refactor: implement UploadDocumentAdapter for file indexing and reindexing

This commit is contained in:
Anish Sarkar 2026-02-28 01:38:32 +05:30
parent 1e4b8d3e89
commit 23a98d802c
4 changed files with 227 additions and 97 deletions

View file

@ -2,7 +2,7 @@
import logging
from sqlalchemy import delete, select
from sqlalchemy import select
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
from sqlalchemy.orm import selectinload
@ -11,12 +11,9 @@ from sqlalchemy.pool import NullPool
from app.celery_app import celery_app
from app.config import config
from app.db import Document
from app.indexing_pipeline.adapters.file_upload_adapter import UploadDocumentAdapter
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import (
create_document_chunks,
generate_document_summary,
)
logger = logging.getLogger(__name__)
@ -54,7 +51,6 @@ def reindex_document_task(self, document_id: int, user_id: str):
async def _reindex_document(document_id: int, user_id: str):
"""Async function to reindex a document."""
async with get_celery_session_maker()() as session:
# First, get the document to get search_space_id for logging
result = await session.execute(
select(Document)
.options(selectinload(Document.chunks))
@ -66,10 +62,8 @@ async def _reindex_document(document_id: int, user_id: str):
logger.error(f"Document {document_id} not found")
return
# Initialize task logger
task_logger = TaskLoggingService(session, document.search_space_id)
# Log task start
log_entry = await task_logger.log_task_start(
task_name="document_reindex",
source="editor",
@ -83,10 +77,7 @@ async def _reindex_document(document_id: int, user_id: str):
)
try:
# Read markdown directly from source_markdown
markdown_content = document.source_markdown
if not markdown_content:
if not document.source_markdown:
await task_logger.log_task_failure(
log_entry,
f"Document {document_id} has no source_markdown to reindex",
@ -97,51 +88,17 @@ async def _reindex_document(document_id: int, user_id: str):
logger.info(f"Reindexing document {document_id} ({document.title})")
# 1. Delete old chunks explicitly
from app.db import Chunk
await session.execute(delete(Chunk).where(Chunk.document_id == document_id))
await session.flush() # Ensure old chunks are deleted
# 2. Create new chunks from source_markdown
new_chunks = await create_document_chunks(markdown_content)
# 3. Add new chunks to session
for chunk in new_chunks:
chunk.document_id = document_id
session.add(chunk)
logger.info(f"Created {len(new_chunks)} chunks for document {document_id}")
# 4. Regenerate summary
user_llm = await get_user_long_context_llm(
session, user_id, document.search_space_id
)
document_metadata = {
"title": document.title,
"document_type": document.document_type.value,
}
adapter = UploadDocumentAdapter(session)
await adapter.reindex(document=document, llm=user_llm)
summary_content, summary_embedding = await generate_document_summary(
markdown_content, user_llm, document_metadata
)
# 5. Update document
document.content = summary_content
document.embedding = summary_embedding
document.content_needs_reindexing = False
await session.commit()
# Log success
await task_logger.log_task_success(
log_entry,
f"Successfully reindexed document: {document.title}",
{
"chunks_created": len(new_chunks),
"document_id": document_id,
},
{"document_id": document_id},
)
logger.info(f"Successfully reindexed document {document_id}")