refactor: implement UploadDocumentAdapter for file indexing and reindexing

This commit is contained in:
Anish Sarkar 2026-02-28 01:38:32 +05:30
parent 1e4b8d3e89
commit 23a98d802c
4 changed files with 227 additions and 97 deletions

View file

@ -1,47 +1,83 @@
from sqlalchemy.ext.asyncio import AsyncSession
from app.db import DocumentStatus, DocumentType
from app.db import Document, DocumentStatus, DocumentType
from app.indexing_pipeline.connector_document import ConnectorDocument
from app.indexing_pipeline.document_hashing import compute_content_hash
from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService
async def index_uploaded_file(
markdown_content: str,
filename: str,
etl_service: str,
search_space_id: int,
user_id: str,
session: AsyncSession,
llm,
should_summarize: bool = False,
) -> None:
connector_doc = ConnectorDocument(
title=filename,
source_markdown=markdown_content,
unique_id=filename,
document_type=DocumentType.FILE,
search_space_id=search_space_id,
created_by_id=user_id,
connector_id=None,
should_summarize=should_summarize,
should_use_code_chunker=False,
fallback_summary=markdown_content[:4000],
metadata={
"FILE_NAME": filename,
"ETL_SERVICE": etl_service,
},
)
class UploadDocumentAdapter:
def __init__(self, session: AsyncSession) -> None:
self._session = session
self._service = IndexingPipelineService(session)
service = IndexingPipelineService(session)
documents = await service.prepare_for_indexing([connector_doc])
async def index(
self,
markdown_content: str,
filename: str,
etl_service: str,
search_space_id: int,
user_id: str,
llm,
should_summarize: bool = False,
) -> None:
connector_doc = ConnectorDocument(
title=filename,
source_markdown=markdown_content,
unique_id=filename,
document_type=DocumentType.FILE,
search_space_id=search_space_id,
created_by_id=user_id,
connector_id=None,
should_summarize=should_summarize,
should_use_code_chunker=False,
fallback_summary=markdown_content[:4000],
metadata={
"FILE_NAME": filename,
"ETL_SERVICE": etl_service,
},
)
if not documents:
raise RuntimeError("prepare_for_indexing returned no documents")
documents = await self._service.prepare_for_indexing([connector_doc])
indexed = await service.index(documents[0], connector_doc, llm)
if not documents:
raise RuntimeError("prepare_for_indexing returned no documents")
if not DocumentStatus.is_state(indexed.status, DocumentStatus.READY):
raise RuntimeError(indexed.status.get("reason", "Indexing failed"))
indexed = await self._service.index(documents[0], connector_doc, llm)
indexed.content_needs_reindexing = False
await session.commit()
if not DocumentStatus.is_state(indexed.status, DocumentStatus.READY):
raise RuntimeError(indexed.status.get("reason", "Indexing failed"))
indexed.content_needs_reindexing = False
await self._session.commit()
async def reindex(self, document: Document, llm) -> None:
"""Re-index an existing document after its source_markdown has been updated."""
if not document.source_markdown:
raise RuntimeError("Document has no source_markdown to reindex")
metadata = document.document_metadata or {}
connector_doc = ConnectorDocument(
title=document.title,
source_markdown=document.source_markdown,
unique_id=document.title,
document_type=document.document_type,
search_space_id=document.search_space_id,
created_by_id=str(document.created_by_id),
connector_id=document.connector_id,
should_summarize=True,
should_use_code_chunker=False,
fallback_summary=document.source_markdown[:4000],
metadata=metadata,
)
document.content_hash = compute_content_hash(connector_doc)
indexed = await self._service.index(document, connector_doc, llm)
if not DocumentStatus.is_state(indexed.status, DocumentStatus.READY):
raise RuntimeError(indexed.status.get("reason", "Reindexing failed"))
indexed.content_needs_reindexing = False
await self._session.commit()