refactor: implement UploadDocumentAdapter for file indexing and reindexing

2026-04-27 17:56:25 +02:00 · 2026-02-28 01:38:32 +05:30 · 2026-02-28 01:38:32 +05:30 · 23a98d802c
commit 23a98d802c
parent 1e4b8d3e89
4 changed files with 227 additions and 97 deletions
--- a/surfsense_backend/app/tasks/celery_tasks/document_reindex_tasks.py
+++ b/surfsense_backend/app/tasks/celery_tasks/document_reindex_tasks.py
@ -2,7 +2,7 @@

 import logging

-from sqlalchemy import delete, select
+from sqlalchemy import select
 from sqlalchemy.exc import SQLAlchemyError
 from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
 from sqlalchemy.orm import selectinload
@ -11,12 +11,9 @@ from sqlalchemy.pool import NullPool
 from app.celery_app import celery_app
 from app.config import config
 from app.db import Document
+from app.indexing_pipeline.adapters.file_upload_adapter import UploadDocumentAdapter
 from app.services.llm_service import get_user_long_context_llm
 from app.services.task_logging_service import TaskLoggingService
-from app.utils.document_converters import (
-    create_document_chunks,
-    generate_document_summary,
-)

 logger = logging.getLogger(__name__)

@ -54,7 +51,6 @@ def reindex_document_task(self, document_id: int, user_id: str):
 async def _reindex_document(document_id: int, user_id: str):
    """Async function to reindex a document."""
    async with get_celery_session_maker()() as session:
-        # First, get the document to get search_space_id for logging
        result = await session.execute(
            select(Document)
            .options(selectinload(Document.chunks))
@ -66,10 +62,8 @@ async def _reindex_document(document_id: int, user_id: str):
            logger.error(f"Document {document_id} not found")
            return

-        # Initialize task logger
        task_logger = TaskLoggingService(session, document.search_space_id)

-        # Log task start
        log_entry = await task_logger.log_task_start(
            task_name="document_reindex",
            source="editor",
@ -83,10 +77,7 @@ async def _reindex_document(document_id: int, user_id: str):
        )

        try:
-            # Read markdown directly from source_markdown
-            markdown_content = document.source_markdown
-
-            if not markdown_content:
+            if not document.source_markdown:
                await task_logger.log_task_failure(
                    log_entry,
                    f"Document {document_id} has no source_markdown to reindex",
@ -97,51 +88,17 @@ async def _reindex_document(document_id: int, user_id: str):

            logger.info(f"Reindexing document {document_id} ({document.title})")

-            # 1. Delete old chunks explicitly
-            from app.db import Chunk
-
-            await session.execute(delete(Chunk).where(Chunk.document_id == document_id))
-            await session.flush()  # Ensure old chunks are deleted
-
-            # 2. Create new chunks from source_markdown
-            new_chunks = await create_document_chunks(markdown_content)
-
-            # 3. Add new chunks to session
-            for chunk in new_chunks:
-                chunk.document_id = document_id
-                session.add(chunk)
-
-            logger.info(f"Created {len(new_chunks)} chunks for document {document_id}")
-
-            # 4. Regenerate summary
            user_llm = await get_user_long_context_llm(
                session, user_id, document.search_space_id
            )

-            document_metadata = {
-                "title": document.title,
-                "document_type": document.document_type.value,
-            }
+            adapter = UploadDocumentAdapter(session)
+            await adapter.reindex(document=document, llm=user_llm)

-            summary_content, summary_embedding = await generate_document_summary(
-                markdown_content, user_llm, document_metadata
-            )
-
-            # 5. Update document
-            document.content = summary_content
-            document.embedding = summary_embedding
-            document.content_needs_reindexing = False
-
-            await session.commit()
-
-            # Log success
            await task_logger.log_task_success(
                log_entry,
                f"Successfully reindexed document: {document.title}",
-                {
-                    "chunks_created": len(new_chunks),
-                    "document_id": document_id,
-                },
+                {"document_id": document_id},
            )

            logger.info(f"Successfully reindexed document {document_id}")