refactor: update safe_set_chunks function to be asynchronous and modify all connector and document processor files to use the new async implementation

2026-04-26 01:06:23 +02:00 · 2026-03-15 00:44:27 -07:00 · 2026-03-15 00:44:27 -07:00 · 2b33dfe728
commit 2b33dfe728
parent 49d8f41b09
30 changed files with 102 additions and 106 deletions
--- a/surfsense_backend/app/tasks/document_processors/base.py
+++ b/surfsense_backend/app/tasks/document_processors/base.py
@ -14,45 +14,37 @@ from app.db import Document
 md = MarkdownifyTransformer()


-def safe_set_chunks(document: Document, chunks: list) -> None:
+async def safe_set_chunks(
+    session: "AsyncSession", document: Document, chunks: list
+) -> None:
    """
-    Safely assign chunks to a document without triggering lazy loading.
+    Delete old chunks and assign new ones to a document.

-    ALWAYS use this instead of `document.chunks = chunks` to avoid
-    SQLAlchemy async errors (MissingGreenlet / greenlet_spawn).
-
-    Why this is needed:
-    - Direct assignment `document.chunks = chunks` triggers SQLAlchemy to
-      load the OLD chunks first (for comparison/orphan detection)
-    - This lazy loading fails in async context with asyncpg driver
-    - set_committed_value bypasses this by setting the value directly
-
-    This function is safe regardless of how the document was loaded
-    (with or without selectinload).
+    This replaces direct ``document.chunks = chunks`` which triggers lazy
+    loading (and MissingGreenlet errors in async contexts).  It also
+    explicitly deletes pre-existing chunks so they don't accumulate across
+    repeated re-indexes — ``set_committed_value`` bypasses SQLAlchemy's
+    delete-orphan cascade.

    Args:
-        document: The Document object to update
-        chunks: List of Chunk objects to assign
-
-    Example:
-        # Instead of: document.chunks = chunks (DANGEROUS!)
-        safe_set_chunks(document, chunks)  # Always safe
+        session: The current async database session.
+        document: The Document object to update.
+        chunks: List of Chunk objects to assign.
    """
-    from sqlalchemy.orm import object_session
+    from sqlalchemy import delete
    from sqlalchemy.orm.attributes import set_committed_value

-    # Keep relationship assignment lazy-load-safe.
-    set_committed_value(document, "chunks", chunks)
+    from app.db import Chunk

-    # Ensure chunk rows are actually persisted.
-    # set_committed_value bypasses normal unit-of-work tracking, so we need to
-    # explicitly attach chunk objects to the current session.
-    session = object_session(document)
-    if session is not None:
-        if document.id is not None:
-            for chunk in chunks:
-                chunk.document_id = document.id
-        session.add_all(chunks)
+    if document.id is not None:
+        await session.execute(
+            delete(Chunk).where(Chunk.document_id == document.id)
+        )
+        for chunk in chunks:
+            chunk.document_id = document.id
+
+    set_committed_value(document, "chunks", chunks)
+    session.add_all(chunks)


 def get_current_timestamp() -> datetime:
--- a/surfsense_backend/app/tasks/document_processors/circleback_processor.py
+++ b/surfsense_backend/app/tasks/document_processors/circleback_processor.py
@ -227,7 +227,7 @@ async def add_circleback_meeting_document(
        if summary_embedding is not None:
            document.embedding = summary_embedding
        document.document_metadata = document_metadata
-        safe_set_chunks(document, chunks)
+        await safe_set_chunks(session, document, chunks)
        document.source_markdown = markdown_content
        document.content_needs_reindexing = False
        document.updated_at = get_current_timestamp()
--- a/surfsense_backend/app/tasks/document_processors/extension_processor.py
+++ b/surfsense_backend/app/tasks/document_processors/extension_processor.py
@ -21,6 +21,7 @@ from app.utils.document_converters import (
 from .base import (
    check_document_by_unique_identifier,
    get_current_timestamp,
+    safe_set_chunks,
 )


@ -154,7 +155,7 @@ async def add_extension_received_document(
            existing_document.content_hash = content_hash
            existing_document.embedding = summary_embedding
            existing_document.document_metadata = content.metadata.model_dump()
-            existing_document.chunks = chunks
+            await safe_set_chunks(session, existing_document, chunks)
            existing_document.source_markdown = combined_document_string
            existing_document.updated_at = get_current_timestamp()

--- a/surfsense_backend/app/tasks/document_processors/file_processors.py
+++ b/surfsense_backend/app/tasks/document_processors/file_processors.py
@ -35,6 +35,7 @@ from .base import (
    check_document_by_unique_identifier,
    check_duplicate_document,
    get_current_timestamp,
+    safe_set_chunks,
 )
 from .markdown_processor import add_received_markdown_file_document

@ -488,7 +489,7 @@ async def add_received_file_document_using_unstructured(
                "FILE_NAME": file_name,
                "ETL_SERVICE": "UNSTRUCTURED",
            }
-            existing_document.chunks = chunks
+            await safe_set_chunks(session, existing_document, chunks)
            existing_document.source_markdown = file_in_markdown
            existing_document.content_needs_reindexing = False
            existing_document.updated_at = get_current_timestamp()
@ -622,7 +623,7 @@ async def add_received_file_document_using_llamacloud(
                "FILE_NAME": file_name,
                "ETL_SERVICE": "LLAMACLOUD",
            }
-            existing_document.chunks = chunks
+            await safe_set_chunks(session, existing_document, chunks)
            existing_document.source_markdown = file_in_markdown
            existing_document.content_needs_reindexing = False
            existing_document.updated_at = get_current_timestamp()
@ -777,7 +778,7 @@ async def add_received_file_document_using_docling(
                "FILE_NAME": file_name,
                "ETL_SERVICE": "DOCLING",
            }
-            existing_document.chunks = chunks
+            await safe_set_chunks(session, existing_document, chunks)
            existing_document.source_markdown = file_in_markdown
            existing_document.content_needs_reindexing = False
            existing_document.updated_at = get_current_timestamp()
--- a/surfsense_backend/app/tasks/document_processors/markdown_processor.py
+++ b/surfsense_backend/app/tasks/document_processors/markdown_processor.py
@ -21,6 +21,7 @@ from .base import (
    check_document_by_unique_identifier,
    check_duplicate_document,
    get_current_timestamp,
+    safe_set_chunks,
 )


@ -258,7 +259,7 @@ async def add_received_markdown_file_document(
            existing_document.document_metadata = {
                "FILE_NAME": file_name,
            }
-            existing_document.chunks = chunks
+            await safe_set_chunks(session, existing_document, chunks)
            existing_document.source_markdown = file_in_markdown
            existing_document.updated_at = get_current_timestamp()
            existing_document.status = DocumentStatus.ready()  # Mark as ready
--- a/surfsense_backend/app/tasks/document_processors/youtube_processor.py
+++ b/surfsense_backend/app/tasks/document_processors/youtube_processor.py
@ -419,7 +419,7 @@ async def add_youtube_video_document(
            "author": video_data.get("author_name", "Unknown"),
            "thumbnail": video_data.get("thumbnail_url", ""),
        }
-        safe_set_chunks(document, chunks)
+        await safe_set_chunks(session, document, chunks)
        document.source_markdown = combined_document_string
        document.status = DocumentStatus.ready()  # READY status - fully processed
        document.updated_at = get_current_timestamp()