Merge remote-tracking branch 'upstream/dev' into feat/web-search

2026-06-24 21:38:09 +02:00 · 2026-03-16 19:34:38 +05:30 · 2026-03-16 19:34:38 +05:30 · 60d12b0a70
commit 60d12b0a70
parent 31378a4d14 e0143e61ae
45 changed files with 377 additions and 198 deletions
--- a/surfsense_backend/alembic/versions/106_add_minimax_to_litellmprovider_enum.py
+++ b/surfsense_backend/alembic/versions/106_add_minimax_to_litellmprovider_enum.py
@ -0,0 +1,23 @@
+"""Add MINIMAX to LiteLLMProvider enum
+
+Revision ID: 106
+Revises: 105
+"""
+
+from collections.abc import Sequence
+
+from alembic import op
+
+revision: str = "106"
+down_revision: str | None = "105"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+
+def upgrade() -> None:
+    op.execute("COMMIT")
+    op.execute("ALTER TYPE litellmprovider ADD VALUE IF NOT EXISTS 'MINIMAX'")
+
+
+def downgrade() -> None:
+    pass
--- a/surfsense_backend/app/agents/new_chat/llm_config.py
+++ b/surfsense_backend/app/agents/new_chat/llm_config.py
@ -59,6 +59,7 @@ PROVIDER_MAP = {
    "DATABRICKS": "databricks",
    "COMETAPI": "cometapi",
    "HUGGINGFACE": "huggingface",
+    "MINIMAX": "openai",
    "CUSTOM": "custom",
 }

--- a/surfsense_backend/app/config/global_llm_config.example.yaml
+++ b/surfsense_backend/app/config/global_llm_config.example.yaml
@ -183,6 +183,23 @@ global_llm_configs:
    use_default_system_instructions: true
    citations_enabled: true

+  # Example: MiniMax M2.5 - High-performance with 204K context window
+  - id: -8
+    name: "Global MiniMax M2.5"
+    description: "MiniMax M2.5 with 204K context window and competitive pricing"
+    provider: "MINIMAX"
+    model_name: "MiniMax-M2.5"
+    api_key: "your-minimax-api-key-here"
+    api_base: "https://api.minimax.io/v1"
+    rpm: 60
+    tpm: 100000
+    litellm_params:
+      temperature: 1.0  # MiniMax requires temperature in (0.0, 1.0], cannot be 0
+      max_tokens: 4000
+    system_instructions: ""
+    use_default_system_instructions: true
+    citations_enabled: true
+
 # =============================================================================
 # Image Generation Configuration
 # =============================================================================
--- a/surfsense_backend/app/connectors/composio_gmail_connector.py
+++ b/surfsense_backend/app/connectors/composio_gmail_connector.py
@ -463,7 +463,7 @@ async def _process_gmail_messages_phase2(
                "connector_id": connector_id,
                "source": "composio",
            }
-            safe_set_chunks(document, chunks)
+            await safe_set_chunks(session, document, chunks)
            document.updated_at = get_current_timestamp()
            document.status = DocumentStatus.ready()

--- a/surfsense_backend/app/connectors/composio_google_calendar_connector.py
+++ b/surfsense_backend/app/connectors/composio_google_calendar_connector.py
@ -477,7 +477,7 @@ async def index_composio_google_calendar(
                    "connector_id": connector_id,
                    "source": "composio",
                }
-                safe_set_chunks(document, chunks)
+                await safe_set_chunks(session, document, chunks)
                document.updated_at = get_current_timestamp()
                document.status = DocumentStatus.ready()

--- a/surfsense_backend/app/connectors/composio_google_drive_connector.py
+++ b/surfsense_backend/app/connectors/composio_google_drive_connector.py
@ -1112,7 +1112,7 @@ async def _index_composio_drive_delta_sync(
                "connector_id": connector_id,
                "source": "composio",
            }
-            safe_set_chunks(document, chunks)
+            await safe_set_chunks(session, document, chunks)
            document.updated_at = get_current_timestamp()
            document.status = DocumentStatus.ready()

@ -1520,7 +1520,7 @@ async def _index_composio_drive_full_scan(
                "connector_id": connector_id,
                "source": "composio",
            }
-            safe_set_chunks(document, chunks)
+            await safe_set_chunks(session, document, chunks)
            document.updated_at = get_current_timestamp()
            document.status = DocumentStatus.ready()

--- a/surfsense_backend/app/db.py
+++ b/surfsense_backend/app/db.py
@ -215,6 +215,7 @@ class LiteLLMProvider(StrEnum):
    COMETAPI = "COMETAPI"
    HUGGINGFACE = "HUGGINGFACE"
    GITHUB_MODELS = "GITHUB_MODELS"
+    MINIMAX = "MINIMAX"
    CUSTOM = "CUSTOM"


--- a/surfsense_backend/app/retriever/chunks_hybrid_search.py
+++ b/surfsense_backend/app/retriever/chunks_hybrid_search.py
@ -1,3 +1,4 @@
+import asyncio
 import time
 from datetime import datetime

@ -49,7 +50,7 @@ class ChucksHybridSearchRetriever:
        # Get embedding for the query
        embedding_model = config.embedding_model_instance
        t_embed = time.perf_counter()
-        query_embedding = embedding_model.embed(query_text)
+        query_embedding = await asyncio.to_thread(embedding_model.embed, query_text)
        perf.debug(
            "[chunk_search] vector_search embedding in %.3fs",
            time.perf_counter() - t_embed,
@ -195,7 +196,7 @@ class ChucksHybridSearchRetriever:
        if query_embedding is None:
            embedding_model = config.embedding_model_instance
            t_embed = time.perf_counter()
-            query_embedding = embedding_model.embed(query_text)
+            query_embedding = await asyncio.to_thread(embedding_model.embed, query_text)
            perf.debug(
                "[chunk_search] hybrid_search embedding in %.3fs",
                time.perf_counter() - t_embed,
@ -427,4 +428,4 @@ class ChucksHybridSearchRetriever:
            search_space_id,
            document_type,
        )
-        return final_docs
+        return final_docs
--- a/surfsense_backend/app/services/linear/kb_sync_service.py
+++ b/surfsense_backend/app/services/linear/kb_sync_service.py
@ -1,11 +1,10 @@
 import logging
 from datetime import datetime

-from sqlalchemy import delete
 from sqlalchemy.ext.asyncio import AsyncSession

 from app.connectors.linear_connector import LinearConnector
-from app.db import Chunk, Document
+from app.db import Document
 from app.services.llm_service import get_user_long_context_llm
 from app.utils.document_converters import (
    create_document_chunks,
@ -105,10 +104,6 @@ class LinearKBSyncService:
                )
                summary_embedding = embed_text(summary_content)

-            await self.db_session.execute(
-                delete(Chunk).where(Chunk.document_id == document.id)
-            )
-
            chunks = await create_document_chunks(issue_content)

            document.title = f"{issue_identifier}: {issue_title}"
@ -131,7 +126,7 @@ class LinearKBSyncService:
                "connector_id": connector_id,
            }
            flag_modified(document, "document_metadata")
-            safe_set_chunks(document, chunks)
+            await safe_set_chunks(self.db_session, document, chunks)
            document.updated_at = get_current_timestamp()

            await self.db_session.commit()
--- a/surfsense_backend/app/services/llm_router_service.py
+++ b/surfsense_backend/app/services/llm_router_service.py
@ -85,6 +85,7 @@ PROVIDER_MAP = {
    "ZHIPU": "openai",
    "GITHUB_MODELS": "github",
    "HUGGINGFACE": "huggingface",
+    "MINIMAX": "openai",
    "CUSTOM": "custom",
 }

--- a/surfsense_backend/app/services/llm_service.py
+++ b/surfsense_backend/app/services/llm_service.py
@ -127,6 +127,7 @@ async def validate_llm_config(
                "ALIBABA_QWEN": "openai",
                "MOONSHOT": "openai",
                "ZHIPU": "openai",  # GLM needs special handling
+                "MINIMAX": "openai",
                "GITHUB_MODELS": "github",
            }
            provider_prefix = provider_map.get(provider, provider.lower())
@ -277,6 +278,7 @@ async def get_search_space_llm_instance(
                    "ALIBABA_QWEN": "openai",
                    "MOONSHOT": "openai",
                    "ZHIPU": "openai",
+                "MINIMAX": "openai",
                }
                provider_prefix = provider_map.get(
                    global_config["provider"], global_config["provider"].lower()
@ -350,6 +352,7 @@ async def get_search_space_llm_instance(
                "ALIBABA_QWEN": "openai",
                "MOONSHOT": "openai",
                "ZHIPU": "openai",
+                "MINIMAX": "openai",
                "GITHUB_MODELS": "github",
            }
            provider_prefix = provider_map.get(
--- a/surfsense_backend/app/services/notion/kb_sync_service.py
+++ b/surfsense_backend/app/services/notion/kb_sync_service.py
@ -1,10 +1,9 @@
 import logging
 from datetime import datetime

-from sqlalchemy import delete
 from sqlalchemy.ext.asyncio import AsyncSession

-from app.db import Chunk, Document
+from app.db import Document
 from app.services.llm_service import get_user_long_context_llm
 from app.utils.document_converters import (
    create_document_chunks,
@ -130,11 +129,6 @@ class NotionKBSyncService:
                summary_content = f"Notion Page: {document.document_metadata.get('page_title')}\n\n{full_content}"
                summary_embedding = embed_text(summary_content)

-            logger.debug(f"Deleting old chunks for document {document_id}")
-            await self.db_session.execute(
-                delete(Chunk).where(Chunk.document_id == document.id)
-            )
-
            logger.debug("Creating new chunks")
            chunks = await create_document_chunks(full_content)
            logger.debug(f"Created {len(chunks)} chunks")
@ -147,7 +141,7 @@ class NotionKBSyncService:
                **document.document_metadata,
                "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            }
-            safe_set_chunks(document, chunks)
+            await safe_set_chunks(self.db_session, document, chunks)
            document.updated_at = get_current_timestamp()

            logger.debug("Committing changes to database")
--- a/surfsense_backend/app/tasks/connector_indexers/airtable_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/airtable_indexer.py
@ -432,7 +432,7 @@ async def index_airtable_records(
                        "table_name": item["table_name"],
                        "connector_id": connector_id,
                    }
-                    safe_set_chunks(document, chunks)
+                    await safe_set_chunks(session, document, chunks)
                    document.updated_at = get_current_timestamp()
                    document.status = DocumentStatus.ready()

--- a/surfsense_backend/app/tasks/connector_indexers/base.py
+++ b/surfsense_backend/app/tasks/connector_indexers/base.py
@ -28,45 +28,37 @@ def get_current_timestamp() -> datetime:
    return datetime.now(UTC)


-def safe_set_chunks(document: Document, chunks: list) -> None:
+async def safe_set_chunks(
+    session: "AsyncSession", document: Document, chunks: list
+) -> None:
    """
-    Safely assign chunks to a document without triggering lazy loading.
+    Delete old chunks and assign new ones to a document.

-    ALWAYS use this instead of `document.chunks = chunks` to avoid
-    SQLAlchemy async errors (MissingGreenlet / greenlet_spawn).
-
-    Why this is needed:
-    - Direct assignment `document.chunks = chunks` triggers SQLAlchemy to
-      load the OLD chunks first (for comparison/orphan detection)
-    - This lazy loading fails in async context with asyncpg driver
-    - set_committed_value bypasses this by setting the value directly
-
-    This function is safe regardless of how the document was loaded
-    (with or without selectinload).
+    This replaces direct ``document.chunks = chunks`` which triggers lazy
+    loading (and MissingGreenlet errors in async contexts).  It also
+    explicitly deletes pre-existing chunks so they don't accumulate across
+    repeated re-indexes — ``set_committed_value`` bypasses SQLAlchemy's
+    delete-orphan cascade.

    Args:
-        document: The Document object to update
-        chunks: List of Chunk objects to assign
-
-    Example:
-        # Instead of: document.chunks = chunks (DANGEROUS!)
-        safe_set_chunks(document, chunks)  # Always safe
+        session: The current async database session.
+        document: The Document object to update.
+        chunks: List of Chunk objects to assign.
    """
-    from sqlalchemy.orm import object_session
+    from sqlalchemy import delete
    from sqlalchemy.orm.attributes import set_committed_value

-    # Keep relationship assignment lazy-load-safe.
-    set_committed_value(document, "chunks", chunks)
+    from app.db import Chunk

-    # Ensure chunk rows are actually persisted.
-    # set_committed_value bypasses normal unit-of-work tracking, so we need to
-    # explicitly attach chunk objects to the current session.
-    session = object_session(document)
-    if session is not None:
-        if document.id is not None:
-            for chunk in chunks:
-                chunk.document_id = document.id
-        session.add_all(chunks)
+    if document.id is not None:
+        await session.execute(
+            delete(Chunk).where(Chunk.document_id == document.id)
+        )
+        for chunk in chunks:
+            chunk.document_id = document.id
+
+    set_committed_value(document, "chunks", chunks)
+    session.add_all(chunks)


 def parse_date_flexible(date_str: str) -> datetime:
--- a/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py
@ -430,7 +430,7 @@ async def index_bookstack_pages(
                document.content_hash = item["content_hash"]
                document.embedding = summary_embedding
                document.document_metadata = doc_metadata
-                safe_set_chunks(document, chunks)
+                await safe_set_chunks(session, document, chunks)
                document.updated_at = get_current_timestamp()
                document.status = DocumentStatus.ready()

--- a/surfsense_backend/app/tasks/connector_indexers/clickup_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/clickup_indexer.py
@ -439,7 +439,7 @@ async def index_clickup_tasks(
                    "connector_id": connector_id,
                    "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                }
-                safe_set_chunks(document, chunks)
+                await safe_set_chunks(session, document, chunks)
                document.updated_at = get_current_timestamp()
                document.status = DocumentStatus.ready()

--- a/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py
@ -413,7 +413,7 @@ async def index_confluence_pages(
                    "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                    "connector_id": connector_id,
                }
-                safe_set_chunks(document, chunks)
+                await safe_set_chunks(session, document, chunks)
                document.updated_at = get_current_timestamp()
                document.status = DocumentStatus.ready()

--- a/surfsense_backend/app/tasks/connector_indexers/discord_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/discord_indexer.py
@ -690,7 +690,7 @@ async def index_discord_messages(
                    "indexed_at": datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S"),
                    "connector_id": connector_id,
                }
-                safe_set_chunks(document, chunks)
+                await safe_set_chunks(session, document, chunks)
                document.updated_at = get_current_timestamp()
                document.status = DocumentStatus.ready()

--- a/surfsense_backend/app/tasks/connector_indexers/elasticsearch_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/elasticsearch_indexer.py
@ -386,7 +386,7 @@ async def index_elasticsearch_documents(
                    document.content_hash = item["content_hash"]
                    document.unique_identifier_hash = item["unique_identifier_hash"]
                    document.document_metadata = metadata
-                    safe_set_chunks(document, chunks)
+                    await safe_set_chunks(session, document, chunks)
                    document.updated_at = get_current_timestamp()
                    document.status = DocumentStatus.ready()

--- a/surfsense_backend/app/tasks/connector_indexers/github_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/github_indexer.py
@ -415,7 +415,7 @@ async def index_github_repos(
                document.content_hash = item["content_hash"]
                document.embedding = summary_embedding
                document.document_metadata = doc_metadata
-                safe_set_chunks(document, chunks_data)
+                await safe_set_chunks(session, document, chunks_data)
                document.updated_at = get_current_timestamp()
                document.status = DocumentStatus.ready()

--- a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py
@ -528,7 +528,7 @@ async def index_google_calendar_events(
                    "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                    "connector_id": connector_id,
                }
-                safe_set_chunks(document, chunks)
+                await safe_set_chunks(session, document, chunks)
                document.updated_at = get_current_timestamp()
                document.status = DocumentStatus.ready()

--- a/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py
@ -451,7 +451,7 @@ async def index_google_gmail_messages(
                    "date": item["date_str"],
                    "connector_id": connector_id,
                }
-                safe_set_chunks(document, chunks)
+                await safe_set_chunks(session, document, chunks)
                document.updated_at = get_current_timestamp()
                document.status = DocumentStatus.ready()

--- a/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py
@ -393,7 +393,7 @@ async def index_jira_issues(
                    "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                    "connector_id": connector_id,
                }
-                safe_set_chunks(document, chunks)
+                await safe_set_chunks(session, document, chunks)
                document.updated_at = get_current_timestamp()
                document.status = DocumentStatus.ready()

--- a/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py
@ -431,7 +431,7 @@ async def index_linear_issues(
                    "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                    "connector_id": connector_id,
                }
-                safe_set_chunks(document, chunks)
+                await safe_set_chunks(session, document, chunks)
                document.updated_at = get_current_timestamp()
                document.status = DocumentStatus.ready()

--- a/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py
@ -488,7 +488,7 @@ async def index_luma_events(
                    "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                    "connector_id": connector_id,
                }
-                safe_set_chunks(document, chunks)
+                await safe_set_chunks(session, document, chunks)
                document.updated_at = get_current_timestamp()
                document.status = DocumentStatus.ready()

--- a/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py
@ -479,7 +479,7 @@ async def index_notion_pages(
                    "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                    "connector_id": connector_id,
                }
-                safe_set_chunks(document, chunks)
+                await safe_set_chunks(session, document, chunks)
                document.updated_at = get_current_timestamp()
                document.status = DocumentStatus.ready()

--- a/surfsense_backend/app/tasks/connector_indexers/obsidian_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/obsidian_indexer.py
@ -571,7 +571,7 @@ async def index_obsidian_vault(
                document.content_hash = content_hash
                document.embedding = embedding
                document.document_metadata = document_metadata
-                safe_set_chunks(document, chunks)
+                await safe_set_chunks(session, document, chunks)
                document.updated_at = get_current_timestamp()
                document.status = DocumentStatus.ready()

--- a/surfsense_backend/app/tasks/connector_indexers/slack_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/slack_indexer.py
@ -564,7 +564,7 @@ async def index_slack_messages(
                    "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                    "connector_id": connector_id,
                }
-                safe_set_chunks(document, chunks)
+                await safe_set_chunks(session, document, chunks)
                document.updated_at = get_current_timestamp()
                document.status = DocumentStatus.ready()

--- a/surfsense_backend/app/tasks/connector_indexers/teams_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/teams_indexer.py
@ -603,7 +603,7 @@ async def index_teams_messages(
                    "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                    "connector_id": connector_id,
                }
-                safe_set_chunks(document, chunks)
+                await safe_set_chunks(session, document, chunks)
                document.updated_at = get_current_timestamp()
                document.status = DocumentStatus.ready()

--- a/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py
@ -410,7 +410,7 @@ async def index_crawled_urls(
                    "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                    "connector_id": connector_id,
                }
-                safe_set_chunks(document, chunks)
+                await safe_set_chunks(session, document, chunks)
                document.status = DocumentStatus.ready()  # READY status
                document.updated_at = get_current_timestamp()

--- a/surfsense_backend/app/tasks/document_processors/base.py
+++ b/surfsense_backend/app/tasks/document_processors/base.py
@ -14,45 +14,37 @@ from app.db import Document
 md = MarkdownifyTransformer()


-def safe_set_chunks(document: Document, chunks: list) -> None:
+async def safe_set_chunks(
+    session: "AsyncSession", document: Document, chunks: list
+) -> None:
    """
-    Safely assign chunks to a document without triggering lazy loading.
+    Delete old chunks and assign new ones to a document.

-    ALWAYS use this instead of `document.chunks = chunks` to avoid
-    SQLAlchemy async errors (MissingGreenlet / greenlet_spawn).
-
-    Why this is needed:
-    - Direct assignment `document.chunks = chunks` triggers SQLAlchemy to
-      load the OLD chunks first (for comparison/orphan detection)
-    - This lazy loading fails in async context with asyncpg driver
-    - set_committed_value bypasses this by setting the value directly
-
-    This function is safe regardless of how the document was loaded
-    (with or without selectinload).
+    This replaces direct ``document.chunks = chunks`` which triggers lazy
+    loading (and MissingGreenlet errors in async contexts).  It also
+    explicitly deletes pre-existing chunks so they don't accumulate across
+    repeated re-indexes — ``set_committed_value`` bypasses SQLAlchemy's
+    delete-orphan cascade.

    Args:
-        document: The Document object to update
-        chunks: List of Chunk objects to assign
-
-    Example:
-        # Instead of: document.chunks = chunks (DANGEROUS!)
-        safe_set_chunks(document, chunks)  # Always safe
+        session: The current async database session.
+        document: The Document object to update.
+        chunks: List of Chunk objects to assign.
    """
-    from sqlalchemy.orm import object_session
+    from sqlalchemy import delete
    from sqlalchemy.orm.attributes import set_committed_value

-    # Keep relationship assignment lazy-load-safe.
-    set_committed_value(document, "chunks", chunks)
+    from app.db import Chunk

-    # Ensure chunk rows are actually persisted.
-    # set_committed_value bypasses normal unit-of-work tracking, so we need to
-    # explicitly attach chunk objects to the current session.
-    session = object_session(document)
-    if session is not None:
-        if document.id is not None:
-            for chunk in chunks:
-                chunk.document_id = document.id
-        session.add_all(chunks)
+    if document.id is not None:
+        await session.execute(
+            delete(Chunk).where(Chunk.document_id == document.id)
+        )
+        for chunk in chunks:
+            chunk.document_id = document.id
+
+    set_committed_value(document, "chunks", chunks)
+    session.add_all(chunks)


 def get_current_timestamp() -> datetime:
--- a/surfsense_backend/app/tasks/document_processors/circleback_processor.py
+++ b/surfsense_backend/app/tasks/document_processors/circleback_processor.py
@ -227,7 +227,7 @@ async def add_circleback_meeting_document(
        if summary_embedding is not None:
            document.embedding = summary_embedding
        document.document_metadata = document_metadata
-        safe_set_chunks(document, chunks)
+        await safe_set_chunks(session, document, chunks)
        document.source_markdown = markdown_content
        document.content_needs_reindexing = False
        document.updated_at = get_current_timestamp()
--- a/surfsense_backend/app/tasks/document_processors/extension_processor.py
+++ b/surfsense_backend/app/tasks/document_processors/extension_processor.py
@ -21,6 +21,7 @@ from app.utils.document_converters import (
 from .base import (
    check_document_by_unique_identifier,
    get_current_timestamp,
+    safe_set_chunks,
 )


@ -154,7 +155,7 @@ async def add_extension_received_document(
            existing_document.content_hash = content_hash
            existing_document.embedding = summary_embedding
            existing_document.document_metadata = content.metadata.model_dump()
-            existing_document.chunks = chunks
+            await safe_set_chunks(session, existing_document, chunks)
            existing_document.source_markdown = combined_document_string
            existing_document.updated_at = get_current_timestamp()

--- a/surfsense_backend/app/tasks/document_processors/file_processors.py
+++ b/surfsense_backend/app/tasks/document_processors/file_processors.py
@ -35,6 +35,7 @@ from .base import (
    check_document_by_unique_identifier,
    check_duplicate_document,
    get_current_timestamp,
+    safe_set_chunks,
 )
 from .markdown_processor import add_received_markdown_file_document

@ -488,7 +489,7 @@ async def add_received_file_document_using_unstructured(
                "FILE_NAME": file_name,
                "ETL_SERVICE": "UNSTRUCTURED",
            }
-            existing_document.chunks = chunks
+            await safe_set_chunks(session, existing_document, chunks)
            existing_document.source_markdown = file_in_markdown
            existing_document.content_needs_reindexing = False
            existing_document.updated_at = get_current_timestamp()
@ -622,7 +623,7 @@ async def add_received_file_document_using_llamacloud(
                "FILE_NAME": file_name,
                "ETL_SERVICE": "LLAMACLOUD",
            }
-            existing_document.chunks = chunks
+            await safe_set_chunks(session, existing_document, chunks)
            existing_document.source_markdown = file_in_markdown
            existing_document.content_needs_reindexing = False
            existing_document.updated_at = get_current_timestamp()
@ -777,7 +778,7 @@ async def add_received_file_document_using_docling(
                "FILE_NAME": file_name,
                "ETL_SERVICE": "DOCLING",
            }
-            existing_document.chunks = chunks
+            await safe_set_chunks(session, existing_document, chunks)
            existing_document.source_markdown = file_in_markdown
            existing_document.content_needs_reindexing = False
            existing_document.updated_at = get_current_timestamp()
--- a/surfsense_backend/app/tasks/document_processors/markdown_processor.py
+++ b/surfsense_backend/app/tasks/document_processors/markdown_processor.py
@ -21,6 +21,7 @@ from .base import (
    check_document_by_unique_identifier,
    check_duplicate_document,
    get_current_timestamp,
+    safe_set_chunks,
 )


@ -258,7 +259,7 @@ async def add_received_markdown_file_document(
            existing_document.document_metadata = {
                "FILE_NAME": file_name,
            }
-            existing_document.chunks = chunks
+            await safe_set_chunks(session, existing_document, chunks)
            existing_document.source_markdown = file_in_markdown
            existing_document.updated_at = get_current_timestamp()
            existing_document.status = DocumentStatus.ready()  # Mark as ready
--- a/surfsense_backend/app/tasks/document_processors/youtube_processor.py
+++ b/surfsense_backend/app/tasks/document_processors/youtube_processor.py
@ -419,7 +419,7 @@ async def add_youtube_video_document(
            "author": video_data.get("author_name", "Unknown"),
            "thumbnail": video_data.get("thumbnail_url", ""),
        }
-        safe_set_chunks(document, chunks)
+        await safe_set_chunks(session, document, chunks)
        document.source_markdown = combined_document_string
        document.status = DocumentStatus.ready()  # READY status - fully processed
        document.updated_at = get_current_timestamp()
--- a/surfsense_backend/app/tasks/surfsense_docs_indexer.py
+++ b/surfsense_backend/app/tasks/surfsense_docs_indexer.py
@ -13,12 +13,32 @@ from sqlalchemy import select
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.orm import selectinload

+from sqlalchemy import delete as sa_delete
+from sqlalchemy.orm.attributes import set_committed_value
+
 from app.config import config
 from app.db import SurfsenseDocsChunk, SurfsenseDocsDocument, async_session_maker
 from app.utils.document_converters import embed_text

 logger = logging.getLogger(__name__)

+
+async def _safe_set_docs_chunks(
+    session: AsyncSession, document: SurfsenseDocsDocument, chunks: list
+) -> None:
+    """safe_set_chunks variant for the SurfsenseDocsDocument/Chunk models."""
+    if document.id is not None:
+        await session.execute(
+            sa_delete(SurfsenseDocsChunk).where(
+                SurfsenseDocsChunk.document_id == document.id
+            )
+        )
+        for chunk in chunks:
+            chunk.document_id = document.id
+
+    set_committed_value(document, "chunks", chunks)
+    session.add_all(chunks)
+
 # Path to docs relative to project root
 DOCS_DIR = (
    Path(__file__).resolve().parent.parent.parent.parent
@ -156,7 +176,7 @@ async def index_surfsense_docs(session: AsyncSession) -> tuple[int, int, int, in
                existing_doc.content = content
                existing_doc.content_hash = content_hash
                existing_doc.embedding = embed_text(content)
-                existing_doc.chunks = chunks
+                await _safe_set_docs_chunks(session, existing_doc, chunks)
                existing_doc.updated_at = datetime.now(UTC)

                updated += 1