refactor: remove search_surfsense_docs tool and related references

- Deleted the `search_surfsense_docs` tool and its associated files, streamlining the agent's toolset. - Updated various components and prompts to remove references to the now-removed tool, ensuring consistency across the codebase. - Adjusted documentation to direct users to the SurfSense documentation link for product-related queries instead.
2026-07-14 22:52:15 +02:00 · 2026-05-28 22:35:14 -07:00 · 2026-05-28 22:35:14 -07:00 · 40ca9e6ed2
commit 40ca9e6ed2
parent 9b9e6828c7
71 changed files with 232 additions and 1676 deletions
--- a/surfsense_backend/app/tasks/chat/stream_new_chat.py
+++ b/surfsense_backend/app/tasks/chat/stream_new_chat.py
@ -25,7 +25,6 @@ from uuid import UUID
 import anyio
 from langchain_core.messages import HumanMessage
 from sqlalchemy.future import select
-from sqlalchemy.orm import selectinload

 from app.agents.multi_agent_chat import create_multi_agent_chat_deep_agent
 from app.agents.new_chat.chat_deepagent import create_surfsense_deep_agent
@ -55,7 +54,6 @@ from app.db import (
    NewChatThread,
    Report,
    SearchSourceConnectorType,
-    SurfsenseDocsDocument,
    async_session_maker,
    shielded_async_session,
 )
@ -77,7 +75,6 @@ from app.tasks.chat.streaming.helpers.interrupt_inspector import (
 )
 from app.utils.content_utils import bootstrap_history_from_db
 from app.utils.perf import get_perf_logger, log_system_snapshot, trim_native_heap
-from app.utils.surfsense_docs import surfsense_docs_public_url
 from app.utils.user_message_multimodal import build_human_message_content

 _background_tasks: set[asyncio.Task] = set()
@ -198,58 +195,6 @@ def _extract_chunk_parts(chunk: Any) -> dict[str, Any]:
    return out


-def format_mentioned_surfsense_docs_as_context(
-    documents: list[SurfsenseDocsDocument],
-) -> str:
-    """Format mentioned SurfSense documentation as context for the agent."""
-    if not documents:
-        return ""
-
-    context_parts = ["<mentioned_surfsense_docs>"]
-    context_parts.append(
-        "The user has explicitly mentioned the following SurfSense documentation pages. "
-        "These are official documentation about how to use SurfSense and should be used to answer questions about the application. "
-        "Use [citation:CHUNK_ID] format for citations (e.g., [citation:doc-123])."
-    )
-
-    for doc in documents:
-        public_url = surfsense_docs_public_url(doc.source)
-        metadata_json = json.dumps(
-            {"source": doc.source, "public_url": public_url}, ensure_ascii=False
-        )
-
-        context_parts.append("<document>")
-        context_parts.append("<document_metadata>")
-        context_parts.append(f"  <document_id>doc-{doc.id}</document_id>")
-        context_parts.append("  <document_type>SURFSENSE_DOCS</document_type>")
-        context_parts.append(f"  <title><![CDATA[{doc.title}]]></title>")
-        context_parts.append(f"  <url><![CDATA[{public_url}]]></url>")
-        context_parts.append(
-            f"  <metadata_json><![CDATA[{metadata_json}]]></metadata_json>"
-        )
-        context_parts.append("</document_metadata>")
-        context_parts.append("")
-        context_parts.append("<document_content>")
-
-        if hasattr(doc, "chunks") and doc.chunks:
-            for chunk in doc.chunks:
-                context_parts.append(
-                    f"  <chunk id='doc-{chunk.id}'><![CDATA[{chunk.content}]]></chunk>"
-                )
-        else:
-            context_parts.append(
-                f"  <chunk id='doc-0'><![CDATA[{doc.content}]]></chunk>"
-            )
-
-        context_parts.append("</document_content>")
-        context_parts.append("</document>")
-        context_parts.append("")
-
-    context_parts.append("</mentioned_surfsense_docs>")
-
-    return "\n".join(context_parts)
-
-
 def extract_todos_from_deepagents(command_output) -> dict:
    """
    Extract todos from deepagents' TodoListMiddleware Command output.
@ -837,7 +782,6 @@ async def stream_new_chat(
    user_id: str | None = None,
    llm_config_id: int = -1,
    mentioned_document_ids: list[int] | None = None,
-    mentioned_surfsense_doc_ids: list[int] | None = None,
    mentioned_folder_ids: list[int] | None = None,
    mentioned_connector_ids: list[int] | None = None,
    mentioned_connectors: list[dict[str, Any]] | None = None,
@ -869,7 +813,6 @@ async def stream_new_chat(
        llm_config_id: The LLM configuration ID (default: -1 for first global config)
        needs_history_bootstrap: If True, load message history from DB (for cloned chats)
        mentioned_document_ids: Optional list of document IDs mentioned with @ in the chat
-        mentioned_surfsense_doc_ids: Optional list of SurfSense doc IDs mentioned with @ in the chat
        mentioned_folder_ids: Optional list of knowledge-base folder IDs mentioned with @ (cloud mode)
        checkpoint_id: Optional checkpoint ID to rewind/fork from (for edit/reload operations)

@ -1295,19 +1238,7 @@ async def stream_new_chat(

        # Mentioned KB documents are now handled by KnowledgeBaseSearchMiddleware
        # which merges them into the scoped filesystem with full document
-        # structure. Only SurfSense docs and report context are inlined here.
-
-        # Fetch mentioned SurfSense docs if any
-        mentioned_surfsense_docs: list[SurfsenseDocsDocument] = []
-        if mentioned_surfsense_doc_ids:
-            result = await session.execute(
-                select(SurfsenseDocsDocument)
-                .options(selectinload(SurfsenseDocsDocument.chunks))
-                .filter(
-                    SurfsenseDocsDocument.id.in_(mentioned_surfsense_doc_ids),
-                )
-            )
-            mentioned_surfsense_docs = list(result.scalars().all())
+        # structure. Only report context is inlined here.

        # Fetch the most recent report(s) in this thread so the LLM can
        # easily find report_id for versioning decisions, instead of
@ -1341,10 +1272,7 @@ async def stream_new_chat(
        agent_user_query = user_query
        accepted_folder_ids: list[int] = []
        if fs_mode == FilesystemMode.CLOUD.value and (
-            mentioned_document_ids
-            or mentioned_surfsense_doc_ids
-            or mentioned_folder_ids
-            or mentioned_documents
+            mentioned_document_ids or mentioned_folder_ids or mentioned_documents
        ):
            from app.schemas.new_chat import (
                MentionedDocumentInfo as _MentionedDocumentInfo,
@ -1370,23 +1298,17 @@ async def stream_new_chat(
                search_space_id=search_space_id,
                mentioned_documents=chip_objs,
                mentioned_document_ids=mentioned_document_ids,
-                mentioned_surfsense_doc_ids=mentioned_surfsense_doc_ids,
                mentioned_folder_ids=mentioned_folder_ids,
            )
            agent_user_query = substitute_in_text(user_query, resolved.token_to_path)
            accepted_folder_ids = resolved.mentioned_folder_ids

-        # Format the user query with context (SurfSense docs + reports only).
+        # Format the user query with context (reports only).
        # Uses ``agent_user_query`` so the LLM sees backtick-wrapped paths
        # instead of bare ``@title`` tokens.
        final_query = agent_user_query
        context_parts = []

-        if mentioned_surfsense_docs:
-            context_parts.append(
-                format_mentioned_surfsense_docs_as_context(mentioned_surfsense_docs)
-            )
-
        if mentioned_connectors:
            connector_lines = []
            for connector in mentioned_connectors:
@ -1617,12 +1539,8 @@ async def stream_new_chat(
        stream_result.content_builder = AssistantContentBuilder()

        # Initial thinking step - analyzing the request
-        if mentioned_surfsense_docs:
-            initial_title = "Analyzing referenced content"
-            action_verb = "Analyzing"
-        else:
-            initial_title = "Understanding your request"
-            action_verb = "Processing"
+        initial_title = "Understanding your request"
+        action_verb = "Processing"

        processing_parts = []
        if user_query.strip():
@ -1633,18 +1551,6 @@ async def stream_new_chat(
        else:
            processing_parts.append("(message)")

-        if mentioned_surfsense_docs:
-            doc_names = []
-            for doc in mentioned_surfsense_docs:
-                title = doc.title
-                if len(title) > 30:
-                    title = title[:27] + "..."
-                doc_names.append(title)
-            if len(doc_names) == 1:
-                processing_parts.append(f"[{doc_names[0]}]")
-            else:
-                processing_parts.append(f"[{len(doc_names)} docs]")
-
        initial_items = [f"{action_verb}: {' '.join(processing_parts)}"]
        initial_step_id = "thinking-1"

@ -1664,10 +1570,10 @@ async def stream_new_chat(
            items=initial_items,
        )

-        # These ORM objects (with eagerly-loaded chunks) can be very large.
-        # They're only needed to build context strings already copied into
-        # final_query / langchain_messages — release them before streaming.
-        del mentioned_surfsense_docs, recent_reports
+        # These ORM objects can be large. They're only needed to build context
+        # strings already copied into final_query / langchain_messages —
+        # release them before streaming.
+        del recent_reports
        del langchain_messages, final_query

        # Check if this is the first assistant response so we can generate
--- a/surfsense_backend/app/tasks/chat/streaming/context/init.py
+++ b/surfsense_backend/app/tasks/chat/streaming/context/init.py
@ -1,15 +1,11 @@
-"""Pre-agent context shaping: mentioned-doc rendering and todos extraction."""
+"""Pre-agent context shaping: todos extraction."""

 from __future__ import annotations

 from app.tasks.chat.streaming.context.deepagents_todos import (
    extract_todos_from_deepagents,
 )
-from app.tasks.chat.streaming.context.mentioned_docs import (
-    format_mentioned_surfsense_docs_as_context,
-)

 __all__ = [
    "extract_todos_from_deepagents",
-    "format_mentioned_surfsense_docs_as_context",
 ]
--- a/surfsense_backend/app/tasks/chat/streaming/context/mentioned_docs.py
+++ b/surfsense_backend/app/tasks/chat/streaming/context/mentioned_docs.py
@ -1,58 +0,0 @@
-"""Render user-mentioned SurfSense docs as XML context for the agent."""
-
-from __future__ import annotations
-
-import json
-
-from app.db import SurfsenseDocsDocument
-from app.utils.surfsense_docs import surfsense_docs_public_url
-
-
-def format_mentioned_surfsense_docs_as_context(
-    documents: list[SurfsenseDocsDocument],
-) -> str:
-    if not documents:
-        return ""
-
-    context_parts = ["<mentioned_surfsense_docs>"]
-    context_parts.append(
-        "The user has explicitly mentioned the following SurfSense documentation pages. "
-        "These are official documentation about how to use SurfSense and should be used to answer questions about the application. "
-        "Use [citation:CHUNK_ID] format for citations (e.g., [citation:doc-123])."
-    )
-
-    for doc in documents:
-        public_url = surfsense_docs_public_url(doc.source)
-        metadata_json = json.dumps(
-            {"source": doc.source, "public_url": public_url}, ensure_ascii=False
-        )
-
-        context_parts.append("<document>")
-        context_parts.append("<document_metadata>")
-        context_parts.append(f"  <document_id>doc-{doc.id}</document_id>")
-        context_parts.append("  <document_type>SURFSENSE_DOCS</document_type>")
-        context_parts.append(f"  <title><![CDATA[{doc.title}]]></title>")
-        context_parts.append(f"  <url><![CDATA[{public_url}]]></url>")
-        context_parts.append(
-            f"  <metadata_json><![CDATA[{metadata_json}]]></metadata_json>"
-        )
-        context_parts.append("</document_metadata>")
-        context_parts.append("")
-        context_parts.append("<document_content>")
-
-        if hasattr(doc, "chunks") and doc.chunks:
-            for chunk in doc.chunks:
-                context_parts.append(
-                    f"  <chunk id='doc-{chunk.id}'><![CDATA[{chunk.content}]]></chunk>"
-                )
-        else:
-            context_parts.append(
-                f"  <chunk id='doc-0'><![CDATA[{doc.content}]]></chunk>"
-            )
-
-        context_parts.append("</document_content>")
-        context_parts.append("</document>")
-        context_parts.append("")
-
-    context_parts.append("</mentioned_surfsense_docs>")
-    return "\n".join(context_parts)
--- a/surfsense_backend/app/tasks/chat/streaming/flows/new_chat/initial_thinking_step.py
+++ b/surfsense_backend/app/tasks/chat/streaming/flows/new_chat/initial_thinking_step.py
@ -1,8 +1,8 @@
 """Build and emit the first ``thinking-1`` step for a new-chat turn.

 The step title and "Processing X" items are derived from what the user sent
-(text snippet, image count, mentioned doc titles) so the FE can render a
-meaningful placeholder while the agent stream warms up.
+(text snippet, image count) so the FE can render a meaningful placeholder
+while the agent stream warms up.

 ``thinking-1`` is the canonical id for this step — every subsequent
 ``thinking-N`` produced by ``stream_agent_events`` folds into the same
@ -15,7 +15,6 @@ from collections.abc import Iterator
 from dataclasses import dataclass
 from typing import Any

-from app.db import SurfsenseDocsDocument
 from app.services.new_streaming_service import VercelStreamingService


@ -37,14 +36,9 @@ def build_initial_thinking_step(
    *,
    user_query: str,
    user_image_data_urls: list[str] | None,
-    mentioned_surfsense_docs: list[SurfsenseDocsDocument],
 ) -> InitialThinkingStep:
-    if mentioned_surfsense_docs:
-        title = "Analyzing referenced content"
-        action_verb = "Analyzing"
-    else:
-        title = "Understanding your request"
-        action_verb = "Processing"
+    title = "Understanding your request"
+    action_verb = "Processing"

    processing_parts: list[str] = []
    if user_query.strip():
@ -55,18 +49,6 @@ def build_initial_thinking_step(
    else:
        processing_parts.append("(message)")

-    if mentioned_surfsense_docs:
-        doc_names: list[str] = []
-        for doc in mentioned_surfsense_docs:
-            t = doc.title
-            if len(t) > 30:
-                t = t[:27] + "..."
-            doc_names.append(t)
-        if len(doc_names) == 1:
-            processing_parts.append(f"[{doc_names[0]}]")
-        else:
-            processing_parts.append(f"[{len(doc_names)} docs]")
-
    items = [f"{action_verb}: {' '.join(processing_parts)}"]
    return InitialThinkingStep(step_id="thinking-1", title=title, items=items)

--- a/surfsense_backend/app/tasks/chat/streaming/flows/new_chat/input_state.py
+++ b/surfsense_backend/app/tasks/chat/streaming/flows/new_chat/input_state.py
@ -5,20 +5,17 @@ Pipeline:
  1. **History bootstrap** — only for cloned chats with no LangGraph checkpoint
     yet; flips the per-thread ``needs_history_bootstrap`` flag back to False
     once the rows are loaded.
-  2. **Mentioned SurfSense docs** — eager-load chunks so the formatter has the
-     full content without a second roundtrip.
-  3. **Recent reports** — top 3 by id desc with non-null content, so the LLM
+  2. **Recent reports** — top 3 by id desc with non-null content, so the LLM
     can resolve ``report_id`` for versioning without spelunking history.
-  4. **@-mention resolve** (cloud mode) — substitute ``@title`` tokens in the
+  3. **@-mention resolve** (cloud mode) — substitute ``@title`` tokens in the
     query with canonical ``\`/documents/...\``` paths the LLM expects.
-  5. **Context block render** — XML-wrap surfsense docs + reports, prepend to
-     the rewritten query, optionally prefix with display name for SEARCH_SPACE
+  4. **Context block render** — XML-wrap recent reports, prepend to the
+     rewritten query, optionally prefix with display name for SEARCH_SPACE
     visibility.
-  6. **HumanMessage** — multimodal content if images are attached.
+  5. **HumanMessage** — multimodal content if images are attached.

 Returns the assembled ``input_state`` dict plus side-channel data the
-orchestrator needs downstream (``accepted_folder_ids`` for runtime context;
-``mentioned_surfsense_docs`` for the initial thinking step).
+orchestrator needs downstream (``accepted_folder_ids`` for runtime context).
 """

 from __future__ import annotations
@ -30,7 +27,6 @@ from typing import Any
 from langchain_core.messages import HumanMessage
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.future import select
-from sqlalchemy.orm import selectinload

 from app.agents.new_chat.filesystem_selection import FilesystemMode
 from app.agents.new_chat.mention_resolver import resolve_mentions, substitute_in_text
@ -38,10 +34,6 @@ from app.db import (
    ChatVisibility,
    NewChatThread,
    Report,
-    SurfsenseDocsDocument,
-)
-from app.tasks.chat.streaming.context.mentioned_docs import (
-    format_mentioned_surfsense_docs_as_context,
 )
 from app.utils.content_utils import bootstrap_history_from_db
 from app.utils.user_message_multimodal import build_human_message_content
@ -55,13 +47,10 @@ class NewChatInputState:

    ``input_state`` is fed straight to the agent. ``accepted_folder_ids``
    feeds the runtime context (the resolver may have dropped some chips).
-    ``mentioned_surfsense_docs`` is consumed by the initial thinking-step
-    builder for the FE placeholder before the agent stream starts.
    """

    input_state: dict[str, Any]
    accepted_folder_ids: list[int]
-    mentioned_surfsense_docs: list[SurfsenseDocsDocument]


 async def build_new_chat_input_state(
@ -72,7 +61,6 @@ async def build_new_chat_input_state(
    user_query: str,
    user_image_data_urls: list[str] | None,
    mentioned_document_ids: list[int] | None,
-    mentioned_surfsense_doc_ids: list[int] | None,
    mentioned_folder_ids: list[int] | None,
    mentioned_documents: list[dict[str, Any]] | None,
    needs_history_bootstrap: bool,
@ -96,15 +84,6 @@ async def build_new_chat_input_state(
            thread.needs_history_bootstrap = False
            await session.commit()

-    mentioned_surfsense_docs: list[SurfsenseDocsDocument] = []
-    if mentioned_surfsense_doc_ids:
-        result = await session.execute(
-            select(SurfsenseDocsDocument)
-            .options(selectinload(SurfsenseDocsDocument.chunks))
-            .filter(SurfsenseDocsDocument.id.in_(mentioned_surfsense_doc_ids))
-        )
-        mentioned_surfsense_docs = list(result.scalars().all())
-
    # Top 3 reports keyed by id desc (newest first) with content present,
    # surfaced inline so the LLM resolves ``report_id`` for versioning without
    # digging through conversation history.
@ -125,14 +104,12 @@ async def build_new_chat_input_state(
        user_query=user_query,
        filesystem_mode=filesystem_mode,
        mentioned_document_ids=mentioned_document_ids,
-        mentioned_surfsense_doc_ids=mentioned_surfsense_doc_ids,
        mentioned_folder_ids=mentioned_folder_ids,
        mentioned_documents=mentioned_documents,
    )

    final_query = _render_query_with_context(
        agent_user_query=agent_user_query,
-        mentioned_surfsense_docs=mentioned_surfsense_docs,
        recent_reports=recent_reports,
    )

@ -154,7 +131,6 @@ async def build_new_chat_input_state(
    return NewChatInputState(
        input_state=input_state,
        accepted_folder_ids=accepted_folder_ids,
-        mentioned_surfsense_docs=mentioned_surfsense_docs,
    )


@ -165,7 +141,6 @@ async def _resolve_mentions_for_query(
    user_query: str,
    filesystem_mode: str,
    mentioned_document_ids: list[int] | None,
-    mentioned_surfsense_doc_ids: list[int] | None,
    mentioned_folder_ids: list[int] | None,
    mentioned_documents: list[dict[str, Any]] | None,
 ) -> tuple[str, list[int]]:
@ -187,10 +162,7 @@ async def _resolve_mentions_for_query(
    accepted_folder_ids: list[int] = []

    has_any_mention = bool(
-        mentioned_document_ids
-        or mentioned_surfsense_doc_ids
-        or mentioned_folder_ids
-        or mentioned_documents
+        mentioned_document_ids or mentioned_folder_ids or mentioned_documents
    )
    if filesystem_mode != FilesystemMode.CLOUD.value or not has_any_mention:
        return agent_user_query, accepted_folder_ids
@ -214,7 +186,6 @@ async def _resolve_mentions_for_query(
        search_space_id=search_space_id,
        mentioned_documents=chip_objs,
        mentioned_document_ids=mentioned_document_ids,
-        mentioned_surfsense_doc_ids=mentioned_surfsense_doc_ids,
        mentioned_folder_ids=mentioned_folder_ids,
    )
    agent_user_query = substitute_in_text(user_query, resolved.token_to_path)
@ -225,17 +196,11 @@ async def _resolve_mentions_for_query(
 def _render_query_with_context(
    *,
    agent_user_query: str,
-    mentioned_surfsense_docs: list[SurfsenseDocsDocument],
    recent_reports: list[Report],
 ) -> str:
-    """Prepend surfsense-docs + recent-reports XML blocks to the user query."""
+    """Prepend recent-reports XML block to the user query."""
    context_parts: list[str] = []

-    if mentioned_surfsense_docs:
-        context_parts.append(
-            format_mentioned_surfsense_docs_as_context(mentioned_surfsense_docs)
-        )
-
    if recent_reports:
        report_lines: list[str] = []
        for r in recent_reports:
--- a/surfsense_backend/app/tasks/chat/streaming/flows/new_chat/orchestrator.py
+++ b/surfsense_backend/app/tasks/chat/streaming/flows/new_chat/orchestrator.py
@ -123,7 +123,6 @@ async def stream_new_chat(
    user_id: str | None = None,
    llm_config_id: int = -1,
    mentioned_document_ids: list[int] | None = None,
-    mentioned_surfsense_doc_ids: list[int] | None = None,
    mentioned_folder_ids: list[int] | None = None,
    mentioned_documents: list[dict[str, Any]] | None = None,
    checkpoint_id: str | None = None,
@ -435,7 +434,6 @@ async def stream_new_chat(
            user_query=user_query,
            user_image_data_urls=user_image_data_urls,
            mentioned_document_ids=mentioned_document_ids,
-            mentioned_surfsense_doc_ids=mentioned_surfsense_doc_ids,
            mentioned_folder_ids=mentioned_folder_ids,
            mentioned_documents=mentioned_documents,
            needs_history_bootstrap=needs_history_bootstrap,
@ -447,7 +445,6 @@ async def stream_new_chat(
        )
        input_state = assembled.input_state
        accepted_folder_ids = assembled.accepted_folder_ids
-        mentioned_surfsense_docs = assembled.mentioned_surfsense_docs
        _perf_log.info(
            "[stream_new_chat] History bootstrap + doc/report queries in %.3fs",
            time.perf_counter() - _t0,
@ -560,7 +557,6 @@ async def stream_new_chat(
        initial_step = build_initial_thinking_step(
            user_query=user_query,
            user_image_data_urls=user_image_data_urls,
-            mentioned_surfsense_docs=mentioned_surfsense_docs,
        )
        for sse in iter_initial_thinking_step_frame(
            initial_step,
@ -575,7 +571,7 @@ async def stream_new_chat(
        # Drop the heavy ORM objects + the container that holds them so they
        # aren't retained for the entire streaming duration. ``input_state``
        # already carries the langchain_messages list independently.
-        del assembled, mentioned_surfsense_docs
+        del assembled

        title_task = spawn_title_task(
            chat_id=chat_id,
--- a/surfsense_backend/app/tasks/surfsense_docs_indexer.py
+++ b/surfsense_backend/app/tasks/surfsense_docs_indexer.py
@ -1,249 +0,0 @@
-"""
-Surfsense documentation indexer.
-Indexes MDX documentation files at startup.
-"""
-
-import hashlib
-import logging
-import re
-from datetime import UTC, datetime
-from pathlib import Path
-
-from sqlalchemy import delete as sa_delete, select
-from sqlalchemy.ext.asyncio import AsyncSession
-from sqlalchemy.orm import selectinload
-from sqlalchemy.orm.attributes import set_committed_value
-
-from app.config import config
-from app.db import SurfsenseDocsChunk, SurfsenseDocsDocument, async_session_maker
-from app.utils.document_converters import embed_text
-
-logger = logging.getLogger(__name__)
-
-
-async def _safe_set_docs_chunks(
-    session: AsyncSession, document: SurfsenseDocsDocument, chunks: list
-) -> None:
-    """safe_set_chunks variant for the SurfsenseDocsDocument/Chunk models."""
-    if document.id is not None:
-        await session.execute(
-            sa_delete(SurfsenseDocsChunk).where(
-                SurfsenseDocsChunk.document_id == document.id
-            )
-        )
-        for chunk in chunks:
-            chunk.document_id = document.id
-
-    set_committed_value(document, "chunks", chunks)
-    session.add_all(chunks)
-
-
-# Path to docs relative to project root
-DOCS_DIR = (
-    Path(__file__).resolve().parent.parent.parent.parent
-    / "surfsense_web"
-    / "content"
-    / "docs"
-)
-
-
-def parse_mdx_frontmatter(content: str) -> tuple[str, str]:
-    """
-    Parse MDX file to extract frontmatter title and content.
-
-    Args:
-        content: Raw MDX file content
-
-    Returns:
-        Tuple of (title, content_without_frontmatter)
-    """
-    # Match frontmatter between --- markers
-    frontmatter_pattern = r"^---\s*\n(.*?)\n---\s*\n"
-    match = re.match(frontmatter_pattern, content, re.DOTALL)
-
-    if match:
-        frontmatter = match.group(1)
-        content_without_frontmatter = content[match.end() :]
-
-        # Extract title from frontmatter
-        title_match = re.search(r"^title:\s*(.+)$", frontmatter, re.MULTILINE)
-        title = title_match.group(1).strip() if title_match else "Untitled"
-
-        # Remove quotes if present
-        title = title.strip("\"'")
-
-        return title, content_without_frontmatter.strip()
-
-    return "Untitled", content.strip()
-
-
-def get_all_mdx_files() -> list[Path]:
-    """
-    Get all MDX files from the docs directory.
-
-    Returns:
-        List of Path objects for each MDX file
-    """
-    if not DOCS_DIR.exists():
-        logger.warning(f"Docs directory not found: {DOCS_DIR}")
-        return []
-
-    return list(DOCS_DIR.rglob("*.mdx"))
-
-
-def generate_surfsense_docs_content_hash(content: str) -> str:
-    """Generate SHA-256 hash for Surfsense docs content."""
-    return hashlib.sha256(content.encode("utf-8")).hexdigest()
-
-
-def create_surfsense_docs_chunks(content: str) -> list[SurfsenseDocsChunk]:
-    """
-    Create chunks from Surfsense documentation content.
-
-    Args:
-        content: Document content to chunk
-
-    Returns:
-        List of SurfsenseDocsChunk objects with embeddings
-    """
-    return [
-        SurfsenseDocsChunk(
-            content=chunk.text,
-            embedding=embed_text(chunk.text),
-        )
-        for chunk in config.chunker_instance.chunk(content)
-    ]
-
-
-async def index_surfsense_docs(session: AsyncSession) -> tuple[int, int, int, int]:
-    """
-    Index all Surfsense documentation files.
-
-    Args:
-        session: SQLAlchemy async session
-
-    Returns:
-        Tuple of (created, updated, skipped, deleted) counts
-    """
-    created = 0
-    updated = 0
-    skipped = 0
-    deleted = 0
-
-    # Get all existing docs from database
-    existing_docs_result = await session.execute(
-        select(SurfsenseDocsDocument).options(
-            selectinload(SurfsenseDocsDocument.chunks)
-        )
-    )
-    existing_docs = {doc.source: doc for doc in existing_docs_result.scalars().all()}
-
-    # Track which sources we've processed
-    processed_sources = set()
-
-    # Get all MDX files
-    mdx_files = get_all_mdx_files()
-    logger.info(f"Found {len(mdx_files)} MDX files to index")
-
-    for mdx_file in mdx_files:
-        try:
-            source = str(mdx_file.relative_to(DOCS_DIR))
-            processed_sources.add(source)
-
-            # Read file content
-            raw_content = mdx_file.read_text(encoding="utf-8")
-            title, content = parse_mdx_frontmatter(raw_content)
-            content_hash = generate_surfsense_docs_content_hash(raw_content)
-
-            if source in existing_docs:
-                existing_doc = existing_docs[source]
-
-                # Check if content changed
-                if existing_doc.content_hash == content_hash:
-                    logger.debug(f"Skipping unchanged: {source}")
-                    skipped += 1
-                    continue
-
-                # Content changed - update document
-                logger.info(f"Updating changed document: {source}")
-
-                # Create new chunks
-                chunks = create_surfsense_docs_chunks(content)
-
-                # Update document fields
-                existing_doc.title = title
-                existing_doc.content = content
-                existing_doc.content_hash = content_hash
-                existing_doc.embedding = embed_text(content)
-                await _safe_set_docs_chunks(session, existing_doc, chunks)
-                existing_doc.updated_at = datetime.now(UTC)
-
-                updated += 1
-            else:
-                # New document - create it
-                logger.info(f"Creating new document: {source}")
-
-                chunks = create_surfsense_docs_chunks(content)
-
-                document = SurfsenseDocsDocument(
-                    source=source,
-                    title=title,
-                    content=content,
-                    content_hash=content_hash,
-                    embedding=embed_text(content),
-                    chunks=chunks,
-                    updated_at=datetime.now(UTC),
-                )
-
-                session.add(document)
-                created += 1
-
-        except Exception as e:
-            logger.error(f"Error processing {mdx_file}: {e}", exc_info=True)
-            continue
-
-    # Delete documents for removed files
-    for source, doc in existing_docs.items():
-        if source not in processed_sources:
-            logger.info(f"Deleting removed document: {source}")
-            await session.delete(doc)
-            deleted += 1
-
-    # Commit all changes
-    await session.commit()
-
-    logger.info(
-        f"Indexing complete: {created} created, {updated} updated, "
-        f"{skipped} skipped, {deleted} deleted"
-    )
-
-    return created, updated, skipped, deleted
-
-
-async def seed_surfsense_docs() -> tuple[int, int, int, int]:
-    """
-    Seed Surfsense documentation into the database.
-
-    This function indexes all MDX files from the docs directory.
-    It handles creating, updating, and deleting docs based on content changes.
-
-    Returns:
-        Tuple of (created, updated, skipped, deleted) counts
-        Returns (0, 0, 0, 0) if an error occurs
-    """
-    logger.info("Starting Surfsense docs indexing...")
-
-    try:
-        async with async_session_maker() as session:
-            created, updated, skipped, deleted = await index_surfsense_docs(session)
-
-        logger.info(
-            f"Surfsense docs indexing complete: "
-            f"created={created}, updated={updated}, skipped={skipped}, deleted={deleted}"
-        )
-
-        return created, updated, skipped, deleted
-
-    except Exception as e:
-        logger.error(f"Failed to seed Surfsense docs: {e}", exc_info=True)
-        return 0, 0, 0, 0