refactor: remove search_surfsense_docs tool and related references

- Deleted the `search_surfsense_docs` tool and its associated files, streamlining the agent's toolset.
- Updated various components and prompts to remove references to the now-removed tool, ensuring consistency across the codebase.
- Adjusted documentation to direct users to the SurfSense documentation link for product-related queries instead.
This commit is contained in:
DESKTOP-RTLN3BA\$punk 2026-05-28 22:35:14 -07:00
parent 9b9e6828c7
commit 40ca9e6ed2
71 changed files with 232 additions and 1676 deletions

View file

@ -25,7 +25,6 @@ from uuid import UUID
import anyio
from langchain_core.messages import HumanMessage
from sqlalchemy.future import select
from sqlalchemy.orm import selectinload
from app.agents.multi_agent_chat import create_multi_agent_chat_deep_agent
from app.agents.new_chat.chat_deepagent import create_surfsense_deep_agent
@ -55,7 +54,6 @@ from app.db import (
NewChatThread,
Report,
SearchSourceConnectorType,
SurfsenseDocsDocument,
async_session_maker,
shielded_async_session,
)
@ -77,7 +75,6 @@ from app.tasks.chat.streaming.helpers.interrupt_inspector import (
)
from app.utils.content_utils import bootstrap_history_from_db
from app.utils.perf import get_perf_logger, log_system_snapshot, trim_native_heap
from app.utils.surfsense_docs import surfsense_docs_public_url
from app.utils.user_message_multimodal import build_human_message_content
_background_tasks: set[asyncio.Task] = set()
@ -198,58 +195,6 @@ def _extract_chunk_parts(chunk: Any) -> dict[str, Any]:
return out
def format_mentioned_surfsense_docs_as_context(
documents: list[SurfsenseDocsDocument],
) -> str:
"""Format mentioned SurfSense documentation as context for the agent."""
if not documents:
return ""
context_parts = ["<mentioned_surfsense_docs>"]
context_parts.append(
"The user has explicitly mentioned the following SurfSense documentation pages. "
"These are official documentation about how to use SurfSense and should be used to answer questions about the application. "
"Use [citation:CHUNK_ID] format for citations (e.g., [citation:doc-123])."
)
for doc in documents:
public_url = surfsense_docs_public_url(doc.source)
metadata_json = json.dumps(
{"source": doc.source, "public_url": public_url}, ensure_ascii=False
)
context_parts.append("<document>")
context_parts.append("<document_metadata>")
context_parts.append(f" <document_id>doc-{doc.id}</document_id>")
context_parts.append(" <document_type>SURFSENSE_DOCS</document_type>")
context_parts.append(f" <title><![CDATA[{doc.title}]]></title>")
context_parts.append(f" <url><![CDATA[{public_url}]]></url>")
context_parts.append(
f" <metadata_json><![CDATA[{metadata_json}]]></metadata_json>"
)
context_parts.append("</document_metadata>")
context_parts.append("")
context_parts.append("<document_content>")
if hasattr(doc, "chunks") and doc.chunks:
for chunk in doc.chunks:
context_parts.append(
f" <chunk id='doc-{chunk.id}'><![CDATA[{chunk.content}]]></chunk>"
)
else:
context_parts.append(
f" <chunk id='doc-0'><![CDATA[{doc.content}]]></chunk>"
)
context_parts.append("</document_content>")
context_parts.append("</document>")
context_parts.append("")
context_parts.append("</mentioned_surfsense_docs>")
return "\n".join(context_parts)
def extract_todos_from_deepagents(command_output) -> dict:
"""
Extract todos from deepagents' TodoListMiddleware Command output.
@ -837,7 +782,6 @@ async def stream_new_chat(
user_id: str | None = None,
llm_config_id: int = -1,
mentioned_document_ids: list[int] | None = None,
mentioned_surfsense_doc_ids: list[int] | None = None,
mentioned_folder_ids: list[int] | None = None,
mentioned_connector_ids: list[int] | None = None,
mentioned_connectors: list[dict[str, Any]] | None = None,
@ -869,7 +813,6 @@ async def stream_new_chat(
llm_config_id: The LLM configuration ID (default: -1 for first global config)
needs_history_bootstrap: If True, load message history from DB (for cloned chats)
mentioned_document_ids: Optional list of document IDs mentioned with @ in the chat
mentioned_surfsense_doc_ids: Optional list of SurfSense doc IDs mentioned with @ in the chat
mentioned_folder_ids: Optional list of knowledge-base folder IDs mentioned with @ (cloud mode)
checkpoint_id: Optional checkpoint ID to rewind/fork from (for edit/reload operations)
@ -1295,19 +1238,7 @@ async def stream_new_chat(
# Mentioned KB documents are now handled by KnowledgeBaseSearchMiddleware
# which merges them into the scoped filesystem with full document
# structure. Only SurfSense docs and report context are inlined here.
# Fetch mentioned SurfSense docs if any
mentioned_surfsense_docs: list[SurfsenseDocsDocument] = []
if mentioned_surfsense_doc_ids:
result = await session.execute(
select(SurfsenseDocsDocument)
.options(selectinload(SurfsenseDocsDocument.chunks))
.filter(
SurfsenseDocsDocument.id.in_(mentioned_surfsense_doc_ids),
)
)
mentioned_surfsense_docs = list(result.scalars().all())
# structure. Only report context is inlined here.
# Fetch the most recent report(s) in this thread so the LLM can
# easily find report_id for versioning decisions, instead of
@ -1341,10 +1272,7 @@ async def stream_new_chat(
agent_user_query = user_query
accepted_folder_ids: list[int] = []
if fs_mode == FilesystemMode.CLOUD.value and (
mentioned_document_ids
or mentioned_surfsense_doc_ids
or mentioned_folder_ids
or mentioned_documents
mentioned_document_ids or mentioned_folder_ids or mentioned_documents
):
from app.schemas.new_chat import (
MentionedDocumentInfo as _MentionedDocumentInfo,
@ -1370,23 +1298,17 @@ async def stream_new_chat(
search_space_id=search_space_id,
mentioned_documents=chip_objs,
mentioned_document_ids=mentioned_document_ids,
mentioned_surfsense_doc_ids=mentioned_surfsense_doc_ids,
mentioned_folder_ids=mentioned_folder_ids,
)
agent_user_query = substitute_in_text(user_query, resolved.token_to_path)
accepted_folder_ids = resolved.mentioned_folder_ids
# Format the user query with context (SurfSense docs + reports only).
# Format the user query with context (reports only).
# Uses ``agent_user_query`` so the LLM sees backtick-wrapped paths
# instead of bare ``@title`` tokens.
final_query = agent_user_query
context_parts = []
if mentioned_surfsense_docs:
context_parts.append(
format_mentioned_surfsense_docs_as_context(mentioned_surfsense_docs)
)
if mentioned_connectors:
connector_lines = []
for connector in mentioned_connectors:
@ -1617,12 +1539,8 @@ async def stream_new_chat(
stream_result.content_builder = AssistantContentBuilder()
# Initial thinking step - analyzing the request
if mentioned_surfsense_docs:
initial_title = "Analyzing referenced content"
action_verb = "Analyzing"
else:
initial_title = "Understanding your request"
action_verb = "Processing"
initial_title = "Understanding your request"
action_verb = "Processing"
processing_parts = []
if user_query.strip():
@ -1633,18 +1551,6 @@ async def stream_new_chat(
else:
processing_parts.append("(message)")
if mentioned_surfsense_docs:
doc_names = []
for doc in mentioned_surfsense_docs:
title = doc.title
if len(title) > 30:
title = title[:27] + "..."
doc_names.append(title)
if len(doc_names) == 1:
processing_parts.append(f"[{doc_names[0]}]")
else:
processing_parts.append(f"[{len(doc_names)} docs]")
initial_items = [f"{action_verb}: {' '.join(processing_parts)}"]
initial_step_id = "thinking-1"
@ -1664,10 +1570,10 @@ async def stream_new_chat(
items=initial_items,
)
# These ORM objects (with eagerly-loaded chunks) can be very large.
# They're only needed to build context strings already copied into
# final_query / langchain_messages — release them before streaming.
del mentioned_surfsense_docs, recent_reports
# These ORM objects can be large. They're only needed to build context
# strings already copied into final_query / langchain_messages —
# release them before streaming.
del recent_reports
del langchain_messages, final_query
# Check if this is the first assistant response so we can generate

View file

@ -1,15 +1,11 @@
"""Pre-agent context shaping: mentioned-doc rendering and todos extraction."""
"""Pre-agent context shaping: todos extraction."""
from __future__ import annotations
from app.tasks.chat.streaming.context.deepagents_todos import (
extract_todos_from_deepagents,
)
from app.tasks.chat.streaming.context.mentioned_docs import (
format_mentioned_surfsense_docs_as_context,
)
__all__ = [
"extract_todos_from_deepagents",
"format_mentioned_surfsense_docs_as_context",
]

View file

@ -1,58 +0,0 @@
"""Render user-mentioned SurfSense docs as XML context for the agent."""
from __future__ import annotations
import json
from app.db import SurfsenseDocsDocument
from app.utils.surfsense_docs import surfsense_docs_public_url
def format_mentioned_surfsense_docs_as_context(
documents: list[SurfsenseDocsDocument],
) -> str:
if not documents:
return ""
context_parts = ["<mentioned_surfsense_docs>"]
context_parts.append(
"The user has explicitly mentioned the following SurfSense documentation pages. "
"These are official documentation about how to use SurfSense and should be used to answer questions about the application. "
"Use [citation:CHUNK_ID] format for citations (e.g., [citation:doc-123])."
)
for doc in documents:
public_url = surfsense_docs_public_url(doc.source)
metadata_json = json.dumps(
{"source": doc.source, "public_url": public_url}, ensure_ascii=False
)
context_parts.append("<document>")
context_parts.append("<document_metadata>")
context_parts.append(f" <document_id>doc-{doc.id}</document_id>")
context_parts.append(" <document_type>SURFSENSE_DOCS</document_type>")
context_parts.append(f" <title><![CDATA[{doc.title}]]></title>")
context_parts.append(f" <url><![CDATA[{public_url}]]></url>")
context_parts.append(
f" <metadata_json><![CDATA[{metadata_json}]]></metadata_json>"
)
context_parts.append("</document_metadata>")
context_parts.append("")
context_parts.append("<document_content>")
if hasattr(doc, "chunks") and doc.chunks:
for chunk in doc.chunks:
context_parts.append(
f" <chunk id='doc-{chunk.id}'><![CDATA[{chunk.content}]]></chunk>"
)
else:
context_parts.append(
f" <chunk id='doc-0'><![CDATA[{doc.content}]]></chunk>"
)
context_parts.append("</document_content>")
context_parts.append("</document>")
context_parts.append("")
context_parts.append("</mentioned_surfsense_docs>")
return "\n".join(context_parts)

View file

@ -1,8 +1,8 @@
"""Build and emit the first ``thinking-1`` step for a new-chat turn.
The step title and "Processing X" items are derived from what the user sent
(text snippet, image count, mentioned doc titles) so the FE can render a
meaningful placeholder while the agent stream warms up.
(text snippet, image count) so the FE can render a meaningful placeholder
while the agent stream warms up.
``thinking-1`` is the canonical id for this step every subsequent
``thinking-N`` produced by ``stream_agent_events`` folds into the same
@ -15,7 +15,6 @@ from collections.abc import Iterator
from dataclasses import dataclass
from typing import Any
from app.db import SurfsenseDocsDocument
from app.services.new_streaming_service import VercelStreamingService
@ -37,14 +36,9 @@ def build_initial_thinking_step(
*,
user_query: str,
user_image_data_urls: list[str] | None,
mentioned_surfsense_docs: list[SurfsenseDocsDocument],
) -> InitialThinkingStep:
if mentioned_surfsense_docs:
title = "Analyzing referenced content"
action_verb = "Analyzing"
else:
title = "Understanding your request"
action_verb = "Processing"
title = "Understanding your request"
action_verb = "Processing"
processing_parts: list[str] = []
if user_query.strip():
@ -55,18 +49,6 @@ def build_initial_thinking_step(
else:
processing_parts.append("(message)")
if mentioned_surfsense_docs:
doc_names: list[str] = []
for doc in mentioned_surfsense_docs:
t = doc.title
if len(t) > 30:
t = t[:27] + "..."
doc_names.append(t)
if len(doc_names) == 1:
processing_parts.append(f"[{doc_names[0]}]")
else:
processing_parts.append(f"[{len(doc_names)} docs]")
items = [f"{action_verb}: {' '.join(processing_parts)}"]
return InitialThinkingStep(step_id="thinking-1", title=title, items=items)

View file

@ -5,20 +5,17 @@ Pipeline:
1. **History bootstrap** only for cloned chats with no LangGraph checkpoint
yet; flips the per-thread ``needs_history_bootstrap`` flag back to False
once the rows are loaded.
2. **Mentioned SurfSense docs** eager-load chunks so the formatter has the
full content without a second roundtrip.
3. **Recent reports** top 3 by id desc with non-null content, so the LLM
2. **Recent reports** top 3 by id desc with non-null content, so the LLM
can resolve ``report_id`` for versioning without spelunking history.
4. **@-mention resolve** (cloud mode) substitute ``@title`` tokens in the
3. **@-mention resolve** (cloud mode) substitute ``@title`` tokens in the
query with canonical ``\`/documents/...\``` paths the LLM expects.
5. **Context block render** XML-wrap surfsense docs + reports, prepend to
the rewritten query, optionally prefix with display name for SEARCH_SPACE
4. **Context block render** XML-wrap recent reports, prepend to the
rewritten query, optionally prefix with display name for SEARCH_SPACE
visibility.
6. **HumanMessage** multimodal content if images are attached.
5. **HumanMessage** multimodal content if images are attached.
Returns the assembled ``input_state`` dict plus side-channel data the
orchestrator needs downstream (``accepted_folder_ids`` for runtime context;
``mentioned_surfsense_docs`` for the initial thinking step).
orchestrator needs downstream (``accepted_folder_ids`` for runtime context).
"""
from __future__ import annotations
@ -30,7 +27,6 @@ from typing import Any
from langchain_core.messages import HumanMessage
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from sqlalchemy.orm import selectinload
from app.agents.new_chat.filesystem_selection import FilesystemMode
from app.agents.new_chat.mention_resolver import resolve_mentions, substitute_in_text
@ -38,10 +34,6 @@ from app.db import (
ChatVisibility,
NewChatThread,
Report,
SurfsenseDocsDocument,
)
from app.tasks.chat.streaming.context.mentioned_docs import (
format_mentioned_surfsense_docs_as_context,
)
from app.utils.content_utils import bootstrap_history_from_db
from app.utils.user_message_multimodal import build_human_message_content
@ -55,13 +47,10 @@ class NewChatInputState:
``input_state`` is fed straight to the agent. ``accepted_folder_ids``
feeds the runtime context (the resolver may have dropped some chips).
``mentioned_surfsense_docs`` is consumed by the initial thinking-step
builder for the FE placeholder before the agent stream starts.
"""
input_state: dict[str, Any]
accepted_folder_ids: list[int]
mentioned_surfsense_docs: list[SurfsenseDocsDocument]
async def build_new_chat_input_state(
@ -72,7 +61,6 @@ async def build_new_chat_input_state(
user_query: str,
user_image_data_urls: list[str] | None,
mentioned_document_ids: list[int] | None,
mentioned_surfsense_doc_ids: list[int] | None,
mentioned_folder_ids: list[int] | None,
mentioned_documents: list[dict[str, Any]] | None,
needs_history_bootstrap: bool,
@ -96,15 +84,6 @@ async def build_new_chat_input_state(
thread.needs_history_bootstrap = False
await session.commit()
mentioned_surfsense_docs: list[SurfsenseDocsDocument] = []
if mentioned_surfsense_doc_ids:
result = await session.execute(
select(SurfsenseDocsDocument)
.options(selectinload(SurfsenseDocsDocument.chunks))
.filter(SurfsenseDocsDocument.id.in_(mentioned_surfsense_doc_ids))
)
mentioned_surfsense_docs = list(result.scalars().all())
# Top 3 reports keyed by id desc (newest first) with content present,
# surfaced inline so the LLM resolves ``report_id`` for versioning without
# digging through conversation history.
@ -125,14 +104,12 @@ async def build_new_chat_input_state(
user_query=user_query,
filesystem_mode=filesystem_mode,
mentioned_document_ids=mentioned_document_ids,
mentioned_surfsense_doc_ids=mentioned_surfsense_doc_ids,
mentioned_folder_ids=mentioned_folder_ids,
mentioned_documents=mentioned_documents,
)
final_query = _render_query_with_context(
agent_user_query=agent_user_query,
mentioned_surfsense_docs=mentioned_surfsense_docs,
recent_reports=recent_reports,
)
@ -154,7 +131,6 @@ async def build_new_chat_input_state(
return NewChatInputState(
input_state=input_state,
accepted_folder_ids=accepted_folder_ids,
mentioned_surfsense_docs=mentioned_surfsense_docs,
)
@ -165,7 +141,6 @@ async def _resolve_mentions_for_query(
user_query: str,
filesystem_mode: str,
mentioned_document_ids: list[int] | None,
mentioned_surfsense_doc_ids: list[int] | None,
mentioned_folder_ids: list[int] | None,
mentioned_documents: list[dict[str, Any]] | None,
) -> tuple[str, list[int]]:
@ -187,10 +162,7 @@ async def _resolve_mentions_for_query(
accepted_folder_ids: list[int] = []
has_any_mention = bool(
mentioned_document_ids
or mentioned_surfsense_doc_ids
or mentioned_folder_ids
or mentioned_documents
mentioned_document_ids or mentioned_folder_ids or mentioned_documents
)
if filesystem_mode != FilesystemMode.CLOUD.value or not has_any_mention:
return agent_user_query, accepted_folder_ids
@ -214,7 +186,6 @@ async def _resolve_mentions_for_query(
search_space_id=search_space_id,
mentioned_documents=chip_objs,
mentioned_document_ids=mentioned_document_ids,
mentioned_surfsense_doc_ids=mentioned_surfsense_doc_ids,
mentioned_folder_ids=mentioned_folder_ids,
)
agent_user_query = substitute_in_text(user_query, resolved.token_to_path)
@ -225,17 +196,11 @@ async def _resolve_mentions_for_query(
def _render_query_with_context(
*,
agent_user_query: str,
mentioned_surfsense_docs: list[SurfsenseDocsDocument],
recent_reports: list[Report],
) -> str:
"""Prepend surfsense-docs + recent-reports XML blocks to the user query."""
"""Prepend recent-reports XML block to the user query."""
context_parts: list[str] = []
if mentioned_surfsense_docs:
context_parts.append(
format_mentioned_surfsense_docs_as_context(mentioned_surfsense_docs)
)
if recent_reports:
report_lines: list[str] = []
for r in recent_reports:

View file

@ -123,7 +123,6 @@ async def stream_new_chat(
user_id: str | None = None,
llm_config_id: int = -1,
mentioned_document_ids: list[int] | None = None,
mentioned_surfsense_doc_ids: list[int] | None = None,
mentioned_folder_ids: list[int] | None = None,
mentioned_documents: list[dict[str, Any]] | None = None,
checkpoint_id: str | None = None,
@ -435,7 +434,6 @@ async def stream_new_chat(
user_query=user_query,
user_image_data_urls=user_image_data_urls,
mentioned_document_ids=mentioned_document_ids,
mentioned_surfsense_doc_ids=mentioned_surfsense_doc_ids,
mentioned_folder_ids=mentioned_folder_ids,
mentioned_documents=mentioned_documents,
needs_history_bootstrap=needs_history_bootstrap,
@ -447,7 +445,6 @@ async def stream_new_chat(
)
input_state = assembled.input_state
accepted_folder_ids = assembled.accepted_folder_ids
mentioned_surfsense_docs = assembled.mentioned_surfsense_docs
_perf_log.info(
"[stream_new_chat] History bootstrap + doc/report queries in %.3fs",
time.perf_counter() - _t0,
@ -560,7 +557,6 @@ async def stream_new_chat(
initial_step = build_initial_thinking_step(
user_query=user_query,
user_image_data_urls=user_image_data_urls,
mentioned_surfsense_docs=mentioned_surfsense_docs,
)
for sse in iter_initial_thinking_step_frame(
initial_step,
@ -575,7 +571,7 @@ async def stream_new_chat(
# Drop the heavy ORM objects + the container that holds them so they
# aren't retained for the entire streaming duration. ``input_state``
# already carries the langchain_messages list independently.
del assembled, mentioned_surfsense_docs
del assembled
title_task = spawn_title_task(
chat_id=chat_id,

View file

@ -1,249 +0,0 @@
"""
Surfsense documentation indexer.
Indexes MDX documentation files at startup.
"""
import hashlib
import logging
import re
from datetime import UTC, datetime
from pathlib import Path
from sqlalchemy import delete as sa_delete, select
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.orm import selectinload
from sqlalchemy.orm.attributes import set_committed_value
from app.config import config
from app.db import SurfsenseDocsChunk, SurfsenseDocsDocument, async_session_maker
from app.utils.document_converters import embed_text
logger = logging.getLogger(__name__)
async def _safe_set_docs_chunks(
session: AsyncSession, document: SurfsenseDocsDocument, chunks: list
) -> None:
"""safe_set_chunks variant for the SurfsenseDocsDocument/Chunk models."""
if document.id is not None:
await session.execute(
sa_delete(SurfsenseDocsChunk).where(
SurfsenseDocsChunk.document_id == document.id
)
)
for chunk in chunks:
chunk.document_id = document.id
set_committed_value(document, "chunks", chunks)
session.add_all(chunks)
# Path to docs relative to project root
DOCS_DIR = (
Path(__file__).resolve().parent.parent.parent.parent
/ "surfsense_web"
/ "content"
/ "docs"
)
def parse_mdx_frontmatter(content: str) -> tuple[str, str]:
"""
Parse MDX file to extract frontmatter title and content.
Args:
content: Raw MDX file content
Returns:
Tuple of (title, content_without_frontmatter)
"""
# Match frontmatter between --- markers
frontmatter_pattern = r"^---\s*\n(.*?)\n---\s*\n"
match = re.match(frontmatter_pattern, content, re.DOTALL)
if match:
frontmatter = match.group(1)
content_without_frontmatter = content[match.end() :]
# Extract title from frontmatter
title_match = re.search(r"^title:\s*(.+)$", frontmatter, re.MULTILINE)
title = title_match.group(1).strip() if title_match else "Untitled"
# Remove quotes if present
title = title.strip("\"'")
return title, content_without_frontmatter.strip()
return "Untitled", content.strip()
def get_all_mdx_files() -> list[Path]:
"""
Get all MDX files from the docs directory.
Returns:
List of Path objects for each MDX file
"""
if not DOCS_DIR.exists():
logger.warning(f"Docs directory not found: {DOCS_DIR}")
return []
return list(DOCS_DIR.rglob("*.mdx"))
def generate_surfsense_docs_content_hash(content: str) -> str:
"""Generate SHA-256 hash for Surfsense docs content."""
return hashlib.sha256(content.encode("utf-8")).hexdigest()
def create_surfsense_docs_chunks(content: str) -> list[SurfsenseDocsChunk]:
"""
Create chunks from Surfsense documentation content.
Args:
content: Document content to chunk
Returns:
List of SurfsenseDocsChunk objects with embeddings
"""
return [
SurfsenseDocsChunk(
content=chunk.text,
embedding=embed_text(chunk.text),
)
for chunk in config.chunker_instance.chunk(content)
]
async def index_surfsense_docs(session: AsyncSession) -> tuple[int, int, int, int]:
"""
Index all Surfsense documentation files.
Args:
session: SQLAlchemy async session
Returns:
Tuple of (created, updated, skipped, deleted) counts
"""
created = 0
updated = 0
skipped = 0
deleted = 0
# Get all existing docs from database
existing_docs_result = await session.execute(
select(SurfsenseDocsDocument).options(
selectinload(SurfsenseDocsDocument.chunks)
)
)
existing_docs = {doc.source: doc for doc in existing_docs_result.scalars().all()}
# Track which sources we've processed
processed_sources = set()
# Get all MDX files
mdx_files = get_all_mdx_files()
logger.info(f"Found {len(mdx_files)} MDX files to index")
for mdx_file in mdx_files:
try:
source = str(mdx_file.relative_to(DOCS_DIR))
processed_sources.add(source)
# Read file content
raw_content = mdx_file.read_text(encoding="utf-8")
title, content = parse_mdx_frontmatter(raw_content)
content_hash = generate_surfsense_docs_content_hash(raw_content)
if source in existing_docs:
existing_doc = existing_docs[source]
# Check if content changed
if existing_doc.content_hash == content_hash:
logger.debug(f"Skipping unchanged: {source}")
skipped += 1
continue
# Content changed - update document
logger.info(f"Updating changed document: {source}")
# Create new chunks
chunks = create_surfsense_docs_chunks(content)
# Update document fields
existing_doc.title = title
existing_doc.content = content
existing_doc.content_hash = content_hash
existing_doc.embedding = embed_text(content)
await _safe_set_docs_chunks(session, existing_doc, chunks)
existing_doc.updated_at = datetime.now(UTC)
updated += 1
else:
# New document - create it
logger.info(f"Creating new document: {source}")
chunks = create_surfsense_docs_chunks(content)
document = SurfsenseDocsDocument(
source=source,
title=title,
content=content,
content_hash=content_hash,
embedding=embed_text(content),
chunks=chunks,
updated_at=datetime.now(UTC),
)
session.add(document)
created += 1
except Exception as e:
logger.error(f"Error processing {mdx_file}: {e}", exc_info=True)
continue
# Delete documents for removed files
for source, doc in existing_docs.items():
if source not in processed_sources:
logger.info(f"Deleting removed document: {source}")
await session.delete(doc)
deleted += 1
# Commit all changes
await session.commit()
logger.info(
f"Indexing complete: {created} created, {updated} updated, "
f"{skipped} skipped, {deleted} deleted"
)
return created, updated, skipped, deleted
async def seed_surfsense_docs() -> tuple[int, int, int, int]:
"""
Seed Surfsense documentation into the database.
This function indexes all MDX files from the docs directory.
It handles creating, updating, and deleting docs based on content changes.
Returns:
Tuple of (created, updated, skipped, deleted) counts
Returns (0, 0, 0, 0) if an error occurs
"""
logger.info("Starting Surfsense docs indexing...")
try:
async with async_session_maker() as session:
created, updated, skipped, deleted = await index_surfsense_docs(session)
logger.info(
f"Surfsense docs indexing complete: "
f"created={created}, updated={updated}, skipped={skipped}, deleted={deleted}"
)
return created, updated, skipped, deleted
except Exception as e:
logger.error(f"Failed to seed Surfsense docs: {e}", exc_info=True)
return 0, 0, 0, 0