refactor: remove search_surfsense_docs tool and related references

- Deleted the `search_surfsense_docs` tool and its associated files, streamlining the agent's toolset. - Updated various components and prompts to remove references to the now-removed tool, ensuring consistency across the codebase. - Adjusted documentation to direct users to the SurfSense documentation link for product-related queries instead.
2026-05-31 19:45:15 +02:00 · 2026-05-28 22:35:14 -07:00 · 2026-05-28 22:35:14 -07:00 · 40ca9e6ed2
commit 40ca9e6ed2
parent 9b9e6828c7
71 changed files with 232 additions and 1676 deletions
--- a/surfsense_backend/app/agents/new_chat/tools/init.py
+++ b/surfsense_backend/app/agents/new_chat/tools/init.py
@ -5,7 +5,6 @@ This module contains all the tools available to the SurfSense agent.
 To add a new tool, see the documentation in registry.py.

 Available tools:
- search_surfsense_docs: Search Surfsense documentation for usage help
 - generate_podcast: Generate audio podcasts from content
 - generate_video_presentation: Generate video presentations with slides and narration
 - generate_image: Generate images from text descriptions using AI models
@ -31,7 +30,6 @@ from .registry import (
    get_tool_by_name,
 )
 from .scrape_webpage import create_scrape_webpage_tool
-from .search_surfsense_docs import create_search_surfsense_docs_tool
 from .update_memory import create_update_memory_tool, create_update_team_memory_tool
 from .video_presentation import create_generate_video_presentation_tool

@ -47,7 +45,6 @@ __all__ = [
    "create_generate_podcast_tool",
    "create_generate_video_presentation_tool",
    "create_scrape_webpage_tool",
-    "create_search_surfsense_docs_tool",
    "create_update_memory_tool",
    "create_update_team_memory_tool",
    "format_documents_for_context",
--- a/surfsense_backend/app/agents/new_chat/tools/registry.py
+++ b/surfsense_backend/app/agents/new_chat/tools/registry.py
@ -101,7 +101,6 @@ from .podcast import create_generate_podcast_tool
 from .report import create_generate_report_tool
 from .resume import create_generate_resume_tool
 from .scrape_webpage import create_scrape_webpage_tool
-from .search_surfsense_docs import create_search_surfsense_docs_tool
 from .teams import (
    create_list_teams_channels_tool,
    create_read_teams_messages_tool,
@ -258,15 +257,6 @@ BUILTIN_TOOLS: list[ToolDefinition] = [
        ),
        requires=[],
    ),
-    # Surfsense documentation search tool
-    ToolDefinition(
-        name="search_surfsense_docs",
-        description="Search Surfsense documentation for help with using the application",
-        factory=lambda deps: create_search_surfsense_docs_tool(
-            db_session=deps["db_session"],
-        ),
-        requires=["db_session"],
-    ),
    # =========================================================================
    # SERVICE ACCOUNT DISCOVERY
    # Generic tool for the LLM to discover connected accounts and resolve
--- a/surfsense_backend/app/agents/new_chat/tools/search_surfsense_docs.py
+++ b/surfsense_backend/app/agents/new_chat/tools/search_surfsense_docs.py
@ -1,174 +0,0 @@
-"""
-Surfsense documentation search tool.
-
-This tool allows the agent to search the pre-indexed Surfsense documentation
-to help users with questions about how to use the application.
-
-The documentation is indexed at deployment time from MDX files and stored
-in dedicated tables (surfsense_docs_documents, surfsense_docs_chunks).
-"""
-
-import asyncio
-import json
-
-from langchain_core.tools import tool
-from sqlalchemy import select
-from sqlalchemy.ext.asyncio import AsyncSession
-
-from app.db import SurfsenseDocsChunk, SurfsenseDocsDocument, async_session_maker
-from app.utils.document_converters import embed_text
-from app.utils.surfsense_docs import surfsense_docs_public_url
-
-
-def format_surfsense_docs_results(results: list[tuple]) -> str:
-    """
-    Format search results into XML structure for the LLM context.
-
-    Uses the same XML structure as format_documents_for_context from knowledge_base.py
-    but with 'doc-' prefix on chunk IDs. This allows:
-    - LLM to use consistent [citation:doc-XXX] format
-    - Frontend to detect 'doc-' prefix and route to surfsense docs endpoint
-
-    Args:
-        results: List of (chunk, document) tuples from the database query
-
-    Returns:
-        Formatted XML string with documentation content and citation-ready chunks
-    """
-    if not results:
-        return "No relevant Surfsense documentation found for your query."
-
-    # Group chunks by document
-    grouped: dict[int, dict] = {}
-    for chunk, doc in results:
-        public_url = surfsense_docs_public_url(doc.source)
-        if doc.id not in grouped:
-            grouped[doc.id] = {
-                "document_id": f"doc-{doc.id}",
-                "document_type": "SURFSENSE_DOCS",
-                "title": doc.title,
-                "url": public_url,
-                "metadata": {"source": doc.source, "public_url": public_url},
-                "chunks": [],
-            }
-        grouped[doc.id]["chunks"].append(
-            {
-                "chunk_id": f"doc-{chunk.id}",
-                "content": chunk.content,
-            }
-        )
-
-    # Render XML matching format_documents_for_context structure
-    parts: list[str] = []
-    for g in grouped.values():
-        metadata_json = json.dumps(g["metadata"], ensure_ascii=False)
-
-        parts.append("<document>")
-        parts.append("<document_metadata>")
-        parts.append(f"  <document_id>{g['document_id']}</document_id>")
-        parts.append(f"  <document_type>{g['document_type']}</document_type>")
-        parts.append(f"  <title><![CDATA[{g['title']}]]></title>")
-        parts.append(f"  <url><![CDATA[{g['url']}]]></url>")
-        parts.append(f"  <metadata_json><![CDATA[{metadata_json}]]></metadata_json>")
-        parts.append("</document_metadata>")
-        parts.append("")
-        parts.append("<document_content>")
-
-        for ch in g["chunks"]:
-            parts.append(
-                f"  <chunk id='{ch['chunk_id']}'><![CDATA[{ch['content']}]]></chunk>"
-            )
-
-        parts.append("</document_content>")
-        parts.append("</document>")
-        parts.append("")
-
-    return "\n".join(parts).strip()
-
-
-async def search_surfsense_docs_async(
-    query: str,
-    db_session: AsyncSession,
-    top_k: int = 10,
-) -> str:
-    """
-    Search Surfsense documentation using vector similarity.
-
-    Args:
-        query: The search query about Surfsense usage
-        db_session: Database session for executing queries
-        top_k: Number of results to return
-
-    Returns:
-        Formatted string with relevant documentation content
-    """
-    # Get embedding for the query
-    query_embedding = await asyncio.to_thread(embed_text, query)
-
-    # Vector similarity search on chunks, joining with documents
-    stmt = (
-        select(SurfsenseDocsChunk, SurfsenseDocsDocument)
-        .join(
-            SurfsenseDocsDocument,
-            SurfsenseDocsChunk.document_id == SurfsenseDocsDocument.id,
-        )
-        .order_by(SurfsenseDocsChunk.embedding.op("<=>")(query_embedding))
-        .limit(top_k)
-    )
-
-    result = await db_session.execute(stmt)
-    rows = result.all()
-
-    return format_surfsense_docs_results(rows)
-
-
-def create_search_surfsense_docs_tool(db_session: AsyncSession):
-    """
-    Factory function to create the search_surfsense_docs tool.
-
-    The tool acquires its own short-lived ``AsyncSession`` per call via
-    :data:`async_session_maker` so the closure is safe to share across
-    HTTP requests by the compiled-agent cache. Capturing a per-request
-    session here would surface stale/closed sessions on cache hits.
-
-    Args:
-        db_session: Reserved for registry compatibility. Per-call sessions
-            are opened via :data:`async_session_maker` inside the tool body.
-
-    Returns:
-        A configured tool function for searching Surfsense documentation
-    """
-    del db_session  # per-call session — see docstring
-
-    @tool
-    async def search_surfsense_docs(query: str, top_k: int = 10) -> str:
-        """
-        Search Surfsense documentation for help with using the application.
-
-        Use this tool when the user asks questions about:
-        - How to use Surfsense features
-        - Installation and setup instructions
-        - Configuration options and settings
-        - Troubleshooting common issues
-        - Available connectors and integrations
-        - Browser extension usage
-        - API documentation
-
-        This searches the official Surfsense documentation that was indexed
-        at deployment time. It does NOT search the user's personal knowledge base.
-
-        Args:
-            query: The search query about Surfsense usage or features
-            top_k: Number of documentation chunks to retrieve (default: 10)
-
-        Returns:
-            Relevant documentation content formatted with chunk IDs for citations
-        """
-        async with async_session_maker() as db_session:
-            return await search_surfsense_docs_async(
-                query=query,
-                db_session=db_session,
-                top_k=top_k,
-            )
-
-    return search_surfsense_docs