refactor: remove search_surfsense_docs tool and related references

- Deleted the `search_surfsense_docs` tool and its associated files, streamlining the agent's toolset.
- Updated various components and prompts to remove references to the now-removed tool, ensuring consistency across the codebase.
- Adjusted documentation to direct users to the SurfSense documentation link for product-related queries instead.
This commit is contained in:
DESKTOP-RTLN3BA\$punk 2026-05-28 22:35:14 -07:00
parent 9b9e6828c7
commit 40ca9e6ed2
71 changed files with 232 additions and 1676 deletions

View file

@ -0,0 +1,129 @@
"""Drop Surfsense docs tables (feature removed end to end)
Revision ID: 146
Revises: 145
Create Date: 2026-05-28
Removes the SurfSense product-documentation feature: the
``surfsense_docs_documents`` and ``surfsense_docs_chunks`` tables (created
in revision 60) and the GIN trigram index on the title column (added in
revision 67). The docs were seeded at startup from local MDX files, so no
user data is lost. Downgrade recreates the tables and indexes.
"""
from collections.abc import Sequence
from alembic import op
from app.config import config
# revision identifiers, used by Alembic.
revision: str = "146"
down_revision: str | None = "145"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
# Embedding dimension is required to recreate the vector columns on downgrade.
EMBEDDING_DIM = config.embedding_model_instance.dimension
def upgrade() -> None:
"""Drop surfsense docs tables and all their indexes."""
# Trigram index from revision 67
op.execute("DROP INDEX IF EXISTS idx_surfsense_docs_title_trgm")
# Full-text search indexes
op.execute("DROP INDEX IF EXISTS surfsense_docs_chunks_search_index")
op.execute("DROP INDEX IF EXISTS surfsense_docs_documents_search_index")
# Vector indexes
op.execute("DROP INDEX IF EXISTS surfsense_docs_chunks_vector_index")
op.execute("DROP INDEX IF EXISTS surfsense_docs_documents_vector_index")
# B-tree indexes
op.execute("DROP INDEX IF EXISTS ix_surfsense_docs_chunks_document_id")
op.execute("DROP INDEX IF EXISTS ix_surfsense_docs_documents_updated_at")
op.execute("DROP INDEX IF EXISTS ix_surfsense_docs_documents_content_hash")
op.execute("DROP INDEX IF EXISTS ix_surfsense_docs_documents_source")
# Tables (chunks first due to FK)
op.execute("DROP TABLE IF EXISTS surfsense_docs_chunks")
op.execute("DROP TABLE IF EXISTS surfsense_docs_documents")
def downgrade() -> None:
"""Recreate surfsense docs tables and indexes (reverses revisions 60 + 67)."""
op.execute(
f"""
CREATE TABLE IF NOT EXISTS surfsense_docs_documents (
id SERIAL PRIMARY KEY,
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
source VARCHAR NOT NULL UNIQUE,
title VARCHAR NOT NULL,
content TEXT NOT NULL,
content_hash VARCHAR NOT NULL,
embedding vector({EMBEDDING_DIM}),
updated_at TIMESTAMP WITH TIME ZONE
);
"""
)
op.execute(
f"""
CREATE TABLE IF NOT EXISTS surfsense_docs_chunks (
id SERIAL PRIMARY KEY,
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
content TEXT NOT NULL,
embedding vector({EMBEDDING_DIM}),
document_id INTEGER NOT NULL REFERENCES surfsense_docs_documents(id) ON DELETE CASCADE
);
"""
)
# B-tree indexes
op.execute(
"CREATE INDEX IF NOT EXISTS ix_surfsense_docs_documents_source ON surfsense_docs_documents(source)"
)
op.execute(
"CREATE INDEX IF NOT EXISTS ix_surfsense_docs_documents_content_hash ON surfsense_docs_documents(content_hash)"
)
op.execute(
"CREATE INDEX IF NOT EXISTS ix_surfsense_docs_documents_updated_at ON surfsense_docs_documents(updated_at)"
)
op.execute(
"CREATE INDEX IF NOT EXISTS ix_surfsense_docs_chunks_document_id ON surfsense_docs_chunks(document_id)"
)
# Vector indexes
op.execute(
"""
CREATE INDEX IF NOT EXISTS surfsense_docs_documents_vector_index
ON surfsense_docs_documents USING hnsw (embedding public.vector_cosine_ops);
"""
)
op.execute(
"""
CREATE INDEX IF NOT EXISTS surfsense_docs_chunks_vector_index
ON surfsense_docs_chunks USING hnsw (embedding public.vector_cosine_ops);
"""
)
# Full-text search indexes
op.execute(
"""
CREATE INDEX IF NOT EXISTS surfsense_docs_documents_search_index
ON surfsense_docs_documents USING gin (to_tsvector('english', content));
"""
)
op.execute(
"""
CREATE INDEX IF NOT EXISTS surfsense_docs_chunks_search_index
ON surfsense_docs_chunks USING gin (to_tsvector('english', content));
"""
)
# Trigram index from revision 67
op.execute(
"""
CREATE INDEX IF NOT EXISTS idx_surfsense_docs_title_trgm
ON surfsense_docs_documents USING gin (title gin_trgm_ops);
"""
)

View file

@ -4,8 +4,8 @@ never invent ids you didn't see. Citation ids are resolved by exact-match
lookup; a wrong id silently breaks the link, so when in doubt, omit.
### Channel A — chunk blocks injected this turn
When `search_surfsense_docs` or `web_search` returns `<document>` /
`<chunk id='…'>` blocks in this turn:
When `web_search` returns `<document>` / `<chunk id='…'>` blocks in this
turn:
1. For each factual statement taken from those chunks, add
`[citation:chunk_id]` using the **exact** id from a visible

View file

@ -20,8 +20,8 @@ it to resolve paths the user describes in natural language ("my Q2 roadmap",
delegating to a specialist.
`<document>` and `<chunk id='…'>` blocks are chunked indexed content returned
by KB search (from `search_surfsense_docs`, or backing `<priority_documents>`).
Each chunk carries a stable `id` attribute.
by KB search (backing `<priority_documents>`). Each chunk carries a stable
`id` attribute.
If a block doesn't appear this turn, work from the conversation alone.
</dynamic_context>

View file

@ -20,8 +20,8 @@ week's planning notes") into concrete document references before delegating
to a specialist.
`<document>` and `<chunk id='…'>` blocks are chunked indexed content returned
by KB search (from `search_surfsense_docs`, or backing `<priority_documents>`).
Each chunk carries a stable `id` attribute.
by KB search (backing `<priority_documents>`). Each chunk carries a stable
`id` attribute.
If a block doesn't appear this turn, work from the conversation alone.
</dynamic_context>

View file

@ -1,19 +1,21 @@
<knowledge_base_first>
CRITICAL — ground factual answers in what you actually receive this turn:
- injected workspace context (see `<dynamic_context>`),
- results from your own tool calls (`search_surfsense_docs`, `web_search`,
`scrape_webpage`),
- results from your own tool calls (`web_search`, `scrape_webpage`),
- or substantive summaries returned by a `task` specialist you invoked.
Do **not** answer factual or informational questions from general knowledge
unless the user explicitly authorises it after you say you couldn't find
enough in those sources. The flow when nothing is found:
1. Say you couldn't find enough in their workspace, docs, or tool output.
1. Say you couldn't find enough in their workspace or tool output.
2. Ask: *"Would you like me to answer from my general knowledge instead?"*
3. Only answer from general knowledge after a clear yes.
This rule does NOT apply to: casual conversation · meta-questions about
SurfSense ("what can you do?") · formatting or analysis of content already
in chat · clear rewrite/edit instructions · lightweight web research.
For "how do I use SurfSense" / product-documentation questions, point the
user to https://www.surfsense.com/docs.
</knowledge_base_first>

View file

@ -5,7 +5,7 @@ Structured reasoning:
- For non-trivial work, `<thinking>` / short `<plan>` before tool calls is fine.
Professional objectivity:
- Accuracy over flattery; verify with **search_surfsense_docs**, **web_search**, **scrape_webpage**, or **task** when unsure — dont invent connector access.
- Accuracy over flattery; verify with **web_search**, **scrape_webpage**, or **task** when unsure — dont invent connector access.
Task management:
- For 3+ steps, use todo tooling; update statuses promptly.

View file

@ -13,6 +13,6 @@ Attribution:
Tool calls:
- Parallelise independent calls.
- Prefer **search_surfsense_docs** for SurfSense docs/product questions before **web_search** when that fits the ask.
- For SurfSense docs/product questions, point the user to https://www.surfsense.com/docs.
- Dont invent paths, chunk ids, or URLs — only values from tools or the user.
</provider_hints>

View file

@ -7,7 +7,7 @@ Output style:
- GitHub-flavoured Markdown; monospace-friendly.
Workflow (Understand → Plan → Act → Verify):
1. **Understand:** parse the ask; use **search_surfsense_docs** / injected workspace context before guessing.
1. **Understand:** parse the ask; use injected workspace context before guessing.
2. **Plan:** for multi-step work, a short plan first.
3. **Act:** only with tools you actually have on this agent (see `<tools>` and `<tool_routing>`). Connector work → **task**.
4. **Verify:** re-read or re-search only when it materially reduces risk.

View file

@ -15,6 +15,7 @@ Output style:
Tool calls:
- Parallelise independent calls in one turn.
- Prefer **search_surfsense_docs** for SurfSense-product questions, **web_search** / **scrape_webpage**
for fresh public facts; integrations and heavy workflows → **task**.
- For SurfSense-product questions, point the user to https://www.surfsense.com/docs;
use **web_search** / **scrape_webpage** for fresh public facts; integrations and
heavy workflows → **task**.
</provider_hints>

View file

@ -3,10 +3,7 @@ You have two execution channels. Pick the one that owns the work — never
simulate one with the other.
### 1. Direct tools (you call them yourself)
- `search_surfsense_docs` — SurfSense product docs (setup, configuration,
connector docs, feature behavior).
- `web_search` — search the public web (anything outside SurfSense docs and
the workspace KB).
- `web_search` — search the public web (anything outside the workspace KB).
- `scrape_webpage` — fetch the body of a specific public URL.
- `update_memory` — curate persistent memory (see `<memory_protocol>`).
- `write_todos` — maintain a structured plan when the turn series spans
@ -14,6 +11,10 @@ simulate one with the other.
`in_progress` **before** the `task` call that handles it, `completed`
once the call returns. Skip for single-step requests.
**Questions about how to use SurfSense itself** (setup, configuration,
connectors, feature behavior) — point the user to the documentation:
https://www.surfsense.com/docs. There is no docs-search tool; give the link.
**You have NO filesystem tools.** Any read, write, edit, move, rename, or
search inside the user's workspace goes through `task(knowledge_base, …)`
never via `write_file`, `ls`, or any direct file operation.

View file

@ -1 +0,0 @@
"""``search_surfsense_docs`` — description + few-shot examples."""

View file

@ -1,10 +0,0 @@
- `search_surfsense_docs` — Search official SurfSense documentation (product
help).
- Use when the user asks how SurfSense itself works — setup, configuration,
connector documentation, feature behavior, anything covered in the
product docs.
- Not a substitute for `task` when the user wants actions inside a
connected service (Gmail, Slack, Jira, Notion, etc.).
- Args: `query`, `top_k` (default 10).
- Returns doc excerpts; chunk ids may appear for attribution — see
`<citations>` for the contract.

View file

@ -1,15 +0,0 @@
<example>
user: "How do I install SurfSense?"
→ search_surfsense_docs(query="installation setup")
</example>
<example>
user: "What connectors does SurfSense support?"
→ search_surfsense_docs(query="available connectors integrations")
</example>
<example>
user: "How do I set up the Notion connector?"
→ search_surfsense_docs(query="Notion connector setup configuration")
(Changing data inside Notion itself → `task(notion, …)`, not this tool.)
</example>

View file

@ -6,7 +6,6 @@ Connector integrations, MCP, deliverables, etc. are delegated via ``task`` subag
from __future__ import annotations
MAIN_AGENT_SURFSENSE_TOOL_NAMES_ORDERED: tuple[str, ...] = (
"search_surfsense_docs",
"web_search",
"scrape_webpage",
"update_memory",

View file

@ -8,7 +8,6 @@ Gather and synthesize evidence using SurfSense research tools with clear citatio
<available_tools>
- `web_search`
- `scrape_webpage`
- `search_surfsense_docs`
</available_tools>
<tool_policy>

View file

@ -1,11 +1,9 @@
"""Research-stage tools: web search, scrape, and in-product doc search."""
"""Research-stage tools: web search and scrape."""
from .scrape_webpage import create_scrape_webpage_tool
from .search_surfsense_docs import create_search_surfsense_docs_tool
from .web_search import create_web_search_tool
__all__ = [
"create_scrape_webpage_tool",
"create_search_surfsense_docs_tool",
"create_web_search_tool",
]

View file

@ -9,7 +9,6 @@ from langchain_core.tools import BaseTool
from app.agents.new_chat.permissions import Ruleset
from .scrape_webpage import create_scrape_webpage_tool
from .search_surfsense_docs import create_search_surfsense_docs_tool
from .web_search import create_web_search_tool
NAME = "research"
@ -27,5 +26,4 @@ def load_tools(
available_connectors=d.get("available_connectors"),
),
create_scrape_webpage_tool(firecrawl_api_key=d.get("firecrawl_api_key")),
create_search_surfsense_docs_tool(db_session=d["db_session"]),
]

View file

@ -1,145 +0,0 @@
"""Semantic search over pre-indexed in-app documentation chunks for user how-to questions."""
import asyncio
import json
from langchain_core.tools import tool
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from app.db import SurfsenseDocsChunk, SurfsenseDocsDocument
from app.utils.document_converters import embed_text
from app.utils.surfsense_docs import surfsense_docs_public_url
def format_surfsense_docs_results(results: list[tuple]) -> str:
"""Format (chunk, document) rows as XML with ``doc-`` chunk IDs for citations and UI routing."""
if not results:
return "No relevant Surfsense documentation found for your query."
# Group chunks by document
grouped: dict[int, dict] = {}
for chunk, doc in results:
public_url = surfsense_docs_public_url(doc.source)
if doc.id not in grouped:
grouped[doc.id] = {
"document_id": f"doc-{doc.id}",
"document_type": "SURFSENSE_DOCS",
"title": doc.title,
"url": public_url,
"metadata": {"source": doc.source, "public_url": public_url},
"chunks": [],
}
grouped[doc.id]["chunks"].append(
{
"chunk_id": f"doc-{chunk.id}",
"content": chunk.content,
}
)
# Render XML matching format_documents_for_context structure
parts: list[str] = []
for g in grouped.values():
metadata_json = json.dumps(g["metadata"], ensure_ascii=False)
parts.append("<document>")
parts.append("<document_metadata>")
parts.append(f" <document_id>{g['document_id']}</document_id>")
parts.append(f" <document_type>{g['document_type']}</document_type>")
parts.append(f" <title><![CDATA[{g['title']}]]></title>")
parts.append(f" <url><![CDATA[{g['url']}]]></url>")
parts.append(f" <metadata_json><![CDATA[{metadata_json}]]></metadata_json>")
parts.append("</document_metadata>")
parts.append("")
parts.append("<document_content>")
for ch in g["chunks"]:
parts.append(
f" <chunk id='{ch['chunk_id']}'><![CDATA[{ch['content']}]]></chunk>"
)
parts.append("</document_content>")
parts.append("</document>")
parts.append("")
return "\n".join(parts).strip()
async def search_surfsense_docs_async(
query: str,
db_session: AsyncSession,
top_k: int = 10,
) -> str:
"""
Search Surfsense documentation using vector similarity.
Args:
query: The search query about Surfsense usage
db_session: Database session for executing queries
top_k: Number of results to return
Returns:
Formatted string with relevant documentation content
"""
# Get embedding for the query
query_embedding = await asyncio.to_thread(embed_text, query)
# Vector similarity search on chunks, joining with documents
stmt = (
select(SurfsenseDocsChunk, SurfsenseDocsDocument)
.join(
SurfsenseDocsDocument,
SurfsenseDocsChunk.document_id == SurfsenseDocsDocument.id,
)
.order_by(SurfsenseDocsChunk.embedding.op("<=>")(query_embedding))
.limit(top_k)
)
result = await db_session.execute(stmt)
rows = result.all()
return format_surfsense_docs_results(rows)
def create_search_surfsense_docs_tool(db_session: AsyncSession):
"""
Factory function to create the search_surfsense_docs tool.
Args:
db_session: Database session for executing queries
Returns:
A configured tool function for searching Surfsense documentation
"""
@tool
async def search_surfsense_docs(query: str, top_k: int = 10) -> str:
"""
Search Surfsense documentation for help with using the application.
Use this tool when the user asks questions about:
- How to use Surfsense features
- Installation and setup instructions
- Configuration options and settings
- Troubleshooting common issues
- Available connectors and integrations
- Browser extension usage
- API documentation
This searches the official Surfsense documentation that was indexed
at deployment time. It does NOT search the user's personal knowledge base.
Args:
query: The search query about Surfsense usage or features
top_k: Number of documentation chunks to retrieve (default: 10)
Returns:
Relevant documentation content formatted with chunk IDs for citations
"""
return await search_surfsense_docs_async(
query=query,
db_session=db_session,
top_k=top_k,
)
return search_surfsense_docs

View file

@ -104,7 +104,7 @@ class AgentFeatureFlags:
# ``tools/google_drive``, ``tools/dropbox``, ``tools/onedrive``,
# ``tools/google_calendar``, ``tools/confluence``, ``tools/discord``,
# ``tools/teams``, ``tools/luma``, ``connected_accounts``,
# ``update_memory``, ``search_surfsense_docs``) now acquire fresh
# ``update_memory``) now acquire fresh
# short-lived ``AsyncSession`` instances per call via
# :data:`async_session_maker`. The factory still accepts ``db_session``
# for registry compatibility but ``del``'s it immediately — see any

View file

@ -73,9 +73,8 @@ class ResolvedMentionSet:
``@Project Roadmap`` is never shadowed by a shorter prefix
``@Project``).
``mentioned_document_ids`` collapses doc + surfsense_doc chips into
a single ordered, deduped list because the priority middleware
treats them uniformly downstream see
``mentioned_document_ids`` is an ordered, deduped list consumed by
the priority middleware downstream see
``KnowledgePriorityMiddleware._compute_priority_paths``.
"""
@ -103,7 +102,6 @@ async def resolve_mentions(
search_space_id: int,
mentioned_documents: list[MentionedDocumentInfo] | None,
mentioned_document_ids: list[int] | None = None,
mentioned_surfsense_doc_ids: list[int] | None = None,
mentioned_folder_ids: list[int] | None = None,
) -> ResolvedMentionSet:
"""Resolve every @-mention chip on a turn into virtual paths.
@ -111,8 +109,7 @@ async def resolve_mentions(
The function takes both the ``mentioned_documents`` discriminated
list (chip metadata used for substitution + persistence) and the
parallel id arrays (``mentioned_document_ids``,
``mentioned_surfsense_doc_ids``, ``mentioned_folder_ids``) for two
reasons:
``mentioned_folder_ids``) for two reasons:
* Legacy clients that haven't migrated to the unified chip list
still send the id arrays we treat the union as authoritative.
@ -142,7 +139,6 @@ async def resolve_mentions(
dict.fromkeys(
[
*(mentioned_document_ids or []),
*(mentioned_surfsense_doc_ids or []),
*chip_doc_ids,
]
)

View file

@ -59,14 +59,13 @@ Do NOT cite document_id. Always use the chunk id.
- NEVER create your own citation format - use the exact chunk_id values from the documents in the [citation:chunk_id] format
- NEVER format citations as clickable links or as markdown links like "([citation:5](https://example.com))". Always use plain square brackets only
- NEVER make up chunk IDs if you are unsure about the chunk_id. It is better to omit the citation than to guess
- Copy the EXACT chunk id from the XML - if it says `<chunk id='doc-123'>`, use [citation:doc-123]
- Copy the EXACT chunk id from the XML - if it says `<chunk id='5'>`, use [citation:5]
- If the chunk id is a URL like `<chunk id='https://example.com/page'>`, use [citation:https://example.com/page]
</citation_format>
<citation_examples>
CORRECT citation formats:
- [citation:5] (numeric chunk ID from knowledge base)
- [citation:doc-123] (for Surfsense documentation chunks)
- [citation:https://example.com/article] (URL chunk ID from web search results)
- [citation:chunk_id1], [citation:chunk_id2], [citation:chunk_id3] (multiple citations)

View file

@ -7,7 +7,7 @@ CRITICAL RULE — KNOWLEDGE BASE FIRST, NEVER DEFAULT TO GENERAL KNOWLEDGE:
2. Ask the user: "Would you like me to answer from my general knowledge instead?"
3. ONLY provide a general-knowledge answer AFTER the user explicitly says yes.
- This policy does NOT apply to:
* Casual conversation, greetings, or meta-questions about SurfSense itself (e.g., "what can you do?")
* Casual conversation, greetings, or meta-questions about SurfSense itself (e.g., "what can you do?"). For "how do I use SurfSense" / product-documentation questions, point the user to https://www.surfsense.com/docs.
* Formatting, summarization, or analysis of content already present in the conversation
* Following user instructions that are clearly task-oriented (e.g., "rewrite this in bullet points")
* Tool-usage actions like generating reports, podcasts, images, or scraping webpages

View file

@ -7,7 +7,7 @@ CRITICAL RULE — KNOWLEDGE BASE FIRST, NEVER DEFAULT TO GENERAL KNOWLEDGE:
2. Ask: "Would you like me to answer from my general knowledge instead?"
3. ONLY provide a general-knowledge answer AFTER a team member explicitly says yes.
- This policy does NOT apply to:
* Casual conversation, greetings, or meta-questions about SurfSense itself (e.g., "what can you do?")
* Casual conversation, greetings, or meta-questions about SurfSense itself (e.g., "what can you do?"). For "how do I use SurfSense" / product-documentation questions, point the user to https://www.surfsense.com/docs.
* Formatting, summarization, or analysis of content already present in the conversation
* Following user instructions that are clearly task-oriented (e.g., "rewrite this in bullet points")
* Tool-usage actions like generating reports, podcasts, images, or scraping webpages

View file

@ -13,6 +13,7 @@ When to use which tool:
- Knowledge base content (Notion, GitHub, files, notes) → automatically searched
- Real-time public web data → call web_search
- Reading a specific webpage → call scrape_webpage
- SurfSense product / how-to questions (setup, configuration, connectors, feature behavior) → point the user to the documentation: https://www.surfsense.com/docs
**`task` subagents (when to delegate):**
- **`linear_specialist`** — Linear-only investigations and tool use.

View file

@ -13,6 +13,7 @@ When to use which tool:
- Knowledge base content (Notion, GitHub, files, notes) → automatically searched
- Real-time public web data → call web_search
- Reading a specific webpage → call scrape_webpage
- SurfSense product / how-to questions (setup, configuration, connectors, feature behavior) → point the user to the documentation: https://www.surfsense.com/docs
**`task` subagents (when to delegate):**
- **`linear_specialist`** — Linear-only investigations and tool use.

View file

@ -151,7 +151,6 @@ def _read_fragment(subpath: str) -> str:
# Ordered for reading flow: fundamentals first, then artifact generators,
# then memory at the end (mirrors the legacy ``_ALL_TOOL_NAMES_ORDERED``).
ALL_TOOL_NAMES_ORDERED: tuple[str, ...] = (
"search_surfsense_docs",
"web_search",
"generate_podcast",
"generate_video_presentation",

View file

@ -1,9 +0,0 @@
- User: "How do I install SurfSense?"
- Call: `search_surfsense_docs(query="installation setup")`
- User: "What connectors does SurfSense support?"
- Call: `search_surfsense_docs(query="available connectors integrations")`
- User: "How do I set up the Notion connector?"
- Call: `search_surfsense_docs(query="Notion connector setup configuration")`
- User: "How do I use Docker to run SurfSense?"
- Call: `search_surfsense_docs(query="Docker installation setup")`

View file

@ -1,7 +0,0 @@
- search_surfsense_docs: Search the official SurfSense documentation.
- Use this tool when the user asks anything about SurfSense itself (the application they are using).
- Args:
- query: The search query about SurfSense
- top_k: Number of documentation chunks to retrieve (default: 10)
- Returns: Documentation content with chunk IDs for citations (prefixed with 'doc-', e.g., [citation:doc-123])

View file

@ -1,7 +1,6 @@
---
name: email-drafting
description: Draft an email matching the user's voice, with structured intent and CTA
allowed-tools: search_surfsense_docs
---
# Email drafting

View file

@ -1,7 +1,7 @@
---
name: kb-research
description: Structured approach to finding and synthesizing information from the user's knowledge base
allowed-tools: search_surfsense_docs, scrape_webpage, read_file, ls_tree, grep, web_search
allowed-tools: scrape_webpage, read_file, ls_tree, grep, web_search
---
# Knowledge-base research

View file

@ -1,7 +1,7 @@
---
name: meeting-prep
description: Pull together briefing materials before a scheduled meeting
allowed-tools: search_surfsense_docs, web_search, scrape_webpage, read_file
allowed-tools: web_search, scrape_webpage, read_file
---
# Meeting preparation

View file

@ -1,7 +1,7 @@
---
name: report-writing
description: How to scope, draft, and revise a Markdown report artifact via generate_report
allowed-tools: generate_report, search_surfsense_docs, read_file
allowed-tools: generate_report, read_file
---
# Report writing

View file

@ -1,7 +1,6 @@
---
name: slack-summary
description: Distill a Slack channel or thread into actionable summary
allowed-tools: search_surfsense_docs
---
# Slack summarization

View file

@ -46,7 +46,6 @@ logger = logging.getLogger(__name__)
# ``glob``, ``grep``) plus the SurfSense-side read tools.
EXPLORE_READ_TOOLS: frozenset[str] = frozenset(
{
"search_surfsense_docs",
"web_search",
"scrape_webpage",
"read_file",
@ -61,7 +60,6 @@ EXPLORE_READ_TOOLS: frozenset[str] = frozenset(
# is needed, the parent should hand off to ``explore`` first.
REPORT_WRITER_TOOLS: frozenset[str] = frozenset(
{
"search_surfsense_docs",
"read_file",
"generate_report",
}
@ -222,7 +220,6 @@ EXPLORE_SYSTEM_PROMPT = """You are the **explore** subagent for SurfSense.
Conduct read-only research across the user's knowledge base, the web, and any documents the parent agent has surfaced. Return a synthesized answer with explicit citations — never speculate beyond the sources you have actually inspected.
## Tools available
- `search_surfsense_docs` fast hybrid search over the user's knowledge base.
- `web_search` only when the user's KB clearly does not contain the answer.
- `scrape_webpage` to read a URL the user or the search results provided.
- `read_file`, `ls`, `glob`, `grep` to inspect specific documents or trees the parent has flagged.
@ -242,7 +239,7 @@ Produce a single high-quality report deliverable using `generate_report`. The pa
## Workflow
1. **Outline first.** Before calling `generate_report`, write a one-paragraph outline of the sections you plan to produce. Confirm the outline reflects the parent's instructions.
2. **Source resolution.** Decide whether to call `search_surfsense_docs` and `read_file` for any final-checks, or whether the parent's earlier tool calls already cover the source set.
2. **Source resolution.** Decide whether to call `read_file` for any final-checks, or whether the parent's earlier tool calls already cover the source set.
3. **One report.** Call `generate_report` exactly once with `source_strategy` chosen per the topic and chat history (see the `report-writing` skill).
4. **Confirm.** End with a one-sentence summary in your final message never paste the report back into chat; the artifact card renders itself.
"""

View file

@ -5,7 +5,6 @@ This module contains all the tools available to the SurfSense agent.
To add a new tool, see the documentation in registry.py.
Available tools:
- search_surfsense_docs: Search Surfsense documentation for usage help
- generate_podcast: Generate audio podcasts from content
- generate_video_presentation: Generate video presentations with slides and narration
- generate_image: Generate images from text descriptions using AI models
@ -31,7 +30,6 @@ from .registry import (
get_tool_by_name,
)
from .scrape_webpage import create_scrape_webpage_tool
from .search_surfsense_docs import create_search_surfsense_docs_tool
from .update_memory import create_update_memory_tool, create_update_team_memory_tool
from .video_presentation import create_generate_video_presentation_tool
@ -47,7 +45,6 @@ __all__ = [
"create_generate_podcast_tool",
"create_generate_video_presentation_tool",
"create_scrape_webpage_tool",
"create_search_surfsense_docs_tool",
"create_update_memory_tool",
"create_update_team_memory_tool",
"format_documents_for_context",

View file

@ -101,7 +101,6 @@ from .podcast import create_generate_podcast_tool
from .report import create_generate_report_tool
from .resume import create_generate_resume_tool
from .scrape_webpage import create_scrape_webpage_tool
from .search_surfsense_docs import create_search_surfsense_docs_tool
from .teams import (
create_list_teams_channels_tool,
create_read_teams_messages_tool,
@ -258,15 +257,6 @@ BUILTIN_TOOLS: list[ToolDefinition] = [
),
requires=[],
),
# Surfsense documentation search tool
ToolDefinition(
name="search_surfsense_docs",
description="Search Surfsense documentation for help with using the application",
factory=lambda deps: create_search_surfsense_docs_tool(
db_session=deps["db_session"],
),
requires=["db_session"],
),
# =========================================================================
# SERVICE ACCOUNT DISCOVERY
# Generic tool for the LLM to discover connected accounts and resolve

View file

@ -1,174 +0,0 @@
"""
Surfsense documentation search tool.
This tool allows the agent to search the pre-indexed Surfsense documentation
to help users with questions about how to use the application.
The documentation is indexed at deployment time from MDX files and stored
in dedicated tables (surfsense_docs_documents, surfsense_docs_chunks).
"""
import asyncio
import json
from langchain_core.tools import tool
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from app.db import SurfsenseDocsChunk, SurfsenseDocsDocument, async_session_maker
from app.utils.document_converters import embed_text
from app.utils.surfsense_docs import surfsense_docs_public_url
def format_surfsense_docs_results(results: list[tuple]) -> str:
"""
Format search results into XML structure for the LLM context.
Uses the same XML structure as format_documents_for_context from knowledge_base.py
but with 'doc-' prefix on chunk IDs. This allows:
- LLM to use consistent [citation:doc-XXX] format
- Frontend to detect 'doc-' prefix and route to surfsense docs endpoint
Args:
results: List of (chunk, document) tuples from the database query
Returns:
Formatted XML string with documentation content and citation-ready chunks
"""
if not results:
return "No relevant Surfsense documentation found for your query."
# Group chunks by document
grouped: dict[int, dict] = {}
for chunk, doc in results:
public_url = surfsense_docs_public_url(doc.source)
if doc.id not in grouped:
grouped[doc.id] = {
"document_id": f"doc-{doc.id}",
"document_type": "SURFSENSE_DOCS",
"title": doc.title,
"url": public_url,
"metadata": {"source": doc.source, "public_url": public_url},
"chunks": [],
}
grouped[doc.id]["chunks"].append(
{
"chunk_id": f"doc-{chunk.id}",
"content": chunk.content,
}
)
# Render XML matching format_documents_for_context structure
parts: list[str] = []
for g in grouped.values():
metadata_json = json.dumps(g["metadata"], ensure_ascii=False)
parts.append("<document>")
parts.append("<document_metadata>")
parts.append(f" <document_id>{g['document_id']}</document_id>")
parts.append(f" <document_type>{g['document_type']}</document_type>")
parts.append(f" <title><![CDATA[{g['title']}]]></title>")
parts.append(f" <url><![CDATA[{g['url']}]]></url>")
parts.append(f" <metadata_json><![CDATA[{metadata_json}]]></metadata_json>")
parts.append("</document_metadata>")
parts.append("")
parts.append("<document_content>")
for ch in g["chunks"]:
parts.append(
f" <chunk id='{ch['chunk_id']}'><![CDATA[{ch['content']}]]></chunk>"
)
parts.append("</document_content>")
parts.append("</document>")
parts.append("")
return "\n".join(parts).strip()
async def search_surfsense_docs_async(
query: str,
db_session: AsyncSession,
top_k: int = 10,
) -> str:
"""
Search Surfsense documentation using vector similarity.
Args:
query: The search query about Surfsense usage
db_session: Database session for executing queries
top_k: Number of results to return
Returns:
Formatted string with relevant documentation content
"""
# Get embedding for the query
query_embedding = await asyncio.to_thread(embed_text, query)
# Vector similarity search on chunks, joining with documents
stmt = (
select(SurfsenseDocsChunk, SurfsenseDocsDocument)
.join(
SurfsenseDocsDocument,
SurfsenseDocsChunk.document_id == SurfsenseDocsDocument.id,
)
.order_by(SurfsenseDocsChunk.embedding.op("<=>")(query_embedding))
.limit(top_k)
)
result = await db_session.execute(stmt)
rows = result.all()
return format_surfsense_docs_results(rows)
def create_search_surfsense_docs_tool(db_session: AsyncSession):
"""
Factory function to create the search_surfsense_docs tool.
The tool acquires its own short-lived ``AsyncSession`` per call via
:data:`async_session_maker` so the closure is safe to share across
HTTP requests by the compiled-agent cache. Capturing a per-request
session here would surface stale/closed sessions on cache hits.
Args:
db_session: Reserved for registry compatibility. Per-call sessions
are opened via :data:`async_session_maker` inside the tool body.
Returns:
A configured tool function for searching Surfsense documentation
"""
del db_session # per-call session — see docstring
@tool
async def search_surfsense_docs(query: str, top_k: int = 10) -> str:
"""
Search Surfsense documentation for help with using the application.
Use this tool when the user asks questions about:
- How to use Surfsense features
- Installation and setup instructions
- Configuration options and settings
- Troubleshooting common issues
- Available connectors and integrations
- Browser extension usage
- API documentation
This searches the official Surfsense documentation that was indexed
at deployment time. It does NOT search the user's personal knowledge base.
Args:
query: The search query about Surfsense usage or features
top_k: Number of documentation chunks to retrieve (default: 10)
Returns:
Relevant documentation content formatted with chunk IDs for citations
"""
async with async_session_maker() as db_session:
return await search_surfsense_docs_async(
query=query,
db_session=db_session,
top_k=top_k,
)
return search_surfsense_docs

View file

@ -43,7 +43,6 @@ from app.rate_limiter import get_real_client_ip, limiter
from app.routes import router as crud_router
from app.routes.auth_routes import router as auth_router
from app.schemas import UserCreate, UserRead, UserUpdate
from app.tasks.surfsense_docs_indexer import seed_surfsense_docs
from app.users import SECRET, auth_backend, current_active_user, fastapi_users
from app.utils.perf import log_system_snapshot
@ -576,13 +575,6 @@ async def lifespan(app: FastAPI):
initialize_llm_router()
initialize_image_gen_router()
initialize_vision_llm_router()
try:
await asyncio.wait_for(seed_surfsense_docs(), timeout=120)
except TimeoutError:
logging.getLogger(__name__).warning(
"Surfsense docs seeding timed out after 120s — skipping. "
"Docs will be indexed on the next restart."
)
# Phase 1.7 — JIT warmup. Bounded so a stuck warmup never delays
# worker readiness. ``shield`` so Uvicorn cancelling startup

View file

@ -1150,46 +1150,6 @@ class Chunk(BaseModel, TimestampMixin):
document = relationship("Document", back_populates="chunks")
class SurfsenseDocsDocument(BaseModel, TimestampMixin):
"""
Surfsense documentation storage.
Indexed at migration time from MDX files.
"""
__tablename__ = "surfsense_docs_documents"
source = Column(
String, nullable=False, unique=True, index=True
) # File path: "connectors/slack.mdx"
title = Column(String, nullable=False)
content = Column(Text, nullable=False)
content_hash = Column(String, nullable=False, index=True) # For detecting changes
embedding = Column(Vector(config.embedding_model_instance.dimension))
updated_at = Column(TIMESTAMP(timezone=True), nullable=True, index=True)
chunks = relationship(
"SurfsenseDocsChunk",
back_populates="document",
cascade="all, delete-orphan",
)
class SurfsenseDocsChunk(BaseModel, TimestampMixin):
"""Chunk storage for Surfsense documentation."""
__tablename__ = "surfsense_docs_chunks"
content = Column(Text, nullable=False)
embedding = Column(Vector(config.embedding_model_instance.dimension))
document_id = Column(
Integer,
ForeignKey("surfsense_docs_documents.id", ondelete="CASCADE"),
nullable=False,
)
document = relationship("SurfsenseDocsDocument", back_populates="chunks")
class Podcast(BaseModel, TimestampMixin):
"""Podcast model for storing generated podcasts."""
@ -2680,11 +2640,6 @@ async def setup_indexes():
"CREATE INDEX IF NOT EXISTS idx_documents_search_space_updated ON documents (search_space_id, updated_at DESC NULLS LAST) INCLUDE (id, title, document_type)"
)
)
await conn.execute(
text(
"CREATE INDEX IF NOT EXISTS idx_surfsense_docs_title_trgm ON surfsense_docs_documents USING gin (title gin_trgm_ops)"
)
)
async def create_db_and_tables():

View file

@ -55,7 +55,6 @@ from .search_source_connectors_routes import router as search_source_connectors_
from .search_spaces_routes import router as search_spaces_router
from .slack_add_connector_route import router as slack_add_connector_router
from .stripe_routes import router as stripe_router
from .surfsense_docs_routes import router as surfsense_docs_router
from .team_memory_routes import router as team_memory_router
from .teams_add_connector_route import router as teams_add_connector_router
from .video_presentations_routes import router as video_presentations_router
@ -108,7 +107,6 @@ router.include_router(new_llm_config_router) # LLM configs with prompt configur
router.include_router(model_list_router) # Dynamic model catalogue from OpenRouter
router.include_router(logs_router)
router.include_router(circleback_webhook_router) # Circleback meeting webhooks
router.include_router(surfsense_docs_router) # Surfsense documentation for citations
router.include_router(notifications_router) # Notifications with Zero sync
router.include_router(
mcp_oauth_router

View file

@ -1785,7 +1785,6 @@ async def handle_new_chat(
user_id=str(user.id),
llm_config_id=llm_config_id,
mentioned_document_ids=request.mentioned_document_ids,
mentioned_surfsense_doc_ids=request.mentioned_surfsense_doc_ids,
mentioned_folder_ids=request.mentioned_folder_ids,
mentioned_connector_ids=request.mentioned_connector_ids,
mentioned_connectors=mentioned_connectors_payload,
@ -2278,7 +2277,6 @@ async def regenerate_response(
user_id=str(user.id),
llm_config_id=llm_config_id,
mentioned_document_ids=request.mentioned_document_ids,
mentioned_surfsense_doc_ids=request.mentioned_surfsense_doc_ids,
mentioned_folder_ids=request.mentioned_folder_ids,
mentioned_connector_ids=request.mentioned_connector_ids,
mentioned_connectors=mentioned_connectors_payload,

View file

@ -1,172 +0,0 @@
"""
Routes for Surfsense documentation.
These endpoints support the citation system for Surfsense docs,
allowing the frontend to fetch document details when a user clicks
on a [citation:doc-XXX] link.
"""
from fastapi import APIRouter, Depends, HTTPException
from sqlalchemy import func, select
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.orm import selectinload
from app.db import (
SurfsenseDocsChunk,
SurfsenseDocsDocument,
User,
get_async_session,
)
from app.schemas import PaginatedResponse
from app.schemas.surfsense_docs import (
SurfsenseDocsChunkRead,
SurfsenseDocsDocumentRead,
SurfsenseDocsDocumentWithChunksRead,
)
from app.users import current_active_user
from app.utils.surfsense_docs import surfsense_docs_public_url
router = APIRouter()
@router.get(
"/surfsense-docs/by-chunk/{chunk_id}",
response_model=SurfsenseDocsDocumentWithChunksRead,
)
async def get_surfsense_doc_by_chunk_id(
chunk_id: int,
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""
Retrieves a Surfsense documentation document based on a chunk ID.
This endpoint is used by the frontend to resolve [citation:doc-XXX] links.
"""
try:
# Get the chunk
chunk_result = await session.execute(
select(SurfsenseDocsChunk).filter(SurfsenseDocsChunk.id == chunk_id)
)
chunk = chunk_result.scalars().first()
if not chunk:
raise HTTPException(
status_code=404,
detail=f"Surfsense docs chunk with id {chunk_id} not found",
)
# Get the associated document with all its chunks
document_result = await session.execute(
select(SurfsenseDocsDocument)
.options(selectinload(SurfsenseDocsDocument.chunks))
.filter(SurfsenseDocsDocument.id == chunk.document_id)
)
document = document_result.scalars().first()
if not document:
raise HTTPException(
status_code=404,
detail="Surfsense docs document not found",
)
# Sort chunks by ID
sorted_chunks = sorted(document.chunks, key=lambda x: x.id)
return SurfsenseDocsDocumentWithChunksRead(
id=document.id,
title=document.title,
source=document.source,
public_url=surfsense_docs_public_url(document.source),
content=document.content,
chunks=[
SurfsenseDocsChunkRead(id=c.id, content=c.content)
for c in sorted_chunks
],
)
except HTTPException:
raise
except Exception as e:
raise HTTPException(
status_code=500,
detail=f"Failed to retrieve Surfsense documentation: {e!s}",
) from e
@router.get(
"/surfsense-docs",
response_model=PaginatedResponse[SurfsenseDocsDocumentRead],
)
async def list_surfsense_docs(
page: int = 0,
page_size: int = 50,
title: str | None = None,
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""
List all Surfsense documentation documents.
Args:
page: Zero-based page index.
page_size: Number of items per page (default: 50).
title: Optional title filter (case-insensitive substring match).
session: Database session (injected).
user: Current authenticated user (injected).
Returns:
PaginatedResponse[SurfsenseDocsDocumentRead]: Paginated list of Surfsense docs.
"""
try:
# Base query
query = select(SurfsenseDocsDocument)
count_query = select(func.count()).select_from(SurfsenseDocsDocument)
# Filter by title if provided
if title and title.strip():
query = query.filter(SurfsenseDocsDocument.title.ilike(f"%{title}%"))
count_query = count_query.filter(
SurfsenseDocsDocument.title.ilike(f"%{title}%")
)
# Get total count
total_result = await session.execute(count_query)
total = total_result.scalar() or 0
# Calculate offset
offset = page * page_size
# Get paginated results
result = await session.execute(
query.order_by(SurfsenseDocsDocument.title).offset(offset).limit(page_size)
)
docs = result.scalars().all()
# Convert to response format
items = [
SurfsenseDocsDocumentRead(
id=doc.id,
title=doc.title,
source=doc.source,
public_url=surfsense_docs_public_url(doc.source),
content=doc.content,
created_at=doc.created_at,
updated_at=doc.updated_at,
)
for doc in docs
]
has_more = (offset + len(items)) < total
return PaginatedResponse(
items=items,
total=total,
page=page,
page_size=page_size,
has_more=has_more,
)
except Exception as e:
raise HTTPException(
status_code=500,
detail=f"Failed to list Surfsense documentation: {e!s}",
) from e

View file

@ -239,9 +239,6 @@ class NewChatRequest(BaseModel):
mentioned_document_ids: list[int] | None = (
None # Optional document IDs mentioned with @ in the chat
)
mentioned_surfsense_doc_ids: list[int] | None = (
None # Optional SurfSense documentation IDs mentioned with @ in the chat
)
mentioned_folder_ids: list[int] | None = Field(
default=None,
description=(
@ -326,7 +323,6 @@ class RegenerateRequest(BaseModel):
None # New user query (for edit). None = reload with same query
)
mentioned_document_ids: list[int] | None = None
mentioned_surfsense_doc_ids: list[int] | None = None
mentioned_folder_ids: list[int] | None = Field(
default=None,
description=(

View file

@ -1,43 +0,0 @@
"""
Schemas for Surfsense documentation.
"""
from datetime import datetime
from pydantic import BaseModel, ConfigDict
class SurfsenseDocsChunkRead(BaseModel):
"""Schema for a Surfsense docs chunk."""
id: int
content: str
model_config = ConfigDict(from_attributes=True)
class SurfsenseDocsDocumentRead(BaseModel):
"""Schema for a Surfsense docs document (without chunks)."""
id: int
title: str
source: str
public_url: str
content: str
created_at: datetime | None = None
updated_at: datetime | None = None
model_config = ConfigDict(from_attributes=True)
class SurfsenseDocsDocumentWithChunksRead(BaseModel):
"""Schema for a Surfsense docs document with its chunks."""
id: int
title: str
source: str
public_url: str
content: str
chunks: list[SurfsenseDocsChunkRead]
model_config = ConfigDict(from_attributes=True)

View file

@ -25,7 +25,6 @@ from uuid import UUID
import anyio
from langchain_core.messages import HumanMessage
from sqlalchemy.future import select
from sqlalchemy.orm import selectinload
from app.agents.multi_agent_chat import create_multi_agent_chat_deep_agent
from app.agents.new_chat.chat_deepagent import create_surfsense_deep_agent
@ -55,7 +54,6 @@ from app.db import (
NewChatThread,
Report,
SearchSourceConnectorType,
SurfsenseDocsDocument,
async_session_maker,
shielded_async_session,
)
@ -77,7 +75,6 @@ from app.tasks.chat.streaming.helpers.interrupt_inspector import (
)
from app.utils.content_utils import bootstrap_history_from_db
from app.utils.perf import get_perf_logger, log_system_snapshot, trim_native_heap
from app.utils.surfsense_docs import surfsense_docs_public_url
from app.utils.user_message_multimodal import build_human_message_content
_background_tasks: set[asyncio.Task] = set()
@ -198,58 +195,6 @@ def _extract_chunk_parts(chunk: Any) -> dict[str, Any]:
return out
def format_mentioned_surfsense_docs_as_context(
documents: list[SurfsenseDocsDocument],
) -> str:
"""Format mentioned SurfSense documentation as context for the agent."""
if not documents:
return ""
context_parts = ["<mentioned_surfsense_docs>"]
context_parts.append(
"The user has explicitly mentioned the following SurfSense documentation pages. "
"These are official documentation about how to use SurfSense and should be used to answer questions about the application. "
"Use [citation:CHUNK_ID] format for citations (e.g., [citation:doc-123])."
)
for doc in documents:
public_url = surfsense_docs_public_url(doc.source)
metadata_json = json.dumps(
{"source": doc.source, "public_url": public_url}, ensure_ascii=False
)
context_parts.append("<document>")
context_parts.append("<document_metadata>")
context_parts.append(f" <document_id>doc-{doc.id}</document_id>")
context_parts.append(" <document_type>SURFSENSE_DOCS</document_type>")
context_parts.append(f" <title><![CDATA[{doc.title}]]></title>")
context_parts.append(f" <url><![CDATA[{public_url}]]></url>")
context_parts.append(
f" <metadata_json><![CDATA[{metadata_json}]]></metadata_json>"
)
context_parts.append("</document_metadata>")
context_parts.append("")
context_parts.append("<document_content>")
if hasattr(doc, "chunks") and doc.chunks:
for chunk in doc.chunks:
context_parts.append(
f" <chunk id='doc-{chunk.id}'><![CDATA[{chunk.content}]]></chunk>"
)
else:
context_parts.append(
f" <chunk id='doc-0'><![CDATA[{doc.content}]]></chunk>"
)
context_parts.append("</document_content>")
context_parts.append("</document>")
context_parts.append("")
context_parts.append("</mentioned_surfsense_docs>")
return "\n".join(context_parts)
def extract_todos_from_deepagents(command_output) -> dict:
"""
Extract todos from deepagents' TodoListMiddleware Command output.
@ -837,7 +782,6 @@ async def stream_new_chat(
user_id: str | None = None,
llm_config_id: int = -1,
mentioned_document_ids: list[int] | None = None,
mentioned_surfsense_doc_ids: list[int] | None = None,
mentioned_folder_ids: list[int] | None = None,
mentioned_connector_ids: list[int] | None = None,
mentioned_connectors: list[dict[str, Any]] | None = None,
@ -869,7 +813,6 @@ async def stream_new_chat(
llm_config_id: The LLM configuration ID (default: -1 for first global config)
needs_history_bootstrap: If True, load message history from DB (for cloned chats)
mentioned_document_ids: Optional list of document IDs mentioned with @ in the chat
mentioned_surfsense_doc_ids: Optional list of SurfSense doc IDs mentioned with @ in the chat
mentioned_folder_ids: Optional list of knowledge-base folder IDs mentioned with @ (cloud mode)
checkpoint_id: Optional checkpoint ID to rewind/fork from (for edit/reload operations)
@ -1295,19 +1238,7 @@ async def stream_new_chat(
# Mentioned KB documents are now handled by KnowledgeBaseSearchMiddleware
# which merges them into the scoped filesystem with full document
# structure. Only SurfSense docs and report context are inlined here.
# Fetch mentioned SurfSense docs if any
mentioned_surfsense_docs: list[SurfsenseDocsDocument] = []
if mentioned_surfsense_doc_ids:
result = await session.execute(
select(SurfsenseDocsDocument)
.options(selectinload(SurfsenseDocsDocument.chunks))
.filter(
SurfsenseDocsDocument.id.in_(mentioned_surfsense_doc_ids),
)
)
mentioned_surfsense_docs = list(result.scalars().all())
# structure. Only report context is inlined here.
# Fetch the most recent report(s) in this thread so the LLM can
# easily find report_id for versioning decisions, instead of
@ -1341,10 +1272,7 @@ async def stream_new_chat(
agent_user_query = user_query
accepted_folder_ids: list[int] = []
if fs_mode == FilesystemMode.CLOUD.value and (
mentioned_document_ids
or mentioned_surfsense_doc_ids
or mentioned_folder_ids
or mentioned_documents
mentioned_document_ids or mentioned_folder_ids or mentioned_documents
):
from app.schemas.new_chat import (
MentionedDocumentInfo as _MentionedDocumentInfo,
@ -1370,23 +1298,17 @@ async def stream_new_chat(
search_space_id=search_space_id,
mentioned_documents=chip_objs,
mentioned_document_ids=mentioned_document_ids,
mentioned_surfsense_doc_ids=mentioned_surfsense_doc_ids,
mentioned_folder_ids=mentioned_folder_ids,
)
agent_user_query = substitute_in_text(user_query, resolved.token_to_path)
accepted_folder_ids = resolved.mentioned_folder_ids
# Format the user query with context (SurfSense docs + reports only).
# Format the user query with context (reports only).
# Uses ``agent_user_query`` so the LLM sees backtick-wrapped paths
# instead of bare ``@title`` tokens.
final_query = agent_user_query
context_parts = []
if mentioned_surfsense_docs:
context_parts.append(
format_mentioned_surfsense_docs_as_context(mentioned_surfsense_docs)
)
if mentioned_connectors:
connector_lines = []
for connector in mentioned_connectors:
@ -1617,12 +1539,8 @@ async def stream_new_chat(
stream_result.content_builder = AssistantContentBuilder()
# Initial thinking step - analyzing the request
if mentioned_surfsense_docs:
initial_title = "Analyzing referenced content"
action_verb = "Analyzing"
else:
initial_title = "Understanding your request"
action_verb = "Processing"
initial_title = "Understanding your request"
action_verb = "Processing"
processing_parts = []
if user_query.strip():
@ -1633,18 +1551,6 @@ async def stream_new_chat(
else:
processing_parts.append("(message)")
if mentioned_surfsense_docs:
doc_names = []
for doc in mentioned_surfsense_docs:
title = doc.title
if len(title) > 30:
title = title[:27] + "..."
doc_names.append(title)
if len(doc_names) == 1:
processing_parts.append(f"[{doc_names[0]}]")
else:
processing_parts.append(f"[{len(doc_names)} docs]")
initial_items = [f"{action_verb}: {' '.join(processing_parts)}"]
initial_step_id = "thinking-1"
@ -1664,10 +1570,10 @@ async def stream_new_chat(
items=initial_items,
)
# These ORM objects (with eagerly-loaded chunks) can be very large.
# They're only needed to build context strings already copied into
# final_query / langchain_messages — release them before streaming.
del mentioned_surfsense_docs, recent_reports
# These ORM objects can be large. They're only needed to build context
# strings already copied into final_query / langchain_messages —
# release them before streaming.
del recent_reports
del langchain_messages, final_query
# Check if this is the first assistant response so we can generate

View file

@ -1,15 +1,11 @@
"""Pre-agent context shaping: mentioned-doc rendering and todos extraction."""
"""Pre-agent context shaping: todos extraction."""
from __future__ import annotations
from app.tasks.chat.streaming.context.deepagents_todos import (
extract_todos_from_deepagents,
)
from app.tasks.chat.streaming.context.mentioned_docs import (
format_mentioned_surfsense_docs_as_context,
)
__all__ = [
"extract_todos_from_deepagents",
"format_mentioned_surfsense_docs_as_context",
]

View file

@ -1,58 +0,0 @@
"""Render user-mentioned SurfSense docs as XML context for the agent."""
from __future__ import annotations
import json
from app.db import SurfsenseDocsDocument
from app.utils.surfsense_docs import surfsense_docs_public_url
def format_mentioned_surfsense_docs_as_context(
documents: list[SurfsenseDocsDocument],
) -> str:
if not documents:
return ""
context_parts = ["<mentioned_surfsense_docs>"]
context_parts.append(
"The user has explicitly mentioned the following SurfSense documentation pages. "
"These are official documentation about how to use SurfSense and should be used to answer questions about the application. "
"Use [citation:CHUNK_ID] format for citations (e.g., [citation:doc-123])."
)
for doc in documents:
public_url = surfsense_docs_public_url(doc.source)
metadata_json = json.dumps(
{"source": doc.source, "public_url": public_url}, ensure_ascii=False
)
context_parts.append("<document>")
context_parts.append("<document_metadata>")
context_parts.append(f" <document_id>doc-{doc.id}</document_id>")
context_parts.append(" <document_type>SURFSENSE_DOCS</document_type>")
context_parts.append(f" <title><![CDATA[{doc.title}]]></title>")
context_parts.append(f" <url><![CDATA[{public_url}]]></url>")
context_parts.append(
f" <metadata_json><![CDATA[{metadata_json}]]></metadata_json>"
)
context_parts.append("</document_metadata>")
context_parts.append("")
context_parts.append("<document_content>")
if hasattr(doc, "chunks") and doc.chunks:
for chunk in doc.chunks:
context_parts.append(
f" <chunk id='doc-{chunk.id}'><![CDATA[{chunk.content}]]></chunk>"
)
else:
context_parts.append(
f" <chunk id='doc-0'><![CDATA[{doc.content}]]></chunk>"
)
context_parts.append("</document_content>")
context_parts.append("</document>")
context_parts.append("")
context_parts.append("</mentioned_surfsense_docs>")
return "\n".join(context_parts)

View file

@ -1,8 +1,8 @@
"""Build and emit the first ``thinking-1`` step for a new-chat turn.
The step title and "Processing X" items are derived from what the user sent
(text snippet, image count, mentioned doc titles) so the FE can render a
meaningful placeholder while the agent stream warms up.
(text snippet, image count) so the FE can render a meaningful placeholder
while the agent stream warms up.
``thinking-1`` is the canonical id for this step every subsequent
``thinking-N`` produced by ``stream_agent_events`` folds into the same
@ -15,7 +15,6 @@ from collections.abc import Iterator
from dataclasses import dataclass
from typing import Any
from app.db import SurfsenseDocsDocument
from app.services.new_streaming_service import VercelStreamingService
@ -37,14 +36,9 @@ def build_initial_thinking_step(
*,
user_query: str,
user_image_data_urls: list[str] | None,
mentioned_surfsense_docs: list[SurfsenseDocsDocument],
) -> InitialThinkingStep:
if mentioned_surfsense_docs:
title = "Analyzing referenced content"
action_verb = "Analyzing"
else:
title = "Understanding your request"
action_verb = "Processing"
title = "Understanding your request"
action_verb = "Processing"
processing_parts: list[str] = []
if user_query.strip():
@ -55,18 +49,6 @@ def build_initial_thinking_step(
else:
processing_parts.append("(message)")
if mentioned_surfsense_docs:
doc_names: list[str] = []
for doc in mentioned_surfsense_docs:
t = doc.title
if len(t) > 30:
t = t[:27] + "..."
doc_names.append(t)
if len(doc_names) == 1:
processing_parts.append(f"[{doc_names[0]}]")
else:
processing_parts.append(f"[{len(doc_names)} docs]")
items = [f"{action_verb}: {' '.join(processing_parts)}"]
return InitialThinkingStep(step_id="thinking-1", title=title, items=items)

View file

@ -5,20 +5,17 @@ Pipeline:
1. **History bootstrap** only for cloned chats with no LangGraph checkpoint
yet; flips the per-thread ``needs_history_bootstrap`` flag back to False
once the rows are loaded.
2. **Mentioned SurfSense docs** eager-load chunks so the formatter has the
full content without a second roundtrip.
3. **Recent reports** top 3 by id desc with non-null content, so the LLM
2. **Recent reports** top 3 by id desc with non-null content, so the LLM
can resolve ``report_id`` for versioning without spelunking history.
4. **@-mention resolve** (cloud mode) substitute ``@title`` tokens in the
3. **@-mention resolve** (cloud mode) substitute ``@title`` tokens in the
query with canonical ``\`/documents/...\``` paths the LLM expects.
5. **Context block render** XML-wrap surfsense docs + reports, prepend to
the rewritten query, optionally prefix with display name for SEARCH_SPACE
4. **Context block render** XML-wrap recent reports, prepend to the
rewritten query, optionally prefix with display name for SEARCH_SPACE
visibility.
6. **HumanMessage** multimodal content if images are attached.
5. **HumanMessage** multimodal content if images are attached.
Returns the assembled ``input_state`` dict plus side-channel data the
orchestrator needs downstream (``accepted_folder_ids`` for runtime context;
``mentioned_surfsense_docs`` for the initial thinking step).
orchestrator needs downstream (``accepted_folder_ids`` for runtime context).
"""
from __future__ import annotations
@ -30,7 +27,6 @@ from typing import Any
from langchain_core.messages import HumanMessage
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from sqlalchemy.orm import selectinload
from app.agents.new_chat.filesystem_selection import FilesystemMode
from app.agents.new_chat.mention_resolver import resolve_mentions, substitute_in_text
@ -38,10 +34,6 @@ from app.db import (
ChatVisibility,
NewChatThread,
Report,
SurfsenseDocsDocument,
)
from app.tasks.chat.streaming.context.mentioned_docs import (
format_mentioned_surfsense_docs_as_context,
)
from app.utils.content_utils import bootstrap_history_from_db
from app.utils.user_message_multimodal import build_human_message_content
@ -55,13 +47,10 @@ class NewChatInputState:
``input_state`` is fed straight to the agent. ``accepted_folder_ids``
feeds the runtime context (the resolver may have dropped some chips).
``mentioned_surfsense_docs`` is consumed by the initial thinking-step
builder for the FE placeholder before the agent stream starts.
"""
input_state: dict[str, Any]
accepted_folder_ids: list[int]
mentioned_surfsense_docs: list[SurfsenseDocsDocument]
async def build_new_chat_input_state(
@ -72,7 +61,6 @@ async def build_new_chat_input_state(
user_query: str,
user_image_data_urls: list[str] | None,
mentioned_document_ids: list[int] | None,
mentioned_surfsense_doc_ids: list[int] | None,
mentioned_folder_ids: list[int] | None,
mentioned_documents: list[dict[str, Any]] | None,
needs_history_bootstrap: bool,
@ -96,15 +84,6 @@ async def build_new_chat_input_state(
thread.needs_history_bootstrap = False
await session.commit()
mentioned_surfsense_docs: list[SurfsenseDocsDocument] = []
if mentioned_surfsense_doc_ids:
result = await session.execute(
select(SurfsenseDocsDocument)
.options(selectinload(SurfsenseDocsDocument.chunks))
.filter(SurfsenseDocsDocument.id.in_(mentioned_surfsense_doc_ids))
)
mentioned_surfsense_docs = list(result.scalars().all())
# Top 3 reports keyed by id desc (newest first) with content present,
# surfaced inline so the LLM resolves ``report_id`` for versioning without
# digging through conversation history.
@ -125,14 +104,12 @@ async def build_new_chat_input_state(
user_query=user_query,
filesystem_mode=filesystem_mode,
mentioned_document_ids=mentioned_document_ids,
mentioned_surfsense_doc_ids=mentioned_surfsense_doc_ids,
mentioned_folder_ids=mentioned_folder_ids,
mentioned_documents=mentioned_documents,
)
final_query = _render_query_with_context(
agent_user_query=agent_user_query,
mentioned_surfsense_docs=mentioned_surfsense_docs,
recent_reports=recent_reports,
)
@ -154,7 +131,6 @@ async def build_new_chat_input_state(
return NewChatInputState(
input_state=input_state,
accepted_folder_ids=accepted_folder_ids,
mentioned_surfsense_docs=mentioned_surfsense_docs,
)
@ -165,7 +141,6 @@ async def _resolve_mentions_for_query(
user_query: str,
filesystem_mode: str,
mentioned_document_ids: list[int] | None,
mentioned_surfsense_doc_ids: list[int] | None,
mentioned_folder_ids: list[int] | None,
mentioned_documents: list[dict[str, Any]] | None,
) -> tuple[str, list[int]]:
@ -187,10 +162,7 @@ async def _resolve_mentions_for_query(
accepted_folder_ids: list[int] = []
has_any_mention = bool(
mentioned_document_ids
or mentioned_surfsense_doc_ids
or mentioned_folder_ids
or mentioned_documents
mentioned_document_ids or mentioned_folder_ids or mentioned_documents
)
if filesystem_mode != FilesystemMode.CLOUD.value or not has_any_mention:
return agent_user_query, accepted_folder_ids
@ -214,7 +186,6 @@ async def _resolve_mentions_for_query(
search_space_id=search_space_id,
mentioned_documents=chip_objs,
mentioned_document_ids=mentioned_document_ids,
mentioned_surfsense_doc_ids=mentioned_surfsense_doc_ids,
mentioned_folder_ids=mentioned_folder_ids,
)
agent_user_query = substitute_in_text(user_query, resolved.token_to_path)
@ -225,17 +196,11 @@ async def _resolve_mentions_for_query(
def _render_query_with_context(
*,
agent_user_query: str,
mentioned_surfsense_docs: list[SurfsenseDocsDocument],
recent_reports: list[Report],
) -> str:
"""Prepend surfsense-docs + recent-reports XML blocks to the user query."""
"""Prepend recent-reports XML block to the user query."""
context_parts: list[str] = []
if mentioned_surfsense_docs:
context_parts.append(
format_mentioned_surfsense_docs_as_context(mentioned_surfsense_docs)
)
if recent_reports:
report_lines: list[str] = []
for r in recent_reports:

View file

@ -123,7 +123,6 @@ async def stream_new_chat(
user_id: str | None = None,
llm_config_id: int = -1,
mentioned_document_ids: list[int] | None = None,
mentioned_surfsense_doc_ids: list[int] | None = None,
mentioned_folder_ids: list[int] | None = None,
mentioned_documents: list[dict[str, Any]] | None = None,
checkpoint_id: str | None = None,
@ -435,7 +434,6 @@ async def stream_new_chat(
user_query=user_query,
user_image_data_urls=user_image_data_urls,
mentioned_document_ids=mentioned_document_ids,
mentioned_surfsense_doc_ids=mentioned_surfsense_doc_ids,
mentioned_folder_ids=mentioned_folder_ids,
mentioned_documents=mentioned_documents,
needs_history_bootstrap=needs_history_bootstrap,
@ -447,7 +445,6 @@ async def stream_new_chat(
)
input_state = assembled.input_state
accepted_folder_ids = assembled.accepted_folder_ids
mentioned_surfsense_docs = assembled.mentioned_surfsense_docs
_perf_log.info(
"[stream_new_chat] History bootstrap + doc/report queries in %.3fs",
time.perf_counter() - _t0,
@ -560,7 +557,6 @@ async def stream_new_chat(
initial_step = build_initial_thinking_step(
user_query=user_query,
user_image_data_urls=user_image_data_urls,
mentioned_surfsense_docs=mentioned_surfsense_docs,
)
for sse in iter_initial_thinking_step_frame(
initial_step,
@ -575,7 +571,7 @@ async def stream_new_chat(
# Drop the heavy ORM objects + the container that holds them so they
# aren't retained for the entire streaming duration. ``input_state``
# already carries the langchain_messages list independently.
del assembled, mentioned_surfsense_docs
del assembled
title_task = spawn_title_task(
chat_id=chat_id,

View file

@ -1,249 +0,0 @@
"""
Surfsense documentation indexer.
Indexes MDX documentation files at startup.
"""
import hashlib
import logging
import re
from datetime import UTC, datetime
from pathlib import Path
from sqlalchemy import delete as sa_delete, select
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.orm import selectinload
from sqlalchemy.orm.attributes import set_committed_value
from app.config import config
from app.db import SurfsenseDocsChunk, SurfsenseDocsDocument, async_session_maker
from app.utils.document_converters import embed_text
logger = logging.getLogger(__name__)
async def _safe_set_docs_chunks(
session: AsyncSession, document: SurfsenseDocsDocument, chunks: list
) -> None:
"""safe_set_chunks variant for the SurfsenseDocsDocument/Chunk models."""
if document.id is not None:
await session.execute(
sa_delete(SurfsenseDocsChunk).where(
SurfsenseDocsChunk.document_id == document.id
)
)
for chunk in chunks:
chunk.document_id = document.id
set_committed_value(document, "chunks", chunks)
session.add_all(chunks)
# Path to docs relative to project root
DOCS_DIR = (
Path(__file__).resolve().parent.parent.parent.parent
/ "surfsense_web"
/ "content"
/ "docs"
)
def parse_mdx_frontmatter(content: str) -> tuple[str, str]:
"""
Parse MDX file to extract frontmatter title and content.
Args:
content: Raw MDX file content
Returns:
Tuple of (title, content_without_frontmatter)
"""
# Match frontmatter between --- markers
frontmatter_pattern = r"^---\s*\n(.*?)\n---\s*\n"
match = re.match(frontmatter_pattern, content, re.DOTALL)
if match:
frontmatter = match.group(1)
content_without_frontmatter = content[match.end() :]
# Extract title from frontmatter
title_match = re.search(r"^title:\s*(.+)$", frontmatter, re.MULTILINE)
title = title_match.group(1).strip() if title_match else "Untitled"
# Remove quotes if present
title = title.strip("\"'")
return title, content_without_frontmatter.strip()
return "Untitled", content.strip()
def get_all_mdx_files() -> list[Path]:
"""
Get all MDX files from the docs directory.
Returns:
List of Path objects for each MDX file
"""
if not DOCS_DIR.exists():
logger.warning(f"Docs directory not found: {DOCS_DIR}")
return []
return list(DOCS_DIR.rglob("*.mdx"))
def generate_surfsense_docs_content_hash(content: str) -> str:
"""Generate SHA-256 hash for Surfsense docs content."""
return hashlib.sha256(content.encode("utf-8")).hexdigest()
def create_surfsense_docs_chunks(content: str) -> list[SurfsenseDocsChunk]:
"""
Create chunks from Surfsense documentation content.
Args:
content: Document content to chunk
Returns:
List of SurfsenseDocsChunk objects with embeddings
"""
return [
SurfsenseDocsChunk(
content=chunk.text,
embedding=embed_text(chunk.text),
)
for chunk in config.chunker_instance.chunk(content)
]
async def index_surfsense_docs(session: AsyncSession) -> tuple[int, int, int, int]:
"""
Index all Surfsense documentation files.
Args:
session: SQLAlchemy async session
Returns:
Tuple of (created, updated, skipped, deleted) counts
"""
created = 0
updated = 0
skipped = 0
deleted = 0
# Get all existing docs from database
existing_docs_result = await session.execute(
select(SurfsenseDocsDocument).options(
selectinload(SurfsenseDocsDocument.chunks)
)
)
existing_docs = {doc.source: doc for doc in existing_docs_result.scalars().all()}
# Track which sources we've processed
processed_sources = set()
# Get all MDX files
mdx_files = get_all_mdx_files()
logger.info(f"Found {len(mdx_files)} MDX files to index")
for mdx_file in mdx_files:
try:
source = str(mdx_file.relative_to(DOCS_DIR))
processed_sources.add(source)
# Read file content
raw_content = mdx_file.read_text(encoding="utf-8")
title, content = parse_mdx_frontmatter(raw_content)
content_hash = generate_surfsense_docs_content_hash(raw_content)
if source in existing_docs:
existing_doc = existing_docs[source]
# Check if content changed
if existing_doc.content_hash == content_hash:
logger.debug(f"Skipping unchanged: {source}")
skipped += 1
continue
# Content changed - update document
logger.info(f"Updating changed document: {source}")
# Create new chunks
chunks = create_surfsense_docs_chunks(content)
# Update document fields
existing_doc.title = title
existing_doc.content = content
existing_doc.content_hash = content_hash
existing_doc.embedding = embed_text(content)
await _safe_set_docs_chunks(session, existing_doc, chunks)
existing_doc.updated_at = datetime.now(UTC)
updated += 1
else:
# New document - create it
logger.info(f"Creating new document: {source}")
chunks = create_surfsense_docs_chunks(content)
document = SurfsenseDocsDocument(
source=source,
title=title,
content=content,
content_hash=content_hash,
embedding=embed_text(content),
chunks=chunks,
updated_at=datetime.now(UTC),
)
session.add(document)
created += 1
except Exception as e:
logger.error(f"Error processing {mdx_file}: {e}", exc_info=True)
continue
# Delete documents for removed files
for source, doc in existing_docs.items():
if source not in processed_sources:
logger.info(f"Deleting removed document: {source}")
await session.delete(doc)
deleted += 1
# Commit all changes
await session.commit()
logger.info(
f"Indexing complete: {created} created, {updated} updated, "
f"{skipped} skipped, {deleted} deleted"
)
return created, updated, skipped, deleted
async def seed_surfsense_docs() -> tuple[int, int, int, int]:
"""
Seed Surfsense documentation into the database.
This function indexes all MDX files from the docs directory.
It handles creating, updating, and deleting docs based on content changes.
Returns:
Tuple of (created, updated, skipped, deleted) counts
Returns (0, 0, 0, 0) if an error occurs
"""
logger.info("Starting Surfsense docs indexing...")
try:
async with async_session_maker() as session:
created, updated, skipped, deleted = await index_surfsense_docs(session)
logger.info(
f"Surfsense docs indexing complete: "
f"created={created}, updated={updated}, skipped={skipped}, deleted={deleted}"
)
return created, updated, skipped, deleted
except Exception as e:
logger.error(f"Failed to seed Surfsense docs: {e}", exc_info=True)
return 0, 0, 0, 0

View file

@ -1,13 +0,0 @@
"""Utilities for SurfSense's built-in documentation index."""
from pathlib import PurePosixPath
DOCS_PUBLIC_ROOT = PurePosixPath("/docs")
def surfsense_docs_public_url(source: str) -> str:
"""Return the public docs route for an indexed documentation source path."""
docs_path = PurePosixPath(source).with_suffix("")
if docs_path.name == "index":
docs_path = docs_path.parent
return (DOCS_PUBLIC_ROOT / docs_path).as_posix()

View file

@ -1,40 +0,0 @@
#!/usr/bin/env python
"""
Seed Surfsense documentation into the database.
CLI wrapper for the seed_surfsense_docs function.
Can be run manually for debugging or re-indexing.
Usage:
python scripts/seed_surfsense_docs.py
"""
import asyncio
import sys
from pathlib import Path
# Add the parent directory to the path so we can import app modules
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from app.tasks.surfsense_docs_indexer import seed_surfsense_docs
def main():
"""CLI entry point for seeding Surfsense docs."""
print("=" * 50)
print(" Surfsense Documentation Seeding")
print("=" * 50)
created, updated, skipped, deleted = asyncio.run(seed_surfsense_docs())
print()
print("Results:")
print(f" Created: {created}")
print(f" Updated: {updated}")
print(f" Skipped: {skipped}")
print(f" Deleted: {deleted}")
print("=" * 50)
if __name__ == "__main__":
main()

View file

@ -60,7 +60,6 @@ class TestReadOnlyToolsAllowed:
"glob",
"web_search",
"scrape_webpage",
"search_surfsense_docs",
"get_connected_accounts",
"write_todos",
"task",

View file

@ -22,12 +22,6 @@ from app.agents.new_chat.subagents.config import (
# ---------------------------------------------------------------------------
@tool
def search_surfsense_docs(query: str) -> str:
"""Search the user's KB."""
return ""
@tool
def web_search(query: str) -> str:
"""Search the public web."""
@ -95,7 +89,6 @@ def generate_report(topic: str) -> str:
ALL_TOOLS = [
search_surfsense_docs,
web_search,
scrape_webpage,
read_file,
@ -161,7 +154,7 @@ class TestReportWriterSubagent:
names = {t.name for t in spec["tools"]} # type: ignore[index]
assert names == REPORT_WRITER_TOOLS & {t.name for t in ALL_TOOLS}
assert "generate_report" in names
assert "search_surfsense_docs" in names
assert "read_file" in names
def test_deny_rules_block_writes_but_allow_generate_report(self) -> None:
spec = build_report_writer_subagent(tools=ALL_TOOLS)
@ -272,9 +265,9 @@ class TestFilterToolsWarningSuppression:
# Allowed set asks for two registry tools (one present, one
# not) plus a bunch of middleware-provided names.
_filter_tools(
[search_surfsense_docs],
[web_search],
allowed_names={
"search_surfsense_docs",
"web_search",
"scrape_webpage", # legitimately missing → should warn
"read_file", # mw-provided → suppressed
"ls",
@ -322,7 +315,6 @@ class TestDenyPatternsCoverage:
def test_deny_patterns_do_not_match_safe_read_tools(self) -> None:
canonical_reads = [
"search_surfsense_docs",
"read_file",
"ls_tree",
"grep",

View file

@ -25,7 +25,6 @@ from __future__ import annotations
import asyncio
import inspect
from dataclasses import dataclass
from typing import Any
from unittest.mock import AsyncMock, patch
@ -140,45 +139,28 @@ def test_orchestrators_are_async_generator_functions() -> None:
# ------------------------------------------------------------ initial thinking
@dataclass
class _FakeSurfsenseDoc:
"""Stand-in for ``SurfsenseDocsDocument`` with just the field we read."""
title: str
@pytest.mark.parametrize(
"user_query, image_urls, docs, expected_title, expected_action",
"user_query, image_urls, expected_title, expected_action",
[
("hello world", None, [], "Understanding your request", "Processing"),
("hello world", None, "Understanding your request", "Processing"),
(
"",
["data:image/png;base64,AAA"],
[],
"Understanding your request",
"Processing",
),
("", None, [], "Understanding your request", "Processing"),
(
"doc question",
None,
[_FakeSurfsenseDoc(title="My Doc")],
"Analyzing referenced content",
"Analyzing",
),
("", None, "Understanding your request", "Processing"),
],
)
def test_initial_thinking_step_branches(
user_query: str,
image_urls: list[str] | None,
docs: list[Any],
expected_title: str,
expected_action: str,
) -> None:
step = build_initial_thinking_step(
user_query=user_query,
user_image_data_urls=image_urls,
mentioned_surfsense_docs=docs, # type: ignore[arg-type]
)
assert step.step_id == "thinking-1"
assert step.title == expected_title
@ -191,7 +173,6 @@ def test_initial_thinking_step_truncates_long_query() -> None:
step = build_initial_thinking_step(
user_query=long_query,
user_image_data_urls=None,
mentioned_surfsense_docs=[],
)
# 80-char truncation + ellipsis, sandwiched after "Processing: ".
assert "..." in step.items[0]
@ -200,16 +181,6 @@ def test_initial_thinking_step_truncates_long_query() -> None:
assert payload.startswith("x" * 80) and payload.endswith("...")
def test_initial_thinking_step_collapses_many_doc_names() -> None:
docs = [_FakeSurfsenseDoc(title=f"Doc {i}") for i in range(5)]
step = build_initial_thinking_step(
user_query="q",
user_image_data_urls=None,
mentioned_surfsense_docs=docs, # type: ignore[arg-type]
)
assert "[5 docs]" in step.items[0]
# ------------------------------------------------------------ capability gate