mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-31 19:45:15 +02:00
refactor: remove search_surfsense_docs tool and related references
- Deleted the `search_surfsense_docs` tool and its associated files, streamlining the agent's toolset. - Updated various components and prompts to remove references to the now-removed tool, ensuring consistency across the codebase. - Adjusted documentation to direct users to the SurfSense documentation link for product-related queries instead.
This commit is contained in:
parent
9b9e6828c7
commit
40ca9e6ed2
71 changed files with 232 additions and 1676 deletions
|
|
@ -104,7 +104,7 @@ class AgentFeatureFlags:
|
|||
# ``tools/google_drive``, ``tools/dropbox``, ``tools/onedrive``,
|
||||
# ``tools/google_calendar``, ``tools/confluence``, ``tools/discord``,
|
||||
# ``tools/teams``, ``tools/luma``, ``connected_accounts``,
|
||||
# ``update_memory``, ``search_surfsense_docs``) now acquire fresh
|
||||
# ``update_memory``) now acquire fresh
|
||||
# short-lived ``AsyncSession`` instances per call via
|
||||
# :data:`async_session_maker`. The factory still accepts ``db_session``
|
||||
# for registry compatibility but ``del``'s it immediately — see any
|
||||
|
|
|
|||
|
|
@ -73,9 +73,8 @@ class ResolvedMentionSet:
|
|||
``@Project Roadmap`` is never shadowed by a shorter prefix
|
||||
``@Project``).
|
||||
|
||||
``mentioned_document_ids`` collapses doc + surfsense_doc chips into
|
||||
a single ordered, deduped list because the priority middleware
|
||||
treats them uniformly downstream — see
|
||||
``mentioned_document_ids`` is an ordered, deduped list consumed by
|
||||
the priority middleware downstream — see
|
||||
``KnowledgePriorityMiddleware._compute_priority_paths``.
|
||||
"""
|
||||
|
||||
|
|
@ -103,7 +102,6 @@ async def resolve_mentions(
|
|||
search_space_id: int,
|
||||
mentioned_documents: list[MentionedDocumentInfo] | None,
|
||||
mentioned_document_ids: list[int] | None = None,
|
||||
mentioned_surfsense_doc_ids: list[int] | None = None,
|
||||
mentioned_folder_ids: list[int] | None = None,
|
||||
) -> ResolvedMentionSet:
|
||||
"""Resolve every @-mention chip on a turn into virtual paths.
|
||||
|
|
@ -111,8 +109,7 @@ async def resolve_mentions(
|
|||
The function takes both the ``mentioned_documents`` discriminated
|
||||
list (chip metadata used for substitution + persistence) and the
|
||||
parallel id arrays (``mentioned_document_ids``,
|
||||
``mentioned_surfsense_doc_ids``, ``mentioned_folder_ids``) for two
|
||||
reasons:
|
||||
``mentioned_folder_ids``) for two reasons:
|
||||
|
||||
* Legacy clients that haven't migrated to the unified chip list
|
||||
still send the id arrays — we treat the union as authoritative.
|
||||
|
|
@ -142,7 +139,6 @@ async def resolve_mentions(
|
|||
dict.fromkeys(
|
||||
[
|
||||
*(mentioned_document_ids or []),
|
||||
*(mentioned_surfsense_doc_ids or []),
|
||||
*chip_doc_ids,
|
||||
]
|
||||
)
|
||||
|
|
|
|||
|
|
@ -59,14 +59,13 @@ Do NOT cite document_id. Always use the chunk id.
|
|||
- NEVER create your own citation format - use the exact chunk_id values from the documents in the [citation:chunk_id] format
|
||||
- NEVER format citations as clickable links or as markdown links like "([citation:5](https://example.com))". Always use plain square brackets only
|
||||
- NEVER make up chunk IDs if you are unsure about the chunk_id. It is better to omit the citation than to guess
|
||||
- Copy the EXACT chunk id from the XML - if it says `<chunk id='doc-123'>`, use [citation:doc-123]
|
||||
- Copy the EXACT chunk id from the XML - if it says `<chunk id='5'>`, use [citation:5]
|
||||
- If the chunk id is a URL like `<chunk id='https://example.com/page'>`, use [citation:https://example.com/page]
|
||||
</citation_format>
|
||||
|
||||
<citation_examples>
|
||||
CORRECT citation formats:
|
||||
- [citation:5] (numeric chunk ID from knowledge base)
|
||||
- [citation:doc-123] (for Surfsense documentation chunks)
|
||||
- [citation:https://example.com/article] (URL chunk ID from web search results)
|
||||
- [citation:chunk_id1], [citation:chunk_id2], [citation:chunk_id3] (multiple citations)
|
||||
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ CRITICAL RULE — KNOWLEDGE BASE FIRST, NEVER DEFAULT TO GENERAL KNOWLEDGE:
|
|||
2. Ask the user: "Would you like me to answer from my general knowledge instead?"
|
||||
3. ONLY provide a general-knowledge answer AFTER the user explicitly says yes.
|
||||
- This policy does NOT apply to:
|
||||
* Casual conversation, greetings, or meta-questions about SurfSense itself (e.g., "what can you do?")
|
||||
* Casual conversation, greetings, or meta-questions about SurfSense itself (e.g., "what can you do?"). For "how do I use SurfSense" / product-documentation questions, point the user to https://www.surfsense.com/docs.
|
||||
* Formatting, summarization, or analysis of content already present in the conversation
|
||||
* Following user instructions that are clearly task-oriented (e.g., "rewrite this in bullet points")
|
||||
* Tool-usage actions like generating reports, podcasts, images, or scraping webpages
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ CRITICAL RULE — KNOWLEDGE BASE FIRST, NEVER DEFAULT TO GENERAL KNOWLEDGE:
|
|||
2. Ask: "Would you like me to answer from my general knowledge instead?"
|
||||
3. ONLY provide a general-knowledge answer AFTER a team member explicitly says yes.
|
||||
- This policy does NOT apply to:
|
||||
* Casual conversation, greetings, or meta-questions about SurfSense itself (e.g., "what can you do?")
|
||||
* Casual conversation, greetings, or meta-questions about SurfSense itself (e.g., "what can you do?"). For "how do I use SurfSense" / product-documentation questions, point the user to https://www.surfsense.com/docs.
|
||||
* Formatting, summarization, or analysis of content already present in the conversation
|
||||
* Following user instructions that are clearly task-oriented (e.g., "rewrite this in bullet points")
|
||||
* Tool-usage actions like generating reports, podcasts, images, or scraping webpages
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ When to use which tool:
|
|||
- Knowledge base content (Notion, GitHub, files, notes) → automatically searched
|
||||
- Real-time public web data → call web_search
|
||||
- Reading a specific webpage → call scrape_webpage
|
||||
- SurfSense product / how-to questions (setup, configuration, connectors, feature behavior) → point the user to the documentation: https://www.surfsense.com/docs
|
||||
|
||||
**`task` subagents (when to delegate):**
|
||||
- **`linear_specialist`** — Linear-only investigations and tool use.
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ When to use which tool:
|
|||
- Knowledge base content (Notion, GitHub, files, notes) → automatically searched
|
||||
- Real-time public web data → call web_search
|
||||
- Reading a specific webpage → call scrape_webpage
|
||||
- SurfSense product / how-to questions (setup, configuration, connectors, feature behavior) → point the user to the documentation: https://www.surfsense.com/docs
|
||||
|
||||
**`task` subagents (when to delegate):**
|
||||
- **`linear_specialist`** — Linear-only investigations and tool use.
|
||||
|
|
|
|||
|
|
@ -151,7 +151,6 @@ def _read_fragment(subpath: str) -> str:
|
|||
# Ordered for reading flow: fundamentals first, then artifact generators,
|
||||
# then memory at the end (mirrors the legacy ``_ALL_TOOL_NAMES_ORDERED``).
|
||||
ALL_TOOL_NAMES_ORDERED: tuple[str, ...] = (
|
||||
"search_surfsense_docs",
|
||||
"web_search",
|
||||
"generate_podcast",
|
||||
"generate_video_presentation",
|
||||
|
|
|
|||
|
|
@ -1,9 +0,0 @@
|
|||
|
||||
- User: "How do I install SurfSense?"
|
||||
- Call: `search_surfsense_docs(query="installation setup")`
|
||||
- User: "What connectors does SurfSense support?"
|
||||
- Call: `search_surfsense_docs(query="available connectors integrations")`
|
||||
- User: "How do I set up the Notion connector?"
|
||||
- Call: `search_surfsense_docs(query="Notion connector setup configuration")`
|
||||
- User: "How do I use Docker to run SurfSense?"
|
||||
- Call: `search_surfsense_docs(query="Docker installation setup")`
|
||||
|
|
@ -1,7 +0,0 @@
|
|||
|
||||
- search_surfsense_docs: Search the official SurfSense documentation.
|
||||
- Use this tool when the user asks anything about SurfSense itself (the application they are using).
|
||||
- Args:
|
||||
- query: The search query about SurfSense
|
||||
- top_k: Number of documentation chunks to retrieve (default: 10)
|
||||
- Returns: Documentation content with chunk IDs for citations (prefixed with 'doc-', e.g., [citation:doc-123])
|
||||
|
|
@ -1,7 +1,6 @@
|
|||
---
|
||||
name: email-drafting
|
||||
description: Draft an email matching the user's voice, with structured intent and CTA
|
||||
allowed-tools: search_surfsense_docs
|
||||
---
|
||||
|
||||
# Email drafting
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
---
|
||||
name: kb-research
|
||||
description: Structured approach to finding and synthesizing information from the user's knowledge base
|
||||
allowed-tools: search_surfsense_docs, scrape_webpage, read_file, ls_tree, grep, web_search
|
||||
allowed-tools: scrape_webpage, read_file, ls_tree, grep, web_search
|
||||
---
|
||||
|
||||
# Knowledge-base research
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
---
|
||||
name: meeting-prep
|
||||
description: Pull together briefing materials before a scheduled meeting
|
||||
allowed-tools: search_surfsense_docs, web_search, scrape_webpage, read_file
|
||||
allowed-tools: web_search, scrape_webpage, read_file
|
||||
---
|
||||
|
||||
# Meeting preparation
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
---
|
||||
name: report-writing
|
||||
description: How to scope, draft, and revise a Markdown report artifact via generate_report
|
||||
allowed-tools: generate_report, search_surfsense_docs, read_file
|
||||
allowed-tools: generate_report, read_file
|
||||
---
|
||||
|
||||
# Report writing
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
---
|
||||
name: slack-summary
|
||||
description: Distill a Slack channel or thread into actionable summary
|
||||
allowed-tools: search_surfsense_docs
|
||||
---
|
||||
|
||||
# Slack summarization
|
||||
|
|
|
|||
|
|
@ -46,7 +46,6 @@ logger = logging.getLogger(__name__)
|
|||
# ``glob``, ``grep``) plus the SurfSense-side read tools.
|
||||
EXPLORE_READ_TOOLS: frozenset[str] = frozenset(
|
||||
{
|
||||
"search_surfsense_docs",
|
||||
"web_search",
|
||||
"scrape_webpage",
|
||||
"read_file",
|
||||
|
|
@ -61,7 +60,6 @@ EXPLORE_READ_TOOLS: frozenset[str] = frozenset(
|
|||
# is needed, the parent should hand off to ``explore`` first.
|
||||
REPORT_WRITER_TOOLS: frozenset[str] = frozenset(
|
||||
{
|
||||
"search_surfsense_docs",
|
||||
"read_file",
|
||||
"generate_report",
|
||||
}
|
||||
|
|
@ -222,7 +220,6 @@ EXPLORE_SYSTEM_PROMPT = """You are the **explore** subagent for SurfSense.
|
|||
Conduct read-only research across the user's knowledge base, the web, and any documents the parent agent has surfaced. Return a synthesized answer with explicit citations — never speculate beyond the sources you have actually inspected.
|
||||
|
||||
## Tools available
|
||||
- `search_surfsense_docs` — fast hybrid search over the user's knowledge base.
|
||||
- `web_search` — only when the user's KB clearly does not contain the answer.
|
||||
- `scrape_webpage` — to read a URL the user or the search results provided.
|
||||
- `read_file`, `ls`, `glob`, `grep` — to inspect specific documents or trees the parent has flagged.
|
||||
|
|
@ -242,7 +239,7 @@ Produce a single high-quality report deliverable using `generate_report`. The pa
|
|||
|
||||
## Workflow
|
||||
1. **Outline first.** Before calling `generate_report`, write a one-paragraph outline of the sections you plan to produce. Confirm the outline reflects the parent's instructions.
|
||||
2. **Source resolution.** Decide whether to call `search_surfsense_docs` and `read_file` for any final-checks, or whether the parent's earlier tool calls already cover the source set.
|
||||
2. **Source resolution.** Decide whether to call `read_file` for any final-checks, or whether the parent's earlier tool calls already cover the source set.
|
||||
3. **One report.** Call `generate_report` exactly once with `source_strategy` chosen per the topic and chat history (see the `report-writing` skill).
|
||||
4. **Confirm.** End with a one-sentence summary in your final message — never paste the report back into chat; the artifact card renders itself.
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ This module contains all the tools available to the SurfSense agent.
|
|||
To add a new tool, see the documentation in registry.py.
|
||||
|
||||
Available tools:
|
||||
- search_surfsense_docs: Search Surfsense documentation for usage help
|
||||
- generate_podcast: Generate audio podcasts from content
|
||||
- generate_video_presentation: Generate video presentations with slides and narration
|
||||
- generate_image: Generate images from text descriptions using AI models
|
||||
|
|
@ -31,7 +30,6 @@ from .registry import (
|
|||
get_tool_by_name,
|
||||
)
|
||||
from .scrape_webpage import create_scrape_webpage_tool
|
||||
from .search_surfsense_docs import create_search_surfsense_docs_tool
|
||||
from .update_memory import create_update_memory_tool, create_update_team_memory_tool
|
||||
from .video_presentation import create_generate_video_presentation_tool
|
||||
|
||||
|
|
@ -47,7 +45,6 @@ __all__ = [
|
|||
"create_generate_podcast_tool",
|
||||
"create_generate_video_presentation_tool",
|
||||
"create_scrape_webpage_tool",
|
||||
"create_search_surfsense_docs_tool",
|
||||
"create_update_memory_tool",
|
||||
"create_update_team_memory_tool",
|
||||
"format_documents_for_context",
|
||||
|
|
|
|||
|
|
@ -101,7 +101,6 @@ from .podcast import create_generate_podcast_tool
|
|||
from .report import create_generate_report_tool
|
||||
from .resume import create_generate_resume_tool
|
||||
from .scrape_webpage import create_scrape_webpage_tool
|
||||
from .search_surfsense_docs import create_search_surfsense_docs_tool
|
||||
from .teams import (
|
||||
create_list_teams_channels_tool,
|
||||
create_read_teams_messages_tool,
|
||||
|
|
@ -258,15 +257,6 @@ BUILTIN_TOOLS: list[ToolDefinition] = [
|
|||
),
|
||||
requires=[],
|
||||
),
|
||||
# Surfsense documentation search tool
|
||||
ToolDefinition(
|
||||
name="search_surfsense_docs",
|
||||
description="Search Surfsense documentation for help with using the application",
|
||||
factory=lambda deps: create_search_surfsense_docs_tool(
|
||||
db_session=deps["db_session"],
|
||||
),
|
||||
requires=["db_session"],
|
||||
),
|
||||
# =========================================================================
|
||||
# SERVICE ACCOUNT DISCOVERY
|
||||
# Generic tool for the LLM to discover connected accounts and resolve
|
||||
|
|
|
|||
|
|
@ -1,174 +0,0 @@
|
|||
"""
|
||||
Surfsense documentation search tool.
|
||||
|
||||
This tool allows the agent to search the pre-indexed Surfsense documentation
|
||||
to help users with questions about how to use the application.
|
||||
|
||||
The documentation is indexed at deployment time from MDX files and stored
|
||||
in dedicated tables (surfsense_docs_documents, surfsense_docs_chunks).
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
|
||||
from langchain_core.tools import tool
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.db import SurfsenseDocsChunk, SurfsenseDocsDocument, async_session_maker
|
||||
from app.utils.document_converters import embed_text
|
||||
from app.utils.surfsense_docs import surfsense_docs_public_url
|
||||
|
||||
|
||||
def format_surfsense_docs_results(results: list[tuple]) -> str:
|
||||
"""
|
||||
Format search results into XML structure for the LLM context.
|
||||
|
||||
Uses the same XML structure as format_documents_for_context from knowledge_base.py
|
||||
but with 'doc-' prefix on chunk IDs. This allows:
|
||||
- LLM to use consistent [citation:doc-XXX] format
|
||||
- Frontend to detect 'doc-' prefix and route to surfsense docs endpoint
|
||||
|
||||
Args:
|
||||
results: List of (chunk, document) tuples from the database query
|
||||
|
||||
Returns:
|
||||
Formatted XML string with documentation content and citation-ready chunks
|
||||
"""
|
||||
if not results:
|
||||
return "No relevant Surfsense documentation found for your query."
|
||||
|
||||
# Group chunks by document
|
||||
grouped: dict[int, dict] = {}
|
||||
for chunk, doc in results:
|
||||
public_url = surfsense_docs_public_url(doc.source)
|
||||
if doc.id not in grouped:
|
||||
grouped[doc.id] = {
|
||||
"document_id": f"doc-{doc.id}",
|
||||
"document_type": "SURFSENSE_DOCS",
|
||||
"title": doc.title,
|
||||
"url": public_url,
|
||||
"metadata": {"source": doc.source, "public_url": public_url},
|
||||
"chunks": [],
|
||||
}
|
||||
grouped[doc.id]["chunks"].append(
|
||||
{
|
||||
"chunk_id": f"doc-{chunk.id}",
|
||||
"content": chunk.content,
|
||||
}
|
||||
)
|
||||
|
||||
# Render XML matching format_documents_for_context structure
|
||||
parts: list[str] = []
|
||||
for g in grouped.values():
|
||||
metadata_json = json.dumps(g["metadata"], ensure_ascii=False)
|
||||
|
||||
parts.append("<document>")
|
||||
parts.append("<document_metadata>")
|
||||
parts.append(f" <document_id>{g['document_id']}</document_id>")
|
||||
parts.append(f" <document_type>{g['document_type']}</document_type>")
|
||||
parts.append(f" <title><![CDATA[{g['title']}]]></title>")
|
||||
parts.append(f" <url><![CDATA[{g['url']}]]></url>")
|
||||
parts.append(f" <metadata_json><![CDATA[{metadata_json}]]></metadata_json>")
|
||||
parts.append("</document_metadata>")
|
||||
parts.append("")
|
||||
parts.append("<document_content>")
|
||||
|
||||
for ch in g["chunks"]:
|
||||
parts.append(
|
||||
f" <chunk id='{ch['chunk_id']}'><![CDATA[{ch['content']}]]></chunk>"
|
||||
)
|
||||
|
||||
parts.append("</document_content>")
|
||||
parts.append("</document>")
|
||||
parts.append("")
|
||||
|
||||
return "\n".join(parts).strip()
|
||||
|
||||
|
||||
async def search_surfsense_docs_async(
|
||||
query: str,
|
||||
db_session: AsyncSession,
|
||||
top_k: int = 10,
|
||||
) -> str:
|
||||
"""
|
||||
Search Surfsense documentation using vector similarity.
|
||||
|
||||
Args:
|
||||
query: The search query about Surfsense usage
|
||||
db_session: Database session for executing queries
|
||||
top_k: Number of results to return
|
||||
|
||||
Returns:
|
||||
Formatted string with relevant documentation content
|
||||
"""
|
||||
# Get embedding for the query
|
||||
query_embedding = await asyncio.to_thread(embed_text, query)
|
||||
|
||||
# Vector similarity search on chunks, joining with documents
|
||||
stmt = (
|
||||
select(SurfsenseDocsChunk, SurfsenseDocsDocument)
|
||||
.join(
|
||||
SurfsenseDocsDocument,
|
||||
SurfsenseDocsChunk.document_id == SurfsenseDocsDocument.id,
|
||||
)
|
||||
.order_by(SurfsenseDocsChunk.embedding.op("<=>")(query_embedding))
|
||||
.limit(top_k)
|
||||
)
|
||||
|
||||
result = await db_session.execute(stmt)
|
||||
rows = result.all()
|
||||
|
||||
return format_surfsense_docs_results(rows)
|
||||
|
||||
|
||||
def create_search_surfsense_docs_tool(db_session: AsyncSession):
|
||||
"""
|
||||
Factory function to create the search_surfsense_docs tool.
|
||||
|
||||
The tool acquires its own short-lived ``AsyncSession`` per call via
|
||||
:data:`async_session_maker` so the closure is safe to share across
|
||||
HTTP requests by the compiled-agent cache. Capturing a per-request
|
||||
session here would surface stale/closed sessions on cache hits.
|
||||
|
||||
Args:
|
||||
db_session: Reserved for registry compatibility. Per-call sessions
|
||||
are opened via :data:`async_session_maker` inside the tool body.
|
||||
|
||||
Returns:
|
||||
A configured tool function for searching Surfsense documentation
|
||||
"""
|
||||
del db_session # per-call session — see docstring
|
||||
|
||||
@tool
|
||||
async def search_surfsense_docs(query: str, top_k: int = 10) -> str:
|
||||
"""
|
||||
Search Surfsense documentation for help with using the application.
|
||||
|
||||
Use this tool when the user asks questions about:
|
||||
- How to use Surfsense features
|
||||
- Installation and setup instructions
|
||||
- Configuration options and settings
|
||||
- Troubleshooting common issues
|
||||
- Available connectors and integrations
|
||||
- Browser extension usage
|
||||
- API documentation
|
||||
|
||||
This searches the official Surfsense documentation that was indexed
|
||||
at deployment time. It does NOT search the user's personal knowledge base.
|
||||
|
||||
Args:
|
||||
query: The search query about Surfsense usage or features
|
||||
top_k: Number of documentation chunks to retrieve (default: 10)
|
||||
|
||||
Returns:
|
||||
Relevant documentation content formatted with chunk IDs for citations
|
||||
"""
|
||||
async with async_session_maker() as db_session:
|
||||
return await search_surfsense_docs_async(
|
||||
query=query,
|
||||
db_session=db_session,
|
||||
top_k=top_k,
|
||||
)
|
||||
|
||||
return search_surfsense_docs
|
||||
Loading…
Add table
Add a link
Reference in a new issue