From 49d675c065d4c896093aa64d6823dbccaeb34185 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 25 Jun 2026 15:26:51 +0200 Subject: [PATCH] web-search: register results on the citation registry (Channel B -> A) web_search now registers each result as a WEB_RESULT (locator {url}) and renders a block of [n] passages, returning Command(update={messages, citation_registry}) like search_knowledge_base. Collapse the duplicate research-subagent web_search into the shared tool and teach the prompts to cite web hits with [n]. --- .../prompts/tools/web_search/description.md | 5 +- .../subagents/builtins/research/agent.py | 11 +- .../builtins/research/system_prompt.md | 14 +- .../builtins/research/tools/__init__.py | 5 +- .../builtins/research/tools/index.py | 2 +- .../builtins/research/tools/web_search.py | 241 ------------------ .../agents/chat/shared/tools/web_search.py | 150 +++++++---- .../chat/shared/tools/test_web_search.py | 93 +++++++ 8 files changed, 218 insertions(+), 303 deletions(-) delete mode 100644 surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/research/tools/web_search.py create mode 100644 surfsense_backend/tests/unit/agents/chat/shared/tools/test_web_search.py diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/tools/web_search/description.md b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/tools/web_search/description.md index df15a6284..aad604e47 100644 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/tools/web_search/description.md +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/tools/web_search/description.md @@ -4,7 +4,10 @@ facts, anything outside SurfSense docs and the workspace KB. Reach for it whenever freshness matters or you'd otherwise guess from memory. - Don't refuse with "I lack network access" — call the tool. + - Returns a `` block: each result is labelled `[n]`. Cite a + result by writing that `[n]` after the statement it supports (when + citations are enabled) — do not hand-write the URL as a markdown link. - If results are thin, say so and offer to refine the query. - Args: `query`, `top_k` (default 10, max 50). - Follow up with `scrape_webpage` on the best URL when snippets are too - shallow. Present sources with `[label](url)` markdown links. + shallow. diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/research/agent.py b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/research/agent.py index 9a694872b..e3c0ab9ae 100644 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/research/agent.py +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/research/agent.py @@ -7,6 +7,9 @@ from typing import Any from langchain_core.language_models import BaseChatModel from langchain_core.tools import BaseTool +from app.agents.chat.multi_agent_chat.shared.middleware.citation_state import ( + build_citation_state_mw, +) from app.agents.chat.multi_agent_chat.subagents.shared.md_file_reader import ( read_md_file, ) @@ -31,6 +34,12 @@ def build_subagent( or "Handles research tasks for this workspace." ) system_prompt = read_md_file(__package__, "system_prompt").strip() + # web_search registers WEB_RESULT citations via Command(update=...); the + # citation-state middleware declares the channel so those [n] merge back up. + middleware_with_citations = { + **(middleware_stack or {}), + "citation_state": build_citation_state_mw(), + } return pack_subagent( name=NAME, description=description, @@ -39,5 +48,5 @@ def build_subagent( ruleset=RULESET, dependencies=dependencies, model=model, - middleware_stack=middleware_stack, + middleware_stack=middleware_with_citations, ) diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/research/system_prompt.md b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/research/system_prompt.md index 1b9ccaefa..3d90a4352 100644 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/research/system_prompt.md +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/research/system_prompt.md @@ -17,6 +17,16 @@ Gather and synthesize evidence using SurfSense research tools with clear citatio - Never fabricate facts, citations, URLs, or quote text. + +`web_search` returns a `` block whose results are each prefixed with a bracketed label — `[1]`, `[2]`, `[3]`. That `[n]` is the citation label. When a finding came from a specific result, append its `[n]` to that finding, copying the label **exactly** as shown. The caller relays these labels verbatim and the server resolves each one, so a wrong number silently breaks the citation. + +- Use the exact `[n]` shown next to the result you actually used; never renumber, guess, or invent a label. +- Before emitting an `[n]`, confirm that bracketed label appears in the `web_search` output this turn. If you can't see it, omit it. +- Write the bare label `[n]` only — no `[citation:…]` wrapper, no markdown links. +- Several results behind one finding → each in its own brackets with nothing between: `[1][2]`. +- `scrape_webpage` returns raw page text with no `[n]` labels; a fact drawn only from a scrape carries no citation (report the URL in `evidence.sources` instead). + + - Do not execute connector mutations (email/calendar/docs/chat writes) or deliverable generation. @@ -47,6 +57,6 @@ Return **only** one JSON object (no markdown/prose): } Route-specific rules: -- `evidence.findings`: max 10 entries, each a single sentence stating one distinct fact. Do not paste raw paragraphs, scraped pages, or quote blocks. -- `evidence.sources`: max 10 URLs, one per finding when applicable. List each URL once. +- `evidence.findings`: max 10 entries, each a single sentence stating one distinct fact. Append the supporting `[n]` to each finding drawn from a `web_search` result. Do not paste raw paragraphs, scraped pages, or quote blocks. +- `evidence.sources`: max 10 URLs, one per finding when applicable. List each URL once. (Citations travel as `[n]`; `sources` is for transparency and for scrape-only facts that carry no `[n]`.) diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/research/tools/__init__.py b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/research/tools/__init__.py index 7234942b6..0c99bf222 100644 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/research/tools/__init__.py +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/research/tools/__init__.py @@ -1,7 +1,8 @@ -"""Research-stage tools: web search and scrape.""" +"""Research-stage tools: web search (shared) and scrape.""" + +from app.agents.chat.shared.tools.web_search import create_web_search_tool from .scrape_webpage import create_scrape_webpage_tool -from .web_search import create_web_search_tool __all__ = [ "create_scrape_webpage_tool", diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/research/tools/index.py b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/research/tools/index.py index 1e823fafa..5fc2b5699 100644 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/research/tools/index.py +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/research/tools/index.py @@ -7,9 +7,9 @@ from typing import Any from langchain_core.tools import BaseTool from app.agents.chat.multi_agent_chat.shared.permissions import Ruleset +from app.agents.chat.shared.tools.web_search import create_web_search_tool from .scrape_webpage import create_scrape_webpage_tool -from .web_search import create_web_search_tool NAME = "research" diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/research/tools/web_search.py b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/research/tools/web_search.py deleted file mode 100644 index 2fe6bd378..000000000 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/research/tools/web_search.py +++ /dev/null @@ -1,241 +0,0 @@ -"""Real-time web search: SearXNG plus configured live-search connectors (Tavily, Linkup, Baidu, etc.).""" - -import asyncio -import json -import time -from typing import Any - -from langchain_core.tools import StructuredTool -from pydantic import BaseModel, Field - -from app.db import shielded_async_session -from app.services.connector_service import ConnectorService -from app.utils.perf import get_perf_logger - -_LIVE_SEARCH_CONNECTORS: set[str] = { - "TAVILY_API", - "LINKUP_API", - "BAIDU_SEARCH_API", -} - -_LIVE_CONNECTOR_SPECS: dict[str, tuple[str, bool, bool, dict[str, Any]]] = { - "TAVILY_API": ("search_tavily", False, True, {}), - "LINKUP_API": ("search_linkup", False, False, {"mode": "standard"}), - "BAIDU_SEARCH_API": ("search_baidu", False, True, {}), -} - -_CONNECTOR_LABELS: dict[str, str] = { - "TAVILY_API": "Tavily", - "LINKUP_API": "Linkup", - "BAIDU_SEARCH_API": "Baidu", -} - - -class WebSearchInput(BaseModel): - """Input schema for the web_search tool.""" - - query: str = Field( - description="The search query to look up on the web. Use specific, descriptive terms.", - ) - top_k: int = Field( - default=10, - description="Number of results to retrieve (default: 10, max: 50).", - ) - - -def _format_web_results( - documents: list[dict[str, Any]], - *, - max_chars: int = 50_000, -) -> str: - """Format web search results into XML suitable for the LLM context.""" - if not documents: - return "No web search results found." - - parts: list[str] = [] - total_chars = 0 - - for doc in documents: - doc_info = doc.get("document") or {} - metadata = doc_info.get("metadata") or {} - title = doc_info.get("title") or "Web Result" - url = metadata.get("url") or "" - content = (doc.get("content") or "").strip() - source = metadata.get("document_type") or doc.get("source") or "WEB_SEARCH" - if not content: - continue - - metadata_json = json.dumps(metadata, ensure_ascii=False) - doc_xml = "\n".join( - [ - "", - "", - f" {source}", - f" <![CDATA[{title}]]>", - f" ", - f" ", - "", - "", - f" ", - "", - "", - "", - ] - ) - - if total_chars + len(doc_xml) > max_chars: - parts.append("") - break - - parts.append(doc_xml) - total_chars += len(doc_xml) - - return "\n".join(parts).strip() or "No web search results found." - - -async def _search_live_connector( - connector: str, - query: str, - search_space_id: int, - top_k: int, - semaphore: asyncio.Semaphore, -) -> list[dict[str, Any]]: - """Dispatch a single live-search connector (Tavily / Linkup / Baidu).""" - perf = get_perf_logger() - spec = _LIVE_CONNECTOR_SPECS.get(connector) - if spec is None: - return [] - - method_name, _includes_date_range, includes_top_k, extra_kwargs = spec - kwargs: dict[str, Any] = { - "user_query": query, - "search_space_id": search_space_id, - **extra_kwargs, - } - if includes_top_k: - kwargs["top_k"] = top_k - - try: - t0 = time.perf_counter() - async with semaphore, shielded_async_session() as session: - svc = ConnectorService(session, search_space_id) - _, chunks = await getattr(svc, method_name)(**kwargs) - perf.info( - "[web_search] connector=%s results=%d in %.3fs", - connector, - len(chunks), - time.perf_counter() - t0, - ) - return chunks - except Exception as e: - perf.warning("[web_search] connector=%s FAILED: %s", connector, e) - return [] - - -def create_web_search_tool( - search_space_id: int | None = None, - available_connectors: list[str] | None = None, -) -> StructuredTool: - """Factory for the ``web_search`` tool. - - Dispatches in parallel to the platform SearXNG instance and any - user-configured live-search connectors (Tavily, Linkup, Baidu). - """ - active_live_connectors: list[str] = [] - if available_connectors: - active_live_connectors = [ - c for c in available_connectors if c in _LIVE_SEARCH_CONNECTORS - ] - - engine_names = ["SearXNG (platform default)"] - engine_names.extend(_CONNECTOR_LABELS.get(c, c) for c in active_live_connectors) - engines_summary = ", ".join(engine_names) - - description = ( - "Search the web for real-time information. " - "Use this for current events, news, prices, weather, public facts, or any " - "question that requires up-to-date information from the internet.\n\n" - f"Active search engines: {engines_summary}.\n" - "All configured engines are queried in parallel and results are merged." - ) - - _search_space_id = search_space_id - _active_live = active_live_connectors - - async def _web_search_impl(query: str, top_k: int = 10) -> str: - from app.services import web_search_service - - perf = get_perf_logger() - t0 = time.perf_counter() - clamped_top_k = min(max(1, top_k), 50) - - semaphore = asyncio.Semaphore(4) - tasks: list[asyncio.Task[list[dict[str, Any]]]] = [] - - if web_search_service.is_available(): - - async def _searxng() -> list[dict[str, Any]]: - async with semaphore: - _result_obj, docs = await web_search_service.search( - query=query, - top_k=clamped_top_k, - ) - return docs - - tasks.append(asyncio.ensure_future(_searxng())) - - if _search_space_id is not None: - for connector in _active_live: - tasks.append( - asyncio.ensure_future( - _search_live_connector( - connector=connector, - query=query, - search_space_id=_search_space_id, - top_k=clamped_top_k, - semaphore=semaphore, - ) - ) - ) - - if not tasks: - return "Web search is not available — no search engines are configured." - - results_lists = await asyncio.gather(*tasks, return_exceptions=True) - - all_documents: list[dict[str, Any]] = [] - for result in results_lists: - if isinstance(result, BaseException): - perf.warning("[web_search] a search engine failed: %s", result) - continue - all_documents.extend(result) - - seen_urls: set[str] = set() - deduplicated: list[dict[str, Any]] = [] - for doc in all_documents: - url = ((doc.get("document") or {}).get("metadata") or {}).get("url", "") - if url and url in seen_urls: - continue - if url: - seen_urls.add(url) - deduplicated.append(doc) - - formatted = _format_web_results(deduplicated) - - perf.info( - "[web_search] query=%r engines=%d results=%d deduped=%d chars=%d in %.3fs", - query[:60], - len(tasks), - len(all_documents), - len(deduplicated), - len(formatted), - time.perf_counter() - t0, - ) - return formatted - - return StructuredTool( - name="web_search", - description=description, - coroutine=_web_search_impl, - args_schema=WebSearchInput, - ) diff --git a/surfsense_backend/app/agents/chat/shared/tools/web_search.py b/surfsense_backend/app/agents/chat/shared/tools/web_search.py index c67db541c..424225b30 100644 --- a/surfsense_backend/app/agents/chat/shared/tools/web_search.py +++ b/surfsense_backend/app/agents/chat/shared/tools/web_search.py @@ -4,20 +4,40 @@ Web search tool for the SurfSense agent. Provides a unified tool for real-time web searches that dispatches to all configured search engines: the platform SearXNG instance (always available) plus any user-configured live-search connectors (Tavily, Linkup, Baidu). + +Each result is registered into the conversation citation registry as a +``WEB_RESULT`` and rendered with a server-assigned ``[n]`` label, so the model +cites the web exactly like the knowledge base — one ``[n]`` spine, no special +web citation form. """ -import asyncio -import json -import time -from typing import Any +from __future__ import annotations -from langchain_core.tools import StructuredTool -from pydantic import BaseModel, Field +import asyncio +import time +from typing import TYPE_CHECKING, Annotated, Any +from urllib.parse import urlparse + +from langchain.tools import ToolRuntime +from langchain_core.messages import ToolMessage +from langchain_core.tools import BaseTool, StructuredTool +from langgraph.types import Command from app.db import shielded_async_session from app.services.connector_service import ConnectorService from app.utils.perf import get_perf_logger +if TYPE_CHECKING: + from app.agents.chat.multi_agent_chat.shared.document_render import ( + RenderableDocument, + ) + +# NOTE: imports from ``app.agents.chat.multi_agent_chat`` are done lazily inside +# the functions below. This module lives under ``app.agents.chat.shared`` but is +# imported during the ``multi_agent_chat`` package's own init cascade (via the +# research subagent); importing that package at module load would re-enter a +# partially-initialized module. Lazy imports break that cycle. + _LIVE_SEARCH_CONNECTORS: set[str] = { "TAVILY_API", "LINKUP_API", @@ -37,28 +57,29 @@ _CONNECTOR_LABELS: dict[str, str] = { } -class WebSearchInput(BaseModel): - """Input schema for the web_search tool.""" - - query: str = Field( - description="The search query to look up on the web. Use specific, descriptive terms.", - ) - top_k: int = Field( - default=10, - description="Number of results to retrieve (default: 10, max: 50).", - ) +def _web_source_label(url: str) -> str: + """A compact, human-readable source for the ```` attr.""" + domain = urlparse(url).netloc.removeprefix("www.") if url else "" + return f"Web · {domain}" if domain else "Web" -def _format_web_results( +def _to_renderable_web_documents( documents: list[dict[str, Any]], *, max_chars: int = 50_000, -) -> str: - """Format web search results into XML suitable for the LLM context.""" - if not documents: - return "No web search results found." +) -> list[RenderableDocument]: + """Map raw web results to renderable documents, one passage (the snippet) each. - parts: list[str] = [] + A result with no URL is skipped: ``url`` is the citation locator, so without + it the result cannot be registered or resolved. + """ + from app.agents.chat.multi_agent_chat.shared.citations import CitationSourceType + from app.agents.chat.multi_agent_chat.shared.document_render import ( + RenderableDocument, + RenderablePassage, + ) + + renderables: list[RenderableDocument] = [] total_chars = 0 for doc in documents: @@ -67,36 +88,28 @@ def _format_web_results( title = doc_info.get("title") or "Web Result" url = metadata.get("url") or "" content = (doc.get("content") or "").strip() - source = metadata.get("document_type") or doc.get("source") or "WEB_SEARCH" - if not content: + if not content or not url: continue - metadata_json = json.dumps(metadata, ensure_ascii=False) - doc_xml = "\n".join( - [ - "", - "", - f" {source}", - f" <![CDATA[{title}]]>", - f" ", - f" ", - "", - "", - f" ", - "", - "", - "", - ] - ) - - if total_chars + len(doc_xml) > max_chars: - parts.append("") + total_chars += len(content) + if total_chars > max_chars: break - parts.append(doc_xml) - total_chars += len(doc_xml) + renderables.append( + RenderableDocument( + title=title, + source=_web_source_label(url), + passages=[ + RenderablePassage( + content=content, + locator={"url": url}, + source_type=CitationSourceType.WEB_RESULT, + ) + ], + ) + ) - return "\n".join(parts).strip() or "No web search results found." + return renderables async def _search_live_connector( @@ -141,7 +154,7 @@ async def _search_live_connector( def create_web_search_tool( search_space_id: int | None = None, available_connectors: list[str] | None = None, -) -> StructuredTool: +) -> BaseTool: """Factory for the ``web_search`` tool. Dispatches in parallel to the platform SearXNG instance and any @@ -168,7 +181,17 @@ def create_web_search_tool( _search_space_id = search_space_id _active_live = active_live_connectors - async def _web_search_impl(query: str, top_k: int = 10) -> str: + async def _web_search_impl( + query: Annotated[ + str, + "The search query to look up on the web. Use specific, descriptive terms.", + ], + runtime: ToolRuntime, + top_k: Annotated[ + int, + "Number of results to retrieve (default: 10, max: 50).", + ] = 10, + ) -> Command | str: from app.services import web_search_service perf = get_perf_logger() @@ -226,22 +249,39 @@ def create_web_search_tool( seen_urls.add(url) deduplicated.append(doc) - formatted = _format_web_results(deduplicated) + from app.agents.chat.multi_agent_chat.shared.citations import load_registry + from app.agents.chat.multi_agent_chat.shared.document_render import ( + render_web_results, + ) + + registry = load_registry(getattr(runtime, "state", None)) + renderables = _to_renderable_web_documents(deduplicated) + rendered = render_web_results(renderables, registry) perf.info( - "[web_search] query=%r engines=%d results=%d deduped=%d chars=%d in %.3fs", + "[web_search] query=%r engines=%d results=%d deduped=%d renderable=%d in %.3fs", query[:60], len(tasks), len(all_documents), len(deduplicated), - len(formatted), + len(renderables), time.perf_counter() - t0, ) - return formatted - return StructuredTool( + if rendered is None: + return "No web search results found." + + return Command( + update={ + "messages": [ + ToolMessage(content=rendered, tool_call_id=runtime.tool_call_id) + ], + "citation_registry": registry, + } + ) + + return StructuredTool.from_function( name="web_search", description=description, coroutine=_web_search_impl, - args_schema=WebSearchInput, ) diff --git a/surfsense_backend/tests/unit/agents/chat/shared/tools/test_web_search.py b/surfsense_backend/tests/unit/agents/chat/shared/tools/test_web_search.py new file mode 100644 index 000000000..7137bfdfc --- /dev/null +++ b/surfsense_backend/tests/unit/agents/chat/shared/tools/test_web_search.py @@ -0,0 +1,93 @@ +"""Tests for the shared ``web_search`` tool's citable-result adaptation. + +The tool's network path (SearXNG + live connectors) is out of scope here; these +cover the pure mapping from raw web results to renderable, citable documents and +the end-to-end registration of ``WEB_RESULT`` ``[n]`` labels. +""" + +from __future__ import annotations + +import pytest + +from app.agents.chat.multi_agent_chat.shared.citations import ( + CitationRegistry, + CitationSourceType, +) +from app.agents.chat.multi_agent_chat.shared.document_render import render_web_results +from app.agents.chat.shared.tools.web_search import ( + _to_renderable_web_documents, + _web_source_label, +) + +pytestmark = pytest.mark.unit + + +def _raw_result(url: str, title: str, content: str) -> dict: + return { + "document": {"title": title, "metadata": {"url": url}}, + "content": content, + } + + +def test_web_source_label_strips_scheme_and_www() -> None: + assert _web_source_label("https://www.example.com/path") == "Web · example.com" + assert _web_source_label("http://news.site.org/a/b") == "Web · news.site.org" + assert _web_source_label("") == "Web" + + +def test_adapter_maps_each_result_to_one_web_passage() -> None: + docs = _to_renderable_web_documents( + [ + _raw_result("https://a.com/x", "Alpha", "alpha body"), + _raw_result("https://b.com/y", "Beta", "beta body"), + ] + ) + + assert [d.title for d in docs] == ["Alpha", "Beta"] + passages = [p for d in docs for p in d.passages] + assert all(p.source_type is CitationSourceType.WEB_RESULT for p in passages) + assert passages[0].locator == {"url": "https://a.com/x"} + assert passages[0].content == "alpha body" + + +def test_adapter_skips_results_without_url_or_content() -> None: + docs = _to_renderable_web_documents( + [ + _raw_result("", "No URL", "has content"), + _raw_result("https://c.com/z", "Empty", " "), + _raw_result("https://d.com/w", "Good", "real content"), + ] + ) + + assert [d.title for d in docs] == ["Good"] + + +def test_adapter_truncates_on_char_budget() -> None: + big = "x" * 30 + docs = _to_renderable_web_documents( + [ + _raw_result("https://a.com", "A", big), + _raw_result("https://b.com", "B", big), + _raw_result("https://c.com", "C", big), + ], + max_chars=50, + ) + + # First fits (30), second crosses 50 and stops the loop. + assert [d.title for d in docs] == ["A"] + + +def test_end_to_end_registers_web_results_for_citation() -> None: + registry = CitationRegistry() + docs = _to_renderable_web_documents( + [_raw_result("https://example.com/a", "Example", "the answer is 42")] + ) + + block = render_web_results(docs, registry) + + assert block is not None + assert "[1] the answer is 42" in block + entry = registry.resolve(1) + assert entry is not None + assert entry.source_type is CitationSourceType.WEB_RESULT + assert entry.locator == {"url": "https://example.com/a"}