citations: consolidate prompts, retire eager path, refresh ADR

Rewrite the main-agent citation contract to a single [n] channel and sync the orphaned system_prompt_composer surface to match; drop stale [citation:chunk_id] / <chunk_index> references from dynamic_context and provider hints. Reuse the shared hybrid search in the deliverables report (citations omitted for now) and delete the orphaned report KB helper. Remove the dead eager KnowledgePriorityMiddleware wiring (knowledge_priority + stack) and its legacy browse test. Update ADR 0001 to reflect the cutover.
2026-06-26 21:39:43 +02:00 · 2026-06-25 15:27:09 +02:00 · 2026-06-25 15:27:09 +02:00 · ce15016533
commit ce15016533
parent 49d675c065
20 changed files with 316 additions and 1127 deletions
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/middleware/knowledge_priority.py
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/middleware/knowledge_priority.py
@ -1,42 +0,0 @@
-"""KB priority planner: <priority_documents> injection."""
-
-from __future__ import annotations
-
-from langchain_core.language_models import BaseChatModel
-
-from app.agents.chat.multi_agent_chat.shared.filesystem_selection import FilesystemMode
-from app.agents.chat.multi_agent_chat.shared.middleware.knowledge_search import (
-    KnowledgePriorityMiddleware,
-)
-from app.services.llm_service import get_planner_llm
-
-
-def build_knowledge_priority_mw(
-    *,
-    llm: BaseChatModel,
-    search_space_id: int,
-    filesystem_mode: FilesystemMode,
-    available_connectors: list[str] | None,
-    available_document_types: list[str] | None,
-    mentioned_document_ids: list[int] | None,
-    preinjection_enabled: bool = True,
-) -> KnowledgePriorityMiddleware:
-    """Build the KB priority middleware.
-
-    When ``preinjection_enabled`` is False (the lazy default), the middleware
-    runs in mentions-only mode: it skips the expensive planner LLM + embedding
-    + hybrid search and only surfaces explicit @-mentions. The main agent is
-    expected to pull relevant KB content on demand via the
-    ``search_knowledge_base`` tool instead.
-    """
-    return KnowledgePriorityMiddleware(
-        llm=llm,
-        planner_llm=get_planner_llm(),
-        search_space_id=search_space_id,
-        filesystem_mode=filesystem_mode,
-        available_connectors=available_connectors,
-        available_document_types=available_document_types,
-        mentioned_document_ids=mentioned_document_ids,
-        inject_system_message=False,
-        mentions_only=not preinjection_enabled,
-    )
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/middleware/stack.py
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/middleware/stack.py
@ -1,10 +1,11 @@
 """Main-agent middleware list assembly: one line per slot.

 The main agent is a pure router — filesystem reads/writes are owned by the
-``knowledge_base`` subagent and delegated via the ``task`` tool. The stack
-here only renders KB context (workspace tree + priority docs), projects it
-into system messages, and commits any subagent-side staged writes at end of
-turn (cloud mode).
+``knowledge_base`` subagent and delegated via the ``task`` tool. Knowledge-base
+retrieval is pull-based: the ``search_knowledge_base`` tool runs the hybrid
+search on demand and renders ``<retrieved_context>`` with ``[n]`` citation
+labels. The stack here computes the workspace tree, commits any subagent-side
+staged writes at end of turn (cloud mode), and wires the supporting middleware.
 """

 from __future__ import annotations
@ -33,9 +34,6 @@ from app.agents.chat.multi_agent_chat.shared.middleware.anthropic_cache import (
 from app.agents.chat.multi_agent_chat.shared.middleware.compaction import (
    build_compaction_mw,
 )
-from app.agents.chat.multi_agent_chat.shared.middleware.kb_context_projection import (
-    build_kb_context_projection_mw,
-)
 from app.agents.chat.multi_agent_chat.shared.middleware.patch_tool_calls import (
    build_patch_tool_calls_mw,
 )
@ -84,7 +82,6 @@ from .context_editing import build_context_editing_mw
 from .dedup_hitl import build_dedup_hitl_mw
 from .doom_loop import build_doom_loop_mw
 from .kb_persistence import build_kb_persistence_mw
-from .knowledge_priority import build_knowledge_priority_mw
 from .knowledge_tree import build_knowledge_tree_mw
 from .noop_injection import build_noop_injection_mw
 from .otel_span import build_otel_mw
@ -237,16 +234,6 @@ def build_main_agent_deepagent_middleware(
            search_space_id=search_space_id,
            llm=llm,
        ),
-        build_knowledge_priority_mw(
-            llm=llm,
-            search_space_id=search_space_id,
-            filesystem_mode=filesystem_mode,
-            available_connectors=available_connectors,
-            available_document_types=available_document_types,
-            mentioned_document_ids=mentioned_document_ids,
-            preinjection_enabled=flags.enable_kb_priority_preinjection,
-        ),
-        build_kb_context_projection_mw(),
        build_kb_persistence_mw(
            filesystem_mode=filesystem_mode,
            search_space_id=search_space_id,
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/skills/builtin/kb-research/SKILL.md
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/skills/builtin/kb-research/SKILL.md
@ -15,7 +15,7 @@ allowed-tools: scrape_webpage, read_file, ls_tree, grep, web_search
 1. Decompose the user's question into 2-4 specific, citation-worthy sub-questions.
 2. For each sub-question, run **one** targeted KB search (focused on terms the user would have written, not synonyms). Open the most relevant 2-3 documents fully via `read_file` if their excerpts are too short.
 3. Use `grep` to find supporting passages in long files instead of re-reading them end to end.
-4. Cite every claim with `[citation:chunk_id]` exactly as the chunk tag specifies.
+4. Cite every claim with the `[n]` label shown on the passage you used (search results and `read_file` output both carry them); never write a chunk id, URL, or title yourself.

 ## What good output looks like
 - Short paragraphs with inline citations.
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/citations/off.md
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/citations/off.md
@ -1,12 +1,13 @@
 <citations>
 Citation markers are **disabled** in this configuration.

-Do NOT include `[citation:…]` markers anywhere, even if tool descriptions or
+Do NOT include `[n]` citation labels or `[citation:…]` markers anywhere, even if
+tool output (`<retrieved_context>`, `<web_results>`), tool descriptions, or
 examples reference them. Ignore citation-format reminders elsewhere in this
 prompt when they conflict with this block.

 1. Answer in plain prose. Optional markdown links to public URLs when
   sources are URLs.
 2. Do not expose raw chunk ids, document ids, or internal ids to the user.
-3. Present KB or docs facts naturally without attribution markers.
+3. Present KB, web, or docs facts naturally without attribution markers.
 </citations>
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/citations/on.md
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/citations/on.md
@ -1,42 +1,16 @@
 <citations>
-Citations reach the answer through two channels. Use whichever applies — and
-never invent ids you didn't see. Citation ids are resolved by exact-match
-lookup; a wrong id silently breaks the link, so when in doubt, omit.
+Cite with one token: the bracket label `[n]`. Every citable result —
+`search_knowledge_base` passages, `web_search` results, and prose from a
+`task` knowledge_base/research specialist — already carries `[n]` labels on a
+single shared count. Those labels are the only citation you write; the server
+resolves each one back to its source after the turn.

-### Channel A — chunk blocks injected this turn
-When `web_search` returns `<document>` / `<chunk id='…'>` blocks in this
-turn:
-
-1. For each factual statement taken from those chunks, add
-   `[citation:chunk_id]` using the **exact** id from a visible
-   `<chunk id='…'>` tag. Copy digit-for-digit (or the URL verbatim);
-   do not retype from memory.
-2. `<document_id>` is the parent doc id, **not** a citation source —
-   only ids inside `<chunk id='…'>` count.
-3. Multiple chunks → `[citation:id1], [citation:id2]` (comma-separated,
-   each id copied individually).
-4. Never invent, normalise, or guess at adjacent ids; if unsure, omit.
-5. Plain brackets only — no markdown links, no footnote numbering.
-
-### Channel B — citations relayed by a `task` specialist
-A `task(...)` tool message may contain `[citation:<chunk_id>]` markers
-the specialist already attached to its prose. The specialist saw the
-underlying `<chunk id='…'>` blocks; you didn't. So:
-
-1. **Preserve those markers verbatim** in your final answer — do not
-   reformat, renumber, drop, or wrap them in markdown links. When you
-   paraphrase a specialist sentence, copy the marker character-for-
-   character; do not regenerate the id from memory (LLMs reliably
-   corrupt nearby digits).
-2. Keep each marker attached to the sentence the specialist attached
-   it to.
-3. Do **not** add new `[citation:…]` markers of your own to a
-   specialist's prose; if a fact has no marker, the specialist
-   couldn't tie it to a chunk and neither can you.
-4. When a specialist returns JSON, the citation markers live inside
-   the prose-bearing fields (e.g. a summary or excerpt). Pull them
-   along with the surrounding sentence when you quote.
-
-If neither channel surfaces citation markers this turn, do not fabricate
-them.
+1. Put the label right after the claim it supports.
+2. Several sources for one claim: stack brackets, `[1][2]`.
+3. Copy labels exactly as shown, a specialist's included — never renumber them,
+   add your own, or write the underlying title, date, id, or URL instead.
+4. Write the bare `[n]` and nothing else: no `[citation:...]`, no markdown links,
+   no footnote marks, no "References" section.
+5. Only label claims the sources support. If nothing shown backs a claim — or you
+   never saw a label — leave it uncited; never invent one.
 </citations>
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/dynamic_context/private.md
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/dynamic_context/private.md
@ -11,17 +11,16 @@ your answer, not as the task itself.
 `<priority_documents>` lists the workspace documents most relevant to the
 latest user message, ranked by relevance score, with `[USER-MENTIONED]`
 flagged on anything the user explicitly referenced. When the task is about
-workspace content, read these first; matched passages inside each document
-are flagged via `<chunk_index>` so you can jump straight to them.
+workspace content, read these first.

 `<workspace_tree>` shows the full `/documents/` folder and file layout. Use
 it to resolve paths the user describes in natural language ("my Q2 roadmap",
 "last week's meeting notes") into concrete document references before
 delegating to a specialist.

-`<document>` and `<chunk id='…'>` blocks are chunked indexed content returned
-by KB search (backing `<priority_documents>`). Each chunk carries a stable
-`id` attribute.
+`<retrieved_context>` blocks hold knowledge-base passages from
+`search_knowledge_base`; each `<document>` inside is in excerpt view and every
+passage is prefixed with an `[n]` citation label.

 If a block doesn't appear this turn, work from the conversation alone.
 </dynamic_context>
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/dynamic_context/team.md
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/dynamic_context/team.md
@ -10,18 +10,16 @@ budget. Treat it as background colour for your answer, not as the task itself.
 `<priority_documents>` lists the workspace documents most relevant to the
 latest user message, ranked by relevance score, with `[USER-MENTIONED]`
 flagged on anything someone in the thread explicitly referenced. When the
-task is about workspace content, read these first; matched passages inside
-each document are flagged via `<chunk_index>` so you can jump straight to
-them.
+task is about workspace content, read these first.

 `<workspace_tree>` shows the full `/documents/` folder and file layout. Use
 it to resolve paths described in natural language ("the Q2 roadmap", "last
 week's planning notes") into concrete document references before delegating
 to a specialist.

-`<document>` and `<chunk id='…'>` blocks are chunked indexed content returned
-by KB search (backing `<priority_documents>`). Each chunk carries a stable
-`id` attribute.
+`<retrieved_context>` blocks hold knowledge-base passages from
+`search_knowledge_base`; each `<document>` inside is in excerpt view and every
+passage is prefixed with an `[n]` citation label.

 If a block doesn't appear this turn, work from the conversation alone.
 </dynamic_context>
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/providers/grok.md
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/providers/grok.md
@ -8,8 +8,8 @@ Tool discipline:
 - Typically one investigative tool per turn unless several independent read-only queries are clearly needed; don’t repeat identical calls.

 Attribution:
- When citations are **enabled** (see citation block above) and you answer from chunk-tagged documents, use `[citation:chunk_id]` exactly as specified there.
- When citations are **disabled**, never emit `[citation:…]` — plain prose and links per tool guidance.
+- When citations are **enabled** (see citation block above) and you answer from labelled passages, cite with the bare `[n]` label exactly as specified there.
+- When citations are **disabled**, never emit `[n]` or `[citation:…]` — plain prose and links per tool guidance.

 Style:
 - No emojis unless asked; flat lists for short answers.
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/providers/openai_codex.md
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/providers/openai_codex.md
@ -3,7 +3,7 @@ You are running on an OpenAI Codex-class model (SurfSense **main agent**).

 Output style:
 - Concise; don’t paste huge fetch blobs — summarize.
- When citations are **enabled** and you rely on chunk-tagged docs, references may use `[citation:chunk_id]` per the citation block above; when **disabled**, use prose and URLs only.
+- When citations are **enabled** and you rely on labelled passages, cite with the bare `[n]` label per the citation block above; when **disabled**, use prose and URLs only.
 - Numbered lists work well when the user should reply with a single option index.
 - No emojis; single-level bullets.

--- a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/deliverables/tools/knowledge_base.py
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/deliverables/tools/knowledge_base.py
@ -1,762 +0,0 @@
-"""
-Knowledge base search tool for the SurfSense agent.
-
-This module provides:
- Connector constants and normalization
- Async knowledge base search across multiple connectors
- Document formatting for LLM context
-"""
-
-import asyncio
-import contextlib
-import json
-import re
-import time
-from datetime import datetime
-from typing import Any
-
-from sqlalchemy.ext.asyncio import AsyncSession
-
-from app.db import NATIVE_TO_LEGACY_DOCTYPE, shielded_async_session
-from app.services.connector_service import ConnectorService
-from app.utils.perf import get_perf_logger
-
-# Connectors that call external live-search APIs. These are handled by the
-# ``web_search`` tool and must be excluded from knowledge-base searches.
-_LIVE_SEARCH_CONNECTORS: set[str] = {
-    "TAVILY_API",
-    "LINKUP_API",
-    "BAIDU_SEARCH_API",
-}
-
-# Patterns that indicate the query has no meaningful search signal.
-# plainto_tsquery('english', '*') produces an empty tsquery and an embedding
-# of '*' is random noise, so both keyword and semantic search degrade to
-# arbitrary ordering — large documents (many chunks) dominate by chance.
-_DEGENERATE_QUERY_RE = re.compile(
-    r"^[\s*?_.#@!\-/\\]+$"  # only wildcards, punctuation, whitespace
-)
-
-# Max chunks per document when doing a recency-based browse instead of
-# a real search.  We want breadth (many docs) over depth (many chunks).
-_BROWSE_MAX_CHUNKS_PER_DOC = 5
-
-
-def _is_degenerate_query(query: str) -> bool:
-    """Return True when the query carries no meaningful search signal.
-
-    Catches wildcard patterns (``*``, ``**``), empty / whitespace-only
-    strings, and single-character non-word tokens.  These queries cause
-    both keyword search (empty tsquery) and semantic search (meaningless
-    embedding) to return effectively random results.
-    """
-    stripped = query.strip()
-    if not stripped:
-        return True
-    return bool(_DEGENERATE_QUERY_RE.match(stripped))
-
-
-async def _browse_recent_documents(
-    search_space_id: int,
-    document_type: str | list[str] | None,
-    top_k: int,
-    start_date: datetime | None,
-    end_date: datetime | None,
-) -> list[dict[str, Any]]:
-    """Return the most-recent documents (recency-ordered, no search ranking).
-
-    Used as a fallback when the search query is degenerate (e.g. ``*``) and
-    semantic / keyword search would produce arbitrary results.  Returns
-    document-grouped dicts in the same shape as ``_combined_rrf_search``
-    so the rest of the pipeline works unchanged.
-    """
-    from sqlalchemy import select
-    from sqlalchemy.orm import joinedload
-
-    from app.db import Chunk, Document, DocumentType
-
-    perf = get_perf_logger()
-    t0 = time.perf_counter()
-
-    base_conditions = [Document.search_space_id == search_space_id]
-
-    if document_type is not None:
-        type_list = (
-            document_type if isinstance(document_type, list) else [document_type]
-        )
-        doc_type_enums = []
-        for dt in type_list:
-            if isinstance(dt, str):
-                with contextlib.suppress(KeyError):
-                    doc_type_enums.append(DocumentType[dt])
-            else:
-                doc_type_enums.append(dt)
-        if not doc_type_enums:
-            return []
-        if len(doc_type_enums) == 1:
-            base_conditions.append(Document.document_type == doc_type_enums[0])
-        else:
-            base_conditions.append(Document.document_type.in_(doc_type_enums))
-
-    if start_date is not None:
-        base_conditions.append(Document.updated_at >= start_date)
-    if end_date is not None:
-        base_conditions.append(Document.updated_at <= end_date)
-
-    async with shielded_async_session() as session:
-        doc_query = (
-            select(Document)
-            .options(joinedload(Document.search_space))
-            .where(*base_conditions)
-            .order_by(Document.updated_at.desc())
-            .limit(top_k)
-        )
-        result = await session.execute(doc_query)
-        documents = result.scalars().unique().all()
-
-        if not documents:
-            return []
-
-        doc_ids = [d.id for d in documents]
-
-        chunk_query = (
-            select(Chunk)
-            .where(Chunk.document_id.in_(doc_ids))
-            .order_by(Chunk.document_id, Chunk.position, Chunk.id)
-        )
-        chunk_result = await session.execute(chunk_query)
-        raw_chunks = chunk_result.scalars().all()
-
-    doc_chunk_counts: dict[int, int] = {}
-    doc_chunks: dict[int, list[dict]] = {d.id: [] for d in documents}
-    for chunk in raw_chunks:
-        did = chunk.document_id
-        count = doc_chunk_counts.get(did, 0)
-        if count < _BROWSE_MAX_CHUNKS_PER_DOC:
-            doc_chunks[did].append({"chunk_id": chunk.id, "content": chunk.content})
-            doc_chunk_counts[did] = count + 1
-
-    results: list[dict[str, Any]] = []
-    for doc in documents:
-        chunks_list = doc_chunks.get(doc.id, [])
-        results.append(
-            {
-                "document_id": doc.id,
-                "content": "\n\n".join(
-                    c["content"] for c in chunks_list if c.get("content")
-                ),
-                "score": 0.0,
-                "chunks": chunks_list,
-                "document": {
-                    "id": doc.id,
-                    "title": doc.title,
-                    "document_type": doc.document_type.value
-                    if getattr(doc, "document_type", None)
-                    else None,
-                    "metadata": doc.document_metadata or {},
-                },
-                "source": doc.document_type.value
-                if getattr(doc, "document_type", None)
-                else None,
-            }
-        )
-
-    perf.info(
-        "[kb_browse] recency browse in %.3fs docs=%d space=%d type=%s",
-        time.perf_counter() - t0,
-        len(results),
-        search_space_id,
-        document_type,
-    )
-    return results
-
-
-# =============================================================================
-# Connector Constants and Normalization
-# =============================================================================
-
-# Canonical connector values used internally by ConnectorService
-# Includes all document types and search source connectors
-_ALL_CONNECTORS: list[str] = [
-    "EXTENSION",
-    "FILE",
-    "SLACK_CONNECTOR",
-    "TEAMS_CONNECTOR",
-    "NOTION_CONNECTOR",
-    "YOUTUBE_VIDEO",
-    "GITHUB_CONNECTOR",
-    "ELASTICSEARCH_CONNECTOR",
-    "LINEAR_CONNECTOR",
-    "JIRA_CONNECTOR",
-    "CONFLUENCE_CONNECTOR",
-    "CLICKUP_CONNECTOR",
-    "GOOGLE_CALENDAR_CONNECTOR",
-    "GOOGLE_GMAIL_CONNECTOR",
-    "GOOGLE_DRIVE_FILE",
-    "DISCORD_CONNECTOR",
-    "AIRTABLE_CONNECTOR",
-    "LUMA_CONNECTOR",
-    "NOTE",
-    "BOOKSTACK_CONNECTOR",
-    "CRAWLED_URL",
-    "CIRCLEBACK",
-    "OBSIDIAN_CONNECTOR",
-    "ONEDRIVE_FILE",
-    "DROPBOX_FILE",
-]
-
-# Human-readable descriptions for each connector type
-# Used for generating dynamic docstrings and informing the LLM
-CONNECTOR_DESCRIPTIONS: dict[str, str] = {
-    "EXTENSION": "Web content saved via SurfSense browser extension (personal browsing history)",
-    "FILE": "User-uploaded documents (PDFs, Word, etc.) (personal files)",
-    "NOTE": "SurfSense Notes (notes created inside SurfSense)",
-    "SLACK_CONNECTOR": "Slack conversations and shared content (personal workspace communications)",
-    "TEAMS_CONNECTOR": "Microsoft Teams messages and conversations (personal Teams communications)",
-    "NOTION_CONNECTOR": "Notion workspace pages and databases (personal knowledge management)",
-    "YOUTUBE_VIDEO": "YouTube video transcripts and metadata (personally saved videos)",
-    "GITHUB_CONNECTOR": "GitHub repository content and issues (personal repositories and interactions)",
-    "ELASTICSEARCH_CONNECTOR": "Elasticsearch indexed documents and data (personal Elasticsearch instances)",
-    "LINEAR_CONNECTOR": "Linear project issues and discussions (personal project management)",
-    "JIRA_CONNECTOR": "Jira project issues, tickets, and comments (personal project tracking)",
-    "CONFLUENCE_CONNECTOR": "Confluence pages and comments (personal project documentation)",
-    "CLICKUP_CONNECTOR": "ClickUp tasks and project data (personal task management)",
-    "GOOGLE_CALENDAR_CONNECTOR": "Google Calendar events, meetings, and schedules (personal calendar)",
-    "GOOGLE_GMAIL_CONNECTOR": "Google Gmail emails and conversations (personal emails)",
-    "GOOGLE_DRIVE_FILE": "Google Drive files and documents (personal cloud storage)",
-    "DISCORD_CONNECTOR": "Discord server conversations and shared content (personal community)",
-    "AIRTABLE_CONNECTOR": "Airtable records, tables, and database content (personal data)",
-    "LUMA_CONNECTOR": "Luma events and meetings",
-    "WEBCRAWLER_CONNECTOR": "Webpages indexed by SurfSense (personally selected websites)",
-    "CRAWLED_URL": "Webpages indexed by SurfSense (personally selected websites)",
-    "BOOKSTACK_CONNECTOR": "BookStack pages (personal documentation)",
-    "CIRCLEBACK": "Circleback meeting notes, transcripts, and action items",
-    "OBSIDIAN_CONNECTOR": "Obsidian vault notes and markdown files (personal notes)",
-    "ONEDRIVE_FILE": "Microsoft OneDrive files and documents (personal cloud storage)",
-    "DROPBOX_FILE": "Dropbox files and documents (cloud storage)",
-}
-
-
-def _normalize_connectors(
-    connectors_to_search: list[str] | None,
-    available_connectors: list[str] | None = None,
-) -> list[str]:
-    """Normalize model-supplied connectors to canonical ConnectorService types.
-
-    Maps user-facing aliases (e.g. WEBCRAWLER_CONNECTOR), drops unknowns, and
-    constrains to ``available_connectors`` when given. Empty input defaults to
-    all available connectors (minus live-search ones).
-    """
-    valid_set = (
-        set(available_connectors) if available_connectors else set(_ALL_CONNECTORS)
-    )
-    valid_set -= _LIVE_SEARCH_CONNECTORS
-
-    if not connectors_to_search:
-        base = (
-            list(available_connectors)
-            if available_connectors
-            else list(_ALL_CONNECTORS)
-        )
-        return [c for c in base if c not in _LIVE_SEARCH_CONNECTORS]
-
-    normalized: list[str] = []
-    for raw in connectors_to_search:
-        c = (raw or "").strip().upper()
-        if not c:
-            continue
-        if c == "WEBCRAWLER_CONNECTOR":
-            c = "CRAWLED_URL"
-        normalized.append(c)
-
-    # De-dupe (order-preserving), keeping only known + available connectors.
-    seen: set[str] = set()
-    out: list[str] = []
-    for c in normalized:
-        if c in seen:
-            continue
-        if c not in _ALL_CONNECTORS:
-            continue
-        if c not in valid_set:
-            continue
-        seen.add(c)
-        out.append(c)
-
-    # Nothing matched: fall back to all available.
-    if not out:
-        base = (
-            list(available_connectors)
-            if available_connectors
-            else list(_ALL_CONNECTORS)
-        )
-        return [c for c in base if c not in _LIVE_SEARCH_CONNECTORS]
-    return out
-
-
-# =============================================================================
-# Document Formatting
-# =============================================================================
-
-
-# Fraction of the model's context window (in characters) that a single tool
-# result is allowed to occupy.  The remainder is reserved for system prompt,
-# conversation history, and model output.  With ~4 chars/token this gives a
-# tool result ≈ 25 % of the context budget in tokens.
-_TOOL_OUTPUT_CONTEXT_FRACTION = 0.25
-_CHARS_PER_TOKEN = 4
-
-# Hard-floor / ceiling so the budget is always sensible regardless of what
-# the model reports.
-_MIN_TOOL_OUTPUT_CHARS = 20_000  # ~5K tokens
-_MAX_TOOL_OUTPUT_CHARS = 200_000  # ~50K tokens
-_MAX_CHUNK_CHARS = 8_000
-
-# Rank-adaptive per-document budget allocation.
-# Top-ranked (most relevant) documents get a larger share of the budget so
-# we pack as much high-quality context as possible.
-#
-#   fraction(rank) = _TOP_DOC_BUDGET_FRACTION / (1 + rank * _RANK_DECAY)
-#
-# Examples (128K budget, 8K chunk cap):
-#   rank 0 → 40% → 6 chunks   |  rank 3 → 19% → 3 chunks
-#   rank 1 → 30% → 4 chunks   |  rank 10 → 10% → 3 chunks (floor)
-#   rank 2 → 24% → 3 chunks   |
-_TOP_DOC_BUDGET_FRACTION = 0.40
-_RANK_DECAY = 0.35
-_MIN_CHUNKS_PER_DOC = 3
-
-
-def _compute_tool_output_budget(max_input_tokens: int | None) -> int:
-    """Derive a character budget from the model's context window.
-
-    Uses ``litellm.get_model_info`` via the value already resolved by
-    ``ChatLiteLLMRouter`` / ``ChatLiteLLM`` and passed through the dependency
-    chain as ``max_input_tokens``.  Falls back to a conservative default when
-    the value is unavailable.
-    """
-    if max_input_tokens is None or max_input_tokens <= 0:
-        return _MIN_TOOL_OUTPUT_CHARS  # conservative fallback
-
-    budget = int(max_input_tokens * _CHARS_PER_TOKEN * _TOOL_OUTPUT_CONTEXT_FRACTION)
-    return max(_MIN_TOOL_OUTPUT_CHARS, min(budget, _MAX_TOOL_OUTPUT_CHARS))
-
-
-_INTERNAL_METADATA_KEYS: frozenset[str] = frozenset(
-    {
-        "message_id",
-        "thread_id",
-        "event_id",
-        "calendar_id",
-        "google_drive_file_id",
-        "onedrive_file_id",
-        "dropbox_file_id",
-        "page_id",
-        "issue_id",
-        "connector_id",
-    }
-)
-
-
-def format_documents_for_context(
-    documents: list[dict[str, Any]],
-    *,
-    max_chars: int = _MAX_TOOL_OUTPUT_CHARS,
-    max_chunk_chars: int = _MAX_CHUNK_CHARS,
-    max_chunks_per_doc: int = 0,
-) -> str:
-    """Format retrieved documents into an XML context string for the LLM.
-
-    Documents are emitted highest-relevance first until ``max_chars`` is hit.
-    ``max_chunks_per_doc=0`` auto-computes a rank-adaptive cap so top results get
-    more chunks and no single large document monopolizes the budget.
-    """
-    if not documents:
-        return ""
-
-    # Group chunks by document id, preserving chunk_id so [citation:123] works.
-    # ConnectorService returns document-grouped results ({document, chunks, source}).
-    grouped: dict[str, dict[str, Any]] = {}
-
-    for doc in documents:
-        document_info = (doc.get("document") or {}) if isinstance(doc, dict) else {}
-        metadata = (
-            (document_info.get("metadata") or {})
-            if isinstance(document_info, dict)
-            else {}
-        )
-        if not metadata and isinstance(doc, dict):
-            # Some result shapes may place metadata at the top level.
-            metadata = doc.get("metadata") or {}
-
-        source = (
-            (doc.get("source") if isinstance(doc, dict) else None)
-            or document_info.get("document_type")
-            or metadata.get("document_type")
-            or "UNKNOWN"
-        )
-
-        # Identity: prefer document_id, else type+title+url.
-        document_id_val = document_info.get("id")
-        title = (
-            document_info.get("title") or metadata.get("title") or "Untitled Document"
-        )
-        url = (
-            metadata.get("url")
-            or metadata.get("source")
-            or metadata.get("page_url")
-            or ""
-        )
-
-        doc_key = (
-            str(document_id_val)
-            if document_id_val is not None
-            else f"{source}::{title}::{url}"
-        )
-
-        if doc_key not in grouped:
-            grouped[doc_key] = {
-                "document_id": document_id_val
-                if document_id_val is not None
-                else doc_key,
-                "document_type": metadata.get("document_type") or source,
-                "title": title,
-                "url": url,
-                "metadata": metadata,
-                "chunks": [],
-            }
-
-        # Prefer document-grouped chunks when present.
-        chunks_list = doc.get("chunks") if isinstance(doc, dict) else None
-        if isinstance(chunks_list, list) and chunks_list:
-            for ch in chunks_list:
-                if not isinstance(ch, dict):
-                    continue
-                chunk_id = ch.get("chunk_id") or ch.get("id")
-                content = (ch.get("content") or "").strip()
-                if not content:
-                    continue
-                grouped[doc_key]["chunks"].append(
-                    {"chunk_id": chunk_id, "content": content}
-                )
-            continue
-
-        # Fallback: treat this as a flat chunk-like object
-        if not isinstance(doc, dict):
-            continue
-        chunk_id = doc.get("chunk_id") or doc.get("id")
-        content = (doc.get("content") or "").strip()
-        if not content:
-            continue
-        grouped[doc_key]["chunks"].append({"chunk_id": chunk_id, "content": content})
-
-    # Live search connectors whose results should be cited by URL rather than
-    # a numeric chunk_id (the numeric IDs are meaningless auto-incremented counters).
-    live_search_connectors = {
-        "TAVILY_API",
-        "LINKUP_API",
-        "BAIDU_SEARCH_API",
-    }
-
-    parts: list[str] = []
-    total_chars = 0
-    total_docs = len(grouped)
-
-    for doc_idx, g in enumerate(grouped.values()):
-        metadata_clean = {
-            k: v for k, v in g["metadata"].items() if k not in _INTERNAL_METADATA_KEYS
-        }
-        metadata_json = json.dumps(metadata_clean, ensure_ascii=False)
-        is_live_search = g["document_type"] in live_search_connectors
-
-        doc_lines: list[str] = [
-            "<document>",
-            "<document_metadata>",
-            f"  <document_id>{g['document_id']}</document_id>",
-            f"  <document_type>{g['document_type']}</document_type>",
-            f"  <title><![CDATA[{g['title']}]]></title>",
-            f"  <url><![CDATA[{g['url']}]]></url>",
-            f"  <metadata_json><![CDATA[{metadata_json}]]></metadata_json>",
-            "</document_metadata>",
-            "",
-            "<document_content>",
-        ]
-
-        # Rank-adaptive per-document chunk cap: top results get more chunks.
-        if max_chunks_per_doc > 0:
-            chunks_allowed = max_chunks_per_doc
-        else:
-            doc_fraction = _TOP_DOC_BUDGET_FRACTION / (1 + doc_idx * _RANK_DECAY)
-            max_doc_chars = int(max_chars * doc_fraction)
-            xml_overhead = 500
-            chunks_allowed = max(
-                (max_doc_chars - xml_overhead) // max(max_chunk_chars, 1),
-                _MIN_CHUNKS_PER_DOC,
-            )
-
-        chunks = g["chunks"]
-        if len(chunks) > chunks_allowed:
-            chunks = chunks[:chunks_allowed]
-
-        for ch in chunks:
-            ch_content = ch["content"]
-            if max_chunk_chars and len(ch_content) > max_chunk_chars:
-                ch_content = ch_content[:max_chunk_chars] + "\n...(truncated)"
-            ch_id = g["url"] if (is_live_search and g["url"]) else ch["chunk_id"]
-            if ch_id is None:
-                doc_lines.append(f"  <chunk><![CDATA[{ch_content}]]></chunk>")
-            else:
-                doc_lines.append(
-                    f"  <chunk id='{ch_id}'><![CDATA[{ch_content}]]></chunk>"
-                )
-
-        doc_lines.extend(["</document_content>", "</document>", ""])
-
-        doc_xml = "\n".join(doc_lines)
-        doc_len = len(doc_xml)
-
-        if total_chars + doc_len > max_chars:
-            remaining = total_docs - doc_idx
-            if doc_idx == 0:
-                parts.append(doc_xml)
-                total_chars += doc_len
-            parts.append(
-                f"<!-- Output truncated: {remaining} more document(s) omitted "
-                f"(budget {max_chars} chars). Refine your query or reduce top_k "
-                f"to retrieve different results. -->"
-            )
-            break
-
-        parts.append(doc_xml)
-        total_chars += doc_len
-
-    result = "\n".join(parts).strip()
-
-    # Hard safety net: if the result is still over budget (e.g. a single massive
-    # first document), forcibly truncate with a closing comment.
-    if len(result) > max_chars:
-        truncation_msg = "\n<!-- ...output forcibly truncated to fit context window -->"
-        result = result[: max_chars - len(truncation_msg)] + truncation_msg
-
-    return result
-
-
-# =============================================================================
-# Knowledge Base Search
-# =============================================================================
-
-
-async def search_knowledge_base_async(
-    query: str,
-    search_space_id: int,
-    db_session: AsyncSession,
-    connector_service: ConnectorService,
-    connectors_to_search: list[str] | None = None,
-    top_k: int = 10,
-    start_date: datetime | None = None,
-    end_date: datetime | None = None,
-    available_connectors: list[str] | None = None,
-    available_document_types: list[str] | None = None,
-    max_input_tokens: int | None = None,
-) -> str:
-    """Search the knowledge base across connectors and return formatted results.
-
-    ``available_document_types`` lets local connectors with no indexed data be
-    skipped (no embedding / DB round-trip), and ``max_input_tokens`` sizes the
-    output to the model's context window.
-    """
-    perf = get_perf_logger()
-    t0 = time.perf_counter()
-
-    deduplicated = await search_knowledge_base_raw_async(
-        query=query,
-        search_space_id=search_space_id,
-        db_session=db_session,
-        connector_service=connector_service,
-        connectors_to_search=connectors_to_search,
-        top_k=top_k,
-        start_date=start_date,
-        end_date=end_date,
-        available_connectors=available_connectors,
-        available_document_types=available_document_types,
-    )
-
-    if not deduplicated:
-        return "No documents found in the knowledge base. The search space has no indexed content yet."
-
-    # Use browse chunk cap for degenerate queries, otherwise adaptive chunking.
-    max_chunks_per_doc = (
-        _BROWSE_MAX_CHUNKS_PER_DOC if _is_degenerate_query(query) else 0
-    )
-    output_budget = _compute_tool_output_budget(max_input_tokens)
-    result = format_documents_for_context(
-        deduplicated,
-        max_chars=output_budget,
-        max_chunks_per_doc=max_chunks_per_doc,
-    )
-
-    if len(result) > output_budget:
-        perf.warning(
-            "[kb_search] output STILL exceeds budget after format (%d > %d), "
-            "hard truncation should have fired",
-            len(result),
-            output_budget,
-        )
-
-    perf.info(
-        "[kb_search] TOTAL in %.3fs total_docs=%d deduped=%d output_chars=%d "
-        "budget=%d max_input_tokens=%s space=%d",
-        time.perf_counter() - t0,
-        len(deduplicated),
-        len(deduplicated),
-        len(result),
-        output_budget,
-        max_input_tokens,
-        search_space_id,
-    )
-    return result
-
-
-async def search_knowledge_base_raw_async(
-    query: str,
-    search_space_id: int,
-    db_session: AsyncSession,
-    connector_service: ConnectorService,
-    connectors_to_search: list[str] | None = None,
-    top_k: int = 10,
-    start_date: datetime | None = None,
-    end_date: datetime | None = None,
-    available_connectors: list[str] | None = None,
-    available_document_types: list[str] | None = None,
-    query_embedding: list[float] | None = None,
-) -> list[dict[str, Any]]:
-    """Search knowledge base and return raw document dicts (no XML formatting)."""
-    perf = get_perf_logger()
-    t0 = time.perf_counter()
-    all_documents: list[dict[str, Any]] = []
-
-    # Preserve the public signature for compatibility even if values are unused.
-    _ = (db_session, connector_service)
-
-    from app.agents.chat.multi_agent_chat.shared.date_filters import resolve_date_range
-
-    resolved_start_date, resolved_end_date = resolve_date_range(
-        start_date=start_date,
-        end_date=end_date,
-    )
-
-    connectors = _normalize_connectors(connectors_to_search, available_connectors)
-
-    if available_document_types:
-        doc_types_set = set(available_document_types)
-        connectors = [
-            c
-            for c in connectors
-            if c in doc_types_set
-            or NATIVE_TO_LEGACY_DOCTYPE.get(c, "") in doc_types_set
-        ]
-
-    if not connectors:
-        return []
-
-    if _is_degenerate_query(query):
-        perf.info(
-            "[kb_search_raw] degenerate query %r detected - recency browse",
-            query,
-        )
-        browse_connectors = connectors if connectors else [None]  # type: ignore[list-item]
-        expanded_browse = []
-        for connector in browse_connectors:
-            if connector is not None and connector in NATIVE_TO_LEGACY_DOCTYPE:
-                expanded_browse.append([connector, NATIVE_TO_LEGACY_DOCTYPE[connector]])
-            else:
-                expanded_browse.append(connector)
-        browse_results = await asyncio.gather(
-            *[
-                _browse_recent_documents(
-                    search_space_id=search_space_id,
-                    document_type=connector,
-                    top_k=top_k,
-                    start_date=resolved_start_date,
-                    end_date=resolved_end_date,
-                )
-                for connector in expanded_browse
-            ]
-        )
-        for docs in browse_results:
-            all_documents.extend(docs)
-    else:
-        if query_embedding is None:
-            from app.config import config as app_config
-
-            query_embedding = app_config.embedding_model_instance.embed(query)
-
-        max_parallel_searches = 4
-        semaphore = asyncio.Semaphore(max_parallel_searches)
-
-        async def _search_one_connector(connector: str) -> list[dict[str, Any]]:
-            try:
-                async with semaphore, shielded_async_session() as isolated_session:
-                    svc = ConnectorService(isolated_session, search_space_id)
-                    return await svc._combined_rrf_search(
-                        query_text=query,
-                        search_space_id=search_space_id,
-                        document_type=connector,
-                        top_k=top_k,
-                        start_date=resolved_start_date,
-                        end_date=resolved_end_date,
-                        query_embedding=query_embedding,
-                    )
-            except Exception as exc:
-                perf.warning("[kb_search_raw] connector=%s FAILED: %s", connector, exc)
-                return []
-
-        connector_results = await asyncio.gather(
-            *[_search_one_connector(connector) for connector in connectors]
-        )
-        for docs in connector_results:
-            all_documents.extend(docs)
-
-    seen_doc_ids: set[Any] = set()
-    seen_content_hashes: set[int] = set()
-    deduplicated: list[dict[str, Any]] = []
-
-    def _content_fingerprint(document: dict[str, Any]) -> int | None:
-        chunks = document.get("chunks")
-        if isinstance(chunks, list):
-            chunk_texts = []
-            for chunk in chunks:
-                if not isinstance(chunk, dict):
-                    continue
-                chunk_content = (chunk.get("content") or "").strip()
-                if chunk_content:
-                    chunk_texts.append(chunk_content)
-            if chunk_texts:
-                return hash("||".join(chunk_texts))
-        flat_content = (document.get("content") or "").strip()
-        if flat_content:
-            return hash(flat_content)
-        return None
-
-    for doc in all_documents:
-        doc_id = (doc.get("document", {}) or {}).get("id")
-        if doc_id is not None:
-            if doc_id in seen_doc_ids:
-                continue
-            seen_doc_ids.add(doc_id)
-            deduplicated.append(doc)
-            continue
-        content_hash = _content_fingerprint(doc)
-        if content_hash is not None and content_hash in seen_content_hashes:
-            continue
-        if content_hash is not None:
-            seen_content_hashes.add(content_hash)
-        deduplicated.append(doc)
-
-    deduplicated.sort(key=lambda doc: doc.get("score", 0), reverse=True)
-    perf.info(
-        "[kb_search_raw] done in %.3fs total=%d deduped=%d",
-        time.perf_counter() - t0,
-        len(all_documents),
-        len(deduplicated),
-    )
-    return deduplicated
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/deliverables/tools/report.py
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/deliverables/tools/report.py
@ -23,6 +23,45 @@ from app.services.llm_service import get_agent_llm

 logger = logging.getLogger(__name__)

+
+def _report_search_types(
+    available_connectors: list[str] | None,
+    available_document_types: list[str] | None,
+) -> tuple[str, ...] | None:
+    """Build the document-type scope for the shared KB search.
+
+    ``None`` means "search every indexed type"; a tuple narrows the scope to the
+    connectors/document types the search space actually has.
+    """
+    types: set[str] = set()
+    if available_document_types:
+        types.update(available_document_types)
+    if available_connectors:
+        types.update(available_connectors)
+    return tuple(sorted(types)) or None
+
+
+def _render_kb_hits_for_report(hits: list[Any]) -> str:
+    """Render KB hits as plain titled source text for the report writer.
+
+    Citations are intentionally omitted from reports for now, so no ``[n]``
+    labels or chunk ids are emitted — just titled document content for grounding.
+    """
+    from app.agents.chat.multi_agent_chat.shared.document_render import source_label
+
+    blocks: list[str] = []
+    for hit in hits:
+        label = source_label(hit.document_type, hit.metadata)
+        header = f"{hit.title} ({label})" if label else hit.title
+        body = "\n\n".join(
+            chunk.content.strip() for chunk in hit.chunks if chunk.content.strip()
+        )
+        if not body:
+            continue
+        blocks.append(f"## {header}\n\n{body}")
+    return "\n\n".join(blocks)
+
+
 # ─── Shared Formatting Rules ────────────────────────────────────────────────
 # Reusable formatting instructions appended to section-level and review prompts.

@ -788,31 +827,46 @@ def create_generate_report_tool(
                    f"{query_count} queries: {search_queries[:5]}"
                )
                try:
-                    from .knowledge_base import search_knowledge_base_async
+                    from app.agents.chat.multi_agent_chat.shared.retrieval.hybrid_search import (
+                        search_chunks,
+                    )
+                    from app.agents.chat.multi_agent_chat.shared.retrieval.models import (
+                        DocumentHit,
+                        SearchScope,
+                    )
+
+                    scope = SearchScope(
+                        document_types=_report_search_types(
+                            available_connectors, available_document_types
+                        )
+                    )

                    # Each query gets its own short-lived session.
-                    async def _run_single_query(q: str) -> str:
+                    async def _run_single_query(q: str) -> list[DocumentHit]:
                        async with shielded_async_session() as kb_session:
-                            kb_connector_svc = ConnectorService(
-                                kb_session, search_space_id
-                            )
-                            return await search_knowledge_base_async(
-                                query=q,
+                            return await search_chunks(
+                                kb_session,
                                search_space_id=search_space_id,
-                                db_session=kb_session,
-                                connector_service=kb_connector_svc,
+                                query=q,
+                                scope=scope,
                                top_k=10,
-                                available_connectors=available_connectors,
-                                available_document_types=available_document_types,
                            )

-                    kb_results = await asyncio.gather(
+                    hits_per_query = await asyncio.gather(
                        *[_run_single_query(q) for q in search_queries[:5]]
                    )

-                    kb_text_parts = [r for r in kb_results if r and r.strip()]
-                    if kb_text_parts:
-                        kb_combined = "\n\n---\n\n".join(kb_text_parts)
+                    seen_doc_ids: set[int] = set()
+                    merged_hits: list[DocumentHit] = []
+                    for hits in hits_per_query:
+                        for hit in hits:
+                            if hit.document_id in seen_doc_ids:
+                                continue
+                            seen_doc_ids.add(hit.document_id)
+                            merged_hits.append(hit)
+
+                    kb_combined = _render_kb_hits_for_report(merged_hits)
+                    if kb_combined.strip():
                        if effective_source.strip():
                            effective_source = (
                                effective_source
@ -822,20 +876,17 @@ def create_generate_report_tool(
                        else:
                            effective_source = kb_combined

-                        # Count docs found (rough: count <document> tags)
-                        doc_count = kb_combined.count("<document>")
+                        doc_count = len(merged_hits)
                        dispatch_custom_event(
                            "report_progress",
                            {
                                "phase": "kb_search_done",
-                                "message": f"Found {doc_count} relevant documents"
-                                if doc_count
-                                else f"Found results from {len(kb_text_parts)} queries",
+                                "message": f"Found {doc_count} relevant documents",
                            },
                        )
                        logger.info(
                            f"[generate_report] KB search added ~{len(kb_combined)} chars "
-                            f"from {len(kb_text_parts)} queries"
+                            f"from {doc_count} documents"
                        )
                    else:
                        dispatch_custom_event(
--- a/surfsense_backend/app/prompts/system_prompt_composer/base/citation_contract.md
+++ b/surfsense_backend/app/prompts/system_prompt_composer/base/citation_contract.md
@ -1,43 +0,0 @@
-<citation_instructions>
-You can cite the sources shown to you. Cited material arrives in labeled blocks
-such as <retrieved_context> (and some tool results). Inside them, every passage
-begins with a bracketed number — that number is its citation label: [1], [2], [3].
-
-How to cite:
- When a statement relies on a passage, put that passage's label right after the
-  statement: "We pushed the launch to March 10 [1]."
- For several sources behind one statement, write each label in its own brackets
-  with nothing between them — [1][2]. Never merge them as [1, 2] and never use a
-  range like [1-3].
- Put the label at the end of the clause or sentence it supports.
-
-Rules:
- Cite ONLY labels that were shown to you. The bracketed number is the single
-  thing you copy — never cite a title, a date, "chunk 4 of 19", a document id, or
-  a URL.
- Never invent a label and never renumber. If nothing shown supports a claim,
-  write it without a citation instead of guessing.
- Attribute only claims drawn from the provided sources; leave your own general
-  knowledge uncited.
- Plain square brackets only. No parentheses around them, no links or markdown
-  links like [1](http://...), no footnote marks like ¹.
- Do not add a "References" or "Sources" section; citations stay inline.
-
-Example of context you might receive:
-<retrieved_context>
-Document: "Q3 Launch Notes"  (Slack · #launch · 2026-03-02)
-  [1] We agreed to push the launch to March 10.
-  [2] Marketing will be notified next week.
-Document: "Release Timeline"  (Notion · 2026-02-28)
-  [3] Dates floated were March 10 and March 17.
-</retrieved_context>
-
-Correct:
-The launch moved to March 10 [1][3], and marketing is told next week [2].
-
-Incorrect — do not produce any of these:
- The launch moved to March 10 [1, 3].          (merged brackets)
- The launch moved to March 10 ([1]).            (parentheses)
- The launch moved to March 10 [citation:1].     (you never write this form)
- The launch moved to March 10 [4].              (label was never shown)
-</citation_instructions>
--- a/surfsense_backend/app/prompts/system_prompt_composer/base/citations_off.md
+++ b/surfsense_backend/app/prompts/system_prompt_composer/base/citations_off.md
@ -1,16 +1,13 @@
 <citation_instructions>
-IMPORTANT: Citations are DISABLED for this configuration.
+Citation markers are **disabled** in this configuration.

-DO NOT include any citations in your responses. Specifically:
-1. Do NOT use the [citation:chunk_id] format anywhere in your response.
-2. Do NOT reference document IDs, chunk IDs, or source IDs.
-3. Simply provide the information naturally without any citation markers.
-4. Write your response as if you're having a normal conversation, incorporating the information from your knowledge seamlessly.
+Do NOT include `[n]` citation labels or `[citation:…]` markers anywhere, even if
+tool output (`<retrieved_context>`) or examples reference them. Ignore
+citation-format reminders elsewhere in this prompt when they conflict with this
+block.

-When answering questions based on documents from the knowledge base:
- Present the information directly and confidently
- Do not mention that information comes from specific documents or chunks
- Integrate facts naturally into your response without attribution markers
-
-Your goal is to provide helpful, informative answers in a clean, readable format without any citation notation.
+1. Answer in plain prose. Optional markdown links to public URLs when sources
+   are URLs.
+2. Do not expose raw chunk ids, document ids, or internal ids to the user.
+3. Present knowledge-base or web facts naturally without attribution markers.
 </citation_instructions>
--- a/surfsense_backend/app/prompts/system_prompt_composer/base/citations_on.md
+++ b/surfsense_backend/app/prompts/system_prompt_composer/base/citations_on.md
@ -1,89 +1,16 @@
 <citation_instructions>
-CRITICAL CITATION REQUIREMENTS:
+Cite with one token: the bracket label `[n]`. Cited material arrives in labeled
+blocks such as `<retrieved_context>` (and some tool results); inside them every
+passage begins with its `[n]` label on a single shared count. Those labels are
+the only citation you write; the server resolves each one back to its source
+after the turn.

-1. For EVERY piece of information you include from the documents, add a citation in the format [citation:chunk_id] where chunk_id is the exact value from the `<chunk id='...'>` tag inside `<document_content>`.
-2. Make sure ALL factual statements from the documents have proper citations.
-3. If multiple chunks support the same point, include all relevant citations [citation:chunk_id1], [citation:chunk_id2].
-4. You MUST use the exact chunk_id values from the `<chunk id='...'>` attributes. Do not create your own citation numbers.
-5. Every citation MUST be in the format [citation:chunk_id] where chunk_id is the exact chunk id value.
-6. Never modify or change the chunk_id - always use the original values exactly as provided in the chunk tags.
-7. Do not return citations as clickable links.
-8. Never format citations as markdown links like "([citation:5](https://example.com))". Always use plain square brackets only.
-9. Citations must ONLY appear as [citation:chunk_id] or [citation:chunk_id1], [citation:chunk_id2] format - never with parentheses, hyperlinks, or other formatting.
-10. Never make up chunk IDs. Only use chunk_id values that are explicitly provided in the `<chunk id='...'>` tags.
-11. If you are unsure about a chunk_id, do not include a citation rather than guessing or making one up.
-
-<document_structure_example>
-The documents you receive are structured like this:
-
-**Knowledge base documents (numeric chunk IDs):**
-<document>
-<document_metadata>
-  <document_id>42</document_id>
-  <document_type>GITHUB_CONNECTOR</document_type>
-  <title><![CDATA[Some repo / file / issue title]]></title>
-  <url><![CDATA[https://example.com]]></url>
-  <metadata_json><![CDATA[{{"any":"other metadata"}}]]></metadata_json>
-</document_metadata>
-
-<document_content>
-  <chunk id='123'><![CDATA[First chunk text...]]></chunk>
-  <chunk id='124'><![CDATA[Second chunk text...]]></chunk>
-</document_content>
-</document>
-
-**Web search results (URL chunk IDs):**
-<document>
-<document_metadata>
-  <document_type>WEB_SEARCH</document_type>
-  <title><![CDATA[Some web search result]]></title>
-  <url><![CDATA[https://example.com/article]]></url>
-</document_metadata>
-
-<document_content>
-  <chunk id='https://example.com/article'><![CDATA[Content from web search...]]></chunk>
-</document_content>
-</document>
-
-IMPORTANT: You MUST cite using the EXACT chunk ids from the `<chunk id='...'>` tags.
- For knowledge base documents, chunk ids are numeric (e.g. 123, 124) or prefixed (e.g. doc-45).
- For live web search results, chunk ids are URLs (e.g. https://example.com/article).
-Do NOT cite document_id. Always use the chunk id.
-</document_structure_example>
-
-<citation_format>
- Every fact from the documents must have a citation in the format [citation:chunk_id] where chunk_id is the EXACT id value from a `<chunk id='...'>` tag
- Citations should appear at the end of the sentence containing the information they support
- Multiple citations should be separated by commas: [citation:chunk_id1], [citation:chunk_id2], [citation:chunk_id3]
- No need to return references section. Just citations in answer.
- NEVER create your own citation format - use the exact chunk_id values from the documents in the [citation:chunk_id] format
- NEVER format citations as clickable links or as markdown links like "([citation:5](https://example.com))". Always use plain square brackets only
- NEVER make up chunk IDs if you are unsure about the chunk_id. It is better to omit the citation than to guess
- Copy the EXACT chunk id from the XML - if it says `<chunk id='5'>`, use [citation:5]
- If the chunk id is a URL like `<chunk id='https://example.com/page'>`, use [citation:https://example.com/page]
-</citation_format>
-
-<citation_examples>
-CORRECT citation formats:
- [citation:5] (numeric chunk ID from knowledge base)
- [citation:https://example.com/article] (URL chunk ID from web search results)
- [citation:chunk_id1], [citation:chunk_id2], [citation:chunk_id3] (multiple citations)
-
-INCORRECT citation formats (DO NOT use):
- Using parentheses and markdown links: ([citation:5](https://github.com/MODSetter/SurfSense))
- Using parentheses around brackets: ([citation:5])
- Using hyperlinked text: [link to source 5](https://example.com)
- Using footnote style: ... library¹
- Making up source IDs when source_id is unknown
- Using old IEEE format: [1], [2], [3]
- Using source types instead of IDs: [citation:GITHUB_CONNECTOR] instead of [citation:5]
-</citation_examples>
-
-<citation_output_example>
-Based on your GitHub repositories and video content, Python's asyncio library provides tools for writing concurrent code using the async/await syntax [citation:5]. It's particularly useful for I/O-bound and high-level structured network code [citation:5].
-
-According to web search results, the key advantage of asyncio is that it can improve performance by allowing other code to run while waiting for I/O operations to complete [citation:https://docs.python.org/3/library/asyncio.html]. This makes it excellent for scenarios like web scraping, API calls, database operations, or any situation where your program spends time waiting for external resources.
-
-However, from your video learning, it's important to note that asyncio is not suitable for CPU-bound tasks as it runs on a single thread [citation:12]. For computationally intensive work, you'd want to use multiprocessing instead.
-</citation_output_example>
+1. Put the label right after the claim it supports.
+2. Several sources for one claim: stack brackets, `[1][2]`.
+3. Copy labels exactly as shown — never renumber them, add your own, or write the
+   underlying title, date, id, or URL instead.
+4. Write the bare `[n]` and nothing else: no `[citation:...]`, no markdown links
+   like `[1](http://…)`, no footnote marks, no "References" section.
+5. Only label claims the sources support. If nothing shown backs a claim — or you
+   never saw a label — leave it uncited; never invent one.
 </citation_instructions>
--- a/surfsense_backend/app/prompts/system_prompt_composer/providers/deepseek.md
+++ b/surfsense_backend/app/prompts/system_prompt_composer/providers/deepseek.md
@ -9,7 +9,7 @@ Reasoning hygiene (R1-aware):
 Output style:
 - Be concise. Default to a one-paragraph answer; expand only when the user asks for detail.
 - Don't open with sycophantic phrasing ("Great question", "Sure, here you go"). Lead with the answer or the next action.
- For factual answers, cite once with `[citation:chunk_id]` and stop.
+- For factual answers, cite once with the passage's `[n]` label and stop.

 Tool calls:
 - Issue independent tool calls in parallel within a single turn.
--- a/surfsense_backend/app/prompts/system_prompt_composer/providers/grok.md
+++ b/surfsense_backend/app/prompts/system_prompt_composer/providers/grok.md
@ -5,7 +5,7 @@ Maximum terseness:
 - Answer in fewer than 4 lines unless the user asks for detail. One-word answers are best when they suffice.
 - No preamble ("The answer is", "Here's what I'll do"), no postamble ("Hope that helps", "Let me know"). Get straight to the answer.
 - Avoid restating the user's question.
- For factual lookups inside the knowledge base, give the answer with a single `[citation:chunk_id]` and stop.
+- For factual lookups inside the knowledge base, give the answer with a single `[n]` label and stop.

 Tool discipline:
 - Use exactly ONE tool per assistant turn when investigating; wait for the result before deciding the next call. Do not loop on the same tool with the same arguments — pick a result and act.
--- a/surfsense_backend/tests/integration/google_unification/conftest.py
+++ b/surfsense_backend/tests/integration/google_unification/conftest.py
@ -3,7 +3,6 @@
 from __future__ import annotations

 import uuid
-from contextlib import asynccontextmanager
 from datetime import UTC, datetime
 from unittest.mock import MagicMock

@ -227,23 +226,6 @@ def patched_embed(monkeypatch):
    return mock


-@pytest.fixture
-def patched_shielded_session(async_engine, monkeypatch):
-    """Replace ``shielded_async_session`` in the knowledge_base module
-    with one that yields sessions from the test engine."""
-    test_maker = async_sessionmaker(async_engine, expire_on_commit=False)
-
-    @asynccontextmanager
-    async def _test_shielded():
-        async with test_maker() as session:
-            yield session
-
-    monkeypatch.setattr(
-        "app.agents.chat.multi_agent_chat.subagents.builtins.deliverables.tools.knowledge_base.shielded_async_session",
-        _test_shielded,
-    )
-
-
 # ---------------------------------------------------------------------------
 # Indexer test helpers
 # ---------------------------------------------------------------------------
--- a/surfsense_backend/tests/integration/google_unification/test_browse_includes_legacy_docs.py
+++ b/surfsense_backend/tests/integration/google_unification/test_browse_includes_legacy_docs.py
@ -1,46 +0,0 @@
-"""Integration test: _browse_recent_documents returns docs of multiple types.
-
-Exercises the browse path (degenerate-query fallback) with a real PostgreSQL
-database.  Verifies that passing a list of document types correctly returns
-documents of all listed types -- the same ``.in_()`` SQL path used by hybrid
-search but through the browse/recency-ordered code path.
-"""
-
-from __future__ import annotations
-
-import pytest
-
-pytestmark = pytest.mark.integration
-
-
-async def test_browse_recent_documents_with_list_type_returns_both(
-    committed_google_data, patched_shielded_session
-):
-    """_browse_recent_documents returns docs of all types when given a list."""
-    from app.agents.chat.multi_agent_chat.subagents.builtins.deliverables.tools.knowledge_base import (
-        _browse_recent_documents,
-    )
-
-    space_id = committed_google_data["search_space_id"]
-
-    results = await _browse_recent_documents(
-        search_space_id=space_id,
-        document_type=["GOOGLE_DRIVE_FILE", "COMPOSIO_GOOGLE_DRIVE_CONNECTOR"],
-        top_k=10,
-        start_date=None,
-        end_date=None,
-    )
-
-    returned_types = set()
-    for doc in results:
-        doc_info = doc.get("document", {})
-        dtype = doc_info.get("document_type")
-        if dtype:
-            returned_types.add(dtype)
-
-    assert "GOOGLE_DRIVE_FILE" in returned_types, (
-        "Native Drive docs should appear in browse results"
-    )
-    assert "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" in returned_types, (
-        "Legacy Composio Drive docs should appear in browse results"
-    )
--- a/surfsense_backend/tests/unit/agents/new_chat/prompts/test_composer.py
+++ b/surfsense_backend/tests/unit/agents/new_chat/prompts/test_composer.py
@ -86,9 +86,10 @@ class TestCompose:
        # Tools
        assert "<tools>" in prompt
        assert "</tools>" in prompt
-        # Citations on by default
+        # Citations on by default — the [n] / <retrieved_context> contract
        assert "<citation_instructions>" in prompt
-        assert "[citation:chunk_id]" in prompt
+        assert "<retrieved_context>" in prompt
+        assert "[1][2]" in prompt

    def test_team_visibility_uses_team_variants(self, fixed_today: datetime) -> None:
        prompt = compose_system_prompt(
@ -116,9 +117,9 @@ class TestCompose:
    def test_citations_disabled_swaps_block(self, fixed_today: datetime) -> None:
        prompt_on = compose_system_prompt(today=fixed_today, citations_enabled=True)
        prompt_off = compose_system_prompt(today=fixed_today, citations_enabled=False)
-        assert "Citations are DISABLED" in prompt_off
-        assert "Citations are DISABLED" not in prompt_on
-        assert "[citation:chunk_id]" in prompt_on
+        assert "Citation markers are **disabled**" in prompt_off
+        assert "Citation markers are **disabled**" not in prompt_on
+        assert "<retrieved_context>" in prompt_on

    def test_enabled_tool_filter_only_includes_listed_tools(
        self, fixed_today: datetime