From ce150165335573e2bc15f75efe44b42325e0e401 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Thu, 25 Jun 2026 15:27:09 +0200
Subject: [PATCH] citations: consolidate prompts, retire eager path, refresh
 ADR

Rewrite the main-agent citation contract to a single [n] channel and sync
the orphaned system_prompt_composer surface to match; drop stale
[citation:chunk_id] / <chunk_index> references from dynamic_context and
provider hints. Reuse the shared hybrid search in the deliverables report
(citations omitted for now) and delete the orphaned report KB helper.
Remove the dead eager KnowledgePriorityMiddleware wiring (knowledge_priority
+ stack) and its legacy browse test. Update ADR 0001 to reflect the cutover.
---
 ...1-rag-citation-and-context-architecture.md | 197 ++++-
 .../middleware/knowledge_priority.py          |  42 -
 .../main_agent/middleware/stack.py            |  23 +-
 .../skills/builtin/kb-research/SKILL.md       |   2 +-
 .../system_prompt/prompts/citations/off.md    |   5 +-
 .../system_prompt/prompts/citations/on.md     |  52 +-
 .../prompts/dynamic_context/private.md        |   9 +-
 .../prompts/dynamic_context/team.md           |  10 +-
 .../system_prompt/prompts/providers/grok.md   |   4 +-
 .../prompts/providers/openai_codex.md         |   2 +-
 .../deliverables/tools/knowledge_base.py      | 762 ------------------
 .../builtins/deliverables/tools/report.py     |  93 ++-
 .../base/citation_contract.md                 |  43 -
 .../base/citations_off.md                     |  21 +-
 .../base/citations_on.md                      |  99 +--
 .../providers/deepseek.md                     |   2 +-
 .../system_prompt_composer/providers/grok.md  |   2 +-
 .../google_unification/conftest.py            |  18 -
 .../test_browse_includes_legacy_docs.py       |  46 --
 .../agents/new_chat/prompts/test_composer.py  |  11 +-
 20 files changed, 316 insertions(+), 1127 deletions(-)
 delete mode 100644 surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/middleware/knowledge_priority.py
 delete mode 100644 surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/deliverables/tools/knowledge_base.py
 delete mode 100644 surfsense_backend/app/prompts/system_prompt_composer/base/citation_contract.md
 delete mode 100644 surfsense_backend/tests/integration/google_unification/test_browse_includes_legacy_docs.py

diff --git a/docs/adr/0001-rag-citation-and-context-architecture.md b/docs/adr/0001-rag-citation-and-context-architecture.md
index a0e8a1b95..688df2d1a 100644
--- a/docs/adr/0001-rag-citation-and-context-architecture.md
+++ b/docs/adr/0001-rag-citation-and-context-architecture.md
@@ -383,8 +383,9 @@ Remove from the hot path:
 - `fetch_mentioned_documents` eager chunk pull.
 - `<priority_documents>` pre-injection and `KbContextProjectionMiddleware`
   priority projection.
-- `kb_priority` / `kb_matched_chunk_ids` state plumbing (deleted per §8.10; add a
-  dedicated `citation_registry` field instead).
+- `kb_priority` state plumbing (deleted per §8.10; add a dedicated
+  `citation_registry` field instead). `kb_matched_chunk_ids` is already gone
+  (build-order Step 5).
 
 Keep / add:
 
@@ -425,26 +426,71 @@ Keep / add:
     pointer only, loaded **on demand** via a `read_chat(thread_id)` tool that reuses
     the access-checked `referenced_chat_context` resolver and registers each surfaced
     turn as `chat_turn`. ✅
+12. **One document render for both surfaces.** RAG excerpts
+    (`search_knowledge_base`) and full reads (`read_file`) render through a *single*
+    document renderer — same envelope, same `[n]` contract. Completeness is carried
+    by `view="excerpt"` vs `view="full"`, **not** an `is_complete` boolean and **not**
+    a numeric coverage count: `view="excerpt"` alone tells the model it saw a slice.
+    (A `chunks_shown`/`total_chunks` count was considered and dropped — it never had a
+    total to show for search excerpts, and full reads already say `view="full"`.) Raw
+    ids and `metadata_json` are dropped from the model's view.
+    **No `<chunk_index>` seek table** — a full read returns the whole document as one
+    numbered document block (an index keyed by internal ids gives the agent no actionable
+    signal, and any `[n]`-keyed/preview index adds cognitive load that risks
+    degrading the primary answer). Supersedes the standalone `<retrieved_context>`
+    shape and the removed `is_complete`. See §12. (planned)
 
 ## 9. Open items
 
-_None — all decisions locked. See §8._
+_All decisions locked (§8). Decision #12 is locked but **not yet built** — see the
+§12 schema and the rollout follow-ups._
 
-## 10. Rollout (suggested)
+## 10. Rollout
 
-1. Citation registry + resolver (state + register/resolve) — no behavior change yet.
-2. `search_knowledge_base` returns registered chunks; render `<retrieved_context>`;
-   normalize `[n]` → `[citation:n]`.
-3. Wire reranker; add chunk overlap in indexing.
-4. Convert mentions to ambient scope + `scope` arg; delete priority pre-injection.
-5. Move workspace tree to ambient plane.
-6. Extend registry to connector/web/chat sources.
+### Already built in parallel (committed, not yet wired)
 
-Built in parallel ahead of cutover (not yet wired): `shared/retrieval/`,
-`shared/retrieved_context/`, `shared/citations/`, and the new on-contract prompt
-`base/citation_contract.md` (teaches `[n]` / `[1][2]`). At cutover its contents
-replace `base/citations_on.md` and `citation_contract.md` is deleted, so the
-composer needs no change; `citations_off.md` stays as-is.
+`shared/citations/` (registry, markers, normalizer), `shared/retrieved_context/`
+(renderer), `shared/retrieval/` (hybrid search + rerank + service), hybrid-search
+behavior tests, and the on-contract prompt `base/citation_contract.md`
+(`[n]` / `[1][2]`).
+
+### Two findings that shape the cutover
+
+- **The agent is already pull-based by default.** `enable_kb_priority_preinjection`
+  is `False` and `KnowledgePriorityMiddleware` runs `mentions_only=True`; an
+  on-demand `search_knowledge_base` tool already exists. So the cutover *upgrades
+  the existing pull tool to the citation spine* — it does not remove eager RAG
+  (already gated off).
+- **The production citation prompt is local to the agent**, at
+  `main_agent/system_prompt/prompts/citations/on.md` (two-channel
+  `[citation:chunk_id]`). The composer's `base/citations_on.md` only serves the
+  anonymous/automation path. Both must learn the `[n]` contract.
+
+### Phased cutover
+
+0. **Registry on state.** Add `citation_registry: CitationRegistry` to
+   `SurfSenseFilesystemState` with a replace reducer; confirm checkpointer
+   round-trip.
+1. **Swap the KB tool.** Rewrite `search_knowledge_base` to call
+   `search_knowledge_base_context` (renders `<retrieved_context>` with `[n]`,
+   mutates the registry) and persist the registry via `Command(update=...)`.
+2. **Normalize `[n]` → `[citation:<payload>]`.** Finalize-time first (rewrite the
+   completed assistant text from the checkpointed registry before DB persist);
+   buffered live-stream normalization is a follow-up. Bare-`[n]` only, so
+   web_search `[citation:url]` markers are untouched.
+3. **Prompt contract (both surfaces).** Update `main_agent/.../citations/on.md`
+   (production) to teach the `[n]` channel alongside the existing web_search/`task`
+   channels; reconcile the composer path by folding `citation_contract.md` into
+   `base/citations_on.md` (then delete `citation_contract.md`). `citations_off.md`
+   stays.
+4. **Mentions → scope.** Map `@document`/`@folder` mentions to
+   `SearchScope(document_ids=…)` for the tool; retire `kb_priority` mention
+   surfacing.
+5. **Remove the old eager path.** Retire `KnowledgePriorityMiddleware`,
+   `kb_context_projection`, and the old `search_knowledge_base` hybrid helper in
+   `knowledge_search.py`; later `ChucksHybridSearchRetriever` (after migrating
+   `ConnectorService`). Migrate `web_search` to register `WEB_RESULT` so all
+   citations unify on `[n]` — **done**, see §12 build-order Step 6.
 
 ---
 
@@ -458,3 +504,122 @@ lost:
   collapse *perceived* latency from pull-based retrieval. See §4.5. This is the
   mitigation for pull's only real cost, but it touches the streaming pipeline, not
   the retrieval/citation path — so it ships independently.
+
+---
+
+## 12. Unified document render (search + read)
+
+The model meets a knowledge-base document in two moments: as **excerpts** from a
+search, and as a **full read** of one object. Today these use two unrelated
+shapes (compact text for search; `<document_metadata>` + `<chunk_index>` +
+`<chunk id>` XML for reads), with two different citation tokens. That doubles the
+schema the model must learn and is a hallucination surface. We collapse both onto
+**one renderer**.
+
+### Principles
+
+- **One envelope, two views.** The same renderer renders a document whether it
+  arrives partial (search) or complete (read). Only the `view` and the set of
+  passages shown differ.
+- **`[n]` is the only citable token**, in both views, assigned by the shared
+  registry (find-or-create). A chunk first seen in search keeps its `[n]` when the
+  same doc is later read in full.
+- **Completeness is the `view` word, nothing more.** A search result is inherently
+  excerpts; a read is inherently the whole object. No `is_complete` flag, no numeric
+  coverage count. `view="excerpt"` tells the model it saw a slice (so it should read
+  the doc before claiming the doc "only" says X); `view="full"` says it has the whole
+  object. A `chunks_shown`/`total_chunks` count was considered and rejected: search
+  excerpts have no total on hand (and we won't add a count query for it), and full
+  reads are already self-evident from `view`.
+- **Drop noise.** Raw `document_id` / `chunk_id` and the `metadata_json` blob
+  leave the model's view (they stay server-side as registry keys). The model
+  sees `title`, `source`, and `[n]` passages.
+- **No seek table.** A full read returns the whole document as one numbered
+  document block; the `<chunk_index>` line-range map is dropped. It was keyed by internal
+  `chunk_id` (which the model never sees), so it gave the agent nothing actionable
+  to seek by. Re-keying it to `[n]` or adding chunk previews would only add cognitive
+  load the agent must reconcile against the actual content — a hallucination/quality
+  risk that outweighs the token savings on the rare genuinely-large read. Simpler:
+  hand over the document, numbered, and let the model read it.
+
+### Shape
+
+Excerpt (from `search_knowledge_base`):
+
+```xml
+<document title="Q3 Launch Notes" source="Slack · #launch · 2026-03-02" view="excerpt">
+  [3] We agreed to push launch to March 10.
+  [4] Marketing will be notified next week.
+</document>
+```
+
+Full (from a read):
+
+```xml
+<document title="Q3 Launch Notes" source="Slack · #launch · 2026-03-02" view="full">
+  [3] We agreed to push launch to March 10.
+  [4] Marketing will be notified next week.
+  [7] …
+  …(all chunks, numbered)
+</document>
+```
+
+`<retrieved_context>` becomes simply "N documents in excerpt view"; a read is
+"one document in full view". This supersedes the standalone `<retrieved_context>`
+renderer decision and confirms the earlier removal of `is_complete`.
+
+### Build order (one step at a time)
+
+1. **Registry merge reducer** — `citation_registry` merges (find-or-create union,
+   re-mint on collision) instead of replacing, so parent/subagent (and parallel)
+   registrations stay globally consistent. Pure; independently testable. ✅
+2. **One document renderer** with a `view` parameter; point `search_knowledge_base`
+   at it (excerpt view), replacing today's `retrieved_context` renderer. ✅
+3. **Register-on-read + full view** — the KB read path registers its chunks and
+   renders through the same renderer (full view); the whole document is returned
+   numbered, with **no `<chunk_index>`**. The `read_file` tool loads the document
+   via `KBPostgresBackend.aload_document`, renders it against the conversation
+   registry, and persists `citation_registry`; `build_document_xml` is deleted. ✅
+4. **Retire Channel C** — now that KB reads emit `[n]` (Step 3), the
+   knowledge_base read/specialist path cites bare `[n]` instead of
+   `[citation:chunk_id]`. The KB subagent prompts (cloud/desktop, full/read-only)
+   and `description_readonly.md` were rewritten to the `<document view="full">`
+   `[n]` format, the `evidence.chunk_ids` field became `evidence.citations`, and
+   `citations/on.md` folds the KB relay into Channel A (preserve `[n]` from a
+   specialist verbatim). Channel C is **narrowed, not deleted**: it still covers
+   `task` specialists that emit `[citation:id]` — today only the deliverables
+   `knowledge_base` tool, which builds its own `<chunk id>` XML and is not yet on
+   the registry/`[n]` spine. Migrating that tool (and then fully deleting
+   Channel C) is a follow-up. ✅
+5. **Delete `kb_matched_chunk_ids`** — with no seek table and no `matched` flag, the
+   search→read highlighting hand-off has no consumer. Removed: the state field
+   (`filesystem_state.py`) and its reducer default (`reducers.py`); the
+   `search_knowledge_base` tool's `_matched_chunk_ids` writer; the dead
+   `KnowledgePriorityMiddleware` writes plus the `matched_chunk_ids` return of
+   `_materialize_priority` (`knowledge_search.py`); and the stale
+   `<chunk_index>` / `matched="true"` / `<chunk id>` rendering prose in the cloud
+   filesystem prompt (`cloud.py`), rewritten to the `<document view="full">` `[n]`
+   read format. The `resolver.py` docstring reference was dropped and the two
+   integration assertions that read the field now assert scope confinement via the
+   rendered `<retrieved_context>` titles. (The retriever-layer `matched_chunk_ids`
+   in `chunks_hybrid_search.py` is a separate output shape and is untouched.) ✅
+6. **Web onto the registry (Channel B → A)** — `web_search` now registers each
+   result as a `WEB_RESULT` (locator `{url}`) and renders a `<web_results>` block
+   of `<document view="excerpt">` blocks with `[n]` labels, returning a
+   `Command(update={messages, citation_registry})` like `search_knowledge_base`.
+   `markers.py` already maps `WEB_RESULT → url`, so `[n]` resolves end-to-end with
+   no frontend change. To enable this, the renderer was generalized: a
+   `RenderablePassage` now carries a generic `locator: dict` (KB fills
+   `{document_id, chunk_id}`; web fills `{url}`) instead of fixed KB fields, and a
+   dedicated **citation-state middleware** declares the `citation_registry` channel
+   for the `research` subagent (which doesn't use the filesystem state). The two
+   duplicate `web_search` implementations were collapsed into the shared
+   `app/agents/chat/shared/tools/web_search.py`; the `research` copy was deleted.
+   Prompts updated: `citations/on.md` drops the web channel (web is now Channel A
+   `[n]`; only the legacy `[citation:id]` specialist relay remains, relabelled
+   Channel B), the research subagent prompt cites `[n]`, the main `web_search`
+   description teaches `<web_results>`/`[n]`, `off.md` suppresses `[n]` too, and
+   stale `<chunk_index>`/`[citation:chunk_id]` references in `dynamic_context` and
+   the grok/openai_codex provider hints were corrected to `[n]`. `scrape_webpage`
+   stays uncited (raw page text, no `[n]`) — a fact from a scrape reports its URL
+   instead. Connectors and chat turns remain unmigrated (future workstreams). ✅
diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/middleware/knowledge_priority.py b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/middleware/knowledge_priority.py
deleted file mode 100644
index 787dbe402..000000000
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/middleware/knowledge_priority.py
+++ /dev/null
@@ -1,42 +0,0 @@
-"""KB priority planner: <priority_documents> injection."""
-
-from __future__ import annotations
-
-from langchain_core.language_models import BaseChatModel
-
-from app.agents.chat.multi_agent_chat.shared.filesystem_selection import FilesystemMode
-from app.agents.chat.multi_agent_chat.shared.middleware.knowledge_search import (
-    KnowledgePriorityMiddleware,
-)
-from app.services.llm_service import get_planner_llm
-
-
-def build_knowledge_priority_mw(
-    *,
-    llm: BaseChatModel,
-    search_space_id: int,
-    filesystem_mode: FilesystemMode,
-    available_connectors: list[str] | None,
-    available_document_types: list[str] | None,
-    mentioned_document_ids: list[int] | None,
-    preinjection_enabled: bool = True,
-) -> KnowledgePriorityMiddleware:
-    """Build the KB priority middleware.
-
-    When ``preinjection_enabled`` is False (the lazy default), the middleware
-    runs in mentions-only mode: it skips the expensive planner LLM + embedding
-    + hybrid search and only surfaces explicit @-mentions. The main agent is
-    expected to pull relevant KB content on demand via the
-    ``search_knowledge_base`` tool instead.
-    """
-    return KnowledgePriorityMiddleware(
-        llm=llm,
-        planner_llm=get_planner_llm(),
-        search_space_id=search_space_id,
-        filesystem_mode=filesystem_mode,
-        available_connectors=available_connectors,
-        available_document_types=available_document_types,
-        mentioned_document_ids=mentioned_document_ids,
-        inject_system_message=False,
-        mentions_only=not preinjection_enabled,
-    )
diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/middleware/stack.py b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/middleware/stack.py
index 675898d4c..d766367de 100644
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/middleware/stack.py
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/middleware/stack.py
@@ -1,10 +1,11 @@
 """Main-agent middleware list assembly: one line per slot.
 
 The main agent is a pure router — filesystem reads/writes are owned by the
-``knowledge_base`` subagent and delegated via the ``task`` tool. The stack
-here only renders KB context (workspace tree + priority docs), projects it
-into system messages, and commits any subagent-side staged writes at end of
-turn (cloud mode).
+``knowledge_base`` subagent and delegated via the ``task`` tool. Knowledge-base
+retrieval is pull-based: the ``search_knowledge_base`` tool runs the hybrid
+search on demand and renders ``<retrieved_context>`` with ``[n]`` citation
+labels. The stack here computes the workspace tree, commits any subagent-side
+staged writes at end of turn (cloud mode), and wires the supporting middleware.
 """
 
 from __future__ import annotations
@@ -33,9 +34,6 @@ from app.agents.chat.multi_agent_chat.shared.middleware.anthropic_cache import (
 from app.agents.chat.multi_agent_chat.shared.middleware.compaction import (
     build_compaction_mw,
 )
-from app.agents.chat.multi_agent_chat.shared.middleware.kb_context_projection import (
-    build_kb_context_projection_mw,
-)
 from app.agents.chat.multi_agent_chat.shared.middleware.patch_tool_calls import (
     build_patch_tool_calls_mw,
 )
@@ -84,7 +82,6 @@ from .context_editing import build_context_editing_mw
 from .dedup_hitl import build_dedup_hitl_mw
 from .doom_loop import build_doom_loop_mw
 from .kb_persistence import build_kb_persistence_mw
-from .knowledge_priority import build_knowledge_priority_mw
 from .knowledge_tree import build_knowledge_tree_mw
 from .noop_injection import build_noop_injection_mw
 from .otel_span import build_otel_mw
@@ -237,16 +234,6 @@ def build_main_agent_deepagent_middleware(
             search_space_id=search_space_id,
             llm=llm,
         ),
-        build_knowledge_priority_mw(
-            llm=llm,
-            search_space_id=search_space_id,
-            filesystem_mode=filesystem_mode,
-            available_connectors=available_connectors,
-            available_document_types=available_document_types,
-            mentioned_document_ids=mentioned_document_ids,
-            preinjection_enabled=flags.enable_kb_priority_preinjection,
-        ),
-        build_kb_context_projection_mw(),
         build_kb_persistence_mw(
             filesystem_mode=filesystem_mode,
             search_space_id=search_space_id,
diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/skills/builtin/kb-research/SKILL.md b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/skills/builtin/kb-research/SKILL.md
index 0f0b5ffbb..5730c3122 100644
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/skills/builtin/kb-research/SKILL.md
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/skills/builtin/kb-research/SKILL.md
@@ -15,7 +15,7 @@ allowed-tools: scrape_webpage, read_file, ls_tree, grep, web_search
 1. Decompose the user's question into 2-4 specific, citation-worthy sub-questions.
 2. For each sub-question, run **one** targeted KB search (focused on terms the user would have written, not synonyms). Open the most relevant 2-3 documents fully via `read_file` if their excerpts are too short.
 3. Use `grep` to find supporting passages in long files instead of re-reading them end to end.
-4. Cite every claim with `[citation:chunk_id]` exactly as the chunk tag specifies.
+4. Cite every claim with the `[n]` label shown on the passage you used (search results and `read_file` output both carry them); never write a chunk id, URL, or title yourself.
 
 ## What good output looks like
 - Short paragraphs with inline citations.
diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/citations/off.md b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/citations/off.md
index 42cb099a6..ce80cf7e2 100644
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/citations/off.md
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/citations/off.md
@@ -1,12 +1,13 @@
 <citations>
 Citation markers are **disabled** in this configuration.
 
-Do NOT include `[citation:…]` markers anywhere, even if tool descriptions or
+Do NOT include `[n]` citation labels or `[citation:…]` markers anywhere, even if
+tool output (`<retrieved_context>`, `<web_results>`), tool descriptions, or
 examples reference them. Ignore citation-format reminders elsewhere in this
 prompt when they conflict with this block.
 
 1. Answer in plain prose. Optional markdown links to public URLs when
    sources are URLs.
 2. Do not expose raw chunk ids, document ids, or internal ids to the user.
-3. Present KB or docs facts naturally without attribution markers.
+3. Present KB, web, or docs facts naturally without attribution markers.
 </citations>
diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/citations/on.md b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/citations/on.md
index 2abd95d5a..a42873fcb 100644
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/citations/on.md
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/citations/on.md
@@ -1,42 +1,16 @@
 <citations>
-Citations reach the answer through two channels. Use whichever applies — and
-never invent ids you didn't see. Citation ids are resolved by exact-match
-lookup; a wrong id silently breaks the link, so when in doubt, omit.
+Cite with one token: the bracket label `[n]`. Every citable result —
+`search_knowledge_base` passages, `web_search` results, and prose from a
+`task` knowledge_base/research specialist — already carries `[n]` labels on a
+single shared count. Those labels are the only citation you write; the server
+resolves each one back to its source after the turn.
 
-### Channel A — chunk blocks injected this turn
-When `web_search` returns `<document>` / `<chunk id='…'>` blocks in this
-turn:
-
-1. For each factual statement taken from those chunks, add
-   `[citation:chunk_id]` using the **exact** id from a visible
-   `<chunk id='…'>` tag. Copy digit-for-digit (or the URL verbatim);
-   do not retype from memory.
-2. `<document_id>` is the parent doc id, **not** a citation source —
-   only ids inside `<chunk id='…'>` count.
-3. Multiple chunks → `[citation:id1], [citation:id2]` (comma-separated,
-   each id copied individually).
-4. Never invent, normalise, or guess at adjacent ids; if unsure, omit.
-5. Plain brackets only — no markdown links, no footnote numbering.
-
-### Channel B — citations relayed by a `task` specialist
-A `task(...)` tool message may contain `[citation:<chunk_id>]` markers
-the specialist already attached to its prose. The specialist saw the
-underlying `<chunk id='…'>` blocks; you didn't. So:
-
-1. **Preserve those markers verbatim** in your final answer — do not
-   reformat, renumber, drop, or wrap them in markdown links. When you
-   paraphrase a specialist sentence, copy the marker character-for-
-   character; do not regenerate the id from memory (LLMs reliably
-   corrupt nearby digits).
-2. Keep each marker attached to the sentence the specialist attached
-   it to.
-3. Do **not** add new `[citation:…]` markers of your own to a
-   specialist's prose; if a fact has no marker, the specialist
-   couldn't tie it to a chunk and neither can you.
-4. When a specialist returns JSON, the citation markers live inside
-   the prose-bearing fields (e.g. a summary or excerpt). Pull them
-   along with the surrounding sentence when you quote.
-
-If neither channel surfaces citation markers this turn, do not fabricate
-them.
+1. Put the label right after the claim it supports.
+2. Several sources for one claim: stack brackets, `[1][2]`.
+3. Copy labels exactly as shown, a specialist's included — never renumber them,
+   add your own, or write the underlying title, date, id, or URL instead.
+4. Write the bare `[n]` and nothing else: no `[citation:...]`, no markdown links,
+   no footnote marks, no "References" section.
+5. Only label claims the sources support. If nothing shown backs a claim — or you
+   never saw a label — leave it uncited; never invent one.
 </citations>
diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/dynamic_context/private.md b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/dynamic_context/private.md
index 8f2bfca4e..3dce76981 100644
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/dynamic_context/private.md
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/dynamic_context/private.md
@@ -11,17 +11,16 @@ your answer, not as the task itself.
 `<priority_documents>` lists the workspace documents most relevant to the
 latest user message, ranked by relevance score, with `[USER-MENTIONED]`
 flagged on anything the user explicitly referenced. When the task is about
-workspace content, read these first; matched passages inside each document
-are flagged via `<chunk_index>` so you can jump straight to them.
+workspace content, read these first.
 
 `<workspace_tree>` shows the full `/documents/` folder and file layout. Use
 it to resolve paths the user describes in natural language ("my Q2 roadmap",
 "last week's meeting notes") into concrete document references before
 delegating to a specialist.
 
-`<document>` and `<chunk id='…'>` blocks are chunked indexed content returned
-by KB search (backing `<priority_documents>`). Each chunk carries a stable
-`id` attribute.
+`<retrieved_context>` blocks hold knowledge-base passages from
+`search_knowledge_base`; each `<document>` inside is in excerpt view and every
+passage is prefixed with an `[n]` citation label.
 
 If a block doesn't appear this turn, work from the conversation alone.
 </dynamic_context>
diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/dynamic_context/team.md b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/dynamic_context/team.md
index a5892c23a..7657af663 100644
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/dynamic_context/team.md
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/dynamic_context/team.md
@@ -10,18 +10,16 @@ budget. Treat it as background colour for your answer, not as the task itself.
 `<priority_documents>` lists the workspace documents most relevant to the
 latest user message, ranked by relevance score, with `[USER-MENTIONED]`
 flagged on anything someone in the thread explicitly referenced. When the
-task is about workspace content, read these first; matched passages inside
-each document are flagged via `<chunk_index>` so you can jump straight to
-them.
+task is about workspace content, read these first.
 
 `<workspace_tree>` shows the full `/documents/` folder and file layout. Use
 it to resolve paths described in natural language ("the Q2 roadmap", "last
 week's planning notes") into concrete document references before delegating
 to a specialist.
 
-`<document>` and `<chunk id='…'>` blocks are chunked indexed content returned
-by KB search (backing `<priority_documents>`). Each chunk carries a stable
-`id` attribute.
+`<retrieved_context>` blocks hold knowledge-base passages from
+`search_knowledge_base`; each `<document>` inside is in excerpt view and every
+passage is prefixed with an `[n]` citation label.
 
 If a block doesn't appear this turn, work from the conversation alone.
 </dynamic_context>
diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/providers/grok.md b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/providers/grok.md
index 3219e10d3..3a68fba16 100644
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/providers/grok.md
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/providers/grok.md
@@ -8,8 +8,8 @@ Tool discipline:
 - Typically one investigative tool per turn unless several independent read-only queries are clearly needed; don’t repeat identical calls.
 
 Attribution:
-- When citations are **enabled** (see citation block above) and you answer from chunk-tagged documents, use `[citation:chunk_id]` exactly as specified there.
-- When citations are **disabled**, never emit `[citation:…]` — plain prose and links per tool guidance.
+- When citations are **enabled** (see citation block above) and you answer from labelled passages, cite with the bare `[n]` label exactly as specified there.
+- When citations are **disabled**, never emit `[n]` or `[citation:…]` — plain prose and links per tool guidance.
 
 Style:
 - No emojis unless asked; flat lists for short answers.
diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/providers/openai_codex.md b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/providers/openai_codex.md
index aad52f995..79689ab80 100644
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/providers/openai_codex.md
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/providers/openai_codex.md
@@ -3,7 +3,7 @@ You are running on an OpenAI Codex-class model (SurfSense **main agent**).
 
 Output style:
 - Concise; don’t paste huge fetch blobs — summarize.
-- When citations are **enabled** and you rely on chunk-tagged docs, references may use `[citation:chunk_id]` per the citation block above; when **disabled**, use prose and URLs only.
+- When citations are **enabled** and you rely on labelled passages, cite with the bare `[n]` label per the citation block above; when **disabled**, use prose and URLs only.
 - Numbered lists work well when the user should reply with a single option index.
 - No emojis; single-level bullets.
 
diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/deliverables/tools/knowledge_base.py b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/deliverables/tools/knowledge_base.py
deleted file mode 100644
index d89124990..000000000
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/deliverables/tools/knowledge_base.py
+++ /dev/null
@@ -1,762 +0,0 @@
-"""
-Knowledge base search tool for the SurfSense agent.
-
-This module provides:
-- Connector constants and normalization
-- Async knowledge base search across multiple connectors
-- Document formatting for LLM context
-"""
-
-import asyncio
-import contextlib
-import json
-import re
-import time
-from datetime import datetime
-from typing import Any
-
-from sqlalchemy.ext.asyncio import AsyncSession
-
-from app.db import NATIVE_TO_LEGACY_DOCTYPE, shielded_async_session
-from app.services.connector_service import ConnectorService
-from app.utils.perf import get_perf_logger
-
-# Connectors that call external live-search APIs. These are handled by the
-# ``web_search`` tool and must be excluded from knowledge-base searches.
-_LIVE_SEARCH_CONNECTORS: set[str] = {
-    "TAVILY_API",
-    "LINKUP_API",
-    "BAIDU_SEARCH_API",
-}
-
-# Patterns that indicate the query has no meaningful search signal.
-# plainto_tsquery('english', '*') produces an empty tsquery and an embedding
-# of '*' is random noise, so both keyword and semantic search degrade to
-# arbitrary ordering — large documents (many chunks) dominate by chance.
-_DEGENERATE_QUERY_RE = re.compile(
-    r"^[\s*?_.#@!\-/\\]+$"  # only wildcards, punctuation, whitespace
-)
-
-# Max chunks per document when doing a recency-based browse instead of
-# a real search.  We want breadth (many docs) over depth (many chunks).
-_BROWSE_MAX_CHUNKS_PER_DOC = 5
-
-
-def _is_degenerate_query(query: str) -> bool:
-    """Return True when the query carries no meaningful search signal.
-
-    Catches wildcard patterns (``*``, ``**``), empty / whitespace-only
-    strings, and single-character non-word tokens.  These queries cause
-    both keyword search (empty tsquery) and semantic search (meaningless
-    embedding) to return effectively random results.
-    """
-    stripped = query.strip()
-    if not stripped:
-        return True
-    return bool(_DEGENERATE_QUERY_RE.match(stripped))
-
-
-async def _browse_recent_documents(
-    search_space_id: int,
-    document_type: str | list[str] | None,
-    top_k: int,
-    start_date: datetime | None,
-    end_date: datetime | None,
-) -> list[dict[str, Any]]:
-    """Return the most-recent documents (recency-ordered, no search ranking).
-
-    Used as a fallback when the search query is degenerate (e.g. ``*``) and
-    semantic / keyword search would produce arbitrary results.  Returns
-    document-grouped dicts in the same shape as ``_combined_rrf_search``
-    so the rest of the pipeline works unchanged.
-    """
-    from sqlalchemy import select
-    from sqlalchemy.orm import joinedload
-
-    from app.db import Chunk, Document, DocumentType
-
-    perf = get_perf_logger()
-    t0 = time.perf_counter()
-
-    base_conditions = [Document.search_space_id == search_space_id]
-
-    if document_type is not None:
-        type_list = (
-            document_type if isinstance(document_type, list) else [document_type]
-        )
-        doc_type_enums = []
-        for dt in type_list:
-            if isinstance(dt, str):
-                with contextlib.suppress(KeyError):
-                    doc_type_enums.append(DocumentType[dt])
-            else:
-                doc_type_enums.append(dt)
-        if not doc_type_enums:
-            return []
-        if len(doc_type_enums) == 1:
-            base_conditions.append(Document.document_type == doc_type_enums[0])
-        else:
-            base_conditions.append(Document.document_type.in_(doc_type_enums))
-
-    if start_date is not None:
-        base_conditions.append(Document.updated_at >= start_date)
-    if end_date is not None:
-        base_conditions.append(Document.updated_at <= end_date)
-
-    async with shielded_async_session() as session:
-        doc_query = (
-            select(Document)
-            .options(joinedload(Document.search_space))
-            .where(*base_conditions)
-            .order_by(Document.updated_at.desc())
-            .limit(top_k)
-        )
-        result = await session.execute(doc_query)
-        documents = result.scalars().unique().all()
-
-        if not documents:
-            return []
-
-        doc_ids = [d.id for d in documents]
-
-        chunk_query = (
-            select(Chunk)
-            .where(Chunk.document_id.in_(doc_ids))
-            .order_by(Chunk.document_id, Chunk.position, Chunk.id)
-        )
-        chunk_result = await session.execute(chunk_query)
-        raw_chunks = chunk_result.scalars().all()
-
-    doc_chunk_counts: dict[int, int] = {}
-    doc_chunks: dict[int, list[dict]] = {d.id: [] for d in documents}
-    for chunk in raw_chunks:
-        did = chunk.document_id
-        count = doc_chunk_counts.get(did, 0)
-        if count < _BROWSE_MAX_CHUNKS_PER_DOC:
-            doc_chunks[did].append({"chunk_id": chunk.id, "content": chunk.content})
-            doc_chunk_counts[did] = count + 1
-
-    results: list[dict[str, Any]] = []
-    for doc in documents:
-        chunks_list = doc_chunks.get(doc.id, [])
-        results.append(
-            {
-                "document_id": doc.id,
-                "content": "\n\n".join(
-                    c["content"] for c in chunks_list if c.get("content")
-                ),
-                "score": 0.0,
-                "chunks": chunks_list,
-                "document": {
-                    "id": doc.id,
-                    "title": doc.title,
-                    "document_type": doc.document_type.value
-                    if getattr(doc, "document_type", None)
-                    else None,
-                    "metadata": doc.document_metadata or {},
-                },
-                "source": doc.document_type.value
-                if getattr(doc, "document_type", None)
-                else None,
-            }
-        )
-
-    perf.info(
-        "[kb_browse] recency browse in %.3fs docs=%d space=%d type=%s",
-        time.perf_counter() - t0,
-        len(results),
-        search_space_id,
-        document_type,
-    )
-    return results
-
-
-# =============================================================================
-# Connector Constants and Normalization
-# =============================================================================
-
-# Canonical connector values used internally by ConnectorService
-# Includes all document types and search source connectors
-_ALL_CONNECTORS: list[str] = [
-    "EXTENSION",
-    "FILE",
-    "SLACK_CONNECTOR",
-    "TEAMS_CONNECTOR",
-    "NOTION_CONNECTOR",
-    "YOUTUBE_VIDEO",
-    "GITHUB_CONNECTOR",
-    "ELASTICSEARCH_CONNECTOR",
-    "LINEAR_CONNECTOR",
-    "JIRA_CONNECTOR",
-    "CONFLUENCE_CONNECTOR",
-    "CLICKUP_CONNECTOR",
-    "GOOGLE_CALENDAR_CONNECTOR",
-    "GOOGLE_GMAIL_CONNECTOR",
-    "GOOGLE_DRIVE_FILE",
-    "DISCORD_CONNECTOR",
-    "AIRTABLE_CONNECTOR",
-    "LUMA_CONNECTOR",
-    "NOTE",
-    "BOOKSTACK_CONNECTOR",
-    "CRAWLED_URL",
-    "CIRCLEBACK",
-    "OBSIDIAN_CONNECTOR",
-    "ONEDRIVE_FILE",
-    "DROPBOX_FILE",
-]
-
-# Human-readable descriptions for each connector type
-# Used for generating dynamic docstrings and informing the LLM
-CONNECTOR_DESCRIPTIONS: dict[str, str] = {
-    "EXTENSION": "Web content saved via SurfSense browser extension (personal browsing history)",
-    "FILE": "User-uploaded documents (PDFs, Word, etc.) (personal files)",
-    "NOTE": "SurfSense Notes (notes created inside SurfSense)",
-    "SLACK_CONNECTOR": "Slack conversations and shared content (personal workspace communications)",
-    "TEAMS_CONNECTOR": "Microsoft Teams messages and conversations (personal Teams communications)",
-    "NOTION_CONNECTOR": "Notion workspace pages and databases (personal knowledge management)",
-    "YOUTUBE_VIDEO": "YouTube video transcripts and metadata (personally saved videos)",
-    "GITHUB_CONNECTOR": "GitHub repository content and issues (personal repositories and interactions)",
-    "ELASTICSEARCH_CONNECTOR": "Elasticsearch indexed documents and data (personal Elasticsearch instances)",
-    "LINEAR_CONNECTOR": "Linear project issues and discussions (personal project management)",
-    "JIRA_CONNECTOR": "Jira project issues, tickets, and comments (personal project tracking)",
-    "CONFLUENCE_CONNECTOR": "Confluence pages and comments (personal project documentation)",
-    "CLICKUP_CONNECTOR": "ClickUp tasks and project data (personal task management)",
-    "GOOGLE_CALENDAR_CONNECTOR": "Google Calendar events, meetings, and schedules (personal calendar)",
-    "GOOGLE_GMAIL_CONNECTOR": "Google Gmail emails and conversations (personal emails)",
-    "GOOGLE_DRIVE_FILE": "Google Drive files and documents (personal cloud storage)",
-    "DISCORD_CONNECTOR": "Discord server conversations and shared content (personal community)",
-    "AIRTABLE_CONNECTOR": "Airtable records, tables, and database content (personal data)",
-    "LUMA_CONNECTOR": "Luma events and meetings",
-    "WEBCRAWLER_CONNECTOR": "Webpages indexed by SurfSense (personally selected websites)",
-    "CRAWLED_URL": "Webpages indexed by SurfSense (personally selected websites)",
-    "BOOKSTACK_CONNECTOR": "BookStack pages (personal documentation)",
-    "CIRCLEBACK": "Circleback meeting notes, transcripts, and action items",
-    "OBSIDIAN_CONNECTOR": "Obsidian vault notes and markdown files (personal notes)",
-    "ONEDRIVE_FILE": "Microsoft OneDrive files and documents (personal cloud storage)",
-    "DROPBOX_FILE": "Dropbox files and documents (cloud storage)",
-}
-
-
-def _normalize_connectors(
-    connectors_to_search: list[str] | None,
-    available_connectors: list[str] | None = None,
-) -> list[str]:
-    """Normalize model-supplied connectors to canonical ConnectorService types.
-
-    Maps user-facing aliases (e.g. WEBCRAWLER_CONNECTOR), drops unknowns, and
-    constrains to ``available_connectors`` when given. Empty input defaults to
-    all available connectors (minus live-search ones).
-    """
-    valid_set = (
-        set(available_connectors) if available_connectors else set(_ALL_CONNECTORS)
-    )
-    valid_set -= _LIVE_SEARCH_CONNECTORS
-
-    if not connectors_to_search:
-        base = (
-            list(available_connectors)
-            if available_connectors
-            else list(_ALL_CONNECTORS)
-        )
-        return [c for c in base if c not in _LIVE_SEARCH_CONNECTORS]
-
-    normalized: list[str] = []
-    for raw in connectors_to_search:
-        c = (raw or "").strip().upper()
-        if not c:
-            continue
-        if c == "WEBCRAWLER_CONNECTOR":
-            c = "CRAWLED_URL"
-        normalized.append(c)
-
-    # De-dupe (order-preserving), keeping only known + available connectors.
-    seen: set[str] = set()
-    out: list[str] = []
-    for c in normalized:
-        if c in seen:
-            continue
-        if c not in _ALL_CONNECTORS:
-            continue
-        if c not in valid_set:
-            continue
-        seen.add(c)
-        out.append(c)
-
-    # Nothing matched: fall back to all available.
-    if not out:
-        base = (
-            list(available_connectors)
-            if available_connectors
-            else list(_ALL_CONNECTORS)
-        )
-        return [c for c in base if c not in _LIVE_SEARCH_CONNECTORS]
-    return out
-
-
-# =============================================================================
-# Document Formatting
-# =============================================================================
-
-
-# Fraction of the model's context window (in characters) that a single tool
-# result is allowed to occupy.  The remainder is reserved for system prompt,
-# conversation history, and model output.  With ~4 chars/token this gives a
-# tool result ≈ 25 % of the context budget in tokens.
-_TOOL_OUTPUT_CONTEXT_FRACTION = 0.25
-_CHARS_PER_TOKEN = 4
-
-# Hard-floor / ceiling so the budget is always sensible regardless of what
-# the model reports.
-_MIN_TOOL_OUTPUT_CHARS = 20_000  # ~5K tokens
-_MAX_TOOL_OUTPUT_CHARS = 200_000  # ~50K tokens
-_MAX_CHUNK_CHARS = 8_000
-
-# Rank-adaptive per-document budget allocation.
-# Top-ranked (most relevant) documents get a larger share of the budget so
-# we pack as much high-quality context as possible.
-#
-#   fraction(rank) = _TOP_DOC_BUDGET_FRACTION / (1 + rank * _RANK_DECAY)
-#
-# Examples (128K budget, 8K chunk cap):
-#   rank 0 → 40% → 6 chunks   |  rank 3 → 19% → 3 chunks
-#   rank 1 → 30% → 4 chunks   |  rank 10 → 10% → 3 chunks (floor)
-#   rank 2 → 24% → 3 chunks   |
-_TOP_DOC_BUDGET_FRACTION = 0.40
-_RANK_DECAY = 0.35
-_MIN_CHUNKS_PER_DOC = 3
-
-
-def _compute_tool_output_budget(max_input_tokens: int | None) -> int:
-    """Derive a character budget from the model's context window.
-
-    Uses ``litellm.get_model_info`` via the value already resolved by
-    ``ChatLiteLLMRouter`` / ``ChatLiteLLM`` and passed through the dependency
-    chain as ``max_input_tokens``.  Falls back to a conservative default when
-    the value is unavailable.
-    """
-    if max_input_tokens is None or max_input_tokens <= 0:
-        return _MIN_TOOL_OUTPUT_CHARS  # conservative fallback
-
-    budget = int(max_input_tokens * _CHARS_PER_TOKEN * _TOOL_OUTPUT_CONTEXT_FRACTION)
-    return max(_MIN_TOOL_OUTPUT_CHARS, min(budget, _MAX_TOOL_OUTPUT_CHARS))
-
-
-_INTERNAL_METADATA_KEYS: frozenset[str] = frozenset(
-    {
-        "message_id",
-        "thread_id",
-        "event_id",
-        "calendar_id",
-        "google_drive_file_id",
-        "onedrive_file_id",
-        "dropbox_file_id",
-        "page_id",
-        "issue_id",
-        "connector_id",
-    }
-)
-
-
-def format_documents_for_context(
-    documents: list[dict[str, Any]],
-    *,
-    max_chars: int = _MAX_TOOL_OUTPUT_CHARS,
-    max_chunk_chars: int = _MAX_CHUNK_CHARS,
-    max_chunks_per_doc: int = 0,
-) -> str:
-    """Format retrieved documents into an XML context string for the LLM.
-
-    Documents are emitted highest-relevance first until ``max_chars`` is hit.
-    ``max_chunks_per_doc=0`` auto-computes a rank-adaptive cap so top results get
-    more chunks and no single large document monopolizes the budget.
-    """
-    if not documents:
-        return ""
-
-    # Group chunks by document id, preserving chunk_id so [citation:123] works.
-    # ConnectorService returns document-grouped results ({document, chunks, source}).
-    grouped: dict[str, dict[str, Any]] = {}
-
-    for doc in documents:
-        document_info = (doc.get("document") or {}) if isinstance(doc, dict) else {}
-        metadata = (
-            (document_info.get("metadata") or {})
-            if isinstance(document_info, dict)
-            else {}
-        )
-        if not metadata and isinstance(doc, dict):
-            # Some result shapes may place metadata at the top level.
-            metadata = doc.get("metadata") or {}
-
-        source = (
-            (doc.get("source") if isinstance(doc, dict) else None)
-            or document_info.get("document_type")
-            or metadata.get("document_type")
-            or "UNKNOWN"
-        )
-
-        # Identity: prefer document_id, else type+title+url.
-        document_id_val = document_info.get("id")
-        title = (
-            document_info.get("title") or metadata.get("title") or "Untitled Document"
-        )
-        url = (
-            metadata.get("url")
-            or metadata.get("source")
-            or metadata.get("page_url")
-            or ""
-        )
-
-        doc_key = (
-            str(document_id_val)
-            if document_id_val is not None
-            else f"{source}::{title}::{url}"
-        )
-
-        if doc_key not in grouped:
-            grouped[doc_key] = {
-                "document_id": document_id_val
-                if document_id_val is not None
-                else doc_key,
-                "document_type": metadata.get("document_type") or source,
-                "title": title,
-                "url": url,
-                "metadata": metadata,
-                "chunks": [],
-            }
-
-        # Prefer document-grouped chunks when present.
-        chunks_list = doc.get("chunks") if isinstance(doc, dict) else None
-        if isinstance(chunks_list, list) and chunks_list:
-            for ch in chunks_list:
-                if not isinstance(ch, dict):
-                    continue
-                chunk_id = ch.get("chunk_id") or ch.get("id")
-                content = (ch.get("content") or "").strip()
-                if not content:
-                    continue
-                grouped[doc_key]["chunks"].append(
-                    {"chunk_id": chunk_id, "content": content}
-                )
-            continue
-
-        # Fallback: treat this as a flat chunk-like object
-        if not isinstance(doc, dict):
-            continue
-        chunk_id = doc.get("chunk_id") or doc.get("id")
-        content = (doc.get("content") or "").strip()
-        if not content:
-            continue
-        grouped[doc_key]["chunks"].append({"chunk_id": chunk_id, "content": content})
-
-    # Live search connectors whose results should be cited by URL rather than
-    # a numeric chunk_id (the numeric IDs are meaningless auto-incremented counters).
-    live_search_connectors = {
-        "TAVILY_API",
-        "LINKUP_API",
-        "BAIDU_SEARCH_API",
-    }
-
-    parts: list[str] = []
-    total_chars = 0
-    total_docs = len(grouped)
-
-    for doc_idx, g in enumerate(grouped.values()):
-        metadata_clean = {
-            k: v for k, v in g["metadata"].items() if k not in _INTERNAL_METADATA_KEYS
-        }
-        metadata_json = json.dumps(metadata_clean, ensure_ascii=False)
-        is_live_search = g["document_type"] in live_search_connectors
-
-        doc_lines: list[str] = [
-            "<document>",
-            "<document_metadata>",
-            f"  <document_id>{g['document_id']}</document_id>",
-            f"  <document_type>{g['document_type']}</document_type>",
-            f"  <title><![CDATA[{g['title']}]]></title>",
-            f"  <url><![CDATA[{g['url']}]]></url>",
-            f"  <metadata_json><![CDATA[{metadata_json}]]></metadata_json>",
-            "</document_metadata>",
-            "",
-            "<document_content>",
-        ]
-
-        # Rank-adaptive per-document chunk cap: top results get more chunks.
-        if max_chunks_per_doc > 0:
-            chunks_allowed = max_chunks_per_doc
-        else:
-            doc_fraction = _TOP_DOC_BUDGET_FRACTION / (1 + doc_idx * _RANK_DECAY)
-            max_doc_chars = int(max_chars * doc_fraction)
-            xml_overhead = 500
-            chunks_allowed = max(
-                (max_doc_chars - xml_overhead) // max(max_chunk_chars, 1),
-                _MIN_CHUNKS_PER_DOC,
-            )
-
-        chunks = g["chunks"]
-        if len(chunks) > chunks_allowed:
-            chunks = chunks[:chunks_allowed]
-
-        for ch in chunks:
-            ch_content = ch["content"]
-            if max_chunk_chars and len(ch_content) > max_chunk_chars:
-                ch_content = ch_content[:max_chunk_chars] + "\n...(truncated)"
-            ch_id = g["url"] if (is_live_search and g["url"]) else ch["chunk_id"]
-            if ch_id is None:
-                doc_lines.append(f"  <chunk><![CDATA[{ch_content}]]></chunk>")
-            else:
-                doc_lines.append(
-                    f"  <chunk id='{ch_id}'><![CDATA[{ch_content}]]></chunk>"
-                )
-
-        doc_lines.extend(["</document_content>", "</document>", ""])
-
-        doc_xml = "\n".join(doc_lines)
-        doc_len = len(doc_xml)
-
-        if total_chars + doc_len > max_chars:
-            remaining = total_docs - doc_idx
-            if doc_idx == 0:
-                parts.append(doc_xml)
-                total_chars += doc_len
-            parts.append(
-                f"<!-- Output truncated: {remaining} more document(s) omitted "
-                f"(budget {max_chars} chars). Refine your query or reduce top_k "
-                f"to retrieve different results. -->"
-            )
-            break
-
-        parts.append(doc_xml)
-        total_chars += doc_len
-
-    result = "\n".join(parts).strip()
-
-    # Hard safety net: if the result is still over budget (e.g. a single massive
-    # first document), forcibly truncate with a closing comment.
-    if len(result) > max_chars:
-        truncation_msg = "\n<!-- ...output forcibly truncated to fit context window -->"
-        result = result[: max_chars - len(truncation_msg)] + truncation_msg
-
-    return result
-
-
-# =============================================================================
-# Knowledge Base Search
-# =============================================================================
-
-
-async def search_knowledge_base_async(
-    query: str,
-    search_space_id: int,
-    db_session: AsyncSession,
-    connector_service: ConnectorService,
-    connectors_to_search: list[str] | None = None,
-    top_k: int = 10,
-    start_date: datetime | None = None,
-    end_date: datetime | None = None,
-    available_connectors: list[str] | None = None,
-    available_document_types: list[str] | None = None,
-    max_input_tokens: int | None = None,
-) -> str:
-    """Search the knowledge base across connectors and return formatted results.
-
-    ``available_document_types`` lets local connectors with no indexed data be
-    skipped (no embedding / DB round-trip), and ``max_input_tokens`` sizes the
-    output to the model's context window.
-    """
-    perf = get_perf_logger()
-    t0 = time.perf_counter()
-
-    deduplicated = await search_knowledge_base_raw_async(
-        query=query,
-        search_space_id=search_space_id,
-        db_session=db_session,
-        connector_service=connector_service,
-        connectors_to_search=connectors_to_search,
-        top_k=top_k,
-        start_date=start_date,
-        end_date=end_date,
-        available_connectors=available_connectors,
-        available_document_types=available_document_types,
-    )
-
-    if not deduplicated:
-        return "No documents found in the knowledge base. The search space has no indexed content yet."
-
-    # Use browse chunk cap for degenerate queries, otherwise adaptive chunking.
-    max_chunks_per_doc = (
-        _BROWSE_MAX_CHUNKS_PER_DOC if _is_degenerate_query(query) else 0
-    )
-    output_budget = _compute_tool_output_budget(max_input_tokens)
-    result = format_documents_for_context(
-        deduplicated,
-        max_chars=output_budget,
-        max_chunks_per_doc=max_chunks_per_doc,
-    )
-
-    if len(result) > output_budget:
-        perf.warning(
-            "[kb_search] output STILL exceeds budget after format (%d > %d), "
-            "hard truncation should have fired",
-            len(result),
-            output_budget,
-        )
-
-    perf.info(
-        "[kb_search] TOTAL in %.3fs total_docs=%d deduped=%d output_chars=%d "
-        "budget=%d max_input_tokens=%s space=%d",
-        time.perf_counter() - t0,
-        len(deduplicated),
-        len(deduplicated),
-        len(result),
-        output_budget,
-        max_input_tokens,
-        search_space_id,
-    )
-    return result
-
-
-async def search_knowledge_base_raw_async(
-    query: str,
-    search_space_id: int,
-    db_session: AsyncSession,
-    connector_service: ConnectorService,
-    connectors_to_search: list[str] | None = None,
-    top_k: int = 10,
-    start_date: datetime | None = None,
-    end_date: datetime | None = None,
-    available_connectors: list[str] | None = None,
-    available_document_types: list[str] | None = None,
-    query_embedding: list[float] | None = None,
-) -> list[dict[str, Any]]:
-    """Search knowledge base and return raw document dicts (no XML formatting)."""
-    perf = get_perf_logger()
-    t0 = time.perf_counter()
-    all_documents: list[dict[str, Any]] = []
-
-    # Preserve the public signature for compatibility even if values are unused.
-    _ = (db_session, connector_service)
-
-    from app.agents.chat.multi_agent_chat.shared.date_filters import resolve_date_range
-
-    resolved_start_date, resolved_end_date = resolve_date_range(
-        start_date=start_date,
-        end_date=end_date,
-    )
-
-    connectors = _normalize_connectors(connectors_to_search, available_connectors)
-
-    if available_document_types:
-        doc_types_set = set(available_document_types)
-        connectors = [
-            c
-            for c in connectors
-            if c in doc_types_set
-            or NATIVE_TO_LEGACY_DOCTYPE.get(c, "") in doc_types_set
-        ]
-
-    if not connectors:
-        return []
-
-    if _is_degenerate_query(query):
-        perf.info(
-            "[kb_search_raw] degenerate query %r detected - recency browse",
-            query,
-        )
-        browse_connectors = connectors if connectors else [None]  # type: ignore[list-item]
-        expanded_browse = []
-        for connector in browse_connectors:
-            if connector is not None and connector in NATIVE_TO_LEGACY_DOCTYPE:
-                expanded_browse.append([connector, NATIVE_TO_LEGACY_DOCTYPE[connector]])
-            else:
-                expanded_browse.append(connector)
-        browse_results = await asyncio.gather(
-            *[
-                _browse_recent_documents(
-                    search_space_id=search_space_id,
-                    document_type=connector,
-                    top_k=top_k,
-                    start_date=resolved_start_date,
-                    end_date=resolved_end_date,
-                )
-                for connector in expanded_browse
-            ]
-        )
-        for docs in browse_results:
-            all_documents.extend(docs)
-    else:
-        if query_embedding is None:
-            from app.config import config as app_config
-
-            query_embedding = app_config.embedding_model_instance.embed(query)
-
-        max_parallel_searches = 4
-        semaphore = asyncio.Semaphore(max_parallel_searches)
-
-        async def _search_one_connector(connector: str) -> list[dict[str, Any]]:
-            try:
-                async with semaphore, shielded_async_session() as isolated_session:
-                    svc = ConnectorService(isolated_session, search_space_id)
-                    return await svc._combined_rrf_search(
-                        query_text=query,
-                        search_space_id=search_space_id,
-                        document_type=connector,
-                        top_k=top_k,
-                        start_date=resolved_start_date,
-                        end_date=resolved_end_date,
-                        query_embedding=query_embedding,
-                    )
-            except Exception as exc:
-                perf.warning("[kb_search_raw] connector=%s FAILED: %s", connector, exc)
-                return []
-
-        connector_results = await asyncio.gather(
-            *[_search_one_connector(connector) for connector in connectors]
-        )
-        for docs in connector_results:
-            all_documents.extend(docs)
-
-    seen_doc_ids: set[Any] = set()
-    seen_content_hashes: set[int] = set()
-    deduplicated: list[dict[str, Any]] = []
-
-    def _content_fingerprint(document: dict[str, Any]) -> int | None:
-        chunks = document.get("chunks")
-        if isinstance(chunks, list):
-            chunk_texts = []
-            for chunk in chunks:
-                if not isinstance(chunk, dict):
-                    continue
-                chunk_content = (chunk.get("content") or "").strip()
-                if chunk_content:
-                    chunk_texts.append(chunk_content)
-            if chunk_texts:
-                return hash("||".join(chunk_texts))
-        flat_content = (document.get("content") or "").strip()
-        if flat_content:
-            return hash(flat_content)
-        return None
-
-    for doc in all_documents:
-        doc_id = (doc.get("document", {}) or {}).get("id")
-        if doc_id is not None:
-            if doc_id in seen_doc_ids:
-                continue
-            seen_doc_ids.add(doc_id)
-            deduplicated.append(doc)
-            continue
-        content_hash = _content_fingerprint(doc)
-        if content_hash is not None and content_hash in seen_content_hashes:
-            continue
-        if content_hash is not None:
-            seen_content_hashes.add(content_hash)
-        deduplicated.append(doc)
-
-    deduplicated.sort(key=lambda doc: doc.get("score", 0), reverse=True)
-    perf.info(
-        "[kb_search_raw] done in %.3fs total=%d deduped=%d",
-        time.perf_counter() - t0,
-        len(all_documents),
-        len(deduplicated),
-    )
-    return deduplicated
diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/deliverables/tools/report.py b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/deliverables/tools/report.py
index ea831b891..c80a2a565 100644
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/deliverables/tools/report.py
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/deliverables/tools/report.py
@@ -23,6 +23,45 @@ from app.services.llm_service import get_agent_llm
 
 logger = logging.getLogger(__name__)
 
+
+def _report_search_types(
+    available_connectors: list[str] | None,
+    available_document_types: list[str] | None,
+) -> tuple[str, ...] | None:
+    """Build the document-type scope for the shared KB search.
+
+    ``None`` means "search every indexed type"; a tuple narrows the scope to the
+    connectors/document types the search space actually has.
+    """
+    types: set[str] = set()
+    if available_document_types:
+        types.update(available_document_types)
+    if available_connectors:
+        types.update(available_connectors)
+    return tuple(sorted(types)) or None
+
+
+def _render_kb_hits_for_report(hits: list[Any]) -> str:
+    """Render KB hits as plain titled source text for the report writer.
+
+    Citations are intentionally omitted from reports for now, so no ``[n]``
+    labels or chunk ids are emitted — just titled document content for grounding.
+    """
+    from app.agents.chat.multi_agent_chat.shared.document_render import source_label
+
+    blocks: list[str] = []
+    for hit in hits:
+        label = source_label(hit.document_type, hit.metadata)
+        header = f"{hit.title} ({label})" if label else hit.title
+        body = "\n\n".join(
+            chunk.content.strip() for chunk in hit.chunks if chunk.content.strip()
+        )
+        if not body:
+            continue
+        blocks.append(f"## {header}\n\n{body}")
+    return "\n\n".join(blocks)
+
+
 # ─── Shared Formatting Rules ────────────────────────────────────────────────
 # Reusable formatting instructions appended to section-level and review prompts.
 
@@ -788,31 +827,46 @@ def create_generate_report_tool(
                     f"{query_count} queries: {search_queries[:5]}"
                 )
                 try:
-                    from .knowledge_base import search_knowledge_base_async
+                    from app.agents.chat.multi_agent_chat.shared.retrieval.hybrid_search import (
+                        search_chunks,
+                    )
+                    from app.agents.chat.multi_agent_chat.shared.retrieval.models import (
+                        DocumentHit,
+                        SearchScope,
+                    )
+
+                    scope = SearchScope(
+                        document_types=_report_search_types(
+                            available_connectors, available_document_types
+                        )
+                    )
 
                     # Each query gets its own short-lived session.
-                    async def _run_single_query(q: str) -> str:
+                    async def _run_single_query(q: str) -> list[DocumentHit]:
                         async with shielded_async_session() as kb_session:
-                            kb_connector_svc = ConnectorService(
-                                kb_session, search_space_id
-                            )
-                            return await search_knowledge_base_async(
-                                query=q,
+                            return await search_chunks(
+                                kb_session,
                                 search_space_id=search_space_id,
-                                db_session=kb_session,
-                                connector_service=kb_connector_svc,
+                                query=q,
+                                scope=scope,
                                 top_k=10,
-                                available_connectors=available_connectors,
-                                available_document_types=available_document_types,
                             )
 
-                    kb_results = await asyncio.gather(
+                    hits_per_query = await asyncio.gather(
                         *[_run_single_query(q) for q in search_queries[:5]]
                     )
 
-                    kb_text_parts = [r for r in kb_results if r and r.strip()]
-                    if kb_text_parts:
-                        kb_combined = "\n\n---\n\n".join(kb_text_parts)
+                    seen_doc_ids: set[int] = set()
+                    merged_hits: list[DocumentHit] = []
+                    for hits in hits_per_query:
+                        for hit in hits:
+                            if hit.document_id in seen_doc_ids:
+                                continue
+                            seen_doc_ids.add(hit.document_id)
+                            merged_hits.append(hit)
+
+                    kb_combined = _render_kb_hits_for_report(merged_hits)
+                    if kb_combined.strip():
                         if effective_source.strip():
                             effective_source = (
                                 effective_source
@@ -822,20 +876,17 @@ def create_generate_report_tool(
                         else:
                             effective_source = kb_combined
 
-                        # Count docs found (rough: count <document> tags)
-                        doc_count = kb_combined.count("<document>")
+                        doc_count = len(merged_hits)
                         dispatch_custom_event(
                             "report_progress",
                             {
                                 "phase": "kb_search_done",
-                                "message": f"Found {doc_count} relevant documents"
-                                if doc_count
-                                else f"Found results from {len(kb_text_parts)} queries",
+                                "message": f"Found {doc_count} relevant documents",
                             },
                         )
                         logger.info(
                             f"[generate_report] KB search added ~{len(kb_combined)} chars "
-                            f"from {len(kb_text_parts)} queries"
+                            f"from {doc_count} documents"
                         )
                     else:
                         dispatch_custom_event(
diff --git a/surfsense_backend/app/prompts/system_prompt_composer/base/citation_contract.md b/surfsense_backend/app/prompts/system_prompt_composer/base/citation_contract.md
deleted file mode 100644
index ba32addb6..000000000
--- a/surfsense_backend/app/prompts/system_prompt_composer/base/citation_contract.md
+++ /dev/null
@@ -1,43 +0,0 @@
-<citation_instructions>
-You can cite the sources shown to you. Cited material arrives in labeled blocks
-such as <retrieved_context> (and some tool results). Inside them, every passage
-begins with a bracketed number — that number is its citation label: [1], [2], [3].
-
-How to cite:
-- When a statement relies on a passage, put that passage's label right after the
-  statement: "We pushed the launch to March 10 [1]."
-- For several sources behind one statement, write each label in its own brackets
-  with nothing between them — [1][2]. Never merge them as [1, 2] and never use a
-  range like [1-3].
-- Put the label at the end of the clause or sentence it supports.
-
-Rules:
-- Cite ONLY labels that were shown to you. The bracketed number is the single
-  thing you copy — never cite a title, a date, "chunk 4 of 19", a document id, or
-  a URL.
-- Never invent a label and never renumber. If nothing shown supports a claim,
-  write it without a citation instead of guessing.
-- Attribute only claims drawn from the provided sources; leave your own general
-  knowledge uncited.
-- Plain square brackets only. No parentheses around them, no links or markdown
-  links like [1](http://...), no footnote marks like ¹.
-- Do not add a "References" or "Sources" section; citations stay inline.
-
-Example of context you might receive:
-<retrieved_context>
-Document: "Q3 Launch Notes"  (Slack · #launch · 2026-03-02)
-  [1] We agreed to push the launch to March 10.
-  [2] Marketing will be notified next week.
-Document: "Release Timeline"  (Notion · 2026-02-28)
-  [3] Dates floated were March 10 and March 17.
-</retrieved_context>
-
-Correct:
-The launch moved to March 10 [1][3], and marketing is told next week [2].
-
-Incorrect — do not produce any of these:
-- The launch moved to March 10 [1, 3].          (merged brackets)
-- The launch moved to March 10 ([1]).            (parentheses)
-- The launch moved to March 10 [citation:1].     (you never write this form)
-- The launch moved to March 10 [4].              (label was never shown)
-</citation_instructions>
diff --git a/surfsense_backend/app/prompts/system_prompt_composer/base/citations_off.md b/surfsense_backend/app/prompts/system_prompt_composer/base/citations_off.md
index 8288886e9..d8857adc3 100644
--- a/surfsense_backend/app/prompts/system_prompt_composer/base/citations_off.md
+++ b/surfsense_backend/app/prompts/system_prompt_composer/base/citations_off.md
@@ -1,16 +1,13 @@
 <citation_instructions>
-IMPORTANT: Citations are DISABLED for this configuration.
+Citation markers are **disabled** in this configuration.
 
-DO NOT include any citations in your responses. Specifically:
-1. Do NOT use the [citation:chunk_id] format anywhere in your response.
-2. Do NOT reference document IDs, chunk IDs, or source IDs.
-3. Simply provide the information naturally without any citation markers.
-4. Write your response as if you're having a normal conversation, incorporating the information from your knowledge seamlessly.
+Do NOT include `[n]` citation labels or `[citation:…]` markers anywhere, even if
+tool output (`<retrieved_context>`) or examples reference them. Ignore
+citation-format reminders elsewhere in this prompt when they conflict with this
+block.
 
-When answering questions based on documents from the knowledge base:
-- Present the information directly and confidently
-- Do not mention that information comes from specific documents or chunks
-- Integrate facts naturally into your response without attribution markers
-
-Your goal is to provide helpful, informative answers in a clean, readable format without any citation notation.
+1. Answer in plain prose. Optional markdown links to public URLs when sources
+   are URLs.
+2. Do not expose raw chunk ids, document ids, or internal ids to the user.
+3. Present knowledge-base or web facts naturally without attribution markers.
 </citation_instructions>
diff --git a/surfsense_backend/app/prompts/system_prompt_composer/base/citations_on.md b/surfsense_backend/app/prompts/system_prompt_composer/base/citations_on.md
index 3562ce66e..85a8e1355 100644
--- a/surfsense_backend/app/prompts/system_prompt_composer/base/citations_on.md
+++ b/surfsense_backend/app/prompts/system_prompt_composer/base/citations_on.md
@@ -1,89 +1,16 @@
 <citation_instructions>
-CRITICAL CITATION REQUIREMENTS:
+Cite with one token: the bracket label `[n]`. Cited material arrives in labeled
+blocks such as `<retrieved_context>` (and some tool results); inside them every
+passage begins with its `[n]` label on a single shared count. Those labels are
+the only citation you write; the server resolves each one back to its source
+after the turn.
 
-1. For EVERY piece of information you include from the documents, add a citation in the format [citation:chunk_id] where chunk_id is the exact value from the `<chunk id='...'>` tag inside `<document_content>`.
-2. Make sure ALL factual statements from the documents have proper citations.
-3. If multiple chunks support the same point, include all relevant citations [citation:chunk_id1], [citation:chunk_id2].
-4. You MUST use the exact chunk_id values from the `<chunk id='...'>` attributes. Do not create your own citation numbers.
-5. Every citation MUST be in the format [citation:chunk_id] where chunk_id is the exact chunk id value.
-6. Never modify or change the chunk_id - always use the original values exactly as provided in the chunk tags.
-7. Do not return citations as clickable links.
-8. Never format citations as markdown links like "([citation:5](https://example.com))". Always use plain square brackets only.
-9. Citations must ONLY appear as [citation:chunk_id] or [citation:chunk_id1], [citation:chunk_id2] format - never with parentheses, hyperlinks, or other formatting.
-10. Never make up chunk IDs. Only use chunk_id values that are explicitly provided in the `<chunk id='...'>` tags.
-11. If you are unsure about a chunk_id, do not include a citation rather than guessing or making one up.
-
-<document_structure_example>
-The documents you receive are structured like this:
-
-**Knowledge base documents (numeric chunk IDs):**
-<document>
-<document_metadata>
-  <document_id>42</document_id>
-  <document_type>GITHUB_CONNECTOR</document_type>
-  <title><![CDATA[Some repo / file / issue title]]></title>
-  <url><![CDATA[https://example.com]]></url>
-  <metadata_json><![CDATA[{{"any":"other metadata"}}]]></metadata_json>
-</document_metadata>
-
-<document_content>
-  <chunk id='123'><![CDATA[First chunk text...]]></chunk>
-  <chunk id='124'><![CDATA[Second chunk text...]]></chunk>
-</document_content>
-</document>
-
-**Web search results (URL chunk IDs):**
-<document>
-<document_metadata>
-  <document_type>WEB_SEARCH</document_type>
-  <title><![CDATA[Some web search result]]></title>
-  <url><![CDATA[https://example.com/article]]></url>
-</document_metadata>
-
-<document_content>
-  <chunk id='https://example.com/article'><![CDATA[Content from web search...]]></chunk>
-</document_content>
-</document>
-
-IMPORTANT: You MUST cite using the EXACT chunk ids from the `<chunk id='...'>` tags.
-- For knowledge base documents, chunk ids are numeric (e.g. 123, 124) or prefixed (e.g. doc-45).
-- For live web search results, chunk ids are URLs (e.g. https://example.com/article).
-Do NOT cite document_id. Always use the chunk id.
-</document_structure_example>
-
-<citation_format>
-- Every fact from the documents must have a citation in the format [citation:chunk_id] where chunk_id is the EXACT id value from a `<chunk id='...'>` tag
-- Citations should appear at the end of the sentence containing the information they support
-- Multiple citations should be separated by commas: [citation:chunk_id1], [citation:chunk_id2], [citation:chunk_id3]
-- No need to return references section. Just citations in answer.
-- NEVER create your own citation format - use the exact chunk_id values from the documents in the [citation:chunk_id] format
-- NEVER format citations as clickable links or as markdown links like "([citation:5](https://example.com))". Always use plain square brackets only
-- NEVER make up chunk IDs if you are unsure about the chunk_id. It is better to omit the citation than to guess
-- Copy the EXACT chunk id from the XML - if it says `<chunk id='5'>`, use [citation:5]
-- If the chunk id is a URL like `<chunk id='https://example.com/page'>`, use [citation:https://example.com/page]
-</citation_format>
-
-<citation_examples>
-CORRECT citation formats:
-- [citation:5] (numeric chunk ID from knowledge base)
-- [citation:https://example.com/article] (URL chunk ID from web search results)
-- [citation:chunk_id1], [citation:chunk_id2], [citation:chunk_id3] (multiple citations)
-
-INCORRECT citation formats (DO NOT use):
-- Using parentheses and markdown links: ([citation:5](https://github.com/MODSetter/SurfSense))
-- Using parentheses around brackets: ([citation:5])
-- Using hyperlinked text: [link to source 5](https://example.com)
-- Using footnote style: ... library¹
-- Making up source IDs when source_id is unknown
-- Using old IEEE format: [1], [2], [3]
-- Using source types instead of IDs: [citation:GITHUB_CONNECTOR] instead of [citation:5]
-</citation_examples>
-
-<citation_output_example>
-Based on your GitHub repositories and video content, Python's asyncio library provides tools for writing concurrent code using the async/await syntax [citation:5]. It's particularly useful for I/O-bound and high-level structured network code [citation:5].
-
-According to web search results, the key advantage of asyncio is that it can improve performance by allowing other code to run while waiting for I/O operations to complete [citation:https://docs.python.org/3/library/asyncio.html]. This makes it excellent for scenarios like web scraping, API calls, database operations, or any situation where your program spends time waiting for external resources.
-
-However, from your video learning, it's important to note that asyncio is not suitable for CPU-bound tasks as it runs on a single thread [citation:12]. For computationally intensive work, you'd want to use multiprocessing instead.
-</citation_output_example>
+1. Put the label right after the claim it supports.
+2. Several sources for one claim: stack brackets, `[1][2]`.
+3. Copy labels exactly as shown — never renumber them, add your own, or write the
+   underlying title, date, id, or URL instead.
+4. Write the bare `[n]` and nothing else: no `[citation:...]`, no markdown links
+   like `[1](http://…)`, no footnote marks, no "References" section.
+5. Only label claims the sources support. If nothing shown backs a claim — or you
+   never saw a label — leave it uncited; never invent one.
 </citation_instructions>
diff --git a/surfsense_backend/app/prompts/system_prompt_composer/providers/deepseek.md b/surfsense_backend/app/prompts/system_prompt_composer/providers/deepseek.md
index 8acf008ca..3e22f48bf 100644
--- a/surfsense_backend/app/prompts/system_prompt_composer/providers/deepseek.md
+++ b/surfsense_backend/app/prompts/system_prompt_composer/providers/deepseek.md
@@ -9,7 +9,7 @@ Reasoning hygiene (R1-aware):
 Output style:
 - Be concise. Default to a one-paragraph answer; expand only when the user asks for detail.
 - Don't open with sycophantic phrasing ("Great question", "Sure, here you go"). Lead with the answer or the next action.
-- For factual answers, cite once with `[citation:chunk_id]` and stop.
+- For factual answers, cite once with the passage's `[n]` label and stop.
 
 Tool calls:
 - Issue independent tool calls in parallel within a single turn.
diff --git a/surfsense_backend/app/prompts/system_prompt_composer/providers/grok.md b/surfsense_backend/app/prompts/system_prompt_composer/providers/grok.md
index 95b8fcc14..0368f4ae8 100644
--- a/surfsense_backend/app/prompts/system_prompt_composer/providers/grok.md
+++ b/surfsense_backend/app/prompts/system_prompt_composer/providers/grok.md
@@ -5,7 +5,7 @@ Maximum terseness:
 - Answer in fewer than 4 lines unless the user asks for detail. One-word answers are best when they suffice.
 - No preamble ("The answer is", "Here's what I'll do"), no postamble ("Hope that helps", "Let me know"). Get straight to the answer.
 - Avoid restating the user's question.
-- For factual lookups inside the knowledge base, give the answer with a single `[citation:chunk_id]` and stop.
+- For factual lookups inside the knowledge base, give the answer with a single `[n]` label and stop.
 
 Tool discipline:
 - Use exactly ONE tool per assistant turn when investigating; wait for the result before deciding the next call. Do not loop on the same tool with the same arguments — pick a result and act.
diff --git a/surfsense_backend/tests/integration/google_unification/conftest.py b/surfsense_backend/tests/integration/google_unification/conftest.py
index 390442fdd..151ee98e3 100644
--- a/surfsense_backend/tests/integration/google_unification/conftest.py
+++ b/surfsense_backend/tests/integration/google_unification/conftest.py
@@ -3,7 +3,6 @@
 from __future__ import annotations
 
 import uuid
-from contextlib import asynccontextmanager
 from datetime import UTC, datetime
 from unittest.mock import MagicMock
 
@@ -227,23 +226,6 @@ def patched_embed(monkeypatch):
     return mock
 
 
-@pytest.fixture
-def patched_shielded_session(async_engine, monkeypatch):
-    """Replace ``shielded_async_session`` in the knowledge_base module
-    with one that yields sessions from the test engine."""
-    test_maker = async_sessionmaker(async_engine, expire_on_commit=False)
-
-    @asynccontextmanager
-    async def _test_shielded():
-        async with test_maker() as session:
-            yield session
-
-    monkeypatch.setattr(
-        "app.agents.chat.multi_agent_chat.subagents.builtins.deliverables.tools.knowledge_base.shielded_async_session",
-        _test_shielded,
-    )
-
-
 # ---------------------------------------------------------------------------
 # Indexer test helpers
 # ---------------------------------------------------------------------------
diff --git a/surfsense_backend/tests/integration/google_unification/test_browse_includes_legacy_docs.py b/surfsense_backend/tests/integration/google_unification/test_browse_includes_legacy_docs.py
deleted file mode 100644
index f0d5c6c6c..000000000
--- a/surfsense_backend/tests/integration/google_unification/test_browse_includes_legacy_docs.py
+++ /dev/null
@@ -1,46 +0,0 @@
-"""Integration test: _browse_recent_documents returns docs of multiple types.
-
-Exercises the browse path (degenerate-query fallback) with a real PostgreSQL
-database.  Verifies that passing a list of document types correctly returns
-documents of all listed types -- the same ``.in_()`` SQL path used by hybrid
-search but through the browse/recency-ordered code path.
-"""
-
-from __future__ import annotations
-
-import pytest
-
-pytestmark = pytest.mark.integration
-
-
-async def test_browse_recent_documents_with_list_type_returns_both(
-    committed_google_data, patched_shielded_session
-):
-    """_browse_recent_documents returns docs of all types when given a list."""
-    from app.agents.chat.multi_agent_chat.subagents.builtins.deliverables.tools.knowledge_base import (
-        _browse_recent_documents,
-    )
-
-    space_id = committed_google_data["search_space_id"]
-
-    results = await _browse_recent_documents(
-        search_space_id=space_id,
-        document_type=["GOOGLE_DRIVE_FILE", "COMPOSIO_GOOGLE_DRIVE_CONNECTOR"],
-        top_k=10,
-        start_date=None,
-        end_date=None,
-    )
-
-    returned_types = set()
-    for doc in results:
-        doc_info = doc.get("document", {})
-        dtype = doc_info.get("document_type")
-        if dtype:
-            returned_types.add(dtype)
-
-    assert "GOOGLE_DRIVE_FILE" in returned_types, (
-        "Native Drive docs should appear in browse results"
-    )
-    assert "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" in returned_types, (
-        "Legacy Composio Drive docs should appear in browse results"
-    )
diff --git a/surfsense_backend/tests/unit/agents/new_chat/prompts/test_composer.py b/surfsense_backend/tests/unit/agents/new_chat/prompts/test_composer.py
index 4f0369e12..0140bd606 100644
--- a/surfsense_backend/tests/unit/agents/new_chat/prompts/test_composer.py
+++ b/surfsense_backend/tests/unit/agents/new_chat/prompts/test_composer.py
@@ -86,9 +86,10 @@ class TestCompose:
         # Tools
         assert "<tools>" in prompt
         assert "</tools>" in prompt
-        # Citations on by default
+        # Citations on by default — the [n] / <retrieved_context> contract
         assert "<citation_instructions>" in prompt
-        assert "[citation:chunk_id]" in prompt
+        assert "<retrieved_context>" in prompt
+        assert "[1][2]" in prompt
 
     def test_team_visibility_uses_team_variants(self, fixed_today: datetime) -> None:
         prompt = compose_system_prompt(
@@ -116,9 +117,9 @@ class TestCompose:
     def test_citations_disabled_swaps_block(self, fixed_today: datetime) -> None:
         prompt_on = compose_system_prompt(today=fixed_today, citations_enabled=True)
         prompt_off = compose_system_prompt(today=fixed_today, citations_enabled=False)
-        assert "Citations are DISABLED" in prompt_off
-        assert "Citations are DISABLED" not in prompt_on
-        assert "[citation:chunk_id]" in prompt_on
+        assert "Citation markers are **disabled**" in prompt_off
+        assert "Citation markers are **disabled**" not in prompt_on
+        assert "<retrieved_context>" in prompt_on
 
     def test_enabled_tool_filter_only_includes_listed_tools(
         self, fixed_today: datetime