search-kb: on-demand KB tool on the [n] spine; drop kb_matched_chunk_ids

The main agent's search_knowledge_base tool runs the hybrid spine, renders
a <retrieved_context> of numbered [n] passages, and persists the registry.
KB subagent prompts teach citing [n] from <document view="full"> reads
(evidence.chunk_ids -> evidence.citations). Delete the now-unused
search->read highlighting hand-off: the kb_matched_chunk_ids state field,
its reducer default, the tool's _matched_chunk_ids writer, and the dead
KnowledgePriorityMiddleware writes.
This commit is contained in:
CREDO23 2026-06-25 15:26:39 +02:00
parent 04a76b163b
commit c98bdea5cf
16 changed files with 518 additions and 325 deletions

View file

@ -1,12 +1,11 @@
"""On-demand ``search_knowledge_base`` main-agent tool (OpenCode-style lazy RAG).
"""On-demand ``search_knowledge_base`` main-agent tool (citation-spine RAG).
The main agent no longer receives eagerly pre-injected KB context on every
turn (see :class:`KnowledgePriorityMiddleware`, now gated off by default).
Instead it calls this tool only when it decides it needs knowledge-base
content. The tool runs a single hybrid search (embed + DB search, ~0.5s),
formats the top matches for the model, and writes ``kb_matched_chunk_ids``
into graph state so matched-section highlighting is preserved when the agent
later reads a document via ``task(knowledge_base)``.
The main agent calls this when it decides it needs knowledge-base content. The
tool runs one hybrid search, renders the matched passages as a
``<retrieved_context>`` block whose passages carry server-assigned ``[n]``
labels, and persists the conversation's ``CitationRegistry`` onto graph state so
the ``[n]`` -> ``[citation:<payload>]`` normalizer can resolve them after the
turn.
"""
from __future__ import annotations
@ -18,153 +17,70 @@ from langchain.tools import ToolRuntime
from langchain_core.messages import ToolMessage
from langchain_core.tools import BaseTool, StructuredTool
from langgraph.types import Command
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from app.agents.chat.multi_agent_chat.shared.middleware.knowledge_search import (
search_knowledge_base as _hybrid_search_kb,
from app.agents.chat.multi_agent_chat.shared.citations import load_registry
from app.agents.chat.multi_agent_chat.shared.retrieval import SearchScope, build_context
from app.agents.chat.multi_agent_chat.shared.retrieval.hybrid_search import (
search_chunks,
)
from app.agents.chat.multi_agent_chat.shared.state.filesystem_state import (
SurfSenseFilesystemState,
)
from app.agents.chat.runtime.path_resolver import (
PathIndex,
build_path_index,
doc_to_virtual_path,
)
from app.db import Document, shielded_async_session
from app.agents.chat.runtime.references import referenced_document_ids
from app.db import shielded_async_session
from app.utils.perf import get_perf_logger
_perf_log = get_perf_logger()
_DEFAULT_TOP_K = 5
_MAX_TOP_K = 20
_PER_DOC_SNIPPET_CHARS = 1200
_MAX_TOTAL_CHARS = 16_000
_TOOL_DESCRIPTION = (
"Search the user's knowledge base (their indexed documents, files, and "
"connector content) for passages relevant to a query, using hybrid "
"semantic + keyword retrieval.\n\n"
"Use this FIRST to ground any factual or informational answer about the "
"user's own documents, notes, or connected sources. The workspace tree "
"shows which files exist; this tool pulls the actual relevant content. "
"Each hit returns the document's virtual path, a relevance score, and the "
"matched snippets. If you need a document's full text, delegate a read to "
"the knowledge_base specialist via `task` using the returned path.\n\n"
"user's own documents, notes, or connected sources. It returns a "
"<retrieved_context> block: each matched passage is labelled [n]. Cite a "
"passage by writing that [n] after the statement it supports.\n\n"
"Write a focused, specific query containing the concrete entities, "
"acronyms, people, projects, or terms you are looking for."
)
async def _resolve_virtual_paths(
results: list[dict[str, Any]],
def _search_types(
available_connectors: list[str] | None,
available_document_types: list[str] | None,
) -> tuple[str, ...] | None:
"""Merge connector + document-type filters into a scope; ``None`` if unrestricted."""
types: set[str] = set()
if available_document_types:
types.update(available_document_types)
if available_connectors:
types.update(available_connectors)
return tuple(sorted(types)) or None
async def _build_search_scope(
session: AsyncSession,
*,
search_space_id: int,
) -> dict[int, str]:
"""Resolve ``Document.id`` -> canonical virtual path for the search hits."""
doc_ids = [
doc_id
for doc_id in (
(doc.get("document") or {}).get("id")
for doc in results
if isinstance(doc, dict)
)
if isinstance(doc_id, int)
]
if not doc_ids:
return {}
async with shielded_async_session() as session:
index: PathIndex = await build_path_index(session, search_space_id)
folder_rows = await session.execute(
select(Document.id, Document.folder_id).where(
Document.search_space_id == search_space_id,
Document.id.in_(doc_ids),
)
)
folder_by_doc_id = {row.id: row.folder_id for row in folder_rows.all()}
paths: dict[int, str] = {}
for doc in results:
doc_meta = doc.get("document") or {}
doc_id = doc_meta.get("id")
if not isinstance(doc_id, int):
continue
folder_id = folder_by_doc_id.get(doc_id, doc_meta.get("folder_id"))
paths[doc_id] = doc_to_virtual_path(
doc_id=doc_id,
title=str(doc_meta.get("title") or "untitled"),
folder_id=folder_id if isinstance(folder_id, int) else None,
index=index,
)
return paths
def _format_hits(
results: list[dict[str, Any]],
*,
paths: dict[int, str],
query: str,
) -> str:
"""Render search hits as a compact, model-readable block."""
if not results:
return (
f"No knowledge-base matches found for query: {query!r}.\n"
"Tell the user nothing relevant was found in their workspace, or "
"try a different query."
)
lines: list[str] = [f"<knowledge_base_results query={query!r}>"]
total = len(lines[0])
for rank, doc in enumerate(results, start=1):
doc_meta = doc.get("document") or {}
doc_id = doc_meta.get("id")
title = str(doc_meta.get("title") or "untitled")
doc_type = doc_meta.get("document_type") or doc.get("source") or "document"
score = doc.get("score")
score_str = f"{score:.3f}" if isinstance(score, int | float) else "n/a"
path = paths.get(doc_id) if isinstance(doc_id, int) else None
header = f"\n{rank}. {title} (type={doc_type}, score={score_str})" + (
f"\n path: {path}" if path else ""
)
content = (doc.get("content") or "").strip()
if content:
snippet = content[:_PER_DOC_SNIPPET_CHARS].strip()
if len(content) > _PER_DOC_SNIPPET_CHARS:
snippet += " ..."
body = "\n " + snippet.replace("\n", "\n ")
else:
body = "\n (no preview available; read the document for details)"
entry = header + body
if total + len(entry) > _MAX_TOTAL_CHARS:
lines.append("\n<!-- additional matches truncated to fit context -->")
break
lines.append(entry)
total += len(entry)
lines.append(
"\n\nTo read a full document, delegate to the knowledge_base specialist "
"with `task`, referencing the path above."
document_types: tuple[str, ...] | None,
runtime: ToolRuntime[None, SurfSenseFilesystemState],
) -> SearchScope:
"""Assemble the retrieval scope: workspace document-type filter + @-mention pins."""
ctx = getattr(runtime, "context", None)
document_ids = await referenced_document_ids(
session,
search_space_id=search_space_id,
document_ids=getattr(ctx, "mentioned_document_ids", None),
folder_ids=getattr(ctx, "mentioned_folder_ids", None),
)
return SearchScope(
document_types=document_types,
document_ids=document_ids or None,
)
lines.append("\n</knowledge_base_results>")
return "".join(lines)
def _matched_chunk_ids(results: list[dict[str, Any]]) -> dict[int, list[int]]:
"""Extract ``Document.id`` -> matched chunk ids for state hand-off."""
matched: dict[int, list[int]] = {}
for doc in results:
doc_id = (doc.get("document") or {}).get("id")
if not isinstance(doc_id, int):
continue
chunk_ids = doc.get("matched_chunk_ids") or []
normalized = [int(cid) for cid in chunk_ids if isinstance(cid, int | str)]
if normalized:
matched[doc_id] = normalized
return matched
def create_search_knowledge_base_tool(
@ -176,8 +92,7 @@ def create_search_knowledge_base_tool(
"""Factory for the on-demand ``search_knowledge_base`` tool."""
_space_id = search_space_id
_connectors = available_connectors
_doc_types = available_document_types
_document_types = _search_types(available_connectors, available_document_types)
async def _impl(
query: Annotated[
@ -195,34 +110,45 @@ def create_search_knowledge_base_tool(
return "Error: provide a non-empty search query."
clamped_top_k = min(max(1, top_k), _MAX_TOP_K)
t0 = time.perf_counter()
results = await _hybrid_search_kb(
query=cleaned_query,
search_space_id=_space_id,
available_connectors=_connectors,
available_document_types=_doc_types,
top_k=clamped_top_k,
)
registry = load_registry(getattr(runtime, "state", None))
paths = await _resolve_virtual_paths(results, search_space_id=_space_id)
rendered = _format_hits(results, paths=paths, query=cleaned_query)
matched = _matched_chunk_ids(results)
t0 = time.perf_counter()
async with shielded_async_session() as session:
scope = await _build_search_scope(
session,
search_space_id=_space_id,
document_types=_document_types,
runtime=runtime,
)
hits = await search_chunks(
session,
search_space_id=_space_id,
query=cleaned_query,
scope=scope,
top_k=clamped_top_k,
)
rendered = build_context(cleaned_query, hits, registry)
_perf_log.info(
"[search_knowledge_base] tool query=%r results=%d chars=%d in %.3fs",
"[search_knowledge_base] tool query=%r docs=%d in %.3fs",
cleaned_query[:60],
len(results),
len(rendered),
len(hits),
time.perf_counter() - t0,
)
if rendered is None:
return (
f"No knowledge-base matches found for query: {cleaned_query!r}.\n"
"Tell the user nothing relevant was found in their workspace, or "
"try a different query."
)
update: dict[str, Any] = {
"messages": [
ToolMessage(content=rendered, tool_call_id=runtime.tool_call_id)
],
"citation_registry": registry,
}
if matched:
update["kb_matched_chunk_ids"] = matched
return Command(update=update)
return StructuredTool.from_function(

View file

@ -5,11 +5,6 @@ This middleware runs ``before_agent`` on every turn and writes:
* ``state["kb_priority"]`` the top-K most relevant documents for the
current user message, used to render a ``<priority_documents>`` system
message immediately before the user turn.
* ``state["kb_matched_chunk_ids"]`` internal hand-off mapping
(``Document.id`` matched chunk IDs) consumed by
:class:`KBPostgresBackend._load_file_data` when the agent first reads each
document, so the XML wrapper can flag matched sections in
``<chunk_index>``.
The previous "scoped filesystem" behaviour (synthetic ``ls`` + state
``files`` seeding) is intentionally removed: documents are now lazy-loaded
@ -816,7 +811,6 @@ class KnowledgePriorityMiddleware(AgentMiddleware): # type: ignore[type-arg]
]
update: dict[str, Any] = {
"kb_priority": priority,
"kb_matched_chunk_ids": {},
}
if self.inject_system_message:
new_messages = list(state.get("messages") or [])
@ -930,7 +924,7 @@ class KnowledgePriorityMiddleware(AgentMiddleware): # type: ignore[type-arg]
merged.append(doc)
_t_materialize = time.perf_counter()
priority, matched_chunk_ids = await self._materialize_priority(merged)
priority = await self._materialize_priority(merged)
if folder_mention_ids:
folder_entries = await self._materialize_folder_priority(folder_mention_ids)
@ -957,7 +951,6 @@ class KnowledgePriorityMiddleware(AgentMiddleware): # type: ignore[type-arg]
update: dict[str, Any] = {
"kb_priority": priority,
"kb_matched_chunk_ids": matched_chunk_ids,
}
if self.inject_system_message:
new_messages = list(messages)
@ -1016,13 +1009,12 @@ class KnowledgePriorityMiddleware(AgentMiddleware): # type: ignore[type-arg]
async def _materialize_priority(
self, merged: list[dict[str, Any]]
) -> tuple[list[dict[str, Any]], dict[int, list[int]]]:
"""Resolve canonical paths and matched chunk ids for the priority list."""
) -> list[dict[str, Any]]:
"""Resolve canonical paths for the priority list."""
priority: list[dict[str, Any]] = []
matched_chunk_ids: dict[int, list[int]] = {}
if not merged:
return priority, matched_chunk_ids
return priority
_t0 = time.perf_counter()
async with shielded_async_session() as session:
@ -1067,18 +1059,12 @@ class KnowledgePriorityMiddleware(AgentMiddleware): # type: ignore[type-arg]
"mentioned": bool(doc.get("_user_mentioned")),
}
)
if isinstance(doc_id, int):
chunk_ids = doc.get("matched_chunk_ids") or []
if chunk_ids:
matched_chunk_ids[doc_id] = [
int(cid) for cid in chunk_ids if isinstance(cid, int | str)
]
_perf_log.info(
"[kb_priority.materialize] db=%.3fs docs=%d",
time.perf_counter() - _t0,
len(merged),
)
return priority, matched_chunk_ids
return priority
__all__ = [

View file

@ -14,8 +14,8 @@ extra fields needed to implement Postgres-backed virtual filesystem semantics:
* ``dirty_path_tool_calls`` sidecar map ``path -> latest tool_call_id`` for
dirty paths; used to bind the per-path snapshot to an action_id.
* ``kb_priority`` top-K priority hints rendered into a system message.
* ``kb_matched_chunk_ids`` internal hand-off for matched-chunk highlighting.
* ``kb_anon_doc`` Redis-loaded anonymous document (if any).
* ``citation_registry`` per-conversation ``[n]`` -> source map for citations.
* ``tree_version`` bumped by persistence; invalidates the tree render cache.
* ``workspace_tree_text`` pre-rendered ``<workspace_tree>`` body for the turn.
@ -30,9 +30,11 @@ from typing import Annotated, Any, NotRequired
from deepagents.middleware.filesystem import FilesystemState
from typing_extensions import TypedDict
from app.agents.chat.multi_agent_chat.shared.citations import CitationRegistry
from app.agents.chat.multi_agent_chat.shared.receipts.receipt import Receipt
from app.agents.chat.multi_agent_chat.shared.state.reducers import (
_add_unique_reducer,
_citation_registry_merge_reducer,
_dict_merge_with_tombstones_reducer,
_int_counter_merge_reducer,
_list_append_reducer,
@ -162,12 +164,16 @@ class SurfSenseFilesystemState(FilesystemState):
kb_priority: NotRequired[Annotated[list[KbPriorityEntry], _replace_reducer]]
"""Top-K priority hints rendered as a system message before the user turn."""
kb_matched_chunk_ids: NotRequired[Annotated[dict[int, list[int]], _replace_reducer]]
"""Internal: ``Document.id`` -> list of matched chunk IDs from hybrid search."""
kb_anon_doc: NotRequired[Annotated[KbAnonDoc | None, _replace_reducer]]
"""Anonymous-session document loaded from Redis (read-only, no DB row)."""
citation_registry: NotRequired[
Annotated[CitationRegistry, _citation_registry_merge_reducer]
]
"""Per-conversation ``[n]`` -> source map; written by retrieval, read by the
normalizer. Merges (union, find-or-create) so parallel/subagent registrations
stay globally consistent instead of clobbering each other."""
tree_version: NotRequired[Annotated[int, _replace_reducer]]
"""Monotonically increasing counter; bumped when commits change the KB tree."""

View file

@ -2,7 +2,7 @@
These reducers back the extra state fields used by the cloud-mode filesystem
agent (`cwd`, `staged_dirs`, `pending_moves`, `dirty_paths`, `doc_id_by_path`,
`kb_priority`, `kb_matched_chunk_ids`, `kb_anon_doc`, `tree_version`).
`kb_priority`, `kb_anon_doc`, `tree_version`).
Tools mutate these fields ONLY via `Command(update={...})` returns; the
reducers are responsible for merging successive updates atomically and for
@ -20,6 +20,8 @@ from __future__ import annotations
from typing import Any, Final, TypeVar
from app.agents.chat.multi_agent_chat.shared.citations import CitationRegistry
_CLEAR: Final[str] = "\x00__SURFSENSE_FILESYSTEM_CLEAR__\x00"
"""Reset sentinel; pass it inside a list/dict update to request a reset.
@ -204,6 +206,41 @@ def _int_counter_merge_reducer(
return base
def _as_registry(value: Any) -> CitationRegistry | None:
"""Coerce a state value into a ``CitationRegistry``.
The checkpointer serializes ``Command.update`` via ``ormsgpack`` *before*
reducers run, so an update can arrive as a plain ``dict`` rather than a model.
"""
if value is None:
return None
if isinstance(value, CitationRegistry):
return value
if isinstance(value, dict):
return CitationRegistry.model_validate(value)
return None
def _citation_registry_merge_reducer(
left: Any,
right: Any,
) -> CitationRegistry | None:
"""Union two citation registries instead of replacing.
Find-or-create across both sides so ``[n]`` stays globally consistent when
branches (parent + subagents, parallel tool calls) each register into a
registry forked from the same base. Collisions re-mint rather than drop. See
:meth:`CitationRegistry.merge`.
"""
right_reg = _as_registry(right)
left_reg = _as_registry(left)
if right_reg is None:
return left_reg
if left_reg is None:
return right_reg
return left_reg.merge(right_reg)
def _initial_filesystem_state() -> dict[str, Any]:
"""Default empty values for SurfSense filesystem state fields.
@ -222,7 +259,6 @@ def _initial_filesystem_state() -> dict[str, Any]:
"dirty_paths": [],
"dirty_path_tool_calls": {},
"kb_priority": [],
"kb_matched_chunk_ids": {},
"kb_anon_doc": None,
"tree_version": 0,
}
@ -231,6 +267,7 @@ def _initial_filesystem_state() -> dict[str, Any]:
__all__ = [
"_CLEAR",
"_add_unique_reducer",
"_citation_registry_merge_reducer",
"_dict_merge_with_tombstones_reducer",
"_initial_filesystem_state",
"_int_counter_merge_reducer",

View file

@ -2,4 +2,4 @@ Read-only specialist for the user's workspace (documents and folders). Use to fi
Pass your full question as one string. The specialist runs in isolation: it cannot see this thread, so include any path hints, filters, or constraints it needs.
The specialist returns plain prose with absolute paths and `[citation:<chunk_id>]` markers when claims came from KB-indexed chunks. Preserve those markers verbatim if you forward the answer.
The specialist returns plain prose with absolute paths and `[n]` citation labels when claims came from KB-indexed documents. Preserve those labels verbatim if you forward the answer.

View file

@ -35,42 +35,31 @@ Map outcomes to your `status`:
You construct the structured `evidence` fields from your own knowledge of what you called and what you observed — the tools do not return them. Never report values you did not actually see.
## Chunk citations in your prose
## Citations in your prose
When `read_file` returns a KB-indexed document under `/documents/`, the response includes `<chunk id='…'>` blocks. Whenever a fact in your `action_summary` or `evidence.content_excerpt` came from a specific chunk, append `[citation:<chunk_id>]` to the sentence stating that fact, using the **exact** id from the `<chunk id='…'>` tag. The caller relays these markers to the end user verbatim, and the UI resolves each id by exact match against the database, so a wrong id silently breaks the citation.
When `read_file` returns a KB-indexed document under `/documents/`, it comes back as a `<document … view="full">` block whose passages are each prefixed with a bracketed label — `[1]`, `[2]`, `[3]`. That `[n]` is the citation label. Whenever a fact in your `action_summary` or `evidence.content_excerpt` came from a specific passage, append its `[n]` to the sentence stating that fact, copying the label **exactly** as shown. The caller relays these labels verbatim and the server resolves each one, so a wrong number silently breaks the citation.
### Where chunk ids live in `read_file` output
### Where the labels live in `read_file` output
A KB document's XML has three numeric attributes — only **one** is a citation source:
A KB document reads back like this — only the bracketed `[n]` is a citation label:
```
<document>
<document_metadata>
<document_id>42</document_id> ← NOT a citation. Parent doc id; ignore for citations.
...
</document_metadata>
<chunk_index>
<entry chunk_id="128" lines="14-22"/> ← Index hint; the same id also appears below.
<entry chunk_id="129" lines="23-30" matched="true"/>
</chunk_index>
<document_content>
<chunk id='128'><![CDATA[…]]></chunk> ← This is the citation source.
<chunk id='129'><![CDATA[…]]></chunk>
</document_content>
<document title="Q2 Roadmap" source="File" view="full">
[3] First milestone is …
[4] Second milestone is …
</document>
```
### Rules
- Use the **exact** id from a `<chunk id='…'>` tag whose content you actually quoted or paraphrased. Copy digit-for-digit; do **not** retype from memory.
- Before emitting `[citation:N]`, confirm the literal substring `<chunk id='N'>` (or its index twin `chunk_id="N"`) appears in the tool result you are summarising this turn. If you can't see it, omit the citation.
- Never cite `<document_id>` — that's the parent doc, not a chunk.
- Never invent, normalise, shorten, or guess at adjacent ids. If unsure between two candidates, omit rather than pick.
- Use the **exact** `[n]` shown next to the passage you actually quoted or paraphrased. Copy it digit-for-digit; do **not** retype from memory or renumber.
- Before emitting an `[n]`, confirm that bracketed label appears in the `read_file` output you are summarising this turn. If you can't see it, omit the citation.
- Labels are **not** sequential by position — a passage may be `[7]` while the one above it is `[3]` (numbering is shared across the whole conversation). Copy what you see; never guess an adjacent number.
- Write the bare label `[n]` only — no `[citation:…]` wrapper, no markdown links, no parentheses, no footnote numbers.
- Several passages behind one point → each in its own brackets with nothing between: `[3][4]`. Never `[3, 4]` and never a range like `[3-4]`.
- Prefer **fewer accurate citations** over many speculative ones.
- Multiple chunks supporting the same point → comma-separated and copied individually: `[citation:128], [citation:129]`.
- Plain square brackets only — no markdown links, no parentheses, no footnote numbers.
- Tool results without `<chunk id='…'>` (write/edit/move confirmations, `ls` / `glob` / `grep` listings, error strings) carry no chunk id and need none.
- Populate `evidence.chunk_ids` with **only** ids you actually emitted in `[citation:…]` markers — same set, same digits.
- Tool results without `[n]` labels (write/edit/move confirmations, `ls` / `glob` / `grep` listings, error strings) carry no label and need none.
- Populate `evidence.citations` with **only** the labels you actually emitted — same numbers.
## Examples
@ -89,7 +78,7 @@ A KB document's XML has three numeric attributes — only **one** is a citation
"path": "/documents/meetings/2026-05-11-meeting.md",
"matched_candidates": null,
"content_excerpt": null,
"chunk_ids": null
"citations": null
},
"next_step": null,
"missing_fields": null,
@ -121,7 +110,7 @@ A KB document's XML has three numeric attributes — only **one** is a citation
{ "id": "/documents/design/auth-rework.md", "label": "Auth Rework" }
],
"content_excerpt": null,
"chunk_ids": null
"citations": null
},
"next_step": "Ask the user which design doc to update.",
"missing_fields": ["path"],
@ -142,7 +131,7 @@ Return **only** one JSON object (no markdown or prose outside it):
"path": string | null,
"matched_candidates": [ { "id": string, "label": string } ] | null,
"content_excerpt": string | null,
"chunk_ids": string[] | null
"citations": number[] | null
},
"next_step": string | null,
"missing_fields": string[] | null,

View file

@ -33,11 +33,11 @@ Map outcomes to your `status`:
- Any other `"Error: …"``status=error` and relay the tool's message verbatim as `next_step`.
- HITL rejection → `status=blocked` with `next_step="User declined this filesystem action. Do not retry."`.
You construct the structured `evidence` fields from your own knowledge of what you called and what you observed — the tools do not return them. Never report values you did not actually see. (`chunk_ids` is always `null` in desktop mode — see "Chunk citations in your prose" below.)
You construct the structured `evidence` fields from your own knowledge of what you called and what you observed — the tools do not return them. Never report values you did not actually see. (`citations` is always `null` in desktop mode — see "Citations in your prose" below.)
## Chunk citations in your prose
## Citations in your prose
In desktop mode your filesystem tools read local files only, and local-file tool results do **not** carry `<chunk id='…'>` tags. Do not emit `[citation:…]` markers in `action_summary` or `evidence.content_excerpt`, and leave `evidence.chunk_ids` `null` — the absolute path is the only reference for local-file work.
In desktop mode your filesystem tools read local files only, which are not KB-indexed and carry no `[n]` citation labels. Do not emit `[n]` or `[citation:…]` markers in `action_summary` or `evidence.content_excerpt`, and leave `evidence.citations` `null` — the absolute path is the only reference for local-file work.
## Examples
@ -56,7 +56,7 @@ In desktop mode your filesystem tools read local files only, and local-file tool
"path": "/notes/meetings/2026-05-11-meeting.md",
"matched_candidates": null,
"content_excerpt": null,
"chunk_ids": null
"citations": null
},
"next_step": null,
"missing_fields": null,
@ -88,7 +88,7 @@ In desktop mode your filesystem tools read local files only, and local-file tool
{ "id": "/projects/web/design/auth-rework.md", "label": "Auth Rework" }
],
"content_excerpt": null,
"chunk_ids": null
"citations": null
},
"next_step": "Ask the user which design doc to update.",
"missing_fields": ["path"],
@ -109,7 +109,7 @@ Return **only** one JSON object (no markdown or prose outside it):
"path": string | null,
"matched_candidates": [ { "id": string, "label": string } ] | null,
"content_excerpt": string | null,
"chunk_ids": string[] | null
"citations": number[] | null
},
"next_step": string | null,
"missing_fields": string[] | null,

View file

@ -28,41 +28,30 @@ Reply in plain prose:
- If the workspace does not contain the requested information, say so explicitly. Do not fabricate paths or content.
- If the question is genuinely ambiguous after a thorough lookup, list the candidates with their paths and stop.
## Chunk citations
## Citations
When the evidence for a claim came from a `read_file` response that included `<chunk id='…'>` blocks (i.e. a KB-indexed document under `/documents/`), append `[citation:<chunk_id>]` to the sentence stating that claim. The caller passes these markers through to the end user verbatim, and the UI resolves each id by exact match against the database, so a wrong id silently breaks the citation.
When the evidence for a claim came from a `read_file` response for a KB-indexed document under `/documents/`, the document reads back as a `<document … view="full">` block whose passages are each prefixed with a bracketed label — `[1]`, `[2]`, `[3]`. That `[n]` is the citation label. Append the relevant `[n]` to the sentence stating the claim, copying it **exactly** as shown. The caller passes these labels through verbatim and the server resolves each one, so a wrong number silently breaks the citation.
### Where chunk ids live in `read_file` output
### Where the labels live in `read_file` output
A KB document's XML has three numeric attributes — only **one** is a citation source:
A KB document reads back like this — only the bracketed `[n]` is a citation label:
```
<document>
<document_metadata>
<document_id>42</document_id> ← NOT a citation. Parent doc id; ignore for citations.
...
</document_metadata>
<chunk_index>
<entry chunk_id="128" lines="14-22"/> ← Index hint; the same id also appears below.
<entry chunk_id="129" lines="23-30" matched="true"/>
</chunk_index>
<document_content>
<chunk id='128'><![CDATA[…]]></chunk> ← This is the citation source.
<chunk id='129'><![CDATA[…]]></chunk>
</document_content>
<document title="Q2 Roadmap" source="File" view="full">
[3] First milestone is …
[4] Second milestone is …
</document>
```
### Rules
- Use the **exact** id from a `<chunk id='…'>` tag whose content you actually quoted or paraphrased. Copy digit-for-digit; do **not** retype from memory.
- Before emitting `[citation:N]`, confirm the literal substring `<chunk id='N'>` (or its index twin `chunk_id="N"`) appears in the tool result you are summarising this turn. If you can't see it, omit the citation.
- Never cite `<document_id>` — that's the parent doc, not a chunk.
- Never invent, normalise, shorten, or guess at adjacent ids. If unsure between two candidates, omit rather than pick.
- Prefer **fewer accurate citations** over many speculative ones. One correct `[citation:128]` is more useful than a string of wrong ids.
- Multiple chunks supporting the same point → comma-separated and copied individually: `[citation:128], [citation:129]`.
- Plain square brackets only — no markdown links, no parentheses, no footnote numbers.
- If a claim came from a tool result that did **not** carry a chunk id (`ls`, `glob`, `grep` listings, error strings, or files without `<chunk id='…'>`), skip the citation.
- The absolute path under `/documents/` is always required; chunk citations are additive, they do not replace the path reference.
- Use the **exact** `[n]` shown next to the passage you actually quoted or paraphrased. Copy it digit-for-digit; do **not** retype from memory or renumber.
- Before emitting an `[n]`, confirm that bracketed label appears in the `read_file` output you are summarising this turn. If you can't see it, omit the citation.
- Labels are **not** sequential by position — a passage may be `[7]` while the one above it is `[3]` (numbering is shared across the whole conversation). Copy what you see; never guess an adjacent number.
- Prefer **fewer accurate citations** over many speculative ones. One correct `[3]` is more useful than a string of wrong numbers.
- Several passages behind one point → each in its own brackets with nothing between: `[3][4]`. Never `[3, 4]` and never a range like `[3-4]`.
- Write the bare label `[n]` only — no `[citation:…]` wrapper, no markdown links, no parentheses, no footnote numbers.
- If a claim came from a tool result that did **not** carry `[n]` labels (`ls`, `glob`, `grep` listings, error strings), skip the citation.
- The absolute path under `/documents/` is always required; `[n]` labels are additive, they do not replace the path reference.
Example: `The Q2 roadmap lists three milestones (/documents/planning/q2-roadmap.md) [citation:128], [citation:129].`
Example: `The Q2 roadmap lists three milestones (/documents/planning/q2-roadmap.md) [3][4].`

View file

@ -29,6 +29,6 @@ Reply in plain prose:
- If the workspace does not contain the requested information, say so explicitly. Do not fabricate paths or content.
- If the question is genuinely ambiguous after a thorough lookup, list the candidates with their paths and stop.
## Chunk citations
## Citations
In desktop mode your filesystem tools read local files only, and local-file `read_file` responses do **not** carry `<chunk id='…'>` tags. Cite each claim with the absolute local path; do not emit `[citation:…]` markers — your caller has nothing to resolve them against.
In desktop mode your filesystem tools read local files only, which are not KB-indexed and carry no `[n]` citation labels. Cite each claim with the absolute local path; do not emit `[n]` or `[citation:…]` markers — your caller has nothing to resolve them against.

View file

@ -13,7 +13,7 @@ from app.schemas.new_chat import MentionedDocumentInfo
from .chat import resolve_chat_references
from .connectors import resolve_connector_references
from .documents import resolve_document_references
from .documents import referenced_document_ids, resolve_document_references
from .folders import resolve_folder_references
from .models import (
ChatReference,
@ -89,6 +89,7 @@ __all__ = [
"FolderReference",
"Reference",
"ReferenceKind",
"referenced_document_ids",
"render_reference_pointers",
"resolve_references",
]

View file

@ -0,0 +1,13 @@
"""Resolve ``@document`` references.
Two concerns, one subject: ``resolver`` turns document ids into pointer
references for the model, ``referenced`` turns ``@document`` / ``@folder``
mentions into the document ids a retrieval is confined to.
"""
from __future__ import annotations
from .referenced import referenced_document_ids
from .resolver import resolve_document_references
__all__ = ["referenced_document_ids", "resolve_document_references"]

View file

@ -0,0 +1,39 @@
"""Resolve ``@document`` / ``@folder`` mentions to the documents they point at.
Reference resolution, not retrieval: this answers "which knowledge-base
documents did the user point at this turn?". ``@document`` ids pass through;
``@folder`` ids expand to the documents directly inside each folder within this
search space (direct children only, not nested subfolders). The caller turns the
returned ids into a retrieval ``SearchScope``.
"""
from __future__ import annotations
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from app.db import Document
async def referenced_document_ids(
session: AsyncSession,
*,
search_space_id: int,
document_ids: list[int] | None = None,
folder_ids: list[int] | None = None,
) -> tuple[int, ...]:
"""Sorted document ids the user pointed at (empty = nothing referenced)."""
doc_ids = set(document_ids or [])
folders = list(folder_ids or [])
if folders:
rows = await session.execute(
select(Document.id).where(
Document.search_space_id == search_space_id,
Document.folder_id.in_(folders),
)
)
doc_ids.update(rows.scalars().all())
return tuple(sorted(doc_ids))
__all__ = ["referenced_document_ids"]

View file

@ -8,7 +8,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.agents.chat.runtime.path_resolver import PathIndex, doc_to_virtual_path
from app.db import Document
from .models import DocumentReference
from ..models import DocumentReference
async def resolve_document_references(

View file

@ -0,0 +1,237 @@
"""Behavior tests for the ``search_knowledge_base`` main-agent tool.
These exercise the tool through its public contract: seed a real document,
invoke the tool, and assert on the ``Command`` it returns the rendered
``<retrieved_context>`` carries ``[n]`` labels and the citation registry handed
back on state is populated.
The tool's own DB session is redirected to the test session, and the embedding
leg is pinned so the search is deterministic without a live model.
"""
from __future__ import annotations
import contextlib
import uuid
from types import SimpleNamespace
import pytest
from langchain_core.messages import ToolMessage
from langgraph.types import Command
from app.agents.chat.multi_agent_chat.main_agent.tools import search_knowledge_base
from app.agents.chat.multi_agent_chat.main_agent.tools.search_knowledge_base import (
create_search_knowledge_base_tool,
)
from app.agents.chat.multi_agent_chat.shared.citations import CitationRegistry
from app.config import config
from app.db import Chunk, Document, DocumentType, Folder
pytestmark = pytest.mark.integration
_DIM = config.embedding_model_instance.dimension
def _axis(index: int) -> list[float]:
vector = [0.0] * _DIM
vector[index] = 1.0
return vector
async def _add_document(
db_session,
*,
search_space_id: int,
title: str,
text: str,
folder_id: int | None = None,
):
document = Document(
title=title,
document_type=DocumentType.FILE,
content=text,
content_hash=uuid.uuid4().hex,
search_space_id=search_space_id,
folder_id=folder_id,
status={"state": "ready"},
)
db_session.add(document)
await db_session.flush()
db_session.add(
Chunk(content=text, document_id=document.id, position=0, embedding=_axis(0))
)
await db_session.flush()
return document
async def _add_folder(db_session, *, search_space_id: int, name: str = "Folder"):
folder = Folder(name=name, position="0", search_space_id=search_space_id)
db_session.add(folder)
await db_session.flush()
return folder
@pytest.fixture
def _tool_uses_test_session(db_session, monkeypatch):
"""Redirect the tool's ``shielded_async_session`` to the test transaction."""
@contextlib.asynccontextmanager
async def _session():
yield db_session
monkeypatch.setattr(search_knowledge_base, "shielded_async_session", _session)
@pytest.fixture
def _pinned_embedding(monkeypatch):
monkeypatch.setattr(
config.embedding_model_instance, "embed", lambda _query: _axis(0)
)
async def _invoke(tool, query: str, state: dict | None = None, context=None):
runtime = SimpleNamespace(
state=state or {}, tool_call_id="call-1", context=context
)
return await tool.coroutine(query, runtime)
def _mentions(*, document_ids=(), folder_ids=()):
return SimpleNamespace(
mentioned_document_ids=list(document_ids),
mentioned_folder_ids=list(folder_ids),
)
async def test_tool_returns_retrieved_context_with_numbered_passages(
db_session, db_search_space, _tool_uses_test_session, _pinned_embedding
):
await _add_document(
db_session,
search_space_id=db_search_space.id,
title="Asyncio Guide",
text="The asyncio library enables concurrency.",
)
tool = create_search_knowledge_base_tool(search_space_id=db_search_space.id)
result = await _invoke(tool, "asyncio")
assert isinstance(result, Command)
message = result.update["messages"][0]
assert isinstance(message, ToolMessage)
assert "<retrieved_context>" in message.content
assert "[1]" in message.content
async def test_tool_populates_citation_registry_on_state(
db_session, db_search_space, _tool_uses_test_session, _pinned_embedding
):
await _add_document(
db_session,
search_space_id=db_search_space.id,
title="Asyncio Guide",
text="The asyncio library enables concurrency.",
)
tool = create_search_knowledge_base_tool(search_space_id=db_search_space.id)
result = await _invoke(tool, "asyncio")
registry = result.update["citation_registry"]
assert isinstance(registry, CitationRegistry)
assert registry.by_n # at least one passage was registered as [n]
async def test_tool_reuses_existing_registry_numbering(
db_session, db_search_space, _tool_uses_test_session, _pinned_embedding
):
await _add_document(
db_session,
search_space_id=db_search_space.id,
title="Asyncio Guide",
text="The asyncio library enables concurrency.",
)
tool = create_search_knowledge_base_tool(search_space_id=db_search_space.id)
first = await _invoke(tool, "asyncio")
carried = first.update["citation_registry"]
second = await _invoke(tool, "asyncio", state={"citation_registry": carried})
# Same passage searched twice keeps a single [n] (find-or-create).
assert len(second.update["citation_registry"].by_n) == 1
async def test_tool_reports_no_matches_without_touching_state(
db_session, db_search_space, _tool_uses_test_session, _pinned_embedding
):
tool = create_search_knowledge_base_tool(search_space_id=db_search_space.id)
result = await _invoke(tool, "nonexistent-term-zzz")
assert isinstance(result, str)
assert "No knowledge-base matches" in result
async def test_tool_rejects_empty_query(
db_search_space, _tool_uses_test_session, _pinned_embedding
):
tool = create_search_knowledge_base_tool(search_space_id=db_search_space.id)
result = await _invoke(tool, " ")
assert isinstance(result, str)
assert "non-empty" in result
async def test_document_mention_confines_search_to_pinned_doc(
db_session, db_search_space, _tool_uses_test_session, _pinned_embedding
):
pinned = await _add_document(
db_session,
search_space_id=db_search_space.id,
title="Pinned",
text="asyncio appears in the pinned doc.",
)
await _add_document(
db_session,
search_space_id=db_search_space.id,
title="Other",
text="asyncio appears in the other doc.",
)
tool = create_search_knowledge_base_tool(search_space_id=db_search_space.id)
result = await _invoke(
tool, "asyncio", context=_mentions(document_ids=[pinned.id])
)
# Search is confined to the pinned doc: only its content is rendered.
content = result.update["messages"][0].content
assert "Pinned" in content
assert "Other" not in content
async def test_folder_mention_confines_search_to_folder_documents(
db_session, db_search_space, _tool_uses_test_session, _pinned_embedding
):
folder = await _add_folder(db_session, search_space_id=db_search_space.id)
await _add_document(
db_session,
search_space_id=db_search_space.id,
title="Inside",
text="asyncio appears inside the folder.",
folder_id=folder.id,
)
await _add_document(
db_session,
search_space_id=db_search_space.id,
title="Outside",
text="asyncio appears outside the folder.",
)
tool = create_search_knowledge_base_tool(search_space_id=db_search_space.id)
result = await _invoke(
tool, "asyncio", context=_mentions(folder_ids=[folder.id])
)
# Search is confined to the folder's document: only its content is rendered.
content = result.update["messages"][0].content
assert "Inside" in content
assert "Outside" not in content

View file

@ -4,9 +4,14 @@ from __future__ import annotations
import pytest
from app.agents.chat.multi_agent_chat.shared.citations import (
CitationRegistry,
CitationSourceType,
)
from app.agents.chat.multi_agent_chat.shared.state.reducers import (
_CLEAR,
_add_unique_reducer,
_citation_registry_merge_reducer,
_dict_merge_with_tombstones_reducer,
_initial_filesystem_state,
_list_append_reducer,
@ -93,6 +98,57 @@ class TestDictMergeWithTombstones:
}
def _kb_registry(chunk_id: int) -> CitationRegistry:
registry = CitationRegistry()
registry.register(
CitationSourceType.KB_CHUNK, {"document_id": 1, "chunk_id": chunk_id}
)
return registry
class TestCitationRegistryMergeReducer:
def test_none_left_returns_right(self):
right = _kb_registry(10)
assert _citation_registry_merge_reducer(None, right) is right
def test_none_right_returns_left(self):
left = _kb_registry(10)
assert _citation_registry_merge_reducer(left, None) is left
def test_both_none_returns_none(self):
assert _citation_registry_merge_reducer(None, None) is None
def test_unions_two_registries(self):
left = _kb_registry(10)
right = _kb_registry(11)
merged = _citation_registry_merge_reducer(left, right)
chunk_ids = {entry.locator["chunk_id"] for entry in merged.by_n.values()}
assert chunk_ids == {10, 11}
def test_coerces_serialized_dict_update(self):
# The checkpointer serializes Command.update via ormsgpack before the
# reducer runs, so `right` can arrive as a plain dict.
left = _kb_registry(10)
right = _kb_registry(11).model_dump()
merged = _citation_registry_merge_reducer(left, right)
chunk_ids = {entry.locator["chunk_id"] for entry in merged.by_n.values()}
assert chunk_ids == {10, 11}
def test_coerces_both_sides_from_dict(self):
left = _kb_registry(10).model_dump()
right = _kb_registry(11).model_dump()
merged = _citation_registry_merge_reducer(left, right)
assert isinstance(merged, CitationRegistry)
chunk_ids = {entry.locator["chunk_id"] for entry in merged.by_n.values()}
assert chunk_ids == {10, 11}
class TestInitialFilesystemState:
def test_default_shape(self):
state = _initial_filesystem_state()
@ -106,7 +162,6 @@ class TestInitialFilesystemState:
assert state["dirty_paths"] == []
assert state["dirty_path_tool_calls"] == {}
assert state["kb_priority"] == []
assert state["kb_matched_chunk_ids"] == {}
assert state["kb_anon_doc"] is None
assert state["tree_version"] == 0

View file

@ -6,9 +6,6 @@ import pytest
from langchain_core.messages import AIMessage, HumanMessage
from app.agents.chat.multi_agent_chat.shared.middleware import knowledge_search as ks
from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.document_xml import (
build_document_xml as _build_document_xml,
)
from app.agents.chat.multi_agent_chat.shared.middleware.knowledge_search import (
KBSearchPlan,
KnowledgePriorityMiddleware,
@ -59,88 +56,6 @@ class TestResolveSearchTypes:
assert result.count("FILE") == 1
# ── _build_document_xml ────────────────────────────────────────────────
class TestBuildDocumentXml:
@pytest.fixture
def sample_document(self):
return {
"document_id": 42,
"document": {
"id": 42,
"document_type": "FILE",
"title": "Test Doc",
"metadata": {"url": "https://example.com"},
},
"chunks": [
{"chunk_id": 101, "content": "First chunk content"},
{"chunk_id": 102, "content": "Second chunk content"},
{"chunk_id": 103, "content": "Third chunk content"},
],
}
def test_contains_document_metadata(self, sample_document):
xml = _build_document_xml(sample_document)
assert "<document_id>42</document_id>" in xml
assert "<document_type>FILE</document_type>" in xml
assert "Test Doc" in xml
def test_contains_chunk_index(self, sample_document):
xml = _build_document_xml(sample_document)
assert "<chunk_index>" in xml
assert "</chunk_index>" in xml
assert 'chunk_id="101"' in xml
assert 'chunk_id="102"' in xml
assert 'chunk_id="103"' in xml
def test_matched_chunks_flagged_in_index(self, sample_document):
xml = _build_document_xml(sample_document, matched_chunk_ids={101, 103})
lines = xml.split("\n")
for line in lines:
if 'chunk_id="101"' in line:
assert 'matched="true"' in line
if 'chunk_id="102"' in line:
assert 'matched="true"' not in line
if 'chunk_id="103"' in line:
assert 'matched="true"' in line
def test_chunk_content_in_document_content_section(self, sample_document):
xml = _build_document_xml(sample_document)
assert "<document_content>" in xml
assert "First chunk content" in xml
assert "Second chunk content" in xml
assert "Third chunk content" in xml
def test_line_numbers_in_chunk_index_are_accurate(self, sample_document):
"""Verify that the line ranges in chunk_index actually point to the right content."""
xml = _build_document_xml(sample_document, matched_chunk_ids={101})
xml_lines = xml.split("\n")
for line in xml_lines:
if 'chunk_id="101"' in line and "lines=" in line:
import re
m = re.search(r'lines="(\d+)-(\d+)"', line)
assert m, f"No lines= attribute found in: {line}"
start, _end = int(m.group(1)), int(m.group(2))
target_line = xml_lines[start - 1]
assert "101" in target_line
assert "First chunk content" in target_line
break
else:
pytest.fail("chunk_id=101 entry not found in chunk_index")
def test_splits_into_lines_correctly(self, sample_document):
"""Each chunk occupies exactly one line (no embedded newlines)."""
xml = _build_document_xml(sample_document)
lines = xml.split("\n")
chunk_lines = [
line for line in lines if "<![CDATA[" in line and "<chunk" in line
]
assert len(chunk_lines) == 3
# ── planner parsing / date normalization ───────────────────────────────