search-kb: on-demand KB tool on the [n] spine; drop kb_matched_chunk_ids

The main agent's search_knowledge_base tool runs the hybrid spine, renders
a <retrieved_context> of numbered [n] passages, and persists the registry.
KB subagent prompts teach citing [n] from <document view="full"> reads
(evidence.chunk_ids -> evidence.citations). Delete the now-unused
search->read highlighting hand-off: the kb_matched_chunk_ids state field,
its reducer default, the tool's _matched_chunk_ids writer, and the dead
KnowledgePriorityMiddleware writes.
This commit is contained in:
CREDO23 2026-06-25 15:26:39 +02:00
parent 04a76b163b
commit c98bdea5cf
16 changed files with 518 additions and 325 deletions

View file

@ -1,12 +1,11 @@
"""On-demand ``search_knowledge_base`` main-agent tool (OpenCode-style lazy RAG). """On-demand ``search_knowledge_base`` main-agent tool (citation-spine RAG).
The main agent no longer receives eagerly pre-injected KB context on every The main agent calls this when it decides it needs knowledge-base content. The
turn (see :class:`KnowledgePriorityMiddleware`, now gated off by default). tool runs one hybrid search, renders the matched passages as a
Instead it calls this tool only when it decides it needs knowledge-base ``<retrieved_context>`` block whose passages carry server-assigned ``[n]``
content. The tool runs a single hybrid search (embed + DB search, ~0.5s), labels, and persists the conversation's ``CitationRegistry`` onto graph state so
formats the top matches for the model, and writes ``kb_matched_chunk_ids`` the ``[n]`` -> ``[citation:<payload>]`` normalizer can resolve them after the
into graph state so matched-section highlighting is preserved when the agent turn.
later reads a document via ``task(knowledge_base)``.
""" """
from __future__ import annotations from __future__ import annotations
@ -18,153 +17,70 @@ from langchain.tools import ToolRuntime
from langchain_core.messages import ToolMessage from langchain_core.messages import ToolMessage
from langchain_core.tools import BaseTool, StructuredTool from langchain_core.tools import BaseTool, StructuredTool
from langgraph.types import Command from langgraph.types import Command
from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession
from app.agents.chat.multi_agent_chat.shared.middleware.knowledge_search import ( from app.agents.chat.multi_agent_chat.shared.citations import load_registry
search_knowledge_base as _hybrid_search_kb, from app.agents.chat.multi_agent_chat.shared.retrieval import SearchScope, build_context
from app.agents.chat.multi_agent_chat.shared.retrieval.hybrid_search import (
search_chunks,
) )
from app.agents.chat.multi_agent_chat.shared.state.filesystem_state import ( from app.agents.chat.multi_agent_chat.shared.state.filesystem_state import (
SurfSenseFilesystemState, SurfSenseFilesystemState,
) )
from app.agents.chat.runtime.path_resolver import ( from app.agents.chat.runtime.references import referenced_document_ids
PathIndex, from app.db import shielded_async_session
build_path_index,
doc_to_virtual_path,
)
from app.db import Document, shielded_async_session
from app.utils.perf import get_perf_logger from app.utils.perf import get_perf_logger
_perf_log = get_perf_logger() _perf_log = get_perf_logger()
_DEFAULT_TOP_K = 5 _DEFAULT_TOP_K = 5
_MAX_TOP_K = 20 _MAX_TOP_K = 20
_PER_DOC_SNIPPET_CHARS = 1200
_MAX_TOTAL_CHARS = 16_000
_TOOL_DESCRIPTION = ( _TOOL_DESCRIPTION = (
"Search the user's knowledge base (their indexed documents, files, and " "Search the user's knowledge base (their indexed documents, files, and "
"connector content) for passages relevant to a query, using hybrid " "connector content) for passages relevant to a query, using hybrid "
"semantic + keyword retrieval.\n\n" "semantic + keyword retrieval.\n\n"
"Use this FIRST to ground any factual or informational answer about the " "Use this FIRST to ground any factual or informational answer about the "
"user's own documents, notes, or connected sources. The workspace tree " "user's own documents, notes, or connected sources. It returns a "
"shows which files exist; this tool pulls the actual relevant content. " "<retrieved_context> block: each matched passage is labelled [n]. Cite a "
"Each hit returns the document's virtual path, a relevance score, and the " "passage by writing that [n] after the statement it supports.\n\n"
"matched snippets. If you need a document's full text, delegate a read to "
"the knowledge_base specialist via `task` using the returned path.\n\n"
"Write a focused, specific query containing the concrete entities, " "Write a focused, specific query containing the concrete entities, "
"acronyms, people, projects, or terms you are looking for." "acronyms, people, projects, or terms you are looking for."
) )
async def _resolve_virtual_paths( def _search_types(
results: list[dict[str, Any]], available_connectors: list[str] | None,
available_document_types: list[str] | None,
) -> tuple[str, ...] | None:
"""Merge connector + document-type filters into a scope; ``None`` if unrestricted."""
types: set[str] = set()
if available_document_types:
types.update(available_document_types)
if available_connectors:
types.update(available_connectors)
return tuple(sorted(types)) or None
async def _build_search_scope(
session: AsyncSession,
*, *,
search_space_id: int, search_space_id: int,
) -> dict[int, str]: document_types: tuple[str, ...] | None,
"""Resolve ``Document.id`` -> canonical virtual path for the search hits.""" runtime: ToolRuntime[None, SurfSenseFilesystemState],
doc_ids = [ ) -> SearchScope:
doc_id """Assemble the retrieval scope: workspace document-type filter + @-mention pins."""
for doc_id in ( ctx = getattr(runtime, "context", None)
(doc.get("document") or {}).get("id") document_ids = await referenced_document_ids(
for doc in results session,
if isinstance(doc, dict) search_space_id=search_space_id,
) document_ids=getattr(ctx, "mentioned_document_ids", None),
if isinstance(doc_id, int) folder_ids=getattr(ctx, "mentioned_folder_ids", None),
] )
if not doc_ids: return SearchScope(
return {} document_types=document_types,
document_ids=document_ids or None,
async with shielded_async_session() as session:
index: PathIndex = await build_path_index(session, search_space_id)
folder_rows = await session.execute(
select(Document.id, Document.folder_id).where(
Document.search_space_id == search_space_id,
Document.id.in_(doc_ids),
)
)
folder_by_doc_id = {row.id: row.folder_id for row in folder_rows.all()}
paths: dict[int, str] = {}
for doc in results:
doc_meta = doc.get("document") or {}
doc_id = doc_meta.get("id")
if not isinstance(doc_id, int):
continue
folder_id = folder_by_doc_id.get(doc_id, doc_meta.get("folder_id"))
paths[doc_id] = doc_to_virtual_path(
doc_id=doc_id,
title=str(doc_meta.get("title") or "untitled"),
folder_id=folder_id if isinstance(folder_id, int) else None,
index=index,
)
return paths
def _format_hits(
results: list[dict[str, Any]],
*,
paths: dict[int, str],
query: str,
) -> str:
"""Render search hits as a compact, model-readable block."""
if not results:
return (
f"No knowledge-base matches found for query: {query!r}.\n"
"Tell the user nothing relevant was found in their workspace, or "
"try a different query."
)
lines: list[str] = [f"<knowledge_base_results query={query!r}>"]
total = len(lines[0])
for rank, doc in enumerate(results, start=1):
doc_meta = doc.get("document") or {}
doc_id = doc_meta.get("id")
title = str(doc_meta.get("title") or "untitled")
doc_type = doc_meta.get("document_type") or doc.get("source") or "document"
score = doc.get("score")
score_str = f"{score:.3f}" if isinstance(score, int | float) else "n/a"
path = paths.get(doc_id) if isinstance(doc_id, int) else None
header = f"\n{rank}. {title} (type={doc_type}, score={score_str})" + (
f"\n path: {path}" if path else ""
)
content = (doc.get("content") or "").strip()
if content:
snippet = content[:_PER_DOC_SNIPPET_CHARS].strip()
if len(content) > _PER_DOC_SNIPPET_CHARS:
snippet += " ..."
body = "\n " + snippet.replace("\n", "\n ")
else:
body = "\n (no preview available; read the document for details)"
entry = header + body
if total + len(entry) > _MAX_TOTAL_CHARS:
lines.append("\n<!-- additional matches truncated to fit context -->")
break
lines.append(entry)
total += len(entry)
lines.append(
"\n\nTo read a full document, delegate to the knowledge_base specialist "
"with `task`, referencing the path above."
) )
lines.append("\n</knowledge_base_results>")
return "".join(lines)
def _matched_chunk_ids(results: list[dict[str, Any]]) -> dict[int, list[int]]:
"""Extract ``Document.id`` -> matched chunk ids for state hand-off."""
matched: dict[int, list[int]] = {}
for doc in results:
doc_id = (doc.get("document") or {}).get("id")
if not isinstance(doc_id, int):
continue
chunk_ids = doc.get("matched_chunk_ids") or []
normalized = [int(cid) for cid in chunk_ids if isinstance(cid, int | str)]
if normalized:
matched[doc_id] = normalized
return matched
def create_search_knowledge_base_tool( def create_search_knowledge_base_tool(
@ -176,8 +92,7 @@ def create_search_knowledge_base_tool(
"""Factory for the on-demand ``search_knowledge_base`` tool.""" """Factory for the on-demand ``search_knowledge_base`` tool."""
_space_id = search_space_id _space_id = search_space_id
_connectors = available_connectors _document_types = _search_types(available_connectors, available_document_types)
_doc_types = available_document_types
async def _impl( async def _impl(
query: Annotated[ query: Annotated[
@ -195,34 +110,45 @@ def create_search_knowledge_base_tool(
return "Error: provide a non-empty search query." return "Error: provide a non-empty search query."
clamped_top_k = min(max(1, top_k), _MAX_TOP_K) clamped_top_k = min(max(1, top_k), _MAX_TOP_K)
t0 = time.perf_counter() registry = load_registry(getattr(runtime, "state", None))
results = await _hybrid_search_kb(
query=cleaned_query,
search_space_id=_space_id,
available_connectors=_connectors,
available_document_types=_doc_types,
top_k=clamped_top_k,
)
paths = await _resolve_virtual_paths(results, search_space_id=_space_id) t0 = time.perf_counter()
rendered = _format_hits(results, paths=paths, query=cleaned_query) async with shielded_async_session() as session:
matched = _matched_chunk_ids(results) scope = await _build_search_scope(
session,
search_space_id=_space_id,
document_types=_document_types,
runtime=runtime,
)
hits = await search_chunks(
session,
search_space_id=_space_id,
query=cleaned_query,
scope=scope,
top_k=clamped_top_k,
)
rendered = build_context(cleaned_query, hits, registry)
_perf_log.info( _perf_log.info(
"[search_knowledge_base] tool query=%r results=%d chars=%d in %.3fs", "[search_knowledge_base] tool query=%r docs=%d in %.3fs",
cleaned_query[:60], cleaned_query[:60],
len(results), len(hits),
len(rendered),
time.perf_counter() - t0, time.perf_counter() - t0,
) )
if rendered is None:
return (
f"No knowledge-base matches found for query: {cleaned_query!r}.\n"
"Tell the user nothing relevant was found in their workspace, or "
"try a different query."
)
update: dict[str, Any] = { update: dict[str, Any] = {
"messages": [ "messages": [
ToolMessage(content=rendered, tool_call_id=runtime.tool_call_id) ToolMessage(content=rendered, tool_call_id=runtime.tool_call_id)
], ],
"citation_registry": registry,
} }
if matched:
update["kb_matched_chunk_ids"] = matched
return Command(update=update) return Command(update=update)
return StructuredTool.from_function( return StructuredTool.from_function(

View file

@ -5,11 +5,6 @@ This middleware runs ``before_agent`` on every turn and writes:
* ``state["kb_priority"]`` the top-K most relevant documents for the * ``state["kb_priority"]`` the top-K most relevant documents for the
current user message, used to render a ``<priority_documents>`` system current user message, used to render a ``<priority_documents>`` system
message immediately before the user turn. message immediately before the user turn.
* ``state["kb_matched_chunk_ids"]`` internal hand-off mapping
(``Document.id`` matched chunk IDs) consumed by
:class:`KBPostgresBackend._load_file_data` when the agent first reads each
document, so the XML wrapper can flag matched sections in
``<chunk_index>``.
The previous "scoped filesystem" behaviour (synthetic ``ls`` + state The previous "scoped filesystem" behaviour (synthetic ``ls`` + state
``files`` seeding) is intentionally removed: documents are now lazy-loaded ``files`` seeding) is intentionally removed: documents are now lazy-loaded
@ -816,7 +811,6 @@ class KnowledgePriorityMiddleware(AgentMiddleware): # type: ignore[type-arg]
] ]
update: dict[str, Any] = { update: dict[str, Any] = {
"kb_priority": priority, "kb_priority": priority,
"kb_matched_chunk_ids": {},
} }
if self.inject_system_message: if self.inject_system_message:
new_messages = list(state.get("messages") or []) new_messages = list(state.get("messages") or [])
@ -930,7 +924,7 @@ class KnowledgePriorityMiddleware(AgentMiddleware): # type: ignore[type-arg]
merged.append(doc) merged.append(doc)
_t_materialize = time.perf_counter() _t_materialize = time.perf_counter()
priority, matched_chunk_ids = await self._materialize_priority(merged) priority = await self._materialize_priority(merged)
if folder_mention_ids: if folder_mention_ids:
folder_entries = await self._materialize_folder_priority(folder_mention_ids) folder_entries = await self._materialize_folder_priority(folder_mention_ids)
@ -957,7 +951,6 @@ class KnowledgePriorityMiddleware(AgentMiddleware): # type: ignore[type-arg]
update: dict[str, Any] = { update: dict[str, Any] = {
"kb_priority": priority, "kb_priority": priority,
"kb_matched_chunk_ids": matched_chunk_ids,
} }
if self.inject_system_message: if self.inject_system_message:
new_messages = list(messages) new_messages = list(messages)
@ -1016,13 +1009,12 @@ class KnowledgePriorityMiddleware(AgentMiddleware): # type: ignore[type-arg]
async def _materialize_priority( async def _materialize_priority(
self, merged: list[dict[str, Any]] self, merged: list[dict[str, Any]]
) -> tuple[list[dict[str, Any]], dict[int, list[int]]]: ) -> list[dict[str, Any]]:
"""Resolve canonical paths and matched chunk ids for the priority list.""" """Resolve canonical paths for the priority list."""
priority: list[dict[str, Any]] = [] priority: list[dict[str, Any]] = []
matched_chunk_ids: dict[int, list[int]] = {}
if not merged: if not merged:
return priority, matched_chunk_ids return priority
_t0 = time.perf_counter() _t0 = time.perf_counter()
async with shielded_async_session() as session: async with shielded_async_session() as session:
@ -1067,18 +1059,12 @@ class KnowledgePriorityMiddleware(AgentMiddleware): # type: ignore[type-arg]
"mentioned": bool(doc.get("_user_mentioned")), "mentioned": bool(doc.get("_user_mentioned")),
} }
) )
if isinstance(doc_id, int):
chunk_ids = doc.get("matched_chunk_ids") or []
if chunk_ids:
matched_chunk_ids[doc_id] = [
int(cid) for cid in chunk_ids if isinstance(cid, int | str)
]
_perf_log.info( _perf_log.info(
"[kb_priority.materialize] db=%.3fs docs=%d", "[kb_priority.materialize] db=%.3fs docs=%d",
time.perf_counter() - _t0, time.perf_counter() - _t0,
len(merged), len(merged),
) )
return priority, matched_chunk_ids return priority
__all__ = [ __all__ = [

View file

@ -14,8 +14,8 @@ extra fields needed to implement Postgres-backed virtual filesystem semantics:
* ``dirty_path_tool_calls`` sidecar map ``path -> latest tool_call_id`` for * ``dirty_path_tool_calls`` sidecar map ``path -> latest tool_call_id`` for
dirty paths; used to bind the per-path snapshot to an action_id. dirty paths; used to bind the per-path snapshot to an action_id.
* ``kb_priority`` top-K priority hints rendered into a system message. * ``kb_priority`` top-K priority hints rendered into a system message.
* ``kb_matched_chunk_ids`` internal hand-off for matched-chunk highlighting.
* ``kb_anon_doc`` Redis-loaded anonymous document (if any). * ``kb_anon_doc`` Redis-loaded anonymous document (if any).
* ``citation_registry`` per-conversation ``[n]`` -> source map for citations.
* ``tree_version`` bumped by persistence; invalidates the tree render cache. * ``tree_version`` bumped by persistence; invalidates the tree render cache.
* ``workspace_tree_text`` pre-rendered ``<workspace_tree>`` body for the turn. * ``workspace_tree_text`` pre-rendered ``<workspace_tree>`` body for the turn.
@ -30,9 +30,11 @@ from typing import Annotated, Any, NotRequired
from deepagents.middleware.filesystem import FilesystemState from deepagents.middleware.filesystem import FilesystemState
from typing_extensions import TypedDict from typing_extensions import TypedDict
from app.agents.chat.multi_agent_chat.shared.citations import CitationRegistry
from app.agents.chat.multi_agent_chat.shared.receipts.receipt import Receipt from app.agents.chat.multi_agent_chat.shared.receipts.receipt import Receipt
from app.agents.chat.multi_agent_chat.shared.state.reducers import ( from app.agents.chat.multi_agent_chat.shared.state.reducers import (
_add_unique_reducer, _add_unique_reducer,
_citation_registry_merge_reducer,
_dict_merge_with_tombstones_reducer, _dict_merge_with_tombstones_reducer,
_int_counter_merge_reducer, _int_counter_merge_reducer,
_list_append_reducer, _list_append_reducer,
@ -162,12 +164,16 @@ class SurfSenseFilesystemState(FilesystemState):
kb_priority: NotRequired[Annotated[list[KbPriorityEntry], _replace_reducer]] kb_priority: NotRequired[Annotated[list[KbPriorityEntry], _replace_reducer]]
"""Top-K priority hints rendered as a system message before the user turn.""" """Top-K priority hints rendered as a system message before the user turn."""
kb_matched_chunk_ids: NotRequired[Annotated[dict[int, list[int]], _replace_reducer]]
"""Internal: ``Document.id`` -> list of matched chunk IDs from hybrid search."""
kb_anon_doc: NotRequired[Annotated[KbAnonDoc | None, _replace_reducer]] kb_anon_doc: NotRequired[Annotated[KbAnonDoc | None, _replace_reducer]]
"""Anonymous-session document loaded from Redis (read-only, no DB row).""" """Anonymous-session document loaded from Redis (read-only, no DB row)."""
citation_registry: NotRequired[
Annotated[CitationRegistry, _citation_registry_merge_reducer]
]
"""Per-conversation ``[n]`` -> source map; written by retrieval, read by the
normalizer. Merges (union, find-or-create) so parallel/subagent registrations
stay globally consistent instead of clobbering each other."""
tree_version: NotRequired[Annotated[int, _replace_reducer]] tree_version: NotRequired[Annotated[int, _replace_reducer]]
"""Monotonically increasing counter; bumped when commits change the KB tree.""" """Monotonically increasing counter; bumped when commits change the KB tree."""

View file

@ -2,7 +2,7 @@
These reducers back the extra state fields used by the cloud-mode filesystem These reducers back the extra state fields used by the cloud-mode filesystem
agent (`cwd`, `staged_dirs`, `pending_moves`, `dirty_paths`, `doc_id_by_path`, agent (`cwd`, `staged_dirs`, `pending_moves`, `dirty_paths`, `doc_id_by_path`,
`kb_priority`, `kb_matched_chunk_ids`, `kb_anon_doc`, `tree_version`). `kb_priority`, `kb_anon_doc`, `tree_version`).
Tools mutate these fields ONLY via `Command(update={...})` returns; the Tools mutate these fields ONLY via `Command(update={...})` returns; the
reducers are responsible for merging successive updates atomically and for reducers are responsible for merging successive updates atomically and for
@ -20,6 +20,8 @@ from __future__ import annotations
from typing import Any, Final, TypeVar from typing import Any, Final, TypeVar
from app.agents.chat.multi_agent_chat.shared.citations import CitationRegistry
_CLEAR: Final[str] = "\x00__SURFSENSE_FILESYSTEM_CLEAR__\x00" _CLEAR: Final[str] = "\x00__SURFSENSE_FILESYSTEM_CLEAR__\x00"
"""Reset sentinel; pass it inside a list/dict update to request a reset. """Reset sentinel; pass it inside a list/dict update to request a reset.
@ -204,6 +206,41 @@ def _int_counter_merge_reducer(
return base return base
def _as_registry(value: Any) -> CitationRegistry | None:
"""Coerce a state value into a ``CitationRegistry``.
The checkpointer serializes ``Command.update`` via ``ormsgpack`` *before*
reducers run, so an update can arrive as a plain ``dict`` rather than a model.
"""
if value is None:
return None
if isinstance(value, CitationRegistry):
return value
if isinstance(value, dict):
return CitationRegistry.model_validate(value)
return None
def _citation_registry_merge_reducer(
left: Any,
right: Any,
) -> CitationRegistry | None:
"""Union two citation registries instead of replacing.
Find-or-create across both sides so ``[n]`` stays globally consistent when
branches (parent + subagents, parallel tool calls) each register into a
registry forked from the same base. Collisions re-mint rather than drop. See
:meth:`CitationRegistry.merge`.
"""
right_reg = _as_registry(right)
left_reg = _as_registry(left)
if right_reg is None:
return left_reg
if left_reg is None:
return right_reg
return left_reg.merge(right_reg)
def _initial_filesystem_state() -> dict[str, Any]: def _initial_filesystem_state() -> dict[str, Any]:
"""Default empty values for SurfSense filesystem state fields. """Default empty values for SurfSense filesystem state fields.
@ -222,7 +259,6 @@ def _initial_filesystem_state() -> dict[str, Any]:
"dirty_paths": [], "dirty_paths": [],
"dirty_path_tool_calls": {}, "dirty_path_tool_calls": {},
"kb_priority": [], "kb_priority": [],
"kb_matched_chunk_ids": {},
"kb_anon_doc": None, "kb_anon_doc": None,
"tree_version": 0, "tree_version": 0,
} }
@ -231,6 +267,7 @@ def _initial_filesystem_state() -> dict[str, Any]:
__all__ = [ __all__ = [
"_CLEAR", "_CLEAR",
"_add_unique_reducer", "_add_unique_reducer",
"_citation_registry_merge_reducer",
"_dict_merge_with_tombstones_reducer", "_dict_merge_with_tombstones_reducer",
"_initial_filesystem_state", "_initial_filesystem_state",
"_int_counter_merge_reducer", "_int_counter_merge_reducer",

View file

@ -2,4 +2,4 @@ Read-only specialist for the user's workspace (documents and folders). Use to fi
Pass your full question as one string. The specialist runs in isolation: it cannot see this thread, so include any path hints, filters, or constraints it needs. Pass your full question as one string. The specialist runs in isolation: it cannot see this thread, so include any path hints, filters, or constraints it needs.
The specialist returns plain prose with absolute paths and `[citation:<chunk_id>]` markers when claims came from KB-indexed chunks. Preserve those markers verbatim if you forward the answer. The specialist returns plain prose with absolute paths and `[n]` citation labels when claims came from KB-indexed documents. Preserve those labels verbatim if you forward the answer.

View file

@ -35,42 +35,31 @@ Map outcomes to your `status`:
You construct the structured `evidence` fields from your own knowledge of what you called and what you observed — the tools do not return them. Never report values you did not actually see. You construct the structured `evidence` fields from your own knowledge of what you called and what you observed — the tools do not return them. Never report values you did not actually see.
## Chunk citations in your prose ## Citations in your prose
When `read_file` returns a KB-indexed document under `/documents/`, the response includes `<chunk id='…'>` blocks. Whenever a fact in your `action_summary` or `evidence.content_excerpt` came from a specific chunk, append `[citation:<chunk_id>]` to the sentence stating that fact, using the **exact** id from the `<chunk id='…'>` tag. The caller relays these markers to the end user verbatim, and the UI resolves each id by exact match against the database, so a wrong id silently breaks the citation. When `read_file` returns a KB-indexed document under `/documents/`, it comes back as a `<document … view="full">` block whose passages are each prefixed with a bracketed label — `[1]`, `[2]`, `[3]`. That `[n]` is the citation label. Whenever a fact in your `action_summary` or `evidence.content_excerpt` came from a specific passage, append its `[n]` to the sentence stating that fact, copying the label **exactly** as shown. The caller relays these labels verbatim and the server resolves each one, so a wrong number silently breaks the citation.
### Where chunk ids live in `read_file` output ### Where the labels live in `read_file` output
A KB document's XML has three numeric attributes — only **one** is a citation source: A KB document reads back like this — only the bracketed `[n]` is a citation label:
``` ```
<document> <document title="Q2 Roadmap" source="File" view="full">
<document_metadata> [3] First milestone is …
<document_id>42</document_id> ← NOT a citation. Parent doc id; ignore for citations. [4] Second milestone is …
...
</document_metadata>
<chunk_index>
<entry chunk_id="128" lines="14-22"/> ← Index hint; the same id also appears below.
<entry chunk_id="129" lines="23-30" matched="true"/>
</chunk_index>
<document_content>
<chunk id='128'><![CDATA[…]]></chunk> ← This is the citation source.
<chunk id='129'><![CDATA[…]]></chunk>
</document_content>
</document> </document>
``` ```
### Rules ### Rules
- Use the **exact** id from a `<chunk id='…'>` tag whose content you actually quoted or paraphrased. Copy digit-for-digit; do **not** retype from memory. - Use the **exact** `[n]` shown next to the passage you actually quoted or paraphrased. Copy it digit-for-digit; do **not** retype from memory or renumber.
- Before emitting `[citation:N]`, confirm the literal substring `<chunk id='N'>` (or its index twin `chunk_id="N"`) appears in the tool result you are summarising this turn. If you can't see it, omit the citation. - Before emitting an `[n]`, confirm that bracketed label appears in the `read_file` output you are summarising this turn. If you can't see it, omit the citation.
- Never cite `<document_id>` — that's the parent doc, not a chunk. - Labels are **not** sequential by position — a passage may be `[7]` while the one above it is `[3]` (numbering is shared across the whole conversation). Copy what you see; never guess an adjacent number.
- Never invent, normalise, shorten, or guess at adjacent ids. If unsure between two candidates, omit rather than pick. - Write the bare label `[n]` only — no `[citation:…]` wrapper, no markdown links, no parentheses, no footnote numbers.
- Several passages behind one point → each in its own brackets with nothing between: `[3][4]`. Never `[3, 4]` and never a range like `[3-4]`.
- Prefer **fewer accurate citations** over many speculative ones. - Prefer **fewer accurate citations** over many speculative ones.
- Multiple chunks supporting the same point → comma-separated and copied individually: `[citation:128], [citation:129]`. - Tool results without `[n]` labels (write/edit/move confirmations, `ls` / `glob` / `grep` listings, error strings) carry no label and need none.
- Plain square brackets only — no markdown links, no parentheses, no footnote numbers. - Populate `evidence.citations` with **only** the labels you actually emitted — same numbers.
- Tool results without `<chunk id='…'>` (write/edit/move confirmations, `ls` / `glob` / `grep` listings, error strings) carry no chunk id and need none.
- Populate `evidence.chunk_ids` with **only** ids you actually emitted in `[citation:…]` markers — same set, same digits.
## Examples ## Examples
@ -89,7 +78,7 @@ A KB document's XML has three numeric attributes — only **one** is a citation
"path": "/documents/meetings/2026-05-11-meeting.md", "path": "/documents/meetings/2026-05-11-meeting.md",
"matched_candidates": null, "matched_candidates": null,
"content_excerpt": null, "content_excerpt": null,
"chunk_ids": null "citations": null
}, },
"next_step": null, "next_step": null,
"missing_fields": null, "missing_fields": null,
@ -121,7 +110,7 @@ A KB document's XML has three numeric attributes — only **one** is a citation
{ "id": "/documents/design/auth-rework.md", "label": "Auth Rework" } { "id": "/documents/design/auth-rework.md", "label": "Auth Rework" }
], ],
"content_excerpt": null, "content_excerpt": null,
"chunk_ids": null "citations": null
}, },
"next_step": "Ask the user which design doc to update.", "next_step": "Ask the user which design doc to update.",
"missing_fields": ["path"], "missing_fields": ["path"],
@ -142,7 +131,7 @@ Return **only** one JSON object (no markdown or prose outside it):
"path": string | null, "path": string | null,
"matched_candidates": [ { "id": string, "label": string } ] | null, "matched_candidates": [ { "id": string, "label": string } ] | null,
"content_excerpt": string | null, "content_excerpt": string | null,
"chunk_ids": string[] | null "citations": number[] | null
}, },
"next_step": string | null, "next_step": string | null,
"missing_fields": string[] | null, "missing_fields": string[] | null,

View file

@ -33,11 +33,11 @@ Map outcomes to your `status`:
- Any other `"Error: …"``status=error` and relay the tool's message verbatim as `next_step`. - Any other `"Error: …"``status=error` and relay the tool's message verbatim as `next_step`.
- HITL rejection → `status=blocked` with `next_step="User declined this filesystem action. Do not retry."`. - HITL rejection → `status=blocked` with `next_step="User declined this filesystem action. Do not retry."`.
You construct the structured `evidence` fields from your own knowledge of what you called and what you observed — the tools do not return them. Never report values you did not actually see. (`chunk_ids` is always `null` in desktop mode — see "Chunk citations in your prose" below.) You construct the structured `evidence` fields from your own knowledge of what you called and what you observed — the tools do not return them. Never report values you did not actually see. (`citations` is always `null` in desktop mode — see "Citations in your prose" below.)
## Chunk citations in your prose ## Citations in your prose
In desktop mode your filesystem tools read local files only, and local-file tool results do **not** carry `<chunk id='…'>` tags. Do not emit `[citation:…]` markers in `action_summary` or `evidence.content_excerpt`, and leave `evidence.chunk_ids` `null` — the absolute path is the only reference for local-file work. In desktop mode your filesystem tools read local files only, which are not KB-indexed and carry no `[n]` citation labels. Do not emit `[n]` or `[citation:…]` markers in `action_summary` or `evidence.content_excerpt`, and leave `evidence.citations` `null` — the absolute path is the only reference for local-file work.
## Examples ## Examples
@ -56,7 +56,7 @@ In desktop mode your filesystem tools read local files only, and local-file tool
"path": "/notes/meetings/2026-05-11-meeting.md", "path": "/notes/meetings/2026-05-11-meeting.md",
"matched_candidates": null, "matched_candidates": null,
"content_excerpt": null, "content_excerpt": null,
"chunk_ids": null "citations": null
}, },
"next_step": null, "next_step": null,
"missing_fields": null, "missing_fields": null,
@ -88,7 +88,7 @@ In desktop mode your filesystem tools read local files only, and local-file tool
{ "id": "/projects/web/design/auth-rework.md", "label": "Auth Rework" } { "id": "/projects/web/design/auth-rework.md", "label": "Auth Rework" }
], ],
"content_excerpt": null, "content_excerpt": null,
"chunk_ids": null "citations": null
}, },
"next_step": "Ask the user which design doc to update.", "next_step": "Ask the user which design doc to update.",
"missing_fields": ["path"], "missing_fields": ["path"],
@ -109,7 +109,7 @@ Return **only** one JSON object (no markdown or prose outside it):
"path": string | null, "path": string | null,
"matched_candidates": [ { "id": string, "label": string } ] | null, "matched_candidates": [ { "id": string, "label": string } ] | null,
"content_excerpt": string | null, "content_excerpt": string | null,
"chunk_ids": string[] | null "citations": number[] | null
}, },
"next_step": string | null, "next_step": string | null,
"missing_fields": string[] | null, "missing_fields": string[] | null,

View file

@ -28,41 +28,30 @@ Reply in plain prose:
- If the workspace does not contain the requested information, say so explicitly. Do not fabricate paths or content. - If the workspace does not contain the requested information, say so explicitly. Do not fabricate paths or content.
- If the question is genuinely ambiguous after a thorough lookup, list the candidates with their paths and stop. - If the question is genuinely ambiguous after a thorough lookup, list the candidates with their paths and stop.
## Chunk citations ## Citations
When the evidence for a claim came from a `read_file` response that included `<chunk id='…'>` blocks (i.e. a KB-indexed document under `/documents/`), append `[citation:<chunk_id>]` to the sentence stating that claim. The caller passes these markers through to the end user verbatim, and the UI resolves each id by exact match against the database, so a wrong id silently breaks the citation. When the evidence for a claim came from a `read_file` response for a KB-indexed document under `/documents/`, the document reads back as a `<document … view="full">` block whose passages are each prefixed with a bracketed label — `[1]`, `[2]`, `[3]`. That `[n]` is the citation label. Append the relevant `[n]` to the sentence stating the claim, copying it **exactly** as shown. The caller passes these labels through verbatim and the server resolves each one, so a wrong number silently breaks the citation.
### Where chunk ids live in `read_file` output ### Where the labels live in `read_file` output
A KB document's XML has three numeric attributes — only **one** is a citation source: A KB document reads back like this — only the bracketed `[n]` is a citation label:
``` ```
<document> <document title="Q2 Roadmap" source="File" view="full">
<document_metadata> [3] First milestone is …
<document_id>42</document_id> ← NOT a citation. Parent doc id; ignore for citations. [4] Second milestone is …
...
</document_metadata>
<chunk_index>
<entry chunk_id="128" lines="14-22"/> ← Index hint; the same id also appears below.
<entry chunk_id="129" lines="23-30" matched="true"/>
</chunk_index>
<document_content>
<chunk id='128'><![CDATA[…]]></chunk> ← This is the citation source.
<chunk id='129'><![CDATA[…]]></chunk>
</document_content>
</document> </document>
``` ```
### Rules ### Rules
- Use the **exact** id from a `<chunk id='…'>` tag whose content you actually quoted or paraphrased. Copy digit-for-digit; do **not** retype from memory. - Use the **exact** `[n]` shown next to the passage you actually quoted or paraphrased. Copy it digit-for-digit; do **not** retype from memory or renumber.
- Before emitting `[citation:N]`, confirm the literal substring `<chunk id='N'>` (or its index twin `chunk_id="N"`) appears in the tool result you are summarising this turn. If you can't see it, omit the citation. - Before emitting an `[n]`, confirm that bracketed label appears in the `read_file` output you are summarising this turn. If you can't see it, omit the citation.
- Never cite `<document_id>` — that's the parent doc, not a chunk. - Labels are **not** sequential by position — a passage may be `[7]` while the one above it is `[3]` (numbering is shared across the whole conversation). Copy what you see; never guess an adjacent number.
- Never invent, normalise, shorten, or guess at adjacent ids. If unsure between two candidates, omit rather than pick. - Prefer **fewer accurate citations** over many speculative ones. One correct `[3]` is more useful than a string of wrong numbers.
- Prefer **fewer accurate citations** over many speculative ones. One correct `[citation:128]` is more useful than a string of wrong ids. - Several passages behind one point → each in its own brackets with nothing between: `[3][4]`. Never `[3, 4]` and never a range like `[3-4]`.
- Multiple chunks supporting the same point → comma-separated and copied individually: `[citation:128], [citation:129]`. - Write the bare label `[n]` only — no `[citation:…]` wrapper, no markdown links, no parentheses, no footnote numbers.
- Plain square brackets only — no markdown links, no parentheses, no footnote numbers. - If a claim came from a tool result that did **not** carry `[n]` labels (`ls`, `glob`, `grep` listings, error strings), skip the citation.
- If a claim came from a tool result that did **not** carry a chunk id (`ls`, `glob`, `grep` listings, error strings, or files without `<chunk id='…'>`), skip the citation. - The absolute path under `/documents/` is always required; `[n]` labels are additive, they do not replace the path reference.
- The absolute path under `/documents/` is always required; chunk citations are additive, they do not replace the path reference.
Example: `The Q2 roadmap lists three milestones (/documents/planning/q2-roadmap.md) [citation:128], [citation:129].` Example: `The Q2 roadmap lists three milestones (/documents/planning/q2-roadmap.md) [3][4].`

View file

@ -29,6 +29,6 @@ Reply in plain prose:
- If the workspace does not contain the requested information, say so explicitly. Do not fabricate paths or content. - If the workspace does not contain the requested information, say so explicitly. Do not fabricate paths or content.
- If the question is genuinely ambiguous after a thorough lookup, list the candidates with their paths and stop. - If the question is genuinely ambiguous after a thorough lookup, list the candidates with their paths and stop.
## Chunk citations ## Citations
In desktop mode your filesystem tools read local files only, and local-file `read_file` responses do **not** carry `<chunk id='…'>` tags. Cite each claim with the absolute local path; do not emit `[citation:…]` markers — your caller has nothing to resolve them against. In desktop mode your filesystem tools read local files only, which are not KB-indexed and carry no `[n]` citation labels. Cite each claim with the absolute local path; do not emit `[n]` or `[citation:…]` markers — your caller has nothing to resolve them against.

View file

@ -13,7 +13,7 @@ from app.schemas.new_chat import MentionedDocumentInfo
from .chat import resolve_chat_references from .chat import resolve_chat_references
from .connectors import resolve_connector_references from .connectors import resolve_connector_references
from .documents import resolve_document_references from .documents import referenced_document_ids, resolve_document_references
from .folders import resolve_folder_references from .folders import resolve_folder_references
from .models import ( from .models import (
ChatReference, ChatReference,
@ -89,6 +89,7 @@ __all__ = [
"FolderReference", "FolderReference",
"Reference", "Reference",
"ReferenceKind", "ReferenceKind",
"referenced_document_ids",
"render_reference_pointers", "render_reference_pointers",
"resolve_references", "resolve_references",
] ]

View file

@ -0,0 +1,13 @@
"""Resolve ``@document`` references.
Two concerns, one subject: ``resolver`` turns document ids into pointer
references for the model, ``referenced`` turns ``@document`` / ``@folder``
mentions into the document ids a retrieval is confined to.
"""
from __future__ import annotations
from .referenced import referenced_document_ids
from .resolver import resolve_document_references
__all__ = ["referenced_document_ids", "resolve_document_references"]

View file

@ -0,0 +1,39 @@
"""Resolve ``@document`` / ``@folder`` mentions to the documents they point at.
Reference resolution, not retrieval: this answers "which knowledge-base
documents did the user point at this turn?". ``@document`` ids pass through;
``@folder`` ids expand to the documents directly inside each folder within this
search space (direct children only, not nested subfolders). The caller turns the
returned ids into a retrieval ``SearchScope``.
"""
from __future__ import annotations
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from app.db import Document
async def referenced_document_ids(
session: AsyncSession,
*,
search_space_id: int,
document_ids: list[int] | None = None,
folder_ids: list[int] | None = None,
) -> tuple[int, ...]:
"""Sorted document ids the user pointed at (empty = nothing referenced)."""
doc_ids = set(document_ids or [])
folders = list(folder_ids or [])
if folders:
rows = await session.execute(
select(Document.id).where(
Document.search_space_id == search_space_id,
Document.folder_id.in_(folders),
)
)
doc_ids.update(rows.scalars().all())
return tuple(sorted(doc_ids))
__all__ = ["referenced_document_ids"]

View file

@ -8,7 +8,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.agents.chat.runtime.path_resolver import PathIndex, doc_to_virtual_path from app.agents.chat.runtime.path_resolver import PathIndex, doc_to_virtual_path
from app.db import Document from app.db import Document
from .models import DocumentReference from ..models import DocumentReference
async def resolve_document_references( async def resolve_document_references(

View file

@ -0,0 +1,237 @@
"""Behavior tests for the ``search_knowledge_base`` main-agent tool.
These exercise the tool through its public contract: seed a real document,
invoke the tool, and assert on the ``Command`` it returns the rendered
``<retrieved_context>`` carries ``[n]`` labels and the citation registry handed
back on state is populated.
The tool's own DB session is redirected to the test session, and the embedding
leg is pinned so the search is deterministic without a live model.
"""
from __future__ import annotations
import contextlib
import uuid
from types import SimpleNamespace
import pytest
from langchain_core.messages import ToolMessage
from langgraph.types import Command
from app.agents.chat.multi_agent_chat.main_agent.tools import search_knowledge_base
from app.agents.chat.multi_agent_chat.main_agent.tools.search_knowledge_base import (
create_search_knowledge_base_tool,
)
from app.agents.chat.multi_agent_chat.shared.citations import CitationRegistry
from app.config import config
from app.db import Chunk, Document, DocumentType, Folder
pytestmark = pytest.mark.integration
_DIM = config.embedding_model_instance.dimension
def _axis(index: int) -> list[float]:
vector = [0.0] * _DIM
vector[index] = 1.0
return vector
async def _add_document(
db_session,
*,
search_space_id: int,
title: str,
text: str,
folder_id: int | None = None,
):
document = Document(
title=title,
document_type=DocumentType.FILE,
content=text,
content_hash=uuid.uuid4().hex,
search_space_id=search_space_id,
folder_id=folder_id,
status={"state": "ready"},
)
db_session.add(document)
await db_session.flush()
db_session.add(
Chunk(content=text, document_id=document.id, position=0, embedding=_axis(0))
)
await db_session.flush()
return document
async def _add_folder(db_session, *, search_space_id: int, name: str = "Folder"):
folder = Folder(name=name, position="0", search_space_id=search_space_id)
db_session.add(folder)
await db_session.flush()
return folder
@pytest.fixture
def _tool_uses_test_session(db_session, monkeypatch):
"""Redirect the tool's ``shielded_async_session`` to the test transaction."""
@contextlib.asynccontextmanager
async def _session():
yield db_session
monkeypatch.setattr(search_knowledge_base, "shielded_async_session", _session)
@pytest.fixture
def _pinned_embedding(monkeypatch):
monkeypatch.setattr(
config.embedding_model_instance, "embed", lambda _query: _axis(0)
)
async def _invoke(tool, query: str, state: dict | None = None, context=None):
runtime = SimpleNamespace(
state=state or {}, tool_call_id="call-1", context=context
)
return await tool.coroutine(query, runtime)
def _mentions(*, document_ids=(), folder_ids=()):
return SimpleNamespace(
mentioned_document_ids=list(document_ids),
mentioned_folder_ids=list(folder_ids),
)
async def test_tool_returns_retrieved_context_with_numbered_passages(
db_session, db_search_space, _tool_uses_test_session, _pinned_embedding
):
await _add_document(
db_session,
search_space_id=db_search_space.id,
title="Asyncio Guide",
text="The asyncio library enables concurrency.",
)
tool = create_search_knowledge_base_tool(search_space_id=db_search_space.id)
result = await _invoke(tool, "asyncio")
assert isinstance(result, Command)
message = result.update["messages"][0]
assert isinstance(message, ToolMessage)
assert "<retrieved_context>" in message.content
assert "[1]" in message.content
async def test_tool_populates_citation_registry_on_state(
db_session, db_search_space, _tool_uses_test_session, _pinned_embedding
):
await _add_document(
db_session,
search_space_id=db_search_space.id,
title="Asyncio Guide",
text="The asyncio library enables concurrency.",
)
tool = create_search_knowledge_base_tool(search_space_id=db_search_space.id)
result = await _invoke(tool, "asyncio")
registry = result.update["citation_registry"]
assert isinstance(registry, CitationRegistry)
assert registry.by_n # at least one passage was registered as [n]
async def test_tool_reuses_existing_registry_numbering(
db_session, db_search_space, _tool_uses_test_session, _pinned_embedding
):
await _add_document(
db_session,
search_space_id=db_search_space.id,
title="Asyncio Guide",
text="The asyncio library enables concurrency.",
)
tool = create_search_knowledge_base_tool(search_space_id=db_search_space.id)
first = await _invoke(tool, "asyncio")
carried = first.update["citation_registry"]
second = await _invoke(tool, "asyncio", state={"citation_registry": carried})
# Same passage searched twice keeps a single [n] (find-or-create).
assert len(second.update["citation_registry"].by_n) == 1
async def test_tool_reports_no_matches_without_touching_state(
db_session, db_search_space, _tool_uses_test_session, _pinned_embedding
):
tool = create_search_knowledge_base_tool(search_space_id=db_search_space.id)
result = await _invoke(tool, "nonexistent-term-zzz")
assert isinstance(result, str)
assert "No knowledge-base matches" in result
async def test_tool_rejects_empty_query(
db_search_space, _tool_uses_test_session, _pinned_embedding
):
tool = create_search_knowledge_base_tool(search_space_id=db_search_space.id)
result = await _invoke(tool, " ")
assert isinstance(result, str)
assert "non-empty" in result
async def test_document_mention_confines_search_to_pinned_doc(
db_session, db_search_space, _tool_uses_test_session, _pinned_embedding
):
pinned = await _add_document(
db_session,
search_space_id=db_search_space.id,
title="Pinned",
text="asyncio appears in the pinned doc.",
)
await _add_document(
db_session,
search_space_id=db_search_space.id,
title="Other",
text="asyncio appears in the other doc.",
)
tool = create_search_knowledge_base_tool(search_space_id=db_search_space.id)
result = await _invoke(
tool, "asyncio", context=_mentions(document_ids=[pinned.id])
)
# Search is confined to the pinned doc: only its content is rendered.
content = result.update["messages"][0].content
assert "Pinned" in content
assert "Other" not in content
async def test_folder_mention_confines_search_to_folder_documents(
db_session, db_search_space, _tool_uses_test_session, _pinned_embedding
):
folder = await _add_folder(db_session, search_space_id=db_search_space.id)
await _add_document(
db_session,
search_space_id=db_search_space.id,
title="Inside",
text="asyncio appears inside the folder.",
folder_id=folder.id,
)
await _add_document(
db_session,
search_space_id=db_search_space.id,
title="Outside",
text="asyncio appears outside the folder.",
)
tool = create_search_knowledge_base_tool(search_space_id=db_search_space.id)
result = await _invoke(
tool, "asyncio", context=_mentions(folder_ids=[folder.id])
)
# Search is confined to the folder's document: only its content is rendered.
content = result.update["messages"][0].content
assert "Inside" in content
assert "Outside" not in content

View file

@ -4,9 +4,14 @@ from __future__ import annotations
import pytest import pytest
from app.agents.chat.multi_agent_chat.shared.citations import (
CitationRegistry,
CitationSourceType,
)
from app.agents.chat.multi_agent_chat.shared.state.reducers import ( from app.agents.chat.multi_agent_chat.shared.state.reducers import (
_CLEAR, _CLEAR,
_add_unique_reducer, _add_unique_reducer,
_citation_registry_merge_reducer,
_dict_merge_with_tombstones_reducer, _dict_merge_with_tombstones_reducer,
_initial_filesystem_state, _initial_filesystem_state,
_list_append_reducer, _list_append_reducer,
@ -93,6 +98,57 @@ class TestDictMergeWithTombstones:
} }
def _kb_registry(chunk_id: int) -> CitationRegistry:
registry = CitationRegistry()
registry.register(
CitationSourceType.KB_CHUNK, {"document_id": 1, "chunk_id": chunk_id}
)
return registry
class TestCitationRegistryMergeReducer:
def test_none_left_returns_right(self):
right = _kb_registry(10)
assert _citation_registry_merge_reducer(None, right) is right
def test_none_right_returns_left(self):
left = _kb_registry(10)
assert _citation_registry_merge_reducer(left, None) is left
def test_both_none_returns_none(self):
assert _citation_registry_merge_reducer(None, None) is None
def test_unions_two_registries(self):
left = _kb_registry(10)
right = _kb_registry(11)
merged = _citation_registry_merge_reducer(left, right)
chunk_ids = {entry.locator["chunk_id"] for entry in merged.by_n.values()}
assert chunk_ids == {10, 11}
def test_coerces_serialized_dict_update(self):
# The checkpointer serializes Command.update via ormsgpack before the
# reducer runs, so `right` can arrive as a plain dict.
left = _kb_registry(10)
right = _kb_registry(11).model_dump()
merged = _citation_registry_merge_reducer(left, right)
chunk_ids = {entry.locator["chunk_id"] for entry in merged.by_n.values()}
assert chunk_ids == {10, 11}
def test_coerces_both_sides_from_dict(self):
left = _kb_registry(10).model_dump()
right = _kb_registry(11).model_dump()
merged = _citation_registry_merge_reducer(left, right)
assert isinstance(merged, CitationRegistry)
chunk_ids = {entry.locator["chunk_id"] for entry in merged.by_n.values()}
assert chunk_ids == {10, 11}
class TestInitialFilesystemState: class TestInitialFilesystemState:
def test_default_shape(self): def test_default_shape(self):
state = _initial_filesystem_state() state = _initial_filesystem_state()
@ -106,7 +162,6 @@ class TestInitialFilesystemState:
assert state["dirty_paths"] == [] assert state["dirty_paths"] == []
assert state["dirty_path_tool_calls"] == {} assert state["dirty_path_tool_calls"] == {}
assert state["kb_priority"] == [] assert state["kb_priority"] == []
assert state["kb_matched_chunk_ids"] == {}
assert state["kb_anon_doc"] is None assert state["kb_anon_doc"] is None
assert state["tree_version"] == 0 assert state["tree_version"] == 0

View file

@ -6,9 +6,6 @@ import pytest
from langchain_core.messages import AIMessage, HumanMessage from langchain_core.messages import AIMessage, HumanMessage
from app.agents.chat.multi_agent_chat.shared.middleware import knowledge_search as ks from app.agents.chat.multi_agent_chat.shared.middleware import knowledge_search as ks
from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.document_xml import (
build_document_xml as _build_document_xml,
)
from app.agents.chat.multi_agent_chat.shared.middleware.knowledge_search import ( from app.agents.chat.multi_agent_chat.shared.middleware.knowledge_search import (
KBSearchPlan, KBSearchPlan,
KnowledgePriorityMiddleware, KnowledgePriorityMiddleware,
@ -59,88 +56,6 @@ class TestResolveSearchTypes:
assert result.count("FILE") == 1 assert result.count("FILE") == 1
# ── _build_document_xml ────────────────────────────────────────────────
class TestBuildDocumentXml:
@pytest.fixture
def sample_document(self):
return {
"document_id": 42,
"document": {
"id": 42,
"document_type": "FILE",
"title": "Test Doc",
"metadata": {"url": "https://example.com"},
},
"chunks": [
{"chunk_id": 101, "content": "First chunk content"},
{"chunk_id": 102, "content": "Second chunk content"},
{"chunk_id": 103, "content": "Third chunk content"},
],
}
def test_contains_document_metadata(self, sample_document):
xml = _build_document_xml(sample_document)
assert "<document_id>42</document_id>" in xml
assert "<document_type>FILE</document_type>" in xml
assert "Test Doc" in xml
def test_contains_chunk_index(self, sample_document):
xml = _build_document_xml(sample_document)
assert "<chunk_index>" in xml
assert "</chunk_index>" in xml
assert 'chunk_id="101"' in xml
assert 'chunk_id="102"' in xml
assert 'chunk_id="103"' in xml
def test_matched_chunks_flagged_in_index(self, sample_document):
xml = _build_document_xml(sample_document, matched_chunk_ids={101, 103})
lines = xml.split("\n")
for line in lines:
if 'chunk_id="101"' in line:
assert 'matched="true"' in line
if 'chunk_id="102"' in line:
assert 'matched="true"' not in line
if 'chunk_id="103"' in line:
assert 'matched="true"' in line
def test_chunk_content_in_document_content_section(self, sample_document):
xml = _build_document_xml(sample_document)
assert "<document_content>" in xml
assert "First chunk content" in xml
assert "Second chunk content" in xml
assert "Third chunk content" in xml
def test_line_numbers_in_chunk_index_are_accurate(self, sample_document):
"""Verify that the line ranges in chunk_index actually point to the right content."""
xml = _build_document_xml(sample_document, matched_chunk_ids={101})
xml_lines = xml.split("\n")
for line in xml_lines:
if 'chunk_id="101"' in line and "lines=" in line:
import re
m = re.search(r'lines="(\d+)-(\d+)"', line)
assert m, f"No lines= attribute found in: {line}"
start, _end = int(m.group(1)), int(m.group(2))
target_line = xml_lines[start - 1]
assert "101" in target_line
assert "First chunk content" in target_line
break
else:
pytest.fail("chunk_id=101 entry not found in chunk_index")
def test_splits_into_lines_correctly(self, sample_document):
"""Each chunk occupies exactly one line (no embedded newlines)."""
xml = _build_document_xml(sample_document)
lines = xml.split("\n")
chunk_lines = [
line for line in lines if "<![CDATA[" in line and "<chunk" in line
]
assert len(chunk_lines) == 3
# ── planner parsing / date normalization ─────────────────────────────── # ── planner parsing / date normalization ───────────────────────────────