document-render: unify KB/web rendering on numbered [n] passages

Add a shared document_render package that renders sources as
<document view="excerpt|full"> blocks with server-assigned [n] passage
labels (KB locator {document_id, chunk_id}, web locator {url}). Wire the
KB read backend (kb_postgres) and read_file to the new renderer and drop
the legacy per-document XML renderer (document_xml, retrieved_context) and
the old chunk_index / matched="true" / <chunk id> read format.
This commit is contained in:
CREDO23 2026-06-25 15:26:16 +02:00
parent 852ab3a576
commit 265888d21c
25 changed files with 846 additions and 501 deletions

View file

@ -0,0 +1,25 @@
"""Render citable documents for the model: one shape for search, read, and web.
``render_document`` emits one ``<document title= source= view="excerpt|full">``
block whose passages carry server-assigned ``[n]`` labels. ``render_search_context``
wraps KB excerpt blocks in ``<retrieved_context>``; ``render_web_results`` wraps web
excerpt blocks in ``<web_results>``. Both cite with the same ``[n]`` spine.
"""
from __future__ import annotations
from .document import render_document
from .models import DocumentView, RenderableDocument, RenderablePassage
from .search_context import render_search_context
from .source_label import source_label
from .web_results import render_web_results
__all__ = [
"DocumentView",
"RenderableDocument",
"RenderablePassage",
"render_document",
"render_search_context",
"render_web_results",
"source_label",
]

View file

@ -0,0 +1,70 @@
"""Render one citable document as a ``<document>`` block.
Every citable surface (KB search excerpts, KB full reads, web results) uses the
same block; ``view`` and the passages shown are what differ. Each passage is
registered for citation as it renders, so its ``[n]`` resolves back to its source
later.
"""
from __future__ import annotations
from app.agents.chat.multi_agent_chat.shared.citations import CitationRegistry
from .models import DocumentView, RenderableDocument, RenderablePassage
def render_document(
document: RenderableDocument,
*,
view: DocumentView,
registry: CitationRegistry,
) -> str | None:
"""Render one ``<document>`` block, registering each passage for citation.
Returns ``None`` when the document has no passage to show. Mutates ``registry``
(find-or-create).
"""
if not document.passages:
return None
lines = [_open_tag(document, view)]
for passage in document.passages:
lines.append(_render_passage(document, passage, registry))
lines.append("</document>")
return "\n".join(lines)
def _open_tag(document: RenderableDocument, view: DocumentView) -> str:
attrs = [f'title="{_attr(document.title)}"']
if document.source:
attrs.append(f'source="{_attr(document.source)}"')
attrs.append(f'view="{view}"')
return f"<document {' '.join(attrs)}>"
def _render_passage(
document: RenderableDocument,
passage: RenderablePassage,
registry: CitationRegistry,
) -> str:
n = registry.register(
passage.source_type,
passage.locator,
{"title": document.title, "source": document.source},
)
label = f" [{n}] "
body = passage.content.strip().replace("\n", "\n" + " " * len(label))
return f"{label}{body}"
def _attr(value: str) -> str:
collapsed = " ".join(str(value).split())
return (
collapsed.replace("&", "&amp;")
.replace("<", "&lt;")
.replace(">", "&gt;")
.replace('"', "&quot;")
)
__all__ = ["render_document"]

View file

@ -0,0 +1,42 @@
"""Inputs for rendering a citable document for the model.
A passage is one citable unit what the model cites with ``[n]``. A document
groups the passages shown from one source. The same shapes feed every citable
surface: KB search excerpts, KB full reads, and web results.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any, Literal
from app.agents.chat.multi_agent_chat.shared.citations import CitationSourceType
DocumentView = Literal["excerpt", "full"]
"""How much of the source is shown: a search slice, or the whole object."""
@dataclass(frozen=True)
class RenderablePassage:
"""One citable unit: what the model cites with ``[n]``.
``locator`` is the source-specific identity registered for this passage (a KB
chunk's ``{document_id, chunk_id}``, a web result's ``{url}``). ``source_type``
selects how that locator resolves to a frontend payload.
"""
content: str
locator: dict[str, Any]
source_type: CitationSourceType = CitationSourceType.KB_CHUNK
@dataclass(frozen=True)
class RenderableDocument:
"""A source document and the passages to render from it, in order."""
title: str
source: str | None = None
passages: list[RenderablePassage] = field(default_factory=list)
__all__ = ["DocumentView", "RenderableDocument", "RenderablePassage"]

View file

@ -0,0 +1,53 @@
"""Wrap search excerpts in the ``<retrieved_context>`` block.
Each document renders through the shared ``render_document``; this module adds the
container and the one-time header that teaches the model how to read and cite.
"""
from __future__ import annotations
from app.agents.chat.multi_agent_chat.shared.citations import CitationRegistry
from .document import render_document
from .models import RenderableDocument
_HEADER = (
"These are excerpts from the user's knowledge base, selected for this query.\n"
"A document is a full source (a file, a Slack thread, a Notion page); each\n"
"<document> below is in excerpt view, so you are seeing only the chunks that\n"
"matched this query, not the whole source. Cite a chunk with its [n]. Read the\n"
"document for full context before claiming it only says X."
)
def render_search_context(
documents: list[RenderableDocument],
registry: CitationRegistry,
) -> str | None:
"""Render retrieved documents as excerpt blocks inside ``<retrieved_context>``.
Returns ``None`` when no document has a passage to show, so the caller can skip
the block. Mutates ``registry`` (find-or-create), so a passage seen again in a
later turn keeps its original ``[n]``.
"""
blocks = [
block
for document in documents
if (
block := render_document(document, view="excerpt", registry=registry)
)
is not None
]
if not blocks:
return None
return (
"<retrieved_context>\n"
+ _HEADER
+ "\n"
+ "\n".join(blocks)
+ "\n</retrieved_context>"
)
__all__ = ["render_search_context"]

View file

@ -1,8 +1,10 @@
"""Build a short, honest source label for a retrieved document.
"""Build a short, honest source label for a knowledge-base document.
A label orients the model about where a passage came from e.g. ``Slack`` or
``Web · docs.python.org``. It is derived only from the document's type and any
URL in its metadata, so it never asserts detail we don't actually have.
URL in its metadata, so it never asserts detail we don't actually have. Search
hits and full reads both build their ``<document source=>`` from here, so the
label a passage carries is identical whichever surface it arrives through.
"""
from __future__ import annotations

View file

@ -0,0 +1,54 @@
"""Wrap live web-search results in a ``<web_results>`` block.
Each result renders through the shared ``render_document`` (excerpt view), so a
web result is cited with ``[n]`` exactly like a knowledge-base passage. Only the
container and header differ they tell the model these came from the public web,
not the user's workspace.
"""
from __future__ import annotations
from app.agents.chat.multi_agent_chat.shared.citations import CitationRegistry
from .document import render_document
from .models import RenderableDocument
_HEADER = (
"These are live results from a public web search for this query. Each\n"
"<document> below is one result in excerpt view; cite a result with its [n]\n"
"after the statement it supports. Scrape the URL for full context before\n"
"making a definitive claim from a snippet."
)
def render_web_results(
documents: list[RenderableDocument],
registry: CitationRegistry,
) -> str | None:
"""Render web results as excerpt blocks inside ``<web_results>``.
Returns ``None`` when no result has content to show, so the caller can skip
the block. Mutates ``registry`` (find-or-create), so a URL seen again keeps
its original ``[n]``.
"""
blocks = [
block
for document in documents
if (
block := render_document(document, view="excerpt", registry=registry)
)
is not None
]
if not blocks:
return None
return (
"<web_results>\n"
+ _HEADER
+ "\n"
+ "\n".join(blocks)
+ "\n</web_results>"
)
__all__ = ["render_web_results"]

View file

@ -1,103 +0,0 @@
"""Shared XML builder for KB documents.
Produces the citation-friendly XML used by every read of a knowledge-base
document (lazy-loaded by :class:`KBPostgresBackend` and synthetic anonymous
files). The XML carries a ``<chunk_index>`` near the top so the LLM can jump
directly to matched-chunk line ranges via ``read_file(offset=, limit=)``.
Extracted from the original ``knowledge_search.py`` so the backend, the
priority middleware, and any future renderer share a single implementation.
"""
from __future__ import annotations
import json
from typing import Any
def build_document_xml(
document: dict[str, Any],
matched_chunk_ids: set[int] | None = None,
) -> str:
"""Build citation-friendly XML with a ``<chunk_index>`` for smart seeking.
Args:
document: Dict shape produced by hybrid search / lazy-load helpers.
Expected keys: ``document`` (with ``id``, ``title``,
``document_type``, ``metadata``) and ``chunks``
(list of ``{chunk_id, content}``).
matched_chunk_ids: Optional set of chunk IDs to flag as
``matched="true"`` in the chunk index.
"""
matched = matched_chunk_ids or set()
doc_meta = document.get("document") or {}
metadata = (doc_meta.get("metadata") or {}) if isinstance(doc_meta, dict) else {}
document_id = doc_meta.get("id", document.get("document_id", "unknown"))
document_type = doc_meta.get("document_type", document.get("source", "UNKNOWN"))
title = doc_meta.get("title") or metadata.get("title") or "Untitled Document"
url = (
metadata.get("url") or metadata.get("source") or metadata.get("page_url") or ""
)
metadata_json = json.dumps(metadata, ensure_ascii=False)
metadata_lines: list[str] = [
"<document>",
"<document_metadata>",
f" <document_id>{document_id}</document_id>",
f" <document_type>{document_type}</document_type>",
f" <title><![CDATA[{title}]]></title>",
f" <url><![CDATA[{url}]]></url>",
f" <metadata_json><![CDATA[{metadata_json}]]></metadata_json>",
"</document_metadata>",
"",
]
chunks = document.get("chunks") or []
chunk_entries: list[tuple[int | None, str]] = []
if isinstance(chunks, list):
for chunk in chunks:
if not isinstance(chunk, dict):
continue
chunk_id = chunk.get("chunk_id") or chunk.get("id")
chunk_content = str(chunk.get("content", "")).strip()
if not chunk_content:
continue
if chunk_id is None:
xml = f" <chunk><![CDATA[{chunk_content}]]></chunk>"
else:
xml = f" <chunk id='{chunk_id}'><![CDATA[{chunk_content}]]></chunk>"
chunk_entries.append((chunk_id, xml))
index_overhead = 1 + len(chunk_entries) + 1 + 1 + 1
first_chunk_line = len(metadata_lines) + index_overhead + 1
current_line = first_chunk_line
index_entry_lines: list[str] = []
for cid, xml_str in chunk_entries:
num_lines = xml_str.count("\n") + 1
end_line = current_line + num_lines - 1
matched_attr = ' matched="true"' if cid is not None and cid in matched else ""
if cid is not None:
index_entry_lines.append(
f' <entry chunk_id="{cid}" lines="{current_line}-{end_line}"{matched_attr}/>'
)
else:
index_entry_lines.append(
f' <entry lines="{current_line}-{end_line}"{matched_attr}/>'
)
current_line = end_line + 1
lines = metadata_lines.copy()
lines.append("<chunk_index>")
lines.extend(index_entry_lines)
lines.append("</chunk_index>")
lines.append("")
lines.append("<document_content>")
for _, xml_str in chunk_entries:
lines.append(xml_str)
lines.extend(["</document_content>", "</document>"])
return "\n".join(lines)
__all__ = ["build_document_xml"]

View file

@ -42,8 +42,15 @@ from langchain.tools import ToolRuntime
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.document_xml import (
build_document_xml,
from app.agents.chat.multi_agent_chat.shared.citations import (
CitationRegistry,
CitationSourceType,
)
from app.agents.chat.multi_agent_chat.shared.document_render import (
RenderableDocument,
RenderablePassage,
render_document,
source_label,
)
from app.agents.chat.runtime.path_resolver import (
DOCUMENTS_ROOT,
@ -59,6 +66,21 @@ _TEMP_PREFIX = "temp_"
_GREP_MAX_TOTAL_MATCHES = 50
_GREP_MAX_PER_DOC = 5
_EMPTY_DOCUMENT_NOTICE = "(This document has no readable content.)"
def render_full_document(
document: RenderableDocument,
registry: CitationRegistry,
) -> str:
"""Render a whole KB document (``view="full"``), registering each chunk's ``[n]``.
Falls back to a short notice when the document has no chunks, so a read never
returns blank.
"""
rendered = render_document(document, view="full", registry=registry)
return rendered if rendered is not None else _EMPTY_DOCUMENT_NOTICE
def _basename(path: str) -> str:
return path.rsplit("/", 1)[-1]
@ -127,13 +149,6 @@ class KBPostgresBackend(BackendProtocol):
anon = self.state.get("kb_anon_doc")
return anon if isinstance(anon, dict) else None
def _matched_chunk_ids(self, doc_id: int) -> set[int]:
mapping = self.state.get("kb_matched_chunk_ids") or {}
try:
return set(mapping.get(doc_id, []) or [])
except TypeError:
return set()
@staticmethod
def _file_data_size(file_data: dict[str, Any]) -> int:
try:
@ -466,80 +481,93 @@ class KBPostgresBackend(BackendProtocol):
def read(self, file_path: str, offset: int = 0, limit: int = 2000) -> str: # type: ignore[override]
return asyncio.run(self.aread(file_path, offset, limit))
async def _load_file_data(
async def aload_document(
self,
path: str,
) -> tuple[dict[str, Any], int | None] | None:
"""Lazy-load a virtual KB document into a deepagents ``FileData``.
) -> tuple[RenderableDocument, int | None] | None:
"""Lazy-load a virtual KB document as a :class:`RenderableDocument`.
Returns ``(file_data, doc_id)`` or ``None`` if the path doesn't map
to any known document. ``doc_id`` is ``None`` for the synthetic
anonymous document so the caller doesn't track it as a DB-backed file.
Returns ``(document, doc_id)`` with every chunk in document order, or
``None`` if the path maps to no known document. ``doc_id`` is ``None``
for the synthetic anonymous upload so the caller doesn't track it as a
DB-backed file. Pure data rendering and citation registration happen in
the caller (see :meth:`_load_file_data` and the ``read_file`` tool).
"""
anon = self._kb_anon_doc()
if anon and str(anon.get("path") or "") == path:
doc_payload = {
"document_id": -1,
"chunks": list(anon.get("chunks") or []),
"matched_chunk_ids": [],
"document": {
"id": -1,
"title": anon.get("title") or "uploaded_document",
"document_type": "FILE",
"metadata": {"source": "anonymous_upload"},
},
"source": "FILE",
}
xml = build_document_xml(doc_payload, matched_chunk_ids=set())
file_data = create_file_data(xml)
return file_data, None
document = RenderableDocument(
title=str(anon.get("title") or "uploaded_document"),
source="Uploaded file",
passages=[
RenderablePassage(
content=str(chunk.get("content", "")),
locator={
"document_id": -1,
"chunk_id": int(chunk["chunk_id"]),
},
source_type=CitationSourceType.ANON_CHUNK,
)
for chunk in (anon.get("chunks") or [])
if isinstance(chunk, dict) and chunk.get("chunk_id") is not None
],
)
return document, None
if not path.startswith(DOCUMENTS_ROOT):
return None
async with shielded_async_session() as session:
document = await virtual_path_to_doc(
document_row = await virtual_path_to_doc(
session,
search_space_id=self.search_space_id,
virtual_path=path,
)
if document is None:
if document_row is None:
return None
chunk_rows = await session.execute(
select(Chunk.id, Chunk.content)
.where(Chunk.document_id == document.id)
.where(Chunk.document_id == document_row.id)
.order_by(Chunk.position, Chunk.id)
)
chunks = [
{"chunk_id": row.id, "content": row.content} for row in chunk_rows.all()
]
chunks = chunk_rows.all()
doc_payload = {
"document_id": document.id,
"chunks": chunks,
"matched_chunk_ids": list(self._matched_chunk_ids(document.id)),
"document": {
"id": document.id,
"title": document.title,
"document_type": (
document.document_type.value
if getattr(document, "document_type", None) is not None
else "UNKNOWN"
),
"metadata": dict(document.document_metadata or {}),
},
"source": (
document.document_type.value
if getattr(document, "document_type", None) is not None
else "UNKNOWN"
),
}
xml = build_document_xml(
doc_payload,
matched_chunk_ids=self._matched_chunk_ids(document.id),
document_type = (
document_row.document_type.value
if getattr(document_row, "document_type", None) is not None
else None
)
file_data = create_file_data(xml)
return file_data, document.id
metadata = dict(document_row.document_metadata or {})
document = RenderableDocument(
title=document_row.title,
source=source_label(document_type, metadata),
passages=[
RenderablePassage(
content=row.content,
locator={"document_id": document_row.id, "chunk_id": row.id},
)
for row in chunks
],
)
return document, document_row.id
async def _load_file_data(
self,
path: str,
) -> tuple[dict[str, Any], int | None] | None:
"""Render a virtual KB document into a deepagents ``FileData``.
Used by the filesystem ops (move/edit existence + content staging) and the
backend's own ``aread``/``aedit``. These have no conversation registry to
persist into, so the ``[n]`` labels are minted into a throwaway registry
the canonical, citation-persisting read is the ``read_file`` tool, which
renders from :meth:`aload_document` against the state registry.
"""
loaded = await self.aload_document(path)
if loaded is None:
return None
document, doc_id = loaded
rendered = render_full_document(document, CitationRegistry())
return create_file_data(rendered), doc_id
# ------------------------------------------------------------------ writes
@ -1037,4 +1065,5 @@ __all__ = [
"KBPostgresBackend",
"list_tree_listing",
"paginate_listing",
"render_full_document",
]

View file

@ -37,8 +37,8 @@ def build_backend_resolver(
In cloud mode the resolver returns a fresh :class:`KBPostgresBackend`
bound to the current ``runtime`` so the backend can read staging state
(``staged_dirs``, ``pending_moves``, ``files`` cache, ``kb_anon_doc``,
``kb_matched_chunk_ids``) for each tool call. When no ``search_space_id``
(``staged_dirs``, ``pending_moves``, ``files`` cache, ``kb_anon_doc``)
for each tool call. When no ``search_space_id``
is provided, the resolver falls back to :class:`StateBackend` (used by
sub-agents and tests that don't need DB-backed reads).

View file

@ -35,26 +35,20 @@ current working directory (`cwd`, default `/documents`).
turn alongside any new/edited documents. Snapshot/revert is enabled
for every destructive operation when action logging is on.
## Reading Documents Efficiently
## Reading Documents
Documents are formatted as XML. Each document contains:
- `<document_metadata>` title, type, URL, etc.
- `<chunk_index>` a table of every chunk with its **line range** and a
`matched="true"` flag for chunks that matched the search query.
- `<document_content>` the actual chunks in original document order.
**Workflow**: when reading a large document, read the first ~20 lines to see
the `<chunk_index>`, identify chunks marked `matched="true"`, then use
`read_file(path, offset=<start_line>, limit=<lines>)` to jump directly to
those sections instead of reading the entire file sequentially.
Use `<chunk id='...'>` values as citation IDs in your answers.
A knowledge-base document is returned as a `<document view="full">` block
the whole source, with each passage labelled `[n]`. `view="full"` means you are
seeing the complete document, not an excerpt. Use `read_file(path, offset, limit)`
to page through a large document. Cite a passage by writing its `[n]` after the
statement it supports the same `[n]` that passage had in
`search_knowledge_base` results.
## Priority List
You receive a `<priority_documents>` system message each turn listing the
top-K paths most relevant to the user's query (by hybrid search). Read those
first matched sections are flagged inside each document's `<chunk_index>`.
first.
## Workspace Tree

View file

@ -10,11 +10,11 @@ Usage:
- By default, reads up to 100 lines from the beginning.
- Use `offset` and `limit` for pagination when files are large.
- Results include line numbers.
- Documents contain a `<chunk_index>` near the top listing every chunk with
its line range and a `matched="true"` flag for search-relevant chunks.
Read the index first, then jump to matched chunks with
`read_file(path, offset=<start_line>, limit=<num_lines>)`.
- Use chunk IDs (`<chunk id='...'>`) as citations in answers.
- A knowledge-base document is returned as a `<document view="full">` block:
the whole source, with each passage labelled `[n]`. `view="full"` means you are
seeing the complete document, not an excerpt.
- Cite a passage by writing its `[n]` after the statement it supports the same
`[n]` you would use for that passage from `search_knowledge_base`.
"""

View file

@ -4,14 +4,20 @@ from __future__ import annotations
from typing import TYPE_CHECKING, Annotated, Any
from deepagents.backends.utils import format_read_response, validate_path
from deepagents.backends.utils import (
create_file_data,
format_read_response,
validate_path,
)
from langchain.tools import ToolRuntime
from langchain_core.messages import ToolMessage
from langchain_core.tools import BaseTool, StructuredTool
from langgraph.types import Command
from app.agents.chat.multi_agent_chat.shared.citations import load_registry
from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.kb_postgres import (
KBPostgresBackend,
render_full_document,
)
from app.agents.chat.multi_agent_chat.shared.state.filesystem_state import (
SurfSenseFilesystemState,
@ -55,10 +61,12 @@ def create_read_file_tool(mw: SurfSenseFilesystemMiddleware) -> BaseTool:
backend = mw._get_backend(runtime)
if isinstance(backend, KBPostgresBackend):
loaded = await backend._load_file_data(validated)
loaded = await backend.aload_document(validated)
if loaded is None:
return f"Error: File '{validated}' not found"
file_data, doc_id = loaded
document, doc_id = loaded
registry = load_registry(runtime.state)
file_data = create_file_data(render_full_document(document, registry))
rendered = format_read_response(file_data, offset, limit)
update: dict[str, Any] = {
"files": {validated: file_data},
@ -68,6 +76,7 @@ def create_read_file_tool(mw: SurfSenseFilesystemMiddleware) -> BaseTool:
tool_call_id=runtime.tool_call_id,
)
],
"citation_registry": registry,
}
if doc_id is not None:
update["doc_id_by_path"] = {validated: doc_id}

View file

@ -1,31 +1,29 @@
"""Turn retriever ``DocumentHit``s into renderable ``RetrievedDocument``s."""
"""Turn retriever ``DocumentHit``s into renderable documents."""
from __future__ import annotations
from app.agents.chat.multi_agent_chat.shared.retrieved_context import (
RetrievedDocument,
RetrievedPassage,
from app.agents.chat.multi_agent_chat.shared.document_render import (
RenderableDocument,
RenderablePassage,
source_label,
)
from .models import DocumentHit
from .source_label import source_label
def to_retrieved_document(hit: DocumentHit) -> RetrievedDocument:
"""Map one hit to the shape the ``<retrieved_context>`` renderer consumes."""
return RetrievedDocument(
document_id=hit.document_id,
def to_renderable_document(hit: DocumentHit) -> RenderableDocument:
"""Map one hit to the shape the document-fragment renderer consumes."""
return RenderableDocument(
title=hit.title,
source_label=source_label(hit.document_type, hit.metadata),
source=source_label(hit.document_type, hit.metadata),
passages=[
RetrievedPassage(
document_id=hit.document_id,
chunk_id=chunk.chunk_id,
RenderablePassage(
content=chunk.content,
locator={"document_id": hit.document_id, "chunk_id": chunk.chunk_id},
)
for chunk in hit.chunks
],
)
__all__ = ["to_retrieved_document"]
__all__ = ["to_renderable_document"]

View file

@ -2,7 +2,7 @@
``SearchScope`` is the optional filter a search runs under. ``DocumentHit`` /
``ChunkHit`` are the retriever's typed output — matched chunks grouped by their
document which the adapter turns into renderable ``RetrievedDocument``s.
document which the adapter turns into renderable ``RenderableDocument``s.
"""
from __future__ import annotations

View file

@ -11,11 +11,11 @@ from typing import TYPE_CHECKING
from sqlalchemy.ext.asyncio import AsyncSession
from app.agents.chat.multi_agent_chat.shared.citations import CitationRegistry
from app.agents.chat.multi_agent_chat.shared.retrieved_context import (
render_retrieved_context,
from app.agents.chat.multi_agent_chat.shared.document_render import (
render_search_context,
)
from .adapter import to_retrieved_document
from .adapter import to_renderable_document
from .hybrid_search import search_chunks
from .models import DocumentHit, SearchScope
from .reranking import rerank_hits
@ -59,8 +59,8 @@ def build_context(
) -> str | None:
"""Rerank → adapt → render. Pure given ``hits``, so it is unit-testable."""
ranked = rerank_hits(query, hits, reranker)
documents = [to_retrieved_document(hit) for hit in ranked]
return render_retrieved_context(documents, registry)
documents = [to_renderable_document(hit) for hit in ranked]
return render_search_context(documents, registry)
__all__ = ["build_context", "search_knowledge_base_context"]

View file

@ -1,16 +0,0 @@
"""Retrieved knowledge-base evidence rendered as the ``<retrieved_context>`` block.
Turns retrieved chunks into the model-facing block and registers each passage
into the citation registry so ``[n]`` resolves back to a real chunk.
"""
from __future__ import annotations
from .models import RetrievedDocument, RetrievedPassage
from .renderer import render_retrieved_context
__all__ = [
"RetrievedDocument",
"RetrievedPassage",
"render_retrieved_context",
]

View file

@ -1,32 +0,0 @@
"""Data shapes for retrieved knowledge-base evidence.
A passage is one matched chunk (the citable unit); a document groups the
passages that came from the same source. The renderer turns these into the
model-facing ``<retrieved_context>`` block.
"""
from __future__ import annotations
from dataclasses import dataclass, field
@dataclass(frozen=True)
class RetrievedPassage:
"""One matched chunk: the unit the model cites with ``[n]``."""
document_id: int
chunk_id: int
content: str
@dataclass(frozen=True)
class RetrievedDocument:
"""A source document and the passages retrieved from it, in order."""
document_id: int
title: str
source_label: str | None = None
passages: list[RetrievedPassage] = field(default_factory=list)
__all__ = ["RetrievedDocument", "RetrievedPassage"]

View file

@ -1,89 +0,0 @@
"""Render retrieved documents into the model-facing ``<retrieved_context>`` block.
Each passage is registered into the citation registry as it is rendered, so the
``[n]`` the model sees is the same label the server can later resolve back to a
chunk. ``[n]`` is the only citable integer per passage by design nothing else
in a line is a number the model could echo by mistake.
"""
from __future__ import annotations
from app.agents.chat.multi_agent_chat.shared.citations import (
CitationRegistry,
CitationSourceType,
)
from .models import RetrievedDocument, RetrievedPassage
_HEADER = (
"These are excerpts from the user's knowledge base, selected for this query.\n"
"A document is a full source (a file, a Slack thread, a Notion page); a chunk\n"
"is one ordered fragment of it. You are seeing only the chunks that matched\n"
"this query, not the whole source.\n"
"Cite a chunk with [n]."
)
def render_retrieved_context(
documents: list[RetrievedDocument],
registry: CitationRegistry,
) -> str | None:
"""Render retrieved documents and register each passage for citation.
Returns ``None`` when there is no passage to show so the caller can skip the
block. Mutates ``registry`` (find-or-create), so a passage seen again in a
later turn keeps its original ``[n]``.
"""
blocks = [
block
for document in documents
if (block := _render_document(document, registry)) is not None
]
if not blocks:
return None
return "<retrieved_context>\n" + _HEADER + "\n" + "\n".join(blocks) + "\n</retrieved_context>"
def _render_document(
document: RetrievedDocument,
registry: CitationRegistry,
) -> str | None:
"""Render one document header and its passages; ``None`` if it has none."""
if not document.passages:
return None
lines = [_render_header(document)]
for passage in document.passages:
lines.append(_render_passage(document, passage, registry))
return "\n".join(lines)
def _render_header(document: RetrievedDocument) -> str:
"""``Document: "Title" (source)``."""
source = f" ({document.source_label})" if document.source_label else ""
return f'Document: "{_clean(document.title)}"{source}'
def _render_passage(
document: RetrievedDocument,
passage: RetrievedPassage,
registry: CitationRegistry,
) -> str:
"""`` [n] <chunk content>`` with continuation lines indented under it."""
n = registry.register(
CitationSourceType.KB_CHUNK,
{"document_id": passage.document_id, "chunk_id": passage.chunk_id},
{"title": document.title, "source": document.source_label},
)
label = f" [{n}] "
body = passage.content.strip().replace("\n", "\n" + " " * len(label))
return f"{label}{body}"
def _clean(text: str) -> str:
"""Collapse whitespace so a title stays on the header line."""
return " ".join(text.split())
__all__ = ["render_retrieved_context"]

View file

@ -0,0 +1,152 @@
"""Tests for the shared ``render_document`` (one ``<document>`` block)."""
from __future__ import annotations
import pytest
from app.agents.chat.multi_agent_chat.shared.citations import (
CitationRegistry,
CitationSourceType,
)
from app.agents.chat.multi_agent_chat.shared.document_render import (
RenderableDocument,
RenderablePassage,
render_document,
)
pytestmark = pytest.mark.unit
def _document(
document_id: int,
title: str,
chunk_ids: list[int],
*,
source: str | None = None,
) -> RenderableDocument:
return RenderableDocument(
title=title,
source=source,
passages=[
RenderablePassage(
content=f"text {cid}",
locator={"document_id": document_id, "chunk_id": cid},
)
for cid in chunk_ids
],
)
def test_returns_none_when_no_passages() -> None:
registry = CitationRegistry()
assert (
render_document(_document(1, "Empty", []), view="excerpt", registry=registry)
is None
)
def test_excerpt_open_and_close_tags() -> None:
registry = CitationRegistry()
block = render_document(
_document(1, "Q3 Launch Notes", [880], source="Slack · #launch"),
view="excerpt",
registry=registry,
)
assert block is not None
assert block.startswith(
'<document title="Q3 Launch Notes" source="Slack · #launch" view="excerpt">'
)
assert block.endswith("</document>")
def test_full_view_renders_view_attribute() -> None:
registry = CitationRegistry()
block = render_document(_document(1, "Doc", [880]), view="full", registry=registry)
assert block is not None
assert '<document title="Doc" view="full">' in block
def test_source_attribute_omitted_when_absent() -> None:
registry = CitationRegistry()
block = render_document(
_document(1, "Plain", [1]), view="excerpt", registry=registry
)
assert block is not None
assert block.startswith('<document title="Plain" view="excerpt">')
def test_registers_passages_with_chunk_locators() -> None:
registry = CitationRegistry()
render_document(
_document(1, "Doc", [880], source="Slack"),
view="excerpt",
registry=registry,
)
entry = registry.resolve(1)
assert entry is not None
assert entry.source_type is CitationSourceType.KB_CHUNK
assert entry.locator == {"document_id": 1, "chunk_id": 880}
assert entry.display == {"title": "Doc", "source": "Slack"}
def test_passages_get_monotonic_labels() -> None:
registry = CitationRegistry()
block = render_document(
_document(1, "Doc", [880, 881]), view="excerpt", registry=registry
)
assert block is not None
assert " [1] text 880" in block
assert " [2] text 881" in block
def test_multiline_passage_indents_under_label() -> None:
registry = CitationRegistry()
document = RenderableDocument(
title="Doc",
passages=[
RenderablePassage(
content="line one\nline two",
locator={"document_id": 1, "chunk_id": 5},
)
],
)
block = render_document(document, view="excerpt", registry=registry)
assert block is not None
assert " [1] line one\n line two" in block
def test_attribute_values_are_escaped() -> None:
registry = CitationRegistry()
block = render_document(
_document(1, 'A & B <c> "d"', [1], source="x & y"),
view="excerpt",
registry=registry,
)
assert block is not None
assert 'title="A &amp; B &lt;c&gt; &quot;d&quot;"' in block
assert 'source="x &amp; y"' in block
def test_same_passage_reuses_label_across_calls() -> None:
registry = CitationRegistry()
document = _document(1, "Doc", [880])
render_document(document, view="excerpt", registry=registry)
render_document(document, view="full", registry=registry)
assert registry.next_n == 2

View file

@ -0,0 +1,94 @@
"""Tests for the ``<retrieved_context>`` wrapper around excerpt documents."""
from __future__ import annotations
import pytest
from app.agents.chat.multi_agent_chat.shared.citations import CitationRegistry
from app.agents.chat.multi_agent_chat.shared.document_render import (
RenderableDocument,
RenderablePassage,
render_search_context,
)
pytestmark = pytest.mark.unit
def _document(
document_id: int,
title: str,
chunk_ids: list[int],
*,
source: str | None = None,
) -> RenderableDocument:
return RenderableDocument(
title=title,
source=source,
passages=[
RenderablePassage(
content=f"text {cid}",
locator={"document_id": document_id, "chunk_id": cid},
)
for cid in chunk_ids
],
)
def test_returns_none_when_nothing_to_show() -> None:
registry = CitationRegistry()
assert render_search_context([], registry) is None
assert render_search_context([_document(1, "Empty", [])], registry) is None
def test_assigns_monotonic_labels_across_documents() -> None:
registry = CitationRegistry()
block = render_search_context(
[
_document(1, "Q3 Launch Notes", [880, 881], source="Slack"),
_document(2, "Timeline", [12], source="Notion"),
],
registry,
)
assert block is not None
assert "[1] text 880" in block
assert "[2] text 881" in block
assert "[3] text 12" in block
def test_wraps_in_retrieved_context_and_teaches_excerpt_and_citation() -> None:
registry = CitationRegistry()
block = render_search_context([_document(1, "Doc", [1])], registry)
assert block is not None
assert block.startswith("<retrieved_context>")
assert block.endswith("</retrieved_context>")
assert "excerpt view" in block
assert "Cite a chunk with its [n]." in block
def test_documents_render_as_excerpt_blocks() -> None:
registry = CitationRegistry()
block = render_search_context(
[_document(1, "Q3", [1], source="Slack · #launch")], registry
)
assert block is not None
assert '<document title="Q3" source="Slack · #launch" view="excerpt">' in block
assert "</document>" in block
def test_same_passage_reuses_label_across_calls() -> None:
registry = CitationRegistry()
document = _document(1, "Doc", [880])
render_search_context([document], registry)
block = render_search_context([document], registry)
assert block is not None
assert "[1] text 880" in block
assert registry.next_n == 2

View file

@ -4,7 +4,7 @@ from __future__ import annotations
import pytest
from app.agents.chat.multi_agent_chat.shared.retrieval.source_label import source_label
from app.agents.chat.multi_agent_chat.shared.document_render import source_label
pytestmark = pytest.mark.unit

View file

@ -0,0 +1,82 @@
"""Tests for the ``<web_results>`` wrapper around web-result excerpt documents."""
from __future__ import annotations
import pytest
from app.agents.chat.multi_agent_chat.shared.citations import (
CitationRegistry,
CitationSourceType,
)
from app.agents.chat.multi_agent_chat.shared.document_render import (
RenderableDocument,
RenderablePassage,
render_web_results,
)
pytestmark = pytest.mark.unit
def _web_doc(url: str, title: str, content: str) -> RenderableDocument:
return RenderableDocument(
title=title,
source=f"Web · {url.split('//', 1)[-1].split('/', 1)[0]}",
passages=[
RenderablePassage(
content=content,
locator={"url": url},
source_type=CitationSourceType.WEB_RESULT,
)
],
)
def test_returns_none_when_nothing_to_show() -> None:
registry = CitationRegistry()
assert render_web_results([], registry) is None
def test_wraps_in_web_results_container() -> None:
registry = CitationRegistry()
block = render_web_results(
[_web_doc("https://example.com/a", "Example", "the answer is 42")],
registry,
)
assert block is not None
assert block.startswith("<web_results>")
assert block.endswith("</web_results>")
assert "cite a result with its [n]" in block
assert '<document title="Example" source="Web · example.com" view="excerpt">' in block
assert "[1] the answer is 42" in block
def test_registers_each_result_as_web_result_with_url_locator() -> None:
registry = CitationRegistry()
render_web_results(
[
_web_doc("https://a.com/x", "A", "alpha"),
_web_doc("https://b.com/y", "B", "beta"),
],
registry,
)
first = registry.resolve(1)
second = registry.resolve(2)
assert first is not None and second is not None
assert first.source_type is CitationSourceType.WEB_RESULT
assert first.locator == {"url": "https://a.com/x"}
assert second.locator == {"url": "https://b.com/y"}
def test_same_url_reuses_label_across_calls() -> None:
registry = CitationRegistry()
doc = _web_doc("https://example.com/a", "Example", "stable fact")
render_web_results([doc], registry)
render_web_results([doc], registry)
assert registry.next_n == 2

View file

@ -1,11 +1,11 @@
"""Tests for mapping a DocumentHit to a renderable RetrievedDocument."""
"""Tests for mapping a DocumentHit to a renderable document."""
from __future__ import annotations
import pytest
from app.agents.chat.multi_agent_chat.shared.retrieval.adapter import (
to_retrieved_document,
to_renderable_document,
)
from app.agents.chat.multi_agent_chat.shared.retrieval.models import (
ChunkHit,
@ -15,7 +15,7 @@ from app.agents.chat.multi_agent_chat.shared.retrieval.models import (
pytestmark = pytest.mark.unit
def test_maps_identity_source_label_and_passages() -> None:
def test_maps_identity_source_and_passages() -> None:
hit = DocumentHit(
document_id=42,
title="Q3 Launch Notes",
@ -28,13 +28,14 @@ def test_maps_identity_source_label_and_passages() -> None:
],
)
document = to_retrieved_document(hit)
document = to_renderable_document(hit)
assert document.document_id == 42
assert document.title == "Q3 Launch Notes"
assert document.source_label == "Slack"
assert [(p.chunk_id, p.content) for p in document.passages] == [(880, "a"), (881, "b")]
assert all(p.document_id == 42 for p in document.passages)
assert document.source == "Slack"
assert [
(p.locator["chunk_id"], p.content) for p in document.passages
] == [(880, "a"), (881, "b")]
assert all(p.locator["document_id"] == 42 for p in document.passages)
def test_document_with_no_chunks_maps_to_no_passages() -> None:
@ -47,4 +48,4 @@ def test_document_with_no_chunks_maps_to_no_passages() -> None:
chunks=[],
)
assert to_retrieved_document(hit).passages == []
assert to_renderable_document(hit).passages == []

View file

@ -1,144 +0,0 @@
"""Tests for the <retrieved_context> renderer and its citation registration."""
from __future__ import annotations
import pytest
from app.agents.chat.multi_agent_chat.shared.citations import (
CitationRegistry,
CitationSourceType,
)
from app.agents.chat.multi_agent_chat.shared.retrieved_context import (
RetrievedDocument,
RetrievedPassage,
render_retrieved_context,
)
pytestmark = pytest.mark.unit
def _document(
document_id: int,
title: str,
chunk_ids: list[int],
*,
source_label: str | None = None,
) -> RetrievedDocument:
return RetrievedDocument(
document_id=document_id,
title=title,
source_label=source_label,
passages=[
RetrievedPassage(document_id=document_id, chunk_id=cid, content=f"text {cid}")
for cid in chunk_ids
],
)
def test_returns_none_when_nothing_to_show() -> None:
registry = CitationRegistry()
assert render_retrieved_context([], registry) is None
assert render_retrieved_context([_document(1, "Empty", [])], registry) is None
def test_assigns_monotonic_labels_across_documents() -> None:
registry = CitationRegistry()
block = render_retrieved_context(
[
_document(1, "Q3 Launch Notes", [880, 881], source_label="Slack"),
_document(2, "Timeline", [12], source_label="Notion"),
],
registry,
)
assert block is not None
assert "[1] text 880" in block
assert "[2] text 881" in block
assert "[3] text 12" in block
def test_registers_passages_with_chunk_locators() -> None:
registry = CitationRegistry()
render_retrieved_context([_document(1, "Doc", [880])], registry)
entry = registry.resolve(1)
assert entry is not None
assert entry.source_type is CitationSourceType.KB_CHUNK
assert entry.locator == {"document_id": 1, "chunk_id": 880}
assert entry.display["title"] == "Doc"
def test_header_shows_source_when_present() -> None:
registry = CitationRegistry()
block = render_retrieved_context(
[
_document(1, "Q3", [1], source_label="Slack · #launch"),
_document(2, "Plan", [2]),
],
registry,
)
assert block is not None
assert 'Document: "Q3" (Slack · #launch)' in block
assert 'Document: "Plan"' in block
def test_wraps_block_and_explains_chunk_vs_document() -> None:
registry = CitationRegistry()
block = render_retrieved_context([_document(1, "Doc", [1])], registry)
assert block is not None
assert block.startswith("<retrieved_context>")
assert block.endswith("</retrieved_context>")
assert "Cite a chunk with [n]." in block
def test_multiline_passage_is_indented_under_label() -> None:
registry = CitationRegistry()
document = RetrievedDocument(
document_id=1,
title="Doc",
passages=[RetrievedPassage(document_id=1, chunk_id=5, content="line one\nline two")],
)
block = render_retrieved_context([document], registry)
assert block is not None
assert " [1] line one\n line two" in block
def test_continuation_indent_tracks_label_width() -> None:
registry = CitationRegistry()
# Burn labels 1..9 so the tenth passage renders as [10] (a 7-char label).
documents = [_document(i, f"Doc {i}", [i]) for i in range(1, 10)]
documents.append(
RetrievedDocument(
document_id=10,
title="Doc 10",
passages=[
RetrievedPassage(document_id=10, chunk_id=10, content="line one\nline two")
],
)
)
block = render_retrieved_context(documents, registry)
assert block is not None
assert " [10] line one\n line two" in block
def test_same_passage_reuses_label_across_calls() -> None:
registry = CitationRegistry()
document = _document(1, "Doc", [880])
render_retrieved_context([document], registry)
block = render_retrieved_context([document], registry)
assert block is not None
assert "[1] text 880" in block
assert registry.next_n == 2

View file

@ -0,0 +1,124 @@
"""Unit tests for the KB read path: full-view render + anonymous-doc loading.
DB-backed loads are exercised by the integration suite; here we lock the pure
pieces ``render_full_document`` and the anonymous-upload branch of
``aload_document`` which need no database.
"""
from __future__ import annotations
from types import SimpleNamespace
import pytest
from app.agents.chat.multi_agent_chat.shared.citations import (
CitationRegistry,
CitationSourceType,
)
from app.agents.chat.multi_agent_chat.shared.document_render import (
RenderableDocument,
RenderablePassage,
)
from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.kb_postgres import (
KBPostgresBackend,
render_full_document,
)
pytestmark = pytest.mark.unit
def _backend(state: dict) -> KBPostgresBackend:
return KBPostgresBackend(search_space_id=1, runtime=SimpleNamespace(state=state))
def test_render_full_document_uses_full_view_and_registers() -> None:
registry = CitationRegistry()
document = RenderableDocument(
title="Launch Notes",
source="Slack",
passages=[
RenderablePassage(
content="push to March 10",
locator={"document_id": 7, "chunk_id": 880},
),
],
)
rendered = render_full_document(document, registry)
assert '<document title="Launch Notes" source="Slack" view="full">' in rendered
assert "[1] push to March 10" in rendered
entry = registry.resolve(1)
assert entry is not None
assert entry.locator == {"document_id": 7, "chunk_id": 880}
def test_render_full_document_reuses_search_label() -> None:
"""A chunk already registered from search keeps its [n] on a later full read."""
registry = CitationRegistry()
n = registry.register(
CitationSourceType.KB_CHUNK,
{"document_id": 7, "chunk_id": 880},
{"title": "Launch Notes", "source": "Slack"},
)
document = RenderableDocument(
title="Launch Notes",
source="Slack",
passages=[
RenderablePassage(
content="new chunk",
locator={"document_id": 7, "chunk_id": 881},
),
RenderablePassage(
content="push to March 10",
locator={"document_id": 7, "chunk_id": 880},
),
],
)
rendered = render_full_document(document, registry)
assert f"[{n}] push to March 10" in rendered
assert "[2] new chunk" in rendered
def test_render_full_document_empty_falls_back_to_notice() -> None:
registry = CitationRegistry()
document = RenderableDocument(title="Empty", passages=[])
assert render_full_document(document, registry) == (
"(This document has no readable content.)"
)
async def test_aload_document_anonymous_upload() -> None:
backend = _backend(
{
"kb_anon_doc": {
"path": "/anon_upload.md",
"title": "Quarterly Report",
"chunks": [
{"chunk_id": -1, "content": "revenue grew"},
{"chunk_id": -2, "content": "costs fell"},
],
}
}
)
loaded = await backend.aload_document("/anon_upload.md")
assert loaded is not None
document, doc_id = loaded
assert doc_id is None
assert document.title == "Quarterly Report"
assert [p.locator["chunk_id"] for p in document.passages] == [-1, -2]
assert all(p.locator["document_id"] == -1 for p in document.passages)
assert all(
p.source_type is CitationSourceType.ANON_CHUNK for p in document.passages
)
async def test_aload_document_unknown_path_returns_none() -> None:
backend = _backend({})
assert await backend.aload_document("/not/under/documents.md") is None