mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-26 21:39:43 +02:00
document-render: unify KB/web rendering on numbered [n] passages
Add a shared document_render package that renders sources as
<document view="excerpt|full"> blocks with server-assigned [n] passage
labels (KB locator {document_id, chunk_id}, web locator {url}). Wire the
KB read backend (kb_postgres) and read_file to the new renderer and drop
the legacy per-document XML renderer (document_xml, retrieved_context) and
the old chunk_index / matched="true" / <chunk id> read format.
This commit is contained in:
parent
852ab3a576
commit
265888d21c
25 changed files with 846 additions and 501 deletions
|
|
@ -0,0 +1,25 @@
|
|||
"""Render citable documents for the model: one shape for search, read, and web.
|
||||
|
||||
``render_document`` emits one ``<document title=… source=… view="excerpt|full">``
|
||||
block whose passages carry server-assigned ``[n]`` labels. ``render_search_context``
|
||||
wraps KB excerpt blocks in ``<retrieved_context>``; ``render_web_results`` wraps web
|
||||
excerpt blocks in ``<web_results>``. Both cite with the same ``[n]`` spine.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from .document import render_document
|
||||
from .models import DocumentView, RenderableDocument, RenderablePassage
|
||||
from .search_context import render_search_context
|
||||
from .source_label import source_label
|
||||
from .web_results import render_web_results
|
||||
|
||||
__all__ = [
|
||||
"DocumentView",
|
||||
"RenderableDocument",
|
||||
"RenderablePassage",
|
||||
"render_document",
|
||||
"render_search_context",
|
||||
"render_web_results",
|
||||
"source_label",
|
||||
]
|
||||
|
|
@ -0,0 +1,70 @@
|
|||
"""Render one citable document as a ``<document>`` block.
|
||||
|
||||
Every citable surface (KB search excerpts, KB full reads, web results) uses the
|
||||
same block; ``view`` and the passages shown are what differ. Each passage is
|
||||
registered for citation as it renders, so its ``[n]`` resolves back to its source
|
||||
later.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from app.agents.chat.multi_agent_chat.shared.citations import CitationRegistry
|
||||
|
||||
from .models import DocumentView, RenderableDocument, RenderablePassage
|
||||
|
||||
|
||||
def render_document(
|
||||
document: RenderableDocument,
|
||||
*,
|
||||
view: DocumentView,
|
||||
registry: CitationRegistry,
|
||||
) -> str | None:
|
||||
"""Render one ``<document>`` block, registering each passage for citation.
|
||||
|
||||
Returns ``None`` when the document has no passage to show. Mutates ``registry``
|
||||
(find-or-create).
|
||||
"""
|
||||
if not document.passages:
|
||||
return None
|
||||
|
||||
lines = [_open_tag(document, view)]
|
||||
for passage in document.passages:
|
||||
lines.append(_render_passage(document, passage, registry))
|
||||
lines.append("</document>")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _open_tag(document: RenderableDocument, view: DocumentView) -> str:
|
||||
attrs = [f'title="{_attr(document.title)}"']
|
||||
if document.source:
|
||||
attrs.append(f'source="{_attr(document.source)}"')
|
||||
attrs.append(f'view="{view}"')
|
||||
return f"<document {' '.join(attrs)}>"
|
||||
|
||||
|
||||
def _render_passage(
|
||||
document: RenderableDocument,
|
||||
passage: RenderablePassage,
|
||||
registry: CitationRegistry,
|
||||
) -> str:
|
||||
n = registry.register(
|
||||
passage.source_type,
|
||||
passage.locator,
|
||||
{"title": document.title, "source": document.source},
|
||||
)
|
||||
label = f" [{n}] "
|
||||
body = passage.content.strip().replace("\n", "\n" + " " * len(label))
|
||||
return f"{label}{body}"
|
||||
|
||||
|
||||
def _attr(value: str) -> str:
|
||||
collapsed = " ".join(str(value).split())
|
||||
return (
|
||||
collapsed.replace("&", "&")
|
||||
.replace("<", "<")
|
||||
.replace(">", ">")
|
||||
.replace('"', """)
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["render_document"]
|
||||
|
|
@ -0,0 +1,42 @@
|
|||
"""Inputs for rendering a citable document for the model.
|
||||
|
||||
A passage is one citable unit — what the model cites with ``[n]``. A document
|
||||
groups the passages shown from one source. The same shapes feed every citable
|
||||
surface: KB search excerpts, KB full reads, and web results.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Literal
|
||||
|
||||
from app.agents.chat.multi_agent_chat.shared.citations import CitationSourceType
|
||||
|
||||
DocumentView = Literal["excerpt", "full"]
|
||||
"""How much of the source is shown: a search slice, or the whole object."""
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class RenderablePassage:
|
||||
"""One citable unit: what the model cites with ``[n]``.
|
||||
|
||||
``locator`` is the source-specific identity registered for this passage (a KB
|
||||
chunk's ``{document_id, chunk_id}``, a web result's ``{url}``). ``source_type``
|
||||
selects how that locator resolves to a frontend payload.
|
||||
"""
|
||||
|
||||
content: str
|
||||
locator: dict[str, Any]
|
||||
source_type: CitationSourceType = CitationSourceType.KB_CHUNK
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class RenderableDocument:
|
||||
"""A source document and the passages to render from it, in order."""
|
||||
|
||||
title: str
|
||||
source: str | None = None
|
||||
passages: list[RenderablePassage] = field(default_factory=list)
|
||||
|
||||
|
||||
__all__ = ["DocumentView", "RenderableDocument", "RenderablePassage"]
|
||||
|
|
@ -0,0 +1,53 @@
|
|||
"""Wrap search excerpts in the ``<retrieved_context>`` block.
|
||||
|
||||
Each document renders through the shared ``render_document``; this module adds the
|
||||
container and the one-time header that teaches the model how to read and cite.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from app.agents.chat.multi_agent_chat.shared.citations import CitationRegistry
|
||||
|
||||
from .document import render_document
|
||||
from .models import RenderableDocument
|
||||
|
||||
_HEADER = (
|
||||
"These are excerpts from the user's knowledge base, selected for this query.\n"
|
||||
"A document is a full source (a file, a Slack thread, a Notion page); each\n"
|
||||
"<document> below is in excerpt view, so you are seeing only the chunks that\n"
|
||||
"matched this query, not the whole source. Cite a chunk with its [n]. Read the\n"
|
||||
"document for full context before claiming it only says X."
|
||||
)
|
||||
|
||||
|
||||
def render_search_context(
|
||||
documents: list[RenderableDocument],
|
||||
registry: CitationRegistry,
|
||||
) -> str | None:
|
||||
"""Render retrieved documents as excerpt blocks inside ``<retrieved_context>``.
|
||||
|
||||
Returns ``None`` when no document has a passage to show, so the caller can skip
|
||||
the block. Mutates ``registry`` (find-or-create), so a passage seen again in a
|
||||
later turn keeps its original ``[n]``.
|
||||
"""
|
||||
blocks = [
|
||||
block
|
||||
for document in documents
|
||||
if (
|
||||
block := render_document(document, view="excerpt", registry=registry)
|
||||
)
|
||||
is not None
|
||||
]
|
||||
if not blocks:
|
||||
return None
|
||||
|
||||
return (
|
||||
"<retrieved_context>\n"
|
||||
+ _HEADER
|
||||
+ "\n"
|
||||
+ "\n".join(blocks)
|
||||
+ "\n</retrieved_context>"
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["render_search_context"]
|
||||
|
|
@ -1,8 +1,10 @@
|
|||
"""Build a short, honest source label for a retrieved document.
|
||||
"""Build a short, honest source label for a knowledge-base document.
|
||||
|
||||
A label orients the model about where a passage came from — e.g. ``Slack`` or
|
||||
``Web · docs.python.org``. It is derived only from the document's type and any
|
||||
URL in its metadata, so it never asserts detail we don't actually have.
|
||||
URL in its metadata, so it never asserts detail we don't actually have. Search
|
||||
hits and full reads both build their ``<document source=…>`` from here, so the
|
||||
label a passage carries is identical whichever surface it arrives through.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
|
@ -0,0 +1,54 @@
|
|||
"""Wrap live web-search results in a ``<web_results>`` block.
|
||||
|
||||
Each result renders through the shared ``render_document`` (excerpt view), so a
|
||||
web result is cited with ``[n]`` exactly like a knowledge-base passage. Only the
|
||||
container and header differ — they tell the model these came from the public web,
|
||||
not the user's workspace.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from app.agents.chat.multi_agent_chat.shared.citations import CitationRegistry
|
||||
|
||||
from .document import render_document
|
||||
from .models import RenderableDocument
|
||||
|
||||
_HEADER = (
|
||||
"These are live results from a public web search for this query. Each\n"
|
||||
"<document> below is one result in excerpt view; cite a result with its [n]\n"
|
||||
"after the statement it supports. Scrape the URL for full context before\n"
|
||||
"making a definitive claim from a snippet."
|
||||
)
|
||||
|
||||
|
||||
def render_web_results(
|
||||
documents: list[RenderableDocument],
|
||||
registry: CitationRegistry,
|
||||
) -> str | None:
|
||||
"""Render web results as excerpt blocks inside ``<web_results>``.
|
||||
|
||||
Returns ``None`` when no result has content to show, so the caller can skip
|
||||
the block. Mutates ``registry`` (find-or-create), so a URL seen again keeps
|
||||
its original ``[n]``.
|
||||
"""
|
||||
blocks = [
|
||||
block
|
||||
for document in documents
|
||||
if (
|
||||
block := render_document(document, view="excerpt", registry=registry)
|
||||
)
|
||||
is not None
|
||||
]
|
||||
if not blocks:
|
||||
return None
|
||||
|
||||
return (
|
||||
"<web_results>\n"
|
||||
+ _HEADER
|
||||
+ "\n"
|
||||
+ "\n".join(blocks)
|
||||
+ "\n</web_results>"
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["render_web_results"]
|
||||
|
|
@ -1,103 +0,0 @@
|
|||
"""Shared XML builder for KB documents.
|
||||
|
||||
Produces the citation-friendly XML used by every read of a knowledge-base
|
||||
document (lazy-loaded by :class:`KBPostgresBackend` and synthetic anonymous
|
||||
files). The XML carries a ``<chunk_index>`` near the top so the LLM can jump
|
||||
directly to matched-chunk line ranges via ``read_file(offset=…, limit=…)``.
|
||||
|
||||
Extracted from the original ``knowledge_search.py`` so the backend, the
|
||||
priority middleware, and any future renderer share a single implementation.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from typing import Any
|
||||
|
||||
|
||||
def build_document_xml(
|
||||
document: dict[str, Any],
|
||||
matched_chunk_ids: set[int] | None = None,
|
||||
) -> str:
|
||||
"""Build citation-friendly XML with a ``<chunk_index>`` for smart seeking.
|
||||
|
||||
Args:
|
||||
document: Dict shape produced by hybrid search / lazy-load helpers.
|
||||
Expected keys: ``document`` (with ``id``, ``title``,
|
||||
``document_type``, ``metadata``) and ``chunks``
|
||||
(list of ``{chunk_id, content}``).
|
||||
matched_chunk_ids: Optional set of chunk IDs to flag as
|
||||
``matched="true"`` in the chunk index.
|
||||
"""
|
||||
matched = matched_chunk_ids or set()
|
||||
|
||||
doc_meta = document.get("document") or {}
|
||||
metadata = (doc_meta.get("metadata") or {}) if isinstance(doc_meta, dict) else {}
|
||||
document_id = doc_meta.get("id", document.get("document_id", "unknown"))
|
||||
document_type = doc_meta.get("document_type", document.get("source", "UNKNOWN"))
|
||||
title = doc_meta.get("title") or metadata.get("title") or "Untitled Document"
|
||||
url = (
|
||||
metadata.get("url") or metadata.get("source") or metadata.get("page_url") or ""
|
||||
)
|
||||
metadata_json = json.dumps(metadata, ensure_ascii=False)
|
||||
|
||||
metadata_lines: list[str] = [
|
||||
"<document>",
|
||||
"<document_metadata>",
|
||||
f" <document_id>{document_id}</document_id>",
|
||||
f" <document_type>{document_type}</document_type>",
|
||||
f" <title><![CDATA[{title}]]></title>",
|
||||
f" <url><![CDATA[{url}]]></url>",
|
||||
f" <metadata_json><![CDATA[{metadata_json}]]></metadata_json>",
|
||||
"</document_metadata>",
|
||||
"",
|
||||
]
|
||||
|
||||
chunks = document.get("chunks") or []
|
||||
chunk_entries: list[tuple[int | None, str]] = []
|
||||
if isinstance(chunks, list):
|
||||
for chunk in chunks:
|
||||
if not isinstance(chunk, dict):
|
||||
continue
|
||||
chunk_id = chunk.get("chunk_id") or chunk.get("id")
|
||||
chunk_content = str(chunk.get("content", "")).strip()
|
||||
if not chunk_content:
|
||||
continue
|
||||
if chunk_id is None:
|
||||
xml = f" <chunk><![CDATA[{chunk_content}]]></chunk>"
|
||||
else:
|
||||
xml = f" <chunk id='{chunk_id}'><![CDATA[{chunk_content}]]></chunk>"
|
||||
chunk_entries.append((chunk_id, xml))
|
||||
|
||||
index_overhead = 1 + len(chunk_entries) + 1 + 1 + 1
|
||||
first_chunk_line = len(metadata_lines) + index_overhead + 1
|
||||
|
||||
current_line = first_chunk_line
|
||||
index_entry_lines: list[str] = []
|
||||
for cid, xml_str in chunk_entries:
|
||||
num_lines = xml_str.count("\n") + 1
|
||||
end_line = current_line + num_lines - 1
|
||||
matched_attr = ' matched="true"' if cid is not None and cid in matched else ""
|
||||
if cid is not None:
|
||||
index_entry_lines.append(
|
||||
f' <entry chunk_id="{cid}" lines="{current_line}-{end_line}"{matched_attr}/>'
|
||||
)
|
||||
else:
|
||||
index_entry_lines.append(
|
||||
f' <entry lines="{current_line}-{end_line}"{matched_attr}/>'
|
||||
)
|
||||
current_line = end_line + 1
|
||||
|
||||
lines = metadata_lines.copy()
|
||||
lines.append("<chunk_index>")
|
||||
lines.extend(index_entry_lines)
|
||||
lines.append("</chunk_index>")
|
||||
lines.append("")
|
||||
lines.append("<document_content>")
|
||||
for _, xml_str in chunk_entries:
|
||||
lines.append(xml_str)
|
||||
lines.extend(["</document_content>", "</document>"])
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
__all__ = ["build_document_xml"]
|
||||
|
|
@ -42,8 +42,15 @@ from langchain.tools import ToolRuntime
|
|||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.document_xml import (
|
||||
build_document_xml,
|
||||
from app.agents.chat.multi_agent_chat.shared.citations import (
|
||||
CitationRegistry,
|
||||
CitationSourceType,
|
||||
)
|
||||
from app.agents.chat.multi_agent_chat.shared.document_render import (
|
||||
RenderableDocument,
|
||||
RenderablePassage,
|
||||
render_document,
|
||||
source_label,
|
||||
)
|
||||
from app.agents.chat.runtime.path_resolver import (
|
||||
DOCUMENTS_ROOT,
|
||||
|
|
@ -59,6 +66,21 @@ _TEMP_PREFIX = "temp_"
|
|||
_GREP_MAX_TOTAL_MATCHES = 50
|
||||
_GREP_MAX_PER_DOC = 5
|
||||
|
||||
_EMPTY_DOCUMENT_NOTICE = "(This document has no readable content.)"
|
||||
|
||||
|
||||
def render_full_document(
|
||||
document: RenderableDocument,
|
||||
registry: CitationRegistry,
|
||||
) -> str:
|
||||
"""Render a whole KB document (``view="full"``), registering each chunk's ``[n]``.
|
||||
|
||||
Falls back to a short notice when the document has no chunks, so a read never
|
||||
returns blank.
|
||||
"""
|
||||
rendered = render_document(document, view="full", registry=registry)
|
||||
return rendered if rendered is not None else _EMPTY_DOCUMENT_NOTICE
|
||||
|
||||
|
||||
def _basename(path: str) -> str:
|
||||
return path.rsplit("/", 1)[-1]
|
||||
|
|
@ -127,13 +149,6 @@ class KBPostgresBackend(BackendProtocol):
|
|||
anon = self.state.get("kb_anon_doc")
|
||||
return anon if isinstance(anon, dict) else None
|
||||
|
||||
def _matched_chunk_ids(self, doc_id: int) -> set[int]:
|
||||
mapping = self.state.get("kb_matched_chunk_ids") or {}
|
||||
try:
|
||||
return set(mapping.get(doc_id, []) or [])
|
||||
except TypeError:
|
||||
return set()
|
||||
|
||||
@staticmethod
|
||||
def _file_data_size(file_data: dict[str, Any]) -> int:
|
||||
try:
|
||||
|
|
@ -466,80 +481,93 @@ class KBPostgresBackend(BackendProtocol):
|
|||
def read(self, file_path: str, offset: int = 0, limit: int = 2000) -> str: # type: ignore[override]
|
||||
return asyncio.run(self.aread(file_path, offset, limit))
|
||||
|
||||
async def _load_file_data(
|
||||
async def aload_document(
|
||||
self,
|
||||
path: str,
|
||||
) -> tuple[dict[str, Any], int | None] | None:
|
||||
"""Lazy-load a virtual KB document into a deepagents ``FileData``.
|
||||
) -> tuple[RenderableDocument, int | None] | None:
|
||||
"""Lazy-load a virtual KB document as a :class:`RenderableDocument`.
|
||||
|
||||
Returns ``(file_data, doc_id)`` or ``None`` if the path doesn't map
|
||||
to any known document. ``doc_id`` is ``None`` for the synthetic
|
||||
anonymous document so the caller doesn't track it as a DB-backed file.
|
||||
Returns ``(document, doc_id)`` with every chunk in document order, or
|
||||
``None`` if the path maps to no known document. ``doc_id`` is ``None``
|
||||
for the synthetic anonymous upload so the caller doesn't track it as a
|
||||
DB-backed file. Pure data — rendering and citation registration happen in
|
||||
the caller (see :meth:`_load_file_data` and the ``read_file`` tool).
|
||||
"""
|
||||
anon = self._kb_anon_doc()
|
||||
if anon and str(anon.get("path") or "") == path:
|
||||
doc_payload = {
|
||||
"document_id": -1,
|
||||
"chunks": list(anon.get("chunks") or []),
|
||||
"matched_chunk_ids": [],
|
||||
"document": {
|
||||
"id": -1,
|
||||
"title": anon.get("title") or "uploaded_document",
|
||||
"document_type": "FILE",
|
||||
"metadata": {"source": "anonymous_upload"},
|
||||
},
|
||||
"source": "FILE",
|
||||
}
|
||||
xml = build_document_xml(doc_payload, matched_chunk_ids=set())
|
||||
file_data = create_file_data(xml)
|
||||
return file_data, None
|
||||
document = RenderableDocument(
|
||||
title=str(anon.get("title") or "uploaded_document"),
|
||||
source="Uploaded file",
|
||||
passages=[
|
||||
RenderablePassage(
|
||||
content=str(chunk.get("content", "")),
|
||||
locator={
|
||||
"document_id": -1,
|
||||
"chunk_id": int(chunk["chunk_id"]),
|
||||
},
|
||||
source_type=CitationSourceType.ANON_CHUNK,
|
||||
)
|
||||
for chunk in (anon.get("chunks") or [])
|
||||
if isinstance(chunk, dict) and chunk.get("chunk_id") is not None
|
||||
],
|
||||
)
|
||||
return document, None
|
||||
|
||||
if not path.startswith(DOCUMENTS_ROOT):
|
||||
return None
|
||||
|
||||
async with shielded_async_session() as session:
|
||||
document = await virtual_path_to_doc(
|
||||
document_row = await virtual_path_to_doc(
|
||||
session,
|
||||
search_space_id=self.search_space_id,
|
||||
virtual_path=path,
|
||||
)
|
||||
if document is None:
|
||||
if document_row is None:
|
||||
return None
|
||||
chunk_rows = await session.execute(
|
||||
select(Chunk.id, Chunk.content)
|
||||
.where(Chunk.document_id == document.id)
|
||||
.where(Chunk.document_id == document_row.id)
|
||||
.order_by(Chunk.position, Chunk.id)
|
||||
)
|
||||
chunks = [
|
||||
{"chunk_id": row.id, "content": row.content} for row in chunk_rows.all()
|
||||
]
|
||||
chunks = chunk_rows.all()
|
||||
|
||||
doc_payload = {
|
||||
"document_id": document.id,
|
||||
"chunks": chunks,
|
||||
"matched_chunk_ids": list(self._matched_chunk_ids(document.id)),
|
||||
"document": {
|
||||
"id": document.id,
|
||||
"title": document.title,
|
||||
"document_type": (
|
||||
document.document_type.value
|
||||
if getattr(document, "document_type", None) is not None
|
||||
else "UNKNOWN"
|
||||
),
|
||||
"metadata": dict(document.document_metadata or {}),
|
||||
},
|
||||
"source": (
|
||||
document.document_type.value
|
||||
if getattr(document, "document_type", None) is not None
|
||||
else "UNKNOWN"
|
||||
),
|
||||
}
|
||||
xml = build_document_xml(
|
||||
doc_payload,
|
||||
matched_chunk_ids=self._matched_chunk_ids(document.id),
|
||||
document_type = (
|
||||
document_row.document_type.value
|
||||
if getattr(document_row, "document_type", None) is not None
|
||||
else None
|
||||
)
|
||||
file_data = create_file_data(xml)
|
||||
return file_data, document.id
|
||||
metadata = dict(document_row.document_metadata or {})
|
||||
document = RenderableDocument(
|
||||
title=document_row.title,
|
||||
source=source_label(document_type, metadata),
|
||||
passages=[
|
||||
RenderablePassage(
|
||||
content=row.content,
|
||||
locator={"document_id": document_row.id, "chunk_id": row.id},
|
||||
)
|
||||
for row in chunks
|
||||
],
|
||||
)
|
||||
return document, document_row.id
|
||||
|
||||
async def _load_file_data(
|
||||
self,
|
||||
path: str,
|
||||
) -> tuple[dict[str, Any], int | None] | None:
|
||||
"""Render a virtual KB document into a deepagents ``FileData``.
|
||||
|
||||
Used by the filesystem ops (move/edit existence + content staging) and the
|
||||
backend's own ``aread``/``aedit``. These have no conversation registry to
|
||||
persist into, so the ``[n]`` labels are minted into a throwaway registry —
|
||||
the canonical, citation-persisting read is the ``read_file`` tool, which
|
||||
renders from :meth:`aload_document` against the state registry.
|
||||
"""
|
||||
loaded = await self.aload_document(path)
|
||||
if loaded is None:
|
||||
return None
|
||||
document, doc_id = loaded
|
||||
rendered = render_full_document(document, CitationRegistry())
|
||||
return create_file_data(rendered), doc_id
|
||||
|
||||
# ------------------------------------------------------------------ writes
|
||||
|
||||
|
|
@ -1037,4 +1065,5 @@ __all__ = [
|
|||
"KBPostgresBackend",
|
||||
"list_tree_listing",
|
||||
"paginate_listing",
|
||||
"render_full_document",
|
||||
]
|
||||
|
|
|
|||
|
|
@ -37,8 +37,8 @@ def build_backend_resolver(
|
|||
|
||||
In cloud mode the resolver returns a fresh :class:`KBPostgresBackend`
|
||||
bound to the current ``runtime`` so the backend can read staging state
|
||||
(``staged_dirs``, ``pending_moves``, ``files`` cache, ``kb_anon_doc``,
|
||||
``kb_matched_chunk_ids``) for each tool call. When no ``search_space_id``
|
||||
(``staged_dirs``, ``pending_moves``, ``files`` cache, ``kb_anon_doc``)
|
||||
for each tool call. When no ``search_space_id``
|
||||
is provided, the resolver falls back to :class:`StateBackend` (used by
|
||||
sub-agents and tests that don't need DB-backed reads).
|
||||
|
||||
|
|
|
|||
|
|
@ -35,26 +35,20 @@ current working directory (`cwd`, default `/documents`).
|
|||
turn alongside any new/edited documents. Snapshot/revert is enabled
|
||||
for every destructive operation when action logging is on.
|
||||
|
||||
## Reading Documents Efficiently
|
||||
## Reading Documents
|
||||
|
||||
Documents are formatted as XML. Each document contains:
|
||||
- `<document_metadata>` — title, type, URL, etc.
|
||||
- `<chunk_index>` — a table of every chunk with its **line range** and a
|
||||
`matched="true"` flag for chunks that matched the search query.
|
||||
- `<document_content>` — the actual chunks in original document order.
|
||||
|
||||
**Workflow**: when reading a large document, read the first ~20 lines to see
|
||||
the `<chunk_index>`, identify chunks marked `matched="true"`, then use
|
||||
`read_file(path, offset=<start_line>, limit=<lines>)` to jump directly to
|
||||
those sections instead of reading the entire file sequentially.
|
||||
|
||||
Use `<chunk id='...'>` values as citation IDs in your answers.
|
||||
A knowledge-base document is returned as a `<document … view="full">` block —
|
||||
the whole source, with each passage labelled `[n]`. `view="full"` means you are
|
||||
seeing the complete document, not an excerpt. Use `read_file(path, offset, limit)`
|
||||
to page through a large document. Cite a passage by writing its `[n]` after the
|
||||
statement it supports — the same `[n]` that passage had in
|
||||
`search_knowledge_base` results.
|
||||
|
||||
## Priority List
|
||||
|
||||
You receive a `<priority_documents>` system message each turn listing the
|
||||
top-K paths most relevant to the user's query (by hybrid search). Read those
|
||||
first — matched sections are flagged inside each document's `<chunk_index>`.
|
||||
first.
|
||||
|
||||
## Workspace Tree
|
||||
|
||||
|
|
|
|||
|
|
@ -10,11 +10,11 @@ Usage:
|
|||
- By default, reads up to 100 lines from the beginning.
|
||||
- Use `offset` and `limit` for pagination when files are large.
|
||||
- Results include line numbers.
|
||||
- Documents contain a `<chunk_index>` near the top listing every chunk with
|
||||
its line range and a `matched="true"` flag for search-relevant chunks.
|
||||
Read the index first, then jump to matched chunks with
|
||||
`read_file(path, offset=<start_line>, limit=<num_lines>)`.
|
||||
- Use chunk IDs (`<chunk id='...'>`) as citations in answers.
|
||||
- A knowledge-base document is returned as a `<document … view="full">` block:
|
||||
the whole source, with each passage labelled `[n]`. `view="full"` means you are
|
||||
seeing the complete document, not an excerpt.
|
||||
- Cite a passage by writing its `[n]` after the statement it supports — the same
|
||||
`[n]` you would use for that passage from `search_knowledge_base`.
|
||||
"""
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -4,14 +4,20 @@ from __future__ import annotations
|
|||
|
||||
from typing import TYPE_CHECKING, Annotated, Any
|
||||
|
||||
from deepagents.backends.utils import format_read_response, validate_path
|
||||
from deepagents.backends.utils import (
|
||||
create_file_data,
|
||||
format_read_response,
|
||||
validate_path,
|
||||
)
|
||||
from langchain.tools import ToolRuntime
|
||||
from langchain_core.messages import ToolMessage
|
||||
from langchain_core.tools import BaseTool, StructuredTool
|
||||
from langgraph.types import Command
|
||||
|
||||
from app.agents.chat.multi_agent_chat.shared.citations import load_registry
|
||||
from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.kb_postgres import (
|
||||
KBPostgresBackend,
|
||||
render_full_document,
|
||||
)
|
||||
from app.agents.chat.multi_agent_chat.shared.state.filesystem_state import (
|
||||
SurfSenseFilesystemState,
|
||||
|
|
@ -55,10 +61,12 @@ def create_read_file_tool(mw: SurfSenseFilesystemMiddleware) -> BaseTool:
|
|||
|
||||
backend = mw._get_backend(runtime)
|
||||
if isinstance(backend, KBPostgresBackend):
|
||||
loaded = await backend._load_file_data(validated)
|
||||
loaded = await backend.aload_document(validated)
|
||||
if loaded is None:
|
||||
return f"Error: File '{validated}' not found"
|
||||
file_data, doc_id = loaded
|
||||
document, doc_id = loaded
|
||||
registry = load_registry(runtime.state)
|
||||
file_data = create_file_data(render_full_document(document, registry))
|
||||
rendered = format_read_response(file_data, offset, limit)
|
||||
update: dict[str, Any] = {
|
||||
"files": {validated: file_data},
|
||||
|
|
@ -68,6 +76,7 @@ def create_read_file_tool(mw: SurfSenseFilesystemMiddleware) -> BaseTool:
|
|||
tool_call_id=runtime.tool_call_id,
|
||||
)
|
||||
],
|
||||
"citation_registry": registry,
|
||||
}
|
||||
if doc_id is not None:
|
||||
update["doc_id_by_path"] = {validated: doc_id}
|
||||
|
|
|
|||
|
|
@ -1,31 +1,29 @@
|
|||
"""Turn retriever ``DocumentHit``s into renderable ``RetrievedDocument``s."""
|
||||
"""Turn retriever ``DocumentHit``s into renderable documents."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from app.agents.chat.multi_agent_chat.shared.retrieved_context import (
|
||||
RetrievedDocument,
|
||||
RetrievedPassage,
|
||||
from app.agents.chat.multi_agent_chat.shared.document_render import (
|
||||
RenderableDocument,
|
||||
RenderablePassage,
|
||||
source_label,
|
||||
)
|
||||
|
||||
from .models import DocumentHit
|
||||
from .source_label import source_label
|
||||
|
||||
|
||||
def to_retrieved_document(hit: DocumentHit) -> RetrievedDocument:
|
||||
"""Map one hit to the shape the ``<retrieved_context>`` renderer consumes."""
|
||||
return RetrievedDocument(
|
||||
document_id=hit.document_id,
|
||||
def to_renderable_document(hit: DocumentHit) -> RenderableDocument:
|
||||
"""Map one hit to the shape the document-fragment renderer consumes."""
|
||||
return RenderableDocument(
|
||||
title=hit.title,
|
||||
source_label=source_label(hit.document_type, hit.metadata),
|
||||
source=source_label(hit.document_type, hit.metadata),
|
||||
passages=[
|
||||
RetrievedPassage(
|
||||
document_id=hit.document_id,
|
||||
chunk_id=chunk.chunk_id,
|
||||
RenderablePassage(
|
||||
content=chunk.content,
|
||||
locator={"document_id": hit.document_id, "chunk_id": chunk.chunk_id},
|
||||
)
|
||||
for chunk in hit.chunks
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["to_retrieved_document"]
|
||||
__all__ = ["to_renderable_document"]
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
``SearchScope`` is the optional filter a search runs under. ``DocumentHit`` /
|
||||
``ChunkHit`` are the retriever's typed output — matched chunks grouped by their
|
||||
document — which the adapter turns into renderable ``RetrievedDocument``s.
|
||||
document — which the adapter turns into renderable ``RenderableDocument``s.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
|
|
|||
|
|
@ -11,11 +11,11 @@ from typing import TYPE_CHECKING
|
|||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.agents.chat.multi_agent_chat.shared.citations import CitationRegistry
|
||||
from app.agents.chat.multi_agent_chat.shared.retrieved_context import (
|
||||
render_retrieved_context,
|
||||
from app.agents.chat.multi_agent_chat.shared.document_render import (
|
||||
render_search_context,
|
||||
)
|
||||
|
||||
from .adapter import to_retrieved_document
|
||||
from .adapter import to_renderable_document
|
||||
from .hybrid_search import search_chunks
|
||||
from .models import DocumentHit, SearchScope
|
||||
from .reranking import rerank_hits
|
||||
|
|
@ -59,8 +59,8 @@ def build_context(
|
|||
) -> str | None:
|
||||
"""Rerank → adapt → render. Pure given ``hits``, so it is unit-testable."""
|
||||
ranked = rerank_hits(query, hits, reranker)
|
||||
documents = [to_retrieved_document(hit) for hit in ranked]
|
||||
return render_retrieved_context(documents, registry)
|
||||
documents = [to_renderable_document(hit) for hit in ranked]
|
||||
return render_search_context(documents, registry)
|
||||
|
||||
|
||||
__all__ = ["build_context", "search_knowledge_base_context"]
|
||||
|
|
|
|||
|
|
@ -1,16 +0,0 @@
|
|||
"""Retrieved knowledge-base evidence rendered as the ``<retrieved_context>`` block.
|
||||
|
||||
Turns retrieved chunks into the model-facing block and registers each passage
|
||||
into the citation registry so ``[n]`` resolves back to a real chunk.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from .models import RetrievedDocument, RetrievedPassage
|
||||
from .renderer import render_retrieved_context
|
||||
|
||||
__all__ = [
|
||||
"RetrievedDocument",
|
||||
"RetrievedPassage",
|
||||
"render_retrieved_context",
|
||||
]
|
||||
|
|
@ -1,32 +0,0 @@
|
|||
"""Data shapes for retrieved knowledge-base evidence.
|
||||
|
||||
A passage is one matched chunk (the citable unit); a document groups the
|
||||
passages that came from the same source. The renderer turns these into the
|
||||
model-facing ``<retrieved_context>`` block.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class RetrievedPassage:
|
||||
"""One matched chunk: the unit the model cites with ``[n]``."""
|
||||
|
||||
document_id: int
|
||||
chunk_id: int
|
||||
content: str
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class RetrievedDocument:
|
||||
"""A source document and the passages retrieved from it, in order."""
|
||||
|
||||
document_id: int
|
||||
title: str
|
||||
source_label: str | None = None
|
||||
passages: list[RetrievedPassage] = field(default_factory=list)
|
||||
|
||||
|
||||
__all__ = ["RetrievedDocument", "RetrievedPassage"]
|
||||
|
|
@ -1,89 +0,0 @@
|
|||
"""Render retrieved documents into the model-facing ``<retrieved_context>`` block.
|
||||
|
||||
Each passage is registered into the citation registry as it is rendered, so the
|
||||
``[n]`` the model sees is the same label the server can later resolve back to a
|
||||
chunk. ``[n]`` is the only citable integer per passage by design — nothing else
|
||||
in a line is a number the model could echo by mistake.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from app.agents.chat.multi_agent_chat.shared.citations import (
|
||||
CitationRegistry,
|
||||
CitationSourceType,
|
||||
)
|
||||
|
||||
from .models import RetrievedDocument, RetrievedPassage
|
||||
|
||||
_HEADER = (
|
||||
"These are excerpts from the user's knowledge base, selected for this query.\n"
|
||||
"A document is a full source (a file, a Slack thread, a Notion page); a chunk\n"
|
||||
"is one ordered fragment of it. You are seeing only the chunks that matched\n"
|
||||
"this query, not the whole source.\n"
|
||||
"Cite a chunk with [n]."
|
||||
)
|
||||
|
||||
|
||||
def render_retrieved_context(
|
||||
documents: list[RetrievedDocument],
|
||||
registry: CitationRegistry,
|
||||
) -> str | None:
|
||||
"""Render retrieved documents and register each passage for citation.
|
||||
|
||||
Returns ``None`` when there is no passage to show so the caller can skip the
|
||||
block. Mutates ``registry`` (find-or-create), so a passage seen again in a
|
||||
later turn keeps its original ``[n]``.
|
||||
"""
|
||||
blocks = [
|
||||
block
|
||||
for document in documents
|
||||
if (block := _render_document(document, registry)) is not None
|
||||
]
|
||||
if not blocks:
|
||||
return None
|
||||
|
||||
return "<retrieved_context>\n" + _HEADER + "\n" + "\n".join(blocks) + "\n</retrieved_context>"
|
||||
|
||||
|
||||
def _render_document(
|
||||
document: RetrievedDocument,
|
||||
registry: CitationRegistry,
|
||||
) -> str | None:
|
||||
"""Render one document header and its passages; ``None`` if it has none."""
|
||||
if not document.passages:
|
||||
return None
|
||||
|
||||
lines = [_render_header(document)]
|
||||
for passage in document.passages:
|
||||
lines.append(_render_passage(document, passage, registry))
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _render_header(document: RetrievedDocument) -> str:
|
||||
"""``Document: "Title" (source)``."""
|
||||
source = f" ({document.source_label})" if document.source_label else ""
|
||||
return f'Document: "{_clean(document.title)}"{source}'
|
||||
|
||||
|
||||
def _render_passage(
|
||||
document: RetrievedDocument,
|
||||
passage: RetrievedPassage,
|
||||
registry: CitationRegistry,
|
||||
) -> str:
|
||||
"""`` [n] <chunk content>`` with continuation lines indented under it."""
|
||||
n = registry.register(
|
||||
CitationSourceType.KB_CHUNK,
|
||||
{"document_id": passage.document_id, "chunk_id": passage.chunk_id},
|
||||
{"title": document.title, "source": document.source_label},
|
||||
)
|
||||
label = f" [{n}] "
|
||||
body = passage.content.strip().replace("\n", "\n" + " " * len(label))
|
||||
return f"{label}{body}"
|
||||
|
||||
|
||||
def _clean(text: str) -> str:
|
||||
"""Collapse whitespace so a title stays on the header line."""
|
||||
return " ".join(text.split())
|
||||
|
||||
|
||||
__all__ = ["render_retrieved_context"]
|
||||
|
|
@ -0,0 +1,152 @@
|
|||
"""Tests for the shared ``render_document`` (one ``<document>`` block)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from app.agents.chat.multi_agent_chat.shared.citations import (
|
||||
CitationRegistry,
|
||||
CitationSourceType,
|
||||
)
|
||||
from app.agents.chat.multi_agent_chat.shared.document_render import (
|
||||
RenderableDocument,
|
||||
RenderablePassage,
|
||||
render_document,
|
||||
)
|
||||
|
||||
pytestmark = pytest.mark.unit
|
||||
|
||||
|
||||
def _document(
|
||||
document_id: int,
|
||||
title: str,
|
||||
chunk_ids: list[int],
|
||||
*,
|
||||
source: str | None = None,
|
||||
) -> RenderableDocument:
|
||||
return RenderableDocument(
|
||||
title=title,
|
||||
source=source,
|
||||
passages=[
|
||||
RenderablePassage(
|
||||
content=f"text {cid}",
|
||||
locator={"document_id": document_id, "chunk_id": cid},
|
||||
)
|
||||
for cid in chunk_ids
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
def test_returns_none_when_no_passages() -> None:
|
||||
registry = CitationRegistry()
|
||||
|
||||
assert (
|
||||
render_document(_document(1, "Empty", []), view="excerpt", registry=registry)
|
||||
is None
|
||||
)
|
||||
|
||||
|
||||
def test_excerpt_open_and_close_tags() -> None:
|
||||
registry = CitationRegistry()
|
||||
|
||||
block = render_document(
|
||||
_document(1, "Q3 Launch Notes", [880], source="Slack · #launch"),
|
||||
view="excerpt",
|
||||
registry=registry,
|
||||
)
|
||||
|
||||
assert block is not None
|
||||
assert block.startswith(
|
||||
'<document title="Q3 Launch Notes" source="Slack · #launch" view="excerpt">'
|
||||
)
|
||||
assert block.endswith("</document>")
|
||||
|
||||
|
||||
def test_full_view_renders_view_attribute() -> None:
|
||||
registry = CitationRegistry()
|
||||
|
||||
block = render_document(_document(1, "Doc", [880]), view="full", registry=registry)
|
||||
|
||||
assert block is not None
|
||||
assert '<document title="Doc" view="full">' in block
|
||||
|
||||
|
||||
def test_source_attribute_omitted_when_absent() -> None:
|
||||
registry = CitationRegistry()
|
||||
|
||||
block = render_document(
|
||||
_document(1, "Plain", [1]), view="excerpt", registry=registry
|
||||
)
|
||||
|
||||
assert block is not None
|
||||
assert block.startswith('<document title="Plain" view="excerpt">')
|
||||
|
||||
|
||||
def test_registers_passages_with_chunk_locators() -> None:
|
||||
registry = CitationRegistry()
|
||||
|
||||
render_document(
|
||||
_document(1, "Doc", [880], source="Slack"),
|
||||
view="excerpt",
|
||||
registry=registry,
|
||||
)
|
||||
|
||||
entry = registry.resolve(1)
|
||||
assert entry is not None
|
||||
assert entry.source_type is CitationSourceType.KB_CHUNK
|
||||
assert entry.locator == {"document_id": 1, "chunk_id": 880}
|
||||
assert entry.display == {"title": "Doc", "source": "Slack"}
|
||||
|
||||
|
||||
def test_passages_get_monotonic_labels() -> None:
|
||||
registry = CitationRegistry()
|
||||
|
||||
block = render_document(
|
||||
_document(1, "Doc", [880, 881]), view="excerpt", registry=registry
|
||||
)
|
||||
|
||||
assert block is not None
|
||||
assert " [1] text 880" in block
|
||||
assert " [2] text 881" in block
|
||||
|
||||
|
||||
def test_multiline_passage_indents_under_label() -> None:
|
||||
registry = CitationRegistry()
|
||||
document = RenderableDocument(
|
||||
title="Doc",
|
||||
passages=[
|
||||
RenderablePassage(
|
||||
content="line one\nline two",
|
||||
locator={"document_id": 1, "chunk_id": 5},
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
block = render_document(document, view="excerpt", registry=registry)
|
||||
|
||||
assert block is not None
|
||||
assert " [1] line one\n line two" in block
|
||||
|
||||
|
||||
def test_attribute_values_are_escaped() -> None:
|
||||
registry = CitationRegistry()
|
||||
|
||||
block = render_document(
|
||||
_document(1, 'A & B <c> "d"', [1], source="x & y"),
|
||||
view="excerpt",
|
||||
registry=registry,
|
||||
)
|
||||
|
||||
assert block is not None
|
||||
assert 'title="A & B <c> "d""' in block
|
||||
assert 'source="x & y"' in block
|
||||
|
||||
|
||||
def test_same_passage_reuses_label_across_calls() -> None:
|
||||
registry = CitationRegistry()
|
||||
document = _document(1, "Doc", [880])
|
||||
|
||||
render_document(document, view="excerpt", registry=registry)
|
||||
render_document(document, view="full", registry=registry)
|
||||
|
||||
assert registry.next_n == 2
|
||||
|
|
@ -0,0 +1,94 @@
|
|||
"""Tests for the ``<retrieved_context>`` wrapper around excerpt documents."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from app.agents.chat.multi_agent_chat.shared.citations import CitationRegistry
|
||||
from app.agents.chat.multi_agent_chat.shared.document_render import (
|
||||
RenderableDocument,
|
||||
RenderablePassage,
|
||||
render_search_context,
|
||||
)
|
||||
|
||||
pytestmark = pytest.mark.unit
|
||||
|
||||
|
||||
def _document(
|
||||
document_id: int,
|
||||
title: str,
|
||||
chunk_ids: list[int],
|
||||
*,
|
||||
source: str | None = None,
|
||||
) -> RenderableDocument:
|
||||
return RenderableDocument(
|
||||
title=title,
|
||||
source=source,
|
||||
passages=[
|
||||
RenderablePassage(
|
||||
content=f"text {cid}",
|
||||
locator={"document_id": document_id, "chunk_id": cid},
|
||||
)
|
||||
for cid in chunk_ids
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
def test_returns_none_when_nothing_to_show() -> None:
|
||||
registry = CitationRegistry()
|
||||
|
||||
assert render_search_context([], registry) is None
|
||||
assert render_search_context([_document(1, "Empty", [])], registry) is None
|
||||
|
||||
|
||||
def test_assigns_monotonic_labels_across_documents() -> None:
|
||||
registry = CitationRegistry()
|
||||
|
||||
block = render_search_context(
|
||||
[
|
||||
_document(1, "Q3 Launch Notes", [880, 881], source="Slack"),
|
||||
_document(2, "Timeline", [12], source="Notion"),
|
||||
],
|
||||
registry,
|
||||
)
|
||||
|
||||
assert block is not None
|
||||
assert "[1] text 880" in block
|
||||
assert "[2] text 881" in block
|
||||
assert "[3] text 12" in block
|
||||
|
||||
|
||||
def test_wraps_in_retrieved_context_and_teaches_excerpt_and_citation() -> None:
|
||||
registry = CitationRegistry()
|
||||
|
||||
block = render_search_context([_document(1, "Doc", [1])], registry)
|
||||
|
||||
assert block is not None
|
||||
assert block.startswith("<retrieved_context>")
|
||||
assert block.endswith("</retrieved_context>")
|
||||
assert "excerpt view" in block
|
||||
assert "Cite a chunk with its [n]." in block
|
||||
|
||||
|
||||
def test_documents_render_as_excerpt_blocks() -> None:
|
||||
registry = CitationRegistry()
|
||||
|
||||
block = render_search_context(
|
||||
[_document(1, "Q3", [1], source="Slack · #launch")], registry
|
||||
)
|
||||
|
||||
assert block is not None
|
||||
assert '<document title="Q3" source="Slack · #launch" view="excerpt">' in block
|
||||
assert "</document>" in block
|
||||
|
||||
|
||||
def test_same_passage_reuses_label_across_calls() -> None:
|
||||
registry = CitationRegistry()
|
||||
document = _document(1, "Doc", [880])
|
||||
|
||||
render_search_context([document], registry)
|
||||
block = render_search_context([document], registry)
|
||||
|
||||
assert block is not None
|
||||
assert "[1] text 880" in block
|
||||
assert registry.next_n == 2
|
||||
|
|
@ -4,7 +4,7 @@ from __future__ import annotations
|
|||
|
||||
import pytest
|
||||
|
||||
from app.agents.chat.multi_agent_chat.shared.retrieval.source_label import source_label
|
||||
from app.agents.chat.multi_agent_chat.shared.document_render import source_label
|
||||
|
||||
pytestmark = pytest.mark.unit
|
||||
|
||||
|
|
@ -0,0 +1,82 @@
|
|||
"""Tests for the ``<web_results>`` wrapper around web-result excerpt documents."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from app.agents.chat.multi_agent_chat.shared.citations import (
|
||||
CitationRegistry,
|
||||
CitationSourceType,
|
||||
)
|
||||
from app.agents.chat.multi_agent_chat.shared.document_render import (
|
||||
RenderableDocument,
|
||||
RenderablePassage,
|
||||
render_web_results,
|
||||
)
|
||||
|
||||
pytestmark = pytest.mark.unit
|
||||
|
||||
|
||||
def _web_doc(url: str, title: str, content: str) -> RenderableDocument:
|
||||
return RenderableDocument(
|
||||
title=title,
|
||||
source=f"Web · {url.split('//', 1)[-1].split('/', 1)[0]}",
|
||||
passages=[
|
||||
RenderablePassage(
|
||||
content=content,
|
||||
locator={"url": url},
|
||||
source_type=CitationSourceType.WEB_RESULT,
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
def test_returns_none_when_nothing_to_show() -> None:
|
||||
registry = CitationRegistry()
|
||||
|
||||
assert render_web_results([], registry) is None
|
||||
|
||||
|
||||
def test_wraps_in_web_results_container() -> None:
|
||||
registry = CitationRegistry()
|
||||
|
||||
block = render_web_results(
|
||||
[_web_doc("https://example.com/a", "Example", "the answer is 42")],
|
||||
registry,
|
||||
)
|
||||
|
||||
assert block is not None
|
||||
assert block.startswith("<web_results>")
|
||||
assert block.endswith("</web_results>")
|
||||
assert "cite a result with its [n]" in block
|
||||
assert '<document title="Example" source="Web · example.com" view="excerpt">' in block
|
||||
assert "[1] the answer is 42" in block
|
||||
|
||||
|
||||
def test_registers_each_result_as_web_result_with_url_locator() -> None:
|
||||
registry = CitationRegistry()
|
||||
|
||||
render_web_results(
|
||||
[
|
||||
_web_doc("https://a.com/x", "A", "alpha"),
|
||||
_web_doc("https://b.com/y", "B", "beta"),
|
||||
],
|
||||
registry,
|
||||
)
|
||||
|
||||
first = registry.resolve(1)
|
||||
second = registry.resolve(2)
|
||||
assert first is not None and second is not None
|
||||
assert first.source_type is CitationSourceType.WEB_RESULT
|
||||
assert first.locator == {"url": "https://a.com/x"}
|
||||
assert second.locator == {"url": "https://b.com/y"}
|
||||
|
||||
|
||||
def test_same_url_reuses_label_across_calls() -> None:
|
||||
registry = CitationRegistry()
|
||||
doc = _web_doc("https://example.com/a", "Example", "stable fact")
|
||||
|
||||
render_web_results([doc], registry)
|
||||
render_web_results([doc], registry)
|
||||
|
||||
assert registry.next_n == 2
|
||||
|
|
@ -1,11 +1,11 @@
|
|||
"""Tests for mapping a DocumentHit to a renderable RetrievedDocument."""
|
||||
"""Tests for mapping a DocumentHit to a renderable document."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from app.agents.chat.multi_agent_chat.shared.retrieval.adapter import (
|
||||
to_retrieved_document,
|
||||
to_renderable_document,
|
||||
)
|
||||
from app.agents.chat.multi_agent_chat.shared.retrieval.models import (
|
||||
ChunkHit,
|
||||
|
|
@ -15,7 +15,7 @@ from app.agents.chat.multi_agent_chat.shared.retrieval.models import (
|
|||
pytestmark = pytest.mark.unit
|
||||
|
||||
|
||||
def test_maps_identity_source_label_and_passages() -> None:
|
||||
def test_maps_identity_source_and_passages() -> None:
|
||||
hit = DocumentHit(
|
||||
document_id=42,
|
||||
title="Q3 Launch Notes",
|
||||
|
|
@ -28,13 +28,14 @@ def test_maps_identity_source_label_and_passages() -> None:
|
|||
],
|
||||
)
|
||||
|
||||
document = to_retrieved_document(hit)
|
||||
document = to_renderable_document(hit)
|
||||
|
||||
assert document.document_id == 42
|
||||
assert document.title == "Q3 Launch Notes"
|
||||
assert document.source_label == "Slack"
|
||||
assert [(p.chunk_id, p.content) for p in document.passages] == [(880, "a"), (881, "b")]
|
||||
assert all(p.document_id == 42 for p in document.passages)
|
||||
assert document.source == "Slack"
|
||||
assert [
|
||||
(p.locator["chunk_id"], p.content) for p in document.passages
|
||||
] == [(880, "a"), (881, "b")]
|
||||
assert all(p.locator["document_id"] == 42 for p in document.passages)
|
||||
|
||||
|
||||
def test_document_with_no_chunks_maps_to_no_passages() -> None:
|
||||
|
|
@ -47,4 +48,4 @@ def test_document_with_no_chunks_maps_to_no_passages() -> None:
|
|||
chunks=[],
|
||||
)
|
||||
|
||||
assert to_retrieved_document(hit).passages == []
|
||||
assert to_renderable_document(hit).passages == []
|
||||
|
|
|
|||
|
|
@ -1,144 +0,0 @@
|
|||
"""Tests for the <retrieved_context> renderer and its citation registration."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from app.agents.chat.multi_agent_chat.shared.citations import (
|
||||
CitationRegistry,
|
||||
CitationSourceType,
|
||||
)
|
||||
from app.agents.chat.multi_agent_chat.shared.retrieved_context import (
|
||||
RetrievedDocument,
|
||||
RetrievedPassage,
|
||||
render_retrieved_context,
|
||||
)
|
||||
|
||||
pytestmark = pytest.mark.unit
|
||||
|
||||
|
||||
def _document(
|
||||
document_id: int,
|
||||
title: str,
|
||||
chunk_ids: list[int],
|
||||
*,
|
||||
source_label: str | None = None,
|
||||
) -> RetrievedDocument:
|
||||
return RetrievedDocument(
|
||||
document_id=document_id,
|
||||
title=title,
|
||||
source_label=source_label,
|
||||
passages=[
|
||||
RetrievedPassage(document_id=document_id, chunk_id=cid, content=f"text {cid}")
|
||||
for cid in chunk_ids
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
def test_returns_none_when_nothing_to_show() -> None:
|
||||
registry = CitationRegistry()
|
||||
|
||||
assert render_retrieved_context([], registry) is None
|
||||
assert render_retrieved_context([_document(1, "Empty", [])], registry) is None
|
||||
|
||||
|
||||
def test_assigns_monotonic_labels_across_documents() -> None:
|
||||
registry = CitationRegistry()
|
||||
|
||||
block = render_retrieved_context(
|
||||
[
|
||||
_document(1, "Q3 Launch Notes", [880, 881], source_label="Slack"),
|
||||
_document(2, "Timeline", [12], source_label="Notion"),
|
||||
],
|
||||
registry,
|
||||
)
|
||||
|
||||
assert block is not None
|
||||
assert "[1] text 880" in block
|
||||
assert "[2] text 881" in block
|
||||
assert "[3] text 12" in block
|
||||
|
||||
|
||||
def test_registers_passages_with_chunk_locators() -> None:
|
||||
registry = CitationRegistry()
|
||||
|
||||
render_retrieved_context([_document(1, "Doc", [880])], registry)
|
||||
|
||||
entry = registry.resolve(1)
|
||||
assert entry is not None
|
||||
assert entry.source_type is CitationSourceType.KB_CHUNK
|
||||
assert entry.locator == {"document_id": 1, "chunk_id": 880}
|
||||
assert entry.display["title"] == "Doc"
|
||||
|
||||
|
||||
def test_header_shows_source_when_present() -> None:
|
||||
registry = CitationRegistry()
|
||||
|
||||
block = render_retrieved_context(
|
||||
[
|
||||
_document(1, "Q3", [1], source_label="Slack · #launch"),
|
||||
_document(2, "Plan", [2]),
|
||||
],
|
||||
registry,
|
||||
)
|
||||
|
||||
assert block is not None
|
||||
assert 'Document: "Q3" (Slack · #launch)' in block
|
||||
assert 'Document: "Plan"' in block
|
||||
|
||||
|
||||
def test_wraps_block_and_explains_chunk_vs_document() -> None:
|
||||
registry = CitationRegistry()
|
||||
|
||||
block = render_retrieved_context([_document(1, "Doc", [1])], registry)
|
||||
|
||||
assert block is not None
|
||||
assert block.startswith("<retrieved_context>")
|
||||
assert block.endswith("</retrieved_context>")
|
||||
assert "Cite a chunk with [n]." in block
|
||||
|
||||
|
||||
def test_multiline_passage_is_indented_under_label() -> None:
|
||||
registry = CitationRegistry()
|
||||
document = RetrievedDocument(
|
||||
document_id=1,
|
||||
title="Doc",
|
||||
passages=[RetrievedPassage(document_id=1, chunk_id=5, content="line one\nline two")],
|
||||
)
|
||||
|
||||
block = render_retrieved_context([document], registry)
|
||||
|
||||
assert block is not None
|
||||
assert " [1] line one\n line two" in block
|
||||
|
||||
|
||||
def test_continuation_indent_tracks_label_width() -> None:
|
||||
registry = CitationRegistry()
|
||||
# Burn labels 1..9 so the tenth passage renders as [10] (a 7-char label).
|
||||
documents = [_document(i, f"Doc {i}", [i]) for i in range(1, 10)]
|
||||
documents.append(
|
||||
RetrievedDocument(
|
||||
document_id=10,
|
||||
title="Doc 10",
|
||||
passages=[
|
||||
RetrievedPassage(document_id=10, chunk_id=10, content="line one\nline two")
|
||||
],
|
||||
)
|
||||
)
|
||||
|
||||
block = render_retrieved_context(documents, registry)
|
||||
|
||||
assert block is not None
|
||||
assert " [10] line one\n line two" in block
|
||||
|
||||
|
||||
def test_same_passage_reuses_label_across_calls() -> None:
|
||||
registry = CitationRegistry()
|
||||
document = _document(1, "Doc", [880])
|
||||
|
||||
render_retrieved_context([document], registry)
|
||||
block = render_retrieved_context([document], registry)
|
||||
|
||||
assert block is not None
|
||||
assert "[1] text 880" in block
|
||||
assert registry.next_n == 2
|
||||
124
surfsense_backend/tests/unit/middleware/test_kb_postgres_read.py
Normal file
124
surfsense_backend/tests/unit/middleware/test_kb_postgres_read.py
Normal file
|
|
@ -0,0 +1,124 @@
|
|||
"""Unit tests for the KB read path: full-view render + anonymous-doc loading.
|
||||
|
||||
DB-backed loads are exercised by the integration suite; here we lock the pure
|
||||
pieces — ``render_full_document`` and the anonymous-upload branch of
|
||||
``aload_document`` — which need no database.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from types import SimpleNamespace
|
||||
|
||||
import pytest
|
||||
|
||||
from app.agents.chat.multi_agent_chat.shared.citations import (
|
||||
CitationRegistry,
|
||||
CitationSourceType,
|
||||
)
|
||||
from app.agents.chat.multi_agent_chat.shared.document_render import (
|
||||
RenderableDocument,
|
||||
RenderablePassage,
|
||||
)
|
||||
from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.kb_postgres import (
|
||||
KBPostgresBackend,
|
||||
render_full_document,
|
||||
)
|
||||
|
||||
pytestmark = pytest.mark.unit
|
||||
|
||||
|
||||
def _backend(state: dict) -> KBPostgresBackend:
|
||||
return KBPostgresBackend(search_space_id=1, runtime=SimpleNamespace(state=state))
|
||||
|
||||
|
||||
def test_render_full_document_uses_full_view_and_registers() -> None:
|
||||
registry = CitationRegistry()
|
||||
document = RenderableDocument(
|
||||
title="Launch Notes",
|
||||
source="Slack",
|
||||
passages=[
|
||||
RenderablePassage(
|
||||
content="push to March 10",
|
||||
locator={"document_id": 7, "chunk_id": 880},
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
rendered = render_full_document(document, registry)
|
||||
|
||||
assert '<document title="Launch Notes" source="Slack" view="full">' in rendered
|
||||
assert "[1] push to March 10" in rendered
|
||||
entry = registry.resolve(1)
|
||||
assert entry is not None
|
||||
assert entry.locator == {"document_id": 7, "chunk_id": 880}
|
||||
|
||||
|
||||
def test_render_full_document_reuses_search_label() -> None:
|
||||
"""A chunk already registered from search keeps its [n] on a later full read."""
|
||||
registry = CitationRegistry()
|
||||
n = registry.register(
|
||||
CitationSourceType.KB_CHUNK,
|
||||
{"document_id": 7, "chunk_id": 880},
|
||||
{"title": "Launch Notes", "source": "Slack"},
|
||||
)
|
||||
document = RenderableDocument(
|
||||
title="Launch Notes",
|
||||
source="Slack",
|
||||
passages=[
|
||||
RenderablePassage(
|
||||
content="new chunk",
|
||||
locator={"document_id": 7, "chunk_id": 881},
|
||||
),
|
||||
RenderablePassage(
|
||||
content="push to March 10",
|
||||
locator={"document_id": 7, "chunk_id": 880},
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
rendered = render_full_document(document, registry)
|
||||
|
||||
assert f"[{n}] push to March 10" in rendered
|
||||
assert "[2] new chunk" in rendered
|
||||
|
||||
|
||||
def test_render_full_document_empty_falls_back_to_notice() -> None:
|
||||
registry = CitationRegistry()
|
||||
document = RenderableDocument(title="Empty", passages=[])
|
||||
|
||||
assert render_full_document(document, registry) == (
|
||||
"(This document has no readable content.)"
|
||||
)
|
||||
|
||||
|
||||
async def test_aload_document_anonymous_upload() -> None:
|
||||
backend = _backend(
|
||||
{
|
||||
"kb_anon_doc": {
|
||||
"path": "/anon_upload.md",
|
||||
"title": "Quarterly Report",
|
||||
"chunks": [
|
||||
{"chunk_id": -1, "content": "revenue grew"},
|
||||
{"chunk_id": -2, "content": "costs fell"},
|
||||
],
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
loaded = await backend.aload_document("/anon_upload.md")
|
||||
|
||||
assert loaded is not None
|
||||
document, doc_id = loaded
|
||||
assert doc_id is None
|
||||
assert document.title == "Quarterly Report"
|
||||
assert [p.locator["chunk_id"] for p in document.passages] == [-1, -2]
|
||||
assert all(p.locator["document_id"] == -1 for p in document.passages)
|
||||
assert all(
|
||||
p.source_type is CitationSourceType.ANON_CHUNK for p in document.passages
|
||||
)
|
||||
|
||||
|
||||
async def test_aload_document_unknown_path_returns_none() -> None:
|
||||
backend = _backend({})
|
||||
|
||||
assert await backend.aload_document("/not/under/documents.md") is None
|
||||
Loading…
Add table
Add a link
Reference in a new issue