From 265888d21c33ba8754090aacf40189819c8063ce Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 25 Jun 2026 15:26:16 +0200 Subject: [PATCH] document-render: unify KB/web rendering on numbered [n] passages Add a shared document_render package that renders sources as blocks with server-assigned [n] passage labels (KB locator {document_id, chunk_id}, web locator {url}). Wire the KB read backend (kb_postgres) and read_file to the new renderer and drop the legacy per-document XML renderer (document_xml, retrieved_context) and the old chunk_index / matched="true" / read format. --- .../shared/document_render/__init__.py | 25 +++ .../shared/document_render/document.py | 70 ++++++++ .../shared/document_render/models.py | 42 +++++ .../shared/document_render/search_context.py | 53 ++++++ .../source_label.py | 6 +- .../shared/document_render/web_results.py | 54 +++++++ .../filesystem/backends/document_xml.py | 103 ------------ .../filesystem/backends/kb_postgres.py | 151 ++++++++++------- .../filesystem/backends/resolver.py | 4 +- .../filesystem/system_prompt/cloud.py | 22 +-- .../filesystem/tools/read_file/description.py | 10 +- .../filesystem/tools/read_file/index.py | 15 +- .../shared/retrieval/adapter.py | 26 ++- .../shared/retrieval/models.py | 2 +- .../shared/retrieval/service.py | 10 +- .../shared/retrieved_context/__init__.py | 16 -- .../shared/retrieved_context/models.py | 32 ---- .../shared/retrieved_context/renderer.py | 89 ---------- .../shared/document_render/test_document.py | 152 ++++++++++++++++++ .../document_render/test_search_context.py | 94 +++++++++++ .../test_source_label.py | 2 +- .../document_render/test_web_results.py | 82 ++++++++++ .../shared/retrieval/test_adapter.py | 19 +-- .../shared/retrieved_context/test_renderer.py | 144 ----------------- .../unit/middleware/test_kb_postgres_read.py | 124 ++++++++++++++ 25 files changed, 846 insertions(+), 501 deletions(-) create mode 100644 surfsense_backend/app/agents/chat/multi_agent_chat/shared/document_render/__init__.py create mode 100644 surfsense_backend/app/agents/chat/multi_agent_chat/shared/document_render/document.py create mode 100644 surfsense_backend/app/agents/chat/multi_agent_chat/shared/document_render/models.py create mode 100644 surfsense_backend/app/agents/chat/multi_agent_chat/shared/document_render/search_context.py rename surfsense_backend/app/agents/chat/multi_agent_chat/shared/{retrieval => document_render}/source_label.py (89%) create mode 100644 surfsense_backend/app/agents/chat/multi_agent_chat/shared/document_render/web_results.py delete mode 100644 surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/document_xml.py delete mode 100644 surfsense_backend/app/agents/chat/multi_agent_chat/shared/retrieved_context/__init__.py delete mode 100644 surfsense_backend/app/agents/chat/multi_agent_chat/shared/retrieved_context/models.py delete mode 100644 surfsense_backend/app/agents/chat/multi_agent_chat/shared/retrieved_context/renderer.py create mode 100644 surfsense_backend/tests/unit/agents/multi_agent_chat/shared/document_render/test_document.py create mode 100644 surfsense_backend/tests/unit/agents/multi_agent_chat/shared/document_render/test_search_context.py rename surfsense_backend/tests/unit/agents/multi_agent_chat/shared/{retrieval => document_render}/test_source_label.py (91%) create mode 100644 surfsense_backend/tests/unit/agents/multi_agent_chat/shared/document_render/test_web_results.py delete mode 100644 surfsense_backend/tests/unit/agents/multi_agent_chat/shared/retrieved_context/test_renderer.py create mode 100644 surfsense_backend/tests/unit/middleware/test_kb_postgres_read.py diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/document_render/__init__.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/document_render/__init__.py new file mode 100644 index 000000000..42368891d --- /dev/null +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/document_render/__init__.py @@ -0,0 +1,25 @@ +"""Render citable documents for the model: one shape for search, read, and web. + +``render_document`` emits one ```` +block whose passages carry server-assigned ``[n]`` labels. ``render_search_context`` +wraps KB excerpt blocks in ````; ``render_web_results`` wraps web +excerpt blocks in ````. Both cite with the same ``[n]`` spine. +""" + +from __future__ import annotations + +from .document import render_document +from .models import DocumentView, RenderableDocument, RenderablePassage +from .search_context import render_search_context +from .source_label import source_label +from .web_results import render_web_results + +__all__ = [ + "DocumentView", + "RenderableDocument", + "RenderablePassage", + "render_document", + "render_search_context", + "render_web_results", + "source_label", +] diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/document_render/document.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/document_render/document.py new file mode 100644 index 000000000..83181ff69 --- /dev/null +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/document_render/document.py @@ -0,0 +1,70 @@ +"""Render one citable document as a ```` block. + +Every citable surface (KB search excerpts, KB full reads, web results) uses the +same block; ``view`` and the passages shown are what differ. Each passage is +registered for citation as it renders, so its ``[n]`` resolves back to its source +later. +""" + +from __future__ import annotations + +from app.agents.chat.multi_agent_chat.shared.citations import CitationRegistry + +from .models import DocumentView, RenderableDocument, RenderablePassage + + +def render_document( + document: RenderableDocument, + *, + view: DocumentView, + registry: CitationRegistry, +) -> str | None: + """Render one ```` block, registering each passage for citation. + + Returns ``None`` when the document has no passage to show. Mutates ``registry`` + (find-or-create). + """ + if not document.passages: + return None + + lines = [_open_tag(document, view)] + for passage in document.passages: + lines.append(_render_passage(document, passage, registry)) + lines.append("") + return "\n".join(lines) + + +def _open_tag(document: RenderableDocument, view: DocumentView) -> str: + attrs = [f'title="{_attr(document.title)}"'] + if document.source: + attrs.append(f'source="{_attr(document.source)}"') + attrs.append(f'view="{view}"') + return f"" + + +def _render_passage( + document: RenderableDocument, + passage: RenderablePassage, + registry: CitationRegistry, +) -> str: + n = registry.register( + passage.source_type, + passage.locator, + {"title": document.title, "source": document.source}, + ) + label = f" [{n}] " + body = passage.content.strip().replace("\n", "\n" + " " * len(label)) + return f"{label}{body}" + + +def _attr(value: str) -> str: + collapsed = " ".join(str(value).split()) + return ( + collapsed.replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace('"', """) + ) + + +__all__ = ["render_document"] diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/document_render/models.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/document_render/models.py new file mode 100644 index 000000000..45cdb1865 --- /dev/null +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/document_render/models.py @@ -0,0 +1,42 @@ +"""Inputs for rendering a citable document for the model. + +A passage is one citable unit — what the model cites with ``[n]``. A document +groups the passages shown from one source. The same shapes feed every citable +surface: KB search excerpts, KB full reads, and web results. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any, Literal + +from app.agents.chat.multi_agent_chat.shared.citations import CitationSourceType + +DocumentView = Literal["excerpt", "full"] +"""How much of the source is shown: a search slice, or the whole object.""" + + +@dataclass(frozen=True) +class RenderablePassage: + """One citable unit: what the model cites with ``[n]``. + + ``locator`` is the source-specific identity registered for this passage (a KB + chunk's ``{document_id, chunk_id}``, a web result's ``{url}``). ``source_type`` + selects how that locator resolves to a frontend payload. + """ + + content: str + locator: dict[str, Any] + source_type: CitationSourceType = CitationSourceType.KB_CHUNK + + +@dataclass(frozen=True) +class RenderableDocument: + """A source document and the passages to render from it, in order.""" + + title: str + source: str | None = None + passages: list[RenderablePassage] = field(default_factory=list) + + +__all__ = ["DocumentView", "RenderableDocument", "RenderablePassage"] diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/document_render/search_context.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/document_render/search_context.py new file mode 100644 index 000000000..418a2142d --- /dev/null +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/document_render/search_context.py @@ -0,0 +1,53 @@ +"""Wrap search excerpts in the ```` block. + +Each document renders through the shared ``render_document``; this module adds the +container and the one-time header that teaches the model how to read and cite. +""" + +from __future__ import annotations + +from app.agents.chat.multi_agent_chat.shared.citations import CitationRegistry + +from .document import render_document +from .models import RenderableDocument + +_HEADER = ( + "These are excerpts from the user's knowledge base, selected for this query.\n" + "A document is a full source (a file, a Slack thread, a Notion page); each\n" + " below is in excerpt view, so you are seeing only the chunks that\n" + "matched this query, not the whole source. Cite a chunk with its [n]. Read the\n" + "document for full context before claiming it only says X." +) + + +def render_search_context( + documents: list[RenderableDocument], + registry: CitationRegistry, +) -> str | None: + """Render retrieved documents as excerpt blocks inside ````. + + Returns ``None`` when no document has a passage to show, so the caller can skip + the block. Mutates ``registry`` (find-or-create), so a passage seen again in a + later turn keeps its original ``[n]``. + """ + blocks = [ + block + for document in documents + if ( + block := render_document(document, view="excerpt", registry=registry) + ) + is not None + ] + if not blocks: + return None + + return ( + "\n" + + _HEADER + + "\n" + + "\n".join(blocks) + + "\n" + ) + + +__all__ = ["render_search_context"] diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/retrieval/source_label.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/document_render/source_label.py similarity index 89% rename from surfsense_backend/app/agents/chat/multi_agent_chat/shared/retrieval/source_label.py rename to surfsense_backend/app/agents/chat/multi_agent_chat/shared/document_render/source_label.py index 12b3ad6ac..03878b2f4 100644 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/retrieval/source_label.py +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/document_render/source_label.py @@ -1,8 +1,10 @@ -"""Build a short, honest source label for a retrieved document. +"""Build a short, honest source label for a knowledge-base document. A label orients the model about where a passage came from — e.g. ``Slack`` or ``Web · docs.python.org``. It is derived only from the document's type and any -URL in its metadata, so it never asserts detail we don't actually have. +URL in its metadata, so it never asserts detail we don't actually have. Search +hits and full reads both build their ```` from here, so the +label a passage carries is identical whichever surface it arrives through. """ from __future__ import annotations diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/document_render/web_results.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/document_render/web_results.py new file mode 100644 index 000000000..b310c7b3a --- /dev/null +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/document_render/web_results.py @@ -0,0 +1,54 @@ +"""Wrap live web-search results in a ```` block. + +Each result renders through the shared ``render_document`` (excerpt view), so a +web result is cited with ``[n]`` exactly like a knowledge-base passage. Only the +container and header differ — they tell the model these came from the public web, +not the user's workspace. +""" + +from __future__ import annotations + +from app.agents.chat.multi_agent_chat.shared.citations import CitationRegistry + +from .document import render_document +from .models import RenderableDocument + +_HEADER = ( + "These are live results from a public web search for this query. Each\n" + " below is one result in excerpt view; cite a result with its [n]\n" + "after the statement it supports. Scrape the URL for full context before\n" + "making a definitive claim from a snippet." +) + + +def render_web_results( + documents: list[RenderableDocument], + registry: CitationRegistry, +) -> str | None: + """Render web results as excerpt blocks inside ````. + + Returns ``None`` when no result has content to show, so the caller can skip + the block. Mutates ``registry`` (find-or-create), so a URL seen again keeps + its original ``[n]``. + """ + blocks = [ + block + for document in documents + if ( + block := render_document(document, view="excerpt", registry=registry) + ) + is not None + ] + if not blocks: + return None + + return ( + "\n" + + _HEADER + + "\n" + + "\n".join(blocks) + + "\n" + ) + + +__all__ = ["render_web_results"] diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/document_xml.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/document_xml.py deleted file mode 100644 index 60e586ae1..000000000 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/document_xml.py +++ /dev/null @@ -1,103 +0,0 @@ -"""Shared XML builder for KB documents. - -Produces the citation-friendly XML used by every read of a knowledge-base -document (lazy-loaded by :class:`KBPostgresBackend` and synthetic anonymous -files). The XML carries a ```` near the top so the LLM can jump -directly to matched-chunk line ranges via ``read_file(offset=…, limit=…)``. - -Extracted from the original ``knowledge_search.py`` so the backend, the -priority middleware, and any future renderer share a single implementation. -""" - -from __future__ import annotations - -import json -from typing import Any - - -def build_document_xml( - document: dict[str, Any], - matched_chunk_ids: set[int] | None = None, -) -> str: - """Build citation-friendly XML with a ```` for smart seeking. - - Args: - document: Dict shape produced by hybrid search / lazy-load helpers. - Expected keys: ``document`` (with ``id``, ``title``, - ``document_type``, ``metadata``) and ``chunks`` - (list of ``{chunk_id, content}``). - matched_chunk_ids: Optional set of chunk IDs to flag as - ``matched="true"`` in the chunk index. - """ - matched = matched_chunk_ids or set() - - doc_meta = document.get("document") or {} - metadata = (doc_meta.get("metadata") or {}) if isinstance(doc_meta, dict) else {} - document_id = doc_meta.get("id", document.get("document_id", "unknown")) - document_type = doc_meta.get("document_type", document.get("source", "UNKNOWN")) - title = doc_meta.get("title") or metadata.get("title") or "Untitled Document" - url = ( - metadata.get("url") or metadata.get("source") or metadata.get("page_url") or "" - ) - metadata_json = json.dumps(metadata, ensure_ascii=False) - - metadata_lines: list[str] = [ - "", - "", - f" {document_id}", - f" {document_type}", - f" <![CDATA[{title}]]>", - f" ", - f" ", - "", - "", - ] - - chunks = document.get("chunks") or [] - chunk_entries: list[tuple[int | None, str]] = [] - if isinstance(chunks, list): - for chunk in chunks: - if not isinstance(chunk, dict): - continue - chunk_id = chunk.get("chunk_id") or chunk.get("id") - chunk_content = str(chunk.get("content", "")).strip() - if not chunk_content: - continue - if chunk_id is None: - xml = f" " - else: - xml = f" " - chunk_entries.append((chunk_id, xml)) - - index_overhead = 1 + len(chunk_entries) + 1 + 1 + 1 - first_chunk_line = len(metadata_lines) + index_overhead + 1 - - current_line = first_chunk_line - index_entry_lines: list[str] = [] - for cid, xml_str in chunk_entries: - num_lines = xml_str.count("\n") + 1 - end_line = current_line + num_lines - 1 - matched_attr = ' matched="true"' if cid is not None and cid in matched else "" - if cid is not None: - index_entry_lines.append( - f' ' - ) - else: - index_entry_lines.append( - f' ' - ) - current_line = end_line + 1 - - lines = metadata_lines.copy() - lines.append("") - lines.extend(index_entry_lines) - lines.append("") - lines.append("") - lines.append("") - for _, xml_str in chunk_entries: - lines.append(xml_str) - lines.extend(["", ""]) - return "\n".join(lines) - - -__all__ = ["build_document_xml"] diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/kb_postgres.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/kb_postgres.py index e13196537..cb0f4cc69 100644 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/kb_postgres.py +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/kb_postgres.py @@ -42,8 +42,15 @@ from langchain.tools import ToolRuntime from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession -from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.document_xml import ( - build_document_xml, +from app.agents.chat.multi_agent_chat.shared.citations import ( + CitationRegistry, + CitationSourceType, +) +from app.agents.chat.multi_agent_chat.shared.document_render import ( + RenderableDocument, + RenderablePassage, + render_document, + source_label, ) from app.agents.chat.runtime.path_resolver import ( DOCUMENTS_ROOT, @@ -59,6 +66,21 @@ _TEMP_PREFIX = "temp_" _GREP_MAX_TOTAL_MATCHES = 50 _GREP_MAX_PER_DOC = 5 +_EMPTY_DOCUMENT_NOTICE = "(This document has no readable content.)" + + +def render_full_document( + document: RenderableDocument, + registry: CitationRegistry, +) -> str: + """Render a whole KB document (``view="full"``), registering each chunk's ``[n]``. + + Falls back to a short notice when the document has no chunks, so a read never + returns blank. + """ + rendered = render_document(document, view="full", registry=registry) + return rendered if rendered is not None else _EMPTY_DOCUMENT_NOTICE + def _basename(path: str) -> str: return path.rsplit("/", 1)[-1] @@ -127,13 +149,6 @@ class KBPostgresBackend(BackendProtocol): anon = self.state.get("kb_anon_doc") return anon if isinstance(anon, dict) else None - def _matched_chunk_ids(self, doc_id: int) -> set[int]: - mapping = self.state.get("kb_matched_chunk_ids") or {} - try: - return set(mapping.get(doc_id, []) or []) - except TypeError: - return set() - @staticmethod def _file_data_size(file_data: dict[str, Any]) -> int: try: @@ -466,80 +481,93 @@ class KBPostgresBackend(BackendProtocol): def read(self, file_path: str, offset: int = 0, limit: int = 2000) -> str: # type: ignore[override] return asyncio.run(self.aread(file_path, offset, limit)) - async def _load_file_data( + async def aload_document( self, path: str, - ) -> tuple[dict[str, Any], int | None] | None: - """Lazy-load a virtual KB document into a deepagents ``FileData``. + ) -> tuple[RenderableDocument, int | None] | None: + """Lazy-load a virtual KB document as a :class:`RenderableDocument`. - Returns ``(file_data, doc_id)`` or ``None`` if the path doesn't map - to any known document. ``doc_id`` is ``None`` for the synthetic - anonymous document so the caller doesn't track it as a DB-backed file. + Returns ``(document, doc_id)`` with every chunk in document order, or + ``None`` if the path maps to no known document. ``doc_id`` is ``None`` + for the synthetic anonymous upload so the caller doesn't track it as a + DB-backed file. Pure data — rendering and citation registration happen in + the caller (see :meth:`_load_file_data` and the ``read_file`` tool). """ anon = self._kb_anon_doc() if anon and str(anon.get("path") or "") == path: - doc_payload = { - "document_id": -1, - "chunks": list(anon.get("chunks") or []), - "matched_chunk_ids": [], - "document": { - "id": -1, - "title": anon.get("title") or "uploaded_document", - "document_type": "FILE", - "metadata": {"source": "anonymous_upload"}, - }, - "source": "FILE", - } - xml = build_document_xml(doc_payload, matched_chunk_ids=set()) - file_data = create_file_data(xml) - return file_data, None + document = RenderableDocument( + title=str(anon.get("title") or "uploaded_document"), + source="Uploaded file", + passages=[ + RenderablePassage( + content=str(chunk.get("content", "")), + locator={ + "document_id": -1, + "chunk_id": int(chunk["chunk_id"]), + }, + source_type=CitationSourceType.ANON_CHUNK, + ) + for chunk in (anon.get("chunks") or []) + if isinstance(chunk, dict) and chunk.get("chunk_id") is not None + ], + ) + return document, None if not path.startswith(DOCUMENTS_ROOT): return None async with shielded_async_session() as session: - document = await virtual_path_to_doc( + document_row = await virtual_path_to_doc( session, search_space_id=self.search_space_id, virtual_path=path, ) - if document is None: + if document_row is None: return None chunk_rows = await session.execute( select(Chunk.id, Chunk.content) - .where(Chunk.document_id == document.id) + .where(Chunk.document_id == document_row.id) .order_by(Chunk.position, Chunk.id) ) - chunks = [ - {"chunk_id": row.id, "content": row.content} for row in chunk_rows.all() - ] + chunks = chunk_rows.all() - doc_payload = { - "document_id": document.id, - "chunks": chunks, - "matched_chunk_ids": list(self._matched_chunk_ids(document.id)), - "document": { - "id": document.id, - "title": document.title, - "document_type": ( - document.document_type.value - if getattr(document, "document_type", None) is not None - else "UNKNOWN" - ), - "metadata": dict(document.document_metadata or {}), - }, - "source": ( - document.document_type.value - if getattr(document, "document_type", None) is not None - else "UNKNOWN" - ), - } - xml = build_document_xml( - doc_payload, - matched_chunk_ids=self._matched_chunk_ids(document.id), + document_type = ( + document_row.document_type.value + if getattr(document_row, "document_type", None) is not None + else None ) - file_data = create_file_data(xml) - return file_data, document.id + metadata = dict(document_row.document_metadata or {}) + document = RenderableDocument( + title=document_row.title, + source=source_label(document_type, metadata), + passages=[ + RenderablePassage( + content=row.content, + locator={"document_id": document_row.id, "chunk_id": row.id}, + ) + for row in chunks + ], + ) + return document, document_row.id + + async def _load_file_data( + self, + path: str, + ) -> tuple[dict[str, Any], int | None] | None: + """Render a virtual KB document into a deepagents ``FileData``. + + Used by the filesystem ops (move/edit existence + content staging) and the + backend's own ``aread``/``aedit``. These have no conversation registry to + persist into, so the ``[n]`` labels are minted into a throwaway registry — + the canonical, citation-persisting read is the ``read_file`` tool, which + renders from :meth:`aload_document` against the state registry. + """ + loaded = await self.aload_document(path) + if loaded is None: + return None + document, doc_id = loaded + rendered = render_full_document(document, CitationRegistry()) + return create_file_data(rendered), doc_id # ------------------------------------------------------------------ writes @@ -1037,4 +1065,5 @@ __all__ = [ "KBPostgresBackend", "list_tree_listing", "paginate_listing", + "render_full_document", ] diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/resolver.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/resolver.py index 6c35f369f..4553df7ff 100644 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/resolver.py +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/resolver.py @@ -37,8 +37,8 @@ def build_backend_resolver( In cloud mode the resolver returns a fresh :class:`KBPostgresBackend` bound to the current ``runtime`` so the backend can read staging state - (``staged_dirs``, ``pending_moves``, ``files`` cache, ``kb_anon_doc``, - ``kb_matched_chunk_ids``) for each tool call. When no ``search_space_id`` + (``staged_dirs``, ``pending_moves``, ``files`` cache, ``kb_anon_doc``) + for each tool call. When no ``search_space_id`` is provided, the resolver falls back to :class:`StateBackend` (used by sub-agents and tests that don't need DB-backed reads). diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/system_prompt/cloud.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/system_prompt/cloud.py index 98dbbaaab..3366ac601 100644 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/system_prompt/cloud.py +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/system_prompt/cloud.py @@ -35,26 +35,20 @@ current working directory (`cwd`, default `/documents`). turn alongside any new/edited documents. Snapshot/revert is enabled for every destructive operation when action logging is on. -## Reading Documents Efficiently +## Reading Documents -Documents are formatted as XML. Each document contains: -- `` — title, type, URL, etc. -- `` — a table of every chunk with its **line range** and a - `matched="true"` flag for chunks that matched the search query. -- `` — the actual chunks in original document order. - -**Workflow**: when reading a large document, read the first ~20 lines to see -the ``, identify chunks marked `matched="true"`, then use -`read_file(path, offset=, limit=)` to jump directly to -those sections instead of reading the entire file sequentially. - -Use `` values as citation IDs in your answers. +A knowledge-base document is returned as a `` block — +the whole source, with each passage labelled `[n]`. `view="full"` means you are +seeing the complete document, not an excerpt. Use `read_file(path, offset, limit)` +to page through a large document. Cite a passage by writing its `[n]` after the +statement it supports — the same `[n]` that passage had in +`search_knowledge_base` results. ## Priority List You receive a `` system message each turn listing the top-K paths most relevant to the user's query (by hybrid search). Read those -first — matched sections are flagged inside each document's ``. +first. ## Workspace Tree diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/read_file/description.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/read_file/description.py index b10ca4acc..3d1c6b69f 100644 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/read_file/description.py +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/read_file/description.py @@ -10,11 +10,11 @@ Usage: - By default, reads up to 100 lines from the beginning. - Use `offset` and `limit` for pagination when files are large. - Results include line numbers. -- Documents contain a `` near the top listing every chunk with - its line range and a `matched="true"` flag for search-relevant chunks. - Read the index first, then jump to matched chunks with - `read_file(path, offset=, limit=)`. -- Use chunk IDs (``) as citations in answers. +- A knowledge-base document is returned as a `` block: + the whole source, with each passage labelled `[n]`. `view="full"` means you are + seeing the complete document, not an excerpt. +- Cite a passage by writing its `[n]` after the statement it supports — the same + `[n]` you would use for that passage from `search_knowledge_base`. """ diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/read_file/index.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/read_file/index.py index 5c20619d6..07dfec57e 100644 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/read_file/index.py +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/read_file/index.py @@ -4,14 +4,20 @@ from __future__ import annotations from typing import TYPE_CHECKING, Annotated, Any -from deepagents.backends.utils import format_read_response, validate_path +from deepagents.backends.utils import ( + create_file_data, + format_read_response, + validate_path, +) from langchain.tools import ToolRuntime from langchain_core.messages import ToolMessage from langchain_core.tools import BaseTool, StructuredTool from langgraph.types import Command +from app.agents.chat.multi_agent_chat.shared.citations import load_registry from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.kb_postgres import ( KBPostgresBackend, + render_full_document, ) from app.agents.chat.multi_agent_chat.shared.state.filesystem_state import ( SurfSenseFilesystemState, @@ -55,10 +61,12 @@ def create_read_file_tool(mw: SurfSenseFilesystemMiddleware) -> BaseTool: backend = mw._get_backend(runtime) if isinstance(backend, KBPostgresBackend): - loaded = await backend._load_file_data(validated) + loaded = await backend.aload_document(validated) if loaded is None: return f"Error: File '{validated}' not found" - file_data, doc_id = loaded + document, doc_id = loaded + registry = load_registry(runtime.state) + file_data = create_file_data(render_full_document(document, registry)) rendered = format_read_response(file_data, offset, limit) update: dict[str, Any] = { "files": {validated: file_data}, @@ -68,6 +76,7 @@ def create_read_file_tool(mw: SurfSenseFilesystemMiddleware) -> BaseTool: tool_call_id=runtime.tool_call_id, ) ], + "citation_registry": registry, } if doc_id is not None: update["doc_id_by_path"] = {validated: doc_id} diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/retrieval/adapter.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/retrieval/adapter.py index 3e7ee79d3..cf4263451 100644 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/retrieval/adapter.py +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/retrieval/adapter.py @@ -1,31 +1,29 @@ -"""Turn retriever ``DocumentHit``s into renderable ``RetrievedDocument``s.""" +"""Turn retriever ``DocumentHit``s into renderable documents.""" from __future__ import annotations -from app.agents.chat.multi_agent_chat.shared.retrieved_context import ( - RetrievedDocument, - RetrievedPassage, +from app.agents.chat.multi_agent_chat.shared.document_render import ( + RenderableDocument, + RenderablePassage, + source_label, ) from .models import DocumentHit -from .source_label import source_label -def to_retrieved_document(hit: DocumentHit) -> RetrievedDocument: - """Map one hit to the shape the ```` renderer consumes.""" - return RetrievedDocument( - document_id=hit.document_id, +def to_renderable_document(hit: DocumentHit) -> RenderableDocument: + """Map one hit to the shape the document-fragment renderer consumes.""" + return RenderableDocument( title=hit.title, - source_label=source_label(hit.document_type, hit.metadata), + source=source_label(hit.document_type, hit.metadata), passages=[ - RetrievedPassage( - document_id=hit.document_id, - chunk_id=chunk.chunk_id, + RenderablePassage( content=chunk.content, + locator={"document_id": hit.document_id, "chunk_id": chunk.chunk_id}, ) for chunk in hit.chunks ], ) -__all__ = ["to_retrieved_document"] +__all__ = ["to_renderable_document"] diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/retrieval/models.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/retrieval/models.py index c45df41c5..4c4174a4f 100644 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/retrieval/models.py +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/retrieval/models.py @@ -2,7 +2,7 @@ ``SearchScope`` is the optional filter a search runs under. ``DocumentHit`` / ``ChunkHit`` are the retriever's typed output — matched chunks grouped by their -document — which the adapter turns into renderable ``RetrievedDocument``s. +document — which the adapter turns into renderable ``RenderableDocument``s. """ from __future__ import annotations diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/retrieval/service.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/retrieval/service.py index 812592ff8..e9cfa18dd 100644 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/retrieval/service.py +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/retrieval/service.py @@ -11,11 +11,11 @@ from typing import TYPE_CHECKING from sqlalchemy.ext.asyncio import AsyncSession from app.agents.chat.multi_agent_chat.shared.citations import CitationRegistry -from app.agents.chat.multi_agent_chat.shared.retrieved_context import ( - render_retrieved_context, +from app.agents.chat.multi_agent_chat.shared.document_render import ( + render_search_context, ) -from .adapter import to_retrieved_document +from .adapter import to_renderable_document from .hybrid_search import search_chunks from .models import DocumentHit, SearchScope from .reranking import rerank_hits @@ -59,8 +59,8 @@ def build_context( ) -> str | None: """Rerank → adapt → render. Pure given ``hits``, so it is unit-testable.""" ranked = rerank_hits(query, hits, reranker) - documents = [to_retrieved_document(hit) for hit in ranked] - return render_retrieved_context(documents, registry) + documents = [to_renderable_document(hit) for hit in ranked] + return render_search_context(documents, registry) __all__ = ["build_context", "search_knowledge_base_context"] diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/retrieved_context/__init__.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/retrieved_context/__init__.py deleted file mode 100644 index 842740da7..000000000 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/retrieved_context/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -"""Retrieved knowledge-base evidence rendered as the ```` block. - -Turns retrieved chunks into the model-facing block and registers each passage -into the citation registry so ``[n]`` resolves back to a real chunk. -""" - -from __future__ import annotations - -from .models import RetrievedDocument, RetrievedPassage -from .renderer import render_retrieved_context - -__all__ = [ - "RetrievedDocument", - "RetrievedPassage", - "render_retrieved_context", -] diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/retrieved_context/models.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/retrieved_context/models.py deleted file mode 100644 index 7ea1012d2..000000000 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/retrieved_context/models.py +++ /dev/null @@ -1,32 +0,0 @@ -"""Data shapes for retrieved knowledge-base evidence. - -A passage is one matched chunk (the citable unit); a document groups the -passages that came from the same source. The renderer turns these into the -model-facing ```` block. -""" - -from __future__ import annotations - -from dataclasses import dataclass, field - - -@dataclass(frozen=True) -class RetrievedPassage: - """One matched chunk: the unit the model cites with ``[n]``.""" - - document_id: int - chunk_id: int - content: str - - -@dataclass(frozen=True) -class RetrievedDocument: - """A source document and the passages retrieved from it, in order.""" - - document_id: int - title: str - source_label: str | None = None - passages: list[RetrievedPassage] = field(default_factory=list) - - -__all__ = ["RetrievedDocument", "RetrievedPassage"] diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/retrieved_context/renderer.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/retrieved_context/renderer.py deleted file mode 100644 index 8225f757b..000000000 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/retrieved_context/renderer.py +++ /dev/null @@ -1,89 +0,0 @@ -"""Render retrieved documents into the model-facing ```` block. - -Each passage is registered into the citation registry as it is rendered, so the -``[n]`` the model sees is the same label the server can later resolve back to a -chunk. ``[n]`` is the only citable integer per passage by design — nothing else -in a line is a number the model could echo by mistake. -""" - -from __future__ import annotations - -from app.agents.chat.multi_agent_chat.shared.citations import ( - CitationRegistry, - CitationSourceType, -) - -from .models import RetrievedDocument, RetrievedPassage - -_HEADER = ( - "These are excerpts from the user's knowledge base, selected for this query.\n" - "A document is a full source (a file, a Slack thread, a Notion page); a chunk\n" - "is one ordered fragment of it. You are seeing only the chunks that matched\n" - "this query, not the whole source.\n" - "Cite a chunk with [n]." -) - - -def render_retrieved_context( - documents: list[RetrievedDocument], - registry: CitationRegistry, -) -> str | None: - """Render retrieved documents and register each passage for citation. - - Returns ``None`` when there is no passage to show so the caller can skip the - block. Mutates ``registry`` (find-or-create), so a passage seen again in a - later turn keeps its original ``[n]``. - """ - blocks = [ - block - for document in documents - if (block := _render_document(document, registry)) is not None - ] - if not blocks: - return None - - return "\n" + _HEADER + "\n" + "\n".join(blocks) + "\n" - - -def _render_document( - document: RetrievedDocument, - registry: CitationRegistry, -) -> str | None: - """Render one document header and its passages; ``None`` if it has none.""" - if not document.passages: - return None - - lines = [_render_header(document)] - for passage in document.passages: - lines.append(_render_passage(document, passage, registry)) - return "\n".join(lines) - - -def _render_header(document: RetrievedDocument) -> str: - """``Document: "Title" (source)``.""" - source = f" ({document.source_label})" if document.source_label else "" - return f'Document: "{_clean(document.title)}"{source}' - - -def _render_passage( - document: RetrievedDocument, - passage: RetrievedPassage, - registry: CitationRegistry, -) -> str: - """`` [n] `` with continuation lines indented under it.""" - n = registry.register( - CitationSourceType.KB_CHUNK, - {"document_id": passage.document_id, "chunk_id": passage.chunk_id}, - {"title": document.title, "source": document.source_label}, - ) - label = f" [{n}] " - body = passage.content.strip().replace("\n", "\n" + " " * len(label)) - return f"{label}{body}" - - -def _clean(text: str) -> str: - """Collapse whitespace so a title stays on the header line.""" - return " ".join(text.split()) - - -__all__ = ["render_retrieved_context"] diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/shared/document_render/test_document.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/shared/document_render/test_document.py new file mode 100644 index 000000000..6c4cb7c25 --- /dev/null +++ b/surfsense_backend/tests/unit/agents/multi_agent_chat/shared/document_render/test_document.py @@ -0,0 +1,152 @@ +"""Tests for the shared ``render_document`` (one ```` block).""" + +from __future__ import annotations + +import pytest + +from app.agents.chat.multi_agent_chat.shared.citations import ( + CitationRegistry, + CitationSourceType, +) +from app.agents.chat.multi_agent_chat.shared.document_render import ( + RenderableDocument, + RenderablePassage, + render_document, +) + +pytestmark = pytest.mark.unit + + +def _document( + document_id: int, + title: str, + chunk_ids: list[int], + *, + source: str | None = None, +) -> RenderableDocument: + return RenderableDocument( + title=title, + source=source, + passages=[ + RenderablePassage( + content=f"text {cid}", + locator={"document_id": document_id, "chunk_id": cid}, + ) + for cid in chunk_ids + ], + ) + + +def test_returns_none_when_no_passages() -> None: + registry = CitationRegistry() + + assert ( + render_document(_document(1, "Empty", []), view="excerpt", registry=registry) + is None + ) + + +def test_excerpt_open_and_close_tags() -> None: + registry = CitationRegistry() + + block = render_document( + _document(1, "Q3 Launch Notes", [880], source="Slack · #launch"), + view="excerpt", + registry=registry, + ) + + assert block is not None + assert block.startswith( + '' + ) + assert block.endswith("") + + +def test_full_view_renders_view_attribute() -> None: + registry = CitationRegistry() + + block = render_document(_document(1, "Doc", [880]), view="full", registry=registry) + + assert block is not None + assert '' in block + + +def test_source_attribute_omitted_when_absent() -> None: + registry = CitationRegistry() + + block = render_document( + _document(1, "Plain", [1]), view="excerpt", registry=registry + ) + + assert block is not None + assert block.startswith('') + + +def test_registers_passages_with_chunk_locators() -> None: + registry = CitationRegistry() + + render_document( + _document(1, "Doc", [880], source="Slack"), + view="excerpt", + registry=registry, + ) + + entry = registry.resolve(1) + assert entry is not None + assert entry.source_type is CitationSourceType.KB_CHUNK + assert entry.locator == {"document_id": 1, "chunk_id": 880} + assert entry.display == {"title": "Doc", "source": "Slack"} + + +def test_passages_get_monotonic_labels() -> None: + registry = CitationRegistry() + + block = render_document( + _document(1, "Doc", [880, 881]), view="excerpt", registry=registry + ) + + assert block is not None + assert " [1] text 880" in block + assert " [2] text 881" in block + + +def test_multiline_passage_indents_under_label() -> None: + registry = CitationRegistry() + document = RenderableDocument( + title="Doc", + passages=[ + RenderablePassage( + content="line one\nline two", + locator={"document_id": 1, "chunk_id": 5}, + ) + ], + ) + + block = render_document(document, view="excerpt", registry=registry) + + assert block is not None + assert " [1] line one\n line two" in block + + +def test_attribute_values_are_escaped() -> None: + registry = CitationRegistry() + + block = render_document( + _document(1, 'A & B "d"', [1], source="x & y"), + view="excerpt", + registry=registry, + ) + + assert block is not None + assert 'title="A & B <c> "d""' in block + assert 'source="x & y"' in block + + +def test_same_passage_reuses_label_across_calls() -> None: + registry = CitationRegistry() + document = _document(1, "Doc", [880]) + + render_document(document, view="excerpt", registry=registry) + render_document(document, view="full", registry=registry) + + assert registry.next_n == 2 diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/shared/document_render/test_search_context.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/shared/document_render/test_search_context.py new file mode 100644 index 000000000..6b22d81a7 --- /dev/null +++ b/surfsense_backend/tests/unit/agents/multi_agent_chat/shared/document_render/test_search_context.py @@ -0,0 +1,94 @@ +"""Tests for the ```` wrapper around excerpt documents.""" + +from __future__ import annotations + +import pytest + +from app.agents.chat.multi_agent_chat.shared.citations import CitationRegistry +from app.agents.chat.multi_agent_chat.shared.document_render import ( + RenderableDocument, + RenderablePassage, + render_search_context, +) + +pytestmark = pytest.mark.unit + + +def _document( + document_id: int, + title: str, + chunk_ids: list[int], + *, + source: str | None = None, +) -> RenderableDocument: + return RenderableDocument( + title=title, + source=source, + passages=[ + RenderablePassage( + content=f"text {cid}", + locator={"document_id": document_id, "chunk_id": cid}, + ) + for cid in chunk_ids + ], + ) + + +def test_returns_none_when_nothing_to_show() -> None: + registry = CitationRegistry() + + assert render_search_context([], registry) is None + assert render_search_context([_document(1, "Empty", [])], registry) is None + + +def test_assigns_monotonic_labels_across_documents() -> None: + registry = CitationRegistry() + + block = render_search_context( + [ + _document(1, "Q3 Launch Notes", [880, 881], source="Slack"), + _document(2, "Timeline", [12], source="Notion"), + ], + registry, + ) + + assert block is not None + assert "[1] text 880" in block + assert "[2] text 881" in block + assert "[3] text 12" in block + + +def test_wraps_in_retrieved_context_and_teaches_excerpt_and_citation() -> None: + registry = CitationRegistry() + + block = render_search_context([_document(1, "Doc", [1])], registry) + + assert block is not None + assert block.startswith("") + assert block.endswith("") + assert "excerpt view" in block + assert "Cite a chunk with its [n]." in block + + +def test_documents_render_as_excerpt_blocks() -> None: + registry = CitationRegistry() + + block = render_search_context( + [_document(1, "Q3", [1], source="Slack · #launch")], registry + ) + + assert block is not None + assert '' in block + assert "" in block + + +def test_same_passage_reuses_label_across_calls() -> None: + registry = CitationRegistry() + document = _document(1, "Doc", [880]) + + render_search_context([document], registry) + block = render_search_context([document], registry) + + assert block is not None + assert "[1] text 880" in block + assert registry.next_n == 2 diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/shared/retrieval/test_source_label.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/shared/document_render/test_source_label.py similarity index 91% rename from surfsense_backend/tests/unit/agents/multi_agent_chat/shared/retrieval/test_source_label.py rename to surfsense_backend/tests/unit/agents/multi_agent_chat/shared/document_render/test_source_label.py index 54c74fb0b..ee492269f 100644 --- a/surfsense_backend/tests/unit/agents/multi_agent_chat/shared/retrieval/test_source_label.py +++ b/surfsense_backend/tests/unit/agents/multi_agent_chat/shared/document_render/test_source_label.py @@ -4,7 +4,7 @@ from __future__ import annotations import pytest -from app.agents.chat.multi_agent_chat.shared.retrieval.source_label import source_label +from app.agents.chat.multi_agent_chat.shared.document_render import source_label pytestmark = pytest.mark.unit diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/shared/document_render/test_web_results.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/shared/document_render/test_web_results.py new file mode 100644 index 000000000..75cf0e1fb --- /dev/null +++ b/surfsense_backend/tests/unit/agents/multi_agent_chat/shared/document_render/test_web_results.py @@ -0,0 +1,82 @@ +"""Tests for the ```` wrapper around web-result excerpt documents.""" + +from __future__ import annotations + +import pytest + +from app.agents.chat.multi_agent_chat.shared.citations import ( + CitationRegistry, + CitationSourceType, +) +from app.agents.chat.multi_agent_chat.shared.document_render import ( + RenderableDocument, + RenderablePassage, + render_web_results, +) + +pytestmark = pytest.mark.unit + + +def _web_doc(url: str, title: str, content: str) -> RenderableDocument: + return RenderableDocument( + title=title, + source=f"Web · {url.split('//', 1)[-1].split('/', 1)[0]}", + passages=[ + RenderablePassage( + content=content, + locator={"url": url}, + source_type=CitationSourceType.WEB_RESULT, + ) + ], + ) + + +def test_returns_none_when_nothing_to_show() -> None: + registry = CitationRegistry() + + assert render_web_results([], registry) is None + + +def test_wraps_in_web_results_container() -> None: + registry = CitationRegistry() + + block = render_web_results( + [_web_doc("https://example.com/a", "Example", "the answer is 42")], + registry, + ) + + assert block is not None + assert block.startswith("") + assert block.endswith("") + assert "cite a result with its [n]" in block + assert '' in block + assert "[1] the answer is 42" in block + + +def test_registers_each_result_as_web_result_with_url_locator() -> None: + registry = CitationRegistry() + + render_web_results( + [ + _web_doc("https://a.com/x", "A", "alpha"), + _web_doc("https://b.com/y", "B", "beta"), + ], + registry, + ) + + first = registry.resolve(1) + second = registry.resolve(2) + assert first is not None and second is not None + assert first.source_type is CitationSourceType.WEB_RESULT + assert first.locator == {"url": "https://a.com/x"} + assert second.locator == {"url": "https://b.com/y"} + + +def test_same_url_reuses_label_across_calls() -> None: + registry = CitationRegistry() + doc = _web_doc("https://example.com/a", "Example", "stable fact") + + render_web_results([doc], registry) + render_web_results([doc], registry) + + assert registry.next_n == 2 diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/shared/retrieval/test_adapter.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/shared/retrieval/test_adapter.py index c38cc624d..fd700dd1d 100644 --- a/surfsense_backend/tests/unit/agents/multi_agent_chat/shared/retrieval/test_adapter.py +++ b/surfsense_backend/tests/unit/agents/multi_agent_chat/shared/retrieval/test_adapter.py @@ -1,11 +1,11 @@ -"""Tests for mapping a DocumentHit to a renderable RetrievedDocument.""" +"""Tests for mapping a DocumentHit to a renderable document.""" from __future__ import annotations import pytest from app.agents.chat.multi_agent_chat.shared.retrieval.adapter import ( - to_retrieved_document, + to_renderable_document, ) from app.agents.chat.multi_agent_chat.shared.retrieval.models import ( ChunkHit, @@ -15,7 +15,7 @@ from app.agents.chat.multi_agent_chat.shared.retrieval.models import ( pytestmark = pytest.mark.unit -def test_maps_identity_source_label_and_passages() -> None: +def test_maps_identity_source_and_passages() -> None: hit = DocumentHit( document_id=42, title="Q3 Launch Notes", @@ -28,13 +28,14 @@ def test_maps_identity_source_label_and_passages() -> None: ], ) - document = to_retrieved_document(hit) + document = to_renderable_document(hit) - assert document.document_id == 42 assert document.title == "Q3 Launch Notes" - assert document.source_label == "Slack" - assert [(p.chunk_id, p.content) for p in document.passages] == [(880, "a"), (881, "b")] - assert all(p.document_id == 42 for p in document.passages) + assert document.source == "Slack" + assert [ + (p.locator["chunk_id"], p.content) for p in document.passages + ] == [(880, "a"), (881, "b")] + assert all(p.locator["document_id"] == 42 for p in document.passages) def test_document_with_no_chunks_maps_to_no_passages() -> None: @@ -47,4 +48,4 @@ def test_document_with_no_chunks_maps_to_no_passages() -> None: chunks=[], ) - assert to_retrieved_document(hit).passages == [] + assert to_renderable_document(hit).passages == [] diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/shared/retrieved_context/test_renderer.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/shared/retrieved_context/test_renderer.py deleted file mode 100644 index 6067cac02..000000000 --- a/surfsense_backend/tests/unit/agents/multi_agent_chat/shared/retrieved_context/test_renderer.py +++ /dev/null @@ -1,144 +0,0 @@ -"""Tests for the renderer and its citation registration.""" - -from __future__ import annotations - -import pytest - -from app.agents.chat.multi_agent_chat.shared.citations import ( - CitationRegistry, - CitationSourceType, -) -from app.agents.chat.multi_agent_chat.shared.retrieved_context import ( - RetrievedDocument, - RetrievedPassage, - render_retrieved_context, -) - -pytestmark = pytest.mark.unit - - -def _document( - document_id: int, - title: str, - chunk_ids: list[int], - *, - source_label: str | None = None, -) -> RetrievedDocument: - return RetrievedDocument( - document_id=document_id, - title=title, - source_label=source_label, - passages=[ - RetrievedPassage(document_id=document_id, chunk_id=cid, content=f"text {cid}") - for cid in chunk_ids - ], - ) - - -def test_returns_none_when_nothing_to_show() -> None: - registry = CitationRegistry() - - assert render_retrieved_context([], registry) is None - assert render_retrieved_context([_document(1, "Empty", [])], registry) is None - - -def test_assigns_monotonic_labels_across_documents() -> None: - registry = CitationRegistry() - - block = render_retrieved_context( - [ - _document(1, "Q3 Launch Notes", [880, 881], source_label="Slack"), - _document(2, "Timeline", [12], source_label="Notion"), - ], - registry, - ) - - assert block is not None - assert "[1] text 880" in block - assert "[2] text 881" in block - assert "[3] text 12" in block - - -def test_registers_passages_with_chunk_locators() -> None: - registry = CitationRegistry() - - render_retrieved_context([_document(1, "Doc", [880])], registry) - - entry = registry.resolve(1) - assert entry is not None - assert entry.source_type is CitationSourceType.KB_CHUNK - assert entry.locator == {"document_id": 1, "chunk_id": 880} - assert entry.display["title"] == "Doc" - - -def test_header_shows_source_when_present() -> None: - registry = CitationRegistry() - - block = render_retrieved_context( - [ - _document(1, "Q3", [1], source_label="Slack · #launch"), - _document(2, "Plan", [2]), - ], - registry, - ) - - assert block is not None - assert 'Document: "Q3" (Slack · #launch)' in block - assert 'Document: "Plan"' in block - - -def test_wraps_block_and_explains_chunk_vs_document() -> None: - registry = CitationRegistry() - - block = render_retrieved_context([_document(1, "Doc", [1])], registry) - - assert block is not None - assert block.startswith("") - assert block.endswith("") - assert "Cite a chunk with [n]." in block - - -def test_multiline_passage_is_indented_under_label() -> None: - registry = CitationRegistry() - document = RetrievedDocument( - document_id=1, - title="Doc", - passages=[RetrievedPassage(document_id=1, chunk_id=5, content="line one\nline two")], - ) - - block = render_retrieved_context([document], registry) - - assert block is not None - assert " [1] line one\n line two" in block - - -def test_continuation_indent_tracks_label_width() -> None: - registry = CitationRegistry() - # Burn labels 1..9 so the tenth passage renders as [10] (a 7-char label). - documents = [_document(i, f"Doc {i}", [i]) for i in range(1, 10)] - documents.append( - RetrievedDocument( - document_id=10, - title="Doc 10", - passages=[ - RetrievedPassage(document_id=10, chunk_id=10, content="line one\nline two") - ], - ) - ) - - block = render_retrieved_context(documents, registry) - - assert block is not None - assert " [10] line one\n line two" in block - - -def test_same_passage_reuses_label_across_calls() -> None: - registry = CitationRegistry() - document = _document(1, "Doc", [880]) - - render_retrieved_context([document], registry) - block = render_retrieved_context([document], registry) - - assert block is not None - assert "[1] text 880" in block - assert registry.next_n == 2 diff --git a/surfsense_backend/tests/unit/middleware/test_kb_postgres_read.py b/surfsense_backend/tests/unit/middleware/test_kb_postgres_read.py new file mode 100644 index 000000000..8117a6bdb --- /dev/null +++ b/surfsense_backend/tests/unit/middleware/test_kb_postgres_read.py @@ -0,0 +1,124 @@ +"""Unit tests for the KB read path: full-view render + anonymous-doc loading. + +DB-backed loads are exercised by the integration suite; here we lock the pure +pieces — ``render_full_document`` and the anonymous-upload branch of +``aload_document`` — which need no database. +""" + +from __future__ import annotations + +from types import SimpleNamespace + +import pytest + +from app.agents.chat.multi_agent_chat.shared.citations import ( + CitationRegistry, + CitationSourceType, +) +from app.agents.chat.multi_agent_chat.shared.document_render import ( + RenderableDocument, + RenderablePassage, +) +from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.kb_postgres import ( + KBPostgresBackend, + render_full_document, +) + +pytestmark = pytest.mark.unit + + +def _backend(state: dict) -> KBPostgresBackend: + return KBPostgresBackend(search_space_id=1, runtime=SimpleNamespace(state=state)) + + +def test_render_full_document_uses_full_view_and_registers() -> None: + registry = CitationRegistry() + document = RenderableDocument( + title="Launch Notes", + source="Slack", + passages=[ + RenderablePassage( + content="push to March 10", + locator={"document_id": 7, "chunk_id": 880}, + ), + ], + ) + + rendered = render_full_document(document, registry) + + assert '' in rendered + assert "[1] push to March 10" in rendered + entry = registry.resolve(1) + assert entry is not None + assert entry.locator == {"document_id": 7, "chunk_id": 880} + + +def test_render_full_document_reuses_search_label() -> None: + """A chunk already registered from search keeps its [n] on a later full read.""" + registry = CitationRegistry() + n = registry.register( + CitationSourceType.KB_CHUNK, + {"document_id": 7, "chunk_id": 880}, + {"title": "Launch Notes", "source": "Slack"}, + ) + document = RenderableDocument( + title="Launch Notes", + source="Slack", + passages=[ + RenderablePassage( + content="new chunk", + locator={"document_id": 7, "chunk_id": 881}, + ), + RenderablePassage( + content="push to March 10", + locator={"document_id": 7, "chunk_id": 880}, + ), + ], + ) + + rendered = render_full_document(document, registry) + + assert f"[{n}] push to March 10" in rendered + assert "[2] new chunk" in rendered + + +def test_render_full_document_empty_falls_back_to_notice() -> None: + registry = CitationRegistry() + document = RenderableDocument(title="Empty", passages=[]) + + assert render_full_document(document, registry) == ( + "(This document has no readable content.)" + ) + + +async def test_aload_document_anonymous_upload() -> None: + backend = _backend( + { + "kb_anon_doc": { + "path": "/anon_upload.md", + "title": "Quarterly Report", + "chunks": [ + {"chunk_id": -1, "content": "revenue grew"}, + {"chunk_id": -2, "content": "costs fell"}, + ], + } + } + ) + + loaded = await backend.aload_document("/anon_upload.md") + + assert loaded is not None + document, doc_id = loaded + assert doc_id is None + assert document.title == "Quarterly Report" + assert [p.locator["chunk_id"] for p in document.passages] == [-1, -2] + assert all(p.locator["document_id"] == -1 for p in document.passages) + assert all( + p.source_type is CitationSourceType.ANON_CHUNK for p in document.passages + ) + + +async def test_aload_document_unknown_path_returns_none() -> None: + backend = _backend({}) + + assert await backend.aload_document("/not/under/documents.md") is None