feat: serve numbered source_markdown reads with citation preamble

2026-07-02 22:01:05 +02:00 · 2026-06-19 17:37:41 +02:00 · 2026-06-19 17:37:41 +02:00 · 188ae053ac
commit 188ae053ac
parent fc17b9becd
6 changed files with 78 additions and 44 deletions
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/kb_postgres.py
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/kb_postgres.py
@ -45,6 +45,10 @@ from sqlalchemy.ext.asyncio import AsyncSession
 from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.document_xml import (
    build_document_xml,
 )
 from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.numbered_document import (
    build_read_preamble,
    compute_matched_line_ranges,
 )
 from app.agents.chat.runtime.path_resolver import (
    DOCUMENTS_ROOT,
    build_path_index,
@ -64,6 +68,12 @@ def _basename(path: str) -> str:
    return path.rsplit("/", 1)[-1]
 def _metadata_url(metadata: dict[str, Any]) -> str:
    return (
        metadata.get("url") or metadata.get("source") or metadata.get("page_url") or ""
    )
 def _is_under(child: str, parent: str) -> bool:
    """Return True iff ``child`` is at-or-under ``parent`` (directory semantics)."""
    if parent == "/":
@ -460,8 +470,11 @@ class KBPostgresBackend(BackendProtocol):
        loaded = await self._load_file_data(file_path)
        if loaded is None:
            return f"Error: File '{file_path}' not found"
-        file_data, _ = loaded
+        file_data, _, preamble = loaded
-        return format_read_response(file_data, offset, limit)
+        body = format_read_response(file_data, offset, limit)
        if preamble and offset == 0:
            return preamble + body
        return body
    def read(self, file_path: str, offset: int = 0, limit: int = 2000) -> str:  # type: ignore[override]
        return asyncio.run(self.aread(file_path, offset, limit))
@ -469,12 +482,14 @@ class KBPostgresBackend(BackendProtocol):
    async def _load_file_data(
        self,
        path: str,
-    ) -> tuple[dict[str, Any], int | None] | None:
+    ) -> tuple[dict[str, Any], int | None, str | None] | None:
        """Lazy-load a virtual KB document into a deepagents ``FileData``.
-        Returns ``(file_data, doc_id)`` or ``None`` if the path doesn't map
+        Returns ``(file_data, doc_id, preamble)`` or ``None`` if the path
-        to any known document. ``doc_id`` is ``None`` for the synthetic
+        doesn't map to any known document. ``doc_id`` is ``None`` for the
-        anonymous document so the caller doesn't track it as a DB-backed file.
+        synthetic anonymous document. ``preamble`` is the metadata header to
        show above a numbered ``source_markdown`` body (``None`` for the legacy
        chunk-reconstructed XML reads used when a document has no body).
        """
        anon = self._kb_anon_doc()
        if anon and str(anon.get("path") or "") == path:
@ -492,7 +507,7 @@ class KBPostgresBackend(BackendProtocol):
            }
            xml = build_document_xml(doc_payload, matched_chunk_ids=set())
            file_data = create_file_data(xml)
-            return file_data, None
+            return file_data, None, None
        if not path.startswith(DOCUMENTS_ROOT):
            return None
@ -505,41 +520,58 @@ class KBPostgresBackend(BackendProtocol):
            )
            if document is None:
                return None
-            chunk_rows = await session.execute(
+            source_markdown = document.source_markdown or ""
-                select(Chunk.id, Chunk.content)
+            document_type = (
                .where(Chunk.document_id == document.id)
                .order_by(Chunk.position, Chunk.id)
            )
            chunks = [
                {"chunk_id": row.id, "content": row.content} for row in chunk_rows.all()
            ]
        doc_payload = {
            "document_id": document.id,
            "chunks": chunks,
            "matched_chunk_ids": list(self._matched_chunk_ids(document.id)),
            "document": {
                "id": document.id,
                "title": document.title,
                "document_type": (
                    document.document_type.value
                    if getattr(document, "document_type", None) is not None
                    else "UNKNOWN"
                ),
                "metadata": dict(document.document_metadata or {}),
            },
            "source": (
                document.document_type.value
                if getattr(document, "document_type", None) is not None
                else "UNKNOWN"
-            ),
+            )
            metadata = dict(document.document_metadata or {})
            chunk_rows = await session.execute(
                select(Chunk.id, Chunk.content, Chunk.start_char, Chunk.end_char)
                .where(Chunk.document_id == document.id)
                .order_by(Chunk.position, Chunk.id)
            )
            chunk_records = chunk_rows.all()
            document_id = document.id
            document_title = document.title
        matched = self._matched_chunk_ids(document_id)
        # Canonical read: serve the verbatim body with cat -n line numbers that
        # line up with chunk char spans, so the agent cites real source lines.
        if source_markdown:
            ranges = compute_matched_line_ranges(
                source_markdown,
                [(r.id, r.start_char, r.end_char) for r in chunk_records],
                matched,
            )
            preamble = build_read_preamble(
                document_id=document_id,
                document_type=document_type,
                title=document_title,
                url=_metadata_url(metadata),
                matched_line_ranges=ranges,
            )
            return create_file_data(source_markdown), document_id, preamble
        # Legacy fallback: no canonical body, reconstruct from chunks as XML.
        doc_payload = {
            "document_id": document_id,
            "chunks": [
                {"chunk_id": r.id, "content": r.content} for r in chunk_records
            ],
            "matched_chunk_ids": list(matched),
            "document": {
                "id": document_id,
                "title": document_title,
                "document_type": document_type,
                "metadata": metadata,
            },
            "source": document_type,
        }
-        xml = build_document_xml(
+        xml = build_document_xml(doc_payload, matched_chunk_ids=matched)
-            doc_payload,
+        return create_file_data(xml), document_id, None
            matched_chunk_ids=self._matched_chunk_ids(document.id),
        )
        file_data = create_file_data(xml)
        return file_data, document.id
    # ------------------------------------------------------------------ writes
@ -571,7 +603,7 @@ class KBPostgresBackend(BackendProtocol):
            loaded = await self._load_file_data(file_path)
            if loaded is None:
                return EditResult(error=f"Error: File '{file_path}' not found")
-            file_data, _ = loaded
+            file_data, _, _ = loaded
        content = file_data_to_string(file_data)
        result = perform_string_replacement(
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/edit_file/index.py
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/edit_file/index.py
@ -73,7 +73,7 @@ def create_edit_file_tool(mw: SurfSenseFilesystemMiddleware) -> BaseTool:
            loaded = await backend._load_file_data(validated)
            if loaded is None:
                return f"Error: File '{validated}' not found"
-            _, doc_id_to_attach = loaded
+            _, doc_id_to_attach, _ = loaded
        res: EditResult = await backend.aedit(
            validated, old_string, new_string, replace_all=replace_all
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/move_file/helpers.py
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/move_file/helpers.py
@ -75,7 +75,7 @@ async def cloud_move_file(
        loaded = await backend._load_file_data(source)
        if loaded is None:
            return f"Error: source '{source}' not found."
-        source_file_data, loaded_doc_id = loaded
+        source_file_data, loaded_doc_id, _ = loaded
        if source_doc_id is None:
            source_doc_id = loaded_doc_id
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/read_file/index.py
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/read_file/index.py
@ -58,8 +58,10 @@ def create_read_file_tool(mw: SurfSenseFilesystemMiddleware) -> BaseTool:
            loaded = await backend._load_file_data(validated)
            if loaded is None:
                return f"Error: File '{validated}' not found"
-            file_data, doc_id = loaded
+            file_data, doc_id, preamble = loaded
            rendered = format_read_response(file_data, offset, limit)
            if preamble and offset == 0:
                rendered = preamble + rendered
            update: dict[str, Any] = {
                "files": {validated: file_data},
                "messages": [
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/rm/helpers.py
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/rm/helpers.py
@ -74,7 +74,7 @@ async def cloud_rm(
        loaded = await backend._load_file_data(validated)
        if loaded is None:
            return f"Error: file '{validated}' not found."
-        _, resolved_doc_id = loaded
+        _, resolved_doc_id, _ = loaded
    files_update: dict[str, Any] = {validated: None}
    update: dict[str, Any] = {
--- a/surfsense_backend/tests/unit/middleware/test_b_filesystem_rm_rmdir_cloud.py
+++ b/surfsense_backend/tests/unit/middleware/test_b_filesystem_rm_rmdir_cloud.py
@ -71,7 +71,7 @@ class _KBBackendStub(KBPostgresBackend):
    def __init__(self, *, children=None, file_data=None) -> None:
        self.als_info = AsyncMock(return_value=children or [])
        self._load_file_data = AsyncMock(
-            return_value=(file_data, 17) if file_data is not None else None
+            return_value=(file_data, 17, None) if file_data is not None else None
        )