feat: updated file management for main agent

2026-05-04 13:22:41 +02:00 · 2026-04-28 04:32:52 -07:00 · 2026-04-28 04:32:52 -07:00 · 05ca4c0b9f
commit 05ca4c0b9f
parent 8d50f90060
27 changed files with 5054 additions and 1803 deletions
--- a/surfsense_backend/app/agents/new_chat/document_xml.py
+++ b/surfsense_backend/app/agents/new_chat/document_xml.py
@ -0,0 +1,103 @@
+"""Shared XML builder for KB documents.
+
+Produces the citation-friendly XML used by every read of a knowledge-base
+document (lazy-loaded by :class:`KBPostgresBackend` and synthetic anonymous
+files). The XML carries a ``<chunk_index>`` near the top so the LLM can jump
+directly to matched-chunk line ranges via ``read_file(offset=…, limit=…)``.
+
+Extracted from the original ``knowledge_search.py`` so the backend, the
+priority middleware, and any future renderer share a single implementation.
+"""
+
+from __future__ import annotations
+
+import json
+from typing import Any
+
+
+def build_document_xml(
+    document: dict[str, Any],
+    matched_chunk_ids: set[int] | None = None,
+) -> str:
+    """Build citation-friendly XML with a ``<chunk_index>`` for smart seeking.
+
+    Args:
+        document: Dict shape produced by hybrid search / lazy-load helpers.
+            Expected keys: ``document`` (with ``id``, ``title``,
+            ``document_type``, ``metadata``) and ``chunks``
+            (list of ``{chunk_id, content}``).
+        matched_chunk_ids: Optional set of chunk IDs to flag as
+            ``matched="true"`` in the chunk index.
+    """
+    matched = matched_chunk_ids or set()
+
+    doc_meta = document.get("document") or {}
+    metadata = (doc_meta.get("metadata") or {}) if isinstance(doc_meta, dict) else {}
+    document_id = doc_meta.get("id", document.get("document_id", "unknown"))
+    document_type = doc_meta.get("document_type", document.get("source", "UNKNOWN"))
+    title = doc_meta.get("title") or metadata.get("title") or "Untitled Document"
+    url = (
+        metadata.get("url") or metadata.get("source") or metadata.get("page_url") or ""
+    )
+    metadata_json = json.dumps(metadata, ensure_ascii=False)
+
+    metadata_lines: list[str] = [
+        "<document>",
+        "<document_metadata>",
+        f"  <document_id>{document_id}</document_id>",
+        f"  <document_type>{document_type}</document_type>",
+        f"  <title><![CDATA[{title}]]></title>",
+        f"  <url><![CDATA[{url}]]></url>",
+        f"  <metadata_json><![CDATA[{metadata_json}]]></metadata_json>",
+        "</document_metadata>",
+        "",
+    ]
+
+    chunks = document.get("chunks") or []
+    chunk_entries: list[tuple[int | None, str]] = []
+    if isinstance(chunks, list):
+        for chunk in chunks:
+            if not isinstance(chunk, dict):
+                continue
+            chunk_id = chunk.get("chunk_id") or chunk.get("id")
+            chunk_content = str(chunk.get("content", "")).strip()
+            if not chunk_content:
+                continue
+            if chunk_id is None:
+                xml = f"  <chunk><![CDATA[{chunk_content}]]></chunk>"
+            else:
+                xml = f"  <chunk id='{chunk_id}'><![CDATA[{chunk_content}]]></chunk>"
+            chunk_entries.append((chunk_id, xml))
+
+    index_overhead = 1 + len(chunk_entries) + 1 + 1 + 1
+    first_chunk_line = len(metadata_lines) + index_overhead + 1
+
+    current_line = first_chunk_line
+    index_entry_lines: list[str] = []
+    for cid, xml_str in chunk_entries:
+        num_lines = xml_str.count("\n") + 1
+        end_line = current_line + num_lines - 1
+        matched_attr = ' matched="true"' if cid is not None and cid in matched else ""
+        if cid is not None:
+            index_entry_lines.append(
+                f'  <entry chunk_id="{cid}" lines="{current_line}-{end_line}"{matched_attr}/>'
+            )
+        else:
+            index_entry_lines.append(
+                f'  <entry lines="{current_line}-{end_line}"{matched_attr}/>'
+            )
+        current_line = end_line + 1
+
+    lines = metadata_lines.copy()
+    lines.append("<chunk_index>")
+    lines.extend(index_entry_lines)
+    lines.append("</chunk_index>")
+    lines.append("")
+    lines.append("<document_content>")
+    for _, xml_str in chunk_entries:
+        lines.append(xml_str)
+    lines.extend(["</document_content>", "</document>"])
+    return "\n".join(lines)
+
+
+__all__ = ["build_document_xml"]