mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-04 13:22:41 +02:00
feat: updated file management for main agent
This commit is contained in:
parent
8d50f90060
commit
05ca4c0b9f
27 changed files with 5054 additions and 1803 deletions
103
surfsense_backend/app/agents/new_chat/document_xml.py
Normal file
103
surfsense_backend/app/agents/new_chat/document_xml.py
Normal file
|
|
@ -0,0 +1,103 @@
|
|||
"""Shared XML builder for KB documents.
|
||||
|
||||
Produces the citation-friendly XML used by every read of a knowledge-base
|
||||
document (lazy-loaded by :class:`KBPostgresBackend` and synthetic anonymous
|
||||
files). The XML carries a ``<chunk_index>`` near the top so the LLM can jump
|
||||
directly to matched-chunk line ranges via ``read_file(offset=…, limit=…)``.
|
||||
|
||||
Extracted from the original ``knowledge_search.py`` so the backend, the
|
||||
priority middleware, and any future renderer share a single implementation.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from typing import Any
|
||||
|
||||
|
||||
def build_document_xml(
|
||||
document: dict[str, Any],
|
||||
matched_chunk_ids: set[int] | None = None,
|
||||
) -> str:
|
||||
"""Build citation-friendly XML with a ``<chunk_index>`` for smart seeking.
|
||||
|
||||
Args:
|
||||
document: Dict shape produced by hybrid search / lazy-load helpers.
|
||||
Expected keys: ``document`` (with ``id``, ``title``,
|
||||
``document_type``, ``metadata``) and ``chunks``
|
||||
(list of ``{chunk_id, content}``).
|
||||
matched_chunk_ids: Optional set of chunk IDs to flag as
|
||||
``matched="true"`` in the chunk index.
|
||||
"""
|
||||
matched = matched_chunk_ids or set()
|
||||
|
||||
doc_meta = document.get("document") or {}
|
||||
metadata = (doc_meta.get("metadata") or {}) if isinstance(doc_meta, dict) else {}
|
||||
document_id = doc_meta.get("id", document.get("document_id", "unknown"))
|
||||
document_type = doc_meta.get("document_type", document.get("source", "UNKNOWN"))
|
||||
title = doc_meta.get("title") or metadata.get("title") or "Untitled Document"
|
||||
url = (
|
||||
metadata.get("url") or metadata.get("source") or metadata.get("page_url") or ""
|
||||
)
|
||||
metadata_json = json.dumps(metadata, ensure_ascii=False)
|
||||
|
||||
metadata_lines: list[str] = [
|
||||
"<document>",
|
||||
"<document_metadata>",
|
||||
f" <document_id>{document_id}</document_id>",
|
||||
f" <document_type>{document_type}</document_type>",
|
||||
f" <title><![CDATA[{title}]]></title>",
|
||||
f" <url><![CDATA[{url}]]></url>",
|
||||
f" <metadata_json><![CDATA[{metadata_json}]]></metadata_json>",
|
||||
"</document_metadata>",
|
||||
"",
|
||||
]
|
||||
|
||||
chunks = document.get("chunks") or []
|
||||
chunk_entries: list[tuple[int | None, str]] = []
|
||||
if isinstance(chunks, list):
|
||||
for chunk in chunks:
|
||||
if not isinstance(chunk, dict):
|
||||
continue
|
||||
chunk_id = chunk.get("chunk_id") or chunk.get("id")
|
||||
chunk_content = str(chunk.get("content", "")).strip()
|
||||
if not chunk_content:
|
||||
continue
|
||||
if chunk_id is None:
|
||||
xml = f" <chunk><![CDATA[{chunk_content}]]></chunk>"
|
||||
else:
|
||||
xml = f" <chunk id='{chunk_id}'><![CDATA[{chunk_content}]]></chunk>"
|
||||
chunk_entries.append((chunk_id, xml))
|
||||
|
||||
index_overhead = 1 + len(chunk_entries) + 1 + 1 + 1
|
||||
first_chunk_line = len(metadata_lines) + index_overhead + 1
|
||||
|
||||
current_line = first_chunk_line
|
||||
index_entry_lines: list[str] = []
|
||||
for cid, xml_str in chunk_entries:
|
||||
num_lines = xml_str.count("\n") + 1
|
||||
end_line = current_line + num_lines - 1
|
||||
matched_attr = ' matched="true"' if cid is not None and cid in matched else ""
|
||||
if cid is not None:
|
||||
index_entry_lines.append(
|
||||
f' <entry chunk_id="{cid}" lines="{current_line}-{end_line}"{matched_attr}/>'
|
||||
)
|
||||
else:
|
||||
index_entry_lines.append(
|
||||
f' <entry lines="{current_line}-{end_line}"{matched_attr}/>'
|
||||
)
|
||||
current_line = end_line + 1
|
||||
|
||||
lines = metadata_lines.copy()
|
||||
lines.append("<chunk_index>")
|
||||
lines.extend(index_entry_lines)
|
||||
lines.append("</chunk_index>")
|
||||
lines.append("")
|
||||
lines.append("<document_content>")
|
||||
for _, xml_str in chunk_entries:
|
||||
lines.append(xml_str)
|
||||
lines.extend(["</document_content>", "</document>"])
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
__all__ = ["build_document_xml"]
|
||||
Loading…
Add table
Add a link
Reference in a new issue