"""Shared XML builder for KB documents. Produces the citation-friendly XML used by every read of a knowledge-base document (lazy-loaded by :class:`KBPostgresBackend` and synthetic anonymous files). The XML carries a ```` near the top so the LLM can jump directly to matched-chunk line ranges via ``read_file(offset=…, limit=…)``. Extracted from the original ``knowledge_search.py`` so the backend, the priority middleware, and any future renderer share a single implementation. """ from __future__ import annotations import json from typing import Any def build_document_xml( document: dict[str, Any], matched_chunk_ids: set[int] | None = None, ) -> str: """Build citation-friendly XML with a ```` for smart seeking. Args: document: Dict shape produced by hybrid search / lazy-load helpers. Expected keys: ``document`` (with ``id``, ``title``, ``document_type``, ``metadata``) and ``chunks`` (list of ``{chunk_id, content}``). matched_chunk_ids: Optional set of chunk IDs to flag as ``matched="true"`` in the chunk index. """ matched = matched_chunk_ids or set() doc_meta = document.get("document") or {} metadata = (doc_meta.get("metadata") or {}) if isinstance(doc_meta, dict) else {} document_id = doc_meta.get("id", document.get("document_id", "unknown")) document_type = doc_meta.get("document_type", document.get("source", "UNKNOWN")) title = doc_meta.get("title") or metadata.get("title") or "Untitled Document" url = ( metadata.get("url") or metadata.get("source") or metadata.get("page_url") or "" ) metadata_json = json.dumps(metadata, ensure_ascii=False) metadata_lines: list[str] = [ "", "", f" {document_id}", f" {document_type}", f" <![CDATA[{title}]]>", f" ", f" ", "", "", ] chunks = document.get("chunks") or [] chunk_entries: list[tuple[int | None, str]] = [] if isinstance(chunks, list): for chunk in chunks: if not isinstance(chunk, dict): continue chunk_id = chunk.get("chunk_id") or chunk.get("id") chunk_content = str(chunk.get("content", "")).strip() if not chunk_content: continue if chunk_id is None: xml = f" " else: xml = f" " chunk_entries.append((chunk_id, xml)) index_overhead = 1 + len(chunk_entries) + 1 + 1 + 1 first_chunk_line = len(metadata_lines) + index_overhead + 1 current_line = first_chunk_line index_entry_lines: list[str] = [] for cid, xml_str in chunk_entries: num_lines = xml_str.count("\n") + 1 end_line = current_line + num_lines - 1 matched_attr = ' matched="true"' if cid is not None and cid in matched else "" if cid is not None: index_entry_lines.append( f' ' ) else: index_entry_lines.append( f' ' ) current_line = end_line + 1 lines = metadata_lines.copy() lines.append("") lines.extend(index_entry_lines) lines.append("") lines.append("") lines.append("") for _, xml_str in chunk_entries: lines.append(xml_str) lines.extend(["", ""]) return "\n".join(lines) __all__ = ["build_document_xml"]