diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/numbered_document.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/numbered_document.py new file mode 100644 index 000000000..ced77096f --- /dev/null +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/numbered_document.py @@ -0,0 +1,73 @@ +"""Read preamble for canonical (numbered ``source_markdown``) KB reads. + +The KB read tool numbers the body lines ``cat -n`` style, so serving the raw +``source_markdown`` makes those line numbers line up exactly with the chunk +char spans and the editor highlight. This module renders the small header the +agent sees above that body: document identity plus the matched line ranges to +seek to, and a concrete reminder of the line-citation token shape. +""" + +from __future__ import annotations + +from collections.abc import Iterable + +from app.utils.text_spans import char_span_to_line_range + + +def _format_range(start: int, end: int) -> str: + return f"{start}" if start == end else f"{start}-{end}" + + +def compute_matched_line_ranges( + source_markdown: str, + chunks: Iterable[tuple[int, int | None, int | None]], + matched_chunk_ids: set[int], +) -> list[tuple[int, int]]: + """Map matched chunks to sorted, de-duplicated 1-based line ranges. + + ``chunks`` are ``(chunk_id, start_char, end_char)`` triples. Chunks without + spans (legacy rows) are skipped — they have no resolvable location. + """ + ranges: set[tuple[int, int]] = set() + for chunk_id, start_char, end_char in chunks: + if chunk_id not in matched_chunk_ids: + continue + if start_char is None or end_char is None: + continue + ranges.add(char_span_to_line_range(source_markdown, start_char, end_char)) + return sorted(ranges) + + +def build_read_preamble( + *, + document_id: int, + document_type: str, + title: str, + url: str, + matched_line_ranges: list[tuple[int, int]], +) -> str: + """Render the metadata header shown above a numbered ``source_markdown`` body. + + ``matched_line_ranges`` are 1-based inclusive line ranges (already derived + from chunk char spans) to point the agent at the relevant lines. + """ + lines = [ + "", + f" {document_id}", + f" {document_type}", + f" <![CDATA[{title}]]>", + f" ", + ] + if matched_line_ranges: + ranges = ", ".join(_format_range(s, e) for s, e in matched_line_ranges) + lines.append(f" {ranges}") + lines.append("") + lines.append( + f"Cite lines from this document as [citation:d{document_id}#L-] " + "using the line numbers shown below." + ) + lines.append("") + return "\n".join(lines) + + +__all__ = ["build_read_preamble", "compute_matched_line_ranges"]