feat: embed line-citation tokens in search hits

2026-06-20 21:18:13 +02:00 · 2026-06-19 17:37:41 +02:00 · 2026-06-19 17:37:41 +02:00 · 73dd4e8e3a
commit 73dd4e8e3a
parent 188ae053ac
2 changed files with 38 additions and 21 deletions
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/tools/search_knowledge_base.py
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/tools/search_knowledge_base.py
@ -112,20 +112,25 @@ async def _resolve_doc_context(
    return paths, bodies


-def _line_label(chunk: dict[str, Any], body: str | None) -> str:
-    """``[lines X-Y]`` for a span-bearing chunk, or '' when spans are absent."""
+def _citation_token(chunk: dict[str, Any], body: str | None, doc_id: int | None) -> str:
+    """Ready-to-copy ``[citation:dID#Lstart-end]`` token, or '' without spans."""
    start = chunk.get("start_char")
    end = chunk.get("end_char")
-    if not body or not isinstance(start, int) or not isinstance(end, int):
+    if (
+        not body
+        or not isinstance(doc_id, int)
+        or not isinstance(start, int)
+        or not isinstance(end, int)
+    ):
        return ""
    start_line, end_line = char_span_to_line_range(body, start, end)
-    if start_line == end_line:
-        return f"[line {start_line}]"
-    return f"[lines {start_line}-{end_line}]"
+    return f"[citation:d{doc_id}#L{start_line}-{end_line}]"


-def _render_passage(chunk: dict[str, Any], body: str | None) -> str | None:
-    """Render one matched chunk as an indented, line-annotated passage."""
+def _render_passage(
+    chunk: dict[str, Any], body: str | None, doc_id: int | None
+) -> str | None:
+    """Render one matched chunk as an indented passage tagged with its token."""
    content = (chunk.get("content") or "").strip()
    if not content:
        return None
@ -133,12 +138,14 @@ def _render_passage(chunk: dict[str, Any], body: str | None) -> str | None:
    if len(content) > _PER_DOC_SNIPPET_CHARS:
        snippet += " ..."
    indented = snippet.replace("\n", "\n   ")
-    label = _line_label(chunk, body)
-    head = f"\n   {label}" if label else ""
+    token = _citation_token(chunk, body, doc_id)
+    head = f"\n   {token}" if token else ""
    return f"{head}\n   {indented}"


-def _matched_passages(doc: dict[str, Any], body: str | None) -> str:
+def _matched_passages(
+    doc: dict[str, Any], body: str | None, doc_id: int | None
+) -> str:
    """Render the RRF-matched chunks; '' when none can be rendered."""
    by_id = {
        c.get("chunk_id"): c
@ -150,7 +157,7 @@ def _matched_passages(doc: dict[str, Any], body: str | None) -> str:
        chunk = by_id.get(chunk_id)
        if chunk is None:
            continue
-        passage = _render_passage(chunk, body)
+        passage = _render_passage(chunk, body, doc_id)
        if passage:
            rendered.append(passage)
    return "".join(rendered)
@ -194,11 +201,12 @@ def _format_hits(
        path = paths.get(doc_id) if isinstance(doc_id, int) else None
        body = bodies.get(doc_id) if isinstance(doc_id, int) else None

-        header = f"\n{rank}. {title} (type={doc_type}, score={score_str})" + (
+        id_str = f"id={doc_id}, " if isinstance(doc_id, int) else ""
+        header = f"\n{rank}. {title} ({id_str}type={doc_type}, score={score_str})" + (
            f"\n   path: {path}" if path else ""
        )

-        passages = _matched_passages(doc, body)
+        passages = _matched_passages(doc, body, doc_id if isinstance(doc_id, int) else None)
        entry = header + (passages or _fallback_snippet(doc))
        if total + len(entry) > _MAX_TOTAL_CHARS:
            lines.append("\n<!-- additional matches truncated to fit context -->")
@ -207,8 +215,9 @@ def _format_hits(
        total += len(entry)

    lines.append(
-        "\n\nTo read a full document, delegate to the knowledge_base specialist "
-        "with `task`, referencing the path above."
+        "\n\nTo cite a matched passage, copy its [citation:dID#Lstart-end] token "
+        "verbatim. To quote more context or read the full document, delegate to "
+        "the knowledge_base specialist with `task` using the path above."
    )
    lines.append("\n</knowledge_base_results>")
    return "".join(lines)
--- a/surfsense_backend/tests/unit/agents/multi_agent_chat/tools/test_search_knowledge_base.py
+++ b/surfsense_backend/tests/unit/agents/multi_agent_chat/tools/test_search_knowledge_base.py
@ -51,20 +51,28 @@ def test_renders_matched_passage_not_top_of_doc() -> None:
    assert "Intro paragraph." not in out


-def test_includes_line_range_when_spans_present() -> None:
+def test_emits_copyable_line_citation_token_when_spans_present() -> None:
    out = _format_hits([_hit()], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
-    # "Matched passage here." sits on line 3 of the body.
-    assert "line 3" in out
+    # "Matched passage here." sits on line 3 of the body; the hit must surface
+    # a ready-to-copy token so the agent can cite without a separate read.
+    assert "[citation:d7#L3-3]" in out


-def test_omits_line_range_when_spans_absent() -> None:
+def test_header_includes_document_id() -> None:
+    out = _format_hits([_hit()], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
+    assert "id=7" in out
+
+
+def test_omits_citation_token_when_spans_absent() -> None:
    hit = _hit()
    for chunk in hit["chunks"]:
        chunk["start_char"] = None
        chunk["end_char"] = None
    out = _format_hits([hit], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
    assert "Matched passage here." in out
-    assert "[line" not in out
+    # No concrete, copyable token for this document without spans (the closing
+    # instruction's placeholder template doesn't count).
+    assert "[citation:d7#L" not in out


 def test_falls_back_to_content_when_no_matched_ids() -> None: