From 73dd4e8e3a3e8026880cf738cef6008d584ce5a3 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Fri, 19 Jun 2026 17:37:41 +0200
Subject: [PATCH] feat: embed line-citation tokens in search hits

---
 .../main_agent/tools/search_knowledge_base.py | 41 +++++++++++--------
 .../tools/test_search_knowledge_base.py       | 18 +++++---
 2 files changed, 38 insertions(+), 21 deletions(-)

diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/tools/search_knowledge_base.py b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/tools/search_knowledge_base.py
index ad47816f9..0696dc92e 100644
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/tools/search_knowledge_base.py
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/tools/search_knowledge_base.py
@@ -112,20 +112,25 @@ async def _resolve_doc_context(
     return paths, bodies
 
 
-def _line_label(chunk: dict[str, Any], body: str | None) -> str:
-    """``[lines X-Y]`` for a span-bearing chunk, or '' when spans are absent."""
+def _citation_token(chunk: dict[str, Any], body: str | None, doc_id: int | None) -> str:
+    """Ready-to-copy ``[citation:dID#Lstart-end]`` token, or '' without spans."""
     start = chunk.get("start_char")
     end = chunk.get("end_char")
-    if not body or not isinstance(start, int) or not isinstance(end, int):
+    if (
+        not body
+        or not isinstance(doc_id, int)
+        or not isinstance(start, int)
+        or not isinstance(end, int)
+    ):
         return ""
     start_line, end_line = char_span_to_line_range(body, start, end)
-    if start_line == end_line:
-        return f"[line {start_line}]"
-    return f"[lines {start_line}-{end_line}]"
+    return f"[citation:d{doc_id}#L{start_line}-{end_line}]"
 
 
-def _render_passage(chunk: dict[str, Any], body: str | None) -> str | None:
-    """Render one matched chunk as an indented, line-annotated passage."""
+def _render_passage(
+    chunk: dict[str, Any], body: str | None, doc_id: int | None
+) -> str | None:
+    """Render one matched chunk as an indented passage tagged with its token."""
     content = (chunk.get("content") or "").strip()
     if not content:
         return None
@@ -133,12 +138,14 @@ def _render_passage(chunk: dict[str, Any], body: str | None) -> str | None:
     if len(content) > _PER_DOC_SNIPPET_CHARS:
         snippet += " ..."
     indented = snippet.replace("\n", "\n   ")
-    label = _line_label(chunk, body)
-    head = f"\n   {label}" if label else ""
+    token = _citation_token(chunk, body, doc_id)
+    head = f"\n   {token}" if token else ""
     return f"{head}\n   {indented}"
 
 
-def _matched_passages(doc: dict[str, Any], body: str | None) -> str:
+def _matched_passages(
+    doc: dict[str, Any], body: str | None, doc_id: int | None
+) -> str:
     """Render the RRF-matched chunks; '' when none can be rendered."""
     by_id = {
         c.get("chunk_id"): c
@@ -150,7 +157,7 @@ def _matched_passages(doc: dict[str, Any], body: str | None) -> str:
         chunk = by_id.get(chunk_id)
         if chunk is None:
             continue
-        passage = _render_passage(chunk, body)
+        passage = _render_passage(chunk, body, doc_id)
         if passage:
             rendered.append(passage)
     return "".join(rendered)
@@ -194,11 +201,12 @@ def _format_hits(
         path = paths.get(doc_id) if isinstance(doc_id, int) else None
         body = bodies.get(doc_id) if isinstance(doc_id, int) else None
 
-        header = f"\n{rank}. {title} (type={doc_type}, score={score_str})" + (
+        id_str = f"id={doc_id}, " if isinstance(doc_id, int) else ""
+        header = f"\n{rank}. {title} ({id_str}type={doc_type}, score={score_str})" + (
             f"\n   path: {path}" if path else ""
         )
 
-        passages = _matched_passages(doc, body)
+        passages = _matched_passages(doc, body, doc_id if isinstance(doc_id, int) else None)
         entry = header + (passages or _fallback_snippet(doc))
         if total + len(entry) > _MAX_TOTAL_CHARS:
             lines.append("\n<!-- additional matches truncated to fit context -->")
@@ -207,8 +215,9 @@ def _format_hits(
         total += len(entry)
 
     lines.append(
-        "\n\nTo read a full document, delegate to the knowledge_base specialist "
-        "with `task`, referencing the path above."
+        "\n\nTo cite a matched passage, copy its [citation:dID#Lstart-end] token "
+        "verbatim. To quote more context or read the full document, delegate to "
+        "the knowledge_base specialist with `task` using the path above."
     )
     lines.append("\n</knowledge_base_results>")
     return "".join(lines)
diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/tools/test_search_knowledge_base.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/tools/test_search_knowledge_base.py
index eadfcd30d..e068792b1 100644
--- a/surfsense_backend/tests/unit/agents/multi_agent_chat/tools/test_search_knowledge_base.py
+++ b/surfsense_backend/tests/unit/agents/multi_agent_chat/tools/test_search_knowledge_base.py
@@ -51,20 +51,28 @@ def test_renders_matched_passage_not_top_of_doc() -> None:
     assert "Intro paragraph." not in out
 
 
-def test_includes_line_range_when_spans_present() -> None:
+def test_emits_copyable_line_citation_token_when_spans_present() -> None:
     out = _format_hits([_hit()], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
-    # "Matched passage here." sits on line 3 of the body.
-    assert "line 3" in out
+    # "Matched passage here." sits on line 3 of the body; the hit must surface
+    # a ready-to-copy token so the agent can cite without a separate read.
+    assert "[citation:d7#L3-3]" in out
 
 
-def test_omits_line_range_when_spans_absent() -> None:
+def test_header_includes_document_id() -> None:
+    out = _format_hits([_hit()], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
+    assert "id=7" in out
+
+
+def test_omits_citation_token_when_spans_absent() -> None:
     hit = _hit()
     for chunk in hit["chunks"]:
         chunk["start_char"] = None
         chunk["end_char"] = None
     out = _format_hits([hit], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
     assert "Matched passage here." in out
-    assert "[line" not in out
+    # No concrete, copyable token for this document without spans (the closing
+    # instruction's placeholder template doesn't count).
+    assert "[citation:d7#L" not in out
 
 
 def test_falls_back_to_content_when_no_matched_ids() -> None: