From 73dd4e8e3a3e8026880cf738cef6008d584ce5a3 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 19 Jun 2026 17:37:41 +0200 Subject: [PATCH] feat: embed line-citation tokens in search hits --- .../main_agent/tools/search_knowledge_base.py | 41 +++++++++++-------- .../tools/test_search_knowledge_base.py | 18 +++++--- 2 files changed, 38 insertions(+), 21 deletions(-) diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/tools/search_knowledge_base.py b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/tools/search_knowledge_base.py index ad47816f9..0696dc92e 100644 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/tools/search_knowledge_base.py +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/tools/search_knowledge_base.py @@ -112,20 +112,25 @@ async def _resolve_doc_context( return paths, bodies -def _line_label(chunk: dict[str, Any], body: str | None) -> str: - """``[lines X-Y]`` for a span-bearing chunk, or '' when spans are absent.""" +def _citation_token(chunk: dict[str, Any], body: str | None, doc_id: int | None) -> str: + """Ready-to-copy ``[citation:dID#Lstart-end]`` token, or '' without spans.""" start = chunk.get("start_char") end = chunk.get("end_char") - if not body or not isinstance(start, int) or not isinstance(end, int): + if ( + not body + or not isinstance(doc_id, int) + or not isinstance(start, int) + or not isinstance(end, int) + ): return "" start_line, end_line = char_span_to_line_range(body, start, end) - if start_line == end_line: - return f"[line {start_line}]" - return f"[lines {start_line}-{end_line}]" + return f"[citation:d{doc_id}#L{start_line}-{end_line}]" -def _render_passage(chunk: dict[str, Any], body: str | None) -> str | None: - """Render one matched chunk as an indented, line-annotated passage.""" +def _render_passage( + chunk: dict[str, Any], body: str | None, doc_id: int | None +) -> str | None: + """Render one matched chunk as an indented passage tagged with its token.""" content = (chunk.get("content") or "").strip() if not content: return None @@ -133,12 +138,14 @@ def _render_passage(chunk: dict[str, Any], body: str | None) -> str | None: if len(content) > _PER_DOC_SNIPPET_CHARS: snippet += " ..." indented = snippet.replace("\n", "\n ") - label = _line_label(chunk, body) - head = f"\n {label}" if label else "" + token = _citation_token(chunk, body, doc_id) + head = f"\n {token}" if token else "" return f"{head}\n {indented}" -def _matched_passages(doc: dict[str, Any], body: str | None) -> str: +def _matched_passages( + doc: dict[str, Any], body: str | None, doc_id: int | None +) -> str: """Render the RRF-matched chunks; '' when none can be rendered.""" by_id = { c.get("chunk_id"): c @@ -150,7 +157,7 @@ def _matched_passages(doc: dict[str, Any], body: str | None) -> str: chunk = by_id.get(chunk_id) if chunk is None: continue - passage = _render_passage(chunk, body) + passage = _render_passage(chunk, body, doc_id) if passage: rendered.append(passage) return "".join(rendered) @@ -194,11 +201,12 @@ def _format_hits( path = paths.get(doc_id) if isinstance(doc_id, int) else None body = bodies.get(doc_id) if isinstance(doc_id, int) else None - header = f"\n{rank}. {title} (type={doc_type}, score={score_str})" + ( + id_str = f"id={doc_id}, " if isinstance(doc_id, int) else "" + header = f"\n{rank}. {title} ({id_str}type={doc_type}, score={score_str})" + ( f"\n path: {path}" if path else "" ) - passages = _matched_passages(doc, body) + passages = _matched_passages(doc, body, doc_id if isinstance(doc_id, int) else None) entry = header + (passages or _fallback_snippet(doc)) if total + len(entry) > _MAX_TOTAL_CHARS: lines.append("\n") @@ -207,8 +215,9 @@ def _format_hits( total += len(entry) lines.append( - "\n\nTo read a full document, delegate to the knowledge_base specialist " - "with `task`, referencing the path above." + "\n\nTo cite a matched passage, copy its [citation:dID#Lstart-end] token " + "verbatim. To quote more context or read the full document, delegate to " + "the knowledge_base specialist with `task` using the path above." ) lines.append("\n") return "".join(lines) diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/tools/test_search_knowledge_base.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/tools/test_search_knowledge_base.py index eadfcd30d..e068792b1 100644 --- a/surfsense_backend/tests/unit/agents/multi_agent_chat/tools/test_search_knowledge_base.py +++ b/surfsense_backend/tests/unit/agents/multi_agent_chat/tools/test_search_knowledge_base.py @@ -51,20 +51,28 @@ def test_renders_matched_passage_not_top_of_doc() -> None: assert "Intro paragraph." not in out -def test_includes_line_range_when_spans_present() -> None: +def test_emits_copyable_line_citation_token_when_spans_present() -> None: out = _format_hits([_hit()], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q") - # "Matched passage here." sits on line 3 of the body. - assert "line 3" in out + # "Matched passage here." sits on line 3 of the body; the hit must surface + # a ready-to-copy token so the agent can cite without a separate read. + assert "[citation:d7#L3-3]" in out -def test_omits_line_range_when_spans_absent() -> None: +def test_header_includes_document_id() -> None: + out = _format_hits([_hit()], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q") + assert "id=7" in out + + +def test_omits_citation_token_when_spans_absent() -> None: hit = _hit() for chunk in hit["chunks"]: chunk["start_char"] = None chunk["end_char"] = None out = _format_hits([hit], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q") assert "Matched passage here." in out - assert "[line" not in out + # No concrete, copyable token for this document without spans (the closing + # instruction's placeholder template doesn't count). + assert "[citation:d7#L" not in out def test_falls_back_to_content_when_no_matched_ids() -> None: