feat: embed line-citation tokens in search hits

This commit is contained in:
CREDO23 2026-06-19 17:37:41 +02:00
parent 188ae053ac
commit 73dd4e8e3a
2 changed files with 38 additions and 21 deletions

View file

@ -112,20 +112,25 @@ async def _resolve_doc_context(
return paths, bodies
def _line_label(chunk: dict[str, Any], body: str | None) -> str:
"""``[lines X-Y]`` for a span-bearing chunk, or '' when spans are absent."""
def _citation_token(chunk: dict[str, Any], body: str | None, doc_id: int | None) -> str:
"""Ready-to-copy ``[citation:dID#Lstart-end]`` token, or '' without spans."""
start = chunk.get("start_char")
end = chunk.get("end_char")
if not body or not isinstance(start, int) or not isinstance(end, int):
if (
not body
or not isinstance(doc_id, int)
or not isinstance(start, int)
or not isinstance(end, int)
):
return ""
start_line, end_line = char_span_to_line_range(body, start, end)
if start_line == end_line:
return f"[line {start_line}]"
return f"[lines {start_line}-{end_line}]"
return f"[citation:d{doc_id}#L{start_line}-{end_line}]"
def _render_passage(chunk: dict[str, Any], body: str | None) -> str | None:
"""Render one matched chunk as an indented, line-annotated passage."""
def _render_passage(
chunk: dict[str, Any], body: str | None, doc_id: int | None
) -> str | None:
"""Render one matched chunk as an indented passage tagged with its token."""
content = (chunk.get("content") or "").strip()
if not content:
return None
@ -133,12 +138,14 @@ def _render_passage(chunk: dict[str, Any], body: str | None) -> str | None:
if len(content) > _PER_DOC_SNIPPET_CHARS:
snippet += " ..."
indented = snippet.replace("\n", "\n ")
label = _line_label(chunk, body)
head = f"\n {label}" if label else ""
token = _citation_token(chunk, body, doc_id)
head = f"\n {token}" if token else ""
return f"{head}\n {indented}"
def _matched_passages(doc: dict[str, Any], body: str | None) -> str:
def _matched_passages(
doc: dict[str, Any], body: str | None, doc_id: int | None
) -> str:
"""Render the RRF-matched chunks; '' when none can be rendered."""
by_id = {
c.get("chunk_id"): c
@ -150,7 +157,7 @@ def _matched_passages(doc: dict[str, Any], body: str | None) -> str:
chunk = by_id.get(chunk_id)
if chunk is None:
continue
passage = _render_passage(chunk, body)
passage = _render_passage(chunk, body, doc_id)
if passage:
rendered.append(passage)
return "".join(rendered)
@ -194,11 +201,12 @@ def _format_hits(
path = paths.get(doc_id) if isinstance(doc_id, int) else None
body = bodies.get(doc_id) if isinstance(doc_id, int) else None
header = f"\n{rank}. {title} (type={doc_type}, score={score_str})" + (
id_str = f"id={doc_id}, " if isinstance(doc_id, int) else ""
header = f"\n{rank}. {title} ({id_str}type={doc_type}, score={score_str})" + (
f"\n path: {path}" if path else ""
)
passages = _matched_passages(doc, body)
passages = _matched_passages(doc, body, doc_id if isinstance(doc_id, int) else None)
entry = header + (passages or _fallback_snippet(doc))
if total + len(entry) > _MAX_TOTAL_CHARS:
lines.append("\n<!-- additional matches truncated to fit context -->")
@ -207,8 +215,9 @@ def _format_hits(
total += len(entry)
lines.append(
"\n\nTo read a full document, delegate to the knowledge_base specialist "
"with `task`, referencing the path above."
"\n\nTo cite a matched passage, copy its [citation:dID#Lstart-end] token "
"verbatim. To quote more context or read the full document, delegate to "
"the knowledge_base specialist with `task` using the path above."
)
lines.append("\n</knowledge_base_results>")
return "".join(lines)

View file

@ -51,20 +51,28 @@ def test_renders_matched_passage_not_top_of_doc() -> None:
assert "Intro paragraph." not in out
def test_includes_line_range_when_spans_present() -> None:
def test_emits_copyable_line_citation_token_when_spans_present() -> None:
out = _format_hits([_hit()], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
# "Matched passage here." sits on line 3 of the body.
assert "line 3" in out
# "Matched passage here." sits on line 3 of the body; the hit must surface
# a ready-to-copy token so the agent can cite without a separate read.
assert "[citation:d7#L3-3]" in out
def test_omits_line_range_when_spans_absent() -> None:
def test_header_includes_document_id() -> None:
out = _format_hits([_hit()], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
assert "id=7" in out
def test_omits_citation_token_when_spans_absent() -> None:
hit = _hit()
for chunk in hit["chunks"]:
chunk["start_char"] = None
chunk["end_char"] = None
out = _format_hits([hit], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
assert "Matched passage here." in out
assert "[line" not in out
# No concrete, copyable token for this document without spans (the closing
# instruction's placeholder template doesn't count).
assert "[citation:d7#L" not in out
def test_falls_back_to_content_when_no_matched_ids() -> None: