mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-26 21:39:43 +02:00
feat: embed line-citation tokens in search hits
This commit is contained in:
parent
188ae053ac
commit
73dd4e8e3a
2 changed files with 38 additions and 21 deletions
|
|
@ -112,20 +112,25 @@ async def _resolve_doc_context(
|
||||||
return paths, bodies
|
return paths, bodies
|
||||||
|
|
||||||
|
|
||||||
def _line_label(chunk: dict[str, Any], body: str | None) -> str:
|
def _citation_token(chunk: dict[str, Any], body: str | None, doc_id: int | None) -> str:
|
||||||
"""``[lines X-Y]`` for a span-bearing chunk, or '' when spans are absent."""
|
"""Ready-to-copy ``[citation:dID#Lstart-end]`` token, or '' without spans."""
|
||||||
start = chunk.get("start_char")
|
start = chunk.get("start_char")
|
||||||
end = chunk.get("end_char")
|
end = chunk.get("end_char")
|
||||||
if not body or not isinstance(start, int) or not isinstance(end, int):
|
if (
|
||||||
|
not body
|
||||||
|
or not isinstance(doc_id, int)
|
||||||
|
or not isinstance(start, int)
|
||||||
|
or not isinstance(end, int)
|
||||||
|
):
|
||||||
return ""
|
return ""
|
||||||
start_line, end_line = char_span_to_line_range(body, start, end)
|
start_line, end_line = char_span_to_line_range(body, start, end)
|
||||||
if start_line == end_line:
|
return f"[citation:d{doc_id}#L{start_line}-{end_line}]"
|
||||||
return f"[line {start_line}]"
|
|
||||||
return f"[lines {start_line}-{end_line}]"
|
|
||||||
|
|
||||||
|
|
||||||
def _render_passage(chunk: dict[str, Any], body: str | None) -> str | None:
|
def _render_passage(
|
||||||
"""Render one matched chunk as an indented, line-annotated passage."""
|
chunk: dict[str, Any], body: str | None, doc_id: int | None
|
||||||
|
) -> str | None:
|
||||||
|
"""Render one matched chunk as an indented passage tagged with its token."""
|
||||||
content = (chunk.get("content") or "").strip()
|
content = (chunk.get("content") or "").strip()
|
||||||
if not content:
|
if not content:
|
||||||
return None
|
return None
|
||||||
|
|
@ -133,12 +138,14 @@ def _render_passage(chunk: dict[str, Any], body: str | None) -> str | None:
|
||||||
if len(content) > _PER_DOC_SNIPPET_CHARS:
|
if len(content) > _PER_DOC_SNIPPET_CHARS:
|
||||||
snippet += " ..."
|
snippet += " ..."
|
||||||
indented = snippet.replace("\n", "\n ")
|
indented = snippet.replace("\n", "\n ")
|
||||||
label = _line_label(chunk, body)
|
token = _citation_token(chunk, body, doc_id)
|
||||||
head = f"\n {label}" if label else ""
|
head = f"\n {token}" if token else ""
|
||||||
return f"{head}\n {indented}"
|
return f"{head}\n {indented}"
|
||||||
|
|
||||||
|
|
||||||
def _matched_passages(doc: dict[str, Any], body: str | None) -> str:
|
def _matched_passages(
|
||||||
|
doc: dict[str, Any], body: str | None, doc_id: int | None
|
||||||
|
) -> str:
|
||||||
"""Render the RRF-matched chunks; '' when none can be rendered."""
|
"""Render the RRF-matched chunks; '' when none can be rendered."""
|
||||||
by_id = {
|
by_id = {
|
||||||
c.get("chunk_id"): c
|
c.get("chunk_id"): c
|
||||||
|
|
@ -150,7 +157,7 @@ def _matched_passages(doc: dict[str, Any], body: str | None) -> str:
|
||||||
chunk = by_id.get(chunk_id)
|
chunk = by_id.get(chunk_id)
|
||||||
if chunk is None:
|
if chunk is None:
|
||||||
continue
|
continue
|
||||||
passage = _render_passage(chunk, body)
|
passage = _render_passage(chunk, body, doc_id)
|
||||||
if passage:
|
if passage:
|
||||||
rendered.append(passage)
|
rendered.append(passage)
|
||||||
return "".join(rendered)
|
return "".join(rendered)
|
||||||
|
|
@ -194,11 +201,12 @@ def _format_hits(
|
||||||
path = paths.get(doc_id) if isinstance(doc_id, int) else None
|
path = paths.get(doc_id) if isinstance(doc_id, int) else None
|
||||||
body = bodies.get(doc_id) if isinstance(doc_id, int) else None
|
body = bodies.get(doc_id) if isinstance(doc_id, int) else None
|
||||||
|
|
||||||
header = f"\n{rank}. {title} (type={doc_type}, score={score_str})" + (
|
id_str = f"id={doc_id}, " if isinstance(doc_id, int) else ""
|
||||||
|
header = f"\n{rank}. {title} ({id_str}type={doc_type}, score={score_str})" + (
|
||||||
f"\n path: {path}" if path else ""
|
f"\n path: {path}" if path else ""
|
||||||
)
|
)
|
||||||
|
|
||||||
passages = _matched_passages(doc, body)
|
passages = _matched_passages(doc, body, doc_id if isinstance(doc_id, int) else None)
|
||||||
entry = header + (passages or _fallback_snippet(doc))
|
entry = header + (passages or _fallback_snippet(doc))
|
||||||
if total + len(entry) > _MAX_TOTAL_CHARS:
|
if total + len(entry) > _MAX_TOTAL_CHARS:
|
||||||
lines.append("\n<!-- additional matches truncated to fit context -->")
|
lines.append("\n<!-- additional matches truncated to fit context -->")
|
||||||
|
|
@ -207,8 +215,9 @@ def _format_hits(
|
||||||
total += len(entry)
|
total += len(entry)
|
||||||
|
|
||||||
lines.append(
|
lines.append(
|
||||||
"\n\nTo read a full document, delegate to the knowledge_base specialist "
|
"\n\nTo cite a matched passage, copy its [citation:dID#Lstart-end] token "
|
||||||
"with `task`, referencing the path above."
|
"verbatim. To quote more context or read the full document, delegate to "
|
||||||
|
"the knowledge_base specialist with `task` using the path above."
|
||||||
)
|
)
|
||||||
lines.append("\n</knowledge_base_results>")
|
lines.append("\n</knowledge_base_results>")
|
||||||
return "".join(lines)
|
return "".join(lines)
|
||||||
|
|
|
||||||
|
|
@ -51,20 +51,28 @@ def test_renders_matched_passage_not_top_of_doc() -> None:
|
||||||
assert "Intro paragraph." not in out
|
assert "Intro paragraph." not in out
|
||||||
|
|
||||||
|
|
||||||
def test_includes_line_range_when_spans_present() -> None:
|
def test_emits_copyable_line_citation_token_when_spans_present() -> None:
|
||||||
out = _format_hits([_hit()], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
|
out = _format_hits([_hit()], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
|
||||||
# "Matched passage here." sits on line 3 of the body.
|
# "Matched passage here." sits on line 3 of the body; the hit must surface
|
||||||
assert "line 3" in out
|
# a ready-to-copy token so the agent can cite without a separate read.
|
||||||
|
assert "[citation:d7#L3-3]" in out
|
||||||
|
|
||||||
|
|
||||||
def test_omits_line_range_when_spans_absent() -> None:
|
def test_header_includes_document_id() -> None:
|
||||||
|
out = _format_hits([_hit()], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
|
||||||
|
assert "id=7" in out
|
||||||
|
|
||||||
|
|
||||||
|
def test_omits_citation_token_when_spans_absent() -> None:
|
||||||
hit = _hit()
|
hit = _hit()
|
||||||
for chunk in hit["chunks"]:
|
for chunk in hit["chunks"]:
|
||||||
chunk["start_char"] = None
|
chunk["start_char"] = None
|
||||||
chunk["end_char"] = None
|
chunk["end_char"] = None
|
||||||
out = _format_hits([hit], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
|
out = _format_hits([hit], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
|
||||||
assert "Matched passage here." in out
|
assert "Matched passage here." in out
|
||||||
assert "[line" not in out
|
# No concrete, copyable token for this document without spans (the closing
|
||||||
|
# instruction's placeholder template doesn't count).
|
||||||
|
assert "[citation:d7#L" not in out
|
||||||
|
|
||||||
|
|
||||||
def test_falls_back_to_content_when_no_matched_ids() -> None:
|
def test_falls_back_to_content_when_no_matched_ids() -> None:
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue