mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-28 21:49:40 +02:00
citations: rewrite model [n] ordinals to frontend [citation:] markers
This commit is contained in:
parent
b043911325
commit
6bb20df510
5 changed files with 236 additions and 0 deletions
|
|
@ -5,7 +5,9 @@ Server-side only; the model sees only the bare ``[n]``.
|
|||
|
||||
from __future__ import annotations
|
||||
|
||||
from .markers import to_frontend_payload
|
||||
from .models import CitationEntry, CitationSourceType
|
||||
from .normalizer import normalize_citations
|
||||
from .registry import CitationRegistry, make_key
|
||||
|
||||
__all__ = [
|
||||
|
|
@ -13,4 +15,6 @@ __all__ = [
|
|||
"CitationRegistry",
|
||||
"CitationSourceType",
|
||||
"make_key",
|
||||
"normalize_citations",
|
||||
"to_frontend_payload",
|
||||
]
|
||||
|
|
|
|||
|
|
@ -0,0 +1,29 @@
|
|||
"""Map a registered citation to the frontend ``[citation:<payload>]`` payload.
|
||||
|
||||
The citation renderer understands a chunk id (``42``), a negative chunk id for
|
||||
anonymous uploads (``-3``), and a URL. This is the seam that turns a server-side
|
||||
source into one the renderer can resolve; it grows as more source kinds become
|
||||
renderable. Kinds with no renderable form yet return ``None`` so the marker is
|
||||
dropped rather than emitted broken.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from .models import CitationEntry, CitationSourceType
|
||||
|
||||
|
||||
def to_frontend_payload(entry: CitationEntry) -> str | None:
|
||||
"""Inner payload for ``[citation:<payload>]``, or ``None`` if not renderable."""
|
||||
locator = entry.locator
|
||||
match entry.source_type:
|
||||
case CitationSourceType.KB_CHUNK | CitationSourceType.ANON_CHUNK:
|
||||
chunk_id = locator.get("chunk_id")
|
||||
return str(chunk_id) if chunk_id is not None else None
|
||||
case CitationSourceType.WEB_RESULT:
|
||||
url = locator.get("url")
|
||||
return url or None
|
||||
case _:
|
||||
return None
|
||||
|
||||
|
||||
__all__ = ["to_frontend_payload"]
|
||||
|
|
@ -0,0 +1,60 @@
|
|||
"""Rewrite model ``[n]`` citations into frontend ``[citation:<payload>]`` markers.
|
||||
|
||||
The model cites with tiny ordinals ``[n]`` — one per bracket. Several citations
|
||||
are just several brackets (``[1][2]`` or ``[1], [2]``). Each ordinal is resolved
|
||||
through the registry and replaced with a marker the citation renderer
|
||||
understands. Unknown or not-yet-renderable ordinals are dropped, so a bad
|
||||
citation disappears rather than misleads. Code spans are left untouched.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from collections.abc import Callable
|
||||
|
||||
from .markers import to_frontend_payload
|
||||
from .registry import CitationRegistry
|
||||
|
||||
# Fenced (```...```) and inline (`...`) code; mirrors the frontend's single
|
||||
# code-region pattern so ordinals inside examples are never rewritten.
|
||||
_CODE_REGION = re.compile(r"```[\s\S]*?```|`[^`\n]+`")
|
||||
|
||||
# A single ordinal in a bracket: `[1]`, `[12]`. Not preceded by a word char, so
|
||||
# index expressions like `arr[1]` are left alone, while adjacent citations like
|
||||
# `[1][2]` (second bracket follows `]`) are both rewritten.
|
||||
_ORDINAL = re.compile(r"(?<!\w)\[\s*(\d+)\s*\]")
|
||||
|
||||
|
||||
def normalize_citations(text: str, registry: CitationRegistry) -> str:
|
||||
"""Replace each ``[n]`` with its resolved marker; drop the unresolved."""
|
||||
if not text:
|
||||
return text
|
||||
|
||||
rewrite = _ordinal_rewriter(registry)
|
||||
return _outside_code(text, lambda span: _ORDINAL.sub(rewrite, span))
|
||||
|
||||
|
||||
def _ordinal_rewriter(registry: CitationRegistry) -> Callable[[re.Match[str]], str]:
|
||||
"""Build the substitution that turns one ordinal into a marker (or drops it)."""
|
||||
|
||||
def rewrite(match: re.Match[str]) -> str:
|
||||
entry = registry.resolve(int(match.group(1)))
|
||||
payload = to_frontend_payload(entry) if entry else None
|
||||
return f"[citation:{payload}]" if payload is not None else ""
|
||||
|
||||
return rewrite
|
||||
|
||||
|
||||
def _outside_code(text: str, transform: Callable[[str], str]) -> str:
|
||||
"""Apply ``transform`` to non-code spans only; code regions pass through verbatim."""
|
||||
parts = []
|
||||
last = 0
|
||||
for region in _CODE_REGION.finditer(text):
|
||||
parts.append(transform(text[last : region.start()]))
|
||||
parts.append(region.group(0))
|
||||
last = region.end()
|
||||
parts.append(transform(text[last:]))
|
||||
return "".join(parts)
|
||||
|
||||
|
||||
__all__ = ["normalize_citations"]
|
||||
|
|
@ -0,0 +1,49 @@
|
|||
"""Tests for citation-entry → frontend payload mapping."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from app.agents.chat.multi_agent_chat.shared.citations.markers import (
|
||||
to_frontend_payload,
|
||||
)
|
||||
from app.agents.chat.multi_agent_chat.shared.citations.models import (
|
||||
CitationEntry,
|
||||
CitationSourceType,
|
||||
)
|
||||
|
||||
pytestmark = pytest.mark.unit
|
||||
|
||||
|
||||
def _entry(source_type: CitationSourceType, locator: dict) -> CitationEntry:
|
||||
return CitationEntry(n=1, source_type=source_type, locator=locator)
|
||||
|
||||
|
||||
def test_kb_chunk_maps_to_chunk_id() -> None:
|
||||
entry = _entry(CitationSourceType.KB_CHUNK, {"chunk_id": 42, "document_id": 7})
|
||||
|
||||
assert to_frontend_payload(entry) == "42"
|
||||
|
||||
|
||||
def test_anon_chunk_keeps_negative_id() -> None:
|
||||
entry = _entry(CitationSourceType.ANON_CHUNK, {"chunk_id": -3})
|
||||
|
||||
assert to_frontend_payload(entry) == "-3"
|
||||
|
||||
|
||||
def test_web_result_maps_to_url() -> None:
|
||||
entry = _entry(CitationSourceType.WEB_RESULT, {"url": "https://example.com/a"})
|
||||
|
||||
assert to_frontend_payload(entry) == "https://example.com/a"
|
||||
|
||||
|
||||
def test_not_yet_renderable_kind_is_dropped() -> None:
|
||||
entry = _entry(CitationSourceType.CHAT_TURN, {"thread_id": 1, "turn": 2})
|
||||
|
||||
assert to_frontend_payload(entry) is None
|
||||
|
||||
|
||||
def test_missing_locator_field_is_dropped() -> None:
|
||||
entry = _entry(CitationSourceType.KB_CHUNK, {})
|
||||
|
||||
assert to_frontend_payload(entry) is None
|
||||
|
|
@ -0,0 +1,94 @@
|
|||
"""Tests for rewriting model ``[n]`` ordinals into frontend citation markers."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from app.agents.chat.multi_agent_chat.shared.citations.models import CitationSourceType
|
||||
from app.agents.chat.multi_agent_chat.shared.citations.normalizer import (
|
||||
normalize_citations,
|
||||
)
|
||||
from app.agents.chat.multi_agent_chat.shared.citations.registry import CitationRegistry
|
||||
|
||||
pytestmark = pytest.mark.unit
|
||||
|
||||
|
||||
def _registry_with_chunks(*chunk_ids: int) -> CitationRegistry:
|
||||
registry = CitationRegistry()
|
||||
for chunk_id in chunk_ids:
|
||||
registry.register(CitationSourceType.KB_CHUNK, {"chunk_id": chunk_id})
|
||||
return registry
|
||||
|
||||
|
||||
def test_single_ordinal_is_rewritten() -> None:
|
||||
registry = _registry_with_chunks(42)
|
||||
|
||||
assert normalize_citations("We shipped it [1].", registry) == (
|
||||
"We shipped it [citation:42]."
|
||||
)
|
||||
|
||||
|
||||
def test_adjacent_brackets_are_each_rewritten() -> None:
|
||||
registry = _registry_with_chunks(42, 7)
|
||||
|
||||
assert normalize_citations("Both agree [1][2].", registry) == (
|
||||
"Both agree [citation:42][citation:7]."
|
||||
)
|
||||
|
||||
|
||||
def test_comma_separated_brackets_are_each_rewritten() -> None:
|
||||
registry = _registry_with_chunks(42, 7)
|
||||
|
||||
assert normalize_citations("Both agree [1], [2].", registry) == (
|
||||
"Both agree [citation:42], [citation:7]."
|
||||
)
|
||||
|
||||
|
||||
def test_unknown_ordinal_is_dropped() -> None:
|
||||
registry = _registry_with_chunks(42)
|
||||
|
||||
assert normalize_citations("Maybe [9] is real.", registry) == "Maybe is real."
|
||||
|
||||
|
||||
def test_unknown_ordinal_among_known_is_dropped() -> None:
|
||||
registry = _registry_with_chunks(42)
|
||||
|
||||
assert normalize_citations("See [1][9].", registry) == "See [citation:42]."
|
||||
|
||||
|
||||
def test_web_result_rewrites_to_url() -> None:
|
||||
registry = CitationRegistry()
|
||||
registry.register(CitationSourceType.WEB_RESULT, {"url": "https://example.com"})
|
||||
|
||||
assert normalize_citations("Per the docs [1].", registry) == (
|
||||
"Per the docs [citation:https://example.com]."
|
||||
)
|
||||
|
||||
|
||||
def test_index_expression_is_left_alone() -> None:
|
||||
registry = _registry_with_chunks(42)
|
||||
|
||||
assert normalize_citations("Read arr[1] carefully.", registry) == (
|
||||
"Read arr[1] carefully."
|
||||
)
|
||||
|
||||
|
||||
def test_ordinals_inside_inline_code_are_untouched() -> None:
|
||||
registry = _registry_with_chunks(42)
|
||||
|
||||
assert normalize_citations("Use `list[1]` here [1].", registry) == (
|
||||
"Use `list[1]` here [citation:42]."
|
||||
)
|
||||
|
||||
|
||||
def test_ordinals_inside_fenced_code_are_untouched() -> None:
|
||||
registry = _registry_with_chunks(42)
|
||||
text = "Before [1].\n```\nx = a[1]\n```\nAfter [1]."
|
||||
|
||||
assert normalize_citations(text, registry) == (
|
||||
"Before [citation:42].\n```\nx = a[1]\n```\nAfter [citation:42]."
|
||||
)
|
||||
|
||||
|
||||
def test_empty_text_is_returned_unchanged() -> None:
|
||||
assert normalize_citations("", _registry_with_chunks(42)) == ""
|
||||
Loading…
Add table
Add a link
Reference in a new issue