citations: rewrite model [n] ordinals to frontend [citation:] markers

This commit is contained in:
CREDO23 2026-06-25 06:48:25 +02:00
parent b043911325
commit 6bb20df510
5 changed files with 236 additions and 0 deletions

View file

@ -5,7 +5,9 @@ Server-side only; the model sees only the bare ``[n]``.
from __future__ import annotations
from .markers import to_frontend_payload
from .models import CitationEntry, CitationSourceType
from .normalizer import normalize_citations
from .registry import CitationRegistry, make_key
__all__ = [
@ -13,4 +15,6 @@ __all__ = [
"CitationRegistry",
"CitationSourceType",
"make_key",
"normalize_citations",
"to_frontend_payload",
]

View file

@ -0,0 +1,29 @@
"""Map a registered citation to the frontend ``[citation:<payload>]`` payload.
The citation renderer understands a chunk id (``42``), a negative chunk id for
anonymous uploads (``-3``), and a URL. This is the seam that turns a server-side
source into one the renderer can resolve; it grows as more source kinds become
renderable. Kinds with no renderable form yet return ``None`` so the marker is
dropped rather than emitted broken.
"""
from __future__ import annotations
from .models import CitationEntry, CitationSourceType
def to_frontend_payload(entry: CitationEntry) -> str | None:
"""Inner payload for ``[citation:<payload>]``, or ``None`` if not renderable."""
locator = entry.locator
match entry.source_type:
case CitationSourceType.KB_CHUNK | CitationSourceType.ANON_CHUNK:
chunk_id = locator.get("chunk_id")
return str(chunk_id) if chunk_id is not None else None
case CitationSourceType.WEB_RESULT:
url = locator.get("url")
return url or None
case _:
return None
__all__ = ["to_frontend_payload"]

View file

@ -0,0 +1,60 @@
"""Rewrite model ``[n]`` citations into frontend ``[citation:<payload>]`` markers.
The model cites with tiny ordinals ``[n]`` one per bracket. Several citations
are just several brackets (``[1][2]`` or ``[1], [2]``). Each ordinal is resolved
through the registry and replaced with a marker the citation renderer
understands. Unknown or not-yet-renderable ordinals are dropped, so a bad
citation disappears rather than misleads. Code spans are left untouched.
"""
from __future__ import annotations
import re
from collections.abc import Callable
from .markers import to_frontend_payload
from .registry import CitationRegistry
# Fenced (```...```) and inline (`...`) code; mirrors the frontend's single
# code-region pattern so ordinals inside examples are never rewritten.
_CODE_REGION = re.compile(r"```[\s\S]*?```|`[^`\n]+`")
# A single ordinal in a bracket: `[1]`, `[12]`. Not preceded by a word char, so
# index expressions like `arr[1]` are left alone, while adjacent citations like
# `[1][2]` (second bracket follows `]`) are both rewritten.
_ORDINAL = re.compile(r"(?<!\w)\[\s*(\d+)\s*\]")
def normalize_citations(text: str, registry: CitationRegistry) -> str:
"""Replace each ``[n]`` with its resolved marker; drop the unresolved."""
if not text:
return text
rewrite = _ordinal_rewriter(registry)
return _outside_code(text, lambda span: _ORDINAL.sub(rewrite, span))
def _ordinal_rewriter(registry: CitationRegistry) -> Callable[[re.Match[str]], str]:
"""Build the substitution that turns one ordinal into a marker (or drops it)."""
def rewrite(match: re.Match[str]) -> str:
entry = registry.resolve(int(match.group(1)))
payload = to_frontend_payload(entry) if entry else None
return f"[citation:{payload}]" if payload is not None else ""
return rewrite
def _outside_code(text: str, transform: Callable[[str], str]) -> str:
"""Apply ``transform`` to non-code spans only; code regions pass through verbatim."""
parts = []
last = 0
for region in _CODE_REGION.finditer(text):
parts.append(transform(text[last : region.start()]))
parts.append(region.group(0))
last = region.end()
parts.append(transform(text[last:]))
return "".join(parts)
__all__ = ["normalize_citations"]

View file

@ -0,0 +1,49 @@
"""Tests for citation-entry → frontend payload mapping."""
from __future__ import annotations
import pytest
from app.agents.chat.multi_agent_chat.shared.citations.markers import (
to_frontend_payload,
)
from app.agents.chat.multi_agent_chat.shared.citations.models import (
CitationEntry,
CitationSourceType,
)
pytestmark = pytest.mark.unit
def _entry(source_type: CitationSourceType, locator: dict) -> CitationEntry:
return CitationEntry(n=1, source_type=source_type, locator=locator)
def test_kb_chunk_maps_to_chunk_id() -> None:
entry = _entry(CitationSourceType.KB_CHUNK, {"chunk_id": 42, "document_id": 7})
assert to_frontend_payload(entry) == "42"
def test_anon_chunk_keeps_negative_id() -> None:
entry = _entry(CitationSourceType.ANON_CHUNK, {"chunk_id": -3})
assert to_frontend_payload(entry) == "-3"
def test_web_result_maps_to_url() -> None:
entry = _entry(CitationSourceType.WEB_RESULT, {"url": "https://example.com/a"})
assert to_frontend_payload(entry) == "https://example.com/a"
def test_not_yet_renderable_kind_is_dropped() -> None:
entry = _entry(CitationSourceType.CHAT_TURN, {"thread_id": 1, "turn": 2})
assert to_frontend_payload(entry) is None
def test_missing_locator_field_is_dropped() -> None:
entry = _entry(CitationSourceType.KB_CHUNK, {})
assert to_frontend_payload(entry) is None

View file

@ -0,0 +1,94 @@
"""Tests for rewriting model ``[n]`` ordinals into frontend citation markers."""
from __future__ import annotations
import pytest
from app.agents.chat.multi_agent_chat.shared.citations.models import CitationSourceType
from app.agents.chat.multi_agent_chat.shared.citations.normalizer import (
normalize_citations,
)
from app.agents.chat.multi_agent_chat.shared.citations.registry import CitationRegistry
pytestmark = pytest.mark.unit
def _registry_with_chunks(*chunk_ids: int) -> CitationRegistry:
registry = CitationRegistry()
for chunk_id in chunk_ids:
registry.register(CitationSourceType.KB_CHUNK, {"chunk_id": chunk_id})
return registry
def test_single_ordinal_is_rewritten() -> None:
registry = _registry_with_chunks(42)
assert normalize_citations("We shipped it [1].", registry) == (
"We shipped it [citation:42]."
)
def test_adjacent_brackets_are_each_rewritten() -> None:
registry = _registry_with_chunks(42, 7)
assert normalize_citations("Both agree [1][2].", registry) == (
"Both agree [citation:42][citation:7]."
)
def test_comma_separated_brackets_are_each_rewritten() -> None:
registry = _registry_with_chunks(42, 7)
assert normalize_citations("Both agree [1], [2].", registry) == (
"Both agree [citation:42], [citation:7]."
)
def test_unknown_ordinal_is_dropped() -> None:
registry = _registry_with_chunks(42)
assert normalize_citations("Maybe [9] is real.", registry) == "Maybe is real."
def test_unknown_ordinal_among_known_is_dropped() -> None:
registry = _registry_with_chunks(42)
assert normalize_citations("See [1][9].", registry) == "See [citation:42]."
def test_web_result_rewrites_to_url() -> None:
registry = CitationRegistry()
registry.register(CitationSourceType.WEB_RESULT, {"url": "https://example.com"})
assert normalize_citations("Per the docs [1].", registry) == (
"Per the docs [citation:https://example.com]."
)
def test_index_expression_is_left_alone() -> None:
registry = _registry_with_chunks(42)
assert normalize_citations("Read arr[1] carefully.", registry) == (
"Read arr[1] carefully."
)
def test_ordinals_inside_inline_code_are_untouched() -> None:
registry = _registry_with_chunks(42)
assert normalize_citations("Use `list[1]` here [1].", registry) == (
"Use `list[1]` here [citation:42]."
)
def test_ordinals_inside_fenced_code_are_untouched() -> None:
registry = _registry_with_chunks(42)
text = "Before [1].\n```\nx = a[1]\n```\nAfter [1]."
assert normalize_citations(text, registry) == (
"Before [citation:42].\n```\nx = a[1]\n```\nAfter [citation:42]."
)
def test_empty_text_is_returned_unchanged() -> None:
assert normalize_citations("", _registry_with_chunks(42)) == ""