citations: rewrite model [n] ordinals to frontend [citation:] markers

2026-06-28 21:49:40 +02:00 · 2026-06-25 06:48:25 +02:00 · 2026-06-25 06:48:25 +02:00 · 6bb20df510
commit 6bb20df510
parent b043911325
5 changed files with 236 additions and 0 deletions
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/citations/init.py
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/citations/init.py
@ -5,7 +5,9 @@ Server-side only; the model sees only the bare ``[n]``.

 from __future__ import annotations

+from .markers import to_frontend_payload
 from .models import CitationEntry, CitationSourceType
+from .normalizer import normalize_citations
 from .registry import CitationRegistry, make_key

 __all__ = [
@ -13,4 +15,6 @@ __all__ = [
    "CitationRegistry",
    "CitationSourceType",
    "make_key",
+    "normalize_citations",
+    "to_frontend_payload",
 ]
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/citations/markers.py
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/citations/markers.py
@ -0,0 +1,29 @@
+"""Map a registered citation to the frontend ``[citation:<payload>]`` payload.
+
+The citation renderer understands a chunk id (``42``), a negative chunk id for
+anonymous uploads (``-3``), and a URL. This is the seam that turns a server-side
+source into one the renderer can resolve; it grows as more source kinds become
+renderable. Kinds with no renderable form yet return ``None`` so the marker is
+dropped rather than emitted broken.
+"""
+
+from __future__ import annotations
+
+from .models import CitationEntry, CitationSourceType
+
+
+def to_frontend_payload(entry: CitationEntry) -> str | None:
+    """Inner payload for ``[citation:<payload>]``, or ``None`` if not renderable."""
+    locator = entry.locator
+    match entry.source_type:
+        case CitationSourceType.KB_CHUNK | CitationSourceType.ANON_CHUNK:
+            chunk_id = locator.get("chunk_id")
+            return str(chunk_id) if chunk_id is not None else None
+        case CitationSourceType.WEB_RESULT:
+            url = locator.get("url")
+            return url or None
+        case _:
+            return None
+
+
+__all__ = ["to_frontend_payload"]
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/citations/normalizer.py
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/citations/normalizer.py
@ -0,0 +1,60 @@
+"""Rewrite model ``[n]`` citations into frontend ``[citation:<payload>]`` markers.
+
+The model cites with tiny ordinals ``[n]`` — one per bracket. Several citations
+are just several brackets (``[1][2]`` or ``[1], [2]``). Each ordinal is resolved
+through the registry and replaced with a marker the citation renderer
+understands. Unknown or not-yet-renderable ordinals are dropped, so a bad
+citation disappears rather than misleads. Code spans are left untouched.
+"""
+
+from __future__ import annotations
+
+import re
+from collections.abc import Callable
+
+from .markers import to_frontend_payload
+from .registry import CitationRegistry
+
+# Fenced (```...```) and inline (`...`) code; mirrors the frontend's single
+# code-region pattern so ordinals inside examples are never rewritten.
+_CODE_REGION = re.compile(r"```[\s\S]*?```|`[^`\n]+`")
+
+# A single ordinal in a bracket: `[1]`, `[12]`. Not preceded by a word char, so
+# index expressions like `arr[1]` are left alone, while adjacent citations like
+# `[1][2]` (second bracket follows `]`) are both rewritten.
+_ORDINAL = re.compile(r"(?<!\w)\[\s*(\d+)\s*\]")
+
+
+def normalize_citations(text: str, registry: CitationRegistry) -> str:
+    """Replace each ``[n]`` with its resolved marker; drop the unresolved."""
+    if not text:
+        return text
+
+    rewrite = _ordinal_rewriter(registry)
+    return _outside_code(text, lambda span: _ORDINAL.sub(rewrite, span))
+
+
+def _ordinal_rewriter(registry: CitationRegistry) -> Callable[[re.Match[str]], str]:
+    """Build the substitution that turns one ordinal into a marker (or drops it)."""
+
+    def rewrite(match: re.Match[str]) -> str:
+        entry = registry.resolve(int(match.group(1)))
+        payload = to_frontend_payload(entry) if entry else None
+        return f"[citation:{payload}]" if payload is not None else ""
+
+    return rewrite
+
+
+def _outside_code(text: str, transform: Callable[[str], str]) -> str:
+    """Apply ``transform`` to non-code spans only; code regions pass through verbatim."""
+    parts = []
+    last = 0
+    for region in _CODE_REGION.finditer(text):
+        parts.append(transform(text[last : region.start()]))
+        parts.append(region.group(0))
+        last = region.end()
+    parts.append(transform(text[last:]))
+    return "".join(parts)
+
+
+__all__ = ["normalize_citations"]
--- a/surfsense_backend/tests/unit/agents/multi_agent_chat/shared/citations/test_markers.py
+++ b/surfsense_backend/tests/unit/agents/multi_agent_chat/shared/citations/test_markers.py
@ -0,0 +1,49 @@
+"""Tests for citation-entry → frontend payload mapping."""
+
+from __future__ import annotations
+
+import pytest
+
+from app.agents.chat.multi_agent_chat.shared.citations.markers import (
+    to_frontend_payload,
+)
+from app.agents.chat.multi_agent_chat.shared.citations.models import (
+    CitationEntry,
+    CitationSourceType,
+)
+
+pytestmark = pytest.mark.unit
+
+
+def _entry(source_type: CitationSourceType, locator: dict) -> CitationEntry:
+    return CitationEntry(n=1, source_type=source_type, locator=locator)
+
+
+def test_kb_chunk_maps_to_chunk_id() -> None:
+    entry = _entry(CitationSourceType.KB_CHUNK, {"chunk_id": 42, "document_id": 7})
+
+    assert to_frontend_payload(entry) == "42"
+
+
+def test_anon_chunk_keeps_negative_id() -> None:
+    entry = _entry(CitationSourceType.ANON_CHUNK, {"chunk_id": -3})
+
+    assert to_frontend_payload(entry) == "-3"
+
+
+def test_web_result_maps_to_url() -> None:
+    entry = _entry(CitationSourceType.WEB_RESULT, {"url": "https://example.com/a"})
+
+    assert to_frontend_payload(entry) == "https://example.com/a"
+
+
+def test_not_yet_renderable_kind_is_dropped() -> None:
+    entry = _entry(CitationSourceType.CHAT_TURN, {"thread_id": 1, "turn": 2})
+
+    assert to_frontend_payload(entry) is None
+
+
+def test_missing_locator_field_is_dropped() -> None:
+    entry = _entry(CitationSourceType.KB_CHUNK, {})
+
+    assert to_frontend_payload(entry) is None
--- a/surfsense_backend/tests/unit/agents/multi_agent_chat/shared/citations/test_normalizer.py
+++ b/surfsense_backend/tests/unit/agents/multi_agent_chat/shared/citations/test_normalizer.py
@ -0,0 +1,94 @@
+"""Tests for rewriting model ``[n]`` ordinals into frontend citation markers."""
+
+from __future__ import annotations
+
+import pytest
+
+from app.agents.chat.multi_agent_chat.shared.citations.models import CitationSourceType
+from app.agents.chat.multi_agent_chat.shared.citations.normalizer import (
+    normalize_citations,
+)
+from app.agents.chat.multi_agent_chat.shared.citations.registry import CitationRegistry
+
+pytestmark = pytest.mark.unit
+
+
+def _registry_with_chunks(*chunk_ids: int) -> CitationRegistry:
+    registry = CitationRegistry()
+    for chunk_id in chunk_ids:
+        registry.register(CitationSourceType.KB_CHUNK, {"chunk_id": chunk_id})
+    return registry
+
+
+def test_single_ordinal_is_rewritten() -> None:
+    registry = _registry_with_chunks(42)
+
+    assert normalize_citations("We shipped it [1].", registry) == (
+        "We shipped it [citation:42]."
+    )
+
+
+def test_adjacent_brackets_are_each_rewritten() -> None:
+    registry = _registry_with_chunks(42, 7)
+
+    assert normalize_citations("Both agree [1][2].", registry) == (
+        "Both agree [citation:42][citation:7]."
+    )
+
+
+def test_comma_separated_brackets_are_each_rewritten() -> None:
+    registry = _registry_with_chunks(42, 7)
+
+    assert normalize_citations("Both agree [1], [2].", registry) == (
+        "Both agree [citation:42], [citation:7]."
+    )
+
+
+def test_unknown_ordinal_is_dropped() -> None:
+    registry = _registry_with_chunks(42)
+
+    assert normalize_citations("Maybe [9] is real.", registry) == "Maybe  is real."
+
+
+def test_unknown_ordinal_among_known_is_dropped() -> None:
+    registry = _registry_with_chunks(42)
+
+    assert normalize_citations("See [1][9].", registry) == "See [citation:42]."
+
+
+def test_web_result_rewrites_to_url() -> None:
+    registry = CitationRegistry()
+    registry.register(CitationSourceType.WEB_RESULT, {"url": "https://example.com"})
+
+    assert normalize_citations("Per the docs [1].", registry) == (
+        "Per the docs [citation:https://example.com]."
+    )
+
+
+def test_index_expression_is_left_alone() -> None:
+    registry = _registry_with_chunks(42)
+
+    assert normalize_citations("Read arr[1] carefully.", registry) == (
+        "Read arr[1] carefully."
+    )
+
+
+def test_ordinals_inside_inline_code_are_untouched() -> None:
+    registry = _registry_with_chunks(42)
+
+    assert normalize_citations("Use `list[1]` here [1].", registry) == (
+        "Use `list[1]` here [citation:42]."
+    )
+
+
+def test_ordinals_inside_fenced_code_are_untouched() -> None:
+    registry = _registry_with_chunks(42)
+    text = "Before [1].\n```\nx = a[1]\n```\nAfter [1]."
+
+    assert normalize_citations(text, registry) == (
+        "Before [citation:42].\n```\nx = a[1]\n```\nAfter [citation:42]."
+    )
+
+
+def test_empty_text_is_returned_unchanged() -> None:
+    assert normalize_citations("", _registry_with_chunks(42)) == ""