diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/citations/__init__.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/citations/__init__.py index f722717e9..91640483b 100644 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/citations/__init__.py +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/citations/__init__.py @@ -5,7 +5,9 @@ Server-side only; the model sees only the bare ``[n]``. from __future__ import annotations +from .markers import to_frontend_payload from .models import CitationEntry, CitationSourceType +from .normalizer import normalize_citations from .registry import CitationRegistry, make_key __all__ = [ @@ -13,4 +15,6 @@ __all__ = [ "CitationRegistry", "CitationSourceType", "make_key", + "normalize_citations", + "to_frontend_payload", ] diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/citations/markers.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/citations/markers.py new file mode 100644 index 000000000..7b6cae917 --- /dev/null +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/citations/markers.py @@ -0,0 +1,29 @@ +"""Map a registered citation to the frontend ``[citation:]`` payload. + +The citation renderer understands a chunk id (``42``), a negative chunk id for +anonymous uploads (``-3``), and a URL. This is the seam that turns a server-side +source into one the renderer can resolve; it grows as more source kinds become +renderable. Kinds with no renderable form yet return ``None`` so the marker is +dropped rather than emitted broken. +""" + +from __future__ import annotations + +from .models import CitationEntry, CitationSourceType + + +def to_frontend_payload(entry: CitationEntry) -> str | None: + """Inner payload for ``[citation:]``, or ``None`` if not renderable.""" + locator = entry.locator + match entry.source_type: + case CitationSourceType.KB_CHUNK | CitationSourceType.ANON_CHUNK: + chunk_id = locator.get("chunk_id") + return str(chunk_id) if chunk_id is not None else None + case CitationSourceType.WEB_RESULT: + url = locator.get("url") + return url or None + case _: + return None + + +__all__ = ["to_frontend_payload"] diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/citations/normalizer.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/citations/normalizer.py new file mode 100644 index 000000000..aa08e96d2 --- /dev/null +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/citations/normalizer.py @@ -0,0 +1,60 @@ +"""Rewrite model ``[n]`` citations into frontend ``[citation:]`` markers. + +The model cites with tiny ordinals ``[n]`` — one per bracket. Several citations +are just several brackets (``[1][2]`` or ``[1], [2]``). Each ordinal is resolved +through the registry and replaced with a marker the citation renderer +understands. Unknown or not-yet-renderable ordinals are dropped, so a bad +citation disappears rather than misleads. Code spans are left untouched. +""" + +from __future__ import annotations + +import re +from collections.abc import Callable + +from .markers import to_frontend_payload +from .registry import CitationRegistry + +# Fenced (```...```) and inline (`...`) code; mirrors the frontend's single +# code-region pattern so ordinals inside examples are never rewritten. +_CODE_REGION = re.compile(r"```[\s\S]*?```|`[^`\n]+`") + +# A single ordinal in a bracket: `[1]`, `[12]`. Not preceded by a word char, so +# index expressions like `arr[1]` are left alone, while adjacent citations like +# `[1][2]` (second bracket follows `]`) are both rewritten. +_ORDINAL = re.compile(r"(? str: + """Replace each ``[n]`` with its resolved marker; drop the unresolved.""" + if not text: + return text + + rewrite = _ordinal_rewriter(registry) + return _outside_code(text, lambda span: _ORDINAL.sub(rewrite, span)) + + +def _ordinal_rewriter(registry: CitationRegistry) -> Callable[[re.Match[str]], str]: + """Build the substitution that turns one ordinal into a marker (or drops it).""" + + def rewrite(match: re.Match[str]) -> str: + entry = registry.resolve(int(match.group(1))) + payload = to_frontend_payload(entry) if entry else None + return f"[citation:{payload}]" if payload is not None else "" + + return rewrite + + +def _outside_code(text: str, transform: Callable[[str], str]) -> str: + """Apply ``transform`` to non-code spans only; code regions pass through verbatim.""" + parts = [] + last = 0 + for region in _CODE_REGION.finditer(text): + parts.append(transform(text[last : region.start()])) + parts.append(region.group(0)) + last = region.end() + parts.append(transform(text[last:])) + return "".join(parts) + + +__all__ = ["normalize_citations"] diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/shared/citations/test_markers.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/shared/citations/test_markers.py new file mode 100644 index 000000000..53cf058a8 --- /dev/null +++ b/surfsense_backend/tests/unit/agents/multi_agent_chat/shared/citations/test_markers.py @@ -0,0 +1,49 @@ +"""Tests for citation-entry → frontend payload mapping.""" + +from __future__ import annotations + +import pytest + +from app.agents.chat.multi_agent_chat.shared.citations.markers import ( + to_frontend_payload, +) +from app.agents.chat.multi_agent_chat.shared.citations.models import ( + CitationEntry, + CitationSourceType, +) + +pytestmark = pytest.mark.unit + + +def _entry(source_type: CitationSourceType, locator: dict) -> CitationEntry: + return CitationEntry(n=1, source_type=source_type, locator=locator) + + +def test_kb_chunk_maps_to_chunk_id() -> None: + entry = _entry(CitationSourceType.KB_CHUNK, {"chunk_id": 42, "document_id": 7}) + + assert to_frontend_payload(entry) == "42" + + +def test_anon_chunk_keeps_negative_id() -> None: + entry = _entry(CitationSourceType.ANON_CHUNK, {"chunk_id": -3}) + + assert to_frontend_payload(entry) == "-3" + + +def test_web_result_maps_to_url() -> None: + entry = _entry(CitationSourceType.WEB_RESULT, {"url": "https://example.com/a"}) + + assert to_frontend_payload(entry) == "https://example.com/a" + + +def test_not_yet_renderable_kind_is_dropped() -> None: + entry = _entry(CitationSourceType.CHAT_TURN, {"thread_id": 1, "turn": 2}) + + assert to_frontend_payload(entry) is None + + +def test_missing_locator_field_is_dropped() -> None: + entry = _entry(CitationSourceType.KB_CHUNK, {}) + + assert to_frontend_payload(entry) is None diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/shared/citations/test_normalizer.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/shared/citations/test_normalizer.py new file mode 100644 index 000000000..6d74bfdf0 --- /dev/null +++ b/surfsense_backend/tests/unit/agents/multi_agent_chat/shared/citations/test_normalizer.py @@ -0,0 +1,94 @@ +"""Tests for rewriting model ``[n]`` ordinals into frontend citation markers.""" + +from __future__ import annotations + +import pytest + +from app.agents.chat.multi_agent_chat.shared.citations.models import CitationSourceType +from app.agents.chat.multi_agent_chat.shared.citations.normalizer import ( + normalize_citations, +) +from app.agents.chat.multi_agent_chat.shared.citations.registry import CitationRegistry + +pytestmark = pytest.mark.unit + + +def _registry_with_chunks(*chunk_ids: int) -> CitationRegistry: + registry = CitationRegistry() + for chunk_id in chunk_ids: + registry.register(CitationSourceType.KB_CHUNK, {"chunk_id": chunk_id}) + return registry + + +def test_single_ordinal_is_rewritten() -> None: + registry = _registry_with_chunks(42) + + assert normalize_citations("We shipped it [1].", registry) == ( + "We shipped it [citation:42]." + ) + + +def test_adjacent_brackets_are_each_rewritten() -> None: + registry = _registry_with_chunks(42, 7) + + assert normalize_citations("Both agree [1][2].", registry) == ( + "Both agree [citation:42][citation:7]." + ) + + +def test_comma_separated_brackets_are_each_rewritten() -> None: + registry = _registry_with_chunks(42, 7) + + assert normalize_citations("Both agree [1], [2].", registry) == ( + "Both agree [citation:42], [citation:7]." + ) + + +def test_unknown_ordinal_is_dropped() -> None: + registry = _registry_with_chunks(42) + + assert normalize_citations("Maybe [9] is real.", registry) == "Maybe is real." + + +def test_unknown_ordinal_among_known_is_dropped() -> None: + registry = _registry_with_chunks(42) + + assert normalize_citations("See [1][9].", registry) == "See [citation:42]." + + +def test_web_result_rewrites_to_url() -> None: + registry = CitationRegistry() + registry.register(CitationSourceType.WEB_RESULT, {"url": "https://example.com"}) + + assert normalize_citations("Per the docs [1].", registry) == ( + "Per the docs [citation:https://example.com]." + ) + + +def test_index_expression_is_left_alone() -> None: + registry = _registry_with_chunks(42) + + assert normalize_citations("Read arr[1] carefully.", registry) == ( + "Read arr[1] carefully." + ) + + +def test_ordinals_inside_inline_code_are_untouched() -> None: + registry = _registry_with_chunks(42) + + assert normalize_citations("Use `list[1]` here [1].", registry) == ( + "Use `list[1]` here [citation:42]." + ) + + +def test_ordinals_inside_fenced_code_are_untouched() -> None: + registry = _registry_with_chunks(42) + text = "Before [1].\n```\nx = a[1]\n```\nAfter [1]." + + assert normalize_citations(text, registry) == ( + "Before [citation:42].\n```\nx = a[1]\n```\nAfter [citation:42]." + ) + + +def test_empty_text_is_returned_unchanged() -> None: + assert normalize_citations("", _registry_with_chunks(42)) == ""