citations: normalize word-glued [n] markers

The model frequently writes citations glued to the preceding word
(docs[17]); the (?<!\w) lookbehind (added to dodge arr[1] array indexing)
silently skipped these, leaving raw [n] that fails to render and reads
like array access. Drop the lookbehind so glued citations resolve; genuine
code/array syntax stays protected by the existing code-region carve-out and
unresolved ordinals still drop harmlessly.
This commit is contained in:
CREDO23 2026-06-25 16:28:31 +02:00
parent ce15016533
commit 5bda944321
2 changed files with 30 additions and 7 deletions

View file

@ -19,10 +19,14 @@ from .registry import CitationRegistry
# code-region pattern so ordinals inside examples are never rewritten.
_CODE_REGION = re.compile(r"```[\s\S]*?```|`[^`\n]+`")
# A single ordinal in a bracket: `[1]`, `[12]`. Not preceded by a word char, so
# index expressions like `arr[1]` are left alone, while adjacent citations like
# `[1][2]` (second bracket follows `]`) are both rewritten.
_ORDINAL = re.compile(r"(?<!\w)\[\s*(\d+)\s*\]")
# A single ordinal in a bracket: `[1]`, `[12]`. We deliberately match even when
# glued to the preceding word (`docs[17]`) because the model very frequently
# writes citations that way — requiring a non-word char before `[` (to dodge
# `arr[1]`) silently dropped those citations, leaving raw `[n]` that both fails to
# render and reads like array indexing. Genuine code/array syntax is instead
# protected by the code-region carve-out below; an unresolved ordinal drops
# harmlessly. Adjacent citations `[1][2]` are each rewritten.
_ORDINAL = re.compile(r"\[\s*(\d+)\s*\]")
def normalize_citations(text: str, registry: CitationRegistry) -> str:

View file

@ -65,11 +65,30 @@ def test_web_result_rewrites_to_url() -> None:
)
def test_index_expression_is_left_alone() -> None:
def test_word_glued_citation_is_rewritten() -> None:
# The model frequently writes citations glued to the preceding word
# (``docs[1]``); these must still resolve to a marker, not leak as raw text.
registry = _registry_with_chunks(42)
assert normalize_citations("Read arr[1] carefully.", registry) == (
"Read arr[1] carefully."
assert normalize_citations("verifying against docs[1].", registry) == (
"verifying against docs[citation:42]."
)
def test_word_glued_unknown_ordinal_drops() -> None:
# A glued ordinal that doesn't resolve drops harmlessly (no broken marker,
# no raw ``[n]`` leak) rather than being preserved as array-index syntax.
registry = _registry_with_chunks(42)
assert normalize_citations("see notes[9] later", registry) == "see notes later"
def test_array_index_inside_code_is_left_alone() -> None:
# Genuine array/index syntax is protected by the code-region carve-out.
registry = _registry_with_chunks(42)
assert normalize_citations("Read `arr[1]` carefully.", registry) == (
"Read `arr[1]` carefully."
)