mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-26 21:39:43 +02:00
citations: normalize word-glued [n] markers
The model frequently writes citations glued to the preceding word (docs[17]); the (?<!\w) lookbehind (added to dodge arr[1] array indexing) silently skipped these, leaving raw [n] that fails to render and reads like array access. Drop the lookbehind so glued citations resolve; genuine code/array syntax stays protected by the existing code-region carve-out and unresolved ordinals still drop harmlessly.
This commit is contained in:
parent
ce15016533
commit
5bda944321
2 changed files with 30 additions and 7 deletions
|
|
@ -19,10 +19,14 @@ from .registry import CitationRegistry
|
|||
# code-region pattern so ordinals inside examples are never rewritten.
|
||||
_CODE_REGION = re.compile(r"```[\s\S]*?```|`[^`\n]+`")
|
||||
|
||||
# A single ordinal in a bracket: `[1]`, `[12]`. Not preceded by a word char, so
|
||||
# index expressions like `arr[1]` are left alone, while adjacent citations like
|
||||
# `[1][2]` (second bracket follows `]`) are both rewritten.
|
||||
_ORDINAL = re.compile(r"(?<!\w)\[\s*(\d+)\s*\]")
|
||||
# A single ordinal in a bracket: `[1]`, `[12]`. We deliberately match even when
|
||||
# glued to the preceding word (`docs[17]`) because the model very frequently
|
||||
# writes citations that way — requiring a non-word char before `[` (to dodge
|
||||
# `arr[1]`) silently dropped those citations, leaving raw `[n]` that both fails to
|
||||
# render and reads like array indexing. Genuine code/array syntax is instead
|
||||
# protected by the code-region carve-out below; an unresolved ordinal drops
|
||||
# harmlessly. Adjacent citations `[1][2]` are each rewritten.
|
||||
_ORDINAL = re.compile(r"\[\s*(\d+)\s*\]")
|
||||
|
||||
|
||||
def normalize_citations(text: str, registry: CitationRegistry) -> str:
|
||||
|
|
|
|||
|
|
@ -65,11 +65,30 @@ def test_web_result_rewrites_to_url() -> None:
|
|||
)
|
||||
|
||||
|
||||
def test_index_expression_is_left_alone() -> None:
|
||||
def test_word_glued_citation_is_rewritten() -> None:
|
||||
# The model frequently writes citations glued to the preceding word
|
||||
# (``docs[1]``); these must still resolve to a marker, not leak as raw text.
|
||||
registry = _registry_with_chunks(42)
|
||||
|
||||
assert normalize_citations("Read arr[1] carefully.", registry) == (
|
||||
"Read arr[1] carefully."
|
||||
assert normalize_citations("verifying against docs[1].", registry) == (
|
||||
"verifying against docs[citation:42]."
|
||||
)
|
||||
|
||||
|
||||
def test_word_glued_unknown_ordinal_drops() -> None:
|
||||
# A glued ordinal that doesn't resolve drops harmlessly (no broken marker,
|
||||
# no raw ``[n]`` leak) rather than being preserved as array-index syntax.
|
||||
registry = _registry_with_chunks(42)
|
||||
|
||||
assert normalize_citations("see notes[9] later", registry) == "see notes later"
|
||||
|
||||
|
||||
def test_array_index_inside_code_is_left_alone() -> None:
|
||||
# Genuine array/index syntax is protected by the code-region carve-out.
|
||||
registry = _registry_with_chunks(42)
|
||||
|
||||
assert normalize_citations("Read `arr[1]` carefully.", registry) == (
|
||||
"Read `arr[1]` carefully."
|
||||
)
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue