fix(core): UTF-8 char boundary panic in find_content_position (#16) (#24)

`search_from = abs_pos + 1` landed mid-char when a rejected match
started on a multi-byte UTF-8 character, panicking on the next
`markdown[search_from..]` slice. Advance by `needle.len()` instead —
always a valid char boundary, and skips the whole rejected match
instead of re-scanning inside it.

Repro: webclaw https://bruler.ru/about_brand -f json
Before: panic "byte index 782 is not a char boundary; it is inside 'Ч'"
After:  extracts 2.3KB of clean Cyrillic markdown with 7 sections

Two regression tests cover multi-byte rejected matches and
all-rejected cycles in Cyrillic text.

Closes #16

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Valerio 2026-04-17 12:02:52 +02:00 committed by GitHub
parent 095ae5d4b1
commit 7f0420bbf0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 49 additions and 8 deletions

View file

@ -684,7 +684,11 @@ fn find_content_position(markdown: &str, needle: &str) -> Option<usize> {
if !is_inside_image_syntax(markdown, abs_pos) {
return Some(abs_pos);
}
search_from = abs_pos + 1;
// Advance past the rejected match. `abs_pos + needle.len()` is always a
// valid UTF-8 char boundary (end of the matched substring); `abs_pos + 1`
// is not, and panics on the next slice when the match starts on a
// multi-byte char (Cyrillic, CJK, accented Latin, emoji). See issue #16.
search_from = abs_pos + needle.len();
}
None
}
@ -859,6 +863,36 @@ mod tests {
Html::parse_document(html)
}
/// Regression: issue #16 — `find_content_position` used to advance
/// `search_from` by 1 byte after an image-syntax rejection, which landed
/// mid-char on multi-byte UTF-8 input (Cyrillic, CJK, accented Latin, emoji)
/// and panicked on the next `markdown[search_from..]` slice.
#[test]
fn find_content_position_does_not_panic_on_multibyte_rejected_match() {
// `needle` appears first inside image syntax (must be rejected), then
// again as plain content after a block of Cyrillic prose. The bump
// from the rejected match used to land inside 'Ч'.
let markdown =
"![alt needle text](/img.png) Наша история Brûler d'Amour. needle text appears here.";
let pos = find_content_position(markdown, "needle text");
assert!(pos.is_some(), "second occurrence should be found");
assert!(
markdown.is_char_boundary(pos.unwrap()),
"returned offset must be a char boundary"
);
}
#[test]
fn find_content_position_survives_all_rejected_in_cyrillic() {
// Every occurrence of `needle` sits inside image syntax, so the
// function must walk the whole string rejecting each one. With the
// `+1` bug this panicked the first time `search_from` crossed a
// 2-byte char. With the fix it should return None cleanly.
let markdown =
"Наша история ![foo needle bar](a.png) Ещё текст ![needle](b.png) Конец 'Ч'";
assert_eq!(find_content_position(markdown, "needle"), None);
}
/// Helper: extract with default options (backward-compatible).
fn extract_default(doc: &Html, base_url: Option<&Url>) -> Content {
extract_content(doc, base_url, &ExtractionOptions::default())