From 12d938fabf29ae916d2a2654291df952c0f5d979 Mon Sep 17 00:00:00 2001 From: Valerio Date: Fri, 17 Apr 2026 11:58:54 +0200 Subject: [PATCH] fix(core): UTF-8 char boundary panic in find_content_position (#16) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `search_from = abs_pos + 1` landed mid-char when a rejected match started on a multi-byte UTF-8 character, panicking on the next `markdown[search_from..]` slice. Advance by `needle.len()` instead — always a valid char boundary, and skips the whole rejected match instead of re-scanning inside it. Repro: webclaw https://bruler.ru/about_brand -f json Before: panic "byte index 782 is not a char boundary; it is inside 'Ч'" After: extracts 2.3KB of clean Cyrillic markdown with 7 sections Two regression tests cover multi-byte rejected matches and all-rejected cycles in Cyrillic text. Closes #16 Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 7 ++++++ Cargo.lock | 12 +++++----- Cargo.toml | 2 +- crates/webclaw-core/src/extractor.rs | 36 +++++++++++++++++++++++++++- 4 files changed, 49 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b923073..a49e066 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,13 @@ All notable changes to webclaw are documented here. Format follows [Keep a Changelog](https://keepachangelog.com/). +## [0.3.18] — 2026-04-16 + +### Fixed +- **UTF-8 char boundary panic in `webclaw-core::extractor::find_content_position` (#16).** After rejecting a match that fell inside image syntax (`![...](...)`), the scan advanced `search_from` by a single byte. If the rejected match started on a multi-byte character (Cyrillic, CJK, accented Latin, emoji), the next `markdown[search_from..]` slice landed mid-char and panicked with `byte index N is not a char boundary; it is inside 'X'`. Repro was `webclaw https://bruler.ru/about_brand -f json`. Now advances by `needle.len()` — always a valid char boundary, and faster because it skips the whole rejected match instead of re-scanning inside it. Two regression tests cover multi-byte rejected matches and all-rejected cycles in Cyrillic text. + +--- + ## [0.3.17] — 2026-04-16 ### Changed diff --git a/Cargo.lock b/Cargo.lock index a57de1b..479abad 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3102,7 +3102,7 @@ dependencies = [ [[package]] name = "webclaw-cli" -version = "0.3.17" +version = "0.3.18" dependencies = [ "clap", "dotenvy", @@ -3123,7 +3123,7 @@ dependencies = [ [[package]] name = "webclaw-core" -version = "0.3.17" +version = "0.3.18" dependencies = [ "ego-tree", "once_cell", @@ -3141,7 +3141,7 @@ dependencies = [ [[package]] name = "webclaw-fetch" -version = "0.3.17" +version = "0.3.18" dependencies = [ "bytes", "calamine", @@ -3163,7 +3163,7 @@ dependencies = [ [[package]] name = "webclaw-llm" -version = "0.3.17" +version = "0.3.18" dependencies = [ "async-trait", "reqwest", @@ -3176,7 +3176,7 @@ dependencies = [ [[package]] name = "webclaw-mcp" -version = "0.3.17" +version = "0.3.18" dependencies = [ "dirs", "dotenvy", @@ -3197,7 +3197,7 @@ dependencies = [ [[package]] name = "webclaw-pdf" -version = "0.3.17" +version = "0.3.18" dependencies = [ "pdf-extract", "thiserror", diff --git a/Cargo.toml b/Cargo.toml index e757628..f36e439 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ resolver = "2" members = ["crates/*"] [workspace.package] -version = "0.3.17" +version = "0.3.18" edition = "2024" license = "AGPL-3.0" repository = "https://github.com/0xMassi/webclaw" diff --git a/crates/webclaw-core/src/extractor.rs b/crates/webclaw-core/src/extractor.rs index 7ba7fc8..132d585 100644 --- a/crates/webclaw-core/src/extractor.rs +++ b/crates/webclaw-core/src/extractor.rs @@ -684,7 +684,11 @@ fn find_content_position(markdown: &str, needle: &str) -> Option { if !is_inside_image_syntax(markdown, abs_pos) { return Some(abs_pos); } - search_from = abs_pos + 1; + // Advance past the rejected match. `abs_pos + needle.len()` is always a + // valid UTF-8 char boundary (end of the matched substring); `abs_pos + 1` + // is not, and panics on the next slice when the match starts on a + // multi-byte char (Cyrillic, CJK, accented Latin, emoji). See issue #16. + search_from = abs_pos + needle.len(); } None } @@ -859,6 +863,36 @@ mod tests { Html::parse_document(html) } + /// Regression: issue #16 — `find_content_position` used to advance + /// `search_from` by 1 byte after an image-syntax rejection, which landed + /// mid-char on multi-byte UTF-8 input (Cyrillic, CJK, accented Latin, emoji) + /// and panicked on the next `markdown[search_from..]` slice. + #[test] + fn find_content_position_does_not_panic_on_multibyte_rejected_match() { + // `needle` appears first inside image syntax (must be rejected), then + // again as plain content after a block of Cyrillic prose. The bump + // from the rejected match used to land inside 'Ч'. + let markdown = + "![alt needle text](/img.png) Наша история Brûler d'Amour. needle text appears here."; + let pos = find_content_position(markdown, "needle text"); + assert!(pos.is_some(), "second occurrence should be found"); + assert!( + markdown.is_char_boundary(pos.unwrap()), + "returned offset must be a char boundary" + ); + } + + #[test] + fn find_content_position_survives_all_rejected_in_cyrillic() { + // Every occurrence of `needle` sits inside image syntax, so the + // function must walk the whole string rejecting each one. With the + // `+1` bug this panicked the first time `search_from` crossed a + // 2-byte char. With the fix it should return None cleanly. + let markdown = + "Наша история ![foo needle bar](a.png) Ещё текст ![needle](b.png) Конец 'Ч'"; + assert_eq!(find_content_position(markdown, "needle"), None); + } + /// Helper: extract with default options (backward-compatible). fn extract_default(doc: &Html, base_url: Option<&Url>) -> Content { extract_content(doc, base_url, &ExtractionOptions::default())