mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-04-25 00:06:21 +02:00
fix(core): UTF-8 char boundary panic in find_content_position (#16)
`search_from = abs_pos + 1` landed mid-char when a rejected match started on a multi-byte UTF-8 character, panicking on the next `markdown[search_from..]` slice. Advance by `needle.len()` instead — always a valid char boundary, and skips the whole rejected match instead of re-scanning inside it. Repro: webclaw https://bruler.ru/about_brand -f json Before: panic "byte index 782 is not a char boundary; it is inside 'Ч'" After: extracts 2.3KB of clean Cyrillic markdown with 7 sections Two regression tests cover multi-byte rejected matches and all-rejected cycles in Cyrillic text. Closes #16 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
095ae5d4b1
commit
12d938fabf
4 changed files with 49 additions and 8 deletions
|
|
@ -3,6 +3,13 @@
|
|||
All notable changes to webclaw are documented here.
|
||||
Format follows [Keep a Changelog](https://keepachangelog.com/).
|
||||
|
||||
## [0.3.18] — 2026-04-16
|
||||
|
||||
### Fixed
|
||||
- **UTF-8 char boundary panic in `webclaw-core::extractor::find_content_position` (#16).** After rejecting a match that fell inside image syntax (``), the scan advanced `search_from` by a single byte. If the rejected match started on a multi-byte character (Cyrillic, CJK, accented Latin, emoji), the next `markdown[search_from..]` slice landed mid-char and panicked with `byte index N is not a char boundary; it is inside 'X'`. Repro was `webclaw https://bruler.ru/about_brand -f json`. Now advances by `needle.len()` — always a valid char boundary, and faster because it skips the whole rejected match instead of re-scanning inside it. Two regression tests cover multi-byte rejected matches and all-rejected cycles in Cyrillic text.
|
||||
|
||||
---
|
||||
|
||||
## [0.3.17] — 2026-04-16
|
||||
|
||||
### Changed
|
||||
|
|
|
|||
12
Cargo.lock
generated
12
Cargo.lock
generated
|
|
@ -3102,7 +3102,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-cli"
|
||||
version = "0.3.17"
|
||||
version = "0.3.18"
|
||||
dependencies = [
|
||||
"clap",
|
||||
"dotenvy",
|
||||
|
|
@ -3123,7 +3123,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-core"
|
||||
version = "0.3.17"
|
||||
version = "0.3.18"
|
||||
dependencies = [
|
||||
"ego-tree",
|
||||
"once_cell",
|
||||
|
|
@ -3141,7 +3141,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-fetch"
|
||||
version = "0.3.17"
|
||||
version = "0.3.18"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"calamine",
|
||||
|
|
@ -3163,7 +3163,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-llm"
|
||||
version = "0.3.17"
|
||||
version = "0.3.18"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"reqwest",
|
||||
|
|
@ -3176,7 +3176,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-mcp"
|
||||
version = "0.3.17"
|
||||
version = "0.3.18"
|
||||
dependencies = [
|
||||
"dirs",
|
||||
"dotenvy",
|
||||
|
|
@ -3197,7 +3197,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-pdf"
|
||||
version = "0.3.17"
|
||||
version = "0.3.18"
|
||||
dependencies = [
|
||||
"pdf-extract",
|
||||
"thiserror",
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ resolver = "2"
|
|||
members = ["crates/*"]
|
||||
|
||||
[workspace.package]
|
||||
version = "0.3.17"
|
||||
version = "0.3.18"
|
||||
edition = "2024"
|
||||
license = "AGPL-3.0"
|
||||
repository = "https://github.com/0xMassi/webclaw"
|
||||
|
|
|
|||
|
|
@ -684,7 +684,11 @@ fn find_content_position(markdown: &str, needle: &str) -> Option<usize> {
|
|||
if !is_inside_image_syntax(markdown, abs_pos) {
|
||||
return Some(abs_pos);
|
||||
}
|
||||
search_from = abs_pos + 1;
|
||||
// Advance past the rejected match. `abs_pos + needle.len()` is always a
|
||||
// valid UTF-8 char boundary (end of the matched substring); `abs_pos + 1`
|
||||
// is not, and panics on the next slice when the match starts on a
|
||||
// multi-byte char (Cyrillic, CJK, accented Latin, emoji). See issue #16.
|
||||
search_from = abs_pos + needle.len();
|
||||
}
|
||||
None
|
||||
}
|
||||
|
|
@ -859,6 +863,36 @@ mod tests {
|
|||
Html::parse_document(html)
|
||||
}
|
||||
|
||||
/// Regression: issue #16 — `find_content_position` used to advance
|
||||
/// `search_from` by 1 byte after an image-syntax rejection, which landed
|
||||
/// mid-char on multi-byte UTF-8 input (Cyrillic, CJK, accented Latin, emoji)
|
||||
/// and panicked on the next `markdown[search_from..]` slice.
|
||||
#[test]
|
||||
fn find_content_position_does_not_panic_on_multibyte_rejected_match() {
|
||||
// `needle` appears first inside image syntax (must be rejected), then
|
||||
// again as plain content after a block of Cyrillic prose. The bump
|
||||
// from the rejected match used to land inside 'Ч'.
|
||||
let markdown =
|
||||
" Наша история Brûler d'Amour. needle text appears here.";
|
||||
let pos = find_content_position(markdown, "needle text");
|
||||
assert!(pos.is_some(), "second occurrence should be found");
|
||||
assert!(
|
||||
markdown.is_char_boundary(pos.unwrap()),
|
||||
"returned offset must be a char boundary"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn find_content_position_survives_all_rejected_in_cyrillic() {
|
||||
// Every occurrence of `needle` sits inside image syntax, so the
|
||||
// function must walk the whole string rejecting each one. With the
|
||||
// `+1` bug this panicked the first time `search_from` crossed a
|
||||
// 2-byte char. With the fix it should return None cleanly.
|
||||
let markdown =
|
||||
"Наша история  Ещё текст  Конец 'Ч'";
|
||||
assert_eq!(find_content_position(markdown, "needle"), None);
|
||||
}
|
||||
|
||||
/// Helper: extract with default options (backward-compatible).
|
||||
fn extract_default(doc: &Html, base_url: Option<&Url>) -> Content {
|
||||
extract_content(doc, base_url, &ExtractionOptions::default())
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue