From 12d938fabf29ae916d2a2654291df952c0f5d979 Mon Sep 17 00:00:00 2001
From: Valerio <massimianivalerio1@gmail.com>
Date: Fri, 17 Apr 2026 11:58:54 +0200
Subject: [PATCH] fix(core): UTF-8 char boundary panic in find_content_position
 (#16)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`search_from = abs_pos + 1` landed mid-char when a rejected match
started on a multi-byte UTF-8 character, panicking on the next
`markdown[search_from..]` slice. Advance by `needle.len()` instead —
always a valid char boundary, and skips the whole rejected match
instead of re-scanning inside it.

Repro: webclaw https://bruler.ru/about_brand -f json
Before: panic "byte index 782 is not a char boundary; it is inside 'Ч'"
After:  extracts 2.3KB of clean Cyrillic markdown with 7 sections

Two regression tests cover multi-byte rejected matches and
all-rejected cycles in Cyrillic text.

Closes #16

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 CHANGELOG.md                         |  7 ++++++
 Cargo.lock                           | 12 +++++-----
 Cargo.toml                           |  2 +-
 crates/webclaw-core/src/extractor.rs | 36 +++++++++++++++++++++++++++-
 4 files changed, 49 insertions(+), 8 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index b923073..a49e066 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,13 @@
 All notable changes to webclaw are documented here.
 Format follows [Keep a Changelog](https://keepachangelog.com/).
 
+## [0.3.18] — 2026-04-16
+
+### Fixed
+- **UTF-8 char boundary panic in `webclaw-core::extractor::find_content_position` (#16).** After rejecting a match that fell inside image syntax (`![...](...)`), the scan advanced `search_from` by a single byte. If the rejected match started on a multi-byte character (Cyrillic, CJK, accented Latin, emoji), the next `markdown[search_from..]` slice landed mid-char and panicked with `byte index N is not a char boundary; it is inside 'X'`. Repro was `webclaw https://bruler.ru/about_brand -f json`. Now advances by `needle.len()` — always a valid char boundary, and faster because it skips the whole rejected match instead of re-scanning inside it. Two regression tests cover multi-byte rejected matches and all-rejected cycles in Cyrillic text.
+
+---
+
 ## [0.3.17] — 2026-04-16
 
 ### Changed
diff --git a/Cargo.lock b/Cargo.lock
index a57de1b..479abad 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3102,7 +3102,7 @@ dependencies = [
 
 [[package]]
 name = "webclaw-cli"
-version = "0.3.17"
+version = "0.3.18"
 dependencies = [
  "clap",
  "dotenvy",
@@ -3123,7 +3123,7 @@ dependencies = [
 
 [[package]]
 name = "webclaw-core"
-version = "0.3.17"
+version = "0.3.18"
 dependencies = [
  "ego-tree",
  "once_cell",
@@ -3141,7 +3141,7 @@ dependencies = [
 
 [[package]]
 name = "webclaw-fetch"
-version = "0.3.17"
+version = "0.3.18"
 dependencies = [
  "bytes",
  "calamine",
@@ -3163,7 +3163,7 @@ dependencies = [
 
 [[package]]
 name = "webclaw-llm"
-version = "0.3.17"
+version = "0.3.18"
 dependencies = [
  "async-trait",
  "reqwest",
@@ -3176,7 +3176,7 @@ dependencies = [
 
 [[package]]
 name = "webclaw-mcp"
-version = "0.3.17"
+version = "0.3.18"
 dependencies = [
  "dirs",
  "dotenvy",
@@ -3197,7 +3197,7 @@ dependencies = [
 
 [[package]]
 name = "webclaw-pdf"
-version = "0.3.17"
+version = "0.3.18"
 dependencies = [
  "pdf-extract",
  "thiserror",
diff --git a/Cargo.toml b/Cargo.toml
index e757628..f36e439 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,7 +3,7 @@ resolver = "2"
 members = ["crates/*"]
 
 [workspace.package]
-version = "0.3.17"
+version = "0.3.18"
 edition = "2024"
 license = "AGPL-3.0"
 repository = "https://github.com/0xMassi/webclaw"
diff --git a/crates/webclaw-core/src/extractor.rs b/crates/webclaw-core/src/extractor.rs
index 7ba7fc8..132d585 100644
--- a/crates/webclaw-core/src/extractor.rs
+++ b/crates/webclaw-core/src/extractor.rs
@@ -684,7 +684,11 @@ fn find_content_position(markdown: &str, needle: &str) -> Option<usize> {
         if !is_inside_image_syntax(markdown, abs_pos) {
             return Some(abs_pos);
         }
-        search_from = abs_pos + 1;
+        // Advance past the rejected match. `abs_pos + needle.len()` is always a
+        // valid UTF-8 char boundary (end of the matched substring); `abs_pos + 1`
+        // is not, and panics on the next slice when the match starts on a
+        // multi-byte char (Cyrillic, CJK, accented Latin, emoji). See issue #16.
+        search_from = abs_pos + needle.len();
     }
     None
 }
@@ -859,6 +863,36 @@ mod tests {
         Html::parse_document(html)
     }
 
+    /// Regression: issue #16 — `find_content_position` used to advance
+    /// `search_from` by 1 byte after an image-syntax rejection, which landed
+    /// mid-char on multi-byte UTF-8 input (Cyrillic, CJK, accented Latin, emoji)
+    /// and panicked on the next `markdown[search_from..]` slice.
+    #[test]
+    fn find_content_position_does_not_panic_on_multibyte_rejected_match() {
+        // `needle` appears first inside image syntax (must be rejected), then
+        // again as plain content after a block of Cyrillic prose. The bump
+        // from the rejected match used to land inside 'Ч'.
+        let markdown =
+            "![alt needle text](/img.png) Наша история Brûler d'Amour. needle text appears here.";
+        let pos = find_content_position(markdown, "needle text");
+        assert!(pos.is_some(), "second occurrence should be found");
+        assert!(
+            markdown.is_char_boundary(pos.unwrap()),
+            "returned offset must be a char boundary"
+        );
+    }
+
+    #[test]
+    fn find_content_position_survives_all_rejected_in_cyrillic() {
+        // Every occurrence of `needle` sits inside image syntax, so the
+        // function must walk the whole string rejecting each one. With the
+        // `+1` bug this panicked the first time `search_from` crossed a
+        // 2-byte char. With the fix it should return None cleanly.
+        let markdown =
+            "Наша история ![foo needle bar](a.png) Ещё текст ![needle](b.png) Конец 'Ч'";
+        assert_eq!(find_content_position(markdown, "needle"), None);
+    }
+
     /// Helper: extract with default options (backward-compatible).
     fn extract_default(doc: &Html, base_url: Option<&Url>) -> Content {
         extract_content(doc, base_url, &ExtractionOptions::default())