fix: harden LLM providers, UTF-8 handling, and webhook/batch reliability

- webclaw-llm: add explicit request + connect timeouts to the reqwest client in every provider (anthropic, openai, ollama) with a shorter timeout on the ollama health check, so a stalled provider fails fast. - webclaw-llm: fix a panic when truncating a provider error body that contains multibyte characters near the 500-char cut (char-safe take). - webclaw-core: snap the endpoint-scan budget cut to a UTF-8 char boundary so oversized scripts with non-ASCII content no longer panic. - webclaw-core: rewrite js_literal_to_json to copy raw bytes instead of `byte as char`, preserving multibyte UTF-8 in SvelteKit string values rather than producing Latin-1 mojibake. - webclaw-cli: have fire_webhook return its JoinHandle and await it at the crawl/batch/batch-llm call sites, removing the fixed 500ms sleeps. - webclaw-mcp: drop the up-front DNS pre-validation loop in batch that aborted the whole request on one bad URL; the fetch layer already applies the same SSRF guard per URL and reports per-URL errors. - webclaw-fetch: include the port in the warmup homepage URL so hosts on a non-default port are warmed correctly. Adds regression tests for the UTF-8 endpoint-scan and SvelteKit cases.
2026-06-23 02:48:06 +02:00 · 2026-06-09 21:10:15 +02:00 · 2026-06-09 21:10:15 +02:00 · 499345046c
commit 499345046c
parent d0d7b835f2
9 changed files with 117 additions and 51 deletions
--- a/crates/webclaw-core/src/endpoints.rs
+++ b/crates/webclaw-core/src/endpoints.rs
@ -233,7 +233,13 @@ pub fn extract_endpoints(
        }
        let slice = if text.len() > *budget {
            *truncated = true;
-            &text[..*budget]
+            // Snap the cut to a UTF-8 char boundary so non-ASCII content
+            // (multibyte codepoints straddling the budget) can't panic.
+            let mut cut = (*budget).min(text.len());
+            while cut > 0 && !text.is_char_boundary(cut) {
+                cut -= 1;
+            }
+            &text[..cut]
        } else {
            text
        };
@ -512,4 +518,16 @@ mod tests {
        );
        assert!(r.hosts.iter().any(|h| h == "pubapi.ticketmaster.co.uk"));
    }
+
+    #[test]
+    fn scan_truncation_at_non_ascii_boundary_does_not_panic() {
+        // A bundle just over the scan budget, padded with a multibyte char
+        // ('é' is 2 bytes) so the cut lands mid-codepoint. The old
+        // `&text[..budget]` slice panicked here; the boundary snap must not.
+        let pad = "é".repeat(MAX_SCAN_BYTES); // ~2× budget in bytes
+        let bundle = format!("{pad} fetch(\"/api/x\")");
+        let bundles = vec![("big.js".to_string(), bundle)];
+        let r = extract_endpoints("<html></html>", "https://example.com/", &bundles);
+        assert!(r.truncated, "oversized bundle should mark truncated");
+    }
 }
--- a/crates/webclaw-core/src/structured_data.rs
+++ b/crates/webclaw-core/src/structured_data.rs
@ -178,7 +178,12 @@ pub fn extract_sveltekit(html: &str) -> Vec<Value> {
 /// Preserves already-quoted keys and string values.
 fn js_literal_to_json(input: &str) -> String {
    let bytes = input.as_bytes();
-    let mut out = String::with_capacity(input.len() + input.len() / 10);
+    // Accumulate raw bytes, not `byte as char`. The input is valid UTF-8 and we
+    // only ever copy its bytes verbatim or insert ASCII quotes, so the result is
+    // guaranteed valid UTF-8 — copying byte-by-byte preserves multibyte
+    // codepoints (e.g. accented/CJK string values) instead of mangling them
+    // into Latin-1 mojibake.
+    let mut out: Vec<u8> = Vec::with_capacity(input.len() + input.len() / 10);
    let mut i = 0;
    let len = bytes.len();

@ -187,14 +192,14 @@ fn js_literal_to_json(input: &str) -> String {

        // Skip through strings
        if b == b'"' {
-            out.push('"');
+            out.push(b'"');
            i += 1;
            while i < len {
                let c = bytes[i];
-                out.push(c as char);
+                out.push(c);
                i += 1;
                if c == b'\\' && i < len {
-                    out.push(bytes[i] as char);
+                    out.push(bytes[i]);
                    i += 1;
                } else if c == b'"' {
                    break;
@ -205,11 +210,11 @@ fn js_literal_to_json(input: &str) -> String {

        // After { or , — look for unquoted key followed by :
        if (b == b'{' || b == b',' || b == b'[') && i + 1 < len {
-            out.push(b as char);
+            out.push(b);
            i += 1;
            // Skip whitespace
            while i < len && bytes[i].is_ascii_whitespace() {
-                out.push(bytes[i] as char);
+                out.push(bytes[i]);
                i += 1;
            }
            // Check if next is an unquoted identifier (key)
@ -218,29 +223,30 @@ fn js_literal_to_json(input: &str) -> String {
                while i < len && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'_') {
                    i += 1;
                }
-                let key = &input[key_start..i];
+                let key = &bytes[key_start..i];
                // Skip whitespace after key
                while i < len && bytes[i].is_ascii_whitespace() {
                    i += 1;
                }
                // If followed by :, it's an unquoted key — quote it
                if i < len && bytes[i] == b':' {
-                    out.push('"');
-                    out.push_str(key);
-                    out.push('"');
+                    out.push(b'"');
+                    out.extend_from_slice(key);
+                    out.push(b'"');
                } else {
                    // Not a key — might be a bare value like true/false/null
-                    out.push_str(key);
+                    out.extend_from_slice(key);
                }
            }
            continue;
        }

-        out.push(b as char);
+        out.push(b);
        i += 1;
    }

-    out
+    // Safe: we only copied bytes from valid-UTF-8 `input` plus ASCII quotes.
+    String::from_utf8(out).unwrap_or_else(|e| String::from_utf8_lossy(e.as_bytes()).into_owned())
 }

 /// Replace raw newlines/tabs inside JSON string values with escape sequences.
@ -440,4 +446,17 @@ newline"}"#;
        assert_eq!(parsed["text"], "line1\nline2");
        assert_eq!(parsed["raw"], "has\nnewline");
    }
+
+    #[test]
+    fn js_literal_to_json_preserves_multibyte_utf8() {
+        // Unquoted ASCII keys with accented and CJK string values (the shape
+        // SvelteKit emits). The old `byte as char` path turned the multibyte
+        // values into Latin-1 mojibake; they must now survive intact.
+        let input = r#"{name:"déjà vu", city:"東京", emoji:"🌱"}"#;
+        let json = js_literal_to_json(input);
+        let parsed: Value = serde_json::from_str(&json).unwrap();
+        assert_eq!(parsed["name"], "déjà vu");
+        assert_eq!(parsed["city"], "東京");
+        assert_eq!(parsed["emoji"], "🌱");
+    }
 }