fix: harden resource limits, path safety, and WASM build (#46)

Security audit follow-up across the workspace: - webclaw-core: keep the crate WASM-safe. quickjs/rquickjs is now a cfg(not(wasm32)) target dependency and the extraction entry point uses a direct call on wasm instead of spawning a thread, so it builds and runs on wasm32 with or without default features. - webclaw-core: bound the structured-data scrubber recursion (depth cap) so deeply nested attacker JSON-LD / __NEXT_DATA__ cannot exhaust the stack. - webclaw-fetch: stream the response body with a running ceiling so a small highly compressed payload cannot inflate to gigabytes in memory; redact user:pass@ from proxy URLs before they reach error strings. - webclaw-cli: contain output filenames inside the chosen directory (reject .. / absolute, drop traversal path segments), run --webhook URLs through the public-URL SSRF guard, clamp --watch-interval to >=1s, and make research slug truncation char-safe. - webclaw-mcp: char-safe slug truncation (no multibyte slice panic). - setup.sh / deploy/hetzner.sh: replace eval on read input with printf -v, and mask auth key / API token in console output. - CI: enforce the wasm32 build invariant for webclaw-core. Tests added for every behavioral change. Bump to 0.6.3 + CHANGELOG.
2026-07-26 07:51:01 +02:00 · 2026-05-19 17:03:52 +02:00 · 2026-05-19 17:03:52 +02:00 · be8bcfebd9
commit be8bcfebd9
parent aab51bea91
13 changed files with 454 additions and 47 deletions
--- a/crates/webclaw-core/Cargo.toml
+++ b/crates/webclaw-core/Cargo.toml
@ -20,6 +20,11 @@ url = { version = "2", features = ["serde"] }
 regex = "1"
 once_cell = "1"
 similar = "2"
+
+# rquickjs links a C library and cannot build for wasm32. Gating it per
+# target keeps the `quickjs` feature usable on native while leaving the
+# crate WASM-safe even with default features enabled.
+[target.'cfg(not(target_arch = "wasm32"))'.dependencies]
 rquickjs = { version = "0.9", features = ["classes", "properties"], optional = true }

 [dev-dependencies]
--- a/crates/webclaw-core/src/lib.rs
+++ b/crates/webclaw-core/src/lib.rs
@ -9,7 +9,7 @@ pub mod diff;
 pub mod domain;
 pub mod error;
 pub mod extractor;
-#[cfg(feature = "quickjs")]
+#[cfg(all(feature = "quickjs", not(target_arch = "wasm32")))]
 pub mod js_eval;
 pub mod llm;
 pub mod markdown;
@ -46,9 +46,13 @@ pub fn extract(html: &str, url: Option<&str>) -> Result<ExtractionResult, Extrac
 /// `url`     — optional source URL, used for resolving relative links and domain detection
 /// `options` — controls include/exclude selectors, main content mode, and raw HTML output
 ///
-/// Spawns extraction on a thread with an 8 MB stack to handle deeply nested
-/// HTML (e.g., Express.co.uk live blogs) without overflowing the default 1-2 MB
-/// main-thread stack on Windows.
+/// On native targets, spawns extraction on a thread with an 8 MB stack to
+/// handle deeply nested HTML (e.g., Express.co.uk live blogs) without
+/// overflowing the default 1-2 MB main-thread stack on Windows.
+///
+/// On `wasm32`, threads are unavailable (`std::thread::spawn` panics at
+/// runtime), so extraction runs inline on the caller's stack.
+#[cfg(not(target_arch = "wasm32"))]
 pub fn extract_with_options(
    html: &str,
    url: Option<&str>,
@ -70,6 +74,16 @@ pub fn extract_with_options(
        .unwrap_or(Err(ExtractError::NoContent))
 }

+/// WASM has no threads; run extraction directly on the caller's stack.
+#[cfg(target_arch = "wasm32")]
+pub fn extract_with_options(
+    html: &str,
+    url: Option<&str>,
+    options: &ExtractionOptions,
+) -> Result<ExtractionResult, ExtractError> {
+    extract_with_options_inner(html, url, options)
+}
+
 fn extract_with_options_inner(
    html: &str,
    url: Option<&str>,
@ -187,7 +201,7 @@ fn extract_with_options_inner(
    // QuickJS: execute inline <script> tags to capture JS-assigned data blobs
    // (e.g., window.__PRELOADED_STATE__, self.__next_f). This supplements the
    // static JSON data island extraction above with runtime-evaluated data.
-    #[cfg(feature = "quickjs")]
+    #[cfg(all(feature = "quickjs", not(target_arch = "wasm32")))]
    {
        let blobs = js_eval::extract_js_data(html);
        if !blobs.is_empty() {
@ -603,4 +617,36 @@ mod tests {
            "Should extract content from deep nesting"
        );
    }
+
+    #[test]
+    fn wasm_direct_call_path_extracts_content() {
+        // On wasm32 `extract_with_options` runs `extract_with_options_inner`
+        // inline (no thread spawn). Exercise that exact entry point here so
+        // the WASM path stays covered on native CI, and assert it produces
+        // the same content as the public threaded entry point.
+        let html = r#"
+        <html lang="en">
+        <head><title>WASM Path</title></head>
+        <body><article><h1>Heading</h1><p>WASM-safe extraction body content.</p></article></body>
+        </html>"#;
+        let opts = ExtractionOptions::default();
+
+        let inner = extract_with_options_inner(html, Some("https://example.com"), &opts)
+            .expect("inner extraction (wasm path) should succeed");
+        assert!(
+            inner
+                .content
+                .markdown
+                .contains("WASM-safe extraction body content"),
+            "wasm direct-call path should extract body, got: {}",
+            inner.content.markdown
+        );
+
+        let threaded = extract_with_options(html, Some("https://example.com"), &opts)
+            .expect("threaded extraction should succeed");
+        assert_eq!(
+            inner.content.markdown, threaded.content.markdown,
+            "wasm path and threaded path must produce identical content"
+        );
+    }
 }
--- a/crates/webclaw-core/src/llm/mod.rs
+++ b/crates/webclaw-core/src/llm/mod.rs
@ -58,7 +58,7 @@ pub fn to_llm_text(result: &ExtractionResult, url: Option<&str>) -> String {
        .cloned()
        .collect();
    for value in &mut useful {
-        scrub_body_fields(value);
+        scrub_body_fields(value, 0);
    }
    if !useful.is_empty() {
        let serialized = serde_json::to_string_pretty(&useful).unwrap_or_default();
@ -117,10 +117,21 @@ fn is_useful_structured_data(v: &serde_json::Value) -> bool {
 }

 /// Recursively remove long fields that duplicate the rendered markdown body.
-fn scrub_body_fields(v: &mut serde_json::Value) {
+///
+/// `depth` guards against stack exhaustion from attacker-controlled
+/// JSON-LD / `__NEXT_DATA__` blobs with pathological nesting: past
+/// [`MAX_SCRUB_DEPTH`] levels we stop descending and leave the subtree
+/// as-is (it is still size-capped by the `STRUCTURED_DATA_MAX_BYTES`
+/// budget in `to_llm_text`).
+fn scrub_body_fields(v: &mut serde_json::Value, depth: usize) {
    const BODY_KEYS: &[&str] = &["articleBody"];
    const LONG_BODY_KEYS: &[&str] = &["body", "text", "description"];
    const LONG_THRESHOLD: usize = 500;
+    const MAX_SCRUB_DEPTH: usize = 64;
+
+    if depth >= MAX_SCRUB_DEPTH {
+        return;
+    }

    match v {
        serde_json::Value::Object(map) => {
@ -136,12 +147,12 @@ fn scrub_body_fields(v: &mut serde_json::Value) {
                true
            });
            for value in map.values_mut() {
-                scrub_body_fields(value);
+                scrub_body_fields(value, depth + 1);
            }
        }
        serde_json::Value::Array(values) => {
            for value in values {
-                scrub_body_fields(value);
+                scrub_body_fields(value, depth + 1);
            }
        }
        _ => {}
@ -908,4 +919,53 @@ mod tests {
            "Compact untyped array dropped: {out}"
        );
    }
+
+    /// Walk `value` down its single `"n"` child link and return the depth
+    /// at which an `articleBody` key is still present (i.e. was NOT
+    /// scrubbed). Used to observe exactly where the recursion stopped.
+    fn first_unscrubbed_article_body_depth(mut value: &serde_json::Value) -> Option<usize> {
+        let mut depth = 0;
+        loop {
+            let obj = value.as_object()?;
+            if obj.contains_key("articleBody") {
+                return Some(depth);
+            }
+            value = obj.get("n")?;
+            depth += 1;
+        }
+    }
+
+    #[test]
+    fn scrub_body_fields_bounds_recursion_on_deep_nesting() {
+        // Attacker-controlled JSON-LD / __NEXT_DATA__ with pathological
+        // nesting must not recurse without bound. Build a chain a little
+        // past the 64-level cap where every level carries a scrub-able
+        // `articleBody`. Levels within the cap get scrubbed; the first
+        // level past the cap keeps its `articleBody` because recursion
+        // stopped — that is the bound we assert. (Kept shallow on purpose:
+        // serde_json drops Values recursively, so a 10k-deep value would
+        // overflow the stack just being dropped.)
+        const DEPTH: usize = 80;
+        let mut node = serde_json::json!({ "articleBody": "x".repeat(600) });
+        for _ in 0..DEPTH {
+            node = serde_json::json!({
+                "articleBody": "x".repeat(600),
+                "n": node,
+            });
+        }
+
+        scrub_body_fields(&mut node, 0);
+
+        let stopped_at = first_unscrubbed_article_body_depth(&node)
+            .expect("recursion must stop and leave a deep articleBody intact");
+        // Top levels were scrubbed; the survivor sits right at the cap.
+        assert_eq!(
+            stopped_at, 64,
+            "recursion should stop at the depth cap, stopped at {stopped_at}"
+        );
+        assert!(
+            node.as_object().unwrap().get("articleBody").is_none(),
+            "shallow articleBody must still be scrubbed"
+        );
+    }
 }