fix: harden resource limits, path safety, and WASM build (#46)

Security audit follow-up across the workspace:

- webclaw-core: keep the crate WASM-safe. quickjs/rquickjs is now a
  cfg(not(wasm32)) target dependency and the extraction entry point uses
  a direct call on wasm instead of spawning a thread, so it builds and
  runs on wasm32 with or without default features.
- webclaw-core: bound the structured-data scrubber recursion (depth cap)
  so deeply nested attacker JSON-LD / __NEXT_DATA__ cannot exhaust the
  stack.
- webclaw-fetch: stream the response body with a running ceiling so a
  small highly compressed payload cannot inflate to gigabytes in memory;
  redact user:pass@ from proxy URLs before they reach error strings.
- webclaw-cli: contain output filenames inside the chosen directory
  (reject .. / absolute, drop traversal path segments), run --webhook
  URLs through the public-URL SSRF guard, clamp --watch-interval to >=1s,
  and make research slug truncation char-safe.
- webclaw-mcp: char-safe slug truncation (no multibyte slice panic).
- setup.sh / deploy/hetzner.sh: replace eval on read input with
  printf -v, and mask auth key / API token in console output.
- CI: enforce the wasm32 build invariant for webclaw-core.

Tests added for every behavioral change. Bump to 0.6.3 + CHANGELOG.
This commit is contained in:
Valerio 2026-05-19 17:03:52 +02:00 committed by GitHub
parent aab51bea91
commit be8bcfebd9
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
13 changed files with 454 additions and 47 deletions

View file

@ -20,6 +20,11 @@ url = { version = "2", features = ["serde"] }
regex = "1"
once_cell = "1"
similar = "2"
# rquickjs links a C library and cannot build for wasm32. Gating it per
# target keeps the `quickjs` feature usable on native while leaving the
# crate WASM-safe even with default features enabled.
[target.'cfg(not(target_arch = "wasm32"))'.dependencies]
rquickjs = { version = "0.9", features = ["classes", "properties"], optional = true }
[dev-dependencies]

View file

@ -9,7 +9,7 @@ pub mod diff;
pub mod domain;
pub mod error;
pub mod extractor;
#[cfg(feature = "quickjs")]
#[cfg(all(feature = "quickjs", not(target_arch = "wasm32")))]
pub mod js_eval;
pub mod llm;
pub mod markdown;
@ -46,9 +46,13 @@ pub fn extract(html: &str, url: Option<&str>) -> Result<ExtractionResult, Extrac
/// `url` — optional source URL, used for resolving relative links and domain detection
/// `options` — controls include/exclude selectors, main content mode, and raw HTML output
///
/// Spawns extraction on a thread with an 8 MB stack to handle deeply nested
/// HTML (e.g., Express.co.uk live blogs) without overflowing the default 1-2 MB
/// main-thread stack on Windows.
/// On native targets, spawns extraction on a thread with an 8 MB stack to
/// handle deeply nested HTML (e.g., Express.co.uk live blogs) without
/// overflowing the default 1-2 MB main-thread stack on Windows.
///
/// On `wasm32`, threads are unavailable (`std::thread::spawn` panics at
/// runtime), so extraction runs inline on the caller's stack.
#[cfg(not(target_arch = "wasm32"))]
pub fn extract_with_options(
html: &str,
url: Option<&str>,
@ -70,6 +74,16 @@ pub fn extract_with_options(
.unwrap_or(Err(ExtractError::NoContent))
}
/// WASM has no threads; run extraction directly on the caller's stack.
#[cfg(target_arch = "wasm32")]
pub fn extract_with_options(
html: &str,
url: Option<&str>,
options: &ExtractionOptions,
) -> Result<ExtractionResult, ExtractError> {
extract_with_options_inner(html, url, options)
}
fn extract_with_options_inner(
html: &str,
url: Option<&str>,
@ -187,7 +201,7 @@ fn extract_with_options_inner(
// QuickJS: execute inline <script> tags to capture JS-assigned data blobs
// (e.g., window.__PRELOADED_STATE__, self.__next_f). This supplements the
// static JSON data island extraction above with runtime-evaluated data.
#[cfg(feature = "quickjs")]
#[cfg(all(feature = "quickjs", not(target_arch = "wasm32")))]
{
let blobs = js_eval::extract_js_data(html);
if !blobs.is_empty() {
@ -603,4 +617,36 @@ mod tests {
"Should extract content from deep nesting"
);
}
#[test]
fn wasm_direct_call_path_extracts_content() {
// On wasm32 `extract_with_options` runs `extract_with_options_inner`
// inline (no thread spawn). Exercise that exact entry point here so
// the WASM path stays covered on native CI, and assert it produces
// the same content as the public threaded entry point.
let html = r#"
<html lang="en">
<head><title>WASM Path</title></head>
<body><article><h1>Heading</h1><p>WASM-safe extraction body content.</p></article></body>
</html>"#;
let opts = ExtractionOptions::default();
let inner = extract_with_options_inner(html, Some("https://example.com"), &opts)
.expect("inner extraction (wasm path) should succeed");
assert!(
inner
.content
.markdown
.contains("WASM-safe extraction body content"),
"wasm direct-call path should extract body, got: {}",
inner.content.markdown
);
let threaded = extract_with_options(html, Some("https://example.com"), &opts)
.expect("threaded extraction should succeed");
assert_eq!(
inner.content.markdown, threaded.content.markdown,
"wasm path and threaded path must produce identical content"
);
}
}

View file

@ -58,7 +58,7 @@ pub fn to_llm_text(result: &ExtractionResult, url: Option<&str>) -> String {
.cloned()
.collect();
for value in &mut useful {
scrub_body_fields(value);
scrub_body_fields(value, 0);
}
if !useful.is_empty() {
let serialized = serde_json::to_string_pretty(&useful).unwrap_or_default();
@ -117,10 +117,21 @@ fn is_useful_structured_data(v: &serde_json::Value) -> bool {
}
/// Recursively remove long fields that duplicate the rendered markdown body.
fn scrub_body_fields(v: &mut serde_json::Value) {
///
/// `depth` guards against stack exhaustion from attacker-controlled
/// JSON-LD / `__NEXT_DATA__` blobs with pathological nesting: past
/// [`MAX_SCRUB_DEPTH`] levels we stop descending and leave the subtree
/// as-is (it is still size-capped by the `STRUCTURED_DATA_MAX_BYTES`
/// budget in `to_llm_text`).
fn scrub_body_fields(v: &mut serde_json::Value, depth: usize) {
const BODY_KEYS: &[&str] = &["articleBody"];
const LONG_BODY_KEYS: &[&str] = &["body", "text", "description"];
const LONG_THRESHOLD: usize = 500;
const MAX_SCRUB_DEPTH: usize = 64;
if depth >= MAX_SCRUB_DEPTH {
return;
}
match v {
serde_json::Value::Object(map) => {
@ -136,12 +147,12 @@ fn scrub_body_fields(v: &mut serde_json::Value) {
true
});
for value in map.values_mut() {
scrub_body_fields(value);
scrub_body_fields(value, depth + 1);
}
}
serde_json::Value::Array(values) => {
for value in values {
scrub_body_fields(value);
scrub_body_fields(value, depth + 1);
}
}
_ => {}
@ -908,4 +919,53 @@ mod tests {
"Compact untyped array dropped: {out}"
);
}
/// Walk `value` down its single `"n"` child link and return the depth
/// at which an `articleBody` key is still present (i.e. was NOT
/// scrubbed). Used to observe exactly where the recursion stopped.
fn first_unscrubbed_article_body_depth(mut value: &serde_json::Value) -> Option<usize> {
let mut depth = 0;
loop {
let obj = value.as_object()?;
if obj.contains_key("articleBody") {
return Some(depth);
}
value = obj.get("n")?;
depth += 1;
}
}
#[test]
fn scrub_body_fields_bounds_recursion_on_deep_nesting() {
// Attacker-controlled JSON-LD / __NEXT_DATA__ with pathological
// nesting must not recurse without bound. Build a chain a little
// past the 64-level cap where every level carries a scrub-able
// `articleBody`. Levels within the cap get scrubbed; the first
// level past the cap keeps its `articleBody` because recursion
// stopped — that is the bound we assert. (Kept shallow on purpose:
// serde_json drops Values recursively, so a 10k-deep value would
// overflow the stack just being dropped.)
const DEPTH: usize = 80;
let mut node = serde_json::json!({ "articleBody": "x".repeat(600) });
for _ in 0..DEPTH {
node = serde_json::json!({
"articleBody": "x".repeat(600),
"n": node,
});
}
scrub_body_fields(&mut node, 0);
let stopped_at = first_unscrubbed_article_body_depth(&node)
.expect("recursion must stop and leave a deep articleBody intact");
// Top levels were scrubbed; the survivor sits right at the cap.
assert_eq!(
stopped_at, 64,
"recursion should stop at the depth cap, stopped at {stopped_at}"
);
assert!(
node.as_object().unwrap().get("articleBody").is_none(),
"shallow articleBody must still be scrubbed"
);
}
}