mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-06 22:05:13 +02:00
fix: harden resource limits, path safety, and WASM build (#46)
Security audit follow-up across the workspace: - webclaw-core: keep the crate WASM-safe. quickjs/rquickjs is now a cfg(not(wasm32)) target dependency and the extraction entry point uses a direct call on wasm instead of spawning a thread, so it builds and runs on wasm32 with or without default features. - webclaw-core: bound the structured-data scrubber recursion (depth cap) so deeply nested attacker JSON-LD / __NEXT_DATA__ cannot exhaust the stack. - webclaw-fetch: stream the response body with a running ceiling so a small highly compressed payload cannot inflate to gigabytes in memory; redact user:pass@ from proxy URLs before they reach error strings. - webclaw-cli: contain output filenames inside the chosen directory (reject .. / absolute, drop traversal path segments), run --webhook URLs through the public-URL SSRF guard, clamp --watch-interval to >=1s, and make research slug truncation char-safe. - webclaw-mcp: char-safe slug truncation (no multibyte slice panic). - setup.sh / deploy/hetzner.sh: replace eval on read input with printf -v, and mask auth key / API token in console output. - CI: enforce the wasm32 build invariant for webclaw-core. Tests added for every behavioral change. Bump to 0.6.3 + CHANGELOG.
This commit is contained in:
parent
aab51bea91
commit
be8bcfebd9
13 changed files with 454 additions and 47 deletions
|
|
@ -20,6 +20,11 @@ url = { version = "2", features = ["serde"] }
|
|||
regex = "1"
|
||||
once_cell = "1"
|
||||
similar = "2"
|
||||
|
||||
# rquickjs links a C library and cannot build for wasm32. Gating it per
|
||||
# target keeps the `quickjs` feature usable on native while leaving the
|
||||
# crate WASM-safe even with default features enabled.
|
||||
[target.'cfg(not(target_arch = "wasm32"))'.dependencies]
|
||||
rquickjs = { version = "0.9", features = ["classes", "properties"], optional = true }
|
||||
|
||||
[dev-dependencies]
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ pub mod diff;
|
|||
pub mod domain;
|
||||
pub mod error;
|
||||
pub mod extractor;
|
||||
#[cfg(feature = "quickjs")]
|
||||
#[cfg(all(feature = "quickjs", not(target_arch = "wasm32")))]
|
||||
pub mod js_eval;
|
||||
pub mod llm;
|
||||
pub mod markdown;
|
||||
|
|
@ -46,9 +46,13 @@ pub fn extract(html: &str, url: Option<&str>) -> Result<ExtractionResult, Extrac
|
|||
/// `url` — optional source URL, used for resolving relative links and domain detection
|
||||
/// `options` — controls include/exclude selectors, main content mode, and raw HTML output
|
||||
///
|
||||
/// Spawns extraction on a thread with an 8 MB stack to handle deeply nested
|
||||
/// HTML (e.g., Express.co.uk live blogs) without overflowing the default 1-2 MB
|
||||
/// main-thread stack on Windows.
|
||||
/// On native targets, spawns extraction on a thread with an 8 MB stack to
|
||||
/// handle deeply nested HTML (e.g., Express.co.uk live blogs) without
|
||||
/// overflowing the default 1-2 MB main-thread stack on Windows.
|
||||
///
|
||||
/// On `wasm32`, threads are unavailable (`std::thread::spawn` panics at
|
||||
/// runtime), so extraction runs inline on the caller's stack.
|
||||
#[cfg(not(target_arch = "wasm32"))]
|
||||
pub fn extract_with_options(
|
||||
html: &str,
|
||||
url: Option<&str>,
|
||||
|
|
@ -70,6 +74,16 @@ pub fn extract_with_options(
|
|||
.unwrap_or(Err(ExtractError::NoContent))
|
||||
}
|
||||
|
||||
/// WASM has no threads; run extraction directly on the caller's stack.
|
||||
#[cfg(target_arch = "wasm32")]
|
||||
pub fn extract_with_options(
|
||||
html: &str,
|
||||
url: Option<&str>,
|
||||
options: &ExtractionOptions,
|
||||
) -> Result<ExtractionResult, ExtractError> {
|
||||
extract_with_options_inner(html, url, options)
|
||||
}
|
||||
|
||||
fn extract_with_options_inner(
|
||||
html: &str,
|
||||
url: Option<&str>,
|
||||
|
|
@ -187,7 +201,7 @@ fn extract_with_options_inner(
|
|||
// QuickJS: execute inline <script> tags to capture JS-assigned data blobs
|
||||
// (e.g., window.__PRELOADED_STATE__, self.__next_f). This supplements the
|
||||
// static JSON data island extraction above with runtime-evaluated data.
|
||||
#[cfg(feature = "quickjs")]
|
||||
#[cfg(all(feature = "quickjs", not(target_arch = "wasm32")))]
|
||||
{
|
||||
let blobs = js_eval::extract_js_data(html);
|
||||
if !blobs.is_empty() {
|
||||
|
|
@ -603,4 +617,36 @@ mod tests {
|
|||
"Should extract content from deep nesting"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn wasm_direct_call_path_extracts_content() {
|
||||
// On wasm32 `extract_with_options` runs `extract_with_options_inner`
|
||||
// inline (no thread spawn). Exercise that exact entry point here so
|
||||
// the WASM path stays covered on native CI, and assert it produces
|
||||
// the same content as the public threaded entry point.
|
||||
let html = r#"
|
||||
<html lang="en">
|
||||
<head><title>WASM Path</title></head>
|
||||
<body><article><h1>Heading</h1><p>WASM-safe extraction body content.</p></article></body>
|
||||
</html>"#;
|
||||
let opts = ExtractionOptions::default();
|
||||
|
||||
let inner = extract_with_options_inner(html, Some("https://example.com"), &opts)
|
||||
.expect("inner extraction (wasm path) should succeed");
|
||||
assert!(
|
||||
inner
|
||||
.content
|
||||
.markdown
|
||||
.contains("WASM-safe extraction body content"),
|
||||
"wasm direct-call path should extract body, got: {}",
|
||||
inner.content.markdown
|
||||
);
|
||||
|
||||
let threaded = extract_with_options(html, Some("https://example.com"), &opts)
|
||||
.expect("threaded extraction should succeed");
|
||||
assert_eq!(
|
||||
inner.content.markdown, threaded.content.markdown,
|
||||
"wasm path and threaded path must produce identical content"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -58,7 +58,7 @@ pub fn to_llm_text(result: &ExtractionResult, url: Option<&str>) -> String {
|
|||
.cloned()
|
||||
.collect();
|
||||
for value in &mut useful {
|
||||
scrub_body_fields(value);
|
||||
scrub_body_fields(value, 0);
|
||||
}
|
||||
if !useful.is_empty() {
|
||||
let serialized = serde_json::to_string_pretty(&useful).unwrap_or_default();
|
||||
|
|
@ -117,10 +117,21 @@ fn is_useful_structured_data(v: &serde_json::Value) -> bool {
|
|||
}
|
||||
|
||||
/// Recursively remove long fields that duplicate the rendered markdown body.
|
||||
fn scrub_body_fields(v: &mut serde_json::Value) {
|
||||
///
|
||||
/// `depth` guards against stack exhaustion from attacker-controlled
|
||||
/// JSON-LD / `__NEXT_DATA__` blobs with pathological nesting: past
|
||||
/// [`MAX_SCRUB_DEPTH`] levels we stop descending and leave the subtree
|
||||
/// as-is (it is still size-capped by the `STRUCTURED_DATA_MAX_BYTES`
|
||||
/// budget in `to_llm_text`).
|
||||
fn scrub_body_fields(v: &mut serde_json::Value, depth: usize) {
|
||||
const BODY_KEYS: &[&str] = &["articleBody"];
|
||||
const LONG_BODY_KEYS: &[&str] = &["body", "text", "description"];
|
||||
const LONG_THRESHOLD: usize = 500;
|
||||
const MAX_SCRUB_DEPTH: usize = 64;
|
||||
|
||||
if depth >= MAX_SCRUB_DEPTH {
|
||||
return;
|
||||
}
|
||||
|
||||
match v {
|
||||
serde_json::Value::Object(map) => {
|
||||
|
|
@ -136,12 +147,12 @@ fn scrub_body_fields(v: &mut serde_json::Value) {
|
|||
true
|
||||
});
|
||||
for value in map.values_mut() {
|
||||
scrub_body_fields(value);
|
||||
scrub_body_fields(value, depth + 1);
|
||||
}
|
||||
}
|
||||
serde_json::Value::Array(values) => {
|
||||
for value in values {
|
||||
scrub_body_fields(value);
|
||||
scrub_body_fields(value, depth + 1);
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
|
|
@ -908,4 +919,53 @@ mod tests {
|
|||
"Compact untyped array dropped: {out}"
|
||||
);
|
||||
}
|
||||
|
||||
/// Walk `value` down its single `"n"` child link and return the depth
|
||||
/// at which an `articleBody` key is still present (i.e. was NOT
|
||||
/// scrubbed). Used to observe exactly where the recursion stopped.
|
||||
fn first_unscrubbed_article_body_depth(mut value: &serde_json::Value) -> Option<usize> {
|
||||
let mut depth = 0;
|
||||
loop {
|
||||
let obj = value.as_object()?;
|
||||
if obj.contains_key("articleBody") {
|
||||
return Some(depth);
|
||||
}
|
||||
value = obj.get("n")?;
|
||||
depth += 1;
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scrub_body_fields_bounds_recursion_on_deep_nesting() {
|
||||
// Attacker-controlled JSON-LD / __NEXT_DATA__ with pathological
|
||||
// nesting must not recurse without bound. Build a chain a little
|
||||
// past the 64-level cap where every level carries a scrub-able
|
||||
// `articleBody`. Levels within the cap get scrubbed; the first
|
||||
// level past the cap keeps its `articleBody` because recursion
|
||||
// stopped — that is the bound we assert. (Kept shallow on purpose:
|
||||
// serde_json drops Values recursively, so a 10k-deep value would
|
||||
// overflow the stack just being dropped.)
|
||||
const DEPTH: usize = 80;
|
||||
let mut node = serde_json::json!({ "articleBody": "x".repeat(600) });
|
||||
for _ in 0..DEPTH {
|
||||
node = serde_json::json!({
|
||||
"articleBody": "x".repeat(600),
|
||||
"n": node,
|
||||
});
|
||||
}
|
||||
|
||||
scrub_body_fields(&mut node, 0);
|
||||
|
||||
let stopped_at = first_unscrubbed_article_body_depth(&node)
|
||||
.expect("recursion must stop and leave a deep articleBody intact");
|
||||
// Top levels were scrubbed; the survivor sits right at the cap.
|
||||
assert_eq!(
|
||||
stopped_at, 64,
|
||||
"recursion should stop at the depth cap, stopped at {stopped_at}"
|
||||
);
|
||||
assert!(
|
||||
node.as_object().unwrap().get("articleBody").is_none(),
|
||||
"shallow articleBody must still be scrubbed"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue