mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-07 22:15:12 +02:00
fix: harden resource limits, path safety, and WASM build (#46)
Security audit follow-up across the workspace: - webclaw-core: keep the crate WASM-safe. quickjs/rquickjs is now a cfg(not(wasm32)) target dependency and the extraction entry point uses a direct call on wasm instead of spawning a thread, so it builds and runs on wasm32 with or without default features. - webclaw-core: bound the structured-data scrubber recursion (depth cap) so deeply nested attacker JSON-LD / __NEXT_DATA__ cannot exhaust the stack. - webclaw-fetch: stream the response body with a running ceiling so a small highly compressed payload cannot inflate to gigabytes in memory; redact user:pass@ from proxy URLs before they reach error strings. - webclaw-cli: contain output filenames inside the chosen directory (reject .. / absolute, drop traversal path segments), run --webhook URLs through the public-URL SSRF guard, clamp --watch-interval to >=1s, and make research slug truncation char-safe. - webclaw-mcp: char-safe slug truncation (no multibyte slice panic). - setup.sh / deploy/hetzner.sh: replace eval on read input with printf -v, and mask auth key / API token in console output. - CI: enforce the wasm32 build invariant for webclaw-core. Tests added for every behavioral change. Bump to 0.6.3 + CHANGELOG.
This commit is contained in:
parent
aab51bea91
commit
be8bcfebd9
13 changed files with 454 additions and 47 deletions
|
|
@ -9,7 +9,7 @@ pub mod diff;
|
|||
pub mod domain;
|
||||
pub mod error;
|
||||
pub mod extractor;
|
||||
#[cfg(feature = "quickjs")]
|
||||
#[cfg(all(feature = "quickjs", not(target_arch = "wasm32")))]
|
||||
pub mod js_eval;
|
||||
pub mod llm;
|
||||
pub mod markdown;
|
||||
|
|
@ -46,9 +46,13 @@ pub fn extract(html: &str, url: Option<&str>) -> Result<ExtractionResult, Extrac
|
|||
/// `url` — optional source URL, used for resolving relative links and domain detection
|
||||
/// `options` — controls include/exclude selectors, main content mode, and raw HTML output
|
||||
///
|
||||
/// Spawns extraction on a thread with an 8 MB stack to handle deeply nested
|
||||
/// HTML (e.g., Express.co.uk live blogs) without overflowing the default 1-2 MB
|
||||
/// main-thread stack on Windows.
|
||||
/// On native targets, spawns extraction on a thread with an 8 MB stack to
|
||||
/// handle deeply nested HTML (e.g., Express.co.uk live blogs) without
|
||||
/// overflowing the default 1-2 MB main-thread stack on Windows.
|
||||
///
|
||||
/// On `wasm32`, threads are unavailable (`std::thread::spawn` panics at
|
||||
/// runtime), so extraction runs inline on the caller's stack.
|
||||
#[cfg(not(target_arch = "wasm32"))]
|
||||
pub fn extract_with_options(
|
||||
html: &str,
|
||||
url: Option<&str>,
|
||||
|
|
@ -70,6 +74,16 @@ pub fn extract_with_options(
|
|||
.unwrap_or(Err(ExtractError::NoContent))
|
||||
}
|
||||
|
||||
/// WASM has no threads; run extraction directly on the caller's stack.
|
||||
#[cfg(target_arch = "wasm32")]
|
||||
pub fn extract_with_options(
|
||||
html: &str,
|
||||
url: Option<&str>,
|
||||
options: &ExtractionOptions,
|
||||
) -> Result<ExtractionResult, ExtractError> {
|
||||
extract_with_options_inner(html, url, options)
|
||||
}
|
||||
|
||||
fn extract_with_options_inner(
|
||||
html: &str,
|
||||
url: Option<&str>,
|
||||
|
|
@ -187,7 +201,7 @@ fn extract_with_options_inner(
|
|||
// QuickJS: execute inline <script> tags to capture JS-assigned data blobs
|
||||
// (e.g., window.__PRELOADED_STATE__, self.__next_f). This supplements the
|
||||
// static JSON data island extraction above with runtime-evaluated data.
|
||||
#[cfg(feature = "quickjs")]
|
||||
#[cfg(all(feature = "quickjs", not(target_arch = "wasm32")))]
|
||||
{
|
||||
let blobs = js_eval::extract_js_data(html);
|
||||
if !blobs.is_empty() {
|
||||
|
|
@ -603,4 +617,36 @@ mod tests {
|
|||
"Should extract content from deep nesting"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn wasm_direct_call_path_extracts_content() {
|
||||
// On wasm32 `extract_with_options` runs `extract_with_options_inner`
|
||||
// inline (no thread spawn). Exercise that exact entry point here so
|
||||
// the WASM path stays covered on native CI, and assert it produces
|
||||
// the same content as the public threaded entry point.
|
||||
let html = r#"
|
||||
<html lang="en">
|
||||
<head><title>WASM Path</title></head>
|
||||
<body><article><h1>Heading</h1><p>WASM-safe extraction body content.</p></article></body>
|
||||
</html>"#;
|
||||
let opts = ExtractionOptions::default();
|
||||
|
||||
let inner = extract_with_options_inner(html, Some("https://example.com"), &opts)
|
||||
.expect("inner extraction (wasm path) should succeed");
|
||||
assert!(
|
||||
inner
|
||||
.content
|
||||
.markdown
|
||||
.contains("WASM-safe extraction body content"),
|
||||
"wasm direct-call path should extract body, got: {}",
|
||||
inner.content.markdown
|
||||
);
|
||||
|
||||
let threaded = extract_with_options(html, Some("https://example.com"), &opts)
|
||||
.expect("threaded extraction should succeed");
|
||||
assert_eq!(
|
||||
inner.content.markdown, threaded.content.markdown,
|
||||
"wasm path and threaded path must produce identical content"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue