diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0b14bcc..78e5223 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -31,6 +31,21 @@ jobs: - run: cargo fmt --check --all - run: cargo clippy --all -- -D warnings + wasm: + name: WASM + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + with: + targets: wasm32-unknown-unknown + - uses: Swatinem/rust-cache@v2 + # webclaw-core must stay WASM-safe (zero network deps, no threads). + # Check both with and without default features so the quickjs gate + # can't regress. + - run: cargo check --target wasm32-unknown-unknown -p webclaw-core + - run: cargo check --target wasm32-unknown-unknown -p webclaw-core --no-default-features + docs: name: Docs runs-on: ubuntu-latest diff --git a/CHANGELOG.md b/CHANGELOG.md index e833578..4400ff1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,13 @@ All notable changes to webclaw are documented here. Format follows [Keep a Changelog](https://keepachangelog.com/). +## [0.6.3] — 2026-05-19 + +### Fixed +- Hardened resource and path-safety limits across the CLI, MCP server, and self-hosted API: oversized or highly compressed responses are capped while streaming, deeply nested page data can no longer exhaust memory, output filenames stay inside the chosen directory, webhook URLs are validated like every other fetch, and multibyte search queries no longer crash slug generation. + +--- + ## [0.6.2] — 2026-05-18 ### Fixed diff --git a/Cargo.lock b/Cargo.lock index 273c0c0..04c093c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3219,7 +3219,7 @@ dependencies = [ [[package]] name = "webclaw-cli" -version = "0.6.2" +version = "0.6.3" dependencies = [ "clap", "dotenvy", @@ -3240,7 +3240,7 @@ dependencies = [ [[package]] name = "webclaw-core" -version = "0.6.2" +version = "0.6.3" dependencies = [ "ego-tree", "once_cell", @@ -3258,7 +3258,7 @@ dependencies = [ [[package]] name = "webclaw-fetch" -version = "0.6.2" +version = "0.6.3" dependencies = [ "async-trait", "bytes", @@ -3284,7 +3284,7 @@ dependencies = [ [[package]] name = "webclaw-llm" -version = "0.6.2" +version = "0.6.3" dependencies = [ "async-trait", "reqwest", @@ -3297,7 +3297,7 @@ dependencies = [ [[package]] name = "webclaw-mcp" -version = "0.6.2" +version = "0.6.3" dependencies = [ "dirs", "dotenvy", @@ -3317,7 +3317,7 @@ dependencies = [ [[package]] name = "webclaw-pdf" -version = "0.6.2" +version = "0.6.3" dependencies = [ "pdf-extract", "thiserror", @@ -3326,7 +3326,7 @@ dependencies = [ [[package]] name = "webclaw-server" -version = "0.6.2" +version = "0.6.3" dependencies = [ "anyhow", "axum", diff --git a/Cargo.toml b/Cargo.toml index 5a3bfc6..2c21290 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ resolver = "2" members = ["crates/*"] [workspace.package] -version = "0.6.2" +version = "0.6.3" edition = "2024" license = "AGPL-3.0" repository = "https://github.com/0xMassi/webclaw" diff --git a/crates/webclaw-cli/src/main.rs b/crates/webclaw-cli/src/main.rs index 1c8515f..1348824 100644 --- a/crates/webclaw-cli/src/main.rs +++ b/crates/webclaw-cli/src/main.rs @@ -613,7 +613,15 @@ fn url_to_filename(raw_url: &str, format: &OutputFormat) -> String { Err(_) => (String::new(), String::new(), None), }; - let mut stem = path.trim_matches('/').to_string(); + // Drop empty / "." / ".." path segments so a URL path like + // `/../../etc/passwd` can't climb out of the output directory. + let cleaned_path: String = path + .split('/') + .filter(|seg| !seg.is_empty() && *seg != "." && *seg != "..") + .collect::>() + .join("/"); + + let mut stem = cleaned_path; if stem.is_empty() { // Use hostname for root URLs to avoid collisions in batch mode let clean_host = host.strip_prefix("www.").unwrap_or(&host); @@ -640,13 +648,59 @@ fn url_to_filename(raw_url: &str, format: &OutputFormat) -> String { format!("{sanitized}.{ext}") } +/// Reject a caller-supplied (CSV `url,filename`) name that could escape the +/// output directory: absolute paths, drive prefixes, root, or any `..` +/// component. Returns the validated relative path on success. +fn safe_relative_filename(filename: &str) -> Result { + let candidate = Path::new(filename); + use std::path::Component; + for comp in candidate.components() { + match comp { + Component::Normal(_) | Component::CurDir => {} + Component::ParentDir => { + return Err(format!("refusing path with '..' component: {filename}")); + } + Component::RootDir | Component::Prefix(_) => { + return Err(format!("refusing absolute output path: {filename}")); + } + } + } + if candidate.as_os_str().is_empty() { + return Err("empty output filename".to_string()); + } + Ok(candidate.to_path_buf()) +} + /// Write extraction output to a file inside `dir`, creating parent dirs as needed. +/// +/// `filename` may originate from an attacker-controlled `--urls-file` +/// (`url,filename` CSV). It is validated for traversal, and the canonical +/// destination directory is asserted to stay under the canonical output +/// directory before any write. fn write_to_file(dir: &Path, filename: &str, content: &str) -> Result<(), String> { - let dest = dir.join(filename); + let rel = safe_relative_filename(filename)?; + let dest = dir.join(&rel); + + std::fs::create_dir_all(dir) + .map_err(|e| format!("failed to create directory {}: {e}", dir.display()))?; + let base = dir + .canonicalize() + .map_err(|e| format!("failed to resolve output dir {}: {e}", dir.display()))?; + if let Some(parent) = dest.parent() { std::fs::create_dir_all(parent) .map_err(|e| format!("failed to create directory {}: {e}", parent.display()))?; + let canon_parent = parent + .canonicalize() + .map_err(|e| format!("failed to resolve {}: {e}", parent.display()))?; + if !canon_parent.starts_with(&base) { + return Err(format!( + "refusing to write outside output dir: {}", + dest.display() + )); + } } + std::fs::write(&dest, content) .map_err(|e| format!("failed to write {}: {e}", dest.display()))?; let word_count = content.split_whitespace().count(); @@ -1679,6 +1733,13 @@ fn fire_webhook(url: &str, payload: &serde_json::Value) { serde_json::to_string(payload).unwrap_or_default() }; tokio::spawn(async move { + // SSRF guard: a webhook URL is user-supplied and otherwise bypasses + // the fetch-layer protections, so resolve + reject private/internal + // destinations before sending the payload. + if let Err(e) = webclaw_fetch::url_security::validate_public_http_url(&url).await { + eprintln!("[webhook] refusing unsafe URL: {e}"); + return; + } match reqwest::Client::builder() .timeout(std::time::Duration::from_secs(10)) .build() @@ -1750,7 +1811,9 @@ async fn run_watch_single( ); loop { - tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval)).await; + // Clamp to >=1s: `--watch-interval 0` would otherwise spin the + // fetch loop with zero delay and hammer the target. + tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval.max(1))).await; if cancelled.load(Ordering::Relaxed) { eprintln!("[watch] Stopped"); @@ -1842,7 +1905,9 @@ async fn run_watch_multi( let mut check_number = 0u64; loop { - tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval)).await; + // Clamp to >=1s: `--watch-interval 0` would otherwise spin the + // fetch loop with zero delay and hammer the target. + tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval.max(1))).await; if cancelled.load(Ordering::Relaxed) { eprintln!("[watch] Stopped"); @@ -2321,7 +2386,9 @@ async fn run_research(cli: &Cli, query: &str) -> Result<(), String> { .collect::>() .join("-") .to_lowercase(); - let slug = if slug.len() > 50 { &slug[..50] } else { &slug }; + // char-safe truncation: byte slicing panics if char 50 + // lands mid-codepoint (multibyte queries). + let slug: String = slug.chars().take(50).collect(); let filename = format!("research-{slug}.json"); let json = serde_json::to_string_pretty(&status_resp).unwrap_or_default(); @@ -2773,4 +2840,66 @@ mod tests { assert_eq!(content, "hello"); let _ = std::fs::remove_dir_all(&dir); } + + #[test] + fn url_to_filename_strips_traversal_segments() { + // `..` / `.` / empty path segments must not survive into the path. + let out = url_to_filename( + "https://example.com/../../etc/passwd", + &OutputFormat::Markdown, + ); + assert!(!out.contains(".."), "traversal leaked: {out}"); + assert_eq!(out, "etc/passwd.md"); + let out2 = url_to_filename("https://example.com/a/./b//c", &OutputFormat::Json); + assert_eq!(out2, "a/b/c.json"); + } + + #[test] + fn safe_relative_filename_rejects_escapes() { + assert!(safe_relative_filename("../escape.md").is_err()); + assert!(safe_relative_filename("a/../../b.md").is_err()); + assert!(safe_relative_filename("/etc/passwd").is_err()); + assert!(safe_relative_filename("").is_err()); + // Normal nested relative names stay allowed. + assert!(safe_relative_filename("nested/deep/file.md").is_ok()); + assert!(safe_relative_filename("./ok.md").is_ok()); + } + + #[test] + fn write_to_file_refuses_traversal_filename() { + let dir = std::env::temp_dir().join("webclaw_test_traversal_dir"); + let _ = std::fs::remove_dir_all(&dir); + // CSV-supplied `url,filename` traversal attempt. + let err = write_to_file(&dir, "../../tmp/webclaw_pwned.md", "x").unwrap_err(); + assert!(err.contains("refusing"), "unexpected error: {err}"); + assert!( + !std::path::Path::new("/tmp/webclaw_pwned.md").exists(), + "traversal write escaped the output dir" + ); + let _ = std::fs::remove_dir_all(&dir); + } + + #[test] + fn research_slug_truncation_is_char_safe() { + // Multibyte query: byte-slicing at 50 would panic mid-codepoint. + let query = "日本語".repeat(40); // 120 chars, 3 bytes each + let slug: String = query + .chars() + .map(|c| { + if c.is_alphanumeric() || c == ' ' { + c + } else { + ' ' + } + }) + .collect::() + .split_whitespace() + .collect::>() + .join("-") + .to_lowercase(); + let slug: String = slug.chars().take(50).collect(); + assert!(slug.chars().count() <= 50); + // Round-trips through formatting without panicking. + let _ = format!("research-{slug}.json"); + } } diff --git a/crates/webclaw-core/Cargo.toml b/crates/webclaw-core/Cargo.toml index 497e002..19b2e08 100644 --- a/crates/webclaw-core/Cargo.toml +++ b/crates/webclaw-core/Cargo.toml @@ -20,6 +20,11 @@ url = { version = "2", features = ["serde"] } regex = "1" once_cell = "1" similar = "2" + +# rquickjs links a C library and cannot build for wasm32. Gating it per +# target keeps the `quickjs` feature usable on native while leaving the +# crate WASM-safe even with default features enabled. +[target.'cfg(not(target_arch = "wasm32"))'.dependencies] rquickjs = { version = "0.9", features = ["classes", "properties"], optional = true } [dev-dependencies] diff --git a/crates/webclaw-core/src/lib.rs b/crates/webclaw-core/src/lib.rs index 80dbb5c..a3e0725 100644 --- a/crates/webclaw-core/src/lib.rs +++ b/crates/webclaw-core/src/lib.rs @@ -9,7 +9,7 @@ pub mod diff; pub mod domain; pub mod error; pub mod extractor; -#[cfg(feature = "quickjs")] +#[cfg(all(feature = "quickjs", not(target_arch = "wasm32")))] pub mod js_eval; pub mod llm; pub mod markdown; @@ -46,9 +46,13 @@ pub fn extract(html: &str, url: Option<&str>) -> Result, @@ -70,6 +74,16 @@ pub fn extract_with_options( .unwrap_or(Err(ExtractError::NoContent)) } +/// WASM has no threads; run extraction directly on the caller's stack. +#[cfg(target_arch = "wasm32")] +pub fn extract_with_options( + html: &str, + url: Option<&str>, + options: &ExtractionOptions, +) -> Result { + extract_with_options_inner(html, url, options) +} + fn extract_with_options_inner( html: &str, url: Option<&str>, @@ -187,7 +201,7 @@ fn extract_with_options_inner( // QuickJS: execute inline