mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-06 22:05:13 +02:00
fix: harden resource limits, path safety, and WASM build (#46)
Security audit follow-up across the workspace: - webclaw-core: keep the crate WASM-safe. quickjs/rquickjs is now a cfg(not(wasm32)) target dependency and the extraction entry point uses a direct call on wasm instead of spawning a thread, so it builds and runs on wasm32 with or without default features. - webclaw-core: bound the structured-data scrubber recursion (depth cap) so deeply nested attacker JSON-LD / __NEXT_DATA__ cannot exhaust the stack. - webclaw-fetch: stream the response body with a running ceiling so a small highly compressed payload cannot inflate to gigabytes in memory; redact user:pass@ from proxy URLs before they reach error strings. - webclaw-cli: contain output filenames inside the chosen directory (reject .. / absolute, drop traversal path segments), run --webhook URLs through the public-URL SSRF guard, clamp --watch-interval to >=1s, and make research slug truncation char-safe. - webclaw-mcp: char-safe slug truncation (no multibyte slice panic). - setup.sh / deploy/hetzner.sh: replace eval on read input with printf -v, and mask auth key / API token in console output. - CI: enforce the wasm32 build invariant for webclaw-core. Tests added for every behavioral change. Bump to 0.6.3 + CHANGELOG.
This commit is contained in:
parent
aab51bea91
commit
be8bcfebd9
13 changed files with 454 additions and 47 deletions
|
|
@ -613,7 +613,15 @@ fn url_to_filename(raw_url: &str, format: &OutputFormat) -> String {
|
|||
Err(_) => (String::new(), String::new(), None),
|
||||
};
|
||||
|
||||
let mut stem = path.trim_matches('/').to_string();
|
||||
// Drop empty / "." / ".." path segments so a URL path like
|
||||
// `/../../etc/passwd` can't climb out of the output directory.
|
||||
let cleaned_path: String = path
|
||||
.split('/')
|
||||
.filter(|seg| !seg.is_empty() && *seg != "." && *seg != "..")
|
||||
.collect::<Vec<_>>()
|
||||
.join("/");
|
||||
|
||||
let mut stem = cleaned_path;
|
||||
if stem.is_empty() {
|
||||
// Use hostname for root URLs to avoid collisions in batch mode
|
||||
let clean_host = host.strip_prefix("www.").unwrap_or(&host);
|
||||
|
|
@ -640,13 +648,59 @@ fn url_to_filename(raw_url: &str, format: &OutputFormat) -> String {
|
|||
format!("{sanitized}.{ext}")
|
||||
}
|
||||
|
||||
/// Reject a caller-supplied (CSV `url,filename`) name that could escape the
|
||||
/// output directory: absolute paths, drive prefixes, root, or any `..`
|
||||
/// component. Returns the validated relative path on success.
|
||||
fn safe_relative_filename(filename: &str) -> Result<PathBuf, String> {
|
||||
let candidate = Path::new(filename);
|
||||
use std::path::Component;
|
||||
for comp in candidate.components() {
|
||||
match comp {
|
||||
Component::Normal(_) | Component::CurDir => {}
|
||||
Component::ParentDir => {
|
||||
return Err(format!("refusing path with '..' component: {filename}"));
|
||||
}
|
||||
Component::RootDir | Component::Prefix(_) => {
|
||||
return Err(format!("refusing absolute output path: {filename}"));
|
||||
}
|
||||
}
|
||||
}
|
||||
if candidate.as_os_str().is_empty() {
|
||||
return Err("empty output filename".to_string());
|
||||
}
|
||||
Ok(candidate.to_path_buf())
|
||||
}
|
||||
|
||||
/// Write extraction output to a file inside `dir`, creating parent dirs as needed.
|
||||
///
|
||||
/// `filename` may originate from an attacker-controlled `--urls-file`
|
||||
/// (`url,filename` CSV). It is validated for traversal, and the canonical
|
||||
/// destination directory is asserted to stay under the canonical output
|
||||
/// directory before any write.
|
||||
fn write_to_file(dir: &Path, filename: &str, content: &str) -> Result<(), String> {
|
||||
let dest = dir.join(filename);
|
||||
let rel = safe_relative_filename(filename)?;
|
||||
let dest = dir.join(&rel);
|
||||
|
||||
std::fs::create_dir_all(dir)
|
||||
.map_err(|e| format!("failed to create directory {}: {e}", dir.display()))?;
|
||||
let base = dir
|
||||
.canonicalize()
|
||||
.map_err(|e| format!("failed to resolve output dir {}: {e}", dir.display()))?;
|
||||
|
||||
if let Some(parent) = dest.parent() {
|
||||
std::fs::create_dir_all(parent)
|
||||
.map_err(|e| format!("failed to create directory {}: {e}", parent.display()))?;
|
||||
let canon_parent = parent
|
||||
.canonicalize()
|
||||
.map_err(|e| format!("failed to resolve {}: {e}", parent.display()))?;
|
||||
if !canon_parent.starts_with(&base) {
|
||||
return Err(format!(
|
||||
"refusing to write outside output dir: {}",
|
||||
dest.display()
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
std::fs::write(&dest, content)
|
||||
.map_err(|e| format!("failed to write {}: {e}", dest.display()))?;
|
||||
let word_count = content.split_whitespace().count();
|
||||
|
|
@ -1679,6 +1733,13 @@ fn fire_webhook(url: &str, payload: &serde_json::Value) {
|
|||
serde_json::to_string(payload).unwrap_or_default()
|
||||
};
|
||||
tokio::spawn(async move {
|
||||
// SSRF guard: a webhook URL is user-supplied and otherwise bypasses
|
||||
// the fetch-layer protections, so resolve + reject private/internal
|
||||
// destinations before sending the payload.
|
||||
if let Err(e) = webclaw_fetch::url_security::validate_public_http_url(&url).await {
|
||||
eprintln!("[webhook] refusing unsafe URL: {e}");
|
||||
return;
|
||||
}
|
||||
match reqwest::Client::builder()
|
||||
.timeout(std::time::Duration::from_secs(10))
|
||||
.build()
|
||||
|
|
@ -1750,7 +1811,9 @@ async fn run_watch_single(
|
|||
);
|
||||
|
||||
loop {
|
||||
tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval)).await;
|
||||
// Clamp to >=1s: `--watch-interval 0` would otherwise spin the
|
||||
// fetch loop with zero delay and hammer the target.
|
||||
tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval.max(1))).await;
|
||||
|
||||
if cancelled.load(Ordering::Relaxed) {
|
||||
eprintln!("[watch] Stopped");
|
||||
|
|
@ -1842,7 +1905,9 @@ async fn run_watch_multi(
|
|||
let mut check_number = 0u64;
|
||||
|
||||
loop {
|
||||
tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval)).await;
|
||||
// Clamp to >=1s: `--watch-interval 0` would otherwise spin the
|
||||
// fetch loop with zero delay and hammer the target.
|
||||
tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval.max(1))).await;
|
||||
|
||||
if cancelled.load(Ordering::Relaxed) {
|
||||
eprintln!("[watch] Stopped");
|
||||
|
|
@ -2321,7 +2386,9 @@ async fn run_research(cli: &Cli, query: &str) -> Result<(), String> {
|
|||
.collect::<Vec<_>>()
|
||||
.join("-")
|
||||
.to_lowercase();
|
||||
let slug = if slug.len() > 50 { &slug[..50] } else { &slug };
|
||||
// char-safe truncation: byte slicing panics if char 50
|
||||
// lands mid-codepoint (multibyte queries).
|
||||
let slug: String = slug.chars().take(50).collect();
|
||||
let filename = format!("research-{slug}.json");
|
||||
|
||||
let json = serde_json::to_string_pretty(&status_resp).unwrap_or_default();
|
||||
|
|
@ -2773,4 +2840,66 @@ mod tests {
|
|||
assert_eq!(content, "hello");
|
||||
let _ = std::fs::remove_dir_all(&dir);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn url_to_filename_strips_traversal_segments() {
|
||||
// `..` / `.` / empty path segments must not survive into the path.
|
||||
let out = url_to_filename(
|
||||
"https://example.com/../../etc/passwd",
|
||||
&OutputFormat::Markdown,
|
||||
);
|
||||
assert!(!out.contains(".."), "traversal leaked: {out}");
|
||||
assert_eq!(out, "etc/passwd.md");
|
||||
let out2 = url_to_filename("https://example.com/a/./b//c", &OutputFormat::Json);
|
||||
assert_eq!(out2, "a/b/c.json");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn safe_relative_filename_rejects_escapes() {
|
||||
assert!(safe_relative_filename("../escape.md").is_err());
|
||||
assert!(safe_relative_filename("a/../../b.md").is_err());
|
||||
assert!(safe_relative_filename("/etc/passwd").is_err());
|
||||
assert!(safe_relative_filename("").is_err());
|
||||
// Normal nested relative names stay allowed.
|
||||
assert!(safe_relative_filename("nested/deep/file.md").is_ok());
|
||||
assert!(safe_relative_filename("./ok.md").is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn write_to_file_refuses_traversal_filename() {
|
||||
let dir = std::env::temp_dir().join("webclaw_test_traversal_dir");
|
||||
let _ = std::fs::remove_dir_all(&dir);
|
||||
// CSV-supplied `url,filename` traversal attempt.
|
||||
let err = write_to_file(&dir, "../../tmp/webclaw_pwned.md", "x").unwrap_err();
|
||||
assert!(err.contains("refusing"), "unexpected error: {err}");
|
||||
assert!(
|
||||
!std::path::Path::new("/tmp/webclaw_pwned.md").exists(),
|
||||
"traversal write escaped the output dir"
|
||||
);
|
||||
let _ = std::fs::remove_dir_all(&dir);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn research_slug_truncation_is_char_safe() {
|
||||
// Multibyte query: byte-slicing at 50 would panic mid-codepoint.
|
||||
let query = "日本語".repeat(40); // 120 chars, 3 bytes each
|
||||
let slug: String = query
|
||||
.chars()
|
||||
.map(|c| {
|
||||
if c.is_alphanumeric() || c == ' ' {
|
||||
c
|
||||
} else {
|
||||
' '
|
||||
}
|
||||
})
|
||||
.collect::<String>()
|
||||
.split_whitespace()
|
||||
.collect::<Vec<_>>()
|
||||
.join("-")
|
||||
.to_lowercase();
|
||||
let slug: String = slug.chars().take(50).collect();
|
||||
assert!(slug.chars().count() <= 50);
|
||||
// Round-trips through formatting without panicking.
|
||||
let _ = format!("research-{slug}.json");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue