fix: harden resource limits, path safety, and WASM build (#46)

Security audit follow-up across the workspace: - webclaw-core: keep the crate WASM-safe. quickjs/rquickjs is now a cfg(not(wasm32)) target dependency and the extraction entry point uses a direct call on wasm instead of spawning a thread, so it builds and runs on wasm32 with or without default features. - webclaw-core: bound the structured-data scrubber recursion (depth cap) so deeply nested attacker JSON-LD / __NEXT_DATA__ cannot exhaust the stack. - webclaw-fetch: stream the response body with a running ceiling so a small highly compressed payload cannot inflate to gigabytes in memory; redact user:pass@ from proxy URLs before they reach error strings. - webclaw-cli: contain output filenames inside the chosen directory (reject .. / absolute, drop traversal path segments), run --webhook URLs through the public-URL SSRF guard, clamp --watch-interval to >=1s, and make research slug truncation char-safe. - webclaw-mcp: char-safe slug truncation (no multibyte slice panic). - setup.sh / deploy/hetzner.sh: replace eval on read input with printf -v, and mask auth key / API token in console output. - CI: enforce the wasm32 build invariant for webclaw-core. Tests added for every behavioral change. Bump to 0.6.3 + CHANGELOG.
2026-07-26 07:51:01 +02:00 · 2026-05-19 17:03:52 +02:00 · 2026-05-19 17:03:52 +02:00 · be8bcfebd9
commit be8bcfebd9
parent aab51bea91
13 changed files with 454 additions and 47 deletions
--- a/crates/webclaw-fetch/src/client.rs
+++ b/crates/webclaw-fetch/src/client.rs
@ -95,12 +95,30 @@ struct Response {
 /// per page in collapse_whitespace + strip_markdown).
 const MAX_BODY_BYTES: u64 = 50 * 1024 * 1024;

+/// Running decompression-bomb guard: reject as soon as the bytes already
+/// buffered plus the next decompressed chunk would cross [`MAX_BODY_BYTES`].
+/// Saturating arithmetic so a huge chunk length can't wrap the sum.
+fn check_body_ceiling(buffered: usize, next_chunk: usize) -> Result<(), FetchError> {
+    let total = (buffered as u64).saturating_add(next_chunk as u64);
+    if total > MAX_BODY_BYTES {
+        return Err(FetchError::BodyDecode(format!(
+            "response body exceeds cap {MAX_BODY_BYTES} bytes (decompressed)"
+        )));
+    }
+    Ok(())
+}
+
 impl Response {
-    /// Buffer a wreq response into an owned Response. Rejects bodies that
-    /// advertise a Content-Length beyond [`MAX_BODY_BYTES`] before we pay
-    /// the allocation, and truncates after the fact as a belt-and-braces
-    /// check against a lying server.
-    async fn from_wreq(resp: wreq::Response) -> Result<Self, FetchError> {
+    /// Buffer a wreq response into an owned Response.
+    ///
+    /// Rejects bodies that advertise a Content-Length beyond
+    /// [`MAX_BODY_BYTES`] before we pay any allocation, then streams the
+    /// body chunk-by-chunk while enforcing a running ceiling. `chunk()`
+    /// yields *post-decompression* bytes (gzip/brotli/zstd/deflate are
+    /// negotiated), so a tiny compressed payload that inflates to
+    /// gigabytes is aborted as soon as the accumulated size crosses the
+    /// cap — it never gets fully buffered in memory.
+    async fn from_wreq(mut resp: wreq::Response) -> Result<Self, FetchError> {
        if let Some(len) = resp.content_length()
            && len > MAX_BODY_BYTES
        {
@ -111,21 +129,22 @@ impl Response {
        let status = resp.status().as_u16();
        let url = resp.uri().to_string();
        let headers = resp.headers().clone();
-        let body = resp
-            .bytes()
+
+        let mut buf = bytes::BytesMut::new();
+        while let Some(chunk) = resp
+            .chunk()
            .await
-            .map_err(|e| FetchError::BodyDecode(e.to_string()))?;
-        if body.len() as u64 > MAX_BODY_BYTES {
-            return Err(FetchError::BodyDecode(format!(
-                "response body {} bytes exceeds cap {MAX_BODY_BYTES}",
-                body.len()
-            )));
+            .map_err(|e| FetchError::BodyDecode(e.to_string()))?
+        {
+            check_body_ceiling(buf.len(), chunk.len())?;
+            buf.extend_from_slice(&chunk);
        }
+
        Ok(Self {
            status,
            url,
            headers,
-            body,
+            body: buf.freeze(),
        })
    }

@ -896,6 +915,28 @@ mod tests {
        assert!(err.result.is_err());
    }

+    #[test]
+    fn body_ceiling_allows_under_cap() {
+        assert!(check_body_ceiling(0, 1024).is_ok());
+        assert!(check_body_ceiling(MAX_BODY_BYTES as usize - 1, 1).is_ok());
+    }
+
+    #[test]
+    fn body_ceiling_rejects_at_and_over_cap() {
+        // Exactly at the cap is allowed; one byte over is rejected.
+        assert!(check_body_ceiling(MAX_BODY_BYTES as usize, 1).is_err());
+        // A small buffer plus a huge inflated chunk (decompression bomb)
+        // is caught on the very first oversized chunk.
+        let err = check_body_ceiling(16, 64 * 1024 * 1024).unwrap_err();
+        assert!(matches!(err, FetchError::BodyDecode(_)));
+    }
+
+    #[test]
+    fn body_ceiling_saturates_on_overflow() {
+        // usize::MAX chunk must not wrap the running sum to a small value.
+        assert!(check_body_ceiling(usize::MAX, usize::MAX).is_err());
+    }
+
    #[test]
    fn test_batch_extract_result_struct() {
        let err = BatchExtractResult {
--- a/crates/webclaw-fetch/src/tls.rs
+++ b/crates/webclaw-fetch/src/tls.rs
@ -533,8 +533,9 @@ pub fn build_client(
        .timeout(timeout);

    if let Some(proxy_url) = proxy {
-        let proxy =
-            wreq::Proxy::all(proxy_url).map_err(|e| FetchError::Build(format!("proxy: {e}")))?;
+        let proxy = wreq::Proxy::all(proxy_url).map_err(|_| {
+            FetchError::Build(format!("invalid proxy {}", redact_proxy_url(proxy_url)))
+        })?;
        builder = builder.proxy(proxy);
    } else {
        builder = builder.dns_resolver(PublicDnsResolver);
@ -545,6 +546,24 @@ pub fn build_client(
        .map_err(|e| FetchError::Build(e.to_string()))
 }

+/// Render a proxy URL safe to log: drop any `user:pass@` userinfo so
+/// rotating-proxy credentials never reach error strings or tracing.
+/// Falls back to a constant placeholder when the input does not parse.
+fn redact_proxy_url(raw: &str) -> String {
+    match url::Url::parse(raw) {
+        Ok(mut u) => {
+            // Best-effort: opaque URLs (e.g. no host) reject these setters;
+            // in that case fall through to the placeholder rather than risk
+            // returning the raw string with credentials.
+            if u.set_username("").is_err() || u.set_password(None).is_err() {
+                return "<proxy redacted>".to_string();
+            }
+            u.to_string()
+        }
+        Err(_) => "<proxy redacted>".to_string(),
+    }
+}
+
 fn ssrf_safe_redirect_policy(
    follow_redirects: bool,
    max_redirects: usize,
@ -567,3 +586,41 @@ fn ssrf_safe_redirect_policy(
        })
    })
 }
+
+#[cfg(test)]
+mod tests {
+    use super::redact_proxy_url;
+
+    #[test]
+    fn redacts_userinfo_from_proxy_url() {
+        let red = redact_proxy_url("http://user123:s3cr3tPass@proxy.example.com:8080");
+        assert!(!red.contains("user123"), "username leaked: {red}");
+        assert!(!red.contains("s3cr3tPass"), "password leaked: {red}");
+        assert!(red.contains("proxy.example.com"), "host lost: {red}");
+        assert!(red.contains("8080"), "port lost: {red}");
+    }
+
+    #[test]
+    fn redacts_long_token_residential_proxy() {
+        // Residential-style: long structured credential with embedded
+        // tokens in the username and special chars in the password.
+        let red =
+            redact_proxy_url("http://acct-zone-resi-country-xx:p@ss-word@gw.proxy.example:12321");
+        assert!(!red.contains("acct-zone-resi"), "username leaked: {red}");
+        assert!(!red.contains("p@ss-word"), "password leaked: {red}");
+        assert!(red.contains("gw.proxy.example"));
+    }
+
+    #[test]
+    fn unparseable_proxy_does_not_echo_input() {
+        let red = redact_proxy_url("user:pass@not a url");
+        assert_eq!(red, "<proxy redacted>");
+    }
+
+    #[test]
+    fn proxy_without_credentials_is_preserved() {
+        let red = redact_proxy_url("http://proxy.example.com:3128");
+        assert!(red.contains("proxy.example.com"));
+        assert!(red.contains("3128"));
+    }
+}