feat(fetch,llm): DoS hardening + glob validation + cleanup (P2) (#22)

* feat(fetch,llm): DoS hardening via response caps + glob validation (P2) Response body caps: - webclaw-fetch::Response::from_wreq now rejects bodies over 50 MB. Checks Content-Length up front (before the allocation) and the actual .bytes() length after (belt-and-braces against lying upstreams). Previously the HTML -> markdown conversion downstream could allocate multiple String copies per page; a 100 MB page would OOM the process. - webclaw-llm providers (anthropic/openai/ollama) share a new response_json_capped helper with a 5 MB cap. Protects against a malicious or runaway provider response exhausting memory. Crawler frontier cap: after each BFS depth level the frontier is truncated to max(max_pages * 10, 100) entries, keeping the most recently discovered links. Dense pages (tag clouds, search results) used to push the frontier into the tens of thousands even after max_pages halted new fetches. Glob pattern validation: user-supplied include_patterns / exclude_patterns are rejected at Crawler::new if they contain more than 4 `**` wildcards or exceed 1024 chars. The backtracking matcher degrades exponentially on deeply-nested `**` against long paths. Cleanup: - Removed blanket #![allow(dead_code)] from webclaw-cli/src/main.rs; no warnings surfaced, the suppression was obsolete. - core/.gitignore: replaced overbroad *.json with specific local- artifact patterns (previous rule would have swallowed package.json, components.json, .smithery/*.json). Tests: +4 validate_glob tests. Full workspace test: 283 passed (webclaw-core + webclaw-fetch + webclaw-llm). Version: 0.3.15 -> 0.3.16 CHANGELOG updated. Refs: docs/AUDIT-2026-04-16.md (P2 section) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * chore: gitignore CLI research dumps, drop accidentally-tracked file research-*.json output from `webclaw ... --research ...` got silently swept into git by the relaxed *.json gitignore in the preceding commit. The old blanket *.json rule was hiding both this legitimate scratch file AND packages/create-webclaw/server.json (MCP registry config that we DO want tracked). Removes the research dump from git and adds a narrower research-*.json ignore pattern so future CLI output doesn't get re-tracked by accident. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-07-21 07:01:01 +02:00 · 2026-04-16 19:44:08 +02:00 · 2026-04-16 19:44:08 +02:00 · d69c50a31d
commit d69c50a31d
parent 7773c8af2a
12 changed files with 219 additions and 13 deletions
--- a/crates/webclaw-fetch/src/client.rs
+++ b/crates/webclaw-fetch/src/client.rs
@ -87,9 +87,27 @@ struct Response {
    body: bytes::Bytes,
 }

+/// Maximum fetched body size. A single 50 MB HTML document is already
+/// several orders of magnitude past any realistic page; larger responses
+/// are either malicious (log bomb, zip-bomb decompressed) or streaming
+/// bugs. Caps the blast radius of the HTML → markdown conversion
+/// downstream (which could otherwise allocate multiple full-size Strings
+/// per page in collapse_whitespace + strip_markdown).
+const MAX_BODY_BYTES: u64 = 50 * 1024 * 1024;
+
 impl Response {
-    /// Buffer a wreq response into an owned Response.
+    /// Buffer a wreq response into an owned Response. Rejects bodies that
+    /// advertise a Content-Length beyond [`MAX_BODY_BYTES`] before we pay
+    /// the allocation, and truncates after the fact as a belt-and-braces
+    /// check against a lying server.
    async fn from_wreq(resp: wreq::Response) -> Result<Self, FetchError> {
+        if let Some(len) = resp.content_length()
+            && len > MAX_BODY_BYTES
+        {
+            return Err(FetchError::BodyDecode(format!(
+                "response body {len} bytes exceeds cap {MAX_BODY_BYTES}"
+            )));
+        }
        let status = resp.status().as_u16();
        let url = resp.uri().to_string();
        let headers = resp.headers().clone();
@ -97,6 +115,12 @@ impl Response {
            .bytes()
            .await
            .map_err(|e| FetchError::BodyDecode(e.to_string()))?;
+        if body.len() as u64 > MAX_BODY_BYTES {
+            return Err(FetchError::BodyDecode(format!(
+                "response body {} bytes exceeds cap {MAX_BODY_BYTES}",
+                body.len()
+            )));
+        }
        Ok(Self {
            status,
            url,
--- a/crates/webclaw-fetch/src/crawler.rs
+++ b/crates/webclaw-fetch/src/crawler.rs
@ -137,6 +137,19 @@ impl Crawler {
        let seed_origin = origin_key(&seed);
        let seed_root_domain = root_domain(&seed);

+        // Reject pathological user-supplied glob patterns before they can
+        // exercise the recursive `**` handler in glob_match_inner. The
+        // matcher is a straight backtracking implementation; a deeply
+        // nested `**/**/**/...` pattern against a long path can degrade
+        // to exponential time per link checked, per page crawled.
+        for pat in config
+            .include_patterns
+            .iter()
+            .chain(config.exclude_patterns.iter())
+        {
+            validate_glob(pat)?;
+        }
+
        let client = FetchClient::new(config.fetch.clone())?;

        Ok(Self {
@ -387,6 +400,26 @@ impl Crawler {
                }
            }

+            // Cap frontier size independently of max_pages. Pages like
+            // search-result listings or tag clouds can emit thousands of
+            // links per page; without this a single dense page could push
+            // the frontier into the tens of thousands of entries and keep
+            // String allocations alive even after max_pages halts crawling.
+            // Trim aggressively once we exceed 10× max_pages, keeping the
+            // most recently discovered entries which are still on-topic
+            // (breadth-first = siblings of the last page we saw).
+            let frontier_cap = self.config.max_pages.saturating_mul(10).max(100);
+            if next_frontier.len() > frontier_cap {
+                let keep = self.config.max_pages.saturating_mul(5).max(50);
+                warn!(
+                    frontier = next_frontier.len(),
+                    cap = frontier_cap,
+                    trimmed_to = keep,
+                    "frontier exceeded cap, truncating"
+                );
+                next_frontier.truncate(keep);
+            }
+
            frontier = next_frontier;
        }

@ -546,6 +579,49 @@ fn normalize(url: &Url) -> String {
    format!("{scheme}://{host}{port_suffix}{path}{query}")
 }

+/// Maximum number of `**` wildcards allowed in a single user glob. Each
+/// additional `**` multiplies the backtracking fan-out of `glob_match_inner`
+/// against adversarial paths; 4 is a practical ceiling for legitimate
+/// nested include/exclude patterns and still keeps the matcher linear-ish.
+const MAX_GLOB_DOUBLESTAR: usize = 4;
+
+/// Maximum glob pattern length. Keeps a single pattern from taking
+/// megabytes of RAM if someone copy-pastes garbage into --include.
+const MAX_GLOB_LEN: usize = 1024;
+
+/// Validate a user-supplied glob pattern before it hits the matcher.
+/// Rejects patterns that would drive `glob_match_inner` into pathological
+/// backtracking (too many `**`, excessive length).
+fn validate_glob(pat: &str) -> Result<(), FetchError> {
+    if pat.len() > MAX_GLOB_LEN {
+        return Err(FetchError::Build(format!(
+            "glob pattern exceeds {MAX_GLOB_LEN} chars ({} given)",
+            pat.len()
+        )));
+    }
+    // Count non-overlapping occurrences of `**`.
+    let bytes = pat.as_bytes();
+    let mut count = 0usize;
+    let mut i = 0;
+    while i + 1 < bytes.len() {
+        if bytes[i] == b'*' && bytes[i + 1] == b'*' {
+            count += 1;
+            // Skip run of consecutive `*` so `***` counts as one.
+            while i < bytes.len() && bytes[i] == b'*' {
+                i += 1;
+            }
+        } else {
+            i += 1;
+        }
+    }
+    if count > MAX_GLOB_DOUBLESTAR {
+        return Err(FetchError::Build(format!(
+            "glob pattern has {count} `**` wildcards (max {MAX_GLOB_DOUBLESTAR})"
+        )));
+    }
+    Ok(())
+}
+
 /// Simple glob matching for URL paths. Supports:
 /// - `*` matches any characters within a single path segment (no `/`)
 /// - `**` matches any characters including `/` (any number of segments)
@ -700,6 +776,37 @@ mod tests {
        assert_eq!(root_domain(&url), "example.com");
    }

+    // -- validate_glob tests --
+
+    #[test]
+    fn validate_glob_accepts_reasonable_patterns() {
+        assert!(validate_glob("/api/*").is_ok());
+        assert!(validate_glob("/api/**").is_ok());
+        assert!(validate_glob("/docs/**/page-*.html").is_ok());
+        assert!(validate_glob("/a/**/b/**/c/**/d/**").is_ok());
+    }
+
+    #[test]
+    fn validate_glob_rejects_too_many_doublestars() {
+        // 5 `**` exceeds MAX_GLOB_DOUBLESTAR = 4.
+        let pat = "/a/**/b/**/c/**/d/**/e/**";
+        let err = validate_glob(pat).unwrap_err();
+        assert!(matches!(err, FetchError::Build(ref m) if m.contains("`**` wildcards")));
+    }
+
+    #[test]
+    fn validate_glob_treats_triple_star_as_one() {
+        // `***` is still one run, should not count as 2.
+        assert!(validate_glob("/a/***/b/***/c/***/d/***").is_ok());
+    }
+
+    #[test]
+    fn validate_glob_rejects_oversized_pattern() {
+        let giant = "x".repeat(2048);
+        let err = validate_glob(&giant).unwrap_err();
+        assert!(matches!(err, FetchError::Build(ref m) if m.contains("exceeds")));
+    }
+
    // -- glob_match tests --

    #[test]