diff --git a/.gitignore b/.gitignore index f97d040..7a5a785 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,15 @@ target/ .DS_Store .env +.env.* proxies.txt .claude/skills/ -*.json +# Scratch / local artifacts (previously covered by overbroad `*.json`, +# which would have also swallowed package.json, components.json, +# .smithery/*.json if they were ever modified). +*.local.json +local-test-results.json +# CLI research command dumps JSON output keyed on the query; they're +# not code and shouldn't live in git. Track deliberately-saved research +# output under a different name. +research-*.json diff --git a/CHANGELOG.md b/CHANGELOG.md index 96ed417..5079bbc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,19 @@ All notable changes to webclaw are documented here. Format follows [Keep a Changelog](https://keepachangelog.com/). +## [0.3.16] — 2026-04-16 + +### Hardened +- **Response body caps across fetch + LLM providers (P2).** Every HTTP response buffered from the network is now rejected if it exceeds a hard size cap. `webclaw-fetch::Response::from_wreq` caps HTML/doc responses at 50 MB (before the allocation pays for anything and as a belt-and-braces check after `bytes().await`); `webclaw-llm` providers (anthropic / openai / ollama) cap JSON responses at 5 MB via a shared `response_json_capped` helper. Previously an adversarial or runaway upstream could push unbounded memory into the process. Closes the DoS-via-giant-body class of bugs noted in the audit. +- **Crawler frontier cap (P2).** After each depth level the frontier is truncated to `max(max_pages × 10, 100)` entries, keeping the most recently discovered links. Dense pages (tag clouds, search results) used to push the frontier into the tens of thousands even after `max_pages` halted new fetches, keeping string allocations alive long after the crawl was effectively done. +- **Glob pattern validation (P2).** User-supplied `include_patterns` / `exclude_patterns` passed to the crawler are now rejected if they contain more than 4 `**` wildcards or exceed 1024 chars. The backtracking matcher degrades exponentially on deeply-nested `**` against long paths; this keeps adversarial config files from weaponising it. + +### Cleanup +- **Removed blanket `#![allow(dead_code)]` in `webclaw-cli/src/main.rs`.** No dead code surfaced; the suppression was obsolete. +- **`.gitignore`: replaced overbroad `*.json` with specific local-artifact patterns.** The previous rule would have swallowed `package.json` / `components.json` / `.smithery/*.json` if they were ever modified. + +--- + ## [0.3.15] — 2026-04-16 ### Fixed diff --git a/Cargo.lock b/Cargo.lock index 4bf0ec4..09bec62 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3102,7 +3102,7 @@ dependencies = [ [[package]] name = "webclaw-cli" -version = "0.3.15" +version = "0.3.16" dependencies = [ "clap", "dotenvy", @@ -3123,7 +3123,7 @@ dependencies = [ [[package]] name = "webclaw-core" -version = "0.3.15" +version = "0.3.16" dependencies = [ "ego-tree", "once_cell", @@ -3141,7 +3141,7 @@ dependencies = [ [[package]] name = "webclaw-fetch" -version = "0.3.15" +version = "0.3.16" dependencies = [ "bytes", "calamine", @@ -3163,7 +3163,7 @@ dependencies = [ [[package]] name = "webclaw-llm" -version = "0.3.15" +version = "0.3.16" dependencies = [ "async-trait", "reqwest", @@ -3176,7 +3176,7 @@ dependencies = [ [[package]] name = "webclaw-mcp" -version = "0.3.15" +version = "0.3.16" dependencies = [ "dirs", "dotenvy", @@ -3197,7 +3197,7 @@ dependencies = [ [[package]] name = "webclaw-pdf" -version = "0.3.15" +version = "0.3.16" dependencies = [ "pdf-extract", "thiserror", diff --git a/Cargo.toml b/Cargo.toml index 97ead31..f8587ca 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ resolver = "2" members = ["crates/*"] [workspace.package] -version = "0.3.15" +version = "0.3.16" edition = "2024" license = "AGPL-3.0" repository = "https://github.com/0xMassi/webclaw" diff --git a/crates/webclaw-cli/src/main.rs b/crates/webclaw-cli/src/main.rs index e520d4f..8070d63 100644 --- a/crates/webclaw-cli/src/main.rs +++ b/crates/webclaw-cli/src/main.rs @@ -1,4 +1,3 @@ -#![allow(dead_code)] /// CLI entry point -- wires webclaw-core and webclaw-fetch into a single command. /// All extraction and fetching logic lives in sibling crates; this is pure plumbing. mod cloud; diff --git a/crates/webclaw-fetch/src/client.rs b/crates/webclaw-fetch/src/client.rs index 2bee533..cc6378a 100644 --- a/crates/webclaw-fetch/src/client.rs +++ b/crates/webclaw-fetch/src/client.rs @@ -87,9 +87,27 @@ struct Response { body: bytes::Bytes, } +/// Maximum fetched body size. A single 50 MB HTML document is already +/// several orders of magnitude past any realistic page; larger responses +/// are either malicious (log bomb, zip-bomb decompressed) or streaming +/// bugs. Caps the blast radius of the HTML → markdown conversion +/// downstream (which could otherwise allocate multiple full-size Strings +/// per page in collapse_whitespace + strip_markdown). +const MAX_BODY_BYTES: u64 = 50 * 1024 * 1024; + impl Response { - /// Buffer a wreq response into an owned Response. + /// Buffer a wreq response into an owned Response. Rejects bodies that + /// advertise a Content-Length beyond [`MAX_BODY_BYTES`] before we pay + /// the allocation, and truncates after the fact as a belt-and-braces + /// check against a lying server. async fn from_wreq(resp: wreq::Response) -> Result { + if let Some(len) = resp.content_length() + && len > MAX_BODY_BYTES + { + return Err(FetchError::BodyDecode(format!( + "response body {len} bytes exceeds cap {MAX_BODY_BYTES}" + ))); + } let status = resp.status().as_u16(); let url = resp.uri().to_string(); let headers = resp.headers().clone(); @@ -97,6 +115,12 @@ impl Response { .bytes() .await .map_err(|e| FetchError::BodyDecode(e.to_string()))?; + if body.len() as u64 > MAX_BODY_BYTES { + return Err(FetchError::BodyDecode(format!( + "response body {} bytes exceeds cap {MAX_BODY_BYTES}", + body.len() + ))); + } Ok(Self { status, url, diff --git a/crates/webclaw-fetch/src/crawler.rs b/crates/webclaw-fetch/src/crawler.rs index bfb86a6..740c479 100644 --- a/crates/webclaw-fetch/src/crawler.rs +++ b/crates/webclaw-fetch/src/crawler.rs @@ -137,6 +137,19 @@ impl Crawler { let seed_origin = origin_key(&seed); let seed_root_domain = root_domain(&seed); + // Reject pathological user-supplied glob patterns before they can + // exercise the recursive `**` handler in glob_match_inner. The + // matcher is a straight backtracking implementation; a deeply + // nested `**/**/**/...` pattern against a long path can degrade + // to exponential time per link checked, per page crawled. + for pat in config + .include_patterns + .iter() + .chain(config.exclude_patterns.iter()) + { + validate_glob(pat)?; + } + let client = FetchClient::new(config.fetch.clone())?; Ok(Self { @@ -387,6 +400,26 @@ impl Crawler { } } + // Cap frontier size independently of max_pages. Pages like + // search-result listings or tag clouds can emit thousands of + // links per page; without this a single dense page could push + // the frontier into the tens of thousands of entries and keep + // String allocations alive even after max_pages halts crawling. + // Trim aggressively once we exceed 10× max_pages, keeping the + // most recently discovered entries which are still on-topic + // (breadth-first = siblings of the last page we saw). + let frontier_cap = self.config.max_pages.saturating_mul(10).max(100); + if next_frontier.len() > frontier_cap { + let keep = self.config.max_pages.saturating_mul(5).max(50); + warn!( + frontier = next_frontier.len(), + cap = frontier_cap, + trimmed_to = keep, + "frontier exceeded cap, truncating" + ); + next_frontier.truncate(keep); + } + frontier = next_frontier; } @@ -546,6 +579,49 @@ fn normalize(url: &Url) -> String { format!("{scheme}://{host}{port_suffix}{path}{query}") } +/// Maximum number of `**` wildcards allowed in a single user glob. Each +/// additional `**` multiplies the backtracking fan-out of `glob_match_inner` +/// against adversarial paths; 4 is a practical ceiling for legitimate +/// nested include/exclude patterns and still keeps the matcher linear-ish. +const MAX_GLOB_DOUBLESTAR: usize = 4; + +/// Maximum glob pattern length. Keeps a single pattern from taking +/// megabytes of RAM if someone copy-pastes garbage into --include. +const MAX_GLOB_LEN: usize = 1024; + +/// Validate a user-supplied glob pattern before it hits the matcher. +/// Rejects patterns that would drive `glob_match_inner` into pathological +/// backtracking (too many `**`, excessive length). +fn validate_glob(pat: &str) -> Result<(), FetchError> { + if pat.len() > MAX_GLOB_LEN { + return Err(FetchError::Build(format!( + "glob pattern exceeds {MAX_GLOB_LEN} chars ({} given)", + pat.len() + ))); + } + // Count non-overlapping occurrences of `**`. + let bytes = pat.as_bytes(); + let mut count = 0usize; + let mut i = 0; + while i + 1 < bytes.len() { + if bytes[i] == b'*' && bytes[i + 1] == b'*' { + count += 1; + // Skip run of consecutive `*` so `***` counts as one. + while i < bytes.len() && bytes[i] == b'*' { + i += 1; + } + } else { + i += 1; + } + } + if count > MAX_GLOB_DOUBLESTAR { + return Err(FetchError::Build(format!( + "glob pattern has {count} `**` wildcards (max {MAX_GLOB_DOUBLESTAR})" + ))); + } + Ok(()) +} + /// Simple glob matching for URL paths. Supports: /// - `*` matches any characters within a single path segment (no `/`) /// - `**` matches any characters including `/` (any number of segments) @@ -700,6 +776,37 @@ mod tests { assert_eq!(root_domain(&url), "example.com"); } + // -- validate_glob tests -- + + #[test] + fn validate_glob_accepts_reasonable_patterns() { + assert!(validate_glob("/api/*").is_ok()); + assert!(validate_glob("/api/**").is_ok()); + assert!(validate_glob("/docs/**/page-*.html").is_ok()); + assert!(validate_glob("/a/**/b/**/c/**/d/**").is_ok()); + } + + #[test] + fn validate_glob_rejects_too_many_doublestars() { + // 5 `**` exceeds MAX_GLOB_DOUBLESTAR = 4. + let pat = "/a/**/b/**/c/**/d/**/e/**"; + let err = validate_glob(pat).unwrap_err(); + assert!(matches!(err, FetchError::Build(ref m) if m.contains("`**` wildcards"))); + } + + #[test] + fn validate_glob_treats_triple_star_as_one() { + // `***` is still one run, should not count as 2. + assert!(validate_glob("/a/***/b/***/c/***/d/***").is_ok()); + } + + #[test] + fn validate_glob_rejects_oversized_pattern() { + let giant = "x".repeat(2048); + let err = validate_glob(&giant).unwrap_err(); + assert!(matches!(err, FetchError::Build(ref m) if m.contains("exceeds"))); + } + // -- glob_match tests -- #[test] diff --git a/crates/webclaw-llm/src/providers/anthropic.rs b/crates/webclaw-llm/src/providers/anthropic.rs index 9852e27..71ca1f9 100644 --- a/crates/webclaw-llm/src/providers/anthropic.rs +++ b/crates/webclaw-llm/src/providers/anthropic.rs @@ -95,7 +95,9 @@ impl LlmProvider for AnthropicProvider { ))); } - let json: serde_json::Value = resp.json().await?; + // Read body with a size cap so a malicious or misbehaving + // endpoint can't allocate unbounded memory via resp.json(). + let json = super::response_json_capped(resp).await?; // Anthropic response: {"content": [{"type": "text", "text": "..."}]} let raw = json["content"][0]["text"] diff --git a/crates/webclaw-llm/src/providers/mod.rs b/crates/webclaw-llm/src/providers/mod.rs index 907b88e..1e6412b 100644 --- a/crates/webclaw-llm/src/providers/mod.rs +++ b/crates/webclaw-llm/src/providers/mod.rs @@ -2,6 +2,8 @@ pub mod anthropic; pub mod ollama; pub mod openai; +use crate::error::LlmError; + /// Load an API key from an explicit override or an environment variable. /// Returns `None` if neither is set or the value is empty. pub(crate) fn load_api_key(override_key: Option, env_var: &str) -> Option { @@ -9,6 +11,36 @@ pub(crate) fn load_api_key(override_key: Option, env_var: &str) -> Optio if key.is_empty() { None } else { Some(key) } } +/// Maximum bytes we'll pull from an LLM provider response. 5 MB is already +/// ~5× the largest real payload any of these providers emits for normal +/// completions; anything bigger is either a streaming bug on their end or +/// an adversarial response aimed at exhausting our memory. +pub(crate) const MAX_RESPONSE_BYTES: u64 = 5 * 1024 * 1024; + +/// Read a provider response as JSON, capping total bytes at +/// [`MAX_RESPONSE_BYTES`]. Rejects via Content-Length if the server is +/// honest about size; otherwise reads to completion and checks the actual +/// byte length so an unbounded body still can't swallow unbounded memory. +pub(crate) async fn response_json_capped( + resp: reqwest::Response, +) -> Result { + if let Some(len) = resp.content_length() + && len > MAX_RESPONSE_BYTES + { + return Err(LlmError::ProviderError(format!( + "response body {len} bytes exceeds cap {MAX_RESPONSE_BYTES}" + ))); + } + let bytes = resp.bytes().await?; + if bytes.len() as u64 > MAX_RESPONSE_BYTES { + return Err(LlmError::ProviderError(format!( + "response body {} bytes exceeds cap {MAX_RESPONSE_BYTES}", + bytes.len() + ))); + } + serde_json::from_slice(&bytes).map_err(|e| LlmError::InvalidJson(format!("response body: {e}"))) +} + #[cfg(test)] mod tests { use super::*; diff --git a/crates/webclaw-llm/src/providers/ollama.rs b/crates/webclaw-llm/src/providers/ollama.rs index 4971525..9ee66c9 100644 --- a/crates/webclaw-llm/src/providers/ollama.rs +++ b/crates/webclaw-llm/src/providers/ollama.rs @@ -80,7 +80,9 @@ impl LlmProvider for OllamaProvider { ))); } - let json: serde_json::Value = resp.json().await?; + // Cap response body size to defend against adversarial payloads + // or a runaway local model streaming gigabytes. + let json = super::response_json_capped(resp).await?; let raw = json["message"]["content"] .as_str() diff --git a/crates/webclaw-llm/src/providers/openai.rs b/crates/webclaw-llm/src/providers/openai.rs index 49825cd..6422cc4 100644 --- a/crates/webclaw-llm/src/providers/openai.rs +++ b/crates/webclaw-llm/src/providers/openai.rs @@ -91,7 +91,8 @@ impl LlmProvider for OpenAiProvider { ))); } - let json: serde_json::Value = resp.json().await?; + // Cap response body size to defend against adversarial payloads. + let json = super::response_json_capped(resp).await?; let raw = json["choices"][0]["message"]["content"] .as_str() diff --git a/packages/create-webclaw/server.json b/packages/create-webclaw/server.json new file mode 100644 index 0000000..0cfc140 --- /dev/null +++ b/packages/create-webclaw/server.json @@ -0,0 +1,17 @@ +{ + "$schema": "https://static.modelcontextprotocol.io/schemas/2025-12-11/server.schema.json", + "name": "io.github.0xMassi/webclaw", + "title": "webclaw", + "description": "Web extraction MCP server. Scrape, crawl, extract, summarize any URL to clean markdown.", + "version": "0.1.4", + "packages": [ + { + "registryType": "npm", + "identifier": "create-webclaw", + "version": "0.1.4", + "transport": { + "type": "stdio" + } + } + ] +}