diff --git a/CHANGELOG.md b/CHANGELOG.md index 5079bbc..b923073 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,17 @@ All notable changes to webclaw are documented here. Format follows [Keep a Changelog](https://keepachangelog.com/). +## [0.3.17] — 2026-04-16 + +### Changed +- **`webclaw-fetch::sitemap::parse_robots_txt` now does proper directive parsing.** The previous `trimmed[..8].eq_ignore_ascii_case("sitemap:")` slice couldn't handle "Sitemap :" (space before colon) from bad generators, didn't strip inline `# ...` comments, and would have returned empty/garbage values if a directive line had no URL. Now splits on the first colon, matches any-case `sitemap` as the directive name, strips comments, and requires the value to contain `://` before accepting it. Eight new unit tests cover case variants, space-before-colon, inline comments, non-URL values, and non-sitemap directives. +- **`webclaw-fetch::crawler::is_cancelled` uses `Ordering::Acquire`** (was `Relaxed`). Technically equivalent on x86/arm64 for single-word loads, but the explicit ordering documents the synchronization intent for readers and the compiler. + +### Added +- **`webclaw-mcp` caches the Firefox FetchClient lazily.** Tool calls that repeatedly request the Firefox profile without cookies used to build a fresh reqwest pool + TLS stack per call; a single `OnceLock` keeps the client alive for the life of the server. Chrome (default) and Random (by design per-call) are unaffected. + +--- + ## [0.3.16] — 2026-04-16 ### Hardened diff --git a/Cargo.lock b/Cargo.lock index 09bec62..a57de1b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3102,7 +3102,7 @@ dependencies = [ [[package]] name = "webclaw-cli" -version = "0.3.16" +version = "0.3.17" dependencies = [ "clap", "dotenvy", @@ -3123,7 +3123,7 @@ dependencies = [ [[package]] name = "webclaw-core" -version = "0.3.16" +version = "0.3.17" dependencies = [ "ego-tree", "once_cell", @@ -3141,7 +3141,7 @@ dependencies = [ [[package]] name = "webclaw-fetch" -version = "0.3.16" +version = "0.3.17" dependencies = [ "bytes", "calamine", @@ -3163,7 +3163,7 @@ dependencies = [ [[package]] name = "webclaw-llm" -version = "0.3.16" +version = "0.3.17" dependencies = [ "async-trait", "reqwest", @@ -3176,7 +3176,7 @@ dependencies = [ [[package]] name = "webclaw-mcp" -version = "0.3.16" +version = "0.3.17" dependencies = [ "dirs", "dotenvy", @@ -3197,7 +3197,7 @@ dependencies = [ [[package]] name = "webclaw-pdf" -version = "0.3.16" +version = "0.3.17" dependencies = [ "pdf-extract", "thiserror", diff --git a/Cargo.toml b/Cargo.toml index f8587ca..e757628 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ resolver = "2" members = ["crates/*"] [workspace.package] -version = "0.3.16" +version = "0.3.17" edition = "2024" license = "AGPL-3.0" repository = "https://github.com/0xMassi/webclaw" diff --git a/crates/webclaw-fetch/src/crawler.rs b/crates/webclaw-fetch/src/crawler.rs index 740c479..1403256 100644 --- a/crates/webclaw-fetch/src/crawler.rs +++ b/crates/webclaw-fetch/src/crawler.rs @@ -190,11 +190,17 @@ impl Crawler { } /// Returns true if the cancel flag has been set. + /// + /// Uses `Acquire` load to pair with a `Release` store on the cancel + /// path. `Relaxed` was technically fine in practice (x86/arm64 give + /// release semantics for free on single-word stores) but `Acquire` + /// makes the ordering explicit so the compiler and future readers + /// don't need to reason about the memory model. fn is_cancelled(&self) -> bool { self.config .cancel_flag .as_ref() - .is_some_and(|f| f.load(Ordering::Relaxed)) + .is_some_and(|f| f.load(Ordering::Acquire)) } /// Crawl starting from `start_url`, returning results for every page visited. diff --git a/crates/webclaw-fetch/src/sitemap.rs b/crates/webclaw-fetch/src/sitemap.rs index 05ee394..931db32 100644 --- a/crates/webclaw-fetch/src/sitemap.rs +++ b/crates/webclaw-fetch/src/sitemap.rs @@ -152,18 +152,34 @@ async fn fetch_sitemaps( // --------------------------------------------------------------------------- /// Extract `Sitemap:` directive URLs from robots.txt content. +/// +/// Handles case-insensitive directive names, optional whitespace before +/// the colon, and strips inline `# ...` comments. Rejects values without +/// a URL scheme (`://`) so a malformed directive doesn't turn an empty +/// or garbage string into a "sitemap URL". pub fn parse_robots_txt(text: &str) -> Vec { text.lines() .filter_map(|line| { + // Strip inline `#...` comments (robots.txt convention). + let line = match line.split_once('#') { + Some((before, _)) => before, + None => line, + }; let trimmed = line.trim(); - // Case-insensitive match for "Sitemap:" prefix - if trimmed.len() > 8 && trimmed[..8].eq_ignore_ascii_case("sitemap:") { - let url = trimmed[8..].trim(); - if !url.is_empty() { - return Some(url.to_string()); - } + // Find the colon that terminates the directive name; reject + // lines that don't have one. Anything between the start and + // the colon that matches "sitemap" case-insensitively is a hit. + let colon = trimmed.find(':')?; + let (name, rest) = trimmed.split_at(colon); + if !name.trim().eq_ignore_ascii_case("sitemap") { + return None; } - None + // Skip the colon itself, then trim. + let url = rest[1..].trim(); + if url.is_empty() || !url.contains("://") { + return None; + } + Some(url.to_string()) }) .collect() } @@ -363,6 +379,62 @@ fn parse_sitemap_index(xml: &str) -> Vec { mod tests { use super::*; + #[test] + fn robots_txt_basic() { + let t = "User-agent: *\nSitemap: https://example.com/sitemap.xml\n"; + assert_eq!( + parse_robots_txt(t), + vec!["https://example.com/sitemap.xml".to_string()] + ); + } + + #[test] + fn robots_txt_case_insensitive() { + let t = "SITEMAP: https://a.example.com/s.xml\nsitemap: https://b.example.com/s.xml\n"; + let got = parse_robots_txt(t); + assert_eq!(got.len(), 2); + assert!(got.contains(&"https://a.example.com/s.xml".to_string())); + assert!(got.contains(&"https://b.example.com/s.xml".to_string())); + } + + #[test] + fn robots_txt_tolerates_space_before_colon() { + // Some malformed generators emit `Sitemap :` with a space. + let t = "Sitemap : https://example.com/sitemap.xml\n"; + assert_eq!( + parse_robots_txt(t), + vec!["https://example.com/sitemap.xml".to_string()] + ); + } + + #[test] + fn robots_txt_strips_inline_comments() { + let t = "Sitemap: https://example.com/s.xml # main sitemap\n"; + assert_eq!( + parse_robots_txt(t), + vec!["https://example.com/s.xml".to_string()] + ); + } + + #[test] + fn robots_txt_rejects_empty_value() { + let t = "Sitemap:\nSitemap: \n"; + assert!(parse_robots_txt(t).is_empty()); + } + + #[test] + fn robots_txt_rejects_non_url_value() { + // "Sitemap: /relative/path" has no scheme; don't blindly accept. + let t = "Sitemap: /sitemap.xml\nSitemap: junk text\n"; + assert!(parse_robots_txt(t).is_empty()); + } + + #[test] + fn robots_txt_ignores_non_sitemap_directives() { + let t = "User-agent: *\nDisallow: /admin\nAllow: /\n"; + assert!(parse_robots_txt(t).is_empty()); + } + #[test] fn test_parse_urlset() { let xml = r#" diff --git a/crates/webclaw-mcp/src/server.rs b/crates/webclaw-mcp/src/server.rs index 12514bf..cdb79f0 100644 --- a/crates/webclaw-mcp/src/server.rs +++ b/crates/webclaw-mcp/src/server.rs @@ -4,7 +4,7 @@ /// Uses a local-first architecture: fetches pages directly, then falls back /// to the webclaw cloud API (api.webclaw.io) when bot protection or /// JS rendering is detected. Set WEBCLAW_API_KEY for automatic fallback. -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use std::time::Duration; use rmcp::handler::server::router::tool::ToolRouter; @@ -21,6 +21,17 @@ use crate::tools::*; pub struct WebclawMcp { tool_router: ToolRouter, fetch_client: Arc, + /// Lazily-initialized Firefox client, reused across all tool calls that + /// request the Firefox profile with no cookies. Building a FetchClient + /// allocates a reqwest pool + TLS state, so tools that repeatedly hit + /// the same profile used to pay that cost per call. + /// + /// We don't cache the default Chrome client (already lives at + /// `fetch_client`), we don't cache Random (by design rotates per call), + /// and we don't cache cookie-bearing clients (the cookie header is + /// part of the shape and varies per call). So a single OnceLock for + /// Firefox covers the one real cacheable case without a full LRU. + firefox_client: OnceLock>, llm_chain: Option, cloud: Option, } @@ -109,11 +120,33 @@ impl WebclawMcp { Self { tool_router: Self::tool_router(), fetch_client: Arc::new(fetch_client), + firefox_client: OnceLock::new(), llm_chain, cloud, } } + /// Return the cached Firefox client, building it on first use. Any + /// subsequent call with the same profile (and no cookies) reuses the + /// pool instead of allocating a fresh reqwest + TLS stack. + fn firefox_or_build(&self) -> Result, String> { + if let Some(c) = self.firefox_client.get() { + return Ok(c.clone()); + } + let config = webclaw_fetch::FetchConfig { + browser: webclaw_fetch::BrowserProfile::Firefox, + ..Default::default() + }; + let client = Arc::new( + webclaw_fetch::FetchClient::new(config) + .map_err(|e| format!("Failed to build firefox client: {e}"))?, + ); + // If a concurrent call raced us here, OnceLock keeps the first + // winner and drops ours — that's fine, the ref-count just cleans up. + let _ = self.firefox_client.set(client.clone()); + Ok(self.firefox_client.get().cloned().unwrap_or(client)) + } + /// Helper: smart fetch with LLM format for extract/summarize tools. async fn smart_fetch_llm(&self, url: &str) -> Result { cloud::smart_fetch( @@ -147,10 +180,23 @@ impl WebclawMcp { .map(|c| c.join("; ")); // Use a custom client if non-default browser or cookies are provided + // Pick the cheapest route to a FetchClient for this request: + // 1. Default Chrome + no cookies → reuse the long-lived self.fetch_client + // 2. Firefox + no cookies → reuse the lazily-cached self.firefox_client + // 3. Anything with cookies, or Random → build ad-hoc let is_default_browser = matches!(browser, webclaw_fetch::BrowserProfile::Chrome); - let needs_custom = !is_default_browser || cookie_header.is_some(); let custom_client; - let client: &webclaw_fetch::FetchClient = if needs_custom { + let cached_firefox; + let client: &webclaw_fetch::FetchClient = if cookie_header.is_none() && is_default_browser { + &self.fetch_client + } else if cookie_header.is_none() + && matches!(browser, webclaw_fetch::BrowserProfile::Firefox) + { + cached_firefox = self.firefox_or_build()?; + // cached_firefox lives to end of function; auto-deref gives + // us &FetchClient here. + &cached_firefox + } else { let mut headers = std::collections::HashMap::new(); headers.insert("Accept-Language".to_string(), "en-US,en;q=0.9".to_string()); if let Some(ref cookies) = cookie_header { @@ -164,8 +210,6 @@ impl WebclawMcp { custom_client = webclaw_fetch::FetchClient::new(config) .map_err(|e| format!("Failed to build client: {e}"))?; &custom_client - } else { - &self.fetch_client }; let formats = [format];