mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-04-25 00:06:21 +02:00
polish(fetch,mcp): robots parser + firefox client cache + Acquire ordering (P3) (#23)
Three P3 items from the 2026-04-16 audit. Bump to 0.3.17. webclaw-fetch/sitemap.rs: parse_robots_txt used trimmed[..8] slice plus eq_ignore_ascii_case for the directive test. That was fragile: "Sitemap :" (space before colon) fell through silently, inline "# ..." comments leaked into the URL, and a line with no URL at all returned an empty string. Rewritten to split on the first colon, match any-case "sitemap" as the directive name, strip comments, and require `://` in the value. +7 unit tests cover case variants, space-before-colon, comments, empty values, non-URL values, and non-sitemap directives. webclaw-fetch/crawler.rs: is_cancelled uses Ordering::Acquire instead of Relaxed. Behaviourally equivalent on current hardware for single-word atomic loads, but the explicit ordering documents intent for readers + compilers. webclaw-mcp/server.rs: add lazy OnceLock cache for the Firefox FetchClient. Tool calls that repeatedly request the firefox profile without cookies used to build a fresh reqwest pool + TLS stack per call. Chrome (default) already used the long-lived field; Random is per-call by design; cookie-bearing requests still build ad-hoc since the cookie header is part of the client shape. Tests: 85 webclaw-fetch (was 78, +7 new sitemap), 272 webclaw-core, 43 webclaw-llm, 11 CLI — all green. Clippy clean across workspace. Refs: docs/AUDIT-2026-04-16.md P3 section Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
d69c50a31d
commit
095ae5d4b1
6 changed files with 153 additions and 20 deletions
11
CHANGELOG.md
11
CHANGELOG.md
|
|
@ -3,6 +3,17 @@
|
||||||
All notable changes to webclaw are documented here.
|
All notable changes to webclaw are documented here.
|
||||||
Format follows [Keep a Changelog](https://keepachangelog.com/).
|
Format follows [Keep a Changelog](https://keepachangelog.com/).
|
||||||
|
|
||||||
|
## [0.3.17] — 2026-04-16
|
||||||
|
|
||||||
|
### Changed
|
||||||
|
- **`webclaw-fetch::sitemap::parse_robots_txt` now does proper directive parsing.** The previous `trimmed[..8].eq_ignore_ascii_case("sitemap:")` slice couldn't handle "Sitemap :" (space before colon) from bad generators, didn't strip inline `# ...` comments, and would have returned empty/garbage values if a directive line had no URL. Now splits on the first colon, matches any-case `sitemap` as the directive name, strips comments, and requires the value to contain `://` before accepting it. Eight new unit tests cover case variants, space-before-colon, inline comments, non-URL values, and non-sitemap directives.
|
||||||
|
- **`webclaw-fetch::crawler::is_cancelled` uses `Ordering::Acquire`** (was `Relaxed`). Technically equivalent on x86/arm64 for single-word loads, but the explicit ordering documents the synchronization intent for readers and the compiler.
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- **`webclaw-mcp` caches the Firefox FetchClient lazily.** Tool calls that repeatedly request the Firefox profile without cookies used to build a fresh reqwest pool + TLS stack per call; a single `OnceLock` keeps the client alive for the life of the server. Chrome (default) and Random (by design per-call) are unaffected.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
## [0.3.16] — 2026-04-16
|
## [0.3.16] — 2026-04-16
|
||||||
|
|
||||||
### Hardened
|
### Hardened
|
||||||
|
|
|
||||||
12
Cargo.lock
generated
12
Cargo.lock
generated
|
|
@ -3102,7 +3102,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-cli"
|
name = "webclaw-cli"
|
||||||
version = "0.3.16"
|
version = "0.3.17"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"clap",
|
"clap",
|
||||||
"dotenvy",
|
"dotenvy",
|
||||||
|
|
@ -3123,7 +3123,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-core"
|
name = "webclaw-core"
|
||||||
version = "0.3.16"
|
version = "0.3.17"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"ego-tree",
|
"ego-tree",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
|
|
@ -3141,7 +3141,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-fetch"
|
name = "webclaw-fetch"
|
||||||
version = "0.3.16"
|
version = "0.3.17"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bytes",
|
"bytes",
|
||||||
"calamine",
|
"calamine",
|
||||||
|
|
@ -3163,7 +3163,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-llm"
|
name = "webclaw-llm"
|
||||||
version = "0.3.16"
|
version = "0.3.17"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-trait",
|
"async-trait",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
|
|
@ -3176,7 +3176,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-mcp"
|
name = "webclaw-mcp"
|
||||||
version = "0.3.16"
|
version = "0.3.17"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"dirs",
|
"dirs",
|
||||||
"dotenvy",
|
"dotenvy",
|
||||||
|
|
@ -3197,7 +3197,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-pdf"
|
name = "webclaw-pdf"
|
||||||
version = "0.3.16"
|
version = "0.3.17"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"pdf-extract",
|
"pdf-extract",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,7 @@ resolver = "2"
|
||||||
members = ["crates/*"]
|
members = ["crates/*"]
|
||||||
|
|
||||||
[workspace.package]
|
[workspace.package]
|
||||||
version = "0.3.16"
|
version = "0.3.17"
|
||||||
edition = "2024"
|
edition = "2024"
|
||||||
license = "AGPL-3.0"
|
license = "AGPL-3.0"
|
||||||
repository = "https://github.com/0xMassi/webclaw"
|
repository = "https://github.com/0xMassi/webclaw"
|
||||||
|
|
|
||||||
|
|
@ -190,11 +190,17 @@ impl Crawler {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns true if the cancel flag has been set.
|
/// Returns true if the cancel flag has been set.
|
||||||
|
///
|
||||||
|
/// Uses `Acquire` load to pair with a `Release` store on the cancel
|
||||||
|
/// path. `Relaxed` was technically fine in practice (x86/arm64 give
|
||||||
|
/// release semantics for free on single-word stores) but `Acquire`
|
||||||
|
/// makes the ordering explicit so the compiler and future readers
|
||||||
|
/// don't need to reason about the memory model.
|
||||||
fn is_cancelled(&self) -> bool {
|
fn is_cancelled(&self) -> bool {
|
||||||
self.config
|
self.config
|
||||||
.cancel_flag
|
.cancel_flag
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.is_some_and(|f| f.load(Ordering::Relaxed))
|
.is_some_and(|f| f.load(Ordering::Acquire))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Crawl starting from `start_url`, returning results for every page visited.
|
/// Crawl starting from `start_url`, returning results for every page visited.
|
||||||
|
|
|
||||||
|
|
@ -152,18 +152,34 @@ async fn fetch_sitemaps(
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
/// Extract `Sitemap:` directive URLs from robots.txt content.
|
/// Extract `Sitemap:` directive URLs from robots.txt content.
|
||||||
|
///
|
||||||
|
/// Handles case-insensitive directive names, optional whitespace before
|
||||||
|
/// the colon, and strips inline `# ...` comments. Rejects values without
|
||||||
|
/// a URL scheme (`://`) so a malformed directive doesn't turn an empty
|
||||||
|
/// or garbage string into a "sitemap URL".
|
||||||
pub fn parse_robots_txt(text: &str) -> Vec<String> {
|
pub fn parse_robots_txt(text: &str) -> Vec<String> {
|
||||||
text.lines()
|
text.lines()
|
||||||
.filter_map(|line| {
|
.filter_map(|line| {
|
||||||
|
// Strip inline `#...` comments (robots.txt convention).
|
||||||
|
let line = match line.split_once('#') {
|
||||||
|
Some((before, _)) => before,
|
||||||
|
None => line,
|
||||||
|
};
|
||||||
let trimmed = line.trim();
|
let trimmed = line.trim();
|
||||||
// Case-insensitive match for "Sitemap:" prefix
|
// Find the colon that terminates the directive name; reject
|
||||||
if trimmed.len() > 8 && trimmed[..8].eq_ignore_ascii_case("sitemap:") {
|
// lines that don't have one. Anything between the start and
|
||||||
let url = trimmed[8..].trim();
|
// the colon that matches "sitemap" case-insensitively is a hit.
|
||||||
if !url.is_empty() {
|
let colon = trimmed.find(':')?;
|
||||||
return Some(url.to_string());
|
let (name, rest) = trimmed.split_at(colon);
|
||||||
|
if !name.trim().eq_ignore_ascii_case("sitemap") {
|
||||||
|
return None;
|
||||||
}
|
}
|
||||||
|
// Skip the colon itself, then trim.
|
||||||
|
let url = rest[1..].trim();
|
||||||
|
if url.is_empty() || !url.contains("://") {
|
||||||
|
return None;
|
||||||
}
|
}
|
||||||
None
|
Some(url.to_string())
|
||||||
})
|
})
|
||||||
.collect()
|
.collect()
|
||||||
}
|
}
|
||||||
|
|
@ -363,6 +379,62 @@ fn parse_sitemap_index(xml: &str) -> Vec<String> {
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn robots_txt_basic() {
|
||||||
|
let t = "User-agent: *\nSitemap: https://example.com/sitemap.xml\n";
|
||||||
|
assert_eq!(
|
||||||
|
parse_robots_txt(t),
|
||||||
|
vec!["https://example.com/sitemap.xml".to_string()]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn robots_txt_case_insensitive() {
|
||||||
|
let t = "SITEMAP: https://a.example.com/s.xml\nsitemap: https://b.example.com/s.xml\n";
|
||||||
|
let got = parse_robots_txt(t);
|
||||||
|
assert_eq!(got.len(), 2);
|
||||||
|
assert!(got.contains(&"https://a.example.com/s.xml".to_string()));
|
||||||
|
assert!(got.contains(&"https://b.example.com/s.xml".to_string()));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn robots_txt_tolerates_space_before_colon() {
|
||||||
|
// Some malformed generators emit `Sitemap :` with a space.
|
||||||
|
let t = "Sitemap : https://example.com/sitemap.xml\n";
|
||||||
|
assert_eq!(
|
||||||
|
parse_robots_txt(t),
|
||||||
|
vec!["https://example.com/sitemap.xml".to_string()]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn robots_txt_strips_inline_comments() {
|
||||||
|
let t = "Sitemap: https://example.com/s.xml # main sitemap\n";
|
||||||
|
assert_eq!(
|
||||||
|
parse_robots_txt(t),
|
||||||
|
vec!["https://example.com/s.xml".to_string()]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn robots_txt_rejects_empty_value() {
|
||||||
|
let t = "Sitemap:\nSitemap: \n";
|
||||||
|
assert!(parse_robots_txt(t).is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn robots_txt_rejects_non_url_value() {
|
||||||
|
// "Sitemap: /relative/path" has no scheme; don't blindly accept.
|
||||||
|
let t = "Sitemap: /sitemap.xml\nSitemap: junk text\n";
|
||||||
|
assert!(parse_robots_txt(t).is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn robots_txt_ignores_non_sitemap_directives() {
|
||||||
|
let t = "User-agent: *\nDisallow: /admin\nAllow: /\n";
|
||||||
|
assert!(parse_robots_txt(t).is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_parse_urlset() {
|
fn test_parse_urlset() {
|
||||||
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
|
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,7 @@
|
||||||
/// Uses a local-first architecture: fetches pages directly, then falls back
|
/// Uses a local-first architecture: fetches pages directly, then falls back
|
||||||
/// to the webclaw cloud API (api.webclaw.io) when bot protection or
|
/// to the webclaw cloud API (api.webclaw.io) when bot protection or
|
||||||
/// JS rendering is detected. Set WEBCLAW_API_KEY for automatic fallback.
|
/// JS rendering is detected. Set WEBCLAW_API_KEY for automatic fallback.
|
||||||
use std::sync::Arc;
|
use std::sync::{Arc, OnceLock};
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
|
||||||
use rmcp::handler::server::router::tool::ToolRouter;
|
use rmcp::handler::server::router::tool::ToolRouter;
|
||||||
|
|
@ -21,6 +21,17 @@ use crate::tools::*;
|
||||||
pub struct WebclawMcp {
|
pub struct WebclawMcp {
|
||||||
tool_router: ToolRouter<Self>,
|
tool_router: ToolRouter<Self>,
|
||||||
fetch_client: Arc<webclaw_fetch::FetchClient>,
|
fetch_client: Arc<webclaw_fetch::FetchClient>,
|
||||||
|
/// Lazily-initialized Firefox client, reused across all tool calls that
|
||||||
|
/// request the Firefox profile with no cookies. Building a FetchClient
|
||||||
|
/// allocates a reqwest pool + TLS state, so tools that repeatedly hit
|
||||||
|
/// the same profile used to pay that cost per call.
|
||||||
|
///
|
||||||
|
/// We don't cache the default Chrome client (already lives at
|
||||||
|
/// `fetch_client`), we don't cache Random (by design rotates per call),
|
||||||
|
/// and we don't cache cookie-bearing clients (the cookie header is
|
||||||
|
/// part of the shape and varies per call). So a single OnceLock for
|
||||||
|
/// Firefox covers the one real cacheable case without a full LRU.
|
||||||
|
firefox_client: OnceLock<Arc<webclaw_fetch::FetchClient>>,
|
||||||
llm_chain: Option<webclaw_llm::ProviderChain>,
|
llm_chain: Option<webclaw_llm::ProviderChain>,
|
||||||
cloud: Option<CloudClient>,
|
cloud: Option<CloudClient>,
|
||||||
}
|
}
|
||||||
|
|
@ -109,11 +120,33 @@ impl WebclawMcp {
|
||||||
Self {
|
Self {
|
||||||
tool_router: Self::tool_router(),
|
tool_router: Self::tool_router(),
|
||||||
fetch_client: Arc::new(fetch_client),
|
fetch_client: Arc::new(fetch_client),
|
||||||
|
firefox_client: OnceLock::new(),
|
||||||
llm_chain,
|
llm_chain,
|
||||||
cloud,
|
cloud,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Return the cached Firefox client, building it on first use. Any
|
||||||
|
/// subsequent call with the same profile (and no cookies) reuses the
|
||||||
|
/// pool instead of allocating a fresh reqwest + TLS stack.
|
||||||
|
fn firefox_or_build(&self) -> Result<Arc<webclaw_fetch::FetchClient>, String> {
|
||||||
|
if let Some(c) = self.firefox_client.get() {
|
||||||
|
return Ok(c.clone());
|
||||||
|
}
|
||||||
|
let config = webclaw_fetch::FetchConfig {
|
||||||
|
browser: webclaw_fetch::BrowserProfile::Firefox,
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
let client = Arc::new(
|
||||||
|
webclaw_fetch::FetchClient::new(config)
|
||||||
|
.map_err(|e| format!("Failed to build firefox client: {e}"))?,
|
||||||
|
);
|
||||||
|
// If a concurrent call raced us here, OnceLock keeps the first
|
||||||
|
// winner and drops ours — that's fine, the ref-count just cleans up.
|
||||||
|
let _ = self.firefox_client.set(client.clone());
|
||||||
|
Ok(self.firefox_client.get().cloned().unwrap_or(client))
|
||||||
|
}
|
||||||
|
|
||||||
/// Helper: smart fetch with LLM format for extract/summarize tools.
|
/// Helper: smart fetch with LLM format for extract/summarize tools.
|
||||||
async fn smart_fetch_llm(&self, url: &str) -> Result<SmartFetchResult, String> {
|
async fn smart_fetch_llm(&self, url: &str) -> Result<SmartFetchResult, String> {
|
||||||
cloud::smart_fetch(
|
cloud::smart_fetch(
|
||||||
|
|
@ -147,10 +180,23 @@ impl WebclawMcp {
|
||||||
.map(|c| c.join("; "));
|
.map(|c| c.join("; "));
|
||||||
|
|
||||||
// Use a custom client if non-default browser or cookies are provided
|
// Use a custom client if non-default browser or cookies are provided
|
||||||
|
// Pick the cheapest route to a FetchClient for this request:
|
||||||
|
// 1. Default Chrome + no cookies → reuse the long-lived self.fetch_client
|
||||||
|
// 2. Firefox + no cookies → reuse the lazily-cached self.firefox_client
|
||||||
|
// 3. Anything with cookies, or Random → build ad-hoc
|
||||||
let is_default_browser = matches!(browser, webclaw_fetch::BrowserProfile::Chrome);
|
let is_default_browser = matches!(browser, webclaw_fetch::BrowserProfile::Chrome);
|
||||||
let needs_custom = !is_default_browser || cookie_header.is_some();
|
|
||||||
let custom_client;
|
let custom_client;
|
||||||
let client: &webclaw_fetch::FetchClient = if needs_custom {
|
let cached_firefox;
|
||||||
|
let client: &webclaw_fetch::FetchClient = if cookie_header.is_none() && is_default_browser {
|
||||||
|
&self.fetch_client
|
||||||
|
} else if cookie_header.is_none()
|
||||||
|
&& matches!(browser, webclaw_fetch::BrowserProfile::Firefox)
|
||||||
|
{
|
||||||
|
cached_firefox = self.firefox_or_build()?;
|
||||||
|
// cached_firefox lives to end of function; auto-deref gives
|
||||||
|
// us &FetchClient here.
|
||||||
|
&cached_firefox
|
||||||
|
} else {
|
||||||
let mut headers = std::collections::HashMap::new();
|
let mut headers = std::collections::HashMap::new();
|
||||||
headers.insert("Accept-Language".to_string(), "en-US,en;q=0.9".to_string());
|
headers.insert("Accept-Language".to_string(), "en-US,en;q=0.9".to_string());
|
||||||
if let Some(ref cookies) = cookie_header {
|
if let Some(ref cookies) = cookie_header {
|
||||||
|
|
@ -164,8 +210,6 @@ impl WebclawMcp {
|
||||||
custom_client = webclaw_fetch::FetchClient::new(config)
|
custom_client = webclaw_fetch::FetchClient::new(config)
|
||||||
.map_err(|e| format!("Failed to build client: {e}"))?;
|
.map_err(|e| format!("Failed to build client: {e}"))?;
|
||||||
&custom_client
|
&custom_client
|
||||||
} else {
|
|
||||||
&self.fetch_client
|
|
||||||
};
|
};
|
||||||
|
|
||||||
let formats = [format];
|
let formats = [format];
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue