polish(fetch,mcp): robots parser + firefox client cache + Acquire ordering (P3) (#23)

Three P3 items from the 2026-04-16 audit. Bump to 0.3.17. webclaw-fetch/sitemap.rs: parse_robots_txt used trimmed[..8] slice plus eq_ignore_ascii_case for the directive test. That was fragile: "Sitemap :" (space before colon) fell through silently, inline "# ..." comments leaked into the URL, and a line with no URL at all returned an empty string. Rewritten to split on the first colon, match any-case "sitemap" as the directive name, strip comments, and require `://` in the value. +7 unit tests cover case variants, space-before-colon, comments, empty values, non-URL values, and non-sitemap directives. webclaw-fetch/crawler.rs: is_cancelled uses Ordering::Acquire instead of Relaxed. Behaviourally equivalent on current hardware for single-word atomic loads, but the explicit ordering documents intent for readers + compilers. webclaw-mcp/server.rs: add lazy OnceLock cache for the Firefox FetchClient. Tool calls that repeatedly request the firefox profile without cookies used to build a fresh reqwest pool + TLS stack per call. Chrome (default) already used the long-lived field; Random is per-call by design; cookie-bearing requests still build ad-hoc since the cookie header is part of the client shape. Tests: 85 webclaw-fetch (was 78, +7 new sitemap), 272 webclaw-core, 43 webclaw-llm, 11 CLI — all green. Clippy clean across workspace. Refs: docs/AUDIT-2026-04-16.md P3 section Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-06-06 22:05:13 +02:00 · 2026-04-16 20:21:32 +02:00 · 2026-04-16 20:21:32 +02:00 · 095ae5d4b1
commit 095ae5d4b1
parent d69c50a31d
6 changed files with 153 additions and 20 deletions
--- a/crates/webclaw-fetch/src/crawler.rs
+++ b/crates/webclaw-fetch/src/crawler.rs
@ -190,11 +190,17 @@ impl Crawler {
    }

    /// Returns true if the cancel flag has been set.
+    ///
+    /// Uses `Acquire` load to pair with a `Release` store on the cancel
+    /// path. `Relaxed` was technically fine in practice (x86/arm64 give
+    /// release semantics for free on single-word stores) but `Acquire`
+    /// makes the ordering explicit so the compiler and future readers
+    /// don't need to reason about the memory model.
    fn is_cancelled(&self) -> bool {
        self.config
            .cancel_flag
            .as_ref()
-            .is_some_and(|f| f.load(Ordering::Relaxed))
+            .is_some_and(|f| f.load(Ordering::Acquire))
    }

    /// Crawl starting from `start_url`, returning results for every page visited.
--- a/crates/webclaw-fetch/src/sitemap.rs
+++ b/crates/webclaw-fetch/src/sitemap.rs
@ -152,18 +152,34 @@ async fn fetch_sitemaps(
 // ---------------------------------------------------------------------------

 /// Extract `Sitemap:` directive URLs from robots.txt content.
+///
+/// Handles case-insensitive directive names, optional whitespace before
+/// the colon, and strips inline `# ...` comments. Rejects values without
+/// a URL scheme (`://`) so a malformed directive doesn't turn an empty
+/// or garbage string into a "sitemap URL".
 pub fn parse_robots_txt(text: &str) -> Vec<String> {
    text.lines()
        .filter_map(|line| {
+            // Strip inline `#...` comments (robots.txt convention).
+            let line = match line.split_once('#') {
+                Some((before, _)) => before,
+                None => line,
+            };
            let trimmed = line.trim();
-            // Case-insensitive match for "Sitemap:" prefix
-            if trimmed.len() > 8 && trimmed[..8].eq_ignore_ascii_case("sitemap:") {
-                let url = trimmed[8..].trim();
-                if !url.is_empty() {
-                    return Some(url.to_string());
-                }
+            // Find the colon that terminates the directive name; reject
+            // lines that don't have one. Anything between the start and
+            // the colon that matches "sitemap" case-insensitively is a hit.
+            let colon = trimmed.find(':')?;
+            let (name, rest) = trimmed.split_at(colon);
+            if !name.trim().eq_ignore_ascii_case("sitemap") {
+                return None;
            }
-            None
+            // Skip the colon itself, then trim.
+            let url = rest[1..].trim();
+            if url.is_empty() || !url.contains("://") {
+                return None;
+            }
+            Some(url.to_string())
        })
        .collect()
 }
@ -363,6 +379,62 @@ fn parse_sitemap_index(xml: &str) -> Vec<String> {
 mod tests {
    use super::*;

+    #[test]
+    fn robots_txt_basic() {
+        let t = "User-agent: *\nSitemap: https://example.com/sitemap.xml\n";
+        assert_eq!(
+            parse_robots_txt(t),
+            vec!["https://example.com/sitemap.xml".to_string()]
+        );
+    }
+
+    #[test]
+    fn robots_txt_case_insensitive() {
+        let t = "SITEMAP: https://a.example.com/s.xml\nsitemap: https://b.example.com/s.xml\n";
+        let got = parse_robots_txt(t);
+        assert_eq!(got.len(), 2);
+        assert!(got.contains(&"https://a.example.com/s.xml".to_string()));
+        assert!(got.contains(&"https://b.example.com/s.xml".to_string()));
+    }
+
+    #[test]
+    fn robots_txt_tolerates_space_before_colon() {
+        // Some malformed generators emit `Sitemap :` with a space.
+        let t = "Sitemap : https://example.com/sitemap.xml\n";
+        assert_eq!(
+            parse_robots_txt(t),
+            vec!["https://example.com/sitemap.xml".to_string()]
+        );
+    }
+
+    #[test]
+    fn robots_txt_strips_inline_comments() {
+        let t = "Sitemap: https://example.com/s.xml # main sitemap\n";
+        assert_eq!(
+            parse_robots_txt(t),
+            vec!["https://example.com/s.xml".to_string()]
+        );
+    }
+
+    #[test]
+    fn robots_txt_rejects_empty_value() {
+        let t = "Sitemap:\nSitemap:   \n";
+        assert!(parse_robots_txt(t).is_empty());
+    }
+
+    #[test]
+    fn robots_txt_rejects_non_url_value() {
+        // "Sitemap: /relative/path" has no scheme; don't blindly accept.
+        let t = "Sitemap: /sitemap.xml\nSitemap: junk text\n";
+        assert!(parse_robots_txt(t).is_empty());
+    }
+
+    #[test]
+    fn robots_txt_ignores_non_sitemap_directives() {
+        let t = "User-agent: *\nDisallow: /admin\nAllow: /\n";
+        assert!(parse_robots_txt(t).is_empty());
+    }
+
    #[test]
    fn test_parse_urlset() {
        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>