mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-06 22:05:13 +02:00
polish(fetch,mcp): robots parser + firefox client cache + Acquire ordering (P3) (#23)
Some checks are pending
CI / Test (push) Waiting to run
CI / Lint (push) Waiting to run
CI / Docs (push) Waiting to run
Some checks are pending
CI / Test (push) Waiting to run
CI / Lint (push) Waiting to run
CI / Docs (push) Waiting to run
Three P3 items from the 2026-04-16 audit. Bump to 0.3.17. webclaw-fetch/sitemap.rs: parse_robots_txt used trimmed[..8] slice plus eq_ignore_ascii_case for the directive test. That was fragile: "Sitemap :" (space before colon) fell through silently, inline "# ..." comments leaked into the URL, and a line with no URL at all returned an empty string. Rewritten to split on the first colon, match any-case "sitemap" as the directive name, strip comments, and require `://` in the value. +7 unit tests cover case variants, space-before-colon, comments, empty values, non-URL values, and non-sitemap directives. webclaw-fetch/crawler.rs: is_cancelled uses Ordering::Acquire instead of Relaxed. Behaviourally equivalent on current hardware for single-word atomic loads, but the explicit ordering documents intent for readers + compilers. webclaw-mcp/server.rs: add lazy OnceLock cache for the Firefox FetchClient. Tool calls that repeatedly request the firefox profile without cookies used to build a fresh reqwest pool + TLS stack per call. Chrome (default) already used the long-lived field; Random is per-call by design; cookie-bearing requests still build ad-hoc since the cookie header is part of the client shape. Tests: 85 webclaw-fetch (was 78, +7 new sitemap), 272 webclaw-core, 43 webclaw-llm, 11 CLI — all green. Clippy clean across workspace. Refs: docs/AUDIT-2026-04-16.md P3 section Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
d69c50a31d
commit
095ae5d4b1
6 changed files with 153 additions and 20 deletions
|
|
@ -190,11 +190,17 @@ impl Crawler {
|
|||
}
|
||||
|
||||
/// Returns true if the cancel flag has been set.
|
||||
///
|
||||
/// Uses `Acquire` load to pair with a `Release` store on the cancel
|
||||
/// path. `Relaxed` was technically fine in practice (x86/arm64 give
|
||||
/// release semantics for free on single-word stores) but `Acquire`
|
||||
/// makes the ordering explicit so the compiler and future readers
|
||||
/// don't need to reason about the memory model.
|
||||
fn is_cancelled(&self) -> bool {
|
||||
self.config
|
||||
.cancel_flag
|
||||
.as_ref()
|
||||
.is_some_and(|f| f.load(Ordering::Relaxed))
|
||||
.is_some_and(|f| f.load(Ordering::Acquire))
|
||||
}
|
||||
|
||||
/// Crawl starting from `start_url`, returning results for every page visited.
|
||||
|
|
|
|||
|
|
@ -152,18 +152,34 @@ async fn fetch_sitemaps(
|
|||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// Extract `Sitemap:` directive URLs from robots.txt content.
|
||||
///
|
||||
/// Handles case-insensitive directive names, optional whitespace before
|
||||
/// the colon, and strips inline `# ...` comments. Rejects values without
|
||||
/// a URL scheme (`://`) so a malformed directive doesn't turn an empty
|
||||
/// or garbage string into a "sitemap URL".
|
||||
pub fn parse_robots_txt(text: &str) -> Vec<String> {
|
||||
text.lines()
|
||||
.filter_map(|line| {
|
||||
// Strip inline `#...` comments (robots.txt convention).
|
||||
let line = match line.split_once('#') {
|
||||
Some((before, _)) => before,
|
||||
None => line,
|
||||
};
|
||||
let trimmed = line.trim();
|
||||
// Case-insensitive match for "Sitemap:" prefix
|
||||
if trimmed.len() > 8 && trimmed[..8].eq_ignore_ascii_case("sitemap:") {
|
||||
let url = trimmed[8..].trim();
|
||||
if !url.is_empty() {
|
||||
return Some(url.to_string());
|
||||
}
|
||||
// Find the colon that terminates the directive name; reject
|
||||
// lines that don't have one. Anything between the start and
|
||||
// the colon that matches "sitemap" case-insensitively is a hit.
|
||||
let colon = trimmed.find(':')?;
|
||||
let (name, rest) = trimmed.split_at(colon);
|
||||
if !name.trim().eq_ignore_ascii_case("sitemap") {
|
||||
return None;
|
||||
}
|
||||
None
|
||||
// Skip the colon itself, then trim.
|
||||
let url = rest[1..].trim();
|
||||
if url.is_empty() || !url.contains("://") {
|
||||
return None;
|
||||
}
|
||||
Some(url.to_string())
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
|
@ -363,6 +379,62 @@ fn parse_sitemap_index(xml: &str) -> Vec<String> {
|
|||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn robots_txt_basic() {
|
||||
let t = "User-agent: *\nSitemap: https://example.com/sitemap.xml\n";
|
||||
assert_eq!(
|
||||
parse_robots_txt(t),
|
||||
vec!["https://example.com/sitemap.xml".to_string()]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn robots_txt_case_insensitive() {
|
||||
let t = "SITEMAP: https://a.example.com/s.xml\nsitemap: https://b.example.com/s.xml\n";
|
||||
let got = parse_robots_txt(t);
|
||||
assert_eq!(got.len(), 2);
|
||||
assert!(got.contains(&"https://a.example.com/s.xml".to_string()));
|
||||
assert!(got.contains(&"https://b.example.com/s.xml".to_string()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn robots_txt_tolerates_space_before_colon() {
|
||||
// Some malformed generators emit `Sitemap :` with a space.
|
||||
let t = "Sitemap : https://example.com/sitemap.xml\n";
|
||||
assert_eq!(
|
||||
parse_robots_txt(t),
|
||||
vec!["https://example.com/sitemap.xml".to_string()]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn robots_txt_strips_inline_comments() {
|
||||
let t = "Sitemap: https://example.com/s.xml # main sitemap\n";
|
||||
assert_eq!(
|
||||
parse_robots_txt(t),
|
||||
vec!["https://example.com/s.xml".to_string()]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn robots_txt_rejects_empty_value() {
|
||||
let t = "Sitemap:\nSitemap: \n";
|
||||
assert!(parse_robots_txt(t).is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn robots_txt_rejects_non_url_value() {
|
||||
// "Sitemap: /relative/path" has no scheme; don't blindly accept.
|
||||
let t = "Sitemap: /sitemap.xml\nSitemap: junk text\n";
|
||||
assert!(parse_robots_txt(t).is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn robots_txt_ignores_non_sitemap_directives() {
|
||||
let t = "User-agent: *\nDisallow: /admin\nAllow: /\n";
|
||||
assert!(parse_robots_txt(t).is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_urlset() {
|
||||
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue