diff --git a/Cargo.lock b/Cargo.lock index 942d841..4acefe2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3263,6 +3263,7 @@ dependencies = [ "async-trait", "bytes", "calamine", + "flate2", "http", "quick-xml 0.37.5", "rand 0.8.5", diff --git a/crates/webclaw-cli/src/cli.rs b/crates/webclaw-cli/src/cli.rs index 403e8cf..1221cdf 100644 --- a/crates/webclaw-cli/src/cli.rs +++ b/crates/webclaw-cli/src/cli.rs @@ -162,6 +162,18 @@ pub struct Cli { #[arg(long)] pub map: bool, + /// Max pages for --map's crawl fallback when the sitemap is thin [default: 150] + #[arg(long)] + pub map_pages: Option, + + /// Disable --map's crawl fallback (sitemap-only discovery) + #[arg(long)] + pub no_map_crawl: bool, + + /// Cap the number of URLs --map returns (default: uncapped) + #[arg(long)] + pub map_limit: Option, + // -- LLM options -- /// Extract structured JSON using LLM (pass a JSON schema string or @file) #[arg(long)] diff --git a/crates/webclaw-cli/src/main.rs b/crates/webclaw-cli/src/main.rs index 39088be..1a834e4 100644 --- a/crates/webclaw-cli/src/main.rs +++ b/crates/webclaw-cli/src/main.rs @@ -35,7 +35,13 @@ fn init_logging(verbose: bool) { EnvFilter::try_from_env("WEBCLAW_LOG").unwrap_or_else(|_| EnvFilter::new(default)) }; - tracing_subscriber::fmt().with_env_filter(filter).init(); + // Logs go to stderr, never stdout: stdout carries the actual result + // (markdown / JSON / URL list). A stray WARN on stdout corrupts + // machine-readable output — e.g. `--map --format json` piped to a parser. + tracing_subscriber::fmt() + .with_env_filter(filter) + .with_writer(std::io::stderr) + .init(); } #[tokio::main] diff --git a/crates/webclaw-cli/src/run.rs b/crates/webclaw-cli/src/run.rs index 2305f64..e5a0bf3 100644 --- a/crates/webclaw-cli/src/run.rs +++ b/crates/webclaw-cli/src/run.rs @@ -205,12 +205,22 @@ pub async fn run_map(cli: &Cli) -> Result<(), String> { let client = FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?; - let entries = webclaw_fetch::sitemap::discover(&client, url) - .await - .map_err(|e| format!("sitemap discovery failed: {e}"))?; + // Layered discovery: sitemaps first, bounded crawl fallback when thin. + let mut opts = webclaw_fetch::MapOptions::default(); + if let Some(pages) = cli.map_pages { + opts.max_crawl_pages = pages; + } + if cli.no_map_crawl { + opts.crawl_fallback = false; + } + if let Some(limit) = cli.map_limit { + opts.max_urls = Some(limit); + } + + let entries = webclaw_fetch::discover_urls(&client, url, &opts).await; if entries.is_empty() { - eprintln!("no sitemap URLs found for {url}"); + eprintln!("no URLs found for {url}"); } else { eprintln!("discovered {} URLs", entries.len()); } diff --git a/crates/webclaw-fetch/Cargo.toml b/crates/webclaw-fetch/Cargo.toml index cc7ead2..dc2011f 100644 --- a/crates/webclaw-fetch/Cargo.toml +++ b/crates/webclaw-fetch/Cargo.toml @@ -29,6 +29,7 @@ reqwest = { version = "0.12", default-features = false, features = ["json", "rus serde_json.workspace = true calamine = "0.34" zip = "2" +flate2 = "1" [dev-dependencies] tempfile = "3" diff --git a/crates/webclaw-fetch/src/client.rs b/crates/webclaw-fetch/src/client.rs index 035c8c5..0726cab 100644 --- a/crates/webclaw-fetch/src/client.rs +++ b/crates/webclaw-fetch/src/client.rs @@ -437,6 +437,27 @@ impl FetchClient { .await } + /// Fetch a URL and return the raw, undecoded response body as bytes. + /// + /// Unlike [`fetch`](Self::fetch), this does **not** run the body through + /// `String::from_utf8_lossy`, so binary payloads survive intact. This is + /// required for gzipped sitemaps (`.xml.gz`): such files are served with + /// `Content-Type: application/gzip` and *no* `Content-Encoding`, so wreq + /// never auto-inflates them — the bytes arrive as raw gzip and the lossy + /// String path would mangle them. Callers detect the gzip magic + /// (`0x1f 0x8b`) and gunzip before parsing. + /// + /// No retry wrapper: callers (sitemap discovery) already tolerate + /// per-URL failures by skipping. Returns `(status, body)`. + pub async fn fetch_raw(&self, url: &str) -> Result<(u16, bytes::Bytes), FetchError> { + let parsed_url = crate::url_security::validate_public_http_url(url).await?; + let url = parsed_url.as_str(); + let client = self.pick_client(url); + let resp = client.get(url).send().await?; + let response = Response::from_wreq(resp).await?; + Ok((response.status(), response.into_body())) + } + /// Fetch a URL then extract structured content. #[instrument(skip(self), fields(url = %url))] pub async fn fetch_and_extract( diff --git a/crates/webclaw-fetch/src/crawler.rs b/crates/webclaw-fetch/src/crawler.rs index 1403256..581e2f7 100644 --- a/crates/webclaw-fetch/src/crawler.rs +++ b/crates/webclaw-fetch/src/crawler.rs @@ -528,7 +528,7 @@ impl Crawler { } /// Canonical origin string for comparing same-origin: "scheme://host[:port]". -fn origin_key(url: &Url) -> String { +pub(crate) fn origin_key(url: &Url) -> String { let port_suffix = match url.port() { Some(p) => format!(":{p}"), None => String::new(), @@ -563,7 +563,7 @@ fn root_domain(url: &Url) -> String { /// Normalize a URL for dedup: strip fragment, remove trailing slash (except root "/"), /// lowercase scheme + host. Preserves query params and path case. -fn normalize(url: &Url) -> String { +pub(crate) fn normalize(url: &Url) -> String { let scheme = url.scheme(); let host = url.host_str().unwrap_or("").to_ascii_lowercase(); let port_suffix = match url.port() { diff --git a/crates/webclaw-fetch/src/lib.rs b/crates/webclaw-fetch/src/lib.rs index 029a7b6..9fb702a 100644 --- a/crates/webclaw-fetch/src/lib.rs +++ b/crates/webclaw-fetch/src/lib.rs @@ -11,6 +11,7 @@ pub mod extractors; pub mod fetcher; pub mod linkedin; pub mod locale; +pub mod map; pub mod proxy; pub mod reddit; pub mod sitemap; @@ -24,6 +25,7 @@ pub use error::FetchError; pub use fetcher::Fetcher; pub use http::HeaderMap; pub use locale::{accept_language_for_tld, accept_language_for_url}; +pub use map::{MapOptions, discover_urls}; pub use proxy::{parse_proxy_file, parse_proxy_line}; pub use sitemap::SitemapEntry; pub use webclaw_pdf::PdfMode; diff --git a/crates/webclaw-fetch/src/map.rs b/crates/webclaw-fetch/src/map.rs new file mode 100644 index 0000000..97219fc --- /dev/null +++ b/crates/webclaw-fetch/src/map.rs @@ -0,0 +1,326 @@ +//! Layered URL discovery for the `map` command. +//! +//! `sitemap::discover` only finds URLs a site explicitly advertises in its +//! `sitemap.xml`. Plenty of sites have no sitemap (news.ycombinator.com), a +//! stale one, or a thin one that lists a handful of section roots. For those, +//! a sitemap-only map returns almost nothing. +//! +//! This module adds a second layer: when the sitemap yields fewer than a +//! threshold of URLs, run a *bounded* same-origin crawl and harvest every URL +//! it touches — fetched pages, the visited set, **and** the remaining frontier +//! (links queued but never fetched because the page cap was hit). That last +//! bucket is the gold: a 150-page crawl of a link-dense site surfaces several +//! thousand frontier URLs, turning a useless map into a real one. +//! +//! Strategy (layered, sitemap-first): +//! 1. Sitemaps via [`sitemap::discover`] — authoritative, carries metadata +//! (lastmod / priority / changefreq). +//! 2. If sitemaps are thin (`< min_sitemap_urls`) and the fallback is enabled, +//! a bounded crawl fills in the rest. Crawl-discovered URLs carry no +//! metadata (`None` everywhere) since they come from link harvesting, not a +//! sitemap. +//! +//! Sitemap entries always come first in the returned vec; crawl-discovered +//! URLs are appended, deduplicated against the sitemap set using the *same* +//! normalization the crawler uses ([`crawler::normalize`]) so map output stays +//! internally consistent. + +use std::collections::HashSet; +use std::time::Duration; + +use url::Url; + +use crate::client::{FetchClient, FetchConfig}; +use crate::crawler::{self, CrawlConfig, Crawler}; +use crate::sitemap::{self, SitemapEntry}; + +/// Tuning knobs for [`discover_urls`]. +#[derive(Debug, Clone)] +pub struct MapOptions { + /// Hard cap on pages the fallback crawl will fetch. The crawl surfaces far + /// more URLs than this via the unfetched frontier, so a small number still + /// yields a large map while keeping the crawl fast and polite. + pub max_crawl_pages: usize, + /// How deep the fallback crawl follows links (1 = links off the seed only). + pub crawl_depth: usize, + /// Sitemap-URL count below which the crawl fallback kicks in. A site with a + /// rich sitemap (≥ this many URLs) skips the crawl entirely. + pub min_sitemap_urls: usize, + /// Master switch for the crawl fallback. When `false`, behaves exactly like + /// the old sitemap-only `discover`. + pub crawl_fallback: bool, + /// Optional cap on URLs returned. `None` (default) = uncapped: return every + /// URL discovered (the crawl is already bounded by `max_crawl_pages`, so the + /// uncapped set is the links harvested from the fetched pages). Set `Some(n)` + /// to truncate. + pub max_urls: Option, +} + +impl Default for MapOptions { + fn default() -> Self { + Self { + max_crawl_pages: 150, + crawl_depth: 2, + min_sitemap_urls: 200, + crawl_fallback: true, + max_urls: None, + } + } +} + +/// Discover URLs for a site using the layered strategy described in the module +/// docs: sitemaps first, then a bounded crawl fallback when the sitemap is +/// thin. +/// +/// Never errors — sitemap and crawl failures are swallowed and simply yield +/// fewer URLs (an empty vec in the worst case), matching `sitemap::discover`'s +/// "absence is not an error" contract. +pub async fn discover_urls( + client: &FetchClient, + base_url: &str, + opts: &MapOptions, +) -> Vec { + // Layer 1: sitemaps. + let mut entries = sitemap::discover(client, base_url) + .await + .unwrap_or_default(); + + // Track normalized URLs we've already emitted, for cross-layer dedup. + let mut seen: HashSet = entries.iter().filter_map(normalize_str).collect(); + + // Layer 2: bounded crawl fallback, only when the sitemap is thin. + if !opts.crawl_fallback || entries.len() >= opts.min_sitemap_urls { + return entries; + } + + let Some(base_origin) = Url::parse(base_url).ok().map(|u| crawler::origin_key(&u)) else { + // Unparseable base URL — nothing sensible to crawl against. + return entries; + }; + + let config = CrawlConfig { + fetch: FetchConfig::default(), + max_depth: opts.crawl_depth, + max_pages: opts.max_crawl_pages, + // Politeness + scope: same-origin only (crawler default), modest delay. + delay: Duration::from_millis(50), + ..CrawlConfig::default() + }; + + let crawler = match Crawler::new(base_url, config) { + Ok(c) => c, + Err(_) => return entries, + }; + + let result = crawler.crawl(base_url, None).await; + + // Richest source first: every link harvested from each fetched page. A + // directory/index page holds hundreds of same-origin links, and this set is + // NOT bound by the crawler's internal frontier cap. Then the URLs the crawl + // itself touched (fetched, visited, queued-but-unfetched frontier). + let mut discovered: Vec = Vec::new(); + for p in &result.pages { + discovered.push(p.url.clone()); + if let Some(ex) = p.extraction.as_ref() { + let page_base = Url::parse(&p.url).ok(); + for link in &ex.content.links { + // Resolve relative/protocol-relative hrefs against the page URL + // so the same-origin filter and dedup see absolute URLs. + let abs = match &page_base { + Some(b) => b.join(&link.href).ok(), + None => Url::parse(&link.href).ok(), + }; + if let Some(u) = abs { + discovered.push(u.to_string()); + } + } + } + } + discovered.extend(result.visited); + discovered.extend(result.remaining_frontier.into_iter().map(|(url, _)| url)); + + append_crawled(&mut entries, &mut seen, discovered, &base_origin); + + // Uncapped by default; only truncate if the caller set an explicit limit + // (sitemap entries added first keep priority). + if let Some(cap) = opts.max_urls { + entries.truncate(cap); + } + entries +} + +/// Normalize a raw URL string to the crawler's canonical form, returning `None` +/// if it doesn't parse. +fn normalize_url(raw: &str) -> Option { + Url::parse(raw).ok().map(|u| crawler::normalize(&u)) +} + +/// Normalize a [`SitemapEntry`]'s URL for the dedup set. +fn normalize_str(entry: &SitemapEntry) -> Option { + normalize_url(&entry.url) +} + +/// Append crawl-discovered URLs to `entries`, skipping any that are off-origin, +/// unparseable, or already present (by normalized form). +/// +/// Split out from [`discover_urls`] so the union/dedup/same-origin logic is +/// unit-testable without touching the network. Mutates `entries` and `seen` in +/// place; crawl URLs get empty metadata. +fn append_crawled( + entries: &mut Vec, + seen: &mut HashSet, + discovered: impl IntoIterator, + base_origin: &str, +) { + for raw in discovered { + let Ok(parsed) = Url::parse(&raw) else { + continue; + }; + // Same-origin filter: drop anything whose origin differs from the seed. + if crawler::origin_key(&parsed) != base_origin { + continue; + } + let norm = crawler::normalize(&parsed); + if seen.insert(norm.clone()) { + entries.push(SitemapEntry { + url: norm, + last_modified: None, + priority: None, + change_freq: None, + }); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn entry(url: &str) -> SitemapEntry { + SitemapEntry { + url: url.to_string(), + last_modified: None, + priority: None, + change_freq: None, + } + } + + fn origin_of(url: &str) -> String { + crawler::origin_key(&Url::parse(url).unwrap()) + } + + #[test] + fn append_adds_new_same_origin_urls() { + let mut entries = vec![entry("https://example.com/")]; + let mut seen: HashSet = entries.iter().filter_map(normalize_str).collect(); + + append_crawled( + &mut entries, + &mut seen, + vec![ + "https://example.com/about".to_string(), + "https://example.com/contact".to_string(), + ], + &origin_of("https://example.com"), + ); + + let urls: Vec<&str> = entries.iter().map(|e| e.url.as_str()).collect(); + assert_eq!( + urls, + vec![ + "https://example.com/", + "https://example.com/about", + "https://example.com/contact", + ] + ); + } + + #[test] + fn append_dedups_against_sitemap_and_self() { + let mut entries = vec![entry("https://example.com/about")]; + let mut seen: HashSet = entries.iter().filter_map(normalize_str).collect(); + + append_crawled( + &mut entries, + &mut seen, + vec![ + // Same as sitemap entry (trailing slash normalizes away). + "https://example.com/about/".to_string(), + // Fragment + duplicate -> only one new entry survives. + "https://example.com/new#frag".to_string(), + "https://example.com/new".to_string(), + ], + &origin_of("https://example.com"), + ); + + let urls: Vec<&str> = entries.iter().map(|e| e.url.as_str()).collect(); + assert_eq!( + urls, + vec!["https://example.com/about", "https://example.com/new"] + ); + } + + #[test] + fn append_filters_off_origin() { + let mut entries = Vec::new(); + let mut seen = HashSet::new(); + + append_crawled( + &mut entries, + &mut seen, + vec![ + "https://example.com/keep".to_string(), + "https://evil.com/drop".to_string(), + "https://sub.example.com/drop".to_string(), // different origin + "ftp://example.com/drop".to_string(), // unparseable as http origin match + ], + &origin_of("https://example.com"), + ); + + let urls: Vec<&str> = entries.iter().map(|e| e.url.as_str()).collect(); + assert_eq!(urls, vec!["https://example.com/keep"]); + } + + #[test] + fn append_treats_www_as_same_origin() { + // origin_key strips a leading `www.`, so www and apex collapse. + let mut entries = Vec::new(); + let mut seen = HashSet::new(); + + append_crawled( + &mut entries, + &mut seen, + vec!["https://www.example.com/page".to_string()], + &origin_of("https://example.com"), + ); + + assert_eq!(entries.len(), 1); + } + + #[test] + fn crawl_urls_carry_no_metadata() { + let mut entries = Vec::new(); + let mut seen = HashSet::new(); + + append_crawled( + &mut entries, + &mut seen, + vec!["https://example.com/x".to_string()], + &origin_of("https://example.com"), + ); + + assert_eq!(entries.len(), 1); + assert!(entries[0].last_modified.is_none()); + assert!(entries[0].priority.is_none()); + assert!(entries[0].change_freq.is_none()); + } + + #[test] + fn map_options_defaults() { + let o = MapOptions::default(); + assert_eq!(o.max_crawl_pages, 150); + assert_eq!(o.crawl_depth, 2); + assert_eq!(o.min_sitemap_urls, 200); + assert!(o.crawl_fallback); + } +} diff --git a/crates/webclaw-fetch/src/sitemap.rs b/crates/webclaw-fetch/src/sitemap.rs index 374892d..10616ec 100644 --- a/crates/webclaw-fetch/src/sitemap.rs +++ b/crates/webclaw-fetch/src/sitemap.rs @@ -18,12 +18,20 @@ use crate::error::FetchError; /// Maximum depth when recursively fetching sitemap index files. /// Prevents infinite loops from circular sitemap references. -const MAX_RECURSION_DEPTH: usize = 3; +/// +/// Raised 3→5: large sites (gov.uk, news publishers) nest sitemap indexes +/// more than three levels deep — a top index → per-section index → +/// per-month index → urlset is already four hops. Three cut those off. +const MAX_RECURSION_DEPTH: usize = 5; /// Common sitemap paths to try when robots.txt doesn't list any. const FALLBACK_SITEMAP_PATHS: &[&str] = &[ "/sitemap.xml", "/sitemap_index.xml", + "/sitemap-index.xml", + "/sitemap1.xml", + "/sitemaps.xml", + "/sitemap/index.xml", "/wp-sitemap.xml", "/sitemap/sitemap-index.xml", ]; @@ -105,10 +113,12 @@ async fn fetch_sitemaps( for sitemap_url in urls { debug!(url = %sitemap_url, depth, "fetching sitemap"); - let xml = match client.fetch(sitemap_url).await { - Ok(result) if result.status == 200 => result.html, - Ok(result) => { - debug!(url = %sitemap_url, status = result.status, "sitemap not found"); + // Fetch raw bytes so gzipped sitemaps survive intact. `fetch` runs + // the body through `from_utf8_lossy`, which corrupts binary gzip. + let body = match client.fetch_raw(sitemap_url).await { + Ok((200, body)) => body, + Ok((status, _)) => { + debug!(url = %sitemap_url, status, "sitemap not found"); continue; } Err(e) => { @@ -117,6 +127,14 @@ async fn fetch_sitemaps( } }; + let xml = match decode_sitemap_body(&body) { + Some(xml) => xml, + None => { + debug!(url = %sitemap_url, "failed to decode sitemap body, skipping"); + continue; + } + }; + match detect_sitemap_type(&xml) { SitemapType::UrlSet => { let parsed = parse_urlset(&xml); @@ -147,6 +165,33 @@ async fn fetch_sitemaps( } } +/// Decode a raw sitemap body into a UTF-8 XML string. +/// +/// Sitemaps are commonly served gzipped (`.xml.gz`) with +/// `Content-Type: application/gzip` and *no* `Content-Encoding`, so the HTTP +/// layer never inflates them. We detect the gzip magic bytes (`0x1f 0x8b`) +/// and gunzip in-process; otherwise the body is treated as plain XML. +/// +/// Returns `None` if a gzip stream fails to inflate. Plain (non-gzip) bodies +/// always succeed via lossy UTF-8 decode, mirroring the previous behaviour. +pub(crate) fn decode_sitemap_body(body: &[u8]) -> Option { + if body.starts_with(&[0x1f, 0x8b]) { + use std::io::Read; + + let mut decoder = flate2::read::GzDecoder::new(body); + let mut out = String::new(); + match decoder.read_to_string(&mut out) { + Ok(_) => Some(out), + Err(e) => { + warn!(error = %e, "failed to gunzip sitemap body"); + None + } + } + } else { + Some(String::from_utf8_lossy(body).into_owned()) + } +} + // --------------------------------------------------------------------------- // Pure parsing functions (no I/O, fully testable) // --------------------------------------------------------------------------- @@ -669,5 +714,47 @@ mod tests { assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap_index.xml")); assert!(FALLBACK_SITEMAP_PATHS.contains(&"/wp-sitemap.xml")); assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap/sitemap-index.xml")); + // Paths added for robustness (item 3). + assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap-index.xml")); + assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap1.xml")); + assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemaps.xml")); + assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap/index.xml")); + } + + #[test] + fn decode_plain_xml_body() { + let xml = r#""#; + let got = decode_sitemap_body(xml.as_bytes()).expect("plain body decodes"); + assert_eq!(got, xml); + } + + #[test] + fn decode_gzipped_body() { + use std::io::Write; + + let xml = r#" + + https://example.com/gz-page +"#; + + // Gzip-compress the XML, then confirm decode_sitemap_body inflates it + // and the parser finds the URL. + let mut encoder = flate2::write::GzEncoder::new(Vec::new(), flate2::Compression::default()); + encoder.write_all(xml.as_bytes()).unwrap(); + let gz = encoder.finish().unwrap(); + + assert_eq!(&gz[..2], &[0x1f, 0x8b], "gzip magic present"); + + let decoded = decode_sitemap_body(&gz).expect("gzip body inflates"); + let entries = parse_urlset(&decoded); + assert_eq!(entries.len(), 1); + assert_eq!(entries[0].url, "https://example.com/gz-page"); + } + + #[test] + fn decode_corrupt_gzip_returns_none() { + // Starts with gzip magic but the rest is garbage -> inflate fails. + let bad = [0x1f, 0x8b, 0x08, 0x00, 0xde, 0xad, 0xbe, 0xef]; + assert!(decode_sitemap_body(&bad).is_none()); } }