feat(map): layered URL discovery with crawl fallback

map falls back to a bounded same-origin crawl when a site has no sitemap
or a thin one, harvesting links from each fetched page (the rich source).
Adds gzip (.xml.gz) sitemap support, deeper sitemap-index recursion + more
fallback paths, uncapped-by-default results with an optional --map-limit /
--map-pages, and routes crawler logs to stderr so --map -f json stays
machine-parseable.
This commit is contained in:
webclaw 2026-06-06 12:08:26 +02:00
parent 02302e7a1d
commit b7bd1155c6
10 changed files with 478 additions and 12 deletions

1
Cargo.lock generated
View file

@ -3263,6 +3263,7 @@ dependencies = [
"async-trait",
"bytes",
"calamine",
"flate2",
"http",
"quick-xml 0.37.5",
"rand 0.8.5",

View file

@ -162,6 +162,18 @@ pub struct Cli {
#[arg(long)]
pub map: bool,
/// Max pages for --map's crawl fallback when the sitemap is thin [default: 150]
#[arg(long)]
pub map_pages: Option<usize>,
/// Disable --map's crawl fallback (sitemap-only discovery)
#[arg(long)]
pub no_map_crawl: bool,
/// Cap the number of URLs --map returns (default: uncapped)
#[arg(long)]
pub map_limit: Option<usize>,
// -- LLM options --
/// Extract structured JSON using LLM (pass a JSON schema string or @file)
#[arg(long)]

View file

@ -35,7 +35,13 @@ fn init_logging(verbose: bool) {
EnvFilter::try_from_env("WEBCLAW_LOG").unwrap_or_else(|_| EnvFilter::new(default))
};
tracing_subscriber::fmt().with_env_filter(filter).init();
// Logs go to stderr, never stdout: stdout carries the actual result
// (markdown / JSON / URL list). A stray WARN on stdout corrupts
// machine-readable output — e.g. `--map --format json` piped to a parser.
tracing_subscriber::fmt()
.with_env_filter(filter)
.with_writer(std::io::stderr)
.init();
}
#[tokio::main]

View file

@ -205,12 +205,22 @@ pub async fn run_map(cli: &Cli) -> Result<(), String> {
let client =
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
let entries = webclaw_fetch::sitemap::discover(&client, url)
.await
.map_err(|e| format!("sitemap discovery failed: {e}"))?;
// Layered discovery: sitemaps first, bounded crawl fallback when thin.
let mut opts = webclaw_fetch::MapOptions::default();
if let Some(pages) = cli.map_pages {
opts.max_crawl_pages = pages;
}
if cli.no_map_crawl {
opts.crawl_fallback = false;
}
if let Some(limit) = cli.map_limit {
opts.max_urls = Some(limit);
}
let entries = webclaw_fetch::discover_urls(&client, url, &opts).await;
if entries.is_empty() {
eprintln!("no sitemap URLs found for {url}");
eprintln!("no URLs found for {url}");
} else {
eprintln!("discovered {} URLs", entries.len());
}

View file

@ -29,6 +29,7 @@ reqwest = { version = "0.12", default-features = false, features = ["json", "rus
serde_json.workspace = true
calamine = "0.34"
zip = "2"
flate2 = "1"
[dev-dependencies]
tempfile = "3"

View file

@ -437,6 +437,27 @@ impl FetchClient {
.await
}
/// Fetch a URL and return the raw, undecoded response body as bytes.
///
/// Unlike [`fetch`](Self::fetch), this does **not** run the body through
/// `String::from_utf8_lossy`, so binary payloads survive intact. This is
/// required for gzipped sitemaps (`.xml.gz`): such files are served with
/// `Content-Type: application/gzip` and *no* `Content-Encoding`, so wreq
/// never auto-inflates them — the bytes arrive as raw gzip and the lossy
/// String path would mangle them. Callers detect the gzip magic
/// (`0x1f 0x8b`) and gunzip before parsing.
///
/// No retry wrapper: callers (sitemap discovery) already tolerate
/// per-URL failures by skipping. Returns `(status, body)`.
pub async fn fetch_raw(&self, url: &str) -> Result<(u16, bytes::Bytes), FetchError> {
let parsed_url = crate::url_security::validate_public_http_url(url).await?;
let url = parsed_url.as_str();
let client = self.pick_client(url);
let resp = client.get(url).send().await?;
let response = Response::from_wreq(resp).await?;
Ok((response.status(), response.into_body()))
}
/// Fetch a URL then extract structured content.
#[instrument(skip(self), fields(url = %url))]
pub async fn fetch_and_extract(

View file

@ -528,7 +528,7 @@ impl Crawler {
}
/// Canonical origin string for comparing same-origin: "scheme://host[:port]".
fn origin_key(url: &Url) -> String {
pub(crate) fn origin_key(url: &Url) -> String {
let port_suffix = match url.port() {
Some(p) => format!(":{p}"),
None => String::new(),
@ -563,7 +563,7 @@ fn root_domain(url: &Url) -> String {
/// Normalize a URL for dedup: strip fragment, remove trailing slash (except root "/"),
/// lowercase scheme + host. Preserves query params and path case.
fn normalize(url: &Url) -> String {
pub(crate) fn normalize(url: &Url) -> String {
let scheme = url.scheme();
let host = url.host_str().unwrap_or("").to_ascii_lowercase();
let port_suffix = match url.port() {

View file

@ -11,6 +11,7 @@ pub mod extractors;
pub mod fetcher;
pub mod linkedin;
pub mod locale;
pub mod map;
pub mod proxy;
pub mod reddit;
pub mod sitemap;
@ -24,6 +25,7 @@ pub use error::FetchError;
pub use fetcher::Fetcher;
pub use http::HeaderMap;
pub use locale::{accept_language_for_tld, accept_language_for_url};
pub use map::{MapOptions, discover_urls};
pub use proxy::{parse_proxy_file, parse_proxy_line};
pub use sitemap::SitemapEntry;
pub use webclaw_pdf::PdfMode;

View file

@ -0,0 +1,326 @@
//! Layered URL discovery for the `map` command.
//!
//! `sitemap::discover` only finds URLs a site explicitly advertises in its
//! `sitemap.xml`. Plenty of sites have no sitemap (news.ycombinator.com), a
//! stale one, or a thin one that lists a handful of section roots. For those,
//! a sitemap-only map returns almost nothing.
//!
//! This module adds a second layer: when the sitemap yields fewer than a
//! threshold of URLs, run a *bounded* same-origin crawl and harvest every URL
//! it touches — fetched pages, the visited set, **and** the remaining frontier
//! (links queued but never fetched because the page cap was hit). That last
//! bucket is the gold: a 150-page crawl of a link-dense site surfaces several
//! thousand frontier URLs, turning a useless map into a real one.
//!
//! Strategy (layered, sitemap-first):
//! 1. Sitemaps via [`sitemap::discover`] — authoritative, carries metadata
//! (lastmod / priority / changefreq).
//! 2. If sitemaps are thin (`< min_sitemap_urls`) and the fallback is enabled,
//! a bounded crawl fills in the rest. Crawl-discovered URLs carry no
//! metadata (`None` everywhere) since they come from link harvesting, not a
//! sitemap.
//!
//! Sitemap entries always come first in the returned vec; crawl-discovered
//! URLs are appended, deduplicated against the sitemap set using the *same*
//! normalization the crawler uses ([`crawler::normalize`]) so map output stays
//! internally consistent.
use std::collections::HashSet;
use std::time::Duration;
use url::Url;
use crate::client::{FetchClient, FetchConfig};
use crate::crawler::{self, CrawlConfig, Crawler};
use crate::sitemap::{self, SitemapEntry};
/// Tuning knobs for [`discover_urls`].
#[derive(Debug, Clone)]
pub struct MapOptions {
/// Hard cap on pages the fallback crawl will fetch. The crawl surfaces far
/// more URLs than this via the unfetched frontier, so a small number still
/// yields a large map while keeping the crawl fast and polite.
pub max_crawl_pages: usize,
/// How deep the fallback crawl follows links (1 = links off the seed only).
pub crawl_depth: usize,
/// Sitemap-URL count below which the crawl fallback kicks in. A site with a
/// rich sitemap (≥ this many URLs) skips the crawl entirely.
pub min_sitemap_urls: usize,
/// Master switch for the crawl fallback. When `false`, behaves exactly like
/// the old sitemap-only `discover`.
pub crawl_fallback: bool,
/// Optional cap on URLs returned. `None` (default) = uncapped: return every
/// URL discovered (the crawl is already bounded by `max_crawl_pages`, so the
/// uncapped set is the links harvested from the fetched pages). Set `Some(n)`
/// to truncate.
pub max_urls: Option<usize>,
}
impl Default for MapOptions {
fn default() -> Self {
Self {
max_crawl_pages: 150,
crawl_depth: 2,
min_sitemap_urls: 200,
crawl_fallback: true,
max_urls: None,
}
}
}
/// Discover URLs for a site using the layered strategy described in the module
/// docs: sitemaps first, then a bounded crawl fallback when the sitemap is
/// thin.
///
/// Never errors — sitemap and crawl failures are swallowed and simply yield
/// fewer URLs (an empty vec in the worst case), matching `sitemap::discover`'s
/// "absence is not an error" contract.
pub async fn discover_urls(
client: &FetchClient,
base_url: &str,
opts: &MapOptions,
) -> Vec<SitemapEntry> {
// Layer 1: sitemaps.
let mut entries = sitemap::discover(client, base_url)
.await
.unwrap_or_default();
// Track normalized URLs we've already emitted, for cross-layer dedup.
let mut seen: HashSet<String> = entries.iter().filter_map(normalize_str).collect();
// Layer 2: bounded crawl fallback, only when the sitemap is thin.
if !opts.crawl_fallback || entries.len() >= opts.min_sitemap_urls {
return entries;
}
let Some(base_origin) = Url::parse(base_url).ok().map(|u| crawler::origin_key(&u)) else {
// Unparseable base URL — nothing sensible to crawl against.
return entries;
};
let config = CrawlConfig {
fetch: FetchConfig::default(),
max_depth: opts.crawl_depth,
max_pages: opts.max_crawl_pages,
// Politeness + scope: same-origin only (crawler default), modest delay.
delay: Duration::from_millis(50),
..CrawlConfig::default()
};
let crawler = match Crawler::new(base_url, config) {
Ok(c) => c,
Err(_) => return entries,
};
let result = crawler.crawl(base_url, None).await;
// Richest source first: every link harvested from each fetched page. A
// directory/index page holds hundreds of same-origin links, and this set is
// NOT bound by the crawler's internal frontier cap. Then the URLs the crawl
// itself touched (fetched, visited, queued-but-unfetched frontier).
let mut discovered: Vec<String> = Vec::new();
for p in &result.pages {
discovered.push(p.url.clone());
if let Some(ex) = p.extraction.as_ref() {
let page_base = Url::parse(&p.url).ok();
for link in &ex.content.links {
// Resolve relative/protocol-relative hrefs against the page URL
// so the same-origin filter and dedup see absolute URLs.
let abs = match &page_base {
Some(b) => b.join(&link.href).ok(),
None => Url::parse(&link.href).ok(),
};
if let Some(u) = abs {
discovered.push(u.to_string());
}
}
}
}
discovered.extend(result.visited);
discovered.extend(result.remaining_frontier.into_iter().map(|(url, _)| url));
append_crawled(&mut entries, &mut seen, discovered, &base_origin);
// Uncapped by default; only truncate if the caller set an explicit limit
// (sitemap entries added first keep priority).
if let Some(cap) = opts.max_urls {
entries.truncate(cap);
}
entries
}
/// Normalize a raw URL string to the crawler's canonical form, returning `None`
/// if it doesn't parse.
fn normalize_url(raw: &str) -> Option<String> {
Url::parse(raw).ok().map(|u| crawler::normalize(&u))
}
/// Normalize a [`SitemapEntry`]'s URL for the dedup set.
fn normalize_str(entry: &SitemapEntry) -> Option<String> {
normalize_url(&entry.url)
}
/// Append crawl-discovered URLs to `entries`, skipping any that are off-origin,
/// unparseable, or already present (by normalized form).
///
/// Split out from [`discover_urls`] so the union/dedup/same-origin logic is
/// unit-testable without touching the network. Mutates `entries` and `seen` in
/// place; crawl URLs get empty metadata.
fn append_crawled(
entries: &mut Vec<SitemapEntry>,
seen: &mut HashSet<String>,
discovered: impl IntoIterator<Item = String>,
base_origin: &str,
) {
for raw in discovered {
let Ok(parsed) = Url::parse(&raw) else {
continue;
};
// Same-origin filter: drop anything whose origin differs from the seed.
if crawler::origin_key(&parsed) != base_origin {
continue;
}
let norm = crawler::normalize(&parsed);
if seen.insert(norm.clone()) {
entries.push(SitemapEntry {
url: norm,
last_modified: None,
priority: None,
change_freq: None,
});
}
}
}
#[cfg(test)]
mod tests {
use super::*;
fn entry(url: &str) -> SitemapEntry {
SitemapEntry {
url: url.to_string(),
last_modified: None,
priority: None,
change_freq: None,
}
}
fn origin_of(url: &str) -> String {
crawler::origin_key(&Url::parse(url).unwrap())
}
#[test]
fn append_adds_new_same_origin_urls() {
let mut entries = vec![entry("https://example.com/")];
let mut seen: HashSet<String> = entries.iter().filter_map(normalize_str).collect();
append_crawled(
&mut entries,
&mut seen,
vec![
"https://example.com/about".to_string(),
"https://example.com/contact".to_string(),
],
&origin_of("https://example.com"),
);
let urls: Vec<&str> = entries.iter().map(|e| e.url.as_str()).collect();
assert_eq!(
urls,
vec![
"https://example.com/",
"https://example.com/about",
"https://example.com/contact",
]
);
}
#[test]
fn append_dedups_against_sitemap_and_self() {
let mut entries = vec![entry("https://example.com/about")];
let mut seen: HashSet<String> = entries.iter().filter_map(normalize_str).collect();
append_crawled(
&mut entries,
&mut seen,
vec![
// Same as sitemap entry (trailing slash normalizes away).
"https://example.com/about/".to_string(),
// Fragment + duplicate -> only one new entry survives.
"https://example.com/new#frag".to_string(),
"https://example.com/new".to_string(),
],
&origin_of("https://example.com"),
);
let urls: Vec<&str> = entries.iter().map(|e| e.url.as_str()).collect();
assert_eq!(
urls,
vec!["https://example.com/about", "https://example.com/new"]
);
}
#[test]
fn append_filters_off_origin() {
let mut entries = Vec::new();
let mut seen = HashSet::new();
append_crawled(
&mut entries,
&mut seen,
vec![
"https://example.com/keep".to_string(),
"https://evil.com/drop".to_string(),
"https://sub.example.com/drop".to_string(), // different origin
"ftp://example.com/drop".to_string(), // unparseable as http origin match
],
&origin_of("https://example.com"),
);
let urls: Vec<&str> = entries.iter().map(|e| e.url.as_str()).collect();
assert_eq!(urls, vec!["https://example.com/keep"]);
}
#[test]
fn append_treats_www_as_same_origin() {
// origin_key strips a leading `www.`, so www and apex collapse.
let mut entries = Vec::new();
let mut seen = HashSet::new();
append_crawled(
&mut entries,
&mut seen,
vec!["https://www.example.com/page".to_string()],
&origin_of("https://example.com"),
);
assert_eq!(entries.len(), 1);
}
#[test]
fn crawl_urls_carry_no_metadata() {
let mut entries = Vec::new();
let mut seen = HashSet::new();
append_crawled(
&mut entries,
&mut seen,
vec!["https://example.com/x".to_string()],
&origin_of("https://example.com"),
);
assert_eq!(entries.len(), 1);
assert!(entries[0].last_modified.is_none());
assert!(entries[0].priority.is_none());
assert!(entries[0].change_freq.is_none());
}
#[test]
fn map_options_defaults() {
let o = MapOptions::default();
assert_eq!(o.max_crawl_pages, 150);
assert_eq!(o.crawl_depth, 2);
assert_eq!(o.min_sitemap_urls, 200);
assert!(o.crawl_fallback);
}
}

View file

@ -18,12 +18,20 @@ use crate::error::FetchError;
/// Maximum depth when recursively fetching sitemap index files.
/// Prevents infinite loops from circular sitemap references.
const MAX_RECURSION_DEPTH: usize = 3;
///
/// Raised 3→5: large sites (gov.uk, news publishers) nest sitemap indexes
/// more than three levels deep — a top index → per-section index →
/// per-month index → urlset is already four hops. Three cut those off.
const MAX_RECURSION_DEPTH: usize = 5;
/// Common sitemap paths to try when robots.txt doesn't list any.
const FALLBACK_SITEMAP_PATHS: &[&str] = &[
"/sitemap.xml",
"/sitemap_index.xml",
"/sitemap-index.xml",
"/sitemap1.xml",
"/sitemaps.xml",
"/sitemap/index.xml",
"/wp-sitemap.xml",
"/sitemap/sitemap-index.xml",
];
@ -105,10 +113,12 @@ async fn fetch_sitemaps(
for sitemap_url in urls {
debug!(url = %sitemap_url, depth, "fetching sitemap");
let xml = match client.fetch(sitemap_url).await {
Ok(result) if result.status == 200 => result.html,
Ok(result) => {
debug!(url = %sitemap_url, status = result.status, "sitemap not found");
// Fetch raw bytes so gzipped sitemaps survive intact. `fetch` runs
// the body through `from_utf8_lossy`, which corrupts binary gzip.
let body = match client.fetch_raw(sitemap_url).await {
Ok((200, body)) => body,
Ok((status, _)) => {
debug!(url = %sitemap_url, status, "sitemap not found");
continue;
}
Err(e) => {
@ -117,6 +127,14 @@ async fn fetch_sitemaps(
}
};
let xml = match decode_sitemap_body(&body) {
Some(xml) => xml,
None => {
debug!(url = %sitemap_url, "failed to decode sitemap body, skipping");
continue;
}
};
match detect_sitemap_type(&xml) {
SitemapType::UrlSet => {
let parsed = parse_urlset(&xml);
@ -147,6 +165,33 @@ async fn fetch_sitemaps(
}
}
/// Decode a raw sitemap body into a UTF-8 XML string.
///
/// Sitemaps are commonly served gzipped (`.xml.gz`) with
/// `Content-Type: application/gzip` and *no* `Content-Encoding`, so the HTTP
/// layer never inflates them. We detect the gzip magic bytes (`0x1f 0x8b`)
/// and gunzip in-process; otherwise the body is treated as plain XML.
///
/// Returns `None` if a gzip stream fails to inflate. Plain (non-gzip) bodies
/// always succeed via lossy UTF-8 decode, mirroring the previous behaviour.
pub(crate) fn decode_sitemap_body(body: &[u8]) -> Option<String> {
if body.starts_with(&[0x1f, 0x8b]) {
use std::io::Read;
let mut decoder = flate2::read::GzDecoder::new(body);
let mut out = String::new();
match decoder.read_to_string(&mut out) {
Ok(_) => Some(out),
Err(e) => {
warn!(error = %e, "failed to gunzip sitemap body");
None
}
}
} else {
Some(String::from_utf8_lossy(body).into_owned())
}
}
// ---------------------------------------------------------------------------
// Pure parsing functions (no I/O, fully testable)
// ---------------------------------------------------------------------------
@ -669,5 +714,47 @@ mod tests {
assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap_index.xml"));
assert!(FALLBACK_SITEMAP_PATHS.contains(&"/wp-sitemap.xml"));
assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap/sitemap-index.xml"));
// Paths added for robustness (item 3).
assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap-index.xml"));
assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap1.xml"));
assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemaps.xml"));
assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap/index.xml"));
}
#[test]
fn decode_plain_xml_body() {
let xml = r#"<?xml version="1.0"?><urlset></urlset>"#;
let got = decode_sitemap_body(xml.as_bytes()).expect("plain body decodes");
assert_eq!(got, xml);
}
#[test]
fn decode_gzipped_body() {
use std::io::Write;
let xml = r#"<?xml version="1.0"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url><loc>https://example.com/gz-page</loc></url>
</urlset>"#;
// Gzip-compress the XML, then confirm decode_sitemap_body inflates it
// and the parser finds the URL.
let mut encoder = flate2::write::GzEncoder::new(Vec::new(), flate2::Compression::default());
encoder.write_all(xml.as_bytes()).unwrap();
let gz = encoder.finish().unwrap();
assert_eq!(&gz[..2], &[0x1f, 0x8b], "gzip magic present");
let decoded = decode_sitemap_body(&gz).expect("gzip body inflates");
let entries = parse_urlset(&decoded);
assert_eq!(entries.len(), 1);
assert_eq!(entries[0].url, "https://example.com/gz-page");
}
#[test]
fn decode_corrupt_gzip_returns_none() {
// Starts with gzip magic but the rest is garbage -> inflate fails.
let bad = [0x1f, 0x8b, 0x08, 0x00, 0xde, 0xad, 0xbe, 0xef];
assert!(decode_sitemap_body(&bad).is_none());
}
}