feat(map): layered URL discovery with crawl fallback

map falls back to a bounded same-origin crawl when a site has no sitemap or a thin one, harvesting links from each fetched page (the rich source). Adds gzip (.xml.gz) sitemap support, deeper sitemap-index recursion + more fallback paths, uncapped-by-default results with an optional --map-limit / --map-pages, and routes crawler logs to stderr so --map -f json stays machine-parseable.
2026-06-22 02:38:06 +02:00 · 2026-06-06 12:08:26 +02:00 · 2026-06-06 12:08:26 +02:00 · b7bd1155c6
commit b7bd1155c6
parent 02302e7a1d
10 changed files with 478 additions and 12 deletions
--- a/crates/webclaw-cli/src/cli.rs
+++ b/crates/webclaw-cli/src/cli.rs
@ -162,6 +162,18 @@ pub struct Cli {
    #[arg(long)]
    pub map: bool,

+    /// Max pages for --map's crawl fallback when the sitemap is thin [default: 150]
+    #[arg(long)]
+    pub map_pages: Option<usize>,
+
+    /// Disable --map's crawl fallback (sitemap-only discovery)
+    #[arg(long)]
+    pub no_map_crawl: bool,
+
+    /// Cap the number of URLs --map returns (default: uncapped)
+    #[arg(long)]
+    pub map_limit: Option<usize>,
+
    // -- LLM options --
    /// Extract structured JSON using LLM (pass a JSON schema string or @file)
    #[arg(long)]
--- a/crates/webclaw-cli/src/main.rs
+++ b/crates/webclaw-cli/src/main.rs
@ -35,7 +35,13 @@ fn init_logging(verbose: bool) {
        EnvFilter::try_from_env("WEBCLAW_LOG").unwrap_or_else(|_| EnvFilter::new(default))
    };

-    tracing_subscriber::fmt().with_env_filter(filter).init();
+    // Logs go to stderr, never stdout: stdout carries the actual result
+    // (markdown / JSON / URL list). A stray WARN on stdout corrupts
+    // machine-readable output — e.g. `--map --format json` piped to a parser.
+    tracing_subscriber::fmt()
+        .with_env_filter(filter)
+        .with_writer(std::io::stderr)
+        .init();
 }

 #[tokio::main]
--- a/crates/webclaw-cli/src/run.rs
+++ b/crates/webclaw-cli/src/run.rs
@ -205,12 +205,22 @@ pub async fn run_map(cli: &Cli) -> Result<(), String> {
    let client =
        FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;

-    let entries = webclaw_fetch::sitemap::discover(&client, url)
-        .await
-        .map_err(|e| format!("sitemap discovery failed: {e}"))?;
+    // Layered discovery: sitemaps first, bounded crawl fallback when thin.
+    let mut opts = webclaw_fetch::MapOptions::default();
+    if let Some(pages) = cli.map_pages {
+        opts.max_crawl_pages = pages;
+    }
+    if cli.no_map_crawl {
+        opts.crawl_fallback = false;
+    }
+    if let Some(limit) = cli.map_limit {
+        opts.max_urls = Some(limit);
+    }
+
+    let entries = webclaw_fetch::discover_urls(&client, url, &opts).await;

    if entries.is_empty() {
-        eprintln!("no sitemap URLs found for {url}");
+        eprintln!("no URLs found for {url}");
    } else {
        eprintln!("discovered {} URLs", entries.len());
    }