feat(map): layered URL discovery with crawl fallback

map falls back to a bounded same-origin crawl when a site has no sitemap or a thin one, harvesting links from each fetched page (the rich source). Adds gzip (.xml.gz) sitemap support, deeper sitemap-index recursion + more fallback paths, uncapped-by-default results with an optional --map-limit / --map-pages, and routes crawler logs to stderr so --map -f json stays machine-parseable.
2026-06-13 23:15:13 +02:00 · 2026-06-06 12:08:26 +02:00 · 2026-06-06 12:08:26 +02:00 · b7bd1155c6
commit b7bd1155c6
parent 02302e7a1d
10 changed files with 478 additions and 12 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -3263,6 +3263,7 @@ dependencies = [
 "async-trait",
 "bytes",
 "calamine",
+ "flate2",
 "http",
 "quick-xml 0.37.5",
 "rand 0.8.5",
--- a/crates/webclaw-cli/src/cli.rs
+++ b/crates/webclaw-cli/src/cli.rs
@ -162,6 +162,18 @@ pub struct Cli {
    #[arg(long)]
    pub map: bool,

+    /// Max pages for --map's crawl fallback when the sitemap is thin [default: 150]
+    #[arg(long)]
+    pub map_pages: Option<usize>,
+
+    /// Disable --map's crawl fallback (sitemap-only discovery)
+    #[arg(long)]
+    pub no_map_crawl: bool,
+
+    /// Cap the number of URLs --map returns (default: uncapped)
+    #[arg(long)]
+    pub map_limit: Option<usize>,
+
    // -- LLM options --
    /// Extract structured JSON using LLM (pass a JSON schema string or @file)
    #[arg(long)]
--- a/crates/webclaw-cli/src/main.rs
+++ b/crates/webclaw-cli/src/main.rs
@ -35,7 +35,13 @@ fn init_logging(verbose: bool) {
        EnvFilter::try_from_env("WEBCLAW_LOG").unwrap_or_else(|_| EnvFilter::new(default))
    };

-    tracing_subscriber::fmt().with_env_filter(filter).init();
+    // Logs go to stderr, never stdout: stdout carries the actual result
+    // (markdown / JSON / URL list). A stray WARN on stdout corrupts
+    // machine-readable output — e.g. `--map --format json` piped to a parser.
+    tracing_subscriber::fmt()
+        .with_env_filter(filter)
+        .with_writer(std::io::stderr)
+        .init();
 }

 #[tokio::main]
--- a/crates/webclaw-cli/src/run.rs
+++ b/crates/webclaw-cli/src/run.rs
@ -205,12 +205,22 @@ pub async fn run_map(cli: &Cli) -> Result<(), String> {
    let client =
        FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;

-    let entries = webclaw_fetch::sitemap::discover(&client, url)
-        .await
-        .map_err(|e| format!("sitemap discovery failed: {e}"))?;
+    // Layered discovery: sitemaps first, bounded crawl fallback when thin.
+    let mut opts = webclaw_fetch::MapOptions::default();
+    if let Some(pages) = cli.map_pages {
+        opts.max_crawl_pages = pages;
+    }
+    if cli.no_map_crawl {
+        opts.crawl_fallback = false;
+    }
+    if let Some(limit) = cli.map_limit {
+        opts.max_urls = Some(limit);
+    }
+
+    let entries = webclaw_fetch::discover_urls(&client, url, &opts).await;

    if entries.is_empty() {
-        eprintln!("no sitemap URLs found for {url}");
+        eprintln!("no URLs found for {url}");
    } else {
        eprintln!("discovered {} URLs", entries.len());
    }
--- a/crates/webclaw-fetch/Cargo.toml
+++ b/crates/webclaw-fetch/Cargo.toml
@ -29,6 +29,7 @@ reqwest = { version = "0.12", default-features = false, features = ["json", "rus
 serde_json.workspace = true
 calamine = "0.34"
 zip = "2"
+flate2 = "1"

 [dev-dependencies]
 tempfile = "3"
--- a/crates/webclaw-fetch/src/client.rs
+++ b/crates/webclaw-fetch/src/client.rs
@ -437,6 +437,27 @@ impl FetchClient {
            .await
    }

+    /// Fetch a URL and return the raw, undecoded response body as bytes.
+    ///
+    /// Unlike [`fetch`](Self::fetch), this does **not** run the body through
+    /// `String::from_utf8_lossy`, so binary payloads survive intact. This is
+    /// required for gzipped sitemaps (`.xml.gz`): such files are served with
+    /// `Content-Type: application/gzip` and *no* `Content-Encoding`, so wreq
+    /// never auto-inflates them — the bytes arrive as raw gzip and the lossy
+    /// String path would mangle them. Callers detect the gzip magic
+    /// (`0x1f 0x8b`) and gunzip before parsing.
+    ///
+    /// No retry wrapper: callers (sitemap discovery) already tolerate
+    /// per-URL failures by skipping. Returns `(status, body)`.
+    pub async fn fetch_raw(&self, url: &str) -> Result<(u16, bytes::Bytes), FetchError> {
+        let parsed_url = crate::url_security::validate_public_http_url(url).await?;
+        let url = parsed_url.as_str();
+        let client = self.pick_client(url);
+        let resp = client.get(url).send().await?;
+        let response = Response::from_wreq(resp).await?;
+        Ok((response.status(), response.into_body()))
+    }
+
    /// Fetch a URL then extract structured content.
    #[instrument(skip(self), fields(url = %url))]
    pub async fn fetch_and_extract(
--- a/crates/webclaw-fetch/src/crawler.rs
+++ b/crates/webclaw-fetch/src/crawler.rs
@ -528,7 +528,7 @@ impl Crawler {
 }

 /// Canonical origin string for comparing same-origin: "scheme://host[:port]".
-fn origin_key(url: &Url) -> String {
+pub(crate) fn origin_key(url: &Url) -> String {
    let port_suffix = match url.port() {
        Some(p) => format!(":{p}"),
        None => String::new(),
@ -563,7 +563,7 @@ fn root_domain(url: &Url) -> String {

 /// Normalize a URL for dedup: strip fragment, remove trailing slash (except root "/"),
 /// lowercase scheme + host. Preserves query params and path case.
-fn normalize(url: &Url) -> String {
+pub(crate) fn normalize(url: &Url) -> String {
    let scheme = url.scheme();
    let host = url.host_str().unwrap_or("").to_ascii_lowercase();
    let port_suffix = match url.port() {
--- a/crates/webclaw-fetch/src/lib.rs
+++ b/crates/webclaw-fetch/src/lib.rs
@ -11,6 +11,7 @@ pub mod extractors;
 pub mod fetcher;
 pub mod linkedin;
 pub mod locale;
+pub mod map;
 pub mod proxy;
 pub mod reddit;
 pub mod sitemap;
@ -24,6 +25,7 @@ pub use error::FetchError;
 pub use fetcher::Fetcher;
 pub use http::HeaderMap;
 pub use locale::{accept_language_for_tld, accept_language_for_url};
+pub use map::{MapOptions, discover_urls};
 pub use proxy::{parse_proxy_file, parse_proxy_line};
 pub use sitemap::SitemapEntry;
 pub use webclaw_pdf::PdfMode;
--- a/crates/webclaw-fetch/src/map.rs
+++ b/crates/webclaw-fetch/src/map.rs
@ -0,0 +1,326 @@
+//! Layered URL discovery for the `map` command.
+//!
+//! `sitemap::discover` only finds URLs a site explicitly advertises in its
+//! `sitemap.xml`. Plenty of sites have no sitemap (news.ycombinator.com), a
+//! stale one, or a thin one that lists a handful of section roots. For those,
+//! a sitemap-only map returns almost nothing.
+//!
+//! This module adds a second layer: when the sitemap yields fewer than a
+//! threshold of URLs, run a *bounded* same-origin crawl and harvest every URL
+//! it touches — fetched pages, the visited set, **and** the remaining frontier
+//! (links queued but never fetched because the page cap was hit). That last
+//! bucket is the gold: a 150-page crawl of a link-dense site surfaces several
+//! thousand frontier URLs, turning a useless map into a real one.
+//!
+//! Strategy (layered, sitemap-first):
+//! 1. Sitemaps via [`sitemap::discover`] — authoritative, carries metadata
+//!    (lastmod / priority / changefreq).
+//! 2. If sitemaps are thin (`< min_sitemap_urls`) and the fallback is enabled,
+//!    a bounded crawl fills in the rest. Crawl-discovered URLs carry no
+//!    metadata (`None` everywhere) since they come from link harvesting, not a
+//!    sitemap.
+//!
+//! Sitemap entries always come first in the returned vec; crawl-discovered
+//! URLs are appended, deduplicated against the sitemap set using the *same*
+//! normalization the crawler uses ([`crawler::normalize`]) so map output stays
+//! internally consistent.
+
+use std::collections::HashSet;
+use std::time::Duration;
+
+use url::Url;
+
+use crate::client::{FetchClient, FetchConfig};
+use crate::crawler::{self, CrawlConfig, Crawler};
+use crate::sitemap::{self, SitemapEntry};
+
+/// Tuning knobs for [`discover_urls`].
+#[derive(Debug, Clone)]
+pub struct MapOptions {
+    /// Hard cap on pages the fallback crawl will fetch. The crawl surfaces far
+    /// more URLs than this via the unfetched frontier, so a small number still
+    /// yields a large map while keeping the crawl fast and polite.
+    pub max_crawl_pages: usize,
+    /// How deep the fallback crawl follows links (1 = links off the seed only).
+    pub crawl_depth: usize,
+    /// Sitemap-URL count below which the crawl fallback kicks in. A site with a
+    /// rich sitemap (≥ this many URLs) skips the crawl entirely.
+    pub min_sitemap_urls: usize,
+    /// Master switch for the crawl fallback. When `false`, behaves exactly like
+    /// the old sitemap-only `discover`.
+    pub crawl_fallback: bool,
+    /// Optional cap on URLs returned. `None` (default) = uncapped: return every
+    /// URL discovered (the crawl is already bounded by `max_crawl_pages`, so the
+    /// uncapped set is the links harvested from the fetched pages). Set `Some(n)`
+    /// to truncate.
+    pub max_urls: Option<usize>,
+}
+
+impl Default for MapOptions {
+    fn default() -> Self {
+        Self {
+            max_crawl_pages: 150,
+            crawl_depth: 2,
+            min_sitemap_urls: 200,
+            crawl_fallback: true,
+            max_urls: None,
+        }
+    }
+}
+
+/// Discover URLs for a site using the layered strategy described in the module
+/// docs: sitemaps first, then a bounded crawl fallback when the sitemap is
+/// thin.
+///
+/// Never errors — sitemap and crawl failures are swallowed and simply yield
+/// fewer URLs (an empty vec in the worst case), matching `sitemap::discover`'s
+/// "absence is not an error" contract.
+pub async fn discover_urls(
+    client: &FetchClient,
+    base_url: &str,
+    opts: &MapOptions,
+) -> Vec<SitemapEntry> {
+    // Layer 1: sitemaps.
+    let mut entries = sitemap::discover(client, base_url)
+        .await
+        .unwrap_or_default();
+
+    // Track normalized URLs we've already emitted, for cross-layer dedup.
+    let mut seen: HashSet<String> = entries.iter().filter_map(normalize_str).collect();
+
+    // Layer 2: bounded crawl fallback, only when the sitemap is thin.
+    if !opts.crawl_fallback || entries.len() >= opts.min_sitemap_urls {
+        return entries;
+    }
+
+    let Some(base_origin) = Url::parse(base_url).ok().map(|u| crawler::origin_key(&u)) else {
+        // Unparseable base URL — nothing sensible to crawl against.
+        return entries;
+    };
+
+    let config = CrawlConfig {
+        fetch: FetchConfig::default(),
+        max_depth: opts.crawl_depth,
+        max_pages: opts.max_crawl_pages,
+        // Politeness + scope: same-origin only (crawler default), modest delay.
+        delay: Duration::from_millis(50),
+        ..CrawlConfig::default()
+    };
+
+    let crawler = match Crawler::new(base_url, config) {
+        Ok(c) => c,
+        Err(_) => return entries,
+    };
+
+    let result = crawler.crawl(base_url, None).await;
+
+    // Richest source first: every link harvested from each fetched page. A
+    // directory/index page holds hundreds of same-origin links, and this set is
+    // NOT bound by the crawler's internal frontier cap. Then the URLs the crawl
+    // itself touched (fetched, visited, queued-but-unfetched frontier).
+    let mut discovered: Vec<String> = Vec::new();
+    for p in &result.pages {
+        discovered.push(p.url.clone());
+        if let Some(ex) = p.extraction.as_ref() {
+            let page_base = Url::parse(&p.url).ok();
+            for link in &ex.content.links {
+                // Resolve relative/protocol-relative hrefs against the page URL
+                // so the same-origin filter and dedup see absolute URLs.
+                let abs = match &page_base {
+                    Some(b) => b.join(&link.href).ok(),
+                    None => Url::parse(&link.href).ok(),
+                };
+                if let Some(u) = abs {
+                    discovered.push(u.to_string());
+                }
+            }
+        }
+    }
+    discovered.extend(result.visited);
+    discovered.extend(result.remaining_frontier.into_iter().map(|(url, _)| url));
+
+    append_crawled(&mut entries, &mut seen, discovered, &base_origin);
+
+    // Uncapped by default; only truncate if the caller set an explicit limit
+    // (sitemap entries added first keep priority).
+    if let Some(cap) = opts.max_urls {
+        entries.truncate(cap);
+    }
+    entries
+}
+
+/// Normalize a raw URL string to the crawler's canonical form, returning `None`
+/// if it doesn't parse.
+fn normalize_url(raw: &str) -> Option<String> {
+    Url::parse(raw).ok().map(|u| crawler::normalize(&u))
+}
+
+/// Normalize a [`SitemapEntry`]'s URL for the dedup set.
+fn normalize_str(entry: &SitemapEntry) -> Option<String> {
+    normalize_url(&entry.url)
+}
+
+/// Append crawl-discovered URLs to `entries`, skipping any that are off-origin,
+/// unparseable, or already present (by normalized form).
+///
+/// Split out from [`discover_urls`] so the union/dedup/same-origin logic is
+/// unit-testable without touching the network. Mutates `entries` and `seen` in
+/// place; crawl URLs get empty metadata.
+fn append_crawled(
+    entries: &mut Vec<SitemapEntry>,
+    seen: &mut HashSet<String>,
+    discovered: impl IntoIterator<Item = String>,
+    base_origin: &str,
+) {
+    for raw in discovered {
+        let Ok(parsed) = Url::parse(&raw) else {
+            continue;
+        };
+        // Same-origin filter: drop anything whose origin differs from the seed.
+        if crawler::origin_key(&parsed) != base_origin {
+            continue;
+        }
+        let norm = crawler::normalize(&parsed);
+        if seen.insert(norm.clone()) {
+            entries.push(SitemapEntry {
+                url: norm,
+                last_modified: None,
+                priority: None,
+                change_freq: None,
+            });
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn entry(url: &str) -> SitemapEntry {
+        SitemapEntry {
+            url: url.to_string(),
+            last_modified: None,
+            priority: None,
+            change_freq: None,
+        }
+    }
+
+    fn origin_of(url: &str) -> String {
+        crawler::origin_key(&Url::parse(url).unwrap())
+    }
+
+    #[test]
+    fn append_adds_new_same_origin_urls() {
+        let mut entries = vec![entry("https://example.com/")];
+        let mut seen: HashSet<String> = entries.iter().filter_map(normalize_str).collect();
+
+        append_crawled(
+            &mut entries,
+            &mut seen,
+            vec![
+                "https://example.com/about".to_string(),
+                "https://example.com/contact".to_string(),
+            ],
+            &origin_of("https://example.com"),
+        );
+
+        let urls: Vec<&str> = entries.iter().map(|e| e.url.as_str()).collect();
+        assert_eq!(
+            urls,
+            vec![
+                "https://example.com/",
+                "https://example.com/about",
+                "https://example.com/contact",
+            ]
+        );
+    }
+
+    #[test]
+    fn append_dedups_against_sitemap_and_self() {
+        let mut entries = vec![entry("https://example.com/about")];
+        let mut seen: HashSet<String> = entries.iter().filter_map(normalize_str).collect();
+
+        append_crawled(
+            &mut entries,
+            &mut seen,
+            vec![
+                // Same as sitemap entry (trailing slash normalizes away).
+                "https://example.com/about/".to_string(),
+                // Fragment + duplicate -> only one new entry survives.
+                "https://example.com/new#frag".to_string(),
+                "https://example.com/new".to_string(),
+            ],
+            &origin_of("https://example.com"),
+        );
+
+        let urls: Vec<&str> = entries.iter().map(|e| e.url.as_str()).collect();
+        assert_eq!(
+            urls,
+            vec!["https://example.com/about", "https://example.com/new"]
+        );
+    }
+
+    #[test]
+    fn append_filters_off_origin() {
+        let mut entries = Vec::new();
+        let mut seen = HashSet::new();
+
+        append_crawled(
+            &mut entries,
+            &mut seen,
+            vec![
+                "https://example.com/keep".to_string(),
+                "https://evil.com/drop".to_string(),
+                "https://sub.example.com/drop".to_string(), // different origin
+                "ftp://example.com/drop".to_string(),       // unparseable as http origin match
+            ],
+            &origin_of("https://example.com"),
+        );
+
+        let urls: Vec<&str> = entries.iter().map(|e| e.url.as_str()).collect();
+        assert_eq!(urls, vec!["https://example.com/keep"]);
+    }
+
+    #[test]
+    fn append_treats_www_as_same_origin() {
+        // origin_key strips a leading `www.`, so www and apex collapse.
+        let mut entries = Vec::new();
+        let mut seen = HashSet::new();
+
+        append_crawled(
+            &mut entries,
+            &mut seen,
+            vec!["https://www.example.com/page".to_string()],
+            &origin_of("https://example.com"),
+        );
+
+        assert_eq!(entries.len(), 1);
+    }
+
+    #[test]
+    fn crawl_urls_carry_no_metadata() {
+        let mut entries = Vec::new();
+        let mut seen = HashSet::new();
+
+        append_crawled(
+            &mut entries,
+            &mut seen,
+            vec!["https://example.com/x".to_string()],
+            &origin_of("https://example.com"),
+        );
+
+        assert_eq!(entries.len(), 1);
+        assert!(entries[0].last_modified.is_none());
+        assert!(entries[0].priority.is_none());
+        assert!(entries[0].change_freq.is_none());
+    }
+
+    #[test]
+    fn map_options_defaults() {
+        let o = MapOptions::default();
+        assert_eq!(o.max_crawl_pages, 150);
+        assert_eq!(o.crawl_depth, 2);
+        assert_eq!(o.min_sitemap_urls, 200);
+        assert!(o.crawl_fallback);
+    }
+}
--- a/crates/webclaw-fetch/src/sitemap.rs
+++ b/crates/webclaw-fetch/src/sitemap.rs
@ -18,12 +18,20 @@ use crate::error::FetchError;

 /// Maximum depth when recursively fetching sitemap index files.
 /// Prevents infinite loops from circular sitemap references.
-const MAX_RECURSION_DEPTH: usize = 3;
+///
+/// Raised 3→5: large sites (gov.uk, news publishers) nest sitemap indexes
+/// more than three levels deep — a top index → per-section index →
+/// per-month index → urlset is already four hops. Three cut those off.
+const MAX_RECURSION_DEPTH: usize = 5;

 /// Common sitemap paths to try when robots.txt doesn't list any.
 const FALLBACK_SITEMAP_PATHS: &[&str] = &[
    "/sitemap.xml",
    "/sitemap_index.xml",
+    "/sitemap-index.xml",
+    "/sitemap1.xml",
+    "/sitemaps.xml",
+    "/sitemap/index.xml",
    "/wp-sitemap.xml",
    "/sitemap/sitemap-index.xml",
 ];
@ -105,10 +113,12 @@ async fn fetch_sitemaps(
    for sitemap_url in urls {
        debug!(url = %sitemap_url, depth, "fetching sitemap");

-        let xml = match client.fetch(sitemap_url).await {
-            Ok(result) if result.status == 200 => result.html,
-            Ok(result) => {
-                debug!(url = %sitemap_url, status = result.status, "sitemap not found");
+        // Fetch raw bytes so gzipped sitemaps survive intact. `fetch` runs
+        // the body through `from_utf8_lossy`, which corrupts binary gzip.
+        let body = match client.fetch_raw(sitemap_url).await {
+            Ok((200, body)) => body,
+            Ok((status, _)) => {
+                debug!(url = %sitemap_url, status, "sitemap not found");
                continue;
            }
            Err(e) => {
@ -117,6 +127,14 @@ async fn fetch_sitemaps(
            }
        };

+        let xml = match decode_sitemap_body(&body) {
+            Some(xml) => xml,
+            None => {
+                debug!(url = %sitemap_url, "failed to decode sitemap body, skipping");
+                continue;
+            }
+        };
+
        match detect_sitemap_type(&xml) {
            SitemapType::UrlSet => {
                let parsed = parse_urlset(&xml);
@ -147,6 +165,33 @@ async fn fetch_sitemaps(
    }
 }

+/// Decode a raw sitemap body into a UTF-8 XML string.
+///
+/// Sitemaps are commonly served gzipped (`.xml.gz`) with
+/// `Content-Type: application/gzip` and *no* `Content-Encoding`, so the HTTP
+/// layer never inflates them. We detect the gzip magic bytes (`0x1f 0x8b`)
+/// and gunzip in-process; otherwise the body is treated as plain XML.
+///
+/// Returns `None` if a gzip stream fails to inflate. Plain (non-gzip) bodies
+/// always succeed via lossy UTF-8 decode, mirroring the previous behaviour.
+pub(crate) fn decode_sitemap_body(body: &[u8]) -> Option<String> {
+    if body.starts_with(&[0x1f, 0x8b]) {
+        use std::io::Read;
+
+        let mut decoder = flate2::read::GzDecoder::new(body);
+        let mut out = String::new();
+        match decoder.read_to_string(&mut out) {
+            Ok(_) => Some(out),
+            Err(e) => {
+                warn!(error = %e, "failed to gunzip sitemap body");
+                None
+            }
+        }
+    } else {
+        Some(String::from_utf8_lossy(body).into_owned())
+    }
+}
+
 // ---------------------------------------------------------------------------
 // Pure parsing functions (no I/O, fully testable)
 // ---------------------------------------------------------------------------
@ -669,5 +714,47 @@ mod tests {
        assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap_index.xml"));
        assert!(FALLBACK_SITEMAP_PATHS.contains(&"/wp-sitemap.xml"));
        assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap/sitemap-index.xml"));
+        // Paths added for robustness (item 3).
+        assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap-index.xml"));
+        assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap1.xml"));
+        assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemaps.xml"));
+        assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap/index.xml"));
+    }
+
+    #[test]
+    fn decode_plain_xml_body() {
+        let xml = r#"<?xml version="1.0"?><urlset></urlset>"#;
+        let got = decode_sitemap_body(xml.as_bytes()).expect("plain body decodes");
+        assert_eq!(got, xml);
+    }
+
+    #[test]
+    fn decode_gzipped_body() {
+        use std::io::Write;
+
+        let xml = r#"<?xml version="1.0"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+  <url><loc>https://example.com/gz-page</loc></url>
+</urlset>"#;
+
+        // Gzip-compress the XML, then confirm decode_sitemap_body inflates it
+        // and the parser finds the URL.
+        let mut encoder = flate2::write::GzEncoder::new(Vec::new(), flate2::Compression::default());
+        encoder.write_all(xml.as_bytes()).unwrap();
+        let gz = encoder.finish().unwrap();
+
+        assert_eq!(&gz[..2], &[0x1f, 0x8b], "gzip magic present");
+
+        let decoded = decode_sitemap_body(&gz).expect("gzip body inflates");
+        let entries = parse_urlset(&decoded);
+        assert_eq!(entries.len(), 1);
+        assert_eq!(entries[0].url, "https://example.com/gz-page");
+    }
+
+    #[test]
+    fn decode_corrupt_gzip_returns_none() {
+        // Starts with gzip magic but the rest is garbage -> inflate fails.
+        let bad = [0x1f, 0x8b, 0x08, 0x00, 0xde, 0xad, 0xbe, 0xef];
+        assert!(decode_sitemap_body(&bad).is_none());
    }
 }