feat: add fallback sitemap paths for broader discovery

Try /sitemap_index.xml, /wp-sitemap.xml, and /sitemap/sitemap-index.xml after the standard /sitemap.xml. WordPress 5.5+ and many CMS platforms use non-standard paths that were previously missed. Paths found via robots.txt are deduplicated to avoid double-fetching. Bump to 0.3.11. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-06-08 22:25:12 +02:00 · 2026-04-10 18:22:57 +02:00 · 2026-04-10 18:22:57 +02:00 · a4c351d5ae
commit a4c351d5ae
parent 25b6282d5f
4 changed files with 39 additions and 13 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -3,6 +3,13 @@
 All notable changes to webclaw are documented here.
 Format follows [Keep a Changelog](https://keepachangelog.com/).

+## [0.3.11] — 2026-04-10
+
+### Added
+- **Sitemap fallback paths**: discovery now tries `/sitemap_index.xml`, `/wp-sitemap.xml`, and `/sitemap/sitemap-index.xml` in addition to the standard `/sitemap.xml`. Sites using WordPress or non-standard sitemap locations are now discovered without needing external search.
+
+---
+
 ## [0.3.10] — 2026-04-10

 ### Changed
--- a/Cargo.lock
+++ b/Cargo.lock
@ -3102,7 +3102,7 @@ dependencies = [

 [[package]]
 name = "webclaw-cli"
-version = "0.3.9"
+version = "0.3.11"
 dependencies = [
 "clap",
 "dotenvy",
@ -3122,7 +3122,7 @@ dependencies = [

 [[package]]
 name = "webclaw-core"
-version = "0.3.9"
+version = "0.3.11"
 dependencies = [
 "ego-tree",
 "once_cell",
@ -3140,7 +3140,7 @@ dependencies = [

 [[package]]
 name = "webclaw-fetch"
-version = "0.3.9"
+version = "0.3.11"
 dependencies = [
 "bytes",
 "calamine",
@ -3162,7 +3162,7 @@ dependencies = [

 [[package]]
 name = "webclaw-llm"
-version = "0.3.9"
+version = "0.3.11"
 dependencies = [
 "async-trait",
 "reqwest",
@ -3175,7 +3175,7 @@ dependencies = [

 [[package]]
 name = "webclaw-mcp"
-version = "0.3.9"
+version = "0.3.11"
 dependencies = [
 "dirs",
 "dotenvy",
@ -3196,7 +3196,7 @@ dependencies = [

 [[package]]
 name = "webclaw-pdf"
-version = "0.3.9"
+version = "0.3.11"
 dependencies = [
 "pdf-extract",
 "thiserror",
--- a/Cargo.toml
+++ b/Cargo.toml
@ -3,7 +3,7 @@ resolver = "2"
 members = ["crates/*"]

 [workspace.package]
-version = "0.3.10"
+version = "0.3.11"
 edition = "2024"
 license = "AGPL-3.0"
 repository = "https://github.com/0xMassi/webclaw"
--- a/crates/webclaw-fetch/src/sitemap.rs
+++ b/crates/webclaw-fetch/src/sitemap.rs
@ -2,7 +2,7 @@
 ///
 /// Discovers URLs from a site's sitemaps using a 3-step process:
 /// 1. Parse robots.txt for `Sitemap:` directives
-/// 2. Try /sitemap.xml as fallback
+/// 2. Try common sitemap paths as fallback
 /// 3. Recursively resolve sitemap index files
 ///
 /// All HTTP requests go through FetchClient to inherit TLS fingerprinting.
@ -20,6 +20,14 @@ use crate::error::FetchError;
 /// Prevents infinite loops from circular sitemap references.
 const MAX_RECURSION_DEPTH: usize = 3;

+/// Common sitemap paths to try when robots.txt doesn't list any.
+const FALLBACK_SITEMAP_PATHS: &[&str] = &[
+    "/sitemap.xml",
+    "/sitemap_index.xml",
+    "/wp-sitemap.xml",
+    "/sitemap/sitemap-index.xml",
+];
+
 /// A single URL discovered from a sitemap.
 #[derive(Debug, Clone, Serialize)]
 pub struct SitemapEntry {
@ -33,7 +41,7 @@ pub struct SitemapEntry {
 ///
 /// Discovery order:
 /// 1. Fetch /robots.txt, parse `Sitemap:` directives
-/// 2. Fetch /sitemap.xml directly
+/// 2. Try common sitemap paths as fallback (skipping any already found)
 /// 3. If sitemap index, recursively fetch child sitemaps
 /// 4. Deduplicate by URL
 ///
@ -63,10 +71,12 @@ pub async fn discover(
        }
    }

-    // Step 2: Always try /sitemap.xml as well (may not be listed in robots.txt)
-    let default_sitemap = format!("{base}/sitemap.xml");
-    if !sitemap_urls.iter().any(|u| u == &default_sitemap) {
-        sitemap_urls.push(default_sitemap);
+    // Step 2: Try common sitemap paths (skipping any already discovered via robots.txt)
+    for path in FALLBACK_SITEMAP_PATHS {
+        let candidate = format!("{base}{path}");
+        if !sitemap_urls.iter().any(|u| u == &candidate) {
+            sitemap_urls.push(candidate);
+        }
    }

    // Step 3: Fetch and parse each sitemap, handling indexes recursively
@ -579,4 +589,13 @@ mod tests {
        assert_eq!(detect_sitemap_type("garbage"), SitemapType::Unknown);
        assert_eq!(detect_sitemap_type(""), SitemapType::Unknown);
    }
+
+    #[test]
+    fn test_fallback_paths_constant() {
+        // Verify the constant has the expected paths
+        assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap.xml"));
+        assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap_index.xml"));
+        assert!(FALLBACK_SITEMAP_PATHS.contains(&"/wp-sitemap.xml"));
+        assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap/sitemap-index.xml"));
+    }
 }