From a4c351d5ae3e53767a6e701d4e7d5eb041181067 Mon Sep 17 00:00:00 2001
From: Valerio <massimianivalerio1@gmail.com>
Date: Fri, 10 Apr 2026 18:22:57 +0200
Subject: [PATCH] feat: add fallback sitemap paths for broader discovery

Try /sitemap_index.xml, /wp-sitemap.xml, and /sitemap/sitemap-index.xml
after the standard /sitemap.xml. WordPress 5.5+ and many CMS platforms
use non-standard paths that were previously missed. Paths found via
robots.txt are deduplicated to avoid double-fetching.

Bump to 0.3.11.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 CHANGELOG.md                        |  7 +++++++
 Cargo.lock                          | 12 +++++------
 Cargo.toml                          |  2 +-
 crates/webclaw-fetch/src/sitemap.rs | 31 +++++++++++++++++++++++------
 4 files changed, 39 insertions(+), 13 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5ca4c66..cbe6897 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,13 @@
 All notable changes to webclaw are documented here.
 Format follows [Keep a Changelog](https://keepachangelog.com/).
 
+## [0.3.11] — 2026-04-10
+
+### Added
+- **Sitemap fallback paths**: discovery now tries `/sitemap_index.xml`, `/wp-sitemap.xml`, and `/sitemap/sitemap-index.xml` in addition to the standard `/sitemap.xml`. Sites using WordPress or non-standard sitemap locations are now discovered without needing external search.
+
+---
+
 ## [0.3.10] — 2026-04-10
 
 ### Changed
diff --git a/Cargo.lock b/Cargo.lock
index 3407fac..0ac74d4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3102,7 +3102,7 @@ dependencies = [
 
 [[package]]
 name = "webclaw-cli"
-version = "0.3.9"
+version = "0.3.11"
 dependencies = [
  "clap",
  "dotenvy",
@@ -3122,7 +3122,7 @@ dependencies = [
 
 [[package]]
 name = "webclaw-core"
-version = "0.3.9"
+version = "0.3.11"
 dependencies = [
  "ego-tree",
  "once_cell",
@@ -3140,7 +3140,7 @@ dependencies = [
 
 [[package]]
 name = "webclaw-fetch"
-version = "0.3.9"
+version = "0.3.11"
 dependencies = [
  "bytes",
  "calamine",
@@ -3162,7 +3162,7 @@ dependencies = [
 
 [[package]]
 name = "webclaw-llm"
-version = "0.3.9"
+version = "0.3.11"
 dependencies = [
  "async-trait",
  "reqwest",
@@ -3175,7 +3175,7 @@ dependencies = [
 
 [[package]]
 name = "webclaw-mcp"
-version = "0.3.9"
+version = "0.3.11"
 dependencies = [
  "dirs",
  "dotenvy",
@@ -3196,7 +3196,7 @@ dependencies = [
 
 [[package]]
 name = "webclaw-pdf"
-version = "0.3.9"
+version = "0.3.11"
 dependencies = [
  "pdf-extract",
  "thiserror",
diff --git a/Cargo.toml b/Cargo.toml
index f7835b8..bc29fd7 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,7 +3,7 @@ resolver = "2"
 members = ["crates/*"]
 
 [workspace.package]
-version = "0.3.10"
+version = "0.3.11"
 edition = "2024"
 license = "AGPL-3.0"
 repository = "https://github.com/0xMassi/webclaw"
diff --git a/crates/webclaw-fetch/src/sitemap.rs b/crates/webclaw-fetch/src/sitemap.rs
index a4cdf15..05ee394 100644
--- a/crates/webclaw-fetch/src/sitemap.rs
+++ b/crates/webclaw-fetch/src/sitemap.rs
@@ -2,7 +2,7 @@
 ///
 /// Discovers URLs from a site's sitemaps using a 3-step process:
 /// 1. Parse robots.txt for `Sitemap:` directives
-/// 2. Try /sitemap.xml as fallback
+/// 2. Try common sitemap paths as fallback
 /// 3. Recursively resolve sitemap index files
 ///
 /// All HTTP requests go through FetchClient to inherit TLS fingerprinting.
@@ -20,6 +20,14 @@ use crate::error::FetchError;
 /// Prevents infinite loops from circular sitemap references.
 const MAX_RECURSION_DEPTH: usize = 3;
 
+/// Common sitemap paths to try when robots.txt doesn't list any.
+const FALLBACK_SITEMAP_PATHS: &[&str] = &[
+    "/sitemap.xml",
+    "/sitemap_index.xml",
+    "/wp-sitemap.xml",
+    "/sitemap/sitemap-index.xml",
+];
+
 /// A single URL discovered from a sitemap.
 #[derive(Debug, Clone, Serialize)]
 pub struct SitemapEntry {
@@ -33,7 +41,7 @@ pub struct SitemapEntry {
 ///
 /// Discovery order:
 /// 1. Fetch /robots.txt, parse `Sitemap:` directives
-/// 2. Fetch /sitemap.xml directly
+/// 2. Try common sitemap paths as fallback (skipping any already found)
 /// 3. If sitemap index, recursively fetch child sitemaps
 /// 4. Deduplicate by URL
 ///
@@ -63,10 +71,12 @@ pub async fn discover(
         }
     }
 
-    // Step 2: Always try /sitemap.xml as well (may not be listed in robots.txt)
-    let default_sitemap = format!("{base}/sitemap.xml");
-    if !sitemap_urls.iter().any(|u| u == &default_sitemap) {
-        sitemap_urls.push(default_sitemap);
+    // Step 2: Try common sitemap paths (skipping any already discovered via robots.txt)
+    for path in FALLBACK_SITEMAP_PATHS {
+        let candidate = format!("{base}{path}");
+        if !sitemap_urls.iter().any(|u| u == &candidate) {
+            sitemap_urls.push(candidate);
+        }
     }
 
     // Step 3: Fetch and parse each sitemap, handling indexes recursively
@@ -579,4 +589,13 @@ mod tests {
         assert_eq!(detect_sitemap_type("garbage"), SitemapType::Unknown);
         assert_eq!(detect_sitemap_type(""), SitemapType::Unknown);
     }
+
+    #[test]
+    fn test_fallback_paths_constant() {
+        // Verify the constant has the expected paths
+        assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap.xml"));
+        assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap_index.xml"));
+        assert!(FALLBACK_SITEMAP_PATHS.contains(&"/wp-sitemap.xml"));
+        assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap/sitemap-index.xml"));
+    }
 }