From a4c351d5ae3e53767a6e701d4e7d5eb041181067 Mon Sep 17 00:00:00 2001 From: Valerio Date: Fri, 10 Apr 2026 18:22:57 +0200 Subject: [PATCH] feat: add fallback sitemap paths for broader discovery Try /sitemap_index.xml, /wp-sitemap.xml, and /sitemap/sitemap-index.xml after the standard /sitemap.xml. WordPress 5.5+ and many CMS platforms use non-standard paths that were previously missed. Paths found via robots.txt are deduplicated to avoid double-fetching. Bump to 0.3.11. Co-Authored-By: Claude Opus 4.6 (1M context) --- CHANGELOG.md | 7 +++++++ Cargo.lock | 12 +++++------ Cargo.toml | 2 +- crates/webclaw-fetch/src/sitemap.rs | 31 +++++++++++++++++++++++------ 4 files changed, 39 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5ca4c66..cbe6897 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,13 @@ All notable changes to webclaw are documented here. Format follows [Keep a Changelog](https://keepachangelog.com/). +## [0.3.11] — 2026-04-10 + +### Added +- **Sitemap fallback paths**: discovery now tries `/sitemap_index.xml`, `/wp-sitemap.xml`, and `/sitemap/sitemap-index.xml` in addition to the standard `/sitemap.xml`. Sites using WordPress or non-standard sitemap locations are now discovered without needing external search. + +--- + ## [0.3.10] — 2026-04-10 ### Changed diff --git a/Cargo.lock b/Cargo.lock index 3407fac..0ac74d4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3102,7 +3102,7 @@ dependencies = [ [[package]] name = "webclaw-cli" -version = "0.3.9" +version = "0.3.11" dependencies = [ "clap", "dotenvy", @@ -3122,7 +3122,7 @@ dependencies = [ [[package]] name = "webclaw-core" -version = "0.3.9" +version = "0.3.11" dependencies = [ "ego-tree", "once_cell", @@ -3140,7 +3140,7 @@ dependencies = [ [[package]] name = "webclaw-fetch" -version = "0.3.9" +version = "0.3.11" dependencies = [ "bytes", "calamine", @@ -3162,7 +3162,7 @@ dependencies = [ [[package]] name = "webclaw-llm" -version = "0.3.9" +version = "0.3.11" dependencies = [ "async-trait", "reqwest", @@ -3175,7 +3175,7 @@ dependencies = [ [[package]] name = "webclaw-mcp" -version = "0.3.9" +version = "0.3.11" dependencies = [ "dirs", "dotenvy", @@ -3196,7 +3196,7 @@ dependencies = [ [[package]] name = "webclaw-pdf" -version = "0.3.9" +version = "0.3.11" dependencies = [ "pdf-extract", "thiserror", diff --git a/Cargo.toml b/Cargo.toml index f7835b8..bc29fd7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ resolver = "2" members = ["crates/*"] [workspace.package] -version = "0.3.10" +version = "0.3.11" edition = "2024" license = "AGPL-3.0" repository = "https://github.com/0xMassi/webclaw" diff --git a/crates/webclaw-fetch/src/sitemap.rs b/crates/webclaw-fetch/src/sitemap.rs index a4cdf15..05ee394 100644 --- a/crates/webclaw-fetch/src/sitemap.rs +++ b/crates/webclaw-fetch/src/sitemap.rs @@ -2,7 +2,7 @@ /// /// Discovers URLs from a site's sitemaps using a 3-step process: /// 1. Parse robots.txt for `Sitemap:` directives -/// 2. Try /sitemap.xml as fallback +/// 2. Try common sitemap paths as fallback /// 3. Recursively resolve sitemap index files /// /// All HTTP requests go through FetchClient to inherit TLS fingerprinting. @@ -20,6 +20,14 @@ use crate::error::FetchError; /// Prevents infinite loops from circular sitemap references. const MAX_RECURSION_DEPTH: usize = 3; +/// Common sitemap paths to try when robots.txt doesn't list any. +const FALLBACK_SITEMAP_PATHS: &[&str] = &[ + "/sitemap.xml", + "/sitemap_index.xml", + "/wp-sitemap.xml", + "/sitemap/sitemap-index.xml", +]; + /// A single URL discovered from a sitemap. #[derive(Debug, Clone, Serialize)] pub struct SitemapEntry { @@ -33,7 +41,7 @@ pub struct SitemapEntry { /// /// Discovery order: /// 1. Fetch /robots.txt, parse `Sitemap:` directives -/// 2. Fetch /sitemap.xml directly +/// 2. Try common sitemap paths as fallback (skipping any already found) /// 3. If sitemap index, recursively fetch child sitemaps /// 4. Deduplicate by URL /// @@ -63,10 +71,12 @@ pub async fn discover( } } - // Step 2: Always try /sitemap.xml as well (may not be listed in robots.txt) - let default_sitemap = format!("{base}/sitemap.xml"); - if !sitemap_urls.iter().any(|u| u == &default_sitemap) { - sitemap_urls.push(default_sitemap); + // Step 2: Try common sitemap paths (skipping any already discovered via robots.txt) + for path in FALLBACK_SITEMAP_PATHS { + let candidate = format!("{base}{path}"); + if !sitemap_urls.iter().any(|u| u == &candidate) { + sitemap_urls.push(candidate); + } } // Step 3: Fetch and parse each sitemap, handling indexes recursively @@ -579,4 +589,13 @@ mod tests { assert_eq!(detect_sitemap_type("garbage"), SitemapType::Unknown); assert_eq!(detect_sitemap_type(""), SitemapType::Unknown); } + + #[test] + fn test_fallback_paths_constant() { + // Verify the constant has the expected paths + assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap.xml")); + assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap_index.xml")); + assert!(FALLBACK_SITEMAP_PATHS.contains(&"/wp-sitemap.xml")); + assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap/sitemap-index.xml")); + } }