mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-04-25 00:06:21 +02:00
feat: add fallback sitemap paths for broader discovery
Try /sitemap_index.xml, /wp-sitemap.xml, and /sitemap/sitemap-index.xml after the standard /sitemap.xml. WordPress 5.5+ and many CMS platforms use non-standard paths that were previously missed. Paths found via robots.txt are deduplicated to avoid double-fetching. Bump to 0.3.11. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
25b6282d5f
commit
a4c351d5ae
4 changed files with 39 additions and 13 deletions
|
|
@ -3,6 +3,13 @@
|
|||
All notable changes to webclaw are documented here.
|
||||
Format follows [Keep a Changelog](https://keepachangelog.com/).
|
||||
|
||||
## [0.3.11] — 2026-04-10
|
||||
|
||||
### Added
|
||||
- **Sitemap fallback paths**: discovery now tries `/sitemap_index.xml`, `/wp-sitemap.xml`, and `/sitemap/sitemap-index.xml` in addition to the standard `/sitemap.xml`. Sites using WordPress or non-standard sitemap locations are now discovered without needing external search.
|
||||
|
||||
---
|
||||
|
||||
## [0.3.10] — 2026-04-10
|
||||
|
||||
### Changed
|
||||
|
|
|
|||
12
Cargo.lock
generated
12
Cargo.lock
generated
|
|
@ -3102,7 +3102,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-cli"
|
||||
version = "0.3.9"
|
||||
version = "0.3.11"
|
||||
dependencies = [
|
||||
"clap",
|
||||
"dotenvy",
|
||||
|
|
@ -3122,7 +3122,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-core"
|
||||
version = "0.3.9"
|
||||
version = "0.3.11"
|
||||
dependencies = [
|
||||
"ego-tree",
|
||||
"once_cell",
|
||||
|
|
@ -3140,7 +3140,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-fetch"
|
||||
version = "0.3.9"
|
||||
version = "0.3.11"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"calamine",
|
||||
|
|
@ -3162,7 +3162,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-llm"
|
||||
version = "0.3.9"
|
||||
version = "0.3.11"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"reqwest",
|
||||
|
|
@ -3175,7 +3175,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-mcp"
|
||||
version = "0.3.9"
|
||||
version = "0.3.11"
|
||||
dependencies = [
|
||||
"dirs",
|
||||
"dotenvy",
|
||||
|
|
@ -3196,7 +3196,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-pdf"
|
||||
version = "0.3.9"
|
||||
version = "0.3.11"
|
||||
dependencies = [
|
||||
"pdf-extract",
|
||||
"thiserror",
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ resolver = "2"
|
|||
members = ["crates/*"]
|
||||
|
||||
[workspace.package]
|
||||
version = "0.3.10"
|
||||
version = "0.3.11"
|
||||
edition = "2024"
|
||||
license = "AGPL-3.0"
|
||||
repository = "https://github.com/0xMassi/webclaw"
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
///
|
||||
/// Discovers URLs from a site's sitemaps using a 3-step process:
|
||||
/// 1. Parse robots.txt for `Sitemap:` directives
|
||||
/// 2. Try /sitemap.xml as fallback
|
||||
/// 2. Try common sitemap paths as fallback
|
||||
/// 3. Recursively resolve sitemap index files
|
||||
///
|
||||
/// All HTTP requests go through FetchClient to inherit TLS fingerprinting.
|
||||
|
|
@ -20,6 +20,14 @@ use crate::error::FetchError;
|
|||
/// Prevents infinite loops from circular sitemap references.
|
||||
const MAX_RECURSION_DEPTH: usize = 3;
|
||||
|
||||
/// Common sitemap paths to try when robots.txt doesn't list any.
|
||||
const FALLBACK_SITEMAP_PATHS: &[&str] = &[
|
||||
"/sitemap.xml",
|
||||
"/sitemap_index.xml",
|
||||
"/wp-sitemap.xml",
|
||||
"/sitemap/sitemap-index.xml",
|
||||
];
|
||||
|
||||
/// A single URL discovered from a sitemap.
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct SitemapEntry {
|
||||
|
|
@ -33,7 +41,7 @@ pub struct SitemapEntry {
|
|||
///
|
||||
/// Discovery order:
|
||||
/// 1. Fetch /robots.txt, parse `Sitemap:` directives
|
||||
/// 2. Fetch /sitemap.xml directly
|
||||
/// 2. Try common sitemap paths as fallback (skipping any already found)
|
||||
/// 3. If sitemap index, recursively fetch child sitemaps
|
||||
/// 4. Deduplicate by URL
|
||||
///
|
||||
|
|
@ -63,10 +71,12 @@ pub async fn discover(
|
|||
}
|
||||
}
|
||||
|
||||
// Step 2: Always try /sitemap.xml as well (may not be listed in robots.txt)
|
||||
let default_sitemap = format!("{base}/sitemap.xml");
|
||||
if !sitemap_urls.iter().any(|u| u == &default_sitemap) {
|
||||
sitemap_urls.push(default_sitemap);
|
||||
// Step 2: Try common sitemap paths (skipping any already discovered via robots.txt)
|
||||
for path in FALLBACK_SITEMAP_PATHS {
|
||||
let candidate = format!("{base}{path}");
|
||||
if !sitemap_urls.iter().any(|u| u == &candidate) {
|
||||
sitemap_urls.push(candidate);
|
||||
}
|
||||
}
|
||||
|
||||
// Step 3: Fetch and parse each sitemap, handling indexes recursively
|
||||
|
|
@ -579,4 +589,13 @@ mod tests {
|
|||
assert_eq!(detect_sitemap_type("garbage"), SitemapType::Unknown);
|
||||
assert_eq!(detect_sitemap_type(""), SitemapType::Unknown);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_fallback_paths_constant() {
|
||||
// Verify the constant has the expected paths
|
||||
assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap.xml"));
|
||||
assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap_index.xml"));
|
||||
assert!(FALLBACK_SITEMAP_PATHS.contains(&"/wp-sitemap.xml"));
|
||||
assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap/sitemap-index.xml"));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue