/// Sitemap parsing and URL discovery. /// /// Discovers URLs from a site's sitemaps using a 3-step process: /// 1. Parse robots.txt for `Sitemap:` directives /// 2. Try common sitemap paths as fallback /// 3. Recursively resolve sitemap index files /// /// All HTTP requests go through FetchClient to inherit TLS fingerprinting. use std::collections::HashSet; use quick_xml::Reader; use quick_xml::events::Event; use serde::Serialize; use tracing::{debug, warn}; use crate::client::FetchClient; use crate::error::FetchError; /// Maximum depth when recursively fetching sitemap index files. /// Prevents infinite loops from circular sitemap references. const MAX_RECURSION_DEPTH: usize = 3; /// Common sitemap paths to try when robots.txt doesn't list any. const FALLBACK_SITEMAP_PATHS: &[&str] = &[ "/sitemap.xml", "/sitemap_index.xml", "/wp-sitemap.xml", "/sitemap/sitemap-index.xml", ]; /// A single URL discovered from a sitemap. #[derive(Debug, Clone, Serialize)] pub struct SitemapEntry { pub url: String, pub last_modified: Option, pub priority: Option, pub change_freq: Option, } /// Discover all URLs from a site's sitemaps. /// /// Discovery order: /// 1. Fetch /robots.txt, parse `Sitemap:` directives /// 2. Try common sitemap paths as fallback (skipping any already found) /// 3. If sitemap index, recursively fetch child sitemaps /// 4. Deduplicate by URL /// /// Returns an empty vec (not an error) if no sitemaps are found. pub async fn discover( client: &FetchClient, base_url: &str, ) -> Result, FetchError> { let base = base_url.trim_end_matches('/'); let mut sitemap_urls: Vec = Vec::new(); // Step 1: Try robots.txt let robots_url = format!("{base}/robots.txt"); debug!(url = %robots_url, "fetching robots.txt"); match client.fetch(&robots_url).await { Ok(result) if result.status == 200 => { let found = parse_robots_txt(&result.html); debug!(count = found.len(), "sitemap URLs from robots.txt"); sitemap_urls.extend(found); } Ok(result) => { debug!(status = result.status, "robots.txt not found"); } Err(e) => { debug!(error = %e, "failed to fetch robots.txt"); } } // Step 2: Try common sitemap paths (skipping any already discovered via robots.txt) for path in FALLBACK_SITEMAP_PATHS { let candidate = format!("{base}{path}"); if !sitemap_urls.iter().any(|u| u == &candidate) { sitemap_urls.push(candidate); } } // Step 3: Fetch and parse each sitemap, handling indexes recursively let mut seen_urls: HashSet = HashSet::new(); let mut entries: Vec = Vec::new(); fetch_sitemaps(client, &sitemap_urls, &mut entries, &mut seen_urls, 0).await; debug!(total = entries.len(), "sitemap discovery complete"); Ok(entries) } /// Recursively fetch and parse sitemap URLs, handling both urlsets and indexes. async fn fetch_sitemaps( client: &FetchClient, urls: &[String], entries: &mut Vec, seen_urls: &mut HashSet, depth: usize, ) { if depth > MAX_RECURSION_DEPTH { warn!(depth, "sitemap recursion limit reached, stopping"); return; } for sitemap_url in urls { debug!(url = %sitemap_url, depth, "fetching sitemap"); let xml = match client.fetch(sitemap_url).await { Ok(result) if result.status == 200 => result.html, Ok(result) => { debug!(url = %sitemap_url, status = result.status, "sitemap not found"); continue; } Err(e) => { debug!(url = %sitemap_url, error = %e, "failed to fetch sitemap"); continue; } }; match detect_sitemap_type(&xml) { SitemapType::UrlSet => { let parsed = parse_urlset(&xml); for entry in parsed { if seen_urls.insert(entry.url.clone()) { entries.push(entry); } } } SitemapType::Index => { let child_urls = parse_sitemap_index(&xml); debug!(count = child_urls.len(), "found child sitemaps in index"); // Box the recursive call to avoid large future sizes Box::pin(fetch_sitemaps( client, &child_urls, entries, seen_urls, depth + 1, )) .await; } SitemapType::Unknown => { debug!(url = %sitemap_url, "unrecognized sitemap format, skipping"); } } } } // --------------------------------------------------------------------------- // Pure parsing functions (no I/O, fully testable) // --------------------------------------------------------------------------- /// Extract `Sitemap:` directive URLs from robots.txt content. /// /// Handles case-insensitive directive names, optional whitespace before /// the colon, and strips inline `# ...` comments. Rejects values without /// a URL scheme (`://`) so a malformed directive doesn't turn an empty /// or garbage string into a "sitemap URL". pub fn parse_robots_txt(text: &str) -> Vec { text.lines() .filter_map(|line| { // Strip inline `#...` comments (robots.txt convention). let line = match line.split_once('#') { Some((before, _)) => before, None => line, }; let trimmed = line.trim(); // Find the colon that terminates the directive name; reject // lines that don't have one. Anything between the start and // the colon that matches "sitemap" case-insensitively is a hit. let colon = trimmed.find(':')?; let (name, rest) = trimmed.split_at(colon); if !name.trim().eq_ignore_ascii_case("sitemap") { return None; } // Skip the colon itself, then trim. let url = rest[1..].trim(); if url.is_empty() || !url.contains("://") { return None; } Some(url.to_string()) }) .collect() } /// Parse a sitemap XML string. Handles both `` and ``. /// Returns entries from urlsets and recursion targets from indexes. pub fn parse_sitemap_xml(xml: &str) -> Vec { match detect_sitemap_type(xml) { SitemapType::UrlSet => parse_urlset(xml), SitemapType::Index => { // For the public parsing API, convert index entries into // SitemapEntry with just the URL. The async `discover` function // handles actual recursive fetching. parse_sitemap_index(xml) .into_iter() .map(|url| SitemapEntry { url, last_modified: None, priority: None, change_freq: None, }) .collect() } SitemapType::Unknown => Vec::new(), } } #[derive(Debug, PartialEq)] enum SitemapType { UrlSet, Index, Unknown, } /// Peek at the first element to determine if this is a urlset or sitemapindex. fn detect_sitemap_type(xml: &str) -> SitemapType { let mut reader = Reader::from_str(xml); let mut buf = Vec::new(); loop { match reader.read_event_into(&mut buf) { Ok(Event::Start(ref e)) | Ok(Event::Empty(ref e)) => { let name = e.local_name(); return match name.as_ref() { b"urlset" => SitemapType::UrlSet, b"sitemapindex" => SitemapType::Index, _ => continue, // skip processing instructions, comments }; } Ok(Event::Eof) => return SitemapType::Unknown, Err(_) => return SitemapType::Unknown, _ => continue, } } } /// Parse `` entries from a `` sitemap. fn parse_urlset(xml: &str) -> Vec { let mut reader = Reader::from_str(xml); let mut buf = Vec::new(); let mut entries = Vec::new(); // State for current element being parsed let mut in_url = false; let mut current_tag: Option = None; let mut loc: Option = None; let mut lastmod: Option = None; let mut priority: Option = None; let mut changefreq: Option = None; loop { match reader.read_event_into(&mut buf) { Ok(Event::Start(ref e)) => { let name = e.local_name(); match name.as_ref() { b"url" => { in_url = true; loc = None; lastmod = None; priority = None; changefreq = None; } b"loc" if in_url => current_tag = Some(UrlTag::Loc), b"lastmod" if in_url => current_tag = Some(UrlTag::LastMod), b"priority" if in_url => current_tag = Some(UrlTag::Priority), b"changefreq" if in_url => current_tag = Some(UrlTag::ChangeFreq), _ => current_tag = None, } } Ok(Event::Text(ref e)) => { if let Some(ref tag) = current_tag && let Ok(text) = e.unescape() { let text = text.trim().to_string(); if !text.is_empty() { match tag { UrlTag::Loc => loc = Some(text), UrlTag::LastMod => lastmod = Some(text), UrlTag::Priority => priority = text.parse().ok(), UrlTag::ChangeFreq => changefreq = Some(text), } } } } Ok(Event::End(ref e)) => { let name = e.local_name(); if name.as_ref() == b"url" && in_url { if let Some(url) = loc.take() { entries.push(SitemapEntry { url, last_modified: lastmod.take(), priority: priority.take(), change_freq: changefreq.take(), }); } in_url = false; } current_tag = None; } Ok(Event::Eof) => break, Err(e) => { warn!(error = %e, "XML parse error in sitemap, returning partial results"); break; } _ => {} } buf.clear(); } entries } #[derive(Debug)] enum UrlTag { Loc, LastMod, Priority, ChangeFreq, } /// Parse `` entries from a ``, returning child sitemap URLs. fn parse_sitemap_index(xml: &str) -> Vec { let mut reader = Reader::from_str(xml); let mut buf = Vec::new(); let mut urls = Vec::new(); let mut in_sitemap = false; let mut in_loc = false; loop { match reader.read_event_into(&mut buf) { Ok(Event::Start(ref e)) => { let name = e.local_name(); match name.as_ref() { b"sitemap" => in_sitemap = true, b"loc" if in_sitemap => in_loc = true, _ => {} } } Ok(Event::Text(ref e)) => { if in_loc && let Ok(text) = e.unescape() { let text = text.trim().to_string(); if !text.is_empty() { urls.push(text); } } } Ok(Event::End(ref e)) => { let name = e.local_name(); match name.as_ref() { b"sitemap" => { in_sitemap = false; in_loc = false; } b"loc" => in_loc = false, _ => {} } } Ok(Event::Eof) => break, Err(e) => { warn!(error = %e, "XML parse error in sitemap index, returning partial results"); break; } _ => {} } buf.clear(); } urls } // --------------------------------------------------------------------------- // Tests // --------------------------------------------------------------------------- #[cfg(test)] mod tests { use super::*; #[test] fn robots_txt_basic() { let t = "User-agent: *\nSitemap: https://example.com/sitemap.xml\n"; assert_eq!( parse_robots_txt(t), vec!["https://example.com/sitemap.xml".to_string()] ); } #[test] fn robots_txt_case_insensitive() { let t = "SITEMAP: https://a.example.com/s.xml\nsitemap: https://b.example.com/s.xml\n"; let got = parse_robots_txt(t); assert_eq!(got.len(), 2); assert!(got.contains(&"https://a.example.com/s.xml".to_string())); assert!(got.contains(&"https://b.example.com/s.xml".to_string())); } #[test] fn robots_txt_tolerates_space_before_colon() { // Some malformed generators emit `Sitemap :` with a space. let t = "Sitemap : https://example.com/sitemap.xml\n"; assert_eq!( parse_robots_txt(t), vec!["https://example.com/sitemap.xml".to_string()] ); } #[test] fn robots_txt_strips_inline_comments() { let t = "Sitemap: https://example.com/s.xml # main sitemap\n"; assert_eq!( parse_robots_txt(t), vec!["https://example.com/s.xml".to_string()] ); } #[test] fn robots_txt_rejects_empty_value() { let t = "Sitemap:\nSitemap: \n"; assert!(parse_robots_txt(t).is_empty()); } #[test] fn robots_txt_rejects_non_url_value() { // "Sitemap: /relative/path" has no scheme; don't blindly accept. let t = "Sitemap: /sitemap.xml\nSitemap: junk text\n"; assert!(parse_robots_txt(t).is_empty()); } #[test] fn robots_txt_ignores_non_sitemap_directives() { let t = "User-agent: *\nDisallow: /admin\nAllow: /\n"; assert!(parse_robots_txt(t).is_empty()); } #[test] fn test_parse_urlset() { let xml = r#" https://example.com/ 2026-01-15 daily 1.0 https://example.com/about 2026-01-10 monthly 0.8 https://example.com/blog/post-1 "#; let entries = parse_urlset(xml); assert_eq!(entries.len(), 3); assert_eq!(entries[0].url, "https://example.com/"); assert_eq!(entries[0].last_modified.as_deref(), Some("2026-01-15")); assert_eq!(entries[0].change_freq.as_deref(), Some("daily")); assert_eq!(entries[0].priority, Some(1.0)); assert_eq!(entries[1].url, "https://example.com/about"); assert_eq!(entries[1].priority, Some(0.8)); assert_eq!(entries[2].url, "https://example.com/blog/post-1"); assert_eq!(entries[2].last_modified, None); assert_eq!(entries[2].priority, None); assert_eq!(entries[2].change_freq, None); } #[test] fn test_parse_sitemap_index() { let xml = r#" https://example.com/sitemap-posts.xml 2026-03-01 https://example.com/sitemap-pages.xml "#; let urls = parse_sitemap_index(xml); assert_eq!(urls.len(), 2); assert_eq!(urls[0], "https://example.com/sitemap-posts.xml"); assert_eq!(urls[1], "https://example.com/sitemap-pages.xml"); } #[test] fn test_parse_sitemap_xml_dispatches_urlset() { let xml = r#" https://example.com/page "#; let entries = parse_sitemap_xml(xml); assert_eq!(entries.len(), 1); assert_eq!(entries[0].url, "https://example.com/page"); } #[test] fn test_parse_sitemap_xml_dispatches_index() { let xml = r#" https://example.com/sitemap-1.xml "#; let entries = parse_sitemap_xml(xml); assert_eq!(entries.len(), 1); assert_eq!(entries[0].url, "https://example.com/sitemap-1.xml"); // Index entries have no metadata when parsed through the public API assert_eq!(entries[0].priority, None); } #[test] fn test_parse_robots_txt() { let robots = "User-agent: *\n\ Disallow: /admin/\n\ \n\ Sitemap: https://example.com/sitemap.xml\n\ sitemap: https://example.com/sitemap-news.xml\n\ SITEMAP: https://example.com/sitemap-images.xml\n\ \n\ User-agent: Googlebot\n\ Allow: /\n"; let urls = parse_robots_txt(robots); assert_eq!(urls.len(), 3); assert_eq!(urls[0], "https://example.com/sitemap.xml"); assert_eq!(urls[1], "https://example.com/sitemap-news.xml"); assert_eq!(urls[2], "https://example.com/sitemap-images.xml"); } #[test] fn test_parse_robots_txt_empty_value() { // "Sitemap:" with no URL should be skipped let robots = "Sitemap:\nSitemap: \nSitemap: https://example.com/s.xml\n"; let urls = parse_robots_txt(robots); assert_eq!(urls.len(), 1); assert_eq!(urls[0], "https://example.com/s.xml"); } #[test] fn test_deduplicate() { // parse_sitemap_xml deduplicates via the discover() path, but // we can verify that parsing the same URL twice produces entries // that the HashSet in discover() would collapse. let xml = r#" https://example.com/page https://example.com/page https://example.com/other "#; let entries = parse_urlset(xml); assert_eq!(entries.len(), 3, "parser returns all entries"); // Simulate the dedup that discover() does let mut seen = HashSet::new(); let deduped: Vec<_> = entries .into_iter() .filter(|e| seen.insert(e.url.clone())) .collect(); assert_eq!(deduped.len(), 2, "dedup collapses duplicates"); } #[test] fn test_empty_sitemap() { let xml = r#" "#; let entries = parse_urlset(xml); assert!(entries.is_empty()); } #[test] fn test_malformed_xml() { let xml = "this is not xml at all <><><"; let entries = parse_sitemap_xml(xml); assert!(entries.is_empty(), "malformed XML returns empty vec"); } #[test] fn test_malformed_xml_partial() { // Partial XML that starts valid but breaks mid-stream let xml = r#" https://example.com/good broken "#; let entries = parse_sitemap_xml(xml); // Should return at least the successfully parsed entry assert!(entries.len() >= 1); assert_eq!(entries[0].url, "https://example.com/good"); } #[test] fn test_missing_loc() { let xml = r#" 2026-01-01 0.5 https://example.com/valid "#; let entries = parse_urlset(xml); assert_eq!(entries.len(), 1, "entry without is skipped"); assert_eq!(entries[0].url, "https://example.com/valid"); } #[test] fn test_priority_parsing() { let xml = r#" https://example.com/high 1.0 https://example.com/mid 0.5 https://example.com/low 0.1 https://example.com/invalid not-a-number "#; let entries = parse_urlset(xml); assert_eq!(entries.len(), 4); assert_eq!(entries[0].priority, Some(1.0)); assert_eq!(entries[1].priority, Some(0.5)); assert_eq!(entries[2].priority, Some(0.1)); assert_eq!(entries[3].priority, None, "invalid priority parses as None"); } #[test] fn test_detect_sitemap_type() { let urlset = r#""#; assert_eq!(detect_sitemap_type(urlset), SitemapType::UrlSet); let index = r#""#; assert_eq!(detect_sitemap_type(index), SitemapType::Index); assert_eq!(detect_sitemap_type("garbage"), SitemapType::Unknown); assert_eq!(detect_sitemap_type(""), SitemapType::Unknown); } #[test] fn test_fallback_paths_constant() { // Verify the constant has the expected paths assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap.xml")); assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap_index.xml")); assert!(FALLBACK_SITEMAP_PATHS.contains(&"/wp-sitemap.xml")); assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap/sitemap-index.xml")); } }