mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-06 22:05:13 +02:00
Some checks are pending
CI / Test (push) Waiting to run
CI / Lint (push) Waiting to run
CI / Docs (push) Waiting to run
Three P3 items from the 2026-04-16 audit. Bump to 0.3.17. webclaw-fetch/sitemap.rs: parse_robots_txt used trimmed[..8] slice plus eq_ignore_ascii_case for the directive test. That was fragile: "Sitemap :" (space before colon) fell through silently, inline "# ..." comments leaked into the URL, and a line with no URL at all returned an empty string. Rewritten to split on the first colon, match any-case "sitemap" as the directive name, strip comments, and require `://` in the value. +7 unit tests cover case variants, space-before-colon, comments, empty values, non-URL values, and non-sitemap directives. webclaw-fetch/crawler.rs: is_cancelled uses Ordering::Acquire instead of Relaxed. Behaviourally equivalent on current hardware for single-word atomic loads, but the explicit ordering documents intent for readers + compilers. webclaw-mcp/server.rs: add lazy OnceLock cache for the Firefox FetchClient. Tool calls that repeatedly request the firefox profile without cookies used to build a fresh reqwest pool + TLS stack per call. Chrome (default) already used the long-lived field; Random is per-call by design; cookie-bearing requests still build ad-hoc since the cookie header is part of the client shape. Tests: 85 webclaw-fetch (was 78, +7 new sitemap), 272 webclaw-core, 43 webclaw-llm, 11 CLI — all green. Clippy clean across workspace. Refs: docs/AUDIT-2026-04-16.md P3 section Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
673 lines
22 KiB
Rust
673 lines
22 KiB
Rust
/// Sitemap parsing and URL discovery.
|
|
///
|
|
/// Discovers URLs from a site's sitemaps using a 3-step process:
|
|
/// 1. Parse robots.txt for `Sitemap:` directives
|
|
/// 2. Try common sitemap paths as fallback
|
|
/// 3. Recursively resolve sitemap index files
|
|
///
|
|
/// All HTTP requests go through FetchClient to inherit TLS fingerprinting.
|
|
use std::collections::HashSet;
|
|
|
|
use quick_xml::Reader;
|
|
use quick_xml::events::Event;
|
|
use serde::Serialize;
|
|
use tracing::{debug, warn};
|
|
|
|
use crate::client::FetchClient;
|
|
use crate::error::FetchError;
|
|
|
|
/// Maximum depth when recursively fetching sitemap index files.
|
|
/// Prevents infinite loops from circular sitemap references.
|
|
const MAX_RECURSION_DEPTH: usize = 3;
|
|
|
|
/// Common sitemap paths to try when robots.txt doesn't list any.
|
|
const FALLBACK_SITEMAP_PATHS: &[&str] = &[
|
|
"/sitemap.xml",
|
|
"/sitemap_index.xml",
|
|
"/wp-sitemap.xml",
|
|
"/sitemap/sitemap-index.xml",
|
|
];
|
|
|
|
/// A single URL discovered from a sitemap.
|
|
#[derive(Debug, Clone, Serialize)]
|
|
pub struct SitemapEntry {
|
|
pub url: String,
|
|
pub last_modified: Option<String>,
|
|
pub priority: Option<f64>,
|
|
pub change_freq: Option<String>,
|
|
}
|
|
|
|
/// Discover all URLs from a site's sitemaps.
|
|
///
|
|
/// Discovery order:
|
|
/// 1. Fetch /robots.txt, parse `Sitemap:` directives
|
|
/// 2. Try common sitemap paths as fallback (skipping any already found)
|
|
/// 3. If sitemap index, recursively fetch child sitemaps
|
|
/// 4. Deduplicate by URL
|
|
///
|
|
/// Returns an empty vec (not an error) if no sitemaps are found.
|
|
pub async fn discover(
|
|
client: &FetchClient,
|
|
base_url: &str,
|
|
) -> Result<Vec<SitemapEntry>, FetchError> {
|
|
let base = base_url.trim_end_matches('/');
|
|
let mut sitemap_urls: Vec<String> = Vec::new();
|
|
|
|
// Step 1: Try robots.txt
|
|
let robots_url = format!("{base}/robots.txt");
|
|
debug!(url = %robots_url, "fetching robots.txt");
|
|
|
|
match client.fetch(&robots_url).await {
|
|
Ok(result) if result.status == 200 => {
|
|
let found = parse_robots_txt(&result.html);
|
|
debug!(count = found.len(), "sitemap URLs from robots.txt");
|
|
sitemap_urls.extend(found);
|
|
}
|
|
Ok(result) => {
|
|
debug!(status = result.status, "robots.txt not found");
|
|
}
|
|
Err(e) => {
|
|
debug!(error = %e, "failed to fetch robots.txt");
|
|
}
|
|
}
|
|
|
|
// Step 2: Try common sitemap paths (skipping any already discovered via robots.txt)
|
|
for path in FALLBACK_SITEMAP_PATHS {
|
|
let candidate = format!("{base}{path}");
|
|
if !sitemap_urls.iter().any(|u| u == &candidate) {
|
|
sitemap_urls.push(candidate);
|
|
}
|
|
}
|
|
|
|
// Step 3: Fetch and parse each sitemap, handling indexes recursively
|
|
let mut seen_urls: HashSet<String> = HashSet::new();
|
|
let mut entries: Vec<SitemapEntry> = Vec::new();
|
|
|
|
fetch_sitemaps(client, &sitemap_urls, &mut entries, &mut seen_urls, 0).await;
|
|
|
|
debug!(total = entries.len(), "sitemap discovery complete");
|
|
Ok(entries)
|
|
}
|
|
|
|
/// Recursively fetch and parse sitemap URLs, handling both urlsets and indexes.
|
|
async fn fetch_sitemaps(
|
|
client: &FetchClient,
|
|
urls: &[String],
|
|
entries: &mut Vec<SitemapEntry>,
|
|
seen_urls: &mut HashSet<String>,
|
|
depth: usize,
|
|
) {
|
|
if depth > MAX_RECURSION_DEPTH {
|
|
warn!(depth, "sitemap recursion limit reached, stopping");
|
|
return;
|
|
}
|
|
|
|
for sitemap_url in urls {
|
|
debug!(url = %sitemap_url, depth, "fetching sitemap");
|
|
|
|
let xml = match client.fetch(sitemap_url).await {
|
|
Ok(result) if result.status == 200 => result.html,
|
|
Ok(result) => {
|
|
debug!(url = %sitemap_url, status = result.status, "sitemap not found");
|
|
continue;
|
|
}
|
|
Err(e) => {
|
|
debug!(url = %sitemap_url, error = %e, "failed to fetch sitemap");
|
|
continue;
|
|
}
|
|
};
|
|
|
|
match detect_sitemap_type(&xml) {
|
|
SitemapType::UrlSet => {
|
|
let parsed = parse_urlset(&xml);
|
|
for entry in parsed {
|
|
if seen_urls.insert(entry.url.clone()) {
|
|
entries.push(entry);
|
|
}
|
|
}
|
|
}
|
|
SitemapType::Index => {
|
|
let child_urls = parse_sitemap_index(&xml);
|
|
debug!(count = child_urls.len(), "found child sitemaps in index");
|
|
|
|
// Box the recursive call to avoid large future sizes
|
|
Box::pin(fetch_sitemaps(
|
|
client,
|
|
&child_urls,
|
|
entries,
|
|
seen_urls,
|
|
depth + 1,
|
|
))
|
|
.await;
|
|
}
|
|
SitemapType::Unknown => {
|
|
debug!(url = %sitemap_url, "unrecognized sitemap format, skipping");
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Pure parsing functions (no I/O, fully testable)
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/// Extract `Sitemap:` directive URLs from robots.txt content.
|
|
///
|
|
/// Handles case-insensitive directive names, optional whitespace before
|
|
/// the colon, and strips inline `# ...` comments. Rejects values without
|
|
/// a URL scheme (`://`) so a malformed directive doesn't turn an empty
|
|
/// or garbage string into a "sitemap URL".
|
|
pub fn parse_robots_txt(text: &str) -> Vec<String> {
|
|
text.lines()
|
|
.filter_map(|line| {
|
|
// Strip inline `#...` comments (robots.txt convention).
|
|
let line = match line.split_once('#') {
|
|
Some((before, _)) => before,
|
|
None => line,
|
|
};
|
|
let trimmed = line.trim();
|
|
// Find the colon that terminates the directive name; reject
|
|
// lines that don't have one. Anything between the start and
|
|
// the colon that matches "sitemap" case-insensitively is a hit.
|
|
let colon = trimmed.find(':')?;
|
|
let (name, rest) = trimmed.split_at(colon);
|
|
if !name.trim().eq_ignore_ascii_case("sitemap") {
|
|
return None;
|
|
}
|
|
// Skip the colon itself, then trim.
|
|
let url = rest[1..].trim();
|
|
if url.is_empty() || !url.contains("://") {
|
|
return None;
|
|
}
|
|
Some(url.to_string())
|
|
})
|
|
.collect()
|
|
}
|
|
|
|
/// Parse a sitemap XML string. Handles both `<urlset>` and `<sitemapindex>`.
|
|
/// Returns entries from urlsets and recursion targets from indexes.
|
|
pub fn parse_sitemap_xml(xml: &str) -> Vec<SitemapEntry> {
|
|
match detect_sitemap_type(xml) {
|
|
SitemapType::UrlSet => parse_urlset(xml),
|
|
SitemapType::Index => {
|
|
// For the public parsing API, convert index <loc> entries into
|
|
// SitemapEntry with just the URL. The async `discover` function
|
|
// handles actual recursive fetching.
|
|
parse_sitemap_index(xml)
|
|
.into_iter()
|
|
.map(|url| SitemapEntry {
|
|
url,
|
|
last_modified: None,
|
|
priority: None,
|
|
change_freq: None,
|
|
})
|
|
.collect()
|
|
}
|
|
SitemapType::Unknown => Vec::new(),
|
|
}
|
|
}
|
|
|
|
#[derive(Debug, PartialEq)]
|
|
enum SitemapType {
|
|
UrlSet,
|
|
Index,
|
|
Unknown,
|
|
}
|
|
|
|
/// Peek at the first element to determine if this is a urlset or sitemapindex.
|
|
fn detect_sitemap_type(xml: &str) -> SitemapType {
|
|
let mut reader = Reader::from_str(xml);
|
|
let mut buf = Vec::new();
|
|
|
|
loop {
|
|
match reader.read_event_into(&mut buf) {
|
|
Ok(Event::Start(ref e)) | Ok(Event::Empty(ref e)) => {
|
|
let name = e.local_name();
|
|
return match name.as_ref() {
|
|
b"urlset" => SitemapType::UrlSet,
|
|
b"sitemapindex" => SitemapType::Index,
|
|
_ => continue, // skip processing instructions, comments
|
|
};
|
|
}
|
|
Ok(Event::Eof) => return SitemapType::Unknown,
|
|
Err(_) => return SitemapType::Unknown,
|
|
_ => continue,
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Parse `<url>` entries from a `<urlset>` sitemap.
|
|
fn parse_urlset(xml: &str) -> Vec<SitemapEntry> {
|
|
let mut reader = Reader::from_str(xml);
|
|
let mut buf = Vec::new();
|
|
let mut entries = Vec::new();
|
|
|
|
// State for current <url> element being parsed
|
|
let mut in_url = false;
|
|
let mut current_tag: Option<UrlTag> = None;
|
|
let mut loc: Option<String> = None;
|
|
let mut lastmod: Option<String> = None;
|
|
let mut priority: Option<f64> = None;
|
|
let mut changefreq: Option<String> = None;
|
|
|
|
loop {
|
|
match reader.read_event_into(&mut buf) {
|
|
Ok(Event::Start(ref e)) => {
|
|
let name = e.local_name();
|
|
match name.as_ref() {
|
|
b"url" => {
|
|
in_url = true;
|
|
loc = None;
|
|
lastmod = None;
|
|
priority = None;
|
|
changefreq = None;
|
|
}
|
|
b"loc" if in_url => current_tag = Some(UrlTag::Loc),
|
|
b"lastmod" if in_url => current_tag = Some(UrlTag::LastMod),
|
|
b"priority" if in_url => current_tag = Some(UrlTag::Priority),
|
|
b"changefreq" if in_url => current_tag = Some(UrlTag::ChangeFreq),
|
|
_ => current_tag = None,
|
|
}
|
|
}
|
|
Ok(Event::Text(ref e)) => {
|
|
if let Some(ref tag) = current_tag
|
|
&& let Ok(text) = e.unescape()
|
|
{
|
|
let text = text.trim().to_string();
|
|
if !text.is_empty() {
|
|
match tag {
|
|
UrlTag::Loc => loc = Some(text),
|
|
UrlTag::LastMod => lastmod = Some(text),
|
|
UrlTag::Priority => priority = text.parse().ok(),
|
|
UrlTag::ChangeFreq => changefreq = Some(text),
|
|
}
|
|
}
|
|
}
|
|
}
|
|
Ok(Event::End(ref e)) => {
|
|
let name = e.local_name();
|
|
if name.as_ref() == b"url" && in_url {
|
|
if let Some(url) = loc.take() {
|
|
entries.push(SitemapEntry {
|
|
url,
|
|
last_modified: lastmod.take(),
|
|
priority: priority.take(),
|
|
change_freq: changefreq.take(),
|
|
});
|
|
}
|
|
in_url = false;
|
|
}
|
|
current_tag = None;
|
|
}
|
|
Ok(Event::Eof) => break,
|
|
Err(e) => {
|
|
warn!(error = %e, "XML parse error in sitemap, returning partial results");
|
|
break;
|
|
}
|
|
_ => {}
|
|
}
|
|
buf.clear();
|
|
}
|
|
|
|
entries
|
|
}
|
|
|
|
#[derive(Debug)]
|
|
enum UrlTag {
|
|
Loc,
|
|
LastMod,
|
|
Priority,
|
|
ChangeFreq,
|
|
}
|
|
|
|
/// Parse `<sitemap>` entries from a `<sitemapindex>`, returning child sitemap URLs.
|
|
fn parse_sitemap_index(xml: &str) -> Vec<String> {
|
|
let mut reader = Reader::from_str(xml);
|
|
let mut buf = Vec::new();
|
|
let mut urls = Vec::new();
|
|
|
|
let mut in_sitemap = false;
|
|
let mut in_loc = false;
|
|
|
|
loop {
|
|
match reader.read_event_into(&mut buf) {
|
|
Ok(Event::Start(ref e)) => {
|
|
let name = e.local_name();
|
|
match name.as_ref() {
|
|
b"sitemap" => in_sitemap = true,
|
|
b"loc" if in_sitemap => in_loc = true,
|
|
_ => {}
|
|
}
|
|
}
|
|
Ok(Event::Text(ref e)) => {
|
|
if in_loc && let Ok(text) = e.unescape() {
|
|
let text = text.trim().to_string();
|
|
if !text.is_empty() {
|
|
urls.push(text);
|
|
}
|
|
}
|
|
}
|
|
Ok(Event::End(ref e)) => {
|
|
let name = e.local_name();
|
|
match name.as_ref() {
|
|
b"sitemap" => {
|
|
in_sitemap = false;
|
|
in_loc = false;
|
|
}
|
|
b"loc" => in_loc = false,
|
|
_ => {}
|
|
}
|
|
}
|
|
Ok(Event::Eof) => break,
|
|
Err(e) => {
|
|
warn!(error = %e, "XML parse error in sitemap index, returning partial results");
|
|
break;
|
|
}
|
|
_ => {}
|
|
}
|
|
buf.clear();
|
|
}
|
|
|
|
urls
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Tests
|
|
// ---------------------------------------------------------------------------
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn robots_txt_basic() {
|
|
let t = "User-agent: *\nSitemap: https://example.com/sitemap.xml\n";
|
|
assert_eq!(
|
|
parse_robots_txt(t),
|
|
vec!["https://example.com/sitemap.xml".to_string()]
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn robots_txt_case_insensitive() {
|
|
let t = "SITEMAP: https://a.example.com/s.xml\nsitemap: https://b.example.com/s.xml\n";
|
|
let got = parse_robots_txt(t);
|
|
assert_eq!(got.len(), 2);
|
|
assert!(got.contains(&"https://a.example.com/s.xml".to_string()));
|
|
assert!(got.contains(&"https://b.example.com/s.xml".to_string()));
|
|
}
|
|
|
|
#[test]
|
|
fn robots_txt_tolerates_space_before_colon() {
|
|
// Some malformed generators emit `Sitemap :` with a space.
|
|
let t = "Sitemap : https://example.com/sitemap.xml\n";
|
|
assert_eq!(
|
|
parse_robots_txt(t),
|
|
vec!["https://example.com/sitemap.xml".to_string()]
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn robots_txt_strips_inline_comments() {
|
|
let t = "Sitemap: https://example.com/s.xml # main sitemap\n";
|
|
assert_eq!(
|
|
parse_robots_txt(t),
|
|
vec!["https://example.com/s.xml".to_string()]
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn robots_txt_rejects_empty_value() {
|
|
let t = "Sitemap:\nSitemap: \n";
|
|
assert!(parse_robots_txt(t).is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn robots_txt_rejects_non_url_value() {
|
|
// "Sitemap: /relative/path" has no scheme; don't blindly accept.
|
|
let t = "Sitemap: /sitemap.xml\nSitemap: junk text\n";
|
|
assert!(parse_robots_txt(t).is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn robots_txt_ignores_non_sitemap_directives() {
|
|
let t = "User-agent: *\nDisallow: /admin\nAllow: /\n";
|
|
assert!(parse_robots_txt(t).is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn test_parse_urlset() {
|
|
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
|
|
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
<url>
|
|
<loc>https://example.com/</loc>
|
|
<lastmod>2026-01-15</lastmod>
|
|
<changefreq>daily</changefreq>
|
|
<priority>1.0</priority>
|
|
</url>
|
|
<url>
|
|
<loc>https://example.com/about</loc>
|
|
<lastmod>2026-01-10</lastmod>
|
|
<changefreq>monthly</changefreq>
|
|
<priority>0.8</priority>
|
|
</url>
|
|
<url>
|
|
<loc>https://example.com/blog/post-1</loc>
|
|
</url>
|
|
</urlset>"#;
|
|
|
|
let entries = parse_urlset(xml);
|
|
assert_eq!(entries.len(), 3);
|
|
|
|
assert_eq!(entries[0].url, "https://example.com/");
|
|
assert_eq!(entries[0].last_modified.as_deref(), Some("2026-01-15"));
|
|
assert_eq!(entries[0].change_freq.as_deref(), Some("daily"));
|
|
assert_eq!(entries[0].priority, Some(1.0));
|
|
|
|
assert_eq!(entries[1].url, "https://example.com/about");
|
|
assert_eq!(entries[1].priority, Some(0.8));
|
|
|
|
assert_eq!(entries[2].url, "https://example.com/blog/post-1");
|
|
assert_eq!(entries[2].last_modified, None);
|
|
assert_eq!(entries[2].priority, None);
|
|
assert_eq!(entries[2].change_freq, None);
|
|
}
|
|
|
|
#[test]
|
|
fn test_parse_sitemap_index() {
|
|
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
|
|
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
<sitemap>
|
|
<loc>https://example.com/sitemap-posts.xml</loc>
|
|
<lastmod>2026-03-01</lastmod>
|
|
</sitemap>
|
|
<sitemap>
|
|
<loc>https://example.com/sitemap-pages.xml</loc>
|
|
</sitemap>
|
|
</sitemapindex>"#;
|
|
|
|
let urls = parse_sitemap_index(xml);
|
|
assert_eq!(urls.len(), 2);
|
|
assert_eq!(urls[0], "https://example.com/sitemap-posts.xml");
|
|
assert_eq!(urls[1], "https://example.com/sitemap-pages.xml");
|
|
}
|
|
|
|
#[test]
|
|
fn test_parse_sitemap_xml_dispatches_urlset() {
|
|
let xml = r#"<?xml version="1.0"?>
|
|
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
<url><loc>https://example.com/page</loc></url>
|
|
</urlset>"#;
|
|
|
|
let entries = parse_sitemap_xml(xml);
|
|
assert_eq!(entries.len(), 1);
|
|
assert_eq!(entries[0].url, "https://example.com/page");
|
|
}
|
|
|
|
#[test]
|
|
fn test_parse_sitemap_xml_dispatches_index() {
|
|
let xml = r#"<?xml version="1.0"?>
|
|
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
<sitemap><loc>https://example.com/sitemap-1.xml</loc></sitemap>
|
|
</sitemapindex>"#;
|
|
|
|
let entries = parse_sitemap_xml(xml);
|
|
assert_eq!(entries.len(), 1);
|
|
assert_eq!(entries[0].url, "https://example.com/sitemap-1.xml");
|
|
// Index entries have no metadata when parsed through the public API
|
|
assert_eq!(entries[0].priority, None);
|
|
}
|
|
|
|
#[test]
|
|
fn test_parse_robots_txt() {
|
|
let robots = "User-agent: *\n\
|
|
Disallow: /admin/\n\
|
|
\n\
|
|
Sitemap: https://example.com/sitemap.xml\n\
|
|
sitemap: https://example.com/sitemap-news.xml\n\
|
|
SITEMAP: https://example.com/sitemap-images.xml\n\
|
|
\n\
|
|
User-agent: Googlebot\n\
|
|
Allow: /\n";
|
|
|
|
let urls = parse_robots_txt(robots);
|
|
assert_eq!(urls.len(), 3);
|
|
assert_eq!(urls[0], "https://example.com/sitemap.xml");
|
|
assert_eq!(urls[1], "https://example.com/sitemap-news.xml");
|
|
assert_eq!(urls[2], "https://example.com/sitemap-images.xml");
|
|
}
|
|
|
|
#[test]
|
|
fn test_parse_robots_txt_empty_value() {
|
|
// "Sitemap:" with no URL should be skipped
|
|
let robots = "Sitemap:\nSitemap: \nSitemap: https://example.com/s.xml\n";
|
|
let urls = parse_robots_txt(robots);
|
|
assert_eq!(urls.len(), 1);
|
|
assert_eq!(urls[0], "https://example.com/s.xml");
|
|
}
|
|
|
|
#[test]
|
|
fn test_deduplicate() {
|
|
// parse_sitemap_xml deduplicates via the discover() path, but
|
|
// we can verify that parsing the same URL twice produces entries
|
|
// that the HashSet in discover() would collapse.
|
|
let xml = r#"<?xml version="1.0"?>
|
|
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
<url><loc>https://example.com/page</loc></url>
|
|
<url><loc>https://example.com/page</loc></url>
|
|
<url><loc>https://example.com/other</loc></url>
|
|
</urlset>"#;
|
|
|
|
let entries = parse_urlset(xml);
|
|
assert_eq!(entries.len(), 3, "parser returns all entries");
|
|
|
|
// Simulate the dedup that discover() does
|
|
let mut seen = HashSet::new();
|
|
let deduped: Vec<_> = entries
|
|
.into_iter()
|
|
.filter(|e| seen.insert(e.url.clone()))
|
|
.collect();
|
|
assert_eq!(deduped.len(), 2, "dedup collapses duplicates");
|
|
}
|
|
|
|
#[test]
|
|
fn test_empty_sitemap() {
|
|
let xml = r#"<?xml version="1.0"?>
|
|
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
</urlset>"#;
|
|
|
|
let entries = parse_urlset(xml);
|
|
assert!(entries.is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn test_malformed_xml() {
|
|
let xml = "this is not xml at all <><><";
|
|
let entries = parse_sitemap_xml(xml);
|
|
assert!(entries.is_empty(), "malformed XML returns empty vec");
|
|
}
|
|
|
|
#[test]
|
|
fn test_malformed_xml_partial() {
|
|
// Partial XML that starts valid but breaks mid-stream
|
|
let xml = r#"<?xml version="1.0"?>
|
|
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
<url><loc>https://example.com/good</loc></url>
|
|
<url><loc>broken
|
|
"#;
|
|
let entries = parse_sitemap_xml(xml);
|
|
// Should return at least the successfully parsed entry
|
|
assert!(entries.len() >= 1);
|
|
assert_eq!(entries[0].url, "https://example.com/good");
|
|
}
|
|
|
|
#[test]
|
|
fn test_missing_loc() {
|
|
let xml = r#"<?xml version="1.0"?>
|
|
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
<url>
|
|
<lastmod>2026-01-01</lastmod>
|
|
<priority>0.5</priority>
|
|
</url>
|
|
<url>
|
|
<loc>https://example.com/valid</loc>
|
|
</url>
|
|
</urlset>"#;
|
|
|
|
let entries = parse_urlset(xml);
|
|
assert_eq!(entries.len(), 1, "entry without <loc> is skipped");
|
|
assert_eq!(entries[0].url, "https://example.com/valid");
|
|
}
|
|
|
|
#[test]
|
|
fn test_priority_parsing() {
|
|
let xml = r#"<?xml version="1.0"?>
|
|
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
<url>
|
|
<loc>https://example.com/high</loc>
|
|
<priority>1.0</priority>
|
|
</url>
|
|
<url>
|
|
<loc>https://example.com/mid</loc>
|
|
<priority>0.5</priority>
|
|
</url>
|
|
<url>
|
|
<loc>https://example.com/low</loc>
|
|
<priority>0.1</priority>
|
|
</url>
|
|
<url>
|
|
<loc>https://example.com/invalid</loc>
|
|
<priority>not-a-number</priority>
|
|
</url>
|
|
</urlset>"#;
|
|
|
|
let entries = parse_urlset(xml);
|
|
assert_eq!(entries.len(), 4);
|
|
|
|
assert_eq!(entries[0].priority, Some(1.0));
|
|
assert_eq!(entries[1].priority, Some(0.5));
|
|
assert_eq!(entries[2].priority, Some(0.1));
|
|
assert_eq!(entries[3].priority, None, "invalid priority parses as None");
|
|
}
|
|
|
|
#[test]
|
|
fn test_detect_sitemap_type() {
|
|
let urlset = r#"<?xml version="1.0"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"></urlset>"#;
|
|
assert_eq!(detect_sitemap_type(urlset), SitemapType::UrlSet);
|
|
|
|
let index = r#"<?xml version="1.0"?><sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"></sitemapindex>"#;
|
|
assert_eq!(detect_sitemap_type(index), SitemapType::Index);
|
|
|
|
assert_eq!(detect_sitemap_type("garbage"), SitemapType::Unknown);
|
|
assert_eq!(detect_sitemap_type(""), SitemapType::Unknown);
|
|
}
|
|
|
|
#[test]
|
|
fn test_fallback_paths_constant() {
|
|
// Verify the constant has the expected paths
|
|
assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap.xml"));
|
|
assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap_index.xml"));
|
|
assert!(FALLBACK_SITEMAP_PATHS.contains(&"/wp-sitemap.xml"));
|
|
assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap/sitemap-index.xml"));
|
|
}
|
|
}
|