Initial release: webclaw v0.1.0 — web content extraction for LLMs

CLI + MCP server for extracting clean, structured content from any URL. 6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats. MIT Licensed | https://webclaw.io
2026-05-15 18:25:24 +02:00 · 2026-03-23 18:31:11 +01:00 · 2026-03-23 18:31:11 +01:00 · c99ec684fa
commit c99ec684fa
79 changed files with 24074 additions and 0 deletions
--- a/crates/webclaw-fetch/src/browser.rs
+++ b/crates/webclaw-fetch/src/browser.rs
@ -0,0 +1,96 @@
+/// Browser fingerprint selection and rotation.
+/// Maps our simple `BrowserProfile` enum to primp's impersonation profiles.
+use primp::{Impersonate, ImpersonateOS};
+
+/// Which browser identity to present at the TLS/HTTP layer.
+#[derive(Debug, Clone, Default)]
+pub enum BrowserProfile {
+    #[default]
+    Chrome,
+    Firefox,
+    /// Randomly pick from all available profiles on each request.
+    Random,
+}
+
+/// A complete impersonation profile: browser + OS.
+#[derive(Debug, Clone)]
+pub struct ImpersonateProfile {
+    pub browser: Impersonate,
+    pub os: ImpersonateOS,
+}
+
+/// All Chrome profiles we ship, newest first.
+pub fn chrome_profiles() -> Vec<ImpersonateProfile> {
+    vec![
+        ImpersonateProfile {
+            browser: Impersonate::ChromeV145,
+            os: ImpersonateOS::Windows,
+        },
+        ImpersonateProfile {
+            browser: Impersonate::ChromeV145,
+            os: ImpersonateOS::MacOS,
+        },
+        ImpersonateProfile {
+            browser: Impersonate::ChromeV144,
+            os: ImpersonateOS::Windows,
+        },
+        ImpersonateProfile {
+            browser: Impersonate::ChromeV144,
+            os: ImpersonateOS::Linux,
+        },
+    ]
+}
+
+/// All Firefox profiles we ship, newest first.
+pub fn firefox_profiles() -> Vec<ImpersonateProfile> {
+    vec![
+        ImpersonateProfile {
+            browser: Impersonate::FirefoxV146,
+            os: ImpersonateOS::Windows,
+        },
+        ImpersonateProfile {
+            browser: Impersonate::FirefoxV146,
+            os: ImpersonateOS::Linux,
+        },
+        ImpersonateProfile {
+            browser: Impersonate::FirefoxV140,
+            os: ImpersonateOS::Windows,
+        },
+    ]
+}
+
+/// Safari + Edge + Opera profiles for maximum diversity in Random mode.
+pub fn extra_profiles() -> Vec<ImpersonateProfile> {
+    vec![
+        ImpersonateProfile {
+            browser: Impersonate::SafariV18_5,
+            os: ImpersonateOS::MacOS,
+        },
+        ImpersonateProfile {
+            browser: Impersonate::SafariV26,
+            os: ImpersonateOS::MacOS,
+        },
+        ImpersonateProfile {
+            browser: Impersonate::EdgeV145,
+            os: ImpersonateOS::Windows,
+        },
+        ImpersonateProfile {
+            browser: Impersonate::OperaV127,
+            os: ImpersonateOS::Windows,
+        },
+    ]
+}
+
+pub fn latest_chrome() -> ImpersonateProfile {
+    ImpersonateProfile {
+        browser: Impersonate::ChromeV145,
+        os: ImpersonateOS::Windows,
+    }
+}
+
+pub fn latest_firefox() -> ImpersonateProfile {
+    ImpersonateProfile {
+        browser: Impersonate::FirefoxV146,
+        os: ImpersonateOS::Windows,
+    }
+}
--- a/crates/webclaw-fetch/src/client.rs
+++ b/crates/webclaw-fetch/src/client.rs
@ -0,0 +1,798 @@
+/// HTTP client with browser TLS fingerprint impersonation.
+/// Wraps primp to provide a simple fetch interface with optional
+/// content extraction via webclaw-core. Supports single and batch operations.
+/// Automatically detects PDF responses and extracts text via webclaw-pdf.
+///
+/// Two proxy modes:
+/// - **Static**: single proxy (or none) baked into pre-built clients at construction.
+/// - **Rotating**: pre-built pool of clients, each with a different proxy + fingerprint.
+///   Same-host URLs are routed to the same client for HTTP/2 connection reuse.
+use std::collections::HashMap;
+use std::hash::{Hash, Hasher};
+use std::sync::Arc;
+use std::time::{Duration, Instant};
+
+use rand::seq::SliceRandom;
+use tokio::sync::Semaphore;
+use tracing::{debug, instrument, warn};
+use webclaw_pdf::PdfMode;
+
+use crate::browser::{self, BrowserProfile, ImpersonateProfile};
+use crate::error::FetchError;
+
+/// Configuration for building a [`FetchClient`].
+#[derive(Debug, Clone)]
+pub struct FetchConfig {
+    pub browser: BrowserProfile,
+    /// Single proxy URL. Used when `proxy_pool` is empty.
+    pub proxy: Option<String>,
+    /// Pool of proxy URLs to rotate through.
+    /// When non-empty, each proxy gets a pre-built client with a
+    /// random browser fingerprint. Same-host URLs reuse the same client
+    /// for HTTP/2 connection multiplexing.
+    pub proxy_pool: Vec<String>,
+    pub timeout: Duration,
+    pub follow_redirects: bool,
+    pub max_redirects: u32,
+    pub headers: HashMap<String, String>,
+    pub pdf_mode: PdfMode,
+}
+
+impl Default for FetchConfig {
+    fn default() -> Self {
+        Self {
+            browser: BrowserProfile::Chrome,
+            proxy: None,
+            proxy_pool: Vec::new(),
+            timeout: Duration::from_secs(30),
+            follow_redirects: true,
+            max_redirects: 10,
+            headers: HashMap::from([("Accept-Language".to_string(), "en-US,en;q=0.9".to_string())]),
+            pdf_mode: PdfMode::default(),
+        }
+    }
+}
+
+/// Result of a successful fetch.
+#[derive(Debug, Clone)]
+pub struct FetchResult {
+    pub html: String,
+    pub status: u16,
+    /// Final URL after any redirects.
+    pub url: String,
+    pub headers: HashMap<String, String>,
+    pub elapsed: Duration,
+}
+
+/// Result for a single URL in a batch fetch operation.
+#[derive(Debug)]
+pub struct BatchResult {
+    pub url: String,
+    pub result: Result<FetchResult, FetchError>,
+}
+
+/// Result for a single URL in a batch fetch-and-extract operation.
+#[derive(Debug)]
+pub struct BatchExtractResult {
+    pub url: String,
+    pub result: Result<webclaw_core::ExtractionResult, FetchError>,
+}
+
+/// Internal representation of the client pool strategy.
+enum ClientPool {
+    /// Pre-built clients with a fixed proxy (or no proxy).
+    /// Fingerprint rotation still works via the pool when `random` is true.
+    Static {
+        clients: Vec<primp::Client>,
+        random: bool,
+    },
+    /// Pre-built pool of clients, each with a different proxy + fingerprint.
+    /// Requests pick a client deterministically by host for HTTP/2 connection reuse.
+    Rotating { clients: Vec<primp::Client> },
+}
+
+/// HTTP client that impersonates browser TLS fingerprints via primp.
+///
+/// Operates in two modes:
+/// - **Static pool**: pre-built primp clients, optionally with fingerprint rotation.
+///   Used when no `proxy_pool` is configured. Fast (no per-request construction).
+/// - **Rotating pool**: pre-built primp clients, one per proxy in the pool.
+///   Same-host URLs are routed to the same client for HTTP/2 multiplexing.
+pub struct FetchClient {
+    pool: ClientPool,
+    pdf_mode: PdfMode,
+}
+
+impl FetchClient {
+    /// Build a new client from config.
+    ///
+    /// When `config.proxy_pool` is non-empty, pre-builds one primp client per proxy,
+    /// each with a randomly assigned fingerprint. Same-host URLs get routed to the
+    /// same client for HTTP/2 connection reuse.
+    ///
+    /// When `proxy_pool` is empty, pre-builds primp clients at construction time
+    /// (one per fingerprint for `Random` profiles, one for fixed profiles).
+    pub fn new(config: FetchConfig) -> Result<Self, FetchError> {
+        let profiles = collect_profiles(&config.browser);
+        let pdf_mode = config.pdf_mode.clone();
+
+        let pool = if config.proxy_pool.is_empty() {
+            let clients = profiles
+                .into_iter()
+                .map(|p| build_primp_client(&config, &p, config.proxy.as_deref()))
+                .collect::<Result<Vec<_>, _>>()?;
+
+            let random = matches!(config.browser, BrowserProfile::Random);
+            debug!(
+                count = clients.len(),
+                random, "fetch client ready (static pool)"
+            );
+
+            ClientPool::Static { clients, random }
+        } else {
+            let mut rng = rand::thread_rng();
+
+            let clients = config
+                .proxy_pool
+                .iter()
+                .map(|proxy| {
+                    let p = profiles.choose(&mut rng).unwrap().clone();
+                    build_primp_client(&config, &p, Some(proxy))
+                })
+                .collect::<Result<Vec<_>, _>>()?;
+
+            debug!(
+                clients = clients.len(),
+                profiles = profiles.len(),
+                "fetch client ready (pre-built rotating pool)"
+            );
+
+            ClientPool::Rotating { clients }
+        };
+
+        Ok(Self { pool, pdf_mode })
+    }
+
+    /// Fetch a URL and return the raw HTML + response metadata.
+    ///
+    /// Automatically retries on transient failures (network errors, 5xx, 429)
+    /// with exponential backoff: 0s, 1s, 3s (3 attempts total).
+    #[instrument(skip(self), fields(url = %url))]
+    pub async fn fetch(&self, url: &str) -> Result<FetchResult, FetchError> {
+        let delays = [
+            Duration::ZERO,
+            Duration::from_secs(1),
+            Duration::from_secs(3),
+        ];
+        let mut last_err = None;
+
+        for (attempt, delay) in delays.iter().enumerate() {
+            if attempt > 0 {
+                tokio::time::sleep(*delay).await;
+            }
+
+            match self.fetch_once(url).await {
+                Ok(result) => {
+                    if is_retryable_status(result.status) && attempt < delays.len() - 1 {
+                        warn!(
+                            url,
+                            status = result.status,
+                            attempt = attempt + 1,
+                            "retryable status, will retry"
+                        );
+                        last_err = Some(FetchError::Build(format!("HTTP {}", result.status)));
+                        continue;
+                    }
+                    if attempt > 0 {
+                        debug!(url, attempt = attempt + 1, "retry succeeded");
+                    }
+                    return Ok(result);
+                }
+                Err(e) => {
+                    if !is_retryable_error(&e) || attempt == delays.len() - 1 {
+                        return Err(e);
+                    }
+                    warn!(
+                        url,
+                        error = %e,
+                        attempt = attempt + 1,
+                        "transient error, will retry"
+                    );
+                    last_err = Some(e);
+                }
+            }
+        }
+
+        Err(last_err.unwrap_or_else(|| FetchError::Build("all retries exhausted".into())))
+    }
+
+    /// Single fetch attempt (no retry).
+    async fn fetch_once(&self, url: &str) -> Result<FetchResult, FetchError> {
+        let start = Instant::now();
+
+        let client = match &self.pool {
+            ClientPool::Static { clients, random } => {
+                if *random {
+                    let host = extract_host(url);
+                    pick_for_host(clients, &host)
+                } else {
+                    &clients[0]
+                }
+            }
+            ClientPool::Rotating { clients } => pick_random(clients),
+        };
+
+        let response = client.get(url).send().await?;
+
+        let status = response.status().as_u16();
+        let final_url = response.url().to_string();
+
+        let headers: HashMap<String, String> = response
+            .headers()
+            .iter()
+            .map(|(k, v)| (k.to_string(), v.to_str().unwrap_or("").to_string()))
+            .collect();
+
+        let html = response
+            .text()
+            .await
+            .map_err(|e| FetchError::BodyDecode(e.to_string()))?;
+
+        let elapsed = start.elapsed();
+        debug!(status, elapsed_ms = %elapsed.as_millis(), "fetch complete");
+
+        Ok(FetchResult {
+            html,
+            status,
+            url: final_url,
+            headers,
+            elapsed,
+        })
+    }
+
+    /// Fetch a URL then extract structured content.
+    ///
+    /// Automatically detects PDF responses via Content-Type header and routes
+    /// to webclaw-pdf for text extraction. HTML responses go through webclaw-core.
+    #[instrument(skip(self), fields(url = %url))]
+    pub async fn fetch_and_extract(
+        &self,
+        url: &str,
+    ) -> Result<webclaw_core::ExtractionResult, FetchError> {
+        self.fetch_and_extract_with_options(url, &webclaw_core::ExtractionOptions::default())
+            .await
+    }
+
+    /// Fetch a URL then extract structured content with custom extraction options.
+    ///
+    /// Same as [`fetch_and_extract`] but accepts `ExtractionOptions` for CSS selector
+    /// filtering, main-content-only mode, etc. Options only apply to HTML responses;
+    /// PDF extraction ignores them (no DOM to filter).
+    #[instrument(skip(self, options), fields(url = %url))]
+    pub async fn fetch_and_extract_with_options(
+        &self,
+        url: &str,
+        options: &webclaw_core::ExtractionOptions,
+    ) -> Result<webclaw_core::ExtractionResult, FetchError> {
+        // Reddit fallback: use their JSON API to get post + full comment tree
+        if crate::reddit::is_reddit_url(url) {
+            let json_url = crate::reddit::json_url(url);
+            debug!("reddit detected, fetching {json_url}");
+
+            let client = self.pick_client(&json_url);
+            let response = client.get(&json_url).send().await?;
+            if response.status().is_success() {
+                let bytes = response
+                    .bytes()
+                    .await
+                    .map_err(|e| FetchError::BodyDecode(e.to_string()))?;
+                match crate::reddit::parse_reddit_json(&bytes, url) {
+                    Ok(result) => return Ok(result),
+                    Err(e) => warn!("reddit json fallback failed: {e}, falling back to HTML"),
+                }
+            }
+        }
+
+        let start = Instant::now();
+        let client = self.pick_client(url);
+        let response = client.get(url).send().await?;
+
+        let status = response.status().as_u16();
+        let final_url = response.url().to_string();
+
+        let headers: HashMap<String, String> = response
+            .headers()
+            .iter()
+            .map(|(k, v)| (k.to_string(), v.to_str().unwrap_or("").to_string()))
+            .collect();
+
+        let is_pdf = is_pdf_content_type(&headers);
+
+        if is_pdf {
+            debug!(status, "detected PDF response, using pdf extraction");
+
+            let bytes = response
+                .bytes()
+                .await
+                .map_err(|e| FetchError::BodyDecode(e.to_string()))?;
+
+            let elapsed = start.elapsed();
+            debug!(
+                status,
+                bytes = bytes.len(),
+                elapsed_ms = %elapsed.as_millis(),
+                "PDF fetch complete"
+            );
+
+            let pdf_result = webclaw_pdf::extract_pdf(&bytes, self.pdf_mode.clone())?;
+            Ok(pdf_to_extraction_result(&pdf_result, &final_url))
+        } else {
+            let html = response
+                .text()
+                .await
+                .map_err(|e| FetchError::BodyDecode(e.to_string()))?;
+
+            let elapsed = start.elapsed();
+            debug!(status, elapsed_ms = %elapsed.as_millis(), "fetch complete");
+
+            // LinkedIn: extract from embedded <code> JSON blobs
+            if crate::linkedin::is_linkedin_post(&final_url) {
+                if let Some(result) = crate::linkedin::extract_linkedin_post(&html, &final_url) {
+                    debug!("linkedin extraction succeeded");
+                    return Ok(result);
+                }
+                debug!("linkedin extraction failed, falling back to standard");
+            }
+
+            let extraction = webclaw_core::extract_with_options(&html, Some(&final_url), options)?;
+            Ok(extraction)
+        }
+    }
+
+    /// Fetch multiple URLs concurrently with bounded parallelism.
+    ///
+    /// Spawns one task per URL, bounded by a semaphore. Results are returned
+    /// in the same order as the input URLs, regardless of completion order.
+    pub async fn fetch_batch(
+        self: &Arc<Self>,
+        urls: &[&str],
+        concurrency: usize,
+    ) -> Vec<BatchResult> {
+        let semaphore = Arc::new(Semaphore::new(concurrency));
+        let mut handles = Vec::with_capacity(urls.len());
+
+        for (idx, url) in urls.iter().enumerate() {
+            let permit = Arc::clone(&semaphore);
+            let client = Arc::clone(self);
+            let url = url.to_string();
+
+            handles.push(tokio::spawn(async move {
+                let _permit = permit.acquire().await.expect("semaphore closed");
+                let result = client.fetch(&url).await;
+                (idx, BatchResult { url, result })
+            }));
+        }
+
+        collect_ordered(handles, urls.len()).await
+    }
+
+    /// Fetch and extract multiple URLs concurrently with bounded parallelism.
+    ///
+    /// Same semantics as [`fetch_batch`] but runs extraction on each response.
+    /// Results preserve input URL order.
+    pub async fn fetch_and_extract_batch(
+        self: &Arc<Self>,
+        urls: &[&str],
+        concurrency: usize,
+    ) -> Vec<BatchExtractResult> {
+        let semaphore = Arc::new(Semaphore::new(concurrency));
+        let mut handles = Vec::with_capacity(urls.len());
+
+        for (idx, url) in urls.iter().enumerate() {
+            let permit = Arc::clone(&semaphore);
+            let client = Arc::clone(self);
+            let url = url.to_string();
+
+            handles.push(tokio::spawn(async move {
+                let _permit = permit.acquire().await.expect("semaphore closed");
+                let result = client.fetch_and_extract(&url).await;
+                (idx, BatchExtractResult { url, result })
+            }));
+        }
+
+        collect_ordered(handles, urls.len()).await
+    }
+
+    /// Returns the number of proxies in the rotation pool, or 0 if static mode.
+    pub fn proxy_pool_size(&self) -> usize {
+        match &self.pool {
+            ClientPool::Static { .. } => 0,
+            ClientPool::Rotating { clients } => clients.len(),
+        }
+    }
+
+    /// Pick a client from the pool for a given URL.
+    fn pick_client(&self, url: &str) -> &primp::Client {
+        match &self.pool {
+            ClientPool::Static { clients, random } => {
+                if *random {
+                    let host = extract_host(url);
+                    pick_for_host(clients, &host)
+                } else {
+                    &clients[0]
+                }
+            }
+            ClientPool::Rotating { clients } => pick_random(clients),
+        }
+    }
+}
+
+/// Collect the impersonation profiles to use based on the browser profile.
+fn collect_profiles(profile: &BrowserProfile) -> Vec<ImpersonateProfile> {
+    match profile {
+        BrowserProfile::Random => {
+            let mut profiles = Vec::new();
+            profiles.extend(browser::chrome_profiles());
+            profiles.extend(browser::firefox_profiles());
+            profiles.extend(browser::extra_profiles());
+            profiles
+        }
+        BrowserProfile::Chrome => vec![browser::latest_chrome()],
+        BrowserProfile::Firefox => vec![browser::latest_firefox()],
+    }
+}
+
+/// Extract the host from a URL, returning empty string on parse failure.
+fn extract_host(url: &str) -> String {
+    url::Url::parse(url)
+        .ok()
+        .and_then(|u| u.host_str().map(String::from))
+        .unwrap_or_default()
+}
+
+/// Pick a client deterministically based on a host string.
+/// Same host always gets the same client, enabling HTTP/2 connection reuse.
+fn pick_for_host<'a>(clients: &'a [primp::Client], host: &str) -> &'a primp::Client {
+    let mut hasher = std::collections::hash_map::DefaultHasher::new();
+    host.hash(&mut hasher);
+    let idx = (hasher.finish() as usize) % clients.len();
+    &clients[idx]
+}
+
+/// Pick a random client from the pool for per-request rotation.
+fn pick_random(clients: &[primp::Client]) -> &primp::Client {
+    use rand::Rng;
+    let idx = rand::thread_rng().gen_range(0..clients.len());
+    &clients[idx]
+}
+
+/// Status codes worth retrying: server errors + rate limiting.
+fn is_retryable_status(status: u16) -> bool {
+    status == 429
+        || status == 502
+        || status == 503
+        || status == 504
+        || status == 520
+        || status == 521
+        || status == 522
+        || status == 523
+        || status == 524
+}
+
+/// Errors worth retrying: network/connection failures (not client errors).
+fn is_retryable_error(err: &FetchError) -> bool {
+    matches!(err, FetchError::Request(_) | FetchError::BodyDecode(_))
+}
+
+fn is_pdf_content_type(headers: &HashMap<String, String>) -> bool {
+    headers
+        .get("content-type")
+        .map(|ct| {
+            let mime = ct.split(';').next().unwrap_or("").trim();
+            mime.eq_ignore_ascii_case("application/pdf")
+        })
+        .unwrap_or(false)
+}
+
+/// Convert a webclaw-pdf PdfResult into a webclaw-core ExtractionResult.
+fn pdf_to_extraction_result(
+    pdf: &webclaw_pdf::PdfResult,
+    url: &str,
+) -> webclaw_core::ExtractionResult {
+    let markdown = webclaw_pdf::to_markdown(pdf);
+    let word_count = markdown.split_whitespace().count();
+
+    webclaw_core::ExtractionResult {
+        metadata: webclaw_core::Metadata {
+            title: pdf.metadata.title.clone(),
+            description: pdf.metadata.subject.clone(),
+            author: pdf.metadata.author.clone(),
+            published_date: None,
+            language: None,
+            url: Some(url.to_string()),
+            site_name: None,
+            image: None,
+            favicon: None,
+            word_count,
+        },
+        content: webclaw_core::Content {
+            markdown,
+            plain_text: pdf.text.clone(),
+            links: Vec::new(),
+            images: Vec::new(),
+            code_blocks: Vec::new(),
+            raw_html: None,
+        },
+        domain_data: None,
+        structured_data: vec![],
+    }
+}
+
+/// Collect spawned tasks and reorder results to match input order.
+async fn collect_ordered<T>(
+    handles: Vec<tokio::task::JoinHandle<(usize, T)>>,
+    len: usize,
+) -> Vec<T> {
+    let mut slots: Vec<Option<T>> = (0..len).map(|_| None).collect();
+
+    for handle in handles {
+        match handle.await {
+            Ok((idx, result)) => {
+                slots[idx] = Some(result);
+            }
+            Err(e) => {
+                warn!(error = %e, "batch task panicked");
+            }
+        }
+    }
+
+    slots.into_iter().flatten().collect()
+}
+
+/// Build a single primp Client from config + impersonation profile + optional proxy.
+fn build_primp_client(
+    config: &FetchConfig,
+    profile: &ImpersonateProfile,
+    proxy: Option<&str>,
+) -> Result<primp::Client, FetchError> {
+    let redirect_policy = if config.follow_redirects {
+        primp::redirect::Policy::limited(config.max_redirects as usize)
+    } else {
+        primp::redirect::Policy::none()
+    };
+
+    let mut headers = primp::header::HeaderMap::new();
+    for (k, v) in &config.headers {
+        if let (Ok(name), Ok(val)) = (
+            primp::header::HeaderName::from_bytes(k.as_bytes()),
+            primp::header::HeaderValue::from_str(v),
+        ) {
+            headers.insert(name, val);
+        }
+    }
+
+    let mut builder = primp::Client::builder()
+        .impersonate(profile.browser)
+        .impersonate_os(profile.os)
+        .cookie_store(true)
+        .timeout(config.timeout)
+        .redirect(redirect_policy)
+        .default_headers(headers);
+
+    if let Some(proxy_url) = proxy {
+        builder = builder
+            .proxy(primp::Proxy::all(proxy_url).map_err(|e| FetchError::Build(e.to_string()))?);
+    }
+
+    builder
+        .build()
+        .map_err(|e| FetchError::Build(e.to_string()))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_batch_result_struct() {
+        let ok = BatchResult {
+            url: "https://example.com".to_string(),
+            result: Ok(FetchResult {
+                html: "<html></html>".to_string(),
+                status: 200,
+                url: "https://example.com".to_string(),
+                headers: HashMap::new(),
+                elapsed: Duration::from_millis(42),
+            }),
+        };
+        assert_eq!(ok.url, "https://example.com");
+        assert!(ok.result.is_ok());
+        assert_eq!(ok.result.unwrap().status, 200);
+
+        let err = BatchResult {
+            url: "https://bad.example".to_string(),
+            result: Err(FetchError::InvalidUrl("bad url".into())),
+        };
+        assert!(err.result.is_err());
+    }
+
+    #[test]
+    fn test_batch_extract_result_struct() {
+        let err = BatchExtractResult {
+            url: "https://example.com".to_string(),
+            result: Err(FetchError::BodyDecode("timeout".into())),
+        };
+        assert_eq!(err.url, "https://example.com");
+        assert!(err.result.is_err());
+    }
+
+    #[tokio::test]
+    async fn test_batch_preserves_order() {
+        let handles: Vec<tokio::task::JoinHandle<(usize, String)>> = vec![
+            tokio::spawn(async { (2, "c".to_string()) }),
+            tokio::spawn(async { (0, "a".to_string()) }),
+            tokio::spawn(async { (1, "b".to_string()) }),
+        ];
+
+        let results = collect_ordered(handles, 3).await;
+        assert_eq!(results, vec!["a", "b", "c"]);
+    }
+
+    #[tokio::test]
+    async fn test_collect_ordered_handles_gaps() {
+        let handles: Vec<tokio::task::JoinHandle<(usize, String)>> = vec![
+            tokio::spawn(async { (0, "first".to_string()) }),
+            tokio::spawn(async { (2, "third".to_string()) }),
+        ];
+
+        let results = collect_ordered(handles, 3).await;
+        assert_eq!(results.len(), 2);
+        assert_eq!(results[0], "first");
+        assert_eq!(results[1], "third");
+    }
+
+    #[test]
+    fn test_is_pdf_content_type() {
+        let mut headers = HashMap::new();
+        headers.insert("content-type".to_string(), "application/pdf".to_string());
+        assert!(is_pdf_content_type(&headers));
+
+        headers.insert(
+            "content-type".to_string(),
+            "application/pdf; charset=utf-8".to_string(),
+        );
+        assert!(is_pdf_content_type(&headers));
+
+        headers.insert("content-type".to_string(), "Application/PDF".to_string());
+        assert!(is_pdf_content_type(&headers));
+
+        headers.insert("content-type".to_string(), "text/html".to_string());
+        assert!(!is_pdf_content_type(&headers));
+
+        let empty: HashMap<String, String> = HashMap::new();
+        assert!(!is_pdf_content_type(&empty));
+    }
+
+    #[test]
+    fn test_pdf_to_extraction_result() {
+        let pdf = webclaw_pdf::PdfResult {
+            text: "Hello from PDF.".into(),
+            page_count: 2,
+            metadata: webclaw_pdf::PdfMetadata {
+                title: Some("My Doc".into()),
+                author: Some("Author".into()),
+                subject: Some("Testing".into()),
+                creator: None,
+            },
+        };
+
+        let result = pdf_to_extraction_result(&pdf, "https://example.com/doc.pdf");
+
+        assert_eq!(result.metadata.title.as_deref(), Some("My Doc"));
+        assert_eq!(result.metadata.author.as_deref(), Some("Author"));
+        assert_eq!(result.metadata.description.as_deref(), Some("Testing"));
+        assert_eq!(
+            result.metadata.url.as_deref(),
+            Some("https://example.com/doc.pdf")
+        );
+        assert!(result.content.markdown.contains("# My Doc"));
+        assert!(result.content.markdown.contains("Hello from PDF."));
+        assert_eq!(result.content.plain_text, "Hello from PDF.");
+        assert!(result.content.links.is_empty());
+        assert!(result.domain_data.is_none());
+        assert!(result.metadata.word_count > 0);
+    }
+
+    #[test]
+    fn test_static_pool_no_proxy() {
+        let config = FetchConfig::default();
+        let client = FetchClient::new(config).unwrap();
+        assert_eq!(client.proxy_pool_size(), 0);
+    }
+
+    #[test]
+    fn test_rotating_pool_prebuilds_clients() {
+        let config = FetchConfig {
+            proxy_pool: vec![
+                "http://proxy1:8080".into(),
+                "http://proxy2:8080".into(),
+                "http://proxy3:8080".into(),
+            ],
+            ..Default::default()
+        };
+        let client = FetchClient::new(config).unwrap();
+        assert_eq!(client.proxy_pool_size(), 3);
+    }
+
+    #[test]
+    fn test_pick_for_host_deterministic() {
+        let config = FetchConfig {
+            browser: BrowserProfile::Random,
+            ..Default::default()
+        };
+        let client = FetchClient::new(config).unwrap();
+
+        let clients = match &client.pool {
+            ClientPool::Static { clients, .. } => clients,
+            ClientPool::Rotating { clients } => clients,
+        };
+
+        let a1 = pick_for_host(clients, "example.com") as *const _;
+        let a2 = pick_for_host(clients, "example.com") as *const _;
+        let a3 = pick_for_host(clients, "example.com") as *const _;
+        assert_eq!(a1, a2);
+        assert_eq!(a2, a3);
+    }
+
+    #[test]
+    fn test_pick_for_host_distributes() {
+        let config = FetchConfig {
+            proxy_pool: (0..10).map(|i| format!("http://proxy{i}:8080")).collect(),
+            ..Default::default()
+        };
+        let client = FetchClient::new(config).unwrap();
+
+        let clients = match &client.pool {
+            ClientPool::Static { clients, .. } | ClientPool::Rotating { clients } => clients,
+        };
+
+        let hosts = [
+            "example.com",
+            "google.com",
+            "github.com",
+            "rust-lang.org",
+            "crates.io",
+        ];
+
+        let indices: Vec<usize> = hosts
+            .iter()
+            .map(|h| {
+                let ptr = pick_for_host(clients, h) as *const _;
+                clients.iter().position(|c| std::ptr::eq(c, ptr)).unwrap()
+            })
+            .collect();
+
+        let unique: std::collections::HashSet<_> = indices.iter().collect();
+        assert!(
+            unique.len() >= 2,
+            "expected host distribution across clients, got indices: {indices:?}"
+        );
+    }
+
+    #[test]
+    fn test_extract_host() {
+        assert_eq!(extract_host("https://example.com/path"), "example.com");
+        assert_eq!(
+            extract_host("https://sub.example.com:8080/foo"),
+            "sub.example.com"
+        );
+        assert_eq!(extract_host("not-a-url"), "");
+    }
+
+    #[test]
+    fn test_default_config_has_empty_proxy_pool() {
+        let config = FetchConfig::default();
+        assert!(config.proxy_pool.is_empty());
+        assert!(config.proxy.is_none());
+    }
+}
--- a/crates/webclaw-fetch/src/crawler.rs
+++ b/crates/webclaw-fetch/src/crawler.rs
@ -0,0 +1,557 @@
+/// Recursive same-origin web crawler built on top of [`FetchClient`].
+///
+/// Starts from a seed URL, extracts content, discovers links, and follows
+/// them breadth-first up to a configurable depth/page limit. Uses a semaphore
+/// for bounded concurrency and per-request delays for politeness.
+///
+/// When `use_sitemap` is enabled, the crawler first discovers URLs from the
+/// site's sitemaps and seeds the BFS frontier before crawling.
+use std::collections::HashSet;
+use std::sync::Arc;
+use std::time::{Duration, Instant};
+
+use serde::{Deserialize, Serialize};
+use tokio::sync::Semaphore;
+use tracing::{debug, info, warn};
+use url::Url;
+
+use crate::client::{FetchClient, FetchConfig};
+use crate::error::FetchError;
+use crate::sitemap;
+
+/// Controls crawl scope, depth, concurrency, and politeness.
+#[derive(Debug, Clone)]
+pub struct CrawlConfig {
+    /// Fetch configuration (browser profile, proxy, timeout, etc.)
+    pub fetch: FetchConfig,
+    /// How deep to follow links. 1 = only immediate links from seed page.
+    pub max_depth: usize,
+    /// Hard cap on total pages fetched (including the seed).
+    pub max_pages: usize,
+    /// Max concurrent in-flight requests.
+    pub concurrency: usize,
+    /// Minimum delay before each request (politeness).
+    pub delay: Duration,
+    /// Only follow URLs whose path starts with this prefix (e.g. "/docs/").
+    pub path_prefix: Option<String>,
+    /// Seed BFS frontier from sitemap discovery before crawling.
+    pub use_sitemap: bool,
+    /// Glob patterns for paths to include. If non-empty, only matching URLs are crawled.
+    /// E.g. `["/api/*", "/guides/*"]` — matched against the URL path.
+    pub include_patterns: Vec<String>,
+    /// Glob patterns for paths to exclude. Checked after include_patterns.
+    /// E.g. `["/changelog/*", "/blog/*"]` — matching URLs are skipped.
+    pub exclude_patterns: Vec<String>,
+    /// Optional channel sender for streaming per-page results as they complete.
+    /// When set, each `PageResult` is sent on this channel immediately after extraction.
+    pub progress_tx: Option<tokio::sync::broadcast::Sender<PageResult>>,
+}
+
+impl Default for CrawlConfig {
+    fn default() -> Self {
+        Self {
+            fetch: FetchConfig::default(),
+            max_depth: 1,
+            max_pages: 50,
+            concurrency: 5,
+            delay: Duration::from_millis(100),
+            path_prefix: None,
+            use_sitemap: false,
+            include_patterns: Vec::new(),
+            exclude_patterns: Vec::new(),
+            progress_tx: None,
+        }
+    }
+}
+
+/// Aggregated results from a crawl run.
+#[derive(Debug, Serialize, Deserialize)]
+pub struct CrawlResult {
+    pub pages: Vec<PageResult>,
+    pub total: usize,
+    pub ok: usize,
+    pub errors: usize,
+    pub elapsed_secs: f64,
+}
+
+/// Outcome of extracting a single page during the crawl.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct PageResult {
+    pub url: String,
+    pub depth: usize,
+    pub extraction: Option<webclaw_core::ExtractionResult>,
+    pub error: Option<String>,
+    #[serde(skip)]
+    pub elapsed: Duration,
+}
+
+/// Recursive crawler that wraps a shared [`FetchClient`].
+pub struct Crawler {
+    client: Arc<FetchClient>,
+    config: CrawlConfig,
+    seed_origin: String,
+}
+
+impl Crawler {
+    /// Build a new crawler from a seed URL and config.
+    /// Constructs the underlying `FetchClient` from `config.fetch`.
+    pub fn new(seed_url: &str, config: CrawlConfig) -> Result<Self, FetchError> {
+        let seed = Url::parse(seed_url).map_err(|_| FetchError::InvalidUrl(seed_url.into()))?;
+        let seed_origin = origin_key(&seed);
+
+        let client = FetchClient::new(config.fetch.clone())?;
+
+        Ok(Self {
+            client: Arc::new(client),
+            config,
+            seed_origin,
+        })
+    }
+
+    /// Crawl starting from `start_url`, returning results for every page visited.
+    ///
+    /// Uses breadth-first traversal: all pages at depth N are fetched (concurrently,
+    /// bounded by `config.concurrency`) before moving to depth N+1.
+    ///
+    /// When `config.use_sitemap` is true, sitemap URLs are discovered first and
+    /// added to the initial frontier at depth 0 alongside the seed URL.
+    pub async fn crawl(&self, start_url: &str) -> CrawlResult {
+        let start = Instant::now();
+
+        let seed = match Url::parse(start_url) {
+            Ok(u) => u,
+            Err(_) => {
+                return CrawlResult {
+                    pages: vec![PageResult {
+                        url: start_url.to_string(),
+                        depth: 0,
+                        extraction: None,
+                        error: Some(format!("invalid URL: {start_url}")),
+                        elapsed: Duration::ZERO,
+                    }],
+                    total: 1,
+                    ok: 0,
+                    errors: 1,
+                    elapsed_secs: 0.0,
+                };
+            }
+        };
+
+        let semaphore = Arc::new(Semaphore::new(self.config.concurrency));
+        let mut visited: HashSet<String> = HashSet::new();
+        let mut pages: Vec<PageResult> = Vec::new();
+
+        // BFS frontier: vec of (normalized_url, depth) for the current level
+        let mut frontier: Vec<(String, usize)> = vec![(normalize(&seed), 0)];
+
+        // Seed frontier from sitemap if enabled
+        if self.config.use_sitemap {
+            let base_url = format!("{}://{}", seed.scheme(), seed.host_str().unwrap_or(""));
+            match sitemap::discover(&self.client, &base_url).await {
+                Ok(entries) => {
+                    let before = frontier.len();
+                    for entry in entries {
+                        if self.qualify_link(&entry.url, &visited).is_some() {
+                            let parsed = match Url::parse(&entry.url) {
+                                Ok(u) => u,
+                                Err(_) => continue,
+                            };
+                            let norm = normalize(&parsed);
+                            frontier.push((norm, 0));
+                        }
+                    }
+                    let added = frontier.len() - before;
+                    info!(
+                        sitemap_urls = added,
+                        "seeded frontier from sitemap discovery"
+                    );
+                }
+                Err(e) => {
+                    warn!(error = %e, "sitemap discovery failed, continuing with seed URL only");
+                }
+            }
+        }
+
+        while !frontier.is_empty() && pages.len() < self.config.max_pages {
+            // Dedup this level's frontier against the visited set and page cap
+            let batch: Vec<(String, usize)> = frontier
+                .drain(..)
+                .filter(|(url, _)| visited.insert(url.clone()))
+                .take(self.config.max_pages.saturating_sub(pages.len()))
+                .collect();
+
+            if batch.is_empty() {
+                break;
+            }
+
+            // Spawn one task per URL, bounded by semaphore
+            let mut handles = Vec::with_capacity(batch.len());
+
+            for (url, depth) in &batch {
+                let permit = Arc::clone(&semaphore);
+                let client = Arc::clone(&self.client);
+                let url = url.clone();
+                let depth = *depth;
+                let delay = self.config.delay;
+
+                handles.push(tokio::spawn(async move {
+                    // Acquire permit — blocks if concurrency limit reached
+                    let _permit = permit.acquire().await.expect("semaphore closed");
+                    tokio::time::sleep(delay).await;
+
+                    let page_start = Instant::now();
+                    let result = client.fetch_and_extract(&url).await;
+                    let elapsed = page_start.elapsed();
+
+                    match result {
+                        Ok(extraction) => {
+                            debug!(
+                                url = %url, depth,
+                                elapsed_ms = %elapsed.as_millis(),
+                                "page extracted"
+                            );
+                            PageResult {
+                                url,
+                                depth,
+                                extraction: Some(extraction),
+                                error: None,
+                                elapsed,
+                            }
+                        }
+                        Err(e) => {
+                            warn!(url = %url, depth, error = %e, "page failed");
+                            PageResult {
+                                url,
+                                depth,
+                                extraction: None,
+                                error: Some(e.to_string()),
+                                elapsed,
+                            }
+                        }
+                    }
+                }));
+            }
+
+            // Collect results and harvest links for the next depth level
+            let mut next_frontier: Vec<(String, usize)> = Vec::new();
+
+            for handle in handles {
+                let page = match handle.await {
+                    Ok(page) => page,
+                    Err(e) => {
+                        warn!(error = %e, "crawl task panicked");
+                        continue;
+                    }
+                };
+                let depth = page.depth;
+
+                if depth < self.config.max_depth
+                    && let Some(ref extraction) = page.extraction
+                {
+                    for link in &extraction.content.links {
+                        if let Some(candidate) = self.qualify_link(&link.href, &visited) {
+                            next_frontier.push((candidate, depth + 1));
+                        }
+                    }
+                }
+
+                // Stream progress if a channel is configured
+                if let Some(tx) = &self.config.progress_tx {
+                    let _ = tx.send(page.clone());
+                }
+
+                pages.push(page);
+
+                if pages.len() >= self.config.max_pages {
+                    break;
+                }
+            }
+
+            frontier = next_frontier;
+        }
+
+        let total_elapsed = start.elapsed();
+        let ok_count = pages.iter().filter(|p| p.extraction.is_some()).count();
+        let err_count = pages.len() - ok_count;
+        info!(
+            total = pages.len(),
+            ok = ok_count,
+            errors = err_count,
+            elapsed_ms = %total_elapsed.as_millis(),
+            "crawl complete"
+        );
+
+        CrawlResult {
+            total: pages.len(),
+            ok: ok_count,
+            errors: err_count,
+            elapsed_secs: total_elapsed.as_secs_f64(),
+            pages,
+        }
+    }
+
+    /// Check if a discovered link should be added to the frontier.
+    /// Returns `Some(normalized_url)` if it passes all filters, `None` otherwise.
+    fn qualify_link(&self, href: &str, visited: &HashSet<String>) -> Option<String> {
+        let parsed = Url::parse(href).ok()?;
+
+        // Only http(s) schemes
+        match parsed.scheme() {
+            "http" | "https" => {}
+            _ => return None,
+        }
+
+        // Same-origin check (scheme + host + port)
+        if origin_key(&parsed) != self.seed_origin {
+            return None;
+        }
+
+        // Path prefix filter
+        if let Some(ref prefix) = self.config.path_prefix
+            && !parsed.path().starts_with(prefix.as_str())
+        {
+            return None;
+        }
+
+        // Include patterns: if any are set, path must match at least one
+        let path = parsed.path();
+        if !self.config.include_patterns.is_empty()
+            && !self
+                .config
+                .include_patterns
+                .iter()
+                .any(|pat| glob_match(pat, path))
+        {
+            return None;
+        }
+
+        // Exclude patterns: if path matches any, skip
+        if self
+            .config
+            .exclude_patterns
+            .iter()
+            .any(|pat| glob_match(pat, path))
+        {
+            return None;
+        }
+
+        // Skip common non-page file extensions
+        const SKIP_EXTENSIONS: &[&str] = &[
+            ".pdf", ".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp", ".ico", ".css", ".js",
+            ".zip", ".tar", ".gz", ".xml", ".rss", ".mp3", ".mp4", ".avi", ".mov", ".woff",
+            ".woff2", ".ttf", ".eot",
+        ];
+        if SKIP_EXTENSIONS.iter().any(|ext| path.ends_with(ext)) {
+            return None;
+        }
+
+        let normalized = normalize(&parsed);
+
+        if visited.contains(&normalized) {
+            return None;
+        }
+
+        Some(normalized)
+    }
+}
+
+/// Canonical origin string for comparing same-origin: "scheme://host[:port]".
+fn origin_key(url: &Url) -> String {
+    let port_suffix = match url.port() {
+        Some(p) => format!(":{p}"),
+        None => String::new(),
+    };
+    let host = url.host_str().unwrap_or("");
+    let host = host.strip_prefix("www.").unwrap_or(host);
+    format!("{}://{}{}", url.scheme(), host, port_suffix)
+}
+
+/// Normalize a URL for dedup: strip fragment, remove trailing slash (except root "/"),
+/// lowercase scheme + host. Preserves query params and path case.
+fn normalize(url: &Url) -> String {
+    let scheme = url.scheme();
+    let host = url.host_str().unwrap_or("").to_ascii_lowercase();
+    let port_suffix = match url.port() {
+        Some(p) => format!(":{p}"),
+        None => String::new(),
+    };
+
+    let mut path = url.path().to_string();
+    if path.len() > 1 && path.ends_with('/') {
+        path.pop();
+    }
+
+    let query = match url.query() {
+        Some(q) => format!("?{q}"),
+        None => String::new(),
+    };
+
+    // Fragment intentionally omitted
+    format!("{scheme}://{host}{port_suffix}{path}{query}")
+}
+
+/// Simple glob matching for URL paths. Supports:
+/// - `*` matches any characters within a single path segment (no `/`)
+/// - `**` matches any characters including `/` (any number of segments)
+/// - Literal characters match exactly
+///
+/// Examples:
+/// - `/api/*` matches `/api/users` but not `/api/users/123`
+/// - `/api/**` matches `/api/users`, `/api/users/123`, `/api/a/b/c`
+/// - `/docs/*/intro` matches `/docs/v2/intro`
+fn glob_match(pattern: &str, path: &str) -> bool {
+    glob_match_inner(pattern.as_bytes(), path.as_bytes())
+}
+
+fn glob_match_inner(pat: &[u8], text: &[u8]) -> bool {
+    let mut pi = 0;
+    let mut ti = 0;
+    let mut star_pi = usize::MAX;
+    let mut star_ti = 0;
+
+    while ti < text.len() {
+        if pi < pat.len() && pat[pi] == b'*' && pi + 1 < pat.len() && pat[pi + 1] == b'*' {
+            // `**` — match everything including slashes
+            // Skip all consecutive `*`
+            while pi < pat.len() && pat[pi] == b'*' {
+                pi += 1;
+            }
+            // Skip trailing `/` after `**`
+            if pi < pat.len() && pat[pi] == b'/' {
+                pi += 1;
+            }
+            if pi >= pat.len() {
+                return true; // `**` at end matches everything
+            }
+            // Try matching the rest of pattern against every suffix of text
+            for start in ti..=text.len() {
+                if glob_match_inner(&pat[pi..], &text[start..]) {
+                    return true;
+                }
+            }
+            return false;
+        } else if pi < pat.len() && pat[pi] == b'*' {
+            // `*` — match any chars except `/`
+            star_pi = pi;
+            star_ti = ti;
+            pi += 1;
+        } else if pi < pat.len() && (pat[pi] == text[ti] || pat[pi] == b'?') {
+            pi += 1;
+            ti += 1;
+        } else if star_pi != usize::MAX {
+            // Backtrack: `*` absorbs one more char (but not `/`)
+            if text[star_ti] == b'/' {
+                return false;
+            }
+            star_ti += 1;
+            ti = star_ti;
+            pi = star_pi + 1;
+        } else {
+            return false;
+        }
+    }
+
+    // Consume trailing `*` or `**` in pattern
+    while pi < pat.len() && pat[pi] == b'*' {
+        pi += 1;
+    }
+
+    pi >= pat.len()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn normalize_strips_fragment() {
+        let url = Url::parse("https://example.com/page#section").unwrap();
+        assert_eq!(normalize(&url), "https://example.com/page");
+    }
+
+    #[test]
+    fn normalize_strips_trailing_slash() {
+        let url = Url::parse("https://example.com/docs/").unwrap();
+        assert_eq!(normalize(&url), "https://example.com/docs");
+    }
+
+    #[test]
+    fn normalize_keeps_root_slash() {
+        let url = Url::parse("https://example.com/").unwrap();
+        assert_eq!(normalize(&url), "https://example.com/");
+    }
+
+    #[test]
+    fn normalize_preserves_query() {
+        let url = Url::parse("https://example.com/search?q=rust&page=2").unwrap();
+        assert_eq!(normalize(&url), "https://example.com/search?q=rust&page=2");
+    }
+
+    #[test]
+    fn normalize_lowercases_host() {
+        let url = Url::parse("https://Example.COM/Path").unwrap();
+        assert_eq!(normalize(&url), "https://example.com/Path");
+    }
+
+    #[test]
+    fn origin_includes_explicit_port() {
+        let url = Url::parse("https://example.com:8443/foo").unwrap();
+        assert_eq!(origin_key(&url), "https://example.com:8443");
+    }
+
+    #[test]
+    fn origin_omits_default_port() {
+        let url = Url::parse("https://example.com/foo").unwrap();
+        assert_eq!(origin_key(&url), "https://example.com");
+    }
+
+    #[test]
+    fn different_schemes_are_different_origins() {
+        let http = Url::parse("http://example.com/").unwrap();
+        let https = Url::parse("https://example.com/").unwrap();
+        assert_ne!(origin_key(&http), origin_key(&https));
+    }
+
+    // -- glob_match tests --
+
+    #[test]
+    fn glob_star_matches_single_segment() {
+        assert!(glob_match("/api/*", "/api/users"));
+        assert!(glob_match("/api/*", "/api/products"));
+        assert!(!glob_match("/api/*", "/api/users/123"));
+    }
+
+    #[test]
+    fn glob_doublestar_matches_multiple_segments() {
+        assert!(glob_match("/api/**", "/api/users"));
+        assert!(glob_match("/api/**", "/api/users/123"));
+        assert!(glob_match("/api/**", "/api/a/b/c/d"));
+        assert!(!glob_match("/api/**", "/docs/intro"));
+    }
+
+    #[test]
+    fn glob_exact_match() {
+        assert!(glob_match("/about", "/about"));
+        assert!(!glob_match("/about", "/about/team"));
+    }
+
+    #[test]
+    fn glob_middle_wildcard() {
+        assert!(glob_match("/docs/*/intro", "/docs/v2/intro"));
+        assert!(!glob_match("/docs/*/intro", "/docs/v2/v3/intro"));
+    }
+
+    #[test]
+    fn glob_no_pattern_matches_nothing() {
+        // Empty pattern only matches empty string
+        assert!(glob_match("", ""));
+        assert!(!glob_match("", "/foo"));
+    }
+
+    #[test]
+    fn glob_trailing_star() {
+        assert!(glob_match("/blog*", "/blog"));
+        assert!(glob_match("/blog*", "/blog-post"));
+        assert!(!glob_match("/blog*", "/blog/post")); // * doesn't cross /
+    }
+}
--- a/crates/webclaw-fetch/src/error.rs
+++ b/crates/webclaw-fetch/src/error.rs
@ -0,0 +1,24 @@
+/// Fetch-layer errors. Wraps primp/network failures into a single type
+/// that callers can match on without leaking transport details.
+use thiserror::Error;
+
+#[derive(Debug, Error)]
+pub enum FetchError {
+    #[error("request failed: {0}")]
+    Request(#[from] primp::Error),
+
+    #[error("invalid url: {0}")]
+    InvalidUrl(String),
+
+    #[error("response body decode failed: {0}")]
+    BodyDecode(String),
+
+    #[error("extraction failed: {0}")]
+    Extraction(#[from] webclaw_core::ExtractError),
+
+    #[error("PDF extraction failed: {0}")]
+    Pdf(#[from] webclaw_pdf::PdfError),
+
+    #[error("client build failed: {0}")]
+    Build(String),
+}
--- a/crates/webclaw-fetch/src/lib.rs
+++ b/crates/webclaw-fetch/src/lib.rs
@ -0,0 +1,20 @@
+/// webclaw-fetch: HTTP client layer with browser TLS fingerprint impersonation.
+/// Uses Impit under the hood to make requests that look like real
+/// browsers at the TLS, HTTP/2, and header levels.
+/// Automatically detects PDF responses and delegates to webclaw-pdf.
+pub mod browser;
+pub mod client;
+pub mod crawler;
+pub mod error;
+pub mod linkedin;
+pub mod proxy;
+pub mod reddit;
+pub mod sitemap;
+
+pub use browser::BrowserProfile;
+pub use client::{BatchExtractResult, BatchResult, FetchClient, FetchConfig, FetchResult};
+pub use crawler::{CrawlConfig, CrawlResult, Crawler, PageResult};
+pub use error::FetchError;
+pub use proxy::{parse_proxy_file, parse_proxy_line};
+pub use sitemap::SitemapEntry;
+pub use webclaw_pdf::PdfMode;
--- a/crates/webclaw-fetch/src/linkedin.rs
+++ b/crates/webclaw-fetch/src/linkedin.rs
@ -0,0 +1,279 @@
+/// LinkedIn post extraction from authenticated HTML.
+///
+/// LinkedIn's SPA stores all data in `<code>` tags as HTML-escaped JSON.
+/// The `included` array contains typed entities: Update (post), Comment,
+/// Profile, etc. We parse these to reconstruct post + comments as markdown.
+use serde_json::Value;
+use tracing::debug;
+use webclaw_core::{Content, ExtractionResult, Metadata};
+
+/// Check if a URL is a LinkedIn post/activity.
+pub fn is_linkedin_post(url: &str) -> bool {
+    let host = url
+        .split("://")
+        .nth(1)
+        .unwrap_or(url)
+        .split('/')
+        .next()
+        .unwrap_or("");
+    (host == "www.linkedin.com" || host == "linkedin.com")
+        && (url.contains("/feed/update/") || url.contains("/posts/"))
+}
+
+/// Extract `<code>` block contents from HTML using simple string scanning.
+/// LinkedIn wraps JSON data in `<code>` tags with HTML-escaped content.
+fn extract_code_blocks(html: &str) -> Vec<String> {
+    let mut blocks = Vec::new();
+    let mut search_from = 0;
+    while let Some(start) = html[search_from..].find("<code") {
+        let abs_start = search_from + start;
+        // Find end of opening tag
+        let Some(tag_end) = html[abs_start..].find('>') else {
+            break;
+        };
+        let content_start = abs_start + tag_end + 1;
+        let Some(end) = html[content_start..].find("</code>") else {
+            break;
+        };
+        let content = &html[content_start..content_start + end];
+        if content.len() > 1000 {
+            blocks.push(html_unescape(content));
+        }
+        search_from = content_start + end + 7;
+    }
+    blocks
+}
+
+/// Extract post + comments from LinkedIn's SSR HTML (requires auth cookies).
+pub fn extract_linkedin_post(html: &str, url: &str) -> Option<ExtractionResult> {
+    let code_blocks = extract_code_blocks(html);
+
+    // Find the largest <code> block with "included" — that's the main data payload
+    let mut best_included: Option<Vec<Value>> = None;
+    for raw in &code_blocks {
+        if let Ok(obj) = serde_json::from_str::<Value>(raw)
+            && let Some(arr) = obj.get("included").and_then(|v| v.as_array())
+        {
+            let current_len = best_included.as_ref().map(|a| a.len()).unwrap_or(0);
+            if arr.len() > current_len {
+                best_included = Some(arr.clone());
+            }
+        }
+    }
+
+    let included = best_included?;
+    debug!(entities = included.len(), "linkedin: found included array");
+
+    // Collect profiles (entityUrn → "First Last")
+    let mut profiles = std::collections::HashMap::new();
+    for item in &included {
+        let t = item.get("$type").and_then(|v| v.as_str()).unwrap_or("");
+        if t.contains("Profile") {
+            let urn = item.get("entityUrn").and_then(|v| v.as_str()).unwrap_or("");
+            let first = item.get("firstName").and_then(|v| v.as_str()).unwrap_or("");
+            let last = item.get("lastName").and_then(|v| v.as_str()).unwrap_or("");
+            let headline = item.get("headline").and_then(|v| v.as_str()).unwrap_or("");
+            if !first.is_empty() {
+                profiles.insert(
+                    urn.to_string(),
+                    (
+                        format!("{first} {last}").trim().to_string(),
+                        headline.to_string(),
+                    ),
+                );
+            }
+        }
+    }
+
+    // Find the main post (Update type)
+    let mut markdown = String::new();
+    let mut post_author = String::new();
+    let mut post_headline = String::new();
+
+    for item in &included {
+        let t = item.get("$type").and_then(|v| v.as_str()).unwrap_or("");
+        if !t.contains("Update") {
+            continue;
+        }
+
+        // Get author from actor profile
+        if let Some(actor) = item.get("actor") {
+            // actor can have a nested profile reference or inline data
+            let author_urn = actor
+                .get("*author")
+                .or(actor.get("author"))
+                .and_then(|v| v.as_str())
+                .unwrap_or("");
+            if let Some((name, headline)) = profiles.get(author_urn) {
+                post_author = name.clone();
+                post_headline = headline.clone();
+            }
+            // Or inline name
+            if post_author.is_empty()
+                && let Some(name) = actor.get("name").and_then(|v| v.as_object())
+            {
+                let text = name.get("text").and_then(|v| v.as_str()).unwrap_or("");
+                if !text.is_empty() {
+                    post_author = text.to_string();
+                }
+            }
+            if post_headline.is_empty()
+                && let Some(desc) = actor.get("description").and_then(|v| v.as_object())
+            {
+                let text = desc.get("text").and_then(|v| v.as_str()).unwrap_or("");
+                if !text.is_empty() {
+                    post_headline = text.to_string();
+                }
+            }
+        }
+
+        // Get post body from commentary
+        if let Some(commentary) = item.get("commentary")
+            && let Some(text) = commentary
+                .get("text")
+                .and_then(|v| v.as_object())
+                .and_then(|o| o.get("text"))
+                .and_then(|v| v.as_str())
+        {
+            if !post_author.is_empty() {
+                markdown.push_str(&format!("# {post_author}\n\n"));
+            }
+            if !post_headline.is_empty() {
+                markdown.push_str(&format!("*{post_headline}*\n\n"));
+            }
+            markdown.push_str("---\n\n");
+            // Unescape literal \n from JSON
+            markdown.push_str(&text.replace("\\n", "\n"));
+            markdown.push_str("\n\n");
+        }
+    }
+
+    if markdown.is_empty() {
+        return None;
+    }
+
+    // Collect comments — LinkedIn stores comment text in `commentary.text`
+    // and commenter name in `commenter.name.text`
+    let mut comments: Vec<(String, String)> = Vec::new();
+    for item in &included {
+        let t = item.get("$type").and_then(|v| v.as_str()).unwrap_or("");
+        if !t.contains("Comment") {
+            continue;
+        }
+
+        // Get comment text from commentary.text
+        let text = item
+            .get("commentary")
+            .and_then(|c| c.get("text"))
+            .and_then(|v| v.as_str())
+            .unwrap_or("");
+        if text.is_empty() {
+            continue;
+        }
+
+        // Get commenter name from commenter.title.text
+        let name = item
+            .get("commenter")
+            .and_then(|c| c.get("title"))
+            .and_then(|n| n.get("text"))
+            .and_then(|v| v.as_str())
+            .unwrap_or("Someone");
+
+        comments.push((name.to_string(), text.to_string()));
+    }
+
+    if !comments.is_empty() {
+        markdown.push_str("---\n\n## Comments\n\n");
+        for (name, text) in &comments {
+            markdown.push_str(&format!("- **{name}**: {text}\n\n"));
+        }
+    }
+
+    let word_count = markdown.split_whitespace().count();
+    debug!(
+        word_count,
+        comments = comments.len(),
+        "linkedin extraction done"
+    );
+
+    Some(ExtractionResult {
+        metadata: Metadata {
+            title: if post_author.is_empty() {
+                None
+            } else {
+                Some(format!("{post_author}'s LinkedIn Post"))
+            },
+            description: None,
+            author: if post_author.is_empty() {
+                None
+            } else {
+                Some(post_author)
+            },
+            published_date: None,
+            language: None,
+            url: Some(url.to_string()),
+            site_name: Some("LinkedIn".into()),
+            image: None,
+            favicon: None,
+            word_count,
+        },
+        content: Content {
+            markdown,
+            plain_text: String::new(),
+            links: vec![],
+            images: vec![],
+            code_blocks: vec![],
+            raw_html: None,
+        },
+        domain_data: None,
+        structured_data: vec![],
+    })
+}
+
+/// Unescape HTML entities (named + numeric decimal).
+fn html_unescape(s: &str) -> String {
+    let mut out = String::with_capacity(s.len());
+    let mut chars = s.chars().peekable();
+    while let Some(c) = chars.next() {
+        if c != '&' {
+            out.push(c);
+            continue;
+        }
+        // Collect until ';'
+        let mut entity = String::new();
+        for c2 in chars.by_ref() {
+            if c2 == ';' {
+                break;
+            }
+            entity.push(c2);
+            if entity.len() > 10 {
+                break;
+            }
+        }
+        match entity.as_str() {
+            "quot" => out.push('"'),
+            "amp" => out.push('&'),
+            "lt" => out.push('<'),
+            "gt" => out.push('>'),
+            "apos" => out.push('\''),
+            s if s.starts_with('#') => {
+                let num = &s[1..];
+                if let Ok(n) = num.parse::<u32>()
+                    && let Some(ch) = char::from_u32(n)
+                {
+                    out.push(ch);
+                    continue;
+                }
+                out.push('&');
+                out.push_str(&entity);
+                out.push(';');
+            }
+            _ => {
+                out.push('&');
+                out.push_str(&entity);
+                out.push(';');
+            }
+        }
+    }
+    out
+}
--- a/crates/webclaw-fetch/src/proxy.rs
+++ b/crates/webclaw-fetch/src/proxy.rs
@ -0,0 +1,122 @@
+/// Proxy file parsing utilities.
+///
+/// Format: `host:port:user:pass` (one per line).
+/// Lines starting with `#` and blank lines are skipped.
+/// Also accepts `host:port` (no auth).
+use crate::error::FetchError;
+
+/// Parse a single proxy line into an HTTP proxy URL.
+///
+/// Accepts two formats:
+/// - `host:port:user:pass` -> `http://user:pass@host:port`
+/// - `host:port` -> `http://host:port`
+pub fn parse_proxy_line(line: &str) -> Option<String> {
+    let parts: Vec<&str> = line.trim().splitn(4, ':').collect();
+    match parts.len() {
+        4 => Some(format!(
+            "http://{}:{}@{}:{}",
+            parts[2], parts[3], parts[0], parts[1]
+        )),
+        2 => Some(format!("http://{}:{}", parts[0], parts[1])),
+        _ => None,
+    }
+}
+
+/// Load proxies from a file, returning parsed HTTP proxy URLs.
+///
+/// Skips blank lines and `#` comments. Returns an error if the file
+/// can't be read or contains no valid entries.
+pub fn parse_proxy_file(path: &str) -> Result<Vec<String>, FetchError> {
+    let content = std::fs::read_to_string(path)
+        .map_err(|e| FetchError::Build(format!("failed to read proxy file: {e}")))?;
+
+    let proxies: Vec<String> = content
+        .lines()
+        .filter_map(|line| {
+            let trimmed = line.trim();
+            if trimmed.is_empty() || trimmed.starts_with('#') {
+                None
+            } else {
+                parse_proxy_line(trimmed)
+            }
+        })
+        .collect();
+
+    if proxies.is_empty() {
+        return Err(FetchError::Build(
+            "proxy file is empty or has no valid entries".into(),
+        ));
+    }
+
+    Ok(proxies)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::io::Write;
+
+    #[test]
+    fn parse_host_port_user_pass() {
+        let result = parse_proxy_line("proxy.example.com:8080:alice:s3cret");
+        assert_eq!(
+            result.as_deref(),
+            Some("http://alice:s3cret@proxy.example.com:8080")
+        );
+    }
+
+    #[test]
+    fn parse_host_port_only() {
+        let result = parse_proxy_line("10.0.0.1:3128");
+        assert_eq!(result.as_deref(), Some("http://10.0.0.1:3128"));
+    }
+
+    #[test]
+    fn parse_trims_whitespace() {
+        let result = parse_proxy_line("  host:9999:user:pass  ");
+        assert_eq!(result.as_deref(), Some("http://user:pass@host:9999"));
+    }
+
+    #[test]
+    fn parse_invalid_returns_none() {
+        assert!(parse_proxy_line("just-a-hostname").is_none());
+        assert!(parse_proxy_line("a:b:c").is_none()); // 3 parts is invalid
+        assert!(parse_proxy_line("").is_none());
+    }
+
+    #[test]
+    fn parse_file_happy_path() {
+        let dir = tempfile::tempdir().unwrap();
+        let path = dir.path().join("proxies.txt");
+        let mut f = std::fs::File::create(&path).unwrap();
+        writeln!(f, "# residential pool").unwrap();
+        writeln!(f, "host1:8080:user1:pass1").unwrap();
+        writeln!(f).unwrap(); // blank line
+        writeln!(f, "host2:3128").unwrap();
+        writeln!(f, "# datacenter").unwrap();
+        writeln!(f, "host3:9999:u:p").unwrap();
+        drop(f);
+
+        let proxies = parse_proxy_file(path.to_str().unwrap()).unwrap();
+        assert_eq!(proxies.len(), 3);
+        assert_eq!(proxies[0], "http://user1:pass1@host1:8080");
+        assert_eq!(proxies[1], "http://host2:3128");
+        assert_eq!(proxies[2], "http://u:p@host3:9999");
+    }
+
+    #[test]
+    fn parse_file_empty_errors() {
+        let dir = tempfile::tempdir().unwrap();
+        let path = dir.path().join("empty.txt");
+        std::fs::write(&path, "# only comments\n\n").unwrap();
+
+        let err = parse_proxy_file(path.to_str().unwrap());
+        assert!(err.is_err());
+    }
+
+    #[test]
+    fn parse_file_missing_errors() {
+        let err = parse_proxy_file("/nonexistent/proxies.txt");
+        assert!(err.is_err());
+    }
+}
--- a/crates/webclaw-fetch/src/reddit.rs
+++ b/crates/webclaw-fetch/src/reddit.rs
@ -0,0 +1,172 @@
+/// Reddit JSON API fallback for extracting posts + comments without JS rendering.
+///
+/// Reddit's new `shreddit` frontend only SSRs the post body — comments are
+/// loaded client-side. Appending `.json` to any Reddit URL returns the full
+/// comment tree as structured JSON, which we convert to clean markdown.
+use serde::Deserialize;
+use tracing::debug;
+use webclaw_core::{Content, ExtractionResult, Metadata};
+
+/// Check if a URL points to a Reddit post/comment page.
+pub fn is_reddit_url(url: &str) -> bool {
+    let host = url
+        .split("://")
+        .nth(1)
+        .unwrap_or(url)
+        .split('/')
+        .next()
+        .unwrap_or("");
+    matches!(
+        host,
+        "reddit.com" | "www.reddit.com" | "old.reddit.com" | "np.reddit.com" | "new.reddit.com"
+    )
+}
+
+/// Build the `.json` URL from a Reddit page URL.
+pub fn json_url(url: &str) -> String {
+    let clean = url.split('?').next().unwrap_or(url).trim_end_matches('/');
+    format!("{clean}.json")
+}
+
+/// Convert Reddit JSON API response into an ExtractionResult.
+pub fn parse_reddit_json(json_bytes: &[u8], url: &str) -> Result<ExtractionResult, String> {
+    let listings: Vec<Listing> =
+        serde_json::from_slice(json_bytes).map_err(|e| format!("reddit json parse: {e}"))?;
+
+    let mut markdown = String::new();
+    let mut title = None;
+    let mut author = None;
+    let mut subreddit = None;
+
+    // First listing = the post itself
+    if let Some(post_listing) = listings.first() {
+        for child in &post_listing.data.children {
+            if child.kind == "t3" {
+                let d = &child.data;
+                title = d.title.clone();
+                author = d.author.clone();
+                subreddit = d.subreddit_name_prefixed.clone();
+
+                if let Some(ref t) = title {
+                    markdown.push_str(&format!("# {t}\n\n"));
+                }
+                if let (Some(a), Some(sr)) = (&author, &subreddit) {
+                    markdown.push_str(&format!("**u/{a}** in {sr}\n\n"));
+                }
+                if let Some(ref body) = d.selftext
+                    && !body.is_empty()
+                {
+                    markdown.push_str(body);
+                    markdown.push_str("\n\n");
+                }
+                if let Some(ref url_field) = d.url_overridden_by_dest
+                    && !url_field.is_empty()
+                {
+                    markdown.push_str(&format!("[Link]({url_field})\n\n"));
+                }
+                markdown.push_str("---\n\n");
+            }
+        }
+    }
+
+    // Second listing = comment tree
+    if let Some(comment_listing) = listings.get(1) {
+        markdown.push_str("## Comments\n\n");
+        for child in &comment_listing.data.children {
+            render_comment(child, 0, &mut markdown);
+        }
+    }
+
+    let word_count = markdown.split_whitespace().count();
+    debug!(word_count, "reddit json extracted");
+
+    Ok(ExtractionResult {
+        metadata: Metadata {
+            title,
+            description: None,
+            author,
+            published_date: None,
+            language: Some("en".into()),
+            url: Some(url.to_string()),
+            site_name: subreddit,
+            image: None,
+            favicon: None,
+            word_count,
+        },
+        content: Content {
+            markdown,
+            plain_text: String::new(),
+            links: vec![],
+            images: vec![],
+            code_blocks: vec![],
+            raw_html: None,
+        },
+        domain_data: None,
+        structured_data: vec![],
+    })
+}
+
+fn render_comment(thing: &Thing, depth: usize, out: &mut String) {
+    if thing.kind != "t1" {
+        return;
+    }
+    let d = &thing.data;
+    let indent = "  ".repeat(depth);
+    let author = d.author.as_deref().unwrap_or("[deleted]");
+    let body = d.body.as_deref().unwrap_or("[removed]");
+    let score = d.score.unwrap_or(0);
+
+    out.push_str(&format!("{indent}- **u/{author}** ({score} pts)\n"));
+    for line in body.lines() {
+        out.push_str(&format!("{indent}  {line}\n"));
+    }
+    out.push('\n');
+
+    // Recurse into replies
+    if let Some(Replies::Listing(listing)) = &d.replies {
+        for child in &listing.data.children {
+            render_comment(child, depth + 1, out);
+        }
+    }
+}
+
+// --- Reddit JSON types (minimal) ---
+
+#[derive(Deserialize)]
+struct Listing {
+    data: ListingData,
+}
+
+#[derive(Deserialize)]
+struct ListingData {
+    children: Vec<Thing>,
+}
+
+#[derive(Deserialize)]
+struct Thing {
+    kind: String,
+    data: ThingData,
+}
+
+#[derive(Deserialize)]
+struct ThingData {
+    // Post fields (t3)
+    title: Option<String>,
+    selftext: Option<String>,
+    subreddit_name_prefixed: Option<String>,
+    url_overridden_by_dest: Option<String>,
+    // Comment fields (t1)
+    author: Option<String>,
+    body: Option<String>,
+    score: Option<i64>,
+    replies: Option<Replies>,
+}
+
+/// Reddit replies can be either a nested Listing or an empty string.
+#[derive(Deserialize)]
+#[serde(untagged)]
+enum Replies {
+    Listing(Listing),
+    #[allow(dead_code)]
+    Empty(String),
+}
--- a/crates/webclaw-fetch/src/sitemap.rs
+++ b/crates/webclaw-fetch/src/sitemap.rs
@ -0,0 +1,582 @@
+/// Sitemap parsing and URL discovery.
+///
+/// Discovers URLs from a site's sitemaps using a 3-step process:
+/// 1. Parse robots.txt for `Sitemap:` directives
+/// 2. Try /sitemap.xml as fallback
+/// 3. Recursively resolve sitemap index files
+///
+/// All HTTP requests go through FetchClient to inherit TLS fingerprinting.
+use std::collections::HashSet;
+
+use quick_xml::Reader;
+use quick_xml::events::Event;
+use serde::Serialize;
+use tracing::{debug, warn};
+
+use crate::client::FetchClient;
+use crate::error::FetchError;
+
+/// Maximum depth when recursively fetching sitemap index files.
+/// Prevents infinite loops from circular sitemap references.
+const MAX_RECURSION_DEPTH: usize = 3;
+
+/// A single URL discovered from a sitemap.
+#[derive(Debug, Clone, Serialize)]
+pub struct SitemapEntry {
+    pub url: String,
+    pub last_modified: Option<String>,
+    pub priority: Option<f64>,
+    pub change_freq: Option<String>,
+}
+
+/// Discover all URLs from a site's sitemaps.
+///
+/// Discovery order:
+/// 1. Fetch /robots.txt, parse `Sitemap:` directives
+/// 2. Fetch /sitemap.xml directly
+/// 3. If sitemap index, recursively fetch child sitemaps
+/// 4. Deduplicate by URL
+///
+/// Returns an empty vec (not an error) if no sitemaps are found.
+pub async fn discover(
+    client: &FetchClient,
+    base_url: &str,
+) -> Result<Vec<SitemapEntry>, FetchError> {
+    let base = base_url.trim_end_matches('/');
+    let mut sitemap_urls: Vec<String> = Vec::new();
+
+    // Step 1: Try robots.txt
+    let robots_url = format!("{base}/robots.txt");
+    debug!(url = %robots_url, "fetching robots.txt");
+
+    match client.fetch(&robots_url).await {
+        Ok(result) if result.status == 200 => {
+            let found = parse_robots_txt(&result.html);
+            debug!(count = found.len(), "sitemap URLs from robots.txt");
+            sitemap_urls.extend(found);
+        }
+        Ok(result) => {
+            debug!(status = result.status, "robots.txt not found");
+        }
+        Err(e) => {
+            debug!(error = %e, "failed to fetch robots.txt");
+        }
+    }
+
+    // Step 2: Always try /sitemap.xml as well (may not be listed in robots.txt)
+    let default_sitemap = format!("{base}/sitemap.xml");
+    if !sitemap_urls.iter().any(|u| u == &default_sitemap) {
+        sitemap_urls.push(default_sitemap);
+    }
+
+    // Step 3: Fetch and parse each sitemap, handling indexes recursively
+    let mut seen_urls: HashSet<String> = HashSet::new();
+    let mut entries: Vec<SitemapEntry> = Vec::new();
+
+    fetch_sitemaps(client, &sitemap_urls, &mut entries, &mut seen_urls, 0).await;
+
+    debug!(total = entries.len(), "sitemap discovery complete");
+    Ok(entries)
+}
+
+/// Recursively fetch and parse sitemap URLs, handling both urlsets and indexes.
+async fn fetch_sitemaps(
+    client: &FetchClient,
+    urls: &[String],
+    entries: &mut Vec<SitemapEntry>,
+    seen_urls: &mut HashSet<String>,
+    depth: usize,
+) {
+    if depth > MAX_RECURSION_DEPTH {
+        warn!(depth, "sitemap recursion limit reached, stopping");
+        return;
+    }
+
+    for sitemap_url in urls {
+        debug!(url = %sitemap_url, depth, "fetching sitemap");
+
+        let xml = match client.fetch(sitemap_url).await {
+            Ok(result) if result.status == 200 => result.html,
+            Ok(result) => {
+                debug!(url = %sitemap_url, status = result.status, "sitemap not found");
+                continue;
+            }
+            Err(e) => {
+                debug!(url = %sitemap_url, error = %e, "failed to fetch sitemap");
+                continue;
+            }
+        };
+
+        match detect_sitemap_type(&xml) {
+            SitemapType::UrlSet => {
+                let parsed = parse_urlset(&xml);
+                for entry in parsed {
+                    if seen_urls.insert(entry.url.clone()) {
+                        entries.push(entry);
+                    }
+                }
+            }
+            SitemapType::Index => {
+                let child_urls = parse_sitemap_index(&xml);
+                debug!(count = child_urls.len(), "found child sitemaps in index");
+
+                // Box the recursive call to avoid large future sizes
+                Box::pin(fetch_sitemaps(
+                    client,
+                    &child_urls,
+                    entries,
+                    seen_urls,
+                    depth + 1,
+                ))
+                .await;
+            }
+            SitemapType::Unknown => {
+                debug!(url = %sitemap_url, "unrecognized sitemap format, skipping");
+            }
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Pure parsing functions (no I/O, fully testable)
+// ---------------------------------------------------------------------------
+
+/// Extract `Sitemap:` directive URLs from robots.txt content.
+pub fn parse_robots_txt(text: &str) -> Vec<String> {
+    text.lines()
+        .filter_map(|line| {
+            let trimmed = line.trim();
+            // Case-insensitive match for "Sitemap:" prefix
+            if trimmed.len() > 8 && trimmed[..8].eq_ignore_ascii_case("sitemap:") {
+                let url = trimmed[8..].trim();
+                if !url.is_empty() {
+                    return Some(url.to_string());
+                }
+            }
+            None
+        })
+        .collect()
+}
+
+/// Parse a sitemap XML string. Handles both `<urlset>` and `<sitemapindex>`.
+/// Returns entries from urlsets and recursion targets from indexes.
+pub fn parse_sitemap_xml(xml: &str) -> Vec<SitemapEntry> {
+    match detect_sitemap_type(xml) {
+        SitemapType::UrlSet => parse_urlset(xml),
+        SitemapType::Index => {
+            // For the public parsing API, convert index <loc> entries into
+            // SitemapEntry with just the URL. The async `discover` function
+            // handles actual recursive fetching.
+            parse_sitemap_index(xml)
+                .into_iter()
+                .map(|url| SitemapEntry {
+                    url,
+                    last_modified: None,
+                    priority: None,
+                    change_freq: None,
+                })
+                .collect()
+        }
+        SitemapType::Unknown => Vec::new(),
+    }
+}
+
+#[derive(Debug, PartialEq)]
+enum SitemapType {
+    UrlSet,
+    Index,
+    Unknown,
+}
+
+/// Peek at the first element to determine if this is a urlset or sitemapindex.
+fn detect_sitemap_type(xml: &str) -> SitemapType {
+    let mut reader = Reader::from_str(xml);
+    let mut buf = Vec::new();
+
+    loop {
+        match reader.read_event_into(&mut buf) {
+            Ok(Event::Start(ref e)) | Ok(Event::Empty(ref e)) => {
+                let name = e.local_name();
+                return match name.as_ref() {
+                    b"urlset" => SitemapType::UrlSet,
+                    b"sitemapindex" => SitemapType::Index,
+                    _ => continue, // skip processing instructions, comments
+                };
+            }
+            Ok(Event::Eof) => return SitemapType::Unknown,
+            Err(_) => return SitemapType::Unknown,
+            _ => continue,
+        }
+    }
+}
+
+/// Parse `<url>` entries from a `<urlset>` sitemap.
+fn parse_urlset(xml: &str) -> Vec<SitemapEntry> {
+    let mut reader = Reader::from_str(xml);
+    let mut buf = Vec::new();
+    let mut entries = Vec::new();
+
+    // State for current <url> element being parsed
+    let mut in_url = false;
+    let mut current_tag: Option<UrlTag> = None;
+    let mut loc: Option<String> = None;
+    let mut lastmod: Option<String> = None;
+    let mut priority: Option<f64> = None;
+    let mut changefreq: Option<String> = None;
+
+    loop {
+        match reader.read_event_into(&mut buf) {
+            Ok(Event::Start(ref e)) => {
+                let name = e.local_name();
+                match name.as_ref() {
+                    b"url" => {
+                        in_url = true;
+                        loc = None;
+                        lastmod = None;
+                        priority = None;
+                        changefreq = None;
+                    }
+                    b"loc" if in_url => current_tag = Some(UrlTag::Loc),
+                    b"lastmod" if in_url => current_tag = Some(UrlTag::LastMod),
+                    b"priority" if in_url => current_tag = Some(UrlTag::Priority),
+                    b"changefreq" if in_url => current_tag = Some(UrlTag::ChangeFreq),
+                    _ => current_tag = None,
+                }
+            }
+            Ok(Event::Text(ref e)) => {
+                if let Some(ref tag) = current_tag
+                    && let Ok(text) = e.unescape()
+                {
+                    let text = text.trim().to_string();
+                    if !text.is_empty() {
+                        match tag {
+                            UrlTag::Loc => loc = Some(text),
+                            UrlTag::LastMod => lastmod = Some(text),
+                            UrlTag::Priority => priority = text.parse().ok(),
+                            UrlTag::ChangeFreq => changefreq = Some(text),
+                        }
+                    }
+                }
+            }
+            Ok(Event::End(ref e)) => {
+                let name = e.local_name();
+                if name.as_ref() == b"url" && in_url {
+                    if let Some(url) = loc.take() {
+                        entries.push(SitemapEntry {
+                            url,
+                            last_modified: lastmod.take(),
+                            priority: priority.take(),
+                            change_freq: changefreq.take(),
+                        });
+                    }
+                    in_url = false;
+                }
+                current_tag = None;
+            }
+            Ok(Event::Eof) => break,
+            Err(e) => {
+                warn!(error = %e, "XML parse error in sitemap, returning partial results");
+                break;
+            }
+            _ => {}
+        }
+        buf.clear();
+    }
+
+    entries
+}
+
+#[derive(Debug)]
+enum UrlTag {
+    Loc,
+    LastMod,
+    Priority,
+    ChangeFreq,
+}
+
+/// Parse `<sitemap>` entries from a `<sitemapindex>`, returning child sitemap URLs.
+fn parse_sitemap_index(xml: &str) -> Vec<String> {
+    let mut reader = Reader::from_str(xml);
+    let mut buf = Vec::new();
+    let mut urls = Vec::new();
+
+    let mut in_sitemap = false;
+    let mut in_loc = false;
+
+    loop {
+        match reader.read_event_into(&mut buf) {
+            Ok(Event::Start(ref e)) => {
+                let name = e.local_name();
+                match name.as_ref() {
+                    b"sitemap" => in_sitemap = true,
+                    b"loc" if in_sitemap => in_loc = true,
+                    _ => {}
+                }
+            }
+            Ok(Event::Text(ref e)) => {
+                if in_loc && let Ok(text) = e.unescape() {
+                    let text = text.trim().to_string();
+                    if !text.is_empty() {
+                        urls.push(text);
+                    }
+                }
+            }
+            Ok(Event::End(ref e)) => {
+                let name = e.local_name();
+                match name.as_ref() {
+                    b"sitemap" => {
+                        in_sitemap = false;
+                        in_loc = false;
+                    }
+                    b"loc" => in_loc = false,
+                    _ => {}
+                }
+            }
+            Ok(Event::Eof) => break,
+            Err(e) => {
+                warn!(error = %e, "XML parse error in sitemap index, returning partial results");
+                break;
+            }
+            _ => {}
+        }
+        buf.clear();
+    }
+
+    urls
+}
+
+// ---------------------------------------------------------------------------
+// Tests
+// ---------------------------------------------------------------------------
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_parse_urlset() {
+        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+  <url>
+    <loc>https://example.com/</loc>
+    <lastmod>2026-01-15</lastmod>
+    <changefreq>daily</changefreq>
+    <priority>1.0</priority>
+  </url>
+  <url>
+    <loc>https://example.com/about</loc>
+    <lastmod>2026-01-10</lastmod>
+    <changefreq>monthly</changefreq>
+    <priority>0.8</priority>
+  </url>
+  <url>
+    <loc>https://example.com/blog/post-1</loc>
+  </url>
+</urlset>"#;
+
+        let entries = parse_urlset(xml);
+        assert_eq!(entries.len(), 3);
+
+        assert_eq!(entries[0].url, "https://example.com/");
+        assert_eq!(entries[0].last_modified.as_deref(), Some("2026-01-15"));
+        assert_eq!(entries[0].change_freq.as_deref(), Some("daily"));
+        assert_eq!(entries[0].priority, Some(1.0));
+
+        assert_eq!(entries[1].url, "https://example.com/about");
+        assert_eq!(entries[1].priority, Some(0.8));
+
+        assert_eq!(entries[2].url, "https://example.com/blog/post-1");
+        assert_eq!(entries[2].last_modified, None);
+        assert_eq!(entries[2].priority, None);
+        assert_eq!(entries[2].change_freq, None);
+    }
+
+    #[test]
+    fn test_parse_sitemap_index() {
+        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
+<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+  <sitemap>
+    <loc>https://example.com/sitemap-posts.xml</loc>
+    <lastmod>2026-03-01</lastmod>
+  </sitemap>
+  <sitemap>
+    <loc>https://example.com/sitemap-pages.xml</loc>
+  </sitemap>
+</sitemapindex>"#;
+
+        let urls = parse_sitemap_index(xml);
+        assert_eq!(urls.len(), 2);
+        assert_eq!(urls[0], "https://example.com/sitemap-posts.xml");
+        assert_eq!(urls[1], "https://example.com/sitemap-pages.xml");
+    }
+
+    #[test]
+    fn test_parse_sitemap_xml_dispatches_urlset() {
+        let xml = r#"<?xml version="1.0"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+  <url><loc>https://example.com/page</loc></url>
+</urlset>"#;
+
+        let entries = parse_sitemap_xml(xml);
+        assert_eq!(entries.len(), 1);
+        assert_eq!(entries[0].url, "https://example.com/page");
+    }
+
+    #[test]
+    fn test_parse_sitemap_xml_dispatches_index() {
+        let xml = r#"<?xml version="1.0"?>
+<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+  <sitemap><loc>https://example.com/sitemap-1.xml</loc></sitemap>
+</sitemapindex>"#;
+
+        let entries = parse_sitemap_xml(xml);
+        assert_eq!(entries.len(), 1);
+        assert_eq!(entries[0].url, "https://example.com/sitemap-1.xml");
+        // Index entries have no metadata when parsed through the public API
+        assert_eq!(entries[0].priority, None);
+    }
+
+    #[test]
+    fn test_parse_robots_txt() {
+        let robots = "User-agent: *\n\
+                       Disallow: /admin/\n\
+                       \n\
+                       Sitemap: https://example.com/sitemap.xml\n\
+                       sitemap: https://example.com/sitemap-news.xml\n\
+                       SITEMAP: https://example.com/sitemap-images.xml\n\
+                       \n\
+                       User-agent: Googlebot\n\
+                       Allow: /\n";
+
+        let urls = parse_robots_txt(robots);
+        assert_eq!(urls.len(), 3);
+        assert_eq!(urls[0], "https://example.com/sitemap.xml");
+        assert_eq!(urls[1], "https://example.com/sitemap-news.xml");
+        assert_eq!(urls[2], "https://example.com/sitemap-images.xml");
+    }
+
+    #[test]
+    fn test_parse_robots_txt_empty_value() {
+        // "Sitemap:" with no URL should be skipped
+        let robots = "Sitemap:\nSitemap:   \nSitemap: https://example.com/s.xml\n";
+        let urls = parse_robots_txt(robots);
+        assert_eq!(urls.len(), 1);
+        assert_eq!(urls[0], "https://example.com/s.xml");
+    }
+
+    #[test]
+    fn test_deduplicate() {
+        // parse_sitemap_xml deduplicates via the discover() path, but
+        // we can verify that parsing the same URL twice produces entries
+        // that the HashSet in discover() would collapse.
+        let xml = r#"<?xml version="1.0"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+  <url><loc>https://example.com/page</loc></url>
+  <url><loc>https://example.com/page</loc></url>
+  <url><loc>https://example.com/other</loc></url>
+</urlset>"#;
+
+        let entries = parse_urlset(xml);
+        assert_eq!(entries.len(), 3, "parser returns all entries");
+
+        // Simulate the dedup that discover() does
+        let mut seen = HashSet::new();
+        let deduped: Vec<_> = entries
+            .into_iter()
+            .filter(|e| seen.insert(e.url.clone()))
+            .collect();
+        assert_eq!(deduped.len(), 2, "dedup collapses duplicates");
+    }
+
+    #[test]
+    fn test_empty_sitemap() {
+        let xml = r#"<?xml version="1.0"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+</urlset>"#;
+
+        let entries = parse_urlset(xml);
+        assert!(entries.is_empty());
+    }
+
+    #[test]
+    fn test_malformed_xml() {
+        let xml = "this is not xml at all <><><";
+        let entries = parse_sitemap_xml(xml);
+        assert!(entries.is_empty(), "malformed XML returns empty vec");
+    }
+
+    #[test]
+    fn test_malformed_xml_partial() {
+        // Partial XML that starts valid but breaks mid-stream
+        let xml = r#"<?xml version="1.0"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+  <url><loc>https://example.com/good</loc></url>
+  <url><loc>broken
+"#;
+        let entries = parse_sitemap_xml(xml);
+        // Should return at least the successfully parsed entry
+        assert!(entries.len() >= 1);
+        assert_eq!(entries[0].url, "https://example.com/good");
+    }
+
+    #[test]
+    fn test_missing_loc() {
+        let xml = r#"<?xml version="1.0"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+  <url>
+    <lastmod>2026-01-01</lastmod>
+    <priority>0.5</priority>
+  </url>
+  <url>
+    <loc>https://example.com/valid</loc>
+  </url>
+</urlset>"#;
+
+        let entries = parse_urlset(xml);
+        assert_eq!(entries.len(), 1, "entry without <loc> is skipped");
+        assert_eq!(entries[0].url, "https://example.com/valid");
+    }
+
+    #[test]
+    fn test_priority_parsing() {
+        let xml = r#"<?xml version="1.0"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+  <url>
+    <loc>https://example.com/high</loc>
+    <priority>1.0</priority>
+  </url>
+  <url>
+    <loc>https://example.com/mid</loc>
+    <priority>0.5</priority>
+  </url>
+  <url>
+    <loc>https://example.com/low</loc>
+    <priority>0.1</priority>
+  </url>
+  <url>
+    <loc>https://example.com/invalid</loc>
+    <priority>not-a-number</priority>
+  </url>
+</urlset>"#;
+
+        let entries = parse_urlset(xml);
+        assert_eq!(entries.len(), 4);
+
+        assert_eq!(entries[0].priority, Some(1.0));
+        assert_eq!(entries[1].priority, Some(0.5));
+        assert_eq!(entries[2].priority, Some(0.1));
+        assert_eq!(entries[3].priority, None, "invalid priority parses as None");
+    }
+
+    #[test]
+    fn test_detect_sitemap_type() {
+        let urlset = r#"<?xml version="1.0"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"></urlset>"#;
+        assert_eq!(detect_sitemap_type(urlset), SitemapType::UrlSet);
+
+        let index = r#"<?xml version="1.0"?><sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"></sitemapindex>"#;
+        assert_eq!(detect_sitemap_type(index), SitemapType::Index);
+
+        assert_eq!(detect_sitemap_type("garbage"), SitemapType::Unknown);
+        assert_eq!(detect_sitemap_type(""), SitemapType::Unknown);
+    }
+}