chore: rebrand webclaw to noxa

2026-05-13 17:02:36 +02:00 · 2026-04-11 00:10:38 -04:00 · 2026-04-11 00:10:38 -04:00 · 8674b60b4e
commit 8674b60b4e
parent a4c351d5ae
86 changed files with 781 additions and 2121 deletions
--- a/crates/noxa-fetch/Cargo.toml
+++ b/crates/noxa-fetch/Cargo.toml
@ -0,0 +1,26 @@
+[package]
+name = "noxa-fetch"
+description = "HTTP client with browser TLS fingerprint impersonation via wreq"
+version.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+noxa-core = { workspace = true }
+noxa-pdf = { path = "../noxa-pdf" }
+serde = { workspace = true }
+thiserror = { workspace = true }
+tracing = { workspace = true }
+tokio = { workspace = true }
+wreq = { version = "6.0.0-rc.28", features = ["cookies", "gzip", "brotli", "zstd", "deflate"] }
+http = "1"
+bytes = "1"
+url = "2"
+rand = "0.8"
+quick-xml = { version = "0.37", features = ["serde"] }
+serde_json.workspace = true
+calamine = "0.34"
+zip = "2"
+
+[dev-dependencies]
+tempfile = "3"
--- a/crates/noxa-fetch/src/browser.rs
+++ b/crates/noxa-fetch/src/browser.rs
@ -0,0 +1,51 @@
+//! Browser fingerprint selection and rotation.
+//! Maps our BrowserProfile enum to noxa-http client builder methods.
+
+/// Which browser identity to present at the TLS/HTTP layer.
+#[derive(Debug, Clone, Default)]
+pub enum BrowserProfile {
+    #[default]
+    Chrome,
+    Firefox,
+    /// Randomly pick from all available profiles on each request.
+    Random,
+}
+
+/// A browser variant for building noxa-http clients.
+#[derive(Debug, Clone, Copy)]
+pub enum BrowserVariant {
+    Chrome,
+    ChromeMacos,
+    Firefox,
+    Safari,
+    Edge,
+}
+
+/// All Chrome variants we ship.
+pub fn chrome_variants() -> Vec<BrowserVariant> {
+    vec![BrowserVariant::Chrome, BrowserVariant::ChromeMacos]
+}
+
+/// All Firefox variants we ship.
+pub fn firefox_variants() -> Vec<BrowserVariant> {
+    vec![BrowserVariant::Firefox]
+}
+
+/// All variants for maximum diversity in Random mode.
+pub fn all_variants() -> Vec<BrowserVariant> {
+    vec![
+        BrowserVariant::Chrome,
+        BrowserVariant::ChromeMacos,
+        BrowserVariant::Firefox,
+        BrowserVariant::Safari,
+        BrowserVariant::Edge,
+    ]
+}
+
+pub fn latest_chrome() -> BrowserVariant {
+    BrowserVariant::Chrome
+}
+
+pub fn latest_firefox() -> BrowserVariant {
+    BrowserVariant::Firefox
+}
--- a/crates/noxa-fetch/src/client.rs
+++ b/crates/noxa-fetch/src/client.rs
@ -0,0 +1,836 @@
+/// HTTP client with browser TLS fingerprint impersonation.
+/// Uses wreq (BoringSSL) for browser-grade TLS + HTTP/2 fingerprinting.
+/// Supports single and batch operations with proxy rotation.
+/// Automatically detects PDF responses and extracts text via noxa-pdf.
+///
+/// Two proxy modes:
+/// - **Static**: single proxy (or none) baked into pre-built clients at construction.
+/// - **Rotating**: pre-built pool of clients, each with a different proxy + fingerprint.
+///   Same-host URLs are routed to the same client for HTTP/2 connection reuse.
+use std::collections::HashMap;
+use std::hash::{Hash, Hasher};
+use std::sync::Arc;
+use std::time::{Duration, Instant};
+
+use rand::seq::SliceRandom;
+use tokio::sync::Semaphore;
+use tracing::{debug, instrument, warn};
+use noxa_pdf::PdfMode;
+
+use crate::browser::{self, BrowserProfile, BrowserVariant};
+use crate::error::FetchError;
+
+/// Configuration for building a [`FetchClient`].
+#[derive(Debug, Clone)]
+pub struct FetchConfig {
+    pub browser: BrowserProfile,
+    /// Single proxy URL. Used when `proxy_pool` is empty.
+    pub proxy: Option<String>,
+    /// Pool of proxy URLs to rotate through.
+    /// When non-empty, each proxy gets a pre-built client with a
+    /// random browser fingerprint. Same-host URLs reuse the same client
+    /// for HTTP/2 connection multiplexing.
+    pub proxy_pool: Vec<String>,
+    pub timeout: Duration,
+    pub follow_redirects: bool,
+    pub max_redirects: u32,
+    pub headers: HashMap<String, String>,
+    pub pdf_mode: PdfMode,
+}
+
+impl Default for FetchConfig {
+    fn default() -> Self {
+        Self {
+            browser: BrowserProfile::Chrome,
+            proxy: None,
+            proxy_pool: Vec::new(),
+            timeout: Duration::from_secs(12),
+            follow_redirects: true,
+            max_redirects: 10,
+            headers: HashMap::from([("Accept-Language".to_string(), "en-US,en;q=0.9".to_string())]),
+            pdf_mode: PdfMode::default(),
+        }
+    }
+}
+
+/// Result of a successful fetch.
+#[derive(Debug, Clone)]
+pub struct FetchResult {
+    pub html: String,
+    pub status: u16,
+    /// Final URL after any redirects.
+    pub url: String,
+    pub headers: http::HeaderMap,
+    pub elapsed: Duration,
+}
+
+/// Result for a single URL in a batch fetch operation.
+#[derive(Debug)]
+pub struct BatchResult {
+    pub url: String,
+    pub result: Result<FetchResult, FetchError>,
+}
+
+/// Result for a single URL in a batch fetch-and-extract operation.
+#[derive(Debug)]
+pub struct BatchExtractResult {
+    pub url: String,
+    pub result: Result<noxa_core::ExtractionResult, FetchError>,
+}
+
+/// Buffered response that owns its body. Provides the same sync API
+/// that noxa-http::Response used to provide.
+struct Response {
+    status: u16,
+    url: String,
+    headers: http::HeaderMap,
+    body: bytes::Bytes,
+}
+
+impl Response {
+    /// Buffer a wreq response into an owned Response.
+    async fn from_wreq(resp: wreq::Response) -> Result<Self, FetchError> {
+        let status = resp.status().as_u16();
+        let url = resp.uri().to_string();
+        let headers = resp.headers().clone();
+        let body = resp
+            .bytes()
+            .await
+            .map_err(|e| FetchError::BodyDecode(e.to_string()))?;
+        Ok(Self {
+            status,
+            url,
+            headers,
+            body,
+        })
+    }
+
+    fn status(&self) -> u16 {
+        self.status
+    }
+    fn url(&self) -> &str {
+        &self.url
+    }
+    fn headers(&self) -> &http::HeaderMap {
+        &self.headers
+    }
+    fn body(&self) -> &[u8] {
+        &self.body
+    }
+    fn is_success(&self) -> bool {
+        (200..300).contains(&self.status)
+    }
+
+    fn text(&self) -> std::borrow::Cow<'_, str> {
+        String::from_utf8_lossy(&self.body)
+    }
+
+    fn into_text(self) -> String {
+        String::from_utf8_lossy(&self.body).into_owned()
+    }
+}
+
+/// Internal representation of the client pool strategy.
+enum ClientPool {
+    /// Pre-built clients with a fixed proxy (or no proxy).
+    /// Fingerprint rotation still works via the pool when `random` is true.
+    Static {
+        clients: Vec<wreq::Client>,
+        random: bool,
+    },
+    /// Pre-built pool of clients, each with a different proxy + fingerprint.
+    /// Requests pick a client deterministically by host for HTTP/2 connection reuse.
+    Rotating { clients: Vec<wreq::Client> },
+}
+
+/// HTTP client with browser TLS + HTTP/2 fingerprinting via wreq.
+///
+/// Operates in two modes:
+/// - **Static pool**: pre-built clients, optionally with fingerprint rotation.
+///   Used when no `proxy_pool` is configured. Fast (no per-request construction).
+/// - **Rotating pool**: pre-built clients, one per proxy in the pool.
+///   Same-host URLs are routed to the same client for HTTP/2 multiplexing.
+pub struct FetchClient {
+    pool: ClientPool,
+    pdf_mode: PdfMode,
+}
+
+impl FetchClient {
+    /// Build a new client from config.
+    pub fn new(config: FetchConfig) -> Result<Self, FetchError> {
+        let variants = collect_variants(&config.browser);
+        let pdf_mode = config.pdf_mode.clone();
+
+        let pool = if config.proxy_pool.is_empty() {
+            let clients = variants
+                .into_iter()
+                .map(|v| {
+                    crate::tls::build_client(
+                        v,
+                        config.timeout,
+                        &config.headers,
+                        config.proxy.as_deref(),
+                    )
+                })
+                .collect::<Result<Vec<_>, _>>()?;
+
+            let random = matches!(config.browser, BrowserProfile::Random);
+            debug!(
+                count = clients.len(),
+                random, "fetch client ready (static pool)"
+            );
+
+            ClientPool::Static { clients, random }
+        } else {
+            let mut rng = rand::thread_rng();
+
+            let clients = config
+                .proxy_pool
+                .iter()
+                .map(|proxy| {
+                    let v = *variants.choose(&mut rng).unwrap();
+                    crate::tls::build_client(v, config.timeout, &config.headers, Some(proxy))
+                })
+                .collect::<Result<Vec<_>, _>>()?;
+
+            debug!(
+                clients = clients.len(),
+                "fetch client ready (pre-built rotating pool)"
+            );
+
+            ClientPool::Rotating { clients }
+        };
+
+        Ok(Self { pool, pdf_mode })
+    }
+
+    /// Fetch a URL and return the raw HTML + response metadata.
+    ///
+    /// Automatically retries on transient failures (network errors, 5xx, 429)
+    /// with exponential backoff: 0s, 1s (2 attempts total).
+    #[instrument(skip(self), fields(url = %url))]
+    pub async fn fetch(&self, url: &str) -> Result<FetchResult, FetchError> {
+        let delays = [Duration::ZERO, Duration::from_secs(1)];
+        let mut last_err = None;
+
+        for (attempt, delay) in delays.iter().enumerate() {
+            if attempt > 0 {
+                tokio::time::sleep(*delay).await;
+            }
+
+            match self.fetch_once(url).await {
+                Ok(result) => {
+                    if is_retryable_status(result.status) && attempt < delays.len() - 1 {
+                        warn!(
+                            url,
+                            status = result.status,
+                            attempt = attempt + 1,
+                            "retryable status, will retry"
+                        );
+                        last_err = Some(FetchError::Build(format!("HTTP {}", result.status)));
+                        continue;
+                    }
+                    if attempt > 0 {
+                        debug!(url, attempt = attempt + 1, "retry succeeded");
+                    }
+                    return Ok(result);
+                }
+                Err(e) => {
+                    if !is_retryable_error(&e) || attempt == delays.len() - 1 {
+                        return Err(e);
+                    }
+                    warn!(
+                        url,
+                        error = %e,
+                        attempt = attempt + 1,
+                        "transient error, will retry"
+                    );
+                    last_err = Some(e);
+                }
+            }
+        }
+
+        Err(last_err.unwrap_or_else(|| FetchError::Build("all retries exhausted".into())))
+    }
+
+    /// Single fetch attempt.
+    async fn fetch_once(&self, url: &str) -> Result<FetchResult, FetchError> {
+        let start = Instant::now();
+        let client = self.pick_client(url);
+
+        let resp = client.get(url).send().await?;
+        let response = Response::from_wreq(resp).await?;
+        response_to_result(response, start)
+    }
+
+    /// Fetch a URL then extract structured content.
+    #[instrument(skip(self), fields(url = %url))]
+    pub async fn fetch_and_extract(
+        &self,
+        url: &str,
+    ) -> Result<noxa_core::ExtractionResult, FetchError> {
+        self.fetch_and_extract_with_options(url, &noxa_core::ExtractionOptions::default())
+            .await
+    }
+
+    /// Fetch a URL then extract structured content with custom extraction options.
+    #[instrument(skip(self, options), fields(url = %url))]
+    pub async fn fetch_and_extract_with_options(
+        &self,
+        url: &str,
+        options: &noxa_core::ExtractionOptions,
+    ) -> Result<noxa_core::ExtractionResult, FetchError> {
+        // Reddit fallback: use their JSON API to get post + full comment tree.
+        if crate::reddit::is_reddit_url(url) {
+            let json_url = crate::reddit::json_url(url);
+            debug!("reddit detected, fetching {json_url}");
+
+            let client = self.pick_client(url);
+            let resp = client.get(&json_url).send().await?;
+            let response = Response::from_wreq(resp).await?;
+            if response.is_success() {
+                let bytes = response.body();
+                match crate::reddit::parse_reddit_json(bytes, url) {
+                    Ok(result) => return Ok(result),
+                    Err(e) => warn!("reddit json fallback failed: {e}, falling back to HTML"),
+                }
+            }
+        }
+
+        let start = Instant::now();
+        let client = self.pick_client(url);
+        let resp = client.get(url).send().await?;
+        let mut response = Response::from_wreq(resp).await?;
+
+        // Cookie warmup: if we get a challenge page, visit the homepage first
+        // to collect Akamai cookies (_abck, bm_sz, etc.), then retry.
+        if is_challenge_response(&response)
+            && let Some(homepage) = extract_homepage(url)
+        {
+            debug!("challenge detected, warming cookies via {homepage}");
+            let _ = client.get(&homepage).send().await;
+            let resp = client.get(url).send().await?;
+            response = Response::from_wreq(resp).await?;
+            debug!("retried after cookie warmup: status={}", response.status());
+        }
+
+        let status = response.status();
+        let final_url = response.url().to_string();
+
+        let headers = response.headers().clone();
+
+        let is_pdf = is_pdf_content_type(&headers);
+
+        if is_pdf {
+            debug!(status, "detected PDF response, using pdf extraction");
+
+            let bytes = response.body();
+
+            let elapsed = start.elapsed();
+            debug!(
+                status,
+                bytes = bytes.len(),
+                elapsed_ms = %elapsed.as_millis(),
+                "PDF fetch complete"
+            );
+
+            let pdf_result = noxa_pdf::extract_pdf(bytes, self.pdf_mode.clone())?;
+            Ok(pdf_to_extraction_result(&pdf_result, &final_url))
+        } else if let Some(doc_type) =
+            crate::document::is_document_content_type(&headers, &final_url)
+        {
+            debug!(status, doc_type = ?doc_type, "detected document response, extracting");
+
+            let bytes = response.body();
+
+            let elapsed = start.elapsed();
+            debug!(
+                status,
+                bytes = bytes.len(),
+                elapsed_ms = %elapsed.as_millis(),
+                "document fetch complete"
+            );
+
+            let mut result = crate::document::extract_document(bytes, doc_type)?;
+            result.metadata.url = Some(final_url);
+            Ok(result)
+        } else {
+            let html = response.into_text();
+
+            let elapsed = start.elapsed();
+            debug!(status, elapsed_ms = %elapsed.as_millis(), "fetch complete");
+
+            // LinkedIn: extract from embedded <code> JSON blobs
+            if crate::linkedin::is_linkedin_post(&final_url) {
+                if let Some(result) = crate::linkedin::extract_linkedin_post(&html, &final_url) {
+                    debug!("linkedin extraction succeeded");
+                    return Ok(result);
+                }
+                debug!("linkedin extraction failed, falling back to standard");
+            }
+
+            let extraction = noxa_core::extract_with_options(&html, Some(&final_url), options)?;
+
+            Ok(extraction)
+        }
+    }
+
+    /// Fetch multiple URLs concurrently with bounded parallelism.
+    pub async fn fetch_batch(
+        self: &Arc<Self>,
+        urls: &[&str],
+        concurrency: usize,
+    ) -> Vec<BatchResult> {
+        let semaphore = Arc::new(Semaphore::new(concurrency));
+        let mut handles = Vec::with_capacity(urls.len());
+
+        for (idx, url) in urls.iter().enumerate() {
+            let permit = Arc::clone(&semaphore);
+            let client = Arc::clone(self);
+            let url = url.to_string();
+
+            handles.push(tokio::spawn(async move {
+                let _permit = permit.acquire().await.expect("semaphore closed");
+                let result = client.fetch(&url).await;
+                (idx, BatchResult { url, result })
+            }));
+        }
+
+        collect_ordered(handles, urls.len()).await
+    }
+
+    /// Fetch and extract multiple URLs concurrently with bounded parallelism.
+    pub async fn fetch_and_extract_batch(
+        self: &Arc<Self>,
+        urls: &[&str],
+        concurrency: usize,
+    ) -> Vec<BatchExtractResult> {
+        self.fetch_and_extract_batch_with_options(
+            urls,
+            concurrency,
+            &noxa_core::ExtractionOptions::default(),
+        )
+        .await
+    }
+
+    /// Fetch and extract multiple URLs concurrently with custom extraction options.
+    pub async fn fetch_and_extract_batch_with_options(
+        self: &Arc<Self>,
+        urls: &[&str],
+        concurrency: usize,
+        options: &noxa_core::ExtractionOptions,
+    ) -> Vec<BatchExtractResult> {
+        let semaphore = Arc::new(Semaphore::new(concurrency));
+        let mut handles = Vec::with_capacity(urls.len());
+
+        for (idx, url) in urls.iter().enumerate() {
+            let permit = Arc::clone(&semaphore);
+            let client = Arc::clone(self);
+            let url = url.to_string();
+            let opts = options.clone();
+
+            handles.push(tokio::spawn(async move {
+                let _permit = permit.acquire().await.expect("semaphore closed");
+                let result = client.fetch_and_extract_with_options(&url, &opts).await;
+                (idx, BatchExtractResult { url, result })
+            }));
+        }
+
+        collect_ordered(handles, urls.len()).await
+    }
+
+    /// Returns the number of proxies in the rotation pool, or 0 if static mode.
+    pub fn proxy_pool_size(&self) -> usize {
+        match &self.pool {
+            ClientPool::Static { .. } => 0,
+            ClientPool::Rotating { clients } => clients.len(),
+        }
+    }
+
+    /// Pick a client from the pool for a given URL.
+    fn pick_client(&self, url: &str) -> &wreq::Client {
+        match &self.pool {
+            ClientPool::Static { clients, random } => {
+                if *random {
+                    let host = extract_host(url);
+                    pick_for_host(clients, &host)
+                } else {
+                    &clients[0]
+                }
+            }
+            ClientPool::Rotating { clients } => pick_random(clients),
+        }
+    }
+}
+
+/// Collect the browser variants to use based on the browser profile.
+fn collect_variants(profile: &BrowserProfile) -> Vec<BrowserVariant> {
+    match profile {
+        BrowserProfile::Random => browser::all_variants(),
+        BrowserProfile::Chrome => vec![browser::latest_chrome()],
+        BrowserProfile::Firefox => vec![browser::latest_firefox()],
+    }
+}
+
+/// Convert a buffered Response into a FetchResult.
+fn response_to_result(response: Response, start: Instant) -> Result<FetchResult, FetchError> {
+    let status = response.status();
+    let final_url = response.url().to_string();
+    let headers = response.headers().clone();
+    let html = response.into_text();
+    let elapsed = start.elapsed();
+
+    debug!(status, elapsed_ms = %elapsed.as_millis(), "fetch complete");
+
+    Ok(FetchResult {
+        html,
+        status,
+        url: final_url,
+        headers,
+        elapsed,
+    })
+}
+
+/// Extract the host from a URL, returning empty string on parse failure.
+fn extract_host(url: &str) -> String {
+    url::Url::parse(url)
+        .ok()
+        .and_then(|u| u.host_str().map(String::from))
+        .unwrap_or_default()
+}
+
+/// Pick a client deterministically based on a host string.
+/// Same host always gets the same client, enabling HTTP/2 connection reuse.
+fn pick_for_host<'a>(clients: &'a [wreq::Client], host: &str) -> &'a wreq::Client {
+    let mut hasher = std::collections::hash_map::DefaultHasher::new();
+    host.hash(&mut hasher);
+    let idx = (hasher.finish() as usize) % clients.len();
+    &clients[idx]
+}
+
+/// Pick a random client from the pool for per-request rotation.
+fn pick_random(clients: &[wreq::Client]) -> &wreq::Client {
+    use rand::Rng;
+    let idx = rand::thread_rng().gen_range(0..clients.len());
+    &clients[idx]
+}
+
+/// Status codes worth retrying: server errors + rate limiting.
+fn is_retryable_status(status: u16) -> bool {
+    status == 429
+        || status == 502
+        || status == 503
+        || status == 504
+        || status == 520
+        || status == 521
+        || status == 522
+        || status == 523
+        || status == 524
+}
+
+/// Errors worth retrying: network/connection failures (not client errors).
+fn is_retryable_error(err: &FetchError) -> bool {
+    matches!(err, FetchError::Request(_) | FetchError::BodyDecode(_))
+}
+
+fn is_pdf_content_type(headers: &http::HeaderMap) -> bool {
+    headers
+        .get("content-type")
+        .and_then(|ct| ct.to_str().ok())
+        .map(|ct| {
+            let mime = ct.split(';').next().unwrap_or("").trim();
+            mime.eq_ignore_ascii_case("application/pdf")
+        })
+        .unwrap_or(false)
+}
+
+/// Detect if a response looks like a bot protection challenge page.
+fn is_challenge_response(response: &Response) -> bool {
+    let len = response.body().len();
+    if len > 15_000 || len == 0 {
+        return false;
+    }
+
+    let text = response.text();
+    let lower = text.to_lowercase();
+
+    if lower.contains("<title>challenge page</title>") {
+        return true;
+    }
+
+    if lower.contains("bazadebezolkohpepadr") && len < 5_000 {
+        return true;
+    }
+
+    false
+}
+
+/// Extract the homepage URL (scheme + host) from a full URL.
+fn extract_homepage(url: &str) -> Option<String> {
+    url::Url::parse(url)
+        .ok()
+        .map(|u| format!("{}://{}/", u.scheme(), u.host_str().unwrap_or("")))
+}
+
+/// Convert a noxa-pdf PdfResult into a noxa-core ExtractionResult.
+fn pdf_to_extraction_result(
+    pdf: &noxa_pdf::PdfResult,
+    url: &str,
+) -> noxa_core::ExtractionResult {
+    let markdown = noxa_pdf::to_markdown(pdf);
+    let word_count = markdown.split_whitespace().count();
+
+    noxa_core::ExtractionResult {
+        metadata: noxa_core::Metadata {
+            title: pdf.metadata.title.clone(),
+            description: pdf.metadata.subject.clone(),
+            author: pdf.metadata.author.clone(),
+            published_date: None,
+            language: None,
+            url: Some(url.to_string()),
+            site_name: None,
+            image: None,
+            favicon: None,
+            word_count,
+        },
+        content: noxa_core::Content {
+            markdown,
+            plain_text: pdf.text.clone(),
+            links: Vec::new(),
+            images: Vec::new(),
+            code_blocks: Vec::new(),
+            raw_html: None,
+        },
+        domain_data: None,
+        structured_data: vec![],
+    }
+}
+
+/// Collect spawned tasks and reorder results to match input order.
+async fn collect_ordered<T>(
+    handles: Vec<tokio::task::JoinHandle<(usize, T)>>,
+    len: usize,
+) -> Vec<T> {
+    let mut slots: Vec<Option<T>> = (0..len).map(|_| None).collect();
+
+    for handle in handles {
+        match handle.await {
+            Ok((idx, result)) => {
+                slots[idx] = Some(result);
+            }
+            Err(e) => {
+                warn!(error = %e, "batch task panicked");
+            }
+        }
+    }
+
+    slots.into_iter().flatten().collect()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_batch_result_struct() {
+        let ok = BatchResult {
+            url: "https://example.com".to_string(),
+            result: Ok(FetchResult {
+                html: "<html></html>".to_string(),
+                status: 200,
+                url: "https://example.com".to_string(),
+                headers: http::HeaderMap::new(),
+                elapsed: Duration::from_millis(42),
+            }),
+        };
+        assert_eq!(ok.url, "https://example.com");
+        assert!(ok.result.is_ok());
+        assert_eq!(ok.result.unwrap().status, 200);
+
+        let err = BatchResult {
+            url: "https://bad.example".to_string(),
+            result: Err(FetchError::InvalidUrl("bad url".into())),
+        };
+        assert!(err.result.is_err());
+    }
+
+    #[test]
+    fn test_batch_extract_result_struct() {
+        let err = BatchExtractResult {
+            url: "https://example.com".to_string(),
+            result: Err(FetchError::BodyDecode("timeout".into())),
+        };
+        assert_eq!(err.url, "https://example.com");
+        assert!(err.result.is_err());
+    }
+
+    #[tokio::test]
+    async fn test_batch_preserves_order() {
+        let handles: Vec<tokio::task::JoinHandle<(usize, String)>> = vec![
+            tokio::spawn(async { (2, "c".to_string()) }),
+            tokio::spawn(async { (0, "a".to_string()) }),
+            tokio::spawn(async { (1, "b".to_string()) }),
+        ];
+
+        let results = collect_ordered(handles, 3).await;
+        assert_eq!(results, vec!["a", "b", "c"]);
+    }
+
+    #[tokio::test]
+    async fn test_collect_ordered_handles_gaps() {
+        let handles: Vec<tokio::task::JoinHandle<(usize, String)>> = vec![
+            tokio::spawn(async { (0, "first".to_string()) }),
+            tokio::spawn(async { (2, "third".to_string()) }),
+        ];
+
+        let results = collect_ordered(handles, 3).await;
+        assert_eq!(results.len(), 2);
+        assert_eq!(results[0], "first");
+        assert_eq!(results[1], "third");
+    }
+
+    #[test]
+    fn test_is_pdf_content_type() {
+        let mut headers = http::HeaderMap::new();
+        headers.insert("content-type", "application/pdf".parse().unwrap());
+        assert!(is_pdf_content_type(&headers));
+
+        headers.insert(
+            "content-type",
+            "application/pdf; charset=utf-8".parse().unwrap(),
+        );
+        assert!(is_pdf_content_type(&headers));
+
+        headers.insert("content-type", "Application/PDF".parse().unwrap());
+        assert!(is_pdf_content_type(&headers));
+
+        headers.insert("content-type", "text/html".parse().unwrap());
+        assert!(!is_pdf_content_type(&headers));
+
+        let empty = http::HeaderMap::new();
+        assert!(!is_pdf_content_type(&empty));
+    }
+
+    #[test]
+    fn test_pdf_to_extraction_result() {
+        let pdf = noxa_pdf::PdfResult {
+            text: "Hello from PDF.".into(),
+            page_count: 2,
+            metadata: noxa_pdf::PdfMetadata {
+                title: Some("My Doc".into()),
+                author: Some("Author".into()),
+                subject: Some("Testing".into()),
+                creator: None,
+            },
+        };
+
+        let result = pdf_to_extraction_result(&pdf, "https://example.com/doc.pdf");
+
+        assert_eq!(result.metadata.title.as_deref(), Some("My Doc"));
+        assert_eq!(result.metadata.author.as_deref(), Some("Author"));
+        assert_eq!(result.metadata.description.as_deref(), Some("Testing"));
+        assert_eq!(
+            result.metadata.url.as_deref(),
+            Some("https://example.com/doc.pdf")
+        );
+        assert!(result.content.markdown.contains("# My Doc"));
+        assert!(result.content.markdown.contains("Hello from PDF."));
+        assert_eq!(result.content.plain_text, "Hello from PDF.");
+        assert!(result.content.links.is_empty());
+        assert!(result.domain_data.is_none());
+        assert!(result.metadata.word_count > 0);
+    }
+
+    #[test]
+    fn test_static_pool_no_proxy() {
+        let config = FetchConfig::default();
+        let client = FetchClient::new(config).unwrap();
+        assert_eq!(client.proxy_pool_size(), 0);
+    }
+
+    #[test]
+    fn test_rotating_pool_prebuilds_clients() {
+        let config = FetchConfig {
+            proxy_pool: vec![
+                "http://proxy1:8080".into(),
+                "http://proxy2:8080".into(),
+                "http://proxy3:8080".into(),
+            ],
+            ..Default::default()
+        };
+        let client = FetchClient::new(config).unwrap();
+        assert_eq!(client.proxy_pool_size(), 3);
+    }
+
+    #[test]
+    fn test_pick_for_host_deterministic() {
+        let config = FetchConfig {
+            browser: BrowserProfile::Random,
+            ..Default::default()
+        };
+        let client = FetchClient::new(config).unwrap();
+
+        let clients = match &client.pool {
+            ClientPool::Static { clients, .. } => clients,
+            ClientPool::Rotating { clients } => clients,
+        };
+
+        let a1 = pick_for_host(clients, "example.com") as *const _;
+        let a2 = pick_for_host(clients, "example.com") as *const _;
+        let a3 = pick_for_host(clients, "example.com") as *const _;
+        assert_eq!(a1, a2);
+        assert_eq!(a2, a3);
+    }
+
+    #[test]
+    fn test_pick_for_host_distributes() {
+        let config = FetchConfig {
+            proxy_pool: (0..10).map(|i| format!("http://proxy{i}:8080")).collect(),
+            ..Default::default()
+        };
+        let client = FetchClient::new(config).unwrap();
+
+        let clients = match &client.pool {
+            ClientPool::Static { clients, .. } | ClientPool::Rotating { clients } => clients,
+        };
+
+        let hosts = [
+            "example.com",
+            "google.com",
+            "github.com",
+            "rust-lang.org",
+            "crates.io",
+        ];
+
+        let indices: Vec<usize> = hosts
+            .iter()
+            .map(|h| {
+                let ptr = pick_for_host(clients, h) as *const _;
+                clients.iter().position(|c| std::ptr::eq(c, ptr)).unwrap()
+            })
+            .collect();
+
+        let unique: std::collections::HashSet<_> = indices.iter().collect();
+        assert!(
+            unique.len() >= 2,
+            "expected host distribution across clients, got indices: {indices:?}"
+        );
+    }
+
+    #[test]
+    fn test_extract_host() {
+        assert_eq!(extract_host("https://example.com/path"), "example.com");
+        assert_eq!(
+            extract_host("https://sub.example.com:8080/foo"),
+            "sub.example.com"
+        );
+        assert_eq!(extract_host("not-a-url"), "");
+    }
+
+    #[test]
+    fn test_default_config_has_empty_proxy_pool() {
+        let config = FetchConfig::default();
+        assert!(config.proxy_pool.is_empty());
+        assert!(config.proxy.is_none());
+    }
+}
--- a/crates/noxa-fetch/src/crawler.rs
+++ b/crates/noxa-fetch/src/crawler.rs
@ -0,0 +1,648 @@
+/// Recursive same-origin web crawler built on top of [`FetchClient`].
+///
+/// Starts from a seed URL, extracts content, discovers links, and follows
+/// them breadth-first up to a configurable depth/page limit. Uses a semaphore
+/// for bounded concurrency and per-request delays for politeness.
+///
+/// When `use_sitemap` is enabled, the crawler first discovers URLs from the
+/// site's sitemaps and seeds the BFS frontier before crawling.
+use std::collections::HashSet;
+use std::path::Path;
+use std::sync::Arc;
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::time::{Duration, Instant};
+
+use serde::{Deserialize, Serialize};
+use tokio::sync::Semaphore;
+use tracing::{debug, info, warn};
+use url::Url;
+
+use crate::client::{FetchClient, FetchConfig};
+use crate::error::FetchError;
+use crate::sitemap;
+
+/// Controls crawl scope, depth, concurrency, and politeness.
+#[derive(Debug, Clone)]
+pub struct CrawlConfig {
+    /// Fetch configuration (browser profile, proxy, timeout, etc.)
+    pub fetch: FetchConfig,
+    /// How deep to follow links. 1 = only immediate links from seed page.
+    pub max_depth: usize,
+    /// Hard cap on total pages fetched (including the seed).
+    pub max_pages: usize,
+    /// Max concurrent in-flight requests.
+    pub concurrency: usize,
+    /// Minimum delay before each request (politeness).
+    pub delay: Duration,
+    /// Only follow URLs whose path starts with this prefix (e.g. "/docs/").
+    pub path_prefix: Option<String>,
+    /// Seed BFS frontier from sitemap discovery before crawling.
+    pub use_sitemap: bool,
+    /// Glob patterns for paths to include. If non-empty, only matching URLs are crawled.
+    /// E.g. `["/api/*", "/guides/*"]` — matched against the URL path.
+    pub include_patterns: Vec<String>,
+    /// Glob patterns for paths to exclude. Checked after include_patterns.
+    /// E.g. `["/changelog/*", "/blog/*"]` — matching URLs are skipped.
+    pub exclude_patterns: Vec<String>,
+    /// Optional channel sender for streaming per-page results as they complete.
+    /// When set, each `PageResult` is sent on this channel immediately after extraction.
+    pub progress_tx: Option<tokio::sync::broadcast::Sender<PageResult>>,
+    /// When set to `true`, the crawler breaks out of the main loop early.
+    /// Callers (e.g. a Ctrl+C handler) can flip this to request graceful cancellation.
+    pub cancel_flag: Option<Arc<AtomicBool>>,
+}
+
+impl Default for CrawlConfig {
+    fn default() -> Self {
+        Self {
+            fetch: FetchConfig::default(),
+            max_depth: 1,
+            max_pages: 50,
+            concurrency: 5,
+            delay: Duration::from_millis(100),
+            path_prefix: None,
+            use_sitemap: false,
+            include_patterns: Vec::new(),
+            exclude_patterns: Vec::new(),
+            progress_tx: None,
+            cancel_flag: None,
+        }
+    }
+}
+
+/// Aggregated results from a crawl run.
+#[derive(Debug, Serialize, Deserialize)]
+pub struct CrawlResult {
+    pub pages: Vec<PageResult>,
+    pub total: usize,
+    pub ok: usize,
+    pub errors: usize,
+    pub elapsed_secs: f64,
+    /// URLs visited during this crawl (for resume state).
+    #[serde(skip)]
+    pub visited: HashSet<String>,
+    /// Remaining frontier when crawl was cancelled (for resume state).
+    #[serde(skip)]
+    pub remaining_frontier: Vec<(String, usize)>,
+}
+
+/// Outcome of extracting a single page during the crawl.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct PageResult {
+    pub url: String,
+    pub depth: usize,
+    pub extraction: Option<noxa_core::ExtractionResult>,
+    pub error: Option<String>,
+    #[serde(skip)]
+    pub elapsed: Duration,
+}
+
+/// Serializable crawl state for resume after Ctrl+C cancellation.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct CrawlState {
+    pub seed_url: String,
+    pub visited: Vec<String>,
+    pub frontier: Vec<(String, usize)>,
+    pub completed_pages: usize,
+    pub max_pages: usize,
+    pub max_depth: usize,
+}
+
+/// Recursive crawler that wraps a shared [`FetchClient`].
+pub struct Crawler {
+    client: Arc<FetchClient>,
+    config: CrawlConfig,
+    seed_origin: String,
+}
+
+impl Crawler {
+    /// Build a new crawler from a seed URL and config.
+    /// Constructs the underlying `FetchClient` from `config.fetch`.
+    pub fn new(seed_url: &str, config: CrawlConfig) -> Result<Self, FetchError> {
+        let seed = Url::parse(seed_url).map_err(|_| FetchError::InvalidUrl(seed_url.into()))?;
+        let seed_origin = origin_key(&seed);
+
+        let client = FetchClient::new(config.fetch.clone())?;
+
+        Ok(Self {
+            client: Arc::new(client),
+            config,
+            seed_origin,
+        })
+    }
+
+    /// Save current crawl state to a JSON file for later resume.
+    pub fn save_state(
+        path: &Path,
+        seed_url: &str,
+        visited: &HashSet<String>,
+        frontier: &[(String, usize)],
+        completed_pages: usize,
+        max_pages: usize,
+        max_depth: usize,
+    ) -> Result<(), String> {
+        let state = CrawlState {
+            seed_url: seed_url.to_string(),
+            visited: visited.iter().cloned().collect(),
+            frontier: frontier.to_vec(),
+            completed_pages,
+            max_pages,
+            max_depth,
+        };
+        let json =
+            serde_json::to_string_pretty(&state).map_err(|e| format!("serialize state: {e}"))?;
+        std::fs::write(path, json).map_err(|e| format!("write state to {}: {e}", path.display()))
+    }
+
+    /// Load crawl state from a JSON file. Returns `None` if file doesn't exist.
+    pub fn load_state(path: &Path) -> Option<CrawlState> {
+        let content = std::fs::read_to_string(path).ok()?;
+        serde_json::from_str(&content).ok()
+    }
+
+    /// Returns true if the cancel flag has been set.
+    fn is_cancelled(&self) -> bool {
+        self.config
+            .cancel_flag
+            .as_ref()
+            .is_some_and(|f| f.load(Ordering::Relaxed))
+    }
+
+    /// Crawl starting from `start_url`, returning results for every page visited.
+    ///
+    /// Uses breadth-first traversal: all pages at depth N are fetched (concurrently,
+    /// bounded by `config.concurrency`) before moving to depth N+1.
+    ///
+    /// When `config.use_sitemap` is true, sitemap URLs are discovered first and
+    /// added to the initial frontier at depth 0 alongside the seed URL.
+    ///
+    /// If `resume_state` is provided, the crawl resumes from the saved state
+    /// (pre-populated visited set and frontier) instead of starting fresh.
+    pub async fn crawl(&self, start_url: &str, resume_state: Option<CrawlState>) -> CrawlResult {
+        let start = Instant::now();
+
+        let seed = match Url::parse(start_url) {
+            Ok(u) => u,
+            Err(_) => {
+                return CrawlResult {
+                    pages: vec![PageResult {
+                        url: start_url.to_string(),
+                        depth: 0,
+                        extraction: None,
+                        error: Some(format!("invalid URL: {start_url}")),
+                        elapsed: Duration::ZERO,
+                    }],
+                    total: 1,
+                    ok: 0,
+                    errors: 1,
+                    elapsed_secs: 0.0,
+                    visited: HashSet::new(),
+                    remaining_frontier: Vec::new(),
+                };
+            }
+        };
+
+        let semaphore = Arc::new(Semaphore::new(self.config.concurrency));
+        let mut visited: HashSet<String>;
+        let mut pages: Vec<PageResult> = Vec::new();
+        let mut frontier: Vec<(String, usize)>;
+
+        // Resume from saved state or start fresh
+        if let Some(state) = resume_state {
+            visited = state.visited.into_iter().collect();
+            frontier = state.frontier;
+            info!(
+                visited = visited.len(),
+                frontier = frontier.len(),
+                "resuming crawl from saved state"
+            );
+        } else {
+            visited = HashSet::new();
+            frontier = vec![(normalize(&seed), 0)];
+
+            // Seed frontier from sitemap if enabled
+            if self.config.use_sitemap {
+                let base_url = format!("{}://{}", seed.scheme(), seed.host_str().unwrap_or(""));
+                match sitemap::discover(&self.client, &base_url).await {
+                    Ok(entries) => {
+                        let before = frontier.len();
+                        for entry in entries {
+                            if self.qualify_link(&entry.url, &visited).is_some() {
+                                let parsed = match Url::parse(&entry.url) {
+                                    Ok(u) => u,
+                                    Err(_) => continue,
+                                };
+                                let norm = normalize(&parsed);
+                                frontier.push((norm, 0));
+                            }
+                        }
+                        let added = frontier.len() - before;
+                        info!(
+                            sitemap_urls = added,
+                            "seeded frontier from sitemap discovery"
+                        );
+                    }
+                    Err(e) => {
+                        warn!(error = %e, "sitemap discovery failed, continuing with seed URL only");
+                    }
+                }
+            }
+        }
+
+        while !frontier.is_empty() && pages.len() < self.config.max_pages {
+            // Check cancel flag before processing each batch
+            if self.is_cancelled() {
+                info!("crawl cancelled by user");
+                break;
+            }
+
+            // Dedup this level's frontier against the visited set and page cap
+            let batch: Vec<(String, usize)> = frontier
+                .drain(..)
+                .filter(|(url, _)| visited.insert(url.clone()))
+                .take(self.config.max_pages.saturating_sub(pages.len()))
+                .collect();
+
+            if batch.is_empty() {
+                break;
+            }
+
+            // Spawn one task per URL, bounded by semaphore
+            let mut handles = Vec::with_capacity(batch.len());
+
+            for (url, depth) in &batch {
+                let permit = Arc::clone(&semaphore);
+                let client = Arc::clone(&self.client);
+                let url = url.clone();
+                let depth = *depth;
+                let delay = self.config.delay;
+
+                handles.push(tokio::spawn(async move {
+                    // Acquire permit — blocks if concurrency limit reached
+                    let _permit = permit.acquire().await.expect("semaphore closed");
+                    tokio::time::sleep(delay).await;
+
+                    let page_start = Instant::now();
+                    let result = client.fetch_and_extract(&url).await;
+                    let elapsed = page_start.elapsed();
+
+                    match result {
+                        Ok(extraction) => {
+                            debug!(
+                                url = %url, depth,
+                                elapsed_ms = %elapsed.as_millis(),
+                                "page extracted"
+                            );
+                            PageResult {
+                                url,
+                                depth,
+                                extraction: Some(extraction),
+                                error: None,
+                                elapsed,
+                            }
+                        }
+                        Err(e) => {
+                            warn!(url = %url, depth, error = %e, "page failed");
+                            PageResult {
+                                url,
+                                depth,
+                                extraction: None,
+                                error: Some(e.to_string()),
+                                elapsed,
+                            }
+                        }
+                    }
+                }));
+            }
+
+            // Collect results and harvest links for the next depth level
+            let mut next_frontier: Vec<(String, usize)> = Vec::new();
+
+            for handle in handles {
+                let page = match handle.await {
+                    Ok(page) => page,
+                    Err(e) => {
+                        warn!(error = %e, "crawl task panicked");
+                        continue;
+                    }
+                };
+                let depth = page.depth;
+
+                if depth < self.config.max_depth
+                    && let Some(ref extraction) = page.extraction
+                {
+                    for link in &extraction.content.links {
+                        if let Some(candidate) = self.qualify_link(&link.href, &visited) {
+                            next_frontier.push((candidate, depth + 1));
+                        }
+                    }
+                }
+
+                // Stream progress if a channel is configured
+                if let Some(tx) = &self.config.progress_tx {
+                    let _ = tx.send(page.clone());
+                }
+
+                pages.push(page);
+
+                if pages.len() >= self.config.max_pages {
+                    break;
+                }
+
+                // Check cancel flag between page results
+                if self.is_cancelled() {
+                    info!("crawl cancelled by user (mid-batch)");
+                    break;
+                }
+            }
+
+            frontier = next_frontier;
+        }
+
+        let total_elapsed = start.elapsed();
+        let ok_count = pages.iter().filter(|p| p.extraction.is_some()).count();
+        let err_count = pages.len() - ok_count;
+        info!(
+            total = pages.len(),
+            ok = ok_count,
+            errors = err_count,
+            elapsed_ms = %total_elapsed.as_millis(),
+            "crawl complete"
+        );
+
+        CrawlResult {
+            total: pages.len(),
+            ok: ok_count,
+            errors: err_count,
+            elapsed_secs: total_elapsed.as_secs_f64(),
+            remaining_frontier: frontier,
+            visited,
+            pages,
+        }
+    }
+
+    /// Check if a discovered link should be added to the frontier.
+    /// Returns `Some(normalized_url)` if it passes all filters, `None` otherwise.
+    fn qualify_link(&self, href: &str, visited: &HashSet<String>) -> Option<String> {
+        let parsed = Url::parse(href).ok()?;
+
+        // Only http(s) schemes
+        match parsed.scheme() {
+            "http" | "https" => {}
+            _ => return None,
+        }
+
+        // Same-origin check (scheme + host + port)
+        if origin_key(&parsed) != self.seed_origin {
+            return None;
+        }
+
+        // Path prefix filter
+        if let Some(ref prefix) = self.config.path_prefix
+            && !parsed.path().starts_with(prefix.as_str())
+        {
+            return None;
+        }
+
+        // Include patterns: if any are set, path must match at least one
+        let path = parsed.path();
+        if !self.config.include_patterns.is_empty()
+            && !self
+                .config
+                .include_patterns
+                .iter()
+                .any(|pat| glob_match(pat, path))
+        {
+            return None;
+        }
+
+        // Exclude patterns: if path matches any, skip
+        if self
+            .config
+            .exclude_patterns
+            .iter()
+            .any(|pat| glob_match(pat, path))
+        {
+            return None;
+        }
+
+        // Skip common non-page file extensions
+        const SKIP_EXTENSIONS: &[&str] = &[
+            ".pdf", ".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp", ".ico", ".css", ".js",
+            ".zip", ".tar", ".gz", ".xml", ".rss", ".mp3", ".mp4", ".avi", ".mov", ".woff",
+            ".woff2", ".ttf", ".eot",
+        ];
+        if SKIP_EXTENSIONS.iter().any(|ext| path.ends_with(ext)) {
+            return None;
+        }
+
+        let normalized = normalize(&parsed);
+
+        if visited.contains(&normalized) {
+            return None;
+        }
+
+        Some(normalized)
+    }
+}
+
+/// Canonical origin string for comparing same-origin: "scheme://host[:port]".
+fn origin_key(url: &Url) -> String {
+    let port_suffix = match url.port() {
+        Some(p) => format!(":{p}"),
+        None => String::new(),
+    };
+    let host = url.host_str().unwrap_or("");
+    let host = host.strip_prefix("www.").unwrap_or(host);
+    format!("{}://{}{}", url.scheme(), host, port_suffix)
+}
+
+/// Normalize a URL for dedup: strip fragment, remove trailing slash (except root "/"),
+/// lowercase scheme + host. Preserves query params and path case.
+fn normalize(url: &Url) -> String {
+    let scheme = url.scheme();
+    let host = url.host_str().unwrap_or("").to_ascii_lowercase();
+    let port_suffix = match url.port() {
+        Some(p) => format!(":{p}"),
+        None => String::new(),
+    };
+
+    let mut path = url.path().to_string();
+    if path.len() > 1 && path.ends_with('/') {
+        path.pop();
+    }
+
+    let query = match url.query() {
+        Some(q) => format!("?{q}"),
+        None => String::new(),
+    };
+
+    // Fragment intentionally omitted
+    format!("{scheme}://{host}{port_suffix}{path}{query}")
+}
+
+/// Simple glob matching for URL paths. Supports:
+/// - `*` matches any characters within a single path segment (no `/`)
+/// - `**` matches any characters including `/` (any number of segments)
+/// - Literal characters match exactly
+///
+/// Examples:
+/// - `/api/*` matches `/api/users` but not `/api/users/123`
+/// - `/api/**` matches `/api/users`, `/api/users/123`, `/api/a/b/c`
+/// - `/docs/*/intro` matches `/docs/v2/intro`
+fn glob_match(pattern: &str, path: &str) -> bool {
+    glob_match_inner(pattern.as_bytes(), path.as_bytes())
+}
+
+fn glob_match_inner(pat: &[u8], text: &[u8]) -> bool {
+    let mut pi = 0;
+    let mut ti = 0;
+    let mut star_pi = usize::MAX;
+    let mut star_ti = 0;
+
+    while ti < text.len() {
+        if pi < pat.len() && pat[pi] == b'*' && pi + 1 < pat.len() && pat[pi + 1] == b'*' {
+            // `**` — match everything including slashes
+            // Skip all consecutive `*`
+            while pi < pat.len() && pat[pi] == b'*' {
+                pi += 1;
+            }
+            // Skip trailing `/` after `**`
+            if pi < pat.len() && pat[pi] == b'/' {
+                pi += 1;
+            }
+            if pi >= pat.len() {
+                return true; // `**` at end matches everything
+            }
+            // Try matching the rest of pattern against every suffix of text
+            for start in ti..=text.len() {
+                if glob_match_inner(&pat[pi..], &text[start..]) {
+                    return true;
+                }
+            }
+            return false;
+        } else if pi < pat.len() && pat[pi] == b'*' {
+            // `*` — match any chars except `/`
+            star_pi = pi;
+            star_ti = ti;
+            pi += 1;
+        } else if pi < pat.len() && (pat[pi] == text[ti] || pat[pi] == b'?') {
+            pi += 1;
+            ti += 1;
+        } else if star_pi != usize::MAX {
+            // Backtrack: `*` absorbs one more char (but not `/`)
+            if text[star_ti] == b'/' {
+                return false;
+            }
+            star_ti += 1;
+            ti = star_ti;
+            pi = star_pi + 1;
+        } else {
+            return false;
+        }
+    }
+
+    // Consume trailing `*` or `**` in pattern
+    while pi < pat.len() && pat[pi] == b'*' {
+        pi += 1;
+    }
+
+    pi >= pat.len()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn normalize_strips_fragment() {
+        let url = Url::parse("https://example.com/page#section").unwrap();
+        assert_eq!(normalize(&url), "https://example.com/page");
+    }
+
+    #[test]
+    fn normalize_strips_trailing_slash() {
+        let url = Url::parse("https://example.com/docs/").unwrap();
+        assert_eq!(normalize(&url), "https://example.com/docs");
+    }
+
+    #[test]
+    fn normalize_keeps_root_slash() {
+        let url = Url::parse("https://example.com/").unwrap();
+        assert_eq!(normalize(&url), "https://example.com/");
+    }
+
+    #[test]
+    fn normalize_preserves_query() {
+        let url = Url::parse("https://example.com/search?q=rust&page=2").unwrap();
+        assert_eq!(normalize(&url), "https://example.com/search?q=rust&page=2");
+    }
+
+    #[test]
+    fn normalize_lowercases_host() {
+        let url = Url::parse("https://Example.COM/Path").unwrap();
+        assert_eq!(normalize(&url), "https://example.com/Path");
+    }
+
+    #[test]
+    fn origin_includes_explicit_port() {
+        let url = Url::parse("https://example.com:8443/foo").unwrap();
+        assert_eq!(origin_key(&url), "https://example.com:8443");
+    }
+
+    #[test]
+    fn origin_omits_default_port() {
+        let url = Url::parse("https://example.com/foo").unwrap();
+        assert_eq!(origin_key(&url), "https://example.com");
+    }
+
+    #[test]
+    fn different_schemes_are_different_origins() {
+        let http = Url::parse("http://example.com/").unwrap();
+        let https = Url::parse("https://example.com/").unwrap();
+        assert_ne!(origin_key(&http), origin_key(&https));
+    }
+
+    // -- glob_match tests --
+
+    #[test]
+    fn glob_star_matches_single_segment() {
+        assert!(glob_match("/api/*", "/api/users"));
+        assert!(glob_match("/api/*", "/api/products"));
+        assert!(!glob_match("/api/*", "/api/users/123"));
+    }
+
+    #[test]
+    fn glob_doublestar_matches_multiple_segments() {
+        assert!(glob_match("/api/**", "/api/users"));
+        assert!(glob_match("/api/**", "/api/users/123"));
+        assert!(glob_match("/api/**", "/api/a/b/c/d"));
+        assert!(!glob_match("/api/**", "/docs/intro"));
+    }
+
+    #[test]
+    fn glob_exact_match() {
+        assert!(glob_match("/about", "/about"));
+        assert!(!glob_match("/about", "/about/team"));
+    }
+
+    #[test]
+    fn glob_middle_wildcard() {
+        assert!(glob_match("/docs/*/intro", "/docs/v2/intro"));
+        assert!(!glob_match("/docs/*/intro", "/docs/v2/v3/intro"));
+    }
+
+    #[test]
+    fn glob_no_pattern_matches_nothing() {
+        // Empty pattern only matches empty string
+        assert!(glob_match("", ""));
+        assert!(!glob_match("", "/foo"));
+    }
+
+    #[test]
+    fn glob_trailing_star() {
+        assert!(glob_match("/blog*", "/blog"));
+        assert!(glob_match("/blog*", "/blog-post"));
+        assert!(!glob_match("/blog*", "/blog/post")); // * doesn't cross /
+    }
+}
--- a/crates/noxa-fetch/src/document.rs
+++ b/crates/noxa-fetch/src/document.rs
@ -0,0 +1,745 @@
+/// Document extraction for DOCX, XLSX, XLS, and CSV files.
+/// Auto-detects document type from Content-Type headers or URL extension,
+/// then extracts text content as markdown — same pattern as PDF extraction.
+use std::io::{Cursor, Read};
+
+use tracing::debug;
+
+use crate::error::FetchError;
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum DocType {
+    Docx,
+    Xlsx,
+    Xls,
+    Csv,
+}
+
+impl DocType {
+    fn label(self) -> &'static str {
+        match self {
+            DocType::Docx => "DOCX",
+            DocType::Xlsx => "XLSX",
+            DocType::Xls => "XLS",
+            DocType::Csv => "CSV",
+        }
+    }
+}
+
+impl std::fmt::Display for DocType {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str(self.label())
+    }
+}
+
+/// Detect document type from response headers or URL extension.
+/// Returns `None` for non-document responses (HTML, PDF, etc.).
+pub fn is_document_content_type(headers: &http::HeaderMap, url: &str) -> Option<DocType> {
+    // Check Content-Type header first
+    if let Some(ct) = headers.get("content-type").and_then(|v| v.to_str().ok()) {
+        let mime = ct.split(';').next().unwrap_or("").trim();
+
+        if mime.eq_ignore_ascii_case(
+            "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        ) {
+            return Some(DocType::Docx);
+        }
+        if mime.eq_ignore_ascii_case(
+            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+        ) {
+            return Some(DocType::Xlsx);
+        }
+        if mime.eq_ignore_ascii_case("application/vnd.ms-excel") {
+            return Some(DocType::Xls);
+        }
+        if mime.eq_ignore_ascii_case("text/csv") {
+            return Some(DocType::Csv);
+        }
+    }
+
+    // Fall back to URL extension
+    let path = url.split('?').next().unwrap_or(url);
+    let lower = path.to_ascii_lowercase();
+
+    if lower.ends_with(".docx") {
+        return Some(DocType::Docx);
+    }
+    if lower.ends_with(".xlsx") {
+        return Some(DocType::Xlsx);
+    }
+    if lower.ends_with(".xls") {
+        return Some(DocType::Xls);
+    }
+    if lower.ends_with(".csv") {
+        return Some(DocType::Csv);
+    }
+
+    None
+}
+
+/// Extract text content from document bytes, returning an ExtractionResult.
+pub fn extract_document(
+    bytes: &[u8],
+    doc_type: DocType,
+) -> Result<noxa_core::ExtractionResult, FetchError> {
+    debug!(
+        doc_type = doc_type.label(),
+        bytes = bytes.len(),
+        "extracting document"
+    );
+
+    let markdown = match doc_type {
+        DocType::Docx => extract_docx(bytes)?,
+        DocType::Xlsx => extract_xlsx(bytes)?,
+        DocType::Xls => extract_xls(bytes)?,
+        DocType::Csv => extract_csv(bytes)?,
+    };
+
+    let plain_text = strip_markdown_formatting(&markdown);
+    let word_count = plain_text.split_whitespace().count();
+
+    Ok(noxa_core::ExtractionResult {
+        metadata: noxa_core::Metadata {
+            title: None,
+            description: None,
+            author: None,
+            published_date: None,
+            language: None,
+            url: None,
+            site_name: None,
+            image: None,
+            favicon: None,
+            word_count,
+        },
+        content: noxa_core::Content {
+            markdown,
+            plain_text,
+            links: Vec::new(),
+            images: Vec::new(),
+            code_blocks: Vec::new(),
+            raw_html: None,
+        },
+        domain_data: None,
+        structured_data: vec![],
+    })
+}
+
+/// Extract text from a DOCX file (ZIP of XML).
+/// Reads `word/document.xml`, extracts `<w:t>` text nodes, detects heading styles.
+fn extract_docx(bytes: &[u8]) -> Result<String, FetchError> {
+    let cursor = Cursor::new(bytes);
+    let mut archive =
+        zip::ZipArchive::new(cursor).map_err(|e| FetchError::Build(format!("DOCX zip: {e}")))?;
+
+    let xml = {
+        let mut file = archive
+            .by_name("word/document.xml")
+            .map_err(|e| FetchError::Build(format!("DOCX missing document.xml: {e}")))?;
+        let mut buf = String::new();
+        file.read_to_string(&mut buf)
+            .map_err(|e| FetchError::BodyDecode(format!("DOCX read: {e}")))?;
+        buf
+    };
+
+    parse_docx_xml(&xml)
+}
+
+/// Parse DOCX XML (word/document.xml) into markdown.
+///
+/// Walks the XML looking for paragraph elements (`<w:p>`). Within each paragraph,
+/// collects text from `<w:t>` tags and detects heading styles from `<w:pStyle>`.
+fn parse_docx_xml(xml: &str) -> Result<String, FetchError> {
+    use quick_xml::Reader;
+    use quick_xml::events::Event;
+
+    let mut reader = Reader::from_str(xml);
+    let mut paragraphs: Vec<String> = Vec::new();
+
+    // State tracking for the current paragraph
+    let mut in_paragraph = false;
+    let mut in_run = false; // inside <w:r> (run)
+    let mut in_text = false; // inside <w:t>
+    let mut current_text = String::new();
+    let mut heading_level: Option<u8> = None; // None = normal paragraph
+    let mut in_ppr = false; // inside <w:pPr> (paragraph properties)
+
+    loop {
+        match reader.read_event() {
+            Ok(Event::Start(ref e)) | Ok(Event::Empty(ref e)) => {
+                let name_bytes = e.name().as_ref().to_vec();
+                let local = local_name(&name_bytes);
+                match local {
+                    b"p" if is_w_namespace(&name_bytes) => {
+                        in_paragraph = true;
+                        current_text.clear();
+                        heading_level = None;
+                    }
+                    b"pPr" if in_paragraph => in_ppr = true,
+                    b"pStyle" if in_ppr => {
+                        heading_level = extract_heading_level(e);
+                    }
+                    b"r" if in_paragraph => in_run = true,
+                    b"t" if in_run => in_text = true,
+                    b"br" if in_paragraph => {
+                        current_text.push('\n');
+                    }
+                    b"tab" if in_paragraph => {
+                        current_text.push('\t');
+                    }
+                    _ => {}
+                }
+            }
+            Ok(Event::End(ref e)) => {
+                let name_bytes = e.name().as_ref().to_vec();
+                let local = local_name(&name_bytes);
+                match local {
+                    b"p" if in_paragraph => {
+                        let text = current_text.trim().to_string();
+                        if !text.is_empty() {
+                            let formatted = match heading_level {
+                                Some(1) => format!("# {text}"),
+                                Some(2) => format!("## {text}"),
+                                Some(3) => format!("### {text}"),
+                                Some(4) => format!("#### {text}"),
+                                Some(5) => format!("##### {text}"),
+                                Some(6) => format!("###### {text}"),
+                                _ => text,
+                            };
+                            paragraphs.push(formatted);
+                        }
+                        in_paragraph = false;
+                    }
+                    b"pPr" => in_ppr = false,
+                    b"r" => {
+                        in_run = false;
+                        in_text = false;
+                    }
+                    b"t" => in_text = false,
+                    _ => {}
+                }
+            }
+            Ok(Event::Text(ref e)) if in_text => {
+                if let Ok(text) = e.unescape() {
+                    current_text.push_str(&text);
+                }
+            }
+            Ok(Event::Eof) => break,
+            Err(e) => {
+                return Err(FetchError::Build(format!("DOCX XML parse error: {e}")));
+            }
+            _ => {}
+        }
+    }
+
+    Ok(paragraphs.join("\n\n"))
+}
+
+/// Check if a qualified name belongs to the `w:` (wordprocessingML) namespace.
+/// Handles both `w:p` (prefixed) and just `p` (default namespace) forms.
+fn is_w_namespace(name: &[u8]) -> bool {
+    // quick-xml gives us the full name bytes. Accept both "w:p" and "p".
+    name == b"w:p" || name == b"p"
+}
+
+/// Extract the local name from a possibly namespaced XML tag.
+/// `w:p` -> `p`, `p` -> `p`
+fn local_name(name: &[u8]) -> &[u8] {
+    match name.iter().position(|&b| b == b':') {
+        Some(pos) => &name[pos + 1..],
+        None => name,
+    }
+}
+
+/// Extract heading level from a `<w:pStyle w:val="Heading1"/>` element.
+fn extract_heading_level(e: &quick_xml::events::BytesStart) -> Option<u8> {
+    for attr in e.attributes().flatten() {
+        let local = local_name(attr.key.as_ref());
+        if local == b"val" {
+            let val = String::from_utf8_lossy(&attr.value);
+            let lower = val.to_ascii_lowercase();
+
+            // Match "heading1", "heading2", etc. and "title" -> h1
+            if lower == "title" {
+                return Some(1);
+            }
+            if let Some(rest) = lower.strip_prefix("heading")
+                && let Ok(n) = rest.parse::<u8>()
+            {
+                return Some(n.min(6));
+            }
+        }
+    }
+    None
+}
+
+/// Extract spreadsheet content using calamine (XLSX format).
+fn extract_xlsx(bytes: &[u8]) -> Result<String, FetchError> {
+    extract_spreadsheet(bytes, "XLSX")
+}
+
+/// Extract spreadsheet content using calamine (XLS format).
+fn extract_xls(bytes: &[u8]) -> Result<String, FetchError> {
+    extract_spreadsheet(bytes, "XLS")
+}
+
+/// Shared spreadsheet extraction for both XLSX and XLS via calamine.
+/// Reads all sheets and formats each as a markdown table.
+fn extract_spreadsheet(bytes: &[u8], label: &str) -> Result<String, FetchError> {
+    use calamine::Reader;
+
+    let cursor = Cursor::new(bytes);
+    let mut workbook: calamine::Sheets<_> = calamine::open_workbook_auto_from_rs(cursor)
+        .map_err(|e| FetchError::Build(format!("{label} open: {e}")))?;
+
+    let sheet_names: Vec<String> = workbook.sheet_names().to_vec();
+    let mut sections: Vec<String> = Vec::new();
+
+    for name in &sheet_names {
+        let range = workbook
+            .worksheet_range(name)
+            .map_err(|e| FetchError::Build(format!("{label} sheet '{name}': {e}")))?;
+
+        let rows: Vec<Vec<String>> = range
+            .rows()
+            .map(|row| row.iter().map(cell_to_string).collect())
+            .collect();
+
+        if rows.is_empty() {
+            continue;
+        }
+
+        let mut section = format!("## Sheet: {name}\n\n");
+        section.push_str(&rows_to_markdown_table(&rows));
+        sections.push(section);
+    }
+
+    if sections.is_empty() {
+        return Ok("(empty spreadsheet)".to_string());
+    }
+
+    Ok(sections.join("\n\n"))
+}
+
+/// Convert a calamine cell value to a display string.
+fn cell_to_string(cell: &calamine::Data) -> String {
+    use calamine::Data;
+    match cell {
+        Data::Empty => String::new(),
+        Data::String(s) => s.clone(),
+        Data::Int(n) => n.to_string(),
+        Data::Float(f) => format_float(*f),
+        Data::Bool(b) => b.to_string(),
+        Data::Error(e) => format!("#{e:?}"),
+        Data::DateTime(dt) => format!("{dt}"),
+        Data::DateTimeIso(s) => s.clone(),
+        Data::DurationIso(s) => s.clone(),
+    }
+}
+
+/// Format a float, dropping trailing `.0` for clean integer display.
+fn format_float(f: f64) -> String {
+    if f.fract() == 0.0 && f.abs() < i64::MAX as f64 {
+        format!("{}", f as i64)
+    } else {
+        format!("{f}")
+    }
+}
+
+/// Extract CSV text and convert to markdown table.
+fn extract_csv(bytes: &[u8]) -> Result<String, FetchError> {
+    let text = String::from_utf8_lossy(bytes);
+    let rows = parse_csv_rows(&text);
+
+    if rows.is_empty() {
+        return Ok("(empty CSV)".to_string());
+    }
+
+    Ok(rows_to_markdown_table(&rows))
+}
+
+/// Parse CSV text into rows of fields, handling quoted fields with commas/newlines.
+fn parse_csv_rows(text: &str) -> Vec<Vec<String>> {
+    let mut rows: Vec<Vec<String>> = Vec::new();
+    let mut current_row: Vec<String> = Vec::new();
+    let mut current_field = String::new();
+    let mut in_quotes = false;
+    let mut chars = text.chars().peekable();
+
+    while let Some(ch) = chars.next() {
+        if in_quotes {
+            if ch == '"' {
+                // Escaped quote ("") or end of quoted field
+                if chars.peek() == Some(&'"') {
+                    chars.next();
+                    current_field.push('"');
+                } else {
+                    in_quotes = false;
+                }
+            } else {
+                current_field.push(ch);
+            }
+        } else {
+            match ch {
+                '"' => in_quotes = true,
+                ',' => {
+                    current_row.push(current_field.trim().to_string());
+                    current_field = String::new();
+                }
+                '\n' => {
+                    current_row.push(current_field.trim().to_string());
+                    current_field = String::new();
+                    if !current_row.iter().all(|f| f.is_empty()) {
+                        rows.push(current_row);
+                    }
+                    current_row = Vec::new();
+                }
+                '\r' => {
+                    // Skip carriage returns (handled with \n)
+                }
+                _ => current_field.push(ch),
+            }
+        }
+    }
+
+    // Flush last field/row
+    if !current_field.is_empty() || !current_row.is_empty() {
+        current_row.push(current_field.trim().to_string());
+        if !current_row.iter().all(|f| f.is_empty()) {
+            rows.push(current_row);
+        }
+    }
+
+    rows
+}
+
+/// Convert rows (first row = header) into a markdown table.
+fn rows_to_markdown_table(rows: &[Vec<String>]) -> String {
+    if rows.is_empty() {
+        return String::new();
+    }
+
+    // Find the max column count across all rows
+    let col_count = rows.iter().map(|r| r.len()).max().unwrap_or(0);
+    if col_count == 0 {
+        return String::new();
+    }
+
+    let mut lines: Vec<String> = Vec::new();
+
+    // Header row
+    let header = &rows[0];
+    let header_cells: Vec<&str> = (0..col_count)
+        .map(|i| header.get(i).map(|s| s.as_str()).unwrap_or(""))
+        .collect();
+    lines.push(format!("| {} |", header_cells.join(" | ")));
+
+    // Separator row
+    let sep: Vec<&str> = vec!["---"; col_count];
+    lines.push(format!("| {} |", sep.join(" | ")));
+
+    // Data rows
+    for row in &rows[1..] {
+        let cells: Vec<&str> = (0..col_count)
+            .map(|i| row.get(i).map(|s| s.as_str()).unwrap_or(""))
+            .collect();
+        lines.push(format!("| {} |", cells.join(" | ")));
+    }
+
+    lines.join("\n")
+}
+
+/// Strip markdown formatting to get plain text.
+fn strip_markdown_formatting(markdown: &str) -> String {
+    let mut plain = String::with_capacity(markdown.len());
+    for line in markdown.lines() {
+        let trimmed = line.trim_start_matches('#').trim();
+        if trimmed.starts_with("| ---") || trimmed == "|---|" {
+            continue; // Skip separator rows
+        }
+        if let Some(stripped) = trimmed.strip_prefix('|')
+            && let Some(stripped) = stripped.strip_suffix('|')
+        {
+            // Table row: join cells with spaces
+            let cells: Vec<&str> = stripped.split('|').map(|c| c.trim()).collect();
+            plain.push_str(&cells.join(" "));
+            plain.push('\n');
+            continue;
+        }
+        plain.push_str(trimmed);
+        plain.push('\n');
+    }
+    plain.trim().to_string()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use http::HeaderMap;
+
+    fn headers_with(name: &str, value: &str) -> HeaderMap {
+        let mut h = HeaderMap::new();
+        h.insert(
+            name.parse::<http::header::HeaderName>().unwrap(),
+            value.parse().unwrap(),
+        );
+        h
+    }
+
+    // --- Content-type detection ---
+
+    #[test]
+    fn test_detect_docx_content_type() {
+        let headers = headers_with(
+            "content-type",
+            "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        );
+        assert_eq!(
+            is_document_content_type(&headers, "https://example.com/file"),
+            Some(DocType::Docx)
+        );
+    }
+
+    #[test]
+    fn test_detect_xlsx_content_type() {
+        let headers = headers_with(
+            "content-type",
+            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+        );
+        assert_eq!(
+            is_document_content_type(&headers, "https://example.com/file"),
+            Some(DocType::Xlsx)
+        );
+    }
+
+    #[test]
+    fn test_detect_xls_content_type() {
+        let headers = headers_with("content-type", "application/vnd.ms-excel");
+        assert_eq!(
+            is_document_content_type(&headers, "https://example.com/file"),
+            Some(DocType::Xls)
+        );
+    }
+
+    #[test]
+    fn test_detect_csv_content_type() {
+        let headers = headers_with("content-type", "text/csv");
+        assert_eq!(
+            is_document_content_type(&headers, "https://example.com/file"),
+            Some(DocType::Csv)
+        );
+    }
+
+    #[test]
+    fn test_detect_csv_content_type_with_charset() {
+        let headers = headers_with("content-type", "text/csv; charset=utf-8");
+        assert_eq!(
+            is_document_content_type(&headers, "https://example.com/file"),
+            Some(DocType::Csv)
+        );
+    }
+
+    #[test]
+    fn test_detect_by_url_extension() {
+        let empty = HeaderMap::new();
+        assert_eq!(
+            is_document_content_type(&empty, "https://example.com/report.docx"),
+            Some(DocType::Docx)
+        );
+        assert_eq!(
+            is_document_content_type(&empty, "https://example.com/data.xlsx"),
+            Some(DocType::Xlsx)
+        );
+        assert_eq!(
+            is_document_content_type(&empty, "https://example.com/old.xls"),
+            Some(DocType::Xls)
+        );
+        assert_eq!(
+            is_document_content_type(&empty, "https://example.com/data.csv"),
+            Some(DocType::Csv)
+        );
+    }
+
+    #[test]
+    fn test_detect_url_extension_with_query() {
+        let empty = HeaderMap::new();
+        assert_eq!(
+            is_document_content_type(&empty, "https://example.com/report.docx?token=abc"),
+            Some(DocType::Docx)
+        );
+    }
+
+    #[test]
+    fn test_detect_url_extension_case_insensitive() {
+        let empty = HeaderMap::new();
+        assert_eq!(
+            is_document_content_type(&empty, "https://example.com/FILE.XLSX"),
+            Some(DocType::Xlsx)
+        );
+    }
+
+    #[test]
+    fn test_detect_none_for_html() {
+        let headers = headers_with("content-type", "text/html");
+        assert_eq!(
+            is_document_content_type(&headers, "https://example.com/page"),
+            None
+        );
+    }
+
+    #[test]
+    fn test_content_type_takes_precedence_over_url() {
+        let headers = headers_with("content-type", "text/csv");
+        // URL says .xlsx but Content-Type says CSV — header wins
+        assert_eq!(
+            is_document_content_type(&headers, "https://example.com/data.xlsx"),
+            Some(DocType::Csv)
+        );
+    }
+
+    // --- CSV parsing ---
+
+    #[test]
+    fn test_csv_simple() {
+        let csv = "Name,Age,City\nAlice,30,NYC\nBob,25,LA\n";
+        let result = extract_csv(csv.as_bytes()).unwrap();
+        assert!(result.contains("| Name | Age | City |"));
+        assert!(result.contains("| --- | --- | --- |"));
+        assert!(result.contains("| Alice | 30 | NYC |"));
+        assert!(result.contains("| Bob | 25 | LA |"));
+    }
+
+    #[test]
+    fn test_csv_quoted_fields() {
+        let csv = "Name,Description\nAlice,\"Has a, comma\"\nBob,\"Said \"\"hello\"\"\"\n";
+        let result = extract_csv(csv.as_bytes()).unwrap();
+        assert!(result.contains("Has a, comma"));
+        assert!(result.contains("Said \"hello\""));
+    }
+
+    #[test]
+    fn test_csv_empty() {
+        let result = extract_csv(b"").unwrap();
+        assert_eq!(result, "(empty CSV)");
+    }
+
+    #[test]
+    fn test_csv_windows_line_endings() {
+        let csv = "A,B\r\n1,2\r\n3,4\r\n";
+        let result = extract_csv(csv.as_bytes()).unwrap();
+        assert!(result.contains("| A | B |"));
+        assert!(result.contains("| 1 | 2 |"));
+    }
+
+    // --- DOCX XML parsing ---
+
+    #[test]
+    fn test_docx_xml_simple_paragraphs() {
+        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
+<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
+  <w:body>
+    <w:p><w:r><w:t>Hello world</w:t></w:r></w:p>
+    <w:p><w:r><w:t>Second paragraph</w:t></w:r></w:p>
+  </w:body>
+</w:document>"#;
+        let result = parse_docx_xml(xml).unwrap();
+        assert_eq!(result, "Hello world\n\nSecond paragraph");
+    }
+
+    #[test]
+    fn test_docx_xml_headings() {
+        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
+<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
+  <w:body>
+    <w:p>
+      <w:pPr><w:pStyle w:val="Heading1"/></w:pPr>
+      <w:r><w:t>Title</w:t></w:r>
+    </w:p>
+    <w:p><w:r><w:t>Body text</w:t></w:r></w:p>
+    <w:p>
+      <w:pPr><w:pStyle w:val="Heading2"/></w:pPr>
+      <w:r><w:t>Subtitle</w:t></w:r>
+    </w:p>
+  </w:body>
+</w:document>"#;
+        let result = parse_docx_xml(xml).unwrap();
+        assert!(result.contains("# Title"));
+        assert!(result.contains("Body text"));
+        assert!(result.contains("## Subtitle"));
+    }
+
+    #[test]
+    fn test_docx_xml_multiple_runs() {
+        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
+<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
+  <w:body>
+    <w:p>
+      <w:r><w:t>Hello </w:t></w:r>
+      <w:r><w:t>world</w:t></w:r>
+    </w:p>
+  </w:body>
+</w:document>"#;
+        let result = parse_docx_xml(xml).unwrap();
+        assert_eq!(result, "Hello world");
+    }
+
+    #[test]
+    fn test_docx_xml_empty_paragraphs_skipped() {
+        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
+<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
+  <w:body>
+    <w:p></w:p>
+    <w:p><w:r><w:t>Content</w:t></w:r></w:p>
+    <w:p><w:r><w:t>   </w:t></w:r></w:p>
+  </w:body>
+</w:document>"#;
+        let result = parse_docx_xml(xml).unwrap();
+        assert_eq!(result, "Content");
+    }
+
+    // --- Markdown table ---
+
+    #[test]
+    fn test_rows_to_markdown_table() {
+        let rows = vec![
+            vec!["A".to_string(), "B".to_string()],
+            vec!["1".to_string(), "2".to_string()],
+            vec!["3".to_string(), "4".to_string()],
+        ];
+        let table = rows_to_markdown_table(&rows);
+        assert_eq!(table, "| A | B |\n| --- | --- |\n| 1 | 2 |\n| 3 | 4 |");
+    }
+
+    #[test]
+    fn test_rows_to_markdown_table_ragged() {
+        let rows = vec![
+            vec!["A".to_string(), "B".to_string(), "C".to_string()],
+            vec!["1".to_string()], // fewer columns
+        ];
+        let table = rows_to_markdown_table(&rows);
+        assert!(table.contains("| 1 |  |  |"));
+    }
+
+    // --- Extract result ---
+
+    #[test]
+    fn test_extract_csv_result() {
+        let csv = "Name,Score\nAlice,100\n";
+        let result = extract_document(csv.as_bytes(), DocType::Csv).unwrap();
+        assert!(result.content.markdown.contains("| Name | Score |"));
+        assert!(result.metadata.word_count > 0);
+        assert!(result.content.links.is_empty());
+        assert!(result.domain_data.is_none());
+    }
+
+    // --- Strip markdown ---
+
+    #[test]
+    fn test_strip_markdown() {
+        let md = "# Title\n\nSome text\n\n| A | B |\n| --- | --- |\n| 1 | 2 |";
+        let plain = strip_markdown_formatting(md);
+        assert!(plain.contains("Title"));
+        assert!(plain.contains("Some text"));
+        assert!(plain.contains("A B"));
+        assert!(!plain.contains("---"));
+    }
+}
--- a/crates/noxa-fetch/src/error.rs
+++ b/crates/noxa-fetch/src/error.rs
@ -0,0 +1,24 @@
+/// Fetch-layer errors. Wraps HTTP/network failures into a single type
+/// that callers can match on without leaking transport details.
+use thiserror::Error;
+
+#[derive(Debug, Error)]
+pub enum FetchError {
+    #[error("request failed: {0}")]
+    Request(#[from] wreq::Error),
+
+    #[error("invalid url: {0}")]
+    InvalidUrl(String),
+
+    #[error("response body decode failed: {0}")]
+    BodyDecode(String),
+
+    #[error("extraction failed: {0}")]
+    Extraction(#[from] noxa_core::ExtractError),
+
+    #[error("PDF extraction failed: {0}")]
+    Pdf(#[from] noxa_pdf::PdfError),
+
+    #[error("client build failed: {0}")]
+    Build(String),
+}
--- a/crates/noxa-fetch/src/lib.rs
+++ b/crates/noxa-fetch/src/lib.rs
@ -0,0 +1,22 @@
+//! noxa-fetch: HTTP client layer with browser TLS fingerprint impersonation.
+//! Uses wreq (BoringSSL) for browser-grade TLS + HTTP/2 fingerprinting.
+//! Automatically detects PDF responses and delegates to noxa-pdf.
+pub mod browser;
+pub mod client;
+pub mod crawler;
+pub mod document;
+pub mod error;
+pub mod linkedin;
+pub mod proxy;
+pub mod reddit;
+pub mod sitemap;
+pub mod tls;
+
+pub use browser::BrowserProfile;
+pub use client::{BatchExtractResult, BatchResult, FetchClient, FetchConfig, FetchResult};
+pub use crawler::{CrawlConfig, CrawlResult, CrawlState, Crawler, PageResult};
+pub use error::FetchError;
+pub use http::HeaderMap;
+pub use proxy::{parse_proxy_file, parse_proxy_line};
+pub use sitemap::SitemapEntry;
+pub use noxa_pdf::PdfMode;
--- a/crates/noxa-fetch/src/linkedin.rs
+++ b/crates/noxa-fetch/src/linkedin.rs
@ -0,0 +1,279 @@
+/// LinkedIn post extraction from authenticated HTML.
+///
+/// LinkedIn's SPA stores all data in `<code>` tags as HTML-escaped JSON.
+/// The `included` array contains typed entities: Update (post), Comment,
+/// Profile, etc. We parse these to reconstruct post + comments as markdown.
+use serde_json::Value;
+use tracing::debug;
+use noxa_core::{Content, ExtractionResult, Metadata};
+
+/// Check if a URL is a LinkedIn post/activity.
+pub fn is_linkedin_post(url: &str) -> bool {
+    let host = url
+        .split("://")
+        .nth(1)
+        .unwrap_or(url)
+        .split('/')
+        .next()
+        .unwrap_or("");
+    (host == "www.linkedin.com" || host == "linkedin.com")
+        && (url.contains("/feed/update/") || url.contains("/posts/"))
+}
+
+/// Extract `<code>` block contents from HTML using simple string scanning.
+/// LinkedIn wraps JSON data in `<code>` tags with HTML-escaped content.
+fn extract_code_blocks(html: &str) -> Vec<String> {
+    let mut blocks = Vec::new();
+    let mut search_from = 0;
+    while let Some(start) = html[search_from..].find("<code") {
+        let abs_start = search_from + start;
+        // Find end of opening tag
+        let Some(tag_end) = html[abs_start..].find('>') else {
+            break;
+        };
+        let content_start = abs_start + tag_end + 1;
+        let Some(end) = html[content_start..].find("</code>") else {
+            break;
+        };
+        let content = &html[content_start..content_start + end];
+        if content.len() > 1000 {
+            blocks.push(html_unescape(content));
+        }
+        search_from = content_start + end + 7;
+    }
+    blocks
+}
+
+/// Extract post + comments from LinkedIn's SSR HTML (requires auth cookies).
+pub fn extract_linkedin_post(html: &str, url: &str) -> Option<ExtractionResult> {
+    let code_blocks = extract_code_blocks(html);
+
+    // Find the largest <code> block with "included" — that's the main data payload
+    let mut best_included: Option<Vec<Value>> = None;
+    for raw in &code_blocks {
+        if let Ok(obj) = serde_json::from_str::<Value>(raw)
+            && let Some(arr) = obj.get("included").and_then(|v| v.as_array())
+        {
+            let current_len = best_included.as_ref().map(|a| a.len()).unwrap_or(0);
+            if arr.len() > current_len {
+                best_included = Some(arr.clone());
+            }
+        }
+    }
+
+    let included = best_included?;
+    debug!(entities = included.len(), "linkedin: found included array");
+
+    // Collect profiles (entityUrn → "First Last")
+    let mut profiles = std::collections::HashMap::new();
+    for item in &included {
+        let t = item.get("$type").and_then(|v| v.as_str()).unwrap_or("");
+        if t.contains("Profile") {
+            let urn = item.get("entityUrn").and_then(|v| v.as_str()).unwrap_or("");
+            let first = item.get("firstName").and_then(|v| v.as_str()).unwrap_or("");
+            let last = item.get("lastName").and_then(|v| v.as_str()).unwrap_or("");
+            let headline = item.get("headline").and_then(|v| v.as_str()).unwrap_or("");
+            if !first.is_empty() {
+                profiles.insert(
+                    urn.to_string(),
+                    (
+                        format!("{first} {last}").trim().to_string(),
+                        headline.to_string(),
+                    ),
+                );
+            }
+        }
+    }
+
+    // Find the main post (Update type)
+    let mut markdown = String::new();
+    let mut post_author = String::new();
+    let mut post_headline = String::new();
+
+    for item in &included {
+        let t = item.get("$type").and_then(|v| v.as_str()).unwrap_or("");
+        if !t.contains("Update") {
+            continue;
+        }
+
+        // Get author from actor profile
+        if let Some(actor) = item.get("actor") {
+            // actor can have a nested profile reference or inline data
+            let author_urn = actor
+                .get("*author")
+                .or(actor.get("author"))
+                .and_then(|v| v.as_str())
+                .unwrap_or("");
+            if let Some((name, headline)) = profiles.get(author_urn) {
+                post_author = name.clone();
+                post_headline = headline.clone();
+            }
+            // Or inline name
+            if post_author.is_empty()
+                && let Some(name) = actor.get("name").and_then(|v| v.as_object())
+            {
+                let text = name.get("text").and_then(|v| v.as_str()).unwrap_or("");
+                if !text.is_empty() {
+                    post_author = text.to_string();
+                }
+            }
+            if post_headline.is_empty()
+                && let Some(desc) = actor.get("description").and_then(|v| v.as_object())
+            {
+                let text = desc.get("text").and_then(|v| v.as_str()).unwrap_or("");
+                if !text.is_empty() {
+                    post_headline = text.to_string();
+                }
+            }
+        }
+
+        // Get post body from commentary
+        if let Some(commentary) = item.get("commentary")
+            && let Some(text) = commentary
+                .get("text")
+                .and_then(|v| v.as_object())
+                .and_then(|o| o.get("text"))
+                .and_then(|v| v.as_str())
+        {
+            if !post_author.is_empty() {
+                markdown.push_str(&format!("# {post_author}\n\n"));
+            }
+            if !post_headline.is_empty() {
+                markdown.push_str(&format!("*{post_headline}*\n\n"));
+            }
+            markdown.push_str("---\n\n");
+            // Unescape literal \n from JSON
+            markdown.push_str(&text.replace("\\n", "\n"));
+            markdown.push_str("\n\n");
+        }
+    }
+
+    if markdown.is_empty() {
+        return None;
+    }
+
+    // Collect comments — LinkedIn stores comment text in `commentary.text`
+    // and commenter name in `commenter.name.text`
+    let mut comments: Vec<(String, String)> = Vec::new();
+    for item in &included {
+        let t = item.get("$type").and_then(|v| v.as_str()).unwrap_or("");
+        if !t.contains("Comment") {
+            continue;
+        }
+
+        // Get comment text from commentary.text
+        let text = item
+            .get("commentary")
+            .and_then(|c| c.get("text"))
+            .and_then(|v| v.as_str())
+            .unwrap_or("");
+        if text.is_empty() {
+            continue;
+        }
+
+        // Get commenter name from commenter.title.text
+        let name = item
+            .get("commenter")
+            .and_then(|c| c.get("title"))
+            .and_then(|n| n.get("text"))
+            .and_then(|v| v.as_str())
+            .unwrap_or("Someone");
+
+        comments.push((name.to_string(), text.to_string()));
+    }
+
+    if !comments.is_empty() {
+        markdown.push_str("---\n\n## Comments\n\n");
+        for (name, text) in &comments {
+            markdown.push_str(&format!("- **{name}**: {text}\n\n"));
+        }
+    }
+
+    let word_count = markdown.split_whitespace().count();
+    debug!(
+        word_count,
+        comments = comments.len(),
+        "linkedin extraction done"
+    );
+
+    Some(ExtractionResult {
+        metadata: Metadata {
+            title: if post_author.is_empty() {
+                None
+            } else {
+                Some(format!("{post_author}'s LinkedIn Post"))
+            },
+            description: None,
+            author: if post_author.is_empty() {
+                None
+            } else {
+                Some(post_author)
+            },
+            published_date: None,
+            language: None,
+            url: Some(url.to_string()),
+            site_name: Some("LinkedIn".into()),
+            image: None,
+            favicon: None,
+            word_count,
+        },
+        content: Content {
+            markdown,
+            plain_text: String::new(),
+            links: vec![],
+            images: vec![],
+            code_blocks: vec![],
+            raw_html: None,
+        },
+        domain_data: None,
+        structured_data: vec![],
+    })
+}
+
+/// Unescape HTML entities (named + numeric decimal).
+fn html_unescape(s: &str) -> String {
+    let mut out = String::with_capacity(s.len());
+    let mut chars = s.chars().peekable();
+    while let Some(c) = chars.next() {
+        if c != '&' {
+            out.push(c);
+            continue;
+        }
+        // Collect until ';'
+        let mut entity = String::new();
+        for c2 in chars.by_ref() {
+            if c2 == ';' {
+                break;
+            }
+            entity.push(c2);
+            if entity.len() > 10 {
+                break;
+            }
+        }
+        match entity.as_str() {
+            "quot" => out.push('"'),
+            "amp" => out.push('&'),
+            "lt" => out.push('<'),
+            "gt" => out.push('>'),
+            "apos" => out.push('\''),
+            s if s.starts_with('#') => {
+                let num = &s[1..];
+                if let Ok(n) = num.parse::<u32>()
+                    && let Some(ch) = char::from_u32(n)
+                {
+                    out.push(ch);
+                    continue;
+                }
+                out.push('&');
+                out.push_str(&entity);
+                out.push(';');
+            }
+            _ => {
+                out.push('&');
+                out.push_str(&entity);
+                out.push(';');
+            }
+        }
+    }
+    out
+}
--- a/crates/noxa-fetch/src/proxy.rs
+++ b/crates/noxa-fetch/src/proxy.rs
@ -0,0 +1,122 @@
+/// Proxy file parsing utilities.
+///
+/// Format: `host:port:user:pass` (one per line).
+/// Lines starting with `#` and blank lines are skipped.
+/// Also accepts `host:port` (no auth).
+use crate::error::FetchError;
+
+/// Parse a single proxy line into an HTTP proxy URL.
+///
+/// Accepts two formats:
+/// - `host:port:user:pass` -> `http://user:pass@host:port`
+/// - `host:port` -> `http://host:port`
+pub fn parse_proxy_line(line: &str) -> Option<String> {
+    let parts: Vec<&str> = line.trim().splitn(4, ':').collect();
+    match parts.len() {
+        4 => Some(format!(
+            "http://{}:{}@{}:{}",
+            parts[2], parts[3], parts[0], parts[1]
+        )),
+        2 => Some(format!("http://{}:{}", parts[0], parts[1])),
+        _ => None,
+    }
+}
+
+/// Load proxies from a file, returning parsed HTTP proxy URLs.
+///
+/// Skips blank lines and `#` comments. Returns an error if the file
+/// can't be read or contains no valid entries.
+pub fn parse_proxy_file(path: &str) -> Result<Vec<String>, FetchError> {
+    let content = std::fs::read_to_string(path)
+        .map_err(|e| FetchError::Build(format!("failed to read proxy file: {e}")))?;
+
+    let proxies: Vec<String> = content
+        .lines()
+        .filter_map(|line| {
+            let trimmed = line.trim();
+            if trimmed.is_empty() || trimmed.starts_with('#') {
+                None
+            } else {
+                parse_proxy_line(trimmed)
+            }
+        })
+        .collect();
+
+    if proxies.is_empty() {
+        return Err(FetchError::Build(
+            "proxy file is empty or has no valid entries".into(),
+        ));
+    }
+
+    Ok(proxies)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::io::Write;
+
+    #[test]
+    fn parse_host_port_user_pass() {
+        let result = parse_proxy_line("proxy.example.com:8080:alice:s3cret");
+        assert_eq!(
+            result.as_deref(),
+            Some("http://alice:s3cret@proxy.example.com:8080")
+        );
+    }
+
+    #[test]
+    fn parse_host_port_only() {
+        let result = parse_proxy_line("10.0.0.1:3128");
+        assert_eq!(result.as_deref(), Some("http://10.0.0.1:3128"));
+    }
+
+    #[test]
+    fn parse_trims_whitespace() {
+        let result = parse_proxy_line("  host:9999:user:pass  ");
+        assert_eq!(result.as_deref(), Some("http://user:pass@host:9999"));
+    }
+
+    #[test]
+    fn parse_invalid_returns_none() {
+        assert!(parse_proxy_line("just-a-hostname").is_none());
+        assert!(parse_proxy_line("a:b:c").is_none()); // 3 parts is invalid
+        assert!(parse_proxy_line("").is_none());
+    }
+
+    #[test]
+    fn parse_file_happy_path() {
+        let dir = tempfile::tempdir().unwrap();
+        let path = dir.path().join("proxies.txt");
+        let mut f = std::fs::File::create(&path).unwrap();
+        writeln!(f, "# residential pool").unwrap();
+        writeln!(f, "host1:8080:user1:pass1").unwrap();
+        writeln!(f).unwrap(); // blank line
+        writeln!(f, "host2:3128").unwrap();
+        writeln!(f, "# datacenter").unwrap();
+        writeln!(f, "host3:9999:u:p").unwrap();
+        drop(f);
+
+        let proxies = parse_proxy_file(path.to_str().unwrap()).unwrap();
+        assert_eq!(proxies.len(), 3);
+        assert_eq!(proxies[0], "http://user1:pass1@host1:8080");
+        assert_eq!(proxies[1], "http://host2:3128");
+        assert_eq!(proxies[2], "http://u:p@host3:9999");
+    }
+
+    #[test]
+    fn parse_file_empty_errors() {
+        let dir = tempfile::tempdir().unwrap();
+        let path = dir.path().join("empty.txt");
+        std::fs::write(&path, "# only comments\n\n").unwrap();
+
+        let err = parse_proxy_file(path.to_str().unwrap());
+        assert!(err.is_err());
+    }
+
+    #[test]
+    fn parse_file_missing_errors() {
+        let err = parse_proxy_file("/nonexistent/proxies.txt");
+        assert!(err.is_err());
+    }
+}
--- a/crates/noxa-fetch/src/reddit.rs
+++ b/crates/noxa-fetch/src/reddit.rs
@ -0,0 +1,172 @@
+/// Reddit JSON API fallback for extracting posts + comments without JS rendering.
+///
+/// Reddit's new `shreddit` frontend only SSRs the post body — comments are
+/// loaded client-side. Appending `.json` to any Reddit URL returns the full
+/// comment tree as structured JSON, which we convert to clean markdown.
+use serde::Deserialize;
+use tracing::debug;
+use noxa_core::{Content, ExtractionResult, Metadata};
+
+/// Check if a URL points to a Reddit post/comment page.
+pub fn is_reddit_url(url: &str) -> bool {
+    let host = url
+        .split("://")
+        .nth(1)
+        .unwrap_or(url)
+        .split('/')
+        .next()
+        .unwrap_or("");
+    matches!(
+        host,
+        "reddit.com" | "www.reddit.com" | "old.reddit.com" | "np.reddit.com" | "new.reddit.com"
+    )
+}
+
+/// Build the `.json` URL from a Reddit page URL.
+pub fn json_url(url: &str) -> String {
+    let clean = url.split('?').next().unwrap_or(url).trim_end_matches('/');
+    format!("{clean}.json")
+}
+
+/// Convert Reddit JSON API response into an ExtractionResult.
+pub fn parse_reddit_json(json_bytes: &[u8], url: &str) -> Result<ExtractionResult, String> {
+    let listings: Vec<Listing> =
+        serde_json::from_slice(json_bytes).map_err(|e| format!("reddit json parse: {e}"))?;
+
+    let mut markdown = String::new();
+    let mut title = None;
+    let mut author = None;
+    let mut subreddit = None;
+
+    // First listing = the post itself
+    if let Some(post_listing) = listings.first() {
+        for child in &post_listing.data.children {
+            if child.kind == "t3" {
+                let d = &child.data;
+                title = d.title.clone();
+                author = d.author.clone();
+                subreddit = d.subreddit_name_prefixed.clone();
+
+                if let Some(ref t) = title {
+                    markdown.push_str(&format!("# {t}\n\n"));
+                }
+                if let (Some(a), Some(sr)) = (&author, &subreddit) {
+                    markdown.push_str(&format!("**u/{a}** in {sr}\n\n"));
+                }
+                if let Some(ref body) = d.selftext
+                    && !body.is_empty()
+                {
+                    markdown.push_str(body);
+                    markdown.push_str("\n\n");
+                }
+                if let Some(ref url_field) = d.url_overridden_by_dest
+                    && !url_field.is_empty()
+                {
+                    markdown.push_str(&format!("[Link]({url_field})\n\n"));
+                }
+                markdown.push_str("---\n\n");
+            }
+        }
+    }
+
+    // Second listing = comment tree
+    if let Some(comment_listing) = listings.get(1) {
+        markdown.push_str("## Comments\n\n");
+        for child in &comment_listing.data.children {
+            render_comment(child, 0, &mut markdown);
+        }
+    }
+
+    let word_count = markdown.split_whitespace().count();
+    debug!(word_count, "reddit json extracted");
+
+    Ok(ExtractionResult {
+        metadata: Metadata {
+            title,
+            description: None,
+            author,
+            published_date: None,
+            language: Some("en".into()),
+            url: Some(url.to_string()),
+            site_name: subreddit,
+            image: None,
+            favicon: None,
+            word_count,
+        },
+        content: Content {
+            markdown,
+            plain_text: String::new(),
+            links: vec![],
+            images: vec![],
+            code_blocks: vec![],
+            raw_html: None,
+        },
+        domain_data: None,
+        structured_data: vec![],
+    })
+}
+
+fn render_comment(thing: &Thing, depth: usize, out: &mut String) {
+    if thing.kind != "t1" {
+        return;
+    }
+    let d = &thing.data;
+    let indent = "  ".repeat(depth);
+    let author = d.author.as_deref().unwrap_or("[deleted]");
+    let body = d.body.as_deref().unwrap_or("[removed]");
+    let score = d.score.unwrap_or(0);
+
+    out.push_str(&format!("{indent}- **u/{author}** ({score} pts)\n"));
+    for line in body.lines() {
+        out.push_str(&format!("{indent}  {line}\n"));
+    }
+    out.push('\n');
+
+    // Recurse into replies
+    if let Some(Replies::Listing(listing)) = &d.replies {
+        for child in &listing.data.children {
+            render_comment(child, depth + 1, out);
+        }
+    }
+}
+
+// --- Reddit JSON types (minimal) ---
+
+#[derive(Deserialize)]
+struct Listing {
+    data: ListingData,
+}
+
+#[derive(Deserialize)]
+struct ListingData {
+    children: Vec<Thing>,
+}
+
+#[derive(Deserialize)]
+struct Thing {
+    kind: String,
+    data: ThingData,
+}
+
+#[derive(Deserialize)]
+struct ThingData {
+    // Post fields (t3)
+    title: Option<String>,
+    selftext: Option<String>,
+    subreddit_name_prefixed: Option<String>,
+    url_overridden_by_dest: Option<String>,
+    // Comment fields (t1)
+    author: Option<String>,
+    body: Option<String>,
+    score: Option<i64>,
+    replies: Option<Replies>,
+}
+
+/// Reddit replies can be either a nested Listing or an empty string.
+#[derive(Deserialize)]
+#[serde(untagged)]
+enum Replies {
+    Listing(Listing),
+    #[allow(dead_code)]
+    Empty(String),
+}
--- a/crates/noxa-fetch/src/sitemap.rs
+++ b/crates/noxa-fetch/src/sitemap.rs
@ -0,0 +1,601 @@
+/// Sitemap parsing and URL discovery.
+///
+/// Discovers URLs from a site's sitemaps using a 3-step process:
+/// 1. Parse robots.txt for `Sitemap:` directives
+/// 2. Try common sitemap paths as fallback
+/// 3. Recursively resolve sitemap index files
+///
+/// All HTTP requests go through FetchClient to inherit TLS fingerprinting.
+use std::collections::HashSet;
+
+use quick_xml::Reader;
+use quick_xml::events::Event;
+use serde::Serialize;
+use tracing::{debug, warn};
+
+use crate::client::FetchClient;
+use crate::error::FetchError;
+
+/// Maximum depth when recursively fetching sitemap index files.
+/// Prevents infinite loops from circular sitemap references.
+const MAX_RECURSION_DEPTH: usize = 3;
+
+/// Common sitemap paths to try when robots.txt doesn't list any.
+const FALLBACK_SITEMAP_PATHS: &[&str] = &[
+    "/sitemap.xml",
+    "/sitemap_index.xml",
+    "/wp-sitemap.xml",
+    "/sitemap/sitemap-index.xml",
+];
+
+/// A single URL discovered from a sitemap.
+#[derive(Debug, Clone, Serialize)]
+pub struct SitemapEntry {
+    pub url: String,
+    pub last_modified: Option<String>,
+    pub priority: Option<f64>,
+    pub change_freq: Option<String>,
+}
+
+/// Discover all URLs from a site's sitemaps.
+///
+/// Discovery order:
+/// 1. Fetch /robots.txt, parse `Sitemap:` directives
+/// 2. Try common sitemap paths as fallback (skipping any already found)
+/// 3. If sitemap index, recursively fetch child sitemaps
+/// 4. Deduplicate by URL
+///
+/// Returns an empty vec (not an error) if no sitemaps are found.
+pub async fn discover(
+    client: &FetchClient,
+    base_url: &str,
+) -> Result<Vec<SitemapEntry>, FetchError> {
+    let base = base_url.trim_end_matches('/');
+    let mut sitemap_urls: Vec<String> = Vec::new();
+
+    // Step 1: Try robots.txt
+    let robots_url = format!("{base}/robots.txt");
+    debug!(url = %robots_url, "fetching robots.txt");
+
+    match client.fetch(&robots_url).await {
+        Ok(result) if result.status == 200 => {
+            let found = parse_robots_txt(&result.html);
+            debug!(count = found.len(), "sitemap URLs from robots.txt");
+            sitemap_urls.extend(found);
+        }
+        Ok(result) => {
+            debug!(status = result.status, "robots.txt not found");
+        }
+        Err(e) => {
+            debug!(error = %e, "failed to fetch robots.txt");
+        }
+    }
+
+    // Step 2: Try common sitemap paths (skipping any already discovered via robots.txt)
+    for path in FALLBACK_SITEMAP_PATHS {
+        let candidate = format!("{base}{path}");
+        if !sitemap_urls.iter().any(|u| u == &candidate) {
+            sitemap_urls.push(candidate);
+        }
+    }
+
+    // Step 3: Fetch and parse each sitemap, handling indexes recursively
+    let mut seen_urls: HashSet<String> = HashSet::new();
+    let mut entries: Vec<SitemapEntry> = Vec::new();
+
+    fetch_sitemaps(client, &sitemap_urls, &mut entries, &mut seen_urls, 0).await;
+
+    debug!(total = entries.len(), "sitemap discovery complete");
+    Ok(entries)
+}
+
+/// Recursively fetch and parse sitemap URLs, handling both urlsets and indexes.
+async fn fetch_sitemaps(
+    client: &FetchClient,
+    urls: &[String],
+    entries: &mut Vec<SitemapEntry>,
+    seen_urls: &mut HashSet<String>,
+    depth: usize,
+) {
+    if depth > MAX_RECURSION_DEPTH {
+        warn!(depth, "sitemap recursion limit reached, stopping");
+        return;
+    }
+
+    for sitemap_url in urls {
+        debug!(url = %sitemap_url, depth, "fetching sitemap");
+
+        let xml = match client.fetch(sitemap_url).await {
+            Ok(result) if result.status == 200 => result.html,
+            Ok(result) => {
+                debug!(url = %sitemap_url, status = result.status, "sitemap not found");
+                continue;
+            }
+            Err(e) => {
+                debug!(url = %sitemap_url, error = %e, "failed to fetch sitemap");
+                continue;
+            }
+        };
+
+        match detect_sitemap_type(&xml) {
+            SitemapType::UrlSet => {
+                let parsed = parse_urlset(&xml);
+                for entry in parsed {
+                    if seen_urls.insert(entry.url.clone()) {
+                        entries.push(entry);
+                    }
+                }
+            }
+            SitemapType::Index => {
+                let child_urls = parse_sitemap_index(&xml);
+                debug!(count = child_urls.len(), "found child sitemaps in index");
+
+                // Box the recursive call to avoid large future sizes
+                Box::pin(fetch_sitemaps(
+                    client,
+                    &child_urls,
+                    entries,
+                    seen_urls,
+                    depth + 1,
+                ))
+                .await;
+            }
+            SitemapType::Unknown => {
+                debug!(url = %sitemap_url, "unrecognized sitemap format, skipping");
+            }
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Pure parsing functions (no I/O, fully testable)
+// ---------------------------------------------------------------------------
+
+/// Extract `Sitemap:` directive URLs from robots.txt content.
+pub fn parse_robots_txt(text: &str) -> Vec<String> {
+    text.lines()
+        .filter_map(|line| {
+            let trimmed = line.trim();
+            // Case-insensitive match for "Sitemap:" prefix
+            if trimmed.len() > 8 && trimmed[..8].eq_ignore_ascii_case("sitemap:") {
+                let url = trimmed[8..].trim();
+                if !url.is_empty() {
+                    return Some(url.to_string());
+                }
+            }
+            None
+        })
+        .collect()
+}
+
+/// Parse a sitemap XML string. Handles both `<urlset>` and `<sitemapindex>`.
+/// Returns entries from urlsets and recursion targets from indexes.
+pub fn parse_sitemap_xml(xml: &str) -> Vec<SitemapEntry> {
+    match detect_sitemap_type(xml) {
+        SitemapType::UrlSet => parse_urlset(xml),
+        SitemapType::Index => {
+            // For the public parsing API, convert index <loc> entries into
+            // SitemapEntry with just the URL. The async `discover` function
+            // handles actual recursive fetching.
+            parse_sitemap_index(xml)
+                .into_iter()
+                .map(|url| SitemapEntry {
+                    url,
+                    last_modified: None,
+                    priority: None,
+                    change_freq: None,
+                })
+                .collect()
+        }
+        SitemapType::Unknown => Vec::new(),
+    }
+}
+
+#[derive(Debug, PartialEq)]
+enum SitemapType {
+    UrlSet,
+    Index,
+    Unknown,
+}
+
+/// Peek at the first element to determine if this is a urlset or sitemapindex.
+fn detect_sitemap_type(xml: &str) -> SitemapType {
+    let mut reader = Reader::from_str(xml);
+    let mut buf = Vec::new();
+
+    loop {
+        match reader.read_event_into(&mut buf) {
+            Ok(Event::Start(ref e)) | Ok(Event::Empty(ref e)) => {
+                let name = e.local_name();
+                return match name.as_ref() {
+                    b"urlset" => SitemapType::UrlSet,
+                    b"sitemapindex" => SitemapType::Index,
+                    _ => continue, // skip processing instructions, comments
+                };
+            }
+            Ok(Event::Eof) => return SitemapType::Unknown,
+            Err(_) => return SitemapType::Unknown,
+            _ => continue,
+        }
+    }
+}
+
+/// Parse `<url>` entries from a `<urlset>` sitemap.
+fn parse_urlset(xml: &str) -> Vec<SitemapEntry> {
+    let mut reader = Reader::from_str(xml);
+    let mut buf = Vec::new();
+    let mut entries = Vec::new();
+
+    // State for current <url> element being parsed
+    let mut in_url = false;
+    let mut current_tag: Option<UrlTag> = None;
+    let mut loc: Option<String> = None;
+    let mut lastmod: Option<String> = None;
+    let mut priority: Option<f64> = None;
+    let mut changefreq: Option<String> = None;
+
+    loop {
+        match reader.read_event_into(&mut buf) {
+            Ok(Event::Start(ref e)) => {
+                let name = e.local_name();
+                match name.as_ref() {
+                    b"url" => {
+                        in_url = true;
+                        loc = None;
+                        lastmod = None;
+                        priority = None;
+                        changefreq = None;
+                    }
+                    b"loc" if in_url => current_tag = Some(UrlTag::Loc),
+                    b"lastmod" if in_url => current_tag = Some(UrlTag::LastMod),
+                    b"priority" if in_url => current_tag = Some(UrlTag::Priority),
+                    b"changefreq" if in_url => current_tag = Some(UrlTag::ChangeFreq),
+                    _ => current_tag = None,
+                }
+            }
+            Ok(Event::Text(ref e)) => {
+                if let Some(ref tag) = current_tag
+                    && let Ok(text) = e.unescape()
+                {
+                    let text = text.trim().to_string();
+                    if !text.is_empty() {
+                        match tag {
+                            UrlTag::Loc => loc = Some(text),
+                            UrlTag::LastMod => lastmod = Some(text),
+                            UrlTag::Priority => priority = text.parse().ok(),
+                            UrlTag::ChangeFreq => changefreq = Some(text),
+                        }
+                    }
+                }
+            }
+            Ok(Event::End(ref e)) => {
+                let name = e.local_name();
+                if name.as_ref() == b"url" && in_url {
+                    if let Some(url) = loc.take() {
+                        entries.push(SitemapEntry {
+                            url,
+                            last_modified: lastmod.take(),
+                            priority: priority.take(),
+                            change_freq: changefreq.take(),
+                        });
+                    }
+                    in_url = false;
+                }
+                current_tag = None;
+            }
+            Ok(Event::Eof) => break,
+            Err(e) => {
+                warn!(error = %e, "XML parse error in sitemap, returning partial results");
+                break;
+            }
+            _ => {}
+        }
+        buf.clear();
+    }
+
+    entries
+}
+
+#[derive(Debug)]
+enum UrlTag {
+    Loc,
+    LastMod,
+    Priority,
+    ChangeFreq,
+}
+
+/// Parse `<sitemap>` entries from a `<sitemapindex>`, returning child sitemap URLs.
+fn parse_sitemap_index(xml: &str) -> Vec<String> {
+    let mut reader = Reader::from_str(xml);
+    let mut buf = Vec::new();
+    let mut urls = Vec::new();
+
+    let mut in_sitemap = false;
+    let mut in_loc = false;
+
+    loop {
+        match reader.read_event_into(&mut buf) {
+            Ok(Event::Start(ref e)) => {
+                let name = e.local_name();
+                match name.as_ref() {
+                    b"sitemap" => in_sitemap = true,
+                    b"loc" if in_sitemap => in_loc = true,
+                    _ => {}
+                }
+            }
+            Ok(Event::Text(ref e)) => {
+                if in_loc && let Ok(text) = e.unescape() {
+                    let text = text.trim().to_string();
+                    if !text.is_empty() {
+                        urls.push(text);
+                    }
+                }
+            }
+            Ok(Event::End(ref e)) => {
+                let name = e.local_name();
+                match name.as_ref() {
+                    b"sitemap" => {
+                        in_sitemap = false;
+                        in_loc = false;
+                    }
+                    b"loc" => in_loc = false,
+                    _ => {}
+                }
+            }
+            Ok(Event::Eof) => break,
+            Err(e) => {
+                warn!(error = %e, "XML parse error in sitemap index, returning partial results");
+                break;
+            }
+            _ => {}
+        }
+        buf.clear();
+    }
+
+    urls
+}
+
+// ---------------------------------------------------------------------------
+// Tests
+// ---------------------------------------------------------------------------
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_parse_urlset() {
+        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+  <url>
+    <loc>https://example.com/</loc>
+    <lastmod>2026-01-15</lastmod>
+    <changefreq>daily</changefreq>
+    <priority>1.0</priority>
+  </url>
+  <url>
+    <loc>https://example.com/about</loc>
+    <lastmod>2026-01-10</lastmod>
+    <changefreq>monthly</changefreq>
+    <priority>0.8</priority>
+  </url>
+  <url>
+    <loc>https://example.com/blog/post-1</loc>
+  </url>
+</urlset>"#;
+
+        let entries = parse_urlset(xml);
+        assert_eq!(entries.len(), 3);
+
+        assert_eq!(entries[0].url, "https://example.com/");
+        assert_eq!(entries[0].last_modified.as_deref(), Some("2026-01-15"));
+        assert_eq!(entries[0].change_freq.as_deref(), Some("daily"));
+        assert_eq!(entries[0].priority, Some(1.0));
+
+        assert_eq!(entries[1].url, "https://example.com/about");
+        assert_eq!(entries[1].priority, Some(0.8));
+
+        assert_eq!(entries[2].url, "https://example.com/blog/post-1");
+        assert_eq!(entries[2].last_modified, None);
+        assert_eq!(entries[2].priority, None);
+        assert_eq!(entries[2].change_freq, None);
+    }
+
+    #[test]
+    fn test_parse_sitemap_index() {
+        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
+<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+  <sitemap>
+    <loc>https://example.com/sitemap-posts.xml</loc>
+    <lastmod>2026-03-01</lastmod>
+  </sitemap>
+  <sitemap>
+    <loc>https://example.com/sitemap-pages.xml</loc>
+  </sitemap>
+</sitemapindex>"#;
+
+        let urls = parse_sitemap_index(xml);
+        assert_eq!(urls.len(), 2);
+        assert_eq!(urls[0], "https://example.com/sitemap-posts.xml");
+        assert_eq!(urls[1], "https://example.com/sitemap-pages.xml");
+    }
+
+    #[test]
+    fn test_parse_sitemap_xml_dispatches_urlset() {
+        let xml = r#"<?xml version="1.0"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+  <url><loc>https://example.com/page</loc></url>
+</urlset>"#;
+
+        let entries = parse_sitemap_xml(xml);
+        assert_eq!(entries.len(), 1);
+        assert_eq!(entries[0].url, "https://example.com/page");
+    }
+
+    #[test]
+    fn test_parse_sitemap_xml_dispatches_index() {
+        let xml = r#"<?xml version="1.0"?>
+<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+  <sitemap><loc>https://example.com/sitemap-1.xml</loc></sitemap>
+</sitemapindex>"#;
+
+        let entries = parse_sitemap_xml(xml);
+        assert_eq!(entries.len(), 1);
+        assert_eq!(entries[0].url, "https://example.com/sitemap-1.xml");
+        // Index entries have no metadata when parsed through the public API
+        assert_eq!(entries[0].priority, None);
+    }
+
+    #[test]
+    fn test_parse_robots_txt() {
+        let robots = "User-agent: *\n\
+                       Disallow: /admin/\n\
+                       \n\
+                       Sitemap: https://example.com/sitemap.xml\n\
+                       sitemap: https://example.com/sitemap-news.xml\n\
+                       SITEMAP: https://example.com/sitemap-images.xml\n\
+                       \n\
+                       User-agent: Googlebot\n\
+                       Allow: /\n";
+
+        let urls = parse_robots_txt(robots);
+        assert_eq!(urls.len(), 3);
+        assert_eq!(urls[0], "https://example.com/sitemap.xml");
+        assert_eq!(urls[1], "https://example.com/sitemap-news.xml");
+        assert_eq!(urls[2], "https://example.com/sitemap-images.xml");
+    }
+
+    #[test]
+    fn test_parse_robots_txt_empty_value() {
+        // "Sitemap:" with no URL should be skipped
+        let robots = "Sitemap:\nSitemap:   \nSitemap: https://example.com/s.xml\n";
+        let urls = parse_robots_txt(robots);
+        assert_eq!(urls.len(), 1);
+        assert_eq!(urls[0], "https://example.com/s.xml");
+    }
+
+    #[test]
+    fn test_deduplicate() {
+        // parse_sitemap_xml deduplicates via the discover() path, but
+        // we can verify that parsing the same URL twice produces entries
+        // that the HashSet in discover() would collapse.
+        let xml = r#"<?xml version="1.0"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+  <url><loc>https://example.com/page</loc></url>
+  <url><loc>https://example.com/page</loc></url>
+  <url><loc>https://example.com/other</loc></url>
+</urlset>"#;
+
+        let entries = parse_urlset(xml);
+        assert_eq!(entries.len(), 3, "parser returns all entries");
+
+        // Simulate the dedup that discover() does
+        let mut seen = HashSet::new();
+        let deduped: Vec<_> = entries
+            .into_iter()
+            .filter(|e| seen.insert(e.url.clone()))
+            .collect();
+        assert_eq!(deduped.len(), 2, "dedup collapses duplicates");
+    }
+
+    #[test]
+    fn test_empty_sitemap() {
+        let xml = r#"<?xml version="1.0"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+</urlset>"#;
+
+        let entries = parse_urlset(xml);
+        assert!(entries.is_empty());
+    }
+
+    #[test]
+    fn test_malformed_xml() {
+        let xml = "this is not xml at all <><><";
+        let entries = parse_sitemap_xml(xml);
+        assert!(entries.is_empty(), "malformed XML returns empty vec");
+    }
+
+    #[test]
+    fn test_malformed_xml_partial() {
+        // Partial XML that starts valid but breaks mid-stream
+        let xml = r#"<?xml version="1.0"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+  <url><loc>https://example.com/good</loc></url>
+  <url><loc>broken
+"#;
+        let entries = parse_sitemap_xml(xml);
+        // Should return at least the successfully parsed entry
+        assert!(entries.len() >= 1);
+        assert_eq!(entries[0].url, "https://example.com/good");
+    }
+
+    #[test]
+    fn test_missing_loc() {
+        let xml = r#"<?xml version="1.0"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+  <url>
+    <lastmod>2026-01-01</lastmod>
+    <priority>0.5</priority>
+  </url>
+  <url>
+    <loc>https://example.com/valid</loc>
+  </url>
+</urlset>"#;
+
+        let entries = parse_urlset(xml);
+        assert_eq!(entries.len(), 1, "entry without <loc> is skipped");
+        assert_eq!(entries[0].url, "https://example.com/valid");
+    }
+
+    #[test]
+    fn test_priority_parsing() {
+        let xml = r#"<?xml version="1.0"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+  <url>
+    <loc>https://example.com/high</loc>
+    <priority>1.0</priority>
+  </url>
+  <url>
+    <loc>https://example.com/mid</loc>
+    <priority>0.5</priority>
+  </url>
+  <url>
+    <loc>https://example.com/low</loc>
+    <priority>0.1</priority>
+  </url>
+  <url>
+    <loc>https://example.com/invalid</loc>
+    <priority>not-a-number</priority>
+  </url>
+</urlset>"#;
+
+        let entries = parse_urlset(xml);
+        assert_eq!(entries.len(), 4);
+
+        assert_eq!(entries[0].priority, Some(1.0));
+        assert_eq!(entries[1].priority, Some(0.5));
+        assert_eq!(entries[2].priority, Some(0.1));
+        assert_eq!(entries[3].priority, None, "invalid priority parses as None");
+    }
+
+    #[test]
+    fn test_detect_sitemap_type() {
+        let urlset = r#"<?xml version="1.0"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"></urlset>"#;
+        assert_eq!(detect_sitemap_type(urlset), SitemapType::UrlSet);
+
+        let index = r#"<?xml version="1.0"?><sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"></sitemapindex>"#;
+        assert_eq!(detect_sitemap_type(index), SitemapType::Index);
+
+        assert_eq!(detect_sitemap_type("garbage"), SitemapType::Unknown);
+        assert_eq!(detect_sitemap_type(""), SitemapType::Unknown);
+    }
+
+    #[test]
+    fn test_fallback_paths_constant() {
+        // Verify the constant has the expected paths
+        assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap.xml"));
+        assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap_index.xml"));
+        assert!(FALLBACK_SITEMAP_PATHS.contains(&"/wp-sitemap.xml"));
+        assert!(FALLBACK_SITEMAP_PATHS.contains(&"/sitemap/sitemap-index.xml"));
+    }
+}
--- a/crates/noxa-fetch/src/tls.rs
+++ b/crates/noxa-fetch/src/tls.rs
@ -0,0 +1,372 @@
+//! Browser TLS + HTTP/2 fingerprint profiles built on wreq (BoringSSL).
+//!
+//! Replaces the old noxa-http/noxa-tls patched rustls stack.
+//! Each profile configures TLS options (cipher suites, curves, extensions,
+//! PSK, ECH GREASE) and HTTP/2 options (SETTINGS order, pseudo-header order,
+//! stream dependency, priorities) to match real browser fingerprints.
+
+use std::time::Duration;
+
+use wreq::http2::{
+    Http2Options, PseudoId, PseudoOrder, SettingId, SettingsOrder, StreamDependency, StreamId,
+};
+use wreq::tls::{AlpsProtocol, CertificateCompressionAlgorithm, TlsOptions, TlsVersion};
+use wreq::{Client, Emulation};
+
+use crate::browser::BrowserVariant;
+use crate::error::FetchError;
+
+/// Chrome cipher list (TLS 1.3 + TLS 1.2 in Chrome's exact order).
+const CHROME_CIPHERS: &str = "TLS_AES_128_GCM_SHA256:TLS_AES_256_GCM_SHA384:TLS_CHACHA20_POLY1305_SHA256:TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA:TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA:TLS_RSA_WITH_AES_128_GCM_SHA256:TLS_RSA_WITH_AES_256_GCM_SHA384:TLS_RSA_WITH_AES_128_CBC_SHA:TLS_RSA_WITH_AES_256_CBC_SHA";
+
+/// Chrome signature algorithms.
+const CHROME_SIGALGS: &str = "ecdsa_secp256r1_sha256:rsa_pss_rsae_sha256:rsa_pkcs1_sha256:ecdsa_secp384r1_sha384:rsa_pss_rsae_sha384:rsa_pkcs1_sha384:rsa_pss_rsae_sha512:rsa_pkcs1_sha512";
+
+/// Chrome curves (post-quantum ML-KEM + X25519 + P-256 + P-384).
+const CHROME_CURVES: &str = "X25519MLKEM768:X25519:P-256:P-384";
+
+/// Firefox cipher list.
+const FIREFOX_CIPHERS: &str = "TLS_AES_128_GCM_SHA256:TLS_CHACHA20_POLY1305_SHA256:TLS_AES_256_GCM_SHA384:TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA:TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA:TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA:TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA:TLS_RSA_WITH_AES_128_GCM_SHA256:TLS_RSA_WITH_AES_256_GCM_SHA384:TLS_RSA_WITH_AES_128_CBC_SHA:TLS_RSA_WITH_AES_256_CBC_SHA";
+
+/// Firefox signature algorithms.
+const FIREFOX_SIGALGS: &str = "ecdsa_secp256r1_sha256:ecdsa_secp384r1_sha384:ecdsa_secp521r1_sha512:rsa_pss_rsae_sha256:rsa_pss_rsae_sha384:rsa_pss_rsae_sha512:rsa_pkcs1_sha256:rsa_pkcs1_sha384:rsa_pkcs1_sha512:ecdsa_sha1:rsa_pkcs1_sha1";
+
+/// Firefox curves.
+const FIREFOX_CURVES: &str = "X25519MLKEM768:X25519:P-256:P-384:P-521";
+
+/// Safari cipher list.
+const SAFARI_CIPHERS: &str = "TLS_AES_128_GCM_SHA256:TLS_AES_256_GCM_SHA384:TLS_CHACHA20_POLY1305_SHA256:TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA:TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA:TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA:TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA:TLS_RSA_WITH_AES_256_GCM_SHA384:TLS_RSA_WITH_AES_128_GCM_SHA256:TLS_RSA_WITH_AES_256_CBC_SHA:TLS_RSA_WITH_AES_128_CBC_SHA";
+
+/// Safari signature algorithms.
+const SAFARI_SIGALGS: &str = "ecdsa_secp256r1_sha256:rsa_pss_rsae_sha256:rsa_pkcs1_sha256:ecdsa_secp384r1_sha384:rsa_pss_rsae_sha384:ecdsa_secp521r1_sha512:rsa_pss_rsae_sha512:rsa_pkcs1_sha384:rsa_pkcs1_sha512";
+
+/// Safari curves.
+const SAFARI_CURVES: &str = "X25519:P-256:P-384:P-521";
+
+// --- Chrome HTTP headers in correct wire order ---
+
+const CHROME_HEADERS: &[(&str, &str)] = &[
+    (
+        "sec-ch-ua",
+        r#""Google Chrome";v="145", "Chromium";v="145", "Not/A)Brand";v="24""#,
+    ),
+    ("sec-ch-ua-mobile", "?0"),
+    ("sec-ch-ua-platform", "\"Windows\""),
+    ("upgrade-insecure-requests", "1"),
+    (
+        "user-agent",
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36",
+    ),
+    (
+        "accept",
+        "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+    ),
+    ("sec-fetch-site", "none"),
+    ("sec-fetch-mode", "navigate"),
+    ("sec-fetch-user", "?1"),
+    ("sec-fetch-dest", "document"),
+    ("accept-encoding", "gzip, deflate, br, zstd"),
+    ("accept-language", "en-US,en;q=0.9"),
+    ("priority", "u=0, i"),
+];
+
+const CHROME_MACOS_HEADERS: &[(&str, &str)] = &[
+    (
+        "sec-ch-ua",
+        r#""Google Chrome";v="145", "Chromium";v="145", "Not/A)Brand";v="24""#,
+    ),
+    ("sec-ch-ua-mobile", "?0"),
+    ("sec-ch-ua-platform", "\"macOS\""),
+    ("upgrade-insecure-requests", "1"),
+    (
+        "user-agent",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36",
+    ),
+    (
+        "accept",
+        "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+    ),
+    ("sec-fetch-site", "none"),
+    ("sec-fetch-mode", "navigate"),
+    ("sec-fetch-user", "?1"),
+    ("sec-fetch-dest", "document"),
+    ("accept-encoding", "gzip, deflate, br, zstd"),
+    ("accept-language", "en-US,en;q=0.9"),
+    ("priority", "u=0, i"),
+];
+
+const FIREFOX_HEADERS: &[(&str, &str)] = &[
+    (
+        "user-agent",
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:135.0) Gecko/20100101 Firefox/135.0",
+    ),
+    (
+        "accept",
+        "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+    ),
+    ("accept-language", "en-US,en;q=0.5"),
+    ("accept-encoding", "gzip, deflate, br, zstd"),
+    ("upgrade-insecure-requests", "1"),
+    ("sec-fetch-dest", "document"),
+    ("sec-fetch-mode", "navigate"),
+    ("sec-fetch-site", "none"),
+    ("sec-fetch-user", "?1"),
+    ("priority", "u=0, i"),
+];
+
+const SAFARI_HEADERS: &[(&str, &str)] = &[
+    (
+        "user-agent",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.3.1 Safari/605.1.15",
+    ),
+    (
+        "accept",
+        "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+    ),
+    ("sec-fetch-site", "none"),
+    ("accept-language", "en-US,en;q=0.9"),
+    ("sec-fetch-mode", "navigate"),
+    ("accept-encoding", "gzip, deflate, br"),
+    ("sec-fetch-dest", "document"),
+];
+
+const EDGE_HEADERS: &[(&str, &str)] = &[
+    (
+        "sec-ch-ua",
+        r#""Microsoft Edge";v="145", "Chromium";v="145", "Not/A)Brand";v="24""#,
+    ),
+    ("sec-ch-ua-mobile", "?0"),
+    ("sec-ch-ua-platform", "\"Windows\""),
+    ("upgrade-insecure-requests", "1"),
+    (
+        "user-agent",
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36 Edg/145.0.0.0",
+    ),
+    (
+        "accept",
+        "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+    ),
+    ("sec-fetch-site", "none"),
+    ("sec-fetch-mode", "navigate"),
+    ("sec-fetch-user", "?1"),
+    ("sec-fetch-dest", "document"),
+    ("accept-encoding", "gzip, deflate, br, zstd"),
+    ("accept-language", "en-US,en;q=0.9"),
+    ("priority", "u=0, i"),
+];
+
+fn chrome_tls() -> TlsOptions {
+    TlsOptions::builder()
+        .cipher_list(CHROME_CIPHERS)
+        .sigalgs_list(CHROME_SIGALGS)
+        .curves_list(CHROME_CURVES)
+        .min_tls_version(TlsVersion::TLS_1_2)
+        .max_tls_version(TlsVersion::TLS_1_3)
+        .grease_enabled(true)
+        .permute_extensions(true)
+        .enable_ech_grease(true)
+        .pre_shared_key(true)
+        .enable_ocsp_stapling(true)
+        .enable_signed_cert_timestamps(true)
+        .alps_protocols([AlpsProtocol::HTTP2])
+        .alps_use_new_codepoint(true)
+        .aes_hw_override(true)
+        .certificate_compression_algorithms(&[CertificateCompressionAlgorithm::BROTLI])
+        .build()
+}
+
+fn firefox_tls() -> TlsOptions {
+    TlsOptions::builder()
+        .cipher_list(FIREFOX_CIPHERS)
+        .sigalgs_list(FIREFOX_SIGALGS)
+        .curves_list(FIREFOX_CURVES)
+        .min_tls_version(TlsVersion::TLS_1_2)
+        .max_tls_version(TlsVersion::TLS_1_3)
+        .grease_enabled(true)
+        .permute_extensions(false)
+        .enable_ech_grease(true)
+        .pre_shared_key(true)
+        .enable_ocsp_stapling(true)
+        .enable_signed_cert_timestamps(true)
+        .certificate_compression_algorithms(&[
+            CertificateCompressionAlgorithm::ZLIB,
+            CertificateCompressionAlgorithm::BROTLI,
+        ])
+        .build()
+}
+
+fn safari_tls() -> TlsOptions {
+    TlsOptions::builder()
+        .cipher_list(SAFARI_CIPHERS)
+        .sigalgs_list(SAFARI_SIGALGS)
+        .curves_list(SAFARI_CURVES)
+        .min_tls_version(TlsVersion::TLS_1_2)
+        .max_tls_version(TlsVersion::TLS_1_3)
+        .grease_enabled(true)
+        .permute_extensions(false)
+        .enable_ech_grease(false)
+        .pre_shared_key(false)
+        .enable_ocsp_stapling(true)
+        .enable_signed_cert_timestamps(true)
+        .certificate_compression_algorithms(&[CertificateCompressionAlgorithm::ZLIB])
+        .build()
+}
+
+fn chrome_h2() -> Http2Options {
+    Http2Options::builder()
+        .initial_window_size(6_291_456)
+        .initial_connection_window_size(15_728_640)
+        .max_header_list_size(262_144)
+        .header_table_size(65_536)
+        .max_concurrent_streams(1000u32)
+        .enable_push(false)
+        .settings_order(
+            SettingsOrder::builder()
+                .extend([
+                    SettingId::HeaderTableSize,
+                    SettingId::EnablePush,
+                    SettingId::MaxConcurrentStreams,
+                    SettingId::InitialWindowSize,
+                    SettingId::MaxFrameSize,
+                    SettingId::MaxHeaderListSize,
+                    SettingId::EnableConnectProtocol,
+                    SettingId::NoRfc7540Priorities,
+                ])
+                .build(),
+        )
+        .headers_pseudo_order(
+            PseudoOrder::builder()
+                .extend([
+                    PseudoId::Method,
+                    PseudoId::Authority,
+                    PseudoId::Scheme,
+                    PseudoId::Path,
+                ])
+                .build(),
+        )
+        .headers_stream_dependency(StreamDependency::new(StreamId::zero(), 219, true))
+        .build()
+}
+
+fn firefox_h2() -> Http2Options {
+    Http2Options::builder()
+        .initial_window_size(131_072)
+        .initial_connection_window_size(12_517_377)
+        .max_header_list_size(65_536)
+        .header_table_size(65_536)
+        .settings_order(
+            SettingsOrder::builder()
+                .extend([
+                    SettingId::HeaderTableSize,
+                    SettingId::InitialWindowSize,
+                    SettingId::MaxFrameSize,
+                ])
+                .build(),
+        )
+        .headers_pseudo_order(
+            PseudoOrder::builder()
+                .extend([
+                    PseudoId::Method,
+                    PseudoId::Path,
+                    PseudoId::Authority,
+                    PseudoId::Scheme,
+                ])
+                .build(),
+        )
+        .build()
+}
+
+fn safari_h2() -> Http2Options {
+    Http2Options::builder()
+        .initial_window_size(2_097_152)
+        .initial_connection_window_size(10_420_225)
+        .max_header_list_size(0)
+        .header_table_size(4_096)
+        .enable_push(false)
+        .max_concurrent_streams(100u32)
+        .settings_order(
+            SettingsOrder::builder()
+                .extend([
+                    SettingId::EnablePush,
+                    SettingId::MaxConcurrentStreams,
+                    SettingId::InitialWindowSize,
+                    SettingId::MaxFrameSize,
+                ])
+                .build(),
+        )
+        .headers_pseudo_order(
+            PseudoOrder::builder()
+                .extend([
+                    PseudoId::Method,
+                    PseudoId::Scheme,
+                    PseudoId::Authority,
+                    PseudoId::Path,
+                ])
+                .build(),
+        )
+        .headers_stream_dependency(StreamDependency::new(StreamId::zero(), 255, false))
+        .build()
+}
+
+fn build_headers(pairs: &[(&str, &str)]) -> http::HeaderMap {
+    let mut map = http::HeaderMap::with_capacity(pairs.len());
+    for (name, value) in pairs {
+        if let (Ok(n), Ok(v)) = (
+            http::header::HeaderName::from_bytes(name.as_bytes()),
+            http::header::HeaderValue::from_str(value),
+        ) {
+            map.insert(n, v);
+        }
+    }
+    map
+}
+
+/// Build a wreq Client for a specific browser variant.
+pub fn build_client(
+    variant: BrowserVariant,
+    timeout: Duration,
+    extra_headers: &std::collections::HashMap<String, String>,
+    proxy: Option<&str>,
+) -> Result<Client, FetchError> {
+    let (tls, h2, headers) = match variant {
+        BrowserVariant::Chrome => (chrome_tls(), chrome_h2(), CHROME_HEADERS),
+        BrowserVariant::ChromeMacos => (chrome_tls(), chrome_h2(), CHROME_MACOS_HEADERS),
+        BrowserVariant::Firefox => (firefox_tls(), firefox_h2(), FIREFOX_HEADERS),
+        BrowserVariant::Safari => (safari_tls(), safari_h2(), SAFARI_HEADERS),
+        BrowserVariant::Edge => (chrome_tls(), chrome_h2(), EDGE_HEADERS),
+    };
+
+    let mut header_map = build_headers(headers);
+
+    // Append extra headers after profile defaults
+    for (k, v) in extra_headers {
+        if let (Ok(n), Ok(val)) = (
+            http::header::HeaderName::from_bytes(k.as_bytes()),
+            http::header::HeaderValue::from_str(v),
+        ) {
+            header_map.insert(n, val);
+        }
+    }
+
+    let emulation = Emulation::builder()
+        .tls_options(tls)
+        .http2_options(h2)
+        .headers(header_map)
+        .build();
+
+    let mut builder = Client::builder()
+        .emulation(emulation)
+        .redirect(wreq::redirect::Policy::limited(10))
+        .cookie_store(true)
+        .timeout(timeout);
+
+    if let Some(proxy_url) = proxy {
+        let proxy =
+            wreq::Proxy::all(proxy_url).map_err(|e| FetchError::Build(format!("proxy: {e}")))?;
+        builder = builder.proxy(proxy);
+    }
+
+    builder
+        .build()
+        .map_err(|e| FetchError::Build(e.to_string()))
+}