diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index bf03cee..4d5625e 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -12,7 +12,11 @@ env:
 jobs:
   test:
     name: Test
-    runs-on: ubuntu-latest
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, macos-latest]
     steps:
       - uses: actions/checkout@v5
       - uses: dtolnay/rust-toolchain@stable
@@ -29,7 +33,16 @@ jobs:
           components: clippy, rustfmt
       - uses: Swatinem/rust-cache@v2
       - run: cargo fmt --check --all
-      - run: cargo clippy --all -- -D warnings
+      - run: cargo clippy --all --all-targets -- -D warnings
+
+  deny:
+    name: Supply chain
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v5
+      - uses: EmbarkStudios/cargo-deny-action@v2
+        with:
+          command: check advisories bans licenses sources
 
   wasm:
     name: WASM
diff --git a/Cargo.lock b/Cargo.lock
index 78e7e77..942d841 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3331,11 +3331,13 @@ dependencies = [
  "anyhow",
  "axum",
  "clap",
+ "http-body-util",
  "serde",
  "serde_json",
  "subtle",
  "thiserror",
  "tokio",
+ "tower",
  "tower-http",
  "tracing",
  "tracing-subscriber",
diff --git a/Cargo.toml b/Cargo.toml
index 124c620..1a80438 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -5,9 +5,31 @@ members = ["crates/*"]
 [workspace.package]
 version = "0.6.5"
 edition = "2024"
+rust-version = "1.85"
 license = "AGPL-3.0"
 repository = "https://github.com/0xMassi/webclaw"
 
+# Hardened release profile: thin LTO + a single codegen unit enable
+# cross-crate inlining on the hot extraction path and shrink the binaries,
+# and stripping symbols trims the shipped artifact. We deliberately do NOT
+# set `panic = "abort"`: webclaw-pdf relies on std::panic::catch_unwind to
+# recover from panics inside the pdf-extract parser, and abort would turn
+# those recoverable panics into hard process kills.
+[profile.release]
+lto = "thin"
+codegen-units = 1
+strip = true
+
+# Conservative, high-value hardening lints applied workspace-wide. Crates
+# opt in via `[lints] workspace = true`. Kept deliberately narrow so
+# `clippy -D warnings` stays green — the goal is hardening, not a cleanup
+# sweep that would break the build.
+[workspace.lints.rust]
+unsafe_op_in_unsafe_fn = "warn"
+
+[workspace.lints.clippy]
+mem_forget = "warn"
+
 [workspace.dependencies]
 webclaw-core = { path = "crates/webclaw-core" }
 webclaw-fetch = { path = "crates/webclaw-fetch" }
diff --git a/crates/webclaw-cli/Cargo.toml b/crates/webclaw-cli/Cargo.toml
index adce50f..a073ce2 100644
--- a/crates/webclaw-cli/Cargo.toml
+++ b/crates/webclaw-cli/Cargo.toml
@@ -3,8 +3,12 @@ name = "webclaw-cli"
 description = "CLI for extracting web content into LLM-optimized formats"
 version.workspace = true
 edition.workspace = true
+rust-version.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [[bin]]
 name = "webclaw"
 path = "src/main.rs"
diff --git a/crates/webclaw-cli/examples/perf_corpus.rs b/crates/webclaw-cli/examples/perf_corpus.rs
index aaa4c02..1198bf3 100644
--- a/crates/webclaw-cli/examples/perf_corpus.rs
+++ b/crates/webclaw-cli/examples/perf_corpus.rs
@@ -48,7 +48,10 @@ async fn main() {
     match mode.as_str() {
         "capture" => capture().await,
         "bench" => {
-            let iters: usize = std::env::args().nth(2).and_then(|s| s.parse().ok()).unwrap_or(60);
+            let iters: usize = std::env::args()
+                .nth(2)
+                .and_then(|s| s.parse().ok())
+                .unwrap_or(60);
             bench(iters);
         }
         "snapshot" => {
@@ -64,14 +67,21 @@ async fn main() {
 
 async fn capture() {
     fs::create_dir_all(CORPUS).unwrap();
-    let config = FetchConfig { browser: BrowserProfile::Chrome, ..FetchConfig::default() };
+    let config = FetchConfig {
+        browser: BrowserProfile::Chrome,
+        ..FetchConfig::default()
+    };
     let client = FetchClient::new(config).expect("build client");
     let mut ok = 0;
     for (i, u) in URLS.iter().enumerate() {
         let name = format!(
             "{:02}_{}.html",
             i + 1,
-            u.replace("https://", "").chars().map(|c| if c.is_alphanumeric() { c } else { '_' }).take(40).collect::<String>()
+            u.replace("https://", "")
+                .chars()
+                .map(|c| if c.is_alphanumeric() { c } else { '_' })
+                .take(40)
+                .collect::<String>()
         );
         match client.fetch(u).await {
             Ok(f) if f.html.len() > 1000 => {
@@ -99,7 +109,9 @@ fn snapshot(label: &str) {
     let mut n = 0;
     for path in &files {
         let html = fs::read_to_string(path).unwrap_or_default();
-        if html.is_empty() { continue; }
+        if html.is_empty() {
+            continue;
+        }
         let stem = path.file_stem().unwrap().to_string_lossy().to_string();
         let url = format!("https://corpus/{stem}");
         match extract(&html, Some(&url)) {
@@ -117,7 +129,9 @@ fn snapshot(label: &str) {
 }
 
 fn percentile(sorted: &[u128], p: f64) -> u128 {
-    if sorted.is_empty() { return 0; }
+    if sorted.is_empty() {
+        return 0;
+    }
     let idx = ((sorted.len() as f64 - 1.0) * p).round() as usize;
     sorted[idx]
 }
@@ -135,7 +149,10 @@ fn bench(iters: usize) {
     }
 
     println!("# perf_corpus bench  docs={}  iters={}", files.len(), iters);
-    println!("{:<42} {:>10} {:>10} {:>10} {:>10}", "doc(KB)", "extract_us", "llm_us", "p50_us", "p90_us");
+    println!(
+        "{:<42} {:>10} {:>10} {:>10} {:>10}",
+        "doc(KB)", "extract_us", "llm_us", "p50_us", "p90_us"
+    );
 
     let mut grand_extract = 0u128;
     let mut grand_llm = 0u128;
@@ -143,8 +160,13 @@ fn bench(iters: usize) {
 
     for path in &files {
         let html = fs::read_to_string(path).unwrap_or_default();
-        if html.is_empty() { continue; }
-        let url = format!("https://corpus/{}", path.file_name().unwrap().to_string_lossy());
+        if html.is_empty() {
+            continue;
+        }
+        let url = format!(
+            "https://corpus/{}",
+            path.file_name().unwrap().to_string_lossy()
+        );
 
         // warmup
         for _ in 0..5 {
@@ -158,7 +180,10 @@ fn bench(iters: usize) {
         let mut total_times = Vec::with_capacity(iters);
         for _ in 0..iters {
             let t0 = Instant::now();
-            let ex = match extract(&html, Some(&url)) { Ok(e) => e, Err(_) => continue };
+            let ex = match extract(&html, Some(&url)) {
+                Ok(e) => e,
+                Err(_) => continue,
+            };
             let t1 = Instant::now();
             let txt = to_llm_text(&ex, Some(&url));
             let t2 = Instant::now();
@@ -178,11 +203,24 @@ fn bench(iters: usize) {
         grand_llm += llm_p50;
         grand_total_p50 += tot_p50;
 
-        let label = format!("{} ({}KB)", path.file_stem().unwrap().to_string_lossy(), html.len() / 1024);
-        println!("{:<42} {:>10} {:>10} {:>10} {:>10}", label.chars().take(42).collect::<String>(), ex_p50, llm_p50, tot_p50, tot_p90);
+        let label = format!(
+            "{} ({}KB)",
+            path.file_stem().unwrap().to_string_lossy(),
+            html.len() / 1024
+        );
+        println!(
+            "{:<42} {:>10} {:>10} {:>10} {:>10}",
+            label.chars().take(42).collect::<String>(),
+            ex_p50,
+            llm_p50,
+            tot_p50,
+            tot_p90
+        );
     }
 
     println!("---");
-    println!("CORPUS_PASS_P50_SUM_US extract={grand_extract} llm={grand_llm} total={grand_total_p50}");
+    println!(
+        "CORPUS_PASS_P50_SUM_US extract={grand_extract} llm={grand_llm} total={grand_total_p50}"
+    );
     println!("(lower is better; total = one full extract+llm pass over the whole corpus at p50)");
 }
diff --git a/crates/webclaw-cli/src/bench.rs b/crates/webclaw-cli/src/bench.rs
index 3e45da4..4c9dbb1 100644
--- a/crates/webclaw-cli/src/bench.rs
+++ b/crates/webclaw-cli/src/bench.rs
@@ -198,7 +198,7 @@ fn fmt_int(n: usize) -> String {
     let bytes = s.as_bytes();
     let mut out = String::with_capacity(s.len() + s.len() / 3);
     for (i, b) in bytes.iter().enumerate() {
-        if i > 0 && (bytes.len() - i).is_multiple_of(3) {
+        if i > 0 && (bytes.len() - i) % 3 == 0 {
             out.push(',');
         }
         out.push(*b as char);
diff --git a/crates/webclaw-cli/src/cli.rs b/crates/webclaw-cli/src/cli.rs
new file mode 100644
index 0000000..403e8cf
--- /dev/null
+++ b/crates/webclaw-cli/src/cli.rs
@@ -0,0 +1,324 @@
+//! CLI argument definitions: clap structs/enums and their conversions.
+
+use std::path::PathBuf;
+
+use clap::{Parser, Subcommand, ValueEnum};
+use webclaw_fetch::BrowserProfile;
+use webclaw_pdf::PdfMode;
+
+#[derive(Parser)]
+#[command(name = "webclaw", about = "Extract web content for LLMs", version)]
+pub struct Cli {
+    /// Optional subcommand. When omitted, the CLI falls back to the
+    /// traditional flag-based flow (URL + --format, --crawl, etc.).
+    /// Subcommands are used for flows that don't fit that model.
+    #[command(subcommand)]
+    pub command: Option<Commands>,
+
+    /// URLs to fetch (multiple allowed)
+    #[arg()]
+    pub urls: Vec<String>,
+
+    /// File with URLs (one per line)
+    #[arg(long)]
+    pub urls_file: Option<String>,
+
+    /// Output format (markdown, json, text, llm, html)
+    #[arg(short, long, default_value = "markdown")]
+    pub format: OutputFormat,
+
+    /// Browser to impersonate
+    #[arg(short, long, default_value = "chrome")]
+    pub browser: Browser,
+
+    /// Proxy URL (http://user:pass@host:port or socks5://host:port)
+    #[arg(short, long, env = "WEBCLAW_PROXY")]
+    pub proxy: Option<String>,
+
+    /// File with proxies (host:port:user:pass, one per line). Rotates per request.
+    #[arg(long, env = "WEBCLAW_PROXY_FILE")]
+    pub proxy_file: Option<String>,
+
+    /// Request timeout in seconds
+    #[arg(short, long, default_value = "30")]
+    pub timeout: u64,
+
+    /// Extract from local HTML file instead of fetching
+    #[arg(long)]
+    pub file: Option<String>,
+
+    /// Read HTML from stdin
+    #[arg(long)]
+    pub stdin: bool,
+
+    /// Include metadata in output (always included in JSON)
+    #[arg(long)]
+    pub metadata: bool,
+
+    /// Output raw fetched HTML instead of extracting
+    #[arg(long)]
+    pub raw_html: bool,
+
+    /// CSS selectors to include (comma-separated, e.g. "article,.content")
+    #[arg(long)]
+    pub include: Option<String>,
+
+    /// CSS selectors to exclude (comma-separated, e.g. "nav,.sidebar,footer")
+    #[arg(long)]
+    pub exclude: Option<String>,
+
+    /// Only extract main content (article/main element)
+    #[arg(long)]
+    pub only_main_content: bool,
+
+    /// Custom headers (repeatable, e.g. -H "Cookie: foo=bar")
+    #[arg(short = 'H', long = "header")]
+    pub headers: Vec<String>,
+
+    /// Cookie string (shorthand for -H "Cookie: ...")
+    #[arg(long)]
+    pub cookie: Option<String>,
+
+    /// JSON cookie file (Chrome extension format: [{name, value, domain, ...}])
+    #[arg(long)]
+    pub cookie_file: Option<String>,
+
+    /// Enable verbose logging
+    #[arg(short, long)]
+    pub verbose: bool,
+
+    /// Compare against a previous JSON snapshot
+    #[arg(long)]
+    pub diff_with: Option<String>,
+
+    /// Watch a URL for changes. Checks at the specified interval and reports diffs.
+    #[arg(long)]
+    pub watch: bool,
+
+    /// Watch interval in seconds [default: 300]
+    #[arg(long, default_value = "300")]
+    pub watch_interval: u64,
+
+    /// Command to run when changes are detected (receives diff JSON on stdin)
+    #[arg(long)]
+    pub on_change: Option<String>,
+
+    /// Webhook URL: POST a JSON payload when an operation completes.
+    /// Works with crawl, batch, watch (on change), and single URL modes.
+    #[arg(long, env = "WEBCLAW_WEBHOOK_URL")]
+    pub webhook: Option<String>,
+
+    /// Extract brand identity (colors, fonts, logo)
+    #[arg(long)]
+    pub brand: bool,
+
+    // -- PDF options --
+    /// PDF extraction mode: auto (error on empty) or fast (return whatever text is found)
+    #[arg(long, default_value = "auto")]
+    pub pdf_mode: PdfModeArg,
+
+    // -- Crawl options --
+    /// Enable recursive crawling of same-domain links
+    #[arg(long)]
+    pub crawl: bool,
+
+    /// Max crawl depth [default: 1]
+    #[arg(long, default_value = "1")]
+    pub depth: usize,
+
+    /// Max pages to crawl [default: 20]
+    #[arg(long, default_value = "20")]
+    pub max_pages: usize,
+
+    /// Max concurrent requests [default: 5]
+    #[arg(long, default_value = "5")]
+    pub concurrency: usize,
+
+    /// Delay between requests in ms [default: 100]
+    #[arg(long, default_value = "100")]
+    pub delay: u64,
+
+    /// Only crawl URLs matching this path prefix
+    #[arg(long)]
+    pub path_prefix: Option<String>,
+
+    /// Glob patterns for crawl URL paths to include (comma-separated, e.g. "/api/*,/guides/**")
+    #[arg(long)]
+    pub include_paths: Option<String>,
+
+    /// Glob patterns for crawl URL paths to exclude (comma-separated, e.g. "/changelog/*,/blog/*")
+    #[arg(long)]
+    pub exclude_paths: Option<String>,
+
+    /// Path to save/resume crawl state. On Ctrl+C: saves progress. On start: resumes if file exists.
+    #[arg(long)]
+    pub crawl_state: Option<PathBuf>,
+
+    /// Seed crawl frontier from sitemap discovery (robots.txt + /sitemap.xml)
+    #[arg(long)]
+    pub sitemap: bool,
+
+    /// Discover URLs from sitemap and print them (one per line; JSON array with --format json)
+    #[arg(long)]
+    pub map: bool,
+
+    // -- LLM options --
+    /// Extract structured JSON using LLM (pass a JSON schema string or @file)
+    #[arg(long)]
+    pub extract_json: Option<String>,
+
+    /// Extract using natural language prompt
+    #[arg(long)]
+    pub extract_prompt: Option<String>,
+
+    /// Summarize content using LLM (optional: number of sentences, default 3)
+    #[arg(long, num_args = 0..=1, default_missing_value = "3")]
+    pub summarize: Option<usize>,
+
+    /// Force a specific LLM provider (ollama, openai, anthropic)
+    #[arg(long, env = "WEBCLAW_LLM_PROVIDER")]
+    pub llm_provider: Option<String>,
+
+    /// Override the LLM model name
+    #[arg(long, env = "WEBCLAW_LLM_MODEL")]
+    pub llm_model: Option<String>,
+
+    /// Override the LLM base URL (Ollama, OpenAI-compatible, or Anthropic-compatible)
+    #[arg(long, env = "WEBCLAW_LLM_BASE_URL")]
+    pub llm_base_url: Option<String>,
+
+    // -- Cloud API options --
+    /// Webclaw Cloud API key for automatic fallback on bot-protected or JS-rendered sites
+    #[arg(long, env = "WEBCLAW_API_KEY")]
+    pub api_key: Option<String>,
+
+    /// Force all requests through the cloud API (skip local extraction)
+    #[arg(long)]
+    pub cloud: bool,
+
+    /// Run deep research on a topic via the cloud API. Requires --api-key.
+    /// Saves full result (report + sources + findings) to a JSON file.
+    #[arg(long)]
+    pub research: Option<String>,
+
+    /// Enable deep research mode (longer, more thorough report). Used with --research.
+    #[arg(long)]
+    pub deep: bool,
+
+    /// Output directory: save each page to a separate file instead of stdout.
+    /// Works with --crawl, batch (multiple URLs), and single URL mode.
+    /// Filenames are derived from URL paths (e.g. /docs/api -> docs/api.md).
+    #[arg(long)]
+    pub output_dir: Option<PathBuf>,
+}
+
+#[derive(Subcommand)]
+pub enum Commands {
+    /// Per-URL extraction micro-benchmark: compares raw HTML vs. the
+    /// webclaw --format llm output on token count, bytes, and
+    /// extraction time. Uses an approximate tokenizer (see `--help`).
+    Bench {
+        /// URL to benchmark.
+        url: String,
+
+        /// Emit a single JSON line instead of the ASCII table.
+        /// Machine-readable shape stable across releases.
+        #[arg(long)]
+        json: bool,
+
+        /// Optional path to a facts.json (same schema as the repo's
+        /// benchmarks/facts.json) for a fidelity column.
+        #[arg(long)]
+        facts: Option<PathBuf>,
+    },
+
+    /// List all vertical extractors in the catalog.
+    ///
+    /// Each entry has a stable `name` (usable with `webclaw vertical <name>`),
+    /// a human-friendly label, a one-line description, and the URL
+    /// patterns it claims. The same data is served by `/v1/extractors`
+    /// when running the REST API.
+    Extractors {
+        /// Emit JSON instead of a human-friendly table.
+        #[arg(long)]
+        json: bool,
+    },
+
+    /// Run a vertical extractor by name. Returns typed JSON with fields
+    /// specific to the target site (title, price, author, rating, etc.)
+    /// rather than generic markdown.
+    ///
+    /// Use `webclaw extractors` to see the full list. Example:
+    /// `webclaw vertical reddit https://www.reddit.com/r/rust/comments/abc/`.
+    Vertical {
+        /// Vertical name (e.g. `reddit`, `github_repo`, `trustpilot_reviews`).
+        name: String,
+        /// URL to extract.
+        url: String,
+        /// Emit compact JSON (single line). Default is pretty-printed.
+        #[arg(long)]
+        raw: bool,
+    },
+}
+
+#[derive(Clone, ValueEnum)]
+pub enum OutputFormat {
+    Markdown,
+    Json,
+    Text,
+    Llm,
+    Html,
+}
+
+impl OutputFormat {
+    /// Map to the cloud API's `formats` string. Single source of truth for the
+    /// format names the REST API expects.
+    pub fn as_api_str(&self) -> &'static str {
+        match self {
+            OutputFormat::Markdown => "markdown",
+            OutputFormat::Json => "json",
+            OutputFormat::Text => "text",
+            OutputFormat::Llm => "llm",
+            OutputFormat::Html => "html",
+        }
+    }
+}
+
+#[derive(Clone, ValueEnum)]
+pub enum Browser {
+    Chrome,
+    Firefox,
+    /// Safari iOS 26. Pair with a country-matched residential proxy for sites
+    /// that reject non-mobile profiles.
+    SafariIos,
+    Random,
+}
+
+#[derive(Clone, ValueEnum, Default)]
+pub enum PdfModeArg {
+    /// Error if PDF has no extractable text (catches scanned PDFs)
+    #[default]
+    Auto,
+    /// Return whatever text is found, even if empty
+    Fast,
+}
+
+impl From<PdfModeArg> for PdfMode {
+    fn from(arg: PdfModeArg) -> Self {
+        match arg {
+            PdfModeArg::Auto => PdfMode::Auto,
+            PdfModeArg::Fast => PdfMode::Fast,
+        }
+    }
+}
+
+impl From<Browser> for BrowserProfile {
+    fn from(b: Browser) -> Self {
+        match b {
+            Browser::Chrome => BrowserProfile::Chrome,
+            Browser::Firefox => BrowserProfile::Firefox,
+            Browser::SafariIos => BrowserProfile::SafariIos,
+            Browser::Random => BrowserProfile::Random,
+        }
+    }
+}
diff --git a/crates/webclaw-cli/src/fetch.rs b/crates/webclaw-cli/src/fetch.rs
new file mode 100644
index 0000000..df7b006
--- /dev/null
+++ b/crates/webclaw-cli/src/fetch.rs
@@ -0,0 +1,823 @@
+//! Input handling and fetching: config building, URL/cookie parsing, empty-page
+//! detection, output-file writing, and the fetch+extract entry points (local,
+//! remote, and cloud fallback).
+
+use std::io::{self, Read as _};
+use std::path::{Path, PathBuf};
+use std::process;
+
+use webclaw_core::{ExtractionOptions, ExtractionResult, extract_with_options};
+use webclaw_fetch::{FetchClient, FetchConfig, FetchResult};
+
+use crate::cli::Cli;
+
+/// Known anti-bot challenge page titles (case-insensitive prefix match).
+const ANTIBOT_TITLES: &[&str] = &[
+    "just a moment",
+    "attention required",
+    "access denied",
+    "checking your browser",
+    "please wait",
+    "one more step",
+    "verify you are human",
+    "bot verification",
+    "security check",
+    "ddos protection",
+];
+
+/// URL host/path fragments that indicate a GDPR/cookie consent redirect.
+const CONSENT_URL_FRAGMENTS: &[&str] = &[
+    "://consent.",
+    "/consent?",
+    "/consent/",
+    "collectconsent",
+    "consentcheck",
+    "/cmp/",
+    "guce.advertising.com",
+];
+
+/// English consent-wall title prefixes. Many providers localize this page, so
+/// this is a best-effort secondary signal. URL shape is the primary signal.
+const CONSENT_TITLES: &[&str] = &[
+    "before you continue",
+    "your privacy choices",
+    "we value your privacy",
+    "we care about your privacy",
+    "cookie consent",
+    "consent required",
+];
+
+/// Detect why a page returned empty or near-empty content.
+#[derive(Debug, PartialEq, Eq)]
+pub enum EmptyReason {
+    /// Anti-bot challenge page (Cloudflare, Akamai, etc.)
+    Antibot,
+    /// GDPR/cookie consent redirect.
+    ConsentWall,
+    /// JS-only SPA that returns an empty shell without a browser
+    JsRequired,
+    /// Page has content.
+    None,
+}
+
+pub fn detect_empty(result: &ExtractionResult) -> EmptyReason {
+    // Consent walls can have a tiny body, so check before the content
+    // short-circuit.
+    if is_consent_wall(result) {
+        return EmptyReason::ConsentWall;
+    }
+
+    // Has real content. Nothing to warn about.
+    if result.metadata.word_count > 50 || !result.content.markdown.is_empty() {
+        return EmptyReason::None;
+    }
+
+    // Check for known anti-bot challenge titles
+    if let Some(ref title) = result.metadata.title {
+        let lower = title.to_lowercase();
+        if ANTIBOT_TITLES.iter().any(|t| lower.starts_with(t)) {
+            return EmptyReason::Antibot;
+        }
+    }
+
+    // Empty content with no title or a generic SPA shell = JS-only site
+    if result.metadata.word_count == 0 && result.content.links.is_empty() {
+        return EmptyReason::JsRequired;
+    }
+
+    EmptyReason::None
+}
+
+/// A consent wall is identified by either:
+/// 1. The final URL pointing at a known consent host/path, or
+/// 2. A consent-wall title prefix with a very small body.
+fn is_consent_wall(result: &ExtractionResult) -> bool {
+    if let Some(ref url) = result.metadata.url {
+        let lower = url.to_ascii_lowercase();
+        if CONSENT_URL_FRAGMENTS
+            .iter()
+            .any(|fragment| lower.contains(fragment))
+        {
+            return true;
+        }
+    }
+
+    if result.metadata.word_count <= 50
+        && let Some(ref title) = result.metadata.title
+    {
+        let lower = title.to_lowercase();
+        if CONSENT_TITLES
+            .iter()
+            .any(|prefix| lower.starts_with(prefix))
+        {
+            return true;
+        }
+    }
+
+    false
+}
+
+pub fn warn_empty(url: &str, reason: &EmptyReason) {
+    match reason {
+        EmptyReason::Antibot => eprintln!(
+            "\x1b[33mwarning:\x1b[0m Anti-bot protection detected on {url}\n\
+             This site requires CAPTCHA solving or browser rendering.\n\
+             Use the webclaw Cloud API for automatic bypass: https://webclaw.io/pricing"
+        ),
+        EmptyReason::ConsentWall => eprintln!(
+            "\x1b[33mwarning:\x1b[0m GDPR/cookie consent wall detected on {url}\n\
+             The site redirected to a consent page and returned no usable content.\n\
+             Try a different region via --proxy, or pass a pre-accepted consent cookie\n\
+             via --cookie / --cookie-file."
+        ),
+        EmptyReason::JsRequired => eprintln!(
+            "\x1b[33mwarning:\x1b[0m No content extracted from {url}\n\
+             This site requires JavaScript rendering (SPA).\n\
+             Use the webclaw Cloud API for JS rendering: https://webclaw.io/pricing"
+        ),
+        EmptyReason::None => {}
+    }
+}
+
+/// Build FetchConfig from CLI flags.
+///
+/// `--proxy` sets a single static proxy (no rotation).
+/// `--proxy-file` loads a pool of proxies and rotates per-request.
+/// `--proxy` takes priority: if both are set, only the single proxy is used.
+pub fn build_fetch_config(cli: &Cli) -> FetchConfig {
+    let (proxy, proxy_pool) = if cli.proxy.is_some() {
+        (cli.proxy.clone(), Vec::new())
+    } else if let Some(ref path) = cli.proxy_file {
+        match webclaw_fetch::parse_proxy_file(path) {
+            Ok(pool) => (None, pool),
+            Err(e) => {
+                eprintln!("warning: {e}");
+                (None, Vec::new())
+            }
+        }
+    } else if std::path::Path::new("proxies.txt").exists() {
+        // Auto-load proxies.txt from working directory if present
+        match webclaw_fetch::parse_proxy_file("proxies.txt") {
+            Ok(pool) if !pool.is_empty() => {
+                eprintln!("loaded {} proxies from proxies.txt", pool.len());
+                (None, pool)
+            }
+            _ => (None, Vec::new()),
+        }
+    } else {
+        (None, Vec::new())
+    };
+
+    let mut headers = std::collections::HashMap::from([(
+        "Accept-Language".to_string(),
+        "en-US,en;q=0.9".to_string(),
+    )]);
+
+    // Parse -H "Key: Value" flags
+    for h in &cli.headers {
+        if let Some((key, val)) = h.split_once(':') {
+            headers.insert(key.trim().to_string(), val.trim().to_string());
+        }
+    }
+
+    // --cookie shorthand
+    if let Some(ref cookie) = cli.cookie {
+        headers.insert("Cookie".to_string(), cookie.clone());
+    }
+
+    // --cookie-file: parse JSON array of {name, value, domain, ...}
+    if let Some(ref path) = cli.cookie_file {
+        match parse_cookie_file(path) {
+            Ok(cookie_str) => {
+                // Merge with existing cookies if --cookie was also provided
+                if let Some(existing) = headers.get("Cookie") {
+                    headers.insert("Cookie".to_string(), format!("{existing}; {cookie_str}"));
+                } else {
+                    headers.insert("Cookie".to_string(), cookie_str);
+                }
+            }
+            Err(e) => {
+                eprintln!("error: failed to parse cookie file: {e}");
+                process::exit(1);
+            }
+        }
+    }
+
+    FetchConfig {
+        browser: cli.browser.clone().into(),
+        proxy,
+        proxy_pool,
+        timeout: std::time::Duration::from_secs(cli.timeout),
+        pdf_mode: cli.pdf_mode.clone().into(),
+        headers,
+        ..Default::default()
+    }
+}
+
+/// Parse a JSON cookie file (Chrome extension format) into a Cookie header string.
+/// Supports: [{name, value, domain, path, secure, httpOnly, expirationDate, ...}]
+fn parse_cookie_file(path: &str) -> Result<String, String> {
+    let content = std::fs::read_to_string(path).map_err(|e| format!("cannot read {path}: {e}"))?;
+    let cookies: Vec<serde_json::Value> =
+        serde_json::from_str(&content).map_err(|e| format!("invalid JSON: {e}"))?;
+
+    let pairs: Vec<String> = cookies
+        .iter()
+        .filter_map(|c| {
+            let name = c.get("name")?.as_str()?;
+            let value = c.get("value")?.as_str()?;
+            Some(format!("{name}={value}"))
+        })
+        .collect();
+
+    if pairs.is_empty() {
+        return Err("no cookies found in file".to_string());
+    }
+
+    Ok(pairs.join("; "))
+}
+
+pub fn build_extraction_options(cli: &Cli) -> ExtractionOptions {
+    ExtractionOptions {
+        include_selectors: cli
+            .include
+            .as_deref()
+            .map(|s| s.split(',').map(|s| s.trim().to_string()).collect())
+            .unwrap_or_default(),
+        exclude_selectors: cli
+            .exclude
+            .as_deref()
+            .map(|s| s.split(',').map(|s| s.trim().to_string()).collect())
+            .unwrap_or_default(),
+        only_main_content: cli.only_main_content,
+        include_raw_html: cli.raw_html || matches!(cli.format, crate::cli::OutputFormat::Html),
+    }
+}
+
+/// Normalize a URL: prepend `https://` if no scheme is present.
+pub fn normalize_url(url: &str) -> String {
+    let trimmed = url.trim();
+    if trimmed.contains("://") {
+        trimmed.to_string()
+    } else {
+        format!("https://{trimmed}")
+    }
+}
+
+/// Derive a filename from a URL for `--output-dir`.
+///
+/// Strips the scheme/host, maps the path to a filesystem path, and appends
+/// an extension matching the output format.
+pub fn url_to_filename(raw_url: &str, format: &crate::cli::OutputFormat) -> String {
+    use crate::cli::OutputFormat;
+    let ext = match format {
+        OutputFormat::Markdown | OutputFormat::Llm => "md",
+        OutputFormat::Json => "json",
+        OutputFormat::Text => "txt",
+        OutputFormat::Html => "html",
+    };
+
+    let parsed = url::Url::parse(raw_url);
+    let (host, path, query) = match &parsed {
+        Ok(u) => (
+            u.host_str().unwrap_or("unknown").to_string(),
+            u.path().to_string(),
+            u.query().map(String::from),
+        ),
+        Err(_) => (String::new(), String::new(), None),
+    };
+
+    // Drop empty / "." / ".." path segments so a URL path like
+    // `/../../etc/passwd` can't climb out of the output directory.
+    let cleaned_path: String = path
+        .split('/')
+        .filter(|seg| !seg.is_empty() && *seg != "." && *seg != "..")
+        .collect::<Vec<_>>()
+        .join("/");
+
+    let mut stem = cleaned_path;
+    if stem.is_empty() {
+        // Use hostname for root URLs to avoid collisions in batch mode
+        let clean_host = host.strip_prefix("www.").unwrap_or(&host);
+        stem = format!("{}/index", clean_host.replace('.', "_"));
+    }
+
+    // Append query params so /p?id=123 doesn't collide with /p?id=456
+    if let Some(q) = query {
+        stem = format!("{stem}_{q}");
+    }
+
+    // Sanitize: keep alphanumeric, dash, underscore, dot, slash
+    let sanitized: String = stem
+        .chars()
+        .map(|c| {
+            if c.is_alphanumeric() || matches!(c, '-' | '_' | '.' | '/') {
+                c
+            } else {
+                '_'
+            }
+        })
+        .collect();
+
+    format!("{sanitized}.{ext}")
+}
+
+/// Reject a caller-supplied (CSV `url,filename`) name that could escape the
+/// output directory: absolute paths, drive prefixes, root, or any `..`
+/// component. Returns the validated relative path on success.
+fn safe_relative_filename(filename: &str) -> Result<PathBuf, String> {
+    let candidate = Path::new(filename);
+    use std::path::Component;
+    for comp in candidate.components() {
+        match comp {
+            Component::Normal(_) | Component::CurDir => {}
+            Component::ParentDir => {
+                return Err(format!("refusing path with '..' component: {filename}"));
+            }
+            Component::RootDir | Component::Prefix(_) => {
+                return Err(format!("refusing absolute output path: {filename}"));
+            }
+        }
+    }
+    if candidate.as_os_str().is_empty() {
+        return Err("empty output filename".to_string());
+    }
+    Ok(candidate.to_path_buf())
+}
+
+/// Write extraction output to a file inside `dir`, creating parent dirs as needed.
+///
+/// `filename` may originate from an attacker-controlled `--urls-file`
+/// (`url,filename` CSV). It is validated for traversal, and the canonical
+/// destination directory is asserted to stay under the canonical output
+/// directory before any write.
+pub fn write_to_file(dir: &Path, filename: &str, content: &str) -> Result<(), String> {
+    let rel = safe_relative_filename(filename)?;
+    let dest = dir.join(&rel);
+
+    std::fs::create_dir_all(dir)
+        .map_err(|e| format!("failed to create directory {}: {e}", dir.display()))?;
+    let base = dir
+        .canonicalize()
+        .map_err(|e| format!("failed to resolve output dir {}: {e}", dir.display()))?;
+
+    if let Some(parent) = dest.parent() {
+        std::fs::create_dir_all(parent)
+            .map_err(|e| format!("failed to create directory {}: {e}", parent.display()))?;
+        let canon_parent = parent
+            .canonicalize()
+            .map_err(|e| format!("failed to resolve {}: {e}", parent.display()))?;
+        if !canon_parent.starts_with(&base) {
+            return Err(format!(
+                "refusing to write outside output dir: {}",
+                dest.display()
+            ));
+        }
+    }
+
+    std::fs::write(&dest, content)
+        .map_err(|e| format!("failed to write {}: {e}", dest.display()))?;
+    let word_count = content.split_whitespace().count();
+    eprintln!("Saved: {} ({word_count} words)", dest.display());
+    Ok(())
+}
+
+/// Collect all URLs from positional args + --urls-file, normalizing bare domains.
+///
+/// Returns `(url, optional_custom_filename)` pairs. Custom filenames come from
+/// CSV-style lines in `--urls-file`: `url,filename`. Plain lines (no comma) get
+/// `None` so the caller auto-generates the filename from the URL.
+pub fn collect_urls(cli: &Cli) -> Result<Vec<(String, Option<String>)>, String> {
+    let mut entries: Vec<(String, Option<String>)> =
+        cli.urls.iter().map(|u| (normalize_url(u), None)).collect();
+
+    if let Some(ref path) = cli.urls_file {
+        let content =
+            std::fs::read_to_string(path).map_err(|e| format!("failed to read {path}: {e}"))?;
+        for line in content.lines() {
+            let trimmed = line.trim();
+            if trimmed.is_empty() || trimmed.starts_with('#') {
+                continue;
+            }
+            if let Some((url_part, name_part)) = trimmed.split_once(',') {
+                let name = name_part.trim();
+                let custom = if name.is_empty() {
+                    None
+                } else {
+                    Some(name.to_string())
+                };
+                entries.push((normalize_url(url_part.trim()), custom));
+            } else {
+                entries.push((normalize_url(trimmed), None));
+            }
+        }
+    }
+
+    Ok(entries)
+}
+
+/// Result that can be either a local extraction or a cloud API JSON response.
+pub enum FetchOutput {
+    Local(Box<ExtractionResult>),
+    Cloud(serde_json::Value),
+}
+
+impl FetchOutput {
+    /// Get the local ExtractionResult, or try to parse it from the cloud response.
+    pub fn into_extraction(self) -> Result<ExtractionResult, String> {
+        match self {
+            FetchOutput::Local(r) => Ok(*r),
+            FetchOutput::Cloud(resp) => {
+                // Cloud response has an "extraction" field with the full ExtractionResult
+                resp.get("extraction")
+                    .and_then(|v| serde_json::from_value(v.clone()).ok())
+                    .or_else(|| serde_json::from_value(resp.clone()).ok())
+                    .ok_or_else(|| "could not parse extraction from cloud response".to_string())
+            }
+        }
+    }
+}
+
+/// Fetch a URL and extract content, handling PDF detection automatically.
+/// Falls back to cloud API when bot protection or JS rendering is detected.
+pub async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
+    // Local sources: read and extract as HTML
+    if cli.stdin {
+        let mut buf = String::new();
+        io::stdin()
+            .read_to_string(&mut buf)
+            .map_err(|e| format!("failed to read stdin: {e}"))?;
+        let options = build_extraction_options(cli);
+        return extract_with_options(&buf, None, &options)
+            .map(|r| FetchOutput::Local(Box::new(r)))
+            .map_err(|e| format!("extraction error: {e}"));
+    }
+
+    if let Some(ref path) = cli.file {
+        let html =
+            std::fs::read_to_string(path).map_err(|e| format!("failed to read {path}: {e}"))?;
+        let options = build_extraction_options(cli);
+        return extract_with_options(&html, None, &options)
+            .map(|r| FetchOutput::Local(Box::new(r)))
+            .map_err(|e| format!("extraction error: {e}"));
+    }
+
+    let raw_url = cli
+        .urls
+        .first()
+        .ok_or("no input provided -- pass a URL, --file, or --stdin")?;
+    let url = normalize_url(raw_url);
+    let url = url.as_str();
+
+    let cloud_client = webclaw_fetch::cloud::CloudClient::new(cli.api_key.as_deref());
+
+    // --cloud: skip local, go straight to cloud API
+    if cli.cloud {
+        let c =
+            cloud_client.ok_or("--cloud requires WEBCLAW_API_KEY (set via env or --api-key)")?;
+        let options = build_extraction_options(cli);
+        let resp = c
+            .scrape(
+                url,
+                &[cli.format.as_api_str()],
+                &options.include_selectors,
+                &options.exclude_selectors,
+                options.only_main_content,
+            )
+            .await?;
+        return Ok(FetchOutput::Cloud(resp));
+    }
+
+    // Normal path: try local first
+    let client =
+        FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
+    let options = build_extraction_options(cli);
+    let result = client
+        .fetch_and_extract_with_options(url, &options)
+        .await
+        .map_err(|e| format!("fetch error: {e}"))?;
+
+    // Check if we should fall back to cloud
+    let reason = detect_empty(&result);
+    if !matches!(reason, EmptyReason::None) {
+        if let Some(ref c) = cloud_client {
+            eprintln!("\x1b[36minfo:\x1b[0m falling back to cloud API...");
+            match c
+                .scrape(
+                    url,
+                    &[cli.format.as_api_str()],
+                    &options.include_selectors,
+                    &options.exclude_selectors,
+                    options.only_main_content,
+                )
+                .await
+            {
+                Ok(resp) => return Ok(FetchOutput::Cloud(resp)),
+                Err(e) => {
+                    eprintln!("\x1b[33mwarning:\x1b[0m cloud fallback failed: {e}");
+                    // Fall through to return the local result with a warning
+                }
+            }
+        }
+        warn_empty(url, &reason);
+    }
+
+    Ok(FetchOutput::Local(Box::new(result)))
+}
+
+/// Fetch raw HTML from a URL (no extraction). Used for --raw-html and brand extraction.
+pub async fn fetch_html(cli: &Cli) -> Result<FetchResult, String> {
+    if cli.stdin {
+        let mut buf = String::new();
+        io::stdin()
+            .read_to_string(&mut buf)
+            .map_err(|e| format!("failed to read stdin: {e}"))?;
+        return Ok(FetchResult {
+            html: buf,
+            url: String::new(),
+            status: 200,
+            headers: Default::default(),
+            elapsed: Default::default(),
+        });
+    }
+
+    if let Some(ref path) = cli.file {
+        let html =
+            std::fs::read_to_string(path).map_err(|e| format!("failed to read {path}: {e}"))?;
+        return Ok(FetchResult {
+            html,
+            url: String::new(),
+            status: 200,
+            headers: Default::default(),
+            elapsed: Default::default(),
+        });
+    }
+
+    let raw_url = cli
+        .urls
+        .first()
+        .ok_or("no input provided -- pass a URL, --file, or --stdin")?;
+    let url = normalize_url(raw_url);
+
+    let client =
+        FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
+    client
+        .fetch(&url)
+        .await
+        .map_err(|e| format!("fetch error: {e}"))
+}
+
+/// Fetch external stylesheets referenced in HTML and inject them as `<style>` blocks.
+/// This allows brand extraction to see colors/fonts from external CSS files.
+pub async fn enrich_html_with_stylesheets(html: &str, base_url: &str) -> String {
+    let base = match url::Url::parse(base_url) {
+        Ok(u) => u,
+        Err(_) => return html.to_string(),
+    };
+
+    // Extract stylesheet hrefs from <link rel="stylesheet" href="...">
+    let re = regex::Regex::new(
+        r#"<link[^>]+rel=["']stylesheet["'][^>]+href=["']([^"']+)["']|<link[^>]+href=["']([^"']+)["'][^>]+rel=["']stylesheet["']"#
+    ).unwrap();
+
+    let hrefs: Vec<String> = re
+        .captures_iter(html)
+        .filter_map(|cap| {
+            let href = cap.get(1).or(cap.get(2))?;
+            Some(
+                base.join(href.as_str())
+                    .map(|u| u.to_string())
+                    .unwrap_or_else(|_| href.as_str().to_string()),
+            )
+        })
+        .take(10)
+        .collect();
+
+    if hrefs.is_empty() {
+        return html.to_string();
+    }
+
+    let client = reqwest::Client::builder()
+        .timeout(std::time::Duration::from_secs(5))
+        .redirect(reqwest::redirect::Policy::none())
+        .build()
+        .unwrap_or_default();
+
+    let mut extra_css = String::new();
+    for href in &hrefs {
+        if webclaw_fetch::url_security::validate_public_http_url(href)
+            .await
+            .is_err()
+        {
+            continue;
+        }
+        if let Ok(resp) = client.get(href).send().await
+            && resp.status().is_success()
+            && let Ok(body) = resp.text().await
+            && !body.trim_start().starts_with("<!")
+            && body.len() < 2_000_000
+        {
+            extra_css.push_str("\n<style>\n");
+            extra_css.push_str(&body);
+            extra_css.push_str("\n</style>\n");
+        }
+    }
+
+    if extra_css.is_empty() {
+        return html.to_string();
+    }
+
+    if let Some(pos) = html.to_lowercase().find("</head>") {
+        let mut enriched = String::with_capacity(html.len() + extra_css.len());
+        enriched.push_str(&html[..pos]);
+        enriched.push_str(&extra_css);
+        enriched.push_str(&html[pos..]);
+        enriched
+    } else {
+        format!("{extra_css}{html}")
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::cli::OutputFormat;
+    use webclaw_core::{Content, Metadata};
+
+    fn empty_result(title: Option<&str>, url: Option<&str>, markdown: &str) -> ExtractionResult {
+        let metadata = Metadata::default()
+            .with_title(title.map(str::to_string))
+            .with_url(url.map(str::to_string))
+            .with_word_count(markdown.split_whitespace().count());
+        let content = Content::default()
+            .with_markdown(markdown.to_string())
+            .with_plain_text(markdown.to_string());
+        ExtractionResult::new(metadata, content)
+    }
+
+    #[test]
+    fn detect_empty_identifies_consent_redirect_url() {
+        let result = empty_result(
+            Some("Yahoo"),
+            Some("https://guce.advertising.com/collectIdentifiers?sessionId=abc"),
+            "Continue",
+        );
+        assert_eq!(detect_empty(&result), EmptyReason::ConsentWall);
+    }
+
+    #[test]
+    fn detect_empty_identifies_short_consent_title() {
+        let result = empty_result(
+            Some("Before you continue"),
+            Some("https://www.google.com/"),
+            "Review privacy options",
+        );
+        assert_eq!(detect_empty(&result), EmptyReason::ConsentWall);
+    }
+
+    #[test]
+    fn detect_empty_does_not_flag_real_content_with_consent_words() {
+        let result = empty_result(
+            Some("Cookie consent patterns explained"),
+            Some("https://example.com/blog"),
+            "This article explains cookie consent patterns for product teams with enough real body text to be useful. It covers consent banners, privacy controls, analytics configuration, regional requirements, product tradeoffs, implementation details, testing flows, debugging notes, accessibility needs, and operational lessons from real teams shipping public websites across multiple markets. It also explains measurement, rollout planning, copy review, support workflows, design constraints, release notes, and how to keep privacy choices understandable for users.",
+        );
+        assert_eq!(detect_empty(&result), EmptyReason::None);
+    }
+
+    #[test]
+    fn url_to_filename_root() {
+        assert_eq!(
+            url_to_filename("https://example.com/", &OutputFormat::Markdown),
+            "example_com/index.md"
+        );
+        assert_eq!(
+            url_to_filename("https://example.com", &OutputFormat::Markdown),
+            "example_com/index.md"
+        );
+    }
+
+    #[test]
+    fn url_to_filename_path() {
+        assert_eq!(
+            url_to_filename("https://example.com/docs/api", &OutputFormat::Markdown),
+            "docs/api.md"
+        );
+    }
+
+    #[test]
+    fn url_to_filename_trailing_slash() {
+        assert_eq!(
+            url_to_filename("https://example.com/docs/api/", &OutputFormat::Markdown),
+            "docs/api.md"
+        );
+    }
+
+    #[test]
+    fn url_to_filename_nested_path() {
+        assert_eq!(
+            url_to_filename("https://example.com/blog/my-post", &OutputFormat::Markdown),
+            "blog/my-post.md"
+        );
+    }
+
+    #[test]
+    fn url_to_filename_query_params() {
+        assert_eq!(
+            url_to_filename("https://example.com/p?id=123", &OutputFormat::Markdown),
+            "p_id_123.md"
+        );
+    }
+
+    #[test]
+    fn url_to_filename_json_format() {
+        assert_eq!(
+            url_to_filename("https://example.com/docs/api", &OutputFormat::Json),
+            "docs/api.json"
+        );
+    }
+
+    #[test]
+    fn url_to_filename_text_format() {
+        assert_eq!(
+            url_to_filename("https://example.com/docs/api", &OutputFormat::Text),
+            "docs/api.txt"
+        );
+    }
+
+    #[test]
+    fn url_to_filename_llm_format() {
+        assert_eq!(
+            url_to_filename("https://example.com/docs/api", &OutputFormat::Llm),
+            "docs/api.md"
+        );
+    }
+
+    #[test]
+    fn url_to_filename_html_format() {
+        assert_eq!(
+            url_to_filename("https://example.com/docs/api", &OutputFormat::Html),
+            "docs/api.html"
+        );
+    }
+
+    #[test]
+    fn url_to_filename_special_chars() {
+        // Spaces and special chars get replaced with underscores
+        assert_eq!(
+            url_to_filename(
+                "https://example.com/path%20with%20spaces",
+                &OutputFormat::Markdown
+            ),
+            "path_20with_20spaces.md"
+        );
+    }
+
+    #[test]
+    fn write_to_file_creates_dirs() {
+        let dir = std::env::temp_dir().join("webclaw_test_output_dir");
+        let _ = std::fs::remove_dir_all(&dir);
+        write_to_file(&dir, "nested/deep/file.md", "hello").unwrap();
+        let content = std::fs::read_to_string(dir.join("nested/deep/file.md")).unwrap();
+        assert_eq!(content, "hello");
+        let _ = std::fs::remove_dir_all(&dir);
+    }
+
+    #[test]
+    fn url_to_filename_strips_traversal_segments() {
+        // `..` / `.` / empty path segments must not survive into the path.
+        let out = url_to_filename(
+            "https://example.com/../../etc/passwd",
+            &OutputFormat::Markdown,
+        );
+        assert!(!out.contains(".."), "traversal leaked: {out}");
+        assert_eq!(out, "etc/passwd.md");
+        let out2 = url_to_filename("https://example.com/a/./b//c", &OutputFormat::Json);
+        assert_eq!(out2, "a/b/c.json");
+    }
+
+    #[test]
+    fn safe_relative_filename_rejects_escapes() {
+        assert!(safe_relative_filename("../escape.md").is_err());
+        assert!(safe_relative_filename("a/../../b.md").is_err());
+        assert!(safe_relative_filename("/etc/passwd").is_err());
+        assert!(safe_relative_filename("").is_err());
+        // Normal nested relative names stay allowed.
+        assert!(safe_relative_filename("nested/deep/file.md").is_ok());
+        assert!(safe_relative_filename("./ok.md").is_ok());
+    }
+
+    #[test]
+    fn write_to_file_refuses_traversal_filename() {
+        let dir = std::env::temp_dir().join("webclaw_test_traversal_dir");
+        let _ = std::fs::remove_dir_all(&dir);
+        // CSV-supplied `url,filename` traversal attempt.
+        let err = write_to_file(&dir, "../../tmp/webclaw_pwned.md", "x").unwrap_err();
+        assert!(err.contains("refusing"), "unexpected error: {err}");
+        assert!(
+            !std::path::Path::new("/tmp/webclaw_pwned.md").exists(),
+            "traversal write escaped the output dir"
+        );
+        let _ = std::fs::remove_dir_all(&dir);
+    }
+}
diff --git a/crates/webclaw-cli/src/main.rs b/crates/webclaw-cli/src/main.rs
index 1348824..39088be 100644
--- a/crates/webclaw-cli/src/main.rs
+++ b/crates/webclaw-cli/src/main.rs
@@ -1,456 +1,28 @@
 /// CLI entry point -- wires webclaw-core and webclaw-fetch into a single command.
-/// All extraction and fetching logic lives in sibling crates; this is pure plumbing.
+/// All extraction and fetching logic lives in sibling crates and modules; this
+/// file is the argument parser plus dispatch.
 mod bench;
+mod cli;
+mod fetch;
+mod output;
+mod run;
+mod webhook;
 
-use std::io::{self, Read as _};
-use std::path::{Path, PathBuf};
 use std::process;
-use std::sync::Arc;
-use std::sync::atomic::{AtomicBool, Ordering};
 
-use clap::{Parser, Subcommand, ValueEnum};
+use clap::Parser;
 use tracing_subscriber::EnvFilter;
-use webclaw_core::{
-    ChangeStatus, ContentDiff, ExtractionOptions, ExtractionResult, Metadata, extract_with_options,
-    to_llm_text,
+
+use cli::{Cli, Commands};
+use fetch::{
+    FetchOutput, collect_urls, fetch_and_extract, fetch_html, normalize_url, url_to_filename,
+    write_to_file,
 };
-use webclaw_fetch::{
-    BatchExtractResult, BrowserProfile, CrawlConfig, CrawlResult, Crawler, FetchClient,
-    FetchConfig, FetchResult, PageResult, SitemapEntry,
+use output::{format_output, print_cloud_output, print_output};
+use run::{
+    has_llm_flags, run_batch, run_batch_llm, run_brand, run_crawl, run_diff, run_llm, run_map,
+    run_research, run_watch,
 };
-use webclaw_llm::LlmProvider;
-use webclaw_pdf::PdfMode;
-
-/// Known anti-bot challenge page titles (case-insensitive prefix match).
-const ANTIBOT_TITLES: &[&str] = &[
-    "just a moment",
-    "attention required",
-    "access denied",
-    "checking your browser",
-    "please wait",
-    "one more step",
-    "verify you are human",
-    "bot verification",
-    "security check",
-    "ddos protection",
-];
-
-/// URL host/path fragments that indicate a GDPR/cookie consent redirect.
-const CONSENT_URL_FRAGMENTS: &[&str] = &[
-    "://consent.",
-    "/consent?",
-    "/consent/",
-    "collectconsent",
-    "consentcheck",
-    "/cmp/",
-    "guce.advertising.com",
-];
-
-/// English consent-wall title prefixes. Many providers localize this page, so
-/// this is a best-effort secondary signal. URL shape is the primary signal.
-const CONSENT_TITLES: &[&str] = &[
-    "before you continue",
-    "your privacy choices",
-    "we value your privacy",
-    "we care about your privacy",
-    "cookie consent",
-    "consent required",
-];
-
-/// Detect why a page returned empty or near-empty content.
-#[derive(Debug, PartialEq, Eq)]
-enum EmptyReason {
-    /// Anti-bot challenge page (Cloudflare, Akamai, etc.)
-    Antibot,
-    /// GDPR/cookie consent redirect.
-    ConsentWall,
-    /// JS-only SPA that returns an empty shell without a browser
-    JsRequired,
-    /// Page has content.
-    None,
-}
-
-fn detect_empty(result: &ExtractionResult) -> EmptyReason {
-    // Consent walls can have a tiny body, so check before the content
-    // short-circuit.
-    if is_consent_wall(result) {
-        return EmptyReason::ConsentWall;
-    }
-
-    // Has real content. Nothing to warn about.
-    if result.metadata.word_count > 50 || !result.content.markdown.is_empty() {
-        return EmptyReason::None;
-    }
-
-    // Check for known anti-bot challenge titles
-    if let Some(ref title) = result.metadata.title {
-        let lower = title.to_lowercase();
-        if ANTIBOT_TITLES.iter().any(|t| lower.starts_with(t)) {
-            return EmptyReason::Antibot;
-        }
-    }
-
-    // Empty content with no title or a generic SPA shell = JS-only site
-    if result.metadata.word_count == 0 && result.content.links.is_empty() {
-        return EmptyReason::JsRequired;
-    }
-
-    EmptyReason::None
-}
-
-/// A consent wall is identified by either:
-/// 1. The final URL pointing at a known consent host/path, or
-/// 2. A consent-wall title prefix with a very small body.
-fn is_consent_wall(result: &ExtractionResult) -> bool {
-    if let Some(ref url) = result.metadata.url {
-        let lower = url.to_ascii_lowercase();
-        if CONSENT_URL_FRAGMENTS
-            .iter()
-            .any(|fragment| lower.contains(fragment))
-        {
-            return true;
-        }
-    }
-
-    if result.metadata.word_count <= 50
-        && let Some(ref title) = result.metadata.title
-    {
-        let lower = title.to_lowercase();
-        if CONSENT_TITLES
-            .iter()
-            .any(|prefix| lower.starts_with(prefix))
-        {
-            return true;
-        }
-    }
-
-    false
-}
-
-fn warn_empty(url: &str, reason: &EmptyReason) {
-    match reason {
-        EmptyReason::Antibot => eprintln!(
-            "\x1b[33mwarning:\x1b[0m Anti-bot protection detected on {url}\n\
-             This site requires CAPTCHA solving or browser rendering.\n\
-             Use the webclaw Cloud API for automatic bypass: https://webclaw.io/pricing"
-        ),
-        EmptyReason::ConsentWall => eprintln!(
-            "\x1b[33mwarning:\x1b[0m GDPR/cookie consent wall detected on {url}\n\
-             The site redirected to a consent page and returned no usable content.\n\
-             Try a different region via --proxy, or pass a pre-accepted consent cookie\n\
-             via --cookie / --cookie-file."
-        ),
-        EmptyReason::JsRequired => eprintln!(
-            "\x1b[33mwarning:\x1b[0m No content extracted from {url}\n\
-             This site requires JavaScript rendering (SPA).\n\
-             Use the webclaw Cloud API for JS rendering: https://webclaw.io/pricing"
-        ),
-        EmptyReason::None => {}
-    }
-}
-
-#[derive(Parser)]
-#[command(name = "webclaw", about = "Extract web content for LLMs", version)]
-struct Cli {
-    /// Optional subcommand. When omitted, the CLI falls back to the
-    /// traditional flag-based flow (URL + --format, --crawl, etc.).
-    /// Subcommands are used for flows that don't fit that model.
-    #[command(subcommand)]
-    command: Option<Commands>,
-
-    /// URLs to fetch (multiple allowed)
-    #[arg()]
-    urls: Vec<String>,
-
-    /// File with URLs (one per line)
-    #[arg(long)]
-    urls_file: Option<String>,
-
-    /// Output format (markdown, json, text, llm, html)
-    #[arg(short, long, default_value = "markdown")]
-    format: OutputFormat,
-
-    /// Browser to impersonate
-    #[arg(short, long, default_value = "chrome")]
-    browser: Browser,
-
-    /// Proxy URL (http://user:pass@host:port or socks5://host:port)
-    #[arg(short, long, env = "WEBCLAW_PROXY")]
-    proxy: Option<String>,
-
-    /// File with proxies (host:port:user:pass, one per line). Rotates per request.
-    #[arg(long, env = "WEBCLAW_PROXY_FILE")]
-    proxy_file: Option<String>,
-
-    /// Request timeout in seconds
-    #[arg(short, long, default_value = "30")]
-    timeout: u64,
-
-    /// Extract from local HTML file instead of fetching
-    #[arg(long)]
-    file: Option<String>,
-
-    /// Read HTML from stdin
-    #[arg(long)]
-    stdin: bool,
-
-    /// Include metadata in output (always included in JSON)
-    #[arg(long)]
-    metadata: bool,
-
-    /// Output raw fetched HTML instead of extracting
-    #[arg(long)]
-    raw_html: bool,
-
-    /// CSS selectors to include (comma-separated, e.g. "article,.content")
-    #[arg(long)]
-    include: Option<String>,
-
-    /// CSS selectors to exclude (comma-separated, e.g. "nav,.sidebar,footer")
-    #[arg(long)]
-    exclude: Option<String>,
-
-    /// Only extract main content (article/main element)
-    #[arg(long)]
-    only_main_content: bool,
-
-    /// Custom headers (repeatable, e.g. -H "Cookie: foo=bar")
-    #[arg(short = 'H', long = "header")]
-    headers: Vec<String>,
-
-    /// Cookie string (shorthand for -H "Cookie: ...")
-    #[arg(long)]
-    cookie: Option<String>,
-
-    /// JSON cookie file (Chrome extension format: [{name, value, domain, ...}])
-    #[arg(long)]
-    cookie_file: Option<String>,
-
-    /// Enable verbose logging
-    #[arg(short, long)]
-    verbose: bool,
-
-    /// Compare against a previous JSON snapshot
-    #[arg(long)]
-    diff_with: Option<String>,
-
-    /// Watch a URL for changes. Checks at the specified interval and reports diffs.
-    #[arg(long)]
-    watch: bool,
-
-    /// Watch interval in seconds [default: 300]
-    #[arg(long, default_value = "300")]
-    watch_interval: u64,
-
-    /// Command to run when changes are detected (receives diff JSON on stdin)
-    #[arg(long)]
-    on_change: Option<String>,
-
-    /// Webhook URL: POST a JSON payload when an operation completes.
-    /// Works with crawl, batch, watch (on change), and single URL modes.
-    #[arg(long, env = "WEBCLAW_WEBHOOK_URL")]
-    webhook: Option<String>,
-
-    /// Extract brand identity (colors, fonts, logo)
-    #[arg(long)]
-    brand: bool,
-
-    // -- PDF options --
-    /// PDF extraction mode: auto (error on empty) or fast (return whatever text is found)
-    #[arg(long, default_value = "auto")]
-    pdf_mode: PdfModeArg,
-
-    // -- Crawl options --
-    /// Enable recursive crawling of same-domain links
-    #[arg(long)]
-    crawl: bool,
-
-    /// Max crawl depth [default: 1]
-    #[arg(long, default_value = "1")]
-    depth: usize,
-
-    /// Max pages to crawl [default: 20]
-    #[arg(long, default_value = "20")]
-    max_pages: usize,
-
-    /// Max concurrent requests [default: 5]
-    #[arg(long, default_value = "5")]
-    concurrency: usize,
-
-    /// Delay between requests in ms [default: 100]
-    #[arg(long, default_value = "100")]
-    delay: u64,
-
-    /// Only crawl URLs matching this path prefix
-    #[arg(long)]
-    path_prefix: Option<String>,
-
-    /// Glob patterns for crawl URL paths to include (comma-separated, e.g. "/api/*,/guides/**")
-    #[arg(long)]
-    include_paths: Option<String>,
-
-    /// Glob patterns for crawl URL paths to exclude (comma-separated, e.g. "/changelog/*,/blog/*")
-    #[arg(long)]
-    exclude_paths: Option<String>,
-
-    /// Path to save/resume crawl state. On Ctrl+C: saves progress. On start: resumes if file exists.
-    #[arg(long)]
-    crawl_state: Option<PathBuf>,
-
-    /// Seed crawl frontier from sitemap discovery (robots.txt + /sitemap.xml)
-    #[arg(long)]
-    sitemap: bool,
-
-    /// Discover URLs from sitemap and print them (one per line; JSON array with --format json)
-    #[arg(long)]
-    map: bool,
-
-    // -- LLM options --
-    /// Extract structured JSON using LLM (pass a JSON schema string or @file)
-    #[arg(long)]
-    extract_json: Option<String>,
-
-    /// Extract using natural language prompt
-    #[arg(long)]
-    extract_prompt: Option<String>,
-
-    /// Summarize content using LLM (optional: number of sentences, default 3)
-    #[arg(long, num_args = 0..=1, default_missing_value = "3")]
-    summarize: Option<usize>,
-
-    /// Force a specific LLM provider (ollama, openai, anthropic)
-    #[arg(long, env = "WEBCLAW_LLM_PROVIDER")]
-    llm_provider: Option<String>,
-
-    /// Override the LLM model name
-    #[arg(long, env = "WEBCLAW_LLM_MODEL")]
-    llm_model: Option<String>,
-
-    /// Override the LLM base URL (Ollama, OpenAI-compatible, or Anthropic-compatible)
-    #[arg(long, env = "WEBCLAW_LLM_BASE_URL")]
-    llm_base_url: Option<String>,
-
-    // -- Cloud API options --
-    /// Webclaw Cloud API key for automatic fallback on bot-protected or JS-rendered sites
-    #[arg(long, env = "WEBCLAW_API_KEY")]
-    api_key: Option<String>,
-
-    /// Force all requests through the cloud API (skip local extraction)
-    #[arg(long)]
-    cloud: bool,
-
-    /// Run deep research on a topic via the cloud API. Requires --api-key.
-    /// Saves full result (report + sources + findings) to a JSON file.
-    #[arg(long)]
-    research: Option<String>,
-
-    /// Enable deep research mode (longer, more thorough report). Used with --research.
-    #[arg(long)]
-    deep: bool,
-
-    /// Output directory: save each page to a separate file instead of stdout.
-    /// Works with --crawl, batch (multiple URLs), and single URL mode.
-    /// Filenames are derived from URL paths (e.g. /docs/api -> docs/api.md).
-    #[arg(long)]
-    output_dir: Option<PathBuf>,
-}
-
-#[derive(Subcommand)]
-enum Commands {
-    /// Per-URL extraction micro-benchmark: compares raw HTML vs. the
-    /// webclaw --format llm output on token count, bytes, and
-    /// extraction time. Uses an approximate tokenizer (see `--help`).
-    Bench {
-        /// URL to benchmark.
-        url: String,
-
-        /// Emit a single JSON line instead of the ASCII table.
-        /// Machine-readable shape stable across releases.
-        #[arg(long)]
-        json: bool,
-
-        /// Optional path to a facts.json (same schema as the repo's
-        /// benchmarks/facts.json) for a fidelity column.
-        #[arg(long)]
-        facts: Option<PathBuf>,
-    },
-
-    /// List all vertical extractors in the catalog.
-    ///
-    /// Each entry has a stable `name` (usable with `webclaw vertical <name>`),
-    /// a human-friendly label, a one-line description, and the URL
-    /// patterns it claims. The same data is served by `/v1/extractors`
-    /// when running the REST API.
-    Extractors {
-        /// Emit JSON instead of a human-friendly table.
-        #[arg(long)]
-        json: bool,
-    },
-
-    /// Run a vertical extractor by name. Returns typed JSON with fields
-    /// specific to the target site (title, price, author, rating, etc.)
-    /// rather than generic markdown.
-    ///
-    /// Use `webclaw extractors` to see the full list. Example:
-    /// `webclaw vertical reddit https://www.reddit.com/r/rust/comments/abc/`.
-    Vertical {
-        /// Vertical name (e.g. `reddit`, `github_repo`, `trustpilot_reviews`).
-        name: String,
-        /// URL to extract.
-        url: String,
-        /// Emit compact JSON (single line). Default is pretty-printed.
-        #[arg(long)]
-        raw: bool,
-    },
-}
-
-#[derive(Clone, ValueEnum)]
-enum OutputFormat {
-    Markdown,
-    Json,
-    Text,
-    Llm,
-    Html,
-}
-
-#[derive(Clone, ValueEnum)]
-enum Browser {
-    Chrome,
-    Firefox,
-    /// Safari iOS 26. Pair with a country-matched residential proxy for sites
-    /// that reject non-mobile profiles.
-    SafariIos,
-    Random,
-}
-
-#[derive(Clone, ValueEnum, Default)]
-enum PdfModeArg {
-    /// Error if PDF has no extractable text (catches scanned PDFs)
-    #[default]
-    Auto,
-    /// Return whatever text is found, even if empty
-    Fast,
-}
-
-impl From<PdfModeArg> for PdfMode {
-    fn from(arg: PdfModeArg) -> Self {
-        match arg {
-            PdfModeArg::Auto => PdfMode::Auto,
-            PdfModeArg::Fast => PdfMode::Fast,
-        }
-    }
-}
-
-impl From<Browser> for BrowserProfile {
-    fn from(b: Browser) -> Self {
-        match b {
-            Browser::Chrome => BrowserProfile::Chrome,
-            Browser::Firefox => BrowserProfile::Firefox,
-            Browser::SafariIos => BrowserProfile::SafariIos,
-            Browser::Random => BrowserProfile::Random,
-        }
-    }
-}
 
 fn init_logging(verbose: bool) {
     // html5ever / markup5ever / selectors emit WARN on common real-world HTML
@@ -466,1981 +38,6 @@ fn init_logging(verbose: bool) {
     tracing_subscriber::fmt().with_env_filter(filter).init();
 }
 
-/// Build FetchConfig from CLI flags.
-///
-/// `--proxy` sets a single static proxy (no rotation).
-/// `--proxy-file` loads a pool of proxies and rotates per-request.
-/// `--proxy` takes priority: if both are set, only the single proxy is used.
-fn build_fetch_config(cli: &Cli) -> FetchConfig {
-    let (proxy, proxy_pool) = if cli.proxy.is_some() {
-        (cli.proxy.clone(), Vec::new())
-    } else if let Some(ref path) = cli.proxy_file {
-        match webclaw_fetch::parse_proxy_file(path) {
-            Ok(pool) => (None, pool),
-            Err(e) => {
-                eprintln!("warning: {e}");
-                (None, Vec::new())
-            }
-        }
-    } else if std::path::Path::new("proxies.txt").exists() {
-        // Auto-load proxies.txt from working directory if present
-        match webclaw_fetch::parse_proxy_file("proxies.txt") {
-            Ok(pool) if !pool.is_empty() => {
-                eprintln!("loaded {} proxies from proxies.txt", pool.len());
-                (None, pool)
-            }
-            _ => (None, Vec::new()),
-        }
-    } else {
-        (None, Vec::new())
-    };
-
-    let mut headers = std::collections::HashMap::from([(
-        "Accept-Language".to_string(),
-        "en-US,en;q=0.9".to_string(),
-    )]);
-
-    // Parse -H "Key: Value" flags
-    for h in &cli.headers {
-        if let Some((key, val)) = h.split_once(':') {
-            headers.insert(key.trim().to_string(), val.trim().to_string());
-        }
-    }
-
-    // --cookie shorthand
-    if let Some(ref cookie) = cli.cookie {
-        headers.insert("Cookie".to_string(), cookie.clone());
-    }
-
-    // --cookie-file: parse JSON array of {name, value, domain, ...}
-    if let Some(ref path) = cli.cookie_file {
-        match parse_cookie_file(path) {
-            Ok(cookie_str) => {
-                // Merge with existing cookies if --cookie was also provided
-                if let Some(existing) = headers.get("Cookie") {
-                    headers.insert("Cookie".to_string(), format!("{existing}; {cookie_str}"));
-                } else {
-                    headers.insert("Cookie".to_string(), cookie_str);
-                }
-            }
-            Err(e) => {
-                eprintln!("error: failed to parse cookie file: {e}");
-                process::exit(1);
-            }
-        }
-    }
-
-    FetchConfig {
-        browser: cli.browser.clone().into(),
-        proxy,
-        proxy_pool,
-        timeout: std::time::Duration::from_secs(cli.timeout),
-        pdf_mode: cli.pdf_mode.clone().into(),
-        headers,
-        ..Default::default()
-    }
-}
-
-/// Parse a JSON cookie file (Chrome extension format) into a Cookie header string.
-/// Supports: [{name, value, domain, path, secure, httpOnly, expirationDate, ...}]
-fn parse_cookie_file(path: &str) -> Result<String, String> {
-    let content = std::fs::read_to_string(path).map_err(|e| format!("cannot read {path}: {e}"))?;
-    let cookies: Vec<serde_json::Value> =
-        serde_json::from_str(&content).map_err(|e| format!("invalid JSON: {e}"))?;
-
-    let pairs: Vec<String> = cookies
-        .iter()
-        .filter_map(|c| {
-            let name = c.get("name")?.as_str()?;
-            let value = c.get("value")?.as_str()?;
-            Some(format!("{name}={value}"))
-        })
-        .collect();
-
-    if pairs.is_empty() {
-        return Err("no cookies found in file".to_string());
-    }
-
-    Ok(pairs.join("; "))
-}
-
-fn build_extraction_options(cli: &Cli) -> ExtractionOptions {
-    ExtractionOptions {
-        include_selectors: cli
-            .include
-            .as_deref()
-            .map(|s| s.split(',').map(|s| s.trim().to_string()).collect())
-            .unwrap_or_default(),
-        exclude_selectors: cli
-            .exclude
-            .as_deref()
-            .map(|s| s.split(',').map(|s| s.trim().to_string()).collect())
-            .unwrap_or_default(),
-        only_main_content: cli.only_main_content,
-        include_raw_html: cli.raw_html || matches!(cli.format, OutputFormat::Html),
-    }
-}
-
-/// Normalize a URL: prepend `https://` if no scheme is present.
-fn normalize_url(url: &str) -> String {
-    let trimmed = url.trim();
-    if trimmed.contains("://") {
-        trimmed.to_string()
-    } else {
-        format!("https://{trimmed}")
-    }
-}
-
-/// Derive a filename from a URL for `--output-dir`.
-///
-/// Strips the scheme/host, maps the path to a filesystem path, and appends
-/// an extension matching the output format.
-fn url_to_filename(raw_url: &str, format: &OutputFormat) -> String {
-    let ext = match format {
-        OutputFormat::Markdown | OutputFormat::Llm => "md",
-        OutputFormat::Json => "json",
-        OutputFormat::Text => "txt",
-        OutputFormat::Html => "html",
-    };
-
-    let parsed = url::Url::parse(raw_url);
-    let (host, path, query) = match &parsed {
-        Ok(u) => (
-            u.host_str().unwrap_or("unknown").to_string(),
-            u.path().to_string(),
-            u.query().map(String::from),
-        ),
-        Err(_) => (String::new(), String::new(), None),
-    };
-
-    // Drop empty / "." / ".." path segments so a URL path like
-    // `/../../etc/passwd` can't climb out of the output directory.
-    let cleaned_path: String = path
-        .split('/')
-        .filter(|seg| !seg.is_empty() && *seg != "." && *seg != "..")
-        .collect::<Vec<_>>()
-        .join("/");
-
-    let mut stem = cleaned_path;
-    if stem.is_empty() {
-        // Use hostname for root URLs to avoid collisions in batch mode
-        let clean_host = host.strip_prefix("www.").unwrap_or(&host);
-        stem = format!("{}/index", clean_host.replace('.', "_"));
-    }
-
-    // Append query params so /p?id=123 doesn't collide with /p?id=456
-    if let Some(q) = query {
-        stem = format!("{stem}_{q}");
-    }
-
-    // Sanitize: keep alphanumeric, dash, underscore, dot, slash
-    let sanitized: String = stem
-        .chars()
-        .map(|c| {
-            if c.is_alphanumeric() || matches!(c, '-' | '_' | '.' | '/') {
-                c
-            } else {
-                '_'
-            }
-        })
-        .collect();
-
-    format!("{sanitized}.{ext}")
-}
-
-/// Reject a caller-supplied (CSV `url,filename`) name that could escape the
-/// output directory: absolute paths, drive prefixes, root, or any `..`
-/// component. Returns the validated relative path on success.
-fn safe_relative_filename(filename: &str) -> Result<PathBuf, String> {
-    let candidate = Path::new(filename);
-    use std::path::Component;
-    for comp in candidate.components() {
-        match comp {
-            Component::Normal(_) | Component::CurDir => {}
-            Component::ParentDir => {
-                return Err(format!("refusing path with '..' component: {filename}"));
-            }
-            Component::RootDir | Component::Prefix(_) => {
-                return Err(format!("refusing absolute output path: {filename}"));
-            }
-        }
-    }
-    if candidate.as_os_str().is_empty() {
-        return Err("empty output filename".to_string());
-    }
-    Ok(candidate.to_path_buf())
-}
-
-/// Write extraction output to a file inside `dir`, creating parent dirs as needed.
-///
-/// `filename` may originate from an attacker-controlled `--urls-file`
-/// (`url,filename` CSV). It is validated for traversal, and the canonical
-/// destination directory is asserted to stay under the canonical output
-/// directory before any write.
-fn write_to_file(dir: &Path, filename: &str, content: &str) -> Result<(), String> {
-    let rel = safe_relative_filename(filename)?;
-    let dest = dir.join(&rel);
-
-    std::fs::create_dir_all(dir)
-        .map_err(|e| format!("failed to create directory {}: {e}", dir.display()))?;
-    let base = dir
-        .canonicalize()
-        .map_err(|e| format!("failed to resolve output dir {}: {e}", dir.display()))?;
-
-    if let Some(parent) = dest.parent() {
-        std::fs::create_dir_all(parent)
-            .map_err(|e| format!("failed to create directory {}: {e}", parent.display()))?;
-        let canon_parent = parent
-            .canonicalize()
-            .map_err(|e| format!("failed to resolve {}: {e}", parent.display()))?;
-        if !canon_parent.starts_with(&base) {
-            return Err(format!(
-                "refusing to write outside output dir: {}",
-                dest.display()
-            ));
-        }
-    }
-
-    std::fs::write(&dest, content)
-        .map_err(|e| format!("failed to write {}: {e}", dest.display()))?;
-    let word_count = content.split_whitespace().count();
-    eprintln!("Saved: {} ({word_count} words)", dest.display());
-    Ok(())
-}
-
-/// Get raw HTML from an extraction result, falling back to markdown if unavailable.
-fn raw_html_or_markdown(result: &ExtractionResult) -> &str {
-    result
-        .content
-        .raw_html
-        .as_deref()
-        .unwrap_or(&result.content.markdown)
-}
-
-/// Format an `ExtractionResult` into a string for the given output format.
-fn format_output(result: &ExtractionResult, format: &OutputFormat, show_metadata: bool) -> String {
-    match format {
-        OutputFormat::Markdown => {
-            let mut out = String::new();
-            if show_metadata {
-                out.push_str(&format_frontmatter(&result.metadata));
-            }
-            out.push_str(&result.content.markdown);
-            if !result.structured_data.is_empty() {
-                out.push_str("\n\n## Structured Data\n\n```json\n");
-                out.push_str(
-                    &serde_json::to_string_pretty(&result.structured_data).unwrap_or_default(),
-                );
-                out.push_str("\n```");
-            }
-            out
-        }
-        OutputFormat::Json => serde_json::to_string_pretty(result).expect("serialization failed"),
-        OutputFormat::Text => result.content.plain_text.clone(),
-        OutputFormat::Llm => to_llm_text(result, result.metadata.url.as_deref()),
-        OutputFormat::Html => raw_html_or_markdown(result).to_string(),
-    }
-}
-
-/// Collect all URLs from positional args + --urls-file, normalizing bare domains.
-///
-/// Returns `(url, optional_custom_filename)` pairs. Custom filenames come from
-/// CSV-style lines in `--urls-file`: `url,filename`. Plain lines (no comma) get
-/// `None` so the caller auto-generates the filename from the URL.
-fn collect_urls(cli: &Cli) -> Result<Vec<(String, Option<String>)>, String> {
-    let mut entries: Vec<(String, Option<String>)> =
-        cli.urls.iter().map(|u| (normalize_url(u), None)).collect();
-
-    if let Some(ref path) = cli.urls_file {
-        let content =
-            std::fs::read_to_string(path).map_err(|e| format!("failed to read {path}: {e}"))?;
-        for line in content.lines() {
-            let trimmed = line.trim();
-            if trimmed.is_empty() || trimmed.starts_with('#') {
-                continue;
-            }
-            if let Some((url_part, name_part)) = trimmed.split_once(',') {
-                let name = name_part.trim();
-                let custom = if name.is_empty() {
-                    None
-                } else {
-                    Some(name.to_string())
-                };
-                entries.push((normalize_url(url_part.trim()), custom));
-            } else {
-                entries.push((normalize_url(trimmed), None));
-            }
-        }
-    }
-
-    Ok(entries)
-}
-
-/// Result that can be either a local extraction or a cloud API JSON response.
-enum FetchOutput {
-    Local(Box<ExtractionResult>),
-    Cloud(serde_json::Value),
-}
-
-impl FetchOutput {
-    /// Get the local ExtractionResult, or try to parse it from the cloud response.
-    fn into_extraction(self) -> Result<ExtractionResult, String> {
-        match self {
-            FetchOutput::Local(r) => Ok(*r),
-            FetchOutput::Cloud(resp) => {
-                // Cloud response has an "extraction" field with the full ExtractionResult
-                resp.get("extraction")
-                    .and_then(|v| serde_json::from_value(v.clone()).ok())
-                    .or_else(|| serde_json::from_value(resp.clone()).ok())
-                    .ok_or_else(|| "could not parse extraction from cloud response".to_string())
-            }
-        }
-    }
-}
-
-/// Fetch a URL and extract content, handling PDF detection automatically.
-/// Falls back to cloud API when bot protection or JS rendering is detected.
-async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
-    // Local sources: read and extract as HTML
-    if cli.stdin {
-        let mut buf = String::new();
-        io::stdin()
-            .read_to_string(&mut buf)
-            .map_err(|e| format!("failed to read stdin: {e}"))?;
-        let options = build_extraction_options(cli);
-        return extract_with_options(&buf, None, &options)
-            .map(|r| FetchOutput::Local(Box::new(r)))
-            .map_err(|e| format!("extraction error: {e}"));
-    }
-
-    if let Some(ref path) = cli.file {
-        let html =
-            std::fs::read_to_string(path).map_err(|e| format!("failed to read {path}: {e}"))?;
-        let options = build_extraction_options(cli);
-        return extract_with_options(&html, None, &options)
-            .map(|r| FetchOutput::Local(Box::new(r)))
-            .map_err(|e| format!("extraction error: {e}"));
-    }
-
-    let raw_url = cli
-        .urls
-        .first()
-        .ok_or("no input provided -- pass a URL, --file, or --stdin")?;
-    let url = normalize_url(raw_url);
-    let url = url.as_str();
-
-    let cloud_client = webclaw_fetch::cloud::CloudClient::new(cli.api_key.as_deref());
-
-    // --cloud: skip local, go straight to cloud API
-    if cli.cloud {
-        let c =
-            cloud_client.ok_or("--cloud requires WEBCLAW_API_KEY (set via env or --api-key)")?;
-        let options = build_extraction_options(cli);
-        let format_str = match cli.format {
-            OutputFormat::Markdown => "markdown",
-            OutputFormat::Json => "json",
-            OutputFormat::Text => "text",
-            OutputFormat::Llm => "llm",
-            OutputFormat::Html => "html",
-        };
-        let resp = c
-            .scrape(
-                url,
-                &[format_str],
-                &options.include_selectors,
-                &options.exclude_selectors,
-                options.only_main_content,
-            )
-            .await?;
-        return Ok(FetchOutput::Cloud(resp));
-    }
-
-    // Normal path: try local first
-    let client =
-        FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
-    let options = build_extraction_options(cli);
-    let result = client
-        .fetch_and_extract_with_options(url, &options)
-        .await
-        .map_err(|e| format!("fetch error: {e}"))?;
-
-    // Check if we should fall back to cloud
-    let reason = detect_empty(&result);
-    if !matches!(reason, EmptyReason::None) {
-        if let Some(ref c) = cloud_client {
-            eprintln!("\x1b[36minfo:\x1b[0m falling back to cloud API...");
-            let format_str = match cli.format {
-                OutputFormat::Markdown => "markdown",
-                OutputFormat::Json => "json",
-                OutputFormat::Text => "text",
-                OutputFormat::Llm => "llm",
-                OutputFormat::Html => "html",
-            };
-            match c
-                .scrape(
-                    url,
-                    &[format_str],
-                    &options.include_selectors,
-                    &options.exclude_selectors,
-                    options.only_main_content,
-                )
-                .await
-            {
-                Ok(resp) => return Ok(FetchOutput::Cloud(resp)),
-                Err(e) => {
-                    eprintln!("\x1b[33mwarning:\x1b[0m cloud fallback failed: {e}");
-                    // Fall through to return the local result with a warning
-                }
-            }
-        }
-        warn_empty(url, &reason);
-    }
-
-    Ok(FetchOutput::Local(Box::new(result)))
-}
-
-/// Fetch raw HTML from a URL (no extraction). Used for --raw-html and brand extraction.
-async fn fetch_html(cli: &Cli) -> Result<FetchResult, String> {
-    if cli.stdin {
-        let mut buf = String::new();
-        io::stdin()
-            .read_to_string(&mut buf)
-            .map_err(|e| format!("failed to read stdin: {e}"))?;
-        return Ok(FetchResult {
-            html: buf,
-            url: String::new(),
-            status: 200,
-            headers: Default::default(),
-            elapsed: Default::default(),
-        });
-    }
-
-    if let Some(ref path) = cli.file {
-        let html =
-            std::fs::read_to_string(path).map_err(|e| format!("failed to read {path}: {e}"))?;
-        return Ok(FetchResult {
-            html,
-            url: String::new(),
-            status: 200,
-            headers: Default::default(),
-            elapsed: Default::default(),
-        });
-    }
-
-    let raw_url = cli
-        .urls
-        .first()
-        .ok_or("no input provided -- pass a URL, --file, or --stdin")?;
-    let url = normalize_url(raw_url);
-
-    let client =
-        FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
-    client
-        .fetch(&url)
-        .await
-        .map_err(|e| format!("fetch error: {e}"))
-}
-
-/// Fetch external stylesheets referenced in HTML and inject them as `<style>` blocks.
-/// This allows brand extraction to see colors/fonts from external CSS files.
-async fn enrich_html_with_stylesheets(html: &str, base_url: &str) -> String {
-    let base = match url::Url::parse(base_url) {
-        Ok(u) => u,
-        Err(_) => return html.to_string(),
-    };
-
-    // Extract stylesheet hrefs from <link rel="stylesheet" href="...">
-    let re = regex::Regex::new(
-        r#"<link[^>]+rel=["']stylesheet["'][^>]+href=["']([^"']+)["']|<link[^>]+href=["']([^"']+)["'][^>]+rel=["']stylesheet["']"#
-    ).unwrap();
-
-    let hrefs: Vec<String> = re
-        .captures_iter(html)
-        .filter_map(|cap| {
-            let href = cap.get(1).or(cap.get(2))?;
-            Some(
-                base.join(href.as_str())
-                    .map(|u| u.to_string())
-                    .unwrap_or_else(|_| href.as_str().to_string()),
-            )
-        })
-        .take(10)
-        .collect();
-
-    if hrefs.is_empty() {
-        return html.to_string();
-    }
-
-    let client = reqwest::Client::builder()
-        .timeout(std::time::Duration::from_secs(5))
-        .redirect(reqwest::redirect::Policy::none())
-        .build()
-        .unwrap_or_default();
-
-    let mut extra_css = String::new();
-    for href in &hrefs {
-        if webclaw_fetch::url_security::validate_public_http_url(href)
-            .await
-            .is_err()
-        {
-            continue;
-        }
-        if let Ok(resp) = client.get(href).send().await
-            && resp.status().is_success()
-            && let Ok(body) = resp.text().await
-            && !body.trim_start().starts_with("<!")
-            && body.len() < 2_000_000
-        {
-            extra_css.push_str("\n<style>\n");
-            extra_css.push_str(&body);
-            extra_css.push_str("\n</style>\n");
-        }
-    }
-
-    if extra_css.is_empty() {
-        return html.to_string();
-    }
-
-    if let Some(pos) = html.to_lowercase().find("</head>") {
-        let mut enriched = String::with_capacity(html.len() + extra_css.len());
-        enriched.push_str(&html[..pos]);
-        enriched.push_str(&extra_css);
-        enriched.push_str(&html[pos..]);
-        enriched
-    } else {
-        format!("{extra_css}{html}")
-    }
-}
-
-fn format_frontmatter(meta: &Metadata) -> String {
-    let mut lines = vec!["---".to_string()];
-
-    if let Some(title) = &meta.title {
-        lines.push(format!("title: \"{title}\""));
-    }
-    if let Some(author) = &meta.author {
-        lines.push(format!("author: \"{author}\""));
-    }
-    if let Some(date) = &meta.published_date {
-        lines.push(format!("date: \"{date}\""));
-    }
-    if let Some(url) = &meta.url {
-        lines.push(format!("source: \"{url}\""));
-    }
-    if meta.word_count > 0 {
-        lines.push(format!("word_count: {}", meta.word_count));
-    }
-
-    lines.push("---".to_string());
-    lines.push(String::new()); // blank line after frontmatter
-    lines.join("\n")
-}
-
-fn print_output(result: &ExtractionResult, format: &OutputFormat, show_metadata: bool) {
-    match format {
-        OutputFormat::Markdown => {
-            if show_metadata {
-                print!("{}", format_frontmatter(&result.metadata));
-            }
-            println!("{}", result.content.markdown);
-            if !result.structured_data.is_empty() {
-                println!(
-                    "\n## Structured Data\n\n```json\n{}\n```",
-                    serde_json::to_string_pretty(&result.structured_data).unwrap_or_default()
-                );
-            }
-        }
-        OutputFormat::Json => {
-            // serde_json::to_string_pretty won't fail on our types
-            println!(
-                "{}",
-                serde_json::to_string_pretty(result).expect("serialization failed")
-            );
-        }
-        OutputFormat::Text => {
-            println!("{}", result.content.plain_text);
-        }
-        OutputFormat::Llm => {
-            println!("{}", to_llm_text(result, result.metadata.url.as_deref()));
-        }
-        OutputFormat::Html => {
-            println!("{}", raw_html_or_markdown(result));
-        }
-    }
-}
-
-/// Print cloud API response in the requested format.
-fn print_cloud_output(resp: &serde_json::Value, format: &OutputFormat) {
-    match format {
-        OutputFormat::Json => {
-            println!(
-                "{}",
-                serde_json::to_string_pretty(resp).expect("serialization failed")
-            );
-        }
-        OutputFormat::Markdown => {
-            // Cloud response has content.markdown
-            if let Some(md) = resp
-                .get("content")
-                .and_then(|c| c.get("markdown"))
-                .and_then(|m| m.as_str())
-            {
-                println!("{md}");
-            } else if let Some(md) = resp.get("markdown").and_then(|m| m.as_str()) {
-                println!("{md}");
-            } else {
-                println!(
-                    "{}",
-                    serde_json::to_string_pretty(resp).expect("serialization failed")
-                );
-            }
-        }
-        OutputFormat::Text => {
-            if let Some(txt) = resp
-                .get("content")
-                .and_then(|c| c.get("plain_text"))
-                .and_then(|t| t.as_str())
-            {
-                println!("{txt}");
-            } else {
-                // Fallback to markdown or raw JSON
-                print_cloud_output(resp, &OutputFormat::Markdown);
-            }
-        }
-        OutputFormat::Llm => {
-            if let Some(llm) = resp
-                .get("content")
-                .and_then(|c| c.get("llm_text"))
-                .and_then(|t| t.as_str())
-            {
-                println!("{llm}");
-            } else {
-                print_cloud_output(resp, &OutputFormat::Markdown);
-            }
-        }
-        OutputFormat::Html => {
-            if let Some(html) = resp
-                .get("content")
-                .and_then(|c| c.get("raw_html"))
-                .and_then(|h| h.as_str())
-            {
-                println!("{html}");
-            } else {
-                print_cloud_output(resp, &OutputFormat::Markdown);
-            }
-        }
-    }
-}
-
-fn print_diff_output(diff: &ContentDiff, format: &OutputFormat) {
-    match format {
-        OutputFormat::Json => {
-            println!(
-                "{}",
-                serde_json::to_string_pretty(diff).expect("serialization failed")
-            );
-        }
-        // For markdown/text/llm, show a human-readable summary
-        _ => {
-            println!("Status: {:?}", diff.status);
-            println!("Word count delta: {:+}", diff.word_count_delta);
-
-            if !diff.metadata_changes.is_empty() {
-                println!("\nMetadata changes:");
-                for change in &diff.metadata_changes {
-                    println!(
-                        "  {}: {} -> {}",
-                        change.field,
-                        change.old.as_deref().unwrap_or("(none)"),
-                        change.new.as_deref().unwrap_or("(none)"),
-                    );
-                }
-            }
-
-            if !diff.links_added.is_empty() {
-                println!("\nLinks added:");
-                for link in &diff.links_added {
-                    println!("  + {} ({})", link.href, link.text);
-                }
-            }
-
-            if !diff.links_removed.is_empty() {
-                println!("\nLinks removed:");
-                for link in &diff.links_removed {
-                    println!("  - {} ({})", link.href, link.text);
-                }
-            }
-
-            if let Some(ref text_diff) = diff.text_diff {
-                println!("\n{text_diff}");
-            }
-        }
-    }
-}
-
-fn print_crawl_output(result: &CrawlResult, format: &OutputFormat, show_metadata: bool) {
-    match format {
-        OutputFormat::Json => {
-            println!(
-                "{}",
-                serde_json::to_string_pretty(result).expect("serialization failed")
-            );
-        }
-        OutputFormat::Markdown => {
-            for page in &result.pages {
-                let Some(ref extraction) = page.extraction else {
-                    continue;
-                };
-                println!("---");
-                println!("# Page: {}\n", page.url);
-                if show_metadata {
-                    print!("{}", format_frontmatter(&extraction.metadata));
-                }
-                println!("{}", extraction.content.markdown);
-                println!();
-            }
-        }
-        OutputFormat::Text => {
-            for page in &result.pages {
-                let Some(ref extraction) = page.extraction else {
-                    continue;
-                };
-                println!("---");
-                println!("# Page: {}\n", page.url);
-                println!("{}", extraction.content.plain_text);
-                println!();
-            }
-        }
-        OutputFormat::Llm => {
-            for page in &result.pages {
-                let Some(ref extraction) = page.extraction else {
-                    continue;
-                };
-                println!("---");
-                println!("{}", to_llm_text(extraction, Some(page.url.as_str())));
-                println!();
-            }
-        }
-        OutputFormat::Html => {
-            for page in &result.pages {
-                let Some(ref extraction) = page.extraction else {
-                    continue;
-                };
-                println!("---");
-                println!("<!-- Page: {} -->\n", page.url);
-                println!("{}", raw_html_or_markdown(extraction));
-                println!();
-            }
-        }
-    }
-}
-
-fn print_batch_output(results: &[BatchExtractResult], format: &OutputFormat, show_metadata: bool) {
-    match format {
-        OutputFormat::Json => {
-            // Build a JSON array of {url, result?, error?} objects
-            let entries: Vec<serde_json::Value> = results
-                .iter()
-                .map(|r| match &r.result {
-                    Ok(extraction) => serde_json::json!({
-                        "url": r.url,
-                        "result": extraction,
-                    }),
-                    Err(e) => serde_json::json!({
-                        "url": r.url,
-                        "error": e.to_string(),
-                    }),
-                })
-                .collect();
-            println!(
-                "{}",
-                serde_json::to_string_pretty(&entries).expect("serialization failed")
-            );
-        }
-        OutputFormat::Markdown => {
-            for r in results {
-                match &r.result {
-                    Ok(extraction) => {
-                        println!("---");
-                        println!("# {}\n", r.url);
-                        if show_metadata {
-                            print!("{}", format_frontmatter(&extraction.metadata));
-                        }
-                        println!("{}", extraction.content.markdown);
-                        println!();
-                    }
-                    Err(e) => {
-                        eprintln!("error: {} -- {}", r.url, e);
-                    }
-                }
-            }
-        }
-        OutputFormat::Text => {
-            for r in results {
-                match &r.result {
-                    Ok(extraction) => {
-                        println!("---");
-                        println!("# {}\n", r.url);
-                        println!("{}", extraction.content.plain_text);
-                        println!();
-                    }
-                    Err(e) => {
-                        eprintln!("error: {} -- {}", r.url, e);
-                    }
-                }
-            }
-        }
-        OutputFormat::Llm => {
-            for r in results {
-                match &r.result {
-                    Ok(extraction) => {
-                        println!("---");
-                        println!("{}", to_llm_text(extraction, Some(r.url.as_str())));
-                        println!();
-                    }
-                    Err(e) => {
-                        eprintln!("error: {} -- {}", r.url, e);
-                    }
-                }
-            }
-        }
-        OutputFormat::Html => {
-            for r in results {
-                match &r.result {
-                    Ok(extraction) => {
-                        println!("---");
-                        println!("<!-- {} -->\n", r.url);
-                        println!("{}", raw_html_or_markdown(extraction));
-                        println!();
-                    }
-                    Err(e) => {
-                        eprintln!("error: {} -- {}", r.url, e);
-                    }
-                }
-            }
-        }
-    }
-}
-
-fn print_map_output(entries: &[SitemapEntry], format: &OutputFormat) {
-    match format {
-        OutputFormat::Json => {
-            println!(
-                "{}",
-                serde_json::to_string_pretty(entries).expect("serialization failed")
-            );
-        }
-        _ => {
-            for entry in entries {
-                println!("{}", entry.url);
-            }
-        }
-    }
-}
-
-/// Format a streaming progress line for a completed page.
-fn format_progress(page: &PageResult, index: usize, max_pages: usize) -> String {
-    let status = if page.error.is_some() { "ERR" } else { "OK " };
-    let timing = format!("{}ms", page.elapsed.as_millis());
-    let detail = if let Some(ref extraction) = page.extraction {
-        format!(", {} words", extraction.metadata.word_count)
-    } else if let Some(ref err) = page.error {
-        format!(" ({err})")
-    } else {
-        String::new()
-    };
-    format!(
-        "[{index}/{max_pages}] {status} {} ({timing}{detail})",
-        page.url
-    )
-}
-
-async fn run_crawl(cli: &Cli) -> Result<(), String> {
-    let url = cli
-        .urls
-        .first()
-        .ok_or("--crawl requires a URL argument")
-        .map(|u| normalize_url(u))?;
-    let url = url.as_str();
-
-    if cli.file.is_some() || cli.stdin {
-        return Err("--crawl cannot be used with --file or --stdin".into());
-    }
-
-    let include_patterns: Vec<String> = cli
-        .include_paths
-        .as_deref()
-        .map(|s| s.split(',').map(|p| p.trim().to_string()).collect())
-        .unwrap_or_default();
-    let exclude_patterns: Vec<String> = cli
-        .exclude_paths
-        .as_deref()
-        .map(|s| s.split(',').map(|p| p.trim().to_string()).collect())
-        .unwrap_or_default();
-
-    // Set up streaming progress channel
-    let (progress_tx, mut progress_rx) = tokio::sync::broadcast::channel::<PageResult>(100);
-
-    // Set up cancel flag for Ctrl+C handling
-    let cancel_flag = Arc::new(AtomicBool::new(false));
-
-    // Register Ctrl+C handler when --crawl-state is set
-    let state_path = cli.crawl_state.clone();
-    if state_path.is_some() {
-        let flag = Arc::clone(&cancel_flag);
-        tokio::spawn(async move {
-            tokio::signal::ctrl_c().await.ok();
-            flag.store(true, Ordering::Relaxed);
-            eprintln!("\nCtrl+C received, saving crawl state...");
-        });
-    }
-
-    let config = CrawlConfig {
-        fetch: build_fetch_config(cli),
-        max_depth: cli.depth,
-        max_pages: cli.max_pages,
-        concurrency: cli.concurrency,
-        delay: std::time::Duration::from_millis(cli.delay),
-        path_prefix: cli.path_prefix.clone(),
-        use_sitemap: cli.sitemap,
-        include_patterns,
-        exclude_patterns,
-        progress_tx: Some(progress_tx),
-        cancel_flag: Some(Arc::clone(&cancel_flag)),
-        allow_subdomains: false,
-        allow_external_links: false,
-    };
-
-    // Load resume state if --crawl-state file exists
-    let resume_state = state_path
-        .as_ref()
-        .and_then(|p| Crawler::load_state(p))
-        .inspect(|s| {
-            eprintln!(
-                "Resuming crawl: {} pages already visited, {} URLs in frontier",
-                s.visited.len(),
-                s.frontier.len(),
-            );
-        });
-
-    let max_pages = cli.max_pages;
-    let completed_offset = resume_state.as_ref().map_or(0, |s| s.completed_pages);
-
-    // Spawn background task to print streaming progress to stderr
-    let progress_handle = tokio::spawn(async move {
-        let mut count = completed_offset;
-        while let Ok(page) = progress_rx.recv().await {
-            count += 1;
-            eprintln!("{}", format_progress(&page, count, max_pages));
-        }
-    });
-
-    let crawler = Crawler::new(url, config).map_err(|e| format!("crawler error: {e}"))?;
-    let result = crawler.crawl(url, resume_state).await;
-
-    // Drop the crawler (and its progress_tx clone) so the progress task finishes
-    drop(crawler);
-    let _ = progress_handle.await;
-
-    // If cancelled via Ctrl+C and --crawl-state is set, save state for resume
-    let was_cancelled = cancel_flag.load(Ordering::Relaxed);
-    if was_cancelled {
-        if let Some(ref path) = state_path {
-            Crawler::save_state(
-                path,
-                url,
-                &result.visited,
-                &result.remaining_frontier,
-                completed_offset + result.pages.len(),
-                cli.max_pages,
-                cli.depth,
-            )?;
-            eprintln!(
-                "Crawl state saved to {} ({} pages completed). Resume with --crawl-state {}",
-                path.display(),
-                completed_offset + result.pages.len(),
-                path.display(),
-            );
-        }
-    } else if let Some(ref path) = state_path {
-        // Crawl completed normally — clean up state file
-        if path.exists() {
-            let _ = std::fs::remove_file(path);
-        }
-    }
-
-    // Log per-page errors and extraction warnings to stderr
-    for page in &result.pages {
-        if let Some(ref err) = page.error {
-            eprintln!("error: {} -- {}", page.url, err);
-        } else if let Some(ref extraction) = page.extraction {
-            let reason = detect_empty(extraction);
-            if !matches!(reason, EmptyReason::None) {
-                warn_empty(&page.url, &reason);
-            }
-        }
-    }
-
-    if let Some(ref dir) = cli.output_dir {
-        let mut saved = 0usize;
-        for page in &result.pages {
-            if let Some(ref extraction) = page.extraction {
-                let filename = url_to_filename(&page.url, &cli.format);
-                let content = format_output(extraction, &cli.format, cli.metadata);
-                write_to_file(dir, &filename, &content)?;
-                saved += 1;
-            }
-        }
-        eprintln!("Saved {saved} files to {}", dir.display());
-    } else {
-        print_crawl_output(&result, &cli.format, cli.metadata);
-    }
-
-    eprintln!(
-        "Crawled {} pages ({} ok, {} errors) in {:.1}s",
-        result.total, result.ok, result.errors, result.elapsed_secs,
-    );
-
-    // Fire webhook on crawl complete
-    if let Some(ref webhook_url) = cli.webhook {
-        let urls: Vec<&str> = result.pages.iter().map(|p| p.url.as_str()).collect();
-        fire_webhook(
-            webhook_url,
-            &serde_json::json!({
-                "event": "crawl_complete",
-                "total": result.total,
-                "ok": result.ok,
-                "errors": result.errors,
-                "elapsed_secs": result.elapsed_secs,
-                "urls": urls,
-            }),
-        );
-        // Brief pause so the async webhook has time to fire
-        tokio::time::sleep(std::time::Duration::from_millis(500)).await;
-    }
-
-    if result.errors > 0 {
-        Err(format!(
-            "{} of {} pages failed",
-            result.errors, result.total
-        ))
-    } else {
-        Ok(())
-    }
-}
-
-async fn run_map(cli: &Cli) -> Result<(), String> {
-    let url = cli
-        .urls
-        .first()
-        .ok_or("--map requires a URL argument")
-        .map(|u| normalize_url(u))?;
-    let url = url.as_str();
-
-    let client =
-        FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
-
-    let entries = webclaw_fetch::sitemap::discover(&client, url)
-        .await
-        .map_err(|e| format!("sitemap discovery failed: {e}"))?;
-
-    if entries.is_empty() {
-        eprintln!("no sitemap URLs found for {url}");
-    } else {
-        eprintln!("discovered {} URLs", entries.len());
-    }
-
-    print_map_output(&entries, &cli.format);
-    Ok(())
-}
-
-async fn run_batch(cli: &Cli, entries: &[(String, Option<String>)]) -> Result<(), String> {
-    let client = Arc::new(
-        FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?,
-    );
-
-    let urls: Vec<&str> = entries.iter().map(|(u, _)| u.as_str()).collect();
-    let options = build_extraction_options(cli);
-    let results = client
-        .fetch_and_extract_batch_with_options(&urls, cli.concurrency, &options)
-        .await;
-
-    let ok = results.iter().filter(|r| r.result.is_ok()).count();
-    let errors = results.len() - ok;
-
-    // Log errors and extraction warnings to stderr
-    for r in &results {
-        if let Err(ref e) = r.result {
-            eprintln!("error: {} -- {}", r.url, e);
-        } else if let Ok(ref extraction) = r.result {
-            let reason = detect_empty(extraction);
-            if !matches!(reason, EmptyReason::None) {
-                warn_empty(&r.url, &reason);
-            }
-        }
-    }
-
-    // Build a lookup of custom filenames by URL
-    let custom_names: std::collections::HashMap<&str, &str> = entries
-        .iter()
-        .filter_map(|(url, name)| name.as_deref().map(|n| (url.as_str(), n)))
-        .collect();
-
-    if let Some(ref dir) = cli.output_dir {
-        let mut saved = 0usize;
-        for r in &results {
-            if let Ok(ref extraction) = r.result {
-                let filename = custom_names
-                    .get(r.url.as_str())
-                    .map(|s| s.to_string())
-                    .unwrap_or_else(|| url_to_filename(&r.url, &cli.format));
-                let content = format_output(extraction, &cli.format, cli.metadata);
-                write_to_file(dir, &filename, &content)?;
-                saved += 1;
-            }
-        }
-        eprintln!("Saved {saved} files to {}", dir.display());
-    } else {
-        print_batch_output(&results, &cli.format, cli.metadata);
-    }
-
-    eprintln!(
-        "Fetched {} URLs ({} ok, {} errors)",
-        results.len(),
-        ok,
-        errors
-    );
-
-    // Fire webhook on batch complete
-    if let Some(ref webhook_url) = cli.webhook {
-        let urls: Vec<&str> = results.iter().map(|r| r.url.as_str()).collect();
-        fire_webhook(
-            webhook_url,
-            &serde_json::json!({
-                "event": "batch_complete",
-                "total": results.len(),
-                "ok": ok,
-                "errors": errors,
-                "urls": urls,
-            }),
-        );
-        tokio::time::sleep(std::time::Duration::from_millis(500)).await;
-    }
-
-    if errors > 0 {
-        Err(format!("{errors} of {} URLs failed", results.len()))
-    } else {
-        Ok(())
-    }
-}
-
-fn timestamp() -> String {
-    let now = std::time::SystemTime::now()
-        .duration_since(std::time::UNIX_EPOCH)
-        .unwrap_or_default()
-        .as_secs();
-    let hours = (now % 86400) / 3600;
-    let minutes = (now % 3600) / 60;
-    let seconds = now % 60;
-    format!("{hours:02}:{minutes:02}:{seconds:02}")
-}
-
-/// Spawn the `--on-change` command with `payload` on stdin.
-///
-/// Previously this passed the entire user-provided string to `sh -c`, which
-/// made `--on-change 'notify "$URL"; rm -rf /'` a plausible disaster the
-/// moment an untrusted config file or MCP-driven agent fed us a command.
-/// The MCP surface specifically is prompt-injection-exposed: an LLM that
-/// controls CLI args can escalate into arbitrary shell on the host.
-///
-/// We now parse the command with `shlex` (POSIX-ish tokenization with proper
-/// quoting) and exec the program directly without an intermediate shell, so
-/// metacharacters like `;`, `&&`, `|`, `$()`, and env expansion can't fire.
-/// Users who genuinely need a pipeline can set the whole chain behind a
-/// script they've written, or opt in per-call via `WEBCLAW_ALLOW_SHELL=1`
-/// (documented escape hatch, noisy by design).
-async fn spawn_on_change(cmd: &str, stdin_payload: &[u8]) {
-    eprintln!("[watch] Running: {cmd}");
-
-    let allow_shell = std::env::var("WEBCLAW_ALLOW_SHELL")
-        .map(|v| v == "1" || v.eq_ignore_ascii_case("true"))
-        .unwrap_or(false);
-
-    let mut command = if allow_shell {
-        eprintln!("[watch] WEBCLAW_ALLOW_SHELL=1 — executing via sh -c (unsafe)");
-        let mut c = tokio::process::Command::new("sh");
-        c.arg("-c").arg(cmd);
-        c
-    } else {
-        let Some(argv) = shlex::split(cmd) else {
-            eprintln!("[watch] Failed to parse --on-change command (unbalanced quotes?)");
-            return;
-        };
-        let Some((program, args)) = argv.split_first() else {
-            eprintln!("[watch] --on-change command is empty");
-            return;
-        };
-        let mut c = tokio::process::Command::new(program);
-        c.args(args);
-        c
-    };
-
-    command.stdin(std::process::Stdio::piped());
-
-    match command.spawn() {
-        Ok(mut child) => {
-            if let Some(mut stdin) = child.stdin.take() {
-                use tokio::io::AsyncWriteExt;
-                let _ = stdin.write_all(stdin_payload).await;
-            }
-        }
-        Err(e) => eprintln!("[watch] Failed to run command: {e}"),
-    }
-}
-
-/// Fire a webhook POST with a JSON payload. Non-blocking — errors logged to stderr.
-/// Auto-detects Discord and Slack webhook URLs and wraps the payload accordingly.
-fn fire_webhook(url: &str, payload: &serde_json::Value) {
-    let url = url.to_string();
-    let is_discord = url.contains("discord.com/api/webhooks");
-    let is_slack = url.contains("hooks.slack.com");
-
-    let body = if is_discord {
-        let event = payload
-            .get("event")
-            .and_then(|v| v.as_str())
-            .unwrap_or("notification");
-        let details = serde_json::to_string_pretty(payload).unwrap_or_default();
-        serde_json::json!({
-            "embeds": [{
-                "title": format!("webclaw: {event}"),
-                "description": format!("```json\n{details}\n```"),
-                "color": 5814783
-            }]
-        })
-        .to_string()
-    } else if is_slack {
-        let event = payload
-            .get("event")
-            .and_then(|v| v.as_str())
-            .unwrap_or("notification");
-        let details = serde_json::to_string_pretty(payload).unwrap_or_default();
-        serde_json::json!({
-            "text": format!("*webclaw: {event}*\n```{details}```")
-        })
-        .to_string()
-    } else {
-        serde_json::to_string(payload).unwrap_or_default()
-    };
-    tokio::spawn(async move {
-        // SSRF guard: a webhook URL is user-supplied and otherwise bypasses
-        // the fetch-layer protections, so resolve + reject private/internal
-        // destinations before sending the payload.
-        if let Err(e) = webclaw_fetch::url_security::validate_public_http_url(&url).await {
-            eprintln!("[webhook] refusing unsafe URL: {e}");
-            return;
-        }
-        match reqwest::Client::builder()
-            .timeout(std::time::Duration::from_secs(10))
-            .build()
-        {
-            Ok(c) => match c
-                .post(&url)
-                .header("Content-Type", "application/json")
-                .body(body)
-                .send()
-                .await
-            {
-                Ok(resp) => {
-                    eprintln!(
-                        "[webhook] POST {} -> {}",
-                        &url[..url.len().min(60)],
-                        resp.status()
-                    );
-                }
-                Err(e) => eprintln!("[webhook] POST failed: {e}"),
-            },
-            Err(e) => eprintln!("[webhook] client error: {e}"),
-        }
-    });
-}
-
-async fn run_watch(cli: &Cli, urls: &[String]) -> Result<(), String> {
-    if urls.is_empty() {
-        return Err("--watch requires at least one URL".into());
-    }
-
-    let client = Arc::new(
-        FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?,
-    );
-    let options = build_extraction_options(cli);
-
-    // Ctrl+C handler
-    let cancelled = Arc::new(AtomicBool::new(false));
-    let flag = Arc::clone(&cancelled);
-    tokio::spawn(async move {
-        tokio::signal::ctrl_c().await.ok();
-        flag.store(true, Ordering::Relaxed);
-    });
-
-    // Single-URL mode: preserve original behavior exactly
-    if urls.len() == 1 {
-        return run_watch_single(cli, &client, &options, &urls[0], &cancelled).await;
-    }
-
-    // Multi-URL mode: batch fetch, diff each, report aggregate
-    run_watch_multi(cli, &client, &options, urls, &cancelled).await
-}
-
-/// Original single-URL watch loop -- backward compatible.
-async fn run_watch_single(
-    cli: &Cli,
-    client: &Arc<FetchClient>,
-    options: &ExtractionOptions,
-    url: &str,
-    cancelled: &Arc<AtomicBool>,
-) -> Result<(), String> {
-    let mut previous = client
-        .fetch_and_extract_with_options(url, options)
-        .await
-        .map_err(|e| format!("initial fetch failed: {e}"))?;
-
-    eprintln!(
-        "[watch] Initial snapshot: {url} ({} words)",
-        previous.metadata.word_count
-    );
-
-    loop {
-        // Clamp to >=1s: `--watch-interval 0` would otherwise spin the
-        // fetch loop with zero delay and hammer the target.
-        tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval.max(1))).await;
-
-        if cancelled.load(Ordering::Relaxed) {
-            eprintln!("[watch] Stopped");
-            break;
-        }
-
-        let current = match client.fetch_and_extract_with_options(url, options).await {
-            Ok(result) => result,
-            Err(e) => {
-                eprintln!("[watch] Fetch error ({}): {e}", timestamp());
-                continue;
-            }
-        };
-
-        let diff = webclaw_core::diff::diff(&previous, &current);
-
-        if diff.status == ChangeStatus::Same {
-            eprintln!("[watch] No changes ({})", timestamp());
-        } else {
-            print_diff_output(&diff, &cli.format);
-            eprintln!("[watch] Changes detected! ({})", timestamp());
-
-            if let Some(ref cmd) = cli.on_change {
-                let diff_json = serde_json::to_string(&diff).unwrap_or_default();
-                spawn_on_change(cmd, diff_json.as_bytes()).await;
-            }
-
-            if let Some(ref webhook_url) = cli.webhook {
-                fire_webhook(
-                    webhook_url,
-                    &serde_json::json!({
-                        "event": "watch_change",
-                        "url": url,
-                        "status": format!("{:?}", diff.status),
-                        "word_count_delta": diff.word_count_delta,
-                        "metadata_changes": diff.metadata_changes.len(),
-                        "links_added": diff.links_added.len(),
-                        "links_removed": diff.links_removed.len(),
-                    }),
-                );
-            }
-
-            previous = current;
-        }
-    }
-
-    Ok(())
-}
-
-/// Multi-URL watch loop -- batch fetch all URLs, diff each, report aggregate.
-async fn run_watch_multi(
-    cli: &Cli,
-    client: &Arc<FetchClient>,
-    options: &ExtractionOptions,
-    urls: &[String],
-    cancelled: &Arc<AtomicBool>,
-) -> Result<(), String> {
-    let url_refs: Vec<&str> = urls.iter().map(|u| u.as_str()).collect();
-
-    // Initial pass: fetch all URLs in parallel
-    let initial_results = client
-        .fetch_and_extract_batch_with_options(&url_refs, cli.concurrency, options)
-        .await;
-
-    let mut snapshots = std::collections::HashMap::new();
-    let mut ok_count = 0usize;
-    let mut err_count = 0usize;
-
-    for r in initial_results {
-        match r.result {
-            Ok(extraction) => {
-                snapshots.insert(r.url, extraction);
-                ok_count += 1;
-            }
-            Err(e) => {
-                eprintln!("[watch] Initial fetch error: {} -- {e}", r.url);
-                err_count += 1;
-            }
-        }
-    }
-
-    eprintln!(
-        "[watch] Watching {} URLs (interval: {}s)",
-        urls.len(),
-        cli.watch_interval
-    );
-    eprintln!("[watch] Initial snapshots: {ok_count} ok, {err_count} errors");
-
-    let mut check_number = 0u64;
-
-    loop {
-        // Clamp to >=1s: `--watch-interval 0` would otherwise spin the
-        // fetch loop with zero delay and hammer the target.
-        tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval.max(1))).await;
-
-        if cancelled.load(Ordering::Relaxed) {
-            eprintln!("[watch] Stopped");
-            break;
-        }
-
-        check_number += 1;
-
-        let current_results = client
-            .fetch_and_extract_batch_with_options(&url_refs, cli.concurrency, options)
-            .await;
-
-        let mut changed: Vec<serde_json::Value> = Vec::new();
-        let mut same_count = 0usize;
-        let mut fetch_errors = 0usize;
-
-        for r in current_results {
-            match r.result {
-                Ok(current) => {
-                    if let Some(previous) = snapshots.get(&r.url) {
-                        let diff = webclaw_core::diff::diff(previous, &current);
-                        if diff.status == ChangeStatus::Same {
-                            same_count += 1;
-                        } else {
-                            changed.push(serde_json::json!({
-                                "url": r.url,
-                                "word_count_delta": diff.word_count_delta,
-                            }));
-                            snapshots.insert(r.url, current);
-                        }
-                    } else {
-                        // URL failed initially, first successful fetch -- store as baseline
-                        snapshots.insert(r.url, current);
-                        same_count += 1;
-                    }
-                }
-                Err(e) => {
-                    eprintln!("[watch] Fetch error: {} -- {e}", r.url);
-                    fetch_errors += 1;
-                }
-            }
-        }
-
-        let ts = timestamp();
-        let err_suffix = if fetch_errors > 0 {
-            format!(", {fetch_errors} errors")
-        } else {
-            String::new()
-        };
-
-        if changed.is_empty() {
-            eprintln!(
-                "[watch] Check {check_number} ({ts}): 0 changed, {same_count} same{err_suffix}"
-            );
-        } else {
-            eprintln!(
-                "[watch] Check {check_number} ({ts}): {} changed, {same_count} same{err_suffix}",
-                changed.len(),
-            );
-            for entry in &changed {
-                let url = entry["url"].as_str().unwrap_or("?");
-                let delta = entry["word_count_delta"].as_i64().unwrap_or(0);
-                eprintln!("  -> {url} (word delta: {delta:+})");
-            }
-
-            // Fire --on-change once with all changes
-            if let Some(ref cmd) = cli.on_change {
-                let payload = serde_json::json!({
-                    "event": "watch_changes",
-                    "check_number": check_number,
-                    "total_urls": urls.len(),
-                    "changed": changed.len(),
-                    "same": same_count,
-                    "changes": changed,
-                });
-                let payload_json = serde_json::to_string(&payload).unwrap_or_default();
-                spawn_on_change(cmd, payload_json.as_bytes()).await;
-            }
-
-            // Fire webhook once with aggregate payload
-            if let Some(ref webhook_url) = cli.webhook {
-                fire_webhook(
-                    webhook_url,
-                    &serde_json::json!({
-                        "event": "watch_changes",
-                        "check_number": check_number,
-                        "total_urls": urls.len(),
-                        "changed": changed.len(),
-                        "same": same_count,
-                        "changes": changed,
-                    }),
-                );
-            }
-        }
-    }
-
-    Ok(())
-}
-
-async fn run_diff(cli: &Cli, snapshot_path: &str) -> Result<(), String> {
-    // Load previous snapshot
-    let snapshot_json = std::fs::read_to_string(snapshot_path)
-        .map_err(|e| format!("failed to read snapshot {snapshot_path}: {e}"))?;
-    let old: ExtractionResult = serde_json::from_str(&snapshot_json)
-        .map_err(|e| format!("failed to parse snapshot JSON: {e}"))?;
-
-    // Extract current version (handles PDF detection for URLs)
-    let new_result = fetch_and_extract(cli).await?.into_extraction()?;
-
-    let diff = webclaw_core::diff::diff(&old, &new_result);
-    print_diff_output(&diff, &cli.format);
-
-    Ok(())
-}
-
-async fn run_brand(cli: &Cli) -> Result<(), String> {
-    let result = fetch_html(cli).await?;
-    let enriched = enrich_html_with_stylesheets(&result.html, &result.url).await;
-    let brand = webclaw_core::brand::extract_brand(
-        &enriched,
-        Some(result.url.as_str()).filter(|s| !s.is_empty()),
-    );
-    println!(
-        "{}",
-        serde_json::to_string_pretty(&brand).expect("serialization failed")
-    );
-    Ok(())
-}
-
-/// Build an LLM provider based on CLI flags, or fall back to the default chain.
-async fn build_llm_provider(cli: &Cli) -> Result<Box<dyn LlmProvider>, String> {
-    if let Some(ref name) = cli.llm_provider {
-        match name.as_str() {
-            "ollama" => {
-                let provider = webclaw_llm::providers::ollama::OllamaProvider::new(
-                    cli.llm_base_url.clone(),
-                    cli.llm_model.clone(),
-                );
-                if !provider.is_available().await {
-                    return Err("ollama is not running or unreachable".into());
-                }
-                Ok(Box::new(provider))
-            }
-            "openai" => {
-                let provider = webclaw_llm::providers::openai::OpenAiProvider::new(
-                    None,
-                    cli.llm_base_url.clone(),
-                    cli.llm_model.clone(),
-                )
-                .ok_or("OPENAI_API_KEY not set")?;
-                Ok(Box::new(provider))
-            }
-            "anthropic" => {
-                let provider = webclaw_llm::providers::anthropic::AnthropicProvider::with_base_url(
-                    None,
-                    cli.llm_base_url.clone(),
-                    cli.llm_model.clone(),
-                )
-                .ok_or("ANTHROPIC_API_KEY not set")?;
-                Ok(Box::new(provider))
-            }
-            other => Err(format!(
-                "unknown LLM provider: {other} (use ollama, openai, or anthropic)"
-            )),
-        }
-    } else {
-        let chain = webclaw_llm::ProviderChain::default().await;
-        if chain.is_empty() {
-            return Err(
-                "no LLM providers available -- start Ollama or set OPENAI_API_KEY / ANTHROPIC_API_KEY"
-                    .into(),
-            );
-        }
-        Ok(Box::new(chain))
-    }
-}
-
-async fn run_llm(cli: &Cli) -> Result<(), String> {
-    // Extract content from source first (handles PDF detection for URLs)
-    let result = fetch_and_extract(cli).await?.into_extraction()?;
-
-    let provider = build_llm_provider(cli).await?;
-    let model = cli.llm_model.as_deref();
-
-    if let Some(ref schema_input) = cli.extract_json {
-        // Support @file syntax for loading schema from file
-        let schema_str = if let Some(path) = schema_input.strip_prefix('@') {
-            std::fs::read_to_string(path)
-                .map_err(|e| format!("failed to read schema file {path}: {e}"))?
-        } else {
-            schema_input.clone()
-        };
-
-        let schema: serde_json::Value =
-            serde_json::from_str(&schema_str).map_err(|e| format!("invalid JSON schema: {e}"))?;
-
-        let extracted = webclaw_llm::extract::extract_json(
-            &result.content.plain_text,
-            &schema,
-            provider.as_ref(),
-            model,
-        )
-        .await
-        .map_err(|e| format!("LLM extraction failed: {e}"))?;
-
-        println!(
-            "{}",
-            serde_json::to_string_pretty(&extracted).expect("serialization failed")
-        );
-    } else if let Some(ref prompt) = cli.extract_prompt {
-        let extracted = webclaw_llm::extract::extract_with_prompt(
-            &result.content.plain_text,
-            prompt,
-            provider.as_ref(),
-            model,
-        )
-        .await
-        .map_err(|e| format!("LLM extraction failed: {e}"))?;
-
-        println!(
-            "{}",
-            serde_json::to_string_pretty(&extracted).expect("serialization failed")
-        );
-    } else if let Some(sentences) = cli.summarize {
-        let summary = webclaw_llm::summarize::summarize(
-            &result.content.plain_text,
-            Some(sentences),
-            provider.as_ref(),
-            model,
-        )
-        .await
-        .map_err(|e| format!("LLM summarization failed: {e}"))?;
-
-        println!("{summary}");
-    }
-
-    Ok(())
-}
-
-/// Batch LLM extraction: fetch each URL, run LLM on extracted content, save/print results.
-/// URLs are processed sequentially to respect LLM provider rate limits.
-async fn run_batch_llm(cli: &Cli, entries: &[(String, Option<String>)]) -> Result<(), String> {
-    let client =
-        FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
-    let options = build_extraction_options(cli);
-    let provider = build_llm_provider(cli).await?;
-    let model = cli.llm_model.as_deref();
-
-    // Pre-parse schema once if --extract-json is used
-    let schema = if let Some(ref schema_input) = cli.extract_json {
-        let schema_str = if let Some(path) = schema_input.strip_prefix('@') {
-            std::fs::read_to_string(path)
-                .map_err(|e| format!("failed to read schema file {path}: {e}"))?
-        } else {
-            schema_input.clone()
-        };
-        Some(
-            serde_json::from_str::<serde_json::Value>(&schema_str)
-                .map_err(|e| format!("invalid JSON schema: {e}"))?,
-        )
-    } else {
-        None
-    };
-
-    // Build custom filename lookup from entries
-    let custom_names: std::collections::HashMap<&str, &str> = entries
-        .iter()
-        .filter_map(|(url, name)| name.as_deref().map(|n| (url.as_str(), n)))
-        .collect();
-
-    let total = entries.len();
-    let mut ok = 0usize;
-    let mut errors = 0usize;
-    let mut all_results: Vec<serde_json::Value> = Vec::with_capacity(total);
-
-    for (i, (url, _)) in entries.iter().enumerate() {
-        let idx = i + 1;
-        eprint!("[{idx}/{total}] {url} ");
-
-        // Fetch and extract page content
-        let extraction = match client.fetch_and_extract_with_options(url, &options).await {
-            Ok(r) => r,
-            Err(e) => {
-                errors += 1;
-                let msg = format!("fetch failed: {e}");
-                eprintln!("-> error: {msg}");
-                all_results.push(serde_json::json!({ "url": url, "error": msg }));
-                continue;
-            }
-        };
-
-        let text = &extraction.content.plain_text;
-
-        // Run the appropriate LLM operation
-        let llm_result = if let Some(ref schema) = schema {
-            webclaw_llm::extract::extract_json(text, schema, provider.as_ref(), model)
-                .await
-                .map(LlmOutput::Json)
-        } else if let Some(ref prompt) = cli.extract_prompt {
-            webclaw_llm::extract::extract_with_prompt(text, prompt, provider.as_ref(), model)
-                .await
-                .map(LlmOutput::Json)
-        } else if let Some(sentences) = cli.summarize {
-            webclaw_llm::summarize::summarize(text, Some(sentences), provider.as_ref(), model)
-                .await
-                .map(LlmOutput::Text)
-        } else {
-            unreachable!("run_batch_llm called without LLM flags")
-        };
-
-        match llm_result {
-            Ok(output) => {
-                ok += 1;
-
-                let (output_str, result_json) = match &output {
-                    LlmOutput::Json(v) => {
-                        let s = serde_json::to_string_pretty(v).expect("serialization failed");
-                        let j = serde_json::json!({ "url": url, "result": v });
-                        (s, j)
-                    }
-                    LlmOutput::Text(s) => {
-                        let j = serde_json::json!({ "url": url, "result": s });
-                        (s.clone(), j)
-                    }
-                };
-
-                // Count top-level fields/items for progress display
-                let detail = match &output {
-                    LlmOutput::Json(v) => match v {
-                        serde_json::Value::Object(m) => format!("{} fields", m.len()),
-                        serde_json::Value::Array(a) => format!("{} items", a.len()),
-                        _ => "done".to_string(),
-                    },
-                    LlmOutput::Text(s) => {
-                        let words = s.split_whitespace().count();
-                        format!("{words} words")
-                    }
-                };
-                eprintln!("-> extracted {detail}");
-
-                if let Some(ref dir) = cli.output_dir {
-                    let filename = custom_names
-                        .get(url.as_str())
-                        .map(|s| s.to_string())
-                        .unwrap_or_else(|| url_to_filename(url, &OutputFormat::Json));
-                    write_to_file(dir, &filename, &output_str)?;
-                } else {
-                    println!("--- {url}");
-                    println!("{output_str}");
-                    println!();
-                }
-
-                all_results.push(result_json);
-            }
-            Err(e) => {
-                errors += 1;
-                let msg = format!("LLM extraction failed: {e}");
-                eprintln!("-> error: {msg}");
-                all_results.push(serde_json::json!({ "url": url, "error": msg }));
-            }
-        }
-    }
-
-    eprintln!("Processed {total} URLs ({ok} ok, {errors} errors)");
-
-    if let Some(ref webhook_url) = cli.webhook {
-        fire_webhook(
-            webhook_url,
-            &serde_json::json!({
-                "event": "batch_llm_complete",
-                "total": total,
-                "ok": ok,
-                "errors": errors,
-            }),
-        );
-        tokio::time::sleep(std::time::Duration::from_millis(500)).await;
-    }
-
-    if errors > 0 {
-        Err(format!("{errors} of {total} URLs failed"))
-    } else {
-        Ok(())
-    }
-}
-
-/// Intermediate type to hold LLM output before formatting.
-enum LlmOutput {
-    Json(serde_json::Value),
-    Text(String),
-}
-
-/// Returns true if any LLM flag is set.
-fn has_llm_flags(cli: &Cli) -> bool {
-    cli.extract_json.is_some() || cli.extract_prompt.is_some() || cli.summarize.is_some()
-}
-
-async fn run_research(cli: &Cli, query: &str) -> Result<(), String> {
-    let api_key = cli
-        .api_key
-        .as_deref()
-        .ok_or("--research requires WEBCLAW_API_KEY (set via env or --api-key)")?;
-
-    let client = reqwest::Client::builder()
-        .timeout(std::time::Duration::from_secs(600))
-        .build()
-        .map_err(|e| format!("http client error: {e}"))?;
-
-    let mut body = serde_json::json!({ "query": query });
-    if cli.deep {
-        body["deep"] = serde_json::json!(true);
-    }
-
-    eprintln!("Starting research: {query}");
-    if cli.deep {
-        eprintln!("Deep mode enabled (longer, more thorough)");
-    }
-
-    // Start job
-    let resp = client
-        .post("https://api.webclaw.io/v1/research")
-        .header("Authorization", format!("Bearer {api_key}"))
-        .json(&body)
-        .send()
-        .await
-        .map_err(|e| format!("API error: {e}"))?
-        .json::<serde_json::Value>()
-        .await
-        .map_err(|e| format!("parse error: {e}"))?;
-
-    let job_id = resp
-        .get("id")
-        .and_then(|v| v.as_str())
-        .ok_or("API did not return a job ID")?
-        .to_string();
-
-    eprintln!("Job started: {job_id}");
-
-    // Poll
-    for poll in 0..200 {
-        tokio::time::sleep(std::time::Duration::from_secs(3)).await;
-
-        let status_resp = client
-            .get(format!("https://api.webclaw.io/v1/research/{job_id}"))
-            .header("Authorization", format!("Bearer {api_key}"))
-            .send()
-            .await
-            .map_err(|e| format!("poll error: {e}"))?
-            .json::<serde_json::Value>()
-            .await
-            .map_err(|e| format!("parse error: {e}"))?;
-
-        let status = status_resp
-            .get("status")
-            .and_then(|v| v.as_str())
-            .unwrap_or("unknown");
-
-        match status {
-            "completed" => {
-                let report = status_resp
-                    .get("report")
-                    .and_then(|v| v.as_str())
-                    .unwrap_or("");
-
-                // Save full result to JSON file
-                let slug: String = query
-                    .chars()
-                    .map(|c| {
-                        if c.is_alphanumeric() || c == ' ' {
-                            c
-                        } else {
-                            ' '
-                        }
-                    })
-                    .collect::<String>()
-                    .split_whitespace()
-                    .collect::<Vec<_>>()
-                    .join("-")
-                    .to_lowercase();
-                // char-safe truncation: byte slicing panics if char 50
-                // lands mid-codepoint (multibyte queries).
-                let slug: String = slug.chars().take(50).collect();
-                let filename = format!("research-{slug}.json");
-
-                let json = serde_json::to_string_pretty(&status_resp).unwrap_or_default();
-                std::fs::write(&filename, &json)
-                    .map_err(|e| format!("failed to write {filename}: {e}"))?;
-
-                let elapsed = status_resp
-                    .get("elapsed_ms")
-                    .and_then(|v| v.as_i64())
-                    .unwrap_or(0);
-                let sources = status_resp
-                    .get("sources_count")
-                    .and_then(|v| v.as_i64())
-                    .unwrap_or(0);
-                let findings = status_resp
-                    .get("findings_count")
-                    .and_then(|v| v.as_i64())
-                    .unwrap_or(0);
-
-                eprintln!(
-                    "Research complete: {sources} sources, {findings} findings, {:.1}s",
-                    elapsed as f64 / 1000.0
-                );
-                eprintln!("Saved to: {filename}");
-
-                // Print report to stdout
-                if !report.is_empty() {
-                    println!("{report}");
-                }
-
-                return Ok(());
-            }
-            "failed" => {
-                let error = status_resp
-                    .get("error")
-                    .and_then(|v| v.as_str())
-                    .unwrap_or("unknown error");
-                return Err(format!("Research failed: {error}"));
-            }
-            _ => {
-                if poll % 10 == 9 {
-                    eprintln!("Still researching... ({:.0}s)", (poll + 1) as f64 * 3.0);
-                }
-            }
-        }
-    }
-
-    Err(format!(
-        "Research timed out after ~10 minutes. Check status: GET /v1/research/{job_id}"
-    ))
-}
-
 #[tokio::main]
 async fn main() {
     dotenvy::dotenv().ok();
@@ -2680,226 +277,3 @@ async fn main() {
         }
     }
 }
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use webclaw_core::Content;
-
-    fn empty_result(title: Option<&str>, url: Option<&str>, markdown: &str) -> ExtractionResult {
-        ExtractionResult {
-            metadata: Metadata {
-                title: title.map(str::to_string),
-                description: None,
-                author: None,
-                published_date: None,
-                language: None,
-                url: url.map(str::to_string),
-                site_name: None,
-                image: None,
-                favicon: None,
-                word_count: markdown.split_whitespace().count(),
-            },
-            content: Content {
-                markdown: markdown.to_string(),
-                plain_text: markdown.to_string(),
-                links: vec![],
-                images: vec![],
-                code_blocks: vec![],
-                raw_html: None,
-            },
-            domain_data: None,
-            structured_data: vec![],
-        }
-    }
-
-    #[test]
-    fn detect_empty_identifies_consent_redirect_url() {
-        let result = empty_result(
-            Some("Yahoo"),
-            Some("https://guce.advertising.com/collectIdentifiers?sessionId=abc"),
-            "Continue",
-        );
-        assert_eq!(detect_empty(&result), EmptyReason::ConsentWall);
-    }
-
-    #[test]
-    fn detect_empty_identifies_short_consent_title() {
-        let result = empty_result(
-            Some("Before you continue"),
-            Some("https://www.google.com/"),
-            "Review privacy options",
-        );
-        assert_eq!(detect_empty(&result), EmptyReason::ConsentWall);
-    }
-
-    #[test]
-    fn detect_empty_does_not_flag_real_content_with_consent_words() {
-        let result = empty_result(
-            Some("Cookie consent patterns explained"),
-            Some("https://example.com/blog"),
-            "This article explains cookie consent patterns for product teams with enough real body text to be useful. It covers consent banners, privacy controls, analytics configuration, regional requirements, product tradeoffs, implementation details, testing flows, debugging notes, accessibility needs, and operational lessons from real teams shipping public websites across multiple markets. It also explains measurement, rollout planning, copy review, support workflows, design constraints, release notes, and how to keep privacy choices understandable for users.",
-        );
-        assert_eq!(detect_empty(&result), EmptyReason::None);
-    }
-
-    #[test]
-    fn url_to_filename_root() {
-        assert_eq!(
-            url_to_filename("https://example.com/", &OutputFormat::Markdown),
-            "example_com/index.md"
-        );
-        assert_eq!(
-            url_to_filename("https://example.com", &OutputFormat::Markdown),
-            "example_com/index.md"
-        );
-    }
-
-    #[test]
-    fn url_to_filename_path() {
-        assert_eq!(
-            url_to_filename("https://example.com/docs/api", &OutputFormat::Markdown),
-            "docs/api.md"
-        );
-    }
-
-    #[test]
-    fn url_to_filename_trailing_slash() {
-        assert_eq!(
-            url_to_filename("https://example.com/docs/api/", &OutputFormat::Markdown),
-            "docs/api.md"
-        );
-    }
-
-    #[test]
-    fn url_to_filename_nested_path() {
-        assert_eq!(
-            url_to_filename("https://example.com/blog/my-post", &OutputFormat::Markdown),
-            "blog/my-post.md"
-        );
-    }
-
-    #[test]
-    fn url_to_filename_query_params() {
-        assert_eq!(
-            url_to_filename("https://example.com/p?id=123", &OutputFormat::Markdown),
-            "p_id_123.md"
-        );
-    }
-
-    #[test]
-    fn url_to_filename_json_format() {
-        assert_eq!(
-            url_to_filename("https://example.com/docs/api", &OutputFormat::Json),
-            "docs/api.json"
-        );
-    }
-
-    #[test]
-    fn url_to_filename_text_format() {
-        assert_eq!(
-            url_to_filename("https://example.com/docs/api", &OutputFormat::Text),
-            "docs/api.txt"
-        );
-    }
-
-    #[test]
-    fn url_to_filename_llm_format() {
-        assert_eq!(
-            url_to_filename("https://example.com/docs/api", &OutputFormat::Llm),
-            "docs/api.md"
-        );
-    }
-
-    #[test]
-    fn url_to_filename_html_format() {
-        assert_eq!(
-            url_to_filename("https://example.com/docs/api", &OutputFormat::Html),
-            "docs/api.html"
-        );
-    }
-
-    #[test]
-    fn url_to_filename_special_chars() {
-        // Spaces and special chars get replaced with underscores
-        assert_eq!(
-            url_to_filename(
-                "https://example.com/path%20with%20spaces",
-                &OutputFormat::Markdown
-            ),
-            "path_20with_20spaces.md"
-        );
-    }
-
-    #[test]
-    fn write_to_file_creates_dirs() {
-        let dir = std::env::temp_dir().join("webclaw_test_output_dir");
-        let _ = std::fs::remove_dir_all(&dir);
-        write_to_file(&dir, "nested/deep/file.md", "hello").unwrap();
-        let content = std::fs::read_to_string(dir.join("nested/deep/file.md")).unwrap();
-        assert_eq!(content, "hello");
-        let _ = std::fs::remove_dir_all(&dir);
-    }
-
-    #[test]
-    fn url_to_filename_strips_traversal_segments() {
-        // `..` / `.` / empty path segments must not survive into the path.
-        let out = url_to_filename(
-            "https://example.com/../../etc/passwd",
-            &OutputFormat::Markdown,
-        );
-        assert!(!out.contains(".."), "traversal leaked: {out}");
-        assert_eq!(out, "etc/passwd.md");
-        let out2 = url_to_filename("https://example.com/a/./b//c", &OutputFormat::Json);
-        assert_eq!(out2, "a/b/c.json");
-    }
-
-    #[test]
-    fn safe_relative_filename_rejects_escapes() {
-        assert!(safe_relative_filename("../escape.md").is_err());
-        assert!(safe_relative_filename("a/../../b.md").is_err());
-        assert!(safe_relative_filename("/etc/passwd").is_err());
-        assert!(safe_relative_filename("").is_err());
-        // Normal nested relative names stay allowed.
-        assert!(safe_relative_filename("nested/deep/file.md").is_ok());
-        assert!(safe_relative_filename("./ok.md").is_ok());
-    }
-
-    #[test]
-    fn write_to_file_refuses_traversal_filename() {
-        let dir = std::env::temp_dir().join("webclaw_test_traversal_dir");
-        let _ = std::fs::remove_dir_all(&dir);
-        // CSV-supplied `url,filename` traversal attempt.
-        let err = write_to_file(&dir, "../../tmp/webclaw_pwned.md", "x").unwrap_err();
-        assert!(err.contains("refusing"), "unexpected error: {err}");
-        assert!(
-            !std::path::Path::new("/tmp/webclaw_pwned.md").exists(),
-            "traversal write escaped the output dir"
-        );
-        let _ = std::fs::remove_dir_all(&dir);
-    }
-
-    #[test]
-    fn research_slug_truncation_is_char_safe() {
-        // Multibyte query: byte-slicing at 50 would panic mid-codepoint.
-        let query = "日本語".repeat(40); // 120 chars, 3 bytes each
-        let slug: String = query
-            .chars()
-            .map(|c| {
-                if c.is_alphanumeric() || c == ' ' {
-                    c
-                } else {
-                    ' '
-                }
-            })
-            .collect::<String>()
-            .split_whitespace()
-            .collect::<Vec<_>>()
-            .join("-")
-            .to_lowercase();
-        let slug: String = slug.chars().take(50).collect();
-        assert!(slug.chars().count() <= 50);
-        // Round-trips through formatting without panicking.
-        let _ = format!("research-{slug}.json");
-    }
-}
diff --git a/crates/webclaw-cli/src/output.rs b/crates/webclaw-cli/src/output.rs
new file mode 100644
index 0000000..69349ee
--- /dev/null
+++ b/crates/webclaw-cli/src/output.rs
@@ -0,0 +1,376 @@
+//! Output formatting and rendering for every CLI mode.
+//!
+//! `render_one` is the single source of truth for turning one
+//! `ExtractionResult` into a standalone document for a given format. The
+//! `print_*`/`format_*` functions own iteration and separator logic and
+//! delegate the per-page body to `render_one`.
+
+use webclaw_core::{ContentDiff, ExtractionResult, Metadata, to_llm_text};
+use webclaw_fetch::{BatchExtractResult, CrawlResult, PageResult, SitemapEntry};
+
+use crate::cli::OutputFormat;
+
+/// Get raw HTML from an extraction result, falling back to markdown if unavailable.
+pub fn raw_html_or_markdown(result: &ExtractionResult) -> &str {
+    result
+        .content
+        .raw_html
+        .as_deref()
+        .unwrap_or(&result.content.markdown)
+}
+
+pub fn format_frontmatter(meta: &Metadata) -> String {
+    let mut lines = vec!["---".to_string()];
+
+    if let Some(title) = &meta.title {
+        lines.push(format!("title: \"{title}\""));
+    }
+    if let Some(author) = &meta.author {
+        lines.push(format!("author: \"{author}\""));
+    }
+    if let Some(date) = &meta.published_date {
+        lines.push(format!("date: \"{date}\""));
+    }
+    if let Some(url) = &meta.url {
+        lines.push(format!("source: \"{url}\""));
+    }
+    if meta.word_count > 0 {
+        lines.push(format!("word_count: {}", meta.word_count));
+    }
+
+    lines.push("---".to_string());
+    lines.push(String::new()); // blank line after frontmatter
+    lines.join("\n")
+}
+
+/// Render a single `ExtractionResult` into a standalone document string for the
+/// given format. The Llm format derives its source URL from `metadata.url`.
+///
+/// This is the single per-page renderer behind `format_output` and
+/// `print_output`. Callers own the iteration and separator framing.
+pub fn render_one(result: &ExtractionResult, format: &OutputFormat, show_metadata: bool) -> String {
+    match format {
+        OutputFormat::Markdown => {
+            let mut out = String::new();
+            if show_metadata {
+                out.push_str(&format_frontmatter(&result.metadata));
+            }
+            out.push_str(&result.content.markdown);
+            if !result.structured_data.is_empty() {
+                out.push_str("\n\n## Structured Data\n\n```json\n");
+                out.push_str(
+                    &serde_json::to_string_pretty(&result.structured_data).unwrap_or_default(),
+                );
+                out.push_str("\n```");
+            }
+            out
+        }
+        OutputFormat::Json => serde_json::to_string_pretty(result).expect("serialization failed"),
+        OutputFormat::Text => result.content.plain_text.clone(),
+        OutputFormat::Llm => to_llm_text(result, result.metadata.url.as_deref()),
+        OutputFormat::Html => raw_html_or_markdown(result).to_string(),
+    }
+}
+
+/// Format an `ExtractionResult` into a string for the given output format.
+pub fn format_output(
+    result: &ExtractionResult,
+    format: &OutputFormat,
+    show_metadata: bool,
+) -> String {
+    render_one(result, format, show_metadata)
+}
+
+pub fn print_output(result: &ExtractionResult, format: &OutputFormat, show_metadata: bool) {
+    println!("{}", render_one(result, format, show_metadata));
+}
+
+/// Print cloud API response in the requested format.
+pub fn print_cloud_output(resp: &serde_json::Value, format: &OutputFormat) {
+    match format {
+        OutputFormat::Json => {
+            println!(
+                "{}",
+                serde_json::to_string_pretty(resp).expect("serialization failed")
+            );
+        }
+        OutputFormat::Markdown => {
+            // Cloud response has content.markdown
+            if let Some(md) = resp
+                .get("content")
+                .and_then(|c| c.get("markdown"))
+                .and_then(|m| m.as_str())
+            {
+                println!("{md}");
+            } else if let Some(md) = resp.get("markdown").and_then(|m| m.as_str()) {
+                println!("{md}");
+            } else {
+                println!(
+                    "{}",
+                    serde_json::to_string_pretty(resp).expect("serialization failed")
+                );
+            }
+        }
+        OutputFormat::Text => {
+            if let Some(txt) = resp
+                .get("content")
+                .and_then(|c| c.get("plain_text"))
+                .and_then(|t| t.as_str())
+            {
+                println!("{txt}");
+            } else {
+                // Fallback to markdown or raw JSON
+                print_cloud_output(resp, &OutputFormat::Markdown);
+            }
+        }
+        OutputFormat::Llm => {
+            if let Some(llm) = resp
+                .get("content")
+                .and_then(|c| c.get("llm_text"))
+                .and_then(|t| t.as_str())
+            {
+                println!("{llm}");
+            } else {
+                print_cloud_output(resp, &OutputFormat::Markdown);
+            }
+        }
+        OutputFormat::Html => {
+            if let Some(html) = resp
+                .get("content")
+                .and_then(|c| c.get("raw_html"))
+                .and_then(|h| h.as_str())
+            {
+                println!("{html}");
+            } else {
+                print_cloud_output(resp, &OutputFormat::Markdown);
+            }
+        }
+    }
+}
+
+pub fn print_diff_output(diff: &ContentDiff, format: &OutputFormat) {
+    match format {
+        OutputFormat::Json => {
+            println!(
+                "{}",
+                serde_json::to_string_pretty(diff).expect("serialization failed")
+            );
+        }
+        // For markdown/text/llm, show a human-readable summary
+        _ => {
+            println!("Status: {:?}", diff.status);
+            println!("Word count delta: {:+}", diff.word_count_delta);
+
+            if !diff.metadata_changes.is_empty() {
+                println!("\nMetadata changes:");
+                for change in &diff.metadata_changes {
+                    println!(
+                        "  {}: {} -> {}",
+                        change.field,
+                        change.old.as_deref().unwrap_or("(none)"),
+                        change.new.as_deref().unwrap_or("(none)"),
+                    );
+                }
+            }
+
+            if !diff.links_added.is_empty() {
+                println!("\nLinks added:");
+                for link in &diff.links_added {
+                    println!("  + {} ({})", link.href, link.text);
+                }
+            }
+
+            if !diff.links_removed.is_empty() {
+                println!("\nLinks removed:");
+                for link in &diff.links_removed {
+                    println!("  - {} ({})", link.href, link.text);
+                }
+            }
+
+            if let Some(ref text_diff) = diff.text_diff {
+                println!("\n{text_diff}");
+            }
+        }
+    }
+}
+
+pub fn print_crawl_output(result: &CrawlResult, format: &OutputFormat, show_metadata: bool) {
+    match format {
+        OutputFormat::Json => {
+            println!(
+                "{}",
+                serde_json::to_string_pretty(result).expect("serialization failed")
+            );
+        }
+        OutputFormat::Markdown => {
+            for page in &result.pages {
+                let Some(ref extraction) = page.extraction else {
+                    continue;
+                };
+                println!("---");
+                println!("# Page: {}\n", page.url);
+                if show_metadata {
+                    print!("{}", format_frontmatter(&extraction.metadata));
+                }
+                println!("{}", extraction.content.markdown);
+                println!();
+            }
+        }
+        OutputFormat::Text => {
+            for page in &result.pages {
+                let Some(ref extraction) = page.extraction else {
+                    continue;
+                };
+                println!("---");
+                println!("# Page: {}\n", page.url);
+                println!("{}", extraction.content.plain_text);
+                println!();
+            }
+        }
+        OutputFormat::Llm => {
+            for page in &result.pages {
+                let Some(ref extraction) = page.extraction else {
+                    continue;
+                };
+                println!("---");
+                println!("{}", to_llm_text(extraction, Some(page.url.as_str())));
+                println!();
+            }
+        }
+        OutputFormat::Html => {
+            for page in &result.pages {
+                let Some(ref extraction) = page.extraction else {
+                    continue;
+                };
+                println!("---");
+                println!("<!-- Page: {} -->\n", page.url);
+                println!("{}", raw_html_or_markdown(extraction));
+                println!();
+            }
+        }
+    }
+}
+
+pub fn print_batch_output(
+    results: &[BatchExtractResult],
+    format: &OutputFormat,
+    show_metadata: bool,
+) {
+    match format {
+        OutputFormat::Json => {
+            // Build a JSON array of {url, result?, error?} objects
+            let entries: Vec<serde_json::Value> = results
+                .iter()
+                .map(|r| match &r.result {
+                    Ok(extraction) => serde_json::json!({
+                        "url": r.url,
+                        "result": extraction,
+                    }),
+                    Err(e) => serde_json::json!({
+                        "url": r.url,
+                        "error": e.to_string(),
+                    }),
+                })
+                .collect();
+            println!(
+                "{}",
+                serde_json::to_string_pretty(&entries).expect("serialization failed")
+            );
+        }
+        OutputFormat::Markdown => {
+            for r in results {
+                match &r.result {
+                    Ok(extraction) => {
+                        println!("---");
+                        println!("# {}\n", r.url);
+                        if show_metadata {
+                            print!("{}", format_frontmatter(&extraction.metadata));
+                        }
+                        println!("{}", extraction.content.markdown);
+                        println!();
+                    }
+                    Err(e) => {
+                        eprintln!("error: {} -- {}", r.url, e);
+                    }
+                }
+            }
+        }
+        OutputFormat::Text => {
+            for r in results {
+                match &r.result {
+                    Ok(extraction) => {
+                        println!("---");
+                        println!("# {}\n", r.url);
+                        println!("{}", extraction.content.plain_text);
+                        println!();
+                    }
+                    Err(e) => {
+                        eprintln!("error: {} -- {}", r.url, e);
+                    }
+                }
+            }
+        }
+        OutputFormat::Llm => {
+            for r in results {
+                match &r.result {
+                    Ok(extraction) => {
+                        println!("---");
+                        println!("{}", to_llm_text(extraction, Some(r.url.as_str())));
+                        println!();
+                    }
+                    Err(e) => {
+                        eprintln!("error: {} -- {}", r.url, e);
+                    }
+                }
+            }
+        }
+        OutputFormat::Html => {
+            for r in results {
+                match &r.result {
+                    Ok(extraction) => {
+                        println!("---");
+                        println!("<!-- {} -->\n", r.url);
+                        println!("{}", raw_html_or_markdown(extraction));
+                        println!();
+                    }
+                    Err(e) => {
+                        eprintln!("error: {} -- {}", r.url, e);
+                    }
+                }
+            }
+        }
+    }
+}
+
+pub fn print_map_output(entries: &[SitemapEntry], format: &OutputFormat) {
+    match format {
+        OutputFormat::Json => {
+            println!(
+                "{}",
+                serde_json::to_string_pretty(entries).expect("serialization failed")
+            );
+        }
+        _ => {
+            for entry in entries {
+                println!("{}", entry.url);
+            }
+        }
+    }
+}
+
+/// Format a streaming progress line for a completed page.
+pub fn format_progress(page: &PageResult, index: usize, max_pages: usize) -> String {
+    let status = if page.error.is_some() { "ERR" } else { "OK " };
+    let timing = format!("{}ms", page.elapsed.as_millis());
+    let detail = if let Some(ref extraction) = page.extraction {
+        format!(", {} words", extraction.metadata.word_count)
+    } else if let Some(ref err) = page.error {
+        format!(" ({err})")
+    } else {
+        String::new()
+    };
+    format!(
+        "[{index}/{max_pages}] {status} {} ({timing}{detail})",
+        page.url
+    )
+}
diff --git a/crates/webclaw-cli/src/run.rs b/crates/webclaw-cli/src/run.rs
new file mode 100644
index 0000000..2305f64
--- /dev/null
+++ b/crates/webclaw-cli/src/run.rs
@@ -0,0 +1,1014 @@
+//! Async run handlers for every CLI mode: crawl, map, batch, watch, diff,
+//! brand, LLM extraction/summarization, and cloud research.
+
+use std::sync::Arc;
+use std::sync::atomic::{AtomicBool, Ordering};
+
+use webclaw_core::{ChangeStatus, ExtractionOptions, ExtractionResult};
+use webclaw_fetch::{CrawlConfig, Crawler, FetchClient, PageResult};
+use webclaw_llm::LlmProvider;
+
+use crate::cli::{Cli, OutputFormat};
+use crate::fetch::{
+    EmptyReason, build_extraction_options, build_fetch_config, detect_empty,
+    enrich_html_with_stylesheets, fetch_and_extract, fetch_html, normalize_url, url_to_filename,
+    warn_empty, write_to_file,
+};
+use crate::output::{
+    format_output, format_progress, print_batch_output, print_crawl_output, print_diff_output,
+    print_map_output,
+};
+use crate::webhook::{fire_webhook, spawn_on_change};
+
+pub async fn run_crawl(cli: &Cli) -> Result<(), String> {
+    let url = cli
+        .urls
+        .first()
+        .ok_or("--crawl requires a URL argument")
+        .map(|u| normalize_url(u))?;
+    let url = url.as_str();
+
+    if cli.file.is_some() || cli.stdin {
+        return Err("--crawl cannot be used with --file or --stdin".into());
+    }
+
+    let include_patterns: Vec<String> = cli
+        .include_paths
+        .as_deref()
+        .map(|s| s.split(',').map(|p| p.trim().to_string()).collect())
+        .unwrap_or_default();
+    let exclude_patterns: Vec<String> = cli
+        .exclude_paths
+        .as_deref()
+        .map(|s| s.split(',').map(|p| p.trim().to_string()).collect())
+        .unwrap_or_default();
+
+    // Set up streaming progress channel
+    let (progress_tx, mut progress_rx) = tokio::sync::broadcast::channel::<PageResult>(100);
+
+    // Set up cancel flag for Ctrl+C handling
+    let cancel_flag = Arc::new(AtomicBool::new(false));
+
+    // Register Ctrl+C handler when --crawl-state is set
+    let state_path = cli.crawl_state.clone();
+    if state_path.is_some() {
+        let flag = Arc::clone(&cancel_flag);
+        tokio::spawn(async move {
+            tokio::signal::ctrl_c().await.ok();
+            flag.store(true, Ordering::Relaxed);
+            eprintln!("\nCtrl+C received, saving crawl state...");
+        });
+    }
+
+    let config = CrawlConfig {
+        fetch: build_fetch_config(cli),
+        max_depth: cli.depth,
+        max_pages: cli.max_pages,
+        concurrency: cli.concurrency,
+        delay: std::time::Duration::from_millis(cli.delay),
+        path_prefix: cli.path_prefix.clone(),
+        use_sitemap: cli.sitemap,
+        include_patterns,
+        exclude_patterns,
+        progress_tx: Some(progress_tx),
+        cancel_flag: Some(Arc::clone(&cancel_flag)),
+        allow_subdomains: false,
+        allow_external_links: false,
+    };
+
+    // Load resume state if --crawl-state file exists
+    let resume_state = state_path
+        .as_ref()
+        .and_then(|p| Crawler::load_state(p))
+        .inspect(|s| {
+            eprintln!(
+                "Resuming crawl: {} pages already visited, {} URLs in frontier",
+                s.visited.len(),
+                s.frontier.len(),
+            );
+        });
+
+    let max_pages = cli.max_pages;
+    let completed_offset = resume_state.as_ref().map_or(0, |s| s.completed_pages);
+
+    // Spawn background task to print streaming progress to stderr
+    let progress_handle = tokio::spawn(async move {
+        let mut count = completed_offset;
+        while let Ok(page) = progress_rx.recv().await {
+            count += 1;
+            eprintln!("{}", format_progress(&page, count, max_pages));
+        }
+    });
+
+    let crawler = Crawler::new(url, config).map_err(|e| format!("crawler error: {e}"))?;
+    let result = crawler.crawl(url, resume_state).await;
+
+    // Drop the crawler (and its progress_tx clone) so the progress task finishes
+    drop(crawler);
+    let _ = progress_handle.await;
+
+    // If cancelled via Ctrl+C and --crawl-state is set, save state for resume
+    let was_cancelled = cancel_flag.load(Ordering::Relaxed);
+    if was_cancelled {
+        if let Some(ref path) = state_path {
+            Crawler::save_state(
+                path,
+                url,
+                &result.visited,
+                &result.remaining_frontier,
+                completed_offset + result.pages.len(),
+                cli.max_pages,
+                cli.depth,
+            )?;
+            eprintln!(
+                "Crawl state saved to {} ({} pages completed). Resume with --crawl-state {}",
+                path.display(),
+                completed_offset + result.pages.len(),
+                path.display(),
+            );
+        }
+    } else if let Some(ref path) = state_path {
+        // Crawl completed normally — clean up state file
+        if path.exists() {
+            let _ = std::fs::remove_file(path);
+        }
+    }
+
+    // Log per-page errors and extraction warnings to stderr
+    for page in &result.pages {
+        if let Some(ref err) = page.error {
+            eprintln!("error: {} -- {}", page.url, err);
+        } else if let Some(ref extraction) = page.extraction {
+            let reason = detect_empty(extraction);
+            if !matches!(reason, EmptyReason::None) {
+                warn_empty(&page.url, &reason);
+            }
+        }
+    }
+
+    if let Some(ref dir) = cli.output_dir {
+        let mut saved = 0usize;
+        for page in &result.pages {
+            if let Some(ref extraction) = page.extraction {
+                let filename = url_to_filename(&page.url, &cli.format);
+                let content = format_output(extraction, &cli.format, cli.metadata);
+                write_to_file(dir, &filename, &content)?;
+                saved += 1;
+            }
+        }
+        eprintln!("Saved {saved} files to {}", dir.display());
+    } else {
+        print_crawl_output(&result, &cli.format, cli.metadata);
+    }
+
+    eprintln!(
+        "Crawled {} pages ({} ok, {} errors) in {:.1}s",
+        result.total, result.ok, result.errors, result.elapsed_secs,
+    );
+
+    // Fire webhook on crawl complete
+    if let Some(ref webhook_url) = cli.webhook {
+        let urls: Vec<&str> = result.pages.iter().map(|p| p.url.as_str()).collect();
+        fire_webhook(
+            webhook_url,
+            &serde_json::json!({
+                "event": "crawl_complete",
+                "total": result.total,
+                "ok": result.ok,
+                "errors": result.errors,
+                "elapsed_secs": result.elapsed_secs,
+                "urls": urls,
+            }),
+        );
+        // Brief pause so the async webhook has time to fire
+        tokio::time::sleep(std::time::Duration::from_millis(500)).await;
+    }
+
+    if result.errors > 0 {
+        Err(format!(
+            "{} of {} pages failed",
+            result.errors, result.total
+        ))
+    } else {
+        Ok(())
+    }
+}
+
+pub async fn run_map(cli: &Cli) -> Result<(), String> {
+    let url = cli
+        .urls
+        .first()
+        .ok_or("--map requires a URL argument")
+        .map(|u| normalize_url(u))?;
+    let url = url.as_str();
+
+    let client =
+        FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
+
+    let entries = webclaw_fetch::sitemap::discover(&client, url)
+        .await
+        .map_err(|e| format!("sitemap discovery failed: {e}"))?;
+
+    if entries.is_empty() {
+        eprintln!("no sitemap URLs found for {url}");
+    } else {
+        eprintln!("discovered {} URLs", entries.len());
+    }
+
+    print_map_output(&entries, &cli.format);
+    Ok(())
+}
+
+pub async fn run_batch(cli: &Cli, entries: &[(String, Option<String>)]) -> Result<(), String> {
+    let client = Arc::new(
+        FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?,
+    );
+
+    let urls: Vec<&str> = entries.iter().map(|(u, _)| u.as_str()).collect();
+    let options = build_extraction_options(cli);
+    let results = client
+        .fetch_and_extract_batch_with_options(&urls, cli.concurrency, &options)
+        .await;
+
+    let ok = results.iter().filter(|r| r.result.is_ok()).count();
+    let errors = results.len() - ok;
+
+    // Log errors and extraction warnings to stderr
+    for r in &results {
+        if let Err(ref e) = r.result {
+            eprintln!("error: {} -- {}", r.url, e);
+        } else if let Ok(ref extraction) = r.result {
+            let reason = detect_empty(extraction);
+            if !matches!(reason, EmptyReason::None) {
+                warn_empty(&r.url, &reason);
+            }
+        }
+    }
+
+    // Build a lookup of custom filenames by URL
+    let custom_names: std::collections::HashMap<&str, &str> = entries
+        .iter()
+        .filter_map(|(url, name)| name.as_deref().map(|n| (url.as_str(), n)))
+        .collect();
+
+    if let Some(ref dir) = cli.output_dir {
+        let mut saved = 0usize;
+        for r in &results {
+            if let Ok(ref extraction) = r.result {
+                let filename = custom_names
+                    .get(r.url.as_str())
+                    .map(|s| s.to_string())
+                    .unwrap_or_else(|| url_to_filename(&r.url, &cli.format));
+                let content = format_output(extraction, &cli.format, cli.metadata);
+                write_to_file(dir, &filename, &content)?;
+                saved += 1;
+            }
+        }
+        eprintln!("Saved {saved} files to {}", dir.display());
+    } else {
+        print_batch_output(&results, &cli.format, cli.metadata);
+    }
+
+    eprintln!(
+        "Fetched {} URLs ({} ok, {} errors)",
+        results.len(),
+        ok,
+        errors
+    );
+
+    // Fire webhook on batch complete
+    if let Some(ref webhook_url) = cli.webhook {
+        let urls: Vec<&str> = results.iter().map(|r| r.url.as_str()).collect();
+        fire_webhook(
+            webhook_url,
+            &serde_json::json!({
+                "event": "batch_complete",
+                "total": results.len(),
+                "ok": ok,
+                "errors": errors,
+                "urls": urls,
+            }),
+        );
+        tokio::time::sleep(std::time::Duration::from_millis(500)).await;
+    }
+
+    if errors > 0 {
+        Err(format!("{errors} of {} URLs failed", results.len()))
+    } else {
+        Ok(())
+    }
+}
+
+fn timestamp() -> String {
+    let now = std::time::SystemTime::now()
+        .duration_since(std::time::UNIX_EPOCH)
+        .unwrap_or_default()
+        .as_secs();
+    let hours = (now % 86400) / 3600;
+    let minutes = (now % 3600) / 60;
+    let seconds = now % 60;
+    format!("{hours:02}:{minutes:02}:{seconds:02}")
+}
+
+pub async fn run_watch(cli: &Cli, urls: &[String]) -> Result<(), String> {
+    if urls.is_empty() {
+        return Err("--watch requires at least one URL".into());
+    }
+
+    let client = Arc::new(
+        FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?,
+    );
+    let options = build_extraction_options(cli);
+
+    // Ctrl+C handler
+    let cancelled = Arc::new(AtomicBool::new(false));
+    let flag = Arc::clone(&cancelled);
+    tokio::spawn(async move {
+        tokio::signal::ctrl_c().await.ok();
+        flag.store(true, Ordering::Relaxed);
+    });
+
+    // Single-URL mode: preserve original behavior exactly
+    if urls.len() == 1 {
+        return run_watch_single(cli, &client, &options, &urls[0], &cancelled).await;
+    }
+
+    // Multi-URL mode: batch fetch, diff each, report aggregate
+    run_watch_multi(cli, &client, &options, urls, &cancelled).await
+}
+
+/// Original single-URL watch loop -- backward compatible.
+async fn run_watch_single(
+    cli: &Cli,
+    client: &Arc<FetchClient>,
+    options: &ExtractionOptions,
+    url: &str,
+    cancelled: &Arc<AtomicBool>,
+) -> Result<(), String> {
+    let mut previous = client
+        .fetch_and_extract_with_options(url, options)
+        .await
+        .map_err(|e| format!("initial fetch failed: {e}"))?;
+
+    eprintln!(
+        "[watch] Initial snapshot: {url} ({} words)",
+        previous.metadata.word_count
+    );
+
+    loop {
+        // Clamp to >=1s: `--watch-interval 0` would otherwise spin the
+        // fetch loop with zero delay and hammer the target.
+        tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval.max(1))).await;
+
+        if cancelled.load(Ordering::Relaxed) {
+            eprintln!("[watch] Stopped");
+            break;
+        }
+
+        let current = match client.fetch_and_extract_with_options(url, options).await {
+            Ok(result) => result,
+            Err(e) => {
+                eprintln!("[watch] Fetch error ({}): {e}", timestamp());
+                continue;
+            }
+        };
+
+        let diff = webclaw_core::diff::diff(&previous, &current);
+
+        if diff.status == ChangeStatus::Same {
+            eprintln!("[watch] No changes ({})", timestamp());
+        } else {
+            print_diff_output(&diff, &cli.format);
+            eprintln!("[watch] Changes detected! ({})", timestamp());
+
+            if let Some(ref cmd) = cli.on_change {
+                let diff_json = serde_json::to_string(&diff).unwrap_or_default();
+                spawn_on_change(cmd, diff_json.as_bytes()).await;
+            }
+
+            if let Some(ref webhook_url) = cli.webhook {
+                fire_webhook(
+                    webhook_url,
+                    &serde_json::json!({
+                        "event": "watch_change",
+                        "url": url,
+                        "status": format!("{:?}", diff.status),
+                        "word_count_delta": diff.word_count_delta,
+                        "metadata_changes": diff.metadata_changes.len(),
+                        "links_added": diff.links_added.len(),
+                        "links_removed": diff.links_removed.len(),
+                    }),
+                );
+            }
+
+            previous = current;
+        }
+    }
+
+    Ok(())
+}
+
+/// Multi-URL watch loop -- batch fetch all URLs, diff each, report aggregate.
+async fn run_watch_multi(
+    cli: &Cli,
+    client: &Arc<FetchClient>,
+    options: &ExtractionOptions,
+    urls: &[String],
+    cancelled: &Arc<AtomicBool>,
+) -> Result<(), String> {
+    let url_refs: Vec<&str> = urls.iter().map(|u| u.as_str()).collect();
+
+    // Initial pass: fetch all URLs in parallel
+    let initial_results = client
+        .fetch_and_extract_batch_with_options(&url_refs, cli.concurrency, options)
+        .await;
+
+    let mut snapshots = std::collections::HashMap::new();
+    let mut ok_count = 0usize;
+    let mut err_count = 0usize;
+
+    for r in initial_results {
+        match r.result {
+            Ok(extraction) => {
+                snapshots.insert(r.url, extraction);
+                ok_count += 1;
+            }
+            Err(e) => {
+                eprintln!("[watch] Initial fetch error: {} -- {e}", r.url);
+                err_count += 1;
+            }
+        }
+    }
+
+    eprintln!(
+        "[watch] Watching {} URLs (interval: {}s)",
+        urls.len(),
+        cli.watch_interval
+    );
+    eprintln!("[watch] Initial snapshots: {ok_count} ok, {err_count} errors");
+
+    let mut check_number = 0u64;
+
+    loop {
+        // Clamp to >=1s: `--watch-interval 0` would otherwise spin the
+        // fetch loop with zero delay and hammer the target.
+        tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval.max(1))).await;
+
+        if cancelled.load(Ordering::Relaxed) {
+            eprintln!("[watch] Stopped");
+            break;
+        }
+
+        check_number += 1;
+
+        let current_results = client
+            .fetch_and_extract_batch_with_options(&url_refs, cli.concurrency, options)
+            .await;
+
+        let mut changed: Vec<serde_json::Value> = Vec::new();
+        let mut same_count = 0usize;
+        let mut fetch_errors = 0usize;
+
+        for r in current_results {
+            match r.result {
+                Ok(current) => {
+                    if let Some(previous) = snapshots.get(&r.url) {
+                        let diff = webclaw_core::diff::diff(previous, &current);
+                        if diff.status == ChangeStatus::Same {
+                            same_count += 1;
+                        } else {
+                            changed.push(serde_json::json!({
+                                "url": r.url,
+                                "word_count_delta": diff.word_count_delta,
+                            }));
+                            snapshots.insert(r.url, current);
+                        }
+                    } else {
+                        // URL failed initially, first successful fetch -- store as baseline
+                        snapshots.insert(r.url, current);
+                        same_count += 1;
+                    }
+                }
+                Err(e) => {
+                    eprintln!("[watch] Fetch error: {} -- {e}", r.url);
+                    fetch_errors += 1;
+                }
+            }
+        }
+
+        let ts = timestamp();
+        let err_suffix = if fetch_errors > 0 {
+            format!(", {fetch_errors} errors")
+        } else {
+            String::new()
+        };
+
+        if changed.is_empty() {
+            eprintln!(
+                "[watch] Check {check_number} ({ts}): 0 changed, {same_count} same{err_suffix}"
+            );
+        } else {
+            eprintln!(
+                "[watch] Check {check_number} ({ts}): {} changed, {same_count} same{err_suffix}",
+                changed.len(),
+            );
+            for entry in &changed {
+                let url = entry["url"].as_str().unwrap_or("?");
+                let delta = entry["word_count_delta"].as_i64().unwrap_or(0);
+                eprintln!("  -> {url} (word delta: {delta:+})");
+            }
+
+            // Fire --on-change once with all changes
+            if let Some(ref cmd) = cli.on_change {
+                let payload = serde_json::json!({
+                    "event": "watch_changes",
+                    "check_number": check_number,
+                    "total_urls": urls.len(),
+                    "changed": changed.len(),
+                    "same": same_count,
+                    "changes": changed,
+                });
+                let payload_json = serde_json::to_string(&payload).unwrap_or_default();
+                spawn_on_change(cmd, payload_json.as_bytes()).await;
+            }
+
+            // Fire webhook once with aggregate payload
+            if let Some(ref webhook_url) = cli.webhook {
+                fire_webhook(
+                    webhook_url,
+                    &serde_json::json!({
+                        "event": "watch_changes",
+                        "check_number": check_number,
+                        "total_urls": urls.len(),
+                        "changed": changed.len(),
+                        "same": same_count,
+                        "changes": changed,
+                    }),
+                );
+            }
+        }
+    }
+
+    Ok(())
+}
+
+pub async fn run_diff(cli: &Cli, snapshot_path: &str) -> Result<(), String> {
+    // Load previous snapshot
+    let snapshot_json = std::fs::read_to_string(snapshot_path)
+        .map_err(|e| format!("failed to read snapshot {snapshot_path}: {e}"))?;
+    let old: ExtractionResult = serde_json::from_str(&snapshot_json)
+        .map_err(|e| format!("failed to parse snapshot JSON: {e}"))?;
+
+    // Extract current version (handles PDF detection for URLs)
+    let new_result = fetch_and_extract(cli).await?.into_extraction()?;
+
+    let diff = webclaw_core::diff::diff(&old, &new_result);
+    print_diff_output(&diff, &cli.format);
+
+    Ok(())
+}
+
+pub async fn run_brand(cli: &Cli) -> Result<(), String> {
+    let result = fetch_html(cli).await?;
+    let enriched = enrich_html_with_stylesheets(&result.html, &result.url).await;
+    let brand = webclaw_core::brand::extract_brand(
+        &enriched,
+        Some(result.url.as_str()).filter(|s| !s.is_empty()),
+    );
+    println!(
+        "{}",
+        serde_json::to_string_pretty(&brand).expect("serialization failed")
+    );
+    Ok(())
+}
+
+/// Build an LLM provider based on CLI flags, or fall back to the default chain.
+async fn build_llm_provider(cli: &Cli) -> Result<Box<dyn LlmProvider>, String> {
+    if let Some(ref name) = cli.llm_provider {
+        match name.as_str() {
+            "ollama" => {
+                let provider = webclaw_llm::providers::ollama::OllamaProvider::new(
+                    cli.llm_base_url.clone(),
+                    cli.llm_model.clone(),
+                );
+                if !provider.is_available().await {
+                    return Err("ollama is not running or unreachable".into());
+                }
+                Ok(Box::new(provider))
+            }
+            "openai" => {
+                let provider = webclaw_llm::providers::openai::OpenAiProvider::new(
+                    None,
+                    cli.llm_base_url.clone(),
+                    cli.llm_model.clone(),
+                )
+                .ok_or("OPENAI_API_KEY not set")?;
+                Ok(Box::new(provider))
+            }
+            "anthropic" => {
+                let provider = webclaw_llm::providers::anthropic::AnthropicProvider::with_base_url(
+                    None,
+                    cli.llm_base_url.clone(),
+                    cli.llm_model.clone(),
+                )
+                .ok_or("ANTHROPIC_API_KEY not set")?;
+                Ok(Box::new(provider))
+            }
+            other => Err(format!(
+                "unknown LLM provider: {other} (use ollama, openai, or anthropic)"
+            )),
+        }
+    } else {
+        let chain = webclaw_llm::ProviderChain::default().await;
+        if chain.is_empty() {
+            return Err(
+                "no LLM providers available -- start Ollama or set OPENAI_API_KEY / ANTHROPIC_API_KEY"
+                    .into(),
+            );
+        }
+        Ok(Box::new(chain))
+    }
+}
+
+pub async fn run_llm(cli: &Cli) -> Result<(), String> {
+    // Extract content from source first (handles PDF detection for URLs)
+    let result = fetch_and_extract(cli).await?.into_extraction()?;
+
+    let provider = build_llm_provider(cli).await?;
+    let model = cli.llm_model.as_deref();
+
+    if let Some(ref schema_input) = cli.extract_json {
+        // Support @file syntax for loading schema from file
+        let schema_str = if let Some(path) = schema_input.strip_prefix('@') {
+            std::fs::read_to_string(path)
+                .map_err(|e| format!("failed to read schema file {path}: {e}"))?
+        } else {
+            schema_input.clone()
+        };
+
+        let schema: serde_json::Value =
+            serde_json::from_str(&schema_str).map_err(|e| format!("invalid JSON schema: {e}"))?;
+
+        let extracted = webclaw_llm::extract::extract_json(
+            &result.content.plain_text,
+            &schema,
+            provider.as_ref(),
+            model,
+        )
+        .await
+        .map_err(|e| format!("LLM extraction failed: {e}"))?;
+
+        println!(
+            "{}",
+            serde_json::to_string_pretty(&extracted).expect("serialization failed")
+        );
+    } else if let Some(ref prompt) = cli.extract_prompt {
+        let extracted = webclaw_llm::extract::extract_with_prompt(
+            &result.content.plain_text,
+            prompt,
+            provider.as_ref(),
+            model,
+        )
+        .await
+        .map_err(|e| format!("LLM extraction failed: {e}"))?;
+
+        println!(
+            "{}",
+            serde_json::to_string_pretty(&extracted).expect("serialization failed")
+        );
+    } else if let Some(sentences) = cli.summarize {
+        let summary = webclaw_llm::summarize::summarize(
+            &result.content.plain_text,
+            Some(sentences),
+            provider.as_ref(),
+            model,
+        )
+        .await
+        .map_err(|e| format!("LLM summarization failed: {e}"))?;
+
+        println!("{summary}");
+    }
+
+    Ok(())
+}
+
+/// Batch LLM extraction: fetch each URL, run LLM on extracted content, save/print results.
+/// URLs are processed sequentially to respect LLM provider rate limits.
+pub async fn run_batch_llm(cli: &Cli, entries: &[(String, Option<String>)]) -> Result<(), String> {
+    let client =
+        FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
+    let options = build_extraction_options(cli);
+    let provider = build_llm_provider(cli).await?;
+    let model = cli.llm_model.as_deref();
+
+    // Pre-parse schema once if --extract-json is used
+    let schema = if let Some(ref schema_input) = cli.extract_json {
+        let schema_str = if let Some(path) = schema_input.strip_prefix('@') {
+            std::fs::read_to_string(path)
+                .map_err(|e| format!("failed to read schema file {path}: {e}"))?
+        } else {
+            schema_input.clone()
+        };
+        Some(
+            serde_json::from_str::<serde_json::Value>(&schema_str)
+                .map_err(|e| format!("invalid JSON schema: {e}"))?,
+        )
+    } else {
+        None
+    };
+
+    // Build custom filename lookup from entries
+    let custom_names: std::collections::HashMap<&str, &str> = entries
+        .iter()
+        .filter_map(|(url, name)| name.as_deref().map(|n| (url.as_str(), n)))
+        .collect();
+
+    let total = entries.len();
+    let mut ok = 0usize;
+    let mut errors = 0usize;
+    let mut all_results: Vec<serde_json::Value> = Vec::with_capacity(total);
+
+    for (i, (url, _)) in entries.iter().enumerate() {
+        let idx = i + 1;
+        eprint!("[{idx}/{total}] {url} ");
+
+        // Fetch and extract page content
+        let extraction = match client.fetch_and_extract_with_options(url, &options).await {
+            Ok(r) => r,
+            Err(e) => {
+                errors += 1;
+                let msg = format!("fetch failed: {e}");
+                eprintln!("-> error: {msg}");
+                all_results.push(serde_json::json!({ "url": url, "error": msg }));
+                continue;
+            }
+        };
+
+        let text = &extraction.content.plain_text;
+
+        // Run the appropriate LLM operation
+        let llm_result = if let Some(ref schema) = schema {
+            webclaw_llm::extract::extract_json(text, schema, provider.as_ref(), model)
+                .await
+                .map(LlmOutput::Json)
+        } else if let Some(ref prompt) = cli.extract_prompt {
+            webclaw_llm::extract::extract_with_prompt(text, prompt, provider.as_ref(), model)
+                .await
+                .map(LlmOutput::Json)
+        } else if let Some(sentences) = cli.summarize {
+            webclaw_llm::summarize::summarize(text, Some(sentences), provider.as_ref(), model)
+                .await
+                .map(LlmOutput::Text)
+        } else {
+            unreachable!("run_batch_llm called without LLM flags")
+        };
+
+        match llm_result {
+            Ok(output) => {
+                ok += 1;
+
+                let (output_str, result_json) = match &output {
+                    LlmOutput::Json(v) => {
+                        let s = serde_json::to_string_pretty(v).expect("serialization failed");
+                        let j = serde_json::json!({ "url": url, "result": v });
+                        (s, j)
+                    }
+                    LlmOutput::Text(s) => {
+                        let j = serde_json::json!({ "url": url, "result": s });
+                        (s.clone(), j)
+                    }
+                };
+
+                // Count top-level fields/items for progress display
+                let detail = match &output {
+                    LlmOutput::Json(v) => match v {
+                        serde_json::Value::Object(m) => format!("{} fields", m.len()),
+                        serde_json::Value::Array(a) => format!("{} items", a.len()),
+                        _ => "done".to_string(),
+                    },
+                    LlmOutput::Text(s) => {
+                        let words = s.split_whitespace().count();
+                        format!("{words} words")
+                    }
+                };
+                eprintln!("-> extracted {detail}");
+
+                if let Some(ref dir) = cli.output_dir {
+                    let filename = custom_names
+                        .get(url.as_str())
+                        .map(|s| s.to_string())
+                        .unwrap_or_else(|| url_to_filename(url, &OutputFormat::Json));
+                    write_to_file(dir, &filename, &output_str)?;
+                } else {
+                    println!("--- {url}");
+                    println!("{output_str}");
+                    println!();
+                }
+
+                all_results.push(result_json);
+            }
+            Err(e) => {
+                errors += 1;
+                let msg = format!("LLM extraction failed: {e}");
+                eprintln!("-> error: {msg}");
+                all_results.push(serde_json::json!({ "url": url, "error": msg }));
+            }
+        }
+    }
+
+    eprintln!("Processed {total} URLs ({ok} ok, {errors} errors)");
+
+    if let Some(ref webhook_url) = cli.webhook {
+        fire_webhook(
+            webhook_url,
+            &serde_json::json!({
+                "event": "batch_llm_complete",
+                "total": total,
+                "ok": ok,
+                "errors": errors,
+            }),
+        );
+        tokio::time::sleep(std::time::Duration::from_millis(500)).await;
+    }
+
+    if errors > 0 {
+        Err(format!("{errors} of {total} URLs failed"))
+    } else {
+        Ok(())
+    }
+}
+
+/// Intermediate type to hold LLM output before formatting.
+enum LlmOutput {
+    Json(serde_json::Value),
+    Text(String),
+}
+
+/// Returns true if any LLM flag is set.
+pub fn has_llm_flags(cli: &Cli) -> bool {
+    cli.extract_json.is_some() || cli.extract_prompt.is_some() || cli.summarize.is_some()
+}
+
+pub async fn run_research(cli: &Cli, query: &str) -> Result<(), String> {
+    let api_key = cli
+        .api_key
+        .as_deref()
+        .ok_or("--research requires WEBCLAW_API_KEY (set via env or --api-key)")?;
+
+    let client = reqwest::Client::builder()
+        .timeout(std::time::Duration::from_secs(600))
+        .build()
+        .map_err(|e| format!("http client error: {e}"))?;
+
+    let mut body = serde_json::json!({ "query": query });
+    if cli.deep {
+        body["deep"] = serde_json::json!(true);
+    }
+
+    eprintln!("Starting research: {query}");
+    if cli.deep {
+        eprintln!("Deep mode enabled (longer, more thorough)");
+    }
+
+    // Start job
+    let resp = client
+        .post("https://api.webclaw.io/v1/research")
+        .header("Authorization", format!("Bearer {api_key}"))
+        .json(&body)
+        .send()
+        .await
+        .map_err(|e| format!("API error: {e}"))?
+        .json::<serde_json::Value>()
+        .await
+        .map_err(|e| format!("parse error: {e}"))?;
+
+    let job_id = resp
+        .get("id")
+        .and_then(|v| v.as_str())
+        .ok_or("API did not return a job ID")?
+        .to_string();
+
+    eprintln!("Job started: {job_id}");
+
+    // Poll
+    for poll in 0..200 {
+        tokio::time::sleep(std::time::Duration::from_secs(3)).await;
+
+        let status_resp = client
+            .get(format!("https://api.webclaw.io/v1/research/{job_id}"))
+            .header("Authorization", format!("Bearer {api_key}"))
+            .send()
+            .await
+            .map_err(|e| format!("poll error: {e}"))?
+            .json::<serde_json::Value>()
+            .await
+            .map_err(|e| format!("parse error: {e}"))?;
+
+        let status = status_resp
+            .get("status")
+            .and_then(|v| v.as_str())
+            .unwrap_or("unknown");
+
+        match status {
+            "completed" => {
+                let report = status_resp
+                    .get("report")
+                    .and_then(|v| v.as_str())
+                    .unwrap_or("");
+
+                // Save full result to JSON file
+                let slug: String = query
+                    .chars()
+                    .map(|c| {
+                        if c.is_alphanumeric() || c == ' ' {
+                            c
+                        } else {
+                            ' '
+                        }
+                    })
+                    .collect::<String>()
+                    .split_whitespace()
+                    .collect::<Vec<_>>()
+                    .join("-")
+                    .to_lowercase();
+                // char-safe truncation: byte slicing panics if char 50
+                // lands mid-codepoint (multibyte queries).
+                let slug: String = slug.chars().take(50).collect();
+                let filename = format!("research-{slug}.json");
+
+                let json = serde_json::to_string_pretty(&status_resp).unwrap_or_default();
+                std::fs::write(&filename, &json)
+                    .map_err(|e| format!("failed to write {filename}: {e}"))?;
+
+                let elapsed = status_resp
+                    .get("elapsed_ms")
+                    .and_then(|v| v.as_i64())
+                    .unwrap_or(0);
+                let sources = status_resp
+                    .get("sources_count")
+                    .and_then(|v| v.as_i64())
+                    .unwrap_or(0);
+                let findings = status_resp
+                    .get("findings_count")
+                    .and_then(|v| v.as_i64())
+                    .unwrap_or(0);
+
+                eprintln!(
+                    "Research complete: {sources} sources, {findings} findings, {:.1}s",
+                    elapsed as f64 / 1000.0
+                );
+                eprintln!("Saved to: {filename}");
+
+                // Print report to stdout
+                if !report.is_empty() {
+                    println!("{report}");
+                }
+
+                return Ok(());
+            }
+            "failed" => {
+                let error = status_resp
+                    .get("error")
+                    .and_then(|v| v.as_str())
+                    .unwrap_or("unknown error");
+                return Err(format!("Research failed: {error}"));
+            }
+            _ => {
+                if poll % 10 == 9 {
+                    eprintln!("Still researching... ({:.0}s)", (poll + 1) as f64 * 3.0);
+                }
+            }
+        }
+    }
+
+    Err(format!(
+        "Research timed out after ~10 minutes. Check status: GET /v1/research/{job_id}"
+    ))
+}
+
+#[cfg(test)]
+mod tests {
+    #[test]
+    fn research_slug_truncation_is_char_safe() {
+        // Multibyte query: byte-slicing at 50 would panic mid-codepoint.
+        let query = "日本語".repeat(40); // 120 chars, 3 bytes each
+        let slug: String = query
+            .chars()
+            .map(|c| {
+                if c.is_alphanumeric() || c == ' ' {
+                    c
+                } else {
+                    ' '
+                }
+            })
+            .collect::<String>()
+            .split_whitespace()
+            .collect::<Vec<_>>()
+            .join("-")
+            .to_lowercase();
+        let slug: String = slug.chars().take(50).collect();
+        assert!(slug.chars().count() <= 50);
+        // Round-trips through formatting without panicking.
+        let _ = format!("research-{slug}.json");
+    }
+}
diff --git a/crates/webclaw-cli/src/webhook.rs b/crates/webclaw-cli/src/webhook.rs
new file mode 100644
index 0000000..e3f69ac
--- /dev/null
+++ b/crates/webclaw-cli/src/webhook.rs
@@ -0,0 +1,121 @@
+//! Webhook delivery and `--on-change` command execution.
+
+/// Spawn the `--on-change` command with `payload` on stdin.
+///
+/// Previously this passed the entire user-provided string to `sh -c`, which
+/// made `--on-change 'notify "$URL"; rm -rf /'` a plausible disaster the
+/// moment an untrusted config file or MCP-driven agent fed us a command.
+/// The MCP surface specifically is prompt-injection-exposed: an LLM that
+/// controls CLI args can escalate into arbitrary shell on the host.
+///
+/// We now parse the command with `shlex` (POSIX-ish tokenization with proper
+/// quoting) and exec the program directly without an intermediate shell, so
+/// metacharacters like `;`, `&&`, `|`, `$()`, and env expansion can't fire.
+/// Users who genuinely need a pipeline can set the whole chain behind a
+/// script they've written, or opt in per-call via `WEBCLAW_ALLOW_SHELL=1`
+/// (documented escape hatch, noisy by design).
+pub async fn spawn_on_change(cmd: &str, stdin_payload: &[u8]) {
+    eprintln!("[watch] Running: {cmd}");
+
+    let allow_shell = std::env::var("WEBCLAW_ALLOW_SHELL")
+        .map(|v| v == "1" || v.eq_ignore_ascii_case("true"))
+        .unwrap_or(false);
+
+    let mut command = if allow_shell {
+        eprintln!("[watch] WEBCLAW_ALLOW_SHELL=1 — executing via sh -c (unsafe)");
+        let mut c = tokio::process::Command::new("sh");
+        c.arg("-c").arg(cmd);
+        c
+    } else {
+        let Some(argv) = shlex::split(cmd) else {
+            eprintln!("[watch] Failed to parse --on-change command (unbalanced quotes?)");
+            return;
+        };
+        let Some((program, args)) = argv.split_first() else {
+            eprintln!("[watch] --on-change command is empty");
+            return;
+        };
+        let mut c = tokio::process::Command::new(program);
+        c.args(args);
+        c
+    };
+
+    command.stdin(std::process::Stdio::piped());
+
+    match command.spawn() {
+        Ok(mut child) => {
+            if let Some(mut stdin) = child.stdin.take() {
+                use tokio::io::AsyncWriteExt;
+                let _ = stdin.write_all(stdin_payload).await;
+            }
+        }
+        Err(e) => eprintln!("[watch] Failed to run command: {e}"),
+    }
+}
+
+/// Fire a webhook POST with a JSON payload. Non-blocking — errors logged to stderr.
+/// Auto-detects Discord and Slack webhook URLs and wraps the payload accordingly.
+pub fn fire_webhook(url: &str, payload: &serde_json::Value) {
+    let url = url.to_string();
+    let is_discord = url.contains("discord.com/api/webhooks");
+    let is_slack = url.contains("hooks.slack.com");
+
+    let body = if is_discord {
+        let event = payload
+            .get("event")
+            .and_then(|v| v.as_str())
+            .unwrap_or("notification");
+        let details = serde_json::to_string_pretty(payload).unwrap_or_default();
+        serde_json::json!({
+            "embeds": [{
+                "title": format!("webclaw: {event}"),
+                "description": format!("```json\n{details}\n```"),
+                "color": 5814783
+            }]
+        })
+        .to_string()
+    } else if is_slack {
+        let event = payload
+            .get("event")
+            .and_then(|v| v.as_str())
+            .unwrap_or("notification");
+        let details = serde_json::to_string_pretty(payload).unwrap_or_default();
+        serde_json::json!({
+            "text": format!("*webclaw: {event}*\n```{details}```")
+        })
+        .to_string()
+    } else {
+        serde_json::to_string(payload).unwrap_or_default()
+    };
+    tokio::spawn(async move {
+        // SSRF guard: a webhook URL is user-supplied and otherwise bypasses
+        // the fetch-layer protections, so resolve + reject private/internal
+        // destinations before sending the payload.
+        if let Err(e) = webclaw_fetch::url_security::validate_public_http_url(&url).await {
+            eprintln!("[webhook] refusing unsafe URL: {e}");
+            return;
+        }
+        match reqwest::Client::builder()
+            .timeout(std::time::Duration::from_secs(10))
+            .build()
+        {
+            Ok(c) => match c
+                .post(&url)
+                .header("Content-Type", "application/json")
+                .body(body)
+                .send()
+                .await
+            {
+                Ok(resp) => {
+                    eprintln!(
+                        "[webhook] POST {} -> {}",
+                        &url[..url.len().min(60)],
+                        resp.status()
+                    );
+                }
+                Err(e) => eprintln!("[webhook] POST failed: {e}"),
+            },
+            Err(e) => eprintln!("[webhook] client error: {e}"),
+        }
+    });
+}
diff --git a/crates/webclaw-core/Cargo.toml b/crates/webclaw-core/Cargo.toml
index 5c2743a..dbd505c 100644
--- a/crates/webclaw-core/Cargo.toml
+++ b/crates/webclaw-core/Cargo.toml
@@ -3,12 +3,16 @@ name = "webclaw-core"
 description = "Pure HTML content extraction engine for LLMs"
 version.workspace = true
 edition.workspace = true
+rust-version.workspace = true
 license.workspace = true
 # Reddit regression fixtures are real old.reddit.com pages read at test time;
 # they're large and only needed to run the test suite from the repo, so keep
 # them out of the published crate.
 exclude = ["testdata/reddit/*.html"]
 
+[lints]
+workspace = true
+
 [features]
 default = ["quickjs"]
 quickjs = ["rquickjs"]
diff --git a/crates/webclaw-core/src/domain.rs b/crates/webclaw-core/src/domain.rs
index 1b5d6eb..eaa5d19 100644
--- a/crates/webclaw-core/src/domain.rs
+++ b/crates/webclaw-core/src/domain.rs
@@ -5,6 +5,7 @@ use serde::{Deserialize, Serialize};
 
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
 #[serde(rename_all = "snake_case")]
+#[non_exhaustive]
 pub enum DomainType {
     Article,
     Documentation,
diff --git a/crates/webclaw-core/src/error.rs b/crates/webclaw-core/src/error.rs
index d6bb9dc..4f28e55 100644
--- a/crates/webclaw-core/src/error.rs
+++ b/crates/webclaw-core/src/error.rs
@@ -3,6 +3,7 @@
 use thiserror::Error;
 
 #[derive(Debug, Error)]
+#[non_exhaustive]
 pub enum ExtractError {
     #[error("failed to parse HTML")]
     ParseError,
diff --git a/crates/webclaw-core/src/js_eval.rs b/crates/webclaw-core/src/js_eval.rs
index e1fb2de..2f78246 100644
--- a/crates/webclaw-core/src/js_eval.rs
+++ b/crates/webclaw-core/src/js_eval.rs
@@ -16,6 +16,29 @@ static SCRIPT_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("script").
 static HTML_TAG_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"<[^>]+>").unwrap());
 const JS_EVAL_TIMEOUT: Duration = Duration::from_millis(250);
 
+/// Markers that, if absent from the HTML, prove the QuickJS scan cannot find
+/// any data blob. The scan only ever surfaces `globalThis.__*` object/array
+/// properties, and the seeded `__next_f` only emits when non-empty. Every
+/// realistic way an inline script populates such a global goes through one of
+/// these substrings (`window.`/`self.__next` assignments, or the
+/// `__NEXT_DATA__`/`__NUXT__`/`application/json` payload conventions). If none
+/// are present, running the VM is guaranteed to return zero blobs, so skipping
+/// it is output-neutral. Conservative by design: any of these may appear in
+/// non-script HTML too, which only makes us skip *less* often, never more.
+const JS_CANDIDATE_MARKERS: [&str; 5] = [
+    "window.",
+    "__NEXT_DATA__",
+    "__NUXT__",
+    "application/json",
+    "self.__next",
+];
+
+/// Returns true if the HTML plausibly contains JS-assigned data the QuickJS
+/// scan could surface. When false, the VM is provably a no-op and is skipped.
+pub fn has_js_candidate_data(html: &str) -> bool {
+    JS_CANDIDATE_MARKERS.iter().any(|m| html.contains(m))
+}
+
 /// A blob of data extracted from JS execution.
 pub struct JsDataBlob {
     pub name: String,
@@ -24,9 +47,17 @@ pub struct JsDataBlob {
 }
 
 /// Execute inline `<script>` tags in a QuickJS sandbox and extract `window.__*` data.
+///
+/// Convenience wrapper that parses `html` first. Hot callers that already hold a
+/// parsed [`Html`] should use [`extract_js_data_from_doc`] to avoid a second parse.
 pub fn extract_js_data(html: &str) -> Vec<JsDataBlob> {
     let doc = Html::parse_document(html);
+    extract_js_data_from_doc(&doc)
+}
 
+/// Execute inline `<script>` tags in a QuickJS sandbox and extract `window.__*` data,
+/// reusing an already-parsed [`Html`] document instead of re-parsing the HTML.
+pub fn extract_js_data_from_doc(doc: &Html) -> Vec<JsDataBlob> {
     let scripts: Vec<String> = doc
         .select(&SCRIPT_SELECTOR)
         .filter(|el| {
diff --git a/crates/webclaw-core/src/lib.rs b/crates/webclaw-core/src/lib.rs
index 8cdfbbb..71b6c26 100644
--- a/crates/webclaw-core/src/lib.rs
+++ b/crates/webclaw-core/src/lib.rs
@@ -1,10 +1,12 @@
+//! webclaw-core: Pure HTML content extraction engine for LLMs.
+//!
+//! Takes raw HTML + optional URL, returns structured content
+//! (metadata, markdown, plain text, links, images, code blocks).
+//! Zero network dependencies — WASM-compatible by design.
+#![forbid(unsafe_code)]
+
 pub mod brand;
 pub(crate) mod data_island;
-/// webclaw-core: Pure HTML content extraction engine for LLMs.
-///
-/// Takes raw HTML + optional URL, returns structured content
-/// (metadata, markdown, plain text, links, images, code blocks).
-/// Zero network dependencies — WASM-compatible by design.
 pub mod diff;
 pub mod domain;
 pub mod endpoints;
@@ -38,6 +40,14 @@ use url::Url;
 ///
 /// `html` — raw HTML string to parse
 /// `url`  — optional source URL, used for resolving relative links and domain detection
+///
+/// # Example
+///
+/// ```rust
+/// let html = "<html><body><article><h1>Hello</h1><p>World</p></article></body></html>";
+/// let result = webclaw_core::extract(html, Some("https://example.com")).unwrap();
+/// assert!(result.content.markdown.contains("# Hello"));
+/// ```
 pub fn extract(html: &str, url: Option<&str>) -> Result<ExtractionResult, ExtractError> {
     extract_with_options(html, url, &ExtractionOptions::default())
 }
@@ -221,9 +231,14 @@ fn extract_with_options_inner(
     // QuickJS: execute inline <script> tags to capture JS-assigned data blobs
     // (e.g., window.__PRELOADED_STATE__, self.__next_f). This supplements the
     // static JSON data island extraction above with runtime-evaluated data.
+    //
+    // Output-neutral fast path: the QuickJS scan can only ever surface
+    // `globalThis.__*` data, so when the HTML contains none of the candidate
+    // markers the VM is provably a no-op and is skipped entirely. We also reuse
+    // the already-parsed `doc` instead of re-parsing the HTML a second time.
     #[cfg(all(feature = "quickjs", not(target_arch = "wasm32")))]
-    {
-        let blobs = js_eval::extract_js_data(html);
+    if js_eval::has_js_candidate_data(html) {
+        let blobs = js_eval::extract_js_data_from_doc(&doc);
         if !blobs.is_empty() {
             let js_text = js_eval::extract_readable_text(&blobs);
             if !js_text.is_empty() {
diff --git a/crates/webclaw-core/src/llm/body.rs b/crates/webclaw-core/src/llm/body.rs
index 1e3c939..5db3e40 100644
--- a/crates/webclaw-core/src/llm/body.rs
+++ b/crates/webclaw-core/src/llm/body.rs
@@ -184,7 +184,7 @@ fn detect_long_line_cycle(words: &[&str]) -> Option<String> {
 
         // Try exact N-copy cycles first
         for n_copies in (2..=5).rev() {
-            if !slice.len().is_multiple_of(n_copies) {
+            if slice.len() % n_copies != 0 {
                 continue;
             }
             let cycle_len = slice.len() / n_copies;
@@ -759,7 +759,7 @@ pub(crate) fn dedup_comma_lists(input: &str) -> String {
             // First: try full cycle dedup (a,b,c,a,b,c -> a,b,c)
             if items.len() >= 6 {
                 for cycle_len in 1..=items.len() / 2 {
-                    if !items.len().is_multiple_of(cycle_len) {
+                    if items.len() % cycle_len != 0 {
                         continue;
                     }
                     let pattern = &items[..cycle_len];
diff --git a/crates/webclaw-core/src/markdown.rs b/crates/webclaw-core/src/markdown.rs
index 2699166..07e7744 100644
--- a/crates/webclaw-core/src/markdown.rs
+++ b/crates/webclaw-core/src/markdown.rs
@@ -13,6 +13,8 @@ use crate::noise;
 use crate::types::{CodeBlock, Image, Link};
 
 static CODE_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("code").unwrap());
+static IMG_ALT_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("img[alt]").unwrap());
+static A_HREF_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("a[href]").unwrap());
 
 /// Maximum recursion depth for DOM traversal.
 /// Express.co.uk live blogs and similar pages can nest 1000+ levels deep,
@@ -853,7 +855,7 @@ fn collect_assets_from_noise(
     assets: &mut ConvertedAssets,
 ) {
     // Collect images with alt text
-    for img in element.select(&Selector::parse("img[alt]").unwrap()) {
+    for img in element.select(&IMG_ALT_SELECTOR) {
         let alt = img.value().attr("alt").unwrap_or("").to_string();
         let src = img
             .value()
@@ -866,7 +868,7 @@ fn collect_assets_from_noise(
     }
 
     // Collect links
-    for link in element.select(&Selector::parse("a[href]").unwrap()) {
+    for link in element.select(&A_HREF_SELECTOR) {
         let href = link
             .value()
             .attr("href")
diff --git a/crates/webclaw-core/src/types.rs b/crates/webclaw-core/src/types.rs
index ebe7a92..ab3d1f7 100644
--- a/crates/webclaw-core/src/types.rs
+++ b/crates/webclaw-core/src/types.rs
@@ -5,6 +5,7 @@ use serde::{Deserialize, Serialize};
 use crate::domain::DomainType;
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[non_exhaustive]
 pub struct ExtractionResult {
     pub metadata: Metadata,
     pub content: Content,
@@ -15,7 +16,38 @@ pub struct ExtractionResult {
     pub structured_data: Vec<serde_json::Value>,
 }
 
-#[derive(Debug, Clone, Serialize, Deserialize)]
+impl ExtractionResult {
+    /// Construct a result from metadata and content, defaulting
+    /// `domain_data` to `None` and `structured_data` to empty.
+    ///
+    /// `ExtractionResult` is `#[non_exhaustive]`, so downstream crates must
+    /// build it through this constructor instead of a struct literal.
+    pub fn new(metadata: Metadata, content: Content) -> Self {
+        Self {
+            metadata,
+            content,
+            domain_data: None,
+            structured_data: Vec::new(),
+        }
+    }
+
+    /// Attach domain-specific data.
+    #[must_use]
+    pub fn with_domain_data(mut self, domain_data: Option<DomainData>) -> Self {
+        self.domain_data = domain_data;
+        self
+    }
+
+    /// Attach JSON-LD structured data blocks.
+    #[must_use]
+    pub fn with_structured_data(mut self, structured_data: Vec<serde_json::Value>) -> Self {
+        self.structured_data = structured_data;
+        self
+    }
+}
+
+#[derive(Debug, Clone, Default, Serialize, Deserialize)]
+#[non_exhaustive]
 pub struct Metadata {
     pub title: Option<String>,
     pub description: Option<String>,
@@ -29,7 +61,73 @@ pub struct Metadata {
     pub word_count: usize,
 }
 
-#[derive(Debug, Clone, Serialize, Deserialize)]
+impl Metadata {
+    /// Start from an all-default `Metadata`. `Metadata` is `#[non_exhaustive]`,
+    /// so downstream crates build it via `Metadata::default()` plus the
+    /// `with_*` setters rather than a struct literal.
+    #[must_use]
+    pub fn with_title(mut self, title: Option<String>) -> Self {
+        self.title = title;
+        self
+    }
+
+    #[must_use]
+    pub fn with_description(mut self, description: Option<String>) -> Self {
+        self.description = description;
+        self
+    }
+
+    #[must_use]
+    pub fn with_author(mut self, author: Option<String>) -> Self {
+        self.author = author;
+        self
+    }
+
+    #[must_use]
+    pub fn with_published_date(mut self, published_date: Option<String>) -> Self {
+        self.published_date = published_date;
+        self
+    }
+
+    #[must_use]
+    pub fn with_language(mut self, language: Option<String>) -> Self {
+        self.language = language;
+        self
+    }
+
+    #[must_use]
+    pub fn with_url(mut self, url: Option<String>) -> Self {
+        self.url = url;
+        self
+    }
+
+    #[must_use]
+    pub fn with_site_name(mut self, site_name: Option<String>) -> Self {
+        self.site_name = site_name;
+        self
+    }
+
+    #[must_use]
+    pub fn with_image(mut self, image: Option<String>) -> Self {
+        self.image = image;
+        self
+    }
+
+    #[must_use]
+    pub fn with_favicon(mut self, favicon: Option<String>) -> Self {
+        self.favicon = favicon;
+        self
+    }
+
+    #[must_use]
+    pub fn with_word_count(mut self, word_count: usize) -> Self {
+        self.word_count = word_count;
+        self
+    }
+}
+
+#[derive(Debug, Clone, Default, Serialize, Deserialize)]
+#[non_exhaustive]
 pub struct Content {
     pub markdown: String,
     pub plain_text: String,
@@ -40,6 +138,47 @@ pub struct Content {
     pub raw_html: Option<String>,
 }
 
+impl Content {
+    /// Start from an all-default `Content`. `Content` is `#[non_exhaustive]`,
+    /// so downstream crates build it via `Content::default()` plus the
+    /// `with_*` setters rather than a struct literal.
+    #[must_use]
+    pub fn with_markdown(mut self, markdown: String) -> Self {
+        self.markdown = markdown;
+        self
+    }
+
+    #[must_use]
+    pub fn with_plain_text(mut self, plain_text: String) -> Self {
+        self.plain_text = plain_text;
+        self
+    }
+
+    #[must_use]
+    pub fn with_links(mut self, links: Vec<Link>) -> Self {
+        self.links = links;
+        self
+    }
+
+    #[must_use]
+    pub fn with_images(mut self, images: Vec<Image>) -> Self {
+        self.images = images;
+        self
+    }
+
+    #[must_use]
+    pub fn with_code_blocks(mut self, code_blocks: Vec<CodeBlock>) -> Self {
+        self.code_blocks = code_blocks;
+        self
+    }
+
+    #[must_use]
+    pub fn with_raw_html(mut self, raw_html: Option<String>) -> Self {
+        self.raw_html = raw_html;
+        self
+    }
+}
+
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
 pub struct Link {
     pub text: String,
diff --git a/crates/webclaw-fetch/Cargo.toml b/crates/webclaw-fetch/Cargo.toml
index 3bf5401..cc7ead2 100644
--- a/crates/webclaw-fetch/Cargo.toml
+++ b/crates/webclaw-fetch/Cargo.toml
@@ -3,8 +3,12 @@ name = "webclaw-fetch"
 description = "HTTP client with browser TLS fingerprint impersonation via wreq"
 version.workspace = true
 edition.workspace = true
+rust-version.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies]
 webclaw-core = { workspace = true }
 webclaw-pdf = { path = "../webclaw-pdf" }
diff --git a/crates/webclaw-fetch/src/browser.rs b/crates/webclaw-fetch/src/browser.rs
index 05f2c54..baf077b 100644
--- a/crates/webclaw-fetch/src/browser.rs
+++ b/crates/webclaw-fetch/src/browser.rs
@@ -3,13 +3,13 @@
 
 /// Which browser identity to present at the TLS/HTTP layer.
 #[derive(Debug, Clone, Default)]
+#[non_exhaustive]
 pub enum BrowserProfile {
     #[default]
     Chrome,
     Firefox,
-    /// Safari iOS 26 (iPhone). The one profile proven to defeat
-    /// DataDome's immobiliare.it / idealista.it / target.com-class
-    /// rules when paired with a country-scoped residential proxy.
+    /// iOS Safari fingerprint. Useful for sites with stricter TLS
+    /// requirements that expect a mobile Safari client.
     SafariIos,
     /// Randomly pick from all available profiles on each request.
     Random,
diff --git a/crates/webclaw-fetch/src/client.rs b/crates/webclaw-fetch/src/client.rs
index 2bfd8c5..035c8c5 100644
--- a/crates/webclaw-fetch/src/client.rs
+++ b/crates/webclaw-fetch/src/client.rs
@@ -168,6 +168,13 @@ impl Response {
     fn into_text(self) -> String {
         String::from_utf8_lossy(&self.body).into_owned()
     }
+
+    /// Consume the response and hand back the owned body buffer. Used by
+    /// the PDF path to move the bytes into `spawn_blocking` without copying
+    /// (`Bytes` is a refcounted buffer, so this is a cheap move).
+    fn into_body(self) -> bytes::Bytes {
+        self.body
+    }
 }
 
 /// Internal representation of the client pool strategy.
@@ -330,6 +337,18 @@ impl FetchClient {
     /// rescue logic; use [`Self::fetch_smart`] for that.
     #[instrument(skip(self), fields(url = %url))]
     pub async fn fetch(&self, url: &str) -> Result<FetchResult, FetchError> {
+        self.with_retry(url, || self.fetch_once(url)).await
+    }
+
+    /// Shared retry loop for the public `fetch` / `fetch_with_headers`
+    /// entry points. Runs `attempt` with exponential backoff (0s, 1s —
+    /// 2 attempts total), retrying on transient network errors and
+    /// retryable HTTP statuses (5xx, 429). `url` is for logging only.
+    async fn with_retry<F, Fut>(&self, url: &str, attempt_fn: F) -> Result<FetchResult, FetchError>
+    where
+        F: Fn() -> Fut,
+        Fut: std::future::Future<Output = Result<FetchResult, FetchError>>,
+    {
         let delays = [Duration::ZERO, Duration::from_secs(1)];
         let mut last_err = None;
 
@@ -338,7 +357,7 @@ impl FetchClient {
                 tokio::time::sleep(*delay).await;
             }
 
-            match self.fetch_once(url).await {
+            match attempt_fn().await {
                 Ok(result) => {
                     if is_retryable_status(result.status) && attempt < delays.len() - 1 {
                         warn!(
@@ -414,46 +433,8 @@ impl FetchClient {
         url: &str,
         extra: &[(&str, &str)],
     ) -> Result<FetchResult, FetchError> {
-        let delays = [Duration::ZERO, Duration::from_secs(1)];
-        let mut last_err = None;
-
-        for (attempt, delay) in delays.iter().enumerate() {
-            if attempt > 0 {
-                tokio::time::sleep(*delay).await;
-            }
-            match self.fetch_once_with_headers(url, extra).await {
-                Ok(result) => {
-                    if is_retryable_status(result.status) && attempt < delays.len() - 1 {
-                        warn!(
-                            url,
-                            status = result.status,
-                            attempt = attempt + 1,
-                            "retryable status, will retry"
-                        );
-                        last_err = Some(FetchError::Build(format!("HTTP {}", result.status)));
-                        continue;
-                    }
-                    if attempt > 0 {
-                        debug!(url, attempt = attempt + 1, "retry succeeded");
-                    }
-                    return Ok(result);
-                }
-                Err(e) => {
-                    if !is_retryable_error(&e) || attempt == delays.len() - 1 {
-                        return Err(e);
-                    }
-                    warn!(
-                        url,
-                        error = %e,
-                        attempt = attempt + 1,
-                        "transient error, will retry"
-                    );
-                    last_err = Some(e);
-                }
-            }
-        }
-
-        Err(last_err.unwrap_or_else(|| FetchError::Build("all retries exhausted".into())))
+        self.with_retry(url, || self.fetch_once_with_headers(url, extra))
+            .await
     }
 
     /// Fetch a URL then extract structured content.
@@ -514,17 +495,24 @@ impl FetchClient {
         if is_pdf {
             debug!(status, "detected PDF response, using pdf extraction");
 
-            let bytes = response.body();
+            let bytes = response.into_body();
+            let byte_len = bytes.len();
 
             let elapsed = start.elapsed();
             debug!(
                 status,
-                bytes = bytes.len(),
+                bytes = byte_len,
                 elapsed_ms = %elapsed.as_millis(),
                 "PDF fetch complete"
             );
 
-            let pdf_result = webclaw_pdf::extract_pdf(bytes, self.pdf_mode.clone())?;
+            // pdf-extract is synchronous and CPU-bound; run it off the async
+            // executor so a large PDF doesn't stall the reactor thread.
+            let pdf_mode = self.pdf_mode.clone();
+            let pdf_result =
+                tokio::task::spawn_blocking(move || webclaw_pdf::extract_pdf(&bytes, pdf_mode))
+                    .await
+                    .map_err(|e| FetchError::Build(format!("pdf extraction task failed: {e}")))??;
             Ok(pdf_to_extraction_result(&pdf_result, &final_url))
         } else if let Some(doc_type) =
             crate::document::is_document_content_type(&headers, &final_url)
@@ -814,30 +802,16 @@ fn pdf_to_extraction_result(
     let markdown = webclaw_pdf::to_markdown(pdf);
     let word_count = markdown.split_whitespace().count();
 
-    webclaw_core::ExtractionResult {
-        metadata: webclaw_core::Metadata {
-            title: pdf.metadata.title.clone(),
-            description: pdf.metadata.subject.clone(),
-            author: pdf.metadata.author.clone(),
-            published_date: None,
-            language: None,
-            url: Some(url.to_string()),
-            site_name: None,
-            image: None,
-            favicon: None,
-            word_count,
-        },
-        content: webclaw_core::Content {
-            markdown,
-            plain_text: pdf.text.clone(),
-            links: Vec::new(),
-            images: Vec::new(),
-            code_blocks: Vec::new(),
-            raw_html: None,
-        },
-        domain_data: None,
-        structured_data: vec![],
-    }
+    let metadata = webclaw_core::Metadata::default()
+        .with_title(pdf.metadata.title.clone())
+        .with_description(pdf.metadata.subject.clone())
+        .with_author(pdf.metadata.author.clone())
+        .with_url(Some(url.to_string()))
+        .with_word_count(word_count);
+    let content = webclaw_core::Content::default()
+        .with_markdown(markdown)
+        .with_plain_text(pdf.text.clone());
+    webclaw_core::ExtractionResult::new(metadata, content)
 }
 
 /// Collect spawned tasks and reorder results to match input order.
diff --git a/crates/webclaw-fetch/src/cloud.rs b/crates/webclaw-fetch/src/cloud.rs
index 7d4978e..f78468c 100644
--- a/crates/webclaw-fetch/src/cloud.rs
+++ b/crates/webclaw-fetch/src/cloud.rs
@@ -93,6 +93,7 @@ const KEYS_URL: &str = "https://webclaw.io/dashboard/api-keys";
 /// Display messages end with an actionable URL so API consumers can
 /// surface them to users verbatim.
 #[derive(Debug, Error)]
+#[non_exhaustive]
 pub enum CloudError {
     /// No `WEBCLAW_API_KEY` configured. Returned by [`smart_fetch_html`]
     /// and friends when they hit bot protection but have no client to
diff --git a/crates/webclaw-fetch/src/document.rs b/crates/webclaw-fetch/src/document.rs
index 3d7d89d..891f84b 100644
--- a/crates/webclaw-fetch/src/document.rs
+++ b/crates/webclaw-fetch/src/document.rs
@@ -98,30 +98,11 @@ pub fn extract_document(
     let plain_text = strip_markdown_formatting(&markdown);
     let word_count = plain_text.split_whitespace().count();
 
-    Ok(webclaw_core::ExtractionResult {
-        metadata: webclaw_core::Metadata {
-            title: None,
-            description: None,
-            author: None,
-            published_date: None,
-            language: None,
-            url: None,
-            site_name: None,
-            image: None,
-            favicon: None,
-            word_count,
-        },
-        content: webclaw_core::Content {
-            markdown,
-            plain_text,
-            links: Vec::new(),
-            images: Vec::new(),
-            code_blocks: Vec::new(),
-            raw_html: None,
-        },
-        domain_data: None,
-        structured_data: vec![],
-    })
+    let metadata = webclaw_core::Metadata::default().with_word_count(word_count);
+    let content = webclaw_core::Content::default()
+        .with_markdown(markdown)
+        .with_plain_text(plain_text);
+    Ok(webclaw_core::ExtractionResult::new(metadata, content))
 }
 
 /// Extract text from a DOCX file (ZIP of XML).
diff --git a/crates/webclaw-fetch/src/error.rs b/crates/webclaw-fetch/src/error.rs
index 37c011d..49b8520 100644
--- a/crates/webclaw-fetch/src/error.rs
+++ b/crates/webclaw-fetch/src/error.rs
@@ -3,6 +3,7 @@
 use thiserror::Error;
 
 #[derive(Debug, Error)]
+#[non_exhaustive]
 pub enum FetchError {
     #[error("request failed: {0}")]
     Request(#[from] wreq::Error),
diff --git a/crates/webclaw-fetch/src/extractors/amazon_product.rs b/crates/webclaw-fetch/src/extractors/amazon_product.rs
index e374b75..0fc6cd7 100644
--- a/crates/webclaw-fetch/src/extractors/amazon_product.rs
+++ b/crates/webclaw-fetch/src/extractors/amazon_product.rs
@@ -33,6 +33,7 @@ use serde_json::{Value, json};
 use url::Url;
 
 use super::ExtractorInfo;
+use super::og::parse_og;
 use crate::cloud::{self, CloudError};
 use crate::error::FetchError;
 use crate::fetcher::Fetcher;
@@ -115,23 +116,25 @@ pub async fn extract(client: &dyn Fetcher, url: &str) -> Result<Value, FetchErro
 /// without carrying webclaw_fetch types.
 pub fn parse(html: &str, url: &str, asin: &str) -> Value {
     let jsonld = find_product_jsonld(html);
+    // Single scan for the og:* fallbacks read below.
+    let og_meta = parse_og(html);
     // Three-tier title: JSON-LD `name` > Amazon's `#productTitle` span
     // (only present on real static HTML) > cloud-synthesized og:title.
     let title = jsonld
         .as_ref()
         .and_then(|v| get_text(v, "name"))
         .or_else(|| dom_title(html))
-        .or_else(|| og(html, "title"));
+        .or_else(|| og_meta.unescaped("title"));
     let image = jsonld
         .as_ref()
         .and_then(get_first_image)
         .or_else(|| dom_image(html))
-        .or_else(|| og(html, "image"));
+        .or_else(|| og_meta.unescaped("image"));
     let brand = jsonld.as_ref().and_then(get_brand);
     let description = jsonld
         .as_ref()
         .and_then(|v| get_text(v, "description"))
-        .or_else(|| og(html, "description"));
+        .or_else(|| og_meta.unescaped("description"));
     let aggregate_rating = jsonld.as_ref().and_then(get_aggregate_rating);
     let offer = jsonld.as_ref().and_then(first_offer);
 
@@ -336,31 +339,6 @@ fn dom_image(html: &str) -> Option<String> {
         .map(|m| m.as_str().to_string())
 }
 
-/// OG meta tag lookup. Cloud-synthesized HTML ships these even when
-/// JSON-LD and Amazon-DOM-IDs are both absent, so they're the last
-/// line of defence for `title`, `image`, `description`.
-fn og(html: &str, prop: &str) -> Option<String> {
-    static RE: OnceLock<Regex> = OnceLock::new();
-    let re = RE.get_or_init(|| {
-        Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
-    });
-    for c in re.captures_iter(html) {
-        if c.get(1).is_some_and(|m| m.as_str() == prop) {
-            return c.get(2).map(|m| html_unescape(m.as_str()));
-        }
-    }
-    None
-}
-
-/// Undo the synthesize_html attribute escaping for the few entities it
-/// emits. Keeps us off a heavier HTML-entity dep.
-fn html_unescape(s: &str) -> String {
-    s.replace("&quot;", "\"")
-        .replace("&amp;", "&")
-        .replace("&lt;", "<")
-        .replace("&gt;", ">")
-}
-
 fn cloud_to_fetch_err(e: CloudError) -> FetchError {
     FetchError::Build(e.to_string())
 }
@@ -477,7 +455,7 @@ mod tests {
     fn og_unescape_handles_quot_entity() {
         let html = r#"<meta property="og:title" content="Apple &quot;M2 Pro&quot; Laptop">"#;
         assert_eq!(
-            og(html, "title").as_deref(),
+            parse_og(html).unescaped("title").as_deref(),
             Some(r#"Apple "M2 Pro" Laptop"#)
         );
     }
diff --git a/crates/webclaw-fetch/src/extractors/ebay_listing.rs b/crates/webclaw-fetch/src/extractors/ebay_listing.rs
index 36f18e9..52a2cc3 100644
--- a/crates/webclaw-fetch/src/extractors/ebay_listing.rs
+++ b/crates/webclaw-fetch/src/extractors/ebay_listing.rs
@@ -15,6 +15,7 @@ use serde_json::{Value, json};
 use url::Url;
 
 use super::ExtractorInfo;
+use super::og::parse_og;
 use crate::cloud::{self, CloudError};
 use crate::error::FetchError;
 use crate::fetcher::Fetcher;
@@ -65,19 +66,21 @@ pub async fn extract(client: &dyn Fetcher, url: &str) -> Result<Value, FetchErro
 
 pub fn parse(html: &str, url: &str, item_id: &str) -> Value {
     let jsonld = find_product_jsonld(html);
+    // Single scan for the three og:* fields read as fallbacks below.
+    let og_meta = parse_og(html);
     let title = jsonld
         .as_ref()
         .and_then(|v| get_text(v, "name"))
-        .or_else(|| og(html, "title"));
+        .or_else(|| og_meta.raw("title"));
     let image = jsonld
         .as_ref()
         .and_then(get_first_image)
-        .or_else(|| og(html, "image"));
+        .or_else(|| og_meta.raw("image"));
     let brand = jsonld.as_ref().and_then(get_brand);
     let description = jsonld
         .as_ref()
         .and_then(|v| get_text(v, "description"))
-        .or_else(|| og(html, "description"));
+        .or_else(|| og_meta.raw("description"));
     let offer = jsonld.as_ref().and_then(first_offer);
 
     // eBay's AggregateOffer uses lowPrice/highPrice. Offer uses price.
@@ -268,19 +271,6 @@ fn get_aggregate_rating(v: &Value) -> Option<Value> {
     }))
 }
 
-fn og(html: &str, prop: &str) -> Option<String> {
-    static RE: OnceLock<Regex> = OnceLock::new();
-    let re = RE.get_or_init(|| {
-        Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
-    });
-    for c in re.captures_iter(html) {
-        if c.get(1).is_some_and(|m| m.as_str() == prop) {
-            return c.get(2).map(|m| m.as_str().to_string());
-        }
-    }
-    None
-}
-
 fn cloud_to_fetch_err(e: CloudError) -> FetchError {
     FetchError::Build(e.to_string())
 }
diff --git a/crates/webclaw-fetch/src/extractors/ecommerce_product.rs b/crates/webclaw-fetch/src/extractors/ecommerce_product.rs
index 019fb68..7edfd41 100644
--- a/crates/webclaw-fetch/src/extractors/ecommerce_product.rs
+++ b/crates/webclaw-fetch/src/extractors/ecommerce_product.rs
@@ -42,6 +42,7 @@ use regex::Regex;
 use serde_json::{Value, json};
 
 use super::ExtractorInfo;
+use super::og::{og, parse_og};
 use crate::error::FetchError;
 use crate::fetcher::Fetcher;
 
@@ -142,15 +143,17 @@ fn build_jsonld_payload(product: &Value, html: &str, url: &str) -> Value {
 /// Build a minimal payload from OG / product meta tags. Used when a
 /// page has no Product JSON-LD at all.
 fn build_og_payload(html: &str, url: &str) -> Value {
+    // Single scan for the three og:* fields this fallback reads.
+    let og_meta = parse_og(html);
     let offers = build_og_offer(html).map(|o| vec![o]).unwrap_or_default();
-    let image = og(html, "image");
+    let image = og_meta.raw("image");
     let images: Vec<Value> = image.map(|i| vec![Value::String(i)]).unwrap_or_default();
 
     json!({
         "url":                url,
         "data_source":        "og_fallback",
-        "name":               og(html, "title"),
-        "description":        og(html, "description"),
+        "name":               og_meta.raw("title"),
+        "description":        og_meta.raw("description"),
         "brand":              meta_property(html, "product:brand"),
         "sku":                None::<String>,
         "mpn":                None::<String>,
@@ -368,20 +371,6 @@ fn build_og_offer(html: &str) -> Option<Value> {
     }))
 }
 
-/// Pull the value of `<meta property="og:{prop}" content="...">`.
-fn og(html: &str, prop: &str) -> Option<String> {
-    static RE: OnceLock<Regex> = OnceLock::new();
-    let re = RE.get_or_init(|| {
-        Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
-    });
-    for c in re.captures_iter(html) {
-        if c.get(1).is_some_and(|m| m.as_str() == prop) {
-            return c.get(2).map(|m| m.as_str().to_string());
-        }
-    }
-    None
-}
-
 /// Pull the value of any `<meta property="..." content="...">` tag.
 /// Needed for namespaced OG variants like `product:price:amount` that
 /// the simple `og:*` matcher above doesn't cover.
diff --git a/crates/webclaw-fetch/src/extractors/etsy_listing.rs b/crates/webclaw-fetch/src/extractors/etsy_listing.rs
index ea9ed0b..b9bab07 100644
--- a/crates/webclaw-fetch/src/extractors/etsy_listing.rs
+++ b/crates/webclaw-fetch/src/extractors/etsy_listing.rs
@@ -26,6 +26,7 @@ use regex::Regex;
 use serde_json::{Value, json};
 
 use super::ExtractorInfo;
+use super::og::parse_og;
 use crate::cloud::{self, CloudError};
 use crate::error::FetchError;
 use crate::fetcher::Fetcher;
@@ -74,19 +75,26 @@ pub fn parse(html: &str, url: &str, listing_id: &str) -> Value {
     let jsonld = find_product_jsonld(html);
     let slug_title = humanise_slug(parse_slug(url).as_deref());
 
+    // Single scan for the three og:* fields used as fallbacks below.
+    let og_meta = parse_og(html);
+
     let title = jsonld
         .as_ref()
         .and_then(|v| get_text(v, "name"))
-        .or_else(|| og(html, "title").filter(|t| !is_generic_title(t)))
+        .or_else(|| og_meta.raw("title").filter(|t| !is_generic_title(t)))
         .or(slug_title);
     let description = jsonld
         .as_ref()
         .and_then(|v| get_text(v, "description"))
-        .or_else(|| og(html, "description").filter(|d| !is_generic_description(d)));
+        .or_else(|| {
+            og_meta
+                .raw("description")
+                .filter(|d| !is_generic_description(d))
+        });
     let image = jsonld
         .as_ref()
         .and_then(get_first_image)
-        .or_else(|| og(html, "image"));
+        .or_else(|| og_meta.raw("image"));
     let brand = jsonld.as_ref().and_then(get_brand);
 
     // Etsy listings often ship either a single Offer or an
@@ -359,19 +367,6 @@ fn strip_schema_prefix(s: String) -> String {
         .replace("https://schema.org/", "")
 }
 
-fn og(html: &str, prop: &str) -> Option<String> {
-    static RE: OnceLock<Regex> = OnceLock::new();
-    let re = RE.get_or_init(|| {
-        Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
-    });
-    for c in re.captures_iter(html) {
-        if c.get(1).is_some_and(|m| m.as_str() == prop) {
-            return c.get(2).map(|m| m.as_str().to_string());
-        }
-    }
-    None
-}
-
 /// Etsy links the owning shop with a canonical anchor like
 /// `<a href="/shop/ShopName" ...>`. Grab the first one after the
 /// breadcrumb boundary.
diff --git a/crates/webclaw-fetch/src/extractors/mod.rs b/crates/webclaw-fetch/src/extractors/mod.rs
index 91ef8d0..4e565cd 100644
--- a/crates/webclaw-fetch/src/extractors/mod.rs
+++ b/crates/webclaw-fetch/src/extractors/mod.rs
@@ -33,6 +33,7 @@ pub mod instagram_post;
 pub mod instagram_profile;
 pub mod linkedin_post;
 pub mod npm;
+pub(crate) mod og;
 pub mod pypi;
 pub mod reddit;
 pub mod shopify_collection;
diff --git a/crates/webclaw-fetch/src/extractors/og.rs b/crates/webclaw-fetch/src/extractors/og.rs
new file mode 100644
index 0000000..c9e58ed
--- /dev/null
+++ b/crates/webclaw-fetch/src/extractors/og.rs
@@ -0,0 +1,79 @@
+//! Shared Open Graph (`og:*`) meta-tag parsing for the HTML vertical
+//! extractors.
+//!
+//! Several site extractors read a handful of `og:*` properties (title,
+//! description, image, ...) from the page `<head>`. Each used to carry a
+//! verbatim copy of the same regex + scan helper. This module centralises
+//! that logic and adds [`parse_og`], which collects every `og:*` pair in a
+//! single `captures_iter` pass so an extractor that needs multiple fields
+//! scans the document once instead of once per field.
+//!
+//! Values are stored raw. Callers that need HTML entity decoding apply
+//! [`html_unescape`] themselves — some extractors intentionally keep the
+//! raw value, so decoding is opt-in per call site to preserve output.
+
+use std::collections::HashMap;
+use std::sync::OnceLock;
+
+use regex::Regex;
+
+/// Matches `<meta property="og:<name>" content="<value>">`, case-insensitive.
+/// Capture 1 is the property suffix (after `og:`), capture 2 is the content.
+fn og_regex() -> &'static Regex {
+    static RE: OnceLock<Regex> = OnceLock::new();
+    RE.get_or_init(|| {
+        Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
+    })
+}
+
+/// Return the raw content of the first `og:<prop>` meta tag, if present.
+///
+/// Single-pass per call. For extractors reading several properties, prefer
+/// [`parse_og`] to scan the document only once.
+pub(crate) fn og(html: &str, prop: &str) -> Option<String> {
+    for c in og_regex().captures_iter(html) {
+        if c.get(1).is_some_and(|m| m.as_str() == prop) {
+            return c.get(2).map(|m| m.as_str().to_string());
+        }
+    }
+    None
+}
+
+/// Parse every `og:*` meta tag in one pass into a `suffix -> content` map.
+///
+/// First occurrence wins, matching the short-circuit-on-first-match
+/// behaviour of [`og`] when called per property. Values are raw (not
+/// entity-decoded); use [`OgMeta::unescaped`] / [`OgMeta::raw`] to read.
+pub(crate) fn parse_og(html: &str) -> OgMeta {
+    let mut map: HashMap<String, String> = HashMap::new();
+    for c in og_regex().captures_iter(html) {
+        if let (Some(name), Some(content)) = (c.get(1), c.get(2)) {
+            map.entry(name.as_str().to_string())
+                .or_insert_with(|| content.as_str().to_string());
+        }
+    }
+    OgMeta(map)
+}
+
+/// Parsed `og:*` properties from a single document scan.
+pub(crate) struct OgMeta(HashMap<String, String>);
+
+impl OgMeta {
+    /// Raw content of `og:<prop>`, exactly as it appeared in the HTML.
+    pub(crate) fn raw(&self, prop: &str) -> Option<String> {
+        self.0.get(prop).cloned()
+    }
+
+    /// Content of `og:<prop>` with the common HTML entities decoded.
+    pub(crate) fn unescaped(&self, prop: &str) -> Option<String> {
+        self.0.get(prop).map(|v| html_unescape(v))
+    }
+}
+
+/// Decode the small set of HTML entities that show up in `og:*` content.
+pub(crate) fn html_unescape(s: &str) -> String {
+    s.replace("&quot;", "\"")
+        .replace("&amp;", "&")
+        .replace("&lt;", "<")
+        .replace("&gt;", ">")
+}
diff --git a/crates/webclaw-fetch/src/extractors/substack_post.rs b/crates/webclaw-fetch/src/extractors/substack_post.rs
index c5b5019..1775393 100644
--- a/crates/webclaw-fetch/src/extractors/substack_post.rs
+++ b/crates/webclaw-fetch/src/extractors/substack_post.rs
@@ -28,6 +28,7 @@ use serde::Deserialize;
 use serde_json::{Value, json};
 
 use super::ExtractorInfo;
+use super::og::parse_og;
 use crate::cloud::{self, CloudError};
 use crate::error::FetchError;
 use crate::fetcher::Fetcher;
@@ -181,24 +182,27 @@ async fn html_fallback(
 pub fn parse_html(html: &str, url: &str, api_url: &str, slug: &str) -> Value {
     let article = find_article_jsonld(html);
 
+    // Single scan for the four og:* fields read as fallbacks below.
+    let og_meta = parse_og(html);
+
     let title = article
         .as_ref()
         .and_then(|v| get_text(v, "headline"))
-        .or_else(|| og(html, "title"));
+        .or_else(|| og_meta.raw("title"));
     let description = article
         .as_ref()
         .and_then(|v| get_text(v, "description"))
-        .or_else(|| og(html, "description"));
+        .or_else(|| og_meta.raw("description"));
     let cover_image = article
         .as_ref()
         .and_then(get_first_image)
-        .or_else(|| og(html, "image"));
+        .or_else(|| og_meta.raw("image"));
     let post_date = article
         .as_ref()
         .and_then(|v| get_text(v, "datePublished"))
         .or_else(|| meta_property(html, "article:published_time"));
     let updated_at = article.as_ref().and_then(|v| get_text(v, "dateModified"));
-    let publication_name = og(html, "site_name");
+    let publication_name = og_meta.raw("site_name");
     let authors = article.as_ref().map(extract_authors).unwrap_or_default();
 
     json!({
@@ -302,19 +306,6 @@ fn handle_from_author_url(u: &str) -> Option<String> {
 // HTML tag helpers
 // ---------------------------------------------------------------------------
 
-fn og(html: &str, prop: &str) -> Option<String> {
-    static RE: OnceLock<Regex> = OnceLock::new();
-    let re = RE.get_or_init(|| {
-        Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
-    });
-    for c in re.captures_iter(html) {
-        if c.get(1).is_some_and(|m| m.as_str() == prop) {
-            return c.get(2).map(|m| m.as_str().to_string());
-        }
-    }
-    None
-}
-
 /// Pull `<meta property="article:published_time" content="...">` and
 /// similar structured meta tags.
 fn meta_property(html: &str, prop: &str) -> Option<String> {
diff --git a/crates/webclaw-fetch/src/extractors/trustpilot_reviews.rs b/crates/webclaw-fetch/src/extractors/trustpilot_reviews.rs
index 8b77a29..39c9d6a 100644
--- a/crates/webclaw-fetch/src/extractors/trustpilot_reviews.rs
+++ b/crates/webclaw-fetch/src/extractors/trustpilot_reviews.rs
@@ -32,6 +32,7 @@ use regex::Regex;
 use serde_json::{Value, json};
 
 use super::ExtractorInfo;
+use super::og::parse_og;
 use crate::cloud::{self, CloudError};
 use crate::error::FetchError;
 use crate::fetcher::Fetcher;
@@ -87,11 +88,17 @@ pub fn parse(html: &str, url: &str) -> Result<Value, FetchError> {
     // The aiSummary block: not typed (no `@type`), detect by key.
     let ai_block = find_ai_summary_block(&blocks);
 
+    // Single scan of the page's og:* meta tags; title + description feed
+    // the regex fallbacks below.
+    let og_meta = parse_og(html);
+    let og_title = og_meta.unescaped("title");
+    let og_description = og_meta.unescaped("description");
+
     // Business name: Dataset > metadata.title regex > URL domain.
     let business_name = dataset
         .as_ref()
         .and_then(|d| get_string(d, "name"))
-        .or_else(|| parse_name_from_og_title(html))
+        .or_else(|| parse_name_from_og_title(og_title.as_deref()))
         .or_else(|| Some(domain.clone()));
 
     // Rating distribution from the csvw:Table columns. Each column has
@@ -105,8 +112,8 @@ pub fn parse(html: &str, url: &str) -> Result<Value, FetchError> {
 
     // Page-title / page-description fallbacks. OG title format:
     // "Anthropic is rated \"Bad\" with 1.5 / 5 on Trustpilot"
-    let (rating_label, rating_from_og) = parse_rating_from_og_title(html);
-    let total_from_desc = parse_review_count_from_og_description(html);
+    let (rating_label, rating_from_og) = parse_rating_from_og_title(og_title.as_deref());
+    let total_from_desc = parse_review_count_from_og_description(og_description.as_deref());
 
     // Recent reviews carried by the aiSummary block.
     let recent_reviews: Vec<Value> = ai_block
@@ -336,20 +343,21 @@ fn compute_rating_stats(distribution: &Value) -> (Option<String>, Option<i64>) {
 
 /// Regex out the business name from the standard Trustpilot OG title
 /// shape: `"{name} is rated \"{label}\" with {rating} / 5 on Trustpilot"`.
-fn parse_name_from_og_title(html: &str) -> Option<String> {
-    let title = og(html, "title")?;
+/// `title` is the (entity-decoded) `og:title` content.
+fn parse_name_from_og_title(title: Option<&str>) -> Option<String> {
+    let title = title?;
     // "Anthropic is rated \"Bad\" with 1.5 / 5 on Trustpilot"
     static RE: OnceLock<Regex> = OnceLock::new();
     let re = RE.get_or_init(|| Regex::new(r"^(.+?)\s+is rated\b").unwrap());
-    re.captures(&title)
+    re.captures(title)
         .and_then(|c| c.get(1))
         .map(|m| m.as_str().to_string())
 }
 
 /// Pull the rating label (e.g. "Bad", "Excellent") and numeric value
-/// from the OG title.
-fn parse_rating_from_og_title(html: &str) -> (Option<String>, Option<String>) {
-    let Some(title) = og(html, "title") else {
+/// from the (entity-decoded) `og:title` content.
+fn parse_rating_from_og_title(title: Option<&str>) -> (Option<String>, Option<String>) {
+    let Some(title) = title else {
         return (None, None);
     };
     static RE: OnceLock<Regex> = OnceLock::new();
@@ -357,7 +365,7 @@ fn parse_rating_from_og_title(html: &str) -> (Option<String>, Option<String>) {
     let re = RE.get_or_init(|| {
         Regex::new(r#"is rated\s*[\\"]+([^"\\]+)[\\"]+\s*with\s*([\d.]+)\s*/\s*5"#).unwrap()
     });
-    let Some(caps) = re.captures(&title) else {
+    let Some(caps) = re.captures(title) else {
         return (None, None);
     };
     (
@@ -366,13 +374,13 @@ fn parse_rating_from_og_title(html: &str) -> (Option<String>, Option<String>) {
     )
 }
 
-/// Parse "hear what 226 customers have already said" from the OG
-/// description tag.
-fn parse_review_count_from_og_description(html: &str) -> Option<i64> {
-    let desc = og(html, "description")?;
+/// Parse "hear what 226 customers have already said" from the
+/// (entity-decoded) `og:description` content.
+fn parse_review_count_from_og_description(desc: Option<&str>) -> Option<i64> {
+    let desc = desc?;
     static RE: OnceLock<Regex> = OnceLock::new();
     let re = RE.get_or_init(|| Regex::new(r"(\d[\d,]*)\s+customers").unwrap());
-    re.captures(&desc)?
+    re.captures(desc)?
         .get(1)?
         .as_str()
         .replace(',', "")
@@ -380,29 +388,6 @@ fn parse_review_count_from_og_description(html: &str) -> Option<i64> {
         .ok()
 }
 
-fn og(html: &str, prop: &str) -> Option<String> {
-    static RE: OnceLock<Regex> = OnceLock::new();
-    let re = RE.get_or_init(|| {
-        Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
-    });
-    for c in re.captures_iter(html) {
-        if c.get(1).is_some_and(|m| m.as_str() == prop) {
-            let raw = c.get(2).map(|m| m.as_str())?;
-            return Some(html_unescape(raw));
-        }
-    }
-    None
-}
-
-/// Minimal HTML entity unescaping for the three entities the
-/// synthesize_html escaper might produce. Keeps us off a heavier dep.
-fn html_unescape(s: &str) -> String {
-    s.replace("&quot;", "\"")
-        .replace("&amp;", "&")
-        .replace("&lt;", "<")
-        .replace("&gt;", ">")
-}
-
 fn get_string(v: &Value, key: &str) -> Option<String> {
     v.get(key).and_then(|x| x.as_str().map(String::from))
 }
@@ -488,8 +473,12 @@ mod tests {
     #[test]
     fn parse_og_title_extracts_name_and_rating() {
         let html = r#"<meta property="og:title" content="Anthropic is rated &quot;Bad&quot; with 1.5 / 5 on Trustpilot">"#;
-        assert_eq!(parse_name_from_og_title(html), Some("Anthropic".into()));
-        let (label, rating) = parse_rating_from_og_title(html);
+        let title = parse_og(html).unescaped("title");
+        assert_eq!(
+            parse_name_from_og_title(title.as_deref()),
+            Some("Anthropic".into())
+        );
+        let (label, rating) = parse_rating_from_og_title(title.as_deref());
         assert_eq!(label.as_deref(), Some("Bad"));
         assert_eq!(rating.as_deref(), Some("1.5"));
     }
@@ -497,7 +486,11 @@ mod tests {
     #[test]
     fn parse_review_count_from_og_description_picks_number() {
         let html = r#"<meta property="og:description" content="Do you agree? Voice your opinion today and hear what 226 customers have already said.">"#;
-        assert_eq!(parse_review_count_from_og_description(html), Some(226));
+        let desc = parse_og(html).unescaped("description");
+        assert_eq!(
+            parse_review_count_from_og_description(desc.as_deref()),
+            Some(226)
+        );
     }
 
     #[test]
diff --git a/crates/webclaw-fetch/src/extractors/youtube_video.rs b/crates/webclaw-fetch/src/extractors/youtube_video.rs
index 2551ff8..5e4fa47 100644
--- a/crates/webclaw-fetch/src/extractors/youtube_video.rs
+++ b/crates/webclaw-fetch/src/extractors/youtube_video.rs
@@ -25,6 +25,7 @@ use regex::Regex;
 use serde_json::{Value, json};
 
 use super::ExtractorInfo;
+use super::og::parse_og;
 use crate::error::FetchError;
 use crate::fetcher::Fetcher;
 
@@ -143,9 +144,11 @@ fn build_player_payload(
 // ---------------------------------------------------------------------------
 
 fn build_og_fallback(html: &str, url: &str, canonical: &str, video_id: &str) -> Value {
-    let title = og(html, "title");
-    let description = og(html, "description");
-    let thumbnail = og(html, "image");
+    // Single scan for the three og:* fields read below.
+    let og_meta = parse_og(html);
+    let title = og_meta.raw("title");
+    let description = og_meta.raw("description");
+    let thumbnail = og_meta.raw("image");
     // YouTube sets `<meta name="channel_name" ...>` on some pages but
     // OG-only pages reliably carry `og:video:tag` and the channel in
     // `<link itemprop="name">`. We keep this lean: just what's stable.
@@ -248,19 +251,6 @@ fn extract_player_response(html: &str) -> Option<Value> {
 // Meta-tag helpers (for OG fallback)
 // ---------------------------------------------------------------------------
 
-fn og(html: &str, prop: &str) -> Option<String> {
-    static RE: OnceLock<Regex> = OnceLock::new();
-    let re = RE.get_or_init(|| {
-        Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
-    });
-    for c in re.captures_iter(html) {
-        if c.get(1).is_some_and(|m| m.as_str() == prop) {
-            return c.get(2).map(|m| m.as_str().to_string());
-        }
-    }
-    None
-}
-
 fn meta_name(html: &str, name: &str) -> Option<String> {
     static RE: OnceLock<Regex> = OnceLock::new();
     let re = RE.get_or_init(|| {
diff --git a/crates/webclaw-fetch/src/fetcher.rs b/crates/webclaw-fetch/src/fetcher.rs
index fabcf44..f922d37 100644
--- a/crates/webclaw-fetch/src/fetcher.rs
+++ b/crates/webclaw-fetch/src/fetcher.rs
@@ -1,13 +1,14 @@
 //! Pluggable fetcher abstraction for vertical extractors.
 //!
 //! Extractors call the network through this trait instead of hard-
-//! coding [`FetchClient`]. The OSS CLI / MCP / self-hosted server all
-//! pass `&FetchClient` (wreq-backed BoringSSL). The production API
-//! server, which must not use in-process TLS fingerprinting, provides
-//! its own implementation that routes through the Go tls-sidecar.
+//! coding [`FetchClient`]. The CLI / MCP / self-hosted server all pass
+//! `&FetchClient`, which fetches in-process via wreq (BoringSSL) with
+//! browser-grade TLS fingerprinting. Deployments that need different
+//! transport behaviour can supply an alternative [`Fetcher`]
+//! implementation instead.
 //!
-//! Both paths expose the same [`FetchResult`] shape and the same
-//! optional cloud-escalation client, so extractor logic stays
+//! Every implementation exposes the same [`FetchResult`] shape and the
+//! same optional cloud-escalation client, so extractor logic stays
 //! identical across environments.
 //!
 //! ## Choosing an implementation
@@ -15,9 +16,9 @@
 //! - CLI, MCP, self-hosted `webclaw-server`: build a [`FetchClient`]
 //!   with [`FetchClient::with_cloud`] to attach cloud fallback, pass
 //!   it to extractors as `&client`.
-//! - `api.webclaw.io` production server: build a `TlsSidecarFetcher`
-//!   (in `server/src/engine/`) that delegates to `engine::tls_client`
-//!   and wraps it in `Arc<dyn Fetcher>` for handler injection.
+//! - Custom deployments: provide any type implementing [`Fetcher`],
+//!   wrapped in `Arc<dyn Fetcher>` for handler injection, to layer in
+//!   environment-specific routing on top of the same extractor logic.
 //!
 //! ## Why a trait and not a free function
 //!
diff --git a/crates/webclaw-fetch/src/linkedin.rs b/crates/webclaw-fetch/src/linkedin.rs
index 0cabd1c..213401e 100644
--- a/crates/webclaw-fetch/src/linkedin.rs
+++ b/crates/webclaw-fetch/src/linkedin.rs
@@ -196,38 +196,24 @@ pub fn extract_linkedin_post(html: &str, url: &str) -> Option<ExtractionResult>
         "linkedin extraction done"
     );
 
-    Some(ExtractionResult {
-        metadata: Metadata {
-            title: if post_author.is_empty() {
-                None
-            } else {
-                Some(format!("{post_author}'s LinkedIn Post"))
-            },
-            description: None,
-            author: if post_author.is_empty() {
-                None
-            } else {
-                Some(post_author)
-            },
-            published_date: None,
-            language: None,
-            url: Some(url.to_string()),
-            site_name: Some("LinkedIn".into()),
-            image: None,
-            favicon: None,
-            word_count,
-        },
-        content: Content {
-            markdown,
-            plain_text: String::new(),
-            links: vec![],
-            images: vec![],
-            code_blocks: vec![],
-            raw_html: None,
-        },
-        domain_data: None,
-        structured_data: vec![],
-    })
+    let title = if post_author.is_empty() {
+        None
+    } else {
+        Some(format!("{post_author}'s LinkedIn Post"))
+    };
+    let author = if post_author.is_empty() {
+        None
+    } else {
+        Some(post_author)
+    };
+    let metadata = Metadata::default()
+        .with_title(title)
+        .with_author(author)
+        .with_url(Some(url.to_string()))
+        .with_site_name(Some("LinkedIn".into()))
+        .with_word_count(word_count);
+    let content = Content::default().with_markdown(markdown);
+    Some(ExtractionResult::new(metadata, content))
 }
 
 /// Unescape HTML entities (named + numeric decimal).
diff --git a/crates/webclaw-fetch/src/locale.rs b/crates/webclaw-fetch/src/locale.rs
index 04079ec..199d547 100644
--- a/crates/webclaw-fetch/src/locale.rs
+++ b/crates/webclaw-fetch/src/locale.rs
@@ -1,9 +1,9 @@
 //! Derive an `Accept-Language` header from a URL.
 //!
-//! DataDome-class bot detection on country-specific sites (e.g. immobiliare.it,
-//! leboncoin.fr) does a geo-vs-locale sanity check: residential IP in the
-//! target country + a browser UA but the wrong `Accept-Language` is a bot
-//! signal. Matching the site's expected locale gets us through.
+//! Some bot-detection systems on country-specific sites do a geo-vs-locale
+//! sanity check: an IP in the target country + a browser UA but the wrong
+//! `Accept-Language` is a bot signal. Matching the site's expected locale
+//! avoids that mismatch.
 //!
 //! Default for unmapped TLDs is `en-US,en;q=0.9` — the global fallback.
 
@@ -53,15 +53,15 @@ mod tests {
     #[test]
     fn tld_dispatch() {
         assert_eq!(
-            accept_language_for_url("https://www.immobiliare.it/annunci/1"),
+            accept_language_for_url("https://www.example.it/page/1"),
             Some("it-IT,it;q=0.9")
         );
         assert_eq!(
-            accept_language_for_url("https://www.leboncoin.fr/"),
+            accept_language_for_url("https://www.example.fr/"),
             Some("fr-FR,fr;q=0.9")
         );
         assert_eq!(
-            accept_language_for_url("https://www.amazon.co.uk/"),
+            accept_language_for_url("https://www.example.co.uk/"),
             Some("en-GB,en;q=0.9")
         );
         assert_eq!(
diff --git a/crates/webclaw-fetch/src/sitemap.rs b/crates/webclaw-fetch/src/sitemap.rs
index 931db32..374892d 100644
--- a/crates/webclaw-fetch/src/sitemap.rs
+++ b/crates/webclaw-fetch/src/sitemap.rs
@@ -597,7 +597,7 @@ mod tests {
 "#;
         let entries = parse_sitemap_xml(xml);
         // Should return at least the successfully parsed entry
-        assert!(entries.len() >= 1);
+        assert!(!entries.is_empty());
         assert_eq!(entries[0].url, "https://example.com/good");
     }
 
diff --git a/crates/webclaw-fetch/src/tls.rs b/crates/webclaw-fetch/src/tls.rs
index c6c2955..2a4b8d6 100644
--- a/crates/webclaw-fetch/src/tls.rs
+++ b/crates/webclaw-fetch/src/tls.rs
@@ -81,10 +81,10 @@ const SAFARI_SIGALGS: &str = "ecdsa_secp256r1_sha256:rsa_pss_rsae_sha256:rsa_pkc
 /// Safari curves.
 const SAFARI_CURVES: &str = "X25519:P-256:P-384:P-521";
 
-/// Safari iOS 26 TLS extension order, matching bogdanfinn's
-/// `safari_ios_26_0` wire format. GREASE slots are omitted. wreq
-/// inserts them itself. Diverges from wreq-util's default SafariIos26
-/// extension order, which DataDome's immobiliare.it ruleset flags.
+/// Safari iOS 26 TLS extension order, matching a real Safari iOS 26
+/// handshake. GREASE slots are omitted; the TLS layer inserts them
+/// itself. Diverges from the library default extension order, which
+/// some strict TLS-fingerprinting WAFs flag.
 fn safari_ios_extensions() -> Vec<ExtensionType> {
     vec![
         ExtensionType::CERTIFICATE_TIMESTAMP,
@@ -103,12 +103,10 @@ fn safari_ios_extensions() -> Vec<ExtensionType> {
     ]
 }
 
-/// Chrome 133 TLS extension order, matching bogdanfinn's stable JA3
-/// (`43067709b025da334de1279a120f8e14`). Real Chrome permutes extensions
-/// per handshake, but indeed.com's WAF allowlists this specific wire order
-/// and rejects permuted ones. GREASE slots are inserted by wreq.
-///
-/// JA3 extension field from peet.ws: 18-5-35-51-10-45-11-27-17613-43-13-0-16-65037-65281-23
+/// Chrome 133 TLS extension order, matching a real Chrome 133 handshake.
+/// Real Chrome permutes extensions per handshake, but some WAFs allowlist
+/// one specific real-browser wire order and reject permuted ones. GREASE
+/// slots are inserted by the TLS layer.
 fn chrome_extensions() -> Vec<ExtensionType> {
     vec![
         ExtensionType::CERTIFICATE_TIMESTAMP,                  // 18
@@ -220,9 +218,8 @@ const SAFARI_HEADERS: &[(&str, &str)] = &[
 /// Safari iOS 26 headers, in the wire order real Safari emits. Critically:
 /// NO `sec-fetch-*`, NO `priority: u=0, i` (both Chromium-only leaks), but
 /// `upgrade-insecure-requests: 1` is present. `accept-encoding` does not
-/// include zstd (Safari can't decode it). Verified against bogdanfinn on
-/// 2026-04-22: this header set is what DataDome's immobiliare ruleset
-/// expects for a real iPhone.
+/// include zstd (Safari can't decode it). This header set matches what a
+/// real iPhone emits.
 const SAFARI_IOS_HEADERS: &[(&str, &str)] = &[
     (
         "accept",
@@ -264,8 +261,8 @@ const EDGE_HEADERS: &[(&str, &str)] = &[
 
 fn chrome_tls() -> TlsOptions {
     // permute_extensions is off so the explicit extension_permutation sticks.
-    // Real Chrome permutes, but indeed.com's WAF allowlists bogdanfinn's
-    // fixed order, so matching that gets us through.
+    // Real Chrome permutes, but some WAFs allowlist one fixed real-browser
+    // order, so matching that order is what passes.
     TlsOptions::builder()
         .cipher_list(CHROME_CIPHERS)
         .sigalgs_list(CHROME_SIGALGS)
@@ -330,18 +327,15 @@ fn safari_tls() -> TlsOptions {
 
 /// Safari iOS 26 emulation — composed on top of `wreq_util::Emulation::SafariIos26`
 /// with four targeted overrides. We don't hand-roll this one like Chrome/Firefox
-/// because the wire-level defaults from wreq-util are already correct for ciphers,
-/// sigalgs, curves, and GREASE — the four things wreq-util gets *wrong* for
-/// DataDome compatibility are overridden here:
+/// because the wire-level library defaults are already correct for ciphers,
+/// sigalgs, curves, and GREASE — the four things the library default gets
+/// *wrong* for strict-WAF compatibility are overridden here:
 ///
-///  1. TLS extension order: match bogdanfinn `safari_ios_26_0` exactly (JA3
-///     ends up `8d909525bd5bbb79f133d11cc05159fe`).
+///  1. TLS extension order: match a real Safari iOS 26 handshake exactly.
 ///  2. HTTP/2 HEADERS priority flag: weight=256, exclusive=1, depends_on=0.
-///     wreq-util omits this frame; real Safari and bogdanfinn include it.
-///     This flip is the thing DataDome actually reads — the akamai_fingerprint
-///     hash changes from `c52879e43202aeb92740be6e8c86ea96` to
-///     `d1294410a06522e37a5c5e3f0a45a705`, which is the winning signature.
-///  3. Headers: strip wreq-util's Chromium defaults (`sec-fetch-*`,
+///     The library default omits this frame; real Safari includes it. It is
+///     part of the HTTP/2 fingerprint that strict WAFs inspect.
+///  3. Headers: strip the library's Chromium defaults (`sec-fetch-*`,
 ///     `priority: u=0, i`, zstd), replace with the real iOS 26 set.
 ///  4. `accept-language` preserved from config.extra_headers for locale.
 fn safari_ios_emulation() -> wreq::Emulation {
@@ -354,7 +348,7 @@ fn safari_ios_emulation() -> wreq::Emulation {
 
     // Only override the priority flag — keep wreq-util's SETTINGS, WINDOW_UPDATE,
     // and pseudo-order intact. Replacing the whole Http2Options resets SETTINGS
-    // to defaults, which sends only INITIAL_WINDOW_SIZE and fails DataDome.
+    // to defaults, which sends only INITIAL_WINDOW_SIZE and fails strict WAFs.
     if let Some(h2) = em.http2_options_mut().as_mut() {
         h2.headers_stream_dependency = Some(StreamDependency::new(StreamId::zero(), 255, true));
     }
@@ -374,11 +368,11 @@ fn safari_ios_emulation() -> wreq::Emulation {
 }
 
 fn chrome_h2() -> Http2Options {
-    // SETTINGS frame matches bogdanfinn `chrome_133`: HEADER_TABLE_SIZE,
+    // SETTINGS frame matches real Chrome 133: HEADER_TABLE_SIZE,
     // ENABLE_PUSH=0, INITIAL_WINDOW_SIZE, MAX_HEADER_LIST_SIZE. No
-    // MAX_CONCURRENT_STREAMS — real Chrome 133 and bogdanfinn both omit it,
-    // and indeed.com's WAF reads this as a bot signal when present. Priority
-    // weight 256 (encoded as 255 + 1) matches bogdanfinn's HEADERS frame.
+    // MAX_CONCURRENT_STREAMS — real Chrome 133 omits it, and some WAFs
+    // read its presence as a bot signal. Priority weight 256 (encoded as
+    // 255 + 1) matches a real Chrome HEADERS frame.
     Http2Options::builder()
         .initial_window_size(6_291_456)
         .initial_connection_window_size(15_728_640)
@@ -530,7 +524,22 @@ pub fn build_client(
             max_redirects as usize,
         ))
         .cookie_store(true)
-        .timeout(timeout);
+        .timeout(timeout)
+        // Fail fast on a black-holed host: a stuck connect aborts in ~5s
+        // instead of consuming the full request `timeout`. The total
+        // timeout above still bounds the overall request.
+        .connect_timeout(Duration::from_secs(5))
+        // Keep warm connections around for reuse (HTTP/2 multiplexing,
+        // cookie-warmup retries) but bound idle sockets so a long-lived
+        // rotating-proxy pool doesn't accumulate dead connections.
+        .pool_idle_timeout(Duration::from_secs(90))
+        .pool_max_idle_per_host(8)
+        // SO_KEEPALIVE so half-open connections through a proxy get torn
+        // down rather than hanging until the request timeout fires.
+        .tcp_keepalive(Duration::from_secs(60));
+    // Note: HTTP/2 keep-alive (PING) interval/timeout are part of the
+    // emulated fingerprint via `Http2Options` and are intentionally not
+    // overridden here — changing them would alter the browser fingerprint.
 
     if let Some(proxy_url) = proxy {
         let proxy = wreq::Proxy::all(proxy_url).map_err(|_| {
diff --git a/crates/webclaw-fetch/src/url_security.rs b/crates/webclaw-fetch/src/url_security.rs
index 328879e..bf8f24c 100644
--- a/crates/webclaw-fetch/src/url_security.rs
+++ b/crates/webclaw-fetch/src/url_security.rs
@@ -193,7 +193,7 @@ mod tests {
                 .await
                 .is_ok()
         );
-        assert!(is_blocked_ip(IpAddr::V4(Ipv4Addr::new(8, 8, 8, 8))) == false);
+        assert!(!is_blocked_ip(IpAddr::V4(Ipv4Addr::new(8, 8, 8, 8))));
     }
 
     #[tokio::test]
diff --git a/crates/webclaw-fetch/tests/bench_1k.rs b/crates/webclaw-fetch/tests/bench_1k.rs
index ffbbf0a..9858ef1 100644
--- a/crates/webclaw-fetch/tests/bench_1k.rs
+++ b/crates/webclaw-fetch/tests/bench_1k.rs
@@ -71,7 +71,7 @@ fn classify(body: &str, len: usize, status: u16, kw: &[String]) -> &'static str
         "CHALLENGE"
     } else if status == 403 || status == 429 {
         "BLOCKED"
-    } else if status >= 300 && status < 400 {
+    } else if (300..400).contains(&status) {
         "REDIRECT"
     } else if len < 1000 {
         "EMPTY"
diff --git a/crates/webclaw-llm/Cargo.toml b/crates/webclaw-llm/Cargo.toml
index 4796257..dd3ee80 100644
--- a/crates/webclaw-llm/Cargo.toml
+++ b/crates/webclaw-llm/Cargo.toml
@@ -3,8 +3,12 @@ name = "webclaw-llm"
 description = "LLM integration for webclaw — local-first hybrid architecture (Ollama -> OpenAI -> Anthropic)"
 version.workspace = true
 edition.workspace = true
+rust-version.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies]
 reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] }
 async-trait = "0.1"
diff --git a/crates/webclaw-llm/src/error.rs b/crates/webclaw-llm/src/error.rs
index 19f75f3..42d317f 100644
--- a/crates/webclaw-llm/src/error.rs
+++ b/crates/webclaw-llm/src/error.rs
@@ -1,5 +1,6 @@
 /// LLM-specific errors. Kept flat — one enum covers transport, provider, and parsing failures.
 #[derive(Debug, thiserror::Error)]
+#[non_exhaustive]
 pub enum LlmError {
     #[error("HTTP error: {0}")]
     Http(#[from] reqwest::Error),
@@ -16,3 +17,53 @@ pub enum LlmError {
     #[error("provider error: {0}")]
     ProviderError(String),
 }
+
+/// Truncate a (possibly network-sourced) error body to at most `max` bytes,
+/// stepping back to the nearest UTF-8 char boundary so we never panic on a
+/// multibyte split. Shared by all provider error paths.
+pub(crate) fn truncate_err(text: &str, max: usize) -> &str {
+    if text.len() <= max {
+        return text;
+    }
+    let mut end = max;
+    while end > 0 && !text.is_char_boundary(end) {
+        end -= 1;
+    }
+    &text[..end]
+}
+
+#[cfg(test)]
+mod tests {
+    use super::truncate_err;
+
+    #[test]
+    fn short_text_unchanged() {
+        assert_eq!(truncate_err("hello", 500), "hello");
+    }
+
+    #[test]
+    fn exact_length_unchanged() {
+        assert_eq!(truncate_err("abcde", 5), "abcde");
+    }
+
+    #[test]
+    fn truncates_ascii() {
+        assert_eq!(truncate_err("abcdef", 3), "abc");
+    }
+
+    #[test]
+    fn never_splits_multibyte() {
+        // "é" is 2 bytes; cutting at 3 would land mid-char on the second "é".
+        let s = "aéé"; // bytes: a(1) é(2) é(2) = 5 bytes
+        let out = truncate_err(s, 3);
+        // Must step back to a valid boundary (after the first "é").
+        assert!(s.is_char_boundary(out.len()));
+        assert_eq!(out, "aé");
+    }
+
+    #[test]
+    fn boundary_step_back_to_zero_is_safe() {
+        let s = "😀"; // 4 bytes, single char
+        assert_eq!(truncate_err(s, 2), "");
+    }
+}
diff --git a/crates/webclaw-llm/src/lib.rs b/crates/webclaw-llm/src/lib.rs
index 61e2ae7..507b588 100644
--- a/crates/webclaw-llm/src/lib.rs
+++ b/crates/webclaw-llm/src/lib.rs
@@ -1,8 +1,31 @@
-/// webclaw-llm: LLM integration with local-first hybrid architecture.
-///
-/// Provider chain tries Ollama (local) first, falls back to OpenAI, then Anthropic.
-/// Provides schema-based extraction, prompt extraction, and summarization
-/// on top of webclaw-core's content pipeline.
+//! webclaw-llm: LLM integration with local-first hybrid architecture.
+//!
+//! Provider chain tries Ollama (local) first, falls back to OpenAI, then Anthropic.
+//! Provides schema-based extraction, prompt extraction, and summarization
+//! on top of webclaw-core's content pipeline.
+//!
+//! ```no_run
+//! use webclaw_llm::{ProviderChain, LlmProvider, CompletionRequest, Message};
+//!
+//! # async fn run() -> Result<(), webclaw_llm::LlmError> {
+//! // Builds Ollama -> OpenAI -> Anthropic, including only configured providers.
+//! let chain = ProviderChain::default().await;
+//!
+//! let request = CompletionRequest {
+//!     model: String::new(), // empty = each provider's default model
+//!     messages: vec![Message { role: "user".into(), content: "Hello".into() }],
+//!     temperature: None,
+//!     max_tokens: None,
+//!     json_mode: false,
+//! };
+//!
+//! let answer = chain.complete(&request).await?;
+//! println!("{answer}");
+//! # Ok(())
+//! # }
+//! ```
+#![deny(unsafe_code)]
+
 pub mod chain;
 pub mod clean;
 pub mod error;
diff --git a/crates/webclaw-llm/src/providers/anthropic.rs b/crates/webclaw-llm/src/providers/anthropic.rs
index e6e43c8..a80d174 100644
--- a/crates/webclaw-llm/src/providers/anthropic.rs
+++ b/crates/webclaw-llm/src/providers/anthropic.rs
@@ -1,14 +1,16 @@
 /// Anthropic provider — Claude models via api.anthropic.com.
 /// Anthropic's API differs from OpenAI: system message is a top-level param,
 /// not part of the messages array.
+use std::time::Duration;
+
 use async_trait::async_trait;
 use serde_json::json;
 
 use crate::clean::strip_thinking_tags;
-use crate::error::LlmError;
+use crate::error::{LlmError, truncate_err};
 use crate::provider::{CompletionRequest, LlmProvider};
 
-use super::load_api_key;
+use super::{build_http_client, load_api_key};
 
 const DEFAULT_ANTHROPIC_BASE_URL: &str = "https://api.anthropic.com/v1";
 const ANTHROPIC_VERSION: &str = "2023-06-01";
@@ -35,7 +37,7 @@ impl AnthropicProvider {
         let key = load_api_key(key_override, "ANTHROPIC_API_KEY")?;
 
         Some(Self {
-            client: reqwest::Client::new(),
+            client: build_http_client(Duration::from_secs(120)),
             key,
             base_url: base_url
                 .or_else(|| std::env::var("ANTHROPIC_BASE_URL").ok())
@@ -108,11 +110,7 @@ impl LlmProvider for AnthropicProvider {
         if !resp.status().is_success() {
             let status = resp.status();
             let text = resp.text().await.unwrap_or_default();
-            let safe_text = if text.len() > 500 {
-                &text[..500]
-            } else {
-                &text
-            };
+            let safe_text = truncate_err(&text, 500);
             return Err(LlmError::ProviderError(format!(
                 "anthropic returned {status}: {safe_text}"
             )));
@@ -208,12 +206,17 @@ mod tests {
         );
     }
 
-    // Env var fallback tests mutate process-global state and race with parallel tests.
-    // The code path is trivial (load_api_key -> env::var().ok()). Run in isolation if needed:
-    //   cargo test -p webclaw-llm env_var -- --ignored --test-threads=1
+    // ANTHROPIC_API_KEY is process-global; cargo runs tests in parallel
+    // threads. Serialize the env-mutating tests so setting the key in one
+    // can't race another asserting its absence (poison-tolerant).
+    static ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
+
     #[test]
-    #[ignore = "mutates process env; run with --test-threads=1"]
+    #[allow(unsafe_code)] // test-only env mutation, serialized by ENV_LOCK
     fn env_var_key_fallback() {
+        let _guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
+        // SAFETY: env mutation is serialized by ENV_LOCK; set_var/remove_var
+        // are unsafe on the 2024 toolchain.
         unsafe { std::env::set_var("ANTHROPIC_API_KEY", "sk-ant-env") };
         let provider = AnthropicProvider::new(None, None).expect("should construct from env");
         assert_eq!(provider.key, "sk-ant-env");
@@ -221,8 +224,11 @@ mod tests {
     }
 
     #[test]
-    #[ignore = "mutates process env; run with --test-threads=1"]
+    #[allow(unsafe_code)] // test-only env mutation, serialized by ENV_LOCK
     fn no_key_returns_none() {
+        let _guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
+        // SAFETY: env mutation is serialized by ENV_LOCK. Clear any ambient
+        // runner value so the absence assertion is deterministic.
         unsafe { std::env::remove_var("ANTHROPIC_API_KEY") };
         assert!(AnthropicProvider::new(None, None).is_none());
     }
diff --git a/crates/webclaw-llm/src/providers/mod.rs b/crates/webclaw-llm/src/providers/mod.rs
index 1e6412b..7bbfe11 100644
--- a/crates/webclaw-llm/src/providers/mod.rs
+++ b/crates/webclaw-llm/src/providers/mod.rs
@@ -2,8 +2,26 @@ pub mod anthropic;
 pub mod ollama;
 pub mod openai;
 
+use std::time::Duration;
+
 use crate::error::LlmError;
 
+/// Connect timeout shared by every provider. A dead or wrong host should fail
+/// fast (so the chain can move to the next provider) rather than hang on the
+/// OS default connect timeout.
+pub(crate) const CONNECT_TIMEOUT: Duration = Duration::from_secs(3);
+
+/// Build the HTTP client for a provider with a fixed connect timeout and a
+/// caller-chosen overall request timeout. Falls back to `reqwest::Client::new()`
+/// only if the builder somehow fails, so construction stays infallible.
+pub(crate) fn build_http_client(request_timeout: Duration) -> reqwest::Client {
+    reqwest::Client::builder()
+        .connect_timeout(CONNECT_TIMEOUT)
+        .timeout(request_timeout)
+        .build()
+        .unwrap_or_else(|_| reqwest::Client::new())
+}
+
 /// Load an API key from an explicit override or an environment variable.
 /// Returns `None` if neither is set or the value is empty.
 pub(crate) fn load_api_key(override_key: Option<String>, env_var: &str) -> Option<String> {
diff --git a/crates/webclaw-llm/src/providers/ollama.rs b/crates/webclaw-llm/src/providers/ollama.rs
index 9ee66c9..9343ea8 100644
--- a/crates/webclaw-llm/src/providers/ollama.rs
+++ b/crates/webclaw-llm/src/providers/ollama.rs
@@ -1,12 +1,16 @@
 /// Ollama provider — talks to a local Ollama instance (default localhost:11434).
 /// First choice in the provider chain: free, private, fast on Apple Silicon.
+use std::time::Duration;
+
 use async_trait::async_trait;
 use serde_json::json;
 
 use crate::clean::strip_thinking_tags;
-use crate::error::LlmError;
+use crate::error::{LlmError, truncate_err};
 use crate::provider::{CompletionRequest, LlmProvider};
 
+use super::build_http_client;
+
 pub struct OllamaProvider {
     client: reqwest::Client,
     base_url: String,
@@ -23,8 +27,11 @@ impl OllamaProvider {
             .or_else(|| std::env::var("OLLAMA_MODEL").ok())
             .unwrap_or_else(|| "qwen3:8b".into());
 
+        // Ollama runs local models that can take a while to generate; keep the
+        // overall timeout generous, but cap connect time so an unreachable host
+        // fails fast and the chain can fall through to a cloud provider.
         Self {
-            client: reqwest::Client::new(),
+            client: build_http_client(Duration::from_secs(120)),
             base_url,
             default_model,
         }
@@ -70,11 +77,7 @@ impl LlmProvider for OllamaProvider {
         if !resp.status().is_success() {
             let status = resp.status();
             let text = resp.text().await.unwrap_or_default();
-            let safe_text = if text.len() > 500 {
-                &text[..500]
-            } else {
-                &text
-            };
+            let safe_text = truncate_err(&text, 500);
             return Err(LlmError::ProviderError(format!(
                 "ollama returned {status}: {safe_text}"
             )));
@@ -140,12 +143,17 @@ mod tests {
         assert_eq!(provider.default_model(), "phi3:mini");
     }
 
-    // Env var fallback is a trivial `env::var().ok()` -- not worth the flakiness
-    // of manipulating process-global state. Run in isolation if needed:
-    //   cargo test -p webclaw-llm env_var_fallback -- --ignored --test-threads=1
+    // OLLAMA_HOST / OLLAMA_MODEL are process-global; cargo runs tests in
+    // parallel threads. Serialize the env-mutating tests so one that sets a
+    // var can't race another asserting its absence (poison-tolerant).
+    static ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
+
     #[test]
-    #[ignore = "mutates process env; run with --test-threads=1"]
+    #[allow(unsafe_code)] // test-only env mutation, serialized by ENV_LOCK
     fn env_var_fallback() {
+        let _guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
+        // SAFETY: env mutation is serialized by ENV_LOCK; set_var/remove_var
+        // are unsafe on the 2024 toolchain.
         unsafe {
             std::env::set_var("OLLAMA_HOST", "http://remote:11434");
             std::env::set_var("OLLAMA_MODEL", "mistral:7b");
diff --git a/crates/webclaw-llm/src/providers/openai.rs b/crates/webclaw-llm/src/providers/openai.rs
index 3780d8f..e23a921 100644
--- a/crates/webclaw-llm/src/providers/openai.rs
+++ b/crates/webclaw-llm/src/providers/openai.rs
@@ -1,12 +1,14 @@
 /// OpenAI provider — works with api.openai.com and any OpenAI-compatible endpoint.
+use std::time::Duration;
+
 use async_trait::async_trait;
 use serde_json::json;
 
 use crate::clean::strip_thinking_tags;
-use crate::error::LlmError;
+use crate::error::{LlmError, truncate_err};
 use crate::provider::{CompletionRequest, LlmProvider};
 
-use super::load_api_key;
+use super::{build_http_client, load_api_key};
 
 pub struct OpenAiProvider {
     client: reqwest::Client,
@@ -69,7 +71,7 @@ impl OpenAiProvider {
         let key = load_api_key(key_override, "OPENAI_API_KEY")?;
 
         Some(Self {
-            client: reqwest::Client::new(),
+            client: build_http_client(Duration::from_secs(120)),
             key,
             base_url: base_url
                 .or_else(|| std::env::var("OPENAI_BASE_URL").ok())
@@ -132,11 +134,7 @@ impl LlmProvider for OpenAiProvider {
         if !resp.status().is_success() {
             let status = resp.status();
             let text = resp.text().await.unwrap_or_default();
-            let safe_text = if text.len() > 500 {
-                &text[..500]
-            } else {
-                &text
-            };
+            let safe_text = truncate_err(&text, 500);
             return Err(LlmError::ProviderError(format!(
                 "openai returned {status}: {safe_text}"
             )));
@@ -276,12 +274,17 @@ mod tests {
         assert_eq!(body["response_format"], json!({ "type": "text" }));
     }
 
-    // Env var fallback tests mutate process-global state and race with parallel tests.
-    // The code path is trivial (load_api_key -> env::var().ok()). Run in isolation if needed:
-    //   cargo test -p webclaw-llm env_var -- --ignored --test-threads=1
+    // OPENAI_API_KEY is process-global; cargo runs tests in parallel threads.
+    // Serialize the env-mutating tests so setting the key in one can't race
+    // another asserting its absence (poison-tolerant).
+    static ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
+
     #[test]
-    #[ignore = "mutates process env; run with --test-threads=1"]
+    #[allow(unsafe_code)] // test-only env mutation, serialized by ENV_LOCK
     fn env_var_key_fallback() {
+        let _guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
+        // SAFETY: env mutation is serialized by ENV_LOCK; set_var/remove_var
+        // are unsafe on the 2024 toolchain.
         unsafe { std::env::set_var("OPENAI_API_KEY", "sk-env-key") };
         let provider = OpenAiProvider::new(None, None, None).expect("should construct from env");
         assert_eq!(provider.key, "sk-env-key");
@@ -289,8 +292,11 @@ mod tests {
     }
 
     #[test]
-    #[ignore = "mutates process env; run with --test-threads=1"]
+    #[allow(unsafe_code)] // test-only env mutation, serialized by ENV_LOCK
     fn no_key_returns_none() {
+        let _guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
+        // SAFETY: env mutation is serialized by ENV_LOCK. Clear any ambient
+        // runner value so the absence assertion is deterministic.
         unsafe { std::env::remove_var("OPENAI_API_KEY") };
         assert!(OpenAiProvider::new(None, None, None).is_none());
     }
diff --git a/crates/webclaw-mcp/Cargo.toml b/crates/webclaw-mcp/Cargo.toml
index ec3b2b4..5e90ea6 100644
--- a/crates/webclaw-mcp/Cargo.toml
+++ b/crates/webclaw-mcp/Cargo.toml
@@ -3,8 +3,12 @@ name = "webclaw-mcp"
 description = "MCP server for webclaw web extraction toolkit"
 version.workspace = true
 edition.workspace = true
+rust-version.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [[bin]]
 name = "webclaw-mcp"
 path = "src/main.rs"
diff --git a/crates/webclaw-mcp/src/server.rs b/crates/webclaw-mcp/src/server.rs
index 497315f..9a469aa 100644
--- a/crates/webclaw-mcp/src/server.rs
+++ b/crates/webclaw-mcp/src/server.rs
@@ -498,30 +498,13 @@ impl WebclawMcp {
                     );
                 }
 
-                let current = webclaw_core::ExtractionResult {
-                    content: webclaw_core::Content {
-                        markdown: markdown.to_string(),
-                        plain_text: markdown.to_string(),
-                        links: Vec::new(),
-                        images: Vec::new(),
-                        code_blocks: Vec::new(),
-                        raw_html: None,
-                    },
-                    metadata: webclaw_core::Metadata {
-                        title: None,
-                        description: None,
-                        author: None,
-                        published_date: None,
-                        language: None,
-                        url: Some(params.url.clone()),
-                        site_name: None,
-                        image: None,
-                        favicon: None,
-                        word_count: markdown.split_whitespace().count(),
-                    },
-                    domain_data: None,
-                    structured_data: Vec::new(),
-                };
+                let content = webclaw_core::Content::default()
+                    .with_markdown(markdown.to_string())
+                    .with_plain_text(markdown.to_string());
+                let metadata = webclaw_core::Metadata::default()
+                    .with_url(Some(params.url.clone()))
+                    .with_word_count(markdown.split_whitespace().count());
+                let current = webclaw_core::ExtractionResult::new(metadata, content);
 
                 let content_diff = webclaw_core::diff::diff(&previous, &current);
                 Ok(serde_json::to_string_pretty(&content_diff).unwrap_or_default())
diff --git a/crates/webclaw-pdf/Cargo.toml b/crates/webclaw-pdf/Cargo.toml
index 880c7f8..a7e171c 100644
--- a/crates/webclaw-pdf/Cargo.toml
+++ b/crates/webclaw-pdf/Cargo.toml
@@ -3,8 +3,12 @@ name = "webclaw-pdf"
 description = "PDF text extraction for webclaw"
 version.workspace = true
 edition.workspace = true
+rust-version.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies]
 pdf-extract = "0.7"
 thiserror = { workspace = true }
diff --git a/crates/webclaw-pdf/src/lib.rs b/crates/webclaw-pdf/src/lib.rs
index 206fe98..10aed59 100644
--- a/crates/webclaw-pdf/src/lib.rs
+++ b/crates/webclaw-pdf/src/lib.rs
@@ -1,7 +1,9 @@
-/// PDF text extraction for webclaw.
-///
-/// Uses pdf-extract (backed by lopdf) to pull text from PDF bytes.
-/// No OCR -- text-based PDFs only. Scanned PDFs return EmptyPdf in Auto mode.
+//! PDF text extraction for webclaw.
+//!
+//! Uses pdf-extract (backed by lopdf) to pull text from PDF bytes.
+//! No OCR -- text-based PDFs only. Scanned PDFs return EmptyPdf in Auto mode.
+#![forbid(unsafe_code)]
+
 pub mod error;
 
 pub use error::PdfError;
@@ -64,9 +66,18 @@ pub fn extract_pdf(bytes: &[u8], mode: PdfMode) -> Result<PdfResult, PdfError> {
 
     debug!(pages = page_count, "PDF document loaded");
 
-    // Extract text via pdf-extract (higher-level API over lopdf)
-    let text = pdf_extract::extract_text_from_mem(bytes)
-        .map_err(|e| PdfError::ExtractionFailed(e.to_string()))?;
+    // Extract text via pdf-extract (higher-level API over lopdf).
+    // pdf-extract has bare `panic!`/`unreachable!` sites on malformed input,
+    // so we isolate it in catch_unwind: a caught panic becomes a normal
+    // ExtractionFailed error instead of unwinding through our callers.
+    // AssertUnwindSafe is sound here: the closure only borrows `bytes` (a
+    // read-only slice) and we discard all closure state on a caught panic.
+    let extracted = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
+        pdf_extract::extract_text_from_mem(bytes)
+    }))
+    .map_err(|_| PdfError::ExtractionFailed("pdf-extract panicked on malformed input".into()))?;
+
+    let text = extracted.map_err(|e| PdfError::ExtractionFailed(e.to_string()))?;
 
     let text = normalize_text(&text);
 
diff --git a/crates/webclaw-server/Cargo.toml b/crates/webclaw-server/Cargo.toml
index 3d4c075..f65ebe8 100644
--- a/crates/webclaw-server/Cargo.toml
+++ b/crates/webclaw-server/Cargo.toml
@@ -2,10 +2,14 @@
 name = "webclaw-server"
 version.workspace = true
 edition.workspace = true
+rust-version.workspace = true
 license.workspace = true
 repository.workspace = true
 description = "Minimal REST API server for self-hosting webclaw extraction. Wraps the OSS extraction crates with HTTP endpoints. NOT the production hosted API at api.webclaw.io — this is a stateless, single-binary reference server for local + self-hosted deployments."
 
+[lints]
+workspace = true
+
 [[bin]]
 name = "webclaw-server"
 path = "src/main.rs"
@@ -18,7 +22,7 @@ webclaw-pdf    = { workspace = true }
 
 axum           = { version = "0.8", features = ["macros"] }
 tokio          = { workspace = true }
-tower-http     = { version = "0.6", features = ["trace", "cors"] }
+tower-http     = { version = "0.6", features = ["trace", "cors", "timeout"] }
 clap           = { workspace = true, features = ["derive", "env"] }
 serde          = { workspace = true }
 serde_json     = { workspace = true }
@@ -27,3 +31,9 @@ tracing-subscriber = { workspace = true, features = ["env-filter"] }
 anyhow         = "1"
 thiserror      = { workspace = true }
 subtle         = "2.6"
+
+[dev-dependencies]
+# `ServiceExt::oneshot` drives the router in-process for hermetic handler
+# tests (no TCP listener, no network).
+tower          = { version = "0.5", features = ["util"] }
+http-body-util = "0.1"
diff --git a/crates/webclaw-server/src/main.rs b/crates/webclaw-server/src/main.rs
index 06f2451..0053db5 100644
--- a/crates/webclaw-server/src/main.rs
+++ b/crates/webclaw-server/src/main.rs
@@ -26,12 +26,20 @@ use axum::{
 };
 use clap::Parser;
 use tower_http::cors::{Any, CorsLayer};
+use tower_http::timeout::TimeoutLayer;
 use tower_http::trace::TraceLayer;
 use tracing::info;
 use tracing_subscriber::{EnvFilter, fmt};
 
 use crate::state::AppState;
 
+/// Hard ceiling on how long any single request may run before the server
+/// returns `408 Request Timeout` and drops the work. Generous enough for a
+/// cold scrape + LLM round-trip, but bounds the inline `/v1/crawl` handler
+/// (up to 500 pages, no job queue) so a slow crawl can't pin a connection
+/// and a worker indefinitely.
+const REQUEST_TIMEOUT: Duration = Duration::from_secs(120);
+
 #[derive(Parser, Debug)]
 #[command(
     name = "webclaw-server",
@@ -84,8 +92,29 @@ async fn main() -> anyhow::Result<()> {
         );
     }
 
-    let state = AppState::new(args.api_key.clone())?;
+    let state = AppState::new(args.api_key.clone()).await?;
 
+    let app = build_app(state);
+
+    let addr = SocketAddr::from((args.host, args.port));
+    let listener = tokio::net::TcpListener::bind(addr).await?;
+    let auth_status = if args.api_key.is_some() {
+        "bearer auth required"
+    } else {
+        "open mode (no auth)"
+    };
+    info!(%addr, mode = auth_status, "webclaw-server listening");
+
+    axum::serve(listener, app).await?;
+    Ok(())
+}
+
+/// Build the fully-layered axum router for a given [`AppState`].
+///
+/// Split out from `main` so the handler tests can exercise the exact same
+/// routing + middleware stack (auth, timeout) in-process via
+/// `tower::ServiceExt::oneshot`, with no TCP listener.
+fn build_app(state: AppState) -> Router {
     let v1 = Router::new()
         .route("/scrape", post(routes::scrape::scrape))
         .route(
@@ -102,7 +131,7 @@ async fn main() -> anyhow::Result<()> {
         .route("/brand", post(routes::brand::brand))
         .layer(from_fn_with_state(state.clone(), auth::require_bearer));
 
-    let app = Router::new()
+    Router::new()
         .route("/health", get(routes::health::health))
         .nest("/v1", v1)
         .layer(
@@ -115,20 +144,14 @@ async fn main() -> anyhow::Result<()> {
                 .allow_headers(Any)
                 .max_age(Duration::from_secs(3600)),
         )
+        // Caps total request time; returns 408 if exceeded. Applied
+        // outermost so it covers every route, including the inline crawl.
+        .layer(TimeoutLayer::with_status_code(
+            axum::http::StatusCode::REQUEST_TIMEOUT,
+            REQUEST_TIMEOUT,
+        ))
         .layer(TraceLayer::new_for_http())
-        .with_state(state);
-
-    let addr = SocketAddr::from((args.host, args.port));
-    let listener = tokio::net::TcpListener::bind(addr).await?;
-    let auth_status = if args.api_key.is_some() {
-        "bearer auth required"
-    } else {
-        "open mode (no auth)"
-    };
-    info!(%addr, mode = auth_status, "webclaw-server listening");
-
-    axum::serve(listener, app).await?;
-    Ok(())
+        .with_state(state)
 }
 
 fn is_unspecified_addr(addr: IpAddr) -> bool {
@@ -137,3 +160,133 @@ fn is_unspecified_addr(addr: IpAddr) -> bool {
         IpAddr::V6(ip) => ip.is_unspecified(),
     }
 }
+
+#[cfg(test)]
+mod tests {
+    //! Hermetic handler tests. Each builds the real router via
+    //! [`build_app`] and drives it in-process with
+    //! [`tower::ServiceExt::oneshot`] — no TCP listener, no outbound
+    //! network. Endpoints that would fetch a URL are reached only on paths
+    //! that short-circuit before any network call (auth rejection, format
+    //! validation, the static `/v1/extractors` catalog, `/health`).
+
+    use super::*;
+    use axum::body::Body;
+    use axum::http::{Request, StatusCode};
+    use http_body_util::BodyExt;
+    use tower::ServiceExt;
+
+    const TEST_KEY: &str = "test-secret-key";
+
+    async fn app_with_key(key: Option<&str>) -> Router {
+        // `AppState::new` probes Ollama once at startup. With no Ollama
+        // running the probe returns fast (connection refused) and the
+        // tests below never touch the chain, so they stay hermetic either
+        // way — no env juggling required.
+        let state = AppState::new(key.map(str::to_owned))
+            .await
+            .expect("build state");
+        build_app(state)
+    }
+
+    fn get(uri: &str) -> Request<Body> {
+        Request::builder()
+            .uri(uri)
+            .body(Body::empty())
+            .expect("request")
+    }
+
+    fn get_auth(uri: &str, header: &str) -> Request<Body> {
+        Request::builder()
+            .uri(uri)
+            .header("authorization", header)
+            .body(Body::empty())
+            .expect("request")
+    }
+
+    async fn json_body(resp: axum::response::Response) -> serde_json::Value {
+        let bytes = resp.into_body().collect().await.expect("body").to_bytes();
+        serde_json::from_slice(&bytes).expect("json")
+    }
+
+    #[tokio::test]
+    async fn health_returns_version() {
+        let app = app_with_key(None).await;
+        let resp = app.oneshot(get("/health")).await.expect("response");
+        assert_eq!(resp.status(), StatusCode::OK);
+        let body = json_body(resp).await;
+        assert_eq!(body["status"], "ok");
+        assert_eq!(body["service"], "webclaw-server");
+        assert_eq!(body["version"], env!("CARGO_PKG_VERSION"));
+    }
+
+    #[tokio::test]
+    async fn missing_key_is_unauthorized() {
+        let app = app_with_key(Some(TEST_KEY)).await;
+        let resp = app.oneshot(get("/v1/extractors")).await.expect("response");
+        assert_eq!(resp.status(), StatusCode::UNAUTHORIZED);
+    }
+
+    #[tokio::test]
+    async fn wrong_key_is_unauthorized() {
+        let app = app_with_key(Some(TEST_KEY)).await;
+        let resp = app
+            .oneshot(get_auth("/v1/extractors", "Bearer wrong-key"))
+            .await
+            .expect("response");
+        assert_eq!(resp.status(), StatusCode::UNAUTHORIZED);
+    }
+
+    #[tokio::test]
+    async fn correct_key_authorized() {
+        let app = app_with_key(Some(TEST_KEY)).await;
+        // `/v1/extractors` is a static catalog — passes auth, no network.
+        let resp = app
+            .oneshot(get_auth("/v1/extractors", &format!("Bearer {TEST_KEY}")))
+            .await
+            .expect("response");
+        assert_eq!(resp.status(), StatusCode::OK);
+    }
+
+    #[tokio::test]
+    async fn lowercase_bearer_accepted() {
+        let app = app_with_key(Some(TEST_KEY)).await;
+        let resp = app
+            .oneshot(get_auth("/v1/extractors", &format!("bearer {TEST_KEY}")))
+            .await
+            .expect("response");
+        assert_eq!(resp.status(), StatusCode::OK);
+    }
+
+    #[tokio::test]
+    async fn open_mode_allows_unauthenticated() {
+        // No api key configured => auth middleware passes everything.
+        let app = app_with_key(None).await;
+        let resp = app.oneshot(get("/v1/extractors")).await.expect("response");
+        assert_eq!(resp.status(), StatusCode::OK);
+    }
+
+    #[tokio::test]
+    async fn unknown_format_is_bad_request() {
+        // Format validation now runs before the fetch, so a bogus format
+        // returns 400 without any network call.
+        let app = app_with_key(None).await;
+        let req = Request::builder()
+            .method("POST")
+            .uri("/v1/scrape")
+            .header("content-type", "application/json")
+            .body(Body::from(
+                r#"{"url":"https://example.com","formats":["bogus"]}"#,
+            ))
+            .expect("request");
+        let resp = app.oneshot(req).await.expect("response");
+        assert_eq!(resp.status(), StatusCode::BAD_REQUEST);
+        let body = json_body(resp).await;
+        assert!(
+            body["error"]
+                .as_str()
+                .is_some_and(|e| e.contains("unknown format")),
+            "expected unknown-format error, got {body:?}"
+        );
+    }
+}
diff --git a/crates/webclaw-server/src/routes/crawl.rs b/crates/webclaw-server/src/routes/crawl.rs
index 9ea484c..23b712c 100644
--- a/crates/webclaw-server/src/routes/crawl.rs
+++ b/crates/webclaw-server/src/routes/crawl.rs
@@ -9,7 +9,7 @@ use axum::{Json, extract::State};
 use serde::Deserialize;
 use serde_json::{Value, json};
 use std::time::Duration;
-use webclaw_fetch::{CrawlConfig, Crawler, FetchConfig};
+use webclaw_fetch::{CrawlConfig, Crawler};
 
 use crate::{error::ApiError, state::AppState};
 
@@ -30,7 +30,7 @@ pub struct CrawlRequest {
 }
 
 pub async fn crawl(
-    State(_state): State<AppState>,
+    State(state): State<AppState>,
     Json(req): Json<CrawlRequest>,
 ) -> Result<Json<Value>, ApiError> {
     if req.url.trim().is_empty() {
@@ -42,7 +42,10 @@ pub async fn crawl(
     let concurrency = req.concurrency.unwrap_or(5).min(20);
 
     let config = CrawlConfig {
-        fetch: FetchConfig::default(),
+        // Inherit the shared client's profile/proxy/timeout instead of
+        // `FetchConfig::default()` (which is Chrome). The rest of the
+        // server fetches as Firefox; crawl now matches.
+        fetch: state.fetch_config().clone(),
         max_depth,
         max_pages,
         concurrency,
diff --git a/crates/webclaw-server/src/routes/diff.rs b/crates/webclaw-server/src/routes/diff.rs
index b0706fb..42bf082 100644
--- a/crates/webclaw-server/src/routes/diff.rs
+++ b/crates/webclaw-server/src/routes/diff.rs
@@ -36,36 +36,16 @@ impl PreviousSnapshot {
     fn into_extraction(self) -> ExtractionResult {
         match self {
             Self::Full(r) => r,
-            Self::Minimal { markdown, metadata } => ExtractionResult {
-                metadata: metadata.unwrap_or_else(empty_metadata),
-                content: Content {
-                    markdown,
-                    plain_text: String::new(),
-                    links: Vec::new(),
-                    images: Vec::new(),
-                    code_blocks: Vec::new(),
-                    raw_html: None,
-                },
-                domain_data: None,
-                structured_data: Vec::new(),
-            },
+            Self::Minimal { markdown, metadata } => ExtractionResult::new(
+                metadata.unwrap_or_else(empty_metadata),
+                Content::default().with_markdown(markdown),
+            ),
         }
     }
 }
 
 fn empty_metadata() -> Metadata {
-    Metadata {
-        title: None,
-        description: None,
-        author: None,
-        published_date: None,
-        language: None,
-        url: None,
-        site_name: None,
-        image: None,
-        favicon: None,
-        word_count: 0,
-    }
+    Metadata::default()
 }
 
 pub async fn diff_route(
diff --git a/crates/webclaw-server/src/routes/extract.rs b/crates/webclaw-server/src/routes/extract.rs
index 55b34a0..467c927 100644
--- a/crates/webclaw-server/src/routes/extract.rs
+++ b/crates/webclaw-server/src/routes/extract.rs
@@ -4,14 +4,14 @@
 //! * `schema` — JSON Schema describing what to extract.
 //! * `prompt` — natural-language instructions.
 //!
-//! At least one must be provided. The provider chain is built per
-//! request from env (Ollama -> OpenAI -> Anthropic). Self-hosters
-//! get the same fallback behaviour as the CLI.
+//! At least one must be provided. The provider chain (Ollama -> OpenAI
+//! -> Anthropic) is built once at startup and shared via `AppState`.
+//! Self-hosters get the same fallback behaviour as the CLI.
 
 use axum::{Json, extract::State};
 use serde::Deserialize;
 use serde_json::{Value, json};
-use webclaw_llm::{ProviderChain, extract::extract_json, extract::extract_with_prompt};
+use webclaw_llm::{extract::extract_json, extract::extract_with_prompt};
 
 use crate::{error::ApiError, state::AppState};
 
@@ -59,7 +59,7 @@ pub async fn extract(
         ));
     }
 
-    let chain = ProviderChain::default().await;
+    let chain = state.llm_chain();
     if chain.is_empty() {
         return Err(ApiError::Llm(
             "no LLM providers configured (set OLLAMA_HOST, OPENAI_API_KEY, or ANTHROPIC_API_KEY)"
@@ -69,10 +69,10 @@ pub async fn extract(
 
     let model = req.model.as_deref();
     let data = if let Some(schema) = req.schema.as_ref() {
-        extract_json(&content, schema, &chain, model).await?
+        extract_json(&content, schema, chain, model).await?
     } else {
         let prompt = req.prompt.as_deref().unwrap_or_default();
-        extract_with_prompt(&content, prompt, &chain, model).await?
+        extract_with_prompt(&content, prompt, chain, model).await?
     };
 
     Ok(Json(json!({
diff --git a/crates/webclaw-server/src/routes/scrape.rs b/crates/webclaw-server/src/routes/scrape.rs
index 2f7e73f..35d7298 100644
--- a/crates/webclaw-server/src/routes/scrape.rs
+++ b/crates/webclaw-server/src/routes/scrape.rs
@@ -52,8 +52,18 @@ pub async fn scrape(
     if req.url.trim().is_empty() {
         return Err(ApiError::bad_request("`url` is required"));
     }
-    let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
     let formats = req.formats.as_vec();
+    // Validate requested formats up front so a typo fails fast with a 400
+    // instead of after a full (wasted) fetch + extract.
+    if let Some(bad) = formats
+        .iter()
+        .find(|f| !matches!(f.as_str(), "markdown" | "text" | "llm" | "html" | "json"))
+    {
+        return Err(ApiError::bad_request(format!(
+            "unknown format: '{bad}' (allowed: markdown, text, llm, html, json)"
+        )));
+    }
+    let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
 
     let options = ExtractionOptions {
         include_selectors: req.include_selectors,
diff --git a/crates/webclaw-server/src/routes/summarize.rs b/crates/webclaw-server/src/routes/summarize.rs
index 6b645ab..024f483 100644
--- a/crates/webclaw-server/src/routes/summarize.rs
+++ b/crates/webclaw-server/src/routes/summarize.rs
@@ -3,7 +3,7 @@
 use axum::{Json, extract::State};
 use serde::Deserialize;
 use serde_json::{Value, json};
-use webclaw_llm::{ProviderChain, summarize::summarize};
+use webclaw_llm::summarize::summarize;
 
 use crate::{error::ApiError, state::AppState};
 
@@ -36,7 +36,7 @@ pub async fn summarize_route(
         ));
     }
 
-    let chain = ProviderChain::default().await;
+    let chain = state.llm_chain();
     if chain.is_empty() {
         return Err(ApiError::Llm(
             "no LLM providers configured (set OLLAMA_HOST, OPENAI_API_KEY, or ANTHROPIC_API_KEY)"
@@ -44,7 +44,7 @@ pub async fn summarize_route(
         ));
     }
 
-    let summary = summarize(&content, req.max_sentences, &chain, req.model.as_deref()).await?;
+    let summary = summarize(&content, req.max_sentences, chain, req.model.as_deref()).await?;
 
     Ok(Json(json!({
         "url": req.url,
diff --git a/crates/webclaw-server/src/state.rs b/crates/webclaw-server/src/state.rs
index 6c2e8f7..9807a04 100644
--- a/crates/webclaw-server/src/state.rs
+++ b/crates/webclaw-server/src/state.rs
@@ -20,6 +20,7 @@ use std::sync::Arc;
 use tracing::info;
 use webclaw_fetch::cloud::CloudClient;
 use webclaw_fetch::{BrowserProfile, FetchClient, FetchConfig};
+use webclaw_llm::ProviderChain;
 
 /// Single-process state shared across all request handlers.
 #[derive(Clone)]
@@ -34,6 +35,16 @@ struct Inner {
     /// auto-deref `&Arc<FetchClient>` -> `&FetchClient`, so this costs
     /// them nothing.
     pub fetch: Arc<FetchClient>,
+    /// The exact [`FetchConfig`] the shared `fetch` client was built from.
+    /// Endpoints that spin up their own client (e.g. `/v1/crawl`, which
+    /// builds a `Crawler` with its own internal `FetchClient`) clone this
+    /// so they inherit the same browser profile / proxy / timeout instead
+    /// of silently falling back to `FetchConfig::default()` (Chrome).
+    pub fetch_config: FetchConfig,
+    /// LLM provider chain (Ollama -> OpenAI -> Anthropic), built once at
+    /// startup. `/v1/extract` and `/v1/summarize` borrow this instead of
+    /// rebuilding the chain (and re-probing Ollama) on every request.
+    pub llm_chain: Arc<ProviderChain>,
     /// Inbound bearer-auth token for this server's own `/v1/*` surface.
     pub api_key: Option<String>,
 }
@@ -45,12 +56,15 @@ impl AppState {
     ///
     /// `inbound_api_key` is the bearer token clients must present;
     /// cloud-fallback credentials come from the env (checked here).
-    pub fn new(inbound_api_key: Option<String>) -> anyhow::Result<Self> {
+    ///
+    /// Async because the LLM provider chain probes Ollama for availability
+    /// once at startup; doing it here keeps it off the per-request hot path.
+    pub async fn new(inbound_api_key: Option<String>) -> anyhow::Result<Self> {
         let config = FetchConfig {
             browser: BrowserProfile::Firefox,
             ..FetchConfig::default()
         };
-        let mut fetch = FetchClient::new(config)
+        let mut fetch = FetchClient::new(config.clone())
             .map_err(|e| anyhow::anyhow!("failed to build fetch client: {e}"))?;
 
         // Cloud fallback: only activates when the operator has provided
@@ -66,9 +80,13 @@ impl AppState {
             fetch = fetch.with_cloud(cloud);
         }
 
+        let llm_chain = Arc::new(ProviderChain::default().await);
+
         Ok(Self {
             inner: Arc::new(Inner {
                 fetch: Arc::new(fetch),
+                fetch_config: config,
+                llm_chain,
                 api_key: inbound_api_key,
             }),
         })
@@ -78,6 +96,19 @@ impl AppState {
         &self.inner.fetch
     }
 
+    /// The [`FetchConfig`] the shared client was built from. Cloned by
+    /// endpoints that need to construct their own client with identical
+    /// settings (currently `/v1/crawl`).
+    pub fn fetch_config(&self) -> &FetchConfig {
+        &self.inner.fetch_config
+    }
+
+    /// The shared LLM provider chain. Borrowed by `/v1/extract` and
+    /// `/v1/summarize`; `&ProviderChain` coerces to `&dyn LlmProvider`.
+    pub fn llm_chain(&self) -> &ProviderChain {
+        &self.inner.llm_chain
+    }
+
     pub fn api_key(&self) -> Option<&str> {
         self.inner.api_key.as_deref()
     }
diff --git a/deny.toml b/deny.toml
new file mode 100644
index 0000000..6cd59f3
--- /dev/null
+++ b/deny.toml
@@ -0,0 +1,59 @@
+# cargo-deny configuration — supply-chain gate for the webclaw workspace.
+# Run locally with `cargo deny check`; CI runs it via EmbarkStudios/cargo-deny-action.
+#
+# Scope of enforcement:
+#   advisories — fail on known RUSTSEC vulnerabilities / unmaintained crates
+#   bans       — keep the dep tree lean and free of disallowed crates
+#   licenses   — allow the AGPL-3.0 workspace plus permissive deps only
+#   sources    — only crates.io and our own GitHub org
+
+[graph]
+# Evaluate all targets so a vuln gated behind a non-host platform still trips
+# the gate. Keep this in sync with the platforms we actually ship.
+all-features = true
+
+[advisories]
+version = 2
+# Fail the build on any unfixed advisory by default (cargo-deny v2 errors on
+# `vulnerability`/`unmaintained`/`unsound`/`yanked` unless explicitly ignored).
+# Add specific RUSTSEC ids here with a justification only when a fix is not yet
+# available upstream.
+ignore = []
+
+[bans]
+# Warn (don't hard-fail) on duplicate versions of the same crate — common and
+# usually benign in a tree this size; revisit if a duplicate becomes a problem.
+multiple-versions = "warn"
+wildcard-dependencies = "deny"
+# Crates that must never enter the tree. Empty for now; this is where a banned
+# transitive dep (e.g. an unmaintained TLS or crypto crate) would be listed.
+deny = []
+
+[licenses]
+version = 2
+# Permissive licenses we accept on dependencies, plus AGPL-3.0 for the
+# workspace crates themselves. SPDX identifiers.
+allow = [
+    "AGPL-3.0",
+    "MIT",
+    "Apache-2.0",
+    "Apache-2.0 WITH LLVM-exception",
+    "BSD-2-Clause",
+    "BSD-3-Clause",
+    "MPL-2.0",
+    "ISC",
+    "Unicode-3.0",
+    "Unicode-DFS-2016",
+    "Zlib",
+    "CC0-1.0",
+]
+# Crates with no SPDX expression in their manifest fail unless clarified here.
+confidence-threshold = 0.8
+
+[sources]
+unknown-registry = "deny"
+unknown-git = "deny"
+allow-registry = ["https://github.com/rust-lang/crates.io-index"]
+
+[sources.allow-org]
+github = ["0xMassi"]