feat: v0.2.0 — DOCX/XLSX/CSV extraction, HTML format, multi-URL watch, batch LLM

Document extraction: - DOCX: auto-detected, outputs markdown with headings (via zip + quick-xml) - XLSX/XLS: markdown tables with multi-sheet support (via calamine) - CSV: quoted field handling, markdown table output - All auto-detected by Content-Type header or URL extension New features: - -f html output format (sanitized HTML) - Multi-URL watch: --urls-file + --watch monitors all URLs in parallel - Batch + LLM: --extract-prompt/--extract-json works with multiple URLs - Mixed batch: HTML pages + DOCX + XLSX + CSV in one command Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-13 00:42:37 +02:00 · 2026-03-26 15:28:23 +01:00 · 2026-03-26 15:28:23 +01:00 · ea14848772
commit ea14848772
parent 0e4128782a
8 changed files with 1520 additions and 41 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -3,6 +3,19 @@
 All notable changes to webclaw are documented here.
 Format follows [Keep a Changelog](https://keepachangelog.com/).
 ## [0.2.0] — 2026-03-26
 ### Added
 - **DOCX extraction**: auto-detected by Content-Type or URL extension, outputs markdown with headings
 - **XLSX/XLS extraction**: spreadsheets converted to markdown tables, multi-sheet support via calamine
 - **CSV extraction**: parsed with quoted field handling, output as markdown table
 - **HTML output format**: `-f html` returns sanitized HTML from the extracted content
 - **Multi-URL watch**: `--watch` now works with `--urls-file` to monitor multiple URLs in parallel
 - **Batch + LLM extraction**: `--extract-prompt` and `--extract-json` now work with multiple URLs
 - **Scheduled batch watch**: watch multiple URLs with aggregate change reports and per-URL diffs
 ---
 ## [0.1.7] — 2026-03-26
 ### Fixed
--- a/Cargo.lock
+++ b/Cargo.lock
@ -17,6 +17,17 @@ dependencies = [
 "pom",
 ]
 [[package]]
 name = "aes"
 version = "0.8.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0"
 dependencies = [
 "cfg-if",
 "cipher",
 "cpufeatures 0.2.17",
 ]
 [[package]]
 name = "aho-corasick"
 version = "1.1.4"
@ -106,6 +117,15 @@ version = "1.0.102"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
 [[package]]
 name = "arbitrary"
 version = "1.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1"
 dependencies = [
 "derive_arbitrary",
 ]
 [[package]]
 name = "async-compression"
 version = "0.4.41"
@ -129,6 +149,15 @@ dependencies = [
 "syn",
 ]
 [[package]]
 name = "atoi_simd"
 version = "0.17.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8ad17c7c205c2c28b527b9845eeb91cf1b4d008b438f98ce0e628227a822758e"
 dependencies = [
 "debug_unsafe",
 ]
 [[package]]
 name = "atomic-waker"
 version = "1.1.2"
@ -224,6 +253,42 @@ version = "1.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33"
 [[package]]
 name = "bzip2"
 version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47"
 dependencies = [
 "bzip2-sys",
 ]
 [[package]]
 name = "bzip2-sys"
 version = "0.1.13+1.0.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14"
 dependencies = [
 "cc",
 "pkg-config",
 ]
 [[package]]
 name = "calamine"
 version = "0.34.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "20ae05a4e39297eecf9a994210d27501318c37a9318201f8e11050add82bb6f0"
 dependencies = [
 "atoi_simd",
 "byteorder",
 "codepage",
 "encoding_rs",
 "fast-float2",
 "log",
 "quick-xml 0.39.2",
 "serde",
 "zip 7.2.0",
 ]
 [[package]]
 name = "cc"
 version = "1.2.57"
@ -255,7 +320,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601"
 dependencies = [
 "cfg-if",
- "cpufeatures",
+ "cpufeatures 0.3.0",
 "rand_core 0.10.0",
 ]
@ -273,6 +338,16 @@ dependencies = [
 "windows-link",
 ]
 [[package]]
 name = "cipher"
 version = "0.4.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad"
 dependencies = [
 "crypto-common",
 "inout",
 ]
 [[package]]
 name = "clap"
 version = "4.6.0"
@ -322,6 +397,15 @@ dependencies = [
 "cc",
 ]
 [[package]]
 name = "codepage"
 version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "48f68d061bc2828ae826206326e61251aca94c1e4a5305cf52d9138639c918b4"
 dependencies = [
 "encoding_rs",
 ]
 [[package]]
 name = "colorchoice"
 version = "1.0.5"
@ -348,6 +432,12 @@ version = "0.4.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "75984efb6ed102a0d42db99afb6c1948f0380d1d91808d5529916e6c08b49d8d"
 [[package]]
 name = "constant_time_eq"
 version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6"
 [[package]]
 name = "cookie"
 version = "0.18.1"
@ -393,6 +483,15 @@ version = "0.8.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
 [[package]]
 name = "cpufeatures"
 version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280"
 dependencies = [
 "libc",
 ]
 [[package]]
 name = "cpufeatures"
 version = "0.3.0"
@ -402,6 +501,21 @@ dependencies = [
 "libc",
 ]
 [[package]]
 name = "crc"
 version = "3.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5eb8a2a1cd12ab0d987a5d5e825195d372001a4094a0376319d5a0ad71c1ba0d"
 dependencies = [
 "crc-catalog",
 ]
 [[package]]
 name = "crc-catalog"
 version = "2.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5"
 [[package]]
 name = "crc32fast"
 version = "1.5.0"
@ -411,6 +525,12 @@ dependencies = [
 "cfg-if",
 ]
 [[package]]
 name = "crossbeam-utils"
 version = "0.8.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
 [[package]]
 name = "crypto-common"
 version = "0.1.7"
@ -478,6 +598,18 @@ dependencies = [
 "syn",
 ]
 [[package]]
 name = "debug_unsafe"
 version = "0.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7eed2c4702fa172d1ce21078faa7c5203e69f5394d48cc436d25928394a867a2"
 [[package]]
 name = "deflate64"
 version = "0.1.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ac6b926516df9c60bfa16e107b21086399f8285a44ca9711344b9e553c5146e2"
 [[package]]
 name = "deranged"
 version = "0.5.8"
@ -487,6 +619,17 @@ dependencies = [
 "powerfmt",
 ]
 [[package]]
 name = "derive_arbitrary"
 version = "1.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1e567bd82dcff979e4b03460c307b3cdc9e96fde3d73bed1496d2bc75d9dd62a"
 dependencies = [
 "proc-macro2",
 "quote",
 "syn",
 ]
 [[package]]
 name = "derive_more"
 version = "0.99.20"
@ -506,6 +649,7 @@ checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
 dependencies = [
 "block-buffer",
 "crypto-common",
 "subtle",
 ]
 [[package]]
@ -601,6 +745,12 @@ dependencies = [
 "num-traits",
 ]
 [[package]]
 name = "fast-float2"
 version = "0.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f8eb564c5c7423d25c886fb561d1e4ee69f72354d16918afa32c08811f6b6a55"
 [[package]]
 name = "fastrand"
 version = "2.3.0"
@ -621,6 +771,7 @@ checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c"
 dependencies = [
 "crc32fast",
 "miniz_oxide",
 "zlib-rs",
 ]
 [[package]]
@ -857,6 +1008,15 @@ version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
 [[package]]
 name = "hmac"
 version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e"
 dependencies = [
 "digest",
 ]
 [[package]]
 name = "html5ever"
 version = "0.29.1"
@ -1121,6 +1281,15 @@ dependencies = [
 "serde_core",
 ]
 [[package]]
 name = "inout"
 version = "0.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "879f10e63c20629ecabbb64a8010319738c66a5cd0c29b02d63d272b03751d01"
 dependencies = [
 "generic-array",
 ]
 [[package]]
 name = "ipnet"
 version = "2.12.0"
@ -1244,6 +1413,27 @@ version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154"
 [[package]]
 name = "lzma-rs"
 version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "297e814c836ae64db86b36cf2a557ba54368d03f6afcd7d947c266692f71115e"
 dependencies = [
 "byteorder",
 "crc",
 ]
 [[package]]
 name = "lzma-sys"
 version = "0.1.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27"
 dependencies = [
 "cc",
 "libc",
 "pkg-config",
 ]
 [[package]]
 name = "mac"
 version = "0.1.1"
@ -1414,6 +1604,16 @@ version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b867cad97c0791bbd3aaa6472142568c6c9e8f71937e98379f584cfb0cf35bec"
 [[package]]
 name = "pbkdf2"
 version = "0.12.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f8ed6a7761f76e3b9f92dfb0a60a6a6477c61024b775147ff0973a02653abaf2"
 dependencies = [
 "digest",
 "hmac",
 ]
 [[package]]
 name = "pdf-extract"
 version = "0.7.12"
@ -1629,6 +1829,16 @@ dependencies = [
 "serde",
 ]
 [[package]]
 name = "quick-xml"
 version = "0.39.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "958f21e8e7ceb5a1aa7fa87fab28e7c75976e0bfe7e23ff069e0a260f894067d"
 dependencies = [
 "encoding_rs",
 "memchr",
 ]
 [[package]]
 name = "quinn"
 version = "0.11.9"
@ -2220,6 +2430,17 @@ dependencies = [
 "stable_deref_trait",
 ]
 [[package]]
 name = "sha1"
 version = "0.10.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba"
 dependencies = [
 "cfg-if",
 "cpufeatures 0.2.17",
 "digest",
 ]
 [[package]]
 name = "sharded-slab"
 version = "0.1.7"
@ -2645,6 +2866,12 @@ dependencies = [
 "pom",
 ]
 [[package]]
 name = "typed-path"
 version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8e28f89b80c87b8fb0cf04ab448d5dd0dd0ade2f8891bae878de66a75a28600e"
 [[package]]
 name = "typenum"
 version = "1.19.0"
@ -2881,7 +3108,7 @@ dependencies = [
 [[package]]
 name = "webclaw-cli"
-version = "0.1.7"
+version = "0.2.0"
 dependencies = [
 "clap",
 "dotenvy",
@ -2901,7 +3128,7 @@ dependencies = [
 [[package]]
 name = "webclaw-core"
-version = "0.1.7"
+version = "0.2.0"
 dependencies = [
 "ego-tree",
 "once_cell",
@ -2919,10 +3146,11 @@ dependencies = [
 [[package]]
 name = "webclaw-fetch"
-version = "0.1.7"
+version = "0.2.0"
 dependencies = [
 "calamine",
 "primp",
- "quick-xml",
+ "quick-xml 0.37.5",
 "rand 0.8.5",
 "serde",
 "serde_json",
@ -2933,11 +3161,12 @@ dependencies = [
 "url",
 "webclaw-core",
 "webclaw-pdf",
 "zip 2.4.2",
 ]
 [[package]]
 name = "webclaw-llm"
-version = "0.1.7"
+version = "0.2.0"
 dependencies = [
 "async-trait",
 "reqwest",
@ -2950,7 +3179,7 @@ dependencies = [
 [[package]]
 name = "webclaw-mcp"
-version = "0.1.7"
+version = "0.2.0"
 dependencies = [
 "dotenvy",
 "reqwest",
@ -2970,7 +3199,7 @@ dependencies = [
 [[package]]
 name = "webclaw-pdf"
-version = "0.1.7"
+version = "0.2.0"
 dependencies = [
 "pdf-extract",
 "thiserror",
@ -3301,6 +3530,15 @@ version = "0.6.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9"
 [[package]]
 name = "xz2"
 version = "0.1.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2"
 dependencies = [
 "lzma-sys",
 ]
 [[package]]
 name = "yoke"
 version = "0.8.1"
@ -3418,12 +3656,74 @@ dependencies = [
 "syn",
 ]
 [[package]]
 name = "zip"
 version = "2.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fabe6324e908f85a1c52063ce7aa26b68dcb7eb6dbc83a2d148403c9bc3eba50"
 dependencies = [
 "aes",
 "arbitrary",
 "bzip2",
 "constant_time_eq",
 "crc32fast",
 "crossbeam-utils",
 "deflate64",
 "displaydoc",
 "flate2",
 "getrandom 0.3.4",
 "hmac",
 "indexmap",
 "lzma-rs",
 "memchr",
 "pbkdf2",
 "sha1",
 "thiserror",
 "time",
 "xz2",
 "zeroize",
 "zopfli",
 "zstd",
 ]
 [[package]]
 name = "zip"
 version = "7.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c42e33efc22a0650c311c2ef19115ce232583abbe80850bc8b66509ebef02de0"
 dependencies = [
 "crc32fast",
 "flate2",
 "indexmap",
 "memchr",
 "typed-path",
 "zopfli",
 ]
 [[package]]
 name = "zlib-rs"
 version = "0.6.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3be3d40e40a133f9c916ee3f9f4fa2d9d63435b5fbe1bfc6d9dae0aa0ada1513"
 [[package]]
 name = "zmij"
 version = "1.0.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa"
 [[package]]
 name = "zopfli"
 version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f05cd8797d63865425ff89b5c4a48804f35ba0ce8d125800027ad6017d2b5249"
 dependencies = [
 "bumpalo",
 "crc32fast",
 "log",
 "simd-adler32",
 ]
 [[package]]
 name = "zstd"
 version = "0.13.3"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -3,7 +3,7 @@ resolver = "2"
 members = ["crates/*"]
 [workspace.package]
-version = "0.1.7"
+version = "0.2.0"
 edition = "2024"
 license = "MIT"
 repository = "https://github.com/0xMassi/webclaw"
--- a/crates/webclaw-cli/src/main.rs
+++ b/crates/webclaw-cli/src/main.rs
@ -95,7 +95,7 @@ struct Cli {
    #[arg(long)]
    urls_file: Option<String>,
-    /// Output format (markdown, json, text, llm)
+    /// Output format (markdown, json, text, llm, html)
    #[arg(short, long, default_value = "markdown")]
    format: OutputFormat,
@ -277,6 +277,7 @@ enum OutputFormat {
    Json,
    Text,
    Llm,
    Html,
 }
 #[derive(Clone, ValueEnum)]
@ -394,7 +395,7 @@ fn build_extraction_options(cli: &Cli) -> ExtractionOptions {
            .map(|s| s.split(',').map(|s| s.trim().to_string()).collect())
            .unwrap_or_default(),
        only_main_content: cli.only_main_content,
-        include_raw_html: cli.raw_html,
+        include_raw_html: cli.raw_html || matches!(cli.format, OutputFormat::Html),
    }
 }
@ -417,6 +418,7 @@ fn url_to_filename(raw_url: &str, format: &OutputFormat) -> String {
        OutputFormat::Markdown | OutputFormat::Llm => "md",
        OutputFormat::Json => "json",
        OutputFormat::Text => "txt",
        OutputFormat::Html => "html",
    };
    let parsed = url::Url::parse(raw_url);
@ -470,6 +472,15 @@ fn write_to_file(dir: &Path, filename: &str, content: &str) -> Result<(), String
    Ok(())
 }
 /// Get raw HTML from an extraction result, falling back to markdown if unavailable.
 fn raw_html_or_markdown(result: &ExtractionResult) -> &str {
    result
        .content
        .raw_html
        .as_deref()
        .unwrap_or(&result.content.markdown)
 }
 /// Format an `ExtractionResult` into a string for the given output format.
 fn format_output(result: &ExtractionResult, format: &OutputFormat, show_metadata: bool) -> String {
    match format {
@ -484,6 +495,7 @@ fn format_output(result: &ExtractionResult, format: &OutputFormat, show_metadata
        OutputFormat::Json => serde_json::to_string_pretty(result).expect("serialization failed"),
        OutputFormat::Text => result.content.plain_text.clone(),
        OutputFormat::Llm => to_llm_text(result, result.metadata.url.as_deref()),
        OutputFormat::Html => raw_html_or_markdown(result).to_string(),
    }
 }
@ -586,6 +598,7 @@ async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
            OutputFormat::Json => "json",
            OutputFormat::Text => "text",
            OutputFormat::Llm => "llm",
            OutputFormat::Html => "html",
        };
        let resp = c
            .scrape(
@ -618,6 +631,7 @@ async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
                OutputFormat::Json => "json",
                OutputFormat::Text => "text",
                OutputFormat::Llm => "llm",
                OutputFormat::Html => "html",
            };
            match c
                .scrape(
@ -793,6 +807,9 @@ fn print_output(result: &ExtractionResult, format: &OutputFormat, show_metadata:
        OutputFormat::Llm => {
            println!("{}", to_llm_text(result, result.metadata.url.as_deref()));
        }
        OutputFormat::Html => {
            println!("{}", raw_html_or_markdown(result));
        }
    }
 }
@ -845,6 +862,17 @@ fn print_cloud_output(resp: &serde_json::Value, format: &OutputFormat) {
                print_cloud_output(resp, &OutputFormat::Markdown);
            }
        }
        OutputFormat::Html => {
            if let Some(html) = resp
                .get("content")
                .and_then(|c| c.get("raw_html"))
                .and_then(|h| h.as_str())
            {
                println!("{html}");
            } else {
                print_cloud_output(resp, &OutputFormat::Markdown);
            }
        }
    }
 }
@ -937,6 +965,17 @@ fn print_crawl_output(result: &CrawlResult, format: &OutputFormat, show_metadata
                println!();
            }
        }
        OutputFormat::Html => {
            for page in &result.pages {
                let Some(ref extraction) = page.extraction else {
                    continue;
                };
                println!("---");
                println!("<!-- Page: {} -->\n", page.url);
                println!("{}", raw_html_or_markdown(extraction));
                println!();
            }
        }
    }
 }
@ -1009,6 +1048,21 @@ fn print_batch_output(results: &[BatchExtractResult], format: &OutputFormat, sho
                }
            }
        }
        OutputFormat::Html => {
            for r in results {
                match &r.result {
                    Ok(extraction) => {
                        println!("---");
                        println!("<!-- {} -->\n", r.url);
                        println!("{}", raw_html_or_markdown(extraction));
                        println!();
                    }
                    Err(e) => {
                        eprintln!("error: {} -- {}", r.url, e);
                    }
                }
            }
        }
    }
 }
@ -1393,24 +1447,15 @@ fn fire_webhook(url: &str, payload: &serde_json::Value) {
    });
 }
-async fn run_watch(cli: &Cli) -> Result<(), String> {
+async fn run_watch(cli: &Cli, urls: &[String]) -> Result<(), String> {
-    let raw_url = cli.urls.first().ok_or("--watch requires a URL argument")?;
+    if urls.is_empty() {
-    let url = normalize_url(raw_url);
+        return Err("--watch requires at least one URL".into());
    }
-    let client =
+    let client = Arc::new(
-        FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
+        FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?,
    let options = build_extraction_options(cli);
    // Initial snapshot
    let mut previous = client
        .fetch_and_extract_with_options(&url, &options)
        .await
        .map_err(|e| format!("initial fetch failed: {e}"))?;
    eprintln!(
        "[watch] Initial snapshot: {url} ({} words)",
        previous.metadata.word_count
    );
    let options = build_extraction_options(cli);
    // Ctrl+C handler
    let cancelled = Arc::new(AtomicBool::new(false));
@ -1420,6 +1465,33 @@ async fn run_watch(cli: &Cli) -> Result<(), String> {
        flag.store(true, Ordering::Relaxed);
    });
    // Single-URL mode: preserve original behavior exactly
    if urls.len() == 1 {
        return run_watch_single(cli, &client, &options, &urls[0], &cancelled).await;
    }
    // Multi-URL mode: batch fetch, diff each, report aggregate
    run_watch_multi(cli, &client, &options, urls, &cancelled).await
 }
 /// Original single-URL watch loop -- backward compatible.
 async fn run_watch_single(
    cli: &Cli,
    client: &Arc<FetchClient>,
    options: &ExtractionOptions,
    url: &str,
    cancelled: &Arc<AtomicBool>,
 ) -> Result<(), String> {
    let mut previous = client
        .fetch_and_extract_with_options(url, options)
        .await
        .map_err(|e| format!("initial fetch failed: {e}"))?;
    eprintln!(
        "[watch] Initial snapshot: {url} ({} words)",
        previous.metadata.word_count
    );
    loop {
        tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval)).await;
@ -1428,7 +1500,7 @@ async fn run_watch(cli: &Cli) -> Result<(), String> {
            break;
        }
-        let current = match client.fetch_and_extract_with_options(&url, &options).await {
+        let current = match client.fetch_and_extract_with_options(url, options).await {
            Ok(result) => result,
            Err(e) => {
                eprintln!("[watch] Fetch error ({}): {e}", timestamp());
@ -1454,7 +1526,6 @@ async fn run_watch(cli: &Cli) -> Result<(), String> {
                    .spawn()
                {
                    Ok(mut child) => {
                        // Pipe diff JSON to stdin, then detach
                        if let Some(mut stdin) = child.stdin.take() {
                            use tokio::io::AsyncWriteExt;
                            let _ = stdin.write_all(diff_json.as_bytes()).await;
@ -1464,7 +1535,6 @@ async fn run_watch(cli: &Cli) -> Result<(), String> {
                }
            }
            // Fire webhook on change
            if let Some(ref webhook_url) = cli.webhook {
                fire_webhook(
                    webhook_url,
@ -1487,6 +1557,162 @@ async fn run_watch(cli: &Cli) -> Result<(), String> {
    Ok(())
 }
 /// Multi-URL watch loop -- batch fetch all URLs, diff each, report aggregate.
 async fn run_watch_multi(
    cli: &Cli,
    client: &Arc<FetchClient>,
    options: &ExtractionOptions,
    urls: &[String],
    cancelled: &Arc<AtomicBool>,
 ) -> Result<(), String> {
    let url_refs: Vec<&str> = urls.iter().map(|u| u.as_str()).collect();
    // Initial pass: fetch all URLs in parallel
    let initial_results = client
        .fetch_and_extract_batch_with_options(&url_refs, cli.concurrency, options)
        .await;
    let mut snapshots = std::collections::HashMap::new();
    let mut ok_count = 0usize;
    let mut err_count = 0usize;
    for r in initial_results {
        match r.result {
            Ok(extraction) => {
                snapshots.insert(r.url, extraction);
                ok_count += 1;
            }
            Err(e) => {
                eprintln!("[watch] Initial fetch error: {} -- {e}", r.url);
                err_count += 1;
            }
        }
    }
    eprintln!(
        "[watch] Watching {} URLs (interval: {}s)",
        urls.len(),
        cli.watch_interval
    );
    eprintln!("[watch] Initial snapshots: {ok_count} ok, {err_count} errors");
    let mut check_number = 0u64;
    loop {
        tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval)).await;
        if cancelled.load(Ordering::Relaxed) {
            eprintln!("[watch] Stopped");
            break;
        }
        check_number += 1;
        let current_results = client
            .fetch_and_extract_batch_with_options(&url_refs, cli.concurrency, options)
            .await;
        let mut changed: Vec<serde_json::Value> = Vec::new();
        let mut same_count = 0usize;
        let mut fetch_errors = 0usize;
        for r in current_results {
            match r.result {
                Ok(current) => {
                    if let Some(previous) = snapshots.get(&r.url) {
                        let diff = webclaw_core::diff::diff(previous, &current);
                        if diff.status == ChangeStatus::Same {
                            same_count += 1;
                        } else {
                            changed.push(serde_json::json!({
                                "url": r.url,
                                "word_count_delta": diff.word_count_delta,
                            }));
                            snapshots.insert(r.url, current);
                        }
                    } else {
                        // URL failed initially, first successful fetch -- store as baseline
                        snapshots.insert(r.url, current);
                        same_count += 1;
                    }
                }
                Err(e) => {
                    eprintln!("[watch] Fetch error: {} -- {e}", r.url);
                    fetch_errors += 1;
                }
            }
        }
        let ts = timestamp();
        let err_suffix = if fetch_errors > 0 {
            format!(", {fetch_errors} errors")
        } else {
            String::new()
        };
        if changed.is_empty() {
            eprintln!(
                "[watch] Check {check_number} ({ts}): 0 changed, {same_count} same{err_suffix}"
            );
        } else {
            eprintln!(
                "[watch] Check {check_number} ({ts}): {} changed, {same_count} same{err_suffix}",
                changed.len(),
            );
            for entry in &changed {
                let url = entry["url"].as_str().unwrap_or("?");
                let delta = entry["word_count_delta"].as_i64().unwrap_or(0);
                eprintln!("  -> {url} (word delta: {delta:+})");
            }
            // Fire --on-change once with all changes
            if let Some(ref cmd) = cli.on_change {
                let payload = serde_json::json!({
                    "event": "watch_changes",
                    "check_number": check_number,
                    "total_urls": urls.len(),
                    "changed": changed.len(),
                    "same": same_count,
                    "changes": changed,
                });
                let payload_json = serde_json::to_string(&payload).unwrap_or_default();
                eprintln!("[watch] Running: {cmd}");
                match tokio::process::Command::new("sh")
                    .arg("-c")
                    .arg(cmd)
                    .stdin(std::process::Stdio::piped())
                    .spawn()
                {
                    Ok(mut child) => {
                        if let Some(mut stdin) = child.stdin.take() {
                            use tokio::io::AsyncWriteExt;
                            let _ = stdin.write_all(payload_json.as_bytes()).await;
                        }
                    }
                    Err(e) => eprintln!("[watch] Failed to run command: {e}"),
                }
            }
            // Fire webhook once with aggregate payload
            if let Some(ref webhook_url) = cli.webhook {
                fire_webhook(
                    webhook_url,
                    &serde_json::json!({
                        "event": "watch_changes",
                        "check_number": check_number,
                        "total_urls": urls.len(),
                        "changed": changed.len(),
                        "same": same_count,
                        "changes": changed,
                    }),
                );
            }
        }
    }
    Ok(())
 }
 async fn run_diff(cli: &Cli, snapshot_path: &str) -> Result<(), String> {
    // Load previous snapshot
    let snapshot_json = std::fs::read_to_string(snapshot_path)
@ -1626,6 +1852,158 @@ async fn run_llm(cli: &Cli) -> Result<(), String> {
    Ok(())
 }
 /// Batch LLM extraction: fetch each URL, run LLM on extracted content, save/print results.
 /// URLs are processed sequentially to respect LLM provider rate limits.
 async fn run_batch_llm(cli: &Cli, entries: &[(String, Option<String>)]) -> Result<(), String> {
    let client =
        FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
    let options = build_extraction_options(cli);
    let provider = build_llm_provider(cli).await?;
    let model = cli.llm_model.as_deref();
    // Pre-parse schema once if --extract-json is used
    let schema = if let Some(ref schema_input) = cli.extract_json {
        let schema_str = if let Some(path) = schema_input.strip_prefix('@') {
            std::fs::read_to_string(path)
                .map_err(|e| format!("failed to read schema file {path}: {e}"))?
        } else {
            schema_input.clone()
        };
        Some(
            serde_json::from_str::<serde_json::Value>(&schema_str)
                .map_err(|e| format!("invalid JSON schema: {e}"))?,
        )
    } else {
        None
    };
    // Build custom filename lookup from entries
    let custom_names: std::collections::HashMap<&str, &str> = entries
        .iter()
        .filter_map(|(url, name)| name.as_deref().map(|n| (url.as_str(), n)))
        .collect();
    let total = entries.len();
    let mut ok = 0usize;
    let mut errors = 0usize;
    let mut all_results: Vec<serde_json::Value> = Vec::with_capacity(total);
    for (i, (url, _)) in entries.iter().enumerate() {
        let idx = i + 1;
        eprint!("[{idx}/{total}] {url} ");
        // Fetch and extract page content
        let extraction = match client.fetch_and_extract_with_options(url, &options).await {
            Ok(r) => r,
            Err(e) => {
                errors += 1;
                let msg = format!("fetch failed: {e}");
                eprintln!("-> error: {msg}");
                all_results.push(serde_json::json!({ "url": url, "error": msg }));
                continue;
            }
        };
        let text = &extraction.content.plain_text;
        // Run the appropriate LLM operation
        let llm_result = if let Some(ref schema) = schema {
            webclaw_llm::extract::extract_json(text, schema, provider.as_ref(), model)
                .await
                .map(LlmOutput::Json)
        } else if let Some(ref prompt) = cli.extract_prompt {
            webclaw_llm::extract::extract_with_prompt(text, prompt, provider.as_ref(), model)
                .await
                .map(LlmOutput::Json)
        } else if let Some(sentences) = cli.summarize {
            webclaw_llm::summarize::summarize(text, Some(sentences), provider.as_ref(), model)
                .await
                .map(LlmOutput::Text)
        } else {
            unreachable!("run_batch_llm called without LLM flags")
        };
        match llm_result {
            Ok(output) => {
                ok += 1;
                let (output_str, result_json) = match &output {
                    LlmOutput::Json(v) => {
                        let s = serde_json::to_string_pretty(v).expect("serialization failed");
                        let j = serde_json::json!({ "url": url, "result": v });
                        (s, j)
                    }
                    LlmOutput::Text(s) => {
                        let j = serde_json::json!({ "url": url, "result": s });
                        (s.clone(), j)
                    }
                };
                // Count top-level fields/items for progress display
                let detail = match &output {
                    LlmOutput::Json(v) => match v {
                        serde_json::Value::Object(m) => format!("{} fields", m.len()),
                        serde_json::Value::Array(a) => format!("{} items", a.len()),
                        _ => "done".to_string(),
                    },
                    LlmOutput::Text(s) => {
                        let words = s.split_whitespace().count();
                        format!("{words} words")
                    }
                };
                eprintln!("-> extracted {detail}");
                if let Some(ref dir) = cli.output_dir {
                    let filename = custom_names
                        .get(url.as_str())
                        .map(|s| s.to_string())
                        .unwrap_or_else(|| url_to_filename(url, &OutputFormat::Json));
                    write_to_file(dir, &filename, &output_str)?;
                } else {
                    println!("--- {url}");
                    println!("{output_str}");
                    println!();
                }
                all_results.push(result_json);
            }
            Err(e) => {
                errors += 1;
                let msg = format!("LLM extraction failed: {e}");
                eprintln!("-> error: {msg}");
                all_results.push(serde_json::json!({ "url": url, "error": msg }));
            }
        }
    }
    eprintln!("Processed {total} URLs ({ok} ok, {errors} errors)");
    if let Some(ref webhook_url) = cli.webhook {
        fire_webhook(
            webhook_url,
            &serde_json::json!({
                "event": "batch_llm_complete",
                "total": total,
                "ok": ok,
                "errors": errors,
            }),
        );
        tokio::time::sleep(std::time::Duration::from_millis(500)).await;
    }
    if errors > 0 {
        Err(format!("{errors} of {total} URLs failed"))
    } else {
        Ok(())
    }
 }
 /// Intermediate type to hold LLM output before formatting.
 enum LlmOutput {
    Json(serde_json::Value),
    Text(String),
 }
 /// Returns true if any LLM flag is set.
 fn has_llm_flags(cli: &Cli) -> bool {
    cli.extract_json.is_some() || cli.extract_prompt.is_some() || cli.summarize.is_some()
@ -1656,9 +2034,16 @@ async fn main() {
        return;
    }
-    // --watch: poll a URL for changes
+    // --watch: poll URL(s) for changes
    if cli.watch {
-        if let Err(e) = run_watch(&cli).await {
+        let watch_urls: Vec<String> = match collect_urls(&cli) {
            Ok(entries) => entries.into_iter().map(|(url, _)| url).collect(),
            Err(e) => {
                eprintln!("error: {e}");
                process::exit(1);
            }
        };
        if let Err(e) = run_watch(&cli, &watch_urls).await {
            eprintln!("error: {e}");
            process::exit(1);
        }
@ -1683,15 +2068,6 @@ async fn main() {
        return;
    }
    // LLM modes: --extract-json, --extract-prompt, --summarize
    if has_llm_flags(&cli) {
        if let Err(e) = run_llm(&cli).await {
            eprintln!("error: {e}");
            process::exit(1);
        }
        return;
    }
    // Collect all URLs from args + --urls-file
    let entries = match collect_urls(&cli) {
        Ok(u) => u,
@ -1701,6 +2077,21 @@ async fn main() {
        }
    };
    // LLM modes: --extract-json, --extract-prompt, --summarize
    // When multiple URLs are provided, run batch LLM extraction over all of them.
    if has_llm_flags(&cli) {
        if entries.len() > 1 {
            if let Err(e) = run_batch_llm(&cli, &entries).await {
                eprintln!("error: {e}");
                process::exit(1);
            }
        } else if let Err(e) = run_llm(&cli).await {
            eprintln!("error: {e}");
            process::exit(1);
        }
        return;
    }
    // Multi-URL batch mode
    if entries.len() > 1 {
        if let Err(e) = run_batch(&cli, &entries).await {
@ -1824,6 +2215,14 @@ mod tests {
        );
    }
    #[test]
    fn url_to_filename_html_format() {
        assert_eq!(
            url_to_filename("https://example.com/docs/api", &OutputFormat::Html),
            "docs/api.html"
        );
    }
    #[test]
    fn url_to_filename_special_chars() {
        // Spaces and special chars get replaced with underscores
--- a/crates/webclaw-fetch/Cargo.toml
+++ b/crates/webclaw-fetch/Cargo.toml
@ -19,6 +19,8 @@ url = "2"
 rand = "0.8"
 quick-xml = { version = "0.37", features = ["serde"] }
 serde_json.workspace = true
 calamine = "0.34"
 zip = "2"
 [dev-dependencies]
 tempfile = "3"
--- a/crates/webclaw-fetch/src/client.rs
+++ b/crates/webclaw-fetch/src/client.rs
@ -399,6 +399,27 @@ impl FetchClient {
            let pdf_result = webclaw_pdf::extract_pdf(&bytes, self.pdf_mode.clone())?;
            Ok(pdf_to_extraction_result(&pdf_result, &final_url))
        } else if let Some(doc_type) =
            crate::document::is_document_content_type(&headers, &final_url)
        {
            debug!(status, doc_type = ?doc_type, "detected document response, extracting");
            let bytes = response
                .bytes()
                .await
                .map_err(|e| FetchError::BodyDecode(e.to_string()))?;
            let elapsed = start.elapsed();
            debug!(
                status,
                bytes = bytes.len(),
                elapsed_ms = %elapsed.as_millis(),
                "document fetch complete"
            );
            let mut result = crate::document::extract_document(&bytes, doc_type)?;
            result.metadata.url = Some(final_url);
            Ok(result)
        } else {
            let html = response
                .text()
--- a/crates/webclaw-fetch/src/document.rs
+++ b/crates/webclaw-fetch/src/document.rs
@ -0,0 +1,743 @@
 /// Document extraction for DOCX, XLSX, XLS, and CSV files.
 /// Auto-detects document type from Content-Type headers or URL extension,
 /// then extracts text content as markdown — same pattern as PDF extraction.
 use std::collections::HashMap;
 use std::io::{Cursor, Read};
 use tracing::debug;
 use crate::error::FetchError;
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum DocType {
    Docx,
    Xlsx,
    Xls,
    Csv,
 }
 impl DocType {
    fn label(self) -> &'static str {
        match self {
            DocType::Docx => "DOCX",
            DocType::Xlsx => "XLSX",
            DocType::Xls => "XLS",
            DocType::Csv => "CSV",
        }
    }
 }
 /// Detect document type from response headers or URL extension.
 /// Returns `None` for non-document responses (HTML, PDF, etc.).
 pub fn is_document_content_type(headers: &HashMap<String, String>, url: &str) -> Option<DocType> {
    // Check Content-Type header first
    if let Some(ct) = headers.get("content-type") {
        let mime = ct.split(';').next().unwrap_or("").trim();
        if mime.eq_ignore_ascii_case(
            "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
        ) {
            return Some(DocType::Docx);
        }
        if mime.eq_ignore_ascii_case(
            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
        ) {
            return Some(DocType::Xlsx);
        }
        if mime.eq_ignore_ascii_case("application/vnd.ms-excel") {
            return Some(DocType::Xls);
        }
        if mime.eq_ignore_ascii_case("text/csv") {
            return Some(DocType::Csv);
        }
    }
    // Fall back to URL extension
    let path = url.split('?').next().unwrap_or(url);
    let lower = path.to_ascii_lowercase();
    if lower.ends_with(".docx") {
        return Some(DocType::Docx);
    }
    if lower.ends_with(".xlsx") {
        return Some(DocType::Xlsx);
    }
    if lower.ends_with(".xls") {
        return Some(DocType::Xls);
    }
    if lower.ends_with(".csv") {
        return Some(DocType::Csv);
    }
    None
 }
 /// Extract text content from document bytes, returning an ExtractionResult.
 pub fn extract_document(
    bytes: &[u8],
    doc_type: DocType,
 ) -> Result<webclaw_core::ExtractionResult, FetchError> {
    debug!(
        doc_type = doc_type.label(),
        bytes = bytes.len(),
        "extracting document"
    );
    let markdown = match doc_type {
        DocType::Docx => extract_docx(bytes)?,
        DocType::Xlsx => extract_xlsx(bytes)?,
        DocType::Xls => extract_xls(bytes)?,
        DocType::Csv => extract_csv(bytes)?,
    };
    let plain_text = strip_markdown_formatting(&markdown);
    let word_count = plain_text.split_whitespace().count();
    Ok(webclaw_core::ExtractionResult {
        metadata: webclaw_core::Metadata {
            title: None,
            description: None,
            author: None,
            published_date: None,
            language: None,
            url: None,
            site_name: None,
            image: None,
            favicon: None,
            word_count,
        },
        content: webclaw_core::Content {
            markdown,
            plain_text,
            links: Vec::new(),
            images: Vec::new(),
            code_blocks: Vec::new(),
            raw_html: None,
        },
        domain_data: None,
        structured_data: vec![],
    })
 }
 /// Extract text from a DOCX file (ZIP of XML).
 /// Reads `word/document.xml`, extracts `<w:t>` text nodes, detects heading styles.
 fn extract_docx(bytes: &[u8]) -> Result<String, FetchError> {
    let cursor = Cursor::new(bytes);
    let mut archive =
        zip::ZipArchive::new(cursor).map_err(|e| FetchError::Build(format!("DOCX zip: {e}")))?;
    let xml = {
        let mut file = archive
            .by_name("word/document.xml")
            .map_err(|e| FetchError::Build(format!("DOCX missing document.xml: {e}")))?;
        let mut buf = String::new();
        file.read_to_string(&mut buf)
            .map_err(|e| FetchError::BodyDecode(format!("DOCX read: {e}")))?;
        buf
    };
    parse_docx_xml(&xml)
 }
 /// Parse DOCX XML (word/document.xml) into markdown.
 ///
 /// Walks the XML looking for paragraph elements (`<w:p>`). Within each paragraph,
 /// collects text from `<w:t>` tags and detects heading styles from `<w:pStyle>`.
 fn parse_docx_xml(xml: &str) -> Result<String, FetchError> {
    use quick_xml::Reader;
    use quick_xml::events::Event;
    let mut reader = Reader::from_str(xml);
    let mut paragraphs: Vec<String> = Vec::new();
    // State tracking for the current paragraph
    let mut in_paragraph = false;
    let mut in_run = false; // inside <w:r> (run)
    let mut in_text = false; // inside <w:t>
    let mut current_text = String::new();
    let mut heading_level: Option<u8> = 0.into(); // None = normal paragraph
    let mut in_ppr = false; // inside <w:pPr> (paragraph properties)
    loop {
        match reader.read_event() {
            Ok(Event::Start(ref e)) | Ok(Event::Empty(ref e)) => {
                let name_bytes = e.name().as_ref().to_vec();
                let local = local_name(&name_bytes);
                match local {
                    b"p" if is_w_namespace(&name_bytes) => {
                        in_paragraph = true;
                        current_text.clear();
                        heading_level = None;
                    }
                    b"pPr" if in_paragraph => in_ppr = true,
                    b"pStyle" if in_ppr => {
                        heading_level = extract_heading_level(e);
                    }
                    b"r" if in_paragraph => in_run = true,
                    b"t" if in_run => in_text = true,
                    b"br" if in_paragraph => {
                        current_text.push('\n');
                    }
                    b"tab" if in_paragraph => {
                        current_text.push('\t');
                    }
                    _ => {}
                }
            }
            Ok(Event::End(ref e)) => {
                let name_bytes = e.name().as_ref().to_vec();
                let local = local_name(&name_bytes);
                match local {
                    b"p" if in_paragraph => {
                        let text = current_text.trim().to_string();
                        if !text.is_empty() {
                            let formatted = match heading_level {
                                Some(1) => format!("# {text}"),
                                Some(2) => format!("## {text}"),
                                Some(3) => format!("### {text}"),
                                Some(4) => format!("#### {text}"),
                                Some(5) => format!("##### {text}"),
                                Some(6) => format!("###### {text}"),
                                _ => text,
                            };
                            paragraphs.push(formatted);
                        }
                        in_paragraph = false;
                    }
                    b"pPr" => in_ppr = false,
                    b"r" => {
                        in_run = false;
                        in_text = false;
                    }
                    b"t" => in_text = false,
                    _ => {}
                }
            }
            Ok(Event::Text(ref e)) if in_text => {
                if let Ok(text) = e.unescape() {
                    current_text.push_str(&text);
                }
            }
            Ok(Event::Eof) => break,
            Err(e) => {
                return Err(FetchError::Build(format!("DOCX XML parse error: {e}")));
            }
            _ => {}
        }
    }
    Ok(paragraphs.join("\n\n"))
 }
 /// Check if a qualified name belongs to the `w:` (wordprocessingML) namespace.
 /// Handles both `w:p` (prefixed) and just `p` (default namespace) forms.
 fn is_w_namespace(name: &[u8]) -> bool {
    // quick-xml gives us the full name bytes. Accept both "w:p" and "p".
    name == b"w:p" || name == b"p"
 }
 /// Extract the local name from a possibly namespaced XML tag.
 /// `w:p` -> `p`, `p` -> `p`
 fn local_name(name: &[u8]) -> &[u8] {
    match name.iter().position(|&b| b == b':') {
        Some(pos) => &name[pos + 1..],
        None => name,
    }
 }
 /// Extract heading level from a `<w:pStyle w:val="Heading1"/>` element.
 fn extract_heading_level(e: &quick_xml::events::BytesStart) -> Option<u8> {
    for attr in e.attributes().flatten() {
        let local = local_name(attr.key.as_ref());
        if local == b"val" {
            let val = String::from_utf8_lossy(&attr.value);
            let lower = val.to_ascii_lowercase();
            // Match "heading1", "heading2", etc. and "title" -> h1
            if lower == "title" {
                return Some(1);
            }
            if let Some(rest) = lower.strip_prefix("heading")
                && let Ok(n) = rest.parse::<u8>()
            {
                return Some(n.min(6));
            }
        }
    }
    None
 }
 /// Extract spreadsheet content using calamine (XLSX format).
 fn extract_xlsx(bytes: &[u8]) -> Result<String, FetchError> {
    extract_spreadsheet(bytes, "XLSX")
 }
 /// Extract spreadsheet content using calamine (XLS format).
 fn extract_xls(bytes: &[u8]) -> Result<String, FetchError> {
    extract_spreadsheet(bytes, "XLS")
 }
 /// Shared spreadsheet extraction for both XLSX and XLS via calamine.
 /// Reads all sheets and formats each as a markdown table.
 fn extract_spreadsheet(bytes: &[u8], label: &str) -> Result<String, FetchError> {
    use calamine::Reader;
    let cursor = Cursor::new(bytes);
    let mut workbook: calamine::Sheets<_> = calamine::open_workbook_auto_from_rs(cursor)
        .map_err(|e| FetchError::Build(format!("{label} open: {e}")))?;
    let sheet_names: Vec<String> = workbook.sheet_names().to_vec();
    let mut sections: Vec<String> = Vec::new();
    for name in &sheet_names {
        let range = workbook
            .worksheet_range(name)
            .map_err(|e| FetchError::Build(format!("{label} sheet '{name}': {e}")))?;
        let rows: Vec<Vec<String>> = range
            .rows()
            .map(|row| row.iter().map(cell_to_string).collect())
            .collect();
        if rows.is_empty() {
            continue;
        }
        let mut section = format!("## Sheet: {name}\n\n");
        section.push_str(&rows_to_markdown_table(&rows));
        sections.push(section);
    }
    if sections.is_empty() {
        return Ok("(empty spreadsheet)".to_string());
    }
    Ok(sections.join("\n\n"))
 }
 /// Convert a calamine cell value to a display string.
 fn cell_to_string(cell: &calamine::Data) -> String {
    use calamine::Data;
    match cell {
        Data::Empty => String::new(),
        Data::String(s) => s.clone(),
        Data::Int(n) => n.to_string(),
        Data::Float(f) => format_float(*f),
        Data::Bool(b) => b.to_string(),
        Data::Error(e) => format!("#{e:?}"),
        Data::DateTime(dt) => format!("{dt}"),
        Data::DateTimeIso(s) => s.clone(),
        Data::DurationIso(s) => s.clone(),
    }
 }
 /// Format a float, dropping trailing `.0` for clean integer display.
 fn format_float(f: f64) -> String {
    if f.fract() == 0.0 && f.abs() < i64::MAX as f64 {
        format!("{}", f as i64)
    } else {
        format!("{f}")
    }
 }
 /// Extract CSV text and convert to markdown table.
 fn extract_csv(bytes: &[u8]) -> Result<String, FetchError> {
    let text = String::from_utf8_lossy(bytes);
    let rows = parse_csv_rows(&text);
    if rows.is_empty() {
        return Ok("(empty CSV)".to_string());
    }
    Ok(rows_to_markdown_table(&rows))
 }
 /// Parse CSV text into rows of fields, handling quoted fields with commas/newlines.
 fn parse_csv_rows(text: &str) -> Vec<Vec<String>> {
    let mut rows: Vec<Vec<String>> = Vec::new();
    let mut current_row: Vec<String> = Vec::new();
    let mut current_field = String::new();
    let mut in_quotes = false;
    let mut chars = text.chars().peekable();
    while let Some(ch) = chars.next() {
        if in_quotes {
            if ch == '"' {
                // Escaped quote ("") or end of quoted field
                if chars.peek() == Some(&'"') {
                    chars.next();
                    current_field.push('"');
                } else {
                    in_quotes = false;
                }
            } else {
                current_field.push(ch);
            }
        } else {
            match ch {
                '"' => in_quotes = true,
                ',' => {
                    current_row.push(current_field.trim().to_string());
                    current_field = String::new();
                }
                '\n' => {
                    current_row.push(current_field.trim().to_string());
                    current_field = String::new();
                    if !current_row.iter().all(|f| f.is_empty()) {
                        rows.push(current_row);
                    }
                    current_row = Vec::new();
                }
                '\r' => {
                    // Skip carriage returns (handled with \n)
                }
                _ => current_field.push(ch),
            }
        }
    }
    // Flush last field/row
    if !current_field.is_empty() || !current_row.is_empty() {
        current_row.push(current_field.trim().to_string());
        if !current_row.iter().all(|f| f.is_empty()) {
            rows.push(current_row);
        }
    }
    rows
 }
 /// Convert rows (first row = header) into a markdown table.
 fn rows_to_markdown_table(rows: &[Vec<String>]) -> String {
    if rows.is_empty() {
        return String::new();
    }
    // Find the max column count across all rows
    let col_count = rows.iter().map(|r| r.len()).max().unwrap_or(0);
    if col_count == 0 {
        return String::new();
    }
    let mut lines: Vec<String> = Vec::new();
    // Header row
    let header = &rows[0];
    let header_cells: Vec<&str> = (0..col_count)
        .map(|i| header.get(i).map(|s| s.as_str()).unwrap_or(""))
        .collect();
    lines.push(format!("| {} |", header_cells.join(" | ")));
    // Separator row
    let sep: Vec<&str> = vec!["---"; col_count];
    lines.push(format!("| {} |", sep.join(" | ")));
    // Data rows
    for row in &rows[1..] {
        let cells: Vec<&str> = (0..col_count)
            .map(|i| row.get(i).map(|s| s.as_str()).unwrap_or(""))
            .collect();
        lines.push(format!("| {} |", cells.join(" | ")));
    }
    lines.join("\n")
 }
 /// Strip markdown formatting to get plain text.
 fn strip_markdown_formatting(markdown: &str) -> String {
    let mut plain = String::with_capacity(markdown.len());
    for line in markdown.lines() {
        let trimmed = line.trim_start_matches('#').trim();
        if trimmed.starts_with("| ---") || trimmed == "|---|" {
            continue; // Skip separator rows
        }
        if let Some(stripped) = trimmed.strip_prefix('|')
            && let Some(stripped) = stripped.strip_suffix('|')
        {
            // Table row: join cells with spaces
            let cells: Vec<&str> = stripped.split('|').map(|c| c.trim()).collect();
            plain.push_str(&cells.join(" "));
            plain.push('\n');
            continue;
        }
        plain.push_str(trimmed);
        plain.push('\n');
    }
    plain.trim().to_string()
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    // --- Content-type detection ---
    #[test]
    fn test_detect_docx_content_type() {
        let mut headers = HashMap::new();
        headers.insert(
            "content-type".to_string(),
            "application/vnd.openxmlformats-officedocument.wordprocessingml.document".to_string(),
        );
        assert_eq!(
            is_document_content_type(&headers, "https://example.com/file"),
            Some(DocType::Docx)
        );
    }
    #[test]
    fn test_detect_xlsx_content_type() {
        let mut headers = HashMap::new();
        headers.insert(
            "content-type".to_string(),
            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet".to_string(),
        );
        assert_eq!(
            is_document_content_type(&headers, "https://example.com/file"),
            Some(DocType::Xlsx)
        );
    }
    #[test]
    fn test_detect_xls_content_type() {
        let mut headers = HashMap::new();
        headers.insert(
            "content-type".to_string(),
            "application/vnd.ms-excel".to_string(),
        );
        assert_eq!(
            is_document_content_type(&headers, "https://example.com/file"),
            Some(DocType::Xls)
        );
    }
    #[test]
    fn test_detect_csv_content_type() {
        let mut headers = HashMap::new();
        headers.insert("content-type".to_string(), "text/csv".to_string());
        assert_eq!(
            is_document_content_type(&headers, "https://example.com/file"),
            Some(DocType::Csv)
        );
    }
    #[test]
    fn test_detect_csv_content_type_with_charset() {
        let mut headers = HashMap::new();
        headers.insert(
            "content-type".to_string(),
            "text/csv; charset=utf-8".to_string(),
        );
        assert_eq!(
            is_document_content_type(&headers, "https://example.com/file"),
            Some(DocType::Csv)
        );
    }
    #[test]
    fn test_detect_by_url_extension() {
        let empty: HashMap<String, String> = HashMap::new();
        assert_eq!(
            is_document_content_type(&empty, "https://example.com/report.docx"),
            Some(DocType::Docx)
        );
        assert_eq!(
            is_document_content_type(&empty, "https://example.com/data.xlsx"),
            Some(DocType::Xlsx)
        );
        assert_eq!(
            is_document_content_type(&empty, "https://example.com/old.xls"),
            Some(DocType::Xls)
        );
        assert_eq!(
            is_document_content_type(&empty, "https://example.com/data.csv"),
            Some(DocType::Csv)
        );
    }
    #[test]
    fn test_detect_url_extension_with_query() {
        let empty: HashMap<String, String> = HashMap::new();
        assert_eq!(
            is_document_content_type(&empty, "https://example.com/report.docx?token=abc"),
            Some(DocType::Docx)
        );
    }
    #[test]
    fn test_detect_url_extension_case_insensitive() {
        let empty: HashMap<String, String> = HashMap::new();
        assert_eq!(
            is_document_content_type(&empty, "https://example.com/FILE.XLSX"),
            Some(DocType::Xlsx)
        );
    }
    #[test]
    fn test_detect_none_for_html() {
        let mut headers = HashMap::new();
        headers.insert("content-type".to_string(), "text/html".to_string());
        assert_eq!(
            is_document_content_type(&headers, "https://example.com/page"),
            None
        );
    }
    #[test]
    fn test_content_type_takes_precedence_over_url() {
        let mut headers = HashMap::new();
        headers.insert("content-type".to_string(), "text/csv".to_string());
        // URL says .xlsx but Content-Type says CSV — header wins
        assert_eq!(
            is_document_content_type(&headers, "https://example.com/data.xlsx"),
            Some(DocType::Csv)
        );
    }
    // --- CSV parsing ---
    #[test]
    fn test_csv_simple() {
        let csv = "Name,Age,City\nAlice,30,NYC\nBob,25,LA\n";
        let result = extract_csv(csv.as_bytes()).unwrap();
        assert!(result.contains("| Name | Age | City |"));
        assert!(result.contains("| --- | --- | --- |"));
        assert!(result.contains("| Alice | 30 | NYC |"));
        assert!(result.contains("| Bob | 25 | LA |"));
    }
    #[test]
    fn test_csv_quoted_fields() {
        let csv = "Name,Description\nAlice,\"Has a, comma\"\nBob,\"Said \"\"hello\"\"\"\n";
        let result = extract_csv(csv.as_bytes()).unwrap();
        assert!(result.contains("Has a, comma"));
        assert!(result.contains("Said \"hello\""));
    }
    #[test]
    fn test_csv_empty() {
        let result = extract_csv(b"").unwrap();
        assert_eq!(result, "(empty CSV)");
    }
    #[test]
    fn test_csv_windows_line_endings() {
        let csv = "A,B\r\n1,2\r\n3,4\r\n";
        let result = extract_csv(csv.as_bytes()).unwrap();
        assert!(result.contains("| A | B |"));
        assert!(result.contains("| 1 | 2 |"));
    }
    // --- DOCX XML parsing ---
    #[test]
    fn test_docx_xml_simple_paragraphs() {
        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
 <w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
  <w:body>
    <w:p><w:r><w:t>Hello world</w:t></w:r></w:p>
    <w:p><w:r><w:t>Second paragraph</w:t></w:r></w:p>
  </w:body>
 </w:document>"#;
        let result = parse_docx_xml(xml).unwrap();
        assert_eq!(result, "Hello world\n\nSecond paragraph");
    }
    #[test]
    fn test_docx_xml_headings() {
        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
 <w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
  <w:body>
    <w:p>
      <w:pPr><w:pStyle w:val="Heading1"/></w:pPr>
      <w:r><w:t>Title</w:t></w:r>
    </w:p>
    <w:p><w:r><w:t>Body text</w:t></w:r></w:p>
    <w:p>
      <w:pPr><w:pStyle w:val="Heading2"/></w:pPr>
      <w:r><w:t>Subtitle</w:t></w:r>
    </w:p>
  </w:body>
 </w:document>"#;
        let result = parse_docx_xml(xml).unwrap();
        assert!(result.contains("# Title"));
        assert!(result.contains("Body text"));
        assert!(result.contains("## Subtitle"));
    }
    #[test]
    fn test_docx_xml_multiple_runs() {
        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
 <w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
  <w:body>
    <w:p>
      <w:r><w:t>Hello </w:t></w:r>
      <w:r><w:t>world</w:t></w:r>
    </w:p>
  </w:body>
 </w:document>"#;
        let result = parse_docx_xml(xml).unwrap();
        assert_eq!(result, "Hello world");
    }
    #[test]
    fn test_docx_xml_empty_paragraphs_skipped() {
        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
 <w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
  <w:body>
    <w:p></w:p>
    <w:p><w:r><w:t>Content</w:t></w:r></w:p>
    <w:p><w:r><w:t>   </w:t></w:r></w:p>
  </w:body>
 </w:document>"#;
        let result = parse_docx_xml(xml).unwrap();
        assert_eq!(result, "Content");
    }
    // --- Markdown table ---
    #[test]
    fn test_rows_to_markdown_table() {
        let rows = vec![
            vec!["A".to_string(), "B".to_string()],
            vec!["1".to_string(), "2".to_string()],
            vec!["3".to_string(), "4".to_string()],
        ];
        let table = rows_to_markdown_table(&rows);
        assert_eq!(table, "| A | B |\n| --- | --- |\n| 1 | 2 |\n| 3 | 4 |");
    }
    #[test]
    fn test_rows_to_markdown_table_ragged() {
        let rows = vec![
            vec!["A".to_string(), "B".to_string(), "C".to_string()],
            vec!["1".to_string()], // fewer columns
        ];
        let table = rows_to_markdown_table(&rows);
        assert!(table.contains("| 1 |  |  |"));
    }
    // --- Extract result ---
    #[test]
    fn test_extract_csv_result() {
        let csv = "Name,Score\nAlice,100\n";
        let result = extract_document(csv.as_bytes(), DocType::Csv).unwrap();
        assert!(result.content.markdown.contains("| Name | Score |"));
        assert!(result.metadata.word_count > 0);
        assert!(result.content.links.is_empty());
        assert!(result.domain_data.is_none());
    }
    // --- Strip markdown ---
    #[test]
    fn test_strip_markdown() {
        let md = "# Title\n\nSome text\n\n| A | B |\n| --- | --- |\n| 1 | 2 |";
        let plain = strip_markdown_formatting(md);
        assert!(plain.contains("Title"));
        assert!(plain.contains("Some text"));
        assert!(plain.contains("A B"));
        assert!(!plain.contains("---"));
    }
 }
--- a/crates/webclaw-fetch/src/lib.rs
+++ b/crates/webclaw-fetch/src/lib.rs
@ -5,6 +5,7 @@
 pub mod browser;
 pub mod client;
 pub mod crawler;
 pub mod document;
 pub mod error;
 pub mod linkedin;
 pub mod proxy;