diff --git a/CHANGELOG.md b/CHANGELOG.md
index e263948..c6a0244 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,19 @@
 All notable changes to webclaw are documented here.
 Format follows [Keep a Changelog](https://keepachangelog.com/).
 
+## [0.2.0] — 2026-03-26
+
+### Added
+- **DOCX extraction**: auto-detected by Content-Type or URL extension, outputs markdown with headings
+- **XLSX/XLS extraction**: spreadsheets converted to markdown tables, multi-sheet support via calamine
+- **CSV extraction**: parsed with quoted field handling, output as markdown table
+- **HTML output format**: `-f html` returns sanitized HTML from the extracted content
+- **Multi-URL watch**: `--watch` now works with `--urls-file` to monitor multiple URLs in parallel
+- **Batch + LLM extraction**: `--extract-prompt` and `--extract-json` now work with multiple URLs
+- **Scheduled batch watch**: watch multiple URLs with aggregate change reports and per-URL diffs
+
+---
+
 ## [0.1.7] — 2026-03-26
 
 ### Fixed
diff --git a/Cargo.lock b/Cargo.lock
index 0c8c50d..6d3c761 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -17,6 +17,17 @@ dependencies = [
  "pom",
 ]
 
+[[package]]
+name = "aes"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0"
+dependencies = [
+ "cfg-if",
+ "cipher",
+ "cpufeatures 0.2.17",
+]
+
 [[package]]
 name = "aho-corasick"
 version = "1.1.4"
@@ -106,6 +117,15 @@ version = "1.0.102"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
 
+[[package]]
+name = "arbitrary"
+version = "1.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1"
+dependencies = [
+ "derive_arbitrary",
+]
+
 [[package]]
 name = "async-compression"
 version = "0.4.41"
@@ -129,6 +149,15 @@ dependencies = [
  "syn",
 ]
 
+[[package]]
+name = "atoi_simd"
+version = "0.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8ad17c7c205c2c28b527b9845eeb91cf1b4d008b438f98ce0e628227a822758e"
+dependencies = [
+ "debug_unsafe",
+]
+
 [[package]]
 name = "atomic-waker"
 version = "1.1.2"
@@ -224,6 +253,42 @@ version = "1.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33"
 
+[[package]]
+name = "bzip2"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47"
+dependencies = [
+ "bzip2-sys",
+]
+
+[[package]]
+name = "bzip2-sys"
+version = "0.1.13+1.0.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14"
+dependencies = [
+ "cc",
+ "pkg-config",
+]
+
+[[package]]
+name = "calamine"
+version = "0.34.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "20ae05a4e39297eecf9a994210d27501318c37a9318201f8e11050add82bb6f0"
+dependencies = [
+ "atoi_simd",
+ "byteorder",
+ "codepage",
+ "encoding_rs",
+ "fast-float2",
+ "log",
+ "quick-xml 0.39.2",
+ "serde",
+ "zip 7.2.0",
+]
+
 [[package]]
 name = "cc"
 version = "1.2.57"
@@ -255,7 +320,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601"
 dependencies = [
  "cfg-if",
- "cpufeatures",
+ "cpufeatures 0.3.0",
  "rand_core 0.10.0",
 ]
 
@@ -273,6 +338,16 @@ dependencies = [
  "windows-link",
 ]
 
+[[package]]
+name = "cipher"
+version = "0.4.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad"
+dependencies = [
+ "crypto-common",
+ "inout",
+]
+
 [[package]]
 name = "clap"
 version = "4.6.0"
@@ -322,6 +397,15 @@ dependencies = [
  "cc",
 ]
 
+[[package]]
+name = "codepage"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "48f68d061bc2828ae826206326e61251aca94c1e4a5305cf52d9138639c918b4"
+dependencies = [
+ "encoding_rs",
+]
+
 [[package]]
 name = "colorchoice"
 version = "1.0.5"
@@ -348,6 +432,12 @@ version = "0.4.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "75984efb6ed102a0d42db99afb6c1948f0380d1d91808d5529916e6c08b49d8d"
 
+[[package]]
+name = "constant_time_eq"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6"
+
 [[package]]
 name = "cookie"
 version = "0.18.1"
@@ -393,6 +483,15 @@ version = "0.8.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
 
+[[package]]
+name = "cpufeatures"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "cpufeatures"
 version = "0.3.0"
@@ -402,6 +501,21 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "crc"
+version = "3.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5eb8a2a1cd12ab0d987a5d5e825195d372001a4094a0376319d5a0ad71c1ba0d"
+dependencies = [
+ "crc-catalog",
+]
+
+[[package]]
+name = "crc-catalog"
+version = "2.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5"
+
 [[package]]
 name = "crc32fast"
 version = "1.5.0"
@@ -411,6 +525,12 @@ dependencies = [
  "cfg-if",
 ]
 
+[[package]]
+name = "crossbeam-utils"
+version = "0.8.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
+
 [[package]]
 name = "crypto-common"
 version = "0.1.7"
@@ -478,6 +598,18 @@ dependencies = [
  "syn",
 ]
 
+[[package]]
+name = "debug_unsafe"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7eed2c4702fa172d1ce21078faa7c5203e69f5394d48cc436d25928394a867a2"
+
+[[package]]
+name = "deflate64"
+version = "0.1.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac6b926516df9c60bfa16e107b21086399f8285a44ca9711344b9e553c5146e2"
+
 [[package]]
 name = "deranged"
 version = "0.5.8"
@@ -487,6 +619,17 @@ dependencies = [
  "powerfmt",
 ]
 
+[[package]]
+name = "derive_arbitrary"
+version = "1.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e567bd82dcff979e4b03460c307b3cdc9e96fde3d73bed1496d2bc75d9dd62a"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "derive_more"
 version = "0.99.20"
@@ -506,6 +649,7 @@ checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
 dependencies = [
  "block-buffer",
  "crypto-common",
+ "subtle",
 ]
 
 [[package]]
@@ -601,6 +745,12 @@ dependencies = [
  "num-traits",
 ]
 
+[[package]]
+name = "fast-float2"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f8eb564c5c7423d25c886fb561d1e4ee69f72354d16918afa32c08811f6b6a55"
+
 [[package]]
 name = "fastrand"
 version = "2.3.0"
@@ -621,6 +771,7 @@ checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c"
 dependencies = [
  "crc32fast",
  "miniz_oxide",
+ "zlib-rs",
 ]
 
 [[package]]
@@ -857,6 +1008,15 @@ version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
 
+[[package]]
+name = "hmac"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e"
+dependencies = [
+ "digest",
+]
+
 [[package]]
 name = "html5ever"
 version = "0.29.1"
@@ -1121,6 +1281,15 @@ dependencies = [
  "serde_core",
 ]
 
+[[package]]
+name = "inout"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "879f10e63c20629ecabbb64a8010319738c66a5cd0c29b02d63d272b03751d01"
+dependencies = [
+ "generic-array",
+]
+
 [[package]]
 name = "ipnet"
 version = "2.12.0"
@@ -1244,6 +1413,27 @@ version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154"
 
+[[package]]
+name = "lzma-rs"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "297e814c836ae64db86b36cf2a557ba54368d03f6afcd7d947c266692f71115e"
+dependencies = [
+ "byteorder",
+ "crc",
+]
+
+[[package]]
+name = "lzma-sys"
+version = "0.1.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27"
+dependencies = [
+ "cc",
+ "libc",
+ "pkg-config",
+]
+
 [[package]]
 name = "mac"
 version = "0.1.1"
@@ -1414,6 +1604,16 @@ version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b867cad97c0791bbd3aaa6472142568c6c9e8f71937e98379f584cfb0cf35bec"
 
+[[package]]
+name = "pbkdf2"
+version = "0.12.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f8ed6a7761f76e3b9f92dfb0a60a6a6477c61024b775147ff0973a02653abaf2"
+dependencies = [
+ "digest",
+ "hmac",
+]
+
 [[package]]
 name = "pdf-extract"
 version = "0.7.12"
@@ -1629,6 +1829,16 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "quick-xml"
+version = "0.39.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "958f21e8e7ceb5a1aa7fa87fab28e7c75976e0bfe7e23ff069e0a260f894067d"
+dependencies = [
+ "encoding_rs",
+ "memchr",
+]
+
 [[package]]
 name = "quinn"
 version = "0.11.9"
@@ -2220,6 +2430,17 @@ dependencies = [
  "stable_deref_trait",
 ]
 
+[[package]]
+name = "sha1"
+version = "0.10.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba"
+dependencies = [
+ "cfg-if",
+ "cpufeatures 0.2.17",
+ "digest",
+]
+
 [[package]]
 name = "sharded-slab"
 version = "0.1.7"
@@ -2645,6 +2866,12 @@ dependencies = [
  "pom",
 ]
 
+[[package]]
+name = "typed-path"
+version = "0.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e28f89b80c87b8fb0cf04ab448d5dd0dd0ade2f8891bae878de66a75a28600e"
+
 [[package]]
 name = "typenum"
 version = "1.19.0"
@@ -2881,7 +3108,7 @@ dependencies = [
 
 [[package]]
 name = "webclaw-cli"
-version = "0.1.7"
+version = "0.2.0"
 dependencies = [
  "clap",
  "dotenvy",
@@ -2901,7 +3128,7 @@ dependencies = [
 
 [[package]]
 name = "webclaw-core"
-version = "0.1.7"
+version = "0.2.0"
 dependencies = [
  "ego-tree",
  "once_cell",
@@ -2919,10 +3146,11 @@ dependencies = [
 
 [[package]]
 name = "webclaw-fetch"
-version = "0.1.7"
+version = "0.2.0"
 dependencies = [
+ "calamine",
  "primp",
- "quick-xml",
+ "quick-xml 0.37.5",
  "rand 0.8.5",
  "serde",
  "serde_json",
@@ -2933,11 +3161,12 @@ dependencies = [
  "url",
  "webclaw-core",
  "webclaw-pdf",
+ "zip 2.4.2",
 ]
 
 [[package]]
 name = "webclaw-llm"
-version = "0.1.7"
+version = "0.2.0"
 dependencies = [
  "async-trait",
  "reqwest",
@@ -2950,7 +3179,7 @@ dependencies = [
 
 [[package]]
 name = "webclaw-mcp"
-version = "0.1.7"
+version = "0.2.0"
 dependencies = [
  "dotenvy",
  "reqwest",
@@ -2970,7 +3199,7 @@ dependencies = [
 
 [[package]]
 name = "webclaw-pdf"
-version = "0.1.7"
+version = "0.2.0"
 dependencies = [
  "pdf-extract",
  "thiserror",
@@ -3301,6 +3530,15 @@ version = "0.6.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9"
 
+[[package]]
+name = "xz2"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2"
+dependencies = [
+ "lzma-sys",
+]
+
 [[package]]
 name = "yoke"
 version = "0.8.1"
@@ -3418,12 +3656,74 @@ dependencies = [
  "syn",
 ]
 
+[[package]]
+name = "zip"
+version = "2.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fabe6324e908f85a1c52063ce7aa26b68dcb7eb6dbc83a2d148403c9bc3eba50"
+dependencies = [
+ "aes",
+ "arbitrary",
+ "bzip2",
+ "constant_time_eq",
+ "crc32fast",
+ "crossbeam-utils",
+ "deflate64",
+ "displaydoc",
+ "flate2",
+ "getrandom 0.3.4",
+ "hmac",
+ "indexmap",
+ "lzma-rs",
+ "memchr",
+ "pbkdf2",
+ "sha1",
+ "thiserror",
+ "time",
+ "xz2",
+ "zeroize",
+ "zopfli",
+ "zstd",
+]
+
+[[package]]
+name = "zip"
+version = "7.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c42e33efc22a0650c311c2ef19115ce232583abbe80850bc8b66509ebef02de0"
+dependencies = [
+ "crc32fast",
+ "flate2",
+ "indexmap",
+ "memchr",
+ "typed-path",
+ "zopfli",
+]
+
+[[package]]
+name = "zlib-rs"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3be3d40e40a133f9c916ee3f9f4fa2d9d63435b5fbe1bfc6d9dae0aa0ada1513"
+
 [[package]]
 name = "zmij"
 version = "1.0.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa"
 
+[[package]]
+name = "zopfli"
+version = "0.8.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f05cd8797d63865425ff89b5c4a48804f35ba0ce8d125800027ad6017d2b5249"
+dependencies = [
+ "bumpalo",
+ "crc32fast",
+ "log",
+ "simd-adler32",
+]
+
 [[package]]
 name = "zstd"
 version = "0.13.3"
diff --git a/Cargo.toml b/Cargo.toml
index 40eada1..129f937 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,7 +3,7 @@ resolver = "2"
 members = ["crates/*"]
 
 [workspace.package]
-version = "0.1.7"
+version = "0.2.0"
 edition = "2024"
 license = "MIT"
 repository = "https://github.com/0xMassi/webclaw"
diff --git a/crates/webclaw-cli/src/main.rs b/crates/webclaw-cli/src/main.rs
index 4aa8a7f..f58c68b 100644
--- a/crates/webclaw-cli/src/main.rs
+++ b/crates/webclaw-cli/src/main.rs
@@ -95,7 +95,7 @@ struct Cli {
     #[arg(long)]
     urls_file: Option<String>,
 
-    /// Output format (markdown, json, text, llm)
+    /// Output format (markdown, json, text, llm, html)
     #[arg(short, long, default_value = "markdown")]
     format: OutputFormat,
 
@@ -277,6 +277,7 @@ enum OutputFormat {
     Json,
     Text,
     Llm,
+    Html,
 }
 
 #[derive(Clone, ValueEnum)]
@@ -394,7 +395,7 @@ fn build_extraction_options(cli: &Cli) -> ExtractionOptions {
             .map(|s| s.split(',').map(|s| s.trim().to_string()).collect())
             .unwrap_or_default(),
         only_main_content: cli.only_main_content,
-        include_raw_html: cli.raw_html,
+        include_raw_html: cli.raw_html || matches!(cli.format, OutputFormat::Html),
     }
 }
 
@@ -417,6 +418,7 @@ fn url_to_filename(raw_url: &str, format: &OutputFormat) -> String {
         OutputFormat::Markdown | OutputFormat::Llm => "md",
         OutputFormat::Json => "json",
         OutputFormat::Text => "txt",
+        OutputFormat::Html => "html",
     };
 
     let parsed = url::Url::parse(raw_url);
@@ -470,6 +472,15 @@ fn write_to_file(dir: &Path, filename: &str, content: &str) -> Result<(), String
     Ok(())
 }
 
+/// Get raw HTML from an extraction result, falling back to markdown if unavailable.
+fn raw_html_or_markdown(result: &ExtractionResult) -> &str {
+    result
+        .content
+        .raw_html
+        .as_deref()
+        .unwrap_or(&result.content.markdown)
+}
+
 /// Format an `ExtractionResult` into a string for the given output format.
 fn format_output(result: &ExtractionResult, format: &OutputFormat, show_metadata: bool) -> String {
     match format {
@@ -484,6 +495,7 @@ fn format_output(result: &ExtractionResult, format: &OutputFormat, show_metadata
         OutputFormat::Json => serde_json::to_string_pretty(result).expect("serialization failed"),
         OutputFormat::Text => result.content.plain_text.clone(),
         OutputFormat::Llm => to_llm_text(result, result.metadata.url.as_deref()),
+        OutputFormat::Html => raw_html_or_markdown(result).to_string(),
     }
 }
 
@@ -586,6 +598,7 @@ async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
             OutputFormat::Json => "json",
             OutputFormat::Text => "text",
             OutputFormat::Llm => "llm",
+            OutputFormat::Html => "html",
         };
         let resp = c
             .scrape(
@@ -618,6 +631,7 @@ async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
                 OutputFormat::Json => "json",
                 OutputFormat::Text => "text",
                 OutputFormat::Llm => "llm",
+                OutputFormat::Html => "html",
             };
             match c
                 .scrape(
@@ -793,6 +807,9 @@ fn print_output(result: &ExtractionResult, format: &OutputFormat, show_metadata:
         OutputFormat::Llm => {
             println!("{}", to_llm_text(result, result.metadata.url.as_deref()));
         }
+        OutputFormat::Html => {
+            println!("{}", raw_html_or_markdown(result));
+        }
     }
 }
 
@@ -845,6 +862,17 @@ fn print_cloud_output(resp: &serde_json::Value, format: &OutputFormat) {
                 print_cloud_output(resp, &OutputFormat::Markdown);
             }
         }
+        OutputFormat::Html => {
+            if let Some(html) = resp
+                .get("content")
+                .and_then(|c| c.get("raw_html"))
+                .and_then(|h| h.as_str())
+            {
+                println!("{html}");
+            } else {
+                print_cloud_output(resp, &OutputFormat::Markdown);
+            }
+        }
     }
 }
 
@@ -937,6 +965,17 @@ fn print_crawl_output(result: &CrawlResult, format: &OutputFormat, show_metadata
                 println!();
             }
         }
+        OutputFormat::Html => {
+            for page in &result.pages {
+                let Some(ref extraction) = page.extraction else {
+                    continue;
+                };
+                println!("---");
+                println!("<!-- Page: {} -->\n", page.url);
+                println!("{}", raw_html_or_markdown(extraction));
+                println!();
+            }
+        }
     }
 }
 
@@ -1009,6 +1048,21 @@ fn print_batch_output(results: &[BatchExtractResult], format: &OutputFormat, sho
                 }
             }
         }
+        OutputFormat::Html => {
+            for r in results {
+                match &r.result {
+                    Ok(extraction) => {
+                        println!("---");
+                        println!("<!-- {} -->\n", r.url);
+                        println!("{}", raw_html_or_markdown(extraction));
+                        println!();
+                    }
+                    Err(e) => {
+                        eprintln!("error: {} -- {}", r.url, e);
+                    }
+                }
+            }
+        }
     }
 }
 
@@ -1393,24 +1447,15 @@ fn fire_webhook(url: &str, payload: &serde_json::Value) {
     });
 }
 
-async fn run_watch(cli: &Cli) -> Result<(), String> {
-    let raw_url = cli.urls.first().ok_or("--watch requires a URL argument")?;
-    let url = normalize_url(raw_url);
+async fn run_watch(cli: &Cli, urls: &[String]) -> Result<(), String> {
+    if urls.is_empty() {
+        return Err("--watch requires at least one URL".into());
+    }
 
-    let client =
-        FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
-    let options = build_extraction_options(cli);
-
-    // Initial snapshot
-    let mut previous = client
-        .fetch_and_extract_with_options(&url, &options)
-        .await
-        .map_err(|e| format!("initial fetch failed: {e}"))?;
-
-    eprintln!(
-        "[watch] Initial snapshot: {url} ({} words)",
-        previous.metadata.word_count
+    let client = Arc::new(
+        FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?,
     );
+    let options = build_extraction_options(cli);
 
     // Ctrl+C handler
     let cancelled = Arc::new(AtomicBool::new(false));
@@ -1420,6 +1465,33 @@ async fn run_watch(cli: &Cli) -> Result<(), String> {
         flag.store(true, Ordering::Relaxed);
     });
 
+    // Single-URL mode: preserve original behavior exactly
+    if urls.len() == 1 {
+        return run_watch_single(cli, &client, &options, &urls[0], &cancelled).await;
+    }
+
+    // Multi-URL mode: batch fetch, diff each, report aggregate
+    run_watch_multi(cli, &client, &options, urls, &cancelled).await
+}
+
+/// Original single-URL watch loop -- backward compatible.
+async fn run_watch_single(
+    cli: &Cli,
+    client: &Arc<FetchClient>,
+    options: &ExtractionOptions,
+    url: &str,
+    cancelled: &Arc<AtomicBool>,
+) -> Result<(), String> {
+    let mut previous = client
+        .fetch_and_extract_with_options(url, options)
+        .await
+        .map_err(|e| format!("initial fetch failed: {e}"))?;
+
+    eprintln!(
+        "[watch] Initial snapshot: {url} ({} words)",
+        previous.metadata.word_count
+    );
+
     loop {
         tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval)).await;
 
@@ -1428,7 +1500,7 @@ async fn run_watch(cli: &Cli) -> Result<(), String> {
             break;
         }
 
-        let current = match client.fetch_and_extract_with_options(&url, &options).await {
+        let current = match client.fetch_and_extract_with_options(url, options).await {
             Ok(result) => result,
             Err(e) => {
                 eprintln!("[watch] Fetch error ({}): {e}", timestamp());
@@ -1454,7 +1526,6 @@ async fn run_watch(cli: &Cli) -> Result<(), String> {
                     .spawn()
                 {
                     Ok(mut child) => {
-                        // Pipe diff JSON to stdin, then detach
                         if let Some(mut stdin) = child.stdin.take() {
                             use tokio::io::AsyncWriteExt;
                             let _ = stdin.write_all(diff_json.as_bytes()).await;
@@ -1464,7 +1535,6 @@ async fn run_watch(cli: &Cli) -> Result<(), String> {
                 }
             }
 
-            // Fire webhook on change
             if let Some(ref webhook_url) = cli.webhook {
                 fire_webhook(
                     webhook_url,
@@ -1487,6 +1557,162 @@ async fn run_watch(cli: &Cli) -> Result<(), String> {
     Ok(())
 }
 
+/// Multi-URL watch loop -- batch fetch all URLs, diff each, report aggregate.
+async fn run_watch_multi(
+    cli: &Cli,
+    client: &Arc<FetchClient>,
+    options: &ExtractionOptions,
+    urls: &[String],
+    cancelled: &Arc<AtomicBool>,
+) -> Result<(), String> {
+    let url_refs: Vec<&str> = urls.iter().map(|u| u.as_str()).collect();
+
+    // Initial pass: fetch all URLs in parallel
+    let initial_results = client
+        .fetch_and_extract_batch_with_options(&url_refs, cli.concurrency, options)
+        .await;
+
+    let mut snapshots = std::collections::HashMap::new();
+    let mut ok_count = 0usize;
+    let mut err_count = 0usize;
+
+    for r in initial_results {
+        match r.result {
+            Ok(extraction) => {
+                snapshots.insert(r.url, extraction);
+                ok_count += 1;
+            }
+            Err(e) => {
+                eprintln!("[watch] Initial fetch error: {} -- {e}", r.url);
+                err_count += 1;
+            }
+        }
+    }
+
+    eprintln!(
+        "[watch] Watching {} URLs (interval: {}s)",
+        urls.len(),
+        cli.watch_interval
+    );
+    eprintln!("[watch] Initial snapshots: {ok_count} ok, {err_count} errors");
+
+    let mut check_number = 0u64;
+
+    loop {
+        tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval)).await;
+
+        if cancelled.load(Ordering::Relaxed) {
+            eprintln!("[watch] Stopped");
+            break;
+        }
+
+        check_number += 1;
+
+        let current_results = client
+            .fetch_and_extract_batch_with_options(&url_refs, cli.concurrency, options)
+            .await;
+
+        let mut changed: Vec<serde_json::Value> = Vec::new();
+        let mut same_count = 0usize;
+        let mut fetch_errors = 0usize;
+
+        for r in current_results {
+            match r.result {
+                Ok(current) => {
+                    if let Some(previous) = snapshots.get(&r.url) {
+                        let diff = webclaw_core::diff::diff(previous, &current);
+                        if diff.status == ChangeStatus::Same {
+                            same_count += 1;
+                        } else {
+                            changed.push(serde_json::json!({
+                                "url": r.url,
+                                "word_count_delta": diff.word_count_delta,
+                            }));
+                            snapshots.insert(r.url, current);
+                        }
+                    } else {
+                        // URL failed initially, first successful fetch -- store as baseline
+                        snapshots.insert(r.url, current);
+                        same_count += 1;
+                    }
+                }
+                Err(e) => {
+                    eprintln!("[watch] Fetch error: {} -- {e}", r.url);
+                    fetch_errors += 1;
+                }
+            }
+        }
+
+        let ts = timestamp();
+        let err_suffix = if fetch_errors > 0 {
+            format!(", {fetch_errors} errors")
+        } else {
+            String::new()
+        };
+
+        if changed.is_empty() {
+            eprintln!(
+                "[watch] Check {check_number} ({ts}): 0 changed, {same_count} same{err_suffix}"
+            );
+        } else {
+            eprintln!(
+                "[watch] Check {check_number} ({ts}): {} changed, {same_count} same{err_suffix}",
+                changed.len(),
+            );
+            for entry in &changed {
+                let url = entry["url"].as_str().unwrap_or("?");
+                let delta = entry["word_count_delta"].as_i64().unwrap_or(0);
+                eprintln!("  -> {url} (word delta: {delta:+})");
+            }
+
+            // Fire --on-change once with all changes
+            if let Some(ref cmd) = cli.on_change {
+                let payload = serde_json::json!({
+                    "event": "watch_changes",
+                    "check_number": check_number,
+                    "total_urls": urls.len(),
+                    "changed": changed.len(),
+                    "same": same_count,
+                    "changes": changed,
+                });
+                let payload_json = serde_json::to_string(&payload).unwrap_or_default();
+                eprintln!("[watch] Running: {cmd}");
+                match tokio::process::Command::new("sh")
+                    .arg("-c")
+                    .arg(cmd)
+                    .stdin(std::process::Stdio::piped())
+                    .spawn()
+                {
+                    Ok(mut child) => {
+                        if let Some(mut stdin) = child.stdin.take() {
+                            use tokio::io::AsyncWriteExt;
+                            let _ = stdin.write_all(payload_json.as_bytes()).await;
+                        }
+                    }
+                    Err(e) => eprintln!("[watch] Failed to run command: {e}"),
+                }
+            }
+
+            // Fire webhook once with aggregate payload
+            if let Some(ref webhook_url) = cli.webhook {
+                fire_webhook(
+                    webhook_url,
+                    &serde_json::json!({
+                        "event": "watch_changes",
+                        "check_number": check_number,
+                        "total_urls": urls.len(),
+                        "changed": changed.len(),
+                        "same": same_count,
+                        "changes": changed,
+                    }),
+                );
+            }
+        }
+    }
+
+    Ok(())
+}
+
 async fn run_diff(cli: &Cli, snapshot_path: &str) -> Result<(), String> {
     // Load previous snapshot
     let snapshot_json = std::fs::read_to_string(snapshot_path)
@@ -1626,6 +1852,158 @@ async fn run_llm(cli: &Cli) -> Result<(), String> {
     Ok(())
 }
 
+/// Batch LLM extraction: fetch each URL, run LLM on extracted content, save/print results.
+/// URLs are processed sequentially to respect LLM provider rate limits.
+async fn run_batch_llm(cli: &Cli, entries: &[(String, Option<String>)]) -> Result<(), String> {
+    let client =
+        FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
+    let options = build_extraction_options(cli);
+    let provider = build_llm_provider(cli).await?;
+    let model = cli.llm_model.as_deref();
+
+    // Pre-parse schema once if --extract-json is used
+    let schema = if let Some(ref schema_input) = cli.extract_json {
+        let schema_str = if let Some(path) = schema_input.strip_prefix('@') {
+            std::fs::read_to_string(path)
+                .map_err(|e| format!("failed to read schema file {path}: {e}"))?
+        } else {
+            schema_input.clone()
+        };
+        Some(
+            serde_json::from_str::<serde_json::Value>(&schema_str)
+                .map_err(|e| format!("invalid JSON schema: {e}"))?,
+        )
+    } else {
+        None
+    };
+
+    // Build custom filename lookup from entries
+    let custom_names: std::collections::HashMap<&str, &str> = entries
+        .iter()
+        .filter_map(|(url, name)| name.as_deref().map(|n| (url.as_str(), n)))
+        .collect();
+
+    let total = entries.len();
+    let mut ok = 0usize;
+    let mut errors = 0usize;
+    let mut all_results: Vec<serde_json::Value> = Vec::with_capacity(total);
+
+    for (i, (url, _)) in entries.iter().enumerate() {
+        let idx = i + 1;
+        eprint!("[{idx}/{total}] {url} ");
+
+        // Fetch and extract page content
+        let extraction = match client.fetch_and_extract_with_options(url, &options).await {
+            Ok(r) => r,
+            Err(e) => {
+                errors += 1;
+                let msg = format!("fetch failed: {e}");
+                eprintln!("-> error: {msg}");
+                all_results.push(serde_json::json!({ "url": url, "error": msg }));
+                continue;
+            }
+        };
+
+        let text = &extraction.content.plain_text;
+
+        // Run the appropriate LLM operation
+        let llm_result = if let Some(ref schema) = schema {
+            webclaw_llm::extract::extract_json(text, schema, provider.as_ref(), model)
+                .await
+                .map(LlmOutput::Json)
+        } else if let Some(ref prompt) = cli.extract_prompt {
+            webclaw_llm::extract::extract_with_prompt(text, prompt, provider.as_ref(), model)
+                .await
+                .map(LlmOutput::Json)
+        } else if let Some(sentences) = cli.summarize {
+            webclaw_llm::summarize::summarize(text, Some(sentences), provider.as_ref(), model)
+                .await
+                .map(LlmOutput::Text)
+        } else {
+            unreachable!("run_batch_llm called without LLM flags")
+        };
+
+        match llm_result {
+            Ok(output) => {
+                ok += 1;
+
+                let (output_str, result_json) = match &output {
+                    LlmOutput::Json(v) => {
+                        let s = serde_json::to_string_pretty(v).expect("serialization failed");
+                        let j = serde_json::json!({ "url": url, "result": v });
+                        (s, j)
+                    }
+                    LlmOutput::Text(s) => {
+                        let j = serde_json::json!({ "url": url, "result": s });
+                        (s.clone(), j)
+                    }
+                };
+
+                // Count top-level fields/items for progress display
+                let detail = match &output {
+                    LlmOutput::Json(v) => match v {
+                        serde_json::Value::Object(m) => format!("{} fields", m.len()),
+                        serde_json::Value::Array(a) => format!("{} items", a.len()),
+                        _ => "done".to_string(),
+                    },
+                    LlmOutput::Text(s) => {
+                        let words = s.split_whitespace().count();
+                        format!("{words} words")
+                    }
+                };
+                eprintln!("-> extracted {detail}");
+
+                if let Some(ref dir) = cli.output_dir {
+                    let filename = custom_names
+                        .get(url.as_str())
+                        .map(|s| s.to_string())
+                        .unwrap_or_else(|| url_to_filename(url, &OutputFormat::Json));
+                    write_to_file(dir, &filename, &output_str)?;
+                } else {
+                    println!("--- {url}");
+                    println!("{output_str}");
+                    println!();
+                }
+
+                all_results.push(result_json);
+            }
+            Err(e) => {
+                errors += 1;
+                let msg = format!("LLM extraction failed: {e}");
+                eprintln!("-> error: {msg}");
+                all_results.push(serde_json::json!({ "url": url, "error": msg }));
+            }
+        }
+    }
+
+    eprintln!("Processed {total} URLs ({ok} ok, {errors} errors)");
+
+    if let Some(ref webhook_url) = cli.webhook {
+        fire_webhook(
+            webhook_url,
+            &serde_json::json!({
+                "event": "batch_llm_complete",
+                "total": total,
+                "ok": ok,
+                "errors": errors,
+            }),
+        );
+        tokio::time::sleep(std::time::Duration::from_millis(500)).await;
+    }
+
+    if errors > 0 {
+        Err(format!("{errors} of {total} URLs failed"))
+    } else {
+        Ok(())
+    }
+}
+
+/// Intermediate type to hold LLM output before formatting.
+enum LlmOutput {
+    Json(serde_json::Value),
+    Text(String),
+}
+
 /// Returns true if any LLM flag is set.
 fn has_llm_flags(cli: &Cli) -> bool {
     cli.extract_json.is_some() || cli.extract_prompt.is_some() || cli.summarize.is_some()
@@ -1656,9 +2034,16 @@ async fn main() {
         return;
     }
 
-    // --watch: poll a URL for changes
+    // --watch: poll URL(s) for changes
     if cli.watch {
-        if let Err(e) = run_watch(&cli).await {
+        let watch_urls: Vec<String> = match collect_urls(&cli) {
+            Ok(entries) => entries.into_iter().map(|(url, _)| url).collect(),
+            Err(e) => {
+                eprintln!("error: {e}");
+                process::exit(1);
+            }
+        };
+        if let Err(e) = run_watch(&cli, &watch_urls).await {
             eprintln!("error: {e}");
             process::exit(1);
         }
@@ -1683,15 +2068,6 @@ async fn main() {
         return;
     }
 
-    // LLM modes: --extract-json, --extract-prompt, --summarize
-    if has_llm_flags(&cli) {
-        if let Err(e) = run_llm(&cli).await {
-            eprintln!("error: {e}");
-            process::exit(1);
-        }
-        return;
-    }
-
     // Collect all URLs from args + --urls-file
     let entries = match collect_urls(&cli) {
         Ok(u) => u,
@@ -1701,6 +2077,21 @@ async fn main() {
         }
     };
 
+    // LLM modes: --extract-json, --extract-prompt, --summarize
+    // When multiple URLs are provided, run batch LLM extraction over all of them.
+    if has_llm_flags(&cli) {
+        if entries.len() > 1 {
+            if let Err(e) = run_batch_llm(&cli, &entries).await {
+                eprintln!("error: {e}");
+                process::exit(1);
+            }
+        } else if let Err(e) = run_llm(&cli).await {
+            eprintln!("error: {e}");
+            process::exit(1);
+        }
+        return;
+    }
+
     // Multi-URL batch mode
     if entries.len() > 1 {
         if let Err(e) = run_batch(&cli, &entries).await {
@@ -1824,6 +2215,14 @@ mod tests {
         );
     }
 
+    #[test]
+    fn url_to_filename_html_format() {
+        assert_eq!(
+            url_to_filename("https://example.com/docs/api", &OutputFormat::Html),
+            "docs/api.html"
+        );
+    }
+
     #[test]
     fn url_to_filename_special_chars() {
         // Spaces and special chars get replaced with underscores
diff --git a/crates/webclaw-fetch/Cargo.toml b/crates/webclaw-fetch/Cargo.toml
index 7ea9625..e4da69b 100644
--- a/crates/webclaw-fetch/Cargo.toml
+++ b/crates/webclaw-fetch/Cargo.toml
@@ -19,6 +19,8 @@ url = "2"
 rand = "0.8"
 quick-xml = { version = "0.37", features = ["serde"] }
 serde_json.workspace = true
+calamine = "0.34"
+zip = "2"
 
 [dev-dependencies]
 tempfile = "3"
diff --git a/crates/webclaw-fetch/src/client.rs b/crates/webclaw-fetch/src/client.rs
index 4af675e..5b8526e 100644
--- a/crates/webclaw-fetch/src/client.rs
+++ b/crates/webclaw-fetch/src/client.rs
@@ -399,6 +399,27 @@ impl FetchClient {
 
             let pdf_result = webclaw_pdf::extract_pdf(&bytes, self.pdf_mode.clone())?;
             Ok(pdf_to_extraction_result(&pdf_result, &final_url))
+        } else if let Some(doc_type) =
+            crate::document::is_document_content_type(&headers, &final_url)
+        {
+            debug!(status, doc_type = ?doc_type, "detected document response, extracting");
+
+            let bytes = response
+                .bytes()
+                .await
+                .map_err(|e| FetchError::BodyDecode(e.to_string()))?;
+
+            let elapsed = start.elapsed();
+            debug!(
+                status,
+                bytes = bytes.len(),
+                elapsed_ms = %elapsed.as_millis(),
+                "document fetch complete"
+            );
+
+            let mut result = crate::document::extract_document(&bytes, doc_type)?;
+            result.metadata.url = Some(final_url);
+            Ok(result)
         } else {
             let html = response
                 .text()
diff --git a/crates/webclaw-fetch/src/document.rs b/crates/webclaw-fetch/src/document.rs
new file mode 100644
index 0000000..0291d52
--- /dev/null
+++ b/crates/webclaw-fetch/src/document.rs
@@ -0,0 +1,743 @@
+/// Document extraction for DOCX, XLSX, XLS, and CSV files.
+/// Auto-detects document type from Content-Type headers or URL extension,
+/// then extracts text content as markdown — same pattern as PDF extraction.
+use std::collections::HashMap;
+use std::io::{Cursor, Read};
+
+use tracing::debug;
+
+use crate::error::FetchError;
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum DocType {
+    Docx,
+    Xlsx,
+    Xls,
+    Csv,
+}
+
+impl DocType {
+    fn label(self) -> &'static str {
+        match self {
+            DocType::Docx => "DOCX",
+            DocType::Xlsx => "XLSX",
+            DocType::Xls => "XLS",
+            DocType::Csv => "CSV",
+        }
+    }
+}
+
+/// Detect document type from response headers or URL extension.
+/// Returns `None` for non-document responses (HTML, PDF, etc.).
+pub fn is_document_content_type(headers: &HashMap<String, String>, url: &str) -> Option<DocType> {
+    // Check Content-Type header first
+    if let Some(ct) = headers.get("content-type") {
+        let mime = ct.split(';').next().unwrap_or("").trim();
+
+        if mime.eq_ignore_ascii_case(
+            "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        ) {
+            return Some(DocType::Docx);
+        }
+        if mime.eq_ignore_ascii_case(
+            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+        ) {
+            return Some(DocType::Xlsx);
+        }
+        if mime.eq_ignore_ascii_case("application/vnd.ms-excel") {
+            return Some(DocType::Xls);
+        }
+        if mime.eq_ignore_ascii_case("text/csv") {
+            return Some(DocType::Csv);
+        }
+    }
+
+    // Fall back to URL extension
+    let path = url.split('?').next().unwrap_or(url);
+    let lower = path.to_ascii_lowercase();
+
+    if lower.ends_with(".docx") {
+        return Some(DocType::Docx);
+    }
+    if lower.ends_with(".xlsx") {
+        return Some(DocType::Xlsx);
+    }
+    if lower.ends_with(".xls") {
+        return Some(DocType::Xls);
+    }
+    if lower.ends_with(".csv") {
+        return Some(DocType::Csv);
+    }
+
+    None
+}
+
+/// Extract text content from document bytes, returning an ExtractionResult.
+pub fn extract_document(
+    bytes: &[u8],
+    doc_type: DocType,
+) -> Result<webclaw_core::ExtractionResult, FetchError> {
+    debug!(
+        doc_type = doc_type.label(),
+        bytes = bytes.len(),
+        "extracting document"
+    );
+
+    let markdown = match doc_type {
+        DocType::Docx => extract_docx(bytes)?,
+        DocType::Xlsx => extract_xlsx(bytes)?,
+        DocType::Xls => extract_xls(bytes)?,
+        DocType::Csv => extract_csv(bytes)?,
+    };
+
+    let plain_text = strip_markdown_formatting(&markdown);
+    let word_count = plain_text.split_whitespace().count();
+
+    Ok(webclaw_core::ExtractionResult {
+        metadata: webclaw_core::Metadata {
+            title: None,
+            description: None,
+            author: None,
+            published_date: None,
+            language: None,
+            url: None,
+            site_name: None,
+            image: None,
+            favicon: None,
+            word_count,
+        },
+        content: webclaw_core::Content {
+            markdown,
+            plain_text,
+            links: Vec::new(),
+            images: Vec::new(),
+            code_blocks: Vec::new(),
+            raw_html: None,
+        },
+        domain_data: None,
+        structured_data: vec![],
+    })
+}
+
+/// Extract text from a DOCX file (ZIP of XML).
+/// Reads `word/document.xml`, extracts `<w:t>` text nodes, detects heading styles.
+fn extract_docx(bytes: &[u8]) -> Result<String, FetchError> {
+    let cursor = Cursor::new(bytes);
+    let mut archive =
+        zip::ZipArchive::new(cursor).map_err(|e| FetchError::Build(format!("DOCX zip: {e}")))?;
+
+    let xml = {
+        let mut file = archive
+            .by_name("word/document.xml")
+            .map_err(|e| FetchError::Build(format!("DOCX missing document.xml: {e}")))?;
+        let mut buf = String::new();
+        file.read_to_string(&mut buf)
+            .map_err(|e| FetchError::BodyDecode(format!("DOCX read: {e}")))?;
+        buf
+    };
+
+    parse_docx_xml(&xml)
+}
+
+/// Parse DOCX XML (word/document.xml) into markdown.
+///
+/// Walks the XML looking for paragraph elements (`<w:p>`). Within each paragraph,
+/// collects text from `<w:t>` tags and detects heading styles from `<w:pStyle>`.
+fn parse_docx_xml(xml: &str) -> Result<String, FetchError> {
+    use quick_xml::Reader;
+    use quick_xml::events::Event;
+
+    let mut reader = Reader::from_str(xml);
+    let mut paragraphs: Vec<String> = Vec::new();
+
+    // State tracking for the current paragraph
+    let mut in_paragraph = false;
+    let mut in_run = false; // inside <w:r> (run)
+    let mut in_text = false; // inside <w:t>
+    let mut current_text = String::new();
+    let mut heading_level: Option<u8> = 0.into(); // None = normal paragraph
+    let mut in_ppr = false; // inside <w:pPr> (paragraph properties)
+
+    loop {
+        match reader.read_event() {
+            Ok(Event::Start(ref e)) | Ok(Event::Empty(ref e)) => {
+                let name_bytes = e.name().as_ref().to_vec();
+                let local = local_name(&name_bytes);
+                match local {
+                    b"p" if is_w_namespace(&name_bytes) => {
+                        in_paragraph = true;
+                        current_text.clear();
+                        heading_level = None;
+                    }
+                    b"pPr" if in_paragraph => in_ppr = true,
+                    b"pStyle" if in_ppr => {
+                        heading_level = extract_heading_level(e);
+                    }
+                    b"r" if in_paragraph => in_run = true,
+                    b"t" if in_run => in_text = true,
+                    b"br" if in_paragraph => {
+                        current_text.push('\n');
+                    }
+                    b"tab" if in_paragraph => {
+                        current_text.push('\t');
+                    }
+                    _ => {}
+                }
+            }
+            Ok(Event::End(ref e)) => {
+                let name_bytes = e.name().as_ref().to_vec();
+                let local = local_name(&name_bytes);
+                match local {
+                    b"p" if in_paragraph => {
+                        let text = current_text.trim().to_string();
+                        if !text.is_empty() {
+                            let formatted = match heading_level {
+                                Some(1) => format!("# {text}"),
+                                Some(2) => format!("## {text}"),
+                                Some(3) => format!("### {text}"),
+                                Some(4) => format!("#### {text}"),
+                                Some(5) => format!("##### {text}"),
+                                Some(6) => format!("###### {text}"),
+                                _ => text,
+                            };
+                            paragraphs.push(formatted);
+                        }
+                        in_paragraph = false;
+                    }
+                    b"pPr" => in_ppr = false,
+                    b"r" => {
+                        in_run = false;
+                        in_text = false;
+                    }
+                    b"t" => in_text = false,
+                    _ => {}
+                }
+            }
+            Ok(Event::Text(ref e)) if in_text => {
+                if let Ok(text) = e.unescape() {
+                    current_text.push_str(&text);
+                }
+            }
+            Ok(Event::Eof) => break,
+            Err(e) => {
+                return Err(FetchError::Build(format!("DOCX XML parse error: {e}")));
+            }
+            _ => {}
+        }
+    }
+
+    Ok(paragraphs.join("\n\n"))
+}
+
+/// Check if a qualified name belongs to the `w:` (wordprocessingML) namespace.
+/// Handles both `w:p` (prefixed) and just `p` (default namespace) forms.
+fn is_w_namespace(name: &[u8]) -> bool {
+    // quick-xml gives us the full name bytes. Accept both "w:p" and "p".
+    name == b"w:p" || name == b"p"
+}
+
+/// Extract the local name from a possibly namespaced XML tag.
+/// `w:p` -> `p`, `p` -> `p`
+fn local_name(name: &[u8]) -> &[u8] {
+    match name.iter().position(|&b| b == b':') {
+        Some(pos) => &name[pos + 1..],
+        None => name,
+    }
+}
+
+/// Extract heading level from a `<w:pStyle w:val="Heading1"/>` element.
+fn extract_heading_level(e: &quick_xml::events::BytesStart) -> Option<u8> {
+    for attr in e.attributes().flatten() {
+        let local = local_name(attr.key.as_ref());
+        if local == b"val" {
+            let val = String::from_utf8_lossy(&attr.value);
+            let lower = val.to_ascii_lowercase();
+
+            // Match "heading1", "heading2", etc. and "title" -> h1
+            if lower == "title" {
+                return Some(1);
+            }
+            if let Some(rest) = lower.strip_prefix("heading")
+                && let Ok(n) = rest.parse::<u8>()
+            {
+                return Some(n.min(6));
+            }
+        }
+    }
+    None
+}
+
+/// Extract spreadsheet content using calamine (XLSX format).
+fn extract_xlsx(bytes: &[u8]) -> Result<String, FetchError> {
+    extract_spreadsheet(bytes, "XLSX")
+}
+
+/// Extract spreadsheet content using calamine (XLS format).
+fn extract_xls(bytes: &[u8]) -> Result<String, FetchError> {
+    extract_spreadsheet(bytes, "XLS")
+}
+
+/// Shared spreadsheet extraction for both XLSX and XLS via calamine.
+/// Reads all sheets and formats each as a markdown table.
+fn extract_spreadsheet(bytes: &[u8], label: &str) -> Result<String, FetchError> {
+    use calamine::Reader;
+
+    let cursor = Cursor::new(bytes);
+    let mut workbook: calamine::Sheets<_> = calamine::open_workbook_auto_from_rs(cursor)
+        .map_err(|e| FetchError::Build(format!("{label} open: {e}")))?;
+
+    let sheet_names: Vec<String> = workbook.sheet_names().to_vec();
+    let mut sections: Vec<String> = Vec::new();
+
+    for name in &sheet_names {
+        let range = workbook
+            .worksheet_range(name)
+            .map_err(|e| FetchError::Build(format!("{label} sheet '{name}': {e}")))?;
+
+        let rows: Vec<Vec<String>> = range
+            .rows()
+            .map(|row| row.iter().map(cell_to_string).collect())
+            .collect();
+
+        if rows.is_empty() {
+            continue;
+        }
+
+        let mut section = format!("## Sheet: {name}\n\n");
+        section.push_str(&rows_to_markdown_table(&rows));
+        sections.push(section);
+    }
+
+    if sections.is_empty() {
+        return Ok("(empty spreadsheet)".to_string());
+    }
+
+    Ok(sections.join("\n\n"))
+}
+
+/// Convert a calamine cell value to a display string.
+fn cell_to_string(cell: &calamine::Data) -> String {
+    use calamine::Data;
+    match cell {
+        Data::Empty => String::new(),
+        Data::String(s) => s.clone(),
+        Data::Int(n) => n.to_string(),
+        Data::Float(f) => format_float(*f),
+        Data::Bool(b) => b.to_string(),
+        Data::Error(e) => format!("#{e:?}"),
+        Data::DateTime(dt) => format!("{dt}"),
+        Data::DateTimeIso(s) => s.clone(),
+        Data::DurationIso(s) => s.clone(),
+    }
+}
+
+/// Format a float, dropping trailing `.0` for clean integer display.
+fn format_float(f: f64) -> String {
+    if f.fract() == 0.0 && f.abs() < i64::MAX as f64 {
+        format!("{}", f as i64)
+    } else {
+        format!("{f}")
+    }
+}
+
+/// Extract CSV text and convert to markdown table.
+fn extract_csv(bytes: &[u8]) -> Result<String, FetchError> {
+    let text = String::from_utf8_lossy(bytes);
+    let rows = parse_csv_rows(&text);
+
+    if rows.is_empty() {
+        return Ok("(empty CSV)".to_string());
+    }
+
+    Ok(rows_to_markdown_table(&rows))
+}
+
+/// Parse CSV text into rows of fields, handling quoted fields with commas/newlines.
+fn parse_csv_rows(text: &str) -> Vec<Vec<String>> {
+    let mut rows: Vec<Vec<String>> = Vec::new();
+    let mut current_row: Vec<String> = Vec::new();
+    let mut current_field = String::new();
+    let mut in_quotes = false;
+    let mut chars = text.chars().peekable();
+
+    while let Some(ch) = chars.next() {
+        if in_quotes {
+            if ch == '"' {
+                // Escaped quote ("") or end of quoted field
+                if chars.peek() == Some(&'"') {
+                    chars.next();
+                    current_field.push('"');
+                } else {
+                    in_quotes = false;
+                }
+            } else {
+                current_field.push(ch);
+            }
+        } else {
+            match ch {
+                '"' => in_quotes = true,
+                ',' => {
+                    current_row.push(current_field.trim().to_string());
+                    current_field = String::new();
+                }
+                '\n' => {
+                    current_row.push(current_field.trim().to_string());
+                    current_field = String::new();
+                    if !current_row.iter().all(|f| f.is_empty()) {
+                        rows.push(current_row);
+                    }
+                    current_row = Vec::new();
+                }
+                '\r' => {
+                    // Skip carriage returns (handled with \n)
+                }
+                _ => current_field.push(ch),
+            }
+        }
+    }
+
+    // Flush last field/row
+    if !current_field.is_empty() || !current_row.is_empty() {
+        current_row.push(current_field.trim().to_string());
+        if !current_row.iter().all(|f| f.is_empty()) {
+            rows.push(current_row);
+        }
+    }
+
+    rows
+}
+
+/// Convert rows (first row = header) into a markdown table.
+fn rows_to_markdown_table(rows: &[Vec<String>]) -> String {
+    if rows.is_empty() {
+        return String::new();
+    }
+
+    // Find the max column count across all rows
+    let col_count = rows.iter().map(|r| r.len()).max().unwrap_or(0);
+    if col_count == 0 {
+        return String::new();
+    }
+
+    let mut lines: Vec<String> = Vec::new();
+
+    // Header row
+    let header = &rows[0];
+    let header_cells: Vec<&str> = (0..col_count)
+        .map(|i| header.get(i).map(|s| s.as_str()).unwrap_or(""))
+        .collect();
+    lines.push(format!("| {} |", header_cells.join(" | ")));
+
+    // Separator row
+    let sep: Vec<&str> = vec!["---"; col_count];
+    lines.push(format!("| {} |", sep.join(" | ")));
+
+    // Data rows
+    for row in &rows[1..] {
+        let cells: Vec<&str> = (0..col_count)
+            .map(|i| row.get(i).map(|s| s.as_str()).unwrap_or(""))
+            .collect();
+        lines.push(format!("| {} |", cells.join(" | ")));
+    }
+
+    lines.join("\n")
+}
+
+/// Strip markdown formatting to get plain text.
+fn strip_markdown_formatting(markdown: &str) -> String {
+    let mut plain = String::with_capacity(markdown.len());
+    for line in markdown.lines() {
+        let trimmed = line.trim_start_matches('#').trim();
+        if trimmed.starts_with("| ---") || trimmed == "|---|" {
+            continue; // Skip separator rows
+        }
+        if let Some(stripped) = trimmed.strip_prefix('|')
+            && let Some(stripped) = stripped.strip_suffix('|')
+        {
+            // Table row: join cells with spaces
+            let cells: Vec<&str> = stripped.split('|').map(|c| c.trim()).collect();
+            plain.push_str(&cells.join(" "));
+            plain.push('\n');
+            continue;
+        }
+        plain.push_str(trimmed);
+        plain.push('\n');
+    }
+    plain.trim().to_string()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // --- Content-type detection ---
+
+    #[test]
+    fn test_detect_docx_content_type() {
+        let mut headers = HashMap::new();
+        headers.insert(
+            "content-type".to_string(),
+            "application/vnd.openxmlformats-officedocument.wordprocessingml.document".to_string(),
+        );
+        assert_eq!(
+            is_document_content_type(&headers, "https://example.com/file"),
+            Some(DocType::Docx)
+        );
+    }
+
+    #[test]
+    fn test_detect_xlsx_content_type() {
+        let mut headers = HashMap::new();
+        headers.insert(
+            "content-type".to_string(),
+            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet".to_string(),
+        );
+        assert_eq!(
+            is_document_content_type(&headers, "https://example.com/file"),
+            Some(DocType::Xlsx)
+        );
+    }
+
+    #[test]
+    fn test_detect_xls_content_type() {
+        let mut headers = HashMap::new();
+        headers.insert(
+            "content-type".to_string(),
+            "application/vnd.ms-excel".to_string(),
+        );
+        assert_eq!(
+            is_document_content_type(&headers, "https://example.com/file"),
+            Some(DocType::Xls)
+        );
+    }
+
+    #[test]
+    fn test_detect_csv_content_type() {
+        let mut headers = HashMap::new();
+        headers.insert("content-type".to_string(), "text/csv".to_string());
+        assert_eq!(
+            is_document_content_type(&headers, "https://example.com/file"),
+            Some(DocType::Csv)
+        );
+    }
+
+    #[test]
+    fn test_detect_csv_content_type_with_charset() {
+        let mut headers = HashMap::new();
+        headers.insert(
+            "content-type".to_string(),
+            "text/csv; charset=utf-8".to_string(),
+        );
+        assert_eq!(
+            is_document_content_type(&headers, "https://example.com/file"),
+            Some(DocType::Csv)
+        );
+    }
+
+    #[test]
+    fn test_detect_by_url_extension() {
+        let empty: HashMap<String, String> = HashMap::new();
+        assert_eq!(
+            is_document_content_type(&empty, "https://example.com/report.docx"),
+            Some(DocType::Docx)
+        );
+        assert_eq!(
+            is_document_content_type(&empty, "https://example.com/data.xlsx"),
+            Some(DocType::Xlsx)
+        );
+        assert_eq!(
+            is_document_content_type(&empty, "https://example.com/old.xls"),
+            Some(DocType::Xls)
+        );
+        assert_eq!(
+            is_document_content_type(&empty, "https://example.com/data.csv"),
+            Some(DocType::Csv)
+        );
+    }
+
+    #[test]
+    fn test_detect_url_extension_with_query() {
+        let empty: HashMap<String, String> = HashMap::new();
+        assert_eq!(
+            is_document_content_type(&empty, "https://example.com/report.docx?token=abc"),
+            Some(DocType::Docx)
+        );
+    }
+
+    #[test]
+    fn test_detect_url_extension_case_insensitive() {
+        let empty: HashMap<String, String> = HashMap::new();
+        assert_eq!(
+            is_document_content_type(&empty, "https://example.com/FILE.XLSX"),
+            Some(DocType::Xlsx)
+        );
+    }
+
+    #[test]
+    fn test_detect_none_for_html() {
+        let mut headers = HashMap::new();
+        headers.insert("content-type".to_string(), "text/html".to_string());
+        assert_eq!(
+            is_document_content_type(&headers, "https://example.com/page"),
+            None
+        );
+    }
+
+    #[test]
+    fn test_content_type_takes_precedence_over_url() {
+        let mut headers = HashMap::new();
+        headers.insert("content-type".to_string(), "text/csv".to_string());
+        // URL says .xlsx but Content-Type says CSV — header wins
+        assert_eq!(
+            is_document_content_type(&headers, "https://example.com/data.xlsx"),
+            Some(DocType::Csv)
+        );
+    }
+
+    // --- CSV parsing ---
+
+    #[test]
+    fn test_csv_simple() {
+        let csv = "Name,Age,City\nAlice,30,NYC\nBob,25,LA\n";
+        let result = extract_csv(csv.as_bytes()).unwrap();
+        assert!(result.contains("| Name | Age | City |"));
+        assert!(result.contains("| --- | --- | --- |"));
+        assert!(result.contains("| Alice | 30 | NYC |"));
+        assert!(result.contains("| Bob | 25 | LA |"));
+    }
+
+    #[test]
+    fn test_csv_quoted_fields() {
+        let csv = "Name,Description\nAlice,\"Has a, comma\"\nBob,\"Said \"\"hello\"\"\"\n";
+        let result = extract_csv(csv.as_bytes()).unwrap();
+        assert!(result.contains("Has a, comma"));
+        assert!(result.contains("Said \"hello\""));
+    }
+
+    #[test]
+    fn test_csv_empty() {
+        let result = extract_csv(b"").unwrap();
+        assert_eq!(result, "(empty CSV)");
+    }
+
+    #[test]
+    fn test_csv_windows_line_endings() {
+        let csv = "A,B\r\n1,2\r\n3,4\r\n";
+        let result = extract_csv(csv.as_bytes()).unwrap();
+        assert!(result.contains("| A | B |"));
+        assert!(result.contains("| 1 | 2 |"));
+    }
+
+    // --- DOCX XML parsing ---
+
+    #[test]
+    fn test_docx_xml_simple_paragraphs() {
+        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
+<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
+  <w:body>
+    <w:p><w:r><w:t>Hello world</w:t></w:r></w:p>
+    <w:p><w:r><w:t>Second paragraph</w:t></w:r></w:p>
+  </w:body>
+</w:document>"#;
+        let result = parse_docx_xml(xml).unwrap();
+        assert_eq!(result, "Hello world\n\nSecond paragraph");
+    }
+
+    #[test]
+    fn test_docx_xml_headings() {
+        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
+<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
+  <w:body>
+    <w:p>
+      <w:pPr><w:pStyle w:val="Heading1"/></w:pPr>
+      <w:r><w:t>Title</w:t></w:r>
+    </w:p>
+    <w:p><w:r><w:t>Body text</w:t></w:r></w:p>
+    <w:p>
+      <w:pPr><w:pStyle w:val="Heading2"/></w:pPr>
+      <w:r><w:t>Subtitle</w:t></w:r>
+    </w:p>
+  </w:body>
+</w:document>"#;
+        let result = parse_docx_xml(xml).unwrap();
+        assert!(result.contains("# Title"));
+        assert!(result.contains("Body text"));
+        assert!(result.contains("## Subtitle"));
+    }
+
+    #[test]
+    fn test_docx_xml_multiple_runs() {
+        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
+<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
+  <w:body>
+    <w:p>
+      <w:r><w:t>Hello </w:t></w:r>
+      <w:r><w:t>world</w:t></w:r>
+    </w:p>
+  </w:body>
+</w:document>"#;
+        let result = parse_docx_xml(xml).unwrap();
+        assert_eq!(result, "Hello world");
+    }
+
+    #[test]
+    fn test_docx_xml_empty_paragraphs_skipped() {
+        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
+<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
+  <w:body>
+    <w:p></w:p>
+    <w:p><w:r><w:t>Content</w:t></w:r></w:p>
+    <w:p><w:r><w:t>   </w:t></w:r></w:p>
+  </w:body>
+</w:document>"#;
+        let result = parse_docx_xml(xml).unwrap();
+        assert_eq!(result, "Content");
+    }
+
+    // --- Markdown table ---
+
+    #[test]
+    fn test_rows_to_markdown_table() {
+        let rows = vec![
+            vec!["A".to_string(), "B".to_string()],
+            vec!["1".to_string(), "2".to_string()],
+            vec!["3".to_string(), "4".to_string()],
+        ];
+        let table = rows_to_markdown_table(&rows);
+        assert_eq!(table, "| A | B |\n| --- | --- |\n| 1 | 2 |\n| 3 | 4 |");
+    }
+
+    #[test]
+    fn test_rows_to_markdown_table_ragged() {
+        let rows = vec![
+            vec!["A".to_string(), "B".to_string(), "C".to_string()],
+            vec!["1".to_string()], // fewer columns
+        ];
+        let table = rows_to_markdown_table(&rows);
+        assert!(table.contains("| 1 |  |  |"));
+    }
+
+    // --- Extract result ---
+
+    #[test]
+    fn test_extract_csv_result() {
+        let csv = "Name,Score\nAlice,100\n";
+        let result = extract_document(csv.as_bytes(), DocType::Csv).unwrap();
+        assert!(result.content.markdown.contains("| Name | Score |"));
+        assert!(result.metadata.word_count > 0);
+        assert!(result.content.links.is_empty());
+        assert!(result.domain_data.is_none());
+    }
+
+    // --- Strip markdown ---
+
+    #[test]
+    fn test_strip_markdown() {
+        let md = "# Title\n\nSome text\n\n| A | B |\n| --- | --- |\n| 1 | 2 |";
+        let plain = strip_markdown_formatting(md);
+        assert!(plain.contains("Title"));
+        assert!(plain.contains("Some text"));
+        assert!(plain.contains("A B"));
+        assert!(!plain.contains("---"));
+    }
+}
diff --git a/crates/webclaw-fetch/src/lib.rs b/crates/webclaw-fetch/src/lib.rs
index c5cd40b..373eb8a 100644
--- a/crates/webclaw-fetch/src/lib.rs
+++ b/crates/webclaw-fetch/src/lib.rs
@@ -5,6 +5,7 @@
 pub mod browser;
 pub mod client;
 pub mod crawler;
+pub mod document;
 pub mod error;
 pub mod linkedin;
 pub mod proxy;