feat: v0.2.0 — DOCX/XLSX/CSV extraction, HTML format, multi-URL watch, batch LLM

Document extraction:
- DOCX: auto-detected, outputs markdown with headings (via zip + quick-xml)
- XLSX/XLS: markdown tables with multi-sheet support (via calamine)
- CSV: quoted field handling, markdown table output
- All auto-detected by Content-Type header or URL extension

New features:
- -f html output format (sanitized HTML)
- Multi-URL watch: --urls-file + --watch monitors all URLs in parallel
- Batch + LLM: --extract-prompt/--extract-json works with multiple URLs
- Mixed batch: HTML pages + DOCX + XLSX + CSV in one command

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Valerio 2026-03-26 15:28:23 +01:00
parent 0e4128782a
commit ea14848772
8 changed files with 1520 additions and 41 deletions

View file

@ -3,6 +3,19 @@
All notable changes to webclaw are documented here. All notable changes to webclaw are documented here.
Format follows [Keep a Changelog](https://keepachangelog.com/). Format follows [Keep a Changelog](https://keepachangelog.com/).
## [0.2.0] — 2026-03-26
### Added
- **DOCX extraction**: auto-detected by Content-Type or URL extension, outputs markdown with headings
- **XLSX/XLS extraction**: spreadsheets converted to markdown tables, multi-sheet support via calamine
- **CSV extraction**: parsed with quoted field handling, output as markdown table
- **HTML output format**: `-f html` returns sanitized HTML from the extracted content
- **Multi-URL watch**: `--watch` now works with `--urls-file` to monitor multiple URLs in parallel
- **Batch + LLM extraction**: `--extract-prompt` and `--extract-json` now work with multiple URLs
- **Scheduled batch watch**: watch multiple URLs with aggregate change reports and per-URL diffs
---
## [0.1.7] — 2026-03-26 ## [0.1.7] — 2026-03-26
### Fixed ### Fixed

316
Cargo.lock generated
View file

@ -17,6 +17,17 @@ dependencies = [
"pom", "pom",
] ]
[[package]]
name = "aes"
version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0"
dependencies = [
"cfg-if",
"cipher",
"cpufeatures 0.2.17",
]
[[package]] [[package]]
name = "aho-corasick" name = "aho-corasick"
version = "1.1.4" version = "1.1.4"
@ -106,6 +117,15 @@ version = "1.0.102"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
[[package]]
name = "arbitrary"
version = "1.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1"
dependencies = [
"derive_arbitrary",
]
[[package]] [[package]]
name = "async-compression" name = "async-compression"
version = "0.4.41" version = "0.4.41"
@ -129,6 +149,15 @@ dependencies = [
"syn", "syn",
] ]
[[package]]
name = "atoi_simd"
version = "0.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8ad17c7c205c2c28b527b9845eeb91cf1b4d008b438f98ce0e628227a822758e"
dependencies = [
"debug_unsafe",
]
[[package]] [[package]]
name = "atomic-waker" name = "atomic-waker"
version = "1.1.2" version = "1.1.2"
@ -224,6 +253,42 @@ version = "1.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33"
[[package]]
name = "bzip2"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47"
dependencies = [
"bzip2-sys",
]
[[package]]
name = "bzip2-sys"
version = "0.1.13+1.0.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14"
dependencies = [
"cc",
"pkg-config",
]
[[package]]
name = "calamine"
version = "0.34.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "20ae05a4e39297eecf9a994210d27501318c37a9318201f8e11050add82bb6f0"
dependencies = [
"atoi_simd",
"byteorder",
"codepage",
"encoding_rs",
"fast-float2",
"log",
"quick-xml 0.39.2",
"serde",
"zip 7.2.0",
]
[[package]] [[package]]
name = "cc" name = "cc"
version = "1.2.57" version = "1.2.57"
@ -255,7 +320,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601" checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601"
dependencies = [ dependencies = [
"cfg-if", "cfg-if",
"cpufeatures", "cpufeatures 0.3.0",
"rand_core 0.10.0", "rand_core 0.10.0",
] ]
@ -273,6 +338,16 @@ dependencies = [
"windows-link", "windows-link",
] ]
[[package]]
name = "cipher"
version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad"
dependencies = [
"crypto-common",
"inout",
]
[[package]] [[package]]
name = "clap" name = "clap"
version = "4.6.0" version = "4.6.0"
@ -322,6 +397,15 @@ dependencies = [
"cc", "cc",
] ]
[[package]]
name = "codepage"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "48f68d061bc2828ae826206326e61251aca94c1e4a5305cf52d9138639c918b4"
dependencies = [
"encoding_rs",
]
[[package]] [[package]]
name = "colorchoice" name = "colorchoice"
version = "1.0.5" version = "1.0.5"
@ -348,6 +432,12 @@ version = "0.4.31"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "75984efb6ed102a0d42db99afb6c1948f0380d1d91808d5529916e6c08b49d8d" checksum = "75984efb6ed102a0d42db99afb6c1948f0380d1d91808d5529916e6c08b49d8d"
[[package]]
name = "constant_time_eq"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6"
[[package]] [[package]]
name = "cookie" name = "cookie"
version = "0.18.1" version = "0.18.1"
@ -393,6 +483,15 @@ version = "0.8.7"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
[[package]]
name = "cpufeatures"
version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280"
dependencies = [
"libc",
]
[[package]] [[package]]
name = "cpufeatures" name = "cpufeatures"
version = "0.3.0" version = "0.3.0"
@ -402,6 +501,21 @@ dependencies = [
"libc", "libc",
] ]
[[package]]
name = "crc"
version = "3.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5eb8a2a1cd12ab0d987a5d5e825195d372001a4094a0376319d5a0ad71c1ba0d"
dependencies = [
"crc-catalog",
]
[[package]]
name = "crc-catalog"
version = "2.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5"
[[package]] [[package]]
name = "crc32fast" name = "crc32fast"
version = "1.5.0" version = "1.5.0"
@ -411,6 +525,12 @@ dependencies = [
"cfg-if", "cfg-if",
] ]
[[package]]
name = "crossbeam-utils"
version = "0.8.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
[[package]] [[package]]
name = "crypto-common" name = "crypto-common"
version = "0.1.7" version = "0.1.7"
@ -478,6 +598,18 @@ dependencies = [
"syn", "syn",
] ]
[[package]]
name = "debug_unsafe"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7eed2c4702fa172d1ce21078faa7c5203e69f5394d48cc436d25928394a867a2"
[[package]]
name = "deflate64"
version = "0.1.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac6b926516df9c60bfa16e107b21086399f8285a44ca9711344b9e553c5146e2"
[[package]] [[package]]
name = "deranged" name = "deranged"
version = "0.5.8" version = "0.5.8"
@ -487,6 +619,17 @@ dependencies = [
"powerfmt", "powerfmt",
] ]
[[package]]
name = "derive_arbitrary"
version = "1.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e567bd82dcff979e4b03460c307b3cdc9e96fde3d73bed1496d2bc75d9dd62a"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]] [[package]]
name = "derive_more" name = "derive_more"
version = "0.99.20" version = "0.99.20"
@ -506,6 +649,7 @@ checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
dependencies = [ dependencies = [
"block-buffer", "block-buffer",
"crypto-common", "crypto-common",
"subtle",
] ]
[[package]] [[package]]
@ -601,6 +745,12 @@ dependencies = [
"num-traits", "num-traits",
] ]
[[package]]
name = "fast-float2"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8eb564c5c7423d25c886fb561d1e4ee69f72354d16918afa32c08811f6b6a55"
[[package]] [[package]]
name = "fastrand" name = "fastrand"
version = "2.3.0" version = "2.3.0"
@ -621,6 +771,7 @@ checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c"
dependencies = [ dependencies = [
"crc32fast", "crc32fast",
"miniz_oxide", "miniz_oxide",
"zlib-rs",
] ]
[[package]] [[package]]
@ -857,6 +1008,15 @@ version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
[[package]]
name = "hmac"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e"
dependencies = [
"digest",
]
[[package]] [[package]]
name = "html5ever" name = "html5ever"
version = "0.29.1" version = "0.29.1"
@ -1121,6 +1281,15 @@ dependencies = [
"serde_core", "serde_core",
] ]
[[package]]
name = "inout"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "879f10e63c20629ecabbb64a8010319738c66a5cd0c29b02d63d272b03751d01"
dependencies = [
"generic-array",
]
[[package]] [[package]]
name = "ipnet" name = "ipnet"
version = "2.12.0" version = "2.12.0"
@ -1244,6 +1413,27 @@ version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154"
[[package]]
name = "lzma-rs"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "297e814c836ae64db86b36cf2a557ba54368d03f6afcd7d947c266692f71115e"
dependencies = [
"byteorder",
"crc",
]
[[package]]
name = "lzma-sys"
version = "0.1.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27"
dependencies = [
"cc",
"libc",
"pkg-config",
]
[[package]] [[package]]
name = "mac" name = "mac"
version = "0.1.1" version = "0.1.1"
@ -1414,6 +1604,16 @@ version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b867cad97c0791bbd3aaa6472142568c6c9e8f71937e98379f584cfb0cf35bec" checksum = "b867cad97c0791bbd3aaa6472142568c6c9e8f71937e98379f584cfb0cf35bec"
[[package]]
name = "pbkdf2"
version = "0.12.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8ed6a7761f76e3b9f92dfb0a60a6a6477c61024b775147ff0973a02653abaf2"
dependencies = [
"digest",
"hmac",
]
[[package]] [[package]]
name = "pdf-extract" name = "pdf-extract"
version = "0.7.12" version = "0.7.12"
@ -1629,6 +1829,16 @@ dependencies = [
"serde", "serde",
] ]
[[package]]
name = "quick-xml"
version = "0.39.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "958f21e8e7ceb5a1aa7fa87fab28e7c75976e0bfe7e23ff069e0a260f894067d"
dependencies = [
"encoding_rs",
"memchr",
]
[[package]] [[package]]
name = "quinn" name = "quinn"
version = "0.11.9" version = "0.11.9"
@ -2220,6 +2430,17 @@ dependencies = [
"stable_deref_trait", "stable_deref_trait",
] ]
[[package]]
name = "sha1"
version = "0.10.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba"
dependencies = [
"cfg-if",
"cpufeatures 0.2.17",
"digest",
]
[[package]] [[package]]
name = "sharded-slab" name = "sharded-slab"
version = "0.1.7" version = "0.1.7"
@ -2645,6 +2866,12 @@ dependencies = [
"pom", "pom",
] ]
[[package]]
name = "typed-path"
version = "0.12.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e28f89b80c87b8fb0cf04ab448d5dd0dd0ade2f8891bae878de66a75a28600e"
[[package]] [[package]]
name = "typenum" name = "typenum"
version = "1.19.0" version = "1.19.0"
@ -2881,7 +3108,7 @@ dependencies = [
[[package]] [[package]]
name = "webclaw-cli" name = "webclaw-cli"
version = "0.1.7" version = "0.2.0"
dependencies = [ dependencies = [
"clap", "clap",
"dotenvy", "dotenvy",
@ -2901,7 +3128,7 @@ dependencies = [
[[package]] [[package]]
name = "webclaw-core" name = "webclaw-core"
version = "0.1.7" version = "0.2.0"
dependencies = [ dependencies = [
"ego-tree", "ego-tree",
"once_cell", "once_cell",
@ -2919,10 +3146,11 @@ dependencies = [
[[package]] [[package]]
name = "webclaw-fetch" name = "webclaw-fetch"
version = "0.1.7" version = "0.2.0"
dependencies = [ dependencies = [
"calamine",
"primp", "primp",
"quick-xml", "quick-xml 0.37.5",
"rand 0.8.5", "rand 0.8.5",
"serde", "serde",
"serde_json", "serde_json",
@ -2933,11 +3161,12 @@ dependencies = [
"url", "url",
"webclaw-core", "webclaw-core",
"webclaw-pdf", "webclaw-pdf",
"zip 2.4.2",
] ]
[[package]] [[package]]
name = "webclaw-llm" name = "webclaw-llm"
version = "0.1.7" version = "0.2.0"
dependencies = [ dependencies = [
"async-trait", "async-trait",
"reqwest", "reqwest",
@ -2950,7 +3179,7 @@ dependencies = [
[[package]] [[package]]
name = "webclaw-mcp" name = "webclaw-mcp"
version = "0.1.7" version = "0.2.0"
dependencies = [ dependencies = [
"dotenvy", "dotenvy",
"reqwest", "reqwest",
@ -2970,7 +3199,7 @@ dependencies = [
[[package]] [[package]]
name = "webclaw-pdf" name = "webclaw-pdf"
version = "0.1.7" version = "0.2.0"
dependencies = [ dependencies = [
"pdf-extract", "pdf-extract",
"thiserror", "thiserror",
@ -3301,6 +3530,15 @@ version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9"
[[package]]
name = "xz2"
version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2"
dependencies = [
"lzma-sys",
]
[[package]] [[package]]
name = "yoke" name = "yoke"
version = "0.8.1" version = "0.8.1"
@ -3418,12 +3656,74 @@ dependencies = [
"syn", "syn",
] ]
[[package]]
name = "zip"
version = "2.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fabe6324e908f85a1c52063ce7aa26b68dcb7eb6dbc83a2d148403c9bc3eba50"
dependencies = [
"aes",
"arbitrary",
"bzip2",
"constant_time_eq",
"crc32fast",
"crossbeam-utils",
"deflate64",
"displaydoc",
"flate2",
"getrandom 0.3.4",
"hmac",
"indexmap",
"lzma-rs",
"memchr",
"pbkdf2",
"sha1",
"thiserror",
"time",
"xz2",
"zeroize",
"zopfli",
"zstd",
]
[[package]]
name = "zip"
version = "7.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c42e33efc22a0650c311c2ef19115ce232583abbe80850bc8b66509ebef02de0"
dependencies = [
"crc32fast",
"flate2",
"indexmap",
"memchr",
"typed-path",
"zopfli",
]
[[package]]
name = "zlib-rs"
version = "0.6.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3be3d40e40a133f9c916ee3f9f4fa2d9d63435b5fbe1bfc6d9dae0aa0ada1513"
[[package]] [[package]]
name = "zmij" name = "zmij"
version = "1.0.21" version = "1.0.21"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa"
[[package]]
name = "zopfli"
version = "0.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f05cd8797d63865425ff89b5c4a48804f35ba0ce8d125800027ad6017d2b5249"
dependencies = [
"bumpalo",
"crc32fast",
"log",
"simd-adler32",
]
[[package]] [[package]]
name = "zstd" name = "zstd"
version = "0.13.3" version = "0.13.3"

View file

@ -3,7 +3,7 @@ resolver = "2"
members = ["crates/*"] members = ["crates/*"]
[workspace.package] [workspace.package]
version = "0.1.7" version = "0.2.0"
edition = "2024" edition = "2024"
license = "MIT" license = "MIT"
repository = "https://github.com/0xMassi/webclaw" repository = "https://github.com/0xMassi/webclaw"

View file

@ -95,7 +95,7 @@ struct Cli {
#[arg(long)] #[arg(long)]
urls_file: Option<String>, urls_file: Option<String>,
/// Output format (markdown, json, text, llm) /// Output format (markdown, json, text, llm, html)
#[arg(short, long, default_value = "markdown")] #[arg(short, long, default_value = "markdown")]
format: OutputFormat, format: OutputFormat,
@ -277,6 +277,7 @@ enum OutputFormat {
Json, Json,
Text, Text,
Llm, Llm,
Html,
} }
#[derive(Clone, ValueEnum)] #[derive(Clone, ValueEnum)]
@ -394,7 +395,7 @@ fn build_extraction_options(cli: &Cli) -> ExtractionOptions {
.map(|s| s.split(',').map(|s| s.trim().to_string()).collect()) .map(|s| s.split(',').map(|s| s.trim().to_string()).collect())
.unwrap_or_default(), .unwrap_or_default(),
only_main_content: cli.only_main_content, only_main_content: cli.only_main_content,
include_raw_html: cli.raw_html, include_raw_html: cli.raw_html || matches!(cli.format, OutputFormat::Html),
} }
} }
@ -417,6 +418,7 @@ fn url_to_filename(raw_url: &str, format: &OutputFormat) -> String {
OutputFormat::Markdown | OutputFormat::Llm => "md", OutputFormat::Markdown | OutputFormat::Llm => "md",
OutputFormat::Json => "json", OutputFormat::Json => "json",
OutputFormat::Text => "txt", OutputFormat::Text => "txt",
OutputFormat::Html => "html",
}; };
let parsed = url::Url::parse(raw_url); let parsed = url::Url::parse(raw_url);
@ -470,6 +472,15 @@ fn write_to_file(dir: &Path, filename: &str, content: &str) -> Result<(), String
Ok(()) Ok(())
} }
/// Get raw HTML from an extraction result, falling back to markdown if unavailable.
fn raw_html_or_markdown(result: &ExtractionResult) -> &str {
result
.content
.raw_html
.as_deref()
.unwrap_or(&result.content.markdown)
}
/// Format an `ExtractionResult` into a string for the given output format. /// Format an `ExtractionResult` into a string for the given output format.
fn format_output(result: &ExtractionResult, format: &OutputFormat, show_metadata: bool) -> String { fn format_output(result: &ExtractionResult, format: &OutputFormat, show_metadata: bool) -> String {
match format { match format {
@ -484,6 +495,7 @@ fn format_output(result: &ExtractionResult, format: &OutputFormat, show_metadata
OutputFormat::Json => serde_json::to_string_pretty(result).expect("serialization failed"), OutputFormat::Json => serde_json::to_string_pretty(result).expect("serialization failed"),
OutputFormat::Text => result.content.plain_text.clone(), OutputFormat::Text => result.content.plain_text.clone(),
OutputFormat::Llm => to_llm_text(result, result.metadata.url.as_deref()), OutputFormat::Llm => to_llm_text(result, result.metadata.url.as_deref()),
OutputFormat::Html => raw_html_or_markdown(result).to_string(),
} }
} }
@ -586,6 +598,7 @@ async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
OutputFormat::Json => "json", OutputFormat::Json => "json",
OutputFormat::Text => "text", OutputFormat::Text => "text",
OutputFormat::Llm => "llm", OutputFormat::Llm => "llm",
OutputFormat::Html => "html",
}; };
let resp = c let resp = c
.scrape( .scrape(
@ -618,6 +631,7 @@ async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
OutputFormat::Json => "json", OutputFormat::Json => "json",
OutputFormat::Text => "text", OutputFormat::Text => "text",
OutputFormat::Llm => "llm", OutputFormat::Llm => "llm",
OutputFormat::Html => "html",
}; };
match c match c
.scrape( .scrape(
@ -793,6 +807,9 @@ fn print_output(result: &ExtractionResult, format: &OutputFormat, show_metadata:
OutputFormat::Llm => { OutputFormat::Llm => {
println!("{}", to_llm_text(result, result.metadata.url.as_deref())); println!("{}", to_llm_text(result, result.metadata.url.as_deref()));
} }
OutputFormat::Html => {
println!("{}", raw_html_or_markdown(result));
}
} }
} }
@ -845,6 +862,17 @@ fn print_cloud_output(resp: &serde_json::Value, format: &OutputFormat) {
print_cloud_output(resp, &OutputFormat::Markdown); print_cloud_output(resp, &OutputFormat::Markdown);
} }
} }
OutputFormat::Html => {
if let Some(html) = resp
.get("content")
.and_then(|c| c.get("raw_html"))
.and_then(|h| h.as_str())
{
println!("{html}");
} else {
print_cloud_output(resp, &OutputFormat::Markdown);
}
}
} }
} }
@ -937,6 +965,17 @@ fn print_crawl_output(result: &CrawlResult, format: &OutputFormat, show_metadata
println!(); println!();
} }
} }
OutputFormat::Html => {
for page in &result.pages {
let Some(ref extraction) = page.extraction else {
continue;
};
println!("---");
println!("<!-- Page: {} -->\n", page.url);
println!("{}", raw_html_or_markdown(extraction));
println!();
}
}
} }
} }
@ -1009,6 +1048,21 @@ fn print_batch_output(results: &[BatchExtractResult], format: &OutputFormat, sho
} }
} }
} }
OutputFormat::Html => {
for r in results {
match &r.result {
Ok(extraction) => {
println!("---");
println!("<!-- {} -->\n", r.url);
println!("{}", raw_html_or_markdown(extraction));
println!();
}
Err(e) => {
eprintln!("error: {} -- {}", r.url, e);
}
}
}
}
} }
} }
@ -1393,24 +1447,15 @@ fn fire_webhook(url: &str, payload: &serde_json::Value) {
}); });
} }
async fn run_watch(cli: &Cli) -> Result<(), String> { async fn run_watch(cli: &Cli, urls: &[String]) -> Result<(), String> {
let raw_url = cli.urls.first().ok_or("--watch requires a URL argument")?; if urls.is_empty() {
let url = normalize_url(raw_url); return Err("--watch requires at least one URL".into());
}
let client = let client = Arc::new(
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?; FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?,
let options = build_extraction_options(cli);
// Initial snapshot
let mut previous = client
.fetch_and_extract_with_options(&url, &options)
.await
.map_err(|e| format!("initial fetch failed: {e}"))?;
eprintln!(
"[watch] Initial snapshot: {url} ({} words)",
previous.metadata.word_count
); );
let options = build_extraction_options(cli);
// Ctrl+C handler // Ctrl+C handler
let cancelled = Arc::new(AtomicBool::new(false)); let cancelled = Arc::new(AtomicBool::new(false));
@ -1420,6 +1465,33 @@ async fn run_watch(cli: &Cli) -> Result<(), String> {
flag.store(true, Ordering::Relaxed); flag.store(true, Ordering::Relaxed);
}); });
// Single-URL mode: preserve original behavior exactly
if urls.len() == 1 {
return run_watch_single(cli, &client, &options, &urls[0], &cancelled).await;
}
// Multi-URL mode: batch fetch, diff each, report aggregate
run_watch_multi(cli, &client, &options, urls, &cancelled).await
}
/// Original single-URL watch loop -- backward compatible.
async fn run_watch_single(
cli: &Cli,
client: &Arc<FetchClient>,
options: &ExtractionOptions,
url: &str,
cancelled: &Arc<AtomicBool>,
) -> Result<(), String> {
let mut previous = client
.fetch_and_extract_with_options(url, options)
.await
.map_err(|e| format!("initial fetch failed: {e}"))?;
eprintln!(
"[watch] Initial snapshot: {url} ({} words)",
previous.metadata.word_count
);
loop { loop {
tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval)).await; tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval)).await;
@ -1428,7 +1500,7 @@ async fn run_watch(cli: &Cli) -> Result<(), String> {
break; break;
} }
let current = match client.fetch_and_extract_with_options(&url, &options).await { let current = match client.fetch_and_extract_with_options(url, options).await {
Ok(result) => result, Ok(result) => result,
Err(e) => { Err(e) => {
eprintln!("[watch] Fetch error ({}): {e}", timestamp()); eprintln!("[watch] Fetch error ({}): {e}", timestamp());
@ -1454,7 +1526,6 @@ async fn run_watch(cli: &Cli) -> Result<(), String> {
.spawn() .spawn()
{ {
Ok(mut child) => { Ok(mut child) => {
// Pipe diff JSON to stdin, then detach
if let Some(mut stdin) = child.stdin.take() { if let Some(mut stdin) = child.stdin.take() {
use tokio::io::AsyncWriteExt; use tokio::io::AsyncWriteExt;
let _ = stdin.write_all(diff_json.as_bytes()).await; let _ = stdin.write_all(diff_json.as_bytes()).await;
@ -1464,7 +1535,6 @@ async fn run_watch(cli: &Cli) -> Result<(), String> {
} }
} }
// Fire webhook on change
if let Some(ref webhook_url) = cli.webhook { if let Some(ref webhook_url) = cli.webhook {
fire_webhook( fire_webhook(
webhook_url, webhook_url,
@ -1487,6 +1557,162 @@ async fn run_watch(cli: &Cli) -> Result<(), String> {
Ok(()) Ok(())
} }
/// Multi-URL watch loop -- batch fetch all URLs, diff each, report aggregate.
async fn run_watch_multi(
cli: &Cli,
client: &Arc<FetchClient>,
options: &ExtractionOptions,
urls: &[String],
cancelled: &Arc<AtomicBool>,
) -> Result<(), String> {
let url_refs: Vec<&str> = urls.iter().map(|u| u.as_str()).collect();
// Initial pass: fetch all URLs in parallel
let initial_results = client
.fetch_and_extract_batch_with_options(&url_refs, cli.concurrency, options)
.await;
let mut snapshots = std::collections::HashMap::new();
let mut ok_count = 0usize;
let mut err_count = 0usize;
for r in initial_results {
match r.result {
Ok(extraction) => {
snapshots.insert(r.url, extraction);
ok_count += 1;
}
Err(e) => {
eprintln!("[watch] Initial fetch error: {} -- {e}", r.url);
err_count += 1;
}
}
}
eprintln!(
"[watch] Watching {} URLs (interval: {}s)",
urls.len(),
cli.watch_interval
);
eprintln!("[watch] Initial snapshots: {ok_count} ok, {err_count} errors");
let mut check_number = 0u64;
loop {
tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval)).await;
if cancelled.load(Ordering::Relaxed) {
eprintln!("[watch] Stopped");
break;
}
check_number += 1;
let current_results = client
.fetch_and_extract_batch_with_options(&url_refs, cli.concurrency, options)
.await;
let mut changed: Vec<serde_json::Value> = Vec::new();
let mut same_count = 0usize;
let mut fetch_errors = 0usize;
for r in current_results {
match r.result {
Ok(current) => {
if let Some(previous) = snapshots.get(&r.url) {
let diff = webclaw_core::diff::diff(previous, &current);
if diff.status == ChangeStatus::Same {
same_count += 1;
} else {
changed.push(serde_json::json!({
"url": r.url,
"word_count_delta": diff.word_count_delta,
}));
snapshots.insert(r.url, current);
}
} else {
// URL failed initially, first successful fetch -- store as baseline
snapshots.insert(r.url, current);
same_count += 1;
}
}
Err(e) => {
eprintln!("[watch] Fetch error: {} -- {e}", r.url);
fetch_errors += 1;
}
}
}
let ts = timestamp();
let err_suffix = if fetch_errors > 0 {
format!(", {fetch_errors} errors")
} else {
String::new()
};
if changed.is_empty() {
eprintln!(
"[watch] Check {check_number} ({ts}): 0 changed, {same_count} same{err_suffix}"
);
} else {
eprintln!(
"[watch] Check {check_number} ({ts}): {} changed, {same_count} same{err_suffix}",
changed.len(),
);
for entry in &changed {
let url = entry["url"].as_str().unwrap_or("?");
let delta = entry["word_count_delta"].as_i64().unwrap_or(0);
eprintln!(" -> {url} (word delta: {delta:+})");
}
// Fire --on-change once with all changes
if let Some(ref cmd) = cli.on_change {
let payload = serde_json::json!({
"event": "watch_changes",
"check_number": check_number,
"total_urls": urls.len(),
"changed": changed.len(),
"same": same_count,
"changes": changed,
});
let payload_json = serde_json::to_string(&payload).unwrap_or_default();
eprintln!("[watch] Running: {cmd}");
match tokio::process::Command::new("sh")
.arg("-c")
.arg(cmd)
.stdin(std::process::Stdio::piped())
.spawn()
{
Ok(mut child) => {
if let Some(mut stdin) = child.stdin.take() {
use tokio::io::AsyncWriteExt;
let _ = stdin.write_all(payload_json.as_bytes()).await;
}
}
Err(e) => eprintln!("[watch] Failed to run command: {e}"),
}
}
// Fire webhook once with aggregate payload
if let Some(ref webhook_url) = cli.webhook {
fire_webhook(
webhook_url,
&serde_json::json!({
"event": "watch_changes",
"check_number": check_number,
"total_urls": urls.len(),
"changed": changed.len(),
"same": same_count,
"changes": changed,
}),
);
}
}
}
Ok(())
}
async fn run_diff(cli: &Cli, snapshot_path: &str) -> Result<(), String> { async fn run_diff(cli: &Cli, snapshot_path: &str) -> Result<(), String> {
// Load previous snapshot // Load previous snapshot
let snapshot_json = std::fs::read_to_string(snapshot_path) let snapshot_json = std::fs::read_to_string(snapshot_path)
@ -1626,6 +1852,158 @@ async fn run_llm(cli: &Cli) -> Result<(), String> {
Ok(()) Ok(())
} }
/// Batch LLM extraction: fetch each URL, run LLM on extracted content, save/print results.
/// URLs are processed sequentially to respect LLM provider rate limits.
async fn run_batch_llm(cli: &Cli, entries: &[(String, Option<String>)]) -> Result<(), String> {
let client =
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
let options = build_extraction_options(cli);
let provider = build_llm_provider(cli).await?;
let model = cli.llm_model.as_deref();
// Pre-parse schema once if --extract-json is used
let schema = if let Some(ref schema_input) = cli.extract_json {
let schema_str = if let Some(path) = schema_input.strip_prefix('@') {
std::fs::read_to_string(path)
.map_err(|e| format!("failed to read schema file {path}: {e}"))?
} else {
schema_input.clone()
};
Some(
serde_json::from_str::<serde_json::Value>(&schema_str)
.map_err(|e| format!("invalid JSON schema: {e}"))?,
)
} else {
None
};
// Build custom filename lookup from entries
let custom_names: std::collections::HashMap<&str, &str> = entries
.iter()
.filter_map(|(url, name)| name.as_deref().map(|n| (url.as_str(), n)))
.collect();
let total = entries.len();
let mut ok = 0usize;
let mut errors = 0usize;
let mut all_results: Vec<serde_json::Value> = Vec::with_capacity(total);
for (i, (url, _)) in entries.iter().enumerate() {
let idx = i + 1;
eprint!("[{idx}/{total}] {url} ");
// Fetch and extract page content
let extraction = match client.fetch_and_extract_with_options(url, &options).await {
Ok(r) => r,
Err(e) => {
errors += 1;
let msg = format!("fetch failed: {e}");
eprintln!("-> error: {msg}");
all_results.push(serde_json::json!({ "url": url, "error": msg }));
continue;
}
};
let text = &extraction.content.plain_text;
// Run the appropriate LLM operation
let llm_result = if let Some(ref schema) = schema {
webclaw_llm::extract::extract_json(text, schema, provider.as_ref(), model)
.await
.map(LlmOutput::Json)
} else if let Some(ref prompt) = cli.extract_prompt {
webclaw_llm::extract::extract_with_prompt(text, prompt, provider.as_ref(), model)
.await
.map(LlmOutput::Json)
} else if let Some(sentences) = cli.summarize {
webclaw_llm::summarize::summarize(text, Some(sentences), provider.as_ref(), model)
.await
.map(LlmOutput::Text)
} else {
unreachable!("run_batch_llm called without LLM flags")
};
match llm_result {
Ok(output) => {
ok += 1;
let (output_str, result_json) = match &output {
LlmOutput::Json(v) => {
let s = serde_json::to_string_pretty(v).expect("serialization failed");
let j = serde_json::json!({ "url": url, "result": v });
(s, j)
}
LlmOutput::Text(s) => {
let j = serde_json::json!({ "url": url, "result": s });
(s.clone(), j)
}
};
// Count top-level fields/items for progress display
let detail = match &output {
LlmOutput::Json(v) => match v {
serde_json::Value::Object(m) => format!("{} fields", m.len()),
serde_json::Value::Array(a) => format!("{} items", a.len()),
_ => "done".to_string(),
},
LlmOutput::Text(s) => {
let words = s.split_whitespace().count();
format!("{words} words")
}
};
eprintln!("-> extracted {detail}");
if let Some(ref dir) = cli.output_dir {
let filename = custom_names
.get(url.as_str())
.map(|s| s.to_string())
.unwrap_or_else(|| url_to_filename(url, &OutputFormat::Json));
write_to_file(dir, &filename, &output_str)?;
} else {
println!("--- {url}");
println!("{output_str}");
println!();
}
all_results.push(result_json);
}
Err(e) => {
errors += 1;
let msg = format!("LLM extraction failed: {e}");
eprintln!("-> error: {msg}");
all_results.push(serde_json::json!({ "url": url, "error": msg }));
}
}
}
eprintln!("Processed {total} URLs ({ok} ok, {errors} errors)");
if let Some(ref webhook_url) = cli.webhook {
fire_webhook(
webhook_url,
&serde_json::json!({
"event": "batch_llm_complete",
"total": total,
"ok": ok,
"errors": errors,
}),
);
tokio::time::sleep(std::time::Duration::from_millis(500)).await;
}
if errors > 0 {
Err(format!("{errors} of {total} URLs failed"))
} else {
Ok(())
}
}
/// Intermediate type to hold LLM output before formatting.
enum LlmOutput {
Json(serde_json::Value),
Text(String),
}
/// Returns true if any LLM flag is set. /// Returns true if any LLM flag is set.
fn has_llm_flags(cli: &Cli) -> bool { fn has_llm_flags(cli: &Cli) -> bool {
cli.extract_json.is_some() || cli.extract_prompt.is_some() || cli.summarize.is_some() cli.extract_json.is_some() || cli.extract_prompt.is_some() || cli.summarize.is_some()
@ -1656,9 +2034,16 @@ async fn main() {
return; return;
} }
// --watch: poll a URL for changes // --watch: poll URL(s) for changes
if cli.watch { if cli.watch {
if let Err(e) = run_watch(&cli).await { let watch_urls: Vec<String> = match collect_urls(&cli) {
Ok(entries) => entries.into_iter().map(|(url, _)| url).collect(),
Err(e) => {
eprintln!("error: {e}");
process::exit(1);
}
};
if let Err(e) = run_watch(&cli, &watch_urls).await {
eprintln!("error: {e}"); eprintln!("error: {e}");
process::exit(1); process::exit(1);
} }
@ -1683,15 +2068,6 @@ async fn main() {
return; return;
} }
// LLM modes: --extract-json, --extract-prompt, --summarize
if has_llm_flags(&cli) {
if let Err(e) = run_llm(&cli).await {
eprintln!("error: {e}");
process::exit(1);
}
return;
}
// Collect all URLs from args + --urls-file // Collect all URLs from args + --urls-file
let entries = match collect_urls(&cli) { let entries = match collect_urls(&cli) {
Ok(u) => u, Ok(u) => u,
@ -1701,6 +2077,21 @@ async fn main() {
} }
}; };
// LLM modes: --extract-json, --extract-prompt, --summarize
// When multiple URLs are provided, run batch LLM extraction over all of them.
if has_llm_flags(&cli) {
if entries.len() > 1 {
if let Err(e) = run_batch_llm(&cli, &entries).await {
eprintln!("error: {e}");
process::exit(1);
}
} else if let Err(e) = run_llm(&cli).await {
eprintln!("error: {e}");
process::exit(1);
}
return;
}
// Multi-URL batch mode // Multi-URL batch mode
if entries.len() > 1 { if entries.len() > 1 {
if let Err(e) = run_batch(&cli, &entries).await { if let Err(e) = run_batch(&cli, &entries).await {
@ -1824,6 +2215,14 @@ mod tests {
); );
} }
#[test]
fn url_to_filename_html_format() {
assert_eq!(
url_to_filename("https://example.com/docs/api", &OutputFormat::Html),
"docs/api.html"
);
}
#[test] #[test]
fn url_to_filename_special_chars() { fn url_to_filename_special_chars() {
// Spaces and special chars get replaced with underscores // Spaces and special chars get replaced with underscores

View file

@ -19,6 +19,8 @@ url = "2"
rand = "0.8" rand = "0.8"
quick-xml = { version = "0.37", features = ["serde"] } quick-xml = { version = "0.37", features = ["serde"] }
serde_json.workspace = true serde_json.workspace = true
calamine = "0.34"
zip = "2"
[dev-dependencies] [dev-dependencies]
tempfile = "3" tempfile = "3"

View file

@ -399,6 +399,27 @@ impl FetchClient {
let pdf_result = webclaw_pdf::extract_pdf(&bytes, self.pdf_mode.clone())?; let pdf_result = webclaw_pdf::extract_pdf(&bytes, self.pdf_mode.clone())?;
Ok(pdf_to_extraction_result(&pdf_result, &final_url)) Ok(pdf_to_extraction_result(&pdf_result, &final_url))
} else if let Some(doc_type) =
crate::document::is_document_content_type(&headers, &final_url)
{
debug!(status, doc_type = ?doc_type, "detected document response, extracting");
let bytes = response
.bytes()
.await
.map_err(|e| FetchError::BodyDecode(e.to_string()))?;
let elapsed = start.elapsed();
debug!(
status,
bytes = bytes.len(),
elapsed_ms = %elapsed.as_millis(),
"document fetch complete"
);
let mut result = crate::document::extract_document(&bytes, doc_type)?;
result.metadata.url = Some(final_url);
Ok(result)
} else { } else {
let html = response let html = response
.text() .text()

View file

@ -0,0 +1,743 @@
/// Document extraction for DOCX, XLSX, XLS, and CSV files.
/// Auto-detects document type from Content-Type headers or URL extension,
/// then extracts text content as markdown — same pattern as PDF extraction.
use std::collections::HashMap;
use std::io::{Cursor, Read};
use tracing::debug;
use crate::error::FetchError;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DocType {
Docx,
Xlsx,
Xls,
Csv,
}
impl DocType {
fn label(self) -> &'static str {
match self {
DocType::Docx => "DOCX",
DocType::Xlsx => "XLSX",
DocType::Xls => "XLS",
DocType::Csv => "CSV",
}
}
}
/// Detect document type from response headers or URL extension.
/// Returns `None` for non-document responses (HTML, PDF, etc.).
pub fn is_document_content_type(headers: &HashMap<String, String>, url: &str) -> Option<DocType> {
// Check Content-Type header first
if let Some(ct) = headers.get("content-type") {
let mime = ct.split(';').next().unwrap_or("").trim();
if mime.eq_ignore_ascii_case(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
) {
return Some(DocType::Docx);
}
if mime.eq_ignore_ascii_case(
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
) {
return Some(DocType::Xlsx);
}
if mime.eq_ignore_ascii_case("application/vnd.ms-excel") {
return Some(DocType::Xls);
}
if mime.eq_ignore_ascii_case("text/csv") {
return Some(DocType::Csv);
}
}
// Fall back to URL extension
let path = url.split('?').next().unwrap_or(url);
let lower = path.to_ascii_lowercase();
if lower.ends_with(".docx") {
return Some(DocType::Docx);
}
if lower.ends_with(".xlsx") {
return Some(DocType::Xlsx);
}
if lower.ends_with(".xls") {
return Some(DocType::Xls);
}
if lower.ends_with(".csv") {
return Some(DocType::Csv);
}
None
}
/// Extract text content from document bytes, returning an ExtractionResult.
pub fn extract_document(
bytes: &[u8],
doc_type: DocType,
) -> Result<webclaw_core::ExtractionResult, FetchError> {
debug!(
doc_type = doc_type.label(),
bytes = bytes.len(),
"extracting document"
);
let markdown = match doc_type {
DocType::Docx => extract_docx(bytes)?,
DocType::Xlsx => extract_xlsx(bytes)?,
DocType::Xls => extract_xls(bytes)?,
DocType::Csv => extract_csv(bytes)?,
};
let plain_text = strip_markdown_formatting(&markdown);
let word_count = plain_text.split_whitespace().count();
Ok(webclaw_core::ExtractionResult {
metadata: webclaw_core::Metadata {
title: None,
description: None,
author: None,
published_date: None,
language: None,
url: None,
site_name: None,
image: None,
favicon: None,
word_count,
},
content: webclaw_core::Content {
markdown,
plain_text,
links: Vec::new(),
images: Vec::new(),
code_blocks: Vec::new(),
raw_html: None,
},
domain_data: None,
structured_data: vec![],
})
}
/// Extract text from a DOCX file (ZIP of XML).
/// Reads `word/document.xml`, extracts `<w:t>` text nodes, detects heading styles.
fn extract_docx(bytes: &[u8]) -> Result<String, FetchError> {
let cursor = Cursor::new(bytes);
let mut archive =
zip::ZipArchive::new(cursor).map_err(|e| FetchError::Build(format!("DOCX zip: {e}")))?;
let xml = {
let mut file = archive
.by_name("word/document.xml")
.map_err(|e| FetchError::Build(format!("DOCX missing document.xml: {e}")))?;
let mut buf = String::new();
file.read_to_string(&mut buf)
.map_err(|e| FetchError::BodyDecode(format!("DOCX read: {e}")))?;
buf
};
parse_docx_xml(&xml)
}
/// Parse DOCX XML (word/document.xml) into markdown.
///
/// Walks the XML looking for paragraph elements (`<w:p>`). Within each paragraph,
/// collects text from `<w:t>` tags and detects heading styles from `<w:pStyle>`.
fn parse_docx_xml(xml: &str) -> Result<String, FetchError> {
use quick_xml::Reader;
use quick_xml::events::Event;
let mut reader = Reader::from_str(xml);
let mut paragraphs: Vec<String> = Vec::new();
// State tracking for the current paragraph
let mut in_paragraph = false;
let mut in_run = false; // inside <w:r> (run)
let mut in_text = false; // inside <w:t>
let mut current_text = String::new();
let mut heading_level: Option<u8> = 0.into(); // None = normal paragraph
let mut in_ppr = false; // inside <w:pPr> (paragraph properties)
loop {
match reader.read_event() {
Ok(Event::Start(ref e)) | Ok(Event::Empty(ref e)) => {
let name_bytes = e.name().as_ref().to_vec();
let local = local_name(&name_bytes);
match local {
b"p" if is_w_namespace(&name_bytes) => {
in_paragraph = true;
current_text.clear();
heading_level = None;
}
b"pPr" if in_paragraph => in_ppr = true,
b"pStyle" if in_ppr => {
heading_level = extract_heading_level(e);
}
b"r" if in_paragraph => in_run = true,
b"t" if in_run => in_text = true,
b"br" if in_paragraph => {
current_text.push('\n');
}
b"tab" if in_paragraph => {
current_text.push('\t');
}
_ => {}
}
}
Ok(Event::End(ref e)) => {
let name_bytes = e.name().as_ref().to_vec();
let local = local_name(&name_bytes);
match local {
b"p" if in_paragraph => {
let text = current_text.trim().to_string();
if !text.is_empty() {
let formatted = match heading_level {
Some(1) => format!("# {text}"),
Some(2) => format!("## {text}"),
Some(3) => format!("### {text}"),
Some(4) => format!("#### {text}"),
Some(5) => format!("##### {text}"),
Some(6) => format!("###### {text}"),
_ => text,
};
paragraphs.push(formatted);
}
in_paragraph = false;
}
b"pPr" => in_ppr = false,
b"r" => {
in_run = false;
in_text = false;
}
b"t" => in_text = false,
_ => {}
}
}
Ok(Event::Text(ref e)) if in_text => {
if let Ok(text) = e.unescape() {
current_text.push_str(&text);
}
}
Ok(Event::Eof) => break,
Err(e) => {
return Err(FetchError::Build(format!("DOCX XML parse error: {e}")));
}
_ => {}
}
}
Ok(paragraphs.join("\n\n"))
}
/// Check if a qualified name belongs to the `w:` (wordprocessingML) namespace.
/// Handles both `w:p` (prefixed) and just `p` (default namespace) forms.
fn is_w_namespace(name: &[u8]) -> bool {
// quick-xml gives us the full name bytes. Accept both "w:p" and "p".
name == b"w:p" || name == b"p"
}
/// Extract the local name from a possibly namespaced XML tag.
/// `w:p` -> `p`, `p` -> `p`
fn local_name(name: &[u8]) -> &[u8] {
match name.iter().position(|&b| b == b':') {
Some(pos) => &name[pos + 1..],
None => name,
}
}
/// Extract heading level from a `<w:pStyle w:val="Heading1"/>` element.
fn extract_heading_level(e: &quick_xml::events::BytesStart) -> Option<u8> {
for attr in e.attributes().flatten() {
let local = local_name(attr.key.as_ref());
if local == b"val" {
let val = String::from_utf8_lossy(&attr.value);
let lower = val.to_ascii_lowercase();
// Match "heading1", "heading2", etc. and "title" -> h1
if lower == "title" {
return Some(1);
}
if let Some(rest) = lower.strip_prefix("heading")
&& let Ok(n) = rest.parse::<u8>()
{
return Some(n.min(6));
}
}
}
None
}
/// Extract spreadsheet content using calamine (XLSX format).
fn extract_xlsx(bytes: &[u8]) -> Result<String, FetchError> {
extract_spreadsheet(bytes, "XLSX")
}
/// Extract spreadsheet content using calamine (XLS format).
fn extract_xls(bytes: &[u8]) -> Result<String, FetchError> {
extract_spreadsheet(bytes, "XLS")
}
/// Shared spreadsheet extraction for both XLSX and XLS via calamine.
/// Reads all sheets and formats each as a markdown table.
fn extract_spreadsheet(bytes: &[u8], label: &str) -> Result<String, FetchError> {
use calamine::Reader;
let cursor = Cursor::new(bytes);
let mut workbook: calamine::Sheets<_> = calamine::open_workbook_auto_from_rs(cursor)
.map_err(|e| FetchError::Build(format!("{label} open: {e}")))?;
let sheet_names: Vec<String> = workbook.sheet_names().to_vec();
let mut sections: Vec<String> = Vec::new();
for name in &sheet_names {
let range = workbook
.worksheet_range(name)
.map_err(|e| FetchError::Build(format!("{label} sheet '{name}': {e}")))?;
let rows: Vec<Vec<String>> = range
.rows()
.map(|row| row.iter().map(cell_to_string).collect())
.collect();
if rows.is_empty() {
continue;
}
let mut section = format!("## Sheet: {name}\n\n");
section.push_str(&rows_to_markdown_table(&rows));
sections.push(section);
}
if sections.is_empty() {
return Ok("(empty spreadsheet)".to_string());
}
Ok(sections.join("\n\n"))
}
/// Convert a calamine cell value to a display string.
fn cell_to_string(cell: &calamine::Data) -> String {
use calamine::Data;
match cell {
Data::Empty => String::new(),
Data::String(s) => s.clone(),
Data::Int(n) => n.to_string(),
Data::Float(f) => format_float(*f),
Data::Bool(b) => b.to_string(),
Data::Error(e) => format!("#{e:?}"),
Data::DateTime(dt) => format!("{dt}"),
Data::DateTimeIso(s) => s.clone(),
Data::DurationIso(s) => s.clone(),
}
}
/// Format a float, dropping trailing `.0` for clean integer display.
fn format_float(f: f64) -> String {
if f.fract() == 0.0 && f.abs() < i64::MAX as f64 {
format!("{}", f as i64)
} else {
format!("{f}")
}
}
/// Extract CSV text and convert to markdown table.
fn extract_csv(bytes: &[u8]) -> Result<String, FetchError> {
let text = String::from_utf8_lossy(bytes);
let rows = parse_csv_rows(&text);
if rows.is_empty() {
return Ok("(empty CSV)".to_string());
}
Ok(rows_to_markdown_table(&rows))
}
/// Parse CSV text into rows of fields, handling quoted fields with commas/newlines.
fn parse_csv_rows(text: &str) -> Vec<Vec<String>> {
let mut rows: Vec<Vec<String>> = Vec::new();
let mut current_row: Vec<String> = Vec::new();
let mut current_field = String::new();
let mut in_quotes = false;
let mut chars = text.chars().peekable();
while let Some(ch) = chars.next() {
if in_quotes {
if ch == '"' {
// Escaped quote ("") or end of quoted field
if chars.peek() == Some(&'"') {
chars.next();
current_field.push('"');
} else {
in_quotes = false;
}
} else {
current_field.push(ch);
}
} else {
match ch {
'"' => in_quotes = true,
',' => {
current_row.push(current_field.trim().to_string());
current_field = String::new();
}
'\n' => {
current_row.push(current_field.trim().to_string());
current_field = String::new();
if !current_row.iter().all(|f| f.is_empty()) {
rows.push(current_row);
}
current_row = Vec::new();
}
'\r' => {
// Skip carriage returns (handled with \n)
}
_ => current_field.push(ch),
}
}
}
// Flush last field/row
if !current_field.is_empty() || !current_row.is_empty() {
current_row.push(current_field.trim().to_string());
if !current_row.iter().all(|f| f.is_empty()) {
rows.push(current_row);
}
}
rows
}
/// Convert rows (first row = header) into a markdown table.
fn rows_to_markdown_table(rows: &[Vec<String>]) -> String {
if rows.is_empty() {
return String::new();
}
// Find the max column count across all rows
let col_count = rows.iter().map(|r| r.len()).max().unwrap_or(0);
if col_count == 0 {
return String::new();
}
let mut lines: Vec<String> = Vec::new();
// Header row
let header = &rows[0];
let header_cells: Vec<&str> = (0..col_count)
.map(|i| header.get(i).map(|s| s.as_str()).unwrap_or(""))
.collect();
lines.push(format!("| {} |", header_cells.join(" | ")));
// Separator row
let sep: Vec<&str> = vec!["---"; col_count];
lines.push(format!("| {} |", sep.join(" | ")));
// Data rows
for row in &rows[1..] {
let cells: Vec<&str> = (0..col_count)
.map(|i| row.get(i).map(|s| s.as_str()).unwrap_or(""))
.collect();
lines.push(format!("| {} |", cells.join(" | ")));
}
lines.join("\n")
}
/// Strip markdown formatting to get plain text.
fn strip_markdown_formatting(markdown: &str) -> String {
let mut plain = String::with_capacity(markdown.len());
for line in markdown.lines() {
let trimmed = line.trim_start_matches('#').trim();
if trimmed.starts_with("| ---") || trimmed == "|---|" {
continue; // Skip separator rows
}
if let Some(stripped) = trimmed.strip_prefix('|')
&& let Some(stripped) = stripped.strip_suffix('|')
{
// Table row: join cells with spaces
let cells: Vec<&str> = stripped.split('|').map(|c| c.trim()).collect();
plain.push_str(&cells.join(" "));
plain.push('\n');
continue;
}
plain.push_str(trimmed);
plain.push('\n');
}
plain.trim().to_string()
}
#[cfg(test)]
mod tests {
use super::*;
// --- Content-type detection ---
#[test]
fn test_detect_docx_content_type() {
let mut headers = HashMap::new();
headers.insert(
"content-type".to_string(),
"application/vnd.openxmlformats-officedocument.wordprocessingml.document".to_string(),
);
assert_eq!(
is_document_content_type(&headers, "https://example.com/file"),
Some(DocType::Docx)
);
}
#[test]
fn test_detect_xlsx_content_type() {
let mut headers = HashMap::new();
headers.insert(
"content-type".to_string(),
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet".to_string(),
);
assert_eq!(
is_document_content_type(&headers, "https://example.com/file"),
Some(DocType::Xlsx)
);
}
#[test]
fn test_detect_xls_content_type() {
let mut headers = HashMap::new();
headers.insert(
"content-type".to_string(),
"application/vnd.ms-excel".to_string(),
);
assert_eq!(
is_document_content_type(&headers, "https://example.com/file"),
Some(DocType::Xls)
);
}
#[test]
fn test_detect_csv_content_type() {
let mut headers = HashMap::new();
headers.insert("content-type".to_string(), "text/csv".to_string());
assert_eq!(
is_document_content_type(&headers, "https://example.com/file"),
Some(DocType::Csv)
);
}
#[test]
fn test_detect_csv_content_type_with_charset() {
let mut headers = HashMap::new();
headers.insert(
"content-type".to_string(),
"text/csv; charset=utf-8".to_string(),
);
assert_eq!(
is_document_content_type(&headers, "https://example.com/file"),
Some(DocType::Csv)
);
}
#[test]
fn test_detect_by_url_extension() {
let empty: HashMap<String, String> = HashMap::new();
assert_eq!(
is_document_content_type(&empty, "https://example.com/report.docx"),
Some(DocType::Docx)
);
assert_eq!(
is_document_content_type(&empty, "https://example.com/data.xlsx"),
Some(DocType::Xlsx)
);
assert_eq!(
is_document_content_type(&empty, "https://example.com/old.xls"),
Some(DocType::Xls)
);
assert_eq!(
is_document_content_type(&empty, "https://example.com/data.csv"),
Some(DocType::Csv)
);
}
#[test]
fn test_detect_url_extension_with_query() {
let empty: HashMap<String, String> = HashMap::new();
assert_eq!(
is_document_content_type(&empty, "https://example.com/report.docx?token=abc"),
Some(DocType::Docx)
);
}
#[test]
fn test_detect_url_extension_case_insensitive() {
let empty: HashMap<String, String> = HashMap::new();
assert_eq!(
is_document_content_type(&empty, "https://example.com/FILE.XLSX"),
Some(DocType::Xlsx)
);
}
#[test]
fn test_detect_none_for_html() {
let mut headers = HashMap::new();
headers.insert("content-type".to_string(), "text/html".to_string());
assert_eq!(
is_document_content_type(&headers, "https://example.com/page"),
None
);
}
#[test]
fn test_content_type_takes_precedence_over_url() {
let mut headers = HashMap::new();
headers.insert("content-type".to_string(), "text/csv".to_string());
// URL says .xlsx but Content-Type says CSV — header wins
assert_eq!(
is_document_content_type(&headers, "https://example.com/data.xlsx"),
Some(DocType::Csv)
);
}
// --- CSV parsing ---
#[test]
fn test_csv_simple() {
let csv = "Name,Age,City\nAlice,30,NYC\nBob,25,LA\n";
let result = extract_csv(csv.as_bytes()).unwrap();
assert!(result.contains("| Name | Age | City |"));
assert!(result.contains("| --- | --- | --- |"));
assert!(result.contains("| Alice | 30 | NYC |"));
assert!(result.contains("| Bob | 25 | LA |"));
}
#[test]
fn test_csv_quoted_fields() {
let csv = "Name,Description\nAlice,\"Has a, comma\"\nBob,\"Said \"\"hello\"\"\"\n";
let result = extract_csv(csv.as_bytes()).unwrap();
assert!(result.contains("Has a, comma"));
assert!(result.contains("Said \"hello\""));
}
#[test]
fn test_csv_empty() {
let result = extract_csv(b"").unwrap();
assert_eq!(result, "(empty CSV)");
}
#[test]
fn test_csv_windows_line_endings() {
let csv = "A,B\r\n1,2\r\n3,4\r\n";
let result = extract_csv(csv.as_bytes()).unwrap();
assert!(result.contains("| A | B |"));
assert!(result.contains("| 1 | 2 |"));
}
// --- DOCX XML parsing ---
#[test]
fn test_docx_xml_simple_paragraphs() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:r><w:t>Hello world</w:t></w:r></w:p>
<w:p><w:r><w:t>Second paragraph</w:t></w:r></w:p>
</w:body>
</w:document>"#;
let result = parse_docx_xml(xml).unwrap();
assert_eq!(result, "Hello world\n\nSecond paragraph");
}
#[test]
fn test_docx_xml_headings() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p>
<w:pPr><w:pStyle w:val="Heading1"/></w:pPr>
<w:r><w:t>Title</w:t></w:r>
</w:p>
<w:p><w:r><w:t>Body text</w:t></w:r></w:p>
<w:p>
<w:pPr><w:pStyle w:val="Heading2"/></w:pPr>
<w:r><w:t>Subtitle</w:t></w:r>
</w:p>
</w:body>
</w:document>"#;
let result = parse_docx_xml(xml).unwrap();
assert!(result.contains("# Title"));
assert!(result.contains("Body text"));
assert!(result.contains("## Subtitle"));
}
#[test]
fn test_docx_xml_multiple_runs() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p>
<w:r><w:t>Hello </w:t></w:r>
<w:r><w:t>world</w:t></w:r>
</w:p>
</w:body>
</w:document>"#;
let result = parse_docx_xml(xml).unwrap();
assert_eq!(result, "Hello world");
}
#[test]
fn test_docx_xml_empty_paragraphs_skipped() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p></w:p>
<w:p><w:r><w:t>Content</w:t></w:r></w:p>
<w:p><w:r><w:t> </w:t></w:r></w:p>
</w:body>
</w:document>"#;
let result = parse_docx_xml(xml).unwrap();
assert_eq!(result, "Content");
}
// --- Markdown table ---
#[test]
fn test_rows_to_markdown_table() {
let rows = vec![
vec!["A".to_string(), "B".to_string()],
vec!["1".to_string(), "2".to_string()],
vec!["3".to_string(), "4".to_string()],
];
let table = rows_to_markdown_table(&rows);
assert_eq!(table, "| A | B |\n| --- | --- |\n| 1 | 2 |\n| 3 | 4 |");
}
#[test]
fn test_rows_to_markdown_table_ragged() {
let rows = vec![
vec!["A".to_string(), "B".to_string(), "C".to_string()],
vec!["1".to_string()], // fewer columns
];
let table = rows_to_markdown_table(&rows);
assert!(table.contains("| 1 | | |"));
}
// --- Extract result ---
#[test]
fn test_extract_csv_result() {
let csv = "Name,Score\nAlice,100\n";
let result = extract_document(csv.as_bytes(), DocType::Csv).unwrap();
assert!(result.content.markdown.contains("| Name | Score |"));
assert!(result.metadata.word_count > 0);
assert!(result.content.links.is_empty());
assert!(result.domain_data.is_none());
}
// --- Strip markdown ---
#[test]
fn test_strip_markdown() {
let md = "# Title\n\nSome text\n\n| A | B |\n| --- | --- |\n| 1 | 2 |";
let plain = strip_markdown_formatting(md);
assert!(plain.contains("Title"));
assert!(plain.contains("Some text"));
assert!(plain.contains("A B"));
assert!(!plain.contains("---"));
}
}

View file

@ -5,6 +5,7 @@
pub mod browser; pub mod browser;
pub mod client; pub mod client;
pub mod crawler; pub mod crawler;
pub mod document;
pub mod error; pub mod error;
pub mod linkedin; pub mod linkedin;
pub mod proxy; pub mod proxy;