feat: v0.2.0 — DOCX/XLSX/CSV extraction, HTML format, multi-URL watch, batch LLM

Document extraction:
- DOCX: auto-detected, outputs markdown with headings (via zip + quick-xml)
- XLSX/XLS: markdown tables with multi-sheet support (via calamine)
- CSV: quoted field handling, markdown table output
- All auto-detected by Content-Type header or URL extension

New features:
- -f html output format (sanitized HTML)
- Multi-URL watch: --urls-file + --watch monitors all URLs in parallel
- Batch + LLM: --extract-prompt/--extract-json works with multiple URLs
- Mixed batch: HTML pages + DOCX + XLSX + CSV in one command

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Valerio 2026-03-26 15:28:23 +01:00
parent 0e4128782a
commit ea14848772
8 changed files with 1520 additions and 41 deletions

316
Cargo.lock generated
View file

@ -17,6 +17,17 @@ dependencies = [
"pom",
]
[[package]]
name = "aes"
version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0"
dependencies = [
"cfg-if",
"cipher",
"cpufeatures 0.2.17",
]
[[package]]
name = "aho-corasick"
version = "1.1.4"
@ -106,6 +117,15 @@ version = "1.0.102"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
[[package]]
name = "arbitrary"
version = "1.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1"
dependencies = [
"derive_arbitrary",
]
[[package]]
name = "async-compression"
version = "0.4.41"
@ -129,6 +149,15 @@ dependencies = [
"syn",
]
[[package]]
name = "atoi_simd"
version = "0.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8ad17c7c205c2c28b527b9845eeb91cf1b4d008b438f98ce0e628227a822758e"
dependencies = [
"debug_unsafe",
]
[[package]]
name = "atomic-waker"
version = "1.1.2"
@ -224,6 +253,42 @@ version = "1.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33"
[[package]]
name = "bzip2"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47"
dependencies = [
"bzip2-sys",
]
[[package]]
name = "bzip2-sys"
version = "0.1.13+1.0.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14"
dependencies = [
"cc",
"pkg-config",
]
[[package]]
name = "calamine"
version = "0.34.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "20ae05a4e39297eecf9a994210d27501318c37a9318201f8e11050add82bb6f0"
dependencies = [
"atoi_simd",
"byteorder",
"codepage",
"encoding_rs",
"fast-float2",
"log",
"quick-xml 0.39.2",
"serde",
"zip 7.2.0",
]
[[package]]
name = "cc"
version = "1.2.57"
@ -255,7 +320,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601"
dependencies = [
"cfg-if",
"cpufeatures",
"cpufeatures 0.3.0",
"rand_core 0.10.0",
]
@ -273,6 +338,16 @@ dependencies = [
"windows-link",
]
[[package]]
name = "cipher"
version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad"
dependencies = [
"crypto-common",
"inout",
]
[[package]]
name = "clap"
version = "4.6.0"
@ -322,6 +397,15 @@ dependencies = [
"cc",
]
[[package]]
name = "codepage"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "48f68d061bc2828ae826206326e61251aca94c1e4a5305cf52d9138639c918b4"
dependencies = [
"encoding_rs",
]
[[package]]
name = "colorchoice"
version = "1.0.5"
@ -348,6 +432,12 @@ version = "0.4.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "75984efb6ed102a0d42db99afb6c1948f0380d1d91808d5529916e6c08b49d8d"
[[package]]
name = "constant_time_eq"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6"
[[package]]
name = "cookie"
version = "0.18.1"
@ -393,6 +483,15 @@ version = "0.8.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
[[package]]
name = "cpufeatures"
version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280"
dependencies = [
"libc",
]
[[package]]
name = "cpufeatures"
version = "0.3.0"
@ -402,6 +501,21 @@ dependencies = [
"libc",
]
[[package]]
name = "crc"
version = "3.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5eb8a2a1cd12ab0d987a5d5e825195d372001a4094a0376319d5a0ad71c1ba0d"
dependencies = [
"crc-catalog",
]
[[package]]
name = "crc-catalog"
version = "2.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5"
[[package]]
name = "crc32fast"
version = "1.5.0"
@ -411,6 +525,12 @@ dependencies = [
"cfg-if",
]
[[package]]
name = "crossbeam-utils"
version = "0.8.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
[[package]]
name = "crypto-common"
version = "0.1.7"
@ -478,6 +598,18 @@ dependencies = [
"syn",
]
[[package]]
name = "debug_unsafe"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7eed2c4702fa172d1ce21078faa7c5203e69f5394d48cc436d25928394a867a2"
[[package]]
name = "deflate64"
version = "0.1.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac6b926516df9c60bfa16e107b21086399f8285a44ca9711344b9e553c5146e2"
[[package]]
name = "deranged"
version = "0.5.8"
@ -487,6 +619,17 @@ dependencies = [
"powerfmt",
]
[[package]]
name = "derive_arbitrary"
version = "1.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e567bd82dcff979e4b03460c307b3cdc9e96fde3d73bed1496d2bc75d9dd62a"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "derive_more"
version = "0.99.20"
@ -506,6 +649,7 @@ checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
dependencies = [
"block-buffer",
"crypto-common",
"subtle",
]
[[package]]
@ -601,6 +745,12 @@ dependencies = [
"num-traits",
]
[[package]]
name = "fast-float2"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8eb564c5c7423d25c886fb561d1e4ee69f72354d16918afa32c08811f6b6a55"
[[package]]
name = "fastrand"
version = "2.3.0"
@ -621,6 +771,7 @@ checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c"
dependencies = [
"crc32fast",
"miniz_oxide",
"zlib-rs",
]
[[package]]
@ -857,6 +1008,15 @@ version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
[[package]]
name = "hmac"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e"
dependencies = [
"digest",
]
[[package]]
name = "html5ever"
version = "0.29.1"
@ -1121,6 +1281,15 @@ dependencies = [
"serde_core",
]
[[package]]
name = "inout"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "879f10e63c20629ecabbb64a8010319738c66a5cd0c29b02d63d272b03751d01"
dependencies = [
"generic-array",
]
[[package]]
name = "ipnet"
version = "2.12.0"
@ -1244,6 +1413,27 @@ version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154"
[[package]]
name = "lzma-rs"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "297e814c836ae64db86b36cf2a557ba54368d03f6afcd7d947c266692f71115e"
dependencies = [
"byteorder",
"crc",
]
[[package]]
name = "lzma-sys"
version = "0.1.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27"
dependencies = [
"cc",
"libc",
"pkg-config",
]
[[package]]
name = "mac"
version = "0.1.1"
@ -1414,6 +1604,16 @@ version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b867cad97c0791bbd3aaa6472142568c6c9e8f71937e98379f584cfb0cf35bec"
[[package]]
name = "pbkdf2"
version = "0.12.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8ed6a7761f76e3b9f92dfb0a60a6a6477c61024b775147ff0973a02653abaf2"
dependencies = [
"digest",
"hmac",
]
[[package]]
name = "pdf-extract"
version = "0.7.12"
@ -1629,6 +1829,16 @@ dependencies = [
"serde",
]
[[package]]
name = "quick-xml"
version = "0.39.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "958f21e8e7ceb5a1aa7fa87fab28e7c75976e0bfe7e23ff069e0a260f894067d"
dependencies = [
"encoding_rs",
"memchr",
]
[[package]]
name = "quinn"
version = "0.11.9"
@ -2220,6 +2430,17 @@ dependencies = [
"stable_deref_trait",
]
[[package]]
name = "sha1"
version = "0.10.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba"
dependencies = [
"cfg-if",
"cpufeatures 0.2.17",
"digest",
]
[[package]]
name = "sharded-slab"
version = "0.1.7"
@ -2645,6 +2866,12 @@ dependencies = [
"pom",
]
[[package]]
name = "typed-path"
version = "0.12.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e28f89b80c87b8fb0cf04ab448d5dd0dd0ade2f8891bae878de66a75a28600e"
[[package]]
name = "typenum"
version = "1.19.0"
@ -2881,7 +3108,7 @@ dependencies = [
[[package]]
name = "webclaw-cli"
version = "0.1.7"
version = "0.2.0"
dependencies = [
"clap",
"dotenvy",
@ -2901,7 +3128,7 @@ dependencies = [
[[package]]
name = "webclaw-core"
version = "0.1.7"
version = "0.2.0"
dependencies = [
"ego-tree",
"once_cell",
@ -2919,10 +3146,11 @@ dependencies = [
[[package]]
name = "webclaw-fetch"
version = "0.1.7"
version = "0.2.0"
dependencies = [
"calamine",
"primp",
"quick-xml",
"quick-xml 0.37.5",
"rand 0.8.5",
"serde",
"serde_json",
@ -2933,11 +3161,12 @@ dependencies = [
"url",
"webclaw-core",
"webclaw-pdf",
"zip 2.4.2",
]
[[package]]
name = "webclaw-llm"
version = "0.1.7"
version = "0.2.0"
dependencies = [
"async-trait",
"reqwest",
@ -2950,7 +3179,7 @@ dependencies = [
[[package]]
name = "webclaw-mcp"
version = "0.1.7"
version = "0.2.0"
dependencies = [
"dotenvy",
"reqwest",
@ -2970,7 +3199,7 @@ dependencies = [
[[package]]
name = "webclaw-pdf"
version = "0.1.7"
version = "0.2.0"
dependencies = [
"pdf-extract",
"thiserror",
@ -3301,6 +3530,15 @@ version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9"
[[package]]
name = "xz2"
version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2"
dependencies = [
"lzma-sys",
]
[[package]]
name = "yoke"
version = "0.8.1"
@ -3418,12 +3656,74 @@ dependencies = [
"syn",
]
[[package]]
name = "zip"
version = "2.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fabe6324e908f85a1c52063ce7aa26b68dcb7eb6dbc83a2d148403c9bc3eba50"
dependencies = [
"aes",
"arbitrary",
"bzip2",
"constant_time_eq",
"crc32fast",
"crossbeam-utils",
"deflate64",
"displaydoc",
"flate2",
"getrandom 0.3.4",
"hmac",
"indexmap",
"lzma-rs",
"memchr",
"pbkdf2",
"sha1",
"thiserror",
"time",
"xz2",
"zeroize",
"zopfli",
"zstd",
]
[[package]]
name = "zip"
version = "7.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c42e33efc22a0650c311c2ef19115ce232583abbe80850bc8b66509ebef02de0"
dependencies = [
"crc32fast",
"flate2",
"indexmap",
"memchr",
"typed-path",
"zopfli",
]
[[package]]
name = "zlib-rs"
version = "0.6.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3be3d40e40a133f9c916ee3f9f4fa2d9d63435b5fbe1bfc6d9dae0aa0ada1513"
[[package]]
name = "zmij"
version = "1.0.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa"
[[package]]
name = "zopfli"
version = "0.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f05cd8797d63865425ff89b5c4a48804f35ba0ce8d125800027ad6017d2b5249"
dependencies = [
"bumpalo",
"crc32fast",
"log",
"simd-adler32",
]
[[package]]
name = "zstd"
version = "0.13.3"