mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-05-13 00:42:37 +02:00
feat: v0.2.0 — DOCX/XLSX/CSV extraction, HTML format, multi-URL watch, batch LLM
Document extraction: - DOCX: auto-detected, outputs markdown with headings (via zip + quick-xml) - XLSX/XLS: markdown tables with multi-sheet support (via calamine) - CSV: quoted field handling, markdown table output - All auto-detected by Content-Type header or URL extension New features: - -f html output format (sanitized HTML) - Multi-URL watch: --urls-file + --watch monitors all URLs in parallel - Batch + LLM: --extract-prompt/--extract-json works with multiple URLs - Mixed batch: HTML pages + DOCX + XLSX + CSV in one command Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
0e4128782a
commit
ea14848772
8 changed files with 1520 additions and 41 deletions
13
CHANGELOG.md
13
CHANGELOG.md
|
|
@ -3,6 +3,19 @@
|
||||||
All notable changes to webclaw are documented here.
|
All notable changes to webclaw are documented here.
|
||||||
Format follows [Keep a Changelog](https://keepachangelog.com/).
|
Format follows [Keep a Changelog](https://keepachangelog.com/).
|
||||||
|
|
||||||
|
## [0.2.0] — 2026-03-26
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- **DOCX extraction**: auto-detected by Content-Type or URL extension, outputs markdown with headings
|
||||||
|
- **XLSX/XLS extraction**: spreadsheets converted to markdown tables, multi-sheet support via calamine
|
||||||
|
- **CSV extraction**: parsed with quoted field handling, output as markdown table
|
||||||
|
- **HTML output format**: `-f html` returns sanitized HTML from the extracted content
|
||||||
|
- **Multi-URL watch**: `--watch` now works with `--urls-file` to monitor multiple URLs in parallel
|
||||||
|
- **Batch + LLM extraction**: `--extract-prompt` and `--extract-json` now work with multiple URLs
|
||||||
|
- **Scheduled batch watch**: watch multiple URLs with aggregate change reports and per-URL diffs
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
## [0.1.7] — 2026-03-26
|
## [0.1.7] — 2026-03-26
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
|
|
|
||||||
316
Cargo.lock
generated
316
Cargo.lock
generated
|
|
@ -17,6 +17,17 @@ dependencies = [
|
||||||
"pom",
|
"pom",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "aes"
|
||||||
|
version = "0.8.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0"
|
||||||
|
dependencies = [
|
||||||
|
"cfg-if",
|
||||||
|
"cipher",
|
||||||
|
"cpufeatures 0.2.17",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "aho-corasick"
|
name = "aho-corasick"
|
||||||
version = "1.1.4"
|
version = "1.1.4"
|
||||||
|
|
@ -106,6 +117,15 @@ version = "1.0.102"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
|
checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "arbitrary"
|
||||||
|
version = "1.4.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1"
|
||||||
|
dependencies = [
|
||||||
|
"derive_arbitrary",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "async-compression"
|
name = "async-compression"
|
||||||
version = "0.4.41"
|
version = "0.4.41"
|
||||||
|
|
@ -129,6 +149,15 @@ dependencies = [
|
||||||
"syn",
|
"syn",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "atoi_simd"
|
||||||
|
version = "0.17.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "8ad17c7c205c2c28b527b9845eeb91cf1b4d008b438f98ce0e628227a822758e"
|
||||||
|
dependencies = [
|
||||||
|
"debug_unsafe",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "atomic-waker"
|
name = "atomic-waker"
|
||||||
version = "1.1.2"
|
version = "1.1.2"
|
||||||
|
|
@ -224,6 +253,42 @@ version = "1.11.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33"
|
checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "bzip2"
|
||||||
|
version = "0.5.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47"
|
||||||
|
dependencies = [
|
||||||
|
"bzip2-sys",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "bzip2-sys"
|
||||||
|
version = "0.1.13+1.0.8"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14"
|
||||||
|
dependencies = [
|
||||||
|
"cc",
|
||||||
|
"pkg-config",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "calamine"
|
||||||
|
version = "0.34.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "20ae05a4e39297eecf9a994210d27501318c37a9318201f8e11050add82bb6f0"
|
||||||
|
dependencies = [
|
||||||
|
"atoi_simd",
|
||||||
|
"byteorder",
|
||||||
|
"codepage",
|
||||||
|
"encoding_rs",
|
||||||
|
"fast-float2",
|
||||||
|
"log",
|
||||||
|
"quick-xml 0.39.2",
|
||||||
|
"serde",
|
||||||
|
"zip 7.2.0",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "cc"
|
name = "cc"
|
||||||
version = "1.2.57"
|
version = "1.2.57"
|
||||||
|
|
@ -255,7 +320,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601"
|
checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"cfg-if",
|
"cfg-if",
|
||||||
"cpufeatures",
|
"cpufeatures 0.3.0",
|
||||||
"rand_core 0.10.0",
|
"rand_core 0.10.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
@ -273,6 +338,16 @@ dependencies = [
|
||||||
"windows-link",
|
"windows-link",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "cipher"
|
||||||
|
version = "0.4.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad"
|
||||||
|
dependencies = [
|
||||||
|
"crypto-common",
|
||||||
|
"inout",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "clap"
|
name = "clap"
|
||||||
version = "4.6.0"
|
version = "4.6.0"
|
||||||
|
|
@ -322,6 +397,15 @@ dependencies = [
|
||||||
"cc",
|
"cc",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "codepage"
|
||||||
|
version = "0.1.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "48f68d061bc2828ae826206326e61251aca94c1e4a5305cf52d9138639c918b4"
|
||||||
|
dependencies = [
|
||||||
|
"encoding_rs",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "colorchoice"
|
name = "colorchoice"
|
||||||
version = "1.0.5"
|
version = "1.0.5"
|
||||||
|
|
@ -348,6 +432,12 @@ version = "0.4.31"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "75984efb6ed102a0d42db99afb6c1948f0380d1d91808d5529916e6c08b49d8d"
|
checksum = "75984efb6ed102a0d42db99afb6c1948f0380d1d91808d5529916e6c08b49d8d"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "constant_time_eq"
|
||||||
|
version = "0.3.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "cookie"
|
name = "cookie"
|
||||||
version = "0.18.1"
|
version = "0.18.1"
|
||||||
|
|
@ -393,6 +483,15 @@ version = "0.8.7"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
|
checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "cpufeatures"
|
||||||
|
version = "0.2.17"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280"
|
||||||
|
dependencies = [
|
||||||
|
"libc",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "cpufeatures"
|
name = "cpufeatures"
|
||||||
version = "0.3.0"
|
version = "0.3.0"
|
||||||
|
|
@ -402,6 +501,21 @@ dependencies = [
|
||||||
"libc",
|
"libc",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "crc"
|
||||||
|
version = "3.4.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "5eb8a2a1cd12ab0d987a5d5e825195d372001a4094a0376319d5a0ad71c1ba0d"
|
||||||
|
dependencies = [
|
||||||
|
"crc-catalog",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "crc-catalog"
|
||||||
|
version = "2.4.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "crc32fast"
|
name = "crc32fast"
|
||||||
version = "1.5.0"
|
version = "1.5.0"
|
||||||
|
|
@ -411,6 +525,12 @@ dependencies = [
|
||||||
"cfg-if",
|
"cfg-if",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "crossbeam-utils"
|
||||||
|
version = "0.8.21"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "crypto-common"
|
name = "crypto-common"
|
||||||
version = "0.1.7"
|
version = "0.1.7"
|
||||||
|
|
@ -478,6 +598,18 @@ dependencies = [
|
||||||
"syn",
|
"syn",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "debug_unsafe"
|
||||||
|
version = "0.1.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "7eed2c4702fa172d1ce21078faa7c5203e69f5394d48cc436d25928394a867a2"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "deflate64"
|
||||||
|
version = "0.1.12"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ac6b926516df9c60bfa16e107b21086399f8285a44ca9711344b9e553c5146e2"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "deranged"
|
name = "deranged"
|
||||||
version = "0.5.8"
|
version = "0.5.8"
|
||||||
|
|
@ -487,6 +619,17 @@ dependencies = [
|
||||||
"powerfmt",
|
"powerfmt",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "derive_arbitrary"
|
||||||
|
version = "1.4.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "1e567bd82dcff979e4b03460c307b3cdc9e96fde3d73bed1496d2bc75d9dd62a"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "derive_more"
|
name = "derive_more"
|
||||||
version = "0.99.20"
|
version = "0.99.20"
|
||||||
|
|
@ -506,6 +649,7 @@ checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"block-buffer",
|
"block-buffer",
|
||||||
"crypto-common",
|
"crypto-common",
|
||||||
|
"subtle",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
|
@ -601,6 +745,12 @@ dependencies = [
|
||||||
"num-traits",
|
"num-traits",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "fast-float2"
|
||||||
|
version = "0.2.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f8eb564c5c7423d25c886fb561d1e4ee69f72354d16918afa32c08811f6b6a55"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "fastrand"
|
name = "fastrand"
|
||||||
version = "2.3.0"
|
version = "2.3.0"
|
||||||
|
|
@ -621,6 +771,7 @@ checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"crc32fast",
|
"crc32fast",
|
||||||
"miniz_oxide",
|
"miniz_oxide",
|
||||||
|
"zlib-rs",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
|
@ -857,6 +1008,15 @@ version = "0.5.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
|
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "hmac"
|
||||||
|
version = "0.12.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e"
|
||||||
|
dependencies = [
|
||||||
|
"digest",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "html5ever"
|
name = "html5ever"
|
||||||
version = "0.29.1"
|
version = "0.29.1"
|
||||||
|
|
@ -1121,6 +1281,15 @@ dependencies = [
|
||||||
"serde_core",
|
"serde_core",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "inout"
|
||||||
|
version = "0.1.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "879f10e63c20629ecabbb64a8010319738c66a5cd0c29b02d63d272b03751d01"
|
||||||
|
dependencies = [
|
||||||
|
"generic-array",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "ipnet"
|
name = "ipnet"
|
||||||
version = "2.12.0"
|
version = "2.12.0"
|
||||||
|
|
@ -1244,6 +1413,27 @@ version = "0.1.2"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154"
|
checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "lzma-rs"
|
||||||
|
version = "0.3.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "297e814c836ae64db86b36cf2a557ba54368d03f6afcd7d947c266692f71115e"
|
||||||
|
dependencies = [
|
||||||
|
"byteorder",
|
||||||
|
"crc",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "lzma-sys"
|
||||||
|
version = "0.1.20"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27"
|
||||||
|
dependencies = [
|
||||||
|
"cc",
|
||||||
|
"libc",
|
||||||
|
"pkg-config",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "mac"
|
name = "mac"
|
||||||
version = "0.1.1"
|
version = "0.1.1"
|
||||||
|
|
@ -1414,6 +1604,16 @@ version = "0.2.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "b867cad97c0791bbd3aaa6472142568c6c9e8f71937e98379f584cfb0cf35bec"
|
checksum = "b867cad97c0791bbd3aaa6472142568c6c9e8f71937e98379f584cfb0cf35bec"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pbkdf2"
|
||||||
|
version = "0.12.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f8ed6a7761f76e3b9f92dfb0a60a6a6477c61024b775147ff0973a02653abaf2"
|
||||||
|
dependencies = [
|
||||||
|
"digest",
|
||||||
|
"hmac",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pdf-extract"
|
name = "pdf-extract"
|
||||||
version = "0.7.12"
|
version = "0.7.12"
|
||||||
|
|
@ -1629,6 +1829,16 @@ dependencies = [
|
||||||
"serde",
|
"serde",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "quick-xml"
|
||||||
|
version = "0.39.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "958f21e8e7ceb5a1aa7fa87fab28e7c75976e0bfe7e23ff069e0a260f894067d"
|
||||||
|
dependencies = [
|
||||||
|
"encoding_rs",
|
||||||
|
"memchr",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "quinn"
|
name = "quinn"
|
||||||
version = "0.11.9"
|
version = "0.11.9"
|
||||||
|
|
@ -2220,6 +2430,17 @@ dependencies = [
|
||||||
"stable_deref_trait",
|
"stable_deref_trait",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "sha1"
|
||||||
|
version = "0.10.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba"
|
||||||
|
dependencies = [
|
||||||
|
"cfg-if",
|
||||||
|
"cpufeatures 0.2.17",
|
||||||
|
"digest",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "sharded-slab"
|
name = "sharded-slab"
|
||||||
version = "0.1.7"
|
version = "0.1.7"
|
||||||
|
|
@ -2645,6 +2866,12 @@ dependencies = [
|
||||||
"pom",
|
"pom",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "typed-path"
|
||||||
|
version = "0.12.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "8e28f89b80c87b8fb0cf04ab448d5dd0dd0ade2f8891bae878de66a75a28600e"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "typenum"
|
name = "typenum"
|
||||||
version = "1.19.0"
|
version = "1.19.0"
|
||||||
|
|
@ -2881,7 +3108,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-cli"
|
name = "webclaw-cli"
|
||||||
version = "0.1.7"
|
version = "0.2.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"clap",
|
"clap",
|
||||||
"dotenvy",
|
"dotenvy",
|
||||||
|
|
@ -2901,7 +3128,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-core"
|
name = "webclaw-core"
|
||||||
version = "0.1.7"
|
version = "0.2.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"ego-tree",
|
"ego-tree",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
|
|
@ -2919,10 +3146,11 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-fetch"
|
name = "webclaw-fetch"
|
||||||
version = "0.1.7"
|
version = "0.2.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"calamine",
|
||||||
"primp",
|
"primp",
|
||||||
"quick-xml",
|
"quick-xml 0.37.5",
|
||||||
"rand 0.8.5",
|
"rand 0.8.5",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
|
|
@ -2933,11 +3161,12 @@ dependencies = [
|
||||||
"url",
|
"url",
|
||||||
"webclaw-core",
|
"webclaw-core",
|
||||||
"webclaw-pdf",
|
"webclaw-pdf",
|
||||||
|
"zip 2.4.2",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-llm"
|
name = "webclaw-llm"
|
||||||
version = "0.1.7"
|
version = "0.2.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-trait",
|
"async-trait",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
|
|
@ -2950,7 +3179,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-mcp"
|
name = "webclaw-mcp"
|
||||||
version = "0.1.7"
|
version = "0.2.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"dotenvy",
|
"dotenvy",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
|
|
@ -2970,7 +3199,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-pdf"
|
name = "webclaw-pdf"
|
||||||
version = "0.1.7"
|
version = "0.2.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"pdf-extract",
|
"pdf-extract",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
|
|
@ -3301,6 +3530,15 @@ version = "0.6.2"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9"
|
checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "xz2"
|
||||||
|
version = "0.1.7"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2"
|
||||||
|
dependencies = [
|
||||||
|
"lzma-sys",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "yoke"
|
name = "yoke"
|
||||||
version = "0.8.1"
|
version = "0.8.1"
|
||||||
|
|
@ -3418,12 +3656,74 @@ dependencies = [
|
||||||
"syn",
|
"syn",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "zip"
|
||||||
|
version = "2.4.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "fabe6324e908f85a1c52063ce7aa26b68dcb7eb6dbc83a2d148403c9bc3eba50"
|
||||||
|
dependencies = [
|
||||||
|
"aes",
|
||||||
|
"arbitrary",
|
||||||
|
"bzip2",
|
||||||
|
"constant_time_eq",
|
||||||
|
"crc32fast",
|
||||||
|
"crossbeam-utils",
|
||||||
|
"deflate64",
|
||||||
|
"displaydoc",
|
||||||
|
"flate2",
|
||||||
|
"getrandom 0.3.4",
|
||||||
|
"hmac",
|
||||||
|
"indexmap",
|
||||||
|
"lzma-rs",
|
||||||
|
"memchr",
|
||||||
|
"pbkdf2",
|
||||||
|
"sha1",
|
||||||
|
"thiserror",
|
||||||
|
"time",
|
||||||
|
"xz2",
|
||||||
|
"zeroize",
|
||||||
|
"zopfli",
|
||||||
|
"zstd",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "zip"
|
||||||
|
version = "7.2.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c42e33efc22a0650c311c2ef19115ce232583abbe80850bc8b66509ebef02de0"
|
||||||
|
dependencies = [
|
||||||
|
"crc32fast",
|
||||||
|
"flate2",
|
||||||
|
"indexmap",
|
||||||
|
"memchr",
|
||||||
|
"typed-path",
|
||||||
|
"zopfli",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "zlib-rs"
|
||||||
|
version = "0.6.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "3be3d40e40a133f9c916ee3f9f4fa2d9d63435b5fbe1bfc6d9dae0aa0ada1513"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "zmij"
|
name = "zmij"
|
||||||
version = "1.0.21"
|
version = "1.0.21"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa"
|
checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "zopfli"
|
||||||
|
version = "0.8.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f05cd8797d63865425ff89b5c4a48804f35ba0ce8d125800027ad6017d2b5249"
|
||||||
|
dependencies = [
|
||||||
|
"bumpalo",
|
||||||
|
"crc32fast",
|
||||||
|
"log",
|
||||||
|
"simd-adler32",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "zstd"
|
name = "zstd"
|
||||||
version = "0.13.3"
|
version = "0.13.3"
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,7 @@ resolver = "2"
|
||||||
members = ["crates/*"]
|
members = ["crates/*"]
|
||||||
|
|
||||||
[workspace.package]
|
[workspace.package]
|
||||||
version = "0.1.7"
|
version = "0.2.0"
|
||||||
edition = "2024"
|
edition = "2024"
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
repository = "https://github.com/0xMassi/webclaw"
|
repository = "https://github.com/0xMassi/webclaw"
|
||||||
|
|
|
||||||
|
|
@ -95,7 +95,7 @@ struct Cli {
|
||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
urls_file: Option<String>,
|
urls_file: Option<String>,
|
||||||
|
|
||||||
/// Output format (markdown, json, text, llm)
|
/// Output format (markdown, json, text, llm, html)
|
||||||
#[arg(short, long, default_value = "markdown")]
|
#[arg(short, long, default_value = "markdown")]
|
||||||
format: OutputFormat,
|
format: OutputFormat,
|
||||||
|
|
||||||
|
|
@ -277,6 +277,7 @@ enum OutputFormat {
|
||||||
Json,
|
Json,
|
||||||
Text,
|
Text,
|
||||||
Llm,
|
Llm,
|
||||||
|
Html,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, ValueEnum)]
|
#[derive(Clone, ValueEnum)]
|
||||||
|
|
@ -394,7 +395,7 @@ fn build_extraction_options(cli: &Cli) -> ExtractionOptions {
|
||||||
.map(|s| s.split(',').map(|s| s.trim().to_string()).collect())
|
.map(|s| s.split(',').map(|s| s.trim().to_string()).collect())
|
||||||
.unwrap_or_default(),
|
.unwrap_or_default(),
|
||||||
only_main_content: cli.only_main_content,
|
only_main_content: cli.only_main_content,
|
||||||
include_raw_html: cli.raw_html,
|
include_raw_html: cli.raw_html || matches!(cli.format, OutputFormat::Html),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -417,6 +418,7 @@ fn url_to_filename(raw_url: &str, format: &OutputFormat) -> String {
|
||||||
OutputFormat::Markdown | OutputFormat::Llm => "md",
|
OutputFormat::Markdown | OutputFormat::Llm => "md",
|
||||||
OutputFormat::Json => "json",
|
OutputFormat::Json => "json",
|
||||||
OutputFormat::Text => "txt",
|
OutputFormat::Text => "txt",
|
||||||
|
OutputFormat::Html => "html",
|
||||||
};
|
};
|
||||||
|
|
||||||
let parsed = url::Url::parse(raw_url);
|
let parsed = url::Url::parse(raw_url);
|
||||||
|
|
@ -470,6 +472,15 @@ fn write_to_file(dir: &Path, filename: &str, content: &str) -> Result<(), String
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Get raw HTML from an extraction result, falling back to markdown if unavailable.
|
||||||
|
fn raw_html_or_markdown(result: &ExtractionResult) -> &str {
|
||||||
|
result
|
||||||
|
.content
|
||||||
|
.raw_html
|
||||||
|
.as_deref()
|
||||||
|
.unwrap_or(&result.content.markdown)
|
||||||
|
}
|
||||||
|
|
||||||
/// Format an `ExtractionResult` into a string for the given output format.
|
/// Format an `ExtractionResult` into a string for the given output format.
|
||||||
fn format_output(result: &ExtractionResult, format: &OutputFormat, show_metadata: bool) -> String {
|
fn format_output(result: &ExtractionResult, format: &OutputFormat, show_metadata: bool) -> String {
|
||||||
match format {
|
match format {
|
||||||
|
|
@ -484,6 +495,7 @@ fn format_output(result: &ExtractionResult, format: &OutputFormat, show_metadata
|
||||||
OutputFormat::Json => serde_json::to_string_pretty(result).expect("serialization failed"),
|
OutputFormat::Json => serde_json::to_string_pretty(result).expect("serialization failed"),
|
||||||
OutputFormat::Text => result.content.plain_text.clone(),
|
OutputFormat::Text => result.content.plain_text.clone(),
|
||||||
OutputFormat::Llm => to_llm_text(result, result.metadata.url.as_deref()),
|
OutputFormat::Llm => to_llm_text(result, result.metadata.url.as_deref()),
|
||||||
|
OutputFormat::Html => raw_html_or_markdown(result).to_string(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -586,6 +598,7 @@ async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
|
||||||
OutputFormat::Json => "json",
|
OutputFormat::Json => "json",
|
||||||
OutputFormat::Text => "text",
|
OutputFormat::Text => "text",
|
||||||
OutputFormat::Llm => "llm",
|
OutputFormat::Llm => "llm",
|
||||||
|
OutputFormat::Html => "html",
|
||||||
};
|
};
|
||||||
let resp = c
|
let resp = c
|
||||||
.scrape(
|
.scrape(
|
||||||
|
|
@ -618,6 +631,7 @@ async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
|
||||||
OutputFormat::Json => "json",
|
OutputFormat::Json => "json",
|
||||||
OutputFormat::Text => "text",
|
OutputFormat::Text => "text",
|
||||||
OutputFormat::Llm => "llm",
|
OutputFormat::Llm => "llm",
|
||||||
|
OutputFormat::Html => "html",
|
||||||
};
|
};
|
||||||
match c
|
match c
|
||||||
.scrape(
|
.scrape(
|
||||||
|
|
@ -793,6 +807,9 @@ fn print_output(result: &ExtractionResult, format: &OutputFormat, show_metadata:
|
||||||
OutputFormat::Llm => {
|
OutputFormat::Llm => {
|
||||||
println!("{}", to_llm_text(result, result.metadata.url.as_deref()));
|
println!("{}", to_llm_text(result, result.metadata.url.as_deref()));
|
||||||
}
|
}
|
||||||
|
OutputFormat::Html => {
|
||||||
|
println!("{}", raw_html_or_markdown(result));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -845,6 +862,17 @@ fn print_cloud_output(resp: &serde_json::Value, format: &OutputFormat) {
|
||||||
print_cloud_output(resp, &OutputFormat::Markdown);
|
print_cloud_output(resp, &OutputFormat::Markdown);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
OutputFormat::Html => {
|
||||||
|
if let Some(html) = resp
|
||||||
|
.get("content")
|
||||||
|
.and_then(|c| c.get("raw_html"))
|
||||||
|
.and_then(|h| h.as_str())
|
||||||
|
{
|
||||||
|
println!("{html}");
|
||||||
|
} else {
|
||||||
|
print_cloud_output(resp, &OutputFormat::Markdown);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -937,6 +965,17 @@ fn print_crawl_output(result: &CrawlResult, format: &OutputFormat, show_metadata
|
||||||
println!();
|
println!();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
OutputFormat::Html => {
|
||||||
|
for page in &result.pages {
|
||||||
|
let Some(ref extraction) = page.extraction else {
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
println!("---");
|
||||||
|
println!("<!-- Page: {} -->\n", page.url);
|
||||||
|
println!("{}", raw_html_or_markdown(extraction));
|
||||||
|
println!();
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1009,6 +1048,21 @@ fn print_batch_output(results: &[BatchExtractResult], format: &OutputFormat, sho
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
OutputFormat::Html => {
|
||||||
|
for r in results {
|
||||||
|
match &r.result {
|
||||||
|
Ok(extraction) => {
|
||||||
|
println!("---");
|
||||||
|
println!("<!-- {} -->\n", r.url);
|
||||||
|
println!("{}", raw_html_or_markdown(extraction));
|
||||||
|
println!();
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("error: {} -- {}", r.url, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1393,24 +1447,15 @@ fn fire_webhook(url: &str, payload: &serde_json::Value) {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn run_watch(cli: &Cli) -> Result<(), String> {
|
async fn run_watch(cli: &Cli, urls: &[String]) -> Result<(), String> {
|
||||||
let raw_url = cli.urls.first().ok_or("--watch requires a URL argument")?;
|
if urls.is_empty() {
|
||||||
let url = normalize_url(raw_url);
|
return Err("--watch requires at least one URL".into());
|
||||||
|
}
|
||||||
|
|
||||||
let client =
|
let client = Arc::new(
|
||||||
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
|
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?,
|
||||||
let options = build_extraction_options(cli);
|
|
||||||
|
|
||||||
// Initial snapshot
|
|
||||||
let mut previous = client
|
|
||||||
.fetch_and_extract_with_options(&url, &options)
|
|
||||||
.await
|
|
||||||
.map_err(|e| format!("initial fetch failed: {e}"))?;
|
|
||||||
|
|
||||||
eprintln!(
|
|
||||||
"[watch] Initial snapshot: {url} ({} words)",
|
|
||||||
previous.metadata.word_count
|
|
||||||
);
|
);
|
||||||
|
let options = build_extraction_options(cli);
|
||||||
|
|
||||||
// Ctrl+C handler
|
// Ctrl+C handler
|
||||||
let cancelled = Arc::new(AtomicBool::new(false));
|
let cancelled = Arc::new(AtomicBool::new(false));
|
||||||
|
|
@ -1420,6 +1465,33 @@ async fn run_watch(cli: &Cli) -> Result<(), String> {
|
||||||
flag.store(true, Ordering::Relaxed);
|
flag.store(true, Ordering::Relaxed);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Single-URL mode: preserve original behavior exactly
|
||||||
|
if urls.len() == 1 {
|
||||||
|
return run_watch_single(cli, &client, &options, &urls[0], &cancelled).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Multi-URL mode: batch fetch, diff each, report aggregate
|
||||||
|
run_watch_multi(cli, &client, &options, urls, &cancelled).await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Original single-URL watch loop -- backward compatible.
|
||||||
|
async fn run_watch_single(
|
||||||
|
cli: &Cli,
|
||||||
|
client: &Arc<FetchClient>,
|
||||||
|
options: &ExtractionOptions,
|
||||||
|
url: &str,
|
||||||
|
cancelled: &Arc<AtomicBool>,
|
||||||
|
) -> Result<(), String> {
|
||||||
|
let mut previous = client
|
||||||
|
.fetch_and_extract_with_options(url, options)
|
||||||
|
.await
|
||||||
|
.map_err(|e| format!("initial fetch failed: {e}"))?;
|
||||||
|
|
||||||
|
eprintln!(
|
||||||
|
"[watch] Initial snapshot: {url} ({} words)",
|
||||||
|
previous.metadata.word_count
|
||||||
|
);
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval)).await;
|
tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval)).await;
|
||||||
|
|
||||||
|
|
@ -1428,7 +1500,7 @@ async fn run_watch(cli: &Cli) -> Result<(), String> {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
let current = match client.fetch_and_extract_with_options(&url, &options).await {
|
let current = match client.fetch_and_extract_with_options(url, options).await {
|
||||||
Ok(result) => result,
|
Ok(result) => result,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
eprintln!("[watch] Fetch error ({}): {e}", timestamp());
|
eprintln!("[watch] Fetch error ({}): {e}", timestamp());
|
||||||
|
|
@ -1454,7 +1526,6 @@ async fn run_watch(cli: &Cli) -> Result<(), String> {
|
||||||
.spawn()
|
.spawn()
|
||||||
{
|
{
|
||||||
Ok(mut child) => {
|
Ok(mut child) => {
|
||||||
// Pipe diff JSON to stdin, then detach
|
|
||||||
if let Some(mut stdin) = child.stdin.take() {
|
if let Some(mut stdin) = child.stdin.take() {
|
||||||
use tokio::io::AsyncWriteExt;
|
use tokio::io::AsyncWriteExt;
|
||||||
let _ = stdin.write_all(diff_json.as_bytes()).await;
|
let _ = stdin.write_all(diff_json.as_bytes()).await;
|
||||||
|
|
@ -1464,7 +1535,6 @@ async fn run_watch(cli: &Cli) -> Result<(), String> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fire webhook on change
|
|
||||||
if let Some(ref webhook_url) = cli.webhook {
|
if let Some(ref webhook_url) = cli.webhook {
|
||||||
fire_webhook(
|
fire_webhook(
|
||||||
webhook_url,
|
webhook_url,
|
||||||
|
|
@ -1487,6 +1557,162 @@ async fn run_watch(cli: &Cli) -> Result<(), String> {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Multi-URL watch loop -- batch fetch all URLs, diff each, report aggregate.
|
||||||
|
async fn run_watch_multi(
|
||||||
|
cli: &Cli,
|
||||||
|
client: &Arc<FetchClient>,
|
||||||
|
options: &ExtractionOptions,
|
||||||
|
urls: &[String],
|
||||||
|
cancelled: &Arc<AtomicBool>,
|
||||||
|
) -> Result<(), String> {
|
||||||
|
let url_refs: Vec<&str> = urls.iter().map(|u| u.as_str()).collect();
|
||||||
|
|
||||||
|
// Initial pass: fetch all URLs in parallel
|
||||||
|
let initial_results = client
|
||||||
|
.fetch_and_extract_batch_with_options(&url_refs, cli.concurrency, options)
|
||||||
|
.await;
|
||||||
|
|
||||||
|
let mut snapshots = std::collections::HashMap::new();
|
||||||
|
let mut ok_count = 0usize;
|
||||||
|
let mut err_count = 0usize;
|
||||||
|
|
||||||
|
for r in initial_results {
|
||||||
|
match r.result {
|
||||||
|
Ok(extraction) => {
|
||||||
|
snapshots.insert(r.url, extraction);
|
||||||
|
ok_count += 1;
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("[watch] Initial fetch error: {} -- {e}", r.url);
|
||||||
|
err_count += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
eprintln!(
|
||||||
|
"[watch] Watching {} URLs (interval: {}s)",
|
||||||
|
urls.len(),
|
||||||
|
cli.watch_interval
|
||||||
|
);
|
||||||
|
eprintln!("[watch] Initial snapshots: {ok_count} ok, {err_count} errors");
|
||||||
|
|
||||||
|
let mut check_number = 0u64;
|
||||||
|
|
||||||
|
loop {
|
||||||
|
tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval)).await;
|
||||||
|
|
||||||
|
if cancelled.load(Ordering::Relaxed) {
|
||||||
|
eprintln!("[watch] Stopped");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
check_number += 1;
|
||||||
|
|
||||||
|
let current_results = client
|
||||||
|
.fetch_and_extract_batch_with_options(&url_refs, cli.concurrency, options)
|
||||||
|
.await;
|
||||||
|
|
||||||
|
let mut changed: Vec<serde_json::Value> = Vec::new();
|
||||||
|
let mut same_count = 0usize;
|
||||||
|
let mut fetch_errors = 0usize;
|
||||||
|
|
||||||
|
for r in current_results {
|
||||||
|
match r.result {
|
||||||
|
Ok(current) => {
|
||||||
|
if let Some(previous) = snapshots.get(&r.url) {
|
||||||
|
let diff = webclaw_core::diff::diff(previous, ¤t);
|
||||||
|
if diff.status == ChangeStatus::Same {
|
||||||
|
same_count += 1;
|
||||||
|
} else {
|
||||||
|
changed.push(serde_json::json!({
|
||||||
|
"url": r.url,
|
||||||
|
"word_count_delta": diff.word_count_delta,
|
||||||
|
}));
|
||||||
|
snapshots.insert(r.url, current);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// URL failed initially, first successful fetch -- store as baseline
|
||||||
|
snapshots.insert(r.url, current);
|
||||||
|
same_count += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("[watch] Fetch error: {} -- {e}", r.url);
|
||||||
|
fetch_errors += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let ts = timestamp();
|
||||||
|
let err_suffix = if fetch_errors > 0 {
|
||||||
|
format!(", {fetch_errors} errors")
|
||||||
|
} else {
|
||||||
|
String::new()
|
||||||
|
};
|
||||||
|
|
||||||
|
if changed.is_empty() {
|
||||||
|
eprintln!(
|
||||||
|
"[watch] Check {check_number} ({ts}): 0 changed, {same_count} same{err_suffix}"
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
eprintln!(
|
||||||
|
"[watch] Check {check_number} ({ts}): {} changed, {same_count} same{err_suffix}",
|
||||||
|
changed.len(),
|
||||||
|
);
|
||||||
|
for entry in &changed {
|
||||||
|
let url = entry["url"].as_str().unwrap_or("?");
|
||||||
|
let delta = entry["word_count_delta"].as_i64().unwrap_or(0);
|
||||||
|
eprintln!(" -> {url} (word delta: {delta:+})");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fire --on-change once with all changes
|
||||||
|
if let Some(ref cmd) = cli.on_change {
|
||||||
|
let payload = serde_json::json!({
|
||||||
|
"event": "watch_changes",
|
||||||
|
"check_number": check_number,
|
||||||
|
"total_urls": urls.len(),
|
||||||
|
"changed": changed.len(),
|
||||||
|
"same": same_count,
|
||||||
|
"changes": changed,
|
||||||
|
});
|
||||||
|
let payload_json = serde_json::to_string(&payload).unwrap_or_default();
|
||||||
|
eprintln!("[watch] Running: {cmd}");
|
||||||
|
match tokio::process::Command::new("sh")
|
||||||
|
.arg("-c")
|
||||||
|
.arg(cmd)
|
||||||
|
.stdin(std::process::Stdio::piped())
|
||||||
|
.spawn()
|
||||||
|
{
|
||||||
|
Ok(mut child) => {
|
||||||
|
if let Some(mut stdin) = child.stdin.take() {
|
||||||
|
use tokio::io::AsyncWriteExt;
|
||||||
|
let _ = stdin.write_all(payload_json.as_bytes()).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => eprintln!("[watch] Failed to run command: {e}"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fire webhook once with aggregate payload
|
||||||
|
if let Some(ref webhook_url) = cli.webhook {
|
||||||
|
fire_webhook(
|
||||||
|
webhook_url,
|
||||||
|
&serde_json::json!({
|
||||||
|
"event": "watch_changes",
|
||||||
|
"check_number": check_number,
|
||||||
|
"total_urls": urls.len(),
|
||||||
|
"changed": changed.len(),
|
||||||
|
"same": same_count,
|
||||||
|
"changes": changed,
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
async fn run_diff(cli: &Cli, snapshot_path: &str) -> Result<(), String> {
|
async fn run_diff(cli: &Cli, snapshot_path: &str) -> Result<(), String> {
|
||||||
// Load previous snapshot
|
// Load previous snapshot
|
||||||
let snapshot_json = std::fs::read_to_string(snapshot_path)
|
let snapshot_json = std::fs::read_to_string(snapshot_path)
|
||||||
|
|
@ -1626,6 +1852,158 @@ async fn run_llm(cli: &Cli) -> Result<(), String> {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Batch LLM extraction: fetch each URL, run LLM on extracted content, save/print results.
|
||||||
|
/// URLs are processed sequentially to respect LLM provider rate limits.
|
||||||
|
async fn run_batch_llm(cli: &Cli, entries: &[(String, Option<String>)]) -> Result<(), String> {
|
||||||
|
let client =
|
||||||
|
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
|
||||||
|
let options = build_extraction_options(cli);
|
||||||
|
let provider = build_llm_provider(cli).await?;
|
||||||
|
let model = cli.llm_model.as_deref();
|
||||||
|
|
||||||
|
// Pre-parse schema once if --extract-json is used
|
||||||
|
let schema = if let Some(ref schema_input) = cli.extract_json {
|
||||||
|
let schema_str = if let Some(path) = schema_input.strip_prefix('@') {
|
||||||
|
std::fs::read_to_string(path)
|
||||||
|
.map_err(|e| format!("failed to read schema file {path}: {e}"))?
|
||||||
|
} else {
|
||||||
|
schema_input.clone()
|
||||||
|
};
|
||||||
|
Some(
|
||||||
|
serde_json::from_str::<serde_json::Value>(&schema_str)
|
||||||
|
.map_err(|e| format!("invalid JSON schema: {e}"))?,
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
// Build custom filename lookup from entries
|
||||||
|
let custom_names: std::collections::HashMap<&str, &str> = entries
|
||||||
|
.iter()
|
||||||
|
.filter_map(|(url, name)| name.as_deref().map(|n| (url.as_str(), n)))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let total = entries.len();
|
||||||
|
let mut ok = 0usize;
|
||||||
|
let mut errors = 0usize;
|
||||||
|
let mut all_results: Vec<serde_json::Value> = Vec::with_capacity(total);
|
||||||
|
|
||||||
|
for (i, (url, _)) in entries.iter().enumerate() {
|
||||||
|
let idx = i + 1;
|
||||||
|
eprint!("[{idx}/{total}] {url} ");
|
||||||
|
|
||||||
|
// Fetch and extract page content
|
||||||
|
let extraction = match client.fetch_and_extract_with_options(url, &options).await {
|
||||||
|
Ok(r) => r,
|
||||||
|
Err(e) => {
|
||||||
|
errors += 1;
|
||||||
|
let msg = format!("fetch failed: {e}");
|
||||||
|
eprintln!("-> error: {msg}");
|
||||||
|
all_results.push(serde_json::json!({ "url": url, "error": msg }));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let text = &extraction.content.plain_text;
|
||||||
|
|
||||||
|
// Run the appropriate LLM operation
|
||||||
|
let llm_result = if let Some(ref schema) = schema {
|
||||||
|
webclaw_llm::extract::extract_json(text, schema, provider.as_ref(), model)
|
||||||
|
.await
|
||||||
|
.map(LlmOutput::Json)
|
||||||
|
} else if let Some(ref prompt) = cli.extract_prompt {
|
||||||
|
webclaw_llm::extract::extract_with_prompt(text, prompt, provider.as_ref(), model)
|
||||||
|
.await
|
||||||
|
.map(LlmOutput::Json)
|
||||||
|
} else if let Some(sentences) = cli.summarize {
|
||||||
|
webclaw_llm::summarize::summarize(text, Some(sentences), provider.as_ref(), model)
|
||||||
|
.await
|
||||||
|
.map(LlmOutput::Text)
|
||||||
|
} else {
|
||||||
|
unreachable!("run_batch_llm called without LLM flags")
|
||||||
|
};
|
||||||
|
|
||||||
|
match llm_result {
|
||||||
|
Ok(output) => {
|
||||||
|
ok += 1;
|
||||||
|
|
||||||
|
let (output_str, result_json) = match &output {
|
||||||
|
LlmOutput::Json(v) => {
|
||||||
|
let s = serde_json::to_string_pretty(v).expect("serialization failed");
|
||||||
|
let j = serde_json::json!({ "url": url, "result": v });
|
||||||
|
(s, j)
|
||||||
|
}
|
||||||
|
LlmOutput::Text(s) => {
|
||||||
|
let j = serde_json::json!({ "url": url, "result": s });
|
||||||
|
(s.clone(), j)
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Count top-level fields/items for progress display
|
||||||
|
let detail = match &output {
|
||||||
|
LlmOutput::Json(v) => match v {
|
||||||
|
serde_json::Value::Object(m) => format!("{} fields", m.len()),
|
||||||
|
serde_json::Value::Array(a) => format!("{} items", a.len()),
|
||||||
|
_ => "done".to_string(),
|
||||||
|
},
|
||||||
|
LlmOutput::Text(s) => {
|
||||||
|
let words = s.split_whitespace().count();
|
||||||
|
format!("{words} words")
|
||||||
|
}
|
||||||
|
};
|
||||||
|
eprintln!("-> extracted {detail}");
|
||||||
|
|
||||||
|
if let Some(ref dir) = cli.output_dir {
|
||||||
|
let filename = custom_names
|
||||||
|
.get(url.as_str())
|
||||||
|
.map(|s| s.to_string())
|
||||||
|
.unwrap_or_else(|| url_to_filename(url, &OutputFormat::Json));
|
||||||
|
write_to_file(dir, &filename, &output_str)?;
|
||||||
|
} else {
|
||||||
|
println!("--- {url}");
|
||||||
|
println!("{output_str}");
|
||||||
|
println!();
|
||||||
|
}
|
||||||
|
|
||||||
|
all_results.push(result_json);
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
errors += 1;
|
||||||
|
let msg = format!("LLM extraction failed: {e}");
|
||||||
|
eprintln!("-> error: {msg}");
|
||||||
|
all_results.push(serde_json::json!({ "url": url, "error": msg }));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
eprintln!("Processed {total} URLs ({ok} ok, {errors} errors)");
|
||||||
|
|
||||||
|
if let Some(ref webhook_url) = cli.webhook {
|
||||||
|
fire_webhook(
|
||||||
|
webhook_url,
|
||||||
|
&serde_json::json!({
|
||||||
|
"event": "batch_llm_complete",
|
||||||
|
"total": total,
|
||||||
|
"ok": ok,
|
||||||
|
"errors": errors,
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
tokio::time::sleep(std::time::Duration::from_millis(500)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
if errors > 0 {
|
||||||
|
Err(format!("{errors} of {total} URLs failed"))
|
||||||
|
} else {
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Intermediate type to hold LLM output before formatting.
|
||||||
|
enum LlmOutput {
|
||||||
|
Json(serde_json::Value),
|
||||||
|
Text(String),
|
||||||
|
}
|
||||||
|
|
||||||
/// Returns true if any LLM flag is set.
|
/// Returns true if any LLM flag is set.
|
||||||
fn has_llm_flags(cli: &Cli) -> bool {
|
fn has_llm_flags(cli: &Cli) -> bool {
|
||||||
cli.extract_json.is_some() || cli.extract_prompt.is_some() || cli.summarize.is_some()
|
cli.extract_json.is_some() || cli.extract_prompt.is_some() || cli.summarize.is_some()
|
||||||
|
|
@ -1656,9 +2034,16 @@ async fn main() {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// --watch: poll a URL for changes
|
// --watch: poll URL(s) for changes
|
||||||
if cli.watch {
|
if cli.watch {
|
||||||
if let Err(e) = run_watch(&cli).await {
|
let watch_urls: Vec<String> = match collect_urls(&cli) {
|
||||||
|
Ok(entries) => entries.into_iter().map(|(url, _)| url).collect(),
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("error: {e}");
|
||||||
|
process::exit(1);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
if let Err(e) = run_watch(&cli, &watch_urls).await {
|
||||||
eprintln!("error: {e}");
|
eprintln!("error: {e}");
|
||||||
process::exit(1);
|
process::exit(1);
|
||||||
}
|
}
|
||||||
|
|
@ -1683,15 +2068,6 @@ async fn main() {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// LLM modes: --extract-json, --extract-prompt, --summarize
|
|
||||||
if has_llm_flags(&cli) {
|
|
||||||
if let Err(e) = run_llm(&cli).await {
|
|
||||||
eprintln!("error: {e}");
|
|
||||||
process::exit(1);
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Collect all URLs from args + --urls-file
|
// Collect all URLs from args + --urls-file
|
||||||
let entries = match collect_urls(&cli) {
|
let entries = match collect_urls(&cli) {
|
||||||
Ok(u) => u,
|
Ok(u) => u,
|
||||||
|
|
@ -1701,6 +2077,21 @@ async fn main() {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// LLM modes: --extract-json, --extract-prompt, --summarize
|
||||||
|
// When multiple URLs are provided, run batch LLM extraction over all of them.
|
||||||
|
if has_llm_flags(&cli) {
|
||||||
|
if entries.len() > 1 {
|
||||||
|
if let Err(e) = run_batch_llm(&cli, &entries).await {
|
||||||
|
eprintln!("error: {e}");
|
||||||
|
process::exit(1);
|
||||||
|
}
|
||||||
|
} else if let Err(e) = run_llm(&cli).await {
|
||||||
|
eprintln!("error: {e}");
|
||||||
|
process::exit(1);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
// Multi-URL batch mode
|
// Multi-URL batch mode
|
||||||
if entries.len() > 1 {
|
if entries.len() > 1 {
|
||||||
if let Err(e) = run_batch(&cli, &entries).await {
|
if let Err(e) = run_batch(&cli, &entries).await {
|
||||||
|
|
@ -1824,6 +2215,14 @@ mod tests {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn url_to_filename_html_format() {
|
||||||
|
assert_eq!(
|
||||||
|
url_to_filename("https://example.com/docs/api", &OutputFormat::Html),
|
||||||
|
"docs/api.html"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn url_to_filename_special_chars() {
|
fn url_to_filename_special_chars() {
|
||||||
// Spaces and special chars get replaced with underscores
|
// Spaces and special chars get replaced with underscores
|
||||||
|
|
|
||||||
|
|
@ -19,6 +19,8 @@ url = "2"
|
||||||
rand = "0.8"
|
rand = "0.8"
|
||||||
quick-xml = { version = "0.37", features = ["serde"] }
|
quick-xml = { version = "0.37", features = ["serde"] }
|
||||||
serde_json.workspace = true
|
serde_json.workspace = true
|
||||||
|
calamine = "0.34"
|
||||||
|
zip = "2"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
tempfile = "3"
|
tempfile = "3"
|
||||||
|
|
|
||||||
|
|
@ -399,6 +399,27 @@ impl FetchClient {
|
||||||
|
|
||||||
let pdf_result = webclaw_pdf::extract_pdf(&bytes, self.pdf_mode.clone())?;
|
let pdf_result = webclaw_pdf::extract_pdf(&bytes, self.pdf_mode.clone())?;
|
||||||
Ok(pdf_to_extraction_result(&pdf_result, &final_url))
|
Ok(pdf_to_extraction_result(&pdf_result, &final_url))
|
||||||
|
} else if let Some(doc_type) =
|
||||||
|
crate::document::is_document_content_type(&headers, &final_url)
|
||||||
|
{
|
||||||
|
debug!(status, doc_type = ?doc_type, "detected document response, extracting");
|
||||||
|
|
||||||
|
let bytes = response
|
||||||
|
.bytes()
|
||||||
|
.await
|
||||||
|
.map_err(|e| FetchError::BodyDecode(e.to_string()))?;
|
||||||
|
|
||||||
|
let elapsed = start.elapsed();
|
||||||
|
debug!(
|
||||||
|
status,
|
||||||
|
bytes = bytes.len(),
|
||||||
|
elapsed_ms = %elapsed.as_millis(),
|
||||||
|
"document fetch complete"
|
||||||
|
);
|
||||||
|
|
||||||
|
let mut result = crate::document::extract_document(&bytes, doc_type)?;
|
||||||
|
result.metadata.url = Some(final_url);
|
||||||
|
Ok(result)
|
||||||
} else {
|
} else {
|
||||||
let html = response
|
let html = response
|
||||||
.text()
|
.text()
|
||||||
|
|
|
||||||
743
crates/webclaw-fetch/src/document.rs
Normal file
743
crates/webclaw-fetch/src/document.rs
Normal file
|
|
@ -0,0 +1,743 @@
|
||||||
|
/// Document extraction for DOCX, XLSX, XLS, and CSV files.
|
||||||
|
/// Auto-detects document type from Content-Type headers or URL extension,
|
||||||
|
/// then extracts text content as markdown — same pattern as PDF extraction.
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::io::{Cursor, Read};
|
||||||
|
|
||||||
|
use tracing::debug;
|
||||||
|
|
||||||
|
use crate::error::FetchError;
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||||
|
pub enum DocType {
|
||||||
|
Docx,
|
||||||
|
Xlsx,
|
||||||
|
Xls,
|
||||||
|
Csv,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DocType {
|
||||||
|
fn label(self) -> &'static str {
|
||||||
|
match self {
|
||||||
|
DocType::Docx => "DOCX",
|
||||||
|
DocType::Xlsx => "XLSX",
|
||||||
|
DocType::Xls => "XLS",
|
||||||
|
DocType::Csv => "CSV",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Detect document type from response headers or URL extension.
|
||||||
|
/// Returns `None` for non-document responses (HTML, PDF, etc.).
|
||||||
|
pub fn is_document_content_type(headers: &HashMap<String, String>, url: &str) -> Option<DocType> {
|
||||||
|
// Check Content-Type header first
|
||||||
|
if let Some(ct) = headers.get("content-type") {
|
||||||
|
let mime = ct.split(';').next().unwrap_or("").trim();
|
||||||
|
|
||||||
|
if mime.eq_ignore_ascii_case(
|
||||||
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||||
|
) {
|
||||||
|
return Some(DocType::Docx);
|
||||||
|
}
|
||||||
|
if mime.eq_ignore_ascii_case(
|
||||||
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||||
|
) {
|
||||||
|
return Some(DocType::Xlsx);
|
||||||
|
}
|
||||||
|
if mime.eq_ignore_ascii_case("application/vnd.ms-excel") {
|
||||||
|
return Some(DocType::Xls);
|
||||||
|
}
|
||||||
|
if mime.eq_ignore_ascii_case("text/csv") {
|
||||||
|
return Some(DocType::Csv);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fall back to URL extension
|
||||||
|
let path = url.split('?').next().unwrap_or(url);
|
||||||
|
let lower = path.to_ascii_lowercase();
|
||||||
|
|
||||||
|
if lower.ends_with(".docx") {
|
||||||
|
return Some(DocType::Docx);
|
||||||
|
}
|
||||||
|
if lower.ends_with(".xlsx") {
|
||||||
|
return Some(DocType::Xlsx);
|
||||||
|
}
|
||||||
|
if lower.ends_with(".xls") {
|
||||||
|
return Some(DocType::Xls);
|
||||||
|
}
|
||||||
|
if lower.ends_with(".csv") {
|
||||||
|
return Some(DocType::Csv);
|
||||||
|
}
|
||||||
|
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract text content from document bytes, returning an ExtractionResult.
|
||||||
|
pub fn extract_document(
|
||||||
|
bytes: &[u8],
|
||||||
|
doc_type: DocType,
|
||||||
|
) -> Result<webclaw_core::ExtractionResult, FetchError> {
|
||||||
|
debug!(
|
||||||
|
doc_type = doc_type.label(),
|
||||||
|
bytes = bytes.len(),
|
||||||
|
"extracting document"
|
||||||
|
);
|
||||||
|
|
||||||
|
let markdown = match doc_type {
|
||||||
|
DocType::Docx => extract_docx(bytes)?,
|
||||||
|
DocType::Xlsx => extract_xlsx(bytes)?,
|
||||||
|
DocType::Xls => extract_xls(bytes)?,
|
||||||
|
DocType::Csv => extract_csv(bytes)?,
|
||||||
|
};
|
||||||
|
|
||||||
|
let plain_text = strip_markdown_formatting(&markdown);
|
||||||
|
let word_count = plain_text.split_whitespace().count();
|
||||||
|
|
||||||
|
Ok(webclaw_core::ExtractionResult {
|
||||||
|
metadata: webclaw_core::Metadata {
|
||||||
|
title: None,
|
||||||
|
description: None,
|
||||||
|
author: None,
|
||||||
|
published_date: None,
|
||||||
|
language: None,
|
||||||
|
url: None,
|
||||||
|
site_name: None,
|
||||||
|
image: None,
|
||||||
|
favicon: None,
|
||||||
|
word_count,
|
||||||
|
},
|
||||||
|
content: webclaw_core::Content {
|
||||||
|
markdown,
|
||||||
|
plain_text,
|
||||||
|
links: Vec::new(),
|
||||||
|
images: Vec::new(),
|
||||||
|
code_blocks: Vec::new(),
|
||||||
|
raw_html: None,
|
||||||
|
},
|
||||||
|
domain_data: None,
|
||||||
|
structured_data: vec![],
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract text from a DOCX file (ZIP of XML).
|
||||||
|
/// Reads `word/document.xml`, extracts `<w:t>` text nodes, detects heading styles.
|
||||||
|
fn extract_docx(bytes: &[u8]) -> Result<String, FetchError> {
|
||||||
|
let cursor = Cursor::new(bytes);
|
||||||
|
let mut archive =
|
||||||
|
zip::ZipArchive::new(cursor).map_err(|e| FetchError::Build(format!("DOCX zip: {e}")))?;
|
||||||
|
|
||||||
|
let xml = {
|
||||||
|
let mut file = archive
|
||||||
|
.by_name("word/document.xml")
|
||||||
|
.map_err(|e| FetchError::Build(format!("DOCX missing document.xml: {e}")))?;
|
||||||
|
let mut buf = String::new();
|
||||||
|
file.read_to_string(&mut buf)
|
||||||
|
.map_err(|e| FetchError::BodyDecode(format!("DOCX read: {e}")))?;
|
||||||
|
buf
|
||||||
|
};
|
||||||
|
|
||||||
|
parse_docx_xml(&xml)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse DOCX XML (word/document.xml) into markdown.
|
||||||
|
///
|
||||||
|
/// Walks the XML looking for paragraph elements (`<w:p>`). Within each paragraph,
|
||||||
|
/// collects text from `<w:t>` tags and detects heading styles from `<w:pStyle>`.
|
||||||
|
fn parse_docx_xml(xml: &str) -> Result<String, FetchError> {
|
||||||
|
use quick_xml::Reader;
|
||||||
|
use quick_xml::events::Event;
|
||||||
|
|
||||||
|
let mut reader = Reader::from_str(xml);
|
||||||
|
let mut paragraphs: Vec<String> = Vec::new();
|
||||||
|
|
||||||
|
// State tracking for the current paragraph
|
||||||
|
let mut in_paragraph = false;
|
||||||
|
let mut in_run = false; // inside <w:r> (run)
|
||||||
|
let mut in_text = false; // inside <w:t>
|
||||||
|
let mut current_text = String::new();
|
||||||
|
let mut heading_level: Option<u8> = 0.into(); // None = normal paragraph
|
||||||
|
let mut in_ppr = false; // inside <w:pPr> (paragraph properties)
|
||||||
|
|
||||||
|
loop {
|
||||||
|
match reader.read_event() {
|
||||||
|
Ok(Event::Start(ref e)) | Ok(Event::Empty(ref e)) => {
|
||||||
|
let name_bytes = e.name().as_ref().to_vec();
|
||||||
|
let local = local_name(&name_bytes);
|
||||||
|
match local {
|
||||||
|
b"p" if is_w_namespace(&name_bytes) => {
|
||||||
|
in_paragraph = true;
|
||||||
|
current_text.clear();
|
||||||
|
heading_level = None;
|
||||||
|
}
|
||||||
|
b"pPr" if in_paragraph => in_ppr = true,
|
||||||
|
b"pStyle" if in_ppr => {
|
||||||
|
heading_level = extract_heading_level(e);
|
||||||
|
}
|
||||||
|
b"r" if in_paragraph => in_run = true,
|
||||||
|
b"t" if in_run => in_text = true,
|
||||||
|
b"br" if in_paragraph => {
|
||||||
|
current_text.push('\n');
|
||||||
|
}
|
||||||
|
b"tab" if in_paragraph => {
|
||||||
|
current_text.push('\t');
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(Event::End(ref e)) => {
|
||||||
|
let name_bytes = e.name().as_ref().to_vec();
|
||||||
|
let local = local_name(&name_bytes);
|
||||||
|
match local {
|
||||||
|
b"p" if in_paragraph => {
|
||||||
|
let text = current_text.trim().to_string();
|
||||||
|
if !text.is_empty() {
|
||||||
|
let formatted = match heading_level {
|
||||||
|
Some(1) => format!("# {text}"),
|
||||||
|
Some(2) => format!("## {text}"),
|
||||||
|
Some(3) => format!("### {text}"),
|
||||||
|
Some(4) => format!("#### {text}"),
|
||||||
|
Some(5) => format!("##### {text}"),
|
||||||
|
Some(6) => format!("###### {text}"),
|
||||||
|
_ => text,
|
||||||
|
};
|
||||||
|
paragraphs.push(formatted);
|
||||||
|
}
|
||||||
|
in_paragraph = false;
|
||||||
|
}
|
||||||
|
b"pPr" => in_ppr = false,
|
||||||
|
b"r" => {
|
||||||
|
in_run = false;
|
||||||
|
in_text = false;
|
||||||
|
}
|
||||||
|
b"t" => in_text = false,
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(Event::Text(ref e)) if in_text => {
|
||||||
|
if let Ok(text) = e.unescape() {
|
||||||
|
current_text.push_str(&text);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(Event::Eof) => break,
|
||||||
|
Err(e) => {
|
||||||
|
return Err(FetchError::Build(format!("DOCX XML parse error: {e}")));
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(paragraphs.join("\n\n"))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if a qualified name belongs to the `w:` (wordprocessingML) namespace.
|
||||||
|
/// Handles both `w:p` (prefixed) and just `p` (default namespace) forms.
|
||||||
|
fn is_w_namespace(name: &[u8]) -> bool {
|
||||||
|
// quick-xml gives us the full name bytes. Accept both "w:p" and "p".
|
||||||
|
name == b"w:p" || name == b"p"
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract the local name from a possibly namespaced XML tag.
|
||||||
|
/// `w:p` -> `p`, `p` -> `p`
|
||||||
|
fn local_name(name: &[u8]) -> &[u8] {
|
||||||
|
match name.iter().position(|&b| b == b':') {
|
||||||
|
Some(pos) => &name[pos + 1..],
|
||||||
|
None => name,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract heading level from a `<w:pStyle w:val="Heading1"/>` element.
|
||||||
|
fn extract_heading_level(e: &quick_xml::events::BytesStart) -> Option<u8> {
|
||||||
|
for attr in e.attributes().flatten() {
|
||||||
|
let local = local_name(attr.key.as_ref());
|
||||||
|
if local == b"val" {
|
||||||
|
let val = String::from_utf8_lossy(&attr.value);
|
||||||
|
let lower = val.to_ascii_lowercase();
|
||||||
|
|
||||||
|
// Match "heading1", "heading2", etc. and "title" -> h1
|
||||||
|
if lower == "title" {
|
||||||
|
return Some(1);
|
||||||
|
}
|
||||||
|
if let Some(rest) = lower.strip_prefix("heading")
|
||||||
|
&& let Ok(n) = rest.parse::<u8>()
|
||||||
|
{
|
||||||
|
return Some(n.min(6));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract spreadsheet content using calamine (XLSX format).
|
||||||
|
fn extract_xlsx(bytes: &[u8]) -> Result<String, FetchError> {
|
||||||
|
extract_spreadsheet(bytes, "XLSX")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract spreadsheet content using calamine (XLS format).
|
||||||
|
fn extract_xls(bytes: &[u8]) -> Result<String, FetchError> {
|
||||||
|
extract_spreadsheet(bytes, "XLS")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Shared spreadsheet extraction for both XLSX and XLS via calamine.
|
||||||
|
/// Reads all sheets and formats each as a markdown table.
|
||||||
|
fn extract_spreadsheet(bytes: &[u8], label: &str) -> Result<String, FetchError> {
|
||||||
|
use calamine::Reader;
|
||||||
|
|
||||||
|
let cursor = Cursor::new(bytes);
|
||||||
|
let mut workbook: calamine::Sheets<_> = calamine::open_workbook_auto_from_rs(cursor)
|
||||||
|
.map_err(|e| FetchError::Build(format!("{label} open: {e}")))?;
|
||||||
|
|
||||||
|
let sheet_names: Vec<String> = workbook.sheet_names().to_vec();
|
||||||
|
let mut sections: Vec<String> = Vec::new();
|
||||||
|
|
||||||
|
for name in &sheet_names {
|
||||||
|
let range = workbook
|
||||||
|
.worksheet_range(name)
|
||||||
|
.map_err(|e| FetchError::Build(format!("{label} sheet '{name}': {e}")))?;
|
||||||
|
|
||||||
|
let rows: Vec<Vec<String>> = range
|
||||||
|
.rows()
|
||||||
|
.map(|row| row.iter().map(cell_to_string).collect())
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
if rows.is_empty() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut section = format!("## Sheet: {name}\n\n");
|
||||||
|
section.push_str(&rows_to_markdown_table(&rows));
|
||||||
|
sections.push(section);
|
||||||
|
}
|
||||||
|
|
||||||
|
if sections.is_empty() {
|
||||||
|
return Ok("(empty spreadsheet)".to_string());
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(sections.join("\n\n"))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert a calamine cell value to a display string.
|
||||||
|
fn cell_to_string(cell: &calamine::Data) -> String {
|
||||||
|
use calamine::Data;
|
||||||
|
match cell {
|
||||||
|
Data::Empty => String::new(),
|
||||||
|
Data::String(s) => s.clone(),
|
||||||
|
Data::Int(n) => n.to_string(),
|
||||||
|
Data::Float(f) => format_float(*f),
|
||||||
|
Data::Bool(b) => b.to_string(),
|
||||||
|
Data::Error(e) => format!("#{e:?}"),
|
||||||
|
Data::DateTime(dt) => format!("{dt}"),
|
||||||
|
Data::DateTimeIso(s) => s.clone(),
|
||||||
|
Data::DurationIso(s) => s.clone(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Format a float, dropping trailing `.0` for clean integer display.
|
||||||
|
fn format_float(f: f64) -> String {
|
||||||
|
if f.fract() == 0.0 && f.abs() < i64::MAX as f64 {
|
||||||
|
format!("{}", f as i64)
|
||||||
|
} else {
|
||||||
|
format!("{f}")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract CSV text and convert to markdown table.
|
||||||
|
fn extract_csv(bytes: &[u8]) -> Result<String, FetchError> {
|
||||||
|
let text = String::from_utf8_lossy(bytes);
|
||||||
|
let rows = parse_csv_rows(&text);
|
||||||
|
|
||||||
|
if rows.is_empty() {
|
||||||
|
return Ok("(empty CSV)".to_string());
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(rows_to_markdown_table(&rows))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse CSV text into rows of fields, handling quoted fields with commas/newlines.
|
||||||
|
fn parse_csv_rows(text: &str) -> Vec<Vec<String>> {
|
||||||
|
let mut rows: Vec<Vec<String>> = Vec::new();
|
||||||
|
let mut current_row: Vec<String> = Vec::new();
|
||||||
|
let mut current_field = String::new();
|
||||||
|
let mut in_quotes = false;
|
||||||
|
let mut chars = text.chars().peekable();
|
||||||
|
|
||||||
|
while let Some(ch) = chars.next() {
|
||||||
|
if in_quotes {
|
||||||
|
if ch == '"' {
|
||||||
|
// Escaped quote ("") or end of quoted field
|
||||||
|
if chars.peek() == Some(&'"') {
|
||||||
|
chars.next();
|
||||||
|
current_field.push('"');
|
||||||
|
} else {
|
||||||
|
in_quotes = false;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
current_field.push(ch);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
match ch {
|
||||||
|
'"' => in_quotes = true,
|
||||||
|
',' => {
|
||||||
|
current_row.push(current_field.trim().to_string());
|
||||||
|
current_field = String::new();
|
||||||
|
}
|
||||||
|
'\n' => {
|
||||||
|
current_row.push(current_field.trim().to_string());
|
||||||
|
current_field = String::new();
|
||||||
|
if !current_row.iter().all(|f| f.is_empty()) {
|
||||||
|
rows.push(current_row);
|
||||||
|
}
|
||||||
|
current_row = Vec::new();
|
||||||
|
}
|
||||||
|
'\r' => {
|
||||||
|
// Skip carriage returns (handled with \n)
|
||||||
|
}
|
||||||
|
_ => current_field.push(ch),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Flush last field/row
|
||||||
|
if !current_field.is_empty() || !current_row.is_empty() {
|
||||||
|
current_row.push(current_field.trim().to_string());
|
||||||
|
if !current_row.iter().all(|f| f.is_empty()) {
|
||||||
|
rows.push(current_row);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
rows
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert rows (first row = header) into a markdown table.
|
||||||
|
fn rows_to_markdown_table(rows: &[Vec<String>]) -> String {
|
||||||
|
if rows.is_empty() {
|
||||||
|
return String::new();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find the max column count across all rows
|
||||||
|
let col_count = rows.iter().map(|r| r.len()).max().unwrap_or(0);
|
||||||
|
if col_count == 0 {
|
||||||
|
return String::new();
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut lines: Vec<String> = Vec::new();
|
||||||
|
|
||||||
|
// Header row
|
||||||
|
let header = &rows[0];
|
||||||
|
let header_cells: Vec<&str> = (0..col_count)
|
||||||
|
.map(|i| header.get(i).map(|s| s.as_str()).unwrap_or(""))
|
||||||
|
.collect();
|
||||||
|
lines.push(format!("| {} |", header_cells.join(" | ")));
|
||||||
|
|
||||||
|
// Separator row
|
||||||
|
let sep: Vec<&str> = vec!["---"; col_count];
|
||||||
|
lines.push(format!("| {} |", sep.join(" | ")));
|
||||||
|
|
||||||
|
// Data rows
|
||||||
|
for row in &rows[1..] {
|
||||||
|
let cells: Vec<&str> = (0..col_count)
|
||||||
|
.map(|i| row.get(i).map(|s| s.as_str()).unwrap_or(""))
|
||||||
|
.collect();
|
||||||
|
lines.push(format!("| {} |", cells.join(" | ")));
|
||||||
|
}
|
||||||
|
|
||||||
|
lines.join("\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Strip markdown formatting to get plain text.
|
||||||
|
fn strip_markdown_formatting(markdown: &str) -> String {
|
||||||
|
let mut plain = String::with_capacity(markdown.len());
|
||||||
|
for line in markdown.lines() {
|
||||||
|
let trimmed = line.trim_start_matches('#').trim();
|
||||||
|
if trimmed.starts_with("| ---") || trimmed == "|---|" {
|
||||||
|
continue; // Skip separator rows
|
||||||
|
}
|
||||||
|
if let Some(stripped) = trimmed.strip_prefix('|')
|
||||||
|
&& let Some(stripped) = stripped.strip_suffix('|')
|
||||||
|
{
|
||||||
|
// Table row: join cells with spaces
|
||||||
|
let cells: Vec<&str> = stripped.split('|').map(|c| c.trim()).collect();
|
||||||
|
plain.push_str(&cells.join(" "));
|
||||||
|
plain.push('\n');
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
plain.push_str(trimmed);
|
||||||
|
plain.push('\n');
|
||||||
|
}
|
||||||
|
plain.trim().to_string()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
// --- Content-type detection ---
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_detect_docx_content_type() {
|
||||||
|
let mut headers = HashMap::new();
|
||||||
|
headers.insert(
|
||||||
|
"content-type".to_string(),
|
||||||
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document".to_string(),
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
is_document_content_type(&headers, "https://example.com/file"),
|
||||||
|
Some(DocType::Docx)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_detect_xlsx_content_type() {
|
||||||
|
let mut headers = HashMap::new();
|
||||||
|
headers.insert(
|
||||||
|
"content-type".to_string(),
|
||||||
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet".to_string(),
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
is_document_content_type(&headers, "https://example.com/file"),
|
||||||
|
Some(DocType::Xlsx)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_detect_xls_content_type() {
|
||||||
|
let mut headers = HashMap::new();
|
||||||
|
headers.insert(
|
||||||
|
"content-type".to_string(),
|
||||||
|
"application/vnd.ms-excel".to_string(),
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
is_document_content_type(&headers, "https://example.com/file"),
|
||||||
|
Some(DocType::Xls)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_detect_csv_content_type() {
|
||||||
|
let mut headers = HashMap::new();
|
||||||
|
headers.insert("content-type".to_string(), "text/csv".to_string());
|
||||||
|
assert_eq!(
|
||||||
|
is_document_content_type(&headers, "https://example.com/file"),
|
||||||
|
Some(DocType::Csv)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_detect_csv_content_type_with_charset() {
|
||||||
|
let mut headers = HashMap::new();
|
||||||
|
headers.insert(
|
||||||
|
"content-type".to_string(),
|
||||||
|
"text/csv; charset=utf-8".to_string(),
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
is_document_content_type(&headers, "https://example.com/file"),
|
||||||
|
Some(DocType::Csv)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_detect_by_url_extension() {
|
||||||
|
let empty: HashMap<String, String> = HashMap::new();
|
||||||
|
assert_eq!(
|
||||||
|
is_document_content_type(&empty, "https://example.com/report.docx"),
|
||||||
|
Some(DocType::Docx)
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
is_document_content_type(&empty, "https://example.com/data.xlsx"),
|
||||||
|
Some(DocType::Xlsx)
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
is_document_content_type(&empty, "https://example.com/old.xls"),
|
||||||
|
Some(DocType::Xls)
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
is_document_content_type(&empty, "https://example.com/data.csv"),
|
||||||
|
Some(DocType::Csv)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_detect_url_extension_with_query() {
|
||||||
|
let empty: HashMap<String, String> = HashMap::new();
|
||||||
|
assert_eq!(
|
||||||
|
is_document_content_type(&empty, "https://example.com/report.docx?token=abc"),
|
||||||
|
Some(DocType::Docx)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_detect_url_extension_case_insensitive() {
|
||||||
|
let empty: HashMap<String, String> = HashMap::new();
|
||||||
|
assert_eq!(
|
||||||
|
is_document_content_type(&empty, "https://example.com/FILE.XLSX"),
|
||||||
|
Some(DocType::Xlsx)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_detect_none_for_html() {
|
||||||
|
let mut headers = HashMap::new();
|
||||||
|
headers.insert("content-type".to_string(), "text/html".to_string());
|
||||||
|
assert_eq!(
|
||||||
|
is_document_content_type(&headers, "https://example.com/page"),
|
||||||
|
None
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_content_type_takes_precedence_over_url() {
|
||||||
|
let mut headers = HashMap::new();
|
||||||
|
headers.insert("content-type".to_string(), "text/csv".to_string());
|
||||||
|
// URL says .xlsx but Content-Type says CSV — header wins
|
||||||
|
assert_eq!(
|
||||||
|
is_document_content_type(&headers, "https://example.com/data.xlsx"),
|
||||||
|
Some(DocType::Csv)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- CSV parsing ---
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_csv_simple() {
|
||||||
|
let csv = "Name,Age,City\nAlice,30,NYC\nBob,25,LA\n";
|
||||||
|
let result = extract_csv(csv.as_bytes()).unwrap();
|
||||||
|
assert!(result.contains("| Name | Age | City |"));
|
||||||
|
assert!(result.contains("| --- | --- | --- |"));
|
||||||
|
assert!(result.contains("| Alice | 30 | NYC |"));
|
||||||
|
assert!(result.contains("| Bob | 25 | LA |"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_csv_quoted_fields() {
|
||||||
|
let csv = "Name,Description\nAlice,\"Has a, comma\"\nBob,\"Said \"\"hello\"\"\"\n";
|
||||||
|
let result = extract_csv(csv.as_bytes()).unwrap();
|
||||||
|
assert!(result.contains("Has a, comma"));
|
||||||
|
assert!(result.contains("Said \"hello\""));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_csv_empty() {
|
||||||
|
let result = extract_csv(b"").unwrap();
|
||||||
|
assert_eq!(result, "(empty CSV)");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_csv_windows_line_endings() {
|
||||||
|
let csv = "A,B\r\n1,2\r\n3,4\r\n";
|
||||||
|
let result = extract_csv(csv.as_bytes()).unwrap();
|
||||||
|
assert!(result.contains("| A | B |"));
|
||||||
|
assert!(result.contains("| 1 | 2 |"));
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- DOCX XML parsing ---
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_docx_xml_simple_paragraphs() {
|
||||||
|
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
||||||
|
<w:body>
|
||||||
|
<w:p><w:r><w:t>Hello world</w:t></w:r></w:p>
|
||||||
|
<w:p><w:r><w:t>Second paragraph</w:t></w:r></w:p>
|
||||||
|
</w:body>
|
||||||
|
</w:document>"#;
|
||||||
|
let result = parse_docx_xml(xml).unwrap();
|
||||||
|
assert_eq!(result, "Hello world\n\nSecond paragraph");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_docx_xml_headings() {
|
||||||
|
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
||||||
|
<w:body>
|
||||||
|
<w:p>
|
||||||
|
<w:pPr><w:pStyle w:val="Heading1"/></w:pPr>
|
||||||
|
<w:r><w:t>Title</w:t></w:r>
|
||||||
|
</w:p>
|
||||||
|
<w:p><w:r><w:t>Body text</w:t></w:r></w:p>
|
||||||
|
<w:p>
|
||||||
|
<w:pPr><w:pStyle w:val="Heading2"/></w:pPr>
|
||||||
|
<w:r><w:t>Subtitle</w:t></w:r>
|
||||||
|
</w:p>
|
||||||
|
</w:body>
|
||||||
|
</w:document>"#;
|
||||||
|
let result = parse_docx_xml(xml).unwrap();
|
||||||
|
assert!(result.contains("# Title"));
|
||||||
|
assert!(result.contains("Body text"));
|
||||||
|
assert!(result.contains("## Subtitle"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_docx_xml_multiple_runs() {
|
||||||
|
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
||||||
|
<w:body>
|
||||||
|
<w:p>
|
||||||
|
<w:r><w:t>Hello </w:t></w:r>
|
||||||
|
<w:r><w:t>world</w:t></w:r>
|
||||||
|
</w:p>
|
||||||
|
</w:body>
|
||||||
|
</w:document>"#;
|
||||||
|
let result = parse_docx_xml(xml).unwrap();
|
||||||
|
assert_eq!(result, "Hello world");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_docx_xml_empty_paragraphs_skipped() {
|
||||||
|
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
||||||
|
<w:body>
|
||||||
|
<w:p></w:p>
|
||||||
|
<w:p><w:r><w:t>Content</w:t></w:r></w:p>
|
||||||
|
<w:p><w:r><w:t> </w:t></w:r></w:p>
|
||||||
|
</w:body>
|
||||||
|
</w:document>"#;
|
||||||
|
let result = parse_docx_xml(xml).unwrap();
|
||||||
|
assert_eq!(result, "Content");
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Markdown table ---
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_rows_to_markdown_table() {
|
||||||
|
let rows = vec![
|
||||||
|
vec!["A".to_string(), "B".to_string()],
|
||||||
|
vec!["1".to_string(), "2".to_string()],
|
||||||
|
vec!["3".to_string(), "4".to_string()],
|
||||||
|
];
|
||||||
|
let table = rows_to_markdown_table(&rows);
|
||||||
|
assert_eq!(table, "| A | B |\n| --- | --- |\n| 1 | 2 |\n| 3 | 4 |");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_rows_to_markdown_table_ragged() {
|
||||||
|
let rows = vec![
|
||||||
|
vec!["A".to_string(), "B".to_string(), "C".to_string()],
|
||||||
|
vec!["1".to_string()], // fewer columns
|
||||||
|
];
|
||||||
|
let table = rows_to_markdown_table(&rows);
|
||||||
|
assert!(table.contains("| 1 | | |"));
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Extract result ---
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_extract_csv_result() {
|
||||||
|
let csv = "Name,Score\nAlice,100\n";
|
||||||
|
let result = extract_document(csv.as_bytes(), DocType::Csv).unwrap();
|
||||||
|
assert!(result.content.markdown.contains("| Name | Score |"));
|
||||||
|
assert!(result.metadata.word_count > 0);
|
||||||
|
assert!(result.content.links.is_empty());
|
||||||
|
assert!(result.domain_data.is_none());
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Strip markdown ---
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_strip_markdown() {
|
||||||
|
let md = "# Title\n\nSome text\n\n| A | B |\n| --- | --- |\n| 1 | 2 |";
|
||||||
|
let plain = strip_markdown_formatting(md);
|
||||||
|
assert!(plain.contains("Title"));
|
||||||
|
assert!(plain.contains("Some text"));
|
||||||
|
assert!(plain.contains("A B"));
|
||||||
|
assert!(!plain.contains("---"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -5,6 +5,7 @@
|
||||||
pub mod browser;
|
pub mod browser;
|
||||||
pub mod client;
|
pub mod client;
|
||||||
pub mod crawler;
|
pub mod crawler;
|
||||||
|
pub mod document;
|
||||||
pub mod error;
|
pub mod error;
|
||||||
pub mod linkedin;
|
pub mod linkedin;
|
||||||
pub mod proxy;
|
pub mod proxy;
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue