diff --git a/CHANGELOG.md b/CHANGELOG.md index e263948..c6a0244 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,19 @@ All notable changes to webclaw are documented here. Format follows [Keep a Changelog](https://keepachangelog.com/). +## [0.2.0] — 2026-03-26 + +### Added +- **DOCX extraction**: auto-detected by Content-Type or URL extension, outputs markdown with headings +- **XLSX/XLS extraction**: spreadsheets converted to markdown tables, multi-sheet support via calamine +- **CSV extraction**: parsed with quoted field handling, output as markdown table +- **HTML output format**: `-f html` returns sanitized HTML from the extracted content +- **Multi-URL watch**: `--watch` now works with `--urls-file` to monitor multiple URLs in parallel +- **Batch + LLM extraction**: `--extract-prompt` and `--extract-json` now work with multiple URLs +- **Scheduled batch watch**: watch multiple URLs with aggregate change reports and per-URL diffs + +--- + ## [0.1.7] — 2026-03-26 ### Fixed diff --git a/Cargo.lock b/Cargo.lock index 0c8c50d..6d3c761 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,6 +17,17 @@ dependencies = [ "pom", ] +[[package]] +name = "aes" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0" +dependencies = [ + "cfg-if", + "cipher", + "cpufeatures 0.2.17", +] + [[package]] name = "aho-corasick" version = "1.1.4" @@ -106,6 +117,15 @@ version = "1.0.102" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" +[[package]] +name = "arbitrary" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1" +dependencies = [ + "derive_arbitrary", +] + [[package]] name = "async-compression" version = "0.4.41" @@ -129,6 +149,15 @@ dependencies = [ "syn", ] +[[package]] +name = "atoi_simd" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ad17c7c205c2c28b527b9845eeb91cf1b4d008b438f98ce0e628227a822758e" +dependencies = [ + "debug_unsafe", +] + [[package]] name = "atomic-waker" version = "1.1.2" @@ -224,6 +253,42 @@ version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" +[[package]] +name = "bzip2" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47" +dependencies = [ + "bzip2-sys", +] + +[[package]] +name = "bzip2-sys" +version = "0.1.13+1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14" +dependencies = [ + "cc", + "pkg-config", +] + +[[package]] +name = "calamine" +version = "0.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20ae05a4e39297eecf9a994210d27501318c37a9318201f8e11050add82bb6f0" +dependencies = [ + "atoi_simd", + "byteorder", + "codepage", + "encoding_rs", + "fast-float2", + "log", + "quick-xml 0.39.2", + "serde", + "zip 7.2.0", +] + [[package]] name = "cc" version = "1.2.57" @@ -255,7 +320,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601" dependencies = [ "cfg-if", - "cpufeatures", + "cpufeatures 0.3.0", "rand_core 0.10.0", ] @@ -273,6 +338,16 @@ dependencies = [ "windows-link", ] +[[package]] +name = "cipher" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad" +dependencies = [ + "crypto-common", + "inout", +] + [[package]] name = "clap" version = "4.6.0" @@ -322,6 +397,15 @@ dependencies = [ "cc", ] +[[package]] +name = "codepage" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48f68d061bc2828ae826206326e61251aca94c1e4a5305cf52d9138639c918b4" +dependencies = [ + "encoding_rs", +] + [[package]] name = "colorchoice" version = "1.0.5" @@ -348,6 +432,12 @@ version = "0.4.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "75984efb6ed102a0d42db99afb6c1948f0380d1d91808d5529916e6c08b49d8d" +[[package]] +name = "constant_time_eq" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" + [[package]] name = "cookie" version = "0.18.1" @@ -393,6 +483,15 @@ version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" +[[package]] +name = "cpufeatures" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" +dependencies = [ + "libc", +] + [[package]] name = "cpufeatures" version = "0.3.0" @@ -402,6 +501,21 @@ dependencies = [ "libc", ] +[[package]] +name = "crc" +version = "3.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5eb8a2a1cd12ab0d987a5d5e825195d372001a4094a0376319d5a0ad71c1ba0d" +dependencies = [ + "crc-catalog", +] + +[[package]] +name = "crc-catalog" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" + [[package]] name = "crc32fast" version = "1.5.0" @@ -411,6 +525,12 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + [[package]] name = "crypto-common" version = "0.1.7" @@ -478,6 +598,18 @@ dependencies = [ "syn", ] +[[package]] +name = "debug_unsafe" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7eed2c4702fa172d1ce21078faa7c5203e69f5394d48cc436d25928394a867a2" + +[[package]] +name = "deflate64" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac6b926516df9c60bfa16e107b21086399f8285a44ca9711344b9e553c5146e2" + [[package]] name = "deranged" version = "0.5.8" @@ -487,6 +619,17 @@ dependencies = [ "powerfmt", ] +[[package]] +name = "derive_arbitrary" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e567bd82dcff979e4b03460c307b3cdc9e96fde3d73bed1496d2bc75d9dd62a" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "derive_more" version = "0.99.20" @@ -506,6 +649,7 @@ checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" dependencies = [ "block-buffer", "crypto-common", + "subtle", ] [[package]] @@ -601,6 +745,12 @@ dependencies = [ "num-traits", ] +[[package]] +name = "fast-float2" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8eb564c5c7423d25c886fb561d1e4ee69f72354d16918afa32c08811f6b6a55" + [[package]] name = "fastrand" version = "2.3.0" @@ -621,6 +771,7 @@ checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" dependencies = [ "crc32fast", "miniz_oxide", + "zlib-rs", ] [[package]] @@ -857,6 +1008,15 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "hmac" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +dependencies = [ + "digest", +] + [[package]] name = "html5ever" version = "0.29.1" @@ -1121,6 +1281,15 @@ dependencies = [ "serde_core", ] +[[package]] +name = "inout" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "879f10e63c20629ecabbb64a8010319738c66a5cd0c29b02d63d272b03751d01" +dependencies = [ + "generic-array", +] + [[package]] name = "ipnet" version = "2.12.0" @@ -1244,6 +1413,27 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" +[[package]] +name = "lzma-rs" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "297e814c836ae64db86b36cf2a557ba54368d03f6afcd7d947c266692f71115e" +dependencies = [ + "byteorder", + "crc", +] + +[[package]] +name = "lzma-sys" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + [[package]] name = "mac" version = "0.1.1" @@ -1414,6 +1604,16 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b867cad97c0791bbd3aaa6472142568c6c9e8f71937e98379f584cfb0cf35bec" +[[package]] +name = "pbkdf2" +version = "0.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ed6a7761f76e3b9f92dfb0a60a6a6477c61024b775147ff0973a02653abaf2" +dependencies = [ + "digest", + "hmac", +] + [[package]] name = "pdf-extract" version = "0.7.12" @@ -1629,6 +1829,16 @@ dependencies = [ "serde", ] +[[package]] +name = "quick-xml" +version = "0.39.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "958f21e8e7ceb5a1aa7fa87fab28e7c75976e0bfe7e23ff069e0a260f894067d" +dependencies = [ + "encoding_rs", + "memchr", +] + [[package]] name = "quinn" version = "0.11.9" @@ -2220,6 +2430,17 @@ dependencies = [ "stable_deref_trait", ] +[[package]] +name = "sha1" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" +dependencies = [ + "cfg-if", + "cpufeatures 0.2.17", + "digest", +] + [[package]] name = "sharded-slab" version = "0.1.7" @@ -2645,6 +2866,12 @@ dependencies = [ "pom", ] +[[package]] +name = "typed-path" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e28f89b80c87b8fb0cf04ab448d5dd0dd0ade2f8891bae878de66a75a28600e" + [[package]] name = "typenum" version = "1.19.0" @@ -2881,7 +3108,7 @@ dependencies = [ [[package]] name = "webclaw-cli" -version = "0.1.7" +version = "0.2.0" dependencies = [ "clap", "dotenvy", @@ -2901,7 +3128,7 @@ dependencies = [ [[package]] name = "webclaw-core" -version = "0.1.7" +version = "0.2.0" dependencies = [ "ego-tree", "once_cell", @@ -2919,10 +3146,11 @@ dependencies = [ [[package]] name = "webclaw-fetch" -version = "0.1.7" +version = "0.2.0" dependencies = [ + "calamine", "primp", - "quick-xml", + "quick-xml 0.37.5", "rand 0.8.5", "serde", "serde_json", @@ -2933,11 +3161,12 @@ dependencies = [ "url", "webclaw-core", "webclaw-pdf", + "zip 2.4.2", ] [[package]] name = "webclaw-llm" -version = "0.1.7" +version = "0.2.0" dependencies = [ "async-trait", "reqwest", @@ -2950,7 +3179,7 @@ dependencies = [ [[package]] name = "webclaw-mcp" -version = "0.1.7" +version = "0.2.0" dependencies = [ "dotenvy", "reqwest", @@ -2970,7 +3199,7 @@ dependencies = [ [[package]] name = "webclaw-pdf" -version = "0.1.7" +version = "0.2.0" dependencies = [ "pdf-extract", "thiserror", @@ -3301,6 +3530,15 @@ version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" +[[package]] +name = "xz2" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" +dependencies = [ + "lzma-sys", +] + [[package]] name = "yoke" version = "0.8.1" @@ -3418,12 +3656,74 @@ dependencies = [ "syn", ] +[[package]] +name = "zip" +version = "2.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fabe6324e908f85a1c52063ce7aa26b68dcb7eb6dbc83a2d148403c9bc3eba50" +dependencies = [ + "aes", + "arbitrary", + "bzip2", + "constant_time_eq", + "crc32fast", + "crossbeam-utils", + "deflate64", + "displaydoc", + "flate2", + "getrandom 0.3.4", + "hmac", + "indexmap", + "lzma-rs", + "memchr", + "pbkdf2", + "sha1", + "thiserror", + "time", + "xz2", + "zeroize", + "zopfli", + "zstd", +] + +[[package]] +name = "zip" +version = "7.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c42e33efc22a0650c311c2ef19115ce232583abbe80850bc8b66509ebef02de0" +dependencies = [ + "crc32fast", + "flate2", + "indexmap", + "memchr", + "typed-path", + "zopfli", +] + +[[package]] +name = "zlib-rs" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3be3d40e40a133f9c916ee3f9f4fa2d9d63435b5fbe1bfc6d9dae0aa0ada1513" + [[package]] name = "zmij" version = "1.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" +[[package]] +name = "zopfli" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f05cd8797d63865425ff89b5c4a48804f35ba0ce8d125800027ad6017d2b5249" +dependencies = [ + "bumpalo", + "crc32fast", + "log", + "simd-adler32", +] + [[package]] name = "zstd" version = "0.13.3" diff --git a/Cargo.toml b/Cargo.toml index 40eada1..129f937 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ resolver = "2" members = ["crates/*"] [workspace.package] -version = "0.1.7" +version = "0.2.0" edition = "2024" license = "MIT" repository = "https://github.com/0xMassi/webclaw" diff --git a/crates/webclaw-cli/src/main.rs b/crates/webclaw-cli/src/main.rs index 4aa8a7f..f58c68b 100644 --- a/crates/webclaw-cli/src/main.rs +++ b/crates/webclaw-cli/src/main.rs @@ -95,7 +95,7 @@ struct Cli { #[arg(long)] urls_file: Option, - /// Output format (markdown, json, text, llm) + /// Output format (markdown, json, text, llm, html) #[arg(short, long, default_value = "markdown")] format: OutputFormat, @@ -277,6 +277,7 @@ enum OutputFormat { Json, Text, Llm, + Html, } #[derive(Clone, ValueEnum)] @@ -394,7 +395,7 @@ fn build_extraction_options(cli: &Cli) -> ExtractionOptions { .map(|s| s.split(',').map(|s| s.trim().to_string()).collect()) .unwrap_or_default(), only_main_content: cli.only_main_content, - include_raw_html: cli.raw_html, + include_raw_html: cli.raw_html || matches!(cli.format, OutputFormat::Html), } } @@ -417,6 +418,7 @@ fn url_to_filename(raw_url: &str, format: &OutputFormat) -> String { OutputFormat::Markdown | OutputFormat::Llm => "md", OutputFormat::Json => "json", OutputFormat::Text => "txt", + OutputFormat::Html => "html", }; let parsed = url::Url::parse(raw_url); @@ -470,6 +472,15 @@ fn write_to_file(dir: &Path, filename: &str, content: &str) -> Result<(), String Ok(()) } +/// Get raw HTML from an extraction result, falling back to markdown if unavailable. +fn raw_html_or_markdown(result: &ExtractionResult) -> &str { + result + .content + .raw_html + .as_deref() + .unwrap_or(&result.content.markdown) +} + /// Format an `ExtractionResult` into a string for the given output format. fn format_output(result: &ExtractionResult, format: &OutputFormat, show_metadata: bool) -> String { match format { @@ -484,6 +495,7 @@ fn format_output(result: &ExtractionResult, format: &OutputFormat, show_metadata OutputFormat::Json => serde_json::to_string_pretty(result).expect("serialization failed"), OutputFormat::Text => result.content.plain_text.clone(), OutputFormat::Llm => to_llm_text(result, result.metadata.url.as_deref()), + OutputFormat::Html => raw_html_or_markdown(result).to_string(), } } @@ -586,6 +598,7 @@ async fn fetch_and_extract(cli: &Cli) -> Result { OutputFormat::Json => "json", OutputFormat::Text => "text", OutputFormat::Llm => "llm", + OutputFormat::Html => "html", }; let resp = c .scrape( @@ -618,6 +631,7 @@ async fn fetch_and_extract(cli: &Cli) -> Result { OutputFormat::Json => "json", OutputFormat::Text => "text", OutputFormat::Llm => "llm", + OutputFormat::Html => "html", }; match c .scrape( @@ -793,6 +807,9 @@ fn print_output(result: &ExtractionResult, format: &OutputFormat, show_metadata: OutputFormat::Llm => { println!("{}", to_llm_text(result, result.metadata.url.as_deref())); } + OutputFormat::Html => { + println!("{}", raw_html_or_markdown(result)); + } } } @@ -845,6 +862,17 @@ fn print_cloud_output(resp: &serde_json::Value, format: &OutputFormat) { print_cloud_output(resp, &OutputFormat::Markdown); } } + OutputFormat::Html => { + if let Some(html) = resp + .get("content") + .and_then(|c| c.get("raw_html")) + .and_then(|h| h.as_str()) + { + println!("{html}"); + } else { + print_cloud_output(resp, &OutputFormat::Markdown); + } + } } } @@ -937,6 +965,17 @@ fn print_crawl_output(result: &CrawlResult, format: &OutputFormat, show_metadata println!(); } } + OutputFormat::Html => { + for page in &result.pages { + let Some(ref extraction) = page.extraction else { + continue; + }; + println!("---"); + println!("\n", page.url); + println!("{}", raw_html_or_markdown(extraction)); + println!(); + } + } } } @@ -1009,6 +1048,21 @@ fn print_batch_output(results: &[BatchExtractResult], format: &OutputFormat, sho } } } + OutputFormat::Html => { + for r in results { + match &r.result { + Ok(extraction) => { + println!("---"); + println!("\n", r.url); + println!("{}", raw_html_or_markdown(extraction)); + println!(); + } + Err(e) => { + eprintln!("error: {} -- {}", r.url, e); + } + } + } + } } } @@ -1393,24 +1447,15 @@ fn fire_webhook(url: &str, payload: &serde_json::Value) { }); } -async fn run_watch(cli: &Cli) -> Result<(), String> { - let raw_url = cli.urls.first().ok_or("--watch requires a URL argument")?; - let url = normalize_url(raw_url); +async fn run_watch(cli: &Cli, urls: &[String]) -> Result<(), String> { + if urls.is_empty() { + return Err("--watch requires at least one URL".into()); + } - let client = - FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?; - let options = build_extraction_options(cli); - - // Initial snapshot - let mut previous = client - .fetch_and_extract_with_options(&url, &options) - .await - .map_err(|e| format!("initial fetch failed: {e}"))?; - - eprintln!( - "[watch] Initial snapshot: {url} ({} words)", - previous.metadata.word_count + let client = Arc::new( + FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?, ); + let options = build_extraction_options(cli); // Ctrl+C handler let cancelled = Arc::new(AtomicBool::new(false)); @@ -1420,6 +1465,33 @@ async fn run_watch(cli: &Cli) -> Result<(), String> { flag.store(true, Ordering::Relaxed); }); + // Single-URL mode: preserve original behavior exactly + if urls.len() == 1 { + return run_watch_single(cli, &client, &options, &urls[0], &cancelled).await; + } + + // Multi-URL mode: batch fetch, diff each, report aggregate + run_watch_multi(cli, &client, &options, urls, &cancelled).await +} + +/// Original single-URL watch loop -- backward compatible. +async fn run_watch_single( + cli: &Cli, + client: &Arc, + options: &ExtractionOptions, + url: &str, + cancelled: &Arc, +) -> Result<(), String> { + let mut previous = client + .fetch_and_extract_with_options(url, options) + .await + .map_err(|e| format!("initial fetch failed: {e}"))?; + + eprintln!( + "[watch] Initial snapshot: {url} ({} words)", + previous.metadata.word_count + ); + loop { tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval)).await; @@ -1428,7 +1500,7 @@ async fn run_watch(cli: &Cli) -> Result<(), String> { break; } - let current = match client.fetch_and_extract_with_options(&url, &options).await { + let current = match client.fetch_and_extract_with_options(url, options).await { Ok(result) => result, Err(e) => { eprintln!("[watch] Fetch error ({}): {e}", timestamp()); @@ -1454,7 +1526,6 @@ async fn run_watch(cli: &Cli) -> Result<(), String> { .spawn() { Ok(mut child) => { - // Pipe diff JSON to stdin, then detach if let Some(mut stdin) = child.stdin.take() { use tokio::io::AsyncWriteExt; let _ = stdin.write_all(diff_json.as_bytes()).await; @@ -1464,7 +1535,6 @@ async fn run_watch(cli: &Cli) -> Result<(), String> { } } - // Fire webhook on change if let Some(ref webhook_url) = cli.webhook { fire_webhook( webhook_url, @@ -1487,6 +1557,162 @@ async fn run_watch(cli: &Cli) -> Result<(), String> { Ok(()) } +/// Multi-URL watch loop -- batch fetch all URLs, diff each, report aggregate. +async fn run_watch_multi( + cli: &Cli, + client: &Arc, + options: &ExtractionOptions, + urls: &[String], + cancelled: &Arc, +) -> Result<(), String> { + let url_refs: Vec<&str> = urls.iter().map(|u| u.as_str()).collect(); + + // Initial pass: fetch all URLs in parallel + let initial_results = client + .fetch_and_extract_batch_with_options(&url_refs, cli.concurrency, options) + .await; + + let mut snapshots = std::collections::HashMap::new(); + let mut ok_count = 0usize; + let mut err_count = 0usize; + + for r in initial_results { + match r.result { + Ok(extraction) => { + snapshots.insert(r.url, extraction); + ok_count += 1; + } + Err(e) => { + eprintln!("[watch] Initial fetch error: {} -- {e}", r.url); + err_count += 1; + } + } + } + + eprintln!( + "[watch] Watching {} URLs (interval: {}s)", + urls.len(), + cli.watch_interval + ); + eprintln!("[watch] Initial snapshots: {ok_count} ok, {err_count} errors"); + + let mut check_number = 0u64; + + loop { + tokio::time::sleep(std::time::Duration::from_secs(cli.watch_interval)).await; + + if cancelled.load(Ordering::Relaxed) { + eprintln!("[watch] Stopped"); + break; + } + + check_number += 1; + + let current_results = client + .fetch_and_extract_batch_with_options(&url_refs, cli.concurrency, options) + .await; + + let mut changed: Vec = Vec::new(); + let mut same_count = 0usize; + let mut fetch_errors = 0usize; + + for r in current_results { + match r.result { + Ok(current) => { + if let Some(previous) = snapshots.get(&r.url) { + let diff = webclaw_core::diff::diff(previous, ¤t); + if diff.status == ChangeStatus::Same { + same_count += 1; + } else { + changed.push(serde_json::json!({ + "url": r.url, + "word_count_delta": diff.word_count_delta, + })); + snapshots.insert(r.url, current); + } + } else { + // URL failed initially, first successful fetch -- store as baseline + snapshots.insert(r.url, current); + same_count += 1; + } + } + Err(e) => { + eprintln!("[watch] Fetch error: {} -- {e}", r.url); + fetch_errors += 1; + } + } + } + + let ts = timestamp(); + let err_suffix = if fetch_errors > 0 { + format!(", {fetch_errors} errors") + } else { + String::new() + }; + + if changed.is_empty() { + eprintln!( + "[watch] Check {check_number} ({ts}): 0 changed, {same_count} same{err_suffix}" + ); + } else { + eprintln!( + "[watch] Check {check_number} ({ts}): {} changed, {same_count} same{err_suffix}", + changed.len(), + ); + for entry in &changed { + let url = entry["url"].as_str().unwrap_or("?"); + let delta = entry["word_count_delta"].as_i64().unwrap_or(0); + eprintln!(" -> {url} (word delta: {delta:+})"); + } + + // Fire --on-change once with all changes + if let Some(ref cmd) = cli.on_change { + let payload = serde_json::json!({ + "event": "watch_changes", + "check_number": check_number, + "total_urls": urls.len(), + "changed": changed.len(), + "same": same_count, + "changes": changed, + }); + let payload_json = serde_json::to_string(&payload).unwrap_or_default(); + eprintln!("[watch] Running: {cmd}"); + match tokio::process::Command::new("sh") + .arg("-c") + .arg(cmd) + .stdin(std::process::Stdio::piped()) + .spawn() + { + Ok(mut child) => { + if let Some(mut stdin) = child.stdin.take() { + use tokio::io::AsyncWriteExt; + let _ = stdin.write_all(payload_json.as_bytes()).await; + } + } + Err(e) => eprintln!("[watch] Failed to run command: {e}"), + } + } + + // Fire webhook once with aggregate payload + if let Some(ref webhook_url) = cli.webhook { + fire_webhook( + webhook_url, + &serde_json::json!({ + "event": "watch_changes", + "check_number": check_number, + "total_urls": urls.len(), + "changed": changed.len(), + "same": same_count, + "changes": changed, + }), + ); + } + } + } + + Ok(()) +} + async fn run_diff(cli: &Cli, snapshot_path: &str) -> Result<(), String> { // Load previous snapshot let snapshot_json = std::fs::read_to_string(snapshot_path) @@ -1626,6 +1852,158 @@ async fn run_llm(cli: &Cli) -> Result<(), String> { Ok(()) } +/// Batch LLM extraction: fetch each URL, run LLM on extracted content, save/print results. +/// URLs are processed sequentially to respect LLM provider rate limits. +async fn run_batch_llm(cli: &Cli, entries: &[(String, Option)]) -> Result<(), String> { + let client = + FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?; + let options = build_extraction_options(cli); + let provider = build_llm_provider(cli).await?; + let model = cli.llm_model.as_deref(); + + // Pre-parse schema once if --extract-json is used + let schema = if let Some(ref schema_input) = cli.extract_json { + let schema_str = if let Some(path) = schema_input.strip_prefix('@') { + std::fs::read_to_string(path) + .map_err(|e| format!("failed to read schema file {path}: {e}"))? + } else { + schema_input.clone() + }; + Some( + serde_json::from_str::(&schema_str) + .map_err(|e| format!("invalid JSON schema: {e}"))?, + ) + } else { + None + }; + + // Build custom filename lookup from entries + let custom_names: std::collections::HashMap<&str, &str> = entries + .iter() + .filter_map(|(url, name)| name.as_deref().map(|n| (url.as_str(), n))) + .collect(); + + let total = entries.len(); + let mut ok = 0usize; + let mut errors = 0usize; + let mut all_results: Vec = Vec::with_capacity(total); + + for (i, (url, _)) in entries.iter().enumerate() { + let idx = i + 1; + eprint!("[{idx}/{total}] {url} "); + + // Fetch and extract page content + let extraction = match client.fetch_and_extract_with_options(url, &options).await { + Ok(r) => r, + Err(e) => { + errors += 1; + let msg = format!("fetch failed: {e}"); + eprintln!("-> error: {msg}"); + all_results.push(serde_json::json!({ "url": url, "error": msg })); + continue; + } + }; + + let text = &extraction.content.plain_text; + + // Run the appropriate LLM operation + let llm_result = if let Some(ref schema) = schema { + webclaw_llm::extract::extract_json(text, schema, provider.as_ref(), model) + .await + .map(LlmOutput::Json) + } else if let Some(ref prompt) = cli.extract_prompt { + webclaw_llm::extract::extract_with_prompt(text, prompt, provider.as_ref(), model) + .await + .map(LlmOutput::Json) + } else if let Some(sentences) = cli.summarize { + webclaw_llm::summarize::summarize(text, Some(sentences), provider.as_ref(), model) + .await + .map(LlmOutput::Text) + } else { + unreachable!("run_batch_llm called without LLM flags") + }; + + match llm_result { + Ok(output) => { + ok += 1; + + let (output_str, result_json) = match &output { + LlmOutput::Json(v) => { + let s = serde_json::to_string_pretty(v).expect("serialization failed"); + let j = serde_json::json!({ "url": url, "result": v }); + (s, j) + } + LlmOutput::Text(s) => { + let j = serde_json::json!({ "url": url, "result": s }); + (s.clone(), j) + } + }; + + // Count top-level fields/items for progress display + let detail = match &output { + LlmOutput::Json(v) => match v { + serde_json::Value::Object(m) => format!("{} fields", m.len()), + serde_json::Value::Array(a) => format!("{} items", a.len()), + _ => "done".to_string(), + }, + LlmOutput::Text(s) => { + let words = s.split_whitespace().count(); + format!("{words} words") + } + }; + eprintln!("-> extracted {detail}"); + + if let Some(ref dir) = cli.output_dir { + let filename = custom_names + .get(url.as_str()) + .map(|s| s.to_string()) + .unwrap_or_else(|| url_to_filename(url, &OutputFormat::Json)); + write_to_file(dir, &filename, &output_str)?; + } else { + println!("--- {url}"); + println!("{output_str}"); + println!(); + } + + all_results.push(result_json); + } + Err(e) => { + errors += 1; + let msg = format!("LLM extraction failed: {e}"); + eprintln!("-> error: {msg}"); + all_results.push(serde_json::json!({ "url": url, "error": msg })); + } + } + } + + eprintln!("Processed {total} URLs ({ok} ok, {errors} errors)"); + + if let Some(ref webhook_url) = cli.webhook { + fire_webhook( + webhook_url, + &serde_json::json!({ + "event": "batch_llm_complete", + "total": total, + "ok": ok, + "errors": errors, + }), + ); + tokio::time::sleep(std::time::Duration::from_millis(500)).await; + } + + if errors > 0 { + Err(format!("{errors} of {total} URLs failed")) + } else { + Ok(()) + } +} + +/// Intermediate type to hold LLM output before formatting. +enum LlmOutput { + Json(serde_json::Value), + Text(String), +} + /// Returns true if any LLM flag is set. fn has_llm_flags(cli: &Cli) -> bool { cli.extract_json.is_some() || cli.extract_prompt.is_some() || cli.summarize.is_some() @@ -1656,9 +2034,16 @@ async fn main() { return; } - // --watch: poll a URL for changes + // --watch: poll URL(s) for changes if cli.watch { - if let Err(e) = run_watch(&cli).await { + let watch_urls: Vec = match collect_urls(&cli) { + Ok(entries) => entries.into_iter().map(|(url, _)| url).collect(), + Err(e) => { + eprintln!("error: {e}"); + process::exit(1); + } + }; + if let Err(e) = run_watch(&cli, &watch_urls).await { eprintln!("error: {e}"); process::exit(1); } @@ -1683,15 +2068,6 @@ async fn main() { return; } - // LLM modes: --extract-json, --extract-prompt, --summarize - if has_llm_flags(&cli) { - if let Err(e) = run_llm(&cli).await { - eprintln!("error: {e}"); - process::exit(1); - } - return; - } - // Collect all URLs from args + --urls-file let entries = match collect_urls(&cli) { Ok(u) => u, @@ -1701,6 +2077,21 @@ async fn main() { } }; + // LLM modes: --extract-json, --extract-prompt, --summarize + // When multiple URLs are provided, run batch LLM extraction over all of them. + if has_llm_flags(&cli) { + if entries.len() > 1 { + if let Err(e) = run_batch_llm(&cli, &entries).await { + eprintln!("error: {e}"); + process::exit(1); + } + } else if let Err(e) = run_llm(&cli).await { + eprintln!("error: {e}"); + process::exit(1); + } + return; + } + // Multi-URL batch mode if entries.len() > 1 { if let Err(e) = run_batch(&cli, &entries).await { @@ -1824,6 +2215,14 @@ mod tests { ); } + #[test] + fn url_to_filename_html_format() { + assert_eq!( + url_to_filename("https://example.com/docs/api", &OutputFormat::Html), + "docs/api.html" + ); + } + #[test] fn url_to_filename_special_chars() { // Spaces and special chars get replaced with underscores diff --git a/crates/webclaw-fetch/Cargo.toml b/crates/webclaw-fetch/Cargo.toml index 7ea9625..e4da69b 100644 --- a/crates/webclaw-fetch/Cargo.toml +++ b/crates/webclaw-fetch/Cargo.toml @@ -19,6 +19,8 @@ url = "2" rand = "0.8" quick-xml = { version = "0.37", features = ["serde"] } serde_json.workspace = true +calamine = "0.34" +zip = "2" [dev-dependencies] tempfile = "3" diff --git a/crates/webclaw-fetch/src/client.rs b/crates/webclaw-fetch/src/client.rs index 4af675e..5b8526e 100644 --- a/crates/webclaw-fetch/src/client.rs +++ b/crates/webclaw-fetch/src/client.rs @@ -399,6 +399,27 @@ impl FetchClient { let pdf_result = webclaw_pdf::extract_pdf(&bytes, self.pdf_mode.clone())?; Ok(pdf_to_extraction_result(&pdf_result, &final_url)) + } else if let Some(doc_type) = + crate::document::is_document_content_type(&headers, &final_url) + { + debug!(status, doc_type = ?doc_type, "detected document response, extracting"); + + let bytes = response + .bytes() + .await + .map_err(|e| FetchError::BodyDecode(e.to_string()))?; + + let elapsed = start.elapsed(); + debug!( + status, + bytes = bytes.len(), + elapsed_ms = %elapsed.as_millis(), + "document fetch complete" + ); + + let mut result = crate::document::extract_document(&bytes, doc_type)?; + result.metadata.url = Some(final_url); + Ok(result) } else { let html = response .text() diff --git a/crates/webclaw-fetch/src/document.rs b/crates/webclaw-fetch/src/document.rs new file mode 100644 index 0000000..0291d52 --- /dev/null +++ b/crates/webclaw-fetch/src/document.rs @@ -0,0 +1,743 @@ +/// Document extraction for DOCX, XLSX, XLS, and CSV files. +/// Auto-detects document type from Content-Type headers or URL extension, +/// then extracts text content as markdown — same pattern as PDF extraction. +use std::collections::HashMap; +use std::io::{Cursor, Read}; + +use tracing::debug; + +use crate::error::FetchError; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum DocType { + Docx, + Xlsx, + Xls, + Csv, +} + +impl DocType { + fn label(self) -> &'static str { + match self { + DocType::Docx => "DOCX", + DocType::Xlsx => "XLSX", + DocType::Xls => "XLS", + DocType::Csv => "CSV", + } + } +} + +/// Detect document type from response headers or URL extension. +/// Returns `None` for non-document responses (HTML, PDF, etc.). +pub fn is_document_content_type(headers: &HashMap, url: &str) -> Option { + // Check Content-Type header first + if let Some(ct) = headers.get("content-type") { + let mime = ct.split(';').next().unwrap_or("").trim(); + + if mime.eq_ignore_ascii_case( + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ) { + return Some(DocType::Docx); + } + if mime.eq_ignore_ascii_case( + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ) { + return Some(DocType::Xlsx); + } + if mime.eq_ignore_ascii_case("application/vnd.ms-excel") { + return Some(DocType::Xls); + } + if mime.eq_ignore_ascii_case("text/csv") { + return Some(DocType::Csv); + } + } + + // Fall back to URL extension + let path = url.split('?').next().unwrap_or(url); + let lower = path.to_ascii_lowercase(); + + if lower.ends_with(".docx") { + return Some(DocType::Docx); + } + if lower.ends_with(".xlsx") { + return Some(DocType::Xlsx); + } + if lower.ends_with(".xls") { + return Some(DocType::Xls); + } + if lower.ends_with(".csv") { + return Some(DocType::Csv); + } + + None +} + +/// Extract text content from document bytes, returning an ExtractionResult. +pub fn extract_document( + bytes: &[u8], + doc_type: DocType, +) -> Result { + debug!( + doc_type = doc_type.label(), + bytes = bytes.len(), + "extracting document" + ); + + let markdown = match doc_type { + DocType::Docx => extract_docx(bytes)?, + DocType::Xlsx => extract_xlsx(bytes)?, + DocType::Xls => extract_xls(bytes)?, + DocType::Csv => extract_csv(bytes)?, + }; + + let plain_text = strip_markdown_formatting(&markdown); + let word_count = plain_text.split_whitespace().count(); + + Ok(webclaw_core::ExtractionResult { + metadata: webclaw_core::Metadata { + title: None, + description: None, + author: None, + published_date: None, + language: None, + url: None, + site_name: None, + image: None, + favicon: None, + word_count, + }, + content: webclaw_core::Content { + markdown, + plain_text, + links: Vec::new(), + images: Vec::new(), + code_blocks: Vec::new(), + raw_html: None, + }, + domain_data: None, + structured_data: vec![], + }) +} + +/// Extract text from a DOCX file (ZIP of XML). +/// Reads `word/document.xml`, extracts `` text nodes, detects heading styles. +fn extract_docx(bytes: &[u8]) -> Result { + let cursor = Cursor::new(bytes); + let mut archive = + zip::ZipArchive::new(cursor).map_err(|e| FetchError::Build(format!("DOCX zip: {e}")))?; + + let xml = { + let mut file = archive + .by_name("word/document.xml") + .map_err(|e| FetchError::Build(format!("DOCX missing document.xml: {e}")))?; + let mut buf = String::new(); + file.read_to_string(&mut buf) + .map_err(|e| FetchError::BodyDecode(format!("DOCX read: {e}")))?; + buf + }; + + parse_docx_xml(&xml) +} + +/// Parse DOCX XML (word/document.xml) into markdown. +/// +/// Walks the XML looking for paragraph elements (``). Within each paragraph, +/// collects text from `` tags and detects heading styles from ``. +fn parse_docx_xml(xml: &str) -> Result { + use quick_xml::Reader; + use quick_xml::events::Event; + + let mut reader = Reader::from_str(xml); + let mut paragraphs: Vec = Vec::new(); + + // State tracking for the current paragraph + let mut in_paragraph = false; + let mut in_run = false; // inside (run) + let mut in_text = false; // inside + let mut current_text = String::new(); + let mut heading_level: Option = 0.into(); // None = normal paragraph + let mut in_ppr = false; // inside (paragraph properties) + + loop { + match reader.read_event() { + Ok(Event::Start(ref e)) | Ok(Event::Empty(ref e)) => { + let name_bytes = e.name().as_ref().to_vec(); + let local = local_name(&name_bytes); + match local { + b"p" if is_w_namespace(&name_bytes) => { + in_paragraph = true; + current_text.clear(); + heading_level = None; + } + b"pPr" if in_paragraph => in_ppr = true, + b"pStyle" if in_ppr => { + heading_level = extract_heading_level(e); + } + b"r" if in_paragraph => in_run = true, + b"t" if in_run => in_text = true, + b"br" if in_paragraph => { + current_text.push('\n'); + } + b"tab" if in_paragraph => { + current_text.push('\t'); + } + _ => {} + } + } + Ok(Event::End(ref e)) => { + let name_bytes = e.name().as_ref().to_vec(); + let local = local_name(&name_bytes); + match local { + b"p" if in_paragraph => { + let text = current_text.trim().to_string(); + if !text.is_empty() { + let formatted = match heading_level { + Some(1) => format!("# {text}"), + Some(2) => format!("## {text}"), + Some(3) => format!("### {text}"), + Some(4) => format!("#### {text}"), + Some(5) => format!("##### {text}"), + Some(6) => format!("###### {text}"), + _ => text, + }; + paragraphs.push(formatted); + } + in_paragraph = false; + } + b"pPr" => in_ppr = false, + b"r" => { + in_run = false; + in_text = false; + } + b"t" => in_text = false, + _ => {} + } + } + Ok(Event::Text(ref e)) if in_text => { + if let Ok(text) = e.unescape() { + current_text.push_str(&text); + } + } + Ok(Event::Eof) => break, + Err(e) => { + return Err(FetchError::Build(format!("DOCX XML parse error: {e}"))); + } + _ => {} + } + } + + Ok(paragraphs.join("\n\n")) +} + +/// Check if a qualified name belongs to the `w:` (wordprocessingML) namespace. +/// Handles both `w:p` (prefixed) and just `p` (default namespace) forms. +fn is_w_namespace(name: &[u8]) -> bool { + // quick-xml gives us the full name bytes. Accept both "w:p" and "p". + name == b"w:p" || name == b"p" +} + +/// Extract the local name from a possibly namespaced XML tag. +/// `w:p` -> `p`, `p` -> `p` +fn local_name(name: &[u8]) -> &[u8] { + match name.iter().position(|&b| b == b':') { + Some(pos) => &name[pos + 1..], + None => name, + } +} + +/// Extract heading level from a `` element. +fn extract_heading_level(e: &quick_xml::events::BytesStart) -> Option { + for attr in e.attributes().flatten() { + let local = local_name(attr.key.as_ref()); + if local == b"val" { + let val = String::from_utf8_lossy(&attr.value); + let lower = val.to_ascii_lowercase(); + + // Match "heading1", "heading2", etc. and "title" -> h1 + if lower == "title" { + return Some(1); + } + if let Some(rest) = lower.strip_prefix("heading") + && let Ok(n) = rest.parse::() + { + return Some(n.min(6)); + } + } + } + None +} + +/// Extract spreadsheet content using calamine (XLSX format). +fn extract_xlsx(bytes: &[u8]) -> Result { + extract_spreadsheet(bytes, "XLSX") +} + +/// Extract spreadsheet content using calamine (XLS format). +fn extract_xls(bytes: &[u8]) -> Result { + extract_spreadsheet(bytes, "XLS") +} + +/// Shared spreadsheet extraction for both XLSX and XLS via calamine. +/// Reads all sheets and formats each as a markdown table. +fn extract_spreadsheet(bytes: &[u8], label: &str) -> Result { + use calamine::Reader; + + let cursor = Cursor::new(bytes); + let mut workbook: calamine::Sheets<_> = calamine::open_workbook_auto_from_rs(cursor) + .map_err(|e| FetchError::Build(format!("{label} open: {e}")))?; + + let sheet_names: Vec = workbook.sheet_names().to_vec(); + let mut sections: Vec = Vec::new(); + + for name in &sheet_names { + let range = workbook + .worksheet_range(name) + .map_err(|e| FetchError::Build(format!("{label} sheet '{name}': {e}")))?; + + let rows: Vec> = range + .rows() + .map(|row| row.iter().map(cell_to_string).collect()) + .collect(); + + if rows.is_empty() { + continue; + } + + let mut section = format!("## Sheet: {name}\n\n"); + section.push_str(&rows_to_markdown_table(&rows)); + sections.push(section); + } + + if sections.is_empty() { + return Ok("(empty spreadsheet)".to_string()); + } + + Ok(sections.join("\n\n")) +} + +/// Convert a calamine cell value to a display string. +fn cell_to_string(cell: &calamine::Data) -> String { + use calamine::Data; + match cell { + Data::Empty => String::new(), + Data::String(s) => s.clone(), + Data::Int(n) => n.to_string(), + Data::Float(f) => format_float(*f), + Data::Bool(b) => b.to_string(), + Data::Error(e) => format!("#{e:?}"), + Data::DateTime(dt) => format!("{dt}"), + Data::DateTimeIso(s) => s.clone(), + Data::DurationIso(s) => s.clone(), + } +} + +/// Format a float, dropping trailing `.0` for clean integer display. +fn format_float(f: f64) -> String { + if f.fract() == 0.0 && f.abs() < i64::MAX as f64 { + format!("{}", f as i64) + } else { + format!("{f}") + } +} + +/// Extract CSV text and convert to markdown table. +fn extract_csv(bytes: &[u8]) -> Result { + let text = String::from_utf8_lossy(bytes); + let rows = parse_csv_rows(&text); + + if rows.is_empty() { + return Ok("(empty CSV)".to_string()); + } + + Ok(rows_to_markdown_table(&rows)) +} + +/// Parse CSV text into rows of fields, handling quoted fields with commas/newlines. +fn parse_csv_rows(text: &str) -> Vec> { + let mut rows: Vec> = Vec::new(); + let mut current_row: Vec = Vec::new(); + let mut current_field = String::new(); + let mut in_quotes = false; + let mut chars = text.chars().peekable(); + + while let Some(ch) = chars.next() { + if in_quotes { + if ch == '"' { + // Escaped quote ("") or end of quoted field + if chars.peek() == Some(&'"') { + chars.next(); + current_field.push('"'); + } else { + in_quotes = false; + } + } else { + current_field.push(ch); + } + } else { + match ch { + '"' => in_quotes = true, + ',' => { + current_row.push(current_field.trim().to_string()); + current_field = String::new(); + } + '\n' => { + current_row.push(current_field.trim().to_string()); + current_field = String::new(); + if !current_row.iter().all(|f| f.is_empty()) { + rows.push(current_row); + } + current_row = Vec::new(); + } + '\r' => { + // Skip carriage returns (handled with \n) + } + _ => current_field.push(ch), + } + } + } + + // Flush last field/row + if !current_field.is_empty() || !current_row.is_empty() { + current_row.push(current_field.trim().to_string()); + if !current_row.iter().all(|f| f.is_empty()) { + rows.push(current_row); + } + } + + rows +} + +/// Convert rows (first row = header) into a markdown table. +fn rows_to_markdown_table(rows: &[Vec]) -> String { + if rows.is_empty() { + return String::new(); + } + + // Find the max column count across all rows + let col_count = rows.iter().map(|r| r.len()).max().unwrap_or(0); + if col_count == 0 { + return String::new(); + } + + let mut lines: Vec = Vec::new(); + + // Header row + let header = &rows[0]; + let header_cells: Vec<&str> = (0..col_count) + .map(|i| header.get(i).map(|s| s.as_str()).unwrap_or("")) + .collect(); + lines.push(format!("| {} |", header_cells.join(" | "))); + + // Separator row + let sep: Vec<&str> = vec!["---"; col_count]; + lines.push(format!("| {} |", sep.join(" | "))); + + // Data rows + for row in &rows[1..] { + let cells: Vec<&str> = (0..col_count) + .map(|i| row.get(i).map(|s| s.as_str()).unwrap_or("")) + .collect(); + lines.push(format!("| {} |", cells.join(" | "))); + } + + lines.join("\n") +} + +/// Strip markdown formatting to get plain text. +fn strip_markdown_formatting(markdown: &str) -> String { + let mut plain = String::with_capacity(markdown.len()); + for line in markdown.lines() { + let trimmed = line.trim_start_matches('#').trim(); + if trimmed.starts_with("| ---") || trimmed == "|---|" { + continue; // Skip separator rows + } + if let Some(stripped) = trimmed.strip_prefix('|') + && let Some(stripped) = stripped.strip_suffix('|') + { + // Table row: join cells with spaces + let cells: Vec<&str> = stripped.split('|').map(|c| c.trim()).collect(); + plain.push_str(&cells.join(" ")); + plain.push('\n'); + continue; + } + plain.push_str(trimmed); + plain.push('\n'); + } + plain.trim().to_string() +} + +#[cfg(test)] +mod tests { + use super::*; + + // --- Content-type detection --- + + #[test] + fn test_detect_docx_content_type() { + let mut headers = HashMap::new(); + headers.insert( + "content-type".to_string(), + "application/vnd.openxmlformats-officedocument.wordprocessingml.document".to_string(), + ); + assert_eq!( + is_document_content_type(&headers, "https://example.com/file"), + Some(DocType::Docx) + ); + } + + #[test] + fn test_detect_xlsx_content_type() { + let mut headers = HashMap::new(); + headers.insert( + "content-type".to_string(), + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet".to_string(), + ); + assert_eq!( + is_document_content_type(&headers, "https://example.com/file"), + Some(DocType::Xlsx) + ); + } + + #[test] + fn test_detect_xls_content_type() { + let mut headers = HashMap::new(); + headers.insert( + "content-type".to_string(), + "application/vnd.ms-excel".to_string(), + ); + assert_eq!( + is_document_content_type(&headers, "https://example.com/file"), + Some(DocType::Xls) + ); + } + + #[test] + fn test_detect_csv_content_type() { + let mut headers = HashMap::new(); + headers.insert("content-type".to_string(), "text/csv".to_string()); + assert_eq!( + is_document_content_type(&headers, "https://example.com/file"), + Some(DocType::Csv) + ); + } + + #[test] + fn test_detect_csv_content_type_with_charset() { + let mut headers = HashMap::new(); + headers.insert( + "content-type".to_string(), + "text/csv; charset=utf-8".to_string(), + ); + assert_eq!( + is_document_content_type(&headers, "https://example.com/file"), + Some(DocType::Csv) + ); + } + + #[test] + fn test_detect_by_url_extension() { + let empty: HashMap = HashMap::new(); + assert_eq!( + is_document_content_type(&empty, "https://example.com/report.docx"), + Some(DocType::Docx) + ); + assert_eq!( + is_document_content_type(&empty, "https://example.com/data.xlsx"), + Some(DocType::Xlsx) + ); + assert_eq!( + is_document_content_type(&empty, "https://example.com/old.xls"), + Some(DocType::Xls) + ); + assert_eq!( + is_document_content_type(&empty, "https://example.com/data.csv"), + Some(DocType::Csv) + ); + } + + #[test] + fn test_detect_url_extension_with_query() { + let empty: HashMap = HashMap::new(); + assert_eq!( + is_document_content_type(&empty, "https://example.com/report.docx?token=abc"), + Some(DocType::Docx) + ); + } + + #[test] + fn test_detect_url_extension_case_insensitive() { + let empty: HashMap = HashMap::new(); + assert_eq!( + is_document_content_type(&empty, "https://example.com/FILE.XLSX"), + Some(DocType::Xlsx) + ); + } + + #[test] + fn test_detect_none_for_html() { + let mut headers = HashMap::new(); + headers.insert("content-type".to_string(), "text/html".to_string()); + assert_eq!( + is_document_content_type(&headers, "https://example.com/page"), + None + ); + } + + #[test] + fn test_content_type_takes_precedence_over_url() { + let mut headers = HashMap::new(); + headers.insert("content-type".to_string(), "text/csv".to_string()); + // URL says .xlsx but Content-Type says CSV — header wins + assert_eq!( + is_document_content_type(&headers, "https://example.com/data.xlsx"), + Some(DocType::Csv) + ); + } + + // --- CSV parsing --- + + #[test] + fn test_csv_simple() { + let csv = "Name,Age,City\nAlice,30,NYC\nBob,25,LA\n"; + let result = extract_csv(csv.as_bytes()).unwrap(); + assert!(result.contains("| Name | Age | City |")); + assert!(result.contains("| --- | --- | --- |")); + assert!(result.contains("| Alice | 30 | NYC |")); + assert!(result.contains("| Bob | 25 | LA |")); + } + + #[test] + fn test_csv_quoted_fields() { + let csv = "Name,Description\nAlice,\"Has a, comma\"\nBob,\"Said \"\"hello\"\"\"\n"; + let result = extract_csv(csv.as_bytes()).unwrap(); + assert!(result.contains("Has a, comma")); + assert!(result.contains("Said \"hello\"")); + } + + #[test] + fn test_csv_empty() { + let result = extract_csv(b"").unwrap(); + assert_eq!(result, "(empty CSV)"); + } + + #[test] + fn test_csv_windows_line_endings() { + let csv = "A,B\r\n1,2\r\n3,4\r\n"; + let result = extract_csv(csv.as_bytes()).unwrap(); + assert!(result.contains("| A | B |")); + assert!(result.contains("| 1 | 2 |")); + } + + // --- DOCX XML parsing --- + + #[test] + fn test_docx_xml_simple_paragraphs() { + let xml = r#" + + + Hello world + Second paragraph + +"#; + let result = parse_docx_xml(xml).unwrap(); + assert_eq!(result, "Hello world\n\nSecond paragraph"); + } + + #[test] + fn test_docx_xml_headings() { + let xml = r#" + + + + + Title + + Body text + + + Subtitle + + +"#; + let result = parse_docx_xml(xml).unwrap(); + assert!(result.contains("# Title")); + assert!(result.contains("Body text")); + assert!(result.contains("## Subtitle")); + } + + #[test] + fn test_docx_xml_multiple_runs() { + let xml = r#" + + + + Hello + world + + +"#; + let result = parse_docx_xml(xml).unwrap(); + assert_eq!(result, "Hello world"); + } + + #[test] + fn test_docx_xml_empty_paragraphs_skipped() { + let xml = r#" + + + + Content + + +"#; + let result = parse_docx_xml(xml).unwrap(); + assert_eq!(result, "Content"); + } + + // --- Markdown table --- + + #[test] + fn test_rows_to_markdown_table() { + let rows = vec![ + vec!["A".to_string(), "B".to_string()], + vec!["1".to_string(), "2".to_string()], + vec!["3".to_string(), "4".to_string()], + ]; + let table = rows_to_markdown_table(&rows); + assert_eq!(table, "| A | B |\n| --- | --- |\n| 1 | 2 |\n| 3 | 4 |"); + } + + #[test] + fn test_rows_to_markdown_table_ragged() { + let rows = vec![ + vec!["A".to_string(), "B".to_string(), "C".to_string()], + vec!["1".to_string()], // fewer columns + ]; + let table = rows_to_markdown_table(&rows); + assert!(table.contains("| 1 | | |")); + } + + // --- Extract result --- + + #[test] + fn test_extract_csv_result() { + let csv = "Name,Score\nAlice,100\n"; + let result = extract_document(csv.as_bytes(), DocType::Csv).unwrap(); + assert!(result.content.markdown.contains("| Name | Score |")); + assert!(result.metadata.word_count > 0); + assert!(result.content.links.is_empty()); + assert!(result.domain_data.is_none()); + } + + // --- Strip markdown --- + + #[test] + fn test_strip_markdown() { + let md = "# Title\n\nSome text\n\n| A | B |\n| --- | --- |\n| 1 | 2 |"; + let plain = strip_markdown_formatting(md); + assert!(plain.contains("Title")); + assert!(plain.contains("Some text")); + assert!(plain.contains("A B")); + assert!(!plain.contains("---")); + } +} diff --git a/crates/webclaw-fetch/src/lib.rs b/crates/webclaw-fetch/src/lib.rs index c5cd40b..373eb8a 100644 --- a/crates/webclaw-fetch/src/lib.rs +++ b/crates/webclaw-fetch/src/lib.rs @@ -5,6 +5,7 @@ pub mod browser; pub mod client; pub mod crawler; +pub mod document; pub mod error; pub mod linkedin; pub mod proxy;