feat: v0.2.0 — DOCX/XLSX/CSV extraction, HTML format, multi-URL watch, batch LLM

Document extraction: - DOCX: auto-detected, outputs markdown with headings (via zip + quick-xml) - XLSX/XLS: markdown tables with multi-sheet support (via calamine) - CSV: quoted field handling, markdown table output - All auto-detected by Content-Type header or URL extension New features: - -f html output format (sanitized HTML) - Multi-URL watch: --urls-file + --watch monitors all URLs in parallel - Batch + LLM: --extract-prompt/--extract-json works with multiple URLs - Mixed batch: HTML pages + DOCX + XLSX + CSV in one command Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-06-29 03:39:37 +02:00 · 2026-03-26 15:28:23 +01:00 · 2026-03-26 15:28:23 +01:00 · ea14848772
commit ea14848772
parent 0e4128782a
8 changed files with 1520 additions and 41 deletions
--- a/crates/webclaw-fetch/src/client.rs
+++ b/crates/webclaw-fetch/src/client.rs
@ -399,6 +399,27 @@ impl FetchClient {

            let pdf_result = webclaw_pdf::extract_pdf(&bytes, self.pdf_mode.clone())?;
            Ok(pdf_to_extraction_result(&pdf_result, &final_url))
+        } else if let Some(doc_type) =
+            crate::document::is_document_content_type(&headers, &final_url)
+        {
+            debug!(status, doc_type = ?doc_type, "detected document response, extracting");
+
+            let bytes = response
+                .bytes()
+                .await
+                .map_err(|e| FetchError::BodyDecode(e.to_string()))?;
+
+            let elapsed = start.elapsed();
+            debug!(
+                status,
+                bytes = bytes.len(),
+                elapsed_ms = %elapsed.as_millis(),
+                "document fetch complete"
+            );
+
+            let mut result = crate::document::extract_document(&bytes, doc_type)?;
+            result.metadata.url = Some(final_url);
+            Ok(result)
        } else {
            let html = response
                .text()