feat: v0.2.0 — DOCX/XLSX/CSV extraction, HTML format, multi-URL watch, batch LLM

Document extraction:
- DOCX: auto-detected, outputs markdown with headings (via zip + quick-xml)
- XLSX/XLS: markdown tables with multi-sheet support (via calamine)
- CSV: quoted field handling, markdown table output
- All auto-detected by Content-Type header or URL extension

New features:
- -f html output format (sanitized HTML)
- Multi-URL watch: --urls-file + --watch monitors all URLs in parallel
- Batch + LLM: --extract-prompt/--extract-json works with multiple URLs
- Mixed batch: HTML pages + DOCX + XLSX + CSV in one command

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Valerio 2026-03-26 15:28:23 +01:00
parent 0e4128782a
commit ea14848772
8 changed files with 1520 additions and 41 deletions

View file

@ -399,6 +399,27 @@ impl FetchClient {
let pdf_result = webclaw_pdf::extract_pdf(&bytes, self.pdf_mode.clone())?;
Ok(pdf_to_extraction_result(&pdf_result, &final_url))
} else if let Some(doc_type) =
crate::document::is_document_content_type(&headers, &final_url)
{
debug!(status, doc_type = ?doc_type, "detected document response, extracting");
let bytes = response
.bytes()
.await
.map_err(|e| FetchError::BodyDecode(e.to_string()))?;
let elapsed = start.elapsed();
debug!(
status,
bytes = bytes.len(),
elapsed_ms = %elapsed.as_millis(),
"document fetch complete"
);
let mut result = crate::document::extract_document(&bytes, doc_type)?;
result.metadata.url = Some(final_url);
Ok(result)
} else {
let html = response
.text()

View file

@ -0,0 +1,743 @@
/// Document extraction for DOCX, XLSX, XLS, and CSV files.
/// Auto-detects document type from Content-Type headers or URL extension,
/// then extracts text content as markdown — same pattern as PDF extraction.
use std::collections::HashMap;
use std::io::{Cursor, Read};
use tracing::debug;
use crate::error::FetchError;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DocType {
Docx,
Xlsx,
Xls,
Csv,
}
impl DocType {
fn label(self) -> &'static str {
match self {
DocType::Docx => "DOCX",
DocType::Xlsx => "XLSX",
DocType::Xls => "XLS",
DocType::Csv => "CSV",
}
}
}
/// Detect document type from response headers or URL extension.
/// Returns `None` for non-document responses (HTML, PDF, etc.).
pub fn is_document_content_type(headers: &HashMap<String, String>, url: &str) -> Option<DocType> {
// Check Content-Type header first
if let Some(ct) = headers.get("content-type") {
let mime = ct.split(';').next().unwrap_or("").trim();
if mime.eq_ignore_ascii_case(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
) {
return Some(DocType::Docx);
}
if mime.eq_ignore_ascii_case(
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
) {
return Some(DocType::Xlsx);
}
if mime.eq_ignore_ascii_case("application/vnd.ms-excel") {
return Some(DocType::Xls);
}
if mime.eq_ignore_ascii_case("text/csv") {
return Some(DocType::Csv);
}
}
// Fall back to URL extension
let path = url.split('?').next().unwrap_or(url);
let lower = path.to_ascii_lowercase();
if lower.ends_with(".docx") {
return Some(DocType::Docx);
}
if lower.ends_with(".xlsx") {
return Some(DocType::Xlsx);
}
if lower.ends_with(".xls") {
return Some(DocType::Xls);
}
if lower.ends_with(".csv") {
return Some(DocType::Csv);
}
None
}
/// Extract text content from document bytes, returning an ExtractionResult.
pub fn extract_document(
bytes: &[u8],
doc_type: DocType,
) -> Result<webclaw_core::ExtractionResult, FetchError> {
debug!(
doc_type = doc_type.label(),
bytes = bytes.len(),
"extracting document"
);
let markdown = match doc_type {
DocType::Docx => extract_docx(bytes)?,
DocType::Xlsx => extract_xlsx(bytes)?,
DocType::Xls => extract_xls(bytes)?,
DocType::Csv => extract_csv(bytes)?,
};
let plain_text = strip_markdown_formatting(&markdown);
let word_count = plain_text.split_whitespace().count();
Ok(webclaw_core::ExtractionResult {
metadata: webclaw_core::Metadata {
title: None,
description: None,
author: None,
published_date: None,
language: None,
url: None,
site_name: None,
image: None,
favicon: None,
word_count,
},
content: webclaw_core::Content {
markdown,
plain_text,
links: Vec::new(),
images: Vec::new(),
code_blocks: Vec::new(),
raw_html: None,
},
domain_data: None,
structured_data: vec![],
})
}
/// Extract text from a DOCX file (ZIP of XML).
/// Reads `word/document.xml`, extracts `<w:t>` text nodes, detects heading styles.
fn extract_docx(bytes: &[u8]) -> Result<String, FetchError> {
let cursor = Cursor::new(bytes);
let mut archive =
zip::ZipArchive::new(cursor).map_err(|e| FetchError::Build(format!("DOCX zip: {e}")))?;
let xml = {
let mut file = archive
.by_name("word/document.xml")
.map_err(|e| FetchError::Build(format!("DOCX missing document.xml: {e}")))?;
let mut buf = String::new();
file.read_to_string(&mut buf)
.map_err(|e| FetchError::BodyDecode(format!("DOCX read: {e}")))?;
buf
};
parse_docx_xml(&xml)
}
/// Parse DOCX XML (word/document.xml) into markdown.
///
/// Walks the XML looking for paragraph elements (`<w:p>`). Within each paragraph,
/// collects text from `<w:t>` tags and detects heading styles from `<w:pStyle>`.
fn parse_docx_xml(xml: &str) -> Result<String, FetchError> {
use quick_xml::Reader;
use quick_xml::events::Event;
let mut reader = Reader::from_str(xml);
let mut paragraphs: Vec<String> = Vec::new();
// State tracking for the current paragraph
let mut in_paragraph = false;
let mut in_run = false; // inside <w:r> (run)
let mut in_text = false; // inside <w:t>
let mut current_text = String::new();
let mut heading_level: Option<u8> = 0.into(); // None = normal paragraph
let mut in_ppr = false; // inside <w:pPr> (paragraph properties)
loop {
match reader.read_event() {
Ok(Event::Start(ref e)) | Ok(Event::Empty(ref e)) => {
let name_bytes = e.name().as_ref().to_vec();
let local = local_name(&name_bytes);
match local {
b"p" if is_w_namespace(&name_bytes) => {
in_paragraph = true;
current_text.clear();
heading_level = None;
}
b"pPr" if in_paragraph => in_ppr = true,
b"pStyle" if in_ppr => {
heading_level = extract_heading_level(e);
}
b"r" if in_paragraph => in_run = true,
b"t" if in_run => in_text = true,
b"br" if in_paragraph => {
current_text.push('\n');
}
b"tab" if in_paragraph => {
current_text.push('\t');
}
_ => {}
}
}
Ok(Event::End(ref e)) => {
let name_bytes = e.name().as_ref().to_vec();
let local = local_name(&name_bytes);
match local {
b"p" if in_paragraph => {
let text = current_text.trim().to_string();
if !text.is_empty() {
let formatted = match heading_level {
Some(1) => format!("# {text}"),
Some(2) => format!("## {text}"),
Some(3) => format!("### {text}"),
Some(4) => format!("#### {text}"),
Some(5) => format!("##### {text}"),
Some(6) => format!("###### {text}"),
_ => text,
};
paragraphs.push(formatted);
}
in_paragraph = false;
}
b"pPr" => in_ppr = false,
b"r" => {
in_run = false;
in_text = false;
}
b"t" => in_text = false,
_ => {}
}
}
Ok(Event::Text(ref e)) if in_text => {
if let Ok(text) = e.unescape() {
current_text.push_str(&text);
}
}
Ok(Event::Eof) => break,
Err(e) => {
return Err(FetchError::Build(format!("DOCX XML parse error: {e}")));
}
_ => {}
}
}
Ok(paragraphs.join("\n\n"))
}
/// Check if a qualified name belongs to the `w:` (wordprocessingML) namespace.
/// Handles both `w:p` (prefixed) and just `p` (default namespace) forms.
fn is_w_namespace(name: &[u8]) -> bool {
// quick-xml gives us the full name bytes. Accept both "w:p" and "p".
name == b"w:p" || name == b"p"
}
/// Extract the local name from a possibly namespaced XML tag.
/// `w:p` -> `p`, `p` -> `p`
fn local_name(name: &[u8]) -> &[u8] {
match name.iter().position(|&b| b == b':') {
Some(pos) => &name[pos + 1..],
None => name,
}
}
/// Extract heading level from a `<w:pStyle w:val="Heading1"/>` element.
fn extract_heading_level(e: &quick_xml::events::BytesStart) -> Option<u8> {
for attr in e.attributes().flatten() {
let local = local_name(attr.key.as_ref());
if local == b"val" {
let val = String::from_utf8_lossy(&attr.value);
let lower = val.to_ascii_lowercase();
// Match "heading1", "heading2", etc. and "title" -> h1
if lower == "title" {
return Some(1);
}
if let Some(rest) = lower.strip_prefix("heading")
&& let Ok(n) = rest.parse::<u8>()
{
return Some(n.min(6));
}
}
}
None
}
/// Extract spreadsheet content using calamine (XLSX format).
fn extract_xlsx(bytes: &[u8]) -> Result<String, FetchError> {
extract_spreadsheet(bytes, "XLSX")
}
/// Extract spreadsheet content using calamine (XLS format).
fn extract_xls(bytes: &[u8]) -> Result<String, FetchError> {
extract_spreadsheet(bytes, "XLS")
}
/// Shared spreadsheet extraction for both XLSX and XLS via calamine.
/// Reads all sheets and formats each as a markdown table.
fn extract_spreadsheet(bytes: &[u8], label: &str) -> Result<String, FetchError> {
use calamine::Reader;
let cursor = Cursor::new(bytes);
let mut workbook: calamine::Sheets<_> = calamine::open_workbook_auto_from_rs(cursor)
.map_err(|e| FetchError::Build(format!("{label} open: {e}")))?;
let sheet_names: Vec<String> = workbook.sheet_names().to_vec();
let mut sections: Vec<String> = Vec::new();
for name in &sheet_names {
let range = workbook
.worksheet_range(name)
.map_err(|e| FetchError::Build(format!("{label} sheet '{name}': {e}")))?;
let rows: Vec<Vec<String>> = range
.rows()
.map(|row| row.iter().map(cell_to_string).collect())
.collect();
if rows.is_empty() {
continue;
}
let mut section = format!("## Sheet: {name}\n\n");
section.push_str(&rows_to_markdown_table(&rows));
sections.push(section);
}
if sections.is_empty() {
return Ok("(empty spreadsheet)".to_string());
}
Ok(sections.join("\n\n"))
}
/// Convert a calamine cell value to a display string.
fn cell_to_string(cell: &calamine::Data) -> String {
use calamine::Data;
match cell {
Data::Empty => String::new(),
Data::String(s) => s.clone(),
Data::Int(n) => n.to_string(),
Data::Float(f) => format_float(*f),
Data::Bool(b) => b.to_string(),
Data::Error(e) => format!("#{e:?}"),
Data::DateTime(dt) => format!("{dt}"),
Data::DateTimeIso(s) => s.clone(),
Data::DurationIso(s) => s.clone(),
}
}
/// Format a float, dropping trailing `.0` for clean integer display.
fn format_float(f: f64) -> String {
if f.fract() == 0.0 && f.abs() < i64::MAX as f64 {
format!("{}", f as i64)
} else {
format!("{f}")
}
}
/// Extract CSV text and convert to markdown table.
fn extract_csv(bytes: &[u8]) -> Result<String, FetchError> {
let text = String::from_utf8_lossy(bytes);
let rows = parse_csv_rows(&text);
if rows.is_empty() {
return Ok("(empty CSV)".to_string());
}
Ok(rows_to_markdown_table(&rows))
}
/// Parse CSV text into rows of fields, handling quoted fields with commas/newlines.
fn parse_csv_rows(text: &str) -> Vec<Vec<String>> {
let mut rows: Vec<Vec<String>> = Vec::new();
let mut current_row: Vec<String> = Vec::new();
let mut current_field = String::new();
let mut in_quotes = false;
let mut chars = text.chars().peekable();
while let Some(ch) = chars.next() {
if in_quotes {
if ch == '"' {
// Escaped quote ("") or end of quoted field
if chars.peek() == Some(&'"') {
chars.next();
current_field.push('"');
} else {
in_quotes = false;
}
} else {
current_field.push(ch);
}
} else {
match ch {
'"' => in_quotes = true,
',' => {
current_row.push(current_field.trim().to_string());
current_field = String::new();
}
'\n' => {
current_row.push(current_field.trim().to_string());
current_field = String::new();
if !current_row.iter().all(|f| f.is_empty()) {
rows.push(current_row);
}
current_row = Vec::new();
}
'\r' => {
// Skip carriage returns (handled with \n)
}
_ => current_field.push(ch),
}
}
}
// Flush last field/row
if !current_field.is_empty() || !current_row.is_empty() {
current_row.push(current_field.trim().to_string());
if !current_row.iter().all(|f| f.is_empty()) {
rows.push(current_row);
}
}
rows
}
/// Convert rows (first row = header) into a markdown table.
fn rows_to_markdown_table(rows: &[Vec<String>]) -> String {
if rows.is_empty() {
return String::new();
}
// Find the max column count across all rows
let col_count = rows.iter().map(|r| r.len()).max().unwrap_or(0);
if col_count == 0 {
return String::new();
}
let mut lines: Vec<String> = Vec::new();
// Header row
let header = &rows[0];
let header_cells: Vec<&str> = (0..col_count)
.map(|i| header.get(i).map(|s| s.as_str()).unwrap_or(""))
.collect();
lines.push(format!("| {} |", header_cells.join(" | ")));
// Separator row
let sep: Vec<&str> = vec!["---"; col_count];
lines.push(format!("| {} |", sep.join(" | ")));
// Data rows
for row in &rows[1..] {
let cells: Vec<&str> = (0..col_count)
.map(|i| row.get(i).map(|s| s.as_str()).unwrap_or(""))
.collect();
lines.push(format!("| {} |", cells.join(" | ")));
}
lines.join("\n")
}
/// Strip markdown formatting to get plain text.
fn strip_markdown_formatting(markdown: &str) -> String {
let mut plain = String::with_capacity(markdown.len());
for line in markdown.lines() {
let trimmed = line.trim_start_matches('#').trim();
if trimmed.starts_with("| ---") || trimmed == "|---|" {
continue; // Skip separator rows
}
if let Some(stripped) = trimmed.strip_prefix('|')
&& let Some(stripped) = stripped.strip_suffix('|')
{
// Table row: join cells with spaces
let cells: Vec<&str> = stripped.split('|').map(|c| c.trim()).collect();
plain.push_str(&cells.join(" "));
plain.push('\n');
continue;
}
plain.push_str(trimmed);
plain.push('\n');
}
plain.trim().to_string()
}
#[cfg(test)]
mod tests {
use super::*;
// --- Content-type detection ---
#[test]
fn test_detect_docx_content_type() {
let mut headers = HashMap::new();
headers.insert(
"content-type".to_string(),
"application/vnd.openxmlformats-officedocument.wordprocessingml.document".to_string(),
);
assert_eq!(
is_document_content_type(&headers, "https://example.com/file"),
Some(DocType::Docx)
);
}
#[test]
fn test_detect_xlsx_content_type() {
let mut headers = HashMap::new();
headers.insert(
"content-type".to_string(),
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet".to_string(),
);
assert_eq!(
is_document_content_type(&headers, "https://example.com/file"),
Some(DocType::Xlsx)
);
}
#[test]
fn test_detect_xls_content_type() {
let mut headers = HashMap::new();
headers.insert(
"content-type".to_string(),
"application/vnd.ms-excel".to_string(),
);
assert_eq!(
is_document_content_type(&headers, "https://example.com/file"),
Some(DocType::Xls)
);
}
#[test]
fn test_detect_csv_content_type() {
let mut headers = HashMap::new();
headers.insert("content-type".to_string(), "text/csv".to_string());
assert_eq!(
is_document_content_type(&headers, "https://example.com/file"),
Some(DocType::Csv)
);
}
#[test]
fn test_detect_csv_content_type_with_charset() {
let mut headers = HashMap::new();
headers.insert(
"content-type".to_string(),
"text/csv; charset=utf-8".to_string(),
);
assert_eq!(
is_document_content_type(&headers, "https://example.com/file"),
Some(DocType::Csv)
);
}
#[test]
fn test_detect_by_url_extension() {
let empty: HashMap<String, String> = HashMap::new();
assert_eq!(
is_document_content_type(&empty, "https://example.com/report.docx"),
Some(DocType::Docx)
);
assert_eq!(
is_document_content_type(&empty, "https://example.com/data.xlsx"),
Some(DocType::Xlsx)
);
assert_eq!(
is_document_content_type(&empty, "https://example.com/old.xls"),
Some(DocType::Xls)
);
assert_eq!(
is_document_content_type(&empty, "https://example.com/data.csv"),
Some(DocType::Csv)
);
}
#[test]
fn test_detect_url_extension_with_query() {
let empty: HashMap<String, String> = HashMap::new();
assert_eq!(
is_document_content_type(&empty, "https://example.com/report.docx?token=abc"),
Some(DocType::Docx)
);
}
#[test]
fn test_detect_url_extension_case_insensitive() {
let empty: HashMap<String, String> = HashMap::new();
assert_eq!(
is_document_content_type(&empty, "https://example.com/FILE.XLSX"),
Some(DocType::Xlsx)
);
}
#[test]
fn test_detect_none_for_html() {
let mut headers = HashMap::new();
headers.insert("content-type".to_string(), "text/html".to_string());
assert_eq!(
is_document_content_type(&headers, "https://example.com/page"),
None
);
}
#[test]
fn test_content_type_takes_precedence_over_url() {
let mut headers = HashMap::new();
headers.insert("content-type".to_string(), "text/csv".to_string());
// URL says .xlsx but Content-Type says CSV — header wins
assert_eq!(
is_document_content_type(&headers, "https://example.com/data.xlsx"),
Some(DocType::Csv)
);
}
// --- CSV parsing ---
#[test]
fn test_csv_simple() {
let csv = "Name,Age,City\nAlice,30,NYC\nBob,25,LA\n";
let result = extract_csv(csv.as_bytes()).unwrap();
assert!(result.contains("| Name | Age | City |"));
assert!(result.contains("| --- | --- | --- |"));
assert!(result.contains("| Alice | 30 | NYC |"));
assert!(result.contains("| Bob | 25 | LA |"));
}
#[test]
fn test_csv_quoted_fields() {
let csv = "Name,Description\nAlice,\"Has a, comma\"\nBob,\"Said \"\"hello\"\"\"\n";
let result = extract_csv(csv.as_bytes()).unwrap();
assert!(result.contains("Has a, comma"));
assert!(result.contains("Said \"hello\""));
}
#[test]
fn test_csv_empty() {
let result = extract_csv(b"").unwrap();
assert_eq!(result, "(empty CSV)");
}
#[test]
fn test_csv_windows_line_endings() {
let csv = "A,B\r\n1,2\r\n3,4\r\n";
let result = extract_csv(csv.as_bytes()).unwrap();
assert!(result.contains("| A | B |"));
assert!(result.contains("| 1 | 2 |"));
}
// --- DOCX XML parsing ---
#[test]
fn test_docx_xml_simple_paragraphs() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:r><w:t>Hello world</w:t></w:r></w:p>
<w:p><w:r><w:t>Second paragraph</w:t></w:r></w:p>
</w:body>
</w:document>"#;
let result = parse_docx_xml(xml).unwrap();
assert_eq!(result, "Hello world\n\nSecond paragraph");
}
#[test]
fn test_docx_xml_headings() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p>
<w:pPr><w:pStyle w:val="Heading1"/></w:pPr>
<w:r><w:t>Title</w:t></w:r>
</w:p>
<w:p><w:r><w:t>Body text</w:t></w:r></w:p>
<w:p>
<w:pPr><w:pStyle w:val="Heading2"/></w:pPr>
<w:r><w:t>Subtitle</w:t></w:r>
</w:p>
</w:body>
</w:document>"#;
let result = parse_docx_xml(xml).unwrap();
assert!(result.contains("# Title"));
assert!(result.contains("Body text"));
assert!(result.contains("## Subtitle"));
}
#[test]
fn test_docx_xml_multiple_runs() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p>
<w:r><w:t>Hello </w:t></w:r>
<w:r><w:t>world</w:t></w:r>
</w:p>
</w:body>
</w:document>"#;
let result = parse_docx_xml(xml).unwrap();
assert_eq!(result, "Hello world");
}
#[test]
fn test_docx_xml_empty_paragraphs_skipped() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p></w:p>
<w:p><w:r><w:t>Content</w:t></w:r></w:p>
<w:p><w:r><w:t> </w:t></w:r></w:p>
</w:body>
</w:document>"#;
let result = parse_docx_xml(xml).unwrap();
assert_eq!(result, "Content");
}
// --- Markdown table ---
#[test]
fn test_rows_to_markdown_table() {
let rows = vec![
vec!["A".to_string(), "B".to_string()],
vec!["1".to_string(), "2".to_string()],
vec!["3".to_string(), "4".to_string()],
];
let table = rows_to_markdown_table(&rows);
assert_eq!(table, "| A | B |\n| --- | --- |\n| 1 | 2 |\n| 3 | 4 |");
}
#[test]
fn test_rows_to_markdown_table_ragged() {
let rows = vec![
vec!["A".to_string(), "B".to_string(), "C".to_string()],
vec!["1".to_string()], // fewer columns
];
let table = rows_to_markdown_table(&rows);
assert!(table.contains("| 1 | | |"));
}
// --- Extract result ---
#[test]
fn test_extract_csv_result() {
let csv = "Name,Score\nAlice,100\n";
let result = extract_document(csv.as_bytes(), DocType::Csv).unwrap();
assert!(result.content.markdown.contains("| Name | Score |"));
assert!(result.metadata.word_count > 0);
assert!(result.content.links.is_empty());
assert!(result.domain_data.is_none());
}
// --- Strip markdown ---
#[test]
fn test_strip_markdown() {
let md = "# Title\n\nSome text\n\n| A | B |\n| --- | --- |\n| 1 | 2 |";
let plain = strip_markdown_formatting(md);
assert!(plain.contains("Title"));
assert!(plain.contains("Some text"));
assert!(plain.contains("A B"));
assert!(!plain.contains("---"));
}
}

View file

@ -5,6 +5,7 @@
pub mod browser;
pub mod client;
pub mod crawler;
pub mod document;
pub mod error;
pub mod linkedin;
pub mod proxy;