feat: v0.2.0 — DOCX/XLSX/CSV extraction, HTML format, multi-URL watch, batch LLM

Document extraction: - DOCX: auto-detected, outputs markdown with headings (via zip + quick-xml) - XLSX/XLS: markdown tables with multi-sheet support (via calamine) - CSV: quoted field handling, markdown table output - All auto-detected by Content-Type header or URL extension New features: - -f html output format (sanitized HTML) - Multi-URL watch: --urls-file + --watch monitors all URLs in parallel - Batch + LLM: --extract-prompt/--extract-json works with multiple URLs - Mixed batch: HTML pages + DOCX + XLSX + CSV in one command Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-15 18:25:24 +02:00 · 2026-03-26 15:28:23 +01:00 · 2026-03-26 15:28:23 +01:00 · ea14848772
commit ea14848772
parent 0e4128782a
8 changed files with 1520 additions and 41 deletions
--- a/crates/webclaw-fetch/src/client.rs
+++ b/crates/webclaw-fetch/src/client.rs
@ -399,6 +399,27 @@ impl FetchClient {

            let pdf_result = webclaw_pdf::extract_pdf(&bytes, self.pdf_mode.clone())?;
            Ok(pdf_to_extraction_result(&pdf_result, &final_url))
+        } else if let Some(doc_type) =
+            crate::document::is_document_content_type(&headers, &final_url)
+        {
+            debug!(status, doc_type = ?doc_type, "detected document response, extracting");
+
+            let bytes = response
+                .bytes()
+                .await
+                .map_err(|e| FetchError::BodyDecode(e.to_string()))?;
+
+            let elapsed = start.elapsed();
+            debug!(
+                status,
+                bytes = bytes.len(),
+                elapsed_ms = %elapsed.as_millis(),
+                "document fetch complete"
+            );
+
+            let mut result = crate::document::extract_document(&bytes, doc_type)?;
+            result.metadata.url = Some(final_url);
+            Ok(result)
        } else {
            let html = response
                .text()
--- a/crates/webclaw-fetch/src/document.rs
+++ b/crates/webclaw-fetch/src/document.rs
@ -0,0 +1,743 @@
+/// Document extraction for DOCX, XLSX, XLS, and CSV files.
+/// Auto-detects document type from Content-Type headers or URL extension,
+/// then extracts text content as markdown — same pattern as PDF extraction.
+use std::collections::HashMap;
+use std::io::{Cursor, Read};
+
+use tracing::debug;
+
+use crate::error::FetchError;
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum DocType {
+    Docx,
+    Xlsx,
+    Xls,
+    Csv,
+}
+
+impl DocType {
+    fn label(self) -> &'static str {
+        match self {
+            DocType::Docx => "DOCX",
+            DocType::Xlsx => "XLSX",
+            DocType::Xls => "XLS",
+            DocType::Csv => "CSV",
+        }
+    }
+}
+
+/// Detect document type from response headers or URL extension.
+/// Returns `None` for non-document responses (HTML, PDF, etc.).
+pub fn is_document_content_type(headers: &HashMap<String, String>, url: &str) -> Option<DocType> {
+    // Check Content-Type header first
+    if let Some(ct) = headers.get("content-type") {
+        let mime = ct.split(';').next().unwrap_or("").trim();
+
+        if mime.eq_ignore_ascii_case(
+            "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        ) {
+            return Some(DocType::Docx);
+        }
+        if mime.eq_ignore_ascii_case(
+            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+        ) {
+            return Some(DocType::Xlsx);
+        }
+        if mime.eq_ignore_ascii_case("application/vnd.ms-excel") {
+            return Some(DocType::Xls);
+        }
+        if mime.eq_ignore_ascii_case("text/csv") {
+            return Some(DocType::Csv);
+        }
+    }
+
+    // Fall back to URL extension
+    let path = url.split('?').next().unwrap_or(url);
+    let lower = path.to_ascii_lowercase();
+
+    if lower.ends_with(".docx") {
+        return Some(DocType::Docx);
+    }
+    if lower.ends_with(".xlsx") {
+        return Some(DocType::Xlsx);
+    }
+    if lower.ends_with(".xls") {
+        return Some(DocType::Xls);
+    }
+    if lower.ends_with(".csv") {
+        return Some(DocType::Csv);
+    }
+
+    None
+}
+
+/// Extract text content from document bytes, returning an ExtractionResult.
+pub fn extract_document(
+    bytes: &[u8],
+    doc_type: DocType,
+) -> Result<webclaw_core::ExtractionResult, FetchError> {
+    debug!(
+        doc_type = doc_type.label(),
+        bytes = bytes.len(),
+        "extracting document"
+    );
+
+    let markdown = match doc_type {
+        DocType::Docx => extract_docx(bytes)?,
+        DocType::Xlsx => extract_xlsx(bytes)?,
+        DocType::Xls => extract_xls(bytes)?,
+        DocType::Csv => extract_csv(bytes)?,
+    };
+
+    let plain_text = strip_markdown_formatting(&markdown);
+    let word_count = plain_text.split_whitespace().count();
+
+    Ok(webclaw_core::ExtractionResult {
+        metadata: webclaw_core::Metadata {
+            title: None,
+            description: None,
+            author: None,
+            published_date: None,
+            language: None,
+            url: None,
+            site_name: None,
+            image: None,
+            favicon: None,
+            word_count,
+        },
+        content: webclaw_core::Content {
+            markdown,
+            plain_text,
+            links: Vec::new(),
+            images: Vec::new(),
+            code_blocks: Vec::new(),
+            raw_html: None,
+        },
+        domain_data: None,
+        structured_data: vec![],
+    })
+}
+
+/// Extract text from a DOCX file (ZIP of XML).
+/// Reads `word/document.xml`, extracts `<w:t>` text nodes, detects heading styles.
+fn extract_docx(bytes: &[u8]) -> Result<String, FetchError> {
+    let cursor = Cursor::new(bytes);
+    let mut archive =
+        zip::ZipArchive::new(cursor).map_err(|e| FetchError::Build(format!("DOCX zip: {e}")))?;
+
+    let xml = {
+        let mut file = archive
+            .by_name("word/document.xml")
+            .map_err(|e| FetchError::Build(format!("DOCX missing document.xml: {e}")))?;
+        let mut buf = String::new();
+        file.read_to_string(&mut buf)
+            .map_err(|e| FetchError::BodyDecode(format!("DOCX read: {e}")))?;
+        buf
+    };
+
+    parse_docx_xml(&xml)
+}
+
+/// Parse DOCX XML (word/document.xml) into markdown.
+///
+/// Walks the XML looking for paragraph elements (`<w:p>`). Within each paragraph,
+/// collects text from `<w:t>` tags and detects heading styles from `<w:pStyle>`.
+fn parse_docx_xml(xml: &str) -> Result<String, FetchError> {
+    use quick_xml::Reader;
+    use quick_xml::events::Event;
+
+    let mut reader = Reader::from_str(xml);
+    let mut paragraphs: Vec<String> = Vec::new();
+
+    // State tracking for the current paragraph
+    let mut in_paragraph = false;
+    let mut in_run = false; // inside <w:r> (run)
+    let mut in_text = false; // inside <w:t>
+    let mut current_text = String::new();
+    let mut heading_level: Option<u8> = 0.into(); // None = normal paragraph
+    let mut in_ppr = false; // inside <w:pPr> (paragraph properties)
+
+    loop {
+        match reader.read_event() {
+            Ok(Event::Start(ref e)) | Ok(Event::Empty(ref e)) => {
+                let name_bytes = e.name().as_ref().to_vec();
+                let local = local_name(&name_bytes);
+                match local {
+                    b"p" if is_w_namespace(&name_bytes) => {
+                        in_paragraph = true;
+                        current_text.clear();
+                        heading_level = None;
+                    }
+                    b"pPr" if in_paragraph => in_ppr = true,
+                    b"pStyle" if in_ppr => {
+                        heading_level = extract_heading_level(e);
+                    }
+                    b"r" if in_paragraph => in_run = true,
+                    b"t" if in_run => in_text = true,
+                    b"br" if in_paragraph => {
+                        current_text.push('\n');
+                    }
+                    b"tab" if in_paragraph => {
+                        current_text.push('\t');
+                    }
+                    _ => {}
+                }
+            }
+            Ok(Event::End(ref e)) => {
+                let name_bytes = e.name().as_ref().to_vec();
+                let local = local_name(&name_bytes);
+                match local {
+                    b"p" if in_paragraph => {
+                        let text = current_text.trim().to_string();
+                        if !text.is_empty() {
+                            let formatted = match heading_level {
+                                Some(1) => format!("# {text}"),
+                                Some(2) => format!("## {text}"),
+                                Some(3) => format!("### {text}"),
+                                Some(4) => format!("#### {text}"),
+                                Some(5) => format!("##### {text}"),
+                                Some(6) => format!("###### {text}"),
+                                _ => text,
+                            };
+                            paragraphs.push(formatted);
+                        }
+                        in_paragraph = false;
+                    }
+                    b"pPr" => in_ppr = false,
+                    b"r" => {
+                        in_run = false;
+                        in_text = false;
+                    }
+                    b"t" => in_text = false,
+                    _ => {}
+                }
+            }
+            Ok(Event::Text(ref e)) if in_text => {
+                if let Ok(text) = e.unescape() {
+                    current_text.push_str(&text);
+                }
+            }
+            Ok(Event::Eof) => break,
+            Err(e) => {
+                return Err(FetchError::Build(format!("DOCX XML parse error: {e}")));
+            }
+            _ => {}
+        }
+    }
+
+    Ok(paragraphs.join("\n\n"))
+}
+
+/// Check if a qualified name belongs to the `w:` (wordprocessingML) namespace.
+/// Handles both `w:p` (prefixed) and just `p` (default namespace) forms.
+fn is_w_namespace(name: &[u8]) -> bool {
+    // quick-xml gives us the full name bytes. Accept both "w:p" and "p".
+    name == b"w:p" || name == b"p"
+}
+
+/// Extract the local name from a possibly namespaced XML tag.
+/// `w:p` -> `p`, `p` -> `p`
+fn local_name(name: &[u8]) -> &[u8] {
+    match name.iter().position(|&b| b == b':') {
+        Some(pos) => &name[pos + 1..],
+        None => name,
+    }
+}
+
+/// Extract heading level from a `<w:pStyle w:val="Heading1"/>` element.
+fn extract_heading_level(e: &quick_xml::events::BytesStart) -> Option<u8> {
+    for attr in e.attributes().flatten() {
+        let local = local_name(attr.key.as_ref());
+        if local == b"val" {
+            let val = String::from_utf8_lossy(&attr.value);
+            let lower = val.to_ascii_lowercase();
+
+            // Match "heading1", "heading2", etc. and "title" -> h1
+            if lower == "title" {
+                return Some(1);
+            }
+            if let Some(rest) = lower.strip_prefix("heading")
+                && let Ok(n) = rest.parse::<u8>()
+            {
+                return Some(n.min(6));
+            }
+        }
+    }
+    None
+}
+
+/// Extract spreadsheet content using calamine (XLSX format).
+fn extract_xlsx(bytes: &[u8]) -> Result<String, FetchError> {
+    extract_spreadsheet(bytes, "XLSX")
+}
+
+/// Extract spreadsheet content using calamine (XLS format).
+fn extract_xls(bytes: &[u8]) -> Result<String, FetchError> {
+    extract_spreadsheet(bytes, "XLS")
+}
+
+/// Shared spreadsheet extraction for both XLSX and XLS via calamine.
+/// Reads all sheets and formats each as a markdown table.
+fn extract_spreadsheet(bytes: &[u8], label: &str) -> Result<String, FetchError> {
+    use calamine::Reader;
+
+    let cursor = Cursor::new(bytes);
+    let mut workbook: calamine::Sheets<_> = calamine::open_workbook_auto_from_rs(cursor)
+        .map_err(|e| FetchError::Build(format!("{label} open: {e}")))?;
+
+    let sheet_names: Vec<String> = workbook.sheet_names().to_vec();
+    let mut sections: Vec<String> = Vec::new();
+
+    for name in &sheet_names {
+        let range = workbook
+            .worksheet_range(name)
+            .map_err(|e| FetchError::Build(format!("{label} sheet '{name}': {e}")))?;
+
+        let rows: Vec<Vec<String>> = range
+            .rows()
+            .map(|row| row.iter().map(cell_to_string).collect())
+            .collect();
+
+        if rows.is_empty() {
+            continue;
+        }
+
+        let mut section = format!("## Sheet: {name}\n\n");
+        section.push_str(&rows_to_markdown_table(&rows));
+        sections.push(section);
+    }
+
+    if sections.is_empty() {
+        return Ok("(empty spreadsheet)".to_string());
+    }
+
+    Ok(sections.join("\n\n"))
+}
+
+/// Convert a calamine cell value to a display string.
+fn cell_to_string(cell: &calamine::Data) -> String {
+    use calamine::Data;
+    match cell {
+        Data::Empty => String::new(),
+        Data::String(s) => s.clone(),
+        Data::Int(n) => n.to_string(),
+        Data::Float(f) => format_float(*f),
+        Data::Bool(b) => b.to_string(),
+        Data::Error(e) => format!("#{e:?}"),
+        Data::DateTime(dt) => format!("{dt}"),
+        Data::DateTimeIso(s) => s.clone(),
+        Data::DurationIso(s) => s.clone(),
+    }
+}
+
+/// Format a float, dropping trailing `.0` for clean integer display.
+fn format_float(f: f64) -> String {
+    if f.fract() == 0.0 && f.abs() < i64::MAX as f64 {
+        format!("{}", f as i64)
+    } else {
+        format!("{f}")
+    }
+}
+
+/// Extract CSV text and convert to markdown table.
+fn extract_csv(bytes: &[u8]) -> Result<String, FetchError> {
+    let text = String::from_utf8_lossy(bytes);
+    let rows = parse_csv_rows(&text);
+
+    if rows.is_empty() {
+        return Ok("(empty CSV)".to_string());
+    }
+
+    Ok(rows_to_markdown_table(&rows))
+}
+
+/// Parse CSV text into rows of fields, handling quoted fields with commas/newlines.
+fn parse_csv_rows(text: &str) -> Vec<Vec<String>> {
+    let mut rows: Vec<Vec<String>> = Vec::new();
+    let mut current_row: Vec<String> = Vec::new();
+    let mut current_field = String::new();
+    let mut in_quotes = false;
+    let mut chars = text.chars().peekable();
+
+    while let Some(ch) = chars.next() {
+        if in_quotes {
+            if ch == '"' {
+                // Escaped quote ("") or end of quoted field
+                if chars.peek() == Some(&'"') {
+                    chars.next();
+                    current_field.push('"');
+                } else {
+                    in_quotes = false;
+                }
+            } else {
+                current_field.push(ch);
+            }
+        } else {
+            match ch {
+                '"' => in_quotes = true,
+                ',' => {
+                    current_row.push(current_field.trim().to_string());
+                    current_field = String::new();
+                }
+                '\n' => {
+                    current_row.push(current_field.trim().to_string());
+                    current_field = String::new();
+                    if !current_row.iter().all(|f| f.is_empty()) {
+                        rows.push(current_row);
+                    }
+                    current_row = Vec::new();
+                }
+                '\r' => {
+                    // Skip carriage returns (handled with \n)
+                }
+                _ => current_field.push(ch),
+            }
+        }
+    }
+
+    // Flush last field/row
+    if !current_field.is_empty() || !current_row.is_empty() {
+        current_row.push(current_field.trim().to_string());
+        if !current_row.iter().all(|f| f.is_empty()) {
+            rows.push(current_row);
+        }
+    }
+
+    rows
+}
+
+/// Convert rows (first row = header) into a markdown table.
+fn rows_to_markdown_table(rows: &[Vec<String>]) -> String {
+    if rows.is_empty() {
+        return String::new();
+    }
+
+    // Find the max column count across all rows
+    let col_count = rows.iter().map(|r| r.len()).max().unwrap_or(0);
+    if col_count == 0 {
+        return String::new();
+    }
+
+    let mut lines: Vec<String> = Vec::new();
+
+    // Header row
+    let header = &rows[0];
+    let header_cells: Vec<&str> = (0..col_count)
+        .map(|i| header.get(i).map(|s| s.as_str()).unwrap_or(""))
+        .collect();
+    lines.push(format!("| {} |", header_cells.join(" | ")));
+
+    // Separator row
+    let sep: Vec<&str> = vec!["---"; col_count];
+    lines.push(format!("| {} |", sep.join(" | ")));
+
+    // Data rows
+    for row in &rows[1..] {
+        let cells: Vec<&str> = (0..col_count)
+            .map(|i| row.get(i).map(|s| s.as_str()).unwrap_or(""))
+            .collect();
+        lines.push(format!("| {} |", cells.join(" | ")));
+    }
+
+    lines.join("\n")
+}
+
+/// Strip markdown formatting to get plain text.
+fn strip_markdown_formatting(markdown: &str) -> String {
+    let mut plain = String::with_capacity(markdown.len());
+    for line in markdown.lines() {
+        let trimmed = line.trim_start_matches('#').trim();
+        if trimmed.starts_with("| ---") || trimmed == "|---|" {
+            continue; // Skip separator rows
+        }
+        if let Some(stripped) = trimmed.strip_prefix('|')
+            && let Some(stripped) = stripped.strip_suffix('|')
+        {
+            // Table row: join cells with spaces
+            let cells: Vec<&str> = stripped.split('|').map(|c| c.trim()).collect();
+            plain.push_str(&cells.join(" "));
+            plain.push('\n');
+            continue;
+        }
+        plain.push_str(trimmed);
+        plain.push('\n');
+    }
+    plain.trim().to_string()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // --- Content-type detection ---
+
+    #[test]
+    fn test_detect_docx_content_type() {
+        let mut headers = HashMap::new();
+        headers.insert(
+            "content-type".to_string(),
+            "application/vnd.openxmlformats-officedocument.wordprocessingml.document".to_string(),
+        );
+        assert_eq!(
+            is_document_content_type(&headers, "https://example.com/file"),
+            Some(DocType::Docx)
+        );
+    }
+
+    #[test]
+    fn test_detect_xlsx_content_type() {
+        let mut headers = HashMap::new();
+        headers.insert(
+            "content-type".to_string(),
+            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet".to_string(),
+        );
+        assert_eq!(
+            is_document_content_type(&headers, "https://example.com/file"),
+            Some(DocType::Xlsx)
+        );
+    }
+
+    #[test]
+    fn test_detect_xls_content_type() {
+        let mut headers = HashMap::new();
+        headers.insert(
+            "content-type".to_string(),
+            "application/vnd.ms-excel".to_string(),
+        );
+        assert_eq!(
+            is_document_content_type(&headers, "https://example.com/file"),
+            Some(DocType::Xls)
+        );
+    }
+
+    #[test]
+    fn test_detect_csv_content_type() {
+        let mut headers = HashMap::new();
+        headers.insert("content-type".to_string(), "text/csv".to_string());
+        assert_eq!(
+            is_document_content_type(&headers, "https://example.com/file"),
+            Some(DocType::Csv)
+        );
+    }
+
+    #[test]
+    fn test_detect_csv_content_type_with_charset() {
+        let mut headers = HashMap::new();
+        headers.insert(
+            "content-type".to_string(),
+            "text/csv; charset=utf-8".to_string(),
+        );
+        assert_eq!(
+            is_document_content_type(&headers, "https://example.com/file"),
+            Some(DocType::Csv)
+        );
+    }
+
+    #[test]
+    fn test_detect_by_url_extension() {
+        let empty: HashMap<String, String> = HashMap::new();
+        assert_eq!(
+            is_document_content_type(&empty, "https://example.com/report.docx"),
+            Some(DocType::Docx)
+        );
+        assert_eq!(
+            is_document_content_type(&empty, "https://example.com/data.xlsx"),
+            Some(DocType::Xlsx)
+        );
+        assert_eq!(
+            is_document_content_type(&empty, "https://example.com/old.xls"),
+            Some(DocType::Xls)
+        );
+        assert_eq!(
+            is_document_content_type(&empty, "https://example.com/data.csv"),
+            Some(DocType::Csv)
+        );
+    }
+
+    #[test]
+    fn test_detect_url_extension_with_query() {
+        let empty: HashMap<String, String> = HashMap::new();
+        assert_eq!(
+            is_document_content_type(&empty, "https://example.com/report.docx?token=abc"),
+            Some(DocType::Docx)
+        );
+    }
+
+    #[test]
+    fn test_detect_url_extension_case_insensitive() {
+        let empty: HashMap<String, String> = HashMap::new();
+        assert_eq!(
+            is_document_content_type(&empty, "https://example.com/FILE.XLSX"),
+            Some(DocType::Xlsx)
+        );
+    }
+
+    #[test]
+    fn test_detect_none_for_html() {
+        let mut headers = HashMap::new();
+        headers.insert("content-type".to_string(), "text/html".to_string());
+        assert_eq!(
+            is_document_content_type(&headers, "https://example.com/page"),
+            None
+        );
+    }
+
+    #[test]
+    fn test_content_type_takes_precedence_over_url() {
+        let mut headers = HashMap::new();
+        headers.insert("content-type".to_string(), "text/csv".to_string());
+        // URL says .xlsx but Content-Type says CSV — header wins
+        assert_eq!(
+            is_document_content_type(&headers, "https://example.com/data.xlsx"),
+            Some(DocType::Csv)
+        );
+    }
+
+    // --- CSV parsing ---
+
+    #[test]
+    fn test_csv_simple() {
+        let csv = "Name,Age,City\nAlice,30,NYC\nBob,25,LA\n";
+        let result = extract_csv(csv.as_bytes()).unwrap();
+        assert!(result.contains("| Name | Age | City |"));
+        assert!(result.contains("| --- | --- | --- |"));
+        assert!(result.contains("| Alice | 30 | NYC |"));
+        assert!(result.contains("| Bob | 25 | LA |"));
+    }
+
+    #[test]
+    fn test_csv_quoted_fields() {
+        let csv = "Name,Description\nAlice,\"Has a, comma\"\nBob,\"Said \"\"hello\"\"\"\n";
+        let result = extract_csv(csv.as_bytes()).unwrap();
+        assert!(result.contains("Has a, comma"));
+        assert!(result.contains("Said \"hello\""));
+    }
+
+    #[test]
+    fn test_csv_empty() {
+        let result = extract_csv(b"").unwrap();
+        assert_eq!(result, "(empty CSV)");
+    }
+
+    #[test]
+    fn test_csv_windows_line_endings() {
+        let csv = "A,B\r\n1,2\r\n3,4\r\n";
+        let result = extract_csv(csv.as_bytes()).unwrap();
+        assert!(result.contains("| A | B |"));
+        assert!(result.contains("| 1 | 2 |"));
+    }
+
+    // --- DOCX XML parsing ---
+
+    #[test]
+    fn test_docx_xml_simple_paragraphs() {
+        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
+<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
+  <w:body>
+    <w:p><w:r><w:t>Hello world</w:t></w:r></w:p>
+    <w:p><w:r><w:t>Second paragraph</w:t></w:r></w:p>
+  </w:body>
+</w:document>"#;
+        let result = parse_docx_xml(xml).unwrap();
+        assert_eq!(result, "Hello world\n\nSecond paragraph");
+    }
+
+    #[test]
+    fn test_docx_xml_headings() {
+        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
+<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
+  <w:body>
+    <w:p>
+      <w:pPr><w:pStyle w:val="Heading1"/></w:pPr>
+      <w:r><w:t>Title</w:t></w:r>
+    </w:p>
+    <w:p><w:r><w:t>Body text</w:t></w:r></w:p>
+    <w:p>
+      <w:pPr><w:pStyle w:val="Heading2"/></w:pPr>
+      <w:r><w:t>Subtitle</w:t></w:r>
+    </w:p>
+  </w:body>
+</w:document>"#;
+        let result = parse_docx_xml(xml).unwrap();
+        assert!(result.contains("# Title"));
+        assert!(result.contains("Body text"));
+        assert!(result.contains("## Subtitle"));
+    }
+
+    #[test]
+    fn test_docx_xml_multiple_runs() {
+        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
+<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
+  <w:body>
+    <w:p>
+      <w:r><w:t>Hello </w:t></w:r>
+      <w:r><w:t>world</w:t></w:r>
+    </w:p>
+  </w:body>
+</w:document>"#;
+        let result = parse_docx_xml(xml).unwrap();
+        assert_eq!(result, "Hello world");
+    }
+
+    #[test]
+    fn test_docx_xml_empty_paragraphs_skipped() {
+        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
+<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
+  <w:body>
+    <w:p></w:p>
+    <w:p><w:r><w:t>Content</w:t></w:r></w:p>
+    <w:p><w:r><w:t>   </w:t></w:r></w:p>
+  </w:body>
+</w:document>"#;
+        let result = parse_docx_xml(xml).unwrap();
+        assert_eq!(result, "Content");
+    }
+
+    // --- Markdown table ---
+
+    #[test]
+    fn test_rows_to_markdown_table() {
+        let rows = vec![
+            vec!["A".to_string(), "B".to_string()],
+            vec!["1".to_string(), "2".to_string()],
+            vec!["3".to_string(), "4".to_string()],
+        ];
+        let table = rows_to_markdown_table(&rows);
+        assert_eq!(table, "| A | B |\n| --- | --- |\n| 1 | 2 |\n| 3 | 4 |");
+    }
+
+    #[test]
+    fn test_rows_to_markdown_table_ragged() {
+        let rows = vec![
+            vec!["A".to_string(), "B".to_string(), "C".to_string()],
+            vec!["1".to_string()], // fewer columns
+        ];
+        let table = rows_to_markdown_table(&rows);
+        assert!(table.contains("| 1 |  |  |"));
+    }
+
+    // --- Extract result ---
+
+    #[test]
+    fn test_extract_csv_result() {
+        let csv = "Name,Score\nAlice,100\n";
+        let result = extract_document(csv.as_bytes(), DocType::Csv).unwrap();
+        assert!(result.content.markdown.contains("| Name | Score |"));
+        assert!(result.metadata.word_count > 0);
+        assert!(result.content.links.is_empty());
+        assert!(result.domain_data.is_none());
+    }
+
+    // --- Strip markdown ---
+
+    #[test]
+    fn test_strip_markdown() {
+        let md = "# Title\n\nSome text\n\n| A | B |\n| --- | --- |\n| 1 | 2 |";
+        let plain = strip_markdown_formatting(md);
+        assert!(plain.contains("Title"));
+        assert!(plain.contains("Some text"));
+        assert!(plain.contains("A B"));
+        assert!(!plain.contains("---"));
+    }
+}
--- a/crates/webclaw-fetch/src/lib.rs
+++ b/crates/webclaw-fetch/src/lib.rs
@ -5,6 +5,7 @@
 pub mod browser;
 pub mod client;
 pub mod crawler;
+pub mod document;
 pub mod error;
 pub mod linkedin;
 pub mod proxy;