/// Document extraction for DOCX, XLSX, XLS, and CSV files. /// Auto-detects document type from Content-Type headers or URL extension, /// then extracts text content as markdown — same pattern as PDF extraction. use std::io::{Cursor, Read}; use tracing::debug; use crate::error::FetchError; #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum DocType { Docx, Xlsx, Xls, Csv, } impl DocType { fn label(self) -> &'static str { match self { DocType::Docx => "DOCX", DocType::Xlsx => "XLSX", DocType::Xls => "XLS", DocType::Csv => "CSV", } } } impl std::fmt::Display for DocType { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.write_str(self.label()) } } /// Detect document type from response headers or URL extension. /// Returns `None` for non-document responses (HTML, PDF, etc.). pub fn is_document_content_type(headers: &webclaw_http::HeaderMap, url: &str) -> Option { // Check Content-Type header first if let Some(ct) = headers.get("content-type").and_then(|v| v.to_str().ok()) { let mime = ct.split(';').next().unwrap_or("").trim(); if mime.eq_ignore_ascii_case( "application/vnd.openxmlformats-officedocument.wordprocessingml.document", ) { return Some(DocType::Docx); } if mime.eq_ignore_ascii_case( "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", ) { return Some(DocType::Xlsx); } if mime.eq_ignore_ascii_case("application/vnd.ms-excel") { return Some(DocType::Xls); } if mime.eq_ignore_ascii_case("text/csv") { return Some(DocType::Csv); } } // Fall back to URL extension let path = url.split('?').next().unwrap_or(url); let lower = path.to_ascii_lowercase(); if lower.ends_with(".docx") { return Some(DocType::Docx); } if lower.ends_with(".xlsx") { return Some(DocType::Xlsx); } if lower.ends_with(".xls") { return Some(DocType::Xls); } if lower.ends_with(".csv") { return Some(DocType::Csv); } None } /// Extract text content from document bytes, returning an ExtractionResult. pub fn extract_document( bytes: &[u8], doc_type: DocType, ) -> Result { debug!( doc_type = doc_type.label(), bytes = bytes.len(), "extracting document" ); let markdown = match doc_type { DocType::Docx => extract_docx(bytes)?, DocType::Xlsx => extract_xlsx(bytes)?, DocType::Xls => extract_xls(bytes)?, DocType::Csv => extract_csv(bytes)?, }; let plain_text = strip_markdown_formatting(&markdown); let word_count = plain_text.split_whitespace().count(); Ok(webclaw_core::ExtractionResult { metadata: webclaw_core::Metadata { title: None, description: None, author: None, published_date: None, language: None, url: None, site_name: None, image: None, favicon: None, word_count, }, content: webclaw_core::Content { markdown, plain_text, links: Vec::new(), images: Vec::new(), code_blocks: Vec::new(), raw_html: None, }, domain_data: None, structured_data: vec![], }) } /// Extract text from a DOCX file (ZIP of XML). /// Reads `word/document.xml`, extracts `` text nodes, detects heading styles. fn extract_docx(bytes: &[u8]) -> Result { let cursor = Cursor::new(bytes); let mut archive = zip::ZipArchive::new(cursor).map_err(|e| FetchError::Build(format!("DOCX zip: {e}")))?; let xml = { let mut file = archive .by_name("word/document.xml") .map_err(|e| FetchError::Build(format!("DOCX missing document.xml: {e}")))?; let mut buf = String::new(); file.read_to_string(&mut buf) .map_err(|e| FetchError::BodyDecode(format!("DOCX read: {e}")))?; buf }; parse_docx_xml(&xml) } /// Parse DOCX XML (word/document.xml) into markdown. /// /// Walks the XML looking for paragraph elements (``). Within each paragraph, /// collects text from `` tags and detects heading styles from ``. fn parse_docx_xml(xml: &str) -> Result { use quick_xml::Reader; use quick_xml::events::Event; let mut reader = Reader::from_str(xml); let mut paragraphs: Vec = Vec::new(); // State tracking for the current paragraph let mut in_paragraph = false; let mut in_run = false; // inside (run) let mut in_text = false; // inside let mut current_text = String::new(); let mut heading_level: Option = None; // None = normal paragraph let mut in_ppr = false; // inside (paragraph properties) loop { match reader.read_event() { Ok(Event::Start(ref e)) | Ok(Event::Empty(ref e)) => { let name_bytes = e.name().as_ref().to_vec(); let local = local_name(&name_bytes); match local { b"p" if is_w_namespace(&name_bytes) => { in_paragraph = true; current_text.clear(); heading_level = None; } b"pPr" if in_paragraph => in_ppr = true, b"pStyle" if in_ppr => { heading_level = extract_heading_level(e); } b"r" if in_paragraph => in_run = true, b"t" if in_run => in_text = true, b"br" if in_paragraph => { current_text.push('\n'); } b"tab" if in_paragraph => { current_text.push('\t'); } _ => {} } } Ok(Event::End(ref e)) => { let name_bytes = e.name().as_ref().to_vec(); let local = local_name(&name_bytes); match local { b"p" if in_paragraph => { let text = current_text.trim().to_string(); if !text.is_empty() { let formatted = match heading_level { Some(1) => format!("# {text}"), Some(2) => format!("## {text}"), Some(3) => format!("### {text}"), Some(4) => format!("#### {text}"), Some(5) => format!("##### {text}"), Some(6) => format!("###### {text}"), _ => text, }; paragraphs.push(formatted); } in_paragraph = false; } b"pPr" => in_ppr = false, b"r" => { in_run = false; in_text = false; } b"t" => in_text = false, _ => {} } } Ok(Event::Text(ref e)) if in_text => { if let Ok(text) = e.unescape() { current_text.push_str(&text); } } Ok(Event::Eof) => break, Err(e) => { return Err(FetchError::Build(format!("DOCX XML parse error: {e}"))); } _ => {} } } Ok(paragraphs.join("\n\n")) } /// Check if a qualified name belongs to the `w:` (wordprocessingML) namespace. /// Handles both `w:p` (prefixed) and just `p` (default namespace) forms. fn is_w_namespace(name: &[u8]) -> bool { // quick-xml gives us the full name bytes. Accept both "w:p" and "p". name == b"w:p" || name == b"p" } /// Extract the local name from a possibly namespaced XML tag. /// `w:p` -> `p`, `p` -> `p` fn local_name(name: &[u8]) -> &[u8] { match name.iter().position(|&b| b == b':') { Some(pos) => &name[pos + 1..], None => name, } } /// Extract heading level from a `` element. fn extract_heading_level(e: &quick_xml::events::BytesStart) -> Option { for attr in e.attributes().flatten() { let local = local_name(attr.key.as_ref()); if local == b"val" { let val = String::from_utf8_lossy(&attr.value); let lower = val.to_ascii_lowercase(); // Match "heading1", "heading2", etc. and "title" -> h1 if lower == "title" { return Some(1); } if let Some(rest) = lower.strip_prefix("heading") && let Ok(n) = rest.parse::() { return Some(n.min(6)); } } } None } /// Extract spreadsheet content using calamine (XLSX format). fn extract_xlsx(bytes: &[u8]) -> Result { extract_spreadsheet(bytes, "XLSX") } /// Extract spreadsheet content using calamine (XLS format). fn extract_xls(bytes: &[u8]) -> Result { extract_spreadsheet(bytes, "XLS") } /// Shared spreadsheet extraction for both XLSX and XLS via calamine. /// Reads all sheets and formats each as a markdown table. fn extract_spreadsheet(bytes: &[u8], label: &str) -> Result { use calamine::Reader; let cursor = Cursor::new(bytes); let mut workbook: calamine::Sheets<_> = calamine::open_workbook_auto_from_rs(cursor) .map_err(|e| FetchError::Build(format!("{label} open: {e}")))?; let sheet_names: Vec = workbook.sheet_names().to_vec(); let mut sections: Vec = Vec::new(); for name in &sheet_names { let range = workbook .worksheet_range(name) .map_err(|e| FetchError::Build(format!("{label} sheet '{name}': {e}")))?; let rows: Vec> = range .rows() .map(|row| row.iter().map(cell_to_string).collect()) .collect(); if rows.is_empty() { continue; } let mut section = format!("## Sheet: {name}\n\n"); section.push_str(&rows_to_markdown_table(&rows)); sections.push(section); } if sections.is_empty() { return Ok("(empty spreadsheet)".to_string()); } Ok(sections.join("\n\n")) } /// Convert a calamine cell value to a display string. fn cell_to_string(cell: &calamine::Data) -> String { use calamine::Data; match cell { Data::Empty => String::new(), Data::String(s) => s.clone(), Data::Int(n) => n.to_string(), Data::Float(f) => format_float(*f), Data::Bool(b) => b.to_string(), Data::Error(e) => format!("#{e:?}"), Data::DateTime(dt) => format!("{dt}"), Data::DateTimeIso(s) => s.clone(), Data::DurationIso(s) => s.clone(), } } /// Format a float, dropping trailing `.0` for clean integer display. fn format_float(f: f64) -> String { if f.fract() == 0.0 && f.abs() < i64::MAX as f64 { format!("{}", f as i64) } else { format!("{f}") } } /// Extract CSV text and convert to markdown table. fn extract_csv(bytes: &[u8]) -> Result { let text = String::from_utf8_lossy(bytes); let rows = parse_csv_rows(&text); if rows.is_empty() { return Ok("(empty CSV)".to_string()); } Ok(rows_to_markdown_table(&rows)) } /// Parse CSV text into rows of fields, handling quoted fields with commas/newlines. fn parse_csv_rows(text: &str) -> Vec> { let mut rows: Vec> = Vec::new(); let mut current_row: Vec = Vec::new(); let mut current_field = String::new(); let mut in_quotes = false; let mut chars = text.chars().peekable(); while let Some(ch) = chars.next() { if in_quotes { if ch == '"' { // Escaped quote ("") or end of quoted field if chars.peek() == Some(&'"') { chars.next(); current_field.push('"'); } else { in_quotes = false; } } else { current_field.push(ch); } } else { match ch { '"' => in_quotes = true, ',' => { current_row.push(current_field.trim().to_string()); current_field = String::new(); } '\n' => { current_row.push(current_field.trim().to_string()); current_field = String::new(); if !current_row.iter().all(|f| f.is_empty()) { rows.push(current_row); } current_row = Vec::new(); } '\r' => { // Skip carriage returns (handled with \n) } _ => current_field.push(ch), } } } // Flush last field/row if !current_field.is_empty() || !current_row.is_empty() { current_row.push(current_field.trim().to_string()); if !current_row.iter().all(|f| f.is_empty()) { rows.push(current_row); } } rows } /// Convert rows (first row = header) into a markdown table. fn rows_to_markdown_table(rows: &[Vec]) -> String { if rows.is_empty() { return String::new(); } // Find the max column count across all rows let col_count = rows.iter().map(|r| r.len()).max().unwrap_or(0); if col_count == 0 { return String::new(); } let mut lines: Vec = Vec::new(); // Header row let header = &rows[0]; let header_cells: Vec<&str> = (0..col_count) .map(|i| header.get(i).map(|s| s.as_str()).unwrap_or("")) .collect(); lines.push(format!("| {} |", header_cells.join(" | "))); // Separator row let sep: Vec<&str> = vec!["---"; col_count]; lines.push(format!("| {} |", sep.join(" | "))); // Data rows for row in &rows[1..] { let cells: Vec<&str> = (0..col_count) .map(|i| row.get(i).map(|s| s.as_str()).unwrap_or("")) .collect(); lines.push(format!("| {} |", cells.join(" | "))); } lines.join("\n") } /// Strip markdown formatting to get plain text. fn strip_markdown_formatting(markdown: &str) -> String { let mut plain = String::with_capacity(markdown.len()); for line in markdown.lines() { let trimmed = line.trim_start_matches('#').trim(); if trimmed.starts_with("| ---") || trimmed == "|---|" { continue; // Skip separator rows } if let Some(stripped) = trimmed.strip_prefix('|') && let Some(stripped) = stripped.strip_suffix('|') { // Table row: join cells with spaces let cells: Vec<&str> = stripped.split('|').map(|c| c.trim()).collect(); plain.push_str(&cells.join(" ")); plain.push('\n'); continue; } plain.push_str(trimmed); plain.push('\n'); } plain.trim().to_string() } #[cfg(test)] mod tests { use super::*; use webclaw_http::HeaderMap; fn headers_with(name: &str, value: &str) -> HeaderMap { let mut h = HeaderMap::new(); h.insert( name.parse::().unwrap(), value.parse().unwrap(), ); h } // --- Content-type detection --- #[test] fn test_detect_docx_content_type() { let headers = headers_with( "content-type", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", ); assert_eq!( is_document_content_type(&headers, "https://example.com/file"), Some(DocType::Docx) ); } #[test] fn test_detect_xlsx_content_type() { let headers = headers_with( "content-type", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", ); assert_eq!( is_document_content_type(&headers, "https://example.com/file"), Some(DocType::Xlsx) ); } #[test] fn test_detect_xls_content_type() { let headers = headers_with("content-type", "application/vnd.ms-excel"); assert_eq!( is_document_content_type(&headers, "https://example.com/file"), Some(DocType::Xls) ); } #[test] fn test_detect_csv_content_type() { let headers = headers_with("content-type", "text/csv"); assert_eq!( is_document_content_type(&headers, "https://example.com/file"), Some(DocType::Csv) ); } #[test] fn test_detect_csv_content_type_with_charset() { let headers = headers_with("content-type", "text/csv; charset=utf-8"); assert_eq!( is_document_content_type(&headers, "https://example.com/file"), Some(DocType::Csv) ); } #[test] fn test_detect_by_url_extension() { let empty = HeaderMap::new(); assert_eq!( is_document_content_type(&empty, "https://example.com/report.docx"), Some(DocType::Docx) ); assert_eq!( is_document_content_type(&empty, "https://example.com/data.xlsx"), Some(DocType::Xlsx) ); assert_eq!( is_document_content_type(&empty, "https://example.com/old.xls"), Some(DocType::Xls) ); assert_eq!( is_document_content_type(&empty, "https://example.com/data.csv"), Some(DocType::Csv) ); } #[test] fn test_detect_url_extension_with_query() { let empty = HeaderMap::new(); assert_eq!( is_document_content_type(&empty, "https://example.com/report.docx?token=abc"), Some(DocType::Docx) ); } #[test] fn test_detect_url_extension_case_insensitive() { let empty = HeaderMap::new(); assert_eq!( is_document_content_type(&empty, "https://example.com/FILE.XLSX"), Some(DocType::Xlsx) ); } #[test] fn test_detect_none_for_html() { let headers = headers_with("content-type", "text/html"); assert_eq!( is_document_content_type(&headers, "https://example.com/page"), None ); } #[test] fn test_content_type_takes_precedence_over_url() { let headers = headers_with("content-type", "text/csv"); // URL says .xlsx but Content-Type says CSV — header wins assert_eq!( is_document_content_type(&headers, "https://example.com/data.xlsx"), Some(DocType::Csv) ); } // --- CSV parsing --- #[test] fn test_csv_simple() { let csv = "Name,Age,City\nAlice,30,NYC\nBob,25,LA\n"; let result = extract_csv(csv.as_bytes()).unwrap(); assert!(result.contains("| Name | Age | City |")); assert!(result.contains("| --- | --- | --- |")); assert!(result.contains("| Alice | 30 | NYC |")); assert!(result.contains("| Bob | 25 | LA |")); } #[test] fn test_csv_quoted_fields() { let csv = "Name,Description\nAlice,\"Has a, comma\"\nBob,\"Said \"\"hello\"\"\"\n"; let result = extract_csv(csv.as_bytes()).unwrap(); assert!(result.contains("Has a, comma")); assert!(result.contains("Said \"hello\"")); } #[test] fn test_csv_empty() { let result = extract_csv(b"").unwrap(); assert_eq!(result, "(empty CSV)"); } #[test] fn test_csv_windows_line_endings() { let csv = "A,B\r\n1,2\r\n3,4\r\n"; let result = extract_csv(csv.as_bytes()).unwrap(); assert!(result.contains("| A | B |")); assert!(result.contains("| 1 | 2 |")); } // --- DOCX XML parsing --- #[test] fn test_docx_xml_simple_paragraphs() { let xml = r#" Hello world Second paragraph "#; let result = parse_docx_xml(xml).unwrap(); assert_eq!(result, "Hello world\n\nSecond paragraph"); } #[test] fn test_docx_xml_headings() { let xml = r#" Title Body text Subtitle "#; let result = parse_docx_xml(xml).unwrap(); assert!(result.contains("# Title")); assert!(result.contains("Body text")); assert!(result.contains("## Subtitle")); } #[test] fn test_docx_xml_multiple_runs() { let xml = r#" Hello world "#; let result = parse_docx_xml(xml).unwrap(); assert_eq!(result, "Hello world"); } #[test] fn test_docx_xml_empty_paragraphs_skipped() { let xml = r#" Content "#; let result = parse_docx_xml(xml).unwrap(); assert_eq!(result, "Content"); } // --- Markdown table --- #[test] fn test_rows_to_markdown_table() { let rows = vec![ vec!["A".to_string(), "B".to_string()], vec!["1".to_string(), "2".to_string()], vec!["3".to_string(), "4".to_string()], ]; let table = rows_to_markdown_table(&rows); assert_eq!(table, "| A | B |\n| --- | --- |\n| 1 | 2 |\n| 3 | 4 |"); } #[test] fn test_rows_to_markdown_table_ragged() { let rows = vec![ vec!["A".to_string(), "B".to_string(), "C".to_string()], vec!["1".to_string()], // fewer columns ]; let table = rows_to_markdown_table(&rows); assert!(table.contains("| 1 | | |")); } // --- Extract result --- #[test] fn test_extract_csv_result() { let csv = "Name,Score\nAlice,100\n"; let result = extract_document(csv.as_bytes(), DocType::Csv).unwrap(); assert!(result.content.markdown.contains("| Name | Score |")); assert!(result.metadata.word_count > 0); assert!(result.content.links.is_empty()); assert!(result.domain_data.is_none()); } // --- Strip markdown --- #[test] fn test_strip_markdown() { let md = "# Title\n\nSome text\n\n| A | B |\n| --- | --- |\n| 1 | 2 |"; let plain = strip_markdown_formatting(md); assert!(plain.contains("Title")); assert!(plain.contains("Some text")); assert!(plain.contains("A B")); assert!(!plain.contains("---")); } }