mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-05-15 18:25:24 +02:00
feat: v0.2.0 — DOCX/XLSX/CSV extraction, HTML format, multi-URL watch, batch LLM
Document extraction: - DOCX: auto-detected, outputs markdown with headings (via zip + quick-xml) - XLSX/XLS: markdown tables with multi-sheet support (via calamine) - CSV: quoted field handling, markdown table output - All auto-detected by Content-Type header or URL extension New features: - -f html output format (sanitized HTML) - Multi-URL watch: --urls-file + --watch monitors all URLs in parallel - Batch + LLM: --extract-prompt/--extract-json works with multiple URLs - Mixed batch: HTML pages + DOCX + XLSX + CSV in one command Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
0e4128782a
commit
ea14848772
8 changed files with 1520 additions and 41 deletions
|
|
@ -399,6 +399,27 @@ impl FetchClient {
|
|||
|
||||
let pdf_result = webclaw_pdf::extract_pdf(&bytes, self.pdf_mode.clone())?;
|
||||
Ok(pdf_to_extraction_result(&pdf_result, &final_url))
|
||||
} else if let Some(doc_type) =
|
||||
crate::document::is_document_content_type(&headers, &final_url)
|
||||
{
|
||||
debug!(status, doc_type = ?doc_type, "detected document response, extracting");
|
||||
|
||||
let bytes = response
|
||||
.bytes()
|
||||
.await
|
||||
.map_err(|e| FetchError::BodyDecode(e.to_string()))?;
|
||||
|
||||
let elapsed = start.elapsed();
|
||||
debug!(
|
||||
status,
|
||||
bytes = bytes.len(),
|
||||
elapsed_ms = %elapsed.as_millis(),
|
||||
"document fetch complete"
|
||||
);
|
||||
|
||||
let mut result = crate::document::extract_document(&bytes, doc_type)?;
|
||||
result.metadata.url = Some(final_url);
|
||||
Ok(result)
|
||||
} else {
|
||||
let html = response
|
||||
.text()
|
||||
|
|
|
|||
743
crates/webclaw-fetch/src/document.rs
Normal file
743
crates/webclaw-fetch/src/document.rs
Normal file
|
|
@ -0,0 +1,743 @@
|
|||
/// Document extraction for DOCX, XLSX, XLS, and CSV files.
|
||||
/// Auto-detects document type from Content-Type headers or URL extension,
|
||||
/// then extracts text content as markdown — same pattern as PDF extraction.
|
||||
use std::collections::HashMap;
|
||||
use std::io::{Cursor, Read};
|
||||
|
||||
use tracing::debug;
|
||||
|
||||
use crate::error::FetchError;
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum DocType {
|
||||
Docx,
|
||||
Xlsx,
|
||||
Xls,
|
||||
Csv,
|
||||
}
|
||||
|
||||
impl DocType {
|
||||
fn label(self) -> &'static str {
|
||||
match self {
|
||||
DocType::Docx => "DOCX",
|
||||
DocType::Xlsx => "XLSX",
|
||||
DocType::Xls => "XLS",
|
||||
DocType::Csv => "CSV",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Detect document type from response headers or URL extension.
|
||||
/// Returns `None` for non-document responses (HTML, PDF, etc.).
|
||||
pub fn is_document_content_type(headers: &HashMap<String, String>, url: &str) -> Option<DocType> {
|
||||
// Check Content-Type header first
|
||||
if let Some(ct) = headers.get("content-type") {
|
||||
let mime = ct.split(';').next().unwrap_or("").trim();
|
||||
|
||||
if mime.eq_ignore_ascii_case(
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
) {
|
||||
return Some(DocType::Docx);
|
||||
}
|
||||
if mime.eq_ignore_ascii_case(
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
) {
|
||||
return Some(DocType::Xlsx);
|
||||
}
|
||||
if mime.eq_ignore_ascii_case("application/vnd.ms-excel") {
|
||||
return Some(DocType::Xls);
|
||||
}
|
||||
if mime.eq_ignore_ascii_case("text/csv") {
|
||||
return Some(DocType::Csv);
|
||||
}
|
||||
}
|
||||
|
||||
// Fall back to URL extension
|
||||
let path = url.split('?').next().unwrap_or(url);
|
||||
let lower = path.to_ascii_lowercase();
|
||||
|
||||
if lower.ends_with(".docx") {
|
||||
return Some(DocType::Docx);
|
||||
}
|
||||
if lower.ends_with(".xlsx") {
|
||||
return Some(DocType::Xlsx);
|
||||
}
|
||||
if lower.ends_with(".xls") {
|
||||
return Some(DocType::Xls);
|
||||
}
|
||||
if lower.ends_with(".csv") {
|
||||
return Some(DocType::Csv);
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
/// Extract text content from document bytes, returning an ExtractionResult.
|
||||
pub fn extract_document(
|
||||
bytes: &[u8],
|
||||
doc_type: DocType,
|
||||
) -> Result<webclaw_core::ExtractionResult, FetchError> {
|
||||
debug!(
|
||||
doc_type = doc_type.label(),
|
||||
bytes = bytes.len(),
|
||||
"extracting document"
|
||||
);
|
||||
|
||||
let markdown = match doc_type {
|
||||
DocType::Docx => extract_docx(bytes)?,
|
||||
DocType::Xlsx => extract_xlsx(bytes)?,
|
||||
DocType::Xls => extract_xls(bytes)?,
|
||||
DocType::Csv => extract_csv(bytes)?,
|
||||
};
|
||||
|
||||
let plain_text = strip_markdown_formatting(&markdown);
|
||||
let word_count = plain_text.split_whitespace().count();
|
||||
|
||||
Ok(webclaw_core::ExtractionResult {
|
||||
metadata: webclaw_core::Metadata {
|
||||
title: None,
|
||||
description: None,
|
||||
author: None,
|
||||
published_date: None,
|
||||
language: None,
|
||||
url: None,
|
||||
site_name: None,
|
||||
image: None,
|
||||
favicon: None,
|
||||
word_count,
|
||||
},
|
||||
content: webclaw_core::Content {
|
||||
markdown,
|
||||
plain_text,
|
||||
links: Vec::new(),
|
||||
images: Vec::new(),
|
||||
code_blocks: Vec::new(),
|
||||
raw_html: None,
|
||||
},
|
||||
domain_data: None,
|
||||
structured_data: vec![],
|
||||
})
|
||||
}
|
||||
|
||||
/// Extract text from a DOCX file (ZIP of XML).
|
||||
/// Reads `word/document.xml`, extracts `<w:t>` text nodes, detects heading styles.
|
||||
fn extract_docx(bytes: &[u8]) -> Result<String, FetchError> {
|
||||
let cursor = Cursor::new(bytes);
|
||||
let mut archive =
|
||||
zip::ZipArchive::new(cursor).map_err(|e| FetchError::Build(format!("DOCX zip: {e}")))?;
|
||||
|
||||
let xml = {
|
||||
let mut file = archive
|
||||
.by_name("word/document.xml")
|
||||
.map_err(|e| FetchError::Build(format!("DOCX missing document.xml: {e}")))?;
|
||||
let mut buf = String::new();
|
||||
file.read_to_string(&mut buf)
|
||||
.map_err(|e| FetchError::BodyDecode(format!("DOCX read: {e}")))?;
|
||||
buf
|
||||
};
|
||||
|
||||
parse_docx_xml(&xml)
|
||||
}
|
||||
|
||||
/// Parse DOCX XML (word/document.xml) into markdown.
|
||||
///
|
||||
/// Walks the XML looking for paragraph elements (`<w:p>`). Within each paragraph,
|
||||
/// collects text from `<w:t>` tags and detects heading styles from `<w:pStyle>`.
|
||||
fn parse_docx_xml(xml: &str) -> Result<String, FetchError> {
|
||||
use quick_xml::Reader;
|
||||
use quick_xml::events::Event;
|
||||
|
||||
let mut reader = Reader::from_str(xml);
|
||||
let mut paragraphs: Vec<String> = Vec::new();
|
||||
|
||||
// State tracking for the current paragraph
|
||||
let mut in_paragraph = false;
|
||||
let mut in_run = false; // inside <w:r> (run)
|
||||
let mut in_text = false; // inside <w:t>
|
||||
let mut current_text = String::new();
|
||||
let mut heading_level: Option<u8> = 0.into(); // None = normal paragraph
|
||||
let mut in_ppr = false; // inside <w:pPr> (paragraph properties)
|
||||
|
||||
loop {
|
||||
match reader.read_event() {
|
||||
Ok(Event::Start(ref e)) | Ok(Event::Empty(ref e)) => {
|
||||
let name_bytes = e.name().as_ref().to_vec();
|
||||
let local = local_name(&name_bytes);
|
||||
match local {
|
||||
b"p" if is_w_namespace(&name_bytes) => {
|
||||
in_paragraph = true;
|
||||
current_text.clear();
|
||||
heading_level = None;
|
||||
}
|
||||
b"pPr" if in_paragraph => in_ppr = true,
|
||||
b"pStyle" if in_ppr => {
|
||||
heading_level = extract_heading_level(e);
|
||||
}
|
||||
b"r" if in_paragraph => in_run = true,
|
||||
b"t" if in_run => in_text = true,
|
||||
b"br" if in_paragraph => {
|
||||
current_text.push('\n');
|
||||
}
|
||||
b"tab" if in_paragraph => {
|
||||
current_text.push('\t');
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
Ok(Event::End(ref e)) => {
|
||||
let name_bytes = e.name().as_ref().to_vec();
|
||||
let local = local_name(&name_bytes);
|
||||
match local {
|
||||
b"p" if in_paragraph => {
|
||||
let text = current_text.trim().to_string();
|
||||
if !text.is_empty() {
|
||||
let formatted = match heading_level {
|
||||
Some(1) => format!("# {text}"),
|
||||
Some(2) => format!("## {text}"),
|
||||
Some(3) => format!("### {text}"),
|
||||
Some(4) => format!("#### {text}"),
|
||||
Some(5) => format!("##### {text}"),
|
||||
Some(6) => format!("###### {text}"),
|
||||
_ => text,
|
||||
};
|
||||
paragraphs.push(formatted);
|
||||
}
|
||||
in_paragraph = false;
|
||||
}
|
||||
b"pPr" => in_ppr = false,
|
||||
b"r" => {
|
||||
in_run = false;
|
||||
in_text = false;
|
||||
}
|
||||
b"t" => in_text = false,
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
Ok(Event::Text(ref e)) if in_text => {
|
||||
if let Ok(text) = e.unescape() {
|
||||
current_text.push_str(&text);
|
||||
}
|
||||
}
|
||||
Ok(Event::Eof) => break,
|
||||
Err(e) => {
|
||||
return Err(FetchError::Build(format!("DOCX XML parse error: {e}")));
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(paragraphs.join("\n\n"))
|
||||
}
|
||||
|
||||
/// Check if a qualified name belongs to the `w:` (wordprocessingML) namespace.
|
||||
/// Handles both `w:p` (prefixed) and just `p` (default namespace) forms.
|
||||
fn is_w_namespace(name: &[u8]) -> bool {
|
||||
// quick-xml gives us the full name bytes. Accept both "w:p" and "p".
|
||||
name == b"w:p" || name == b"p"
|
||||
}
|
||||
|
||||
/// Extract the local name from a possibly namespaced XML tag.
|
||||
/// `w:p` -> `p`, `p` -> `p`
|
||||
fn local_name(name: &[u8]) -> &[u8] {
|
||||
match name.iter().position(|&b| b == b':') {
|
||||
Some(pos) => &name[pos + 1..],
|
||||
None => name,
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract heading level from a `<w:pStyle w:val="Heading1"/>` element.
|
||||
fn extract_heading_level(e: &quick_xml::events::BytesStart) -> Option<u8> {
|
||||
for attr in e.attributes().flatten() {
|
||||
let local = local_name(attr.key.as_ref());
|
||||
if local == b"val" {
|
||||
let val = String::from_utf8_lossy(&attr.value);
|
||||
let lower = val.to_ascii_lowercase();
|
||||
|
||||
// Match "heading1", "heading2", etc. and "title" -> h1
|
||||
if lower == "title" {
|
||||
return Some(1);
|
||||
}
|
||||
if let Some(rest) = lower.strip_prefix("heading")
|
||||
&& let Ok(n) = rest.parse::<u8>()
|
||||
{
|
||||
return Some(n.min(6));
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Extract spreadsheet content using calamine (XLSX format).
|
||||
fn extract_xlsx(bytes: &[u8]) -> Result<String, FetchError> {
|
||||
extract_spreadsheet(bytes, "XLSX")
|
||||
}
|
||||
|
||||
/// Extract spreadsheet content using calamine (XLS format).
|
||||
fn extract_xls(bytes: &[u8]) -> Result<String, FetchError> {
|
||||
extract_spreadsheet(bytes, "XLS")
|
||||
}
|
||||
|
||||
/// Shared spreadsheet extraction for both XLSX and XLS via calamine.
|
||||
/// Reads all sheets and formats each as a markdown table.
|
||||
fn extract_spreadsheet(bytes: &[u8], label: &str) -> Result<String, FetchError> {
|
||||
use calamine::Reader;
|
||||
|
||||
let cursor = Cursor::new(bytes);
|
||||
let mut workbook: calamine::Sheets<_> = calamine::open_workbook_auto_from_rs(cursor)
|
||||
.map_err(|e| FetchError::Build(format!("{label} open: {e}")))?;
|
||||
|
||||
let sheet_names: Vec<String> = workbook.sheet_names().to_vec();
|
||||
let mut sections: Vec<String> = Vec::new();
|
||||
|
||||
for name in &sheet_names {
|
||||
let range = workbook
|
||||
.worksheet_range(name)
|
||||
.map_err(|e| FetchError::Build(format!("{label} sheet '{name}': {e}")))?;
|
||||
|
||||
let rows: Vec<Vec<String>> = range
|
||||
.rows()
|
||||
.map(|row| row.iter().map(cell_to_string).collect())
|
||||
.collect();
|
||||
|
||||
if rows.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let mut section = format!("## Sheet: {name}\n\n");
|
||||
section.push_str(&rows_to_markdown_table(&rows));
|
||||
sections.push(section);
|
||||
}
|
||||
|
||||
if sections.is_empty() {
|
||||
return Ok("(empty spreadsheet)".to_string());
|
||||
}
|
||||
|
||||
Ok(sections.join("\n\n"))
|
||||
}
|
||||
|
||||
/// Convert a calamine cell value to a display string.
|
||||
fn cell_to_string(cell: &calamine::Data) -> String {
|
||||
use calamine::Data;
|
||||
match cell {
|
||||
Data::Empty => String::new(),
|
||||
Data::String(s) => s.clone(),
|
||||
Data::Int(n) => n.to_string(),
|
||||
Data::Float(f) => format_float(*f),
|
||||
Data::Bool(b) => b.to_string(),
|
||||
Data::Error(e) => format!("#{e:?}"),
|
||||
Data::DateTime(dt) => format!("{dt}"),
|
||||
Data::DateTimeIso(s) => s.clone(),
|
||||
Data::DurationIso(s) => s.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Format a float, dropping trailing `.0` for clean integer display.
|
||||
fn format_float(f: f64) -> String {
|
||||
if f.fract() == 0.0 && f.abs() < i64::MAX as f64 {
|
||||
format!("{}", f as i64)
|
||||
} else {
|
||||
format!("{f}")
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract CSV text and convert to markdown table.
|
||||
fn extract_csv(bytes: &[u8]) -> Result<String, FetchError> {
|
||||
let text = String::from_utf8_lossy(bytes);
|
||||
let rows = parse_csv_rows(&text);
|
||||
|
||||
if rows.is_empty() {
|
||||
return Ok("(empty CSV)".to_string());
|
||||
}
|
||||
|
||||
Ok(rows_to_markdown_table(&rows))
|
||||
}
|
||||
|
||||
/// Parse CSV text into rows of fields, handling quoted fields with commas/newlines.
|
||||
fn parse_csv_rows(text: &str) -> Vec<Vec<String>> {
|
||||
let mut rows: Vec<Vec<String>> = Vec::new();
|
||||
let mut current_row: Vec<String> = Vec::new();
|
||||
let mut current_field = String::new();
|
||||
let mut in_quotes = false;
|
||||
let mut chars = text.chars().peekable();
|
||||
|
||||
while let Some(ch) = chars.next() {
|
||||
if in_quotes {
|
||||
if ch == '"' {
|
||||
// Escaped quote ("") or end of quoted field
|
||||
if chars.peek() == Some(&'"') {
|
||||
chars.next();
|
||||
current_field.push('"');
|
||||
} else {
|
||||
in_quotes = false;
|
||||
}
|
||||
} else {
|
||||
current_field.push(ch);
|
||||
}
|
||||
} else {
|
||||
match ch {
|
||||
'"' => in_quotes = true,
|
||||
',' => {
|
||||
current_row.push(current_field.trim().to_string());
|
||||
current_field = String::new();
|
||||
}
|
||||
'\n' => {
|
||||
current_row.push(current_field.trim().to_string());
|
||||
current_field = String::new();
|
||||
if !current_row.iter().all(|f| f.is_empty()) {
|
||||
rows.push(current_row);
|
||||
}
|
||||
current_row = Vec::new();
|
||||
}
|
||||
'\r' => {
|
||||
// Skip carriage returns (handled with \n)
|
||||
}
|
||||
_ => current_field.push(ch),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Flush last field/row
|
||||
if !current_field.is_empty() || !current_row.is_empty() {
|
||||
current_row.push(current_field.trim().to_string());
|
||||
if !current_row.iter().all(|f| f.is_empty()) {
|
||||
rows.push(current_row);
|
||||
}
|
||||
}
|
||||
|
||||
rows
|
||||
}
|
||||
|
||||
/// Convert rows (first row = header) into a markdown table.
|
||||
fn rows_to_markdown_table(rows: &[Vec<String>]) -> String {
|
||||
if rows.is_empty() {
|
||||
return String::new();
|
||||
}
|
||||
|
||||
// Find the max column count across all rows
|
||||
let col_count = rows.iter().map(|r| r.len()).max().unwrap_or(0);
|
||||
if col_count == 0 {
|
||||
return String::new();
|
||||
}
|
||||
|
||||
let mut lines: Vec<String> = Vec::new();
|
||||
|
||||
// Header row
|
||||
let header = &rows[0];
|
||||
let header_cells: Vec<&str> = (0..col_count)
|
||||
.map(|i| header.get(i).map(|s| s.as_str()).unwrap_or(""))
|
||||
.collect();
|
||||
lines.push(format!("| {} |", header_cells.join(" | ")));
|
||||
|
||||
// Separator row
|
||||
let sep: Vec<&str> = vec!["---"; col_count];
|
||||
lines.push(format!("| {} |", sep.join(" | ")));
|
||||
|
||||
// Data rows
|
||||
for row in &rows[1..] {
|
||||
let cells: Vec<&str> = (0..col_count)
|
||||
.map(|i| row.get(i).map(|s| s.as_str()).unwrap_or(""))
|
||||
.collect();
|
||||
lines.push(format!("| {} |", cells.join(" | ")));
|
||||
}
|
||||
|
||||
lines.join("\n")
|
||||
}
|
||||
|
||||
/// Strip markdown formatting to get plain text.
|
||||
fn strip_markdown_formatting(markdown: &str) -> String {
|
||||
let mut plain = String::with_capacity(markdown.len());
|
||||
for line in markdown.lines() {
|
||||
let trimmed = line.trim_start_matches('#').trim();
|
||||
if trimmed.starts_with("| ---") || trimmed == "|---|" {
|
||||
continue; // Skip separator rows
|
||||
}
|
||||
if let Some(stripped) = trimmed.strip_prefix('|')
|
||||
&& let Some(stripped) = stripped.strip_suffix('|')
|
||||
{
|
||||
// Table row: join cells with spaces
|
||||
let cells: Vec<&str> = stripped.split('|').map(|c| c.trim()).collect();
|
||||
plain.push_str(&cells.join(" "));
|
||||
plain.push('\n');
|
||||
continue;
|
||||
}
|
||||
plain.push_str(trimmed);
|
||||
plain.push('\n');
|
||||
}
|
||||
plain.trim().to_string()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
// --- Content-type detection ---
|
||||
|
||||
#[test]
|
||||
fn test_detect_docx_content_type() {
|
||||
let mut headers = HashMap::new();
|
||||
headers.insert(
|
||||
"content-type".to_string(),
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document".to_string(),
|
||||
);
|
||||
assert_eq!(
|
||||
is_document_content_type(&headers, "https://example.com/file"),
|
||||
Some(DocType::Docx)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_xlsx_content_type() {
|
||||
let mut headers = HashMap::new();
|
||||
headers.insert(
|
||||
"content-type".to_string(),
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet".to_string(),
|
||||
);
|
||||
assert_eq!(
|
||||
is_document_content_type(&headers, "https://example.com/file"),
|
||||
Some(DocType::Xlsx)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_xls_content_type() {
|
||||
let mut headers = HashMap::new();
|
||||
headers.insert(
|
||||
"content-type".to_string(),
|
||||
"application/vnd.ms-excel".to_string(),
|
||||
);
|
||||
assert_eq!(
|
||||
is_document_content_type(&headers, "https://example.com/file"),
|
||||
Some(DocType::Xls)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_csv_content_type() {
|
||||
let mut headers = HashMap::new();
|
||||
headers.insert("content-type".to_string(), "text/csv".to_string());
|
||||
assert_eq!(
|
||||
is_document_content_type(&headers, "https://example.com/file"),
|
||||
Some(DocType::Csv)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_csv_content_type_with_charset() {
|
||||
let mut headers = HashMap::new();
|
||||
headers.insert(
|
||||
"content-type".to_string(),
|
||||
"text/csv; charset=utf-8".to_string(),
|
||||
);
|
||||
assert_eq!(
|
||||
is_document_content_type(&headers, "https://example.com/file"),
|
||||
Some(DocType::Csv)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_by_url_extension() {
|
||||
let empty: HashMap<String, String> = HashMap::new();
|
||||
assert_eq!(
|
||||
is_document_content_type(&empty, "https://example.com/report.docx"),
|
||||
Some(DocType::Docx)
|
||||
);
|
||||
assert_eq!(
|
||||
is_document_content_type(&empty, "https://example.com/data.xlsx"),
|
||||
Some(DocType::Xlsx)
|
||||
);
|
||||
assert_eq!(
|
||||
is_document_content_type(&empty, "https://example.com/old.xls"),
|
||||
Some(DocType::Xls)
|
||||
);
|
||||
assert_eq!(
|
||||
is_document_content_type(&empty, "https://example.com/data.csv"),
|
||||
Some(DocType::Csv)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_url_extension_with_query() {
|
||||
let empty: HashMap<String, String> = HashMap::new();
|
||||
assert_eq!(
|
||||
is_document_content_type(&empty, "https://example.com/report.docx?token=abc"),
|
||||
Some(DocType::Docx)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_url_extension_case_insensitive() {
|
||||
let empty: HashMap<String, String> = HashMap::new();
|
||||
assert_eq!(
|
||||
is_document_content_type(&empty, "https://example.com/FILE.XLSX"),
|
||||
Some(DocType::Xlsx)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_none_for_html() {
|
||||
let mut headers = HashMap::new();
|
||||
headers.insert("content-type".to_string(), "text/html".to_string());
|
||||
assert_eq!(
|
||||
is_document_content_type(&headers, "https://example.com/page"),
|
||||
None
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_content_type_takes_precedence_over_url() {
|
||||
let mut headers = HashMap::new();
|
||||
headers.insert("content-type".to_string(), "text/csv".to_string());
|
||||
// URL says .xlsx but Content-Type says CSV — header wins
|
||||
assert_eq!(
|
||||
is_document_content_type(&headers, "https://example.com/data.xlsx"),
|
||||
Some(DocType::Csv)
|
||||
);
|
||||
}
|
||||
|
||||
// --- CSV parsing ---
|
||||
|
||||
#[test]
|
||||
fn test_csv_simple() {
|
||||
let csv = "Name,Age,City\nAlice,30,NYC\nBob,25,LA\n";
|
||||
let result = extract_csv(csv.as_bytes()).unwrap();
|
||||
assert!(result.contains("| Name | Age | City |"));
|
||||
assert!(result.contains("| --- | --- | --- |"));
|
||||
assert!(result.contains("| Alice | 30 | NYC |"));
|
||||
assert!(result.contains("| Bob | 25 | LA |"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_csv_quoted_fields() {
|
||||
let csv = "Name,Description\nAlice,\"Has a, comma\"\nBob,\"Said \"\"hello\"\"\"\n";
|
||||
let result = extract_csv(csv.as_bytes()).unwrap();
|
||||
assert!(result.contains("Has a, comma"));
|
||||
assert!(result.contains("Said \"hello\""));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_csv_empty() {
|
||||
let result = extract_csv(b"").unwrap();
|
||||
assert_eq!(result, "(empty CSV)");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_csv_windows_line_endings() {
|
||||
let csv = "A,B\r\n1,2\r\n3,4\r\n";
|
||||
let result = extract_csv(csv.as_bytes()).unwrap();
|
||||
assert!(result.contains("| A | B |"));
|
||||
assert!(result.contains("| 1 | 2 |"));
|
||||
}
|
||||
|
||||
// --- DOCX XML parsing ---
|
||||
|
||||
#[test]
|
||||
fn test_docx_xml_simple_paragraphs() {
|
||||
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
|
||||
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
||||
<w:body>
|
||||
<w:p><w:r><w:t>Hello world</w:t></w:r></w:p>
|
||||
<w:p><w:r><w:t>Second paragraph</w:t></w:r></w:p>
|
||||
</w:body>
|
||||
</w:document>"#;
|
||||
let result = parse_docx_xml(xml).unwrap();
|
||||
assert_eq!(result, "Hello world\n\nSecond paragraph");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_docx_xml_headings() {
|
||||
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
|
||||
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
||||
<w:body>
|
||||
<w:p>
|
||||
<w:pPr><w:pStyle w:val="Heading1"/></w:pPr>
|
||||
<w:r><w:t>Title</w:t></w:r>
|
||||
</w:p>
|
||||
<w:p><w:r><w:t>Body text</w:t></w:r></w:p>
|
||||
<w:p>
|
||||
<w:pPr><w:pStyle w:val="Heading2"/></w:pPr>
|
||||
<w:r><w:t>Subtitle</w:t></w:r>
|
||||
</w:p>
|
||||
</w:body>
|
||||
</w:document>"#;
|
||||
let result = parse_docx_xml(xml).unwrap();
|
||||
assert!(result.contains("# Title"));
|
||||
assert!(result.contains("Body text"));
|
||||
assert!(result.contains("## Subtitle"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_docx_xml_multiple_runs() {
|
||||
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
|
||||
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
||||
<w:body>
|
||||
<w:p>
|
||||
<w:r><w:t>Hello </w:t></w:r>
|
||||
<w:r><w:t>world</w:t></w:r>
|
||||
</w:p>
|
||||
</w:body>
|
||||
</w:document>"#;
|
||||
let result = parse_docx_xml(xml).unwrap();
|
||||
assert_eq!(result, "Hello world");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_docx_xml_empty_paragraphs_skipped() {
|
||||
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
|
||||
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
||||
<w:body>
|
||||
<w:p></w:p>
|
||||
<w:p><w:r><w:t>Content</w:t></w:r></w:p>
|
||||
<w:p><w:r><w:t> </w:t></w:r></w:p>
|
||||
</w:body>
|
||||
</w:document>"#;
|
||||
let result = parse_docx_xml(xml).unwrap();
|
||||
assert_eq!(result, "Content");
|
||||
}
|
||||
|
||||
// --- Markdown table ---
|
||||
|
||||
#[test]
|
||||
fn test_rows_to_markdown_table() {
|
||||
let rows = vec![
|
||||
vec!["A".to_string(), "B".to_string()],
|
||||
vec!["1".to_string(), "2".to_string()],
|
||||
vec!["3".to_string(), "4".to_string()],
|
||||
];
|
||||
let table = rows_to_markdown_table(&rows);
|
||||
assert_eq!(table, "| A | B |\n| --- | --- |\n| 1 | 2 |\n| 3 | 4 |");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_rows_to_markdown_table_ragged() {
|
||||
let rows = vec![
|
||||
vec!["A".to_string(), "B".to_string(), "C".to_string()],
|
||||
vec!["1".to_string()], // fewer columns
|
||||
];
|
||||
let table = rows_to_markdown_table(&rows);
|
||||
assert!(table.contains("| 1 | | |"));
|
||||
}
|
||||
|
||||
// --- Extract result ---
|
||||
|
||||
#[test]
|
||||
fn test_extract_csv_result() {
|
||||
let csv = "Name,Score\nAlice,100\n";
|
||||
let result = extract_document(csv.as_bytes(), DocType::Csv).unwrap();
|
||||
assert!(result.content.markdown.contains("| Name | Score |"));
|
||||
assert!(result.metadata.word_count > 0);
|
||||
assert!(result.content.links.is_empty());
|
||||
assert!(result.domain_data.is_none());
|
||||
}
|
||||
|
||||
// --- Strip markdown ---
|
||||
|
||||
#[test]
|
||||
fn test_strip_markdown() {
|
||||
let md = "# Title\n\nSome text\n\n| A | B |\n| --- | --- |\n| 1 | 2 |";
|
||||
let plain = strip_markdown_formatting(md);
|
||||
assert!(plain.contains("Title"));
|
||||
assert!(plain.contains("Some text"));
|
||||
assert!(plain.contains("A B"));
|
||||
assert!(!plain.contains("---"));
|
||||
}
|
||||
}
|
||||
|
|
@ -5,6 +5,7 @@
|
|||
pub mod browser;
|
||||
pub mod client;
|
||||
pub mod crawler;
|
||||
pub mod document;
|
||||
pub mod error;
|
||||
pub mod linkedin;
|
||||
pub mod proxy;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue