mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-29 03:39:37 +02:00
feat: v0.2.0 — DOCX/XLSX/CSV extraction, HTML format, multi-URL watch, batch LLM
Document extraction: - DOCX: auto-detected, outputs markdown with headings (via zip + quick-xml) - XLSX/XLS: markdown tables with multi-sheet support (via calamine) - CSV: quoted field handling, markdown table output - All auto-detected by Content-Type header or URL extension New features: - -f html output format (sanitized HTML) - Multi-URL watch: --urls-file + --watch monitors all URLs in parallel - Batch + LLM: --extract-prompt/--extract-json works with multiple URLs - Mixed batch: HTML pages + DOCX + XLSX + CSV in one command Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
0e4128782a
commit
ea14848772
8 changed files with 1520 additions and 41 deletions
|
|
@ -399,6 +399,27 @@ impl FetchClient {
|
|||
|
||||
let pdf_result = webclaw_pdf::extract_pdf(&bytes, self.pdf_mode.clone())?;
|
||||
Ok(pdf_to_extraction_result(&pdf_result, &final_url))
|
||||
} else if let Some(doc_type) =
|
||||
crate::document::is_document_content_type(&headers, &final_url)
|
||||
{
|
||||
debug!(status, doc_type = ?doc_type, "detected document response, extracting");
|
||||
|
||||
let bytes = response
|
||||
.bytes()
|
||||
.await
|
||||
.map_err(|e| FetchError::BodyDecode(e.to_string()))?;
|
||||
|
||||
let elapsed = start.elapsed();
|
||||
debug!(
|
||||
status,
|
||||
bytes = bytes.len(),
|
||||
elapsed_ms = %elapsed.as_millis(),
|
||||
"document fetch complete"
|
||||
);
|
||||
|
||||
let mut result = crate::document::extract_document(&bytes, doc_type)?;
|
||||
result.metadata.url = Some(final_url);
|
||||
Ok(result)
|
||||
} else {
|
||||
let html = response
|
||||
.text()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue