mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-05-13 17:02:36 +02:00
Document extraction: - DOCX: auto-detected, outputs markdown with headings (via zip + quick-xml) - XLSX/XLS: markdown tables with multi-sheet support (via calamine) - CSV: quoted field handling, markdown table output - All auto-detected by Content-Type header or URL extension New features: - -f html output format (sanitized HTML) - Multi-URL watch: --urls-file + --watch monitors all URLs in parallel - Batch + LLM: --extract-prompt/--extract-json works with multiple URLs - Mixed batch: HTML pages + DOCX + XLSX + CSV in one command Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
26 lines
768 B
TOML
26 lines
768 B
TOML
[package]
|
|
name = "webclaw-fetch"
|
|
description = "HTTP client with browser TLS fingerprint impersonation via Impit"
|
|
version.workspace = true
|
|
edition.workspace = true
|
|
license.workspace = true
|
|
|
|
[dependencies]
|
|
webclaw-core = { workspace = true }
|
|
webclaw-pdf = { path = "../webclaw-pdf" }
|
|
serde = { workspace = true }
|
|
thiserror = { workspace = true }
|
|
tracing = { workspace = true }
|
|
tokio = { workspace = true }
|
|
primp = { git = "https://github.com/deedy5/primp", default-features = false, features = [
|
|
"default-tls", "http2", "impersonate", "cookies", "gzip", "brotli", "deflate", "zstd", "socks",
|
|
] }
|
|
url = "2"
|
|
rand = "0.8"
|
|
quick-xml = { version = "0.37", features = ["serde"] }
|
|
serde_json.workspace = true
|
|
calamine = "0.34"
|
|
zip = "2"
|
|
|
|
[dev-dependencies]
|
|
tempfile = "3"
|