webclaw/crates/webclaw-fetch/Cargo.toml
webclaw b7bd1155c6 feat(map): layered URL discovery with crawl fallback
map falls back to a bounded same-origin crawl when a site has no sitemap
or a thin one, harvesting links from each fetched page (the rich source).
Adds gzip (.xml.gz) sitemap support, deeper sitemap-index recursion + more
fallback paths, uncapped-by-default results with an optional --map-limit /
--map-pages, and routes crawler logs to stderr so --map -f json stays
machine-parseable.
2026-06-06 12:08:26 +02:00

35 lines
911 B
TOML

[package]
name = "webclaw-fetch"
description = "HTTP client with browser TLS fingerprint impersonation via wreq"
version.workspace = true
edition.workspace = true
rust-version.workspace = true
license.workspace = true
[lints]
workspace = true
[dependencies]
webclaw-core = { workspace = true }
webclaw-pdf = { path = "../webclaw-pdf" }
serde = { workspace = true }
thiserror = { workspace = true }
tracing = { workspace = true }
tokio = { workspace = true }
async-trait = "0.1"
wreq = { version = "6.0.0-rc.28", features = ["cookies", "gzip", "brotli", "zstd", "deflate"] }
wreq-util = "3.0.0-rc.10"
http = "1"
bytes = "1"
url = "2"
rand = "0.8"
quick-xml = { version = "0.37", features = ["serde"] }
regex = "1"
reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] }
serde_json.workspace = true
calamine = "0.34"
zip = "2"
flate2 = "1"
[dev-dependencies]
tempfile = "3"