mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-07-02 04:08:08 +02:00
map falls back to a bounded same-origin crawl when a site has no sitemap or a thin one, harvesting links from each fetched page (the rich source). Adds gzip (.xml.gz) sitemap support, deeper sitemap-index recursion + more fallback paths, uncapped-by-default results with an optional --map-limit / --map-pages, and routes crawler logs to stderr so --map -f json stays machine-parseable.
35 lines
911 B
TOML
35 lines
911 B
TOML
[package]
|
|
name = "webclaw-fetch"
|
|
description = "HTTP client with browser TLS fingerprint impersonation via wreq"
|
|
version.workspace = true
|
|
edition.workspace = true
|
|
rust-version.workspace = true
|
|
license.workspace = true
|
|
|
|
[lints]
|
|
workspace = true
|
|
|
|
[dependencies]
|
|
webclaw-core = { workspace = true }
|
|
webclaw-pdf = { path = "../webclaw-pdf" }
|
|
serde = { workspace = true }
|
|
thiserror = { workspace = true }
|
|
tracing = { workspace = true }
|
|
tokio = { workspace = true }
|
|
async-trait = "0.1"
|
|
wreq = { version = "6.0.0-rc.28", features = ["cookies", "gzip", "brotli", "zstd", "deflate"] }
|
|
wreq-util = "3.0.0-rc.10"
|
|
http = "1"
|
|
bytes = "1"
|
|
url = "2"
|
|
rand = "0.8"
|
|
quick-xml = { version = "0.37", features = ["serde"] }
|
|
regex = "1"
|
|
reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] }
|
|
serde_json.workspace = true
|
|
calamine = "0.34"
|
|
zip = "2"
|
|
flate2 = "1"
|
|
|
|
[dev-dependencies]
|
|
tempfile = "3"
|