mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-09 22:35:12 +02:00
Compare commits
8 commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d0d7b835f2 | ||
|
|
6519ac2a8b | ||
|
|
14ded4b99e | ||
|
|
72a451cfb6 | ||
|
|
17fce81a95 | ||
|
|
84a0f9774d | ||
|
|
519dfb7864 | ||
|
|
985a90b083 |
10 changed files with 589 additions and 132 deletions
BIN
.github/banner.png
vendored
BIN
.github/banner.png
vendored
Binary file not shown.
|
Before Width: | Height: | Size: 44 KiB After Width: | Height: | Size: 48 KiB |
15
CHANGELOG.md
15
CHANGELOG.md
|
|
@ -3,6 +3,21 @@
|
|||
All notable changes to webclaw are documented here.
|
||||
Format follows [Keep a Changelog](https://keepachangelog.com/).
|
||||
|
||||
## [0.6.7] — 2026-06-09
|
||||
|
||||
### Changed
|
||||
- Updated the HTTP/TLS engine (wreq 6.0.0-rc.29, wreq-util 3.0.0-rc.12). This pulls in upstream robustness fixes: no more panic on responses with non-UTF8 header values, a fix for short reads when decoding large compressed bodies, and the TCP nodelay setting is restored. Browser TLS fingerprints are unchanged.
|
||||
|
||||
---
|
||||
|
||||
## [0.6.6] — 2026-06-09
|
||||
|
||||
### Added
|
||||
- Slow fetches now print a progress line to stderr every 10 seconds (`# webclaw: still fetching <url> (Ns)`) so a long request no longer looks like the CLI hung. Fast fetches stay silent and stdout is untouched.
|
||||
- New `--url-encoded` flag plus a warning when a URL looks like the shell split it on `&` or `?`. The warning suggests quoting the URL; pass `--url-encoded` to silence it when the URL is intentional.
|
||||
|
||||
---
|
||||
|
||||
## [0.6.5] — 2026-06-04
|
||||
|
||||
### Changed
|
||||
|
|
|
|||
221
Cargo.lock
generated
221
Cargo.lock
generated
|
|
@ -28,18 +28,6 @@ dependencies = [
|
|||
"cpufeatures",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ahash"
|
||||
version = "0.8.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"once_cell",
|
||||
"version_check",
|
||||
"zerocopy",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aho-corasick"
|
||||
version = "1.1.4"
|
||||
|
|
@ -64,6 +52,12 @@ dependencies = [
|
|||
"alloc-no-stdlib",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "allocator-api2"
|
||||
version = "0.2.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
|
||||
|
||||
[[package]]
|
||||
name = "android_system_properties"
|
||||
version = "0.1.5"
|
||||
|
|
@ -272,9 +266,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "bitflags"
|
||||
version = "2.11.0"
|
||||
version = "2.13.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af"
|
||||
checksum = "b4388bee8683e3d04af747c73422af53102d2bd24d9eadb6cbc100baef4b43f8"
|
||||
|
||||
[[package]]
|
||||
name = "block-buffer"
|
||||
|
|
@ -285,31 +279,6 @@ dependencies = [
|
|||
"generic-array",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "boring-sys2"
|
||||
version = "5.0.0-alpha.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "455d79965f5155dcc88a7abce112c3590883889131b799beda10bf9a813ed669"
|
||||
dependencies = [
|
||||
"bindgen",
|
||||
"cmake",
|
||||
"fs_extra",
|
||||
"fslock",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "boring2"
|
||||
version = "5.0.0-alpha.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "183ccc3854411c035410dcdbffafca62084f3a6c33f013c77e83c025d2a08a28"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"boring-sys2",
|
||||
"foreign-types",
|
||||
"libc",
|
||||
"openssl-macros",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "brotli"
|
||||
version = "8.0.2"
|
||||
|
|
@ -331,6 +300,31 @@ dependencies = [
|
|||
"alloc-stdlib",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "btls"
|
||||
version = "0.5.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2c5e60b8c8d282c86360cab651ded04ab0335a7b5390c8d34145cbeab8cacf5f"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"btls-sys",
|
||||
"foreign-types",
|
||||
"libc",
|
||||
"openssl-macros",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "btls-sys"
|
||||
version = "0.5.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9b1b8638a2e1c38a5ae4efa90ae57e643baec35a30d03fc5b399b893adc4954b"
|
||||
dependencies = [
|
||||
"bindgen",
|
||||
"cmake",
|
||||
"fs_extra",
|
||||
"fslock",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bumpalo"
|
||||
version = "3.20.2"
|
||||
|
|
@ -865,6 +859,12 @@ version = "0.1.5"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
|
||||
|
||||
[[package]]
|
||||
name = "foldhash"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb"
|
||||
|
||||
[[package]]
|
||||
name = "foreign-types"
|
||||
version = "0.5.0"
|
||||
|
|
@ -1089,19 +1089,13 @@ version = "0.3.3"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280"
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.13.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "43a3c133739dddd0d2990f9a4bdf8eb4b21ef50e4851ca85ab661199821d510e"
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.15.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
|
||||
dependencies = [
|
||||
"foldhash",
|
||||
"foldhash 0.1.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -1110,6 +1104,17 @@ version = "0.16.1"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.17.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a"
|
||||
dependencies = [
|
||||
"allocator-api2",
|
||||
"equivalent",
|
||||
"foldhash 0.2.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
version = "0.5.0"
|
||||
|
|
@ -1172,9 +1177,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "http2"
|
||||
version = "0.5.15"
|
||||
version = "0.5.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c45c6490693ee8a8d0d95fdbdf76fead9fb87548f7894137259a7c6d22821948"
|
||||
checksum = "569ef7a780e853c4e1768f58a3c8168193b82cdcbab66638a0b1c6583ec5995e"
|
||||
dependencies = [
|
||||
"atomic-waker",
|
||||
"bytes",
|
||||
|
|
@ -1183,7 +1188,6 @@ dependencies = [
|
|||
"futures-sink",
|
||||
"http",
|
||||
"indexmap",
|
||||
"parking_lot",
|
||||
"slab",
|
||||
"smallvec",
|
||||
"tokio",
|
||||
|
|
@ -1495,9 +1499,9 @@ checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2"
|
|||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.183"
|
||||
version = "0.2.186"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b5b646652bf6661599e1da8901b3b9522896f01e736bad5f723fe7a3a27f899d"
|
||||
checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66"
|
||||
|
||||
[[package]]
|
||||
name = "libloading"
|
||||
|
|
@ -1563,6 +1567,15 @@ dependencies = [
|
|||
"weezl",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lru"
|
||||
version = "0.18.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8a860605968fce16869fd239cf4237a82f3ac470723415db603b0e8b6c8d4fb9"
|
||||
dependencies = [
|
||||
"hashbrown 0.17.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lru-slab"
|
||||
version = "0.1.2"
|
||||
|
|
@ -2375,17 +2388,6 @@ dependencies = [
|
|||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "schnellru"
|
||||
version = "0.2.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "356285bbf17bea63d9e52e96bd18f039672ac92b55b8cb997d6162a2a37d1649"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"cfg-if",
|
||||
"hashbrown 0.13.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "scopeguard"
|
||||
version = "1.2.0"
|
||||
|
|
@ -2779,9 +2781,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
|
|||
|
||||
[[package]]
|
||||
name = "tokio"
|
||||
version = "1.50.0"
|
||||
version = "1.52.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "27ad5e34374e03cfffefc301becb44e9dc3c17584f414349ebe29ed26661822d"
|
||||
checksum = "8fc7f01b389ac15039e4dc9531aa973a135d7a4135281b12d7c1bc79fd57fffe"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"libc",
|
||||
|
|
@ -2795,20 +2797,20 @@ dependencies = [
|
|||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-boring2"
|
||||
version = "5.0.0-alpha.13"
|
||||
name = "tokio-btls"
|
||||
version = "0.5.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0f81df1210d791f31d72d840de8fbd80b9c3cb324956523048b1413e2bd55756"
|
||||
checksum = "2e1fd638ec35427faf3b8f412e0fdd6fae76591d79dba40f38fa667d22bc44dd"
|
||||
dependencies = [
|
||||
"boring2",
|
||||
"btls",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-macros"
|
||||
version = "2.6.1"
|
||||
version = "2.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5c55a2eff8b69ce66c84f85e1da1c233edc36ceb85a2058d11b0d6a3c7e7569c"
|
||||
checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
|
|
@ -3219,7 +3221,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-cli"
|
||||
version = "0.6.5"
|
||||
version = "0.6.7"
|
||||
dependencies = [
|
||||
"clap",
|
||||
"dotenvy",
|
||||
|
|
@ -3240,7 +3242,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-core"
|
||||
version = "0.6.5"
|
||||
version = "0.6.7"
|
||||
dependencies = [
|
||||
"ego-tree",
|
||||
"once_cell",
|
||||
|
|
@ -3258,11 +3260,12 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-fetch"
|
||||
version = "0.6.5"
|
||||
version = "0.6.7"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"bytes",
|
||||
"calamine",
|
||||
"futures-util",
|
||||
"http",
|
||||
"quick-xml 0.37.5",
|
||||
"rand 0.8.5",
|
||||
|
|
@ -3284,7 +3287,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-llm"
|
||||
version = "0.6.5"
|
||||
version = "0.6.7"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"reqwest",
|
||||
|
|
@ -3297,7 +3300,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-mcp"
|
||||
version = "0.6.5"
|
||||
version = "0.6.7"
|
||||
dependencies = [
|
||||
"dirs",
|
||||
"dotenvy",
|
||||
|
|
@ -3317,7 +3320,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-pdf"
|
||||
version = "0.6.5"
|
||||
version = "0.6.7"
|
||||
dependencies = [
|
||||
"pdf-extract",
|
||||
"thiserror",
|
||||
|
|
@ -3326,7 +3329,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-server"
|
||||
version = "0.6.5"
|
||||
version = "0.6.7"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"axum",
|
||||
|
|
@ -3347,9 +3350,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webpki-root-certs"
|
||||
version = "1.0.6"
|
||||
version = "1.0.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "804f18a4ac2676ffb4e8b5b5fa9ae38af06df08162314f96a68d2a363e21a8ca"
|
||||
checksum = "f31141ce3fc3e300ae89b78c0dd67f9708061d1d2eda54b8209346fd6be9a92c"
|
||||
dependencies = [
|
||||
"rustls-pki-types",
|
||||
]
|
||||
|
|
@ -3696,17 +3699,14 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "wreq"
|
||||
version = "6.0.0-rc.28"
|
||||
version = "6.0.0-rc.29"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f79937f6c4df65b3f6f78715b9de2977afe9ee3b3436483c7949a24511e25935"
|
||||
checksum = "3f0eba5f5814a94e5f1a99156f187133464e525b66bdbc69a9627d46530af2e1"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"boring2",
|
||||
"brotli",
|
||||
"btls",
|
||||
"btls-sys",
|
||||
"bytes",
|
||||
"cookie",
|
||||
"flate2",
|
||||
"futures-channel",
|
||||
"futures-util",
|
||||
"http",
|
||||
"http-body",
|
||||
|
|
@ -3715,29 +3715,64 @@ dependencies = [
|
|||
"httparse",
|
||||
"ipnet",
|
||||
"libc",
|
||||
"lru",
|
||||
"percent-encoding",
|
||||
"pin-project-lite",
|
||||
"schnellru",
|
||||
"smallvec",
|
||||
"socket2",
|
||||
"sync_wrapper",
|
||||
"tokio",
|
||||
"tokio-boring2",
|
||||
"tokio-btls",
|
||||
"tokio-util",
|
||||
"tower",
|
||||
"tower-http",
|
||||
"url",
|
||||
"want",
|
||||
"webpki-root-certs",
|
||||
"zstd",
|
||||
"wreq-proto",
|
||||
"wreq-rt",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wreq-proto"
|
||||
version = "0.2.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a43942f024bb303f1042c9aa3c87fa1d9149f507c65db6e5220a11ccdb207387"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"futures-channel",
|
||||
"futures-util",
|
||||
"http",
|
||||
"http-body",
|
||||
"http2",
|
||||
"httparse",
|
||||
"pin-project-lite",
|
||||
"smallvec",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"want",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wreq-rt"
|
||||
version = "0.2.2-rc.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "99e9bce67a3fa3dd3f1503f066d86661c9caf399a763d3bd184da7afaf886c8b"
|
||||
dependencies = [
|
||||
"pin-project-lite",
|
||||
"tokio",
|
||||
"wreq-proto",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wreq-util"
|
||||
version = "3.0.0-rc.10"
|
||||
version = "3.0.0-rc.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6c6bbe24d28beb9ceb58b514bd6a613c759d3b706f768b9d2950d5d35b543c04"
|
||||
checksum = "baa5d2ab72139256916ca352a3d05c53d74e1dd360052eb5ba7691033c417c65"
|
||||
dependencies = [
|
||||
"brotli",
|
||||
"flate2",
|
||||
"typed-builder",
|
||||
"wreq",
|
||||
"zstd",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ resolver = "2"
|
|||
members = ["crates/*"]
|
||||
|
||||
[workspace.package]
|
||||
version = "0.6.5"
|
||||
version = "0.6.7"
|
||||
edition = "2024"
|
||||
license = "AGPL-3.0"
|
||||
repository = "https://github.com/0xMassi/webclaw"
|
||||
|
|
|
|||
|
|
@ -166,6 +166,14 @@ struct Cli {
|
|||
#[arg(long)]
|
||||
urls_file: Option<String>,
|
||||
|
||||
/// Assert that the URL has been handled for shell escaping. Suppresses
|
||||
/// the URL-truncation stderr warning. Use when the URL is intentionally
|
||||
/// passed with an empty/keyless query (e.g. legacy CGI) or when a
|
||||
/// trailing `&` is genuinely part of the URL. The URL is fetched as-is
|
||||
/// (no extra normalization beyond the standard scheme prepend).
|
||||
#[arg(long)]
|
||||
url_encoded: bool,
|
||||
|
||||
/// Output format (markdown, json, text, llm, html)
|
||||
#[arg(short, long, default_value = "markdown")]
|
||||
format: OutputFormat,
|
||||
|
|
@ -591,6 +599,31 @@ fn normalize_url(url: &str) -> String {
|
|||
}
|
||||
}
|
||||
|
||||
/// M14: detect URLs that look truncated by the shell (e.g. an unquoted URL
|
||||
/// that the shell split on `&` or `?`). Returns `true` when:
|
||||
/// - the URL ends with `&` (a trailing param separator suggests the next
|
||||
/// param was lopped off), OR
|
||||
/// - the URL contains `?` but no `=` after it (a query with bare keys is
|
||||
/// rare; usually a real query has at least one `=`).
|
||||
///
|
||||
/// Informational only — caller decides whether to warn / abort. This is a
|
||||
/// heuristic; legitimate URLs with bare-key queries will trigger a false
|
||||
/// positive (suppressible via `--url-encoded`).
|
||||
fn looks_truncated(url: &str) -> bool {
|
||||
let trimmed = url.trim();
|
||||
if trimmed.ends_with('&') {
|
||||
return true;
|
||||
}
|
||||
if let Some((_before, after_q)) = trimmed.split_once('?') {
|
||||
// Trim a trailing fragment so `?#section` etc. doesn't mask the check.
|
||||
let query_part = after_q.split('#').next().unwrap_or(after_q);
|
||||
if !query_part.contains('=') {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
/// Derive a filename from a URL for `--output-dir`.
|
||||
///
|
||||
/// Strips the scheme/host, maps the path to a filesystem path, and appends
|
||||
|
|
@ -826,6 +859,14 @@ async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
|
|||
.urls
|
||||
.first()
|
||||
.ok_or("no input provided -- pass a URL, --file, or --stdin")?;
|
||||
// M14: warn when the URL looks like the shell split it on `&` or `?`.
|
||||
// Informational only — fetch still proceeds. Suppressed by --url-encoded,
|
||||
// which asserts the caller has handled escaping intentionally.
|
||||
if !cli.url_encoded && looks_truncated(raw_url) {
|
||||
eprintln!(
|
||||
"# webclaw: warning: URL looks truncated (ends with '&' or '?'); did the shell split it? Quote the URL or use --url-encoded."
|
||||
);
|
||||
}
|
||||
let url = normalize_url(raw_url);
|
||||
let url = url.as_str();
|
||||
|
||||
|
|
@ -859,8 +900,11 @@ async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
|
|||
let client =
|
||||
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
|
||||
let options = build_extraction_options(cli);
|
||||
let result = client
|
||||
.fetch_and_extract_with_options(url, &options)
|
||||
// M13: wrap with periodic stderr progress emitter. Fast fetches see
|
||||
// zero emissions (timer never fires in <10s); slow fetches get a
|
||||
// line every 10s of elapsed time so the CLI doesn't appear hung.
|
||||
let fetch_fut = client.fetch_and_extract_with_options(url, &options);
|
||||
let result = webclaw_fetch::with_progress(url, fetch_fut)
|
||||
.await
|
||||
.map_err(|e| format!("fetch error: {e}"))?;
|
||||
|
||||
|
|
@ -2879,6 +2923,61 @@ mod tests {
|
|||
let _ = std::fs::remove_dir_all(&dir);
|
||||
}
|
||||
|
||||
// M14: URL truncation heuristic tests.
|
||||
#[test]
|
||||
fn looks_truncated_fires_on_trailing_ampersand() {
|
||||
// The most common shell-split shape: `?a=1&` lost the `b=2`.
|
||||
assert!(looks_truncated("https://example.com/?a=1&"));
|
||||
assert!(looks_truncated("https://example.com/path?key=val&"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn looks_truncated_fires_on_query_with_no_equals() {
|
||||
// `?foo` with no `=` is a strong signal the shell ate the `=value`.
|
||||
assert!(looks_truncated("https://example.com/?foo"));
|
||||
// Bare `?` (empty query) also looks like the shell ate the whole pair.
|
||||
assert!(looks_truncated("https://example.com/?"));
|
||||
// Same with a fragment after — strip fragment before checking.
|
||||
assert!(looks_truncated("https://example.com/?foo#section"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn looks_truncated_silent_on_clean_url() {
|
||||
// Normal URLs (no query, or query with at least one `=`) are clean.
|
||||
assert!(!looks_truncated("https://example.com/"));
|
||||
assert!(!looks_truncated("https://example.com/path/to/page"));
|
||||
assert!(!looks_truncated("https://example.com/?a=1"));
|
||||
assert!(!looks_truncated("https://example.com/?a=1&b=2"));
|
||||
assert!(!looks_truncated(
|
||||
"https://example.com/?a=1&b=2&c=hello%20world"
|
||||
));
|
||||
// Hash anchors without a query are clean.
|
||||
assert!(!looks_truncated("https://example.com/page#section"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn looks_truncated_silent_with_url_encoded_assertion_modeled_via_skip() {
|
||||
// The --url-encoded flag suppresses the warning at the call site
|
||||
// (main.rs gates the eprintln! behind `if !cli.url_encoded`).
|
||||
// This test models the gate logic directly: when --url-encoded is set,
|
||||
// the warning branch is never entered, even on a truncated-looking URL.
|
||||
let url = "https://example.com/?a=1&";
|
||||
let url_encoded_flag = true;
|
||||
let should_warn = !url_encoded_flag && looks_truncated(url);
|
||||
assert!(
|
||||
!should_warn,
|
||||
"--url-encoded must suppress the warning even on URL ending with &"
|
||||
);
|
||||
|
||||
// Sanity: same URL without --url-encoded does warn.
|
||||
let url_encoded_flag = false;
|
||||
let should_warn = !url_encoded_flag && looks_truncated(url);
|
||||
assert!(
|
||||
should_warn,
|
||||
"without --url-encoded, the warning should fire on URL ending with &"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn research_slug_truncation_is_char_safe() {
|
||||
// Multibyte query: byte-slicing at 50 would panic mid-codepoint.
|
||||
|
|
|
|||
|
|
@ -14,13 +14,16 @@ tracing = { workspace = true }
|
|||
tokio = { workspace = true }
|
||||
async-trait = "0.1"
|
||||
# Pinned to exact pre-release versions: wreq/wreq-util are release candidates
|
||||
# with no semver stability between rc.N builds (rc.29 broke the TLS + Response
|
||||
# API). An exact pin keeps `cargo build`, `cargo install` (which ignores
|
||||
# Cargo.lock), and the release workflow all on the version that compiles.
|
||||
wreq = { version = "=6.0.0-rc.28", features = ["cookies", "gzip", "brotli", "zstd", "deflate"] }
|
||||
wreq-util = "=3.0.0-rc.10"
|
||||
# with no semver stability between rc.N builds. An exact pin keeps `cargo build`,
|
||||
# `cargo install` (which ignores Cargo.lock), and the release workflow all on the
|
||||
# version that compiles.
|
||||
wreq = { version = "=6.0.0-rc.29", features = ["cookies", "gzip", "brotli", "zstd", "deflate", "stream"] }
|
||||
wreq-util = "=3.0.0-rc.12"
|
||||
http = "1"
|
||||
bytes = "1"
|
||||
# Stream adapter for `wreq::Response::bytes_stream()` (wreq 6.0.0-rc.29 dropped
|
||||
# `Response::chunk()`); used to buffer bodies under the running size ceiling.
|
||||
futures-util = "0.3"
|
||||
url = "2"
|
||||
rand = "0.8"
|
||||
quick-xml = { version = "0.37", features = ["serde"] }
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@ use std::hash::{Hash, Hasher};
|
|||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use futures_util::StreamExt;
|
||||
use rand::seq::SliceRandom;
|
||||
use tokio::sync::Semaphore;
|
||||
use tracing::{debug, instrument, warn};
|
||||
|
|
@ -118,7 +119,7 @@ impl Response {
|
|||
/// negotiated), so a tiny compressed payload that inflates to
|
||||
/// gigabytes is aborted as soon as the accumulated size crosses the
|
||||
/// cap — it never gets fully buffered in memory.
|
||||
async fn from_wreq(mut resp: wreq::Response) -> Result<Self, FetchError> {
|
||||
async fn from_wreq(resp: wreq::Response) -> Result<Self, FetchError> {
|
||||
if let Some(len) = resp.content_length()
|
||||
&& len > MAX_BODY_BYTES
|
||||
{
|
||||
|
|
@ -130,12 +131,13 @@ impl Response {
|
|||
let url = resp.uri().to_string();
|
||||
let headers = resp.headers().clone();
|
||||
|
||||
// wreq 6.0.0-rc.29 dropped `Response::chunk()`. Stream post-decompression
|
||||
// bytes via `bytes_stream()` and keep enforcing the running ceiling so a
|
||||
// compression bomb is aborted before it is fully buffered in memory.
|
||||
let mut buf = bytes::BytesMut::new();
|
||||
while let Some(chunk) = resp
|
||||
.chunk()
|
||||
.await
|
||||
.map_err(|e| FetchError::BodyDecode(e.to_string()))?
|
||||
{
|
||||
let mut stream = resp.bytes_stream();
|
||||
while let Some(chunk) = stream.next().await {
|
||||
let chunk = chunk.map_err(|e| FetchError::BodyDecode(e.to_string()))?;
|
||||
check_body_ceiling(buf.len(), chunk.len())?;
|
||||
buf.extend_from_slice(&chunk);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@ pub mod extractors;
|
|||
pub mod fetcher;
|
||||
pub mod linkedin;
|
||||
pub mod locale;
|
||||
pub mod progress;
|
||||
pub mod proxy;
|
||||
pub mod reddit;
|
||||
pub mod sitemap;
|
||||
|
|
@ -24,6 +25,7 @@ pub use error::FetchError;
|
|||
pub use fetcher::Fetcher;
|
||||
pub use http::HeaderMap;
|
||||
pub use locale::{accept_language_for_tld, accept_language_for_url};
|
||||
pub use progress::{PROGRESS_INTERVAL, with_progress};
|
||||
pub use proxy::{parse_proxy_file, parse_proxy_line};
|
||||
pub use sitemap::SitemapEntry;
|
||||
pub use webclaw_pdf::PdfMode;
|
||||
|
|
|
|||
293
crates/webclaw-fetch/src/progress.rs
Normal file
293
crates/webclaw-fetch/src/progress.rs
Normal file
|
|
@ -0,0 +1,293 @@
|
|||
//! Periodic stderr progress line emitter for slow fetches (M13).
|
||||
//!
|
||||
//! Wraps any async fetch future with a `tokio::select!` against a
|
||||
//! `tokio::time::interval`. Every `PROGRESS_INTERVAL` (default 10s) of
|
||||
//! elapsed time, emits one line to STDERR of the form:
|
||||
//!
|
||||
//! ```text
|
||||
//! # webclaw: still fetching <URL> (Ns)
|
||||
//! ```
|
||||
//!
|
||||
//! Fetches completing in under `PROGRESS_INTERVAL` emit zero lines (the
|
||||
//! timer never fires). Stdout is untouched.
|
||||
//!
|
||||
//! The URL is truncated to at most 80 chars (head + `...` + tail) so
|
||||
//! pathological query strings don't blow up the stderr line. Truncation
|
||||
//! is char-boundary safe (operates on `chars`, not bytes).
|
||||
|
||||
use std::future::Future;
|
||||
use std::time::Duration;
|
||||
|
||||
use tokio::time::{Instant, MissedTickBehavior, interval};
|
||||
|
||||
/// Default progress emission interval. The first tick fires at +10s
|
||||
/// elapsed; subsequent ticks at +20s, +30s, etc.
|
||||
pub const PROGRESS_INTERVAL: Duration = Duration::from_secs(10);
|
||||
|
||||
/// Maximum URL length in the progress line. Longer URLs are truncated
|
||||
/// `head...tail` style.
|
||||
const MAX_URL_LEN: usize = 80;
|
||||
|
||||
/// Wrap a fetch future with the default 10s progress emitter. Writes
|
||||
/// progress lines to STDERR via `eprintln!`. Returns the inner future's
|
||||
/// result unchanged.
|
||||
pub async fn with_progress<F, T>(url: &str, future: F) -> T
|
||||
where
|
||||
F: Future<Output = T>,
|
||||
{
|
||||
with_progress_writer(url, future, PROGRESS_INTERVAL, |s| eprintln!("{s}")).await
|
||||
}
|
||||
|
||||
/// Test-friendly variant of [`with_progress`]: caller supplies the tick
|
||||
/// interval (so tests can use a 50ms period instead of 10s) and a
|
||||
/// writer closure (so tests can capture emitted lines without touching
|
||||
/// real stderr).
|
||||
///
|
||||
/// Production code uses [`with_progress`] which delegates here with
|
||||
/// [`PROGRESS_INTERVAL`] and an `eprintln!` writer.
|
||||
pub async fn with_progress_writer<F, T, W>(
|
||||
url: &str,
|
||||
future: F,
|
||||
period: Duration,
|
||||
mut writer: W,
|
||||
) -> T
|
||||
where
|
||||
F: Future<Output = T>,
|
||||
W: FnMut(String),
|
||||
{
|
||||
let start = Instant::now();
|
||||
let mut ticker = interval(period);
|
||||
// First tick of `tokio::time::interval(period)` fires *immediately*
|
||||
// (at construction time). We don't want a t=0 emit — consume that
|
||||
// first tick before entering the select loop. Subsequent ticks fire
|
||||
// at `start + period`, `start + 2*period`, ...
|
||||
ticker.set_missed_tick_behavior(MissedTickBehavior::Skip);
|
||||
ticker.tick().await;
|
||||
|
||||
tokio::pin!(future);
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
// Bias toward the future — if both are ready (rare), prefer
|
||||
// returning the result over emitting a final tick.
|
||||
biased;
|
||||
result = &mut future => {
|
||||
return result;
|
||||
}
|
||||
_ = ticker.tick() => {
|
||||
let elapsed = start.elapsed();
|
||||
writer(format_progress_line(url, elapsed));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Build the progress line: `# webclaw: still fetching <URL> (Ns)`.
|
||||
/// URL is truncated via [`truncate_url`] to [`MAX_URL_LEN`] chars.
|
||||
/// Elapsed is rounded to whole seconds (10, 20, 30, ...).
|
||||
pub(crate) fn format_progress_line(url: &str, elapsed: Duration) -> String {
|
||||
let truncated = truncate_url(url, MAX_URL_LEN);
|
||||
let secs = elapsed.as_secs();
|
||||
format!("# webclaw: still fetching {truncated} ({secs}s)")
|
||||
}
|
||||
|
||||
/// Truncate `url` to at most `max` chars, using `head...tail` shape
|
||||
/// when truncation is needed. Char-boundary safe (operates on `chars`).
|
||||
pub(crate) fn truncate_url(url: &str, max: usize) -> String {
|
||||
let total_chars = url.chars().count();
|
||||
if total_chars <= max {
|
||||
return url.to_string();
|
||||
}
|
||||
// Reserve 3 chars for "..." and split the remainder ~70/30 between
|
||||
// head (path-side) and tail (query-side).
|
||||
let avail = max.saturating_sub(3);
|
||||
let head_chars = avail.saturating_sub(17);
|
||||
let tail_chars = 17;
|
||||
let head: String = url.chars().take(head_chars).collect();
|
||||
let tail: String = url
|
||||
.chars()
|
||||
.rev()
|
||||
.take(tail_chars)
|
||||
.collect::<Vec<_>>()
|
||||
.into_iter()
|
||||
.rev()
|
||||
.collect();
|
||||
format!("{head}...{tail}")
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
/// Collect emitted lines into a `Vec<String>` via a captured writer.
|
||||
fn capture() -> (Arc<Mutex<Vec<String>>>, impl FnMut(String)) {
|
||||
let sink: Arc<Mutex<Vec<String>>> = Arc::new(Mutex::new(Vec::new()));
|
||||
let sink_clone = Arc::clone(&sink);
|
||||
let writer = move |s: String| {
|
||||
sink_clone.lock().unwrap().push(s);
|
||||
};
|
||||
(sink, writer)
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_progress_emits_after_interval_elapsed() {
|
||||
let (sink, writer) = capture();
|
||||
// 250ms future, 50ms interval — expect ~4-5 ticks before resolution.
|
||||
let fut = tokio::time::sleep(Duration::from_millis(250));
|
||||
with_progress_writer(
|
||||
"https://example.com/slow",
|
||||
async {
|
||||
fut.await;
|
||||
42_i32
|
||||
},
|
||||
Duration::from_millis(50),
|
||||
writer,
|
||||
)
|
||||
.await;
|
||||
let lines = sink.lock().unwrap();
|
||||
assert!(
|
||||
!lines.is_empty(),
|
||||
"expected >=1 progress line; got {} ({:?})",
|
||||
lines.len(),
|
||||
*lines
|
||||
);
|
||||
for line in lines.iter() {
|
||||
assert!(
|
||||
line.starts_with("# webclaw: still fetching"),
|
||||
"line shape wrong: {line:?}"
|
||||
);
|
||||
assert!(
|
||||
line.contains("https://example.com/slow"),
|
||||
"url missing from line: {line:?}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_progress_silent_on_fast_future() {
|
||||
let (sink, writer) = capture();
|
||||
// 10ms future, 1s interval — zero ticks expected.
|
||||
let result = with_progress_writer(
|
||||
"https://example.com/fast",
|
||||
async {
|
||||
tokio::time::sleep(Duration::from_millis(10)).await;
|
||||
"done"
|
||||
},
|
||||
Duration::from_secs(1),
|
||||
writer,
|
||||
)
|
||||
.await;
|
||||
assert_eq!(result, "done");
|
||||
let lines = sink.lock().unwrap();
|
||||
assert_eq!(
|
||||
lines.len(),
|
||||
0,
|
||||
"expected 0 progress lines on fast future; got {:?}",
|
||||
*lines
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_progress_line_includes_url() {
|
||||
let (sink, writer) = capture();
|
||||
let target_url = "https://news.ycombinator.com/item?id=12345";
|
||||
with_progress_writer(
|
||||
target_url,
|
||||
async {
|
||||
tokio::time::sleep(Duration::from_millis(150)).await;
|
||||
},
|
||||
Duration::from_millis(50),
|
||||
writer,
|
||||
)
|
||||
.await;
|
||||
let lines = sink.lock().unwrap();
|
||||
assert!(!lines.is_empty(), "expected progress lines");
|
||||
assert!(
|
||||
lines.iter().all(|l| l.contains(target_url)),
|
||||
"every line should contain the URL: {:?}",
|
||||
*lines
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_progress_returns_inner_result_ok() {
|
||||
let (_sink, writer) = capture();
|
||||
let r: Result<i32, String> = with_progress_writer(
|
||||
"https://example.com/",
|
||||
async { Ok::<i32, String>(7) },
|
||||
Duration::from_secs(1),
|
||||
writer,
|
||||
)
|
||||
.await;
|
||||
assert_eq!(r, Ok(7));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_progress_propagates_error() {
|
||||
let (_sink, writer) = capture();
|
||||
let r: Result<i32, String> = with_progress_writer(
|
||||
"https://example.com/",
|
||||
async { Err::<i32, String>("boom".to_string()) },
|
||||
Duration::from_secs(1),
|
||||
writer,
|
||||
)
|
||||
.await;
|
||||
assert_eq!(r, Err("boom".to_string()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_truncate_url_short_passthrough() {
|
||||
let url = "https://example.com/";
|
||||
assert_eq!(truncate_url(url, 80), url);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_truncate_url_long_head_dots_tail() {
|
||||
let url = "https://www.example.com/very/long/path/segments/with/lots/of/text/and/then?q=some_long_query_string_value_here&other=more&another=thing";
|
||||
let truncated = truncate_url(url, 80);
|
||||
assert!(
|
||||
truncated.chars().count() <= 80,
|
||||
"truncated length {} > 80: {truncated:?}",
|
||||
truncated.chars().count()
|
||||
);
|
||||
assert!(
|
||||
truncated.contains("..."),
|
||||
"expected '...' marker in truncated url: {truncated:?}"
|
||||
);
|
||||
assert!(
|
||||
truncated.starts_with("https://www.example.com/"),
|
||||
"truncated should start with the URL head: {truncated:?}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_truncate_url_unicode_safe() {
|
||||
// Cyrillic URL longer than 80 chars — must not panic on a
|
||||
// mid-codepoint split.
|
||||
let url =
|
||||
"https://example.com/путь/к/очень/длинной/странице/с/большим/количеством/кириллицы/тут";
|
||||
let truncated = truncate_url(url, 80);
|
||||
assert!(truncated.is_char_boundary(truncated.len()));
|
||||
// Roundtrip through chars to confirm valid UTF-8 throughout.
|
||||
let _: String = truncated.chars().collect();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_format_progress_line_shape() {
|
||||
let line = format_progress_line("https://example.com/", Duration::from_secs(10));
|
||||
assert_eq!(line, "# webclaw: still fetching https://example.com/ (10s)");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_format_progress_line_seconds_only() {
|
||||
// Sub-second elapsed rounds to 0s, not fractions. (In practice
|
||||
// the first tick fires at +PROGRESS_INTERVAL so this is mostly
|
||||
// a defensive shape assertion.)
|
||||
let line = format_progress_line("https://x/", Duration::from_millis(9_500));
|
||||
assert!(
|
||||
line.ends_with("(9s)"),
|
||||
"line should end with `(9s)`: {line:?}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
@ -10,15 +10,24 @@ use std::{borrow::Cow, io, time::Duration};
|
|||
use wreq::http2::{
|
||||
Http2Options, PseudoId, PseudoOrder, SettingId, SettingsOrder, StreamDependency, StreamId,
|
||||
};
|
||||
use wreq::tls::{
|
||||
AlpnProtocol, AlpsProtocol, CertificateCompressionAlgorithm, ExtensionType, TlsOptions,
|
||||
TlsVersion,
|
||||
};
|
||||
use wreq::{Client, Emulation};
|
||||
use wreq::tls::compress::CertificateCompressor;
|
||||
use wreq::tls::{AlpnProtocol, AlpsProtocol, ExtensionType, TlsOptions, TlsVersion};
|
||||
use wreq::{Client, Emulation, Group, IntoEmulation};
|
||||
use wreq_util::emulate::compress::{BrotliCompressor, ZlibCompressor};
|
||||
|
||||
use crate::browser::BrowserVariant;
|
||||
use crate::error::FetchError;
|
||||
|
||||
// Certificate-compression advertisement per profile. wreq 6.0.0-rc.29 replaced
|
||||
// the `CertificateCompressionAlgorithm` enum argument with `&dyn
|
||||
// CertificateCompressor` trait objects; wreq-util ships the concrete zlib/brotli
|
||||
// implementations. The advertised set (and order) is a TLS fingerprint signal,
|
||||
// so these mirror the previous enum lists exactly.
|
||||
static CHROME_CERT_COMPRESSORS: &[&'static dyn CertificateCompressor] = &[&BrotliCompressor];
|
||||
static FIREFOX_CERT_COMPRESSORS: &[&'static dyn CertificateCompressor] =
|
||||
&[&ZlibCompressor, &BrotliCompressor];
|
||||
static SAFARI_CERT_COMPRESSORS: &[&'static dyn CertificateCompressor] = &[&ZlibCompressor];
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
struct PublicDnsResolver;
|
||||
|
||||
|
|
@ -119,14 +128,14 @@ fn chrome_extensions() -> Vec<ExtensionType> {
|
|||
ExtensionType::PSK_KEY_EXCHANGE_MODES, // 45
|
||||
ExtensionType::EC_POINT_FORMATS, // 11
|
||||
ExtensionType::CERT_COMPRESSION, // 27
|
||||
ExtensionType::APPLICATION_SETTINGS_NEW, // 17613 (new codepoint, matches alps_use_new_codepoint)
|
||||
ExtensionType::SUPPORTED_VERSIONS, // 43
|
||||
ExtensionType::SIGNATURE_ALGORITHMS, // 13
|
||||
ExtensionType::SERVER_NAME, // 0
|
||||
ExtensionType::APPLICATION_SETTINGS, // 17613 (new codepoint, matches alps_use_new_codepoint)
|
||||
ExtensionType::SUPPORTED_VERSIONS, // 43
|
||||
ExtensionType::SIGNATURE_ALGORITHMS, // 13
|
||||
ExtensionType::SERVER_NAME, // 0
|
||||
ExtensionType::APPLICATION_LAYER_PROTOCOL_NEGOTIATION, // 16
|
||||
ExtensionType::ENCRYPTED_CLIENT_HELLO, // 65037
|
||||
ExtensionType::RENEGOTIATE, // 65281
|
||||
ExtensionType::EXTENDED_MASTER_SECRET, // 23
|
||||
ExtensionType::ENCRYPTED_CLIENT_HELLO, // 65037
|
||||
ExtensionType::RENEGOTIATE, // 65281
|
||||
ExtensionType::EXTENDED_MASTER_SECRET, // 23
|
||||
]
|
||||
}
|
||||
|
||||
|
|
@ -287,7 +296,7 @@ fn chrome_tls() -> TlsOptions {
|
|||
.alps_protocols([AlpsProtocol::HTTP3, AlpsProtocol::HTTP2])
|
||||
.alps_use_new_codepoint(true)
|
||||
.aes_hw_override(true)
|
||||
.certificate_compression_algorithms(&[CertificateCompressionAlgorithm::BROTLI])
|
||||
.certificate_compressors(CHROME_CERT_COMPRESSORS)
|
||||
.build()
|
||||
}
|
||||
|
||||
|
|
@ -304,10 +313,7 @@ fn firefox_tls() -> TlsOptions {
|
|||
.pre_shared_key(true)
|
||||
.enable_ocsp_stapling(true)
|
||||
.enable_signed_cert_timestamps(true)
|
||||
.certificate_compression_algorithms(&[
|
||||
CertificateCompressionAlgorithm::ZLIB,
|
||||
CertificateCompressionAlgorithm::BROTLI,
|
||||
])
|
||||
.certificate_compressors(FIREFOX_CERT_COMPRESSORS)
|
||||
.build()
|
||||
}
|
||||
|
||||
|
|
@ -324,7 +330,7 @@ fn safari_tls() -> TlsOptions {
|
|||
.pre_shared_key(false)
|
||||
.enable_ocsp_stapling(true)
|
||||
.enable_signed_cert_timestamps(true)
|
||||
.certificate_compression_algorithms(&[CertificateCompressionAlgorithm::ZLIB])
|
||||
.certificate_compressors(SAFARI_CERT_COMPRESSORS)
|
||||
.build()
|
||||
}
|
||||
|
||||
|
|
@ -345,21 +351,23 @@ fn safari_tls() -> TlsOptions {
|
|||
/// `priority: u=0, i`, zstd), replace with the real iOS 26 set.
|
||||
/// 4. `accept-language` preserved from config.extra_headers for locale.
|
||||
fn safari_ios_emulation() -> wreq::Emulation {
|
||||
use wreq::EmulationFactory;
|
||||
let mut em = wreq_util::Emulation::SafariIos26.emulation();
|
||||
// wreq 6.0.0-rc.29 exposes the `Emulation` fields directly (no `*_mut()`
|
||||
// accessors) and wreq-util 3.0.0-rc.12 renamed the enum to `Profile` with
|
||||
// `IntoEmulation::into_emulation` replacing `EmulationFactory::emulation`.
|
||||
let mut em = wreq_util::Profile::SafariIos26.into_emulation();
|
||||
|
||||
if let Some(tls) = em.tls_options_mut().as_mut() {
|
||||
if let Some(tls) = em.tls_options.as_mut() {
|
||||
tls.extension_permutation = Some(Cow::Owned(safari_ios_extensions()));
|
||||
}
|
||||
|
||||
// Only override the priority flag — keep wreq-util's SETTINGS, WINDOW_UPDATE,
|
||||
// and pseudo-order intact. Replacing the whole Http2Options resets SETTINGS
|
||||
// to defaults, which sends only INITIAL_WINDOW_SIZE and fails DataDome.
|
||||
if let Some(h2) = em.http2_options_mut().as_mut() {
|
||||
if let Some(h2) = em.http2_options.as_mut() {
|
||||
h2.headers_stream_dependency = Some(StreamDependency::new(StreamId::zero(), 255, true));
|
||||
}
|
||||
|
||||
let hm = em.headers_mut();
|
||||
let hm = &mut em.headers;
|
||||
hm.clear();
|
||||
for (k, v) in SAFARI_IOS_HEADERS {
|
||||
if let (Ok(n), Ok(val)) = (
|
||||
|
|
@ -508,12 +516,12 @@ pub fn build_client(
|
|||
.tls_options(tls)
|
||||
.http2_options(h2)
|
||||
.headers(build_headers(headers))
|
||||
.build()
|
||||
.build(Group::default())
|
||||
}
|
||||
};
|
||||
|
||||
// Append extra headers after profile defaults.
|
||||
let hm = emulation.headers_mut();
|
||||
let hm = &mut emulation.headers;
|
||||
for (k, v) in extra_headers {
|
||||
if let (Ok(n), Ok(val)) = (
|
||||
http::header::HeaderName::from_bytes(k.as_bytes()),
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue