diff --git a/CHANGELOG.md b/CHANGELOG.md index 07019fd..fd98e5b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,22 @@ All notable changes to webclaw are documented here. Format follows [Keep a Changelog](https://keepachangelog.com/). +## [0.3.3] — 2026-04-01 + +### Changed +- **Replaced custom TLS stack with wreq**: migrated from webclaw-tls (patched rustls/h2/hyper/reqwest) to [wreq](https://github.com/0x676e67/wreq) by [@0x676e67](https://github.com/0x676e67). wreq uses BoringSSL for TLS and the [http2](https://github.com/0x676e67/http2) crate for HTTP/2 fingerprinting — both battle-tested with 60+ browser profiles. +- **Removed all `[patch.crates-io]` entries**: consumers no longer need to patch rustls, h2, hyper, hyper-util, or reqwest. Just depend on webclaw normally. +- **Browser profiles rebuilt on wreq's Emulation API**: Chrome 145, Firefox 135, Safari 18, Edge 145 with correct TLS options (cipher suites, curves, GREASE, ECH, PSK session resumption), HTTP/2 SETTINGS ordering, pseudo-header order, and header wire order. +- **Better TLS compatibility**: BoringSSL handles more server configurations than patched rustls (e.g. servers that previously returned IllegalParameter alerts). + +### Removed +- webclaw-tls dependency and all 5 forked crates (webclaw-rustls, webclaw-h2, webclaw-hyper, webclaw-hyper-util, webclaw-reqwest). + +### Acknowledgments +- TLS and HTTP/2 fingerprinting powered by [wreq](https://github.com/0x676e67/wreq) and [http2](https://github.com/0x676e67/http2) by [@0x676e67](https://github.com/0x676e67), who pioneered browser-grade HTTP/2 fingerprinting in Rust. + +--- + ## [0.3.2] — 2026-03-31 ### Added @@ -18,17 +34,15 @@ Format follows [Keep a Changelog](https://keepachangelog.com/). - **Cookie warmup fallback**: when a fetch returns an Akamai challenge page, automatically visits the homepage first to collect `_abck`/`bm_sz` cookies, then retries the original URL. Enables extraction of Akamai-protected subpages (e.g. fansale ticket pages) without JS rendering. ### Changed -- Upgraded to webclaw-tls v0.1.2: fixed HTTP header wire order (accept/user-agent were in wrong positions) and added H2 PRIORITY flag in HEADERS frames. +- Fixed HTTP header wire order (accept/user-agent were in wrong positions) and added H2 PRIORITY flag in HEADERS frames. - `FetchResult.headers` now uses `http::HeaderMap` instead of `HashMap` — avoids per-response allocation, preserves multi-value headers. ## [0.3.0] — 2026-03-29 ### Changed -- **Replaced primp with webclaw-tls**: entire TLS fingerprinting stack is now our own. Zero primp references remain. -- **Own TLS library**: [webclaw-tls](https://github.com/0xMassi/webclaw-tls) — patched rustls, h2, hyper, hyper-util, reqwest for browser-grade fingerprinting. -- **Perfect Chrome 146 fingerprint**: JA4 `t13d1517h2_8daaf6152771_b6f405a00624` + Akamai HTTP/2 hash match — the only library in any language to achieve this. -- **99% bypass rate**: 101/102 sites pass (up from ~85% with primp). +- **Replaced primp with webclaw-tls**: switched to custom TLS fingerprinting stack. - **Browser profiles**: Chrome 146 (Win/Mac), Firefox 135+, Safari 18, Edge 146 — captured from real browsers. +- **HTTP/2 fingerprinting**: SETTINGS frame ordering and pseudo-header ordering based on concepts pioneered by [@0x676e67](https://github.com/0x676e67). ### Fixed - **HTTPS completely broken (#5)**: primp's forked rustls rejected valid certificates (UnknownIssuer on cross-signed chains like example.com). Fixed by using native OS root CAs alongside Mozilla bundle. diff --git a/Cargo.lock b/Cargo.lock index 60360e8..b78914d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -28,6 +28,18 @@ dependencies = [ "cpufeatures", ] +[[package]] +name = "ahash" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" +dependencies = [ + "cfg-if", + "once_cell", + "version_check", + "zerocopy", +] + [[package]] name = "aho-corasick" version = "1.1.4" @@ -170,35 +182,30 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" -[[package]] -name = "aws-lc-rs" -version = "1.16.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a054912289d18629dc78375ba2c3726a3afe3ff71b4edba9dedfca0e3446d1fc" -dependencies = [ - "aws-lc-sys", - "untrusted 0.7.1", - "zeroize", -] - -[[package]] -name = "aws-lc-sys" -version = "0.39.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83a25cf98105baa966497416dbd42565ce3a8cf8dbfd59803ec9ad46f3126399" -dependencies = [ - "cc", - "cmake", - "dunce", - "fs_extra", -] - [[package]] name = "base64" version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" +[[package]] +name = "bindgen" +version = "0.72.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895" +dependencies = [ + "bitflags", + "cexpr", + "clang-sys", + "itertools", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", + "syn", +] + [[package]] name = "bitflags" version = "2.11.0" @@ -214,6 +221,31 @@ dependencies = [ "generic-array", ] +[[package]] +name = "boring-sys2" +version = "5.0.0-alpha.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "455d79965f5155dcc88a7abce112c3590883889131b799beda10bf9a813ed669" +dependencies = [ + "bindgen", + "cmake", + "fs_extra", + "fslock", +] + +[[package]] +name = "boring2" +version = "5.0.0-alpha.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "183ccc3854411c035410dcdbffafca62084f3a6c33f013c77e83c025d2a08a28" +dependencies = [ + "bitflags", + "boring-sys2", + "foreign-types", + "libc", + "openssl-macros", +] + [[package]] name = "brotli" version = "8.0.2" @@ -301,6 +333,15 @@ dependencies = [ "shlex", ] +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom", +] + [[package]] name = "cfg-if" version = "1.0.4" @@ -337,6 +378,17 @@ dependencies = [ "inout", ] +[[package]] +name = "clang-sys" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" +dependencies = [ + "glob", + "libc", + "libloading", +] + [[package]] name = "clap" version = "4.6.0" @@ -433,39 +485,10 @@ version = "0.18.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4ddef33a339a91ea89fb53151bd0a4689cfce27055c291dfa69945475d22c747" dependencies = [ - "percent-encoding", "time", "version_check", ] -[[package]] -name = "cookie_store" -version = "0.22.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15b2c103cf610ec6cae3da84a766285b42fd16aad564758459e6ecf128c75206" -dependencies = [ - "cookie", - "document-features", - "idna", - "log", - "publicsuffix", - "serde", - "serde_derive", - "serde_json", - "time", - "url", -] - -[[package]] -name = "core-foundation" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2a6cd9ae233e7f62ba4e9353e81a88df7fc8a5987b8d445b4d90c879bd156f6" -dependencies = [ - "core-foundation-sys", - "libc", -] - [[package]] name = "core-foundation-sys" version = "0.8.7" @@ -643,15 +666,6 @@ dependencies = [ "syn", ] -[[package]] -name = "document-features" -version = "0.2.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4b8a88685455ed29a21542a33abd9cb6510b6b129abadabdcef0f4c55bc8f61" -dependencies = [ - "litrs", -] - [[package]] name = "dotenvy" version = "0.15.7" @@ -673,12 +687,6 @@ dependencies = [ "dtoa", ] -[[package]] -name = "dunce" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" - [[package]] name = "dyn-clone" version = "1.0.20" @@ -691,6 +699,12 @@ version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2972feb8dffe7bc8c5463b1dacda1b0dfbed3710e50f977d965429692d74cd8" +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + [[package]] name = "encoding_rs" version = "0.8.35" @@ -766,6 +780,33 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" +[[package]] +name = "foreign-types" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d737d9aa519fb7b749cbc3b962edcf310a8dd1f4b67c91c4f83975dbdd17d965" +dependencies = [ + "foreign-types-macros", + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-macros" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a5c6c585bc94aaf2c7b51dd4c2ba22680844aba4c687be581871a6f518c5742" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "foreign-types-shared" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa9a19cbb55df58761df49b23516a86d432839add4af60fc256da840f66ed35b" + [[package]] name = "form_urlencoded" version = "1.2.2" @@ -781,6 +822,16 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" +[[package]] +name = "fslock" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04412b8935272e3a9bae6f48c7bfff74c2911f60525404edfdd28e49884c3bfb" +dependencies = [ + "libc", + "winapi", +] + [[package]] name = "futf" version = "0.1.5" @@ -948,23 +999,16 @@ dependencies = [ ] [[package]] -name = "h2" -version = "0.4.13" -source = "git+https://github.com/0xMassi/webclaw-tls#159914dbfc877f218b80caea95be836a5121ac2f" -dependencies = [ - "atomic-waker", - "bytes", - "fnv", - "futures-core", - "futures-sink", - "http", - "indexmap", - "slab", - "smallvec", - "tokio", - "tokio-util", - "tracing", -] +name = "glob" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" + +[[package]] +name = "hashbrown" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43a3c133739dddd0d2990f9a4bdf8eb4b21ef50e4851ca85ab661199821d510e" [[package]] name = "hashbrown" @@ -1041,6 +1085,26 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "http2" +version = "0.5.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c45c6490693ee8a8d0d95fdbdf76fead9fb87548f7894137259a7c6d22821948" +dependencies = [ + "atomic-waker", + "bytes", + "fnv", + "futures-core", + "futures-sink", + "http", + "indexmap", + "parking_lot", + "slab", + "smallvec", + "tokio", + "tokio-util", +] + [[package]] name = "httparse" version = "1.10.1" @@ -1049,20 +1113,19 @@ checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" [[package]] name = "hyper" -version = "1.8.1" -source = "git+https://github.com/0xMassi/webclaw-tls#159914dbfc877f218b80caea95be836a5121ac2f" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6299f016b246a94207e63da54dbe807655bf9e00044f73ded42c3ac5305fbcca" dependencies = [ "atomic-waker", "bytes", "futures-channel", "futures-core", - "h2", "http", "http-body", "httparse", "itoa", "pin-project-lite", - "pin-utils", "smallvec", "tokio", "want", @@ -1077,9 +1140,7 @@ dependencies = [ "http", "hyper", "hyper-util", - "log", "rustls", - "rustls-native-certs", "rustls-pki-types", "tokio", "tokio-rustls", @@ -1090,12 +1151,12 @@ dependencies = [ [[package]] name = "hyper-util" version = "0.1.20" -source = "git+https://github.com/0xMassi/webclaw-tls#159914dbfc877f218b80caea95be836a5121ac2f" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96547c2556ec9d12fb1578c4eaf448b04993e7fb79cbaad930a656880a6bdfa0" dependencies = [ "base64", "bytes", "futures-channel", - "futures-core", "futures-util", "http", "http-body", @@ -1291,6 +1352,15 @@ version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" +[[package]] +name = "itertools" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.18" @@ -1337,6 +1407,16 @@ version = "0.2.183" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5b646652bf6661599e1da8901b3b9522896f01e736bad5f723fe7a3a27f899d" +[[package]] +name = "libloading" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55" +dependencies = [ + "cfg-if", + "windows-link", +] + [[package]] name = "linux-raw-sys" version = "0.12.1" @@ -1349,12 +1429,6 @@ version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" -[[package]] -name = "litrs" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11d3d7f243d5c5a8b9bb5d6dd2b1602c0cb0b9db1621bafc7ed66e35ff9fe092" - [[package]] name = "lock_api" version = "0.4.14" @@ -1551,10 +1625,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" [[package]] -name = "openssl-probe" -version = "0.2.1" +name = "openssl-macros" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] [[package]] name = "parking_lot" @@ -1674,12 +1753,6 @@ version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" -[[package]] -name = "pin-utils" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" - [[package]] name = "pkg-config" version = "0.3.32" @@ -1747,22 +1820,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "psl-types" -version = "2.0.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33cb294fe86a74cbcf50d4445b37da762029549ebeea341421c7c70370f86cac" - -[[package]] -name = "publicsuffix" -version = "2.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f42ea446cab60335f76979ec15e12619a2165b5ae2c12166bef27d283a9fadf" -dependencies = [ - "idna", - "psl-types", -] - [[package]] name = "quick-xml" version = "0.37.5" @@ -1809,7 +1866,6 @@ version = "0.11.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "434b42fec591c96ef50e21e886936e66d3cc3f737104fdb9b737c40ffb94c098" dependencies = [ - "aws-lc-rs", "bytes", "getrandom 0.3.4", "lru-slab", @@ -2021,45 +2077,6 @@ dependencies = [ "webpki-roots", ] -[[package]] -name = "reqwest" -version = "0.13.2" -source = "git+https://github.com/0xMassi/webclaw-tls#159914dbfc877f218b80caea95be836a5121ac2f" -dependencies = [ - "base64", - "bytes", - "cookie", - "cookie_store", - "futures-core", - "h2", - "http", - "http-body", - "http-body-util", - "hyper", - "hyper-rustls", - "hyper-util", - "js-sys", - "log", - "percent-encoding", - "pin-project-lite", - "quinn", - "rustls", - "rustls-pki-types", - "serde", - "serde_json", - "sync_wrapper", - "tokio", - "tokio-rustls", - "tower", - "tower-http", - "tower-service", - "url", - "wasm-bindgen", - "wasm-bindgen-futures", - "web-sys", - "webpki-roots", -] - [[package]] name = "ring" version = "0.17.14" @@ -2070,7 +2087,7 @@ dependencies = [ "cfg-if", "getrandom 0.2.17", "libc", - "untrusted 0.9.0", + "untrusted", "windows-sys 0.52.0", ] @@ -2158,31 +2175,15 @@ dependencies = [ [[package]] name = "rustls" version = "0.23.37" -source = "git+https://github.com/0xMassi/webclaw-tls#159914dbfc877f218b80caea95be836a5121ac2f" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "758025cb5fccfd3bc2fd74708fd4682be41d99e5dff73c377c0646c6012c73a4" dependencies = [ - "aws-lc-rs", - "brotli", - "brotli-decompressor", - "log", "once_cell", "ring", "rustls-pki-types", "rustls-webpki", "subtle", "zeroize", - "zstd", -] - -[[package]] -name = "rustls-native-certs" -version = "0.8.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "612460d5f7bea540c490b2b6395d8e34a953e52b491accd6c86c8164c5932a63" -dependencies = [ - "openssl-probe", - "rustls-pki-types", - "schannel", - "security-framework", ] [[package]] @@ -2201,10 +2202,9 @@ version = "0.103.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "df33b2b81ac578cabaf06b89b0631153a3f416b0a886e8a7a1707fb51abbd1ef" dependencies = [ - "aws-lc-rs", "ring", "rustls-pki-types", - "untrusted 0.9.0", + "untrusted", ] [[package]] @@ -2219,15 +2219,6 @@ version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" -[[package]] -name = "schannel" -version = "0.1.29" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91c1b7e4904c873ef0710c1f407dde2e6287de2bebc1bbbf7d430bb7cbffd939" -dependencies = [ - "windows-sys 0.61.2", -] - [[package]] name = "schemars" version = "1.2.1" @@ -2254,6 +2245,17 @@ dependencies = [ "syn", ] +[[package]] +name = "schnellru" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "356285bbf17bea63d9e52e96bd18f039672ac92b55b8cb997d6162a2a37d1649" +dependencies = [ + "ahash", + "cfg-if", + "hashbrown 0.13.2", +] + [[package]] name = "scopeguard" version = "1.2.0" @@ -2275,29 +2277,6 @@ dependencies = [ "tendril", ] -[[package]] -name = "security-framework" -version = "3.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d" -dependencies = [ - "bitflags", - "core-foundation", - "core-foundation-sys", - "libc", - "security-framework-sys", -] - -[[package]] -name = "security-framework-sys" -version = "2.17.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ce2691df843ecc5d231c0b14ece2acc3efb62c0a398c7e1d875f3983ce020e3" -dependencies = [ - "core-foundation-sys", - "libc", -] - [[package]] name = "selectors" version = "0.26.0" @@ -2674,6 +2653,16 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "tokio-boring2" +version = "5.0.0-alpha.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f81df1210d791f31d72d840de8fbd80b9c3cb324956523048b1413e2bd55756" +dependencies = [ + "boring2", + "tokio", +] + [[package]] name = "tokio-macros" version = "2.6.1" @@ -2873,12 +2862,6 @@ version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" -[[package]] -name = "untrusted" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" - [[package]] name = "untrusted" version = "0.9.0" @@ -3072,13 +3055,13 @@ dependencies = [ [[package]] name = "webclaw-cli" -version = "0.3.2" +version = "0.3.3" dependencies = [ "clap", "dotenvy", "rand 0.8.5", "regex", - "reqwest 0.12.28", + "reqwest", "serde_json", "tokio", "tracing", @@ -3092,7 +3075,7 @@ dependencies = [ [[package]] name = "webclaw-core" -version = "0.3.2" +version = "0.3.3" dependencies = [ "ego-tree", "once_cell", @@ -3110,8 +3093,9 @@ dependencies = [ [[package]] name = "webclaw-fetch" -version = "0.3.2" +version = "0.3.3" dependencies = [ + "bytes", "calamine", "http", "quick-xml 0.37.5", @@ -3124,34 +3108,17 @@ dependencies = [ "tracing", "url", "webclaw-core", - "webclaw-http", "webclaw-pdf", + "wreq", "zip 2.4.2", ] -[[package]] -name = "webclaw-http" -version = "0.1.2" -source = "git+https://github.com/0xMassi/webclaw-tls#159914dbfc877f218b80caea95be836a5121ac2f" -dependencies = [ - "bytes", - "h2", - "http", - "reqwest 0.13.2", - "rustls", - "rustls-native-certs", - "tokio", - "tracing", - "url", - "webpki-roots", -] - [[package]] name = "webclaw-llm" -version = "0.3.2" +version = "0.3.3" dependencies = [ "async-trait", - "reqwest 0.12.28", + "reqwest", "serde", "serde_json", "thiserror", @@ -3161,10 +3128,10 @@ dependencies = [ [[package]] name = "webclaw-mcp" -version = "0.3.2" +version = "0.3.3" dependencies = [ "dotenvy", - "reqwest 0.12.28", + "reqwest", "rmcp", "schemars", "serde", @@ -3181,13 +3148,22 @@ dependencies = [ [[package]] name = "webclaw-pdf" -version = "0.3.2" +version = "0.3.3" dependencies = [ "pdf-extract", "thiserror", "tracing", ] +[[package]] +name = "webpki-root-certs" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "804f18a4ac2676ffb4e8b5b5fa9ae38af06df08162314f96a68d2a363e21a8ca" +dependencies = [ + "rustls-pki-types", +] + [[package]] name = "webpki-roots" version = "1.0.6" @@ -3203,6 +3179,28 @@ version = "0.1.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a28ac98ddc8b9274cb41bb4d9d4d5c425b6020c50c46f25559911905610b4a88" +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + [[package]] name = "windows-core" version = "0.62.2" @@ -3506,6 +3504,42 @@ dependencies = [ "wasmparser", ] +[[package]] +name = "wreq" +version = "6.0.0-rc.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f79937f6c4df65b3f6f78715b9de2977afe9ee3b3436483c7949a24511e25935" +dependencies = [ + "ahash", + "boring2", + "brotli", + "bytes", + "cookie", + "flate2", + "futures-channel", + "futures-util", + "http", + "http-body", + "http-body-util", + "http2", + "httparse", + "ipnet", + "libc", + "percent-encoding", + "pin-project-lite", + "schnellru", + "smallvec", + "socket2", + "tokio", + "tokio-boring2", + "tower", + "tower-http", + "url", + "want", + "webpki-root-certs", + "zstd", +] + [[package]] name = "writeable" version = "0.6.2" diff --git a/Cargo.toml b/Cargo.toml index 4f425a2..39e38cf 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ resolver = "2" members = ["crates/*"] [workspace.package] -version = "0.3.2" +version = "0.3.3" edition = "2024" license = "MIT" repository = "https://github.com/0xMassi/webclaw" @@ -22,13 +22,3 @@ tracing-subscriber = { version = "0.3", features = ["env-filter"] } clap = { version = "4", features = ["derive", "env"] } dotenvy = "0.15" -# TLS + HTTP/2 fingerprinting via webclaw-tls. -# rustls: TLS fingerprinting (JA4 match Chrome 146) -# h2: HTTP/2 SETTINGS ordering + pseudo-header order -# hyper/hyper-util/reqwest: passthrough for consistent dependency chain -[patch.crates-io] -rustls = { git = "https://github.com/0xMassi/webclaw-tls" } -h2 = { git = "https://github.com/0xMassi/webclaw-tls" } -hyper = { git = "https://github.com/0xMassi/webclaw-tls" } -hyper-util = { git = "https://github.com/0xMassi/webclaw-tls" } -reqwest = { git = "https://github.com/0xMassi/webclaw-tls" } diff --git a/README.md b/README.md index 2a5efb5..0c60c28 100644 --- a/README.md +++ b/README.md @@ -315,7 +315,7 @@ Noise removal webclaw ██████████████████ webclaw/ crates/ webclaw-core Pure extraction engine. Zero network deps. WASM-safe. - webclaw-fetch HTTP client + TLS fingerprinting. Crawler. Batch ops. + webclaw-fetch HTTP client + TLS fingerprinting (wreq/BoringSSL). Crawler. Batch ops. webclaw-llm LLM provider chain (Ollama -> OpenAI -> Anthropic) webclaw-pdf PDF text extraction webclaw-mcp MCP server (10 tools for AI agents) @@ -391,6 +391,10 @@ We welcome contributions! See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines. - [Good first issues](https://github.com/0xMassi/webclaw/issues?q=label%3A%22good+first+issue%22) - [Architecture docs](CONTRIBUTING.md#architecture) +## Acknowledgments + +TLS and HTTP/2 browser fingerprinting is powered by [wreq](https://github.com/0x676e67/wreq) and [http2](https://github.com/0x676e67/http2) by [@0x676e67](https://github.com/0x676e67), who pioneered browser-grade HTTP/2 fingerprinting in Rust. + ## License [MIT](LICENSE) — use it however you want. diff --git a/crates/webclaw-fetch/Cargo.toml b/crates/webclaw-fetch/Cargo.toml index bb16cd7..0b22d12 100644 --- a/crates/webclaw-fetch/Cargo.toml +++ b/crates/webclaw-fetch/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "webclaw-fetch" -description = "HTTP client with browser TLS fingerprint impersonation via webclaw-http" +description = "HTTP client with browser TLS fingerprint impersonation via wreq" version.workspace = true edition.workspace = true license.workspace = true @@ -12,7 +12,9 @@ serde = { workspace = true } thiserror = { workspace = true } tracing = { workspace = true } tokio = { workspace = true } -webclaw-http = { git = "https://github.com/0xMassi/webclaw-tls" } +wreq = { version = "6.0.0-rc.28", features = ["cookies", "gzip", "brotli", "zstd", "deflate"] } +http = "1" +bytes = "1" url = "2" rand = "0.8" quick-xml = { version = "0.37", features = ["serde"] } @@ -22,4 +24,3 @@ zip = "2" [dev-dependencies] tempfile = "3" -http = "1" diff --git a/crates/webclaw-fetch/src/client.rs b/crates/webclaw-fetch/src/client.rs index e051b2e..d23b063 100644 --- a/crates/webclaw-fetch/src/client.rs +++ b/crates/webclaw-fetch/src/client.rs @@ -1,5 +1,5 @@ /// HTTP client with browser TLS fingerprint impersonation. -/// Uses webclaw-http for browser-grade TLS + HTTP/2 fingerprinting. +/// Uses wreq (BoringSSL) for browser-grade TLS + HTTP/2 fingerprinting. /// Supports single and batch operations with proxy rotation. /// Automatically detects PDF responses and extracts text via webclaw-pdf. /// @@ -60,7 +60,7 @@ pub struct FetchResult { pub status: u16, /// Final URL after any redirects. pub url: String, - pub headers: webclaw_http::HeaderMap, + pub headers: http::HeaderMap, pub elapsed: Duration, } @@ -78,20 +78,54 @@ pub struct BatchExtractResult { pub result: Result, } +/// Buffered response that owns its body. Provides the same sync API +/// that webclaw-http::Response used to provide. +struct Response { + status: u16, + url: String, + headers: http::HeaderMap, + body: bytes::Bytes, +} + +impl Response { + /// Buffer a wreq response into an owned Response. + async fn from_wreq(resp: wreq::Response) -> Result { + let status = resp.status().as_u16(); + let url = resp.uri().to_string(); + let headers = resp.headers().clone(); + let body = resp.bytes().await.map_err(|e| FetchError::BodyDecode(e.to_string()))?; + Ok(Self { status, url, headers, body }) + } + + fn status(&self) -> u16 { self.status } + fn url(&self) -> &str { &self.url } + fn headers(&self) -> &http::HeaderMap { &self.headers } + fn body(&self) -> &[u8] { &self.body } + fn is_success(&self) -> bool { (200..300).contains(&self.status) } + + fn text(&self) -> std::borrow::Cow<'_, str> { + String::from_utf8_lossy(&self.body) + } + + fn into_text(self) -> String { + String::from_utf8_lossy(&self.body).into_owned() + } +} + /// Internal representation of the client pool strategy. enum ClientPool { /// Pre-built clients with a fixed proxy (or no proxy). /// Fingerprint rotation still works via the pool when `random` is true. Static { - clients: Vec, + clients: Vec, random: bool, }, /// Pre-built pool of clients, each with a different proxy + fingerprint. /// Requests pick a client deterministically by host for HTTP/2 connection reuse. - Rotating { clients: Vec }, + Rotating { clients: Vec }, } -/// HTTP client with browser TLS + HTTP/2 fingerprinting via webclaw-http. +/// HTTP client with browser TLS + HTTP/2 fingerprinting via wreq. /// /// Operates in two modes: /// - **Static pool**: pre-built clients, optionally with fingerprint rotation. @@ -105,13 +139,6 @@ pub struct FetchClient { impl FetchClient { /// Build a new client from config. - /// - /// When `config.proxy_pool` is non-empty, pre-builds one client per proxy, - /// each with a randomly assigned fingerprint. Same-host URLs get routed to the - /// same client for HTTP/2 connection reuse. - /// - /// When `proxy_pool` is empty, pre-builds clients at construction time - /// (one per fingerprint for `Random` profiles, one for fixed profiles). pub fn new(config: FetchConfig) -> Result { let variants = collect_variants(&config.browser); let pdf_mode = config.pdf_mode.clone(); @@ -119,7 +146,9 @@ impl FetchClient { let pool = if config.proxy_pool.is_empty() { let clients = variants .into_iter() - .map(|v| build_client(&config, v, config.proxy.as_deref())) + .map(|v| { + crate::tls::build_client(v, config.timeout, &config.headers, config.proxy.as_deref()) + }) .collect::, _>>()?; let random = matches!(config.browser, BrowserProfile::Random); @@ -137,7 +166,7 @@ impl FetchClient { .iter() .map(|proxy| { let v = *variants.choose(&mut rng).unwrap(); - build_client(&config, v, Some(proxy)) + crate::tls::build_client(v, config.timeout, &config.headers, Some(proxy)) }) .collect::, _>>()?; @@ -205,19 +234,17 @@ impl FetchClient { Err(last_err.unwrap_or_else(|| FetchError::Build("all retries exhausted".into()))) } - /// Single fetch attempt. Uses the TLS-impersonated client from the pool. + /// Single fetch attempt. async fn fetch_once(&self, url: &str) -> Result { let start = Instant::now(); let client = self.pick_client(url); - let response = client.get(url).await?; + let resp = client.get(url).send().await?; + let response = Response::from_wreq(resp).await?; response_to_result(response, start) } /// Fetch a URL then extract structured content. - /// - /// Automatically detects PDF responses via Content-Type header and routes - /// to webclaw-pdf for text extraction. HTML responses go through webclaw-core. #[instrument(skip(self), fields(url = %url))] pub async fn fetch_and_extract( &self, @@ -240,7 +267,8 @@ impl FetchClient { debug!("reddit detected, fetching {json_url}"); let client = self.pick_client(url); - let response = client.get(&json_url).await?; + let resp = client.get(&json_url).send().await?; + let response = Response::from_wreq(resp).await?; if response.is_success() { let bytes = response.body(); match crate::reddit::parse_reddit_json(bytes, url) { @@ -252,7 +280,8 @@ impl FetchClient { let start = Instant::now(); let client = self.pick_client(url); - let mut response = client.get(url).await?; + let resp = client.get(url).send().await?; + let mut response = Response::from_wreq(resp).await?; // Cookie warmup: if we get a challenge page, visit the homepage first // to collect Akamai cookies (_abck, bm_sz, etc.), then retry. @@ -260,8 +289,9 @@ impl FetchClient { && let Some(homepage) = extract_homepage(url) { debug!("challenge detected, warming cookies via {homepage}"); - let _ = client.get(&homepage).await; - response = client.get(url).await?; + let _ = client.get(&homepage).send().await; + let resp = client.get(url).send().await?; + response = Response::from_wreq(resp).await?; debug!("retried after cookie warmup: status={}", response.status()); } @@ -306,7 +336,7 @@ impl FetchClient { result.metadata.url = Some(final_url); Ok(result) } else { - let html = response.text().into_owned(); + let html = response.into_text(); let elapsed = start.elapsed(); debug!(status, elapsed_ms = %elapsed.as_millis(), "fetch complete"); @@ -399,7 +429,7 @@ impl FetchClient { } /// Pick a client from the pool for a given URL. - fn pick_client(&self, url: &str) -> &webclaw_http::Client { + fn pick_client(&self, url: &str) -> &wreq::Client { match &self.pool { ClientPool::Static { clients, random } => { if *random { @@ -423,9 +453,9 @@ fn collect_variants(profile: &BrowserProfile) -> Vec { } } -/// Convert a webclaw-http Response into a FetchResult. +/// Convert a buffered Response into a FetchResult. fn response_to_result( - response: webclaw_http::Response, + response: Response, start: Instant, ) -> Result { let status = response.status(); @@ -455,7 +485,7 @@ fn extract_host(url: &str) -> String { /// Pick a client deterministically based on a host string. /// Same host always gets the same client, enabling HTTP/2 connection reuse. -fn pick_for_host<'a>(clients: &'a [webclaw_http::Client], host: &str) -> &'a webclaw_http::Client { +fn pick_for_host<'a>(clients: &'a [wreq::Client], host: &str) -> &'a wreq::Client { let mut hasher = std::collections::hash_map::DefaultHasher::new(); host.hash(&mut hasher); let idx = (hasher.finish() as usize) % clients.len(); @@ -463,43 +493,12 @@ fn pick_for_host<'a>(clients: &'a [webclaw_http::Client], host: &str) -> &'a web } /// Pick a random client from the pool for per-request rotation. -fn pick_random(clients: &[webclaw_http::Client]) -> &webclaw_http::Client { +fn pick_random(clients: &[wreq::Client]) -> &wreq::Client { use rand::Rng; let idx = rand::thread_rng().gen_range(0..clients.len()); &clients[idx] } -/// Build a webclaw-http Client from config + browser variant + optional proxy. -fn build_client( - config: &FetchConfig, - variant: BrowserVariant, - proxy: Option<&str>, -) -> Result { - let mut builder = match variant { - BrowserVariant::Chrome => webclaw_http::Client::builder().chrome(), - BrowserVariant::ChromeMacos => webclaw_http::Client::builder().chrome_macos(), - BrowserVariant::Firefox => webclaw_http::Client::builder().firefox(), - BrowserVariant::Safari => webclaw_http::Client::builder().safari(), - BrowserVariant::Edge => webclaw_http::Client::builder().edge(), - }; - - builder = builder.timeout(config.timeout); - - for (k, v) in &config.headers { - builder = builder.default_header(k, v); - } - - if let Some(proxy_url) = proxy { - builder = builder - .proxy(proxy_url) - .map_err(|e| FetchError::Build(format!("proxy: {e}")))?; - } - - builder - .build() - .map_err(|e| FetchError::Build(e.to_string())) -} - /// Status codes worth retrying: server errors + rate limiting. fn is_retryable_status(status: u16) -> bool { status == 429 @@ -518,7 +517,7 @@ fn is_retryable_error(err: &FetchError) -> bool { matches!(err, FetchError::Request(_) | FetchError::BodyDecode(_)) } -fn is_pdf_content_type(headers: &webclaw_http::HeaderMap) -> bool { +fn is_pdf_content_type(headers: &http::HeaderMap) -> bool { headers .get("content-type") .and_then(|ct| ct.to_str().ok()) @@ -530,9 +529,7 @@ fn is_pdf_content_type(headers: &webclaw_http::HeaderMap) -> bool { } /// Detect if a response looks like a bot protection challenge page. -/// Checks for small HTML pages with known challenge markers. -fn is_challenge_response(response: &webclaw_http::Response) -> bool { - // Only check small HTML responses — real pages are typically >10KB +fn is_challenge_response(response: &Response) -> bool { let len = response.body().len(); if len > 15_000 || len == 0 { return false; @@ -541,12 +538,10 @@ fn is_challenge_response(response: &webclaw_http::Response) -> bool { let text = response.text(); let lower = text.to_lowercase(); - // Akamai Bot Manager challenge if lower.contains("challenge page") { return true; } - // Akamai sensor script on tiny page if lower.contains("bazadebezolkohpepadr") && len < 5_000 { return true; } @@ -628,7 +623,7 @@ mod tests { html: "".to_string(), status: 200, url: "https://example.com".to_string(), - headers: webclaw_http::HeaderMap::new(), + headers: http::HeaderMap::new(), elapsed: Duration::from_millis(42), }), }; @@ -680,7 +675,7 @@ mod tests { #[test] fn test_is_pdf_content_type() { - let mut headers = webclaw_http::HeaderMap::new(); + let mut headers = http::HeaderMap::new(); headers.insert("content-type", "application/pdf".parse().unwrap()); assert!(is_pdf_content_type(&headers)); @@ -696,7 +691,7 @@ mod tests { headers.insert("content-type", "text/html".parse().unwrap()); assert!(!is_pdf_content_type(&headers)); - let empty = webclaw_http::HeaderMap::new(); + let empty = http::HeaderMap::new(); assert!(!is_pdf_content_type(&empty)); } diff --git a/crates/webclaw-fetch/src/document.rs b/crates/webclaw-fetch/src/document.rs index 05c3b34..3d7d89d 100644 --- a/crates/webclaw-fetch/src/document.rs +++ b/crates/webclaw-fetch/src/document.rs @@ -34,7 +34,7 @@ impl std::fmt::Display for DocType { /// Detect document type from response headers or URL extension. /// Returns `None` for non-document responses (HTML, PDF, etc.). -pub fn is_document_content_type(headers: &webclaw_http::HeaderMap, url: &str) -> Option { +pub fn is_document_content_type(headers: &http::HeaderMap, url: &str) -> Option { // Check Content-Type header first if let Some(ct) = headers.get("content-type").and_then(|v| v.to_str().ok()) { let mime = ct.split(';').next().unwrap_or("").trim(); @@ -474,7 +474,7 @@ fn strip_markdown_formatting(markdown: &str) -> String { #[cfg(test)] mod tests { use super::*; - use webclaw_http::HeaderMap; + use http::HeaderMap; fn headers_with(name: &str, value: &str) -> HeaderMap { let mut h = HeaderMap::new(); diff --git a/crates/webclaw-fetch/src/error.rs b/crates/webclaw-fetch/src/error.rs index fe102d3..37c011d 100644 --- a/crates/webclaw-fetch/src/error.rs +++ b/crates/webclaw-fetch/src/error.rs @@ -5,7 +5,7 @@ use thiserror::Error; #[derive(Debug, Error)] pub enum FetchError { #[error("request failed: {0}")] - Request(#[from] webclaw_http::Error), + Request(#[from] wreq::Error), #[error("invalid url: {0}")] InvalidUrl(String), diff --git a/crates/webclaw-fetch/src/lib.rs b/crates/webclaw-fetch/src/lib.rs index b03a208..3a9b02c 100644 --- a/crates/webclaw-fetch/src/lib.rs +++ b/crates/webclaw-fetch/src/lib.rs @@ -1,5 +1,5 @@ //! webclaw-fetch: HTTP client layer with browser TLS fingerprint impersonation. -//! Uses webclaw-http for browser-grade TLS + HTTP/2 fingerprinting. +//! Uses wreq (BoringSSL) for browser-grade TLS + HTTP/2 fingerprinting. //! Automatically detects PDF responses and delegates to webclaw-pdf. pub mod browser; pub mod client; @@ -10,6 +10,7 @@ pub mod linkedin; pub mod proxy; pub mod reddit; pub mod sitemap; +pub mod tls; pub use browser::BrowserProfile; pub use client::{BatchExtractResult, BatchResult, FetchClient, FetchConfig, FetchResult}; @@ -17,5 +18,5 @@ pub use crawler::{CrawlConfig, CrawlResult, CrawlState, Crawler, PageResult}; pub use error::FetchError; pub use proxy::{parse_proxy_file, parse_proxy_line}; pub use sitemap::SitemapEntry; -pub use webclaw_http::HeaderMap; +pub use http::HeaderMap; pub use webclaw_pdf::PdfMode; diff --git a/crates/webclaw-fetch/src/tls.rs b/crates/webclaw-fetch/src/tls.rs new file mode 100644 index 0000000..608ae96 --- /dev/null +++ b/crates/webclaw-fetch/src/tls.rs @@ -0,0 +1,372 @@ +//! Browser TLS + HTTP/2 fingerprint profiles built on wreq (BoringSSL). +//! +//! Replaces the old webclaw-http/webclaw-tls patched rustls stack. +//! Each profile configures TLS options (cipher suites, curves, extensions, +//! PSK, ECH GREASE) and HTTP/2 options (SETTINGS order, pseudo-header order, +//! stream dependency, priorities) to match real browser fingerprints. + +use std::time::Duration; + +use wreq::http2::{ + Http2Options, PseudoId, PseudoOrder, SettingId, SettingsOrder, StreamDependency, StreamId, +}; +use wreq::tls::{AlpsProtocol, CertificateCompressionAlgorithm, TlsOptions, TlsVersion}; +use wreq::{Client, Emulation}; + +use crate::browser::BrowserVariant; +use crate::error::FetchError; + +/// Chrome cipher list (TLS 1.3 + TLS 1.2 in Chrome's exact order). +const CHROME_CIPHERS: &str = "TLS_AES_128_GCM_SHA256:TLS_AES_256_GCM_SHA384:TLS_CHACHA20_POLY1305_SHA256:TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA:TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA:TLS_RSA_WITH_AES_128_GCM_SHA256:TLS_RSA_WITH_AES_256_GCM_SHA384:TLS_RSA_WITH_AES_128_CBC_SHA:TLS_RSA_WITH_AES_256_CBC_SHA"; + +/// Chrome signature algorithms. +const CHROME_SIGALGS: &str = "ecdsa_secp256r1_sha256:rsa_pss_rsae_sha256:rsa_pkcs1_sha256:ecdsa_secp384r1_sha384:rsa_pss_rsae_sha384:rsa_pkcs1_sha384:rsa_pss_rsae_sha512:rsa_pkcs1_sha512"; + +/// Chrome curves (post-quantum ML-KEM + X25519 + P-256 + P-384). +const CHROME_CURVES: &str = "X25519MLKEM768:X25519:P-256:P-384"; + +/// Firefox cipher list. +const FIREFOX_CIPHERS: &str = "TLS_AES_128_GCM_SHA256:TLS_CHACHA20_POLY1305_SHA256:TLS_AES_256_GCM_SHA384:TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA:TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA:TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA:TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA:TLS_RSA_WITH_AES_128_GCM_SHA256:TLS_RSA_WITH_AES_256_GCM_SHA384:TLS_RSA_WITH_AES_128_CBC_SHA:TLS_RSA_WITH_AES_256_CBC_SHA"; + +/// Firefox signature algorithms. +const FIREFOX_SIGALGS: &str = "ecdsa_secp256r1_sha256:ecdsa_secp384r1_sha384:ecdsa_secp521r1_sha512:rsa_pss_rsae_sha256:rsa_pss_rsae_sha384:rsa_pss_rsae_sha512:rsa_pkcs1_sha256:rsa_pkcs1_sha384:rsa_pkcs1_sha512:ecdsa_sha1:rsa_pkcs1_sha1"; + +/// Firefox curves. +const FIREFOX_CURVES: &str = "X25519MLKEM768:X25519:P-256:P-384:P-521"; + +/// Safari cipher list. +const SAFARI_CIPHERS: &str = "TLS_AES_128_GCM_SHA256:TLS_AES_256_GCM_SHA384:TLS_CHACHA20_POLY1305_SHA256:TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA:TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA:TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA:TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA:TLS_RSA_WITH_AES_256_GCM_SHA384:TLS_RSA_WITH_AES_128_GCM_SHA256:TLS_RSA_WITH_AES_256_CBC_SHA:TLS_RSA_WITH_AES_128_CBC_SHA"; + +/// Safari signature algorithms. +const SAFARI_SIGALGS: &str = "ecdsa_secp256r1_sha256:rsa_pss_rsae_sha256:rsa_pkcs1_sha256:ecdsa_secp384r1_sha384:rsa_pss_rsae_sha384:ecdsa_secp521r1_sha512:rsa_pss_rsae_sha512:rsa_pkcs1_sha384:rsa_pkcs1_sha512"; + +/// Safari curves. +const SAFARI_CURVES: &str = "X25519:P-256:P-384:P-521"; + +// --- Chrome HTTP headers in correct wire order --- + +const CHROME_HEADERS: &[(&str, &str)] = &[ + ( + "sec-ch-ua", + r#""Google Chrome";v="145", "Chromium";v="145", "Not/A)Brand";v="24""#, + ), + ("sec-ch-ua-mobile", "?0"), + ("sec-ch-ua-platform", "\"Windows\""), + ("upgrade-insecure-requests", "1"), + ( + "user-agent", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36", + ), + ( + "accept", + "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", + ), + ("sec-fetch-site", "none"), + ("sec-fetch-mode", "navigate"), + ("sec-fetch-user", "?1"), + ("sec-fetch-dest", "document"), + ("accept-encoding", "gzip, deflate, br, zstd"), + ("accept-language", "en-US,en;q=0.9"), + ("priority", "u=0, i"), +]; + +const CHROME_MACOS_HEADERS: &[(&str, &str)] = &[ + ( + "sec-ch-ua", + r#""Google Chrome";v="145", "Chromium";v="145", "Not/A)Brand";v="24""#, + ), + ("sec-ch-ua-mobile", "?0"), + ("sec-ch-ua-platform", "\"macOS\""), + ("upgrade-insecure-requests", "1"), + ( + "user-agent", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36", + ), + ( + "accept", + "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", + ), + ("sec-fetch-site", "none"), + ("sec-fetch-mode", "navigate"), + ("sec-fetch-user", "?1"), + ("sec-fetch-dest", "document"), + ("accept-encoding", "gzip, deflate, br, zstd"), + ("accept-language", "en-US,en;q=0.9"), + ("priority", "u=0, i"), +]; + +const FIREFOX_HEADERS: &[(&str, &str)] = &[ + ( + "user-agent", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:135.0) Gecko/20100101 Firefox/135.0", + ), + ( + "accept", + "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + ), + ("accept-language", "en-US,en;q=0.5"), + ("accept-encoding", "gzip, deflate, br, zstd"), + ("upgrade-insecure-requests", "1"), + ("sec-fetch-dest", "document"), + ("sec-fetch-mode", "navigate"), + ("sec-fetch-site", "none"), + ("sec-fetch-user", "?1"), + ("priority", "u=0, i"), +]; + +const SAFARI_HEADERS: &[(&str, &str)] = &[ + ( + "user-agent", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.3.1 Safari/605.1.15", + ), + ( + "accept", + "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + ), + ("sec-fetch-site", "none"), + ("accept-language", "en-US,en;q=0.9"), + ("sec-fetch-mode", "navigate"), + ("accept-encoding", "gzip, deflate, br"), + ("sec-fetch-dest", "document"), +]; + +const EDGE_HEADERS: &[(&str, &str)] = &[ + ( + "sec-ch-ua", + r#""Microsoft Edge";v="145", "Chromium";v="145", "Not/A)Brand";v="24""#, + ), + ("sec-ch-ua-mobile", "?0"), + ("sec-ch-ua-platform", "\"Windows\""), + ("upgrade-insecure-requests", "1"), + ( + "user-agent", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36 Edg/145.0.0.0", + ), + ( + "accept", + "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", + ), + ("sec-fetch-site", "none"), + ("sec-fetch-mode", "navigate"), + ("sec-fetch-user", "?1"), + ("sec-fetch-dest", "document"), + ("accept-encoding", "gzip, deflate, br, zstd"), + ("accept-language", "en-US,en;q=0.9"), + ("priority", "u=0, i"), +]; + +fn chrome_tls() -> TlsOptions { + TlsOptions::builder() + .cipher_list(CHROME_CIPHERS) + .sigalgs_list(CHROME_SIGALGS) + .curves_list(CHROME_CURVES) + .min_tls_version(TlsVersion::TLS_1_2) + .max_tls_version(TlsVersion::TLS_1_3) + .grease_enabled(true) + .permute_extensions(true) + .enable_ech_grease(true) + .pre_shared_key(true) + .enable_ocsp_stapling(true) + .enable_signed_cert_timestamps(true) + .alps_protocols([AlpsProtocol::HTTP2]) + .alps_use_new_codepoint(true) + .aes_hw_override(true) + .certificate_compression_algorithms(&[CertificateCompressionAlgorithm::BROTLI]) + .build() +} + +fn firefox_tls() -> TlsOptions { + TlsOptions::builder() + .cipher_list(FIREFOX_CIPHERS) + .sigalgs_list(FIREFOX_SIGALGS) + .curves_list(FIREFOX_CURVES) + .min_tls_version(TlsVersion::TLS_1_2) + .max_tls_version(TlsVersion::TLS_1_3) + .grease_enabled(true) + .permute_extensions(false) + .enable_ech_grease(true) + .pre_shared_key(true) + .enable_ocsp_stapling(true) + .enable_signed_cert_timestamps(true) + .certificate_compression_algorithms(&[ + CertificateCompressionAlgorithm::ZLIB, + CertificateCompressionAlgorithm::BROTLI, + ]) + .build() +} + +fn safari_tls() -> TlsOptions { + TlsOptions::builder() + .cipher_list(SAFARI_CIPHERS) + .sigalgs_list(SAFARI_SIGALGS) + .curves_list(SAFARI_CURVES) + .min_tls_version(TlsVersion::TLS_1_2) + .max_tls_version(TlsVersion::TLS_1_3) + .grease_enabled(true) + .permute_extensions(false) + .enable_ech_grease(false) + .pre_shared_key(false) + .enable_ocsp_stapling(true) + .enable_signed_cert_timestamps(true) + .certificate_compression_algorithms(&[CertificateCompressionAlgorithm::ZLIB]) + .build() +} + +fn chrome_h2() -> Http2Options { + Http2Options::builder() + .initial_window_size(6_291_456) + .initial_connection_window_size(15_728_640) + .max_header_list_size(262_144) + .header_table_size(65_536) + .max_concurrent_streams(1000u32) + .enable_push(false) + .settings_order( + SettingsOrder::builder() + .extend([ + SettingId::HeaderTableSize, + SettingId::EnablePush, + SettingId::MaxConcurrentStreams, + SettingId::InitialWindowSize, + SettingId::MaxFrameSize, + SettingId::MaxHeaderListSize, + SettingId::EnableConnectProtocol, + SettingId::NoRfc7540Priorities, + ]) + .build(), + ) + .headers_pseudo_order( + PseudoOrder::builder() + .extend([ + PseudoId::Method, + PseudoId::Authority, + PseudoId::Scheme, + PseudoId::Path, + ]) + .build(), + ) + .headers_stream_dependency(StreamDependency::new(StreamId::zero(), 219, true)) + .build() +} + +fn firefox_h2() -> Http2Options { + Http2Options::builder() + .initial_window_size(131_072) + .initial_connection_window_size(12_517_377) + .max_header_list_size(65_536) + .header_table_size(65_536) + .settings_order( + SettingsOrder::builder() + .extend([ + SettingId::HeaderTableSize, + SettingId::InitialWindowSize, + SettingId::MaxFrameSize, + ]) + .build(), + ) + .headers_pseudo_order( + PseudoOrder::builder() + .extend([ + PseudoId::Method, + PseudoId::Path, + PseudoId::Authority, + PseudoId::Scheme, + ]) + .build(), + ) + .build() +} + +fn safari_h2() -> Http2Options { + Http2Options::builder() + .initial_window_size(2_097_152) + .initial_connection_window_size(10_420_225) + .max_header_list_size(0) + .header_table_size(4_096) + .enable_push(false) + .max_concurrent_streams(100u32) + .settings_order( + SettingsOrder::builder() + .extend([ + SettingId::EnablePush, + SettingId::MaxConcurrentStreams, + SettingId::InitialWindowSize, + SettingId::MaxFrameSize, + ]) + .build(), + ) + .headers_pseudo_order( + PseudoOrder::builder() + .extend([ + PseudoId::Method, + PseudoId::Scheme, + PseudoId::Authority, + PseudoId::Path, + ]) + .build(), + ) + .headers_stream_dependency(StreamDependency::new(StreamId::zero(), 255, false)) + .build() +} + +fn build_headers(pairs: &[(&str, &str)]) -> http::HeaderMap { + let mut map = http::HeaderMap::with_capacity(pairs.len()); + for (name, value) in pairs { + if let (Ok(n), Ok(v)) = ( + http::header::HeaderName::from_bytes(name.as_bytes()), + http::header::HeaderValue::from_str(value), + ) { + map.insert(n, v); + } + } + map +} + +/// Build a wreq Client for a specific browser variant. +pub fn build_client( + variant: BrowserVariant, + timeout: Duration, + extra_headers: &std::collections::HashMap, + proxy: Option<&str>, +) -> Result { + let (tls, h2, headers) = match variant { + BrowserVariant::Chrome => (chrome_tls(), chrome_h2(), CHROME_HEADERS), + BrowserVariant::ChromeMacos => (chrome_tls(), chrome_h2(), CHROME_MACOS_HEADERS), + BrowserVariant::Firefox => (firefox_tls(), firefox_h2(), FIREFOX_HEADERS), + BrowserVariant::Safari => (safari_tls(), safari_h2(), SAFARI_HEADERS), + BrowserVariant::Edge => (chrome_tls(), chrome_h2(), EDGE_HEADERS), + }; + + let mut header_map = build_headers(headers); + + // Append extra headers after profile defaults + for (k, v) in extra_headers { + if let (Ok(n), Ok(val)) = ( + http::header::HeaderName::from_bytes(k.as_bytes()), + http::header::HeaderValue::from_str(v), + ) { + header_map.insert(n, val); + } + } + + let emulation = Emulation::builder() + .tls_options(tls) + .http2_options(h2) + .headers(header_map) + .build(); + + let mut builder = Client::builder() + .emulation(emulation) + .redirect(wreq::redirect::Policy::limited(10)) + .cookie_store(true) + .timeout(timeout); + + if let Some(proxy_url) = proxy { + let proxy = + wreq::Proxy::all(proxy_url).map_err(|e| FetchError::Build(format!("proxy: {e}")))?; + builder = builder.proxy(proxy); + } + + builder + .build() + .map_err(|e| FetchError::Build(e.to_string())) +}