mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-08 22:25:12 +02:00
feat: replace primp with webclaw-tls, bump to v0.3.0
Replace primp dependency with our own TLS fingerprinting stack (webclaw-tls). Perfect Chrome 146 JA4 + Akamai hash match. - Remove primp entirely (zero references remaining) - webclaw-fetch now uses webclaw-http from github.com/0xMassi/webclaw-tls - Native + Mozilla root CAs (fixes HTTPS on cross-signed cert chains) - Skip unknown certificate extensions (SCT tolerance) - 99% bypass rate on 102 sites (was ~85% with primp) - Fixes #5 (HTTPS broken — example.com and similar sites now work) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
77e93441c0
commit
f13cb83c73
8 changed files with 204 additions and 599 deletions
236
Cargo.lock
generated
236
Cargo.lock
generated
|
|
@ -25,7 +25,7 @@ checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0"
|
|||
dependencies = [
|
||||
"cfg-if",
|
||||
"cipher",
|
||||
"cpufeatures 0.2.17",
|
||||
"cpufeatures",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -183,9 +183,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "aws-lc-sys"
|
||||
version = "0.39.0"
|
||||
version = "0.39.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1fa7e52a4c5c547c741610a2c6f123f3881e409b714cd27e6798ef020c514f0a"
|
||||
checksum = "83a25cf98105baa966497416dbd42565ce3a8cf8dbfd59803ec9ad46f3126399"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"cmake",
|
||||
|
|
@ -313,17 +313,6 @@ version = "0.2.1"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
|
||||
|
||||
[[package]]
|
||||
name = "chacha20"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"cpufeatures 0.3.0",
|
||||
"rand_core 0.10.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "chrono"
|
||||
version = "0.4.44"
|
||||
|
|
@ -467,16 +456,6 @@ dependencies = [
|
|||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "core-foundation"
|
||||
version = "0.9.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f"
|
||||
dependencies = [
|
||||
"core-foundation-sys",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "core-foundation"
|
||||
version = "0.10.1"
|
||||
|
|
@ -502,15 +481,6 @@ dependencies = [
|
|||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cpufeatures"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crc"
|
||||
version = "3.4.0"
|
||||
|
|
@ -973,7 +943,6 @@ dependencies = [
|
|||
"cfg-if",
|
||||
"libc",
|
||||
"r-efi 6.0.0",
|
||||
"rand_core 0.10.0",
|
||||
"wasip2",
|
||||
"wasip3",
|
||||
]
|
||||
|
|
@ -981,7 +950,7 @@ dependencies = [
|
|||
[[package]]
|
||||
name = "h2"
|
||||
version = "0.4.13"
|
||||
source = "git+https://github.com/deedy5/primp#b1d34a7b9fc7c24be515b1b74a469060b12fe137"
|
||||
source = "git+https://github.com/0xMassi/webclaw-tls#fcbd389f90994fc1be1efdde1065713d0ef562d5"
|
||||
dependencies = [
|
||||
"atomic-waker",
|
||||
"bytes",
|
||||
|
|
@ -1081,7 +1050,7 @@ checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87"
|
|||
[[package]]
|
||||
name = "hyper"
|
||||
version = "1.8.1"
|
||||
source = "git+https://github.com/deedy5/primp#b1d34a7b9fc7c24be515b1b74a469060b12fe137"
|
||||
source = "git+https://github.com/0xMassi/webclaw-tls#fcbd389f90994fc1be1efdde1065713d0ef562d5"
|
||||
dependencies = [
|
||||
"atomic-waker",
|
||||
"bytes",
|
||||
|
|
@ -1121,7 +1090,7 @@ dependencies = [
|
|||
[[package]]
|
||||
name = "hyper-util"
|
||||
version = "0.1.20"
|
||||
source = "git+https://github.com/deedy5/primp#b1d34a7b9fc7c24be515b1b74a469060b12fe137"
|
||||
source = "git+https://github.com/0xMassi/webclaw-tls#fcbd389f90994fc1be1efdde1065713d0ef562d5"
|
||||
dependencies = [
|
||||
"base64",
|
||||
"bytes",
|
||||
|
|
@ -1136,11 +1105,9 @@ dependencies = [
|
|||
"percent-encoding",
|
||||
"pin-project-lite",
|
||||
"socket2",
|
||||
"system-configuration",
|
||||
"tokio",
|
||||
"tower-service",
|
||||
"tracing",
|
||||
"windows-registry",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -1342,10 +1309,12 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "js-sys"
|
||||
version = "0.3.91"
|
||||
version = "0.3.92"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b49715b7073f385ba4bc528e5747d02e66cb39c6146efb66b781f131f0fb399c"
|
||||
checksum = "cc4c90f45aa2e6eacbe8645f77fdea542ac97a494bcd117a67df9ff4d611f995"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"futures-util",
|
||||
"once_cell",
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
|
@ -1502,22 +1471,6 @@ version = "2.8.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79"
|
||||
|
||||
[[package]]
|
||||
name = "mime"
|
||||
version = "0.3.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
|
||||
|
||||
[[package]]
|
||||
name = "mime_guess"
|
||||
version = "2.0.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f7c44f8e672c00fe5308fa235f821cb4198414e1c77935c1ab6948d3fd78550e"
|
||||
dependencies = [
|
||||
"mime",
|
||||
"unicase",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "minimal-lexical"
|
||||
version = "0.2.1"
|
||||
|
|
@ -1785,20 +1738,6 @@ dependencies = [
|
|||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "primp"
|
||||
version = "1.2.0"
|
||||
source = "git+https://github.com/deedy5/primp#b1d34a7b9fc7c24be515b1b74a469060b12fe137"
|
||||
dependencies = [
|
||||
"h2",
|
||||
"http",
|
||||
"rand 0.10.0",
|
||||
"reqwest 0.13.2",
|
||||
"rustls",
|
||||
"url",
|
||||
"webpki-roots",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.106"
|
||||
|
|
@ -1942,17 +1881,6 @@ dependencies = [
|
|||
"rand_core 0.9.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bc266eb313df6c5c09c1c7b1fbe2510961e5bcd3add930c1e31f7ed9da0feff8"
|
||||
dependencies = [
|
||||
"chacha20",
|
||||
"getrandom 0.4.2",
|
||||
"rand_core 0.10.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_chacha"
|
||||
version = "0.3.1"
|
||||
|
|
@ -1991,12 +1919,6 @@ dependencies = [
|
|||
"getrandom 0.3.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_core"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0c8d0fd677905edcbeedbf2edb6494d676f0e98d54d5cf9bda0b061cb8fb8aba"
|
||||
|
||||
[[package]]
|
||||
name = "rangemap"
|
||||
version = "1.7.1"
|
||||
|
|
@ -2102,14 +2024,13 @@ dependencies = [
|
|||
[[package]]
|
||||
name = "reqwest"
|
||||
version = "0.13.2"
|
||||
source = "git+https://github.com/deedy5/primp#b1d34a7b9fc7c24be515b1b74a469060b12fe137"
|
||||
source = "git+https://github.com/0xMassi/webclaw-tls#fcbd389f90994fc1be1efdde1065713d0ef562d5"
|
||||
dependencies = [
|
||||
"base64",
|
||||
"bytes",
|
||||
"cookie",
|
||||
"cookie_store",
|
||||
"futures-core",
|
||||
"futures-util",
|
||||
"h2",
|
||||
"http",
|
||||
"http-body",
|
||||
|
|
@ -2119,7 +2040,6 @@ dependencies = [
|
|||
"hyper-util",
|
||||
"js-sys",
|
||||
"log",
|
||||
"mime_guess",
|
||||
"percent-encoding",
|
||||
"pin-project-lite",
|
||||
"quinn",
|
||||
|
|
@ -2127,18 +2047,15 @@ dependencies = [
|
|||
"rustls-pki-types",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_urlencoded",
|
||||
"sync_wrapper",
|
||||
"tokio",
|
||||
"tokio-rustls",
|
||||
"tokio-util",
|
||||
"tower",
|
||||
"tower-http",
|
||||
"tower-service",
|
||||
"url",
|
||||
"wasm-bindgen",
|
||||
"wasm-bindgen-futures",
|
||||
"wasm-streams",
|
||||
"web-sys",
|
||||
"webpki-roots",
|
||||
]
|
||||
|
|
@ -2221,9 +2138,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "rustc-hash"
|
||||
version = "2.1.1"
|
||||
version = "2.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d"
|
||||
checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe"
|
||||
|
||||
[[package]]
|
||||
name = "rustix"
|
||||
|
|
@ -2241,7 +2158,7 @@ dependencies = [
|
|||
[[package]]
|
||||
name = "rustls"
|
||||
version = "0.23.37"
|
||||
source = "git+https://github.com/deedy5/primp#b1d34a7b9fc7c24be515b1b74a469060b12fe137"
|
||||
source = "git+https://github.com/0xMassi/webclaw-tls#fcbd389f90994fc1be1efdde1065713d0ef562d5"
|
||||
dependencies = [
|
||||
"aws-lc-rs",
|
||||
"brotli",
|
||||
|
|
@ -2253,7 +2170,6 @@ dependencies = [
|
|||
"rustls-webpki",
|
||||
"subtle",
|
||||
"zeroize",
|
||||
"zlib-rs",
|
||||
"zstd",
|
||||
]
|
||||
|
||||
|
|
@ -2366,7 +2282,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||
checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"core-foundation 0.10.1",
|
||||
"core-foundation",
|
||||
"core-foundation-sys",
|
||||
"libc",
|
||||
"security-framework-sys",
|
||||
|
|
@ -2489,7 +2405,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||
checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"cpufeatures 0.2.17",
|
||||
"cpufeatures",
|
||||
"digest",
|
||||
]
|
||||
|
||||
|
|
@ -2632,27 +2548,6 @@ dependencies = [
|
|||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "system-configuration"
|
||||
version = "0.6.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"core-foundation 0.9.4",
|
||||
"system-configuration-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "system-configuration-sys"
|
||||
version = "0.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4"
|
||||
dependencies = [
|
||||
"core-foundation-sys",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tempfile"
|
||||
version = "3.27.0"
|
||||
|
|
@ -2951,12 +2846,6 @@ version = "1.19.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb"
|
||||
|
||||
[[package]]
|
||||
name = "unicase"
|
||||
version = "2.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dbc4bc3a9f746d862c45cb89d705aa10f187bb96c76001afab07a0d35ce60142"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-ident"
|
||||
version = "1.0.24"
|
||||
|
|
@ -3074,9 +2963,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "wasm-bindgen"
|
||||
version = "0.2.114"
|
||||
version = "0.2.115"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6532f9a5c1ece3798cb1c2cfdba640b9b3ba884f5db45973a6f442510a87d38e"
|
||||
checksum = "6523d69017b7633e396a89c5efab138161ed5aafcbc8d3e5c5a42ae38f50495a"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"once_cell",
|
||||
|
|
@ -3087,23 +2976,19 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-futures"
|
||||
version = "0.4.64"
|
||||
version = "0.4.65"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e9c5522b3a28661442748e09d40924dfb9ca614b21c00d3fd135720e48b67db8"
|
||||
checksum = "2d1faf851e778dfa54db7cd438b70758eba9755cb47403f3496edd7c8fc212f0"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"futures-util",
|
||||
"js-sys",
|
||||
"once_cell",
|
||||
"wasm-bindgen",
|
||||
"web-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-macro"
|
||||
version = "0.2.114"
|
||||
version = "0.2.115"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "18a2d50fcf105fb33bb15f00e7a77b772945a2ee45dcf454961fd843e74c18e6"
|
||||
checksum = "4e3a6c758eb2f701ed3d052ff5737f5bfe6614326ea7f3bbac7156192dc32e67"
|
||||
dependencies = [
|
||||
"quote",
|
||||
"wasm-bindgen-macro-support",
|
||||
|
|
@ -3111,9 +2996,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-macro-support"
|
||||
version = "0.2.114"
|
||||
version = "0.2.115"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "03ce4caeaac547cdf713d280eda22a730824dd11e6b8c3ca9e42247b25c631e3"
|
||||
checksum = "921de2737904886b52bcbb237301552d05969a6f9c40d261eb0533c8b055fedf"
|
||||
dependencies = [
|
||||
"bumpalo",
|
||||
"proc-macro2",
|
||||
|
|
@ -3124,9 +3009,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-shared"
|
||||
version = "0.2.114"
|
||||
version = "0.2.115"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "75a326b8c223ee17883a4251907455a2431acc2791c98c26279376490c378c16"
|
||||
checksum = "a93e946af942b58934c604527337bad9ae33ba1d5c6900bbb41c2c07c2364a93"
|
||||
dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
|
@ -3153,19 +3038,6 @@ dependencies = [
|
|||
"wasmparser",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasm-streams"
|
||||
version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9d1ec4f6517c9e11ae630e200b2b65d193279042e28edd4a2cda233e46670bbb"
|
||||
dependencies = [
|
||||
"futures-util",
|
||||
"js-sys",
|
||||
"wasm-bindgen",
|
||||
"wasm-bindgen-futures",
|
||||
"web-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasmparser"
|
||||
version = "0.244.0"
|
||||
|
|
@ -3180,9 +3052,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "web-sys"
|
||||
version = "0.3.91"
|
||||
version = "0.3.92"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "854ba17bb104abfb26ba36da9729addc7ce7f06f5c0f90f3c391f8461cca21f9"
|
||||
checksum = "84cde8507f4d7cfcb1185b8cb5890c494ffea65edbe1ba82cfd63661c805ed94"
|
||||
dependencies = [
|
||||
"js-sys",
|
||||
"wasm-bindgen",
|
||||
|
|
@ -3200,11 +3072,10 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-cli"
|
||||
version = "0.2.3"
|
||||
version = "0.3.0"
|
||||
dependencies = [
|
||||
"clap",
|
||||
"dotenvy",
|
||||
"rand 0.8.5",
|
||||
"regex",
|
||||
"reqwest 0.12.28",
|
||||
"serde_json",
|
||||
|
|
@ -3220,10 +3091,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-core"
|
||||
version = "0.2.3"
|
||||
version = "0.3.0"
|
||||
dependencies = [
|
||||
"ego-tree",
|
||||
"once_cell",
|
||||
"regex",
|
||||
"rquickjs",
|
||||
"scraper",
|
||||
|
|
@ -3238,10 +3108,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-fetch"
|
||||
version = "0.2.3"
|
||||
version = "0.3.0"
|
||||
dependencies = [
|
||||
"calamine",
|
||||
"primp",
|
||||
"quick-xml 0.37.5",
|
||||
"rand 0.8.5",
|
||||
"serde",
|
||||
|
|
@ -3252,13 +3121,31 @@ dependencies = [
|
|||
"tracing",
|
||||
"url",
|
||||
"webclaw-core",
|
||||
"webclaw-http",
|
||||
"webclaw-pdf",
|
||||
"zip 2.4.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "webclaw-http"
|
||||
version = "0.1.0"
|
||||
source = "git+https://github.com/0xMassi/webclaw-tls#fcbd389f90994fc1be1efdde1065713d0ef562d5"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"h2",
|
||||
"http",
|
||||
"reqwest 0.13.2",
|
||||
"rustls",
|
||||
"rustls-native-certs",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"url",
|
||||
"webpki-roots",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "webclaw-llm"
|
||||
version = "0.2.3"
|
||||
version = "0.3.0"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"reqwest 0.12.28",
|
||||
|
|
@ -3271,7 +3158,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-mcp"
|
||||
version = "0.2.3"
|
||||
version = "0.3.0"
|
||||
dependencies = [
|
||||
"dotenvy",
|
||||
"reqwest 0.12.28",
|
||||
|
|
@ -3291,7 +3178,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-pdf"
|
||||
version = "0.2.3"
|
||||
version = "0.3.0"
|
||||
dependencies = [
|
||||
"pdf-extract",
|
||||
"thiserror",
|
||||
|
|
@ -3354,17 +3241,6 @@ version = "0.2.1"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
|
||||
|
||||
[[package]]
|
||||
name = "windows-registry"
|
||||
version = "0.6.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "02752bf7fbdcce7f2a27a742f798510f3e5ad88dbe84871e5168e2120c3d5720"
|
||||
dependencies = [
|
||||
"windows-link",
|
||||
"windows-result",
|
||||
"windows-strings",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-result"
|
||||
version = "0.4.1"
|
||||
|
|
@ -3667,18 +3543,18 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "zerocopy"
|
||||
version = "0.8.47"
|
||||
version = "0.8.48"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "efbb2a062be311f2ba113ce66f697a4dc589f85e78a4aea276200804cea0ed87"
|
||||
checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9"
|
||||
dependencies = [
|
||||
"zerocopy-derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zerocopy-derive"
|
||||
version = "0.8.47"
|
||||
version = "0.8.48"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0e8bc7269b54418e7aeeef514aa68f8690b8c0489a06b0136e5f57c4c5ccab89"
|
||||
checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
|
|
|
|||
18
Cargo.toml
18
Cargo.toml
|
|
@ -3,7 +3,7 @@ resolver = "2"
|
|||
members = ["crates/*"]
|
||||
|
||||
[workspace.package]
|
||||
version = "0.2.3"
|
||||
version = "0.3.0"
|
||||
edition = "2024"
|
||||
license = "MIT"
|
||||
repository = "https://github.com/0xMassi/webclaw"
|
||||
|
|
@ -22,11 +22,13 @@ tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
|||
clap = { version = "4", features = ["derive", "env"] }
|
||||
dotenvy = "0.15"
|
||||
|
||||
# primp requires patched forks with TLS impersonation support.
|
||||
# Must mirror all patches from primp's own Cargo.toml.
|
||||
# TLS + HTTP/2 fingerprinting via webclaw-tls.
|
||||
# rustls: TLS fingerprinting (JA4 match Chrome 146)
|
||||
# h2: HTTP/2 SETTINGS ordering + pseudo-header order
|
||||
# hyper/hyper-util/reqwest: passthrough for consistent dependency chain
|
||||
[patch.crates-io]
|
||||
reqwest = { git = "https://github.com/deedy5/primp", subdirectory = "crates/primp-reqwest" }
|
||||
rustls = { git = "https://github.com/deedy5/primp", subdirectory = "crates/primp-rustls/rustls" }
|
||||
h2 = { git = "https://github.com/deedy5/primp", subdirectory = "crates/primp-h2" }
|
||||
hyper = { git = "https://github.com/deedy5/primp", subdirectory = "crates/primp-hyper" }
|
||||
hyper-util = { git = "https://github.com/deedy5/primp", subdirectory = "crates/primp-hyper-util" }
|
||||
rustls = { git = "https://github.com/0xMassi/webclaw-tls" }
|
||||
h2 = { git = "https://github.com/0xMassi/webclaw-tls" }
|
||||
hyper = { git = "https://github.com/0xMassi/webclaw-tls" }
|
||||
hyper-util = { git = "https://github.com/0xMassi/webclaw-tls" }
|
||||
reqwest = { git = "https://github.com/0xMassi/webclaw-tls" }
|
||||
|
|
|
|||
|
|
@ -3,6 +3,11 @@
|
|||
/// When WEBCLAW_API_KEY is set (or --api-key is passed), the CLI can fall back
|
||||
/// to api.webclaw.io for bot-protected or JS-rendered sites. With --cloud flag,
|
||||
/// all requests go through the cloud API directly.
|
||||
///
|
||||
/// NOTE: The canonical, full-featured cloud module lives in webclaw-mcp/src/cloud.rs
|
||||
/// (smart_fetch, bot detection, JS rendering checks). This is the minimal subset
|
||||
/// needed by the CLI. Kept separate to avoid pulling in rmcp via webclaw-mcp.
|
||||
/// and adding webclaw-mcp as a dependency would pull in rmcp.
|
||||
use serde_json::{Value, json};
|
||||
|
||||
const API_BASE: &str = "https://api.webclaw.io/v1";
|
||||
|
|
@ -51,46 +56,6 @@ impl CloudClient {
|
|||
self.post("scrape", body).await
|
||||
}
|
||||
|
||||
/// Summarize via cloud API.
|
||||
pub async fn summarize(
|
||||
&self,
|
||||
url: &str,
|
||||
max_sentences: Option<usize>,
|
||||
) -> Result<Value, String> {
|
||||
let mut body = json!({ "url": url });
|
||||
if let Some(n) = max_sentences {
|
||||
body["max_sentences"] = json!(n);
|
||||
}
|
||||
self.post("summarize", body).await
|
||||
}
|
||||
|
||||
/// Brand extraction via cloud API.
|
||||
pub async fn brand(&self, url: &str) -> Result<Value, String> {
|
||||
self.post("brand", json!({ "url": url })).await
|
||||
}
|
||||
|
||||
/// Diff via cloud API.
|
||||
pub async fn diff(&self, url: &str) -> Result<Value, String> {
|
||||
self.post("diff", json!({ "url": url })).await
|
||||
}
|
||||
|
||||
/// Extract via cloud API.
|
||||
pub async fn extract(
|
||||
&self,
|
||||
url: &str,
|
||||
schema: Option<&str>,
|
||||
prompt: Option<&str>,
|
||||
) -> Result<Value, String> {
|
||||
let mut body = json!({ "url": url });
|
||||
if let Some(s) = schema {
|
||||
body["schema"] = serde_json::from_str(s).unwrap_or(json!(s));
|
||||
}
|
||||
if let Some(p) = prompt {
|
||||
body["prompt"] = json!(p);
|
||||
}
|
||||
self.post("extract", body).await
|
||||
}
|
||||
|
||||
async fn post(&self, endpoint: &str, body: Value) -> Result<Value, String> {
|
||||
let resp = self
|
||||
.http
|
||||
|
|
@ -113,58 +78,3 @@ impl CloudClient {
|
|||
.map_err(|e| format!("cloud API response parse failed: {e}"))
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if HTML is a bot protection challenge page.
|
||||
pub fn is_bot_protected(html: &str) -> bool {
|
||||
let html_lower = html.to_lowercase();
|
||||
|
||||
// Cloudflare
|
||||
if html_lower.contains("_cf_chl_opt") || html_lower.contains("challenge-platform") {
|
||||
return true;
|
||||
}
|
||||
if (html_lower.contains("just a moment") || html_lower.contains("checking your browser"))
|
||||
&& html_lower.contains("cf-spinner")
|
||||
{
|
||||
return true;
|
||||
}
|
||||
if (html_lower.contains("cf-turnstile")
|
||||
|| html_lower.contains("challenges.cloudflare.com/turnstile"))
|
||||
&& html.len() < 100_000
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
// DataDome
|
||||
if html_lower.contains("geo.captcha-delivery.com") {
|
||||
return true;
|
||||
}
|
||||
|
||||
// AWS WAF
|
||||
if html_lower.contains("awswaf-captcha") {
|
||||
return true;
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
/// Check if a page likely needs JS rendering.
|
||||
pub fn needs_js_rendering(word_count: usize, html: &str) -> bool {
|
||||
let has_scripts = html.contains("<script");
|
||||
|
||||
if word_count < 50 && html.len() > 5_000 && has_scripts {
|
||||
return true;
|
||||
}
|
||||
|
||||
if word_count < 800 && html.len() > 50_000 && has_scripts {
|
||||
let html_lower = html.to_lowercase();
|
||||
if html_lower.contains("react-app")
|
||||
|| html_lower.contains("id=\"__next\"")
|
||||
|| html_lower.contains("id=\"root\"")
|
||||
|| html_lower.contains("id=\"app\"")
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
[package]
|
||||
name = "webclaw-fetch"
|
||||
description = "HTTP client with browser TLS fingerprint impersonation via Impit"
|
||||
description = "HTTP client with browser TLS fingerprint impersonation via webclaw-http"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
|
@ -12,9 +12,7 @@ serde = { workspace = true }
|
|||
thiserror = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
tokio = { workspace = true }
|
||||
primp = { git = "https://github.com/deedy5/primp", default-features = false, features = [
|
||||
"default-tls", "http2", "impersonate", "cookies", "gzip", "brotli", "deflate", "zstd", "socks",
|
||||
] }
|
||||
webclaw-http = { git = "https://github.com/0xMassi/webclaw-tls" }
|
||||
url = "2"
|
||||
rand = "0.8"
|
||||
quick-xml = { version = "0.37", features = ["serde"] }
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
/// Browser fingerprint selection and rotation.
|
||||
/// Maps our simple `BrowserProfile` enum to primp's impersonation profiles.
|
||||
use primp::{Impersonate, ImpersonateOS};
|
||||
/// Maps our BrowserProfile enum to webclaw-http client builder methods.
|
||||
|
||||
/// Which browser identity to present at the TLS/HTTP layer.
|
||||
#[derive(Debug, Clone, Default)]
|
||||
|
|
@ -12,85 +11,41 @@ pub enum BrowserProfile {
|
|||
Random,
|
||||
}
|
||||
|
||||
/// A complete impersonation profile: browser + OS.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ImpersonateProfile {
|
||||
pub browser: Impersonate,
|
||||
pub os: ImpersonateOS,
|
||||
/// A browser variant for building webclaw-http clients.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum BrowserVariant {
|
||||
Chrome,
|
||||
ChromeMacos,
|
||||
Firefox,
|
||||
Safari,
|
||||
Edge,
|
||||
}
|
||||
|
||||
/// All Chrome profiles we ship, newest first.
|
||||
pub fn chrome_profiles() -> Vec<ImpersonateProfile> {
|
||||
/// All Chrome variants we ship.
|
||||
pub fn chrome_variants() -> Vec<BrowserVariant> {
|
||||
vec![BrowserVariant::Chrome, BrowserVariant::ChromeMacos]
|
||||
}
|
||||
|
||||
/// All Firefox variants we ship.
|
||||
pub fn firefox_variants() -> Vec<BrowserVariant> {
|
||||
vec![BrowserVariant::Firefox]
|
||||
}
|
||||
|
||||
/// All variants for maximum diversity in Random mode.
|
||||
pub fn all_variants() -> Vec<BrowserVariant> {
|
||||
vec![
|
||||
ImpersonateProfile {
|
||||
browser: Impersonate::ChromeV145,
|
||||
os: ImpersonateOS::Windows,
|
||||
},
|
||||
ImpersonateProfile {
|
||||
browser: Impersonate::ChromeV145,
|
||||
os: ImpersonateOS::MacOS,
|
||||
},
|
||||
ImpersonateProfile {
|
||||
browser: Impersonate::ChromeV144,
|
||||
os: ImpersonateOS::Windows,
|
||||
},
|
||||
ImpersonateProfile {
|
||||
browser: Impersonate::ChromeV144,
|
||||
os: ImpersonateOS::Linux,
|
||||
},
|
||||
BrowserVariant::Chrome,
|
||||
BrowserVariant::ChromeMacos,
|
||||
BrowserVariant::Firefox,
|
||||
BrowserVariant::Safari,
|
||||
BrowserVariant::Edge,
|
||||
]
|
||||
}
|
||||
|
||||
/// All Firefox profiles we ship, newest first.
|
||||
pub fn firefox_profiles() -> Vec<ImpersonateProfile> {
|
||||
vec![
|
||||
ImpersonateProfile {
|
||||
browser: Impersonate::FirefoxV146,
|
||||
os: ImpersonateOS::Windows,
|
||||
},
|
||||
ImpersonateProfile {
|
||||
browser: Impersonate::FirefoxV146,
|
||||
os: ImpersonateOS::Linux,
|
||||
},
|
||||
ImpersonateProfile {
|
||||
browser: Impersonate::FirefoxV140,
|
||||
os: ImpersonateOS::Windows,
|
||||
},
|
||||
]
|
||||
pub fn latest_chrome() -> BrowserVariant {
|
||||
BrowserVariant::Chrome
|
||||
}
|
||||
|
||||
/// Safari + Edge + Opera profiles for maximum diversity in Random mode.
|
||||
pub fn extra_profiles() -> Vec<ImpersonateProfile> {
|
||||
vec![
|
||||
ImpersonateProfile {
|
||||
browser: Impersonate::SafariV18_5,
|
||||
os: ImpersonateOS::MacOS,
|
||||
},
|
||||
ImpersonateProfile {
|
||||
browser: Impersonate::SafariV26,
|
||||
os: ImpersonateOS::MacOS,
|
||||
},
|
||||
ImpersonateProfile {
|
||||
browser: Impersonate::EdgeV145,
|
||||
os: ImpersonateOS::Windows,
|
||||
},
|
||||
ImpersonateProfile {
|
||||
browser: Impersonate::OperaV127,
|
||||
os: ImpersonateOS::Windows,
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
pub fn latest_chrome() -> ImpersonateProfile {
|
||||
ImpersonateProfile {
|
||||
browser: Impersonate::SafariV26,
|
||||
os: ImpersonateOS::MacOS,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn latest_firefox() -> ImpersonateProfile {
|
||||
ImpersonateProfile {
|
||||
browser: Impersonate::FirefoxV146,
|
||||
os: ImpersonateOS::Windows,
|
||||
}
|
||||
pub fn latest_firefox() -> BrowserVariant {
|
||||
BrowserVariant::Firefox
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
/// HTTP client with browser TLS fingerprint impersonation.
|
||||
/// Wraps primp to provide a simple fetch interface with optional
|
||||
/// content extraction via webclaw-core. Supports single and batch operations.
|
||||
/// Uses webclaw-http for browser-grade TLS + HTTP/2 fingerprinting.
|
||||
/// Supports single and batch operations with proxy rotation.
|
||||
/// Automatically detects PDF responses and extracts text via webclaw-pdf.
|
||||
///
|
||||
/// Two proxy modes:
|
||||
|
|
@ -17,7 +17,7 @@ use tokio::sync::Semaphore;
|
|||
use tracing::{debug, instrument, warn};
|
||||
use webclaw_pdf::PdfMode;
|
||||
|
||||
use crate::browser::{self, BrowserProfile, ImpersonateProfile};
|
||||
use crate::browser::{self, BrowserProfile, BrowserVariant};
|
||||
use crate::error::FetchError;
|
||||
|
||||
/// Configuration for building a [`FetchClient`].
|
||||
|
|
@ -83,20 +83,22 @@ enum ClientPool {
|
|||
/// Pre-built clients with a fixed proxy (or no proxy).
|
||||
/// Fingerprint rotation still works via the pool when `random` is true.
|
||||
Static {
|
||||
clients: Vec<primp::Client>,
|
||||
clients: Vec<webclaw_http::Client>,
|
||||
random: bool,
|
||||
},
|
||||
/// Pre-built pool of clients, each with a different proxy + fingerprint.
|
||||
/// Requests pick a client deterministically by host for HTTP/2 connection reuse.
|
||||
Rotating { clients: Vec<primp::Client> },
|
||||
Rotating {
|
||||
clients: Vec<webclaw_http::Client>,
|
||||
},
|
||||
}
|
||||
|
||||
/// HTTP client that impersonates browser TLS fingerprints via primp.
|
||||
/// HTTP client with browser TLS + HTTP/2 fingerprinting via webclaw-http.
|
||||
///
|
||||
/// Operates in two modes:
|
||||
/// - **Static pool**: pre-built primp clients, optionally with fingerprint rotation.
|
||||
/// - **Static pool**: pre-built clients, optionally with fingerprint rotation.
|
||||
/// Used when no `proxy_pool` is configured. Fast (no per-request construction).
|
||||
/// - **Rotating pool**: pre-built primp clients, one per proxy in the pool.
|
||||
/// - **Rotating pool**: pre-built clients, one per proxy in the pool.
|
||||
/// Same-host URLs are routed to the same client for HTTP/2 multiplexing.
|
||||
pub struct FetchClient {
|
||||
pool: ClientPool,
|
||||
|
|
@ -106,20 +108,20 @@ pub struct FetchClient {
|
|||
impl FetchClient {
|
||||
/// Build a new client from config.
|
||||
///
|
||||
/// When `config.proxy_pool` is non-empty, pre-builds one primp client per proxy,
|
||||
/// When `config.proxy_pool` is non-empty, pre-builds one client per proxy,
|
||||
/// each with a randomly assigned fingerprint. Same-host URLs get routed to the
|
||||
/// same client for HTTP/2 connection reuse.
|
||||
///
|
||||
/// When `proxy_pool` is empty, pre-builds primp clients at construction time
|
||||
/// When `proxy_pool` is empty, pre-builds clients at construction time
|
||||
/// (one per fingerprint for `Random` profiles, one for fixed profiles).
|
||||
pub fn new(config: FetchConfig) -> Result<Self, FetchError> {
|
||||
let profiles = collect_profiles(&config.browser);
|
||||
let variants = collect_variants(&config.browser);
|
||||
let pdf_mode = config.pdf_mode.clone();
|
||||
|
||||
let pool = if config.proxy_pool.is_empty() {
|
||||
let clients = profiles
|
||||
let clients = variants
|
||||
.into_iter()
|
||||
.map(|p| build_primp_client(&config, &p, config.proxy.as_deref()))
|
||||
.map(|v| build_client(&config, v, config.proxy.as_deref()))
|
||||
.collect::<Result<Vec<_>, _>>()?;
|
||||
|
||||
let random = matches!(config.browser, BrowserProfile::Random);
|
||||
|
|
@ -136,14 +138,13 @@ impl FetchClient {
|
|||
.proxy_pool
|
||||
.iter()
|
||||
.map(|proxy| {
|
||||
let p = profiles.choose(&mut rng).unwrap().clone();
|
||||
build_primp_client(&config, &p, Some(proxy))
|
||||
let v = *variants.choose(&mut rng).unwrap();
|
||||
build_client(&config, v, Some(proxy))
|
||||
})
|
||||
.collect::<Result<Vec<_>, _>>()?;
|
||||
|
||||
debug!(
|
||||
clients = clients.len(),
|
||||
profiles = profiles.len(),
|
||||
"fetch client ready (pre-built rotating pool)"
|
||||
);
|
||||
|
||||
|
|
@ -206,91 +207,13 @@ impl FetchClient {
|
|||
Err(last_err.unwrap_or_else(|| FetchError::Build("all retries exhausted".into())))
|
||||
}
|
||||
|
||||
/// Single fetch attempt with automatic plain-client fallback.
|
||||
///
|
||||
/// If the TLS-impersonated client fails with a connection error or gets a 403,
|
||||
/// retries with a plain client (no impersonation). Some sites (e.g. ycombinator.com)
|
||||
/// reject forged TLS fingerprints but accept default rustls connections.
|
||||
/// Single fetch attempt. Uses the TLS-impersonated client from the pool.
|
||||
async fn fetch_once(&self, url: &str) -> Result<FetchResult, FetchError> {
|
||||
let start = Instant::now();
|
||||
let client = self.pick_client(url);
|
||||
|
||||
let client = match &self.pool {
|
||||
ClientPool::Static { clients, random } => {
|
||||
if *random {
|
||||
let host = extract_host(url);
|
||||
pick_for_host(clients, &host)
|
||||
} else {
|
||||
&clients[0]
|
||||
}
|
||||
}
|
||||
ClientPool::Rotating { clients } => pick_random(clients),
|
||||
};
|
||||
|
||||
// Try impersonated client first
|
||||
let needs_plain_fallback = match client.get(url).send().await {
|
||||
Ok(response) => {
|
||||
let status = response.status().as_u16();
|
||||
if status == 403 {
|
||||
debug!(url, "impersonated client got 403, trying plain fallback");
|
||||
true
|
||||
} else {
|
||||
return Self::response_to_result(response, start).await;
|
||||
}
|
||||
}
|
||||
Err(_e) => {
|
||||
debug!(
|
||||
url,
|
||||
"impersonated client connection failed, trying plain fallback"
|
||||
);
|
||||
true
|
||||
}
|
||||
};
|
||||
|
||||
// Plain client fallback (no TLS impersonation)
|
||||
if needs_plain_fallback {
|
||||
let plain = primp::Client::builder()
|
||||
.user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36")
|
||||
.cookie_store(true)
|
||||
.timeout(Duration::from_secs(30))
|
||||
.build()
|
||||
.map_err(|e| FetchError::Build(format!("plain client: {e}")))?;
|
||||
|
||||
let response = plain.get(url).send().await?;
|
||||
return Self::response_to_result(response, start).await;
|
||||
}
|
||||
|
||||
unreachable!()
|
||||
}
|
||||
|
||||
/// Convert a primp Response into a FetchResult.
|
||||
async fn response_to_result(
|
||||
response: primp::Response,
|
||||
start: Instant,
|
||||
) -> Result<FetchResult, FetchError> {
|
||||
let status = response.status().as_u16();
|
||||
let final_url = response.url().to_string();
|
||||
|
||||
let headers: HashMap<String, String> = response
|
||||
.headers()
|
||||
.iter()
|
||||
.map(|(k, v)| (k.to_string(), v.to_str().unwrap_or("").to_string()))
|
||||
.collect();
|
||||
|
||||
let html = response
|
||||
.text()
|
||||
.await
|
||||
.map_err(|e| FetchError::BodyDecode(e.to_string()))?;
|
||||
|
||||
let elapsed = start.elapsed();
|
||||
debug!(status, elapsed_ms = %elapsed.as_millis(), "fetch complete");
|
||||
|
||||
Ok(FetchResult {
|
||||
html,
|
||||
status,
|
||||
url: final_url,
|
||||
headers,
|
||||
elapsed,
|
||||
})
|
||||
let response = client.get(url).await?;
|
||||
response_to_result(response, start)
|
||||
}
|
||||
|
||||
/// Fetch a URL then extract structured content.
|
||||
|
|
@ -307,10 +230,6 @@ impl FetchClient {
|
|||
}
|
||||
|
||||
/// Fetch a URL then extract structured content with custom extraction options.
|
||||
///
|
||||
/// Same as [`fetch_and_extract`] but accepts `ExtractionOptions` for CSS selector
|
||||
/// filtering, main-content-only mode, etc. Options only apply to HTML responses;
|
||||
/// PDF extraction ignores them (no DOM to filter).
|
||||
#[instrument(skip(self, options), fields(url = %url))]
|
||||
pub async fn fetch_and_extract_with_options(
|
||||
&self,
|
||||
|
|
@ -318,24 +237,15 @@ impl FetchClient {
|
|||
options: &webclaw_core::ExtractionOptions,
|
||||
) -> Result<webclaw_core::ExtractionResult, FetchError> {
|
||||
// Reddit fallback: use their JSON API to get post + full comment tree.
|
||||
// Uses a plain reqwest client — Reddit's JSON endpoint blocks TLS-fingerprinted clients
|
||||
// but accepts standard requests with a browser User-Agent.
|
||||
if crate::reddit::is_reddit_url(url) {
|
||||
let json_url = crate::reddit::json_url(url);
|
||||
debug!("reddit detected, fetching {json_url}");
|
||||
|
||||
let plain = primp::Client::builder()
|
||||
.user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36")
|
||||
.timeout(std::time::Duration::from_secs(15))
|
||||
.build()
|
||||
.map_err(|e| FetchError::Build(format!("reddit client: {e}")))?;
|
||||
let response = plain.get(&json_url).send().await?;
|
||||
if response.status().is_success() {
|
||||
let bytes = response
|
||||
.bytes()
|
||||
.await
|
||||
.map_err(|e| FetchError::BodyDecode(e.to_string()))?;
|
||||
match crate::reddit::parse_reddit_json(&bytes, url) {
|
||||
let client = self.pick_client(url);
|
||||
let response = client.get(&json_url).await?;
|
||||
if response.is_success() {
|
||||
let bytes = response.body();
|
||||
match crate::reddit::parse_reddit_json(bytes, url) {
|
||||
Ok(result) => return Ok(result),
|
||||
Err(e) => warn!("reddit json fallback failed: {e}, falling back to HTML"),
|
||||
}
|
||||
|
|
@ -344,50 +254,19 @@ impl FetchClient {
|
|||
|
||||
let start = Instant::now();
|
||||
let client = self.pick_client(url);
|
||||
let response = client.get(url).await?;
|
||||
|
||||
// Try impersonated client, fall back to plain on connection error or 403
|
||||
let response = match client.get(url).send().await {
|
||||
Ok(resp) if resp.status().as_u16() == 403 => {
|
||||
debug!(url, "impersonated client got 403, trying plain fallback");
|
||||
let plain = primp::Client::builder()
|
||||
.user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36")
|
||||
.cookie_store(true)
|
||||
.timeout(Duration::from_secs(30))
|
||||
.build()
|
||||
.map_err(|e| FetchError::Build(format!("plain fallback: {e}")))?;
|
||||
plain.get(url).send().await?
|
||||
}
|
||||
Ok(resp) => resp,
|
||||
Err(_e) => {
|
||||
debug!(url, "impersonated client failed, trying plain fallback");
|
||||
let plain = primp::Client::builder()
|
||||
.user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36")
|
||||
.cookie_store(true)
|
||||
.timeout(Duration::from_secs(30))
|
||||
.build()
|
||||
.map_err(|e| FetchError::Build(format!("plain fallback: {e}")))?;
|
||||
plain.get(url).send().await?
|
||||
}
|
||||
};
|
||||
|
||||
let status = response.status().as_u16();
|
||||
let status = response.status();
|
||||
let final_url = response.url().to_string();
|
||||
|
||||
let headers: HashMap<String, String> = response
|
||||
.headers()
|
||||
.iter()
|
||||
.map(|(k, v)| (k.to_string(), v.to_str().unwrap_or("").to_string()))
|
||||
.collect();
|
||||
let headers: HashMap<String, String> = response.headers().clone();
|
||||
|
||||
let is_pdf = is_pdf_content_type(&headers);
|
||||
|
||||
if is_pdf {
|
||||
debug!(status, "detected PDF response, using pdf extraction");
|
||||
|
||||
let bytes = response
|
||||
.bytes()
|
||||
.await
|
||||
.map_err(|e| FetchError::BodyDecode(e.to_string()))?;
|
||||
let bytes = response.body();
|
||||
|
||||
let elapsed = start.elapsed();
|
||||
debug!(
|
||||
|
|
@ -397,17 +276,14 @@ impl FetchClient {
|
|||
"PDF fetch complete"
|
||||
);
|
||||
|
||||
let pdf_result = webclaw_pdf::extract_pdf(&bytes, self.pdf_mode.clone())?;
|
||||
let pdf_result = webclaw_pdf::extract_pdf(bytes, self.pdf_mode.clone())?;
|
||||
Ok(pdf_to_extraction_result(&pdf_result, &final_url))
|
||||
} else if let Some(doc_type) =
|
||||
crate::document::is_document_content_type(&headers, &final_url)
|
||||
{
|
||||
debug!(status, doc_type = ?doc_type, "detected document response, extracting");
|
||||
|
||||
let bytes = response
|
||||
.bytes()
|
||||
.await
|
||||
.map_err(|e| FetchError::BodyDecode(e.to_string()))?;
|
||||
let bytes = response.body();
|
||||
|
||||
let elapsed = start.elapsed();
|
||||
debug!(
|
||||
|
|
@ -417,14 +293,11 @@ impl FetchClient {
|
|||
"document fetch complete"
|
||||
);
|
||||
|
||||
let mut result = crate::document::extract_document(&bytes, doc_type)?;
|
||||
let mut result = crate::document::extract_document(bytes, doc_type)?;
|
||||
result.metadata.url = Some(final_url);
|
||||
Ok(result)
|
||||
} else {
|
||||
let html = response
|
||||
.text()
|
||||
.await
|
||||
.map_err(|e| FetchError::BodyDecode(e.to_string()))?;
|
||||
let html = response.text().into_owned();
|
||||
|
||||
let elapsed = start.elapsed();
|
||||
debug!(status, elapsed_ms = %elapsed.as_millis(), "fetch complete");
|
||||
|
|
@ -440,21 +313,11 @@ impl FetchClient {
|
|||
|
||||
let extraction = webclaw_core::extract_with_options(&html, Some(&final_url), options)?;
|
||||
|
||||
// YouTube transcript: caption URLs are IP-signed and expire immediately,
|
||||
// so the timedtext endpoint returns empty responses. The innertube
|
||||
// get_transcript API requires cookies/consent. Transcript extraction
|
||||
// will be enabled via the cloud API (JS rendering + cookie jar).
|
||||
// The extraction functions exist in webclaw_core::youtube but are not
|
||||
// wired up here until we have a reliable fetch path.
|
||||
|
||||
Ok(extraction)
|
||||
}
|
||||
}
|
||||
|
||||
/// Fetch multiple URLs concurrently with bounded parallelism.
|
||||
///
|
||||
/// Spawns one task per URL, bounded by a semaphore. Results are returned
|
||||
/// in the same order as the input URLs, regardless of completion order.
|
||||
pub async fn fetch_batch(
|
||||
self: &Arc<Self>,
|
||||
urls: &[&str],
|
||||
|
|
@ -479,9 +342,6 @@ impl FetchClient {
|
|||
}
|
||||
|
||||
/// Fetch and extract multiple URLs concurrently with bounded parallelism.
|
||||
///
|
||||
/// Same semantics as [`fetch_batch`] but runs extraction on each response.
|
||||
/// Results preserve input URL order.
|
||||
pub async fn fetch_and_extract_batch(
|
||||
self: &Arc<Self>,
|
||||
urls: &[&str],
|
||||
|
|
@ -496,9 +356,6 @@ impl FetchClient {
|
|||
}
|
||||
|
||||
/// Fetch and extract multiple URLs concurrently with custom extraction options.
|
||||
///
|
||||
/// Same as [`fetch_and_extract_batch`] but applies the given options
|
||||
/// (include/exclude selectors, only-main-content, etc.) to each extraction.
|
||||
pub async fn fetch_and_extract_batch_with_options(
|
||||
self: &Arc<Self>,
|
||||
urls: &[&str],
|
||||
|
|
@ -533,7 +390,7 @@ impl FetchClient {
|
|||
}
|
||||
|
||||
/// Pick a client from the pool for a given URL.
|
||||
fn pick_client(&self, url: &str) -> &primp::Client {
|
||||
fn pick_client(&self, url: &str) -> &webclaw_http::Client {
|
||||
match &self.pool {
|
||||
ClientPool::Static { clients, random } => {
|
||||
if *random {
|
||||
|
|
@ -548,21 +405,37 @@ impl FetchClient {
|
|||
}
|
||||
}
|
||||
|
||||
/// Collect the impersonation profiles to use based on the browser profile.
|
||||
fn collect_profiles(profile: &BrowserProfile) -> Vec<ImpersonateProfile> {
|
||||
/// Collect the browser variants to use based on the browser profile.
|
||||
fn collect_variants(profile: &BrowserProfile) -> Vec<BrowserVariant> {
|
||||
match profile {
|
||||
BrowserProfile::Random => {
|
||||
let mut profiles = Vec::new();
|
||||
profiles.extend(browser::chrome_profiles());
|
||||
profiles.extend(browser::firefox_profiles());
|
||||
profiles.extend(browser::extra_profiles());
|
||||
profiles
|
||||
}
|
||||
BrowserProfile::Random => browser::all_variants(),
|
||||
BrowserProfile::Chrome => vec![browser::latest_chrome()],
|
||||
BrowserProfile::Firefox => vec![browser::latest_firefox()],
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert a webclaw-http Response into a FetchResult.
|
||||
fn response_to_result(
|
||||
response: webclaw_http::Response,
|
||||
start: Instant,
|
||||
) -> Result<FetchResult, FetchError> {
|
||||
let status = response.status();
|
||||
let final_url = response.url().to_string();
|
||||
let headers = response.headers().clone();
|
||||
let html = response.into_text();
|
||||
let elapsed = start.elapsed();
|
||||
|
||||
debug!(status, elapsed_ms = %elapsed.as_millis(), "fetch complete");
|
||||
|
||||
Ok(FetchResult {
|
||||
html,
|
||||
status,
|
||||
url: final_url,
|
||||
headers,
|
||||
elapsed,
|
||||
})
|
||||
}
|
||||
|
||||
/// Extract the host from a URL, returning empty string on parse failure.
|
||||
fn extract_host(url: &str) -> String {
|
||||
url::Url::parse(url)
|
||||
|
|
@ -573,7 +446,10 @@ fn extract_host(url: &str) -> String {
|
|||
|
||||
/// Pick a client deterministically based on a host string.
|
||||
/// Same host always gets the same client, enabling HTTP/2 connection reuse.
|
||||
fn pick_for_host<'a>(clients: &'a [primp::Client], host: &str) -> &'a primp::Client {
|
||||
fn pick_for_host<'a>(
|
||||
clients: &'a [webclaw_http::Client],
|
||||
host: &str,
|
||||
) -> &'a webclaw_http::Client {
|
||||
let mut hasher = std::collections::hash_map::DefaultHasher::new();
|
||||
host.hash(&mut hasher);
|
||||
let idx = (hasher.finish() as usize) % clients.len();
|
||||
|
|
@ -581,12 +457,41 @@ fn pick_for_host<'a>(clients: &'a [primp::Client], host: &str) -> &'a primp::Cli
|
|||
}
|
||||
|
||||
/// Pick a random client from the pool for per-request rotation.
|
||||
fn pick_random(clients: &[primp::Client]) -> &primp::Client {
|
||||
fn pick_random(clients: &[webclaw_http::Client]) -> &webclaw_http::Client {
|
||||
use rand::Rng;
|
||||
let idx = rand::thread_rng().gen_range(0..clients.len());
|
||||
&clients[idx]
|
||||
}
|
||||
|
||||
/// Build a webclaw-http Client from config + browser variant + optional proxy.
|
||||
fn build_client(
|
||||
config: &FetchConfig,
|
||||
variant: BrowserVariant,
|
||||
proxy: Option<&str>,
|
||||
) -> Result<webclaw_http::Client, FetchError> {
|
||||
let mut builder = match variant {
|
||||
BrowserVariant::Chrome => webclaw_http::Client::builder().chrome(),
|
||||
BrowserVariant::ChromeMacos => webclaw_http::Client::builder().chrome_macos(),
|
||||
BrowserVariant::Firefox => webclaw_http::Client::builder().firefox(),
|
||||
BrowserVariant::Safari => webclaw_http::Client::builder().safari(),
|
||||
BrowserVariant::Edge => webclaw_http::Client::builder().edge(),
|
||||
};
|
||||
|
||||
builder = builder.timeout(config.timeout);
|
||||
|
||||
for (k, v) in &config.headers {
|
||||
builder = builder.default_header(k, v);
|
||||
}
|
||||
|
||||
if let Some(proxy_url) = proxy {
|
||||
builder = builder
|
||||
.proxy(proxy_url)
|
||||
.map_err(|e| FetchError::Build(format!("proxy: {e}")))?;
|
||||
}
|
||||
|
||||
builder.build().map_err(|e| FetchError::Build(e.to_string()))
|
||||
}
|
||||
|
||||
/// Status codes worth retrying: server errors + rate limiting.
|
||||
fn is_retryable_status(status: u16) -> bool {
|
||||
status == 429
|
||||
|
|
@ -670,46 +575,6 @@ async fn collect_ordered<T>(
|
|||
slots.into_iter().flatten().collect()
|
||||
}
|
||||
|
||||
/// Build a single primp Client from config + impersonation profile + optional proxy.
|
||||
fn build_primp_client(
|
||||
config: &FetchConfig,
|
||||
profile: &ImpersonateProfile,
|
||||
proxy: Option<&str>,
|
||||
) -> Result<primp::Client, FetchError> {
|
||||
let redirect_policy = if config.follow_redirects {
|
||||
primp::redirect::Policy::limited(config.max_redirects as usize)
|
||||
} else {
|
||||
primp::redirect::Policy::none()
|
||||
};
|
||||
|
||||
let mut headers = primp::header::HeaderMap::new();
|
||||
for (k, v) in &config.headers {
|
||||
if let (Ok(name), Ok(val)) = (
|
||||
primp::header::HeaderName::from_bytes(k.as_bytes()),
|
||||
primp::header::HeaderValue::from_str(v),
|
||||
) {
|
||||
headers.insert(name, val);
|
||||
}
|
||||
}
|
||||
|
||||
let mut builder = primp::Client::builder()
|
||||
.impersonate(profile.browser)
|
||||
.impersonate_os(profile.os)
|
||||
.cookie_store(true)
|
||||
.timeout(config.timeout)
|
||||
.redirect(redirect_policy)
|
||||
.default_headers(headers);
|
||||
|
||||
if let Some(proxy_url) = proxy {
|
||||
builder = builder
|
||||
.proxy(primp::Proxy::all(proxy_url).map_err(|e| FetchError::Build(e.to_string()))?);
|
||||
}
|
||||
|
||||
builder
|
||||
.build()
|
||||
.map_err(|e| FetchError::Build(e.to_string()))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
|
|
|||
|
|
@ -1,11 +1,11 @@
|
|||
/// Fetch-layer errors. Wraps primp/network failures into a single type
|
||||
/// Fetch-layer errors. Wraps HTTP/network failures into a single type
|
||||
/// that callers can match on without leaking transport details.
|
||||
use thiserror::Error;
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum FetchError {
|
||||
#[error("request failed: {0}")]
|
||||
Request(#[from] primp::Error),
|
||||
Request(#[from] webclaw_http::Error),
|
||||
|
||||
#[error("invalid url: {0}")]
|
||||
InvalidUrl(String),
|
||||
|
|
|
|||
|
|
@ -1,15 +1,14 @@
|
|||
/// webclaw-fetch: HTTP client layer with browser TLS fingerprint impersonation.
|
||||
/// Uses Impit under the hood to make requests that look like real
|
||||
/// browsers at the TLS, HTTP/2, and header levels.
|
||||
/// Automatically detects PDF responses and delegates to webclaw-pdf.
|
||||
//! webclaw-fetch: HTTP client layer with browser TLS fingerprint impersonation.
|
||||
//! Uses webclaw-http for browser-grade TLS + HTTP/2 fingerprinting.
|
||||
//! Automatically detects PDF responses and delegates to webclaw-pdf.
|
||||
pub mod browser;
|
||||
pub mod client;
|
||||
pub mod crawler;
|
||||
pub mod document;
|
||||
pub mod error;
|
||||
pub mod linkedin;
|
||||
pub(crate) mod linkedin;
|
||||
pub mod proxy;
|
||||
pub mod reddit;
|
||||
pub(crate) mod reddit;
|
||||
pub mod sitemap;
|
||||
|
||||
pub use browser::BrowserProfile;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue