diff --git a/Cargo.lock b/Cargo.lock index d302f29..24a32ae 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -25,7 +25,7 @@ checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0" dependencies = [ "cfg-if", "cipher", - "cpufeatures 0.2.17", + "cpufeatures", ] [[package]] @@ -183,9 +183,9 @@ dependencies = [ [[package]] name = "aws-lc-sys" -version = "0.39.0" +version = "0.39.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fa7e52a4c5c547c741610a2c6f123f3881e409b714cd27e6798ef020c514f0a" +checksum = "83a25cf98105baa966497416dbd42565ce3a8cf8dbfd59803ec9ad46f3126399" dependencies = [ "cc", "cmake", @@ -313,17 +313,6 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" -[[package]] -name = "chacha20" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601" -dependencies = [ - "cfg-if", - "cpufeatures 0.3.0", - "rand_core 0.10.0", -] - [[package]] name = "chrono" version = "0.4.44" @@ -467,16 +456,6 @@ dependencies = [ "url", ] -[[package]] -name = "core-foundation" -version = "0.9.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" -dependencies = [ - "core-foundation-sys", - "libc", -] - [[package]] name = "core-foundation" version = "0.10.1" @@ -502,15 +481,6 @@ dependencies = [ "libc", ] -[[package]] -name = "cpufeatures" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201" -dependencies = [ - "libc", -] - [[package]] name = "crc" version = "3.4.0" @@ -973,7 +943,6 @@ dependencies = [ "cfg-if", "libc", "r-efi 6.0.0", - "rand_core 0.10.0", "wasip2", "wasip3", ] @@ -981,7 +950,7 @@ dependencies = [ [[package]] name = "h2" version = "0.4.13" -source = "git+https://github.com/deedy5/primp#b1d34a7b9fc7c24be515b1b74a469060b12fe137" +source = "git+https://github.com/0xMassi/webclaw-tls#fcbd389f90994fc1be1efdde1065713d0ef562d5" dependencies = [ "atomic-waker", "bytes", @@ -1081,7 +1050,7 @@ checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" [[package]] name = "hyper" version = "1.8.1" -source = "git+https://github.com/deedy5/primp#b1d34a7b9fc7c24be515b1b74a469060b12fe137" +source = "git+https://github.com/0xMassi/webclaw-tls#fcbd389f90994fc1be1efdde1065713d0ef562d5" dependencies = [ "atomic-waker", "bytes", @@ -1121,7 +1090,7 @@ dependencies = [ [[package]] name = "hyper-util" version = "0.1.20" -source = "git+https://github.com/deedy5/primp#b1d34a7b9fc7c24be515b1b74a469060b12fe137" +source = "git+https://github.com/0xMassi/webclaw-tls#fcbd389f90994fc1be1efdde1065713d0ef562d5" dependencies = [ "base64", "bytes", @@ -1136,11 +1105,9 @@ dependencies = [ "percent-encoding", "pin-project-lite", "socket2", - "system-configuration", "tokio", "tower-service", "tracing", - "windows-registry", ] [[package]] @@ -1342,10 +1309,12 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.91" +version = "0.3.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b49715b7073f385ba4bc528e5747d02e66cb39c6146efb66b781f131f0fb399c" +checksum = "cc4c90f45aa2e6eacbe8645f77fdea542ac97a494bcd117a67df9ff4d611f995" dependencies = [ + "cfg-if", + "futures-util", "once_cell", "wasm-bindgen", ] @@ -1502,22 +1471,6 @@ version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" -[[package]] -name = "mime" -version = "0.3.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" - -[[package]] -name = "mime_guess" -version = "2.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7c44f8e672c00fe5308fa235f821cb4198414e1c77935c1ab6948d3fd78550e" -dependencies = [ - "mime", - "unicase", -] - [[package]] name = "minimal-lexical" version = "0.2.1" @@ -1785,20 +1738,6 @@ dependencies = [ "syn", ] -[[package]] -name = "primp" -version = "1.2.0" -source = "git+https://github.com/deedy5/primp#b1d34a7b9fc7c24be515b1b74a469060b12fe137" -dependencies = [ - "h2", - "http", - "rand 0.10.0", - "reqwest 0.13.2", - "rustls", - "url", - "webpki-roots", -] - [[package]] name = "proc-macro2" version = "1.0.106" @@ -1942,17 +1881,6 @@ dependencies = [ "rand_core 0.9.5", ] -[[package]] -name = "rand" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc266eb313df6c5c09c1c7b1fbe2510961e5bcd3add930c1e31f7ed9da0feff8" -dependencies = [ - "chacha20", - "getrandom 0.4.2", - "rand_core 0.10.0", -] - [[package]] name = "rand_chacha" version = "0.3.1" @@ -1991,12 +1919,6 @@ dependencies = [ "getrandom 0.3.4", ] -[[package]] -name = "rand_core" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c8d0fd677905edcbeedbf2edb6494d676f0e98d54d5cf9bda0b061cb8fb8aba" - [[package]] name = "rangemap" version = "1.7.1" @@ -2102,14 +2024,13 @@ dependencies = [ [[package]] name = "reqwest" version = "0.13.2" -source = "git+https://github.com/deedy5/primp#b1d34a7b9fc7c24be515b1b74a469060b12fe137" +source = "git+https://github.com/0xMassi/webclaw-tls#fcbd389f90994fc1be1efdde1065713d0ef562d5" dependencies = [ "base64", "bytes", "cookie", "cookie_store", "futures-core", - "futures-util", "h2", "http", "http-body", @@ -2119,7 +2040,6 @@ dependencies = [ "hyper-util", "js-sys", "log", - "mime_guess", "percent-encoding", "pin-project-lite", "quinn", @@ -2127,18 +2047,15 @@ dependencies = [ "rustls-pki-types", "serde", "serde_json", - "serde_urlencoded", "sync_wrapper", "tokio", "tokio-rustls", - "tokio-util", "tower", "tower-http", "tower-service", "url", "wasm-bindgen", "wasm-bindgen-futures", - "wasm-streams", "web-sys", "webpki-roots", ] @@ -2221,9 +2138,9 @@ dependencies = [ [[package]] name = "rustc-hash" -version = "2.1.1" +version = "2.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" +checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe" [[package]] name = "rustix" @@ -2241,7 +2158,7 @@ dependencies = [ [[package]] name = "rustls" version = "0.23.37" -source = "git+https://github.com/deedy5/primp#b1d34a7b9fc7c24be515b1b74a469060b12fe137" +source = "git+https://github.com/0xMassi/webclaw-tls#fcbd389f90994fc1be1efdde1065713d0ef562d5" dependencies = [ "aws-lc-rs", "brotli", @@ -2253,7 +2170,6 @@ dependencies = [ "rustls-webpki", "subtle", "zeroize", - "zlib-rs", "zstd", ] @@ -2366,7 +2282,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d" dependencies = [ "bitflags", - "core-foundation 0.10.1", + "core-foundation", "core-foundation-sys", "libc", "security-framework-sys", @@ -2489,7 +2405,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" dependencies = [ "cfg-if", - "cpufeatures 0.2.17", + "cpufeatures", "digest", ] @@ -2632,27 +2548,6 @@ dependencies = [ "syn", ] -[[package]] -name = "system-configuration" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b" -dependencies = [ - "bitflags", - "core-foundation 0.9.4", - "system-configuration-sys", -] - -[[package]] -name = "system-configuration-sys" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4" -dependencies = [ - "core-foundation-sys", - "libc", -] - [[package]] name = "tempfile" version = "3.27.0" @@ -2951,12 +2846,6 @@ version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" -[[package]] -name = "unicase" -version = "2.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbc4bc3a9f746d862c45cb89d705aa10f187bb96c76001afab07a0d35ce60142" - [[package]] name = "unicode-ident" version = "1.0.24" @@ -3074,9 +2963,9 @@ dependencies = [ [[package]] name = "wasm-bindgen" -version = "0.2.114" +version = "0.2.115" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6532f9a5c1ece3798cb1c2cfdba640b9b3ba884f5db45973a6f442510a87d38e" +checksum = "6523d69017b7633e396a89c5efab138161ed5aafcbc8d3e5c5a42ae38f50495a" dependencies = [ "cfg-if", "once_cell", @@ -3087,23 +2976,19 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.64" +version = "0.4.65" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9c5522b3a28661442748e09d40924dfb9ca614b21c00d3fd135720e48b67db8" +checksum = "2d1faf851e778dfa54db7cd438b70758eba9755cb47403f3496edd7c8fc212f0" dependencies = [ - "cfg-if", - "futures-util", "js-sys", - "once_cell", "wasm-bindgen", - "web-sys", ] [[package]] name = "wasm-bindgen-macro" -version = "0.2.114" +version = "0.2.115" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18a2d50fcf105fb33bb15f00e7a77b772945a2ee45dcf454961fd843e74c18e6" +checksum = "4e3a6c758eb2f701ed3d052ff5737f5bfe6614326ea7f3bbac7156192dc32e67" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -3111,9 +2996,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.114" +version = "0.2.115" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03ce4caeaac547cdf713d280eda22a730824dd11e6b8c3ca9e42247b25c631e3" +checksum = "921de2737904886b52bcbb237301552d05969a6f9c40d261eb0533c8b055fedf" dependencies = [ "bumpalo", "proc-macro2", @@ -3124,9 +3009,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.114" +version = "0.2.115" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75a326b8c223ee17883a4251907455a2431acc2791c98c26279376490c378c16" +checksum = "a93e946af942b58934c604527337bad9ae33ba1d5c6900bbb41c2c07c2364a93" dependencies = [ "unicode-ident", ] @@ -3153,19 +3038,6 @@ dependencies = [ "wasmparser", ] -[[package]] -name = "wasm-streams" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d1ec4f6517c9e11ae630e200b2b65d193279042e28edd4a2cda233e46670bbb" -dependencies = [ - "futures-util", - "js-sys", - "wasm-bindgen", - "wasm-bindgen-futures", - "web-sys", -] - [[package]] name = "wasmparser" version = "0.244.0" @@ -3180,9 +3052,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.91" +version = "0.3.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "854ba17bb104abfb26ba36da9729addc7ce7f06f5c0f90f3c391f8461cca21f9" +checksum = "84cde8507f4d7cfcb1185b8cb5890c494ffea65edbe1ba82cfd63661c805ed94" dependencies = [ "js-sys", "wasm-bindgen", @@ -3200,11 +3072,10 @@ dependencies = [ [[package]] name = "webclaw-cli" -version = "0.2.3" +version = "0.3.0" dependencies = [ "clap", "dotenvy", - "rand 0.8.5", "regex", "reqwest 0.12.28", "serde_json", @@ -3220,10 +3091,9 @@ dependencies = [ [[package]] name = "webclaw-core" -version = "0.2.3" +version = "0.3.0" dependencies = [ "ego-tree", - "once_cell", "regex", "rquickjs", "scraper", @@ -3238,10 +3108,9 @@ dependencies = [ [[package]] name = "webclaw-fetch" -version = "0.2.3" +version = "0.3.0" dependencies = [ "calamine", - "primp", "quick-xml 0.37.5", "rand 0.8.5", "serde", @@ -3252,13 +3121,31 @@ dependencies = [ "tracing", "url", "webclaw-core", + "webclaw-http", "webclaw-pdf", "zip 2.4.2", ] +[[package]] +name = "webclaw-http" +version = "0.1.0" +source = "git+https://github.com/0xMassi/webclaw-tls#fcbd389f90994fc1be1efdde1065713d0ef562d5" +dependencies = [ + "bytes", + "h2", + "http", + "reqwest 0.13.2", + "rustls", + "rustls-native-certs", + "tokio", + "tracing", + "url", + "webpki-roots", +] + [[package]] name = "webclaw-llm" -version = "0.2.3" +version = "0.3.0" dependencies = [ "async-trait", "reqwest 0.12.28", @@ -3271,7 +3158,7 @@ dependencies = [ [[package]] name = "webclaw-mcp" -version = "0.2.3" +version = "0.3.0" dependencies = [ "dotenvy", "reqwest 0.12.28", @@ -3291,7 +3178,7 @@ dependencies = [ [[package]] name = "webclaw-pdf" -version = "0.2.3" +version = "0.3.0" dependencies = [ "pdf-extract", "thiserror", @@ -3354,17 +3241,6 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" -[[package]] -name = "windows-registry" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02752bf7fbdcce7f2a27a742f798510f3e5ad88dbe84871e5168e2120c3d5720" -dependencies = [ - "windows-link", - "windows-result", - "windows-strings", -] - [[package]] name = "windows-result" version = "0.4.1" @@ -3667,18 +3543,18 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.8.47" +version = "0.8.48" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "efbb2a062be311f2ba113ce66f697a4dc589f85e78a4aea276200804cea0ed87" +checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.47" +version = "0.8.48" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e8bc7269b54418e7aeeef514aa68f8690b8c0489a06b0136e5f57c4c5ccab89" +checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4" dependencies = [ "proc-macro2", "quote", diff --git a/Cargo.toml b/Cargo.toml index 6ec3f59..fb3afe6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ resolver = "2" members = ["crates/*"] [workspace.package] -version = "0.2.3" +version = "0.3.0" edition = "2024" license = "MIT" repository = "https://github.com/0xMassi/webclaw" @@ -22,11 +22,13 @@ tracing-subscriber = { version = "0.3", features = ["env-filter"] } clap = { version = "4", features = ["derive", "env"] } dotenvy = "0.15" -# primp requires patched forks with TLS impersonation support. -# Must mirror all patches from primp's own Cargo.toml. +# TLS + HTTP/2 fingerprinting via webclaw-tls. +# rustls: TLS fingerprinting (JA4 match Chrome 146) +# h2: HTTP/2 SETTINGS ordering + pseudo-header order +# hyper/hyper-util/reqwest: passthrough for consistent dependency chain [patch.crates-io] -reqwest = { git = "https://github.com/deedy5/primp", subdirectory = "crates/primp-reqwest" } -rustls = { git = "https://github.com/deedy5/primp", subdirectory = "crates/primp-rustls/rustls" } -h2 = { git = "https://github.com/deedy5/primp", subdirectory = "crates/primp-h2" } -hyper = { git = "https://github.com/deedy5/primp", subdirectory = "crates/primp-hyper" } -hyper-util = { git = "https://github.com/deedy5/primp", subdirectory = "crates/primp-hyper-util" } +rustls = { git = "https://github.com/0xMassi/webclaw-tls" } +h2 = { git = "https://github.com/0xMassi/webclaw-tls" } +hyper = { git = "https://github.com/0xMassi/webclaw-tls" } +hyper-util = { git = "https://github.com/0xMassi/webclaw-tls" } +reqwest = { git = "https://github.com/0xMassi/webclaw-tls" } diff --git a/crates/webclaw-cli/src/cloud.rs b/crates/webclaw-cli/src/cloud.rs index 8b068ba..464eb4c 100644 --- a/crates/webclaw-cli/src/cloud.rs +++ b/crates/webclaw-cli/src/cloud.rs @@ -3,6 +3,11 @@ /// When WEBCLAW_API_KEY is set (or --api-key is passed), the CLI can fall back /// to api.webclaw.io for bot-protected or JS-rendered sites. With --cloud flag, /// all requests go through the cloud API directly. +/// +/// NOTE: The canonical, full-featured cloud module lives in webclaw-mcp/src/cloud.rs +/// (smart_fetch, bot detection, JS rendering checks). This is the minimal subset +/// needed by the CLI. Kept separate to avoid pulling in rmcp via webclaw-mcp. +/// and adding webclaw-mcp as a dependency would pull in rmcp. use serde_json::{Value, json}; const API_BASE: &str = "https://api.webclaw.io/v1"; @@ -51,46 +56,6 @@ impl CloudClient { self.post("scrape", body).await } - /// Summarize via cloud API. - pub async fn summarize( - &self, - url: &str, - max_sentences: Option, - ) -> Result { - let mut body = json!({ "url": url }); - if let Some(n) = max_sentences { - body["max_sentences"] = json!(n); - } - self.post("summarize", body).await - } - - /// Brand extraction via cloud API. - pub async fn brand(&self, url: &str) -> Result { - self.post("brand", json!({ "url": url })).await - } - - /// Diff via cloud API. - pub async fn diff(&self, url: &str) -> Result { - self.post("diff", json!({ "url": url })).await - } - - /// Extract via cloud API. - pub async fn extract( - &self, - url: &str, - schema: Option<&str>, - prompt: Option<&str>, - ) -> Result { - let mut body = json!({ "url": url }); - if let Some(s) = schema { - body["schema"] = serde_json::from_str(s).unwrap_or(json!(s)); - } - if let Some(p) = prompt { - body["prompt"] = json!(p); - } - self.post("extract", body).await - } - async fn post(&self, endpoint: &str, body: Value) -> Result { let resp = self .http @@ -113,58 +78,3 @@ impl CloudClient { .map_err(|e| format!("cloud API response parse failed: {e}")) } } - -/// Check if HTML is a bot protection challenge page. -pub fn is_bot_protected(html: &str) -> bool { - let html_lower = html.to_lowercase(); - - // Cloudflare - if html_lower.contains("_cf_chl_opt") || html_lower.contains("challenge-platform") { - return true; - } - if (html_lower.contains("just a moment") || html_lower.contains("checking your browser")) - && html_lower.contains("cf-spinner") - { - return true; - } - if (html_lower.contains("cf-turnstile") - || html_lower.contains("challenges.cloudflare.com/turnstile")) - && html.len() < 100_000 - { - return true; - } - - // DataDome - if html_lower.contains("geo.captcha-delivery.com") { - return true; - } - - // AWS WAF - if html_lower.contains("awswaf-captcha") { - return true; - } - - false -} - -/// Check if a page likely needs JS rendering. -pub fn needs_js_rendering(word_count: usize, html: &str) -> bool { - let has_scripts = html.contains(" 5_000 && has_scripts { - return true; - } - - if word_count < 800 && html.len() > 50_000 && has_scripts { - let html_lower = html.to_lowercase(); - if html_lower.contains("react-app") - || html_lower.contains("id=\"__next\"") - || html_lower.contains("id=\"root\"") - || html_lower.contains("id=\"app\"") - { - return true; - } - } - - false -} diff --git a/crates/webclaw-fetch/Cargo.toml b/crates/webclaw-fetch/Cargo.toml index e4da69b..dd1b046 100644 --- a/crates/webclaw-fetch/Cargo.toml +++ b/crates/webclaw-fetch/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "webclaw-fetch" -description = "HTTP client with browser TLS fingerprint impersonation via Impit" +description = "HTTP client with browser TLS fingerprint impersonation via webclaw-http" version.workspace = true edition.workspace = true license.workspace = true @@ -12,9 +12,7 @@ serde = { workspace = true } thiserror = { workspace = true } tracing = { workspace = true } tokio = { workspace = true } -primp = { git = "https://github.com/deedy5/primp", default-features = false, features = [ - "default-tls", "http2", "impersonate", "cookies", "gzip", "brotli", "deflate", "zstd", "socks", -] } +webclaw-http = { git = "https://github.com/0xMassi/webclaw-tls" } url = "2" rand = "0.8" quick-xml = { version = "0.37", features = ["serde"] } diff --git a/crates/webclaw-fetch/src/browser.rs b/crates/webclaw-fetch/src/browser.rs index c35fab5..4865670 100644 --- a/crates/webclaw-fetch/src/browser.rs +++ b/crates/webclaw-fetch/src/browser.rs @@ -1,6 +1,5 @@ /// Browser fingerprint selection and rotation. -/// Maps our simple `BrowserProfile` enum to primp's impersonation profiles. -use primp::{Impersonate, ImpersonateOS}; +/// Maps our BrowserProfile enum to webclaw-http client builder methods. /// Which browser identity to present at the TLS/HTTP layer. #[derive(Debug, Clone, Default)] @@ -12,85 +11,41 @@ pub enum BrowserProfile { Random, } -/// A complete impersonation profile: browser + OS. -#[derive(Debug, Clone)] -pub struct ImpersonateProfile { - pub browser: Impersonate, - pub os: ImpersonateOS, +/// A browser variant for building webclaw-http clients. +#[derive(Debug, Clone, Copy)] +pub enum BrowserVariant { + Chrome, + ChromeMacos, + Firefox, + Safari, + Edge, } -/// All Chrome profiles we ship, newest first. -pub fn chrome_profiles() -> Vec { +/// All Chrome variants we ship. +pub fn chrome_variants() -> Vec { + vec![BrowserVariant::Chrome, BrowserVariant::ChromeMacos] +} + +/// All Firefox variants we ship. +pub fn firefox_variants() -> Vec { + vec![BrowserVariant::Firefox] +} + +/// All variants for maximum diversity in Random mode. +pub fn all_variants() -> Vec { vec![ - ImpersonateProfile { - browser: Impersonate::ChromeV145, - os: ImpersonateOS::Windows, - }, - ImpersonateProfile { - browser: Impersonate::ChromeV145, - os: ImpersonateOS::MacOS, - }, - ImpersonateProfile { - browser: Impersonate::ChromeV144, - os: ImpersonateOS::Windows, - }, - ImpersonateProfile { - browser: Impersonate::ChromeV144, - os: ImpersonateOS::Linux, - }, + BrowserVariant::Chrome, + BrowserVariant::ChromeMacos, + BrowserVariant::Firefox, + BrowserVariant::Safari, + BrowserVariant::Edge, ] } -/// All Firefox profiles we ship, newest first. -pub fn firefox_profiles() -> Vec { - vec![ - ImpersonateProfile { - browser: Impersonate::FirefoxV146, - os: ImpersonateOS::Windows, - }, - ImpersonateProfile { - browser: Impersonate::FirefoxV146, - os: ImpersonateOS::Linux, - }, - ImpersonateProfile { - browser: Impersonate::FirefoxV140, - os: ImpersonateOS::Windows, - }, - ] +pub fn latest_chrome() -> BrowserVariant { + BrowserVariant::Chrome } -/// Safari + Edge + Opera profiles for maximum diversity in Random mode. -pub fn extra_profiles() -> Vec { - vec![ - ImpersonateProfile { - browser: Impersonate::SafariV18_5, - os: ImpersonateOS::MacOS, - }, - ImpersonateProfile { - browser: Impersonate::SafariV26, - os: ImpersonateOS::MacOS, - }, - ImpersonateProfile { - browser: Impersonate::EdgeV145, - os: ImpersonateOS::Windows, - }, - ImpersonateProfile { - browser: Impersonate::OperaV127, - os: ImpersonateOS::Windows, - }, - ] -} - -pub fn latest_chrome() -> ImpersonateProfile { - ImpersonateProfile { - browser: Impersonate::SafariV26, - os: ImpersonateOS::MacOS, - } -} - -pub fn latest_firefox() -> ImpersonateProfile { - ImpersonateProfile { - browser: Impersonate::FirefoxV146, - os: ImpersonateOS::Windows, - } +pub fn latest_firefox() -> BrowserVariant { + BrowserVariant::Firefox } diff --git a/crates/webclaw-fetch/src/client.rs b/crates/webclaw-fetch/src/client.rs index 5b8526e..ec28d5c 100644 --- a/crates/webclaw-fetch/src/client.rs +++ b/crates/webclaw-fetch/src/client.rs @@ -1,6 +1,6 @@ /// HTTP client with browser TLS fingerprint impersonation. -/// Wraps primp to provide a simple fetch interface with optional -/// content extraction via webclaw-core. Supports single and batch operations. +/// Uses webclaw-http for browser-grade TLS + HTTP/2 fingerprinting. +/// Supports single and batch operations with proxy rotation. /// Automatically detects PDF responses and extracts text via webclaw-pdf. /// /// Two proxy modes: @@ -17,7 +17,7 @@ use tokio::sync::Semaphore; use tracing::{debug, instrument, warn}; use webclaw_pdf::PdfMode; -use crate::browser::{self, BrowserProfile, ImpersonateProfile}; +use crate::browser::{self, BrowserProfile, BrowserVariant}; use crate::error::FetchError; /// Configuration for building a [`FetchClient`]. @@ -83,20 +83,22 @@ enum ClientPool { /// Pre-built clients with a fixed proxy (or no proxy). /// Fingerprint rotation still works via the pool when `random` is true. Static { - clients: Vec, + clients: Vec, random: bool, }, /// Pre-built pool of clients, each with a different proxy + fingerprint. /// Requests pick a client deterministically by host for HTTP/2 connection reuse. - Rotating { clients: Vec }, + Rotating { + clients: Vec, + }, } -/// HTTP client that impersonates browser TLS fingerprints via primp. +/// HTTP client with browser TLS + HTTP/2 fingerprinting via webclaw-http. /// /// Operates in two modes: -/// - **Static pool**: pre-built primp clients, optionally with fingerprint rotation. +/// - **Static pool**: pre-built clients, optionally with fingerprint rotation. /// Used when no `proxy_pool` is configured. Fast (no per-request construction). -/// - **Rotating pool**: pre-built primp clients, one per proxy in the pool. +/// - **Rotating pool**: pre-built clients, one per proxy in the pool. /// Same-host URLs are routed to the same client for HTTP/2 multiplexing. pub struct FetchClient { pool: ClientPool, @@ -106,20 +108,20 @@ pub struct FetchClient { impl FetchClient { /// Build a new client from config. /// - /// When `config.proxy_pool` is non-empty, pre-builds one primp client per proxy, + /// When `config.proxy_pool` is non-empty, pre-builds one client per proxy, /// each with a randomly assigned fingerprint. Same-host URLs get routed to the /// same client for HTTP/2 connection reuse. /// - /// When `proxy_pool` is empty, pre-builds primp clients at construction time + /// When `proxy_pool` is empty, pre-builds clients at construction time /// (one per fingerprint for `Random` profiles, one for fixed profiles). pub fn new(config: FetchConfig) -> Result { - let profiles = collect_profiles(&config.browser); + let variants = collect_variants(&config.browser); let pdf_mode = config.pdf_mode.clone(); let pool = if config.proxy_pool.is_empty() { - let clients = profiles + let clients = variants .into_iter() - .map(|p| build_primp_client(&config, &p, config.proxy.as_deref())) + .map(|v| build_client(&config, v, config.proxy.as_deref())) .collect::, _>>()?; let random = matches!(config.browser, BrowserProfile::Random); @@ -136,14 +138,13 @@ impl FetchClient { .proxy_pool .iter() .map(|proxy| { - let p = profiles.choose(&mut rng).unwrap().clone(); - build_primp_client(&config, &p, Some(proxy)) + let v = *variants.choose(&mut rng).unwrap(); + build_client(&config, v, Some(proxy)) }) .collect::, _>>()?; debug!( clients = clients.len(), - profiles = profiles.len(), "fetch client ready (pre-built rotating pool)" ); @@ -206,91 +207,13 @@ impl FetchClient { Err(last_err.unwrap_or_else(|| FetchError::Build("all retries exhausted".into()))) } - /// Single fetch attempt with automatic plain-client fallback. - /// - /// If the TLS-impersonated client fails with a connection error or gets a 403, - /// retries with a plain client (no impersonation). Some sites (e.g. ycombinator.com) - /// reject forged TLS fingerprints but accept default rustls connections. + /// Single fetch attempt. Uses the TLS-impersonated client from the pool. async fn fetch_once(&self, url: &str) -> Result { let start = Instant::now(); + let client = self.pick_client(url); - let client = match &self.pool { - ClientPool::Static { clients, random } => { - if *random { - let host = extract_host(url); - pick_for_host(clients, &host) - } else { - &clients[0] - } - } - ClientPool::Rotating { clients } => pick_random(clients), - }; - - // Try impersonated client first - let needs_plain_fallback = match client.get(url).send().await { - Ok(response) => { - let status = response.status().as_u16(); - if status == 403 { - debug!(url, "impersonated client got 403, trying plain fallback"); - true - } else { - return Self::response_to_result(response, start).await; - } - } - Err(_e) => { - debug!( - url, - "impersonated client connection failed, trying plain fallback" - ); - true - } - }; - - // Plain client fallback (no TLS impersonation) - if needs_plain_fallback { - let plain = primp::Client::builder() - .user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36") - .cookie_store(true) - .timeout(Duration::from_secs(30)) - .build() - .map_err(|e| FetchError::Build(format!("plain client: {e}")))?; - - let response = plain.get(url).send().await?; - return Self::response_to_result(response, start).await; - } - - unreachable!() - } - - /// Convert a primp Response into a FetchResult. - async fn response_to_result( - response: primp::Response, - start: Instant, - ) -> Result { - let status = response.status().as_u16(); - let final_url = response.url().to_string(); - - let headers: HashMap = response - .headers() - .iter() - .map(|(k, v)| (k.to_string(), v.to_str().unwrap_or("").to_string())) - .collect(); - - let html = response - .text() - .await - .map_err(|e| FetchError::BodyDecode(e.to_string()))?; - - let elapsed = start.elapsed(); - debug!(status, elapsed_ms = %elapsed.as_millis(), "fetch complete"); - - Ok(FetchResult { - html, - status, - url: final_url, - headers, - elapsed, - }) + let response = client.get(url).await?; + response_to_result(response, start) } /// Fetch a URL then extract structured content. @@ -307,10 +230,6 @@ impl FetchClient { } /// Fetch a URL then extract structured content with custom extraction options. - /// - /// Same as [`fetch_and_extract`] but accepts `ExtractionOptions` for CSS selector - /// filtering, main-content-only mode, etc. Options only apply to HTML responses; - /// PDF extraction ignores them (no DOM to filter). #[instrument(skip(self, options), fields(url = %url))] pub async fn fetch_and_extract_with_options( &self, @@ -318,24 +237,15 @@ impl FetchClient { options: &webclaw_core::ExtractionOptions, ) -> Result { // Reddit fallback: use their JSON API to get post + full comment tree. - // Uses a plain reqwest client — Reddit's JSON endpoint blocks TLS-fingerprinted clients - // but accepts standard requests with a browser User-Agent. if crate::reddit::is_reddit_url(url) { let json_url = crate::reddit::json_url(url); debug!("reddit detected, fetching {json_url}"); - let plain = primp::Client::builder() - .user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36") - .timeout(std::time::Duration::from_secs(15)) - .build() - .map_err(|e| FetchError::Build(format!("reddit client: {e}")))?; - let response = plain.get(&json_url).send().await?; - if response.status().is_success() { - let bytes = response - .bytes() - .await - .map_err(|e| FetchError::BodyDecode(e.to_string()))?; - match crate::reddit::parse_reddit_json(&bytes, url) { + let client = self.pick_client(url); + let response = client.get(&json_url).await?; + if response.is_success() { + let bytes = response.body(); + match crate::reddit::parse_reddit_json(bytes, url) { Ok(result) => return Ok(result), Err(e) => warn!("reddit json fallback failed: {e}, falling back to HTML"), } @@ -344,50 +254,19 @@ impl FetchClient { let start = Instant::now(); let client = self.pick_client(url); + let response = client.get(url).await?; - // Try impersonated client, fall back to plain on connection error or 403 - let response = match client.get(url).send().await { - Ok(resp) if resp.status().as_u16() == 403 => { - debug!(url, "impersonated client got 403, trying plain fallback"); - let plain = primp::Client::builder() - .user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36") - .cookie_store(true) - .timeout(Duration::from_secs(30)) - .build() - .map_err(|e| FetchError::Build(format!("plain fallback: {e}")))?; - plain.get(url).send().await? - } - Ok(resp) => resp, - Err(_e) => { - debug!(url, "impersonated client failed, trying plain fallback"); - let plain = primp::Client::builder() - .user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36") - .cookie_store(true) - .timeout(Duration::from_secs(30)) - .build() - .map_err(|e| FetchError::Build(format!("plain fallback: {e}")))?; - plain.get(url).send().await? - } - }; - - let status = response.status().as_u16(); + let status = response.status(); let final_url = response.url().to_string(); - let headers: HashMap = response - .headers() - .iter() - .map(|(k, v)| (k.to_string(), v.to_str().unwrap_or("").to_string())) - .collect(); + let headers: HashMap = response.headers().clone(); let is_pdf = is_pdf_content_type(&headers); if is_pdf { debug!(status, "detected PDF response, using pdf extraction"); - let bytes = response - .bytes() - .await - .map_err(|e| FetchError::BodyDecode(e.to_string()))?; + let bytes = response.body(); let elapsed = start.elapsed(); debug!( @@ -397,17 +276,14 @@ impl FetchClient { "PDF fetch complete" ); - let pdf_result = webclaw_pdf::extract_pdf(&bytes, self.pdf_mode.clone())?; + let pdf_result = webclaw_pdf::extract_pdf(bytes, self.pdf_mode.clone())?; Ok(pdf_to_extraction_result(&pdf_result, &final_url)) } else if let Some(doc_type) = crate::document::is_document_content_type(&headers, &final_url) { debug!(status, doc_type = ?doc_type, "detected document response, extracting"); - let bytes = response - .bytes() - .await - .map_err(|e| FetchError::BodyDecode(e.to_string()))?; + let bytes = response.body(); let elapsed = start.elapsed(); debug!( @@ -417,14 +293,11 @@ impl FetchClient { "document fetch complete" ); - let mut result = crate::document::extract_document(&bytes, doc_type)?; + let mut result = crate::document::extract_document(bytes, doc_type)?; result.metadata.url = Some(final_url); Ok(result) } else { - let html = response - .text() - .await - .map_err(|e| FetchError::BodyDecode(e.to_string()))?; + let html = response.text().into_owned(); let elapsed = start.elapsed(); debug!(status, elapsed_ms = %elapsed.as_millis(), "fetch complete"); @@ -440,21 +313,11 @@ impl FetchClient { let extraction = webclaw_core::extract_with_options(&html, Some(&final_url), options)?; - // YouTube transcript: caption URLs are IP-signed and expire immediately, - // so the timedtext endpoint returns empty responses. The innertube - // get_transcript API requires cookies/consent. Transcript extraction - // will be enabled via the cloud API (JS rendering + cookie jar). - // The extraction functions exist in webclaw_core::youtube but are not - // wired up here until we have a reliable fetch path. - Ok(extraction) } } /// Fetch multiple URLs concurrently with bounded parallelism. - /// - /// Spawns one task per URL, bounded by a semaphore. Results are returned - /// in the same order as the input URLs, regardless of completion order. pub async fn fetch_batch( self: &Arc, urls: &[&str], @@ -479,9 +342,6 @@ impl FetchClient { } /// Fetch and extract multiple URLs concurrently with bounded parallelism. - /// - /// Same semantics as [`fetch_batch`] but runs extraction on each response. - /// Results preserve input URL order. pub async fn fetch_and_extract_batch( self: &Arc, urls: &[&str], @@ -496,9 +356,6 @@ impl FetchClient { } /// Fetch and extract multiple URLs concurrently with custom extraction options. - /// - /// Same as [`fetch_and_extract_batch`] but applies the given options - /// (include/exclude selectors, only-main-content, etc.) to each extraction. pub async fn fetch_and_extract_batch_with_options( self: &Arc, urls: &[&str], @@ -533,7 +390,7 @@ impl FetchClient { } /// Pick a client from the pool for a given URL. - fn pick_client(&self, url: &str) -> &primp::Client { + fn pick_client(&self, url: &str) -> &webclaw_http::Client { match &self.pool { ClientPool::Static { clients, random } => { if *random { @@ -548,21 +405,37 @@ impl FetchClient { } } -/// Collect the impersonation profiles to use based on the browser profile. -fn collect_profiles(profile: &BrowserProfile) -> Vec { +/// Collect the browser variants to use based on the browser profile. +fn collect_variants(profile: &BrowserProfile) -> Vec { match profile { - BrowserProfile::Random => { - let mut profiles = Vec::new(); - profiles.extend(browser::chrome_profiles()); - profiles.extend(browser::firefox_profiles()); - profiles.extend(browser::extra_profiles()); - profiles - } + BrowserProfile::Random => browser::all_variants(), BrowserProfile::Chrome => vec![browser::latest_chrome()], BrowserProfile::Firefox => vec![browser::latest_firefox()], } } +/// Convert a webclaw-http Response into a FetchResult. +fn response_to_result( + response: webclaw_http::Response, + start: Instant, +) -> Result { + let status = response.status(); + let final_url = response.url().to_string(); + let headers = response.headers().clone(); + let html = response.into_text(); + let elapsed = start.elapsed(); + + debug!(status, elapsed_ms = %elapsed.as_millis(), "fetch complete"); + + Ok(FetchResult { + html, + status, + url: final_url, + headers, + elapsed, + }) +} + /// Extract the host from a URL, returning empty string on parse failure. fn extract_host(url: &str) -> String { url::Url::parse(url) @@ -573,7 +446,10 @@ fn extract_host(url: &str) -> String { /// Pick a client deterministically based on a host string. /// Same host always gets the same client, enabling HTTP/2 connection reuse. -fn pick_for_host<'a>(clients: &'a [primp::Client], host: &str) -> &'a primp::Client { +fn pick_for_host<'a>( + clients: &'a [webclaw_http::Client], + host: &str, +) -> &'a webclaw_http::Client { let mut hasher = std::collections::hash_map::DefaultHasher::new(); host.hash(&mut hasher); let idx = (hasher.finish() as usize) % clients.len(); @@ -581,12 +457,41 @@ fn pick_for_host<'a>(clients: &'a [primp::Client], host: &str) -> &'a primp::Cli } /// Pick a random client from the pool for per-request rotation. -fn pick_random(clients: &[primp::Client]) -> &primp::Client { +fn pick_random(clients: &[webclaw_http::Client]) -> &webclaw_http::Client { use rand::Rng; let idx = rand::thread_rng().gen_range(0..clients.len()); &clients[idx] } +/// Build a webclaw-http Client from config + browser variant + optional proxy. +fn build_client( + config: &FetchConfig, + variant: BrowserVariant, + proxy: Option<&str>, +) -> Result { + let mut builder = match variant { + BrowserVariant::Chrome => webclaw_http::Client::builder().chrome(), + BrowserVariant::ChromeMacos => webclaw_http::Client::builder().chrome_macos(), + BrowserVariant::Firefox => webclaw_http::Client::builder().firefox(), + BrowserVariant::Safari => webclaw_http::Client::builder().safari(), + BrowserVariant::Edge => webclaw_http::Client::builder().edge(), + }; + + builder = builder.timeout(config.timeout); + + for (k, v) in &config.headers { + builder = builder.default_header(k, v); + } + + if let Some(proxy_url) = proxy { + builder = builder + .proxy(proxy_url) + .map_err(|e| FetchError::Build(format!("proxy: {e}")))?; + } + + builder.build().map_err(|e| FetchError::Build(e.to_string())) +} + /// Status codes worth retrying: server errors + rate limiting. fn is_retryable_status(status: u16) -> bool { status == 429 @@ -670,46 +575,6 @@ async fn collect_ordered( slots.into_iter().flatten().collect() } -/// Build a single primp Client from config + impersonation profile + optional proxy. -fn build_primp_client( - config: &FetchConfig, - profile: &ImpersonateProfile, - proxy: Option<&str>, -) -> Result { - let redirect_policy = if config.follow_redirects { - primp::redirect::Policy::limited(config.max_redirects as usize) - } else { - primp::redirect::Policy::none() - }; - - let mut headers = primp::header::HeaderMap::new(); - for (k, v) in &config.headers { - if let (Ok(name), Ok(val)) = ( - primp::header::HeaderName::from_bytes(k.as_bytes()), - primp::header::HeaderValue::from_str(v), - ) { - headers.insert(name, val); - } - } - - let mut builder = primp::Client::builder() - .impersonate(profile.browser) - .impersonate_os(profile.os) - .cookie_store(true) - .timeout(config.timeout) - .redirect(redirect_policy) - .default_headers(headers); - - if let Some(proxy_url) = proxy { - builder = builder - .proxy(primp::Proxy::all(proxy_url).map_err(|e| FetchError::Build(e.to_string()))?); - } - - builder - .build() - .map_err(|e| FetchError::Build(e.to_string())) -} - #[cfg(test)] mod tests { use super::*; diff --git a/crates/webclaw-fetch/src/error.rs b/crates/webclaw-fetch/src/error.rs index e0de33c..fe102d3 100644 --- a/crates/webclaw-fetch/src/error.rs +++ b/crates/webclaw-fetch/src/error.rs @@ -1,11 +1,11 @@ -/// Fetch-layer errors. Wraps primp/network failures into a single type +/// Fetch-layer errors. Wraps HTTP/network failures into a single type /// that callers can match on without leaking transport details. use thiserror::Error; #[derive(Debug, Error)] pub enum FetchError { #[error("request failed: {0}")] - Request(#[from] primp::Error), + Request(#[from] webclaw_http::Error), #[error("invalid url: {0}")] InvalidUrl(String), diff --git a/crates/webclaw-fetch/src/lib.rs b/crates/webclaw-fetch/src/lib.rs index 373eb8a..c0b6b29 100644 --- a/crates/webclaw-fetch/src/lib.rs +++ b/crates/webclaw-fetch/src/lib.rs @@ -1,15 +1,14 @@ -/// webclaw-fetch: HTTP client layer with browser TLS fingerprint impersonation. -/// Uses Impit under the hood to make requests that look like real -/// browsers at the TLS, HTTP/2, and header levels. -/// Automatically detects PDF responses and delegates to webclaw-pdf. +//! webclaw-fetch: HTTP client layer with browser TLS fingerprint impersonation. +//! Uses webclaw-http for browser-grade TLS + HTTP/2 fingerprinting. +//! Automatically detects PDF responses and delegates to webclaw-pdf. pub mod browser; pub mod client; pub mod crawler; pub mod document; pub mod error; -pub mod linkedin; +pub(crate) mod linkedin; pub mod proxy; -pub mod reddit; +pub(crate) mod reddit; pub mod sitemap; pub use browser::BrowserProfile;