mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-06 22:05:13 +02:00
feat: replace custom TLS stack with wreq (BoringSSL), bump v0.3.3
Migrated webclaw-fetch from webclaw-tls (patched rustls/h2/hyper/reqwest) to wreq by @0x676e67. wreq uses BoringSSL for TLS and the http2 crate for HTTP/2 fingerprinting — battle-tested with 60+ browser profiles. This removes all 5 [patch.crates-io] entries that consumers previously needed. Browser profiles (Chrome 145, Firefox 135, Safari 18, Edge 145) are now built directly on wreq's Emulation API with correct TLS options, HTTP/2 SETTINGS ordering, pseudo-header order, and header wire order. 84% pass rate across 1000 real sites. 384 unit tests green. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
0d0da265ab
commit
aaf51eddef
10 changed files with 754 additions and 343 deletions
|
|
@ -1,5 +1,5 @@
|
|||
/// HTTP client with browser TLS fingerprint impersonation.
|
||||
/// Uses webclaw-http for browser-grade TLS + HTTP/2 fingerprinting.
|
||||
/// Uses wreq (BoringSSL) for browser-grade TLS + HTTP/2 fingerprinting.
|
||||
/// Supports single and batch operations with proxy rotation.
|
||||
/// Automatically detects PDF responses and extracts text via webclaw-pdf.
|
||||
///
|
||||
|
|
@ -60,7 +60,7 @@ pub struct FetchResult {
|
|||
pub status: u16,
|
||||
/// Final URL after any redirects.
|
||||
pub url: String,
|
||||
pub headers: webclaw_http::HeaderMap,
|
||||
pub headers: http::HeaderMap,
|
||||
pub elapsed: Duration,
|
||||
}
|
||||
|
||||
|
|
@ -78,20 +78,54 @@ pub struct BatchExtractResult {
|
|||
pub result: Result<webclaw_core::ExtractionResult, FetchError>,
|
||||
}
|
||||
|
||||
/// Buffered response that owns its body. Provides the same sync API
|
||||
/// that webclaw-http::Response used to provide.
|
||||
struct Response {
|
||||
status: u16,
|
||||
url: String,
|
||||
headers: http::HeaderMap,
|
||||
body: bytes::Bytes,
|
||||
}
|
||||
|
||||
impl Response {
|
||||
/// Buffer a wreq response into an owned Response.
|
||||
async fn from_wreq(resp: wreq::Response) -> Result<Self, FetchError> {
|
||||
let status = resp.status().as_u16();
|
||||
let url = resp.uri().to_string();
|
||||
let headers = resp.headers().clone();
|
||||
let body = resp.bytes().await.map_err(|e| FetchError::BodyDecode(e.to_string()))?;
|
||||
Ok(Self { status, url, headers, body })
|
||||
}
|
||||
|
||||
fn status(&self) -> u16 { self.status }
|
||||
fn url(&self) -> &str { &self.url }
|
||||
fn headers(&self) -> &http::HeaderMap { &self.headers }
|
||||
fn body(&self) -> &[u8] { &self.body }
|
||||
fn is_success(&self) -> bool { (200..300).contains(&self.status) }
|
||||
|
||||
fn text(&self) -> std::borrow::Cow<'_, str> {
|
||||
String::from_utf8_lossy(&self.body)
|
||||
}
|
||||
|
||||
fn into_text(self) -> String {
|
||||
String::from_utf8_lossy(&self.body).into_owned()
|
||||
}
|
||||
}
|
||||
|
||||
/// Internal representation of the client pool strategy.
|
||||
enum ClientPool {
|
||||
/// Pre-built clients with a fixed proxy (or no proxy).
|
||||
/// Fingerprint rotation still works via the pool when `random` is true.
|
||||
Static {
|
||||
clients: Vec<webclaw_http::Client>,
|
||||
clients: Vec<wreq::Client>,
|
||||
random: bool,
|
||||
},
|
||||
/// Pre-built pool of clients, each with a different proxy + fingerprint.
|
||||
/// Requests pick a client deterministically by host for HTTP/2 connection reuse.
|
||||
Rotating { clients: Vec<webclaw_http::Client> },
|
||||
Rotating { clients: Vec<wreq::Client> },
|
||||
}
|
||||
|
||||
/// HTTP client with browser TLS + HTTP/2 fingerprinting via webclaw-http.
|
||||
/// HTTP client with browser TLS + HTTP/2 fingerprinting via wreq.
|
||||
///
|
||||
/// Operates in two modes:
|
||||
/// - **Static pool**: pre-built clients, optionally with fingerprint rotation.
|
||||
|
|
@ -105,13 +139,6 @@ pub struct FetchClient {
|
|||
|
||||
impl FetchClient {
|
||||
/// Build a new client from config.
|
||||
///
|
||||
/// When `config.proxy_pool` is non-empty, pre-builds one client per proxy,
|
||||
/// each with a randomly assigned fingerprint. Same-host URLs get routed to the
|
||||
/// same client for HTTP/2 connection reuse.
|
||||
///
|
||||
/// When `proxy_pool` is empty, pre-builds clients at construction time
|
||||
/// (one per fingerprint for `Random` profiles, one for fixed profiles).
|
||||
pub fn new(config: FetchConfig) -> Result<Self, FetchError> {
|
||||
let variants = collect_variants(&config.browser);
|
||||
let pdf_mode = config.pdf_mode.clone();
|
||||
|
|
@ -119,7 +146,9 @@ impl FetchClient {
|
|||
let pool = if config.proxy_pool.is_empty() {
|
||||
let clients = variants
|
||||
.into_iter()
|
||||
.map(|v| build_client(&config, v, config.proxy.as_deref()))
|
||||
.map(|v| {
|
||||
crate::tls::build_client(v, config.timeout, &config.headers, config.proxy.as_deref())
|
||||
})
|
||||
.collect::<Result<Vec<_>, _>>()?;
|
||||
|
||||
let random = matches!(config.browser, BrowserProfile::Random);
|
||||
|
|
@ -137,7 +166,7 @@ impl FetchClient {
|
|||
.iter()
|
||||
.map(|proxy| {
|
||||
let v = *variants.choose(&mut rng).unwrap();
|
||||
build_client(&config, v, Some(proxy))
|
||||
crate::tls::build_client(v, config.timeout, &config.headers, Some(proxy))
|
||||
})
|
||||
.collect::<Result<Vec<_>, _>>()?;
|
||||
|
||||
|
|
@ -205,19 +234,17 @@ impl FetchClient {
|
|||
Err(last_err.unwrap_or_else(|| FetchError::Build("all retries exhausted".into())))
|
||||
}
|
||||
|
||||
/// Single fetch attempt. Uses the TLS-impersonated client from the pool.
|
||||
/// Single fetch attempt.
|
||||
async fn fetch_once(&self, url: &str) -> Result<FetchResult, FetchError> {
|
||||
let start = Instant::now();
|
||||
let client = self.pick_client(url);
|
||||
|
||||
let response = client.get(url).await?;
|
||||
let resp = client.get(url).send().await?;
|
||||
let response = Response::from_wreq(resp).await?;
|
||||
response_to_result(response, start)
|
||||
}
|
||||
|
||||
/// Fetch a URL then extract structured content.
|
||||
///
|
||||
/// Automatically detects PDF responses via Content-Type header and routes
|
||||
/// to webclaw-pdf for text extraction. HTML responses go through webclaw-core.
|
||||
#[instrument(skip(self), fields(url = %url))]
|
||||
pub async fn fetch_and_extract(
|
||||
&self,
|
||||
|
|
@ -240,7 +267,8 @@ impl FetchClient {
|
|||
debug!("reddit detected, fetching {json_url}");
|
||||
|
||||
let client = self.pick_client(url);
|
||||
let response = client.get(&json_url).await?;
|
||||
let resp = client.get(&json_url).send().await?;
|
||||
let response = Response::from_wreq(resp).await?;
|
||||
if response.is_success() {
|
||||
let bytes = response.body();
|
||||
match crate::reddit::parse_reddit_json(bytes, url) {
|
||||
|
|
@ -252,7 +280,8 @@ impl FetchClient {
|
|||
|
||||
let start = Instant::now();
|
||||
let client = self.pick_client(url);
|
||||
let mut response = client.get(url).await?;
|
||||
let resp = client.get(url).send().await?;
|
||||
let mut response = Response::from_wreq(resp).await?;
|
||||
|
||||
// Cookie warmup: if we get a challenge page, visit the homepage first
|
||||
// to collect Akamai cookies (_abck, bm_sz, etc.), then retry.
|
||||
|
|
@ -260,8 +289,9 @@ impl FetchClient {
|
|||
&& let Some(homepage) = extract_homepage(url)
|
||||
{
|
||||
debug!("challenge detected, warming cookies via {homepage}");
|
||||
let _ = client.get(&homepage).await;
|
||||
response = client.get(url).await?;
|
||||
let _ = client.get(&homepage).send().await;
|
||||
let resp = client.get(url).send().await?;
|
||||
response = Response::from_wreq(resp).await?;
|
||||
debug!("retried after cookie warmup: status={}", response.status());
|
||||
}
|
||||
|
||||
|
|
@ -306,7 +336,7 @@ impl FetchClient {
|
|||
result.metadata.url = Some(final_url);
|
||||
Ok(result)
|
||||
} else {
|
||||
let html = response.text().into_owned();
|
||||
let html = response.into_text();
|
||||
|
||||
let elapsed = start.elapsed();
|
||||
debug!(status, elapsed_ms = %elapsed.as_millis(), "fetch complete");
|
||||
|
|
@ -399,7 +429,7 @@ impl FetchClient {
|
|||
}
|
||||
|
||||
/// Pick a client from the pool for a given URL.
|
||||
fn pick_client(&self, url: &str) -> &webclaw_http::Client {
|
||||
fn pick_client(&self, url: &str) -> &wreq::Client {
|
||||
match &self.pool {
|
||||
ClientPool::Static { clients, random } => {
|
||||
if *random {
|
||||
|
|
@ -423,9 +453,9 @@ fn collect_variants(profile: &BrowserProfile) -> Vec<BrowserVariant> {
|
|||
}
|
||||
}
|
||||
|
||||
/// Convert a webclaw-http Response into a FetchResult.
|
||||
/// Convert a buffered Response into a FetchResult.
|
||||
fn response_to_result(
|
||||
response: webclaw_http::Response,
|
||||
response: Response,
|
||||
start: Instant,
|
||||
) -> Result<FetchResult, FetchError> {
|
||||
let status = response.status();
|
||||
|
|
@ -455,7 +485,7 @@ fn extract_host(url: &str) -> String {
|
|||
|
||||
/// Pick a client deterministically based on a host string.
|
||||
/// Same host always gets the same client, enabling HTTP/2 connection reuse.
|
||||
fn pick_for_host<'a>(clients: &'a [webclaw_http::Client], host: &str) -> &'a webclaw_http::Client {
|
||||
fn pick_for_host<'a>(clients: &'a [wreq::Client], host: &str) -> &'a wreq::Client {
|
||||
let mut hasher = std::collections::hash_map::DefaultHasher::new();
|
||||
host.hash(&mut hasher);
|
||||
let idx = (hasher.finish() as usize) % clients.len();
|
||||
|
|
@ -463,43 +493,12 @@ fn pick_for_host<'a>(clients: &'a [webclaw_http::Client], host: &str) -> &'a web
|
|||
}
|
||||
|
||||
/// Pick a random client from the pool for per-request rotation.
|
||||
fn pick_random(clients: &[webclaw_http::Client]) -> &webclaw_http::Client {
|
||||
fn pick_random(clients: &[wreq::Client]) -> &wreq::Client {
|
||||
use rand::Rng;
|
||||
let idx = rand::thread_rng().gen_range(0..clients.len());
|
||||
&clients[idx]
|
||||
}
|
||||
|
||||
/// Build a webclaw-http Client from config + browser variant + optional proxy.
|
||||
fn build_client(
|
||||
config: &FetchConfig,
|
||||
variant: BrowserVariant,
|
||||
proxy: Option<&str>,
|
||||
) -> Result<webclaw_http::Client, FetchError> {
|
||||
let mut builder = match variant {
|
||||
BrowserVariant::Chrome => webclaw_http::Client::builder().chrome(),
|
||||
BrowserVariant::ChromeMacos => webclaw_http::Client::builder().chrome_macos(),
|
||||
BrowserVariant::Firefox => webclaw_http::Client::builder().firefox(),
|
||||
BrowserVariant::Safari => webclaw_http::Client::builder().safari(),
|
||||
BrowserVariant::Edge => webclaw_http::Client::builder().edge(),
|
||||
};
|
||||
|
||||
builder = builder.timeout(config.timeout);
|
||||
|
||||
for (k, v) in &config.headers {
|
||||
builder = builder.default_header(k, v);
|
||||
}
|
||||
|
||||
if let Some(proxy_url) = proxy {
|
||||
builder = builder
|
||||
.proxy(proxy_url)
|
||||
.map_err(|e| FetchError::Build(format!("proxy: {e}")))?;
|
||||
}
|
||||
|
||||
builder
|
||||
.build()
|
||||
.map_err(|e| FetchError::Build(e.to_string()))
|
||||
}
|
||||
|
||||
/// Status codes worth retrying: server errors + rate limiting.
|
||||
fn is_retryable_status(status: u16) -> bool {
|
||||
status == 429
|
||||
|
|
@ -518,7 +517,7 @@ fn is_retryable_error(err: &FetchError) -> bool {
|
|||
matches!(err, FetchError::Request(_) | FetchError::BodyDecode(_))
|
||||
}
|
||||
|
||||
fn is_pdf_content_type(headers: &webclaw_http::HeaderMap) -> bool {
|
||||
fn is_pdf_content_type(headers: &http::HeaderMap) -> bool {
|
||||
headers
|
||||
.get("content-type")
|
||||
.and_then(|ct| ct.to_str().ok())
|
||||
|
|
@ -530,9 +529,7 @@ fn is_pdf_content_type(headers: &webclaw_http::HeaderMap) -> bool {
|
|||
}
|
||||
|
||||
/// Detect if a response looks like a bot protection challenge page.
|
||||
/// Checks for small HTML pages with known challenge markers.
|
||||
fn is_challenge_response(response: &webclaw_http::Response) -> bool {
|
||||
// Only check small HTML responses — real pages are typically >10KB
|
||||
fn is_challenge_response(response: &Response) -> bool {
|
||||
let len = response.body().len();
|
||||
if len > 15_000 || len == 0 {
|
||||
return false;
|
||||
|
|
@ -541,12 +538,10 @@ fn is_challenge_response(response: &webclaw_http::Response) -> bool {
|
|||
let text = response.text();
|
||||
let lower = text.to_lowercase();
|
||||
|
||||
// Akamai Bot Manager challenge
|
||||
if lower.contains("<title>challenge page</title>") {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Akamai sensor script on tiny page
|
||||
if lower.contains("bazadebezolkohpepadr") && len < 5_000 {
|
||||
return true;
|
||||
}
|
||||
|
|
@ -628,7 +623,7 @@ mod tests {
|
|||
html: "<html></html>".to_string(),
|
||||
status: 200,
|
||||
url: "https://example.com".to_string(),
|
||||
headers: webclaw_http::HeaderMap::new(),
|
||||
headers: http::HeaderMap::new(),
|
||||
elapsed: Duration::from_millis(42),
|
||||
}),
|
||||
};
|
||||
|
|
@ -680,7 +675,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn test_is_pdf_content_type() {
|
||||
let mut headers = webclaw_http::HeaderMap::new();
|
||||
let mut headers = http::HeaderMap::new();
|
||||
headers.insert("content-type", "application/pdf".parse().unwrap());
|
||||
assert!(is_pdf_content_type(&headers));
|
||||
|
||||
|
|
@ -696,7 +691,7 @@ mod tests {
|
|||
headers.insert("content-type", "text/html".parse().unwrap());
|
||||
assert!(!is_pdf_content_type(&headers));
|
||||
|
||||
let empty = webclaw_http::HeaderMap::new();
|
||||
let empty = http::HeaderMap::new();
|
||||
assert!(!is_pdf_content_type(&empty));
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -34,7 +34,7 @@ impl std::fmt::Display for DocType {
|
|||
|
||||
/// Detect document type from response headers or URL extension.
|
||||
/// Returns `None` for non-document responses (HTML, PDF, etc.).
|
||||
pub fn is_document_content_type(headers: &webclaw_http::HeaderMap, url: &str) -> Option<DocType> {
|
||||
pub fn is_document_content_type(headers: &http::HeaderMap, url: &str) -> Option<DocType> {
|
||||
// Check Content-Type header first
|
||||
if let Some(ct) = headers.get("content-type").and_then(|v| v.to_str().ok()) {
|
||||
let mime = ct.split(';').next().unwrap_or("").trim();
|
||||
|
|
@ -474,7 +474,7 @@ fn strip_markdown_formatting(markdown: &str) -> String {
|
|||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use webclaw_http::HeaderMap;
|
||||
use http::HeaderMap;
|
||||
|
||||
fn headers_with(name: &str, value: &str) -> HeaderMap {
|
||||
let mut h = HeaderMap::new();
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ use thiserror::Error;
|
|||
#[derive(Debug, Error)]
|
||||
pub enum FetchError {
|
||||
#[error("request failed: {0}")]
|
||||
Request(#[from] webclaw_http::Error),
|
||||
Request(#[from] wreq::Error),
|
||||
|
||||
#[error("invalid url: {0}")]
|
||||
InvalidUrl(String),
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
//! webclaw-fetch: HTTP client layer with browser TLS fingerprint impersonation.
|
||||
//! Uses webclaw-http for browser-grade TLS + HTTP/2 fingerprinting.
|
||||
//! Uses wreq (BoringSSL) for browser-grade TLS + HTTP/2 fingerprinting.
|
||||
//! Automatically detects PDF responses and delegates to webclaw-pdf.
|
||||
pub mod browser;
|
||||
pub mod client;
|
||||
|
|
@ -10,6 +10,7 @@ pub mod linkedin;
|
|||
pub mod proxy;
|
||||
pub mod reddit;
|
||||
pub mod sitemap;
|
||||
pub mod tls;
|
||||
|
||||
pub use browser::BrowserProfile;
|
||||
pub use client::{BatchExtractResult, BatchResult, FetchClient, FetchConfig, FetchResult};
|
||||
|
|
@ -17,5 +18,5 @@ pub use crawler::{CrawlConfig, CrawlResult, CrawlState, Crawler, PageResult};
|
|||
pub use error::FetchError;
|
||||
pub use proxy::{parse_proxy_file, parse_proxy_line};
|
||||
pub use sitemap::SitemapEntry;
|
||||
pub use webclaw_http::HeaderMap;
|
||||
pub use http::HeaderMap;
|
||||
pub use webclaw_pdf::PdfMode;
|
||||
|
|
|
|||
372
crates/webclaw-fetch/src/tls.rs
Normal file
372
crates/webclaw-fetch/src/tls.rs
Normal file
|
|
@ -0,0 +1,372 @@
|
|||
//! Browser TLS + HTTP/2 fingerprint profiles built on wreq (BoringSSL).
|
||||
//!
|
||||
//! Replaces the old webclaw-http/webclaw-tls patched rustls stack.
|
||||
//! Each profile configures TLS options (cipher suites, curves, extensions,
|
||||
//! PSK, ECH GREASE) and HTTP/2 options (SETTINGS order, pseudo-header order,
|
||||
//! stream dependency, priorities) to match real browser fingerprints.
|
||||
|
||||
use std::time::Duration;
|
||||
|
||||
use wreq::http2::{
|
||||
Http2Options, PseudoId, PseudoOrder, SettingId, SettingsOrder, StreamDependency, StreamId,
|
||||
};
|
||||
use wreq::tls::{AlpsProtocol, CertificateCompressionAlgorithm, TlsOptions, TlsVersion};
|
||||
use wreq::{Client, Emulation};
|
||||
|
||||
use crate::browser::BrowserVariant;
|
||||
use crate::error::FetchError;
|
||||
|
||||
/// Chrome cipher list (TLS 1.3 + TLS 1.2 in Chrome's exact order).
|
||||
const CHROME_CIPHERS: &str = "TLS_AES_128_GCM_SHA256:TLS_AES_256_GCM_SHA384:TLS_CHACHA20_POLY1305_SHA256:TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA:TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA:TLS_RSA_WITH_AES_128_GCM_SHA256:TLS_RSA_WITH_AES_256_GCM_SHA384:TLS_RSA_WITH_AES_128_CBC_SHA:TLS_RSA_WITH_AES_256_CBC_SHA";
|
||||
|
||||
/// Chrome signature algorithms.
|
||||
const CHROME_SIGALGS: &str = "ecdsa_secp256r1_sha256:rsa_pss_rsae_sha256:rsa_pkcs1_sha256:ecdsa_secp384r1_sha384:rsa_pss_rsae_sha384:rsa_pkcs1_sha384:rsa_pss_rsae_sha512:rsa_pkcs1_sha512";
|
||||
|
||||
/// Chrome curves (post-quantum ML-KEM + X25519 + P-256 + P-384).
|
||||
const CHROME_CURVES: &str = "X25519MLKEM768:X25519:P-256:P-384";
|
||||
|
||||
/// Firefox cipher list.
|
||||
const FIREFOX_CIPHERS: &str = "TLS_AES_128_GCM_SHA256:TLS_CHACHA20_POLY1305_SHA256:TLS_AES_256_GCM_SHA384:TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA:TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA:TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA:TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA:TLS_RSA_WITH_AES_128_GCM_SHA256:TLS_RSA_WITH_AES_256_GCM_SHA384:TLS_RSA_WITH_AES_128_CBC_SHA:TLS_RSA_WITH_AES_256_CBC_SHA";
|
||||
|
||||
/// Firefox signature algorithms.
|
||||
const FIREFOX_SIGALGS: &str = "ecdsa_secp256r1_sha256:ecdsa_secp384r1_sha384:ecdsa_secp521r1_sha512:rsa_pss_rsae_sha256:rsa_pss_rsae_sha384:rsa_pss_rsae_sha512:rsa_pkcs1_sha256:rsa_pkcs1_sha384:rsa_pkcs1_sha512:ecdsa_sha1:rsa_pkcs1_sha1";
|
||||
|
||||
/// Firefox curves.
|
||||
const FIREFOX_CURVES: &str = "X25519MLKEM768:X25519:P-256:P-384:P-521";
|
||||
|
||||
/// Safari cipher list.
|
||||
const SAFARI_CIPHERS: &str = "TLS_AES_128_GCM_SHA256:TLS_AES_256_GCM_SHA384:TLS_CHACHA20_POLY1305_SHA256:TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA:TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA:TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA:TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA:TLS_RSA_WITH_AES_256_GCM_SHA384:TLS_RSA_WITH_AES_128_GCM_SHA256:TLS_RSA_WITH_AES_256_CBC_SHA:TLS_RSA_WITH_AES_128_CBC_SHA";
|
||||
|
||||
/// Safari signature algorithms.
|
||||
const SAFARI_SIGALGS: &str = "ecdsa_secp256r1_sha256:rsa_pss_rsae_sha256:rsa_pkcs1_sha256:ecdsa_secp384r1_sha384:rsa_pss_rsae_sha384:ecdsa_secp521r1_sha512:rsa_pss_rsae_sha512:rsa_pkcs1_sha384:rsa_pkcs1_sha512";
|
||||
|
||||
/// Safari curves.
|
||||
const SAFARI_CURVES: &str = "X25519:P-256:P-384:P-521";
|
||||
|
||||
// --- Chrome HTTP headers in correct wire order ---
|
||||
|
||||
const CHROME_HEADERS: &[(&str, &str)] = &[
|
||||
(
|
||||
"sec-ch-ua",
|
||||
r#""Google Chrome";v="145", "Chromium";v="145", "Not/A)Brand";v="24""#,
|
||||
),
|
||||
("sec-ch-ua-mobile", "?0"),
|
||||
("sec-ch-ua-platform", "\"Windows\""),
|
||||
("upgrade-insecure-requests", "1"),
|
||||
(
|
||||
"user-agent",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36",
|
||||
),
|
||||
(
|
||||
"accept",
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
||||
),
|
||||
("sec-fetch-site", "none"),
|
||||
("sec-fetch-mode", "navigate"),
|
||||
("sec-fetch-user", "?1"),
|
||||
("sec-fetch-dest", "document"),
|
||||
("accept-encoding", "gzip, deflate, br, zstd"),
|
||||
("accept-language", "en-US,en;q=0.9"),
|
||||
("priority", "u=0, i"),
|
||||
];
|
||||
|
||||
const CHROME_MACOS_HEADERS: &[(&str, &str)] = &[
|
||||
(
|
||||
"sec-ch-ua",
|
||||
r#""Google Chrome";v="145", "Chromium";v="145", "Not/A)Brand";v="24""#,
|
||||
),
|
||||
("sec-ch-ua-mobile", "?0"),
|
||||
("sec-ch-ua-platform", "\"macOS\""),
|
||||
("upgrade-insecure-requests", "1"),
|
||||
(
|
||||
"user-agent",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36",
|
||||
),
|
||||
(
|
||||
"accept",
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
||||
),
|
||||
("sec-fetch-site", "none"),
|
||||
("sec-fetch-mode", "navigate"),
|
||||
("sec-fetch-user", "?1"),
|
||||
("sec-fetch-dest", "document"),
|
||||
("accept-encoding", "gzip, deflate, br, zstd"),
|
||||
("accept-language", "en-US,en;q=0.9"),
|
||||
("priority", "u=0, i"),
|
||||
];
|
||||
|
||||
const FIREFOX_HEADERS: &[(&str, &str)] = &[
|
||||
(
|
||||
"user-agent",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:135.0) Gecko/20100101 Firefox/135.0",
|
||||
),
|
||||
(
|
||||
"accept",
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
),
|
||||
("accept-language", "en-US,en;q=0.5"),
|
||||
("accept-encoding", "gzip, deflate, br, zstd"),
|
||||
("upgrade-insecure-requests", "1"),
|
||||
("sec-fetch-dest", "document"),
|
||||
("sec-fetch-mode", "navigate"),
|
||||
("sec-fetch-site", "none"),
|
||||
("sec-fetch-user", "?1"),
|
||||
("priority", "u=0, i"),
|
||||
];
|
||||
|
||||
const SAFARI_HEADERS: &[(&str, &str)] = &[
|
||||
(
|
||||
"user-agent",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.3.1 Safari/605.1.15",
|
||||
),
|
||||
(
|
||||
"accept",
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
),
|
||||
("sec-fetch-site", "none"),
|
||||
("accept-language", "en-US,en;q=0.9"),
|
||||
("sec-fetch-mode", "navigate"),
|
||||
("accept-encoding", "gzip, deflate, br"),
|
||||
("sec-fetch-dest", "document"),
|
||||
];
|
||||
|
||||
const EDGE_HEADERS: &[(&str, &str)] = &[
|
||||
(
|
||||
"sec-ch-ua",
|
||||
r#""Microsoft Edge";v="145", "Chromium";v="145", "Not/A)Brand";v="24""#,
|
||||
),
|
||||
("sec-ch-ua-mobile", "?0"),
|
||||
("sec-ch-ua-platform", "\"Windows\""),
|
||||
("upgrade-insecure-requests", "1"),
|
||||
(
|
||||
"user-agent",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36 Edg/145.0.0.0",
|
||||
),
|
||||
(
|
||||
"accept",
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
||||
),
|
||||
("sec-fetch-site", "none"),
|
||||
("sec-fetch-mode", "navigate"),
|
||||
("sec-fetch-user", "?1"),
|
||||
("sec-fetch-dest", "document"),
|
||||
("accept-encoding", "gzip, deflate, br, zstd"),
|
||||
("accept-language", "en-US,en;q=0.9"),
|
||||
("priority", "u=0, i"),
|
||||
];
|
||||
|
||||
fn chrome_tls() -> TlsOptions {
|
||||
TlsOptions::builder()
|
||||
.cipher_list(CHROME_CIPHERS)
|
||||
.sigalgs_list(CHROME_SIGALGS)
|
||||
.curves_list(CHROME_CURVES)
|
||||
.min_tls_version(TlsVersion::TLS_1_2)
|
||||
.max_tls_version(TlsVersion::TLS_1_3)
|
||||
.grease_enabled(true)
|
||||
.permute_extensions(true)
|
||||
.enable_ech_grease(true)
|
||||
.pre_shared_key(true)
|
||||
.enable_ocsp_stapling(true)
|
||||
.enable_signed_cert_timestamps(true)
|
||||
.alps_protocols([AlpsProtocol::HTTP2])
|
||||
.alps_use_new_codepoint(true)
|
||||
.aes_hw_override(true)
|
||||
.certificate_compression_algorithms(&[CertificateCompressionAlgorithm::BROTLI])
|
||||
.build()
|
||||
}
|
||||
|
||||
fn firefox_tls() -> TlsOptions {
|
||||
TlsOptions::builder()
|
||||
.cipher_list(FIREFOX_CIPHERS)
|
||||
.sigalgs_list(FIREFOX_SIGALGS)
|
||||
.curves_list(FIREFOX_CURVES)
|
||||
.min_tls_version(TlsVersion::TLS_1_2)
|
||||
.max_tls_version(TlsVersion::TLS_1_3)
|
||||
.grease_enabled(true)
|
||||
.permute_extensions(false)
|
||||
.enable_ech_grease(true)
|
||||
.pre_shared_key(true)
|
||||
.enable_ocsp_stapling(true)
|
||||
.enable_signed_cert_timestamps(true)
|
||||
.certificate_compression_algorithms(&[
|
||||
CertificateCompressionAlgorithm::ZLIB,
|
||||
CertificateCompressionAlgorithm::BROTLI,
|
||||
])
|
||||
.build()
|
||||
}
|
||||
|
||||
fn safari_tls() -> TlsOptions {
|
||||
TlsOptions::builder()
|
||||
.cipher_list(SAFARI_CIPHERS)
|
||||
.sigalgs_list(SAFARI_SIGALGS)
|
||||
.curves_list(SAFARI_CURVES)
|
||||
.min_tls_version(TlsVersion::TLS_1_2)
|
||||
.max_tls_version(TlsVersion::TLS_1_3)
|
||||
.grease_enabled(true)
|
||||
.permute_extensions(false)
|
||||
.enable_ech_grease(false)
|
||||
.pre_shared_key(false)
|
||||
.enable_ocsp_stapling(true)
|
||||
.enable_signed_cert_timestamps(true)
|
||||
.certificate_compression_algorithms(&[CertificateCompressionAlgorithm::ZLIB])
|
||||
.build()
|
||||
}
|
||||
|
||||
fn chrome_h2() -> Http2Options {
|
||||
Http2Options::builder()
|
||||
.initial_window_size(6_291_456)
|
||||
.initial_connection_window_size(15_728_640)
|
||||
.max_header_list_size(262_144)
|
||||
.header_table_size(65_536)
|
||||
.max_concurrent_streams(1000u32)
|
||||
.enable_push(false)
|
||||
.settings_order(
|
||||
SettingsOrder::builder()
|
||||
.extend([
|
||||
SettingId::HeaderTableSize,
|
||||
SettingId::EnablePush,
|
||||
SettingId::MaxConcurrentStreams,
|
||||
SettingId::InitialWindowSize,
|
||||
SettingId::MaxFrameSize,
|
||||
SettingId::MaxHeaderListSize,
|
||||
SettingId::EnableConnectProtocol,
|
||||
SettingId::NoRfc7540Priorities,
|
||||
])
|
||||
.build(),
|
||||
)
|
||||
.headers_pseudo_order(
|
||||
PseudoOrder::builder()
|
||||
.extend([
|
||||
PseudoId::Method,
|
||||
PseudoId::Authority,
|
||||
PseudoId::Scheme,
|
||||
PseudoId::Path,
|
||||
])
|
||||
.build(),
|
||||
)
|
||||
.headers_stream_dependency(StreamDependency::new(StreamId::zero(), 219, true))
|
||||
.build()
|
||||
}
|
||||
|
||||
fn firefox_h2() -> Http2Options {
|
||||
Http2Options::builder()
|
||||
.initial_window_size(131_072)
|
||||
.initial_connection_window_size(12_517_377)
|
||||
.max_header_list_size(65_536)
|
||||
.header_table_size(65_536)
|
||||
.settings_order(
|
||||
SettingsOrder::builder()
|
||||
.extend([
|
||||
SettingId::HeaderTableSize,
|
||||
SettingId::InitialWindowSize,
|
||||
SettingId::MaxFrameSize,
|
||||
])
|
||||
.build(),
|
||||
)
|
||||
.headers_pseudo_order(
|
||||
PseudoOrder::builder()
|
||||
.extend([
|
||||
PseudoId::Method,
|
||||
PseudoId::Path,
|
||||
PseudoId::Authority,
|
||||
PseudoId::Scheme,
|
||||
])
|
||||
.build(),
|
||||
)
|
||||
.build()
|
||||
}
|
||||
|
||||
fn safari_h2() -> Http2Options {
|
||||
Http2Options::builder()
|
||||
.initial_window_size(2_097_152)
|
||||
.initial_connection_window_size(10_420_225)
|
||||
.max_header_list_size(0)
|
||||
.header_table_size(4_096)
|
||||
.enable_push(false)
|
||||
.max_concurrent_streams(100u32)
|
||||
.settings_order(
|
||||
SettingsOrder::builder()
|
||||
.extend([
|
||||
SettingId::EnablePush,
|
||||
SettingId::MaxConcurrentStreams,
|
||||
SettingId::InitialWindowSize,
|
||||
SettingId::MaxFrameSize,
|
||||
])
|
||||
.build(),
|
||||
)
|
||||
.headers_pseudo_order(
|
||||
PseudoOrder::builder()
|
||||
.extend([
|
||||
PseudoId::Method,
|
||||
PseudoId::Scheme,
|
||||
PseudoId::Authority,
|
||||
PseudoId::Path,
|
||||
])
|
||||
.build(),
|
||||
)
|
||||
.headers_stream_dependency(StreamDependency::new(StreamId::zero(), 255, false))
|
||||
.build()
|
||||
}
|
||||
|
||||
fn build_headers(pairs: &[(&str, &str)]) -> http::HeaderMap {
|
||||
let mut map = http::HeaderMap::with_capacity(pairs.len());
|
||||
for (name, value) in pairs {
|
||||
if let (Ok(n), Ok(v)) = (
|
||||
http::header::HeaderName::from_bytes(name.as_bytes()),
|
||||
http::header::HeaderValue::from_str(value),
|
||||
) {
|
||||
map.insert(n, v);
|
||||
}
|
||||
}
|
||||
map
|
||||
}
|
||||
|
||||
/// Build a wreq Client for a specific browser variant.
|
||||
pub fn build_client(
|
||||
variant: BrowserVariant,
|
||||
timeout: Duration,
|
||||
extra_headers: &std::collections::HashMap<String, String>,
|
||||
proxy: Option<&str>,
|
||||
) -> Result<Client, FetchError> {
|
||||
let (tls, h2, headers) = match variant {
|
||||
BrowserVariant::Chrome => (chrome_tls(), chrome_h2(), CHROME_HEADERS),
|
||||
BrowserVariant::ChromeMacos => (chrome_tls(), chrome_h2(), CHROME_MACOS_HEADERS),
|
||||
BrowserVariant::Firefox => (firefox_tls(), firefox_h2(), FIREFOX_HEADERS),
|
||||
BrowserVariant::Safari => (safari_tls(), safari_h2(), SAFARI_HEADERS),
|
||||
BrowserVariant::Edge => (chrome_tls(), chrome_h2(), EDGE_HEADERS),
|
||||
};
|
||||
|
||||
let mut header_map = build_headers(headers);
|
||||
|
||||
// Append extra headers after profile defaults
|
||||
for (k, v) in extra_headers {
|
||||
if let (Ok(n), Ok(val)) = (
|
||||
http::header::HeaderName::from_bytes(k.as_bytes()),
|
||||
http::header::HeaderValue::from_str(v),
|
||||
) {
|
||||
header_map.insert(n, val);
|
||||
}
|
||||
}
|
||||
|
||||
let emulation = Emulation::builder()
|
||||
.tls_options(tls)
|
||||
.http2_options(h2)
|
||||
.headers(header_map)
|
||||
.build();
|
||||
|
||||
let mut builder = Client::builder()
|
||||
.emulation(emulation)
|
||||
.redirect(wreq::redirect::Policy::limited(10))
|
||||
.cookie_store(true)
|
||||
.timeout(timeout);
|
||||
|
||||
if let Some(proxy_url) = proxy {
|
||||
let proxy =
|
||||
wreq::Proxy::all(proxy_url).map_err(|e| FetchError::Build(format!("proxy: {e}")))?;
|
||||
builder = builder.proxy(proxy);
|
||||
}
|
||||
|
||||
builder
|
||||
.build()
|
||||
.map_err(|e| FetchError::Build(e.to_string()))
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue