mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-21 02:28:27 +02:00
- New BrowserProfile::SafariIos mapped to BrowserVariant::SafariIos26. Built on wreq_util::Emulation::SafariIos26 with 4 overrides (TLS extension order, HTTP/2 HEADERS priority, real Safari iOS 26 headers, gzip/deflate/br). Matches bogdanfinn safari_ios_26_0 JA3 8d909525bd5bbb79f133d11cc05159fe exactly. Empirically 9/10 on immobiliare.it with country-it residential. - BrowserProfile::Chrome aligned to bogdanfinn chrome_133: dropped MAX_CONCURRENT_STREAMS from H2 SETTINGS, priority weight 256, explicit extension_permutation, advertise h3 in ALPN and ALPS. JA3 43067709b025da334de1279a120f8e14, akamai_fp 52d84b11737d980aef856699f885ca86. Fixes indeed.com and other Cloudflare-fronted sites. - New locale module: accept_language_for_url / accept_language_for_tld. TLD to Accept-Language mapping, unknown TLDs default to en-US. DataDome geo-vs-locale cross-checks are now trivially satisfiable. - wreq-util bumped 2.2.6 to 3.0.0-rc.10 for Emulation::SafariIos26.
77 lines
2.5 KiB
Rust
77 lines
2.5 KiB
Rust
//! Derive an `Accept-Language` header from a URL.
|
|
//!
|
|
//! DataDome-class bot detection on country-specific sites (e.g. immobiliare.it,
|
|
//! leboncoin.fr) does a geo-vs-locale sanity check: residential IP in the
|
|
//! target country + a browser UA but the wrong `Accept-Language` is a bot
|
|
//! signal. Matching the site's expected locale gets us through.
|
|
//!
|
|
//! Default for unmapped TLDs is `en-US,en;q=0.9` — the global fallback.
|
|
|
|
/// Best-effort `Accept-Language` header value for the given URL's TLD.
|
|
/// Returns `None` if the URL cannot be parsed.
|
|
pub fn accept_language_for_url(url: &str) -> Option<&'static str> {
|
|
let host = url::Url::parse(url).ok()?.host_str()?.to_ascii_lowercase();
|
|
let tld = host.rsplit('.').next()?;
|
|
Some(accept_language_for_tld(tld))
|
|
}
|
|
|
|
/// Map a bare TLD like `it`, `fr`, `de` to a plausible `Accept-Language`.
|
|
/// Unknown TLDs fall back to US English.
|
|
pub fn accept_language_for_tld(tld: &str) -> &'static str {
|
|
match tld {
|
|
"it" => "it-IT,it;q=0.9",
|
|
"fr" => "fr-FR,fr;q=0.9",
|
|
"de" | "at" => "de-DE,de;q=0.9",
|
|
"es" => "es-ES,es;q=0.9",
|
|
"pt" => "pt-PT,pt;q=0.9",
|
|
"nl" => "nl-NL,nl;q=0.9",
|
|
"pl" => "pl-PL,pl;q=0.9",
|
|
"se" => "sv-SE,sv;q=0.9",
|
|
"no" => "nb-NO,nb;q=0.9",
|
|
"dk" => "da-DK,da;q=0.9",
|
|
"fi" => "fi-FI,fi;q=0.9",
|
|
"cz" => "cs-CZ,cs;q=0.9",
|
|
"ro" => "ro-RO,ro;q=0.9",
|
|
"gr" => "el-GR,el;q=0.9",
|
|
"tr" => "tr-TR,tr;q=0.9",
|
|
"ru" => "ru-RU,ru;q=0.9",
|
|
"jp" => "ja-JP,ja;q=0.9",
|
|
"kr" => "ko-KR,ko;q=0.9",
|
|
"cn" => "zh-CN,zh;q=0.9",
|
|
"tw" | "hk" => "zh-TW,zh;q=0.9",
|
|
"br" => "pt-BR,pt;q=0.9",
|
|
"mx" | "ar" | "co" | "cl" | "pe" => "es-ES,es;q=0.9",
|
|
"uk" | "ie" => "en-GB,en;q=0.9",
|
|
_ => "en-US,en;q=0.9",
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn tld_dispatch() {
|
|
assert_eq!(
|
|
accept_language_for_url("https://www.immobiliare.it/annunci/1"),
|
|
Some("it-IT,it;q=0.9")
|
|
);
|
|
assert_eq!(
|
|
accept_language_for_url("https://www.leboncoin.fr/"),
|
|
Some("fr-FR,fr;q=0.9")
|
|
);
|
|
assert_eq!(
|
|
accept_language_for_url("https://www.amazon.co.uk/"),
|
|
Some("en-GB,en;q=0.9")
|
|
);
|
|
assert_eq!(
|
|
accept_language_for_url("https://example.com/"),
|
|
Some("en-US,en;q=0.9")
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn bad_url_returns_none() {
|
|
assert_eq!(accept_language_for_url("not-a-url"), None);
|
|
}
|
|
}
|