mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-04-25 00:06:21 +02:00
feat: cookie warmup fallback for Akamai-protected pages
When a fetch returns a challenge page (small HTML with Akamai markers), automatically visit the homepage first to collect _abck/bm_sz cookies, then retry the original URL. This bypasses Akamai's cookie-based gate on subpages without needing JS execution. Detected via: <title>Challenge Page</title> or bazadebezolkohpepadr sensor marker on responses under 15KB. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
75e0a9cdef
commit
7041a1d992
1 changed files with 44 additions and 1 deletions
|
|
@ -252,7 +252,18 @@ impl FetchClient {
|
|||
|
||||
let start = Instant::now();
|
||||
let client = self.pick_client(url);
|
||||
let response = client.get(url).await?;
|
||||
let mut response = client.get(url).await?;
|
||||
|
||||
// Cookie warmup: if we get a challenge page, visit the homepage first
|
||||
// to collect Akamai cookies (_abck, bm_sz, etc.), then retry.
|
||||
if is_challenge_response(&response) {
|
||||
if let Some(homepage) = extract_homepage(url) {
|
||||
debug!("challenge detected, warming cookies via {homepage}");
|
||||
let _ = client.get(&homepage).await;
|
||||
response = client.get(url).await?;
|
||||
debug!("retried after cookie warmup: status={}", response.status());
|
||||
}
|
||||
}
|
||||
|
||||
let status = response.status();
|
||||
let final_url = response.url().to_string();
|
||||
|
|
@ -518,6 +529,38 @@ fn is_pdf_content_type(headers: &webclaw_http::HeaderMap) -> bool {
|
|||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
/// Detect if a response looks like a bot protection challenge page.
|
||||
/// Checks for small HTML pages with known challenge markers.
|
||||
fn is_challenge_response(response: &webclaw_http::Response) -> bool {
|
||||
// Only check small HTML responses — real pages are typically >10KB
|
||||
let len = response.body().len();
|
||||
if len > 15_000 || len == 0 {
|
||||
return false;
|
||||
}
|
||||
|
||||
let text = response.text();
|
||||
let lower = text.to_lowercase();
|
||||
|
||||
// Akamai Bot Manager challenge
|
||||
if lower.contains("<title>challenge page</title>") {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Akamai sensor script on tiny page
|
||||
if lower.contains("bazadebezolkohpepadr") && len < 5_000 {
|
||||
return true;
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
/// Extract the homepage URL (scheme + host) from a full URL.
|
||||
fn extract_homepage(url: &str) -> Option<String> {
|
||||
url::Url::parse(url).ok().map(|u| {
|
||||
format!("{}://{}/", u.scheme(), u.host_str().unwrap_or(""))
|
||||
})
|
||||
}
|
||||
|
||||
/// Convert a webclaw-pdf PdfResult into a webclaw-core ExtractionResult.
|
||||
fn pdf_to_extraction_result(
|
||||
pdf: &webclaw_pdf::PdfResult,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue