feat: cookie warmup fallback for Akamai-protected pages

When a fetch returns a challenge page (small HTML with Akamai markers),
automatically visit the homepage first to collect _abck/bm_sz cookies,
then retry the original URL. This bypasses Akamai's cookie-based gate
on subpages without needing JS execution.

Detected via: <title>Challenge Page</title> or bazadebezolkohpepadr
sensor marker on responses under 15KB.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Valerio 2026-03-30 14:09:31 +02:00
parent 75e0a9cdef
commit 7041a1d992

View file

@ -252,7 +252,18 @@ impl FetchClient {
let start = Instant::now(); let start = Instant::now();
let client = self.pick_client(url); let client = self.pick_client(url);
let response = client.get(url).await?; let mut response = client.get(url).await?;
// Cookie warmup: if we get a challenge page, visit the homepage first
// to collect Akamai cookies (_abck, bm_sz, etc.), then retry.
if is_challenge_response(&response) {
if let Some(homepage) = extract_homepage(url) {
debug!("challenge detected, warming cookies via {homepage}");
let _ = client.get(&homepage).await;
response = client.get(url).await?;
debug!("retried after cookie warmup: status={}", response.status());
}
}
let status = response.status(); let status = response.status();
let final_url = response.url().to_string(); let final_url = response.url().to_string();
@ -518,6 +529,38 @@ fn is_pdf_content_type(headers: &webclaw_http::HeaderMap) -> bool {
.unwrap_or(false) .unwrap_or(false)
} }
/// Detect if a response looks like a bot protection challenge page.
/// Checks for small HTML pages with known challenge markers.
fn is_challenge_response(response: &webclaw_http::Response) -> bool {
// Only check small HTML responses — real pages are typically >10KB
let len = response.body().len();
if len > 15_000 || len == 0 {
return false;
}
let text = response.text();
let lower = text.to_lowercase();
// Akamai Bot Manager challenge
if lower.contains("<title>challenge page</title>") {
return true;
}
// Akamai sensor script on tiny page
if lower.contains("bazadebezolkohpepadr") && len < 5_000 {
return true;
}
false
}
/// Extract the homepage URL (scheme + host) from a full URL.
fn extract_homepage(url: &str) -> Option<String> {
url::Url::parse(url).ok().map(|u| {
format!("{}://{}/", u.scheme(), u.host_str().unwrap_or(""))
})
}
/// Convert a webclaw-pdf PdfResult into a webclaw-core ExtractionResult. /// Convert a webclaw-pdf PdfResult into a webclaw-core ExtractionResult.
fn pdf_to_extraction_result( fn pdf_to_extraction_result(
pdf: &webclaw_pdf::PdfResult, pdf: &webclaw_pdf::PdfResult,