mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-04-25 00:06:21 +02:00
feat: cookie warmup fallback for Akamai-protected pages
When a fetch returns a challenge page (small HTML with Akamai markers), automatically visit the homepage first to collect _abck/bm_sz cookies, then retry the original URL. This bypasses Akamai's cookie-based gate on subpages without needing JS execution. Detected via: <title>Challenge Page</title> or bazadebezolkohpepadr sensor marker on responses under 15KB. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
75e0a9cdef
commit
7041a1d992
1 changed files with 44 additions and 1 deletions
|
|
@ -252,7 +252,18 @@ impl FetchClient {
|
||||||
|
|
||||||
let start = Instant::now();
|
let start = Instant::now();
|
||||||
let client = self.pick_client(url);
|
let client = self.pick_client(url);
|
||||||
let response = client.get(url).await?;
|
let mut response = client.get(url).await?;
|
||||||
|
|
||||||
|
// Cookie warmup: if we get a challenge page, visit the homepage first
|
||||||
|
// to collect Akamai cookies (_abck, bm_sz, etc.), then retry.
|
||||||
|
if is_challenge_response(&response) {
|
||||||
|
if let Some(homepage) = extract_homepage(url) {
|
||||||
|
debug!("challenge detected, warming cookies via {homepage}");
|
||||||
|
let _ = client.get(&homepage).await;
|
||||||
|
response = client.get(url).await?;
|
||||||
|
debug!("retried after cookie warmup: status={}", response.status());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let status = response.status();
|
let status = response.status();
|
||||||
let final_url = response.url().to_string();
|
let final_url = response.url().to_string();
|
||||||
|
|
@ -518,6 +529,38 @@ fn is_pdf_content_type(headers: &webclaw_http::HeaderMap) -> bool {
|
||||||
.unwrap_or(false)
|
.unwrap_or(false)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Detect if a response looks like a bot protection challenge page.
|
||||||
|
/// Checks for small HTML pages with known challenge markers.
|
||||||
|
fn is_challenge_response(response: &webclaw_http::Response) -> bool {
|
||||||
|
// Only check small HTML responses — real pages are typically >10KB
|
||||||
|
let len = response.body().len();
|
||||||
|
if len > 15_000 || len == 0 {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
let text = response.text();
|
||||||
|
let lower = text.to_lowercase();
|
||||||
|
|
||||||
|
// Akamai Bot Manager challenge
|
||||||
|
if lower.contains("<title>challenge page</title>") {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Akamai sensor script on tiny page
|
||||||
|
if lower.contains("bazadebezolkohpepadr") && len < 5_000 {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
false
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract the homepage URL (scheme + host) from a full URL.
|
||||||
|
fn extract_homepage(url: &str) -> Option<String> {
|
||||||
|
url::Url::parse(url).ok().map(|u| {
|
||||||
|
format!("{}://{}/", u.scheme(), u.host_str().unwrap_or(""))
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
/// Convert a webclaw-pdf PdfResult into a webclaw-core ExtractionResult.
|
/// Convert a webclaw-pdf PdfResult into a webclaw-core ExtractionResult.
|
||||||
fn pdf_to_extraction_result(
|
fn pdf_to_extraction_result(
|
||||||
pdf: &webclaw_pdf::PdfResult,
|
pdf: &webclaw_pdf::PdfResult,
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue