From b413d702b272960dcc3970394194f5328c784eeb Mon Sep 17 00:00:00 2001 From: Valerio Date: Thu, 23 Apr 2026 14:59:29 +0200 Subject: [PATCH 1/4] feat(fetch): add fetch_smart with Reddit + Akamai rescue paths, bump 0.5.6 --- CHANGELOG.md | 10 +++++ Cargo.lock | 14 +++---- Cargo.toml | 2 +- crates/webclaw-fetch/src/client.rs | 59 ++++++++++++++++++++++++++---- 4 files changed, 69 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 94b9ddb..54cb31f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,16 @@ All notable changes to webclaw are documented here. Format follows [Keep a Changelog](https://keepachangelog.com/). +## [0.5.6] — 2026-04-23 + +### Added +- `FetchClient::fetch_smart(url)` applies per-site rescue logic and returns the same `FetchResult` shape as `fetch()`. Reddit URLs route to the `.json` API, and Akamai-style challenge pages trigger a homepage cookie warmup plus a retry. Makes `/v1/scrape` on Reddit populate markdown again. + +### Fixed +- Regression introduced in 0.5.4 where the production server's `/v1/scrape` bypassed the Reddit `.json` shortcut and Akamai cookie warmup that `fetch_and_extract` had been providing. Both helpers now live in `fetch_smart` and every caller path picks them up. + +--- + ## [0.5.5] — 2026-04-23 ### Added diff --git a/Cargo.lock b/Cargo.lock index 30135cd..b382000 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3219,7 +3219,7 @@ dependencies = [ [[package]] name = "webclaw-cli" -version = "0.5.5" +version = "0.5.6" dependencies = [ "clap", "dotenvy", @@ -3240,7 +3240,7 @@ dependencies = [ [[package]] name = "webclaw-core" -version = "0.5.5" +version = "0.5.6" dependencies = [ "ego-tree", "once_cell", @@ -3258,7 +3258,7 @@ dependencies = [ [[package]] name = "webclaw-fetch" -version = "0.5.5" +version = "0.5.6" dependencies = [ "async-trait", "bytes", @@ -3284,7 +3284,7 @@ dependencies = [ [[package]] name = "webclaw-llm" -version = "0.5.5" +version = "0.5.6" dependencies = [ "async-trait", "reqwest", @@ -3297,7 +3297,7 @@ dependencies = [ [[package]] name = "webclaw-mcp" -version = "0.5.5" +version = "0.5.6" dependencies = [ "dirs", "dotenvy", @@ -3317,7 +3317,7 @@ dependencies = [ [[package]] name = "webclaw-pdf" -version = "0.5.5" +version = "0.5.6" dependencies = [ "pdf-extract", "thiserror", @@ -3326,7 +3326,7 @@ dependencies = [ [[package]] name = "webclaw-server" -version = "0.5.5" +version = "0.5.6" dependencies = [ "anyhow", "axum", diff --git a/Cargo.toml b/Cargo.toml index abd5816..d9cfd92 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ resolver = "2" members = ["crates/*"] [workspace.package] -version = "0.5.5" +version = "0.5.6" edition = "2024" license = "AGPL-3.0" repository = "https://github.com/0xMassi/webclaw" diff --git a/crates/webclaw-fetch/src/client.rs b/crates/webclaw-fetch/src/client.rs index e147337..d61694f 100644 --- a/crates/webclaw-fetch/src/client.rs +++ b/crates/webclaw-fetch/src/client.rs @@ -261,10 +261,52 @@ impl FetchClient { self.cloud.as_deref() } + /// Fetch a URL with per-site rescue paths: Reddit URLs redirect to the + /// `.json` API, and Akamai-style challenge responses trigger a homepage + /// cookie warmup and a retry. Returns the same `FetchResult` shape as + /// [`Self::fetch`] so every caller (CLI, MCP, OSS server, production + /// server) benefits without shape churn. + /// + /// This is the method most callers want. Use plain [`Self::fetch`] only + /// when you need literal no-rescue behavior (e.g. inside the rescue + /// logic itself to avoid recursion). + pub async fn fetch_smart(&self, url: &str) -> Result { + // Reddit: the HTML page shows a verification interstitial for most + // client IPs, but appending `.json` returns the post + comment tree + // publicly. `parse_reddit_json` in downstream code knows how to read + // the result; here we just do the URL swap at the fetch layer. + if crate::reddit::is_reddit_url(url) { + let json_url = crate::reddit::json_url(url); + if let Ok(resp) = self.fetch(&json_url).await { + if resp.status == 200 && !resp.html.is_empty() { + return Ok(resp); + } + } + // If the .json fetch failed, fall through to the HTML path. + } + + let resp = self.fetch(url).await?; + + // Akamai / bazadebezolkohpepadr challenge: visit the homepage to + // collect warmup cookies (_abck, bm_sz, etc.), then retry. + if is_challenge_html(&resp.html) + && let Some(homepage) = extract_homepage(url) + { + debug!("challenge detected, warming cookies via {homepage}"); + let _ = self.fetch(&homepage).await; + if let Ok(retry) = self.fetch(url).await { + return Ok(retry); + } + } + + Ok(resp) + } + /// Fetch a URL and return the raw HTML + response metadata. /// /// Automatically retries on transient failures (network errors, 5xx, 429) - /// with exponential backoff: 0s, 1s (2 attempts total). + /// with exponential backoff: 0s, 1s (2 attempts total). No per-site + /// rescue logic; use [`Self::fetch_smart`] for that. #[instrument(skip(self), fields(url = %url))] pub async fn fetch(&self, url: &str) -> Result { let delays = [Duration::ZERO, Duration::from_secs(1)]; @@ -713,22 +755,23 @@ fn is_pdf_content_type(headers: &http::HeaderMap) -> bool { /// Detect if a response looks like a bot protection challenge page. fn is_challenge_response(response: &Response) -> bool { - let len = response.body().len(); + is_challenge_html(response.text().as_ref()) +} + +/// Same as `is_challenge_response`, operating on a body string directly +/// so callers holding a `FetchResult` can reuse the heuristic. +fn is_challenge_html(html: &str) -> bool { + let len = html.len(); if len > 15_000 || len == 0 { return false; } - - let text = response.text(); - let lower = text.to_lowercase(); - + let lower = html.to_lowercase(); if lower.contains("challenge page") { return true; } - if lower.contains("bazadebezolkohpepadr") && len < 5_000 { return true; } - false } From 866fa88aa05d208cb5389795cfc655876742cfbc Mon Sep 17 00:00:00 2001 From: Valerio Date: Thu, 23 Apr 2026 15:06:35 +0200 Subject: [PATCH 2/4] fix(fetch): reject HTML verification pages served at .json reddit URL --- crates/webclaw-fetch/src/client.rs | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/crates/webclaw-fetch/src/client.rs b/crates/webclaw-fetch/src/client.rs index d61694f..78731e5 100644 --- a/crates/webclaw-fetch/src/client.rs +++ b/crates/webclaw-fetch/src/client.rs @@ -277,12 +277,18 @@ impl FetchClient { // the result; here we just do the URL swap at the fetch layer. if crate::reddit::is_reddit_url(url) { let json_url = crate::reddit::json_url(url); - if let Ok(resp) = self.fetch(&json_url).await { - if resp.status == 200 && !resp.html.is_empty() { + if let Ok(resp) = self.fetch(&json_url).await + && resp.status == 200 + { + // Reddit will serve an HTML verification page at the .json + // URL too when the IP is flagged. Only return if the body + // actually starts with a JSON payload. + let first = resp.html.trim_start().as_bytes().first().copied(); + if matches!(first, Some(b'{') | Some(b'[')) { return Ok(resp); } } - // If the .json fetch failed, fall through to the HTML path. + // If the .json fetch failed or returned HTML, fall through. } let resp = self.fetch(url).await?; From 966981bc4299323721c2d43ff5aa157bf939b82c Mon Sep 17 00:00:00 2001 From: Valerio Date: Thu, 23 Apr 2026 15:17:04 +0200 Subject: [PATCH 3/4] fix(fetch): send bot-identifying UA on reddit .json API to bypass browser UA block --- crates/webclaw-fetch/src/client.rs | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/crates/webclaw-fetch/src/client.rs b/crates/webclaw-fetch/src/client.rs index 78731e5..94d698f 100644 --- a/crates/webclaw-fetch/src/client.rs +++ b/crates/webclaw-fetch/src/client.rs @@ -275,14 +275,21 @@ impl FetchClient { // client IPs, but appending `.json` returns the post + comment tree // publicly. `parse_reddit_json` in downstream code knows how to read // the result; here we just do the URL swap at the fetch layer. - if crate::reddit::is_reddit_url(url) { + if crate::reddit::is_reddit_url(url) && !url.ends_with(".json") { let json_url = crate::reddit::json_url(url); - if let Ok(resp) = self.fetch(&json_url).await + // Reddit's public .json API serves JSON to identifiable bot + // User-Agents and blocks browser UAs with a verification wall. + // Override our Chrome-profile UA for this specific call. + let ua = concat!( + "Webclaw/", + env!("CARGO_PKG_VERSION"), + " (+https://webclaw.io)" + ); + if let Ok(resp) = self + .fetch_with_headers(&json_url, &[("user-agent", ua)]) + .await && resp.status == 200 { - // Reddit will serve an HTML verification page at the .json - // URL too when the IP is flagged. Only return if the body - // actually starts with a JSON payload. let first = resp.html.trim_start().as_bytes().first().copied(); if matches!(first, Some(b'{') | Some(b'[')) { return Ok(resp); From a5c3433372f33517f2aa765c2544ab6abdfe1cc7 Mon Sep 17 00:00:00 2001 From: Valerio Date: Thu, 23 Apr 2026 15:26:31 +0200 Subject: [PATCH 4/4] fix(core+server): guard markdown pipe slice + detect trustpilot/reddit verify walls --- CHANGELOG.md | 3 ++- crates/webclaw-core/src/markdown.rs | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 54cb31f..3000593 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,10 +6,11 @@ Format follows [Keep a Changelog](https://keepachangelog.com/). ## [0.5.6] — 2026-04-23 ### Added -- `FetchClient::fetch_smart(url)` applies per-site rescue logic and returns the same `FetchResult` shape as `fetch()`. Reddit URLs route to the `.json` API, and Akamai-style challenge pages trigger a homepage cookie warmup plus a retry. Makes `/v1/scrape` on Reddit populate markdown again. +- `FetchClient::fetch_smart(url)` applies per-site rescue logic and returns the same `FetchResult` shape as `fetch()`. Reddit URLs route to the `.json` API with an identifiable bot `User-Agent`, and Akamai-style challenge pages trigger a homepage cookie warmup plus a retry. Makes `/v1/scrape` on Reddit populate markdown again. ### Fixed - Regression introduced in 0.5.4 where the production server's `/v1/scrape` bypassed the Reddit `.json` shortcut and Akamai cookie warmup that `fetch_and_extract` had been providing. Both helpers now live in `fetch_smart` and every caller path picks them up. +- Panic in the markdown converter (`markdown.rs:925`) on single-pipe `|` lines. A `[1..len-1]` slice on a 1-char input triggered `begin <= end`. Guarded. --- diff --git a/crates/webclaw-core/src/markdown.rs b/crates/webclaw-core/src/markdown.rs index 1a61586..d0a2c23 100644 --- a/crates/webclaw-core/src/markdown.rs +++ b/crates/webclaw-core/src/markdown.rs @@ -920,8 +920,10 @@ fn strip_markdown(md: &str) -> String { continue; } - // Convert table data rows: strip leading/trailing pipes, replace inner pipes with tabs - if trimmed.starts_with('|') && trimmed.ends_with('|') { + // Convert table data rows: strip leading/trailing pipes, replace inner pipes with tabs. + // Require at least 2 chars so the slice `[1..len-1]` stays non-empty on single-pipe rows + // (which aren't real tables anyway); a lone `|` previously panicked at `begin <= end`. + if trimmed.len() >= 2 && trimmed.starts_with('|') && trimmed.ends_with('|') { let inner = &trimmed[1..trimmed.len() - 1]; let cells: Vec<&str> = inner.split('|').map(|c| c.trim()).collect(); lines.push(cells.join("\t"));