From b413d702b272960dcc3970394194f5328c784eeb Mon Sep 17 00:00:00 2001
From: Valerio <massimianivalerio1@gmail.com>
Date: Thu, 23 Apr 2026 14:59:29 +0200
Subject: [PATCH 1/4] feat(fetch): add fetch_smart with Reddit + Akamai rescue
 paths, bump 0.5.6

---
 CHANGELOG.md                       | 10 +++++
 Cargo.lock                         | 14 +++----
 Cargo.toml                         |  2 +-
 crates/webclaw-fetch/src/client.rs | 59 ++++++++++++++++++++++++++----
 4 files changed, 69 insertions(+), 16 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 94b9ddb..54cb31f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,16 @@
 All notable changes to webclaw are documented here.
 Format follows [Keep a Changelog](https://keepachangelog.com/).
 
+## [0.5.6] — 2026-04-23
+
+### Added
+- `FetchClient::fetch_smart(url)` applies per-site rescue logic and returns the same `FetchResult` shape as `fetch()`. Reddit URLs route to the `.json` API, and Akamai-style challenge pages trigger a homepage cookie warmup plus a retry. Makes `/v1/scrape` on Reddit populate markdown again.
+
+### Fixed
+- Regression introduced in 0.5.4 where the production server's `/v1/scrape` bypassed the Reddit `.json` shortcut and Akamai cookie warmup that `fetch_and_extract` had been providing. Both helpers now live in `fetch_smart` and every caller path picks them up.
+
+---
+
 ## [0.5.5] — 2026-04-23
 
 ### Added
diff --git a/Cargo.lock b/Cargo.lock
index 30135cd..b382000 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3219,7 +3219,7 @@ dependencies = [
 
 [[package]]
 name = "webclaw-cli"
-version = "0.5.5"
+version = "0.5.6"
 dependencies = [
  "clap",
  "dotenvy",
@@ -3240,7 +3240,7 @@ dependencies = [
 
 [[package]]
 name = "webclaw-core"
-version = "0.5.5"
+version = "0.5.6"
 dependencies = [
  "ego-tree",
  "once_cell",
@@ -3258,7 +3258,7 @@ dependencies = [
 
 [[package]]
 name = "webclaw-fetch"
-version = "0.5.5"
+version = "0.5.6"
 dependencies = [
  "async-trait",
  "bytes",
@@ -3284,7 +3284,7 @@ dependencies = [
 
 [[package]]
 name = "webclaw-llm"
-version = "0.5.5"
+version = "0.5.6"
 dependencies = [
  "async-trait",
  "reqwest",
@@ -3297,7 +3297,7 @@ dependencies = [
 
 [[package]]
 name = "webclaw-mcp"
-version = "0.5.5"
+version = "0.5.6"
 dependencies = [
  "dirs",
  "dotenvy",
@@ -3317,7 +3317,7 @@ dependencies = [
 
 [[package]]
 name = "webclaw-pdf"
-version = "0.5.5"
+version = "0.5.6"
 dependencies = [
  "pdf-extract",
  "thiserror",
@@ -3326,7 +3326,7 @@ dependencies = [
 
 [[package]]
 name = "webclaw-server"
-version = "0.5.5"
+version = "0.5.6"
 dependencies = [
  "anyhow",
  "axum",
diff --git a/Cargo.toml b/Cargo.toml
index abd5816..d9cfd92 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,7 +3,7 @@ resolver = "2"
 members = ["crates/*"]
 
 [workspace.package]
-version = "0.5.5"
+version = "0.5.6"
 edition = "2024"
 license = "AGPL-3.0"
 repository = "https://github.com/0xMassi/webclaw"
diff --git a/crates/webclaw-fetch/src/client.rs b/crates/webclaw-fetch/src/client.rs
index e147337..d61694f 100644
--- a/crates/webclaw-fetch/src/client.rs
+++ b/crates/webclaw-fetch/src/client.rs
@@ -261,10 +261,52 @@ impl FetchClient {
         self.cloud.as_deref()
     }
 
+    /// Fetch a URL with per-site rescue paths: Reddit URLs redirect to the
+    /// `.json` API, and Akamai-style challenge responses trigger a homepage
+    /// cookie warmup and a retry. Returns the same `FetchResult` shape as
+    /// [`Self::fetch`] so every caller (CLI, MCP, OSS server, production
+    /// server) benefits without shape churn.
+    ///
+    /// This is the method most callers want. Use plain [`Self::fetch`] only
+    /// when you need literal no-rescue behavior (e.g. inside the rescue
+    /// logic itself to avoid recursion).
+    pub async fn fetch_smart(&self, url: &str) -> Result<FetchResult, FetchError> {
+        // Reddit: the HTML page shows a verification interstitial for most
+        // client IPs, but appending `.json` returns the post + comment tree
+        // publicly. `parse_reddit_json` in downstream code knows how to read
+        // the result; here we just do the URL swap at the fetch layer.
+        if crate::reddit::is_reddit_url(url) {
+            let json_url = crate::reddit::json_url(url);
+            if let Ok(resp) = self.fetch(&json_url).await {
+                if resp.status == 200 && !resp.html.is_empty() {
+                    return Ok(resp);
+                }
+            }
+            // If the .json fetch failed, fall through to the HTML path.
+        }
+
+        let resp = self.fetch(url).await?;
+
+        // Akamai / bazadebezolkohpepadr challenge: visit the homepage to
+        // collect warmup cookies (_abck, bm_sz, etc.), then retry.
+        if is_challenge_html(&resp.html)
+            && let Some(homepage) = extract_homepage(url)
+        {
+            debug!("challenge detected, warming cookies via {homepage}");
+            let _ = self.fetch(&homepage).await;
+            if let Ok(retry) = self.fetch(url).await {
+                return Ok(retry);
+            }
+        }
+
+        Ok(resp)
+    }
+
     /// Fetch a URL and return the raw HTML + response metadata.
     ///
     /// Automatically retries on transient failures (network errors, 5xx, 429)
-    /// with exponential backoff: 0s, 1s (2 attempts total).
+    /// with exponential backoff: 0s, 1s (2 attempts total). No per-site
+    /// rescue logic; use [`Self::fetch_smart`] for that.
     #[instrument(skip(self), fields(url = %url))]
     pub async fn fetch(&self, url: &str) -> Result<FetchResult, FetchError> {
         let delays = [Duration::ZERO, Duration::from_secs(1)];
@@ -713,22 +755,23 @@ fn is_pdf_content_type(headers: &http::HeaderMap) -> bool {
 
 /// Detect if a response looks like a bot protection challenge page.
 fn is_challenge_response(response: &Response) -> bool {
-    let len = response.body().len();
+    is_challenge_html(response.text().as_ref())
+}
+
+/// Same as `is_challenge_response`, operating on a body string directly
+/// so callers holding a `FetchResult` can reuse the heuristic.
+fn is_challenge_html(html: &str) -> bool {
+    let len = html.len();
     if len > 15_000 || len == 0 {
         return false;
     }
-
-    let text = response.text();
-    let lower = text.to_lowercase();
-
+    let lower = html.to_lowercase();
     if lower.contains("<title>challenge page</title>") {
         return true;
     }
-
     if lower.contains("bazadebezolkohpepadr") && len < 5_000 {
         return true;
     }
-
     false
 }
 

From 866fa88aa05d208cb5389795cfc655876742cfbc Mon Sep 17 00:00:00 2001
From: Valerio <massimianivalerio1@gmail.com>
Date: Thu, 23 Apr 2026 15:06:35 +0200
Subject: [PATCH 2/4] fix(fetch): reject HTML verification pages served at
 .json reddit URL

---
 crates/webclaw-fetch/src/client.rs | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/crates/webclaw-fetch/src/client.rs b/crates/webclaw-fetch/src/client.rs
index d61694f..78731e5 100644
--- a/crates/webclaw-fetch/src/client.rs
+++ b/crates/webclaw-fetch/src/client.rs
@@ -277,12 +277,18 @@ impl FetchClient {
         // the result; here we just do the URL swap at the fetch layer.
         if crate::reddit::is_reddit_url(url) {
             let json_url = crate::reddit::json_url(url);
-            if let Ok(resp) = self.fetch(&json_url).await {
-                if resp.status == 200 && !resp.html.is_empty() {
+            if let Ok(resp) = self.fetch(&json_url).await
+                && resp.status == 200
+            {
+                // Reddit will serve an HTML verification page at the .json
+                // URL too when the IP is flagged. Only return if the body
+                // actually starts with a JSON payload.
+                let first = resp.html.trim_start().as_bytes().first().copied();
+                if matches!(first, Some(b'{') | Some(b'[')) {
                     return Ok(resp);
                 }
             }
-            // If the .json fetch failed, fall through to the HTML path.
+            // If the .json fetch failed or returned HTML, fall through.
         }
 
         let resp = self.fetch(url).await?;

From 966981bc4299323721c2d43ff5aa157bf939b82c Mon Sep 17 00:00:00 2001
From: Valerio <massimianivalerio1@gmail.com>
Date: Thu, 23 Apr 2026 15:17:04 +0200
Subject: [PATCH 3/4] fix(fetch): send bot-identifying UA on reddit .json API
 to bypass browser UA block

---
 crates/webclaw-fetch/src/client.rs | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/crates/webclaw-fetch/src/client.rs b/crates/webclaw-fetch/src/client.rs
index 78731e5..94d698f 100644
--- a/crates/webclaw-fetch/src/client.rs
+++ b/crates/webclaw-fetch/src/client.rs
@@ -275,14 +275,21 @@ impl FetchClient {
         // client IPs, but appending `.json` returns the post + comment tree
         // publicly. `parse_reddit_json` in downstream code knows how to read
         // the result; here we just do the URL swap at the fetch layer.
-        if crate::reddit::is_reddit_url(url) {
+        if crate::reddit::is_reddit_url(url) && !url.ends_with(".json") {
             let json_url = crate::reddit::json_url(url);
-            if let Ok(resp) = self.fetch(&json_url).await
+            // Reddit's public .json API serves JSON to identifiable bot
+            // User-Agents and blocks browser UAs with a verification wall.
+            // Override our Chrome-profile UA for this specific call.
+            let ua = concat!(
+                "Webclaw/",
+                env!("CARGO_PKG_VERSION"),
+                " (+https://webclaw.io)"
+            );
+            if let Ok(resp) = self
+                .fetch_with_headers(&json_url, &[("user-agent", ua)])
+                .await
                 && resp.status == 200
             {
-                // Reddit will serve an HTML verification page at the .json
-                // URL too when the IP is flagged. Only return if the body
-                // actually starts with a JSON payload.
                 let first = resp.html.trim_start().as_bytes().first().copied();
                 if matches!(first, Some(b'{') | Some(b'[')) {
                     return Ok(resp);

From a5c3433372f33517f2aa765c2544ab6abdfe1cc7 Mon Sep 17 00:00:00 2001
From: Valerio <massimianivalerio1@gmail.com>
Date: Thu, 23 Apr 2026 15:26:31 +0200
Subject: [PATCH 4/4] fix(core+server): guard markdown pipe slice + detect
 trustpilot/reddit verify walls

---
 CHANGELOG.md                        | 3 ++-
 crates/webclaw-core/src/markdown.rs | 6 ++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 54cb31f..3000593 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,10 +6,11 @@ Format follows [Keep a Changelog](https://keepachangelog.com/).
 ## [0.5.6] — 2026-04-23
 
 ### Added
-- `FetchClient::fetch_smart(url)` applies per-site rescue logic and returns the same `FetchResult` shape as `fetch()`. Reddit URLs route to the `.json` API, and Akamai-style challenge pages trigger a homepage cookie warmup plus a retry. Makes `/v1/scrape` on Reddit populate markdown again.
+- `FetchClient::fetch_smart(url)` applies per-site rescue logic and returns the same `FetchResult` shape as `fetch()`. Reddit URLs route to the `.json` API with an identifiable bot `User-Agent`, and Akamai-style challenge pages trigger a homepage cookie warmup plus a retry. Makes `/v1/scrape` on Reddit populate markdown again.
 
 ### Fixed
 - Regression introduced in 0.5.4 where the production server's `/v1/scrape` bypassed the Reddit `.json` shortcut and Akamai cookie warmup that `fetch_and_extract` had been providing. Both helpers now live in `fetch_smart` and every caller path picks them up.
+- Panic in the markdown converter (`markdown.rs:925`) on single-pipe `|` lines. A `[1..len-1]` slice on a 1-char input triggered `begin <= end`. Guarded.
 
 ---
 
diff --git a/crates/webclaw-core/src/markdown.rs b/crates/webclaw-core/src/markdown.rs
index 1a61586..d0a2c23 100644
--- a/crates/webclaw-core/src/markdown.rs
+++ b/crates/webclaw-core/src/markdown.rs
@@ -920,8 +920,10 @@ fn strip_markdown(md: &str) -> String {
             continue;
         }
 
-        // Convert table data rows: strip leading/trailing pipes, replace inner pipes with tabs
-        if trimmed.starts_with('|') && trimmed.ends_with('|') {
+        // Convert table data rows: strip leading/trailing pipes, replace inner pipes with tabs.
+        // Require at least 2 chars so the slice `[1..len-1]` stays non-empty on single-pipe rows
+        // (which aren't real tables anyway); a lone `|` previously panicked at `begin <= end`.
+        if trimmed.len() >= 2 && trimmed.starts_with('|') && trimmed.ends_with('|') {
             let inner = &trimmed[1..trimmed.len() - 1];
             let cells: Vec<&str> = inner.split('|').map(|c| c.trim()).collect();
             lines.push(cells.join("\t"));