feat(reddit): parse old.reddit.com HTML instead of the dead .json API

Reddit blocked unauthenticated `.json` access, so the previous extractor returned block pages or timed out on every thread. Switch to parsing old.reddit.com's server-rendered HTML, which needs no API key or JS. Fetch layer: - Rewrite every Reddit host to old.reddit.com before fetching; drop all `.json` URL handling and the JSON response parser. Extraction (webclaw-core::reddit): - New HTML parser producing a typed post + nested comment tree. - Comments nest structurally (.comment > .child > .sitetable > .comment); old.reddit omits a usable depth attribute, so the tree is walked recursively. Bodies live in .entry > form > .usertext-body > .md. - Post metadata: title, author, subreddit, score, comment count (data-comments-count), self-vs-link (self class / self.* domain), flair, self-text body. - Comment scores read the .score.unvoted title (the displayed value, not the ±1 vote-state siblings); hidden scores are None, not 0. - Deleted comments are kept in place so their replies aren't orphaned; "load more comments" stubs are skipped. Markdown output: - Reply nesting via blockquote depth (avoids 4-space indentation turning text and code fences into broken indented-code blocks). - Links keep their target as [text](url); root-relative reddit links resolve against old.reddit.com. Nested lists indent correctly. - A recognised but unparseable /comments/ page returns no content rather than falling through to generic extraction of Reddit chrome. Tests: regression suite runs against real old.reddit.com fixtures (testdata/reddit/), the ground truth that surfaced the parsing and markdown bugs synthetic HTML had hidden. Fixtures are excluded from the published crate.
2026-06-07 22:15:12 +02:00 · 2026-06-04 16:16:08 +02:00 · 2026-06-04 16:16:08 +02:00 · 217bfe088b
commit 217bfe088b
parent 3b7d11328e
11 changed files with 2522 additions and 391 deletions
--- a/crates/webclaw-fetch/src/client.rs
+++ b/crates/webclaw-fetch/src/client.rs
@ -160,9 +160,6 @@ impl Response {
    fn body(&self) -> &[u8] {
        &self.body
    }
-    fn is_success(&self) -> bool {
-        (200..300).contains(&self.status)
-    }

    fn text(&self) -> std::borrow::Cow<'_, str> {
        String::from_utf8_lossy(&self.body)
@ -299,32 +296,15 @@ impl FetchClient {
    /// when you need literal no-rescue behavior (e.g. inside the rescue
    /// logic itself to avoid recursion).
    pub async fn fetch_smart(&self, url: &str) -> Result<FetchResult, FetchError> {
-        // Reddit: the HTML page shows a verification interstitial for most
-        // client IPs, but appending `.json` returns the post + comment tree
-        // publicly. `parse_reddit_json` in downstream code knows how to read
-        // the result; here we just do the URL swap at the fetch layer.
-        if crate::reddit::is_reddit_url(url) && !url.ends_with(".json") {
-            let json_url = crate::reddit::json_url(url);
-            // Reddit's public .json API serves JSON to identifiable bot
-            // User-Agents and blocks browser UAs with a verification wall.
-            // Override our Chrome-profile UA for this specific call.
-            let ua = concat!(
-                "Webclaw/",
-                env!("CARGO_PKG_VERSION"),
-                " (+https://webclaw.io)"
-            );
-            if let Ok(resp) = self
-                .fetch_with_headers(&json_url, &[("user-agent", ua)])
-                .await
-                && resp.status == 200
-            {
-                let first = resp.html.trim_start().as_bytes().first().copied();
-                if matches!(first, Some(b'{') | Some(b'[')) {
-                    return Ok(resp);
-                }
-            }
-            // If the .json fetch failed or returned HTML, fall through.
-        }
+        // Reddit: fetch old.reddit.com for stable server-rendered HTML.
+        // The JSON API is blocked; old.reddit.com works without JS or auth.
+        let owned;
+        let url = if crate::reddit::is_reddit_url(url) {
+            owned = crate::reddit::to_old_reddit_url(url);
+            owned.as_str()
+        } else {
+            url
+        };

        let resp = self.fetch(url).await?;

@ -496,23 +476,16 @@ impl FetchClient {
        let parsed_url = crate::url_security::validate_public_http_url(url).await?;
        let url = parsed_url.as_str();

-        // Reddit fallback: use their JSON API to get post + full comment tree.
-        if crate::reddit::is_reddit_url(url) {
-            let json_url = crate::reddit::json_url(url);
-            let json_url = crate::url_security::validate_public_http_url(&json_url).await?;
-            debug!("reddit detected, fetching {json_url}");
-
-            let client = self.pick_client(url);
-            let resp = client.get(json_url.as_str()).send().await?;
-            let response = Response::from_wreq(resp).await?;
-            if response.is_success() {
-                let bytes = response.body();
-                match crate::reddit::parse_reddit_json(bytes, url) {
-                    Ok(result) => return Ok(result),
-                    Err(e) => warn!("reddit json fallback failed: {e}, falling back to HTML"),
-                }
-            }
-        }
+        // Reddit: rewrite to old.reddit.com for stable server-rendered HTML.
+        // webclaw-core's Reddit fast path then parses the thread structure.
+        let reddit_owned;
+        let url = if crate::reddit::is_reddit_url(url) {
+            reddit_owned = crate::reddit::to_old_reddit_url(url);
+            debug!("reddit: rewriting to {reddit_owned}");
+            reddit_owned.as_str()
+        } else {
+            url
+        };

        let start = Instant::now();
        let client = self.pick_client(url);
--- a/crates/webclaw-fetch/src/extractors/reddit.rs
+++ b/crates/webclaw-fetch/src/extractors/reddit.rs
@ -1,12 +1,10 @@
-//! Reddit structured extractor — returns the full post + comment tree
-//! as typed JSON via Reddit's `.json` API.
+//! Reddit structured extractor — parses old.reddit.com HTML.
 //!
-//! The same trick the markdown extractor in `crate::reddit` uses:
-//! appending `.json` to any post URL returns the data the new SPA
-//! frontend would load client-side. Zero antibot, zero JS rendering.
+//! Fetches old.reddit.com (stable server-rendered HTML, no JS required)
+//! and delegates parsing to `webclaw_core::reddit`. Returns a typed JSON
+//! value with `{ url, post, comments }` structure.

-use serde::Deserialize;
-use serde_json::{Value, json};
+use serde_json::Value;

 use super::ExtractorInfo;
 use crate::error::FetchError;
@ -24,182 +22,27 @@ pub const INFO: ExtractorInfo = ExtractorInfo {
 };

 pub fn matches(url: &str) -> bool {
-    let host = host_of(url);
-    let is_reddit_host = matches!(
-        host,
-        "reddit.com" | "www.reddit.com" | "old.reddit.com" | "np.reddit.com" | "new.reddit.com"
-    );
-    is_reddit_host && url.contains("/comments/")
+    webclaw_core::reddit::is_reddit_url(url) && url.contains("/comments/")
 }

 pub async fn extract(client: &dyn Fetcher, url: &str) -> Result<Value, FetchError> {
-    let json_url = build_json_url(url);
-    let resp = client.fetch(&json_url).await?;
+    let fetch_url = crate::reddit::to_old_reddit_url(url);
+    let resp = client.fetch(&fetch_url).await?;
    if resp.status != 200 {
        return Err(FetchError::Build(format!(
-            "reddit api returned status {}",
+            "reddit: unexpected status {}",
            resp.status
        )));
    }

-    let listings: Vec<Listing> = serde_json::from_str(&resp.html)
-        .map_err(|e| FetchError::BodyDecode(format!("reddit json parse: {e}")))?;
+    let thread = webclaw_core::reddit::try_extract_thread(&resp.html, url).ok_or_else(|| {
+        FetchError::BodyDecode(
+            "reddit: page structure not recognised — is this a thread URL?".into(),
+        )
+    })?;

-    if listings.is_empty() {
-        return Err(FetchError::BodyDecode("reddit response empty".into()));
-    }
-
-    // First listing = the post (single t3 child).
-    let post = listings
-        .first()
-        .and_then(|l| l.data.children.first())
-        .filter(|t| t.kind == "t3")
-        .map(|t| post_json(&t.data))
-        .unwrap_or(Value::Null);
-
-    // Second listing = the comment tree.
-    let comments: Vec<Value> = listings
-        .get(1)
-        .map(|l| l.data.children.iter().filter_map(comment_json).collect())
-        .unwrap_or_default();
-
-    Ok(json!({
-        "url": url,
-        "post": post,
-        "comments": comments,
-    }))
-}
-
-// ---------------------------------------------------------------------------
-// JSON shapers
-// ---------------------------------------------------------------------------
-
-fn post_json(d: &ThingData) -> Value {
-    json!({
-        "id":               d.id,
-        "title":            d.title,
-        "author":           d.author,
-        "subreddit":        d.subreddit_name_prefixed,
-        "permalink":        d.permalink.as_ref().map(|p| format!("https://www.reddit.com{p}")),
-        "url":              d.url_overridden_by_dest,
-        "is_self":          d.is_self,
-        "selftext":         d.selftext,
-        "score":            d.score,
-        "upvote_ratio":     d.upvote_ratio,
-        "num_comments":     d.num_comments,
-        "created_utc":      d.created_utc,
-        "link_flair_text":  d.link_flair_text,
-        "over_18":          d.over_18,
-        "spoiler":          d.spoiler,
-        "stickied":         d.stickied,
-        "locked":           d.locked,
-    })
-}
-
-/// Render a single comment + its reply tree. Returns `None` for non-t1
-/// kinds (the trailing `more` placeholder Reddit injects at depth limits).
-fn comment_json(thing: &Thing) -> Option<Value> {
-    if thing.kind != "t1" {
-        return None;
-    }
-    let d = &thing.data;
-    let replies: Vec<Value> = match &d.replies {
-        Some(Replies::Listing(l)) => l.data.children.iter().filter_map(comment_json).collect(),
-        _ => Vec::new(),
-    };
-    Some(json!({
-        "id":             d.id,
-        "author":         d.author,
-        "body":           d.body,
-        "score":          d.score,
-        "created_utc":    d.created_utc,
-        "is_submitter":   d.is_submitter,
-        "stickied":       d.stickied,
-        "depth":          d.depth,
-        "permalink":      d.permalink.as_ref().map(|p| format!("https://www.reddit.com{p}")),
-        "replies":        replies,
-    }))
-}
-
-// ---------------------------------------------------------------------------
-// URL helpers
-// ---------------------------------------------------------------------------
-
-fn host_of(url: &str) -> &str {
-    url.split("://")
-        .nth(1)
-        .unwrap_or(url)
-        .split('/')
-        .next()
-        .unwrap_or("")
-}
-
-/// Build the Reddit JSON URL. We keep the original host (`www.reddit.com`
-/// or `old.reddit.com` as the caller gave us). Routing through
-/// `old.reddit.com` unconditionally looks appealing but that host has
-/// stricter UA-based blocking than `www.reddit.com`, while the main
-/// host accepts our Chrome-fingerprinted client fine.
-fn build_json_url(url: &str) -> String {
-    let clean = url.split('?').next().unwrap_or(url).trim_end_matches('/');
-    format!("{clean}.json?raw_json=1")
-}
-
-// ---------------------------------------------------------------------------
-// Reddit JSON types — only fields we render. Everything else is dropped.
-// ---------------------------------------------------------------------------
-
-#[derive(Deserialize)]
-struct Listing {
-    data: ListingData,
-}
-
-#[derive(Deserialize)]
-struct ListingData {
-    children: Vec<Thing>,
-}
-
-#[derive(Deserialize)]
-struct Thing {
-    kind: String,
-    data: ThingData,
-}
-
-#[derive(Deserialize, Default)]
-struct ThingData {
-    // post (t3)
-    id: Option<String>,
-    title: Option<String>,
-    selftext: Option<String>,
-    subreddit_name_prefixed: Option<String>,
-    url_overridden_by_dest: Option<String>,
-    is_self: Option<bool>,
-    upvote_ratio: Option<f64>,
-    num_comments: Option<i64>,
-    over_18: Option<bool>,
-    spoiler: Option<bool>,
-    stickied: Option<bool>,
-    locked: Option<bool>,
-    link_flair_text: Option<String>,
-
-    // comment (t1)
-    author: Option<String>,
-    body: Option<String>,
-    score: Option<i64>,
-    created_utc: Option<f64>,
-    is_submitter: Option<bool>,
-    depth: Option<i64>,
-    permalink: Option<String>,
-
-    // recursive
-    replies: Option<Replies>,
-}
-
-#[derive(Deserialize)]
-#[serde(untagged)]
-enum Replies {
-    Listing(Listing),
-    #[allow(dead_code)]
-    Empty(String),
+    serde_json::to_value(&thread)
+        .map_err(|e| FetchError::BodyDecode(format!("reddit: serialisation error: {e}")))
 }

 #[cfg(test)]
@ -207,28 +50,17 @@ mod tests {
    use super::*;

    #[test]
-    fn matches_reddit_post_urls() {
+    fn matches_thread_urls() {
        assert!(matches(
            "https://www.reddit.com/r/rust/comments/abc123/some_title/"
        ));
-        assert!(matches(
-            "https://reddit.com/r/rust/comments/abc123/some_title"
-        ));
        assert!(matches("https://old.reddit.com/r/rust/comments/abc123/x/"));
+        assert!(matches("https://reddit.com/r/rust/comments/abc/x"));
    }

    #[test]
-    fn rejects_non_post_reddit_urls() {
+    fn rejects_listing_and_non_reddit() {
        assert!(!matches("https://www.reddit.com/r/rust"));
-        assert!(!matches("https://www.reddit.com/user/foo"));
-        assert!(!matches("https://example.com/r/rust/comments/x"));
-    }
-
-    #[test]
-    fn json_url_appends_suffix_and_drops_query() {
-        assert_eq!(
-            build_json_url("https://www.reddit.com/r/rust/comments/abc/x/?utm=foo"),
-            "https://www.reddit.com/r/rust/comments/abc/x.json?raw_json=1"
-        );
+        assert!(!matches("https://example.com/r/rust/comments/abc/x"));
    }
 }
--- a/crates/webclaw-fetch/src/reddit.rs
+++ b/crates/webclaw-fetch/src/reddit.rs
@ -1,172 +1,56 @@
-/// Reddit JSON API fallback for extracting posts + comments without JS rendering.
-///
-/// Reddit's new `shreddit` frontend only SSRs the post body — comments are
-/// loaded client-side. Appending `.json` to any Reddit URL returns the full
-/// comment tree as structured JSON, which we convert to clean markdown.
-use serde::Deserialize;
-use tracing::debug;
-use webclaw_core::{Content, ExtractionResult, Metadata};
+//! Reddit URL helpers for the fetch layer.
+//!
+//! The JSON API (`*.json`) is blocked. We rewrite all Reddit hosts to
+//! `old.reddit.com`, which serves stable server-rendered HTML that
+//! `webclaw-core::reddit` parses directly.

-/// Check if a URL points to a Reddit post/comment page.
 pub fn is_reddit_url(url: &str) -> bool {
-    let host = url
-        .split("://")
-        .nth(1)
-        .unwrap_or(url)
-        .split('/')
-        .next()
-        .unwrap_or("");
-    matches!(
-        host,
-        "reddit.com" | "www.reddit.com" | "old.reddit.com" | "np.reddit.com" | "new.reddit.com"
-    )
+    webclaw_core::reddit::is_reddit_url(url)
 }

-/// Build the `.json` URL from a Reddit page URL.
-pub fn json_url(url: &str) -> String {
-    let clean = url.split('?').next().unwrap_or(url).trim_end_matches('/');
-    format!("{clean}.json")
+/// Rewrite any Reddit host to old.reddit.com, preserving path and query.
+pub fn to_old_reddit_url(url: &str) -> String {
+    let Some(scheme_end) = url.find("://") else {
+        return url.to_string();
+    };
+    let after = &url[scheme_end + 3..];
+    let host_end = after.find(['/', '?', '#']).unwrap_or(after.len());
+    let scheme = &url[..scheme_end + 3];
+    let rest = &after[host_end..];
+    format!("{scheme}old.reddit.com{rest}")
 }

-/// Convert Reddit JSON API response into an ExtractionResult.
-pub fn parse_reddit_json(json_bytes: &[u8], url: &str) -> Result<ExtractionResult, String> {
-    let listings: Vec<Listing> =
-        serde_json::from_slice(json_bytes).map_err(|e| format!("reddit json parse: {e}"))?;
+#[cfg(test)]
+mod tests {
+    use super::*;

-    let mut markdown = String::new();
-    let mut title = None;
-    let mut author = None;
-    let mut subreddit = None;
-
-    // First listing = the post itself
-    if let Some(post_listing) = listings.first() {
-        for child in &post_listing.data.children {
-            if child.kind == "t3" {
-                let d = &child.data;
-                title = d.title.clone();
-                author = d.author.clone();
-                subreddit = d.subreddit_name_prefixed.clone();
-
-                if let Some(ref t) = title {
-                    markdown.push_str(&format!("# {t}\n\n"));
-                }
-                if let (Some(a), Some(sr)) = (&author, &subreddit) {
-                    markdown.push_str(&format!("**u/{a}** in {sr}\n\n"));
-                }
-                if let Some(ref body) = d.selftext
-                    && !body.is_empty()
-                {
-                    markdown.push_str(body);
-                    markdown.push_str("\n\n");
-                }
-                if let Some(ref url_field) = d.url_overridden_by_dest
-                    && !url_field.is_empty()
-                {
-                    markdown.push_str(&format!("[Link]({url_field})\n\n"));
-                }
-                markdown.push_str("---\n\n");
-            }
-        }
+    #[test]
+    fn rewrites_www_to_old() {
+        assert_eq!(
+            to_old_reddit_url("https://www.reddit.com/r/rust/comments/abc/x/"),
+            "https://old.reddit.com/r/rust/comments/abc/x/"
+        );
    }

-    // Second listing = comment tree
-    if let Some(comment_listing) = listings.get(1) {
-        markdown.push_str("## Comments\n\n");
-        for child in &comment_listing.data.children {
-            render_comment(child, 0, &mut markdown);
-        }
+    #[test]
+    fn rewrites_bare_to_old() {
+        assert_eq!(
+            to_old_reddit_url("https://reddit.com/r/rust/"),
+            "https://old.reddit.com/r/rust/"
+        );
    }

-    let word_count = markdown.split_whitespace().count();
-    debug!(word_count, "reddit json extracted");
-
-    Ok(ExtractionResult {
-        metadata: Metadata {
-            title,
-            description: None,
-            author,
-            published_date: None,
-            language: Some("en".into()),
-            url: Some(url.to_string()),
-            site_name: subreddit,
-            image: None,
-            favicon: None,
-            word_count,
-        },
-        content: Content {
-            markdown,
-            plain_text: String::new(),
-            links: vec![],
-            images: vec![],
-            code_blocks: vec![],
-            raw_html: None,
-        },
-        domain_data: None,
-        structured_data: vec![],
-    })
-}
-
-fn render_comment(thing: &Thing, depth: usize, out: &mut String) {
-    if thing.kind != "t1" {
-        return;
+    #[test]
+    fn preserves_old_reddit_unchanged() {
+        let url = "https://old.reddit.com/r/rust/comments/abc/x/?context=3";
+        assert_eq!(to_old_reddit_url(url), url);
    }
-    let d = &thing.data;
-    let indent = "  ".repeat(depth);
-    let author = d.author.as_deref().unwrap_or("[deleted]");
-    let body = d.body.as_deref().unwrap_or("[removed]");
-    let score = d.score.unwrap_or(0);

-    out.push_str(&format!("{indent}- **u/{author}** ({score} pts)\n"));
-    for line in body.lines() {
-        out.push_str(&format!("{indent}  {line}\n"));
-    }
-    out.push('\n');
-
-    // Recurse into replies
-    if let Some(Replies::Listing(listing)) = &d.replies {
-        for child in &listing.data.children {
-            render_comment(child, depth + 1, out);
-        }
+    #[test]
+    fn preserves_query_and_hash() {
+        assert_eq!(
+            to_old_reddit_url("https://www.reddit.com/r/rust/?sort=top#anchor"),
+            "https://old.reddit.com/r/rust/?sort=top#anchor"
+        );
    }
 }
-
-// --- Reddit JSON types (minimal) ---
-
-#[derive(Deserialize)]
-struct Listing {
-    data: ListingData,
-}
-
-#[derive(Deserialize)]
-struct ListingData {
-    children: Vec<Thing>,
-}
-
-#[derive(Deserialize)]
-struct Thing {
-    kind: String,
-    data: ThingData,
-}
-
-#[derive(Deserialize)]
-struct ThingData {
-    // Post fields (t3)
-    title: Option<String>,
-    selftext: Option<String>,
-    subreddit_name_prefixed: Option<String>,
-    url_overridden_by_dest: Option<String>,
-    // Comment fields (t1)
-    author: Option<String>,
-    body: Option<String>,
-    score: Option<i64>,
-    replies: Option<Replies>,
-}
-
-/// Reddit replies can be either a nested Listing or an empty string.
-#[derive(Deserialize)]
-#[serde(untagged)]
-enum Replies {
-    Listing(Listing),
-    #[allow(dead_code)]
-    Empty(String),
-}