feat(reddit): parse old.reddit.com HTML instead of the dead .json API

Reddit blocked unauthenticated `.json` access, so the previous extractor returned block pages or timed out on every thread. Switch to parsing old.reddit.com's server-rendered HTML, which needs no API key or JS. Fetch layer: - Rewrite every Reddit host to old.reddit.com before fetching; drop all `.json` URL handling and the JSON response parser. Extraction (webclaw-core::reddit): - New HTML parser producing a typed post + nested comment tree. - Comments nest structurally (.comment > .child > .sitetable > .comment); old.reddit omits a usable depth attribute, so the tree is walked recursively. Bodies live in .entry > form > .usertext-body > .md. - Post metadata: title, author, subreddit, score, comment count (data-comments-count), self-vs-link (self class / self.* domain), flair, self-text body. - Comment scores read the .score.unvoted title (the displayed value, not the ±1 vote-state siblings); hidden scores are None, not 0. - Deleted comments are kept in place so their replies aren't orphaned; "load more comments" stubs are skipped. Markdown output: - Reply nesting via blockquote depth (avoids 4-space indentation turning text and code fences into broken indented-code blocks). - Links keep their target as [text](url); root-relative reddit links resolve against old.reddit.com. Nested lists indent correctly. - A recognised but unparseable /comments/ page returns no content rather than falling through to generic extraction of Reddit chrome. Tests: regression suite runs against real old.reddit.com fixtures (testdata/reddit/), the ground truth that surfaced the parsing and markdown bugs synthetic HTML had hidden. Fixtures are excluded from the published crate.
2026-06-08 22:25:12 +02:00 · 2026-06-04 16:16:08 +02:00 · 2026-06-04 16:16:08 +02:00 · 217bfe088b
commit 217bfe088b
parent 3b7d11328e
11 changed files with 2522 additions and 391 deletions
--- a/crates/webclaw-fetch/src/reddit.rs
+++ b/crates/webclaw-fetch/src/reddit.rs
@ -1,172 +1,56 @@
-/// Reddit JSON API fallback for extracting posts + comments without JS rendering.
-///
-/// Reddit's new `shreddit` frontend only SSRs the post body — comments are
-/// loaded client-side. Appending `.json` to any Reddit URL returns the full
-/// comment tree as structured JSON, which we convert to clean markdown.
-use serde::Deserialize;
-use tracing::debug;
-use webclaw_core::{Content, ExtractionResult, Metadata};
+//! Reddit URL helpers for the fetch layer.
+//!
+//! The JSON API (`*.json`) is blocked. We rewrite all Reddit hosts to
+//! `old.reddit.com`, which serves stable server-rendered HTML that
+//! `webclaw-core::reddit` parses directly.

-/// Check if a URL points to a Reddit post/comment page.
 pub fn is_reddit_url(url: &str) -> bool {
-    let host = url
-        .split("://")
-        .nth(1)
-        .unwrap_or(url)
-        .split('/')
-        .next()
-        .unwrap_or("");
-    matches!(
-        host,
-        "reddit.com" | "www.reddit.com" | "old.reddit.com" | "np.reddit.com" | "new.reddit.com"
-    )
+    webclaw_core::reddit::is_reddit_url(url)
 }

-/// Build the `.json` URL from a Reddit page URL.
-pub fn json_url(url: &str) -> String {
-    let clean = url.split('?').next().unwrap_or(url).trim_end_matches('/');
-    format!("{clean}.json")
+/// Rewrite any Reddit host to old.reddit.com, preserving path and query.
+pub fn to_old_reddit_url(url: &str) -> String {
+    let Some(scheme_end) = url.find("://") else {
+        return url.to_string();
+    };
+    let after = &url[scheme_end + 3..];
+    let host_end = after.find(['/', '?', '#']).unwrap_or(after.len());
+    let scheme = &url[..scheme_end + 3];
+    let rest = &after[host_end..];
+    format!("{scheme}old.reddit.com{rest}")
 }

-/// Convert Reddit JSON API response into an ExtractionResult.
-pub fn parse_reddit_json(json_bytes: &[u8], url: &str) -> Result<ExtractionResult, String> {
-    let listings: Vec<Listing> =
-        serde_json::from_slice(json_bytes).map_err(|e| format!("reddit json parse: {e}"))?;
+#[cfg(test)]
+mod tests {
+    use super::*;

-    let mut markdown = String::new();
-    let mut title = None;
-    let mut author = None;
-    let mut subreddit = None;
-
-    // First listing = the post itself
-    if let Some(post_listing) = listings.first() {
-        for child in &post_listing.data.children {
-            if child.kind == "t3" {
-                let d = &child.data;
-                title = d.title.clone();
-                author = d.author.clone();
-                subreddit = d.subreddit_name_prefixed.clone();
-
-                if let Some(ref t) = title {
-                    markdown.push_str(&format!("# {t}\n\n"));
-                }
-                if let (Some(a), Some(sr)) = (&author, &subreddit) {
-                    markdown.push_str(&format!("**u/{a}** in {sr}\n\n"));
-                }
-                if let Some(ref body) = d.selftext
-                    && !body.is_empty()
-                {
-                    markdown.push_str(body);
-                    markdown.push_str("\n\n");
-                }
-                if let Some(ref url_field) = d.url_overridden_by_dest
-                    && !url_field.is_empty()
-                {
-                    markdown.push_str(&format!("[Link]({url_field})\n\n"));
-                }
-                markdown.push_str("---\n\n");
-            }
-        }
+    #[test]
+    fn rewrites_www_to_old() {
+        assert_eq!(
+            to_old_reddit_url("https://www.reddit.com/r/rust/comments/abc/x/"),
+            "https://old.reddit.com/r/rust/comments/abc/x/"
+        );
    }

-    // Second listing = comment tree
-    if let Some(comment_listing) = listings.get(1) {
-        markdown.push_str("## Comments\n\n");
-        for child in &comment_listing.data.children {
-            render_comment(child, 0, &mut markdown);
-        }
+    #[test]
+    fn rewrites_bare_to_old() {
+        assert_eq!(
+            to_old_reddit_url("https://reddit.com/r/rust/"),
+            "https://old.reddit.com/r/rust/"
+        );
    }

-    let word_count = markdown.split_whitespace().count();
-    debug!(word_count, "reddit json extracted");
-
-    Ok(ExtractionResult {
-        metadata: Metadata {
-            title,
-            description: None,
-            author,
-            published_date: None,
-            language: Some("en".into()),
-            url: Some(url.to_string()),
-            site_name: subreddit,
-            image: None,
-            favicon: None,
-            word_count,
-        },
-        content: Content {
-            markdown,
-            plain_text: String::new(),
-            links: vec![],
-            images: vec![],
-            code_blocks: vec![],
-            raw_html: None,
-        },
-        domain_data: None,
-        structured_data: vec![],
-    })
-}
-
-fn render_comment(thing: &Thing, depth: usize, out: &mut String) {
-    if thing.kind != "t1" {
-        return;
+    #[test]
+    fn preserves_old_reddit_unchanged() {
+        let url = "https://old.reddit.com/r/rust/comments/abc/x/?context=3";
+        assert_eq!(to_old_reddit_url(url), url);
    }
-    let d = &thing.data;
-    let indent = "  ".repeat(depth);
-    let author = d.author.as_deref().unwrap_or("[deleted]");
-    let body = d.body.as_deref().unwrap_or("[removed]");
-    let score = d.score.unwrap_or(0);

-    out.push_str(&format!("{indent}- **u/{author}** ({score} pts)\n"));
-    for line in body.lines() {
-        out.push_str(&format!("{indent}  {line}\n"));
-    }
-    out.push('\n');
-
-    // Recurse into replies
-    if let Some(Replies::Listing(listing)) = &d.replies {
-        for child in &listing.data.children {
-            render_comment(child, depth + 1, out);
-        }
+    #[test]
+    fn preserves_query_and_hash() {
+        assert_eq!(
+            to_old_reddit_url("https://www.reddit.com/r/rust/?sort=top#anchor"),
+            "https://old.reddit.com/r/rust/?sort=top#anchor"
+        );
    }
 }
-
-// --- Reddit JSON types (minimal) ---
-
-#[derive(Deserialize)]
-struct Listing {
-    data: ListingData,
-}
-
-#[derive(Deserialize)]
-struct ListingData {
-    children: Vec<Thing>,
-}
-
-#[derive(Deserialize)]
-struct Thing {
-    kind: String,
-    data: ThingData,
-}
-
-#[derive(Deserialize)]
-struct ThingData {
-    // Post fields (t3)
-    title: Option<String>,
-    selftext: Option<String>,
-    subreddit_name_prefixed: Option<String>,
-    url_overridden_by_dest: Option<String>,
-    // Comment fields (t1)
-    author: Option<String>,
-    body: Option<String>,
-    score: Option<i64>,
-    replies: Option<Replies>,
-}
-
-/// Reddit replies can be either a nested Listing or an empty string.
-#[derive(Deserialize)]
-#[serde(untagged)]
-enum Replies {
-    Listing(Listing),
-    #[allow(dead_code)]
-    Empty(String),
-}