feat(reddit): parse old.reddit.com HTML instead of the dead .json API

Reddit blocked unauthenticated `.json` access, so the previous extractor returned block pages or timed out on every thread. Switch to parsing old.reddit.com's server-rendered HTML, which needs no API key or JS. Fetch layer: - Rewrite every Reddit host to old.reddit.com before fetching; drop all `.json` URL handling and the JSON response parser. Extraction (webclaw-core::reddit): - New HTML parser producing a typed post + nested comment tree. - Comments nest structurally (.comment > .child > .sitetable > .comment); old.reddit omits a usable depth attribute, so the tree is walked recursively. Bodies live in .entry > form > .usertext-body > .md. - Post metadata: title, author, subreddit, score, comment count (data-comments-count), self-vs-link (self class / self.* domain), flair, self-text body. - Comment scores read the .score.unvoted title (the displayed value, not the ±1 vote-state siblings); hidden scores are None, not 0. - Deleted comments are kept in place so their replies aren't orphaned; "load more comments" stubs are skipped. Markdown output: - Reply nesting via blockquote depth (avoids 4-space indentation turning text and code fences into broken indented-code blocks). - Links keep their target as [text](url); root-relative reddit links resolve against old.reddit.com. Nested lists indent correctly. - A recognised but unparseable /comments/ page returns no content rather than falling through to generic extraction of Reddit chrome. Tests: regression suite runs against real old.reddit.com fixtures (testdata/reddit/), the ground truth that surfaced the parsing and markdown bugs synthetic HTML had hidden. Fixtures are excluded from the published crate.
2026-07-23 07:21:02 +02:00 · 2026-06-04 16:16:08 +02:00 · 2026-06-04 16:16:08 +02:00 · 217bfe088b
commit 217bfe088b
parent 3b7d11328e
11 changed files with 2522 additions and 391 deletions
--- a/crates/webclaw-core/Cargo.toml
+++ b/crates/webclaw-core/Cargo.toml
@ -4,6 +4,10 @@ description = "Pure HTML content extraction engine for LLMs"
 version.workspace = true
 edition.workspace = true
 license.workspace = true
+# Reddit regression fixtures are real old.reddit.com pages read at test time;
+# they're large and only needed to run the test suite from the repo, so keep
+# them out of the published crate.
+exclude = ["testdata/reddit/*.html"]

 [features]
 default = ["quickjs"]
--- a/crates/webclaw-core/src/lib.rs
+++ b/crates/webclaw-core/src/lib.rs
@ -17,6 +17,7 @@ pub mod markdown;
 pub mod metadata;
 #[allow(dead_code)]
 pub(crate) mod noise;
+pub mod reddit;
 pub mod structured_data;
 pub mod types;
 pub mod youtube;
@ -94,6 +95,24 @@ fn extract_with_options_inner(
        return Err(ExtractError::NoContent);
    }

+    // Reddit fast path: parse old.reddit.com HTML directly.
+    // The fetch layer rewrites all Reddit hosts to old.reddit.com before
+    // calling extract, so we always get stable server-rendered HTML here.
+    if let Some(u) = url
+        && reddit::is_reddit_url(u)
+    {
+        if let Some(result) = reddit::try_extract(html, u) {
+            return Ok(result);
+        }
+        // A recognised comment thread that we couldn't parse (Reddit markup
+        // change, or a block/challenge page) — don't fall through to generic
+        // extraction, which would emit Reddit nav/sidebar chrome. Listings
+        // and profiles (no `/comments/`) intentionally fall through below.
+        if u.contains("/comments/") {
+            return Err(ExtractError::NoContent);
+        }
+    }
+
    // YouTube fast path: if the URL is a YouTube video page, try extracting
    // structured metadata from ytInitialPlayerResponse before DOM scoring.
    // This gives LLMs a clean, structured view of video metadata.
--- a/crates/webclaw-core/src/reddit.rs
+++ b/crates/webclaw-core/src/reddit.rs
@ -0,0 +1,968 @@
+//! Reddit thread extractor — parses old.reddit.com HTML directly.
+//!
+//! old.reddit.com serves fully server-rendered HTML with stable class names
+//! and data attributes. No JS, no API key, no `.json` trick needed.
+
+use scraper::{ElementRef, Html, Selector};
+use serde::Serialize;
+
+use crate::{Content, DomainData, DomainType, ExtractionResult, Metadata};
+
+// ─── Public types ──────────────────────────────────────────────────────────────
+
+#[derive(Serialize)]
+pub struct RedditPost {
+    pub id: Option<String>,
+    pub title: String,
+    pub author: String,
+    pub subreddit: Option<String>,
+    pub score: i64,
+    pub body: Option<String>,
+    pub num_comments: usize,
+    pub permalink: String,
+    pub url: Option<String>,
+    pub is_self: bool,
+    pub flair: Option<String>,
+    pub created_utc: Option<String>,
+}
+
+#[derive(Serialize)]
+pub struct RedditComment {
+    pub id: Option<String>,
+    pub author: String,
+    pub body: String,
+    /// `None` when Reddit hides the score (fresh comments). Distinct from
+    /// `Some(0)`, which is a real net-zero score.
+    pub score: Option<i64>,
+    pub depth: usize,
+    pub is_op: bool,
+    pub created_utc: Option<String>,
+    pub replies: Vec<RedditComment>,
+}
+
+#[derive(Serialize)]
+pub struct RedditThread {
+    #[serde(rename = "url")]
+    pub source_url: String,
+    pub post: Option<RedditPost>,
+    pub comments: Vec<RedditComment>,
+}
+
+// ─── Public API ────────────────────────────────────────────────────────────────
+
+pub fn is_reddit_url(url: &str) -> bool {
+    matches!(
+        host_of(url),
+        "reddit.com" | "www.reddit.com" | "old.reddit.com" | "np.reddit.com" | "new.reddit.com"
+    )
+}
+
+/// Try to parse a Reddit thread from old.reddit.com HTML.
+/// Returns `None` if the page doesn't have recognisable Reddit structure.
+pub fn try_extract_thread(html: &str, url: &str) -> Option<RedditThread> {
+    if !url.contains("/comments/") {
+        return None;
+    }
+    let doc = Html::parse_document(html);
+    let post = parse_post(&doc);
+    let op = post.as_ref().map(|p| p.author.as_str()).unwrap_or("");
+    let comments = parse_comments(&doc, op);
+
+    if post.is_none() && comments.is_empty() {
+        return None;
+    }
+
+    Some(RedditThread {
+        source_url: url.to_string(),
+        post,
+        comments,
+    })
+}
+
+/// Entry point for `webclaw-core`'s extraction fast path.
+pub fn try_extract(html: &str, url: &str) -> Option<ExtractionResult> {
+    let thread = try_extract_thread(html, url)?;
+    Some(to_extraction_result(&thread))
+}
+
+// ─── ExtractionResult builder ──────────────────────────────────────────────────
+
+fn to_extraction_result(thread: &RedditThread) -> ExtractionResult {
+    let md = to_markdown(thread);
+    let plain = plain_text(&md);
+    let wc = md.split_whitespace().count();
+
+    let (title, author, site_name) = thread
+        .post
+        .as_ref()
+        .map(|p| {
+            (
+                Some(p.title.clone()),
+                Some(p.author.clone()),
+                p.subreddit.clone(),
+            )
+        })
+        .unwrap_or_default();
+
+    ExtractionResult {
+        metadata: Metadata {
+            title,
+            description: None,
+            author,
+            published_date: None,
+            language: Some("en".to_string()),
+            url: Some(thread.source_url.clone()),
+            site_name,
+            image: None,
+            favicon: None,
+            word_count: wc,
+        },
+        content: Content {
+            markdown: md,
+            plain_text: plain,
+            links: vec![],
+            images: vec![],
+            code_blocks: vec![],
+            raw_html: None,
+        },
+        domain_data: Some(DomainData {
+            domain_type: DomainType::Social,
+        }),
+        structured_data: vec![],
+    }
+}
+
+// ─── Markdown rendering ────────────────────────────────────────────────────────
+
+pub fn to_markdown(thread: &RedditThread) -> String {
+    let mut out = String::new();
+
+    if let Some(p) = &thread.post {
+        out.push_str(&format!("# {}\n\n", p.title));
+
+        let pts = pt_label(Some(p.score));
+        let cmt = match p.num_comments {
+            0 => String::new(),
+            1 => " · 1 comment".to_string(),
+            n => format!(" · {n} comments"),
+        };
+        let sub = p.subreddit.as_deref().unwrap_or("?");
+        out.push_str(&format!("**u/{}** · r/{sub} · {pts}{cmt}\n\n", p.author));
+
+        if let Some(ref body) = p.body
+            && !body.is_empty()
+        {
+            out.push_str(body);
+            out.push_str("\n\n");
+        }
+        if let Some(ref link) = p.url
+            && !p.is_self
+        {
+            out.push_str(&format!("[Link]({link})\n\n"));
+        }
+        out.push_str("---\n\n");
+    }
+
+    if !thread.comments.is_empty() {
+        out.push_str("## Comments\n\n");
+        for c in &thread.comments {
+            render_comment(c, &mut out);
+        }
+    }
+
+    collapse_blank_lines(out.trim_end())
+}
+
+/// Render one comment + its replies. Nesting is expressed with blockquote
+/// depth (`> ` per level) rather than leading spaces: space-indentation of
+/// 4+ would turn ordinary text and ``` fences into CommonMark indented code
+/// blocks, corrupting any comment at depth ≥ 2.
+fn render_comment(c: &RedditComment, out: &mut String) {
+    let q = "> ".repeat(c.depth);
+    let blank = ">".repeat(c.depth);
+    let author = if c.is_op {
+        format!("**u/{} [OP]**", c.author)
+    } else {
+        format!("**u/{}**", c.author)
+    };
+    out.push_str(&format!("{q}{author} · {}\n", pt_label(c.score)));
+    for line in c.body.lines() {
+        if line.is_empty() {
+            out.push_str(&blank);
+            out.push('\n');
+        } else {
+            out.push_str(&q);
+            out.push_str(line);
+            out.push('\n');
+        }
+    }
+    out.push('\n');
+    for reply in &c.replies {
+        render_comment(reply, out);
+    }
+}
+
+fn pt_label(n: Option<i64>) -> String {
+    match n {
+        None => "score hidden".to_string(),
+        Some(1) => "1 pt".to_string(),
+        Some(-1) => "-1 pt".to_string(),
+        Some(n) => format!("{n} pts"),
+    }
+}
+
+/// Collapse runs of 3+ newlines down to a blank-line separator so the
+/// blockquote prefixes and `<pre>` spacing don't leave large gaps.
+fn collapse_blank_lines(s: &str) -> String {
+    let mut out = String::with_capacity(s.len());
+    let mut newlines = 0;
+    for ch in s.chars() {
+        if ch == '\n' {
+            newlines += 1;
+            if newlines <= 2 {
+                out.push(ch);
+            }
+        } else {
+            newlines = 0;
+            out.push(ch);
+        }
+    }
+    out
+}
+
+fn plain_text(md: &str) -> String {
+    md.lines()
+        .map(|l| {
+            // Strip a single leading blockquote / heading marker, then drop
+            // emphasis markers. Greedy char-class stripping (the old approach)
+            // ate legitimate content like ">"-prefixed quotes.
+            let l = l.trim_start();
+            let l = l
+                .strip_prefix("> ")
+                .or_else(|| l.strip_prefix('>'))
+                .unwrap_or(l);
+            let l = l.trim_start_matches('#').trim_start();
+            l.replace("**", "")
+                .replace("~~", "")
+                .replace(['*', '`'], "")
+        })
+        .collect::<Vec<_>>()
+        .join("\n")
+}
+
+// ─── HTML parsing ──────────────────────────────────────────────────────────────
+
+fn parse_post(doc: &Html) -> Option<RedditPost> {
+    let sel = Selector::parse("#siteTable .thing.link").ok()?;
+    let thing = doc.select(&sel).next()?;
+    let v = thing.value();
+
+    let id = v
+        .attr("data-fullname")
+        .map(|s| s.trim_start_matches("t3_").to_string());
+    let author = v.attr("data-author").unwrap_or("[deleted]").to_string();
+    let subreddit = v.attr("data-subreddit").map(str::to_string);
+    let score: i64 = v
+        .attr("data-score")
+        .and_then(|s| s.parse().ok())
+        .unwrap_or(0);
+    let num_comments: usize = v
+        .attr("data-comments-count")
+        .and_then(|s| s.parse().ok())
+        .unwrap_or(0);
+    let permalink_path = v.attr("data-permalink").unwrap_or("");
+    let permalink = format!("https://old.reddit.com{permalink_path}");
+    // Self-posts carry the `self` class and a `self.<sub>` domain; their
+    // data-url points back at the permalink rather than an external site.
+    let is_self = v.has_class("self", scraper::CaseSensitivity::AsciiCaseInsensitive)
+        || v.attr("data-domain")
+            .is_some_and(|d| d.starts_with("self."));
+    let link_url = v.attr("data-url").map(str::to_string);
+    let url = if is_self { None } else { link_url };
+
+    // Title
+    let sel_title = Selector::parse(".title a.title").ok()?;
+    let title = thing
+        .select(&sel_title)
+        .next()
+        .map(|el| el.text().collect::<String>().trim().to_string())
+        .filter(|s| !s.is_empty())?;
+
+    // Flair
+    let flair = Selector::parse(".linkflairlabel")
+        .ok()
+        .and_then(|s| thing.select(&s).next())
+        .map(|el| el.text().collect::<String>().trim().to_string())
+        .filter(|s| !s.is_empty());
+
+    // Self-text body: thing > .entry > .expando > .usertext-body [> .md]
+    let body = direct_child(thing, "entry")
+        .and_then(|entry| find_class(entry, "expando"))
+        .and_then(|expando| find_class(expando, "usertext-body"))
+        .and_then(|ut| find_class(ut, "md"))
+        .map(md_to_markdown)
+        .filter(|s| !s.is_empty());
+
+    // Datetime
+    let created_utc = Selector::parse("time[datetime]")
+        .ok()
+        .and_then(|s| thing.select(&s).next())
+        .and_then(|t| t.value().attr("datetime"))
+        .map(str::to_string);
+
+    Some(RedditPost {
+        id,
+        title,
+        author,
+        subreddit,
+        score,
+        body,
+        num_comments,
+        permalink,
+        url,
+        is_self,
+        flair,
+        created_utc,
+    })
+}
+
+// ─── Comment parsing ───────────────────────────────────────────────────────────
+//
+// old.reddit.com nests comments structurally, not via a depth attribute:
+//
+//   .commentarea
+//     .sitetable.nestedlisting
+//       .comment.thing                          ← root comment
+//         .entry → form → .usertext-body → .md  ← its own body
+//         .child
+//           .sitetable.listing
+//             .comment.thing                    ← reply (recurse)
+//
+// `data-depth`/`data-replies` are absent or always "0" in the logged-out
+// HTML, so we walk the tree by recursing into each comment's `.child`.
+
+fn parse_comments(doc: &Html, op: &str) -> Vec<RedditComment> {
+    // Root listing is `.sitetable.nestedlisting` inside `.commentarea`
+    // (note: `commentarea` is a class on old.reddit, not an id). Fall back
+    // to the first `.nestedlisting` anywhere for comment-permalink pages.
+    let listing = Selector::parse(".commentarea .sitetable.nestedlisting")
+        .ok()
+        .and_then(|s| doc.select(&s).next())
+        .or_else(|| {
+            Selector::parse(".sitetable.nestedlisting")
+                .ok()
+                .and_then(|s| doc.select(&s).next())
+        });
+
+    match listing {
+        Some(l) => walk_comment_level(l, op, 0),
+        None => vec![],
+    }
+}
+
+/// Parse the direct-child `.comment.thing` elements of a comment listing.
+fn walk_comment_level(listing: ElementRef, op: &str, depth: usize) -> Vec<RedditComment> {
+    listing
+        .children()
+        .filter_map(ElementRef::wrap)
+        .filter(|c| {
+            let val = c.value();
+            val.has_class("comment", scraper::CaseSensitivity::AsciiCaseInsensitive)
+                && val.has_class("thing", scraper::CaseSensitivity::AsciiCaseInsensitive)
+        })
+        .filter_map(|c| parse_one_comment(c, op, depth))
+        .collect()
+}
+
+fn parse_one_comment(c: ElementRef, op: &str, depth: usize) -> Option<RedditComment> {
+    let v = c.value();
+
+    // "load more comments" placeholders are `.thing` with type=morechildren.
+    // They carry a t1_ fullname but no real content — skip them.
+    if v.attr("data-type") == Some("morechildren")
+        || v.has_class(
+            "morechildren",
+            scraper::CaseSensitivity::AsciiCaseInsensitive,
+        )
+    {
+        return None;
+    }
+
+    let is_deleted = v.has_class("deleted", scraper::CaseSensitivity::AsciiCaseInsensitive);
+    let id = v
+        .attr("data-fullname")
+        .map(|s| s.trim_start_matches("t1_").to_string());
+    let author = v
+        .attr("data-author")
+        .filter(|a| !a.is_empty())
+        .unwrap_or("[deleted]")
+        .to_string();
+
+    // Own body lives in `.entry > form > .usertext-body > .md`. `.child`
+    // (nested replies) is a sibling of `.entry`, so descending within
+    // `.entry` never crosses into a reply's body.
+    let entry = direct_child(c, "entry");
+    let body = entry
+        .and_then(|e| find_class(e, "usertext-body"))
+        .and_then(|ut| find_class(ut, "md"))
+        .map(md_to_markdown)
+        .filter(|s| !s.is_empty())
+        .unwrap_or_else(|| {
+            if is_deleted {
+                "[removed]".into()
+            } else {
+                String::new()
+            }
+        });
+
+    // Displayed score is `.score.unvoted`, whose `title` holds the exact
+    // integer (the sibling likes/dislikes spans are ±1). Hidden-score
+    // comments have no `.score.unvoted` span, so `comment_score` returns
+    // None — kept distinct from a genuine 0.
+    let score = entry.and_then(comment_score);
+
+    let created_utc = entry
+        .and_then(|e| Selector::parse("time[datetime]").ok().map(|s| (e, s)))
+        .and_then(|(e, s)| e.select(&s).next())
+        .and_then(|t| t.value().attr("datetime"))
+        .map(str::to_string);
+
+    let is_op = !is_deleted && author != "[deleted]" && author == op;
+
+    // Replies: `.comment > .child > .sitetable > .comment`.
+    let replies = direct_child(c, "child")
+        .and_then(|child| direct_child(child, "sitetable"))
+        .map(|st| walk_comment_level(st, op, depth + 1))
+        .unwrap_or_default();
+
+    Some(RedditComment {
+        id,
+        author,
+        body,
+        score,
+        depth,
+        is_op,
+        created_utc,
+        replies,
+    })
+}
+
+/// Read a comment's score from the `.score.unvoted` span inside `.entry`.
+/// Prefers the `title` attribute (exact integer); falls back to the text.
+/// Returns `None` when Reddit hides the score (no `.score.unvoted` span).
+fn comment_score(entry: ElementRef) -> Option<i64> {
+    let sel = Selector::parse("span.score.unvoted").ok()?;
+    let span = entry.select(&sel).next()?;
+    span.value()
+        .attr("title")
+        .and_then(|t| t.trim().parse().ok())
+        .or_else(|| parse_score(&span.text().collect::<String>()))
+}
+
+// ─── DOM helpers ───────────────────────────────────────────────────────────────
+
+/// First direct child element whose class list includes `class`.
+fn direct_child<'a>(el: ElementRef<'a>, class: &str) -> Option<ElementRef<'a>> {
+    el.children().filter_map(ElementRef::wrap).find(|c| {
+        c.value()
+            .has_class(class, scraper::CaseSensitivity::AsciiCaseInsensitive)
+    })
+}
+
+/// First descendant (any depth) whose class list includes `class`.
+fn find_class<'a>(el: ElementRef<'a>, class: &str) -> Option<ElementRef<'a>> {
+    el.children().filter_map(ElementRef::wrap).find_map(|c| {
+        if c.value()
+            .has_class(class, scraper::CaseSensitivity::AsciiCaseInsensitive)
+        {
+            Some(c)
+        } else {
+            find_class(c, class)
+        }
+    })
+}
+
+fn parse_score(text: &str) -> Option<i64> {
+    text.split_whitespace()
+        .next()
+        .map(|w| w.replace('−', "-"))
+        .and_then(|w| w.parse().ok())
+}
+
+// ─── .md div → markdown ────────────────────────────────────────────────────────
+
+fn md_to_markdown(el: ElementRef) -> String {
+    let mut out = String::new();
+    render_children(el, &mut out);
+    out.trim().to_string()
+}
+
+fn render_children(el: ElementRef, out: &mut String) {
+    use scraper::node::Node;
+    for child in el.children() {
+        match child.value() {
+            Node::Text(t) => out.push_str(t.as_ref()),
+            Node::Element(_) => {
+                if let Some(c) = ElementRef::wrap(child) {
+                    render_node(c, out);
+                }
+            }
+            _ => {}
+        }
+    }
+}
+
+fn render_node(el: ElementRef, out: &mut String) {
+    match el.value().name() {
+        "p" | "div" => {
+            let mut inner = String::new();
+            render_children(el, &mut inner);
+            let t = inner.trim();
+            if !t.is_empty() {
+                out.push_str(t);
+                out.push_str("\n\n");
+            }
+        }
+        "br" => out.push('\n'),
+        "strong" | "b" => {
+            let t: String = el.text().collect();
+            let t = t.trim();
+            if !t.is_empty() {
+                out.push_str(&format!("**{t}**"));
+            }
+        }
+        "em" | "i" => {
+            let t: String = el.text().collect();
+            let t = t.trim();
+            if !t.is_empty() {
+                out.push_str(&format!("*{t}*"));
+            }
+        }
+        "del" | "s" | "strike" => {
+            let t: String = el.text().collect();
+            let t = t.trim();
+            if !t.is_empty() {
+                out.push_str(&format!("~~{t}~~"));
+            }
+        }
+        "code" => {
+            let t: String = el.text().collect();
+            out.push('`');
+            out.push_str(t.trim());
+            out.push('`');
+        }
+        "pre" => {
+            let t: String = el.text().collect();
+            out.push_str("```\n");
+            out.push_str(t.trim_end_matches('\n'));
+            out.push_str("\n```\n\n");
+        }
+        "a" => {
+            let text: String = el.text().collect();
+            let text = text.trim();
+            if !text.is_empty() {
+                // Preserve the destination as a markdown link. Resolve
+                // root-relative reddit hrefs (/r/, /user/, /wiki/, ...) and
+                // drop non-navigational ones (javascript:, #fragment, mailto:).
+                let href = el.value().attr("href").unwrap_or("");
+                if href.starts_with("http://") || href.starts_with("https://") {
+                    out.push_str(&format!("[{text}]({href})"));
+                } else if href.starts_with('/') {
+                    out.push_str(&format!("[{text}](https://old.reddit.com{href})"));
+                } else {
+                    out.push_str(text);
+                }
+            }
+        }
+        "blockquote" => {
+            let mut inner = String::new();
+            render_children(el, &mut inner);
+            let trimmed = inner.trim();
+            for line in trimmed.lines() {
+                out.push('>');
+                if !line.is_empty() {
+                    out.push(' ');
+                    out.push_str(line);
+                }
+                out.push('\n');
+            }
+            out.push('\n');
+        }
+        "ul" => render_list(el, false, 0, out),
+        "ol" => render_list(el, true, 0, out),
+        "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
+            let level = el
+                .value()
+                .name()
+                .chars()
+                .nth(1)
+                .and_then(|c| c.to_digit(10))
+                .unwrap_or(2) as usize;
+            let t: String = el.text().collect();
+            let t = t.trim();
+            if !t.is_empty() {
+                out.push_str(&"#".repeat(level));
+                out.push(' ');
+                out.push_str(t);
+                out.push_str("\n\n");
+            }
+        }
+        "hr" => out.push_str("---\n\n"),
+        "sup" => {
+            let t: String = el.text().collect();
+            out.push_str(t.trim());
+        }
+        // Unknown / generic containers: recurse
+        _ => render_children(el, out),
+    }
+}
+
+/// Render a `<ul>`/`<ol>`, indenting nested lists by two spaces per level so
+/// child items keep their own line instead of being glued to the parent.
+fn render_list(list: ElementRef, ordered: bool, indent: usize, out: &mut String) {
+    use scraper::node::Node;
+    let pad = "  ".repeat(indent);
+    let mut n = 0;
+    for li in list
+        .children()
+        .filter_map(ElementRef::wrap)
+        .filter(|c| c.value().name() == "li")
+    {
+        n += 1;
+        // Inline content of this <li>, excluding nested lists (rendered after).
+        let mut inline = String::new();
+        for child in li.children() {
+            match child.value() {
+                Node::Text(t) => inline.push_str(t.as_ref()),
+                Node::Element(e) if e.name() == "ul" || e.name() == "ol" => {}
+                Node::Element(_) => {
+                    if let Some(c) = ElementRef::wrap(child) {
+                        render_node(c, &mut inline);
+                    }
+                }
+                _ => {}
+            }
+        }
+        let marker = if ordered {
+            format!("{n}. ")
+        } else {
+            "- ".to_string()
+        };
+        out.push_str(&format!("{pad}{marker}{}\n", inline.trim()));
+
+        for child in li.children().filter_map(ElementRef::wrap) {
+            match child.value().name() {
+                "ul" => render_list(child, false, indent + 1, out),
+                "ol" => render_list(child, true, indent + 1, out),
+                _ => {}
+            }
+        }
+    }
+    if indent == 0 {
+        out.push('\n');
+    }
+}
+
+// ─── URL helpers ───────────────────────────────────────────────────────────────
+
+fn host_of(url: &str) -> &str {
+    url.split("://")
+        .nth(1)
+        .unwrap_or(url)
+        .split(['/', '?', '#'])
+        .next()
+        .unwrap_or("")
+}
+
+// ─── Tests ─────────────────────────────────────────────────────────────────────
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn is_reddit_url_recognises_variants() {
+        assert!(is_reddit_url(
+            "https://www.reddit.com/r/rust/comments/abc/x/"
+        ));
+        assert!(is_reddit_url(
+            "https://old.reddit.com/r/rust/comments/abc/x/"
+        ));
+        assert!(is_reddit_url("https://reddit.com/r/rust/comments/abc/x/"));
+        assert!(!is_reddit_url("https://example.com"));
+    }
+
+    #[test]
+    fn try_extract_thread_returns_none_for_listing_url() {
+        let html = "<html><body></body></html>";
+        assert!(try_extract_thread(html, "https://old.reddit.com/r/rust/").is_none());
+    }
+
+    #[test]
+    fn md_to_markdown_basic() {
+        let html =
+            Html::parse_fragment(r#"<div class="md"><p>Hello <strong>world</strong>!</p></div>"#);
+        let sel = Selector::parse(".md").unwrap();
+        let el = html.select(&sel).next().unwrap();
+        let md = md_to_markdown(el);
+        assert!(md.contains("**world**"));
+        assert!(md.contains("Hello"));
+    }
+
+    #[test]
+    fn md_to_markdown_blockquote_and_code() {
+        let html = Html::parse_fragment(
+            r#"<div class="md"><blockquote><p>Quoted</p></blockquote><pre><code>fn main() {}</code></pre></div>"#,
+        );
+        let sel = Selector::parse(".md").unwrap();
+        let el = html.select(&sel).next().unwrap();
+        let md = md_to_markdown(el);
+        assert!(md.contains("> Quoted"));
+        assert!(md.contains("```"));
+        assert!(md.contains("fn main()"));
+    }
+
+    #[test]
+    fn md_to_markdown_link_preserves_href() {
+        let abs = Html::parse_fragment(
+            r#"<div class="md"><p>see <a href="https://example.com/x">this</a></p></div>"#,
+        );
+        let sel = Selector::parse(".md").unwrap();
+        let el = abs.select(&sel).next().unwrap();
+        assert!(md_to_markdown(el).contains("[this](https://example.com/x)"));
+
+        // Root-relative reddit links resolve against old.reddit.com.
+        let rel = Html::parse_fragment(
+            r#"<div class="md"><p><a href="/r/rust/wiki/faq">faq</a></p></div>"#,
+        );
+        let el = rel.select(&sel).next().unwrap();
+        assert!(md_to_markdown(el).contains("[faq](https://old.reddit.com/r/rust/wiki/faq)"));
+
+        // javascript: / fragment hrefs degrade to bare text.
+        let js = Html::parse_fragment(
+            r#"<div class="md"><p><a href="javascript:void(0)">x</a></p></div>"#,
+        );
+        let el = js.select(&sel).next().unwrap();
+        let out = md_to_markdown(el);
+        assert!(out.contains('x') && !out.contains("javascript"));
+    }
+
+    // ── Regression tests against REAL old.reddit.com HTML ──────────────────
+    //
+    // These fixtures are genuine pages fetched from old.reddit.com (see
+    // testdata/reddit/). They are the ground truth — synthetic HTML is too
+    // easy to write to match wrong assumptions, which is exactly how the
+    // first version of this parser shipped silently broken.
+
+    fn fixture(name: &str) -> String {
+        std::fs::read_to_string(format!("testdata/reddit/{name}")).unwrap()
+    }
+
+    fn total_comments(cs: &[RedditComment]) -> usize {
+        cs.len() + cs.iter().map(|c| total_comments(&c.replies)).sum::<usize>()
+    }
+
+    fn collect<'a>(cs: &'a [RedditComment], out: &mut Vec<&'a RedditComment>) {
+        for c in cs {
+            out.push(c);
+            collect(&c.replies, out);
+        }
+    }
+
+    #[test]
+    fn real_link_post_metadata() {
+        // pandas: external-link post (blog.geekuni.com), 34 comments.
+        let html = fixture("pandas_34comments.html");
+        let t = try_extract_thread(
+            &html,
+            "https://old.reddit.com/r/programming/comments/abc123/t/",
+        )
+        .expect("should parse");
+        let p = t.post.expect("post");
+        assert_eq!(p.author, "Horror-Willingness74");
+        assert_eq!(p.subreddit.as_deref(), Some("programming"));
+        assert_eq!(p.score, 43);
+        assert_eq!(p.num_comments, 34, "data-comments-count");
+        assert!(!p.is_self, "external blog link, not a self post");
+        assert_eq!(
+            p.url.as_deref(),
+            Some("https://blog.geekuni.com/2026/06/why-learn-pandas.html")
+        );
+        assert!(p.title.contains("Pandas"));
+    }
+
+    #[test]
+    fn real_self_post_metadata() {
+        // A self-post (text) on r/rust: `self.rust` domain, self-text body,
+        // no external url.
+        let html = fixture("rust_selfpost_36comments.html");
+        let t = try_extract_thread(&html, "https://old.reddit.com/r/rust/comments/abc123/t/")
+            .expect("should parse");
+        let p = t.post.expect("post");
+        assert!(p.is_self, "self.rust domain → self post");
+        assert_eq!(p.url, None, "self posts carry no external url");
+        assert_eq!(p.subreddit.as_deref(), Some("rust"));
+        assert!(
+            p.body
+                .as_deref()
+                .unwrap_or("")
+                .contains("IT project manager"),
+            "self-text body should be extracted: {:?}",
+            p.body
+        );
+    }
+
+    #[test]
+    fn real_comment_bodies_and_scores() {
+        // The original bug: every comment body came back empty because
+        // .usertext-body sits inside a <form>, not directly under .entry.
+        let html = fixture("ebpf_6comments.html");
+        let t = try_extract_thread(
+            &html,
+            "https://old.reddit.com/r/programming/comments/abc123/t/",
+        )
+        .expect("should parse");
+        // 6 comments total: 5 top-level + 1 nested reply (admalledd under ejrh).
+        assert_eq!(t.comments.len(), 5, "5 top-level comments");
+        assert_eq!(total_comments(&t.comments), 6, "6 comments incl. nested");
+        let teerre = t
+            .comments
+            .iter()
+            .find(|c| c.author == "teerre")
+            .expect("teerre");
+        assert!(
+            teerre.body.contains("Very cool blog"),
+            "body must be populated, got {:?}",
+            teerre.body
+        );
+        // Score comes from .score.unvoted title (the real value), not the
+        // ±1 likes/dislikes siblings.
+        assert_eq!(
+            teerre.score,
+            Some(10),
+            "unvoted score, not dislikes(9)/likes(11)"
+        );
+        assert!(
+            t.comments.iter().all(|c| !c.body.is_empty()),
+            "no comment body should be empty"
+        );
+    }
+
+    #[test]
+    fn real_nested_comment_tree() {
+        // pandas has structurally-nested replies (.child > .sitetable >
+        // .comment). data-depth/data-replies are absent in logged-out HTML.
+        let html = fixture("pandas_34comments.html");
+        let t = try_extract_thread(
+            &html,
+            "https://old.reddit.com/r/programming/comments/abc123/t/",
+        )
+        .expect("should parse");
+        // 34 rendered comments with content + 1 [deleted] node that old.reddit
+        // still shows because it has live replies = 35 nodes in the tree.
+        assert_eq!(
+            total_comments(&t.comments),
+            35,
+            "all comments incl. nested + deleted"
+        );
+        let nested = t.comments.iter().any(|c| !c.replies.is_empty());
+        assert!(nested, "at least one comment must have replies");
+        let max_depth = {
+            fn d(cs: &[RedditComment]) -> usize {
+                cs.iter().map(|c| 1 + d(&c.replies)).max().unwrap_or(0)
+            }
+            d(&t.comments)
+        };
+        assert!(max_depth >= 2, "tree should be more than one level deep");
+        let a_reply = t.comments.iter().find_map(|c| c.replies.first());
+        assert_eq!(a_reply.map(|r| r.depth), Some(1));
+    }
+
+    #[test]
+    fn real_morechildren_stubs_skipped() {
+        // AskReddit deep thread: 259 .thing[data-fullname=t1_] markers, but
+        // some are "load more comments" stubs (data-type=morechildren) with
+        // no author/body. They must not appear as ghost comments.
+        let html = fixture("askreddit_deep_morechildren.html");
+        let t = try_extract_thread(
+            &html,
+            "https://old.reddit.com/r/AskReddit/comments/abc123/t/",
+        )
+        .expect("should parse");
+        fn check(cs: &[RedditComment]) {
+            for c in cs {
+                let ghost = c.body.is_empty() && c.author == "[deleted]" && c.id.is_some();
+                assert!(!ghost, "morechildren stub leaked as comment: {:?}", c.id);
+                check(&c.replies);
+            }
+        }
+        check(&t.comments);
+    }
+
+    #[test]
+    fn real_hidden_score_is_none_not_zero() {
+        // AskReddit has fresh comments with `.score-hidden` (no .score.unvoted
+        // span). These must be None, distinct from a genuine 0-score comment.
+        let html = fixture("askreddit_deep_morechildren.html");
+        let t = try_extract_thread(
+            &html,
+            "https://old.reddit.com/r/AskReddit/comments/abc123/t/",
+        )
+        .expect("should parse");
+        let mut all = Vec::new();
+        collect(&t.comments, &mut all);
+        assert!(
+            all.iter().any(|c| c.score.is_none()),
+            "some fresh comments have hidden scores → None"
+        );
+    }
+
+    #[test]
+    fn real_deleted_comment_preserves_subtree() {
+        // pandas has a [deleted] comment that still has visible replies. The
+        // structural walk must keep it so its children aren't orphaned.
+        let html = fixture("pandas_34comments.html");
+        let t = try_extract_thread(
+            &html,
+            "https://old.reddit.com/r/programming/comments/abc123/t/",
+        )
+        .expect("should parse");
+        let mut all = Vec::new();
+        collect(&t.comments, &mut all);
+        let deleted: Vec<_> = all.iter().filter(|c| c.author == "[deleted]").collect();
+        assert!(!deleted.is_empty(), "should keep deleted comments");
+        assert!(
+            deleted.iter().any(|c| !c.replies.is_empty()),
+            "a deleted comment with replies must retain its subtree"
+        );
+        assert!(deleted.iter().all(|c| !c.is_op));
+    }
+
+    #[test]
+    fn real_markdown_is_commonmark_clean() {
+        // Guards the markdown bugs the verification workflow found: no
+        // whitespace-only "blank" lines, and ``` fences never indented 4+
+        // spaces (which would turn them into literal indented code blocks).
+        let html = fixture("elixir_60comments.html");
+        let result = try_extract(
+            &html,
+            "https://old.reddit.com/r/programming/comments/abc123/t/",
+        )
+        .expect("should extract");
+        let md = &result.content.markdown;
+        assert!(md.starts_with("# "));
+        assert!(md.contains("## Comments"));
+        for line in md.lines() {
+            assert!(
+                !(line.starts_with(' ') && line.trim().is_empty()),
+                "whitespace-only line: {line:?}"
+            );
+            let trimmed = line.trim_start_matches(['>', ' ']);
+            if trimmed.starts_with("```") {
+                let indent = line.len() - line.trim_start_matches(' ').len();
+                assert!(indent < 4, "code fence indented {indent} spaces: {line:?}");
+            }
+        }
+        assert!(result.metadata.word_count > 20);
+    }
+}
--- a/crates/webclaw-core/testdata/reddit/askreddit_deep_morechildren.html
+++ b/crates/webclaw-core/testdata/reddit/askreddit_deep_morechildren.html
--- a/crates/webclaw-core/testdata/reddit/ebpf_6comments.html
+++ b/crates/webclaw-core/testdata/reddit/ebpf_6comments.html
--- a/crates/webclaw-core/testdata/reddit/elixir_60comments.html
+++ b/crates/webclaw-core/testdata/reddit/elixir_60comments.html
--- a/crates/webclaw-core/testdata/reddit/pandas_34comments.html
+++ b/crates/webclaw-core/testdata/reddit/pandas_34comments.html
--- a/crates/webclaw-core/testdata/reddit/rust_selfpost_36comments.html
+++ b/crates/webclaw-core/testdata/reddit/rust_selfpost_36comments.html
--- a/crates/webclaw-fetch/src/client.rs
+++ b/crates/webclaw-fetch/src/client.rs
@ -160,9 +160,6 @@ impl Response {
    fn body(&self) -> &[u8] {
        &self.body
    }
-    fn is_success(&self) -> bool {
-        (200..300).contains(&self.status)
-    }

    fn text(&self) -> std::borrow::Cow<'_, str> {
        String::from_utf8_lossy(&self.body)
@ -299,32 +296,15 @@ impl FetchClient {
    /// when you need literal no-rescue behavior (e.g. inside the rescue
    /// logic itself to avoid recursion).
    pub async fn fetch_smart(&self, url: &str) -> Result<FetchResult, FetchError> {
-        // Reddit: the HTML page shows a verification interstitial for most
-        // client IPs, but appending `.json` returns the post + comment tree
-        // publicly. `parse_reddit_json` in downstream code knows how to read
-        // the result; here we just do the URL swap at the fetch layer.
-        if crate::reddit::is_reddit_url(url) && !url.ends_with(".json") {
-            let json_url = crate::reddit::json_url(url);
-            // Reddit's public .json API serves JSON to identifiable bot
-            // User-Agents and blocks browser UAs with a verification wall.
-            // Override our Chrome-profile UA for this specific call.
-            let ua = concat!(
-                "Webclaw/",
-                env!("CARGO_PKG_VERSION"),
-                " (+https://webclaw.io)"
-            );
-            if let Ok(resp) = self
-                .fetch_with_headers(&json_url, &[("user-agent", ua)])
-                .await
-                && resp.status == 200
-            {
-                let first = resp.html.trim_start().as_bytes().first().copied();
-                if matches!(first, Some(b'{') | Some(b'[')) {
-                    return Ok(resp);
-                }
-            }
-            // If the .json fetch failed or returned HTML, fall through.
-        }
+        // Reddit: fetch old.reddit.com for stable server-rendered HTML.
+        // The JSON API is blocked; old.reddit.com works without JS or auth.
+        let owned;
+        let url = if crate::reddit::is_reddit_url(url) {
+            owned = crate::reddit::to_old_reddit_url(url);
+            owned.as_str()
+        } else {
+            url
+        };

        let resp = self.fetch(url).await?;

@ -496,23 +476,16 @@ impl FetchClient {
        let parsed_url = crate::url_security::validate_public_http_url(url).await?;
        let url = parsed_url.as_str();

-        // Reddit fallback: use their JSON API to get post + full comment tree.
-        if crate::reddit::is_reddit_url(url) {
-            let json_url = crate::reddit::json_url(url);
-            let json_url = crate::url_security::validate_public_http_url(&json_url).await?;
-            debug!("reddit detected, fetching {json_url}");
-
-            let client = self.pick_client(url);
-            let resp = client.get(json_url.as_str()).send().await?;
-            let response = Response::from_wreq(resp).await?;
-            if response.is_success() {
-                let bytes = response.body();
-                match crate::reddit::parse_reddit_json(bytes, url) {
-                    Ok(result) => return Ok(result),
-                    Err(e) => warn!("reddit json fallback failed: {e}, falling back to HTML"),
-                }
-            }
-        }
+        // Reddit: rewrite to old.reddit.com for stable server-rendered HTML.
+        // webclaw-core's Reddit fast path then parses the thread structure.
+        let reddit_owned;
+        let url = if crate::reddit::is_reddit_url(url) {
+            reddit_owned = crate::reddit::to_old_reddit_url(url);
+            debug!("reddit: rewriting to {reddit_owned}");
+            reddit_owned.as_str()
+        } else {
+            url
+        };

        let start = Instant::now();
        let client = self.pick_client(url);
--- a/crates/webclaw-fetch/src/extractors/reddit.rs
+++ b/crates/webclaw-fetch/src/extractors/reddit.rs
@ -1,12 +1,10 @@
-//! Reddit structured extractor — returns the full post + comment tree
-//! as typed JSON via Reddit's `.json` API.
+//! Reddit structured extractor — parses old.reddit.com HTML.
 //!
-//! The same trick the markdown extractor in `crate::reddit` uses:
-//! appending `.json` to any post URL returns the data the new SPA
-//! frontend would load client-side. Zero antibot, zero JS rendering.
+//! Fetches old.reddit.com (stable server-rendered HTML, no JS required)
+//! and delegates parsing to `webclaw_core::reddit`. Returns a typed JSON
+//! value with `{ url, post, comments }` structure.

-use serde::Deserialize;
-use serde_json::{Value, json};
+use serde_json::Value;

 use super::ExtractorInfo;
 use crate::error::FetchError;
@ -24,182 +22,27 @@ pub const INFO: ExtractorInfo = ExtractorInfo {
 };

 pub fn matches(url: &str) -> bool {
-    let host = host_of(url);
-    let is_reddit_host = matches!(
-        host,
-        "reddit.com" | "www.reddit.com" | "old.reddit.com" | "np.reddit.com" | "new.reddit.com"
-    );
-    is_reddit_host && url.contains("/comments/")
+    webclaw_core::reddit::is_reddit_url(url) && url.contains("/comments/")
 }

 pub async fn extract(client: &dyn Fetcher, url: &str) -> Result<Value, FetchError> {
-    let json_url = build_json_url(url);
-    let resp = client.fetch(&json_url).await?;
+    let fetch_url = crate::reddit::to_old_reddit_url(url);
+    let resp = client.fetch(&fetch_url).await?;
    if resp.status != 200 {
        return Err(FetchError::Build(format!(
-            "reddit api returned status {}",
+            "reddit: unexpected status {}",
            resp.status
        )));
    }

-    let listings: Vec<Listing> = serde_json::from_str(&resp.html)
-        .map_err(|e| FetchError::BodyDecode(format!("reddit json parse: {e}")))?;
+    let thread = webclaw_core::reddit::try_extract_thread(&resp.html, url).ok_or_else(|| {
+        FetchError::BodyDecode(
+            "reddit: page structure not recognised — is this a thread URL?".into(),
+        )
+    })?;

-    if listings.is_empty() {
-        return Err(FetchError::BodyDecode("reddit response empty".into()));
-    }
-
-    // First listing = the post (single t3 child).
-    let post = listings
-        .first()
-        .and_then(|l| l.data.children.first())
-        .filter(|t| t.kind == "t3")
-        .map(|t| post_json(&t.data))
-        .unwrap_or(Value::Null);
-
-    // Second listing = the comment tree.
-    let comments: Vec<Value> = listings
-        .get(1)
-        .map(|l| l.data.children.iter().filter_map(comment_json).collect())
-        .unwrap_or_default();
-
-    Ok(json!({
-        "url": url,
-        "post": post,
-        "comments": comments,
-    }))
-}
-
-// ---------------------------------------------------------------------------
-// JSON shapers
-// ---------------------------------------------------------------------------
-
-fn post_json(d: &ThingData) -> Value {
-    json!({
-        "id":               d.id,
-        "title":            d.title,
-        "author":           d.author,
-        "subreddit":        d.subreddit_name_prefixed,
-        "permalink":        d.permalink.as_ref().map(|p| format!("https://www.reddit.com{p}")),
-        "url":              d.url_overridden_by_dest,
-        "is_self":          d.is_self,
-        "selftext":         d.selftext,
-        "score":            d.score,
-        "upvote_ratio":     d.upvote_ratio,
-        "num_comments":     d.num_comments,
-        "created_utc":      d.created_utc,
-        "link_flair_text":  d.link_flair_text,
-        "over_18":          d.over_18,
-        "spoiler":          d.spoiler,
-        "stickied":         d.stickied,
-        "locked":           d.locked,
-    })
-}
-
-/// Render a single comment + its reply tree. Returns `None` for non-t1
-/// kinds (the trailing `more` placeholder Reddit injects at depth limits).
-fn comment_json(thing: &Thing) -> Option<Value> {
-    if thing.kind != "t1" {
-        return None;
-    }
-    let d = &thing.data;
-    let replies: Vec<Value> = match &d.replies {
-        Some(Replies::Listing(l)) => l.data.children.iter().filter_map(comment_json).collect(),
-        _ => Vec::new(),
-    };
-    Some(json!({
-        "id":             d.id,
-        "author":         d.author,
-        "body":           d.body,
-        "score":          d.score,
-        "created_utc":    d.created_utc,
-        "is_submitter":   d.is_submitter,
-        "stickied":       d.stickied,
-        "depth":          d.depth,
-        "permalink":      d.permalink.as_ref().map(|p| format!("https://www.reddit.com{p}")),
-        "replies":        replies,
-    }))
-}
-
-// ---------------------------------------------------------------------------
-// URL helpers
-// ---------------------------------------------------------------------------
-
-fn host_of(url: &str) -> &str {
-    url.split("://")
-        .nth(1)
-        .unwrap_or(url)
-        .split('/')
-        .next()
-        .unwrap_or("")
-}
-
-/// Build the Reddit JSON URL. We keep the original host (`www.reddit.com`
-/// or `old.reddit.com` as the caller gave us). Routing through
-/// `old.reddit.com` unconditionally looks appealing but that host has
-/// stricter UA-based blocking than `www.reddit.com`, while the main
-/// host accepts our Chrome-fingerprinted client fine.
-fn build_json_url(url: &str) -> String {
-    let clean = url.split('?').next().unwrap_or(url).trim_end_matches('/');
-    format!("{clean}.json?raw_json=1")
-}
-
-// ---------------------------------------------------------------------------
-// Reddit JSON types — only fields we render. Everything else is dropped.
-// ---------------------------------------------------------------------------
-
-#[derive(Deserialize)]
-struct Listing {
-    data: ListingData,
-}
-
-#[derive(Deserialize)]
-struct ListingData {
-    children: Vec<Thing>,
-}
-
-#[derive(Deserialize)]
-struct Thing {
-    kind: String,
-    data: ThingData,
-}
-
-#[derive(Deserialize, Default)]
-struct ThingData {
-    // post (t3)
-    id: Option<String>,
-    title: Option<String>,
-    selftext: Option<String>,
-    subreddit_name_prefixed: Option<String>,
-    url_overridden_by_dest: Option<String>,
-    is_self: Option<bool>,
-    upvote_ratio: Option<f64>,
-    num_comments: Option<i64>,
-    over_18: Option<bool>,
-    spoiler: Option<bool>,
-    stickied: Option<bool>,
-    locked: Option<bool>,
-    link_flair_text: Option<String>,
-
-    // comment (t1)
-    author: Option<String>,
-    body: Option<String>,
-    score: Option<i64>,
-    created_utc: Option<f64>,
-    is_submitter: Option<bool>,
-    depth: Option<i64>,
-    permalink: Option<String>,
-
-    // recursive
-    replies: Option<Replies>,
-}
-
-#[derive(Deserialize)]
-#[serde(untagged)]
-enum Replies {
-    Listing(Listing),
-    #[allow(dead_code)]
-    Empty(String),
+    serde_json::to_value(&thread)
+        .map_err(|e| FetchError::BodyDecode(format!("reddit: serialisation error: {e}")))
 }

 #[cfg(test)]
@ -207,28 +50,17 @@ mod tests {
    use super::*;

    #[test]
-    fn matches_reddit_post_urls() {
+    fn matches_thread_urls() {
        assert!(matches(
            "https://www.reddit.com/r/rust/comments/abc123/some_title/"
        ));
-        assert!(matches(
-            "https://reddit.com/r/rust/comments/abc123/some_title"
-        ));
        assert!(matches("https://old.reddit.com/r/rust/comments/abc123/x/"));
+        assert!(matches("https://reddit.com/r/rust/comments/abc/x"));
    }

    #[test]
-    fn rejects_non_post_reddit_urls() {
+    fn rejects_listing_and_non_reddit() {
        assert!(!matches("https://www.reddit.com/r/rust"));
-        assert!(!matches("https://www.reddit.com/user/foo"));
-        assert!(!matches("https://example.com/r/rust/comments/x"));
-    }
-
-    #[test]
-    fn json_url_appends_suffix_and_drops_query() {
-        assert_eq!(
-            build_json_url("https://www.reddit.com/r/rust/comments/abc/x/?utm=foo"),
-            "https://www.reddit.com/r/rust/comments/abc/x.json?raw_json=1"
-        );
+        assert!(!matches("https://example.com/r/rust/comments/abc/x"));
    }
 }
--- a/crates/webclaw-fetch/src/reddit.rs
+++ b/crates/webclaw-fetch/src/reddit.rs
@ -1,172 +1,56 @@
-/// Reddit JSON API fallback for extracting posts + comments without JS rendering.
-///
-/// Reddit's new `shreddit` frontend only SSRs the post body — comments are
-/// loaded client-side. Appending `.json` to any Reddit URL returns the full
-/// comment tree as structured JSON, which we convert to clean markdown.
-use serde::Deserialize;
-use tracing::debug;
-use webclaw_core::{Content, ExtractionResult, Metadata};
+//! Reddit URL helpers for the fetch layer.
+//!
+//! The JSON API (`*.json`) is blocked. We rewrite all Reddit hosts to
+//! `old.reddit.com`, which serves stable server-rendered HTML that
+//! `webclaw-core::reddit` parses directly.

-/// Check if a URL points to a Reddit post/comment page.
 pub fn is_reddit_url(url: &str) -> bool {
-    let host = url
-        .split("://")
-        .nth(1)
-        .unwrap_or(url)
-        .split('/')
-        .next()
-        .unwrap_or("");
-    matches!(
-        host,
-        "reddit.com" | "www.reddit.com" | "old.reddit.com" | "np.reddit.com" | "new.reddit.com"
-    )
+    webclaw_core::reddit::is_reddit_url(url)
 }

-/// Build the `.json` URL from a Reddit page URL.
-pub fn json_url(url: &str) -> String {
-    let clean = url.split('?').next().unwrap_or(url).trim_end_matches('/');
-    format!("{clean}.json")
+/// Rewrite any Reddit host to old.reddit.com, preserving path and query.
+pub fn to_old_reddit_url(url: &str) -> String {
+    let Some(scheme_end) = url.find("://") else {
+        return url.to_string();
+    };
+    let after = &url[scheme_end + 3..];
+    let host_end = after.find(['/', '?', '#']).unwrap_or(after.len());
+    let scheme = &url[..scheme_end + 3];
+    let rest = &after[host_end..];
+    format!("{scheme}old.reddit.com{rest}")
 }

-/// Convert Reddit JSON API response into an ExtractionResult.
-pub fn parse_reddit_json(json_bytes: &[u8], url: &str) -> Result<ExtractionResult, String> {
-    let listings: Vec<Listing> =
-        serde_json::from_slice(json_bytes).map_err(|e| format!("reddit json parse: {e}"))?;
+#[cfg(test)]
+mod tests {
+    use super::*;

-    let mut markdown = String::new();
-    let mut title = None;
-    let mut author = None;
-    let mut subreddit = None;
-
-    // First listing = the post itself
-    if let Some(post_listing) = listings.first() {
-        for child in &post_listing.data.children {
-            if child.kind == "t3" {
-                let d = &child.data;
-                title = d.title.clone();
-                author = d.author.clone();
-                subreddit = d.subreddit_name_prefixed.clone();
-
-                if let Some(ref t) = title {
-                    markdown.push_str(&format!("# {t}\n\n"));
-                }
-                if let (Some(a), Some(sr)) = (&author, &subreddit) {
-                    markdown.push_str(&format!("**u/{a}** in {sr}\n\n"));
-                }
-                if let Some(ref body) = d.selftext
-                    && !body.is_empty()
-                {
-                    markdown.push_str(body);
-                    markdown.push_str("\n\n");
-                }
-                if let Some(ref url_field) = d.url_overridden_by_dest
-                    && !url_field.is_empty()
-                {
-                    markdown.push_str(&format!("[Link]({url_field})\n\n"));
-                }
-                markdown.push_str("---\n\n");
-            }
-        }
+    #[test]
+    fn rewrites_www_to_old() {
+        assert_eq!(
+            to_old_reddit_url("https://www.reddit.com/r/rust/comments/abc/x/"),
+            "https://old.reddit.com/r/rust/comments/abc/x/"
+        );
    }

-    // Second listing = comment tree
-    if let Some(comment_listing) = listings.get(1) {
-        markdown.push_str("## Comments\n\n");
-        for child in &comment_listing.data.children {
-            render_comment(child, 0, &mut markdown);
-        }
+    #[test]
+    fn rewrites_bare_to_old() {
+        assert_eq!(
+            to_old_reddit_url("https://reddit.com/r/rust/"),
+            "https://old.reddit.com/r/rust/"
+        );
    }

-    let word_count = markdown.split_whitespace().count();
-    debug!(word_count, "reddit json extracted");
-
-    Ok(ExtractionResult {
-        metadata: Metadata {
-            title,
-            description: None,
-            author,
-            published_date: None,
-            language: Some("en".into()),
-            url: Some(url.to_string()),
-            site_name: subreddit,
-            image: None,
-            favicon: None,
-            word_count,
-        },
-        content: Content {
-            markdown,
-            plain_text: String::new(),
-            links: vec![],
-            images: vec![],
-            code_blocks: vec![],
-            raw_html: None,
-        },
-        domain_data: None,
-        structured_data: vec![],
-    })
-}
-
-fn render_comment(thing: &Thing, depth: usize, out: &mut String) {
-    if thing.kind != "t1" {
-        return;
+    #[test]
+    fn preserves_old_reddit_unchanged() {
+        let url = "https://old.reddit.com/r/rust/comments/abc/x/?context=3";
+        assert_eq!(to_old_reddit_url(url), url);
    }
-    let d = &thing.data;
-    let indent = "  ".repeat(depth);
-    let author = d.author.as_deref().unwrap_or("[deleted]");
-    let body = d.body.as_deref().unwrap_or("[removed]");
-    let score = d.score.unwrap_or(0);

-    out.push_str(&format!("{indent}- **u/{author}** ({score} pts)\n"));
-    for line in body.lines() {
-        out.push_str(&format!("{indent}  {line}\n"));
-    }
-    out.push('\n');
-
-    // Recurse into replies
-    if let Some(Replies::Listing(listing)) = &d.replies {
-        for child in &listing.data.children {
-            render_comment(child, depth + 1, out);
-        }
+    #[test]
+    fn preserves_query_and_hash() {
+        assert_eq!(
+            to_old_reddit_url("https://www.reddit.com/r/rust/?sort=top#anchor"),
+            "https://old.reddit.com/r/rust/?sort=top#anchor"
+        );
    }
 }
-
-// --- Reddit JSON types (minimal) ---
-
-#[derive(Deserialize)]
-struct Listing {
-    data: ListingData,
-}
-
-#[derive(Deserialize)]
-struct ListingData {
-    children: Vec<Thing>,
-}
-
-#[derive(Deserialize)]
-struct Thing {
-    kind: String,
-    data: ThingData,
-}
-
-#[derive(Deserialize)]
-struct ThingData {
-    // Post fields (t3)
-    title: Option<String>,
-    selftext: Option<String>,
-    subreddit_name_prefixed: Option<String>,
-    url_overridden_by_dest: Option<String>,
-    // Comment fields (t1)
-    author: Option<String>,
-    body: Option<String>,
-    score: Option<i64>,
-    replies: Option<Replies>,
-}
-
-/// Reddit replies can be either a nested Listing or an empty string.
-#[derive(Deserialize)]
-#[serde(untagged)]
-enum Replies {
-    Listing(Listing),
-    #[allow(dead_code)]
-    Empty(String),
-}