feat(reddit): parse old.reddit.com HTML instead of the dead .json API

Reddit blocked unauthenticated `.json` access, so the previous extractor
returned block pages or timed out on every thread. Switch to parsing
old.reddit.com's server-rendered HTML, which needs no API key or JS.

Fetch layer:
- Rewrite every Reddit host to old.reddit.com before fetching; drop all
  `.json` URL handling and the JSON response parser.

Extraction (webclaw-core::reddit):
- New HTML parser producing a typed post + nested comment tree.
- Comments nest structurally (.comment > .child > .sitetable > .comment);
  old.reddit omits a usable depth attribute, so the tree is walked
  recursively. Bodies live in .entry > form > .usertext-body > .md.
- Post metadata: title, author, subreddit, score, comment count
  (data-comments-count), self-vs-link (self class / self.* domain),
  flair, self-text body.
- Comment scores read the .score.unvoted title (the displayed value, not
  the ±1 vote-state siblings); hidden scores are None, not 0.
- Deleted comments are kept in place so their replies aren't orphaned;
  "load more comments" stubs are skipped.

Markdown output:
- Reply nesting via blockquote depth (avoids 4-space indentation turning
  text and code fences into broken indented-code blocks).
- Links keep their target as [text](url); root-relative reddit links
  resolve against old.reddit.com. Nested lists indent correctly.
- A recognised but unparseable /comments/ page returns no content rather
  than falling through to generic extraction of Reddit chrome.

Tests: regression suite runs against real old.reddit.com fixtures
(testdata/reddit/), the ground truth that surfaced the parsing and
markdown bugs synthetic HTML had hidden. Fixtures are excluded from the
published crate.
This commit is contained in:
Valerio 2026-06-04 16:16:08 +02:00
parent 3b7d11328e
commit 217bfe088b
11 changed files with 2522 additions and 391 deletions

View file

@ -4,6 +4,10 @@ description = "Pure HTML content extraction engine for LLMs"
version.workspace = true
edition.workspace = true
license.workspace = true
# Reddit regression fixtures are real old.reddit.com pages read at test time;
# they're large and only needed to run the test suite from the repo, so keep
# them out of the published crate.
exclude = ["testdata/reddit/*.html"]
[features]
default = ["quickjs"]

View file

@ -17,6 +17,7 @@ pub mod markdown;
pub mod metadata;
#[allow(dead_code)]
pub(crate) mod noise;
pub mod reddit;
pub mod structured_data;
pub mod types;
pub mod youtube;
@ -94,6 +95,24 @@ fn extract_with_options_inner(
return Err(ExtractError::NoContent);
}
// Reddit fast path: parse old.reddit.com HTML directly.
// The fetch layer rewrites all Reddit hosts to old.reddit.com before
// calling extract, so we always get stable server-rendered HTML here.
if let Some(u) = url
&& reddit::is_reddit_url(u)
{
if let Some(result) = reddit::try_extract(html, u) {
return Ok(result);
}
// A recognised comment thread that we couldn't parse (Reddit markup
// change, or a block/challenge page) — don't fall through to generic
// extraction, which would emit Reddit nav/sidebar chrome. Listings
// and profiles (no `/comments/`) intentionally fall through below.
if u.contains("/comments/") {
return Err(ExtractError::NoContent);
}
}
// YouTube fast path: if the URL is a YouTube video page, try extracting
// structured metadata from ytInitialPlayerResponse before DOM scoring.
// This gives LLMs a clean, structured view of video metadata.

View file

@ -0,0 +1,968 @@
//! Reddit thread extractor — parses old.reddit.com HTML directly.
//!
//! old.reddit.com serves fully server-rendered HTML with stable class names
//! and data attributes. No JS, no API key, no `.json` trick needed.
use scraper::{ElementRef, Html, Selector};
use serde::Serialize;
use crate::{Content, DomainData, DomainType, ExtractionResult, Metadata};
// ─── Public types ──────────────────────────────────────────────────────────────
#[derive(Serialize)]
pub struct RedditPost {
pub id: Option<String>,
pub title: String,
pub author: String,
pub subreddit: Option<String>,
pub score: i64,
pub body: Option<String>,
pub num_comments: usize,
pub permalink: String,
pub url: Option<String>,
pub is_self: bool,
pub flair: Option<String>,
pub created_utc: Option<String>,
}
#[derive(Serialize)]
pub struct RedditComment {
pub id: Option<String>,
pub author: String,
pub body: String,
/// `None` when Reddit hides the score (fresh comments). Distinct from
/// `Some(0)`, which is a real net-zero score.
pub score: Option<i64>,
pub depth: usize,
pub is_op: bool,
pub created_utc: Option<String>,
pub replies: Vec<RedditComment>,
}
#[derive(Serialize)]
pub struct RedditThread {
#[serde(rename = "url")]
pub source_url: String,
pub post: Option<RedditPost>,
pub comments: Vec<RedditComment>,
}
// ─── Public API ────────────────────────────────────────────────────────────────
pub fn is_reddit_url(url: &str) -> bool {
matches!(
host_of(url),
"reddit.com" | "www.reddit.com" | "old.reddit.com" | "np.reddit.com" | "new.reddit.com"
)
}
/// Try to parse a Reddit thread from old.reddit.com HTML.
/// Returns `None` if the page doesn't have recognisable Reddit structure.
pub fn try_extract_thread(html: &str, url: &str) -> Option<RedditThread> {
if !url.contains("/comments/") {
return None;
}
let doc = Html::parse_document(html);
let post = parse_post(&doc);
let op = post.as_ref().map(|p| p.author.as_str()).unwrap_or("");
let comments = parse_comments(&doc, op);
if post.is_none() && comments.is_empty() {
return None;
}
Some(RedditThread {
source_url: url.to_string(),
post,
comments,
})
}
/// Entry point for `webclaw-core`'s extraction fast path.
pub fn try_extract(html: &str, url: &str) -> Option<ExtractionResult> {
let thread = try_extract_thread(html, url)?;
Some(to_extraction_result(&thread))
}
// ─── ExtractionResult builder ──────────────────────────────────────────────────
fn to_extraction_result(thread: &RedditThread) -> ExtractionResult {
let md = to_markdown(thread);
let plain = plain_text(&md);
let wc = md.split_whitespace().count();
let (title, author, site_name) = thread
.post
.as_ref()
.map(|p| {
(
Some(p.title.clone()),
Some(p.author.clone()),
p.subreddit.clone(),
)
})
.unwrap_or_default();
ExtractionResult {
metadata: Metadata {
title,
description: None,
author,
published_date: None,
language: Some("en".to_string()),
url: Some(thread.source_url.clone()),
site_name,
image: None,
favicon: None,
word_count: wc,
},
content: Content {
markdown: md,
plain_text: plain,
links: vec![],
images: vec![],
code_blocks: vec![],
raw_html: None,
},
domain_data: Some(DomainData {
domain_type: DomainType::Social,
}),
structured_data: vec![],
}
}
// ─── Markdown rendering ────────────────────────────────────────────────────────
pub fn to_markdown(thread: &RedditThread) -> String {
let mut out = String::new();
if let Some(p) = &thread.post {
out.push_str(&format!("# {}\n\n", p.title));
let pts = pt_label(Some(p.score));
let cmt = match p.num_comments {
0 => String::new(),
1 => " · 1 comment".to_string(),
n => format!(" · {n} comments"),
};
let sub = p.subreddit.as_deref().unwrap_or("?");
out.push_str(&format!("**u/{}** · r/{sub} · {pts}{cmt}\n\n", p.author));
if let Some(ref body) = p.body
&& !body.is_empty()
{
out.push_str(body);
out.push_str("\n\n");
}
if let Some(ref link) = p.url
&& !p.is_self
{
out.push_str(&format!("[Link]({link})\n\n"));
}
out.push_str("---\n\n");
}
if !thread.comments.is_empty() {
out.push_str("## Comments\n\n");
for c in &thread.comments {
render_comment(c, &mut out);
}
}
collapse_blank_lines(out.trim_end())
}
/// Render one comment + its replies. Nesting is expressed with blockquote
/// depth (`> ` per level) rather than leading spaces: space-indentation of
/// 4+ would turn ordinary text and ``` fences into CommonMark indented code
/// blocks, corrupting any comment at depth ≥ 2.
fn render_comment(c: &RedditComment, out: &mut String) {
let q = "> ".repeat(c.depth);
let blank = ">".repeat(c.depth);
let author = if c.is_op {
format!("**u/{} [OP]**", c.author)
} else {
format!("**u/{}**", c.author)
};
out.push_str(&format!("{q}{author} · {}\n", pt_label(c.score)));
for line in c.body.lines() {
if line.is_empty() {
out.push_str(&blank);
out.push('\n');
} else {
out.push_str(&q);
out.push_str(line);
out.push('\n');
}
}
out.push('\n');
for reply in &c.replies {
render_comment(reply, out);
}
}
fn pt_label(n: Option<i64>) -> String {
match n {
None => "score hidden".to_string(),
Some(1) => "1 pt".to_string(),
Some(-1) => "-1 pt".to_string(),
Some(n) => format!("{n} pts"),
}
}
/// Collapse runs of 3+ newlines down to a blank-line separator so the
/// blockquote prefixes and `<pre>` spacing don't leave large gaps.
fn collapse_blank_lines(s: &str) -> String {
let mut out = String::with_capacity(s.len());
let mut newlines = 0;
for ch in s.chars() {
if ch == '\n' {
newlines += 1;
if newlines <= 2 {
out.push(ch);
}
} else {
newlines = 0;
out.push(ch);
}
}
out
}
fn plain_text(md: &str) -> String {
md.lines()
.map(|l| {
// Strip a single leading blockquote / heading marker, then drop
// emphasis markers. Greedy char-class stripping (the old approach)
// ate legitimate content like ">"-prefixed quotes.
let l = l.trim_start();
let l = l
.strip_prefix("> ")
.or_else(|| l.strip_prefix('>'))
.unwrap_or(l);
let l = l.trim_start_matches('#').trim_start();
l.replace("**", "")
.replace("~~", "")
.replace(['*', '`'], "")
})
.collect::<Vec<_>>()
.join("\n")
}
// ─── HTML parsing ──────────────────────────────────────────────────────────────
fn parse_post(doc: &Html) -> Option<RedditPost> {
let sel = Selector::parse("#siteTable .thing.link").ok()?;
let thing = doc.select(&sel).next()?;
let v = thing.value();
let id = v
.attr("data-fullname")
.map(|s| s.trim_start_matches("t3_").to_string());
let author = v.attr("data-author").unwrap_or("[deleted]").to_string();
let subreddit = v.attr("data-subreddit").map(str::to_string);
let score: i64 = v
.attr("data-score")
.and_then(|s| s.parse().ok())
.unwrap_or(0);
let num_comments: usize = v
.attr("data-comments-count")
.and_then(|s| s.parse().ok())
.unwrap_or(0);
let permalink_path = v.attr("data-permalink").unwrap_or("");
let permalink = format!("https://old.reddit.com{permalink_path}");
// Self-posts carry the `self` class and a `self.<sub>` domain; their
// data-url points back at the permalink rather than an external site.
let is_self = v.has_class("self", scraper::CaseSensitivity::AsciiCaseInsensitive)
|| v.attr("data-domain")
.is_some_and(|d| d.starts_with("self."));
let link_url = v.attr("data-url").map(str::to_string);
let url = if is_self { None } else { link_url };
// Title
let sel_title = Selector::parse(".title a.title").ok()?;
let title = thing
.select(&sel_title)
.next()
.map(|el| el.text().collect::<String>().trim().to_string())
.filter(|s| !s.is_empty())?;
// Flair
let flair = Selector::parse(".linkflairlabel")
.ok()
.and_then(|s| thing.select(&s).next())
.map(|el| el.text().collect::<String>().trim().to_string())
.filter(|s| !s.is_empty());
// Self-text body: thing > .entry > .expando > .usertext-body [> .md]
let body = direct_child(thing, "entry")
.and_then(|entry| find_class(entry, "expando"))
.and_then(|expando| find_class(expando, "usertext-body"))
.and_then(|ut| find_class(ut, "md"))
.map(md_to_markdown)
.filter(|s| !s.is_empty());
// Datetime
let created_utc = Selector::parse("time[datetime]")
.ok()
.and_then(|s| thing.select(&s).next())
.and_then(|t| t.value().attr("datetime"))
.map(str::to_string);
Some(RedditPost {
id,
title,
author,
subreddit,
score,
body,
num_comments,
permalink,
url,
is_self,
flair,
created_utc,
})
}
// ─── Comment parsing ───────────────────────────────────────────────────────────
//
// old.reddit.com nests comments structurally, not via a depth attribute:
//
// .commentarea
// .sitetable.nestedlisting
// .comment.thing ← root comment
// .entry → form → .usertext-body → .md ← its own body
// .child
// .sitetable.listing
// .comment.thing ← reply (recurse)
//
// `data-depth`/`data-replies` are absent or always "0" in the logged-out
// HTML, so we walk the tree by recursing into each comment's `.child`.
fn parse_comments(doc: &Html, op: &str) -> Vec<RedditComment> {
// Root listing is `.sitetable.nestedlisting` inside `.commentarea`
// (note: `commentarea` is a class on old.reddit, not an id). Fall back
// to the first `.nestedlisting` anywhere for comment-permalink pages.
let listing = Selector::parse(".commentarea .sitetable.nestedlisting")
.ok()
.and_then(|s| doc.select(&s).next())
.or_else(|| {
Selector::parse(".sitetable.nestedlisting")
.ok()
.and_then(|s| doc.select(&s).next())
});
match listing {
Some(l) => walk_comment_level(l, op, 0),
None => vec![],
}
}
/// Parse the direct-child `.comment.thing` elements of a comment listing.
fn walk_comment_level(listing: ElementRef, op: &str, depth: usize) -> Vec<RedditComment> {
listing
.children()
.filter_map(ElementRef::wrap)
.filter(|c| {
let val = c.value();
val.has_class("comment", scraper::CaseSensitivity::AsciiCaseInsensitive)
&& val.has_class("thing", scraper::CaseSensitivity::AsciiCaseInsensitive)
})
.filter_map(|c| parse_one_comment(c, op, depth))
.collect()
}
fn parse_one_comment(c: ElementRef, op: &str, depth: usize) -> Option<RedditComment> {
let v = c.value();
// "load more comments" placeholders are `.thing` with type=morechildren.
// They carry a t1_ fullname but no real content — skip them.
if v.attr("data-type") == Some("morechildren")
|| v.has_class(
"morechildren",
scraper::CaseSensitivity::AsciiCaseInsensitive,
)
{
return None;
}
let is_deleted = v.has_class("deleted", scraper::CaseSensitivity::AsciiCaseInsensitive);
let id = v
.attr("data-fullname")
.map(|s| s.trim_start_matches("t1_").to_string());
let author = v
.attr("data-author")
.filter(|a| !a.is_empty())
.unwrap_or("[deleted]")
.to_string();
// Own body lives in `.entry > form > .usertext-body > .md`. `.child`
// (nested replies) is a sibling of `.entry`, so descending within
// `.entry` never crosses into a reply's body.
let entry = direct_child(c, "entry");
let body = entry
.and_then(|e| find_class(e, "usertext-body"))
.and_then(|ut| find_class(ut, "md"))
.map(md_to_markdown)
.filter(|s| !s.is_empty())
.unwrap_or_else(|| {
if is_deleted {
"[removed]".into()
} else {
String::new()
}
});
// Displayed score is `.score.unvoted`, whose `title` holds the exact
// integer (the sibling likes/dislikes spans are ±1). Hidden-score
// comments have no `.score.unvoted` span, so `comment_score` returns
// None — kept distinct from a genuine 0.
let score = entry.and_then(comment_score);
let created_utc = entry
.and_then(|e| Selector::parse("time[datetime]").ok().map(|s| (e, s)))
.and_then(|(e, s)| e.select(&s).next())
.and_then(|t| t.value().attr("datetime"))
.map(str::to_string);
let is_op = !is_deleted && author != "[deleted]" && author == op;
// Replies: `.comment > .child > .sitetable > .comment`.
let replies = direct_child(c, "child")
.and_then(|child| direct_child(child, "sitetable"))
.map(|st| walk_comment_level(st, op, depth + 1))
.unwrap_or_default();
Some(RedditComment {
id,
author,
body,
score,
depth,
is_op,
created_utc,
replies,
})
}
/// Read a comment's score from the `.score.unvoted` span inside `.entry`.
/// Prefers the `title` attribute (exact integer); falls back to the text.
/// Returns `None` when Reddit hides the score (no `.score.unvoted` span).
fn comment_score(entry: ElementRef) -> Option<i64> {
let sel = Selector::parse("span.score.unvoted").ok()?;
let span = entry.select(&sel).next()?;
span.value()
.attr("title")
.and_then(|t| t.trim().parse().ok())
.or_else(|| parse_score(&span.text().collect::<String>()))
}
// ─── DOM helpers ───────────────────────────────────────────────────────────────
/// First direct child element whose class list includes `class`.
fn direct_child<'a>(el: ElementRef<'a>, class: &str) -> Option<ElementRef<'a>> {
el.children().filter_map(ElementRef::wrap).find(|c| {
c.value()
.has_class(class, scraper::CaseSensitivity::AsciiCaseInsensitive)
})
}
/// First descendant (any depth) whose class list includes `class`.
fn find_class<'a>(el: ElementRef<'a>, class: &str) -> Option<ElementRef<'a>> {
el.children().filter_map(ElementRef::wrap).find_map(|c| {
if c.value()
.has_class(class, scraper::CaseSensitivity::AsciiCaseInsensitive)
{
Some(c)
} else {
find_class(c, class)
}
})
}
fn parse_score(text: &str) -> Option<i64> {
text.split_whitespace()
.next()
.map(|w| w.replace('', "-"))
.and_then(|w| w.parse().ok())
}
// ─── .md div → markdown ────────────────────────────────────────────────────────
fn md_to_markdown(el: ElementRef) -> String {
let mut out = String::new();
render_children(el, &mut out);
out.trim().to_string()
}
fn render_children(el: ElementRef, out: &mut String) {
use scraper::node::Node;
for child in el.children() {
match child.value() {
Node::Text(t) => out.push_str(t.as_ref()),
Node::Element(_) => {
if let Some(c) = ElementRef::wrap(child) {
render_node(c, out);
}
}
_ => {}
}
}
}
fn render_node(el: ElementRef, out: &mut String) {
match el.value().name() {
"p" | "div" => {
let mut inner = String::new();
render_children(el, &mut inner);
let t = inner.trim();
if !t.is_empty() {
out.push_str(t);
out.push_str("\n\n");
}
}
"br" => out.push('\n'),
"strong" | "b" => {
let t: String = el.text().collect();
let t = t.trim();
if !t.is_empty() {
out.push_str(&format!("**{t}**"));
}
}
"em" | "i" => {
let t: String = el.text().collect();
let t = t.trim();
if !t.is_empty() {
out.push_str(&format!("*{t}*"));
}
}
"del" | "s" | "strike" => {
let t: String = el.text().collect();
let t = t.trim();
if !t.is_empty() {
out.push_str(&format!("~~{t}~~"));
}
}
"code" => {
let t: String = el.text().collect();
out.push('`');
out.push_str(t.trim());
out.push('`');
}
"pre" => {
let t: String = el.text().collect();
out.push_str("```\n");
out.push_str(t.trim_end_matches('\n'));
out.push_str("\n```\n\n");
}
"a" => {
let text: String = el.text().collect();
let text = text.trim();
if !text.is_empty() {
// Preserve the destination as a markdown link. Resolve
// root-relative reddit hrefs (/r/, /user/, /wiki/, ...) and
// drop non-navigational ones (javascript:, #fragment, mailto:).
let href = el.value().attr("href").unwrap_or("");
if href.starts_with("http://") || href.starts_with("https://") {
out.push_str(&format!("[{text}]({href})"));
} else if href.starts_with('/') {
out.push_str(&format!("[{text}](https://old.reddit.com{href})"));
} else {
out.push_str(text);
}
}
}
"blockquote" => {
let mut inner = String::new();
render_children(el, &mut inner);
let trimmed = inner.trim();
for line in trimmed.lines() {
out.push('>');
if !line.is_empty() {
out.push(' ');
out.push_str(line);
}
out.push('\n');
}
out.push('\n');
}
"ul" => render_list(el, false, 0, out),
"ol" => render_list(el, true, 0, out),
"h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
let level = el
.value()
.name()
.chars()
.nth(1)
.and_then(|c| c.to_digit(10))
.unwrap_or(2) as usize;
let t: String = el.text().collect();
let t = t.trim();
if !t.is_empty() {
out.push_str(&"#".repeat(level));
out.push(' ');
out.push_str(t);
out.push_str("\n\n");
}
}
"hr" => out.push_str("---\n\n"),
"sup" => {
let t: String = el.text().collect();
out.push_str(t.trim());
}
// Unknown / generic containers: recurse
_ => render_children(el, out),
}
}
/// Render a `<ul>`/`<ol>`, indenting nested lists by two spaces per level so
/// child items keep their own line instead of being glued to the parent.
fn render_list(list: ElementRef, ordered: bool, indent: usize, out: &mut String) {
use scraper::node::Node;
let pad = " ".repeat(indent);
let mut n = 0;
for li in list
.children()
.filter_map(ElementRef::wrap)
.filter(|c| c.value().name() == "li")
{
n += 1;
// Inline content of this <li>, excluding nested lists (rendered after).
let mut inline = String::new();
for child in li.children() {
match child.value() {
Node::Text(t) => inline.push_str(t.as_ref()),
Node::Element(e) if e.name() == "ul" || e.name() == "ol" => {}
Node::Element(_) => {
if let Some(c) = ElementRef::wrap(child) {
render_node(c, &mut inline);
}
}
_ => {}
}
}
let marker = if ordered {
format!("{n}. ")
} else {
"- ".to_string()
};
out.push_str(&format!("{pad}{marker}{}\n", inline.trim()));
for child in li.children().filter_map(ElementRef::wrap) {
match child.value().name() {
"ul" => render_list(child, false, indent + 1, out),
"ol" => render_list(child, true, indent + 1, out),
_ => {}
}
}
}
if indent == 0 {
out.push('\n');
}
}
// ─── URL helpers ───────────────────────────────────────────────────────────────
fn host_of(url: &str) -> &str {
url.split("://")
.nth(1)
.unwrap_or(url)
.split(['/', '?', '#'])
.next()
.unwrap_or("")
}
// ─── Tests ─────────────────────────────────────────────────────────────────────
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn is_reddit_url_recognises_variants() {
assert!(is_reddit_url(
"https://www.reddit.com/r/rust/comments/abc/x/"
));
assert!(is_reddit_url(
"https://old.reddit.com/r/rust/comments/abc/x/"
));
assert!(is_reddit_url("https://reddit.com/r/rust/comments/abc/x/"));
assert!(!is_reddit_url("https://example.com"));
}
#[test]
fn try_extract_thread_returns_none_for_listing_url() {
let html = "<html><body></body></html>";
assert!(try_extract_thread(html, "https://old.reddit.com/r/rust/").is_none());
}
#[test]
fn md_to_markdown_basic() {
let html =
Html::parse_fragment(r#"<div class="md"><p>Hello <strong>world</strong>!</p></div>"#);
let sel = Selector::parse(".md").unwrap();
let el = html.select(&sel).next().unwrap();
let md = md_to_markdown(el);
assert!(md.contains("**world**"));
assert!(md.contains("Hello"));
}
#[test]
fn md_to_markdown_blockquote_and_code() {
let html = Html::parse_fragment(
r#"<div class="md"><blockquote><p>Quoted</p></blockquote><pre><code>fn main() {}</code></pre></div>"#,
);
let sel = Selector::parse(".md").unwrap();
let el = html.select(&sel).next().unwrap();
let md = md_to_markdown(el);
assert!(md.contains("> Quoted"));
assert!(md.contains("```"));
assert!(md.contains("fn main()"));
}
#[test]
fn md_to_markdown_link_preserves_href() {
let abs = Html::parse_fragment(
r#"<div class="md"><p>see <a href="https://example.com/x">this</a></p></div>"#,
);
let sel = Selector::parse(".md").unwrap();
let el = abs.select(&sel).next().unwrap();
assert!(md_to_markdown(el).contains("[this](https://example.com/x)"));
// Root-relative reddit links resolve against old.reddit.com.
let rel = Html::parse_fragment(
r#"<div class="md"><p><a href="/r/rust/wiki/faq">faq</a></p></div>"#,
);
let el = rel.select(&sel).next().unwrap();
assert!(md_to_markdown(el).contains("[faq](https://old.reddit.com/r/rust/wiki/faq)"));
// javascript: / fragment hrefs degrade to bare text.
let js = Html::parse_fragment(
r#"<div class="md"><p><a href="javascript:void(0)">x</a></p></div>"#,
);
let el = js.select(&sel).next().unwrap();
let out = md_to_markdown(el);
assert!(out.contains('x') && !out.contains("javascript"));
}
// ── Regression tests against REAL old.reddit.com HTML ──────────────────
//
// These fixtures are genuine pages fetched from old.reddit.com (see
// testdata/reddit/). They are the ground truth — synthetic HTML is too
// easy to write to match wrong assumptions, which is exactly how the
// first version of this parser shipped silently broken.
fn fixture(name: &str) -> String {
std::fs::read_to_string(format!("testdata/reddit/{name}")).unwrap()
}
fn total_comments(cs: &[RedditComment]) -> usize {
cs.len() + cs.iter().map(|c| total_comments(&c.replies)).sum::<usize>()
}
fn collect<'a>(cs: &'a [RedditComment], out: &mut Vec<&'a RedditComment>) {
for c in cs {
out.push(c);
collect(&c.replies, out);
}
}
#[test]
fn real_link_post_metadata() {
// pandas: external-link post (blog.geekuni.com), 34 comments.
let html = fixture("pandas_34comments.html");
let t = try_extract_thread(
&html,
"https://old.reddit.com/r/programming/comments/abc123/t/",
)
.expect("should parse");
let p = t.post.expect("post");
assert_eq!(p.author, "Horror-Willingness74");
assert_eq!(p.subreddit.as_deref(), Some("programming"));
assert_eq!(p.score, 43);
assert_eq!(p.num_comments, 34, "data-comments-count");
assert!(!p.is_self, "external blog link, not a self post");
assert_eq!(
p.url.as_deref(),
Some("https://blog.geekuni.com/2026/06/why-learn-pandas.html")
);
assert!(p.title.contains("Pandas"));
}
#[test]
fn real_self_post_metadata() {
// A self-post (text) on r/rust: `self.rust` domain, self-text body,
// no external url.
let html = fixture("rust_selfpost_36comments.html");
let t = try_extract_thread(&html, "https://old.reddit.com/r/rust/comments/abc123/t/")
.expect("should parse");
let p = t.post.expect("post");
assert!(p.is_self, "self.rust domain → self post");
assert_eq!(p.url, None, "self posts carry no external url");
assert_eq!(p.subreddit.as_deref(), Some("rust"));
assert!(
p.body
.as_deref()
.unwrap_or("")
.contains("IT project manager"),
"self-text body should be extracted: {:?}",
p.body
);
}
#[test]
fn real_comment_bodies_and_scores() {
// The original bug: every comment body came back empty because
// .usertext-body sits inside a <form>, not directly under .entry.
let html = fixture("ebpf_6comments.html");
let t = try_extract_thread(
&html,
"https://old.reddit.com/r/programming/comments/abc123/t/",
)
.expect("should parse");
// 6 comments total: 5 top-level + 1 nested reply (admalledd under ejrh).
assert_eq!(t.comments.len(), 5, "5 top-level comments");
assert_eq!(total_comments(&t.comments), 6, "6 comments incl. nested");
let teerre = t
.comments
.iter()
.find(|c| c.author == "teerre")
.expect("teerre");
assert!(
teerre.body.contains("Very cool blog"),
"body must be populated, got {:?}",
teerre.body
);
// Score comes from .score.unvoted title (the real value), not the
// ±1 likes/dislikes siblings.
assert_eq!(
teerre.score,
Some(10),
"unvoted score, not dislikes(9)/likes(11)"
);
assert!(
t.comments.iter().all(|c| !c.body.is_empty()),
"no comment body should be empty"
);
}
#[test]
fn real_nested_comment_tree() {
// pandas has structurally-nested replies (.child > .sitetable >
// .comment). data-depth/data-replies are absent in logged-out HTML.
let html = fixture("pandas_34comments.html");
let t = try_extract_thread(
&html,
"https://old.reddit.com/r/programming/comments/abc123/t/",
)
.expect("should parse");
// 34 rendered comments with content + 1 [deleted] node that old.reddit
// still shows because it has live replies = 35 nodes in the tree.
assert_eq!(
total_comments(&t.comments),
35,
"all comments incl. nested + deleted"
);
let nested = t.comments.iter().any(|c| !c.replies.is_empty());
assert!(nested, "at least one comment must have replies");
let max_depth = {
fn d(cs: &[RedditComment]) -> usize {
cs.iter().map(|c| 1 + d(&c.replies)).max().unwrap_or(0)
}
d(&t.comments)
};
assert!(max_depth >= 2, "tree should be more than one level deep");
let a_reply = t.comments.iter().find_map(|c| c.replies.first());
assert_eq!(a_reply.map(|r| r.depth), Some(1));
}
#[test]
fn real_morechildren_stubs_skipped() {
// AskReddit deep thread: 259 .thing[data-fullname=t1_] markers, but
// some are "load more comments" stubs (data-type=morechildren) with
// no author/body. They must not appear as ghost comments.
let html = fixture("askreddit_deep_morechildren.html");
let t = try_extract_thread(
&html,
"https://old.reddit.com/r/AskReddit/comments/abc123/t/",
)
.expect("should parse");
fn check(cs: &[RedditComment]) {
for c in cs {
let ghost = c.body.is_empty() && c.author == "[deleted]" && c.id.is_some();
assert!(!ghost, "morechildren stub leaked as comment: {:?}", c.id);
check(&c.replies);
}
}
check(&t.comments);
}
#[test]
fn real_hidden_score_is_none_not_zero() {
// AskReddit has fresh comments with `.score-hidden` (no .score.unvoted
// span). These must be None, distinct from a genuine 0-score comment.
let html = fixture("askreddit_deep_morechildren.html");
let t = try_extract_thread(
&html,
"https://old.reddit.com/r/AskReddit/comments/abc123/t/",
)
.expect("should parse");
let mut all = Vec::new();
collect(&t.comments, &mut all);
assert!(
all.iter().any(|c| c.score.is_none()),
"some fresh comments have hidden scores → None"
);
}
#[test]
fn real_deleted_comment_preserves_subtree() {
// pandas has a [deleted] comment that still has visible replies. The
// structural walk must keep it so its children aren't orphaned.
let html = fixture("pandas_34comments.html");
let t = try_extract_thread(
&html,
"https://old.reddit.com/r/programming/comments/abc123/t/",
)
.expect("should parse");
let mut all = Vec::new();
collect(&t.comments, &mut all);
let deleted: Vec<_> = all.iter().filter(|c| c.author == "[deleted]").collect();
assert!(!deleted.is_empty(), "should keep deleted comments");
assert!(
deleted.iter().any(|c| !c.replies.is_empty()),
"a deleted comment with replies must retain its subtree"
);
assert!(deleted.iter().all(|c| !c.is_op));
}
#[test]
fn real_markdown_is_commonmark_clean() {
// Guards the markdown bugs the verification workflow found: no
// whitespace-only "blank" lines, and ``` fences never indented 4+
// spaces (which would turn them into literal indented code blocks).
let html = fixture("elixir_60comments.html");
let result = try_extract(
&html,
"https://old.reddit.com/r/programming/comments/abc123/t/",
)
.expect("should extract");
let md = &result.content.markdown;
assert!(md.starts_with("# "));
assert!(md.contains("## Comments"));
for line in md.lines() {
assert!(
!(line.starts_with(' ') && line.trim().is_empty()),
"whitespace-only line: {line:?}"
);
let trimmed = line.trim_start_matches(['>', ' ']);
if trimmed.starts_with("```") {
let indent = line.len() - line.trim_start_matches(' ').len();
assert!(indent < 4, "code fence indented {indent} spaces: {line:?}");
}
}
assert!(result.metadata.word_count > 20);
}
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -160,9 +160,6 @@ impl Response {
fn body(&self) -> &[u8] {
&self.body
}
fn is_success(&self) -> bool {
(200..300).contains(&self.status)
}
fn text(&self) -> std::borrow::Cow<'_, str> {
String::from_utf8_lossy(&self.body)
@ -299,32 +296,15 @@ impl FetchClient {
/// when you need literal no-rescue behavior (e.g. inside the rescue
/// logic itself to avoid recursion).
pub async fn fetch_smart(&self, url: &str) -> Result<FetchResult, FetchError> {
// Reddit: the HTML page shows a verification interstitial for most
// client IPs, but appending `.json` returns the post + comment tree
// publicly. `parse_reddit_json` in downstream code knows how to read
// the result; here we just do the URL swap at the fetch layer.
if crate::reddit::is_reddit_url(url) && !url.ends_with(".json") {
let json_url = crate::reddit::json_url(url);
// Reddit's public .json API serves JSON to identifiable bot
// User-Agents and blocks browser UAs with a verification wall.
// Override our Chrome-profile UA for this specific call.
let ua = concat!(
"Webclaw/",
env!("CARGO_PKG_VERSION"),
" (+https://webclaw.io)"
);
if let Ok(resp) = self
.fetch_with_headers(&json_url, &[("user-agent", ua)])
.await
&& resp.status == 200
{
let first = resp.html.trim_start().as_bytes().first().copied();
if matches!(first, Some(b'{') | Some(b'[')) {
return Ok(resp);
}
}
// If the .json fetch failed or returned HTML, fall through.
}
// Reddit: fetch old.reddit.com for stable server-rendered HTML.
// The JSON API is blocked; old.reddit.com works without JS or auth.
let owned;
let url = if crate::reddit::is_reddit_url(url) {
owned = crate::reddit::to_old_reddit_url(url);
owned.as_str()
} else {
url
};
let resp = self.fetch(url).await?;
@ -496,23 +476,16 @@ impl FetchClient {
let parsed_url = crate::url_security::validate_public_http_url(url).await?;
let url = parsed_url.as_str();
// Reddit fallback: use their JSON API to get post + full comment tree.
if crate::reddit::is_reddit_url(url) {
let json_url = crate::reddit::json_url(url);
let json_url = crate::url_security::validate_public_http_url(&json_url).await?;
debug!("reddit detected, fetching {json_url}");
let client = self.pick_client(url);
let resp = client.get(json_url.as_str()).send().await?;
let response = Response::from_wreq(resp).await?;
if response.is_success() {
let bytes = response.body();
match crate::reddit::parse_reddit_json(bytes, url) {
Ok(result) => return Ok(result),
Err(e) => warn!("reddit json fallback failed: {e}, falling back to HTML"),
}
}
}
// Reddit: rewrite to old.reddit.com for stable server-rendered HTML.
// webclaw-core's Reddit fast path then parses the thread structure.
let reddit_owned;
let url = if crate::reddit::is_reddit_url(url) {
reddit_owned = crate::reddit::to_old_reddit_url(url);
debug!("reddit: rewriting to {reddit_owned}");
reddit_owned.as_str()
} else {
url
};
let start = Instant::now();
let client = self.pick_client(url);

View file

@ -1,12 +1,10 @@
//! Reddit structured extractor — returns the full post + comment tree
//! as typed JSON via Reddit's `.json` API.
//! Reddit structured extractor — parses old.reddit.com HTML.
//!
//! The same trick the markdown extractor in `crate::reddit` uses:
//! appending `.json` to any post URL returns the data the new SPA
//! frontend would load client-side. Zero antibot, zero JS rendering.
//! Fetches old.reddit.com (stable server-rendered HTML, no JS required)
//! and delegates parsing to `webclaw_core::reddit`. Returns a typed JSON
//! value with `{ url, post, comments }` structure.
use serde::Deserialize;
use serde_json::{Value, json};
use serde_json::Value;
use super::ExtractorInfo;
use crate::error::FetchError;
@ -24,182 +22,27 @@ pub const INFO: ExtractorInfo = ExtractorInfo {
};
pub fn matches(url: &str) -> bool {
let host = host_of(url);
let is_reddit_host = matches!(
host,
"reddit.com" | "www.reddit.com" | "old.reddit.com" | "np.reddit.com" | "new.reddit.com"
);
is_reddit_host && url.contains("/comments/")
webclaw_core::reddit::is_reddit_url(url) && url.contains("/comments/")
}
pub async fn extract(client: &dyn Fetcher, url: &str) -> Result<Value, FetchError> {
let json_url = build_json_url(url);
let resp = client.fetch(&json_url).await?;
let fetch_url = crate::reddit::to_old_reddit_url(url);
let resp = client.fetch(&fetch_url).await?;
if resp.status != 200 {
return Err(FetchError::Build(format!(
"reddit api returned status {}",
"reddit: unexpected status {}",
resp.status
)));
}
let listings: Vec<Listing> = serde_json::from_str(&resp.html)
.map_err(|e| FetchError::BodyDecode(format!("reddit json parse: {e}")))?;
let thread = webclaw_core::reddit::try_extract_thread(&resp.html, url).ok_or_else(|| {
FetchError::BodyDecode(
"reddit: page structure not recognised — is this a thread URL?".into(),
)
})?;
if listings.is_empty() {
return Err(FetchError::BodyDecode("reddit response empty".into()));
}
// First listing = the post (single t3 child).
let post = listings
.first()
.and_then(|l| l.data.children.first())
.filter(|t| t.kind == "t3")
.map(|t| post_json(&t.data))
.unwrap_or(Value::Null);
// Second listing = the comment tree.
let comments: Vec<Value> = listings
.get(1)
.map(|l| l.data.children.iter().filter_map(comment_json).collect())
.unwrap_or_default();
Ok(json!({
"url": url,
"post": post,
"comments": comments,
}))
}
// ---------------------------------------------------------------------------
// JSON shapers
// ---------------------------------------------------------------------------
fn post_json(d: &ThingData) -> Value {
json!({
"id": d.id,
"title": d.title,
"author": d.author,
"subreddit": d.subreddit_name_prefixed,
"permalink": d.permalink.as_ref().map(|p| format!("https://www.reddit.com{p}")),
"url": d.url_overridden_by_dest,
"is_self": d.is_self,
"selftext": d.selftext,
"score": d.score,
"upvote_ratio": d.upvote_ratio,
"num_comments": d.num_comments,
"created_utc": d.created_utc,
"link_flair_text": d.link_flair_text,
"over_18": d.over_18,
"spoiler": d.spoiler,
"stickied": d.stickied,
"locked": d.locked,
})
}
/// Render a single comment + its reply tree. Returns `None` for non-t1
/// kinds (the trailing `more` placeholder Reddit injects at depth limits).
fn comment_json(thing: &Thing) -> Option<Value> {
if thing.kind != "t1" {
return None;
}
let d = &thing.data;
let replies: Vec<Value> = match &d.replies {
Some(Replies::Listing(l)) => l.data.children.iter().filter_map(comment_json).collect(),
_ => Vec::new(),
};
Some(json!({
"id": d.id,
"author": d.author,
"body": d.body,
"score": d.score,
"created_utc": d.created_utc,
"is_submitter": d.is_submitter,
"stickied": d.stickied,
"depth": d.depth,
"permalink": d.permalink.as_ref().map(|p| format!("https://www.reddit.com{p}")),
"replies": replies,
}))
}
// ---------------------------------------------------------------------------
// URL helpers
// ---------------------------------------------------------------------------
fn host_of(url: &str) -> &str {
url.split("://")
.nth(1)
.unwrap_or(url)
.split('/')
.next()
.unwrap_or("")
}
/// Build the Reddit JSON URL. We keep the original host (`www.reddit.com`
/// or `old.reddit.com` as the caller gave us). Routing through
/// `old.reddit.com` unconditionally looks appealing but that host has
/// stricter UA-based blocking than `www.reddit.com`, while the main
/// host accepts our Chrome-fingerprinted client fine.
fn build_json_url(url: &str) -> String {
let clean = url.split('?').next().unwrap_or(url).trim_end_matches('/');
format!("{clean}.json?raw_json=1")
}
// ---------------------------------------------------------------------------
// Reddit JSON types — only fields we render. Everything else is dropped.
// ---------------------------------------------------------------------------
#[derive(Deserialize)]
struct Listing {
data: ListingData,
}
#[derive(Deserialize)]
struct ListingData {
children: Vec<Thing>,
}
#[derive(Deserialize)]
struct Thing {
kind: String,
data: ThingData,
}
#[derive(Deserialize, Default)]
struct ThingData {
// post (t3)
id: Option<String>,
title: Option<String>,
selftext: Option<String>,
subreddit_name_prefixed: Option<String>,
url_overridden_by_dest: Option<String>,
is_self: Option<bool>,
upvote_ratio: Option<f64>,
num_comments: Option<i64>,
over_18: Option<bool>,
spoiler: Option<bool>,
stickied: Option<bool>,
locked: Option<bool>,
link_flair_text: Option<String>,
// comment (t1)
author: Option<String>,
body: Option<String>,
score: Option<i64>,
created_utc: Option<f64>,
is_submitter: Option<bool>,
depth: Option<i64>,
permalink: Option<String>,
// recursive
replies: Option<Replies>,
}
#[derive(Deserialize)]
#[serde(untagged)]
enum Replies {
Listing(Listing),
#[allow(dead_code)]
Empty(String),
serde_json::to_value(&thread)
.map_err(|e| FetchError::BodyDecode(format!("reddit: serialisation error: {e}")))
}
#[cfg(test)]
@ -207,28 +50,17 @@ mod tests {
use super::*;
#[test]
fn matches_reddit_post_urls() {
fn matches_thread_urls() {
assert!(matches(
"https://www.reddit.com/r/rust/comments/abc123/some_title/"
));
assert!(matches(
"https://reddit.com/r/rust/comments/abc123/some_title"
));
assert!(matches("https://old.reddit.com/r/rust/comments/abc123/x/"));
assert!(matches("https://reddit.com/r/rust/comments/abc/x"));
}
#[test]
fn rejects_non_post_reddit_urls() {
fn rejects_listing_and_non_reddit() {
assert!(!matches("https://www.reddit.com/r/rust"));
assert!(!matches("https://www.reddit.com/user/foo"));
assert!(!matches("https://example.com/r/rust/comments/x"));
}
#[test]
fn json_url_appends_suffix_and_drops_query() {
assert_eq!(
build_json_url("https://www.reddit.com/r/rust/comments/abc/x/?utm=foo"),
"https://www.reddit.com/r/rust/comments/abc/x.json?raw_json=1"
);
assert!(!matches("https://example.com/r/rust/comments/abc/x"));
}
}

View file

@ -1,172 +1,56 @@
/// Reddit JSON API fallback for extracting posts + comments without JS rendering.
///
/// Reddit's new `shreddit` frontend only SSRs the post body — comments are
/// loaded client-side. Appending `.json` to any Reddit URL returns the full
/// comment tree as structured JSON, which we convert to clean markdown.
use serde::Deserialize;
use tracing::debug;
use webclaw_core::{Content, ExtractionResult, Metadata};
//! Reddit URL helpers for the fetch layer.
//!
//! The JSON API (`*.json`) is blocked. We rewrite all Reddit hosts to
//! `old.reddit.com`, which serves stable server-rendered HTML that
//! `webclaw-core::reddit` parses directly.
/// Check if a URL points to a Reddit post/comment page.
pub fn is_reddit_url(url: &str) -> bool {
let host = url
.split("://")
.nth(1)
.unwrap_or(url)
.split('/')
.next()
.unwrap_or("");
matches!(
host,
"reddit.com" | "www.reddit.com" | "old.reddit.com" | "np.reddit.com" | "new.reddit.com"
)
webclaw_core::reddit::is_reddit_url(url)
}
/// Build the `.json` URL from a Reddit page URL.
pub fn json_url(url: &str) -> String {
let clean = url.split('?').next().unwrap_or(url).trim_end_matches('/');
format!("{clean}.json")
/// Rewrite any Reddit host to old.reddit.com, preserving path and query.
pub fn to_old_reddit_url(url: &str) -> String {
let Some(scheme_end) = url.find("://") else {
return url.to_string();
};
let after = &url[scheme_end + 3..];
let host_end = after.find(['/', '?', '#']).unwrap_or(after.len());
let scheme = &url[..scheme_end + 3];
let rest = &after[host_end..];
format!("{scheme}old.reddit.com{rest}")
}
/// Convert Reddit JSON API response into an ExtractionResult.
pub fn parse_reddit_json(json_bytes: &[u8], url: &str) -> Result<ExtractionResult, String> {
let listings: Vec<Listing> =
serde_json::from_slice(json_bytes).map_err(|e| format!("reddit json parse: {e}"))?;
#[cfg(test)]
mod tests {
use super::*;
let mut markdown = String::new();
let mut title = None;
let mut author = None;
let mut subreddit = None;
// First listing = the post itself
if let Some(post_listing) = listings.first() {
for child in &post_listing.data.children {
if child.kind == "t3" {
let d = &child.data;
title = d.title.clone();
author = d.author.clone();
subreddit = d.subreddit_name_prefixed.clone();
if let Some(ref t) = title {
markdown.push_str(&format!("# {t}\n\n"));
}
if let (Some(a), Some(sr)) = (&author, &subreddit) {
markdown.push_str(&format!("**u/{a}** in {sr}\n\n"));
}
if let Some(ref body) = d.selftext
&& !body.is_empty()
{
markdown.push_str(body);
markdown.push_str("\n\n");
}
if let Some(ref url_field) = d.url_overridden_by_dest
&& !url_field.is_empty()
{
markdown.push_str(&format!("[Link]({url_field})\n\n"));
}
markdown.push_str("---\n\n");
}
}
#[test]
fn rewrites_www_to_old() {
assert_eq!(
to_old_reddit_url("https://www.reddit.com/r/rust/comments/abc/x/"),
"https://old.reddit.com/r/rust/comments/abc/x/"
);
}
// Second listing = comment tree
if let Some(comment_listing) = listings.get(1) {
markdown.push_str("## Comments\n\n");
for child in &comment_listing.data.children {
render_comment(child, 0, &mut markdown);
}
#[test]
fn rewrites_bare_to_old() {
assert_eq!(
to_old_reddit_url("https://reddit.com/r/rust/"),
"https://old.reddit.com/r/rust/"
);
}
let word_count = markdown.split_whitespace().count();
debug!(word_count, "reddit json extracted");
Ok(ExtractionResult {
metadata: Metadata {
title,
description: None,
author,
published_date: None,
language: Some("en".into()),
url: Some(url.to_string()),
site_name: subreddit,
image: None,
favicon: None,
word_count,
},
content: Content {
markdown,
plain_text: String::new(),
links: vec![],
images: vec![],
code_blocks: vec![],
raw_html: None,
},
domain_data: None,
structured_data: vec![],
})
}
fn render_comment(thing: &Thing, depth: usize, out: &mut String) {
if thing.kind != "t1" {
return;
#[test]
fn preserves_old_reddit_unchanged() {
let url = "https://old.reddit.com/r/rust/comments/abc/x/?context=3";
assert_eq!(to_old_reddit_url(url), url);
}
let d = &thing.data;
let indent = " ".repeat(depth);
let author = d.author.as_deref().unwrap_or("[deleted]");
let body = d.body.as_deref().unwrap_or("[removed]");
let score = d.score.unwrap_or(0);
out.push_str(&format!("{indent}- **u/{author}** ({score} pts)\n"));
for line in body.lines() {
out.push_str(&format!("{indent} {line}\n"));
}
out.push('\n');
// Recurse into replies
if let Some(Replies::Listing(listing)) = &d.replies {
for child in &listing.data.children {
render_comment(child, depth + 1, out);
}
#[test]
fn preserves_query_and_hash() {
assert_eq!(
to_old_reddit_url("https://www.reddit.com/r/rust/?sort=top#anchor"),
"https://old.reddit.com/r/rust/?sort=top#anchor"
);
}
}
// --- Reddit JSON types (minimal) ---
#[derive(Deserialize)]
struct Listing {
data: ListingData,
}
#[derive(Deserialize)]
struct ListingData {
children: Vec<Thing>,
}
#[derive(Deserialize)]
struct Thing {
kind: String,
data: ThingData,
}
#[derive(Deserialize)]
struct ThingData {
// Post fields (t3)
title: Option<String>,
selftext: Option<String>,
subreddit_name_prefixed: Option<String>,
url_overridden_by_dest: Option<String>,
// Comment fields (t1)
author: Option<String>,
body: Option<String>,
score: Option<i64>,
replies: Option<Replies>,
}
/// Reddit replies can be either a nested Listing or an empty string.
#[derive(Deserialize)]
#[serde(untagged)]
enum Replies {
Listing(Listing),
#[allow(dead_code)]
Empty(String),
}