mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-06 22:05:13 +02:00
feat(reddit): parse old.reddit.com HTML instead of the dead .json API
Reddit blocked unauthenticated `.json` access, so the previous extractor returned block pages or timed out on every thread. Switch to parsing old.reddit.com's server-rendered HTML, which needs no API key or JS. Fetch layer: - Rewrite every Reddit host to old.reddit.com before fetching; drop all `.json` URL handling and the JSON response parser. Extraction (webclaw-core::reddit): - New HTML parser producing a typed post + nested comment tree. - Comments nest structurally (.comment > .child > .sitetable > .comment); old.reddit omits a usable depth attribute, so the tree is walked recursively. Bodies live in .entry > form > .usertext-body > .md. - Post metadata: title, author, subreddit, score, comment count (data-comments-count), self-vs-link (self class / self.* domain), flair, self-text body. - Comment scores read the .score.unvoted title (the displayed value, not the ±1 vote-state siblings); hidden scores are None, not 0. - Deleted comments are kept in place so their replies aren't orphaned; "load more comments" stubs are skipped. Markdown output: - Reply nesting via blockquote depth (avoids 4-space indentation turning text and code fences into broken indented-code blocks). - Links keep their target as [text](url); root-relative reddit links resolve against old.reddit.com. Nested lists indent correctly. - A recognised but unparseable /comments/ page returns no content rather than falling through to generic extraction of Reddit chrome. Tests: regression suite runs against real old.reddit.com fixtures (testdata/reddit/), the ground truth that surfaced the parsing and markdown bugs synthetic HTML had hidden. Fixtures are excluded from the published crate.
This commit is contained in:
parent
3b7d11328e
commit
217bfe088b
11 changed files with 2522 additions and 391 deletions
|
|
@ -4,6 +4,10 @@ description = "Pure HTML content extraction engine for LLMs"
|
|||
version.workspace = true
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
# Reddit regression fixtures are real old.reddit.com pages read at test time;
|
||||
# they're large and only needed to run the test suite from the repo, so keep
|
||||
# them out of the published crate.
|
||||
exclude = ["testdata/reddit/*.html"]
|
||||
|
||||
[features]
|
||||
default = ["quickjs"]
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@ pub mod markdown;
|
|||
pub mod metadata;
|
||||
#[allow(dead_code)]
|
||||
pub(crate) mod noise;
|
||||
pub mod reddit;
|
||||
pub mod structured_data;
|
||||
pub mod types;
|
||||
pub mod youtube;
|
||||
|
|
@ -94,6 +95,24 @@ fn extract_with_options_inner(
|
|||
return Err(ExtractError::NoContent);
|
||||
}
|
||||
|
||||
// Reddit fast path: parse old.reddit.com HTML directly.
|
||||
// The fetch layer rewrites all Reddit hosts to old.reddit.com before
|
||||
// calling extract, so we always get stable server-rendered HTML here.
|
||||
if let Some(u) = url
|
||||
&& reddit::is_reddit_url(u)
|
||||
{
|
||||
if let Some(result) = reddit::try_extract(html, u) {
|
||||
return Ok(result);
|
||||
}
|
||||
// A recognised comment thread that we couldn't parse (Reddit markup
|
||||
// change, or a block/challenge page) — don't fall through to generic
|
||||
// extraction, which would emit Reddit nav/sidebar chrome. Listings
|
||||
// and profiles (no `/comments/`) intentionally fall through below.
|
||||
if u.contains("/comments/") {
|
||||
return Err(ExtractError::NoContent);
|
||||
}
|
||||
}
|
||||
|
||||
// YouTube fast path: if the URL is a YouTube video page, try extracting
|
||||
// structured metadata from ytInitialPlayerResponse before DOM scoring.
|
||||
// This gives LLMs a clean, structured view of video metadata.
|
||||
|
|
|
|||
968
crates/webclaw-core/src/reddit.rs
Normal file
968
crates/webclaw-core/src/reddit.rs
Normal file
|
|
@ -0,0 +1,968 @@
|
|||
//! Reddit thread extractor — parses old.reddit.com HTML directly.
|
||||
//!
|
||||
//! old.reddit.com serves fully server-rendered HTML with stable class names
|
||||
//! and data attributes. No JS, no API key, no `.json` trick needed.
|
||||
|
||||
use scraper::{ElementRef, Html, Selector};
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::{Content, DomainData, DomainType, ExtractionResult, Metadata};
|
||||
|
||||
// ─── Public types ──────────────────────────────────────────────────────────────
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub struct RedditPost {
|
||||
pub id: Option<String>,
|
||||
pub title: String,
|
||||
pub author: String,
|
||||
pub subreddit: Option<String>,
|
||||
pub score: i64,
|
||||
pub body: Option<String>,
|
||||
pub num_comments: usize,
|
||||
pub permalink: String,
|
||||
pub url: Option<String>,
|
||||
pub is_self: bool,
|
||||
pub flair: Option<String>,
|
||||
pub created_utc: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub struct RedditComment {
|
||||
pub id: Option<String>,
|
||||
pub author: String,
|
||||
pub body: String,
|
||||
/// `None` when Reddit hides the score (fresh comments). Distinct from
|
||||
/// `Some(0)`, which is a real net-zero score.
|
||||
pub score: Option<i64>,
|
||||
pub depth: usize,
|
||||
pub is_op: bool,
|
||||
pub created_utc: Option<String>,
|
||||
pub replies: Vec<RedditComment>,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub struct RedditThread {
|
||||
#[serde(rename = "url")]
|
||||
pub source_url: String,
|
||||
pub post: Option<RedditPost>,
|
||||
pub comments: Vec<RedditComment>,
|
||||
}
|
||||
|
||||
// ─── Public API ────────────────────────────────────────────────────────────────
|
||||
|
||||
pub fn is_reddit_url(url: &str) -> bool {
|
||||
matches!(
|
||||
host_of(url),
|
||||
"reddit.com" | "www.reddit.com" | "old.reddit.com" | "np.reddit.com" | "new.reddit.com"
|
||||
)
|
||||
}
|
||||
|
||||
/// Try to parse a Reddit thread from old.reddit.com HTML.
|
||||
/// Returns `None` if the page doesn't have recognisable Reddit structure.
|
||||
pub fn try_extract_thread(html: &str, url: &str) -> Option<RedditThread> {
|
||||
if !url.contains("/comments/") {
|
||||
return None;
|
||||
}
|
||||
let doc = Html::parse_document(html);
|
||||
let post = parse_post(&doc);
|
||||
let op = post.as_ref().map(|p| p.author.as_str()).unwrap_or("");
|
||||
let comments = parse_comments(&doc, op);
|
||||
|
||||
if post.is_none() && comments.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(RedditThread {
|
||||
source_url: url.to_string(),
|
||||
post,
|
||||
comments,
|
||||
})
|
||||
}
|
||||
|
||||
/// Entry point for `webclaw-core`'s extraction fast path.
|
||||
pub fn try_extract(html: &str, url: &str) -> Option<ExtractionResult> {
|
||||
let thread = try_extract_thread(html, url)?;
|
||||
Some(to_extraction_result(&thread))
|
||||
}
|
||||
|
||||
// ─── ExtractionResult builder ──────────────────────────────────────────────────
|
||||
|
||||
fn to_extraction_result(thread: &RedditThread) -> ExtractionResult {
|
||||
let md = to_markdown(thread);
|
||||
let plain = plain_text(&md);
|
||||
let wc = md.split_whitespace().count();
|
||||
|
||||
let (title, author, site_name) = thread
|
||||
.post
|
||||
.as_ref()
|
||||
.map(|p| {
|
||||
(
|
||||
Some(p.title.clone()),
|
||||
Some(p.author.clone()),
|
||||
p.subreddit.clone(),
|
||||
)
|
||||
})
|
||||
.unwrap_or_default();
|
||||
|
||||
ExtractionResult {
|
||||
metadata: Metadata {
|
||||
title,
|
||||
description: None,
|
||||
author,
|
||||
published_date: None,
|
||||
language: Some("en".to_string()),
|
||||
url: Some(thread.source_url.clone()),
|
||||
site_name,
|
||||
image: None,
|
||||
favicon: None,
|
||||
word_count: wc,
|
||||
},
|
||||
content: Content {
|
||||
markdown: md,
|
||||
plain_text: plain,
|
||||
links: vec![],
|
||||
images: vec![],
|
||||
code_blocks: vec![],
|
||||
raw_html: None,
|
||||
},
|
||||
domain_data: Some(DomainData {
|
||||
domain_type: DomainType::Social,
|
||||
}),
|
||||
structured_data: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Markdown rendering ────────────────────────────────────────────────────────
|
||||
|
||||
pub fn to_markdown(thread: &RedditThread) -> String {
|
||||
let mut out = String::new();
|
||||
|
||||
if let Some(p) = &thread.post {
|
||||
out.push_str(&format!("# {}\n\n", p.title));
|
||||
|
||||
let pts = pt_label(Some(p.score));
|
||||
let cmt = match p.num_comments {
|
||||
0 => String::new(),
|
||||
1 => " · 1 comment".to_string(),
|
||||
n => format!(" · {n} comments"),
|
||||
};
|
||||
let sub = p.subreddit.as_deref().unwrap_or("?");
|
||||
out.push_str(&format!("**u/{}** · r/{sub} · {pts}{cmt}\n\n", p.author));
|
||||
|
||||
if let Some(ref body) = p.body
|
||||
&& !body.is_empty()
|
||||
{
|
||||
out.push_str(body);
|
||||
out.push_str("\n\n");
|
||||
}
|
||||
if let Some(ref link) = p.url
|
||||
&& !p.is_self
|
||||
{
|
||||
out.push_str(&format!("[Link]({link})\n\n"));
|
||||
}
|
||||
out.push_str("---\n\n");
|
||||
}
|
||||
|
||||
if !thread.comments.is_empty() {
|
||||
out.push_str("## Comments\n\n");
|
||||
for c in &thread.comments {
|
||||
render_comment(c, &mut out);
|
||||
}
|
||||
}
|
||||
|
||||
collapse_blank_lines(out.trim_end())
|
||||
}
|
||||
|
||||
/// Render one comment + its replies. Nesting is expressed with blockquote
|
||||
/// depth (`> ` per level) rather than leading spaces: space-indentation of
|
||||
/// 4+ would turn ordinary text and ``` fences into CommonMark indented code
|
||||
/// blocks, corrupting any comment at depth ≥ 2.
|
||||
fn render_comment(c: &RedditComment, out: &mut String) {
|
||||
let q = "> ".repeat(c.depth);
|
||||
let blank = ">".repeat(c.depth);
|
||||
let author = if c.is_op {
|
||||
format!("**u/{} [OP]**", c.author)
|
||||
} else {
|
||||
format!("**u/{}**", c.author)
|
||||
};
|
||||
out.push_str(&format!("{q}{author} · {}\n", pt_label(c.score)));
|
||||
for line in c.body.lines() {
|
||||
if line.is_empty() {
|
||||
out.push_str(&blank);
|
||||
out.push('\n');
|
||||
} else {
|
||||
out.push_str(&q);
|
||||
out.push_str(line);
|
||||
out.push('\n');
|
||||
}
|
||||
}
|
||||
out.push('\n');
|
||||
for reply in &c.replies {
|
||||
render_comment(reply, out);
|
||||
}
|
||||
}
|
||||
|
||||
fn pt_label(n: Option<i64>) -> String {
|
||||
match n {
|
||||
None => "score hidden".to_string(),
|
||||
Some(1) => "1 pt".to_string(),
|
||||
Some(-1) => "-1 pt".to_string(),
|
||||
Some(n) => format!("{n} pts"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Collapse runs of 3+ newlines down to a blank-line separator so the
|
||||
/// blockquote prefixes and `<pre>` spacing don't leave large gaps.
|
||||
fn collapse_blank_lines(s: &str) -> String {
|
||||
let mut out = String::with_capacity(s.len());
|
||||
let mut newlines = 0;
|
||||
for ch in s.chars() {
|
||||
if ch == '\n' {
|
||||
newlines += 1;
|
||||
if newlines <= 2 {
|
||||
out.push(ch);
|
||||
}
|
||||
} else {
|
||||
newlines = 0;
|
||||
out.push(ch);
|
||||
}
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
fn plain_text(md: &str) -> String {
|
||||
md.lines()
|
||||
.map(|l| {
|
||||
// Strip a single leading blockquote / heading marker, then drop
|
||||
// emphasis markers. Greedy char-class stripping (the old approach)
|
||||
// ate legitimate content like ">"-prefixed quotes.
|
||||
let l = l.trim_start();
|
||||
let l = l
|
||||
.strip_prefix("> ")
|
||||
.or_else(|| l.strip_prefix('>'))
|
||||
.unwrap_or(l);
|
||||
let l = l.trim_start_matches('#').trim_start();
|
||||
l.replace("**", "")
|
||||
.replace("~~", "")
|
||||
.replace(['*', '`'], "")
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n")
|
||||
}
|
||||
|
||||
// ─── HTML parsing ──────────────────────────────────────────────────────────────
|
||||
|
||||
fn parse_post(doc: &Html) -> Option<RedditPost> {
|
||||
let sel = Selector::parse("#siteTable .thing.link").ok()?;
|
||||
let thing = doc.select(&sel).next()?;
|
||||
let v = thing.value();
|
||||
|
||||
let id = v
|
||||
.attr("data-fullname")
|
||||
.map(|s| s.trim_start_matches("t3_").to_string());
|
||||
let author = v.attr("data-author").unwrap_or("[deleted]").to_string();
|
||||
let subreddit = v.attr("data-subreddit").map(str::to_string);
|
||||
let score: i64 = v
|
||||
.attr("data-score")
|
||||
.and_then(|s| s.parse().ok())
|
||||
.unwrap_or(0);
|
||||
let num_comments: usize = v
|
||||
.attr("data-comments-count")
|
||||
.and_then(|s| s.parse().ok())
|
||||
.unwrap_or(0);
|
||||
let permalink_path = v.attr("data-permalink").unwrap_or("");
|
||||
let permalink = format!("https://old.reddit.com{permalink_path}");
|
||||
// Self-posts carry the `self` class and a `self.<sub>` domain; their
|
||||
// data-url points back at the permalink rather than an external site.
|
||||
let is_self = v.has_class("self", scraper::CaseSensitivity::AsciiCaseInsensitive)
|
||||
|| v.attr("data-domain")
|
||||
.is_some_and(|d| d.starts_with("self."));
|
||||
let link_url = v.attr("data-url").map(str::to_string);
|
||||
let url = if is_self { None } else { link_url };
|
||||
|
||||
// Title
|
||||
let sel_title = Selector::parse(".title a.title").ok()?;
|
||||
let title = thing
|
||||
.select(&sel_title)
|
||||
.next()
|
||||
.map(|el| el.text().collect::<String>().trim().to_string())
|
||||
.filter(|s| !s.is_empty())?;
|
||||
|
||||
// Flair
|
||||
let flair = Selector::parse(".linkflairlabel")
|
||||
.ok()
|
||||
.and_then(|s| thing.select(&s).next())
|
||||
.map(|el| el.text().collect::<String>().trim().to_string())
|
||||
.filter(|s| !s.is_empty());
|
||||
|
||||
// Self-text body: thing > .entry > .expando > .usertext-body [> .md]
|
||||
let body = direct_child(thing, "entry")
|
||||
.and_then(|entry| find_class(entry, "expando"))
|
||||
.and_then(|expando| find_class(expando, "usertext-body"))
|
||||
.and_then(|ut| find_class(ut, "md"))
|
||||
.map(md_to_markdown)
|
||||
.filter(|s| !s.is_empty());
|
||||
|
||||
// Datetime
|
||||
let created_utc = Selector::parse("time[datetime]")
|
||||
.ok()
|
||||
.and_then(|s| thing.select(&s).next())
|
||||
.and_then(|t| t.value().attr("datetime"))
|
||||
.map(str::to_string);
|
||||
|
||||
Some(RedditPost {
|
||||
id,
|
||||
title,
|
||||
author,
|
||||
subreddit,
|
||||
score,
|
||||
body,
|
||||
num_comments,
|
||||
permalink,
|
||||
url,
|
||||
is_self,
|
||||
flair,
|
||||
created_utc,
|
||||
})
|
||||
}
|
||||
|
||||
// ─── Comment parsing ───────────────────────────────────────────────────────────
|
||||
//
|
||||
// old.reddit.com nests comments structurally, not via a depth attribute:
|
||||
//
|
||||
// .commentarea
|
||||
// .sitetable.nestedlisting
|
||||
// .comment.thing ← root comment
|
||||
// .entry → form → .usertext-body → .md ← its own body
|
||||
// .child
|
||||
// .sitetable.listing
|
||||
// .comment.thing ← reply (recurse)
|
||||
//
|
||||
// `data-depth`/`data-replies` are absent or always "0" in the logged-out
|
||||
// HTML, so we walk the tree by recursing into each comment's `.child`.
|
||||
|
||||
fn parse_comments(doc: &Html, op: &str) -> Vec<RedditComment> {
|
||||
// Root listing is `.sitetable.nestedlisting` inside `.commentarea`
|
||||
// (note: `commentarea` is a class on old.reddit, not an id). Fall back
|
||||
// to the first `.nestedlisting` anywhere for comment-permalink pages.
|
||||
let listing = Selector::parse(".commentarea .sitetable.nestedlisting")
|
||||
.ok()
|
||||
.and_then(|s| doc.select(&s).next())
|
||||
.or_else(|| {
|
||||
Selector::parse(".sitetable.nestedlisting")
|
||||
.ok()
|
||||
.and_then(|s| doc.select(&s).next())
|
||||
});
|
||||
|
||||
match listing {
|
||||
Some(l) => walk_comment_level(l, op, 0),
|
||||
None => vec![],
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse the direct-child `.comment.thing` elements of a comment listing.
|
||||
fn walk_comment_level(listing: ElementRef, op: &str, depth: usize) -> Vec<RedditComment> {
|
||||
listing
|
||||
.children()
|
||||
.filter_map(ElementRef::wrap)
|
||||
.filter(|c| {
|
||||
let val = c.value();
|
||||
val.has_class("comment", scraper::CaseSensitivity::AsciiCaseInsensitive)
|
||||
&& val.has_class("thing", scraper::CaseSensitivity::AsciiCaseInsensitive)
|
||||
})
|
||||
.filter_map(|c| parse_one_comment(c, op, depth))
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn parse_one_comment(c: ElementRef, op: &str, depth: usize) -> Option<RedditComment> {
|
||||
let v = c.value();
|
||||
|
||||
// "load more comments" placeholders are `.thing` with type=morechildren.
|
||||
// They carry a t1_ fullname but no real content — skip them.
|
||||
if v.attr("data-type") == Some("morechildren")
|
||||
|| v.has_class(
|
||||
"morechildren",
|
||||
scraper::CaseSensitivity::AsciiCaseInsensitive,
|
||||
)
|
||||
{
|
||||
return None;
|
||||
}
|
||||
|
||||
let is_deleted = v.has_class("deleted", scraper::CaseSensitivity::AsciiCaseInsensitive);
|
||||
let id = v
|
||||
.attr("data-fullname")
|
||||
.map(|s| s.trim_start_matches("t1_").to_string());
|
||||
let author = v
|
||||
.attr("data-author")
|
||||
.filter(|a| !a.is_empty())
|
||||
.unwrap_or("[deleted]")
|
||||
.to_string();
|
||||
|
||||
// Own body lives in `.entry > form > .usertext-body > .md`. `.child`
|
||||
// (nested replies) is a sibling of `.entry`, so descending within
|
||||
// `.entry` never crosses into a reply's body.
|
||||
let entry = direct_child(c, "entry");
|
||||
let body = entry
|
||||
.and_then(|e| find_class(e, "usertext-body"))
|
||||
.and_then(|ut| find_class(ut, "md"))
|
||||
.map(md_to_markdown)
|
||||
.filter(|s| !s.is_empty())
|
||||
.unwrap_or_else(|| {
|
||||
if is_deleted {
|
||||
"[removed]".into()
|
||||
} else {
|
||||
String::new()
|
||||
}
|
||||
});
|
||||
|
||||
// Displayed score is `.score.unvoted`, whose `title` holds the exact
|
||||
// integer (the sibling likes/dislikes spans are ±1). Hidden-score
|
||||
// comments have no `.score.unvoted` span, so `comment_score` returns
|
||||
// None — kept distinct from a genuine 0.
|
||||
let score = entry.and_then(comment_score);
|
||||
|
||||
let created_utc = entry
|
||||
.and_then(|e| Selector::parse("time[datetime]").ok().map(|s| (e, s)))
|
||||
.and_then(|(e, s)| e.select(&s).next())
|
||||
.and_then(|t| t.value().attr("datetime"))
|
||||
.map(str::to_string);
|
||||
|
||||
let is_op = !is_deleted && author != "[deleted]" && author == op;
|
||||
|
||||
// Replies: `.comment > .child > .sitetable > .comment`.
|
||||
let replies = direct_child(c, "child")
|
||||
.and_then(|child| direct_child(child, "sitetable"))
|
||||
.map(|st| walk_comment_level(st, op, depth + 1))
|
||||
.unwrap_or_default();
|
||||
|
||||
Some(RedditComment {
|
||||
id,
|
||||
author,
|
||||
body,
|
||||
score,
|
||||
depth,
|
||||
is_op,
|
||||
created_utc,
|
||||
replies,
|
||||
})
|
||||
}
|
||||
|
||||
/// Read a comment's score from the `.score.unvoted` span inside `.entry`.
|
||||
/// Prefers the `title` attribute (exact integer); falls back to the text.
|
||||
/// Returns `None` when Reddit hides the score (no `.score.unvoted` span).
|
||||
fn comment_score(entry: ElementRef) -> Option<i64> {
|
||||
let sel = Selector::parse("span.score.unvoted").ok()?;
|
||||
let span = entry.select(&sel).next()?;
|
||||
span.value()
|
||||
.attr("title")
|
||||
.and_then(|t| t.trim().parse().ok())
|
||||
.or_else(|| parse_score(&span.text().collect::<String>()))
|
||||
}
|
||||
|
||||
// ─── DOM helpers ───────────────────────────────────────────────────────────────
|
||||
|
||||
/// First direct child element whose class list includes `class`.
|
||||
fn direct_child<'a>(el: ElementRef<'a>, class: &str) -> Option<ElementRef<'a>> {
|
||||
el.children().filter_map(ElementRef::wrap).find(|c| {
|
||||
c.value()
|
||||
.has_class(class, scraper::CaseSensitivity::AsciiCaseInsensitive)
|
||||
})
|
||||
}
|
||||
|
||||
/// First descendant (any depth) whose class list includes `class`.
|
||||
fn find_class<'a>(el: ElementRef<'a>, class: &str) -> Option<ElementRef<'a>> {
|
||||
el.children().filter_map(ElementRef::wrap).find_map(|c| {
|
||||
if c.value()
|
||||
.has_class(class, scraper::CaseSensitivity::AsciiCaseInsensitive)
|
||||
{
|
||||
Some(c)
|
||||
} else {
|
||||
find_class(c, class)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn parse_score(text: &str) -> Option<i64> {
|
||||
text.split_whitespace()
|
||||
.next()
|
||||
.map(|w| w.replace('−', "-"))
|
||||
.and_then(|w| w.parse().ok())
|
||||
}
|
||||
|
||||
// ─── .md div → markdown ────────────────────────────────────────────────────────
|
||||
|
||||
fn md_to_markdown(el: ElementRef) -> String {
|
||||
let mut out = String::new();
|
||||
render_children(el, &mut out);
|
||||
out.trim().to_string()
|
||||
}
|
||||
|
||||
fn render_children(el: ElementRef, out: &mut String) {
|
||||
use scraper::node::Node;
|
||||
for child in el.children() {
|
||||
match child.value() {
|
||||
Node::Text(t) => out.push_str(t.as_ref()),
|
||||
Node::Element(_) => {
|
||||
if let Some(c) = ElementRef::wrap(child) {
|
||||
render_node(c, out);
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn render_node(el: ElementRef, out: &mut String) {
|
||||
match el.value().name() {
|
||||
"p" | "div" => {
|
||||
let mut inner = String::new();
|
||||
render_children(el, &mut inner);
|
||||
let t = inner.trim();
|
||||
if !t.is_empty() {
|
||||
out.push_str(t);
|
||||
out.push_str("\n\n");
|
||||
}
|
||||
}
|
||||
"br" => out.push('\n'),
|
||||
"strong" | "b" => {
|
||||
let t: String = el.text().collect();
|
||||
let t = t.trim();
|
||||
if !t.is_empty() {
|
||||
out.push_str(&format!("**{t}**"));
|
||||
}
|
||||
}
|
||||
"em" | "i" => {
|
||||
let t: String = el.text().collect();
|
||||
let t = t.trim();
|
||||
if !t.is_empty() {
|
||||
out.push_str(&format!("*{t}*"));
|
||||
}
|
||||
}
|
||||
"del" | "s" | "strike" => {
|
||||
let t: String = el.text().collect();
|
||||
let t = t.trim();
|
||||
if !t.is_empty() {
|
||||
out.push_str(&format!("~~{t}~~"));
|
||||
}
|
||||
}
|
||||
"code" => {
|
||||
let t: String = el.text().collect();
|
||||
out.push('`');
|
||||
out.push_str(t.trim());
|
||||
out.push('`');
|
||||
}
|
||||
"pre" => {
|
||||
let t: String = el.text().collect();
|
||||
out.push_str("```\n");
|
||||
out.push_str(t.trim_end_matches('\n'));
|
||||
out.push_str("\n```\n\n");
|
||||
}
|
||||
"a" => {
|
||||
let text: String = el.text().collect();
|
||||
let text = text.trim();
|
||||
if !text.is_empty() {
|
||||
// Preserve the destination as a markdown link. Resolve
|
||||
// root-relative reddit hrefs (/r/, /user/, /wiki/, ...) and
|
||||
// drop non-navigational ones (javascript:, #fragment, mailto:).
|
||||
let href = el.value().attr("href").unwrap_or("");
|
||||
if href.starts_with("http://") || href.starts_with("https://") {
|
||||
out.push_str(&format!("[{text}]({href})"));
|
||||
} else if href.starts_with('/') {
|
||||
out.push_str(&format!("[{text}](https://old.reddit.com{href})"));
|
||||
} else {
|
||||
out.push_str(text);
|
||||
}
|
||||
}
|
||||
}
|
||||
"blockquote" => {
|
||||
let mut inner = String::new();
|
||||
render_children(el, &mut inner);
|
||||
let trimmed = inner.trim();
|
||||
for line in trimmed.lines() {
|
||||
out.push('>');
|
||||
if !line.is_empty() {
|
||||
out.push(' ');
|
||||
out.push_str(line);
|
||||
}
|
||||
out.push('\n');
|
||||
}
|
||||
out.push('\n');
|
||||
}
|
||||
"ul" => render_list(el, false, 0, out),
|
||||
"ol" => render_list(el, true, 0, out),
|
||||
"h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
|
||||
let level = el
|
||||
.value()
|
||||
.name()
|
||||
.chars()
|
||||
.nth(1)
|
||||
.and_then(|c| c.to_digit(10))
|
||||
.unwrap_or(2) as usize;
|
||||
let t: String = el.text().collect();
|
||||
let t = t.trim();
|
||||
if !t.is_empty() {
|
||||
out.push_str(&"#".repeat(level));
|
||||
out.push(' ');
|
||||
out.push_str(t);
|
||||
out.push_str("\n\n");
|
||||
}
|
||||
}
|
||||
"hr" => out.push_str("---\n\n"),
|
||||
"sup" => {
|
||||
let t: String = el.text().collect();
|
||||
out.push_str(t.trim());
|
||||
}
|
||||
// Unknown / generic containers: recurse
|
||||
_ => render_children(el, out),
|
||||
}
|
||||
}
|
||||
|
||||
/// Render a `<ul>`/`<ol>`, indenting nested lists by two spaces per level so
|
||||
/// child items keep their own line instead of being glued to the parent.
|
||||
fn render_list(list: ElementRef, ordered: bool, indent: usize, out: &mut String) {
|
||||
use scraper::node::Node;
|
||||
let pad = " ".repeat(indent);
|
||||
let mut n = 0;
|
||||
for li in list
|
||||
.children()
|
||||
.filter_map(ElementRef::wrap)
|
||||
.filter(|c| c.value().name() == "li")
|
||||
{
|
||||
n += 1;
|
||||
// Inline content of this <li>, excluding nested lists (rendered after).
|
||||
let mut inline = String::new();
|
||||
for child in li.children() {
|
||||
match child.value() {
|
||||
Node::Text(t) => inline.push_str(t.as_ref()),
|
||||
Node::Element(e) if e.name() == "ul" || e.name() == "ol" => {}
|
||||
Node::Element(_) => {
|
||||
if let Some(c) = ElementRef::wrap(child) {
|
||||
render_node(c, &mut inline);
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
let marker = if ordered {
|
||||
format!("{n}. ")
|
||||
} else {
|
||||
"- ".to_string()
|
||||
};
|
||||
out.push_str(&format!("{pad}{marker}{}\n", inline.trim()));
|
||||
|
||||
for child in li.children().filter_map(ElementRef::wrap) {
|
||||
match child.value().name() {
|
||||
"ul" => render_list(child, false, indent + 1, out),
|
||||
"ol" => render_list(child, true, indent + 1, out),
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
if indent == 0 {
|
||||
out.push('\n');
|
||||
}
|
||||
}
|
||||
|
||||
// ─── URL helpers ───────────────────────────────────────────────────────────────
|
||||
|
||||
fn host_of(url: &str) -> &str {
|
||||
url.split("://")
|
||||
.nth(1)
|
||||
.unwrap_or(url)
|
||||
.split(['/', '?', '#'])
|
||||
.next()
|
||||
.unwrap_or("")
|
||||
}
|
||||
|
||||
// ─── Tests ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn is_reddit_url_recognises_variants() {
|
||||
assert!(is_reddit_url(
|
||||
"https://www.reddit.com/r/rust/comments/abc/x/"
|
||||
));
|
||||
assert!(is_reddit_url(
|
||||
"https://old.reddit.com/r/rust/comments/abc/x/"
|
||||
));
|
||||
assert!(is_reddit_url("https://reddit.com/r/rust/comments/abc/x/"));
|
||||
assert!(!is_reddit_url("https://example.com"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn try_extract_thread_returns_none_for_listing_url() {
|
||||
let html = "<html><body></body></html>";
|
||||
assert!(try_extract_thread(html, "https://old.reddit.com/r/rust/").is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn md_to_markdown_basic() {
|
||||
let html =
|
||||
Html::parse_fragment(r#"<div class="md"><p>Hello <strong>world</strong>!</p></div>"#);
|
||||
let sel = Selector::parse(".md").unwrap();
|
||||
let el = html.select(&sel).next().unwrap();
|
||||
let md = md_to_markdown(el);
|
||||
assert!(md.contains("**world**"));
|
||||
assert!(md.contains("Hello"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn md_to_markdown_blockquote_and_code() {
|
||||
let html = Html::parse_fragment(
|
||||
r#"<div class="md"><blockquote><p>Quoted</p></blockquote><pre><code>fn main() {}</code></pre></div>"#,
|
||||
);
|
||||
let sel = Selector::parse(".md").unwrap();
|
||||
let el = html.select(&sel).next().unwrap();
|
||||
let md = md_to_markdown(el);
|
||||
assert!(md.contains("> Quoted"));
|
||||
assert!(md.contains("```"));
|
||||
assert!(md.contains("fn main()"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn md_to_markdown_link_preserves_href() {
|
||||
let abs = Html::parse_fragment(
|
||||
r#"<div class="md"><p>see <a href="https://example.com/x">this</a></p></div>"#,
|
||||
);
|
||||
let sel = Selector::parse(".md").unwrap();
|
||||
let el = abs.select(&sel).next().unwrap();
|
||||
assert!(md_to_markdown(el).contains("[this](https://example.com/x)"));
|
||||
|
||||
// Root-relative reddit links resolve against old.reddit.com.
|
||||
let rel = Html::parse_fragment(
|
||||
r#"<div class="md"><p><a href="/r/rust/wiki/faq">faq</a></p></div>"#,
|
||||
);
|
||||
let el = rel.select(&sel).next().unwrap();
|
||||
assert!(md_to_markdown(el).contains("[faq](https://old.reddit.com/r/rust/wiki/faq)"));
|
||||
|
||||
// javascript: / fragment hrefs degrade to bare text.
|
||||
let js = Html::parse_fragment(
|
||||
r#"<div class="md"><p><a href="javascript:void(0)">x</a></p></div>"#,
|
||||
);
|
||||
let el = js.select(&sel).next().unwrap();
|
||||
let out = md_to_markdown(el);
|
||||
assert!(out.contains('x') && !out.contains("javascript"));
|
||||
}
|
||||
|
||||
// ── Regression tests against REAL old.reddit.com HTML ──────────────────
|
||||
//
|
||||
// These fixtures are genuine pages fetched from old.reddit.com (see
|
||||
// testdata/reddit/). They are the ground truth — synthetic HTML is too
|
||||
// easy to write to match wrong assumptions, which is exactly how the
|
||||
// first version of this parser shipped silently broken.
|
||||
|
||||
fn fixture(name: &str) -> String {
|
||||
std::fs::read_to_string(format!("testdata/reddit/{name}")).unwrap()
|
||||
}
|
||||
|
||||
fn total_comments(cs: &[RedditComment]) -> usize {
|
||||
cs.len() + cs.iter().map(|c| total_comments(&c.replies)).sum::<usize>()
|
||||
}
|
||||
|
||||
fn collect<'a>(cs: &'a [RedditComment], out: &mut Vec<&'a RedditComment>) {
|
||||
for c in cs {
|
||||
out.push(c);
|
||||
collect(&c.replies, out);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn real_link_post_metadata() {
|
||||
// pandas: external-link post (blog.geekuni.com), 34 comments.
|
||||
let html = fixture("pandas_34comments.html");
|
||||
let t = try_extract_thread(
|
||||
&html,
|
||||
"https://old.reddit.com/r/programming/comments/abc123/t/",
|
||||
)
|
||||
.expect("should parse");
|
||||
let p = t.post.expect("post");
|
||||
assert_eq!(p.author, "Horror-Willingness74");
|
||||
assert_eq!(p.subreddit.as_deref(), Some("programming"));
|
||||
assert_eq!(p.score, 43);
|
||||
assert_eq!(p.num_comments, 34, "data-comments-count");
|
||||
assert!(!p.is_self, "external blog link, not a self post");
|
||||
assert_eq!(
|
||||
p.url.as_deref(),
|
||||
Some("https://blog.geekuni.com/2026/06/why-learn-pandas.html")
|
||||
);
|
||||
assert!(p.title.contains("Pandas"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn real_self_post_metadata() {
|
||||
// A self-post (text) on r/rust: `self.rust` domain, self-text body,
|
||||
// no external url.
|
||||
let html = fixture("rust_selfpost_36comments.html");
|
||||
let t = try_extract_thread(&html, "https://old.reddit.com/r/rust/comments/abc123/t/")
|
||||
.expect("should parse");
|
||||
let p = t.post.expect("post");
|
||||
assert!(p.is_self, "self.rust domain → self post");
|
||||
assert_eq!(p.url, None, "self posts carry no external url");
|
||||
assert_eq!(p.subreddit.as_deref(), Some("rust"));
|
||||
assert!(
|
||||
p.body
|
||||
.as_deref()
|
||||
.unwrap_or("")
|
||||
.contains("IT project manager"),
|
||||
"self-text body should be extracted: {:?}",
|
||||
p.body
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn real_comment_bodies_and_scores() {
|
||||
// The original bug: every comment body came back empty because
|
||||
// .usertext-body sits inside a <form>, not directly under .entry.
|
||||
let html = fixture("ebpf_6comments.html");
|
||||
let t = try_extract_thread(
|
||||
&html,
|
||||
"https://old.reddit.com/r/programming/comments/abc123/t/",
|
||||
)
|
||||
.expect("should parse");
|
||||
// 6 comments total: 5 top-level + 1 nested reply (admalledd under ejrh).
|
||||
assert_eq!(t.comments.len(), 5, "5 top-level comments");
|
||||
assert_eq!(total_comments(&t.comments), 6, "6 comments incl. nested");
|
||||
let teerre = t
|
||||
.comments
|
||||
.iter()
|
||||
.find(|c| c.author == "teerre")
|
||||
.expect("teerre");
|
||||
assert!(
|
||||
teerre.body.contains("Very cool blog"),
|
||||
"body must be populated, got {:?}",
|
||||
teerre.body
|
||||
);
|
||||
// Score comes from .score.unvoted title (the real value), not the
|
||||
// ±1 likes/dislikes siblings.
|
||||
assert_eq!(
|
||||
teerre.score,
|
||||
Some(10),
|
||||
"unvoted score, not dislikes(9)/likes(11)"
|
||||
);
|
||||
assert!(
|
||||
t.comments.iter().all(|c| !c.body.is_empty()),
|
||||
"no comment body should be empty"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn real_nested_comment_tree() {
|
||||
// pandas has structurally-nested replies (.child > .sitetable >
|
||||
// .comment). data-depth/data-replies are absent in logged-out HTML.
|
||||
let html = fixture("pandas_34comments.html");
|
||||
let t = try_extract_thread(
|
||||
&html,
|
||||
"https://old.reddit.com/r/programming/comments/abc123/t/",
|
||||
)
|
||||
.expect("should parse");
|
||||
// 34 rendered comments with content + 1 [deleted] node that old.reddit
|
||||
// still shows because it has live replies = 35 nodes in the tree.
|
||||
assert_eq!(
|
||||
total_comments(&t.comments),
|
||||
35,
|
||||
"all comments incl. nested + deleted"
|
||||
);
|
||||
let nested = t.comments.iter().any(|c| !c.replies.is_empty());
|
||||
assert!(nested, "at least one comment must have replies");
|
||||
let max_depth = {
|
||||
fn d(cs: &[RedditComment]) -> usize {
|
||||
cs.iter().map(|c| 1 + d(&c.replies)).max().unwrap_or(0)
|
||||
}
|
||||
d(&t.comments)
|
||||
};
|
||||
assert!(max_depth >= 2, "tree should be more than one level deep");
|
||||
let a_reply = t.comments.iter().find_map(|c| c.replies.first());
|
||||
assert_eq!(a_reply.map(|r| r.depth), Some(1));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn real_morechildren_stubs_skipped() {
|
||||
// AskReddit deep thread: 259 .thing[data-fullname=t1_] markers, but
|
||||
// some are "load more comments" stubs (data-type=morechildren) with
|
||||
// no author/body. They must not appear as ghost comments.
|
||||
let html = fixture("askreddit_deep_morechildren.html");
|
||||
let t = try_extract_thread(
|
||||
&html,
|
||||
"https://old.reddit.com/r/AskReddit/comments/abc123/t/",
|
||||
)
|
||||
.expect("should parse");
|
||||
fn check(cs: &[RedditComment]) {
|
||||
for c in cs {
|
||||
let ghost = c.body.is_empty() && c.author == "[deleted]" && c.id.is_some();
|
||||
assert!(!ghost, "morechildren stub leaked as comment: {:?}", c.id);
|
||||
check(&c.replies);
|
||||
}
|
||||
}
|
||||
check(&t.comments);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn real_hidden_score_is_none_not_zero() {
|
||||
// AskReddit has fresh comments with `.score-hidden` (no .score.unvoted
|
||||
// span). These must be None, distinct from a genuine 0-score comment.
|
||||
let html = fixture("askreddit_deep_morechildren.html");
|
||||
let t = try_extract_thread(
|
||||
&html,
|
||||
"https://old.reddit.com/r/AskReddit/comments/abc123/t/",
|
||||
)
|
||||
.expect("should parse");
|
||||
let mut all = Vec::new();
|
||||
collect(&t.comments, &mut all);
|
||||
assert!(
|
||||
all.iter().any(|c| c.score.is_none()),
|
||||
"some fresh comments have hidden scores → None"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn real_deleted_comment_preserves_subtree() {
|
||||
// pandas has a [deleted] comment that still has visible replies. The
|
||||
// structural walk must keep it so its children aren't orphaned.
|
||||
let html = fixture("pandas_34comments.html");
|
||||
let t = try_extract_thread(
|
||||
&html,
|
||||
"https://old.reddit.com/r/programming/comments/abc123/t/",
|
||||
)
|
||||
.expect("should parse");
|
||||
let mut all = Vec::new();
|
||||
collect(&t.comments, &mut all);
|
||||
let deleted: Vec<_> = all.iter().filter(|c| c.author == "[deleted]").collect();
|
||||
assert!(!deleted.is_empty(), "should keep deleted comments");
|
||||
assert!(
|
||||
deleted.iter().any(|c| !c.replies.is_empty()),
|
||||
"a deleted comment with replies must retain its subtree"
|
||||
);
|
||||
assert!(deleted.iter().all(|c| !c.is_op));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn real_markdown_is_commonmark_clean() {
|
||||
// Guards the markdown bugs the verification workflow found: no
|
||||
// whitespace-only "blank" lines, and ``` fences never indented 4+
|
||||
// spaces (which would turn them into literal indented code blocks).
|
||||
let html = fixture("elixir_60comments.html");
|
||||
let result = try_extract(
|
||||
&html,
|
||||
"https://old.reddit.com/r/programming/comments/abc123/t/",
|
||||
)
|
||||
.expect("should extract");
|
||||
let md = &result.content.markdown;
|
||||
assert!(md.starts_with("# "));
|
||||
assert!(md.contains("## Comments"));
|
||||
for line in md.lines() {
|
||||
assert!(
|
||||
!(line.starts_with(' ') && line.trim().is_empty()),
|
||||
"whitespace-only line: {line:?}"
|
||||
);
|
||||
let trimmed = line.trim_start_matches(['>', ' ']);
|
||||
if trimmed.starts_with("```") {
|
||||
let indent = line.len() - line.trim_start_matches(' ').len();
|
||||
assert!(indent < 4, "code fence indented {indent} spaces: {line:?}");
|
||||
}
|
||||
}
|
||||
assert!(result.metadata.word_count > 20);
|
||||
}
|
||||
}
|
||||
596
crates/webclaw-core/testdata/reddit/askreddit_deep_morechildren.html
vendored
Normal file
596
crates/webclaw-core/testdata/reddit/askreddit_deep_morechildren.html
vendored
Normal file
File diff suppressed because one or more lines are too long
82
crates/webclaw-core/testdata/reddit/ebpf_6comments.html
vendored
Normal file
82
crates/webclaw-core/testdata/reddit/ebpf_6comments.html
vendored
Normal file
File diff suppressed because one or more lines are too long
312
crates/webclaw-core/testdata/reddit/elixir_60comments.html
vendored
Normal file
312
crates/webclaw-core/testdata/reddit/elixir_60comments.html
vendored
Normal file
File diff suppressed because one or more lines are too long
227
crates/webclaw-core/testdata/reddit/pandas_34comments.html
vendored
Normal file
227
crates/webclaw-core/testdata/reddit/pandas_34comments.html
vendored
Normal file
File diff suppressed because one or more lines are too long
234
crates/webclaw-core/testdata/reddit/rust_selfpost_36comments.html
vendored
Normal file
234
crates/webclaw-core/testdata/reddit/rust_selfpost_36comments.html
vendored
Normal file
File diff suppressed because one or more lines are too long
|
|
@ -160,9 +160,6 @@ impl Response {
|
|||
fn body(&self) -> &[u8] {
|
||||
&self.body
|
||||
}
|
||||
fn is_success(&self) -> bool {
|
||||
(200..300).contains(&self.status)
|
||||
}
|
||||
|
||||
fn text(&self) -> std::borrow::Cow<'_, str> {
|
||||
String::from_utf8_lossy(&self.body)
|
||||
|
|
@ -299,32 +296,15 @@ impl FetchClient {
|
|||
/// when you need literal no-rescue behavior (e.g. inside the rescue
|
||||
/// logic itself to avoid recursion).
|
||||
pub async fn fetch_smart(&self, url: &str) -> Result<FetchResult, FetchError> {
|
||||
// Reddit: the HTML page shows a verification interstitial for most
|
||||
// client IPs, but appending `.json` returns the post + comment tree
|
||||
// publicly. `parse_reddit_json` in downstream code knows how to read
|
||||
// the result; here we just do the URL swap at the fetch layer.
|
||||
if crate::reddit::is_reddit_url(url) && !url.ends_with(".json") {
|
||||
let json_url = crate::reddit::json_url(url);
|
||||
// Reddit's public .json API serves JSON to identifiable bot
|
||||
// User-Agents and blocks browser UAs with a verification wall.
|
||||
// Override our Chrome-profile UA for this specific call.
|
||||
let ua = concat!(
|
||||
"Webclaw/",
|
||||
env!("CARGO_PKG_VERSION"),
|
||||
" (+https://webclaw.io)"
|
||||
);
|
||||
if let Ok(resp) = self
|
||||
.fetch_with_headers(&json_url, &[("user-agent", ua)])
|
||||
.await
|
||||
&& resp.status == 200
|
||||
{
|
||||
let first = resp.html.trim_start().as_bytes().first().copied();
|
||||
if matches!(first, Some(b'{') | Some(b'[')) {
|
||||
return Ok(resp);
|
||||
}
|
||||
}
|
||||
// If the .json fetch failed or returned HTML, fall through.
|
||||
}
|
||||
// Reddit: fetch old.reddit.com for stable server-rendered HTML.
|
||||
// The JSON API is blocked; old.reddit.com works without JS or auth.
|
||||
let owned;
|
||||
let url = if crate::reddit::is_reddit_url(url) {
|
||||
owned = crate::reddit::to_old_reddit_url(url);
|
||||
owned.as_str()
|
||||
} else {
|
||||
url
|
||||
};
|
||||
|
||||
let resp = self.fetch(url).await?;
|
||||
|
||||
|
|
@ -496,23 +476,16 @@ impl FetchClient {
|
|||
let parsed_url = crate::url_security::validate_public_http_url(url).await?;
|
||||
let url = parsed_url.as_str();
|
||||
|
||||
// Reddit fallback: use their JSON API to get post + full comment tree.
|
||||
if crate::reddit::is_reddit_url(url) {
|
||||
let json_url = crate::reddit::json_url(url);
|
||||
let json_url = crate::url_security::validate_public_http_url(&json_url).await?;
|
||||
debug!("reddit detected, fetching {json_url}");
|
||||
|
||||
let client = self.pick_client(url);
|
||||
let resp = client.get(json_url.as_str()).send().await?;
|
||||
let response = Response::from_wreq(resp).await?;
|
||||
if response.is_success() {
|
||||
let bytes = response.body();
|
||||
match crate::reddit::parse_reddit_json(bytes, url) {
|
||||
Ok(result) => return Ok(result),
|
||||
Err(e) => warn!("reddit json fallback failed: {e}, falling back to HTML"),
|
||||
}
|
||||
}
|
||||
}
|
||||
// Reddit: rewrite to old.reddit.com for stable server-rendered HTML.
|
||||
// webclaw-core's Reddit fast path then parses the thread structure.
|
||||
let reddit_owned;
|
||||
let url = if crate::reddit::is_reddit_url(url) {
|
||||
reddit_owned = crate::reddit::to_old_reddit_url(url);
|
||||
debug!("reddit: rewriting to {reddit_owned}");
|
||||
reddit_owned.as_str()
|
||||
} else {
|
||||
url
|
||||
};
|
||||
|
||||
let start = Instant::now();
|
||||
let client = self.pick_client(url);
|
||||
|
|
|
|||
|
|
@ -1,12 +1,10 @@
|
|||
//! Reddit structured extractor — returns the full post + comment tree
|
||||
//! as typed JSON via Reddit's `.json` API.
|
||||
//! Reddit structured extractor — parses old.reddit.com HTML.
|
||||
//!
|
||||
//! The same trick the markdown extractor in `crate::reddit` uses:
|
||||
//! appending `.json` to any post URL returns the data the new SPA
|
||||
//! frontend would load client-side. Zero antibot, zero JS rendering.
|
||||
//! Fetches old.reddit.com (stable server-rendered HTML, no JS required)
|
||||
//! and delegates parsing to `webclaw_core::reddit`. Returns a typed JSON
|
||||
//! value with `{ url, post, comments }` structure.
|
||||
|
||||
use serde::Deserialize;
|
||||
use serde_json::{Value, json};
|
||||
use serde_json::Value;
|
||||
|
||||
use super::ExtractorInfo;
|
||||
use crate::error::FetchError;
|
||||
|
|
@ -24,182 +22,27 @@ pub const INFO: ExtractorInfo = ExtractorInfo {
|
|||
};
|
||||
|
||||
pub fn matches(url: &str) -> bool {
|
||||
let host = host_of(url);
|
||||
let is_reddit_host = matches!(
|
||||
host,
|
||||
"reddit.com" | "www.reddit.com" | "old.reddit.com" | "np.reddit.com" | "new.reddit.com"
|
||||
);
|
||||
is_reddit_host && url.contains("/comments/")
|
||||
webclaw_core::reddit::is_reddit_url(url) && url.contains("/comments/")
|
||||
}
|
||||
|
||||
pub async fn extract(client: &dyn Fetcher, url: &str) -> Result<Value, FetchError> {
|
||||
let json_url = build_json_url(url);
|
||||
let resp = client.fetch(&json_url).await?;
|
||||
let fetch_url = crate::reddit::to_old_reddit_url(url);
|
||||
let resp = client.fetch(&fetch_url).await?;
|
||||
if resp.status != 200 {
|
||||
return Err(FetchError::Build(format!(
|
||||
"reddit api returned status {}",
|
||||
"reddit: unexpected status {}",
|
||||
resp.status
|
||||
)));
|
||||
}
|
||||
|
||||
let listings: Vec<Listing> = serde_json::from_str(&resp.html)
|
||||
.map_err(|e| FetchError::BodyDecode(format!("reddit json parse: {e}")))?;
|
||||
let thread = webclaw_core::reddit::try_extract_thread(&resp.html, url).ok_or_else(|| {
|
||||
FetchError::BodyDecode(
|
||||
"reddit: page structure not recognised — is this a thread URL?".into(),
|
||||
)
|
||||
})?;
|
||||
|
||||
if listings.is_empty() {
|
||||
return Err(FetchError::BodyDecode("reddit response empty".into()));
|
||||
}
|
||||
|
||||
// First listing = the post (single t3 child).
|
||||
let post = listings
|
||||
.first()
|
||||
.and_then(|l| l.data.children.first())
|
||||
.filter(|t| t.kind == "t3")
|
||||
.map(|t| post_json(&t.data))
|
||||
.unwrap_or(Value::Null);
|
||||
|
||||
// Second listing = the comment tree.
|
||||
let comments: Vec<Value> = listings
|
||||
.get(1)
|
||||
.map(|l| l.data.children.iter().filter_map(comment_json).collect())
|
||||
.unwrap_or_default();
|
||||
|
||||
Ok(json!({
|
||||
"url": url,
|
||||
"post": post,
|
||||
"comments": comments,
|
||||
}))
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// JSON shapers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
fn post_json(d: &ThingData) -> Value {
|
||||
json!({
|
||||
"id": d.id,
|
||||
"title": d.title,
|
||||
"author": d.author,
|
||||
"subreddit": d.subreddit_name_prefixed,
|
||||
"permalink": d.permalink.as_ref().map(|p| format!("https://www.reddit.com{p}")),
|
||||
"url": d.url_overridden_by_dest,
|
||||
"is_self": d.is_self,
|
||||
"selftext": d.selftext,
|
||||
"score": d.score,
|
||||
"upvote_ratio": d.upvote_ratio,
|
||||
"num_comments": d.num_comments,
|
||||
"created_utc": d.created_utc,
|
||||
"link_flair_text": d.link_flair_text,
|
||||
"over_18": d.over_18,
|
||||
"spoiler": d.spoiler,
|
||||
"stickied": d.stickied,
|
||||
"locked": d.locked,
|
||||
})
|
||||
}
|
||||
|
||||
/// Render a single comment + its reply tree. Returns `None` for non-t1
|
||||
/// kinds (the trailing `more` placeholder Reddit injects at depth limits).
|
||||
fn comment_json(thing: &Thing) -> Option<Value> {
|
||||
if thing.kind != "t1" {
|
||||
return None;
|
||||
}
|
||||
let d = &thing.data;
|
||||
let replies: Vec<Value> = match &d.replies {
|
||||
Some(Replies::Listing(l)) => l.data.children.iter().filter_map(comment_json).collect(),
|
||||
_ => Vec::new(),
|
||||
};
|
||||
Some(json!({
|
||||
"id": d.id,
|
||||
"author": d.author,
|
||||
"body": d.body,
|
||||
"score": d.score,
|
||||
"created_utc": d.created_utc,
|
||||
"is_submitter": d.is_submitter,
|
||||
"stickied": d.stickied,
|
||||
"depth": d.depth,
|
||||
"permalink": d.permalink.as_ref().map(|p| format!("https://www.reddit.com{p}")),
|
||||
"replies": replies,
|
||||
}))
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// URL helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
fn host_of(url: &str) -> &str {
|
||||
url.split("://")
|
||||
.nth(1)
|
||||
.unwrap_or(url)
|
||||
.split('/')
|
||||
.next()
|
||||
.unwrap_or("")
|
||||
}
|
||||
|
||||
/// Build the Reddit JSON URL. We keep the original host (`www.reddit.com`
|
||||
/// or `old.reddit.com` as the caller gave us). Routing through
|
||||
/// `old.reddit.com` unconditionally looks appealing but that host has
|
||||
/// stricter UA-based blocking than `www.reddit.com`, while the main
|
||||
/// host accepts our Chrome-fingerprinted client fine.
|
||||
fn build_json_url(url: &str) -> String {
|
||||
let clean = url.split('?').next().unwrap_or(url).trim_end_matches('/');
|
||||
format!("{clean}.json?raw_json=1")
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Reddit JSON types — only fields we render. Everything else is dropped.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct Listing {
|
||||
data: ListingData,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct ListingData {
|
||||
children: Vec<Thing>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct Thing {
|
||||
kind: String,
|
||||
data: ThingData,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Default)]
|
||||
struct ThingData {
|
||||
// post (t3)
|
||||
id: Option<String>,
|
||||
title: Option<String>,
|
||||
selftext: Option<String>,
|
||||
subreddit_name_prefixed: Option<String>,
|
||||
url_overridden_by_dest: Option<String>,
|
||||
is_self: Option<bool>,
|
||||
upvote_ratio: Option<f64>,
|
||||
num_comments: Option<i64>,
|
||||
over_18: Option<bool>,
|
||||
spoiler: Option<bool>,
|
||||
stickied: Option<bool>,
|
||||
locked: Option<bool>,
|
||||
link_flair_text: Option<String>,
|
||||
|
||||
// comment (t1)
|
||||
author: Option<String>,
|
||||
body: Option<String>,
|
||||
score: Option<i64>,
|
||||
created_utc: Option<f64>,
|
||||
is_submitter: Option<bool>,
|
||||
depth: Option<i64>,
|
||||
permalink: Option<String>,
|
||||
|
||||
// recursive
|
||||
replies: Option<Replies>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
#[serde(untagged)]
|
||||
enum Replies {
|
||||
Listing(Listing),
|
||||
#[allow(dead_code)]
|
||||
Empty(String),
|
||||
serde_json::to_value(&thread)
|
||||
.map_err(|e| FetchError::BodyDecode(format!("reddit: serialisation error: {e}")))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
|
@ -207,28 +50,17 @@ mod tests {
|
|||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn matches_reddit_post_urls() {
|
||||
fn matches_thread_urls() {
|
||||
assert!(matches(
|
||||
"https://www.reddit.com/r/rust/comments/abc123/some_title/"
|
||||
));
|
||||
assert!(matches(
|
||||
"https://reddit.com/r/rust/comments/abc123/some_title"
|
||||
));
|
||||
assert!(matches("https://old.reddit.com/r/rust/comments/abc123/x/"));
|
||||
assert!(matches("https://reddit.com/r/rust/comments/abc/x"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rejects_non_post_reddit_urls() {
|
||||
fn rejects_listing_and_non_reddit() {
|
||||
assert!(!matches("https://www.reddit.com/r/rust"));
|
||||
assert!(!matches("https://www.reddit.com/user/foo"));
|
||||
assert!(!matches("https://example.com/r/rust/comments/x"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn json_url_appends_suffix_and_drops_query() {
|
||||
assert_eq!(
|
||||
build_json_url("https://www.reddit.com/r/rust/comments/abc/x/?utm=foo"),
|
||||
"https://www.reddit.com/r/rust/comments/abc/x.json?raw_json=1"
|
||||
);
|
||||
assert!(!matches("https://example.com/r/rust/comments/abc/x"));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,172 +1,56 @@
|
|||
/// Reddit JSON API fallback for extracting posts + comments without JS rendering.
|
||||
///
|
||||
/// Reddit's new `shreddit` frontend only SSRs the post body — comments are
|
||||
/// loaded client-side. Appending `.json` to any Reddit URL returns the full
|
||||
/// comment tree as structured JSON, which we convert to clean markdown.
|
||||
use serde::Deserialize;
|
||||
use tracing::debug;
|
||||
use webclaw_core::{Content, ExtractionResult, Metadata};
|
||||
//! Reddit URL helpers for the fetch layer.
|
||||
//!
|
||||
//! The JSON API (`*.json`) is blocked. We rewrite all Reddit hosts to
|
||||
//! `old.reddit.com`, which serves stable server-rendered HTML that
|
||||
//! `webclaw-core::reddit` parses directly.
|
||||
|
||||
/// Check if a URL points to a Reddit post/comment page.
|
||||
pub fn is_reddit_url(url: &str) -> bool {
|
||||
let host = url
|
||||
.split("://")
|
||||
.nth(1)
|
||||
.unwrap_or(url)
|
||||
.split('/')
|
||||
.next()
|
||||
.unwrap_or("");
|
||||
matches!(
|
||||
host,
|
||||
"reddit.com" | "www.reddit.com" | "old.reddit.com" | "np.reddit.com" | "new.reddit.com"
|
||||
)
|
||||
webclaw_core::reddit::is_reddit_url(url)
|
||||
}
|
||||
|
||||
/// Build the `.json` URL from a Reddit page URL.
|
||||
pub fn json_url(url: &str) -> String {
|
||||
let clean = url.split('?').next().unwrap_or(url).trim_end_matches('/');
|
||||
format!("{clean}.json")
|
||||
/// Rewrite any Reddit host to old.reddit.com, preserving path and query.
|
||||
pub fn to_old_reddit_url(url: &str) -> String {
|
||||
let Some(scheme_end) = url.find("://") else {
|
||||
return url.to_string();
|
||||
};
|
||||
let after = &url[scheme_end + 3..];
|
||||
let host_end = after.find(['/', '?', '#']).unwrap_or(after.len());
|
||||
let scheme = &url[..scheme_end + 3];
|
||||
let rest = &after[host_end..];
|
||||
format!("{scheme}old.reddit.com{rest}")
|
||||
}
|
||||
|
||||
/// Convert Reddit JSON API response into an ExtractionResult.
|
||||
pub fn parse_reddit_json(json_bytes: &[u8], url: &str) -> Result<ExtractionResult, String> {
|
||||
let listings: Vec<Listing> =
|
||||
serde_json::from_slice(json_bytes).map_err(|e| format!("reddit json parse: {e}"))?;
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
let mut markdown = String::new();
|
||||
let mut title = None;
|
||||
let mut author = None;
|
||||
let mut subreddit = None;
|
||||
|
||||
// First listing = the post itself
|
||||
if let Some(post_listing) = listings.first() {
|
||||
for child in &post_listing.data.children {
|
||||
if child.kind == "t3" {
|
||||
let d = &child.data;
|
||||
title = d.title.clone();
|
||||
author = d.author.clone();
|
||||
subreddit = d.subreddit_name_prefixed.clone();
|
||||
|
||||
if let Some(ref t) = title {
|
||||
markdown.push_str(&format!("# {t}\n\n"));
|
||||
}
|
||||
if let (Some(a), Some(sr)) = (&author, &subreddit) {
|
||||
markdown.push_str(&format!("**u/{a}** in {sr}\n\n"));
|
||||
}
|
||||
if let Some(ref body) = d.selftext
|
||||
&& !body.is_empty()
|
||||
{
|
||||
markdown.push_str(body);
|
||||
markdown.push_str("\n\n");
|
||||
}
|
||||
if let Some(ref url_field) = d.url_overridden_by_dest
|
||||
&& !url_field.is_empty()
|
||||
{
|
||||
markdown.push_str(&format!("[Link]({url_field})\n\n"));
|
||||
}
|
||||
markdown.push_str("---\n\n");
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn rewrites_www_to_old() {
|
||||
assert_eq!(
|
||||
to_old_reddit_url("https://www.reddit.com/r/rust/comments/abc/x/"),
|
||||
"https://old.reddit.com/r/rust/comments/abc/x/"
|
||||
);
|
||||
}
|
||||
|
||||
// Second listing = comment tree
|
||||
if let Some(comment_listing) = listings.get(1) {
|
||||
markdown.push_str("## Comments\n\n");
|
||||
for child in &comment_listing.data.children {
|
||||
render_comment(child, 0, &mut markdown);
|
||||
}
|
||||
#[test]
|
||||
fn rewrites_bare_to_old() {
|
||||
assert_eq!(
|
||||
to_old_reddit_url("https://reddit.com/r/rust/"),
|
||||
"https://old.reddit.com/r/rust/"
|
||||
);
|
||||
}
|
||||
|
||||
let word_count = markdown.split_whitespace().count();
|
||||
debug!(word_count, "reddit json extracted");
|
||||
|
||||
Ok(ExtractionResult {
|
||||
metadata: Metadata {
|
||||
title,
|
||||
description: None,
|
||||
author,
|
||||
published_date: None,
|
||||
language: Some("en".into()),
|
||||
url: Some(url.to_string()),
|
||||
site_name: subreddit,
|
||||
image: None,
|
||||
favicon: None,
|
||||
word_count,
|
||||
},
|
||||
content: Content {
|
||||
markdown,
|
||||
plain_text: String::new(),
|
||||
links: vec![],
|
||||
images: vec![],
|
||||
code_blocks: vec![],
|
||||
raw_html: None,
|
||||
},
|
||||
domain_data: None,
|
||||
structured_data: vec![],
|
||||
})
|
||||
}
|
||||
|
||||
fn render_comment(thing: &Thing, depth: usize, out: &mut String) {
|
||||
if thing.kind != "t1" {
|
||||
return;
|
||||
#[test]
|
||||
fn preserves_old_reddit_unchanged() {
|
||||
let url = "https://old.reddit.com/r/rust/comments/abc/x/?context=3";
|
||||
assert_eq!(to_old_reddit_url(url), url);
|
||||
}
|
||||
let d = &thing.data;
|
||||
let indent = " ".repeat(depth);
|
||||
let author = d.author.as_deref().unwrap_or("[deleted]");
|
||||
let body = d.body.as_deref().unwrap_or("[removed]");
|
||||
let score = d.score.unwrap_or(0);
|
||||
|
||||
out.push_str(&format!("{indent}- **u/{author}** ({score} pts)\n"));
|
||||
for line in body.lines() {
|
||||
out.push_str(&format!("{indent} {line}\n"));
|
||||
}
|
||||
out.push('\n');
|
||||
|
||||
// Recurse into replies
|
||||
if let Some(Replies::Listing(listing)) = &d.replies {
|
||||
for child in &listing.data.children {
|
||||
render_comment(child, depth + 1, out);
|
||||
}
|
||||
#[test]
|
||||
fn preserves_query_and_hash() {
|
||||
assert_eq!(
|
||||
to_old_reddit_url("https://www.reddit.com/r/rust/?sort=top#anchor"),
|
||||
"https://old.reddit.com/r/rust/?sort=top#anchor"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// --- Reddit JSON types (minimal) ---
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct Listing {
|
||||
data: ListingData,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct ListingData {
|
||||
children: Vec<Thing>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct Thing {
|
||||
kind: String,
|
||||
data: ThingData,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct ThingData {
|
||||
// Post fields (t3)
|
||||
title: Option<String>,
|
||||
selftext: Option<String>,
|
||||
subreddit_name_prefixed: Option<String>,
|
||||
url_overridden_by_dest: Option<String>,
|
||||
// Comment fields (t1)
|
||||
author: Option<String>,
|
||||
body: Option<String>,
|
||||
score: Option<i64>,
|
||||
replies: Option<Replies>,
|
||||
}
|
||||
|
||||
/// Reddit replies can be either a nested Listing or an empty string.
|
||||
#[derive(Deserialize)]
|
||||
#[serde(untagged)]
|
||||
enum Replies {
|
||||
Listing(Listing),
|
||||
#[allow(dead_code)]
|
||||
Empty(String),
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue