mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-04-25 00:06:21 +02:00
173 lines
5 KiB
Rust
173 lines
5 KiB
Rust
|
|
/// Reddit JSON API fallback for extracting posts + comments without JS rendering.
|
||
|
|
///
|
||
|
|
/// Reddit's new `shreddit` frontend only SSRs the post body — comments are
|
||
|
|
/// loaded client-side. Appending `.json` to any Reddit URL returns the full
|
||
|
|
/// comment tree as structured JSON, which we convert to clean markdown.
|
||
|
|
use serde::Deserialize;
|
||
|
|
use tracing::debug;
|
||
|
|
use webclaw_core::{Content, ExtractionResult, Metadata};
|
||
|
|
|
||
|
|
/// Check if a URL points to a Reddit post/comment page.
|
||
|
|
pub fn is_reddit_url(url: &str) -> bool {
|
||
|
|
let host = url
|
||
|
|
.split("://")
|
||
|
|
.nth(1)
|
||
|
|
.unwrap_or(url)
|
||
|
|
.split('/')
|
||
|
|
.next()
|
||
|
|
.unwrap_or("");
|
||
|
|
matches!(
|
||
|
|
host,
|
||
|
|
"reddit.com" | "www.reddit.com" | "old.reddit.com" | "np.reddit.com" | "new.reddit.com"
|
||
|
|
)
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Build the `.json` URL from a Reddit page URL.
|
||
|
|
pub fn json_url(url: &str) -> String {
|
||
|
|
let clean = url.split('?').next().unwrap_or(url).trim_end_matches('/');
|
||
|
|
format!("{clean}.json")
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Convert Reddit JSON API response into an ExtractionResult.
|
||
|
|
pub fn parse_reddit_json(json_bytes: &[u8], url: &str) -> Result<ExtractionResult, String> {
|
||
|
|
let listings: Vec<Listing> =
|
||
|
|
serde_json::from_slice(json_bytes).map_err(|e| format!("reddit json parse: {e}"))?;
|
||
|
|
|
||
|
|
let mut markdown = String::new();
|
||
|
|
let mut title = None;
|
||
|
|
let mut author = None;
|
||
|
|
let mut subreddit = None;
|
||
|
|
|
||
|
|
// First listing = the post itself
|
||
|
|
if let Some(post_listing) = listings.first() {
|
||
|
|
for child in &post_listing.data.children {
|
||
|
|
if child.kind == "t3" {
|
||
|
|
let d = &child.data;
|
||
|
|
title = d.title.clone();
|
||
|
|
author = d.author.clone();
|
||
|
|
subreddit = d.subreddit_name_prefixed.clone();
|
||
|
|
|
||
|
|
if let Some(ref t) = title {
|
||
|
|
markdown.push_str(&format!("# {t}\n\n"));
|
||
|
|
}
|
||
|
|
if let (Some(a), Some(sr)) = (&author, &subreddit) {
|
||
|
|
markdown.push_str(&format!("**u/{a}** in {sr}\n\n"));
|
||
|
|
}
|
||
|
|
if let Some(ref body) = d.selftext
|
||
|
|
&& !body.is_empty()
|
||
|
|
{
|
||
|
|
markdown.push_str(body);
|
||
|
|
markdown.push_str("\n\n");
|
||
|
|
}
|
||
|
|
if let Some(ref url_field) = d.url_overridden_by_dest
|
||
|
|
&& !url_field.is_empty()
|
||
|
|
{
|
||
|
|
markdown.push_str(&format!("[Link]({url_field})\n\n"));
|
||
|
|
}
|
||
|
|
markdown.push_str("---\n\n");
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Second listing = comment tree
|
||
|
|
if let Some(comment_listing) = listings.get(1) {
|
||
|
|
markdown.push_str("## Comments\n\n");
|
||
|
|
for child in &comment_listing.data.children {
|
||
|
|
render_comment(child, 0, &mut markdown);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
let word_count = markdown.split_whitespace().count();
|
||
|
|
debug!(word_count, "reddit json extracted");
|
||
|
|
|
||
|
|
Ok(ExtractionResult {
|
||
|
|
metadata: Metadata {
|
||
|
|
title,
|
||
|
|
description: None,
|
||
|
|
author,
|
||
|
|
published_date: None,
|
||
|
|
language: Some("en".into()),
|
||
|
|
url: Some(url.to_string()),
|
||
|
|
site_name: subreddit,
|
||
|
|
image: None,
|
||
|
|
favicon: None,
|
||
|
|
word_count,
|
||
|
|
},
|
||
|
|
content: Content {
|
||
|
|
markdown,
|
||
|
|
plain_text: String::new(),
|
||
|
|
links: vec![],
|
||
|
|
images: vec![],
|
||
|
|
code_blocks: vec![],
|
||
|
|
raw_html: None,
|
||
|
|
},
|
||
|
|
domain_data: None,
|
||
|
|
structured_data: vec![],
|
||
|
|
})
|
||
|
|
}
|
||
|
|
|
||
|
|
fn render_comment(thing: &Thing, depth: usize, out: &mut String) {
|
||
|
|
if thing.kind != "t1" {
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
let d = &thing.data;
|
||
|
|
let indent = " ".repeat(depth);
|
||
|
|
let author = d.author.as_deref().unwrap_or("[deleted]");
|
||
|
|
let body = d.body.as_deref().unwrap_or("[removed]");
|
||
|
|
let score = d.score.unwrap_or(0);
|
||
|
|
|
||
|
|
out.push_str(&format!("{indent}- **u/{author}** ({score} pts)\n"));
|
||
|
|
for line in body.lines() {
|
||
|
|
out.push_str(&format!("{indent} {line}\n"));
|
||
|
|
}
|
||
|
|
out.push('\n');
|
||
|
|
|
||
|
|
// Recurse into replies
|
||
|
|
if let Some(Replies::Listing(listing)) = &d.replies {
|
||
|
|
for child in &listing.data.children {
|
||
|
|
render_comment(child, depth + 1, out);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// --- Reddit JSON types (minimal) ---
|
||
|
|
|
||
|
|
#[derive(Deserialize)]
|
||
|
|
struct Listing {
|
||
|
|
data: ListingData,
|
||
|
|
}
|
||
|
|
|
||
|
|
#[derive(Deserialize)]
|
||
|
|
struct ListingData {
|
||
|
|
children: Vec<Thing>,
|
||
|
|
}
|
||
|
|
|
||
|
|
#[derive(Deserialize)]
|
||
|
|
struct Thing {
|
||
|
|
kind: String,
|
||
|
|
data: ThingData,
|
||
|
|
}
|
||
|
|
|
||
|
|
#[derive(Deserialize)]
|
||
|
|
struct ThingData {
|
||
|
|
// Post fields (t3)
|
||
|
|
title: Option<String>,
|
||
|
|
selftext: Option<String>,
|
||
|
|
subreddit_name_prefixed: Option<String>,
|
||
|
|
url_overridden_by_dest: Option<String>,
|
||
|
|
// Comment fields (t1)
|
||
|
|
author: Option<String>,
|
||
|
|
body: Option<String>,
|
||
|
|
score: Option<i64>,
|
||
|
|
replies: Option<Replies>,
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Reddit replies can be either a nested Listing or an empty string.
|
||
|
|
#[derive(Deserialize)]
|
||
|
|
#[serde(untagged)]
|
||
|
|
enum Replies {
|
||
|
|
Listing(Listing),
|
||
|
|
#[allow(dead_code)]
|
||
|
|
Empty(String),
|
||
|
|
}
|