webclaw/crates/webclaw-fetch/src/reddit.rs

/// Reddit JSON API fallback for extracting posts + comments without JS rendering.
///
/// Reddit's new `shreddit` frontend only SSRs the post body — comments are
/// loaded client-side. Appending `.json` to any Reddit URL returns the full
/// comment tree as structured JSON, which we convert to clean markdown.
use serde::Deserialize;
use tracing::debug;
use webclaw_core::{Content, ExtractionResult, Metadata};

/// Check if a URL points to a Reddit post/comment page.
pub fn is_reddit_url(url: &str) -> bool {
    let host = url
        .split("://")
        .nth(1)
        .unwrap_or(url)
        .split('/')
        .next()
        .unwrap_or("");
    matches!(
        host,
        "reddit.com" | "www.reddit.com" | "old.reddit.com" | "np.reddit.com" | "new.reddit.com"
    )
}

/// Build the `.json` URL from a Reddit page URL.
pub fn json_url(url: &str) -> String {
    let clean = url.split('?').next().unwrap_or(url).trim_end_matches('/');
    format!("{clean}.json")
}

/// Convert Reddit JSON API response into an ExtractionResult.
pub fn parse_reddit_json(json_bytes: &[u8], url: &str) -> Result<ExtractionResult, String> {
    let listings: Vec<Listing> =
        serde_json::from_slice(json_bytes).map_err(|e| format!("reddit json parse: {e}"))?;

    let mut markdown = String::new();
    let mut title = None;
    let mut author = None;
    let mut subreddit = None;

    // First listing = the post itself
    if let Some(post_listing) = listings.first() {
        for child in &post_listing.data.children {
            if child.kind == "t3" {
                let d = &child.data;
                title = d.title.clone();
                author = d.author.clone();
                subreddit = d.subreddit_name_prefixed.clone();

                if let Some(ref t) = title {
                    markdown.push_str(&format!("# {t}\n\n"));
                }
                if let (Some(a), Some(sr)) = (&author, &subreddit) {
                    markdown.push_str(&format!("**u/{a}** in {sr}\n\n"));
                }
                if let Some(ref body) = d.selftext
                    && !body.is_empty()
                {
                    markdown.push_str(body);
                    markdown.push_str("\n\n");
                }
                if let Some(ref url_field) = d.url_overridden_by_dest
                    && !url_field.is_empty()
                {
                    markdown.push_str(&format!("[Link]({url_field})\n\n"));
                }
                markdown.push_str("---\n\n");
            }
        }
    }

    // Second listing = comment tree
    if let Some(comment_listing) = listings.get(1) {
        markdown.push_str("## Comments\n\n");
        for child in &comment_listing.data.children {
            render_comment(child, 0, &mut markdown);
        }
    }

    let word_count = markdown.split_whitespace().count();
    debug!(word_count, "reddit json extracted");

    Ok(ExtractionResult {
        metadata: Metadata {
            title,
            description: None,
            author,
            published_date: None,
            language: Some("en".into()),
            url: Some(url.to_string()),
            site_name: subreddit,
            image: None,
            favicon: None,
            word_count,
        },
        content: Content {
            markdown,
            plain_text: String::new(),
            links: vec![],
            images: vec![],
            code_blocks: vec![],
            raw_html: None,
        },
        domain_data: None,
        structured_data: vec![],
    })
}

fn render_comment(thing: &Thing, depth: usize, out: &mut String) {
    if thing.kind != "t1" {
        return;
    }
    let d = &thing.data;
    let indent = "  ".repeat(depth);
    let author = d.author.as_deref().unwrap_or("[deleted]");
    let body = d.body.as_deref().unwrap_or("[removed]");
    let score = d.score.unwrap_or(0);

    out.push_str(&format!("{indent}- **u/{author}** ({score} pts)\n"));
    for line in body.lines() {
        out.push_str(&format!("{indent}  {line}\n"));
    }
    out.push('\n');

    // Recurse into replies
    if let Some(Replies::Listing(listing)) = &d.replies {
        for child in &listing.data.children {
            render_comment(child, depth + 1, out);
        }
    }
}

// --- Reddit JSON types (minimal) ---

#[derive(Deserialize)]
struct Listing {
    data: ListingData,
}

#[derive(Deserialize)]
struct ListingData {
    children: Vec<Thing>,
}

#[derive(Deserialize)]
struct Thing {
    kind: String,
    data: ThingData,
}

#[derive(Deserialize)]
struct ThingData {
    // Post fields (t3)
    title: Option<String>,
    selftext: Option<String>,
    subreddit_name_prefixed: Option<String>,
    url_overridden_by_dest: Option<String>,
    // Comment fields (t1)
    author: Option<String>,
    body: Option<String>,
    score: Option<i64>,
    replies: Option<Replies>,
}

/// Reddit replies can be either a nested Listing or an empty string.
#[derive(Deserialize)]
#[serde(untagged)]
enum Replies {
    Listing(Listing),
    #[allow(dead_code)]
    Empty(String),
}
Initial release: webclaw v0.1.0 — web content extraction for LLMs CLI + MCP server for extracting clean, structured content from any URL. 6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats. MIT Licensed \| https://webclaw.io 2026-03-23 18:31:11 +01:00			`/// Reddit JSON API fallback for extracting posts + comments without JS rendering.`
			`///`
			/// Reddit's new `shreddit` frontend only SSRs the post body — comments are
			/// loaded client-side. Appending `.json` to any Reddit URL returns the full
			`/// comment tree as structured JSON, which we convert to clean markdown.`
			`use serde::Deserialize;`
			`use tracing::debug;`
			`use webclaw_core::{Content, ExtractionResult, Metadata};`

			`/// Check if a URL points to a Reddit post/comment page.`
			`pub fn is_reddit_url(url: &str) -> bool {`
			`let host = url`
			`.split("://")`
			`.nth(1)`
			`.unwrap_or(url)`
			`.split('/')`
			`.next()`
			`.unwrap_or("");`
			`matches!(`
			`host,`
			`"reddit.com" \| "www.reddit.com" \| "old.reddit.com" \| "np.reddit.com" \| "new.reddit.com"`
			`)`
			`}`

			/// Build the `.json` URL from a Reddit page URL.
			`pub fn json_url(url: &str) -> String {`
			`let clean = url.split('?').next().unwrap_or(url).trim_end_matches('/');`
			`format!("{clean}.json")`
			`}`

			`/// Convert Reddit JSON API response into an ExtractionResult.`
			`pub fn parse_reddit_json(json_bytes: &[u8], url: &str) -> Result<ExtractionResult, String> {`
			`let listings: Vec<Listing> =`
			`serde_json::from_slice(json_bytes).map_err(\|e\| format!("reddit json parse: {e}"))?;`

			`let mut markdown = String::new();`
			`let mut title = None;`
			`let mut author = None;`
			`let mut subreddit = None;`

			`// First listing = the post itself`
			`if let Some(post_listing) = listings.first() {`
			`for child in &post_listing.data.children {`
			`if child.kind == "t3" {`
			`let d = &child.data;`
			`title = d.title.clone();`
			`author = d.author.clone();`
			`subreddit = d.subreddit_name_prefixed.clone();`

			`if let Some(ref t) = title {`
			`markdown.push_str(&format!("# {t}\n\n"));`
			`}`
			`if let (Some(a), Some(sr)) = (&author, &subreddit) {`
			`markdown.push_str(&format!("u/{a} in {sr}\n\n"));`
			`}`
			`if let Some(ref body) = d.selftext`
			`&& !body.is_empty()`
			`{`
			`markdown.push_str(body);`
			`markdown.push_str("\n\n");`
			`}`
			`if let Some(ref url_field) = d.url_overridden_by_dest`
			`&& !url_field.is_empty()`
			`{`
			`markdown.push_str(&format!("[Link]({url_field})\n\n"));`
			`}`
			`markdown.push_str("---\n\n");`
			`}`
			`}`
			`}`

			`// Second listing = comment tree`
			`if let Some(comment_listing) = listings.get(1) {`
			`markdown.push_str("## Comments\n\n");`
			`for child in &comment_listing.data.children {`
			`render_comment(child, 0, &mut markdown);`
			`}`
			`}`

			`let word_count = markdown.split_whitespace().count();`
			`debug!(word_count, "reddit json extracted");`

			`Ok(ExtractionResult {`
			`metadata: Metadata {`
			`title,`
			`description: None,`
			`author,`
			`published_date: None,`
			`language: Some("en".into()),`
			`url: Some(url.to_string()),`
			`site_name: subreddit,`
			`image: None,`
			`favicon: None,`
			`word_count,`
			`},`
			`content: Content {`
			`markdown,`
			`plain_text: String::new(),`
			`links: vec![],`
			`images: vec![],`
			`code_blocks: vec![],`
			`raw_html: None,`
			`},`
			`domain_data: None,`
			`structured_data: vec![],`
			`})`
			`}`

			`fn render_comment(thing: &Thing, depth: usize, out: &mut String) {`
			`if thing.kind != "t1" {`
			`return;`
			`}`
			`let d = &thing.data;`
			`let indent = " ".repeat(depth);`
			`let author = d.author.as_deref().unwrap_or("[deleted]");`
			`let body = d.body.as_deref().unwrap_or("[removed]");`
			`let score = d.score.unwrap_or(0);`

			`out.push_str(&format!("{indent}- u/{author} ({score} pts)\n"));`
			`for line in body.lines() {`
			`out.push_str(&format!("{indent} {line}\n"));`
			`}`
			`out.push('\n');`

			`// Recurse into replies`
			`if let Some(Replies::Listing(listing)) = &d.replies {`
			`for child in &listing.data.children {`
			`render_comment(child, depth + 1, out);`
			`}`
			`}`
			`}`

			`// --- Reddit JSON types (minimal) ---`

			`#[derive(Deserialize)]`
			`struct Listing {`
			`data: ListingData,`
			`}`

			`#[derive(Deserialize)]`
			`struct ListingData {`
			`children: Vec<Thing>,`
			`}`

			`#[derive(Deserialize)]`
			`struct Thing {`
			`kind: String,`
			`data: ThingData,`
			`}`

			`#[derive(Deserialize)]`
			`struct ThingData {`
			`// Post fields (t3)`
			`title: Option<String>,`
			`selftext: Option<String>,`
			`subreddit_name_prefixed: Option<String>,`
			`url_overridden_by_dest: Option<String>,`
			`// Comment fields (t1)`
			`author: Option<String>,`
			`body: Option<String>,`
			`score: Option<i64>,`
			`replies: Option<Replies>,`
			`}`

			`/// Reddit replies can be either a nested Listing or an empty string.`
			`#[derive(Deserialize)]`
			`#[serde(untagged)]`
			`enum Replies {`
			`Listing(Listing),`
			`#[allow(dead_code)]`
			`Empty(String),`
			`}`