webclaw/crates/webclaw-fetch/src/extractors/reddit.rs
Valerio 8ba7538c37 feat(extractors): add vertical extractors module + first 6 verticals
New extractors module returns site-specific typed JSON instead of
generic markdown. Each extractor:
- declares a URL pattern via matches()
- fetches from the site's official JSON API where one exists
- returns a typed serde_json::Value with documented field names
- exposes an INFO struct that powers the /v1/extractors catalog

First 6 verticals shipped, all hitting public JSON APIs (no HTML
scraping, zero antibot risk):

- reddit       → www.reddit.com/*/.json
- hackernews   → hn.algolia.com/api/v1/items/{id} (full thread in one call)
- github_repo  → api.github.com/repos/{owner}/{repo}
- pypi         → pypi.org/pypi/{name}/json
- npm          → registry.npmjs.org/{name} + downloads/point/last-week
- huggingface_model → huggingface.co/api/models/{owner}/{name}

Server-side routes added:
- POST /v1/scrape/{vertical}  explicit per-vertical extraction
- GET  /v1/extractors         catalog (name, label, description, url_patterns)

The dispatcher validates that URL matches the requested vertical
before running, so users get "URL doesn't match the X extractor"
instead of opaque parse failures inside the extractor.

17 unit tests cover URL matching + path parsing for each vertical.
Live tests against canonical URLs (rust-lang/rust, requests pypi,
react npm, whisper-large-v3 hf, item 8863 hn, an r/micro_saas post)
all return correct typed JSON in 100-300ms. Sample sizes: github
863B, npm 700B, pypi 1.7KB, hf 3.2KB, hn 38KB (full comment tree).

Marketing positioning: Firecrawl charges 5 credits per /extract call
and you write the schema. Webclaw returns the same JSON in 1 credit
per /scrape/{vertical} call with hand-written deterministic
extractors per site.
2026-04-22 14:11:43 +02:00

234 lines
7 KiB
Rust

//! Reddit structured extractor — returns the full post + comment tree
//! as typed JSON via Reddit's `.json` API.
//!
//! The same trick the markdown extractor in `crate::reddit` uses:
//! appending `.json` to any post URL returns the data the new SPA
//! frontend would load client-side. Zero antibot, zero JS rendering.
use serde::Deserialize;
use serde_json::{Value, json};
use super::ExtractorInfo;
use crate::client::FetchClient;
use crate::error::FetchError;
pub const INFO: ExtractorInfo = ExtractorInfo {
name: "reddit",
label: "Reddit thread",
description: "Returns post + nested comment tree with scores, authors, and timestamps.",
url_patterns: &[
"https://www.reddit.com/r/*/comments/*",
"https://reddit.com/r/*/comments/*",
"https://old.reddit.com/r/*/comments/*",
],
};
pub fn matches(url: &str) -> bool {
let host = host_of(url);
let is_reddit_host = matches!(
host,
"reddit.com" | "www.reddit.com" | "old.reddit.com" | "np.reddit.com" | "new.reddit.com"
);
is_reddit_host && url.contains("/comments/")
}
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
let json_url = build_json_url(url);
let resp = client.fetch(&json_url).await?;
if resp.status != 200 {
return Err(FetchError::Build(format!(
"reddit api returned status {}",
resp.status
)));
}
let listings: Vec<Listing> = serde_json::from_str(&resp.html)
.map_err(|e| FetchError::BodyDecode(format!("reddit json parse: {e}")))?;
if listings.is_empty() {
return Err(FetchError::BodyDecode("reddit response empty".into()));
}
// First listing = the post (single t3 child).
let post = listings
.first()
.and_then(|l| l.data.children.first())
.filter(|t| t.kind == "t3")
.map(|t| post_json(&t.data))
.unwrap_or(Value::Null);
// Second listing = the comment tree.
let comments: Vec<Value> = listings
.get(1)
.map(|l| l.data.children.iter().filter_map(comment_json).collect())
.unwrap_or_default();
Ok(json!({
"url": url,
"post": post,
"comments": comments,
}))
}
// ---------------------------------------------------------------------------
// JSON shapers
// ---------------------------------------------------------------------------
fn post_json(d: &ThingData) -> Value {
json!({
"id": d.id,
"title": d.title,
"author": d.author,
"subreddit": d.subreddit_name_prefixed,
"permalink": d.permalink.as_ref().map(|p| format!("https://www.reddit.com{p}")),
"url": d.url_overridden_by_dest,
"is_self": d.is_self,
"selftext": d.selftext,
"score": d.score,
"upvote_ratio": d.upvote_ratio,
"num_comments": d.num_comments,
"created_utc": d.created_utc,
"link_flair_text": d.link_flair_text,
"over_18": d.over_18,
"spoiler": d.spoiler,
"stickied": d.stickied,
"locked": d.locked,
})
}
/// Render a single comment + its reply tree. Returns `None` for non-t1
/// kinds (the trailing `more` placeholder Reddit injects at depth limits).
fn comment_json(thing: &Thing) -> Option<Value> {
if thing.kind != "t1" {
return None;
}
let d = &thing.data;
let replies: Vec<Value> = match &d.replies {
Some(Replies::Listing(l)) => l.data.children.iter().filter_map(comment_json).collect(),
_ => Vec::new(),
};
Some(json!({
"id": d.id,
"author": d.author,
"body": d.body,
"score": d.score,
"created_utc": d.created_utc,
"is_submitter": d.is_submitter,
"stickied": d.stickied,
"depth": d.depth,
"permalink": d.permalink.as_ref().map(|p| format!("https://www.reddit.com{p}")),
"replies": replies,
}))
}
// ---------------------------------------------------------------------------
// URL helpers
// ---------------------------------------------------------------------------
fn host_of(url: &str) -> &str {
url.split("://")
.nth(1)
.unwrap_or(url)
.split('/')
.next()
.unwrap_or("")
}
/// Build the Reddit JSON URL. We keep the original host (`www.reddit.com`
/// or `old.reddit.com` as the caller gave us). Routing through
/// `old.reddit.com` unconditionally looks appealing but that host has
/// stricter UA-based blocking than `www.reddit.com`, while the main
/// host accepts our Chrome-fingerprinted client fine.
fn build_json_url(url: &str) -> String {
let clean = url.split('?').next().unwrap_or(url).trim_end_matches('/');
format!("{clean}.json?raw_json=1")
}
// ---------------------------------------------------------------------------
// Reddit JSON types — only fields we render. Everything else is dropped.
// ---------------------------------------------------------------------------
#[derive(Deserialize)]
struct Listing {
data: ListingData,
}
#[derive(Deserialize)]
struct ListingData {
children: Vec<Thing>,
}
#[derive(Deserialize)]
struct Thing {
kind: String,
data: ThingData,
}
#[derive(Deserialize, Default)]
struct ThingData {
// post (t3)
id: Option<String>,
title: Option<String>,
selftext: Option<String>,
subreddit_name_prefixed: Option<String>,
url_overridden_by_dest: Option<String>,
is_self: Option<bool>,
upvote_ratio: Option<f64>,
num_comments: Option<i64>,
over_18: Option<bool>,
spoiler: Option<bool>,
stickied: Option<bool>,
locked: Option<bool>,
link_flair_text: Option<String>,
// comment (t1)
author: Option<String>,
body: Option<String>,
score: Option<i64>,
created_utc: Option<f64>,
is_submitter: Option<bool>,
depth: Option<i64>,
permalink: Option<String>,
// recursive
replies: Option<Replies>,
}
#[derive(Deserialize)]
#[serde(untagged)]
enum Replies {
Listing(Listing),
#[allow(dead_code)]
Empty(String),
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn matches_reddit_post_urls() {
assert!(matches(
"https://www.reddit.com/r/rust/comments/abc123/some_title/"
));
assert!(matches(
"https://reddit.com/r/rust/comments/abc123/some_title"
));
assert!(matches("https://old.reddit.com/r/rust/comments/abc123/x/"));
}
#[test]
fn rejects_non_post_reddit_urls() {
assert!(!matches("https://www.reddit.com/r/rust"));
assert!(!matches("https://www.reddit.com/user/foo"));
assert!(!matches("https://example.com/r/rust/comments/x"));
}
#[test]
fn json_url_appends_suffix_and_drops_query() {
assert_eq!(
build_json_url("https://www.reddit.com/r/rust/comments/abc/x/?utm=foo"),
"https://www.reddit.com/r/rust/comments/abc/x.json?raw_json=1"
);
}
}