mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-04-25 00:06:21 +02:00
New extractors module returns site-specific typed JSON instead of
generic markdown. Each extractor:
- declares a URL pattern via matches()
- fetches from the site's official JSON API where one exists
- returns a typed serde_json::Value with documented field names
- exposes an INFO struct that powers the /v1/extractors catalog
First 6 verticals shipped, all hitting public JSON APIs (no HTML
scraping, zero antibot risk):
- reddit → www.reddit.com/*/.json
- hackernews → hn.algolia.com/api/v1/items/{id} (full thread in one call)
- github_repo → api.github.com/repos/{owner}/{repo}
- pypi → pypi.org/pypi/{name}/json
- npm → registry.npmjs.org/{name} + downloads/point/last-week
- huggingface_model → huggingface.co/api/models/{owner}/{name}
Server-side routes added:
- POST /v1/scrape/{vertical} explicit per-vertical extraction
- GET /v1/extractors catalog (name, label, description, url_patterns)
The dispatcher validates that URL matches the requested vertical
before running, so users get "URL doesn't match the X extractor"
instead of opaque parse failures inside the extractor.
17 unit tests cover URL matching + path parsing for each vertical.
Live tests against canonical URLs (rust-lang/rust, requests pypi,
react npm, whisper-large-v3 hf, item 8863 hn, an r/micro_saas post)
all return correct typed JSON in 100-300ms. Sample sizes: github
863B, npm 700B, pypi 1.7KB, hf 3.2KB, hn 38KB (full comment tree).
Marketing positioning: Firecrawl charges 5 credits per /extract call
and you write the schema. Webclaw returns the same JSON in 1 credit
per /scrape/{vertical} call with hand-written deterministic
extractors per site.
234 lines
7 KiB
Rust
234 lines
7 KiB
Rust
//! Reddit structured extractor — returns the full post + comment tree
|
|
//! as typed JSON via Reddit's `.json` API.
|
|
//!
|
|
//! The same trick the markdown extractor in `crate::reddit` uses:
|
|
//! appending `.json` to any post URL returns the data the new SPA
|
|
//! frontend would load client-side. Zero antibot, zero JS rendering.
|
|
|
|
use serde::Deserialize;
|
|
use serde_json::{Value, json};
|
|
|
|
use super::ExtractorInfo;
|
|
use crate::client::FetchClient;
|
|
use crate::error::FetchError;
|
|
|
|
pub const INFO: ExtractorInfo = ExtractorInfo {
|
|
name: "reddit",
|
|
label: "Reddit thread",
|
|
description: "Returns post + nested comment tree with scores, authors, and timestamps.",
|
|
url_patterns: &[
|
|
"https://www.reddit.com/r/*/comments/*",
|
|
"https://reddit.com/r/*/comments/*",
|
|
"https://old.reddit.com/r/*/comments/*",
|
|
],
|
|
};
|
|
|
|
pub fn matches(url: &str) -> bool {
|
|
let host = host_of(url);
|
|
let is_reddit_host = matches!(
|
|
host,
|
|
"reddit.com" | "www.reddit.com" | "old.reddit.com" | "np.reddit.com" | "new.reddit.com"
|
|
);
|
|
is_reddit_host && url.contains("/comments/")
|
|
}
|
|
|
|
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
|
|
let json_url = build_json_url(url);
|
|
let resp = client.fetch(&json_url).await?;
|
|
if resp.status != 200 {
|
|
return Err(FetchError::Build(format!(
|
|
"reddit api returned status {}",
|
|
resp.status
|
|
)));
|
|
}
|
|
|
|
let listings: Vec<Listing> = serde_json::from_str(&resp.html)
|
|
.map_err(|e| FetchError::BodyDecode(format!("reddit json parse: {e}")))?;
|
|
|
|
if listings.is_empty() {
|
|
return Err(FetchError::BodyDecode("reddit response empty".into()));
|
|
}
|
|
|
|
// First listing = the post (single t3 child).
|
|
let post = listings
|
|
.first()
|
|
.and_then(|l| l.data.children.first())
|
|
.filter(|t| t.kind == "t3")
|
|
.map(|t| post_json(&t.data))
|
|
.unwrap_or(Value::Null);
|
|
|
|
// Second listing = the comment tree.
|
|
let comments: Vec<Value> = listings
|
|
.get(1)
|
|
.map(|l| l.data.children.iter().filter_map(comment_json).collect())
|
|
.unwrap_or_default();
|
|
|
|
Ok(json!({
|
|
"url": url,
|
|
"post": post,
|
|
"comments": comments,
|
|
}))
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// JSON shapers
|
|
// ---------------------------------------------------------------------------
|
|
|
|
fn post_json(d: &ThingData) -> Value {
|
|
json!({
|
|
"id": d.id,
|
|
"title": d.title,
|
|
"author": d.author,
|
|
"subreddit": d.subreddit_name_prefixed,
|
|
"permalink": d.permalink.as_ref().map(|p| format!("https://www.reddit.com{p}")),
|
|
"url": d.url_overridden_by_dest,
|
|
"is_self": d.is_self,
|
|
"selftext": d.selftext,
|
|
"score": d.score,
|
|
"upvote_ratio": d.upvote_ratio,
|
|
"num_comments": d.num_comments,
|
|
"created_utc": d.created_utc,
|
|
"link_flair_text": d.link_flair_text,
|
|
"over_18": d.over_18,
|
|
"spoiler": d.spoiler,
|
|
"stickied": d.stickied,
|
|
"locked": d.locked,
|
|
})
|
|
}
|
|
|
|
/// Render a single comment + its reply tree. Returns `None` for non-t1
|
|
/// kinds (the trailing `more` placeholder Reddit injects at depth limits).
|
|
fn comment_json(thing: &Thing) -> Option<Value> {
|
|
if thing.kind != "t1" {
|
|
return None;
|
|
}
|
|
let d = &thing.data;
|
|
let replies: Vec<Value> = match &d.replies {
|
|
Some(Replies::Listing(l)) => l.data.children.iter().filter_map(comment_json).collect(),
|
|
_ => Vec::new(),
|
|
};
|
|
Some(json!({
|
|
"id": d.id,
|
|
"author": d.author,
|
|
"body": d.body,
|
|
"score": d.score,
|
|
"created_utc": d.created_utc,
|
|
"is_submitter": d.is_submitter,
|
|
"stickied": d.stickied,
|
|
"depth": d.depth,
|
|
"permalink": d.permalink.as_ref().map(|p| format!("https://www.reddit.com{p}")),
|
|
"replies": replies,
|
|
}))
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// URL helpers
|
|
// ---------------------------------------------------------------------------
|
|
|
|
fn host_of(url: &str) -> &str {
|
|
url.split("://")
|
|
.nth(1)
|
|
.unwrap_or(url)
|
|
.split('/')
|
|
.next()
|
|
.unwrap_or("")
|
|
}
|
|
|
|
/// Build the Reddit JSON URL. We keep the original host (`www.reddit.com`
|
|
/// or `old.reddit.com` as the caller gave us). Routing through
|
|
/// `old.reddit.com` unconditionally looks appealing but that host has
|
|
/// stricter UA-based blocking than `www.reddit.com`, while the main
|
|
/// host accepts our Chrome-fingerprinted client fine.
|
|
fn build_json_url(url: &str) -> String {
|
|
let clean = url.split('?').next().unwrap_or(url).trim_end_matches('/');
|
|
format!("{clean}.json?raw_json=1")
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Reddit JSON types — only fields we render. Everything else is dropped.
|
|
// ---------------------------------------------------------------------------
|
|
|
|
#[derive(Deserialize)]
|
|
struct Listing {
|
|
data: ListingData,
|
|
}
|
|
|
|
#[derive(Deserialize)]
|
|
struct ListingData {
|
|
children: Vec<Thing>,
|
|
}
|
|
|
|
#[derive(Deserialize)]
|
|
struct Thing {
|
|
kind: String,
|
|
data: ThingData,
|
|
}
|
|
|
|
#[derive(Deserialize, Default)]
|
|
struct ThingData {
|
|
// post (t3)
|
|
id: Option<String>,
|
|
title: Option<String>,
|
|
selftext: Option<String>,
|
|
subreddit_name_prefixed: Option<String>,
|
|
url_overridden_by_dest: Option<String>,
|
|
is_self: Option<bool>,
|
|
upvote_ratio: Option<f64>,
|
|
num_comments: Option<i64>,
|
|
over_18: Option<bool>,
|
|
spoiler: Option<bool>,
|
|
stickied: Option<bool>,
|
|
locked: Option<bool>,
|
|
link_flair_text: Option<String>,
|
|
|
|
// comment (t1)
|
|
author: Option<String>,
|
|
body: Option<String>,
|
|
score: Option<i64>,
|
|
created_utc: Option<f64>,
|
|
is_submitter: Option<bool>,
|
|
depth: Option<i64>,
|
|
permalink: Option<String>,
|
|
|
|
// recursive
|
|
replies: Option<Replies>,
|
|
}
|
|
|
|
#[derive(Deserialize)]
|
|
#[serde(untagged)]
|
|
enum Replies {
|
|
Listing(Listing),
|
|
#[allow(dead_code)]
|
|
Empty(String),
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn matches_reddit_post_urls() {
|
|
assert!(matches(
|
|
"https://www.reddit.com/r/rust/comments/abc123/some_title/"
|
|
));
|
|
assert!(matches(
|
|
"https://reddit.com/r/rust/comments/abc123/some_title"
|
|
));
|
|
assert!(matches("https://old.reddit.com/r/rust/comments/abc123/x/"));
|
|
}
|
|
|
|
#[test]
|
|
fn rejects_non_post_reddit_urls() {
|
|
assert!(!matches("https://www.reddit.com/r/rust"));
|
|
assert!(!matches("https://www.reddit.com/user/foo"));
|
|
assert!(!matches("https://example.com/r/rust/comments/x"));
|
|
}
|
|
|
|
#[test]
|
|
fn json_url_appends_suffix_and_drops_query() {
|
|
assert_eq!(
|
|
build_json_url("https://www.reddit.com/r/rust/comments/abc/x/?utm=foo"),
|
|
"https://www.reddit.com/r/rust/comments/abc/x.json?raw_json=1"
|
|
);
|
|
}
|
|
}
|