mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-04-25 00:06:21 +02:00
Initial release: webclaw v0.1.0 — web content extraction for LLMs
CLI + MCP server for extracting clean, structured content from any URL. 6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats. MIT Licensed | https://webclaw.io
This commit is contained in:
commit
c99ec684fa
79 changed files with 24074 additions and 0 deletions
172
crates/webclaw-fetch/src/reddit.rs
Normal file
172
crates/webclaw-fetch/src/reddit.rs
Normal file
|
|
@ -0,0 +1,172 @@
|
|||
/// Reddit JSON API fallback for extracting posts + comments without JS rendering.
|
||||
///
|
||||
/// Reddit's new `shreddit` frontend only SSRs the post body — comments are
|
||||
/// loaded client-side. Appending `.json` to any Reddit URL returns the full
|
||||
/// comment tree as structured JSON, which we convert to clean markdown.
|
||||
use serde::Deserialize;
|
||||
use tracing::debug;
|
||||
use webclaw_core::{Content, ExtractionResult, Metadata};
|
||||
|
||||
/// Check if a URL points to a Reddit post/comment page.
|
||||
pub fn is_reddit_url(url: &str) -> bool {
|
||||
let host = url
|
||||
.split("://")
|
||||
.nth(1)
|
||||
.unwrap_or(url)
|
||||
.split('/')
|
||||
.next()
|
||||
.unwrap_or("");
|
||||
matches!(
|
||||
host,
|
||||
"reddit.com" | "www.reddit.com" | "old.reddit.com" | "np.reddit.com" | "new.reddit.com"
|
||||
)
|
||||
}
|
||||
|
||||
/// Build the `.json` URL from a Reddit page URL.
|
||||
pub fn json_url(url: &str) -> String {
|
||||
let clean = url.split('?').next().unwrap_or(url).trim_end_matches('/');
|
||||
format!("{clean}.json")
|
||||
}
|
||||
|
||||
/// Convert Reddit JSON API response into an ExtractionResult.
|
||||
pub fn parse_reddit_json(json_bytes: &[u8], url: &str) -> Result<ExtractionResult, String> {
|
||||
let listings: Vec<Listing> =
|
||||
serde_json::from_slice(json_bytes).map_err(|e| format!("reddit json parse: {e}"))?;
|
||||
|
||||
let mut markdown = String::new();
|
||||
let mut title = None;
|
||||
let mut author = None;
|
||||
let mut subreddit = None;
|
||||
|
||||
// First listing = the post itself
|
||||
if let Some(post_listing) = listings.first() {
|
||||
for child in &post_listing.data.children {
|
||||
if child.kind == "t3" {
|
||||
let d = &child.data;
|
||||
title = d.title.clone();
|
||||
author = d.author.clone();
|
||||
subreddit = d.subreddit_name_prefixed.clone();
|
||||
|
||||
if let Some(ref t) = title {
|
||||
markdown.push_str(&format!("# {t}\n\n"));
|
||||
}
|
||||
if let (Some(a), Some(sr)) = (&author, &subreddit) {
|
||||
markdown.push_str(&format!("**u/{a}** in {sr}\n\n"));
|
||||
}
|
||||
if let Some(ref body) = d.selftext
|
||||
&& !body.is_empty()
|
||||
{
|
||||
markdown.push_str(body);
|
||||
markdown.push_str("\n\n");
|
||||
}
|
||||
if let Some(ref url_field) = d.url_overridden_by_dest
|
||||
&& !url_field.is_empty()
|
||||
{
|
||||
markdown.push_str(&format!("[Link]({url_field})\n\n"));
|
||||
}
|
||||
markdown.push_str("---\n\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Second listing = comment tree
|
||||
if let Some(comment_listing) = listings.get(1) {
|
||||
markdown.push_str("## Comments\n\n");
|
||||
for child in &comment_listing.data.children {
|
||||
render_comment(child, 0, &mut markdown);
|
||||
}
|
||||
}
|
||||
|
||||
let word_count = markdown.split_whitespace().count();
|
||||
debug!(word_count, "reddit json extracted");
|
||||
|
||||
Ok(ExtractionResult {
|
||||
metadata: Metadata {
|
||||
title,
|
||||
description: None,
|
||||
author,
|
||||
published_date: None,
|
||||
language: Some("en".into()),
|
||||
url: Some(url.to_string()),
|
||||
site_name: subreddit,
|
||||
image: None,
|
||||
favicon: None,
|
||||
word_count,
|
||||
},
|
||||
content: Content {
|
||||
markdown,
|
||||
plain_text: String::new(),
|
||||
links: vec![],
|
||||
images: vec![],
|
||||
code_blocks: vec![],
|
||||
raw_html: None,
|
||||
},
|
||||
domain_data: None,
|
||||
structured_data: vec![],
|
||||
})
|
||||
}
|
||||
|
||||
fn render_comment(thing: &Thing, depth: usize, out: &mut String) {
|
||||
if thing.kind != "t1" {
|
||||
return;
|
||||
}
|
||||
let d = &thing.data;
|
||||
let indent = " ".repeat(depth);
|
||||
let author = d.author.as_deref().unwrap_or("[deleted]");
|
||||
let body = d.body.as_deref().unwrap_or("[removed]");
|
||||
let score = d.score.unwrap_or(0);
|
||||
|
||||
out.push_str(&format!("{indent}- **u/{author}** ({score} pts)\n"));
|
||||
for line in body.lines() {
|
||||
out.push_str(&format!("{indent} {line}\n"));
|
||||
}
|
||||
out.push('\n');
|
||||
|
||||
// Recurse into replies
|
||||
if let Some(Replies::Listing(listing)) = &d.replies {
|
||||
for child in &listing.data.children {
|
||||
render_comment(child, depth + 1, out);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// --- Reddit JSON types (minimal) ---
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct Listing {
|
||||
data: ListingData,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct ListingData {
|
||||
children: Vec<Thing>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct Thing {
|
||||
kind: String,
|
||||
data: ThingData,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct ThingData {
|
||||
// Post fields (t3)
|
||||
title: Option<String>,
|
||||
selftext: Option<String>,
|
||||
subreddit_name_prefixed: Option<String>,
|
||||
url_overridden_by_dest: Option<String>,
|
||||
// Comment fields (t1)
|
||||
author: Option<String>,
|
||||
body: Option<String>,
|
||||
score: Option<i64>,
|
||||
replies: Option<Replies>,
|
||||
}
|
||||
|
||||
/// Reddit replies can be either a nested Listing or an empty string.
|
||||
#[derive(Deserialize)]
|
||||
#[serde(untagged)]
|
||||
enum Replies {
|
||||
Listing(Listing),
|
||||
#[allow(dead_code)]
|
||||
Empty(String),
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue