Initial release: webclaw v0.1.0 — web content extraction for LLMs

CLI + MCP server for extracting clean, structured content from any URL.
6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats.

MIT Licensed | https://webclaw.io
This commit is contained in:
Valerio 2026-03-23 18:31:11 +01:00
commit c99ec684fa
79 changed files with 24074 additions and 0 deletions

View file

@ -0,0 +1,172 @@
/// Reddit JSON API fallback for extracting posts + comments without JS rendering.
///
/// Reddit's new `shreddit` frontend only SSRs the post body — comments are
/// loaded client-side. Appending `.json` to any Reddit URL returns the full
/// comment tree as structured JSON, which we convert to clean markdown.
use serde::Deserialize;
use tracing::debug;
use webclaw_core::{Content, ExtractionResult, Metadata};
/// Check if a URL points to a Reddit post/comment page.
pub fn is_reddit_url(url: &str) -> bool {
let host = url
.split("://")
.nth(1)
.unwrap_or(url)
.split('/')
.next()
.unwrap_or("");
matches!(
host,
"reddit.com" | "www.reddit.com" | "old.reddit.com" | "np.reddit.com" | "new.reddit.com"
)
}
/// Build the `.json` URL from a Reddit page URL.
pub fn json_url(url: &str) -> String {
let clean = url.split('?').next().unwrap_or(url).trim_end_matches('/');
format!("{clean}.json")
}
/// Convert Reddit JSON API response into an ExtractionResult.
pub fn parse_reddit_json(json_bytes: &[u8], url: &str) -> Result<ExtractionResult, String> {
let listings: Vec<Listing> =
serde_json::from_slice(json_bytes).map_err(|e| format!("reddit json parse: {e}"))?;
let mut markdown = String::new();
let mut title = None;
let mut author = None;
let mut subreddit = None;
// First listing = the post itself
if let Some(post_listing) = listings.first() {
for child in &post_listing.data.children {
if child.kind == "t3" {
let d = &child.data;
title = d.title.clone();
author = d.author.clone();
subreddit = d.subreddit_name_prefixed.clone();
if let Some(ref t) = title {
markdown.push_str(&format!("# {t}\n\n"));
}
if let (Some(a), Some(sr)) = (&author, &subreddit) {
markdown.push_str(&format!("**u/{a}** in {sr}\n\n"));
}
if let Some(ref body) = d.selftext
&& !body.is_empty()
{
markdown.push_str(body);
markdown.push_str("\n\n");
}
if let Some(ref url_field) = d.url_overridden_by_dest
&& !url_field.is_empty()
{
markdown.push_str(&format!("[Link]({url_field})\n\n"));
}
markdown.push_str("---\n\n");
}
}
}
// Second listing = comment tree
if let Some(comment_listing) = listings.get(1) {
markdown.push_str("## Comments\n\n");
for child in &comment_listing.data.children {
render_comment(child, 0, &mut markdown);
}
}
let word_count = markdown.split_whitespace().count();
debug!(word_count, "reddit json extracted");
Ok(ExtractionResult {
metadata: Metadata {
title,
description: None,
author,
published_date: None,
language: Some("en".into()),
url: Some(url.to_string()),
site_name: subreddit,
image: None,
favicon: None,
word_count,
},
content: Content {
markdown,
plain_text: String::new(),
links: vec![],
images: vec![],
code_blocks: vec![],
raw_html: None,
},
domain_data: None,
structured_data: vec![],
})
}
fn render_comment(thing: &Thing, depth: usize, out: &mut String) {
if thing.kind != "t1" {
return;
}
let d = &thing.data;
let indent = " ".repeat(depth);
let author = d.author.as_deref().unwrap_or("[deleted]");
let body = d.body.as_deref().unwrap_or("[removed]");
let score = d.score.unwrap_or(0);
out.push_str(&format!("{indent}- **u/{author}** ({score} pts)\n"));
for line in body.lines() {
out.push_str(&format!("{indent} {line}\n"));
}
out.push('\n');
// Recurse into replies
if let Some(Replies::Listing(listing)) = &d.replies {
for child in &listing.data.children {
render_comment(child, depth + 1, out);
}
}
}
// --- Reddit JSON types (minimal) ---
#[derive(Deserialize)]
struct Listing {
data: ListingData,
}
#[derive(Deserialize)]
struct ListingData {
children: Vec<Thing>,
}
#[derive(Deserialize)]
struct Thing {
kind: String,
data: ThingData,
}
#[derive(Deserialize)]
struct ThingData {
// Post fields (t3)
title: Option<String>,
selftext: Option<String>,
subreddit_name_prefixed: Option<String>,
url_overridden_by_dest: Option<String>,
// Comment fields (t1)
author: Option<String>,
body: Option<String>,
score: Option<i64>,
replies: Option<Replies>,
}
/// Reddit replies can be either a nested Listing or an empty string.
#[derive(Deserialize)]
#[serde(untagged)]
enum Replies {
Listing(Listing),
#[allow(dead_code)]
Empty(String),
}