Initial release: webclaw v0.1.0 — web content extraction for LLMs

CLI + MCP server for extracting clean, structured content from any URL. 6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats. MIT Licensed | https://webclaw.io
2026-04-25 00:06:21 +02:00 · 2026-03-23 18:31:11 +01:00 · 2026-03-23 18:31:11 +01:00 · c99ec684fa
commit c99ec684fa
79 changed files with 24074 additions and 0 deletions
--- a/crates/webclaw-fetch/src/reddit.rs
+++ b/crates/webclaw-fetch/src/reddit.rs
@ -0,0 +1,172 @@
+/// Reddit JSON API fallback for extracting posts + comments without JS rendering.
+///
+/// Reddit's new `shreddit` frontend only SSRs the post body — comments are
+/// loaded client-side. Appending `.json` to any Reddit URL returns the full
+/// comment tree as structured JSON, which we convert to clean markdown.
+use serde::Deserialize;
+use tracing::debug;
+use webclaw_core::{Content, ExtractionResult, Metadata};
+
+/// Check if a URL points to a Reddit post/comment page.
+pub fn is_reddit_url(url: &str) -> bool {
+    let host = url
+        .split("://")
+        .nth(1)
+        .unwrap_or(url)
+        .split('/')
+        .next()
+        .unwrap_or("");
+    matches!(
+        host,
+        "reddit.com" | "www.reddit.com" | "old.reddit.com" | "np.reddit.com" | "new.reddit.com"
+    )
+}
+
+/// Build the `.json` URL from a Reddit page URL.
+pub fn json_url(url: &str) -> String {
+    let clean = url.split('?').next().unwrap_or(url).trim_end_matches('/');
+    format!("{clean}.json")
+}
+
+/// Convert Reddit JSON API response into an ExtractionResult.
+pub fn parse_reddit_json(json_bytes: &[u8], url: &str) -> Result<ExtractionResult, String> {
+    let listings: Vec<Listing> =
+        serde_json::from_slice(json_bytes).map_err(|e| format!("reddit json parse: {e}"))?;
+
+    let mut markdown = String::new();
+    let mut title = None;
+    let mut author = None;
+    let mut subreddit = None;
+
+    // First listing = the post itself
+    if let Some(post_listing) = listings.first() {
+        for child in &post_listing.data.children {
+            if child.kind == "t3" {
+                let d = &child.data;
+                title = d.title.clone();
+                author = d.author.clone();
+                subreddit = d.subreddit_name_prefixed.clone();
+
+                if let Some(ref t) = title {
+                    markdown.push_str(&format!("# {t}\n\n"));
+                }
+                if let (Some(a), Some(sr)) = (&author, &subreddit) {
+                    markdown.push_str(&format!("**u/{a}** in {sr}\n\n"));
+                }
+                if let Some(ref body) = d.selftext
+                    && !body.is_empty()
+                {
+                    markdown.push_str(body);
+                    markdown.push_str("\n\n");
+                }
+                if let Some(ref url_field) = d.url_overridden_by_dest
+                    && !url_field.is_empty()
+                {
+                    markdown.push_str(&format!("[Link]({url_field})\n\n"));
+                }
+                markdown.push_str("---\n\n");
+            }
+        }
+    }
+
+    // Second listing = comment tree
+    if let Some(comment_listing) = listings.get(1) {
+        markdown.push_str("## Comments\n\n");
+        for child in &comment_listing.data.children {
+            render_comment(child, 0, &mut markdown);
+        }
+    }
+
+    let word_count = markdown.split_whitespace().count();
+    debug!(word_count, "reddit json extracted");
+
+    Ok(ExtractionResult {
+        metadata: Metadata {
+            title,
+            description: None,
+            author,
+            published_date: None,
+            language: Some("en".into()),
+            url: Some(url.to_string()),
+            site_name: subreddit,
+            image: None,
+            favicon: None,
+            word_count,
+        },
+        content: Content {
+            markdown,
+            plain_text: String::new(),
+            links: vec![],
+            images: vec![],
+            code_blocks: vec![],
+            raw_html: None,
+        },
+        domain_data: None,
+        structured_data: vec![],
+    })
+}
+
+fn render_comment(thing: &Thing, depth: usize, out: &mut String) {
+    if thing.kind != "t1" {
+        return;
+    }
+    let d = &thing.data;
+    let indent = "  ".repeat(depth);
+    let author = d.author.as_deref().unwrap_or("[deleted]");
+    let body = d.body.as_deref().unwrap_or("[removed]");
+    let score = d.score.unwrap_or(0);
+
+    out.push_str(&format!("{indent}- **u/{author}** ({score} pts)\n"));
+    for line in body.lines() {
+        out.push_str(&format!("{indent}  {line}\n"));
+    }
+    out.push('\n');
+
+    // Recurse into replies
+    if let Some(Replies::Listing(listing)) = &d.replies {
+        for child in &listing.data.children {
+            render_comment(child, depth + 1, out);
+        }
+    }
+}
+
+// --- Reddit JSON types (minimal) ---
+
+#[derive(Deserialize)]
+struct Listing {
+    data: ListingData,
+}
+
+#[derive(Deserialize)]
+struct ListingData {
+    children: Vec<Thing>,
+}
+
+#[derive(Deserialize)]
+struct Thing {
+    kind: String,
+    data: ThingData,
+}
+
+#[derive(Deserialize)]
+struct ThingData {
+    // Post fields (t3)
+    title: Option<String>,
+    selftext: Option<String>,
+    subreddit_name_prefixed: Option<String>,
+    url_overridden_by_dest: Option<String>,
+    // Comment fields (t1)
+    author: Option<String>,
+    body: Option<String>,
+    score: Option<i64>,
+    replies: Option<Replies>,
+}
+
+/// Reddit replies can be either a nested Listing or an empty string.
+#[derive(Deserialize)]
+#[serde(untagged)]
+enum Replies {
+    Listing(Listing),
+    #[allow(dead_code)]
+    Empty(String),
+}