feat(extractors): add vertical extractors module + first 6 verticals

New extractors module returns site-specific typed JSON instead of generic markdown. Each extractor: - declares a URL pattern via matches() - fetches from the site's official JSON API where one exists - returns a typed serde_json::Value with documented field names - exposes an INFO struct that powers the /v1/extractors catalog First 6 verticals shipped, all hitting public JSON APIs (no HTML scraping, zero antibot risk): - reddit → www.reddit.com/*/.json - hackernews → hn.algolia.com/api/v1/items/{id} (full thread in one call) - github_repo → api.github.com/repos/{owner}/{repo} - pypi → pypi.org/pypi/{name}/json - npm → registry.npmjs.org/{name} + downloads/point/last-week - huggingface_model → huggingface.co/api/models/{owner}/{name} Server-side routes added: - POST /v1/scrape/{vertical} explicit per-vertical extraction - GET /v1/extractors catalog (name, label, description, url_patterns) The dispatcher validates that URL matches the requested vertical before running, so users get "URL doesn't match the X extractor" instead of opaque parse failures inside the extractor. 17 unit tests cover URL matching + path parsing for each vertical. Live tests against canonical URLs (rust-lang/rust, requests pypi, react npm, whisper-large-v3 hf, item 8863 hn, an r/micro_saas post) all return correct typed JSON in 100-300ms. Sample sizes: github 863B, npm 700B, pypi 1.7KB, hf 3.2KB, hn 38KB (full comment tree). Marketing positioning: Firecrawl charges 5 credits per /extract call and you write the schema. Webclaw returns the same JSON in 1 credit per /scrape/{vertical} call with hand-written deterministic extractors per site.
2026-06-08 22:25:12 +02:00 · 2026-04-22 14:11:43 +02:00 · 2026-04-22 14:11:43 +02:00 · 8ba7538c37
commit 8ba7538c37
parent ccdb6d364b
11 changed files with 1535 additions and 0 deletions
--- a/crates/webclaw-fetch/src/extractors/github_repo.rs
+++ b/crates/webclaw-fetch/src/extractors/github_repo.rs
@ -0,0 +1,212 @@
+//! GitHub repository structured extractor.
+//!
+//! Uses GitHub's public REST API at `api.github.com/repos/{owner}/{repo}`.
+//! Unauthenticated requests get 60/hour per IP, which is fine for users
+//! self-hosting and for low-volume cloud usage. Production cloud should
+//! set a `GITHUB_TOKEN` to lift to 5,000/hour, but the extractor doesn't
+//! depend on it being set — it works open out of the box.
+
+use serde::Deserialize;
+use serde_json::{Value, json};
+
+use super::ExtractorInfo;
+use crate::client::FetchClient;
+use crate::error::FetchError;
+
+pub const INFO: ExtractorInfo = ExtractorInfo {
+    name: "github_repo",
+    label: "GitHub repository",
+    description: "Returns repo metadata: stars, forks, topics, license, default branch, recent activity.",
+    url_patterns: &["https://github.com/{owner}/{repo}"],
+};
+
+pub fn matches(url: &str) -> bool {
+    let host = url
+        .split("://")
+        .nth(1)
+        .unwrap_or(url)
+        .split('/')
+        .next()
+        .unwrap_or("");
+    if host != "github.com" && host != "www.github.com" {
+        return false;
+    }
+    // Path must be exactly /{owner}/{repo} (or with trailing slash). Reject
+    // sub-pages (issues, pulls, blob, etc.) so we don't claim URLs the
+    // future github_issue / github_pr extractors will handle.
+    let path = url
+        .split("://")
+        .nth(1)
+        .and_then(|s| s.split_once('/'))
+        .map(|(_, p)| p)
+        .unwrap_or("");
+    let stripped = path
+        .split(['?', '#'])
+        .next()
+        .unwrap_or("")
+        .trim_end_matches('/');
+    let segs: Vec<&str> = stripped.split('/').filter(|s| !s.is_empty()).collect();
+    segs.len() == 2 && !RESERVED_OWNERS.contains(&segs[0])
+}
+
+/// GitHub uses some top-level paths for non-repo pages.
+const RESERVED_OWNERS: &[&str] = &[
+    "settings",
+    "marketplace",
+    "explore",
+    "topics",
+    "trending",
+    "collections",
+    "events",
+    "sponsors",
+    "issues",
+    "pulls",
+    "notifications",
+    "new",
+    "organizations",
+    "login",
+    "join",
+    "search",
+    "about",
+];
+
+pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
+    let (owner, repo) = parse_owner_repo(url).ok_or_else(|| {
+        FetchError::Build(format!("github_repo: cannot parse owner/repo from '{url}'"))
+    })?;
+
+    let api_url = format!("https://api.github.com/repos/{owner}/{repo}");
+    let resp = client.fetch(&api_url).await?;
+    if resp.status == 404 {
+        return Err(FetchError::Build(format!(
+            "github_repo: repo '{owner}/{repo}' not found"
+        )));
+    }
+    if resp.status == 403 {
+        return Err(FetchError::Build(
+            "github_repo: rate limited (60/hour unauth). Set GITHUB_TOKEN for 5,000/hour.".into(),
+        ));
+    }
+    if resp.status != 200 {
+        return Err(FetchError::Build(format!(
+            "github api returned status {}",
+            resp.status
+        )));
+    }
+
+    let r: Repo = serde_json::from_str(&resp.html)
+        .map_err(|e| FetchError::BodyDecode(format!("github api parse: {e}")))?;
+
+    Ok(json!({
+        "url":              url,
+        "owner":            r.owner.as_ref().map(|o| &o.login),
+        "name":             r.name,
+        "full_name":        r.full_name,
+        "description":      r.description,
+        "homepage":         r.homepage,
+        "language":         r.language,
+        "topics":           r.topics,
+        "license":          r.license.as_ref().and_then(|l| l.spdx_id.clone()),
+        "license_name":     r.license.as_ref().map(|l| l.name.clone()),
+        "default_branch":   r.default_branch,
+        "stars":            r.stargazers_count,
+        "forks":            r.forks_count,
+        "watchers":         r.subscribers_count,
+        "open_issues":      r.open_issues_count,
+        "size_kb":          r.size,
+        "archived":         r.archived,
+        "fork":             r.fork,
+        "is_template":      r.is_template,
+        "has_issues":       r.has_issues,
+        "has_wiki":         r.has_wiki,
+        "has_pages":        r.has_pages,
+        "has_discussions":  r.has_discussions,
+        "created_at":       r.created_at,
+        "updated_at":       r.updated_at,
+        "pushed_at":        r.pushed_at,
+        "html_url":         r.html_url,
+    }))
+}
+
+fn parse_owner_repo(url: &str) -> Option<(String, String)> {
+    let path = url.split("://").nth(1)?.split_once('/').map(|(_, p)| p)?;
+    let stripped = path.split(['?', '#']).next()?.trim_end_matches('/');
+    let mut segs = stripped.split('/').filter(|s| !s.is_empty());
+    let owner = segs.next()?.to_string();
+    let repo = segs.next()?.to_string();
+    Some((owner, repo))
+}
+
+// ---------------------------------------------------------------------------
+// GitHub API types — only the fields we surface
+// ---------------------------------------------------------------------------
+
+#[derive(Deserialize)]
+struct Repo {
+    name: Option<String>,
+    full_name: Option<String>,
+    description: Option<String>,
+    homepage: Option<String>,
+    language: Option<String>,
+    #[serde(default)]
+    topics: Vec<String>,
+    license: Option<License>,
+    default_branch: Option<String>,
+    stargazers_count: Option<i64>,
+    forks_count: Option<i64>,
+    subscribers_count: Option<i64>,
+    open_issues_count: Option<i64>,
+    size: Option<i64>,
+    archived: Option<bool>,
+    fork: Option<bool>,
+    is_template: Option<bool>,
+    has_issues: Option<bool>,
+    has_wiki: Option<bool>,
+    has_pages: Option<bool>,
+    has_discussions: Option<bool>,
+    created_at: Option<String>,
+    updated_at: Option<String>,
+    pushed_at: Option<String>,
+    html_url: Option<String>,
+    owner: Option<Owner>,
+}
+
+#[derive(Deserialize)]
+struct Owner {
+    login: String,
+}
+
+#[derive(Deserialize)]
+struct License {
+    name: String,
+    spdx_id: Option<String>,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn matches_repo_root_only() {
+        assert!(matches("https://github.com/rust-lang/rust"));
+        assert!(matches("https://github.com/rust-lang/rust/"));
+        assert!(!matches("https://github.com/rust-lang/rust/issues"));
+        assert!(!matches("https://github.com/rust-lang/rust/pulls/123"));
+        assert!(!matches("https://github.com/rust-lang"));
+        assert!(!matches("https://github.com/marketplace"));
+        assert!(!matches("https://github.com/topics/rust"));
+        assert!(!matches("https://example.com/foo/bar"));
+    }
+
+    #[test]
+    fn parse_owner_repo_handles_trailing_slash_and_query() {
+        assert_eq!(
+            parse_owner_repo("https://github.com/rust-lang/rust"),
+            Some(("rust-lang".into(), "rust".into()))
+        );
+        assert_eq!(
+            parse_owner_repo("https://github.com/rust-lang/rust/?tab=foo"),
+            Some(("rust-lang".into(), "rust".into()))
+        );
+    }
+}
--- a/crates/webclaw-fetch/src/extractors/hackernews.rs
+++ b/crates/webclaw-fetch/src/extractors/hackernews.rs
@ -0,0 +1,186 @@
+//! Hacker News structured extractor.
+//!
+//! Uses Algolia's HN API (`hn.algolia.com/api/v1/items/{id}`) which
+//! returns the full post + recursive comment tree in a single request.
+//! The official Firebase API at `hacker-news.firebaseio.com` requires
+//! N+1 fetches per comment, so we'd hit either timeout or rate-limit
+//! on any non-trivial thread.
+
+use serde::Deserialize;
+use serde_json::{Value, json};
+
+use super::ExtractorInfo;
+use crate::client::FetchClient;
+use crate::error::FetchError;
+
+pub const INFO: ExtractorInfo = ExtractorInfo {
+    name: "hackernews",
+    label: "Hacker News story",
+    description: "Returns post + nested comment tree for a Hacker News item.",
+    url_patterns: &[
+        "https://news.ycombinator.com/item?id=N",
+        "https://hn.algolia.com/items/N",
+    ],
+};
+
+pub fn matches(url: &str) -> bool {
+    let host = url
+        .split("://")
+        .nth(1)
+        .unwrap_or(url)
+        .split('/')
+        .next()
+        .unwrap_or("");
+    if host == "news.ycombinator.com" {
+        return url.contains("item?id=") || url.contains("item%3Fid=");
+    }
+    if host == "hn.algolia.com" {
+        return url.contains("/items/");
+    }
+    false
+}
+
+pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
+    let id = parse_item_id(url).ok_or_else(|| {
+        FetchError::Build(format!("hackernews: cannot parse item id from '{url}'"))
+    })?;
+
+    let api_url = format!("https://hn.algolia.com/api/v1/items/{id}");
+    let resp = client.fetch(&api_url).await?;
+    if resp.status != 200 {
+        return Err(FetchError::Build(format!(
+            "hn algolia returned status {}",
+            resp.status
+        )));
+    }
+
+    let item: AlgoliaItem = serde_json::from_str(&resp.html)
+        .map_err(|e| FetchError::BodyDecode(format!("hn algolia parse: {e}")))?;
+
+    let post = post_json(&item);
+    let comments: Vec<Value> = item.children.iter().filter_map(comment_json).collect();
+
+    Ok(json!({
+        "url": url,
+        "post": post,
+        "comments": comments,
+    }))
+}
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+/// Pull the numeric id out of a HN URL. Handles `item?id=N` and the
+/// Algolia mirror's `/items/N` form.
+fn parse_item_id(url: &str) -> Option<u64> {
+    if let Some(after) = url.split("id=").nth(1) {
+        let n = after.split('&').next().unwrap_or(after);
+        if let Ok(id) = n.parse::<u64>() {
+            return Some(id);
+        }
+    }
+    if let Some(after) = url.split("/items/").nth(1) {
+        let n = after.split(['/', '?', '#']).next().unwrap_or(after);
+        if let Ok(id) = n.parse::<u64>() {
+            return Some(id);
+        }
+    }
+    None
+}
+
+fn post_json(item: &AlgoliaItem) -> Value {
+    json!({
+        "id":              item.id,
+        "type":            item.r#type,
+        "title":           item.title,
+        "url":             item.url,
+        "author":          item.author,
+        "points":          item.points,
+        "text":            item.text,                 // populated for ask/show/tell
+        "created_at":      item.created_at,
+        "created_at_unix": item.created_at_i,
+        "comment_count":   count_descendants(item),
+        "permalink":       item.id.map(|i| format!("https://news.ycombinator.com/item?id={i}")),
+    })
+}
+
+fn comment_json(item: &AlgoliaItem) -> Option<Value> {
+    if !matches!(item.r#type.as_deref(), Some("comment")) {
+        return None;
+    }
+    // Dead/deleted comments still appear in the tree; surface them honestly.
+    let replies: Vec<Value> = item.children.iter().filter_map(comment_json).collect();
+    Some(json!({
+        "id":              item.id,
+        "author":          item.author,
+        "text":            item.text,
+        "created_at":      item.created_at,
+        "created_at_unix": item.created_at_i,
+        "parent_id":       item.parent_id,
+        "story_id":        item.story_id,
+        "replies":         replies,
+    }))
+}
+
+fn count_descendants(item: &AlgoliaItem) -> usize {
+    item.children
+        .iter()
+        .filter(|c| matches!(c.r#type.as_deref(), Some("comment")))
+        .map(|c| 1 + count_descendants(c))
+        .sum()
+}
+
+// ---------------------------------------------------------------------------
+// Algolia API types
+// ---------------------------------------------------------------------------
+
+#[derive(Deserialize)]
+struct AlgoliaItem {
+    id: Option<u64>,
+    r#type: Option<String>,
+    title: Option<String>,
+    url: Option<String>,
+    author: Option<String>,
+    points: Option<i64>,
+    text: Option<String>,
+    created_at: Option<String>,
+    created_at_i: Option<i64>,
+    parent_id: Option<u64>,
+    story_id: Option<u64>,
+    #[serde(default)]
+    children: Vec<AlgoliaItem>,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn matches_hn_item_urls() {
+        assert!(matches("https://news.ycombinator.com/item?id=1"));
+        assert!(matches("https://news.ycombinator.com/item?id=12345"));
+        assert!(matches("https://hn.algolia.com/items/1"));
+    }
+
+    #[test]
+    fn rejects_non_item_urls() {
+        assert!(!matches("https://news.ycombinator.com/"));
+        assert!(!matches("https://news.ycombinator.com/news"));
+        assert!(!matches("https://example.com/item?id=1"));
+    }
+
+    #[test]
+    fn parse_item_id_handles_both_forms() {
+        assert_eq!(
+            parse_item_id("https://news.ycombinator.com/item?id=1"),
+            Some(1)
+        );
+        assert_eq!(
+            parse_item_id("https://news.ycombinator.com/item?id=12345&p=2"),
+            Some(12345)
+        );
+        assert_eq!(parse_item_id("https://hn.algolia.com/items/999"), Some(999));
+        assert_eq!(parse_item_id("https://example.com/foo"), None);
+    }
+}
--- a/crates/webclaw-fetch/src/extractors/huggingface_model.rs
+++ b/crates/webclaw-fetch/src/extractors/huggingface_model.rs
@ -0,0 +1,223 @@
+//! HuggingFace model card structured extractor.
+//!
+//! Uses the public model API at `huggingface.co/api/models/{owner}/{name}`.
+//! Returns metadata + the parsed model card front matter, but does not
+//! pull the full README body — those are sometimes 100KB+ and the user
+//! can hit /v1/scrape if they want it as markdown.
+
+use serde::Deserialize;
+use serde_json::{Value, json};
+
+use super::ExtractorInfo;
+use crate::client::FetchClient;
+use crate::error::FetchError;
+
+pub const INFO: ExtractorInfo = ExtractorInfo {
+    name: "huggingface_model",
+    label: "HuggingFace model",
+    description: "Returns model metadata: downloads, likes, license, pipeline tag, library name, file list.",
+    url_patterns: &["https://huggingface.co/{owner}/{name}"],
+};
+
+pub fn matches(url: &str) -> bool {
+    let host = host_of(url);
+    if host != "huggingface.co" && host != "www.huggingface.co" {
+        return false;
+    }
+    let path = url
+        .split("://")
+        .nth(1)
+        .and_then(|s| s.split_once('/'))
+        .map(|(_, p)| p)
+        .unwrap_or("");
+    let stripped = path
+        .split(['?', '#'])
+        .next()
+        .unwrap_or("")
+        .trim_end_matches('/');
+    let segs: Vec<&str> = stripped.split('/').filter(|s| !s.is_empty()).collect();
+    // /{owner}/{name} but reject HF-internal sections + sub-pages.
+    if segs.len() != 2 {
+        return false;
+    }
+    !RESERVED_NAMESPACES.contains(&segs[0])
+}
+
+const RESERVED_NAMESPACES: &[&str] = &[
+    "datasets",
+    "spaces",
+    "blog",
+    "docs",
+    "api",
+    "models",
+    "papers",
+    "pricing",
+    "tasks",
+    "join",
+    "login",
+    "settings",
+    "organizations",
+    "new",
+    "search",
+];
+
+pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
+    let (owner, name) = parse_owner_name(url).ok_or_else(|| {
+        FetchError::Build(format!("hf model: cannot parse owner/name from '{url}'"))
+    })?;
+
+    let api_url = format!("https://huggingface.co/api/models/{owner}/{name}");
+    let resp = client.fetch(&api_url).await?;
+    if resp.status == 404 {
+        return Err(FetchError::Build(format!(
+            "hf model: '{owner}/{name}' not found"
+        )));
+    }
+    if resp.status == 401 {
+        return Err(FetchError::Build(format!(
+            "hf model: '{owner}/{name}' requires authentication (gated repo)"
+        )));
+    }
+    if resp.status != 200 {
+        return Err(FetchError::Build(format!(
+            "hf api returned status {}",
+            resp.status
+        )));
+    }
+
+    let m: ModelInfo = serde_json::from_str(&resp.html)
+        .map_err(|e| FetchError::BodyDecode(format!("hf api parse: {e}")))?;
+
+    // Surface a flat file list — full siblings can be hundreds of entries
+    // for big repos. We keep it as-is because callers want to know about
+    // every shard; if it bloats responses too much we'll add pagination.
+    let files: Vec<Value> = m
+        .siblings
+        .iter()
+        .map(|s| json!({"rfilename": s.rfilename, "size": s.size}))
+        .collect();
+
+    Ok(json!({
+        "url":             url,
+        "id":              m.id,
+        "model_id":        m.model_id,
+        "private":         m.private,
+        "gated":           m.gated,
+        "downloads":       m.downloads,
+        "downloads_30d":   m.downloads_all_time,
+        "likes":           m.likes,
+        "library_name":    m.library_name,
+        "pipeline_tag":    m.pipeline_tag,
+        "tags":            m.tags,
+        "license":         m.card_data.as_ref().and_then(|c| c.license.clone()),
+        "language":        m.card_data.as_ref().and_then(|c| c.language.clone()),
+        "datasets":        m.card_data.as_ref().and_then(|c| c.datasets.clone()),
+        "base_model":      m.card_data.as_ref().and_then(|c| c.base_model.clone()),
+        "model_type":      m.card_data.as_ref().and_then(|c| c.model_type.clone()),
+        "created_at":      m.created_at,
+        "last_modified":   m.last_modified,
+        "sha":             m.sha,
+        "file_count":      m.siblings.len(),
+        "files":           files,
+    }))
+}
+
+fn host_of(url: &str) -> &str {
+    url.split("://")
+        .nth(1)
+        .unwrap_or(url)
+        .split('/')
+        .next()
+        .unwrap_or("")
+}
+
+fn parse_owner_name(url: &str) -> Option<(String, String)> {
+    let path = url.split("://").nth(1)?.split_once('/').map(|(_, p)| p)?;
+    let stripped = path.split(['?', '#']).next()?.trim_end_matches('/');
+    let mut segs = stripped.split('/').filter(|s| !s.is_empty());
+    let owner = segs.next()?.to_string();
+    let name = segs.next()?.to_string();
+    Some((owner, name))
+}
+
+// ---------------------------------------------------------------------------
+// HF API types
+// ---------------------------------------------------------------------------
+
+#[derive(Deserialize)]
+struct ModelInfo {
+    id: Option<String>,
+    #[serde(rename = "modelId")]
+    model_id: Option<String>,
+    private: Option<bool>,
+    gated: Option<serde_json::Value>, // bool or string ("auto" / "manual" / false)
+    downloads: Option<i64>,
+    #[serde(rename = "downloadsAllTime")]
+    downloads_all_time: Option<i64>,
+    likes: Option<i64>,
+    #[serde(rename = "library_name")]
+    library_name: Option<String>,
+    #[serde(rename = "pipeline_tag")]
+    pipeline_tag: Option<String>,
+    #[serde(default)]
+    tags: Vec<String>,
+    #[serde(rename = "createdAt")]
+    created_at: Option<String>,
+    #[serde(rename = "lastModified")]
+    last_modified: Option<String>,
+    sha: Option<String>,
+    #[serde(rename = "cardData")]
+    card_data: Option<CardData>,
+    #[serde(default)]
+    siblings: Vec<Sibling>,
+}
+
+#[derive(Deserialize)]
+struct CardData {
+    license: Option<serde_json::Value>, // string or array
+    language: Option<serde_json::Value>,
+    datasets: Option<serde_json::Value>,
+    #[serde(rename = "base_model")]
+    base_model: Option<serde_json::Value>,
+    #[serde(rename = "model_type")]
+    model_type: Option<String>,
+}
+
+#[derive(Deserialize)]
+struct Sibling {
+    rfilename: String,
+    size: Option<i64>,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn matches_model_pages() {
+        assert!(matches("https://huggingface.co/meta-llama/Meta-Llama-3-8B"));
+        assert!(matches("https://huggingface.co/openai/whisper-large-v3"));
+        assert!(matches("https://huggingface.co/bert-base-uncased/main")); // owner=bert-base-uncased name=main: false positive but acceptable for v1
+    }
+
+    #[test]
+    fn rejects_hf_section_pages() {
+        assert!(!matches("https://huggingface.co/datasets/squad"));
+        assert!(!matches("https://huggingface.co/spaces/foo/bar"));
+        assert!(!matches("https://huggingface.co/blog/intro"));
+        assert!(!matches("https://huggingface.co/"));
+        assert!(!matches("https://huggingface.co/meta-llama"));
+    }
+
+    #[test]
+    fn parse_owner_name_pulls_both() {
+        assert_eq!(
+            parse_owner_name("https://huggingface.co/meta-llama/Meta-Llama-3-8B"),
+            Some(("meta-llama".into(), "Meta-Llama-3-8B".into()))
+        );
+        assert_eq!(
+            parse_owner_name("https://huggingface.co/openai/whisper-large-v3?library=transformers"),
+            Some(("openai".into(), "whisper-large-v3".into()))
+        );
+    }
+}
--- a/crates/webclaw-fetch/src/extractors/mod.rs
+++ b/crates/webclaw-fetch/src/extractors/mod.rs
@ -0,0 +1,199 @@
+//! Vertical extractors: site-specific parsers that return typed JSON
+//! instead of generic markdown.
+//!
+//! Each extractor handles a single site or platform and exposes:
+//! - `matches(url)` to claim ownership of a URL pattern
+//! - `extract(client, url)` to fetch + parse into a typed JSON `Value`
+//! - `INFO` static for the catalog (`/v1/extractors`)
+//!
+//! The dispatch in this module is a simple `match`-style chain rather than
+//! a trait registry. With ~30 extractors that's still fast and avoids the
+//! ceremony of dynamic dispatch. If we hit 50+ we'll revisit.
+//!
+//! Extractors prefer official JSON APIs over HTML scraping where one
+//! exists (Reddit, HN/Algolia, PyPI, npm, GitHub, HuggingFace all have
+//! one). HTML extraction is the fallback for sites that don't.
+
+pub mod github_repo;
+pub mod hackernews;
+pub mod huggingface_model;
+pub mod npm;
+pub mod pypi;
+pub mod reddit;
+
+use serde::Serialize;
+use serde_json::Value;
+
+use crate::client::FetchClient;
+use crate::error::FetchError;
+
+/// Public catalog entry for `/v1/extractors`. Stable shape — clients
+/// rely on `name` to pick the right `/v1/scrape/{name}` route.
+#[derive(Debug, Clone, Serialize)]
+pub struct ExtractorInfo {
+    /// URL-safe identifier (`reddit`, `hackernews`, `github_repo`, ...).
+    pub name: &'static str,
+    /// Human-friendly display name.
+    pub label: &'static str,
+    /// One-line description of what the extractor returns.
+    pub description: &'static str,
+    /// Glob-ish URL pattern(s) the extractor claims. For documentation;
+    /// the actual matching is done by the extractor's `matches` fn.
+    pub url_patterns: &'static [&'static str],
+}
+
+/// Full catalog. Order is stable; new entries append.
+pub fn list() -> Vec<ExtractorInfo> {
+    vec![
+        reddit::INFO,
+        hackernews::INFO,
+        github_repo::INFO,
+        pypi::INFO,
+        npm::INFO,
+        huggingface_model::INFO,
+    ]
+}
+
+/// Auto-detect mode: try every extractor's `matches`, return the first
+/// one that claims the URL. Used by `/v1/scrape` when the caller doesn't
+/// pick a vertical explicitly.
+pub async fn dispatch_by_url(
+    client: &FetchClient,
+    url: &str,
+) -> Option<Result<(&'static str, Value), FetchError>> {
+    if reddit::matches(url) {
+        return Some(
+            reddit::extract(client, url)
+                .await
+                .map(|v| (reddit::INFO.name, v)),
+        );
+    }
+    if hackernews::matches(url) {
+        return Some(
+            hackernews::extract(client, url)
+                .await
+                .map(|v| (hackernews::INFO.name, v)),
+        );
+    }
+    if github_repo::matches(url) {
+        return Some(
+            github_repo::extract(client, url)
+                .await
+                .map(|v| (github_repo::INFO.name, v)),
+        );
+    }
+    if pypi::matches(url) {
+        return Some(
+            pypi::extract(client, url)
+                .await
+                .map(|v| (pypi::INFO.name, v)),
+        );
+    }
+    if npm::matches(url) {
+        return Some(npm::extract(client, url).await.map(|v| (npm::INFO.name, v)));
+    }
+    if huggingface_model::matches(url) {
+        return Some(
+            huggingface_model::extract(client, url)
+                .await
+                .map(|v| (huggingface_model::INFO.name, v)),
+        );
+    }
+    None
+}
+
+/// Explicit mode: caller picked the vertical (`POST /v1/scrape/reddit`).
+/// We still validate that the URL plausibly belongs to that vertical so
+/// users get a clear "wrong route" error instead of a confusing parse
+/// failure deep in the extractor.
+pub async fn dispatch_by_name(
+    client: &FetchClient,
+    name: &str,
+    url: &str,
+) -> Result<Value, ExtractorDispatchError> {
+    match name {
+        n if n == reddit::INFO.name => {
+            run_or_mismatch(reddit::matches(url), n, url, || {
+                reddit::extract(client, url)
+            })
+            .await
+        }
+        n if n == hackernews::INFO.name => {
+            run_or_mismatch(hackernews::matches(url), n, url, || {
+                hackernews::extract(client, url)
+            })
+            .await
+        }
+        n if n == github_repo::INFO.name => {
+            run_or_mismatch(github_repo::matches(url), n, url, || {
+                github_repo::extract(client, url)
+            })
+            .await
+        }
+        n if n == pypi::INFO.name => {
+            run_or_mismatch(pypi::matches(url), n, url, || pypi::extract(client, url)).await
+        }
+        n if n == npm::INFO.name => {
+            run_or_mismatch(npm::matches(url), n, url, || npm::extract(client, url)).await
+        }
+        n if n == huggingface_model::INFO.name => {
+            run_or_mismatch(huggingface_model::matches(url), n, url, || {
+                huggingface_model::extract(client, url)
+            })
+            .await
+        }
+        _ => Err(ExtractorDispatchError::UnknownVertical(name.to_string())),
+    }
+}
+
+/// Errors that the dispatcher itself raises (vs. errors from inside an
+/// extractor, which come back wrapped in `Fetch`).
+#[derive(Debug, thiserror::Error)]
+pub enum ExtractorDispatchError {
+    #[error("unknown vertical: '{0}'")]
+    UnknownVertical(String),
+
+    #[error("URL '{url}' does not match the '{vertical}' extractor")]
+    UrlMismatch { vertical: String, url: String },
+
+    #[error(transparent)]
+    Fetch(#[from] FetchError),
+}
+
+/// Helper: when the caller explicitly picked a vertical but their URL
+/// doesn't match it, return `UrlMismatch` instead of running the
+/// extractor (which would just fail with a less-clear error).
+async fn run_or_mismatch<F, Fut>(
+    matches: bool,
+    vertical: &str,
+    url: &str,
+    f: F,
+) -> Result<Value, ExtractorDispatchError>
+where
+    F: FnOnce() -> Fut,
+    Fut: std::future::Future<Output = Result<Value, FetchError>>,
+{
+    if !matches {
+        return Err(ExtractorDispatchError::UrlMismatch {
+            vertical: vertical.to_string(),
+            url: url.to_string(),
+        });
+    }
+    f().await.map_err(ExtractorDispatchError::Fetch)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn list_is_non_empty_and_unique() {
+        let entries = list();
+        assert!(!entries.is_empty());
+        let mut names: Vec<_> = entries.iter().map(|e| e.name).collect();
+        names.sort();
+        let before = names.len();
+        names.dedup();
+        assert_eq!(before, names.len(), "extractor names must be unique");
+    }
+}
--- a/crates/webclaw-fetch/src/extractors/npm.rs
+++ b/crates/webclaw-fetch/src/extractors/npm.rs
@ -0,0 +1,235 @@
+//! npm package structured extractor.
+//!
+//! Uses two npm-run APIs:
+//!   - `registry.npmjs.org/{name}` for full package metadata
+//!   - `api.npmjs.org/downloads/point/last-week/{name}` for usage signal
+//!
+//! The registry API returns the *full* document including every version
+//! ever published, which can be tens of MB for popular packages
+//! (`@types/node` etc). We strip down to the latest version's manifest
+//! and a count of releases — full history would explode the response.
+
+use serde::Deserialize;
+use serde_json::{Value, json};
+
+use super::ExtractorInfo;
+use crate::client::FetchClient;
+use crate::error::FetchError;
+
+pub const INFO: ExtractorInfo = ExtractorInfo {
+    name: "npm",
+    label: "npm package",
+    description: "Returns package metadata: latest version manifest, dependencies, weekly downloads, license.",
+    url_patterns: &["https://www.npmjs.com/package/{name}"],
+};
+
+pub fn matches(url: &str) -> bool {
+    let host = host_of(url);
+    if host != "www.npmjs.com" && host != "npmjs.com" {
+        return false;
+    }
+    url.contains("/package/")
+}
+
+pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
+    let name = parse_name(url)
+        .ok_or_else(|| FetchError::Build(format!("npm: cannot parse name from '{url}'")))?;
+
+    let registry_url = format!("https://registry.npmjs.org/{}", urlencode_segment(&name));
+    let resp = client.fetch(&registry_url).await?;
+    if resp.status == 404 {
+        return Err(FetchError::Build(format!(
+            "npm: package '{name}' not found"
+        )));
+    }
+    if resp.status != 200 {
+        return Err(FetchError::Build(format!(
+            "npm registry returned status {}",
+            resp.status
+        )));
+    }
+
+    let pkg: PackageDoc = serde_json::from_str(&resp.html)
+        .map_err(|e| FetchError::BodyDecode(format!("npm registry parse: {e}")))?;
+
+    // Resolve "latest" to a concrete version.
+    let latest_version = pkg
+        .dist_tags
+        .as_ref()
+        .and_then(|t| t.get("latest"))
+        .cloned()
+        .or_else(|| pkg.versions.as_ref().and_then(|v| v.keys().last().cloned()));
+
+    let latest_manifest = latest_version
+        .as_deref()
+        .and_then(|v| pkg.versions.as_ref().and_then(|m| m.get(v)));
+
+    let release_count = pkg.versions.as_ref().map(|v| v.len()).unwrap_or(0);
+    let latest_release_date = latest_version
+        .as_deref()
+        .and_then(|v| pkg.time.as_ref().and_then(|t| t.get(v).cloned()));
+
+    // Best-effort weekly downloads. If the api.npmjs.org call fails we
+    // surface `null` rather than failing the whole extractor — npm
+    // sometimes 503s the downloads endpoint while the registry is up.
+    let weekly_downloads = fetch_weekly_downloads(client, &name).await.ok();
+
+    Ok(json!({
+        "url":                 url,
+        "name":                pkg.name.clone().unwrap_or(name.clone()),
+        "description":         pkg.description,
+        "latest_version":      latest_version,
+        "license":             latest_manifest.and_then(|m| m.license.clone()),
+        "homepage":            pkg.homepage,
+        "repository":          pkg.repository.as_ref().and_then(|r| r.url.clone()),
+        "dependencies":        latest_manifest.and_then(|m| m.dependencies.clone()),
+        "dev_dependencies":    latest_manifest.and_then(|m| m.dev_dependencies.clone()),
+        "peer_dependencies":   latest_manifest.and_then(|m| m.peer_dependencies.clone()),
+        "keywords":            pkg.keywords,
+        "maintainers":         pkg.maintainers,
+        "deprecated":          latest_manifest.and_then(|m| m.deprecated.clone()),
+        "release_count":       release_count,
+        "latest_release_date": latest_release_date,
+        "weekly_downloads":    weekly_downloads,
+    }))
+}
+
+async fn fetch_weekly_downloads(client: &FetchClient, name: &str) -> Result<i64, FetchError> {
+    let url = format!(
+        "https://api.npmjs.org/downloads/point/last-week/{}",
+        urlencode_segment(name)
+    );
+    let resp = client.fetch(&url).await?;
+    if resp.status != 200 {
+        return Err(FetchError::Build(format!(
+            "npm downloads api status {}",
+            resp.status
+        )));
+    }
+    let dl: Downloads = serde_json::from_str(&resp.html)
+        .map_err(|e| FetchError::BodyDecode(format!("npm downloads parse: {e}")))?;
+    Ok(dl.downloads)
+}
+
+fn host_of(url: &str) -> &str {
+    url.split("://")
+        .nth(1)
+        .unwrap_or(url)
+        .split('/')
+        .next()
+        .unwrap_or("")
+}
+
+/// Extract the package name from an npmjs.com URL. Handles scoped packages
+/// (`/package/@scope/name`) and trailing path segments (`/v/x.y.z`).
+fn parse_name(url: &str) -> Option<String> {
+    let after = url.split("/package/").nth(1)?;
+    let stripped = after.split(['?', '#']).next()?.trim_end_matches('/');
+    let mut segs = stripped.split('/').filter(|s| !s.is_empty());
+    let first = segs.next()?;
+    if first.starts_with('@') {
+        let second = segs.next()?;
+        Some(format!("{first}/{second}"))
+    } else {
+        Some(first.to_string())
+    }
+}
+
+/// `@scope/name` must encode the `/` for the registry path. Plain names
+/// pass through untouched.
+fn urlencode_segment(name: &str) -> String {
+    name.replace('/', "%2F")
+}
+
+// ---------------------------------------------------------------------------
+// Registry types
+// ---------------------------------------------------------------------------
+
+#[derive(Deserialize)]
+struct PackageDoc {
+    name: Option<String>,
+    description: Option<String>,
+    homepage: Option<serde_json::Value>, // sometimes string, sometimes object
+    repository: Option<Repository>,
+    keywords: Option<Vec<String>>,
+    maintainers: Option<Vec<Maintainer>>,
+    #[serde(rename = "dist-tags")]
+    dist_tags: Option<std::collections::BTreeMap<String, String>>,
+    versions: Option<std::collections::BTreeMap<String, VersionManifest>>,
+    time: Option<std::collections::BTreeMap<String, String>>,
+}
+
+#[derive(Deserialize, Default, Clone)]
+struct VersionManifest {
+    license: Option<serde_json::Value>, // string or object
+    dependencies: Option<std::collections::BTreeMap<String, String>>,
+    #[serde(rename = "devDependencies")]
+    dev_dependencies: Option<std::collections::BTreeMap<String, String>>,
+    #[serde(rename = "peerDependencies")]
+    peer_dependencies: Option<std::collections::BTreeMap<String, String>>,
+    // `deprecated` is sometimes a bool and sometimes a string in the
+    // registry. serde_json::Value covers both without failing the parse.
+    deprecated: Option<serde_json::Value>,
+}
+
+#[derive(Deserialize)]
+struct Repository {
+    url: Option<String>,
+}
+
+#[derive(Deserialize, Clone)]
+struct Maintainer {
+    name: Option<String>,
+    email: Option<String>,
+}
+
+impl serde::Serialize for Maintainer {
+    fn serialize<S: serde::Serializer>(&self, s: S) -> Result<S::Ok, S::Error> {
+        use serde::ser::SerializeMap;
+        let mut m = s.serialize_map(Some(2))?;
+        m.serialize_entry("name", &self.name)?;
+        m.serialize_entry("email", &self.email)?;
+        m.end()
+    }
+}
+
+#[derive(Deserialize)]
+struct Downloads {
+    downloads: i64,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn matches_npm_package_urls() {
+        assert!(matches("https://www.npmjs.com/package/react"));
+        assert!(matches("https://www.npmjs.com/package/@types/node"));
+        assert!(matches("https://npmjs.com/package/lodash"));
+        assert!(!matches("https://www.npmjs.com/"));
+        assert!(!matches("https://example.com/package/foo"));
+    }
+
+    #[test]
+    fn parse_name_handles_scoped_and_unscoped() {
+        assert_eq!(
+            parse_name("https://www.npmjs.com/package/react"),
+            Some("react".into())
+        );
+        assert_eq!(
+            parse_name("https://www.npmjs.com/package/@types/node"),
+            Some("@types/node".into())
+        );
+        assert_eq!(
+            parse_name("https://www.npmjs.com/package/lodash/v/4.17.21"),
+            Some("lodash".into())
+        );
+    }
+
+    #[test]
+    fn urlencode_only_touches_scope_separator() {
+        assert_eq!(urlencode_segment("react"), "react");
+        assert_eq!(urlencode_segment("@types/node"), "@types%2Fnode");
+    }
+}
--- a/crates/webclaw-fetch/src/extractors/pypi.rs
+++ b/crates/webclaw-fetch/src/extractors/pypi.rs
@ -0,0 +1,184 @@
+//! PyPI package structured extractor.
+//!
+//! PyPI exposes a stable JSON API at `pypi.org/pypi/{name}/json` and
+//! a versioned form at `pypi.org/pypi/{name}/{version}/json`. Both
+//! return the full release info plus history. No auth, no rate limits
+//! that we hit at normal usage.
+
+use serde::Deserialize;
+use serde_json::{Value, json};
+
+use super::ExtractorInfo;
+use crate::client::FetchClient;
+use crate::error::FetchError;
+
+pub const INFO: ExtractorInfo = ExtractorInfo {
+    name: "pypi",
+    label: "PyPI package",
+    description: "Returns package metadata: latest version, dependencies, license, release history.",
+    url_patterns: &[
+        "https://pypi.org/project/{name}/",
+        "https://pypi.org/project/{name}/{version}/",
+    ],
+};
+
+pub fn matches(url: &str) -> bool {
+    let host = host_of(url);
+    if host != "pypi.org" && host != "www.pypi.org" {
+        return false;
+    }
+    url.contains("/project/")
+}
+
+pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
+    let (name, version) = parse_project(url).ok_or_else(|| {
+        FetchError::Build(format!("pypi: cannot parse package name from '{url}'"))
+    })?;
+
+    let api_url = match &version {
+        Some(v) => format!("https://pypi.org/pypi/{name}/{v}/json"),
+        None => format!("https://pypi.org/pypi/{name}/json"),
+    };
+    let resp = client.fetch(&api_url).await?;
+    if resp.status == 404 {
+        return Err(FetchError::Build(format!(
+            "pypi: package '{name}' not found"
+        )));
+    }
+    if resp.status != 200 {
+        return Err(FetchError::Build(format!(
+            "pypi api returned status {}",
+            resp.status
+        )));
+    }
+
+    let pkg: PypiResponse = serde_json::from_str(&resp.html)
+        .map_err(|e| FetchError::BodyDecode(format!("pypi parse: {e}")))?;
+
+    let info = pkg.info;
+    let release_count = pkg.releases.as_ref().map(|r| r.len()).unwrap_or(0);
+
+    // Latest release date = max upload time across files in the latest version.
+    let latest_release_date = pkg
+        .releases
+        .as_ref()
+        .and_then(|map| info.version.as_deref().and_then(|v| map.get(v)))
+        .and_then(|files| files.iter().filter_map(|f| f.upload_time.clone()).max());
+
+    // Drop the long description from the JSON shape — it's frequently a 50KB
+    // README and bloats responses. Callers who need it can hit /v1/scrape.
+    Ok(json!({
+        "url":                 url,
+        "name":                info.name,
+        "version":             info.version,
+        "summary":             info.summary,
+        "homepage":            info.home_page,
+        "license":             info.license,
+        "license_classifier":  pick_license_classifier(&info.classifiers),
+        "author":              info.author,
+        "author_email":        info.author_email,
+        "maintainer":          info.maintainer,
+        "requires_python":     info.requires_python,
+        "requires_dist":       info.requires_dist,
+        "keywords":            info.keywords,
+        "classifiers":         info.classifiers,
+        "yanked":              info.yanked,
+        "yanked_reason":       info.yanked_reason,
+        "project_urls":        info.project_urls,
+        "release_count":       release_count,
+        "latest_release_date": latest_release_date,
+    }))
+}
+
+/// PyPI puts the SPDX-ish license under classifiers like
+/// `License :: OSI Approved :: Apache Software License`. Surface the most
+/// specific one when the `license` field itself is empty/junk.
+fn pick_license_classifier(classifiers: &Option<Vec<String>>) -> Option<String> {
+    classifiers
+        .as_ref()?
+        .iter()
+        .filter(|c| c.starts_with("License ::"))
+        .max_by_key(|c| c.len())
+        .cloned()
+}
+
+fn host_of(url: &str) -> &str {
+    url.split("://")
+        .nth(1)
+        .unwrap_or(url)
+        .split('/')
+        .next()
+        .unwrap_or("")
+}
+
+fn parse_project(url: &str) -> Option<(String, Option<String>)> {
+    let after = url.split("/project/").nth(1)?;
+    let stripped = after.split(['?', '#']).next()?.trim_end_matches('/');
+    let mut segs = stripped.split('/').filter(|s| !s.is_empty());
+    let name = segs.next()?.to_string();
+    let version = segs.next().map(|v| v.to_string());
+    Some((name, version))
+}
+
+// ---------------------------------------------------------------------------
+// PyPI API types
+// ---------------------------------------------------------------------------
+
+#[derive(Deserialize)]
+struct PypiResponse {
+    info: Info,
+    releases: Option<std::collections::BTreeMap<String, Vec<File>>>,
+}
+
+#[derive(Deserialize)]
+struct Info {
+    name: Option<String>,
+    version: Option<String>,
+    summary: Option<String>,
+    home_page: Option<String>,
+    license: Option<String>,
+    author: Option<String>,
+    author_email: Option<String>,
+    maintainer: Option<String>,
+    requires_python: Option<String>,
+    requires_dist: Option<Vec<String>>,
+    keywords: Option<String>,
+    classifiers: Option<Vec<String>>,
+    yanked: Option<bool>,
+    yanked_reason: Option<String>,
+    project_urls: Option<std::collections::BTreeMap<String, String>>,
+}
+
+#[derive(Deserialize)]
+struct File {
+    upload_time: Option<String>,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn matches_project_urls() {
+        assert!(matches("https://pypi.org/project/requests/"));
+        assert!(matches("https://pypi.org/project/numpy/1.26.0/"));
+        assert!(!matches("https://pypi.org/"));
+        assert!(!matches("https://example.com/project/foo"));
+    }
+
+    #[test]
+    fn parse_project_pulls_name_and_version() {
+        assert_eq!(
+            parse_project("https://pypi.org/project/requests/"),
+            Some(("requests".into(), None))
+        );
+        assert_eq!(
+            parse_project("https://pypi.org/project/numpy/1.26.0/"),
+            Some(("numpy".into(), Some("1.26.0".into())))
+        );
+        assert_eq!(
+            parse_project("https://pypi.org/project/scikit-learn/?foo=bar"),
+            Some(("scikit-learn".into(), None))
+        );
+    }
+}
--- a/crates/webclaw-fetch/src/extractors/reddit.rs
+++ b/crates/webclaw-fetch/src/extractors/reddit.rs
@ -0,0 +1,234 @@
+//! Reddit structured extractor — returns the full post + comment tree
+//! as typed JSON via Reddit's `.json` API.
+//!
+//! The same trick the markdown extractor in `crate::reddit` uses:
+//! appending `.json` to any post URL returns the data the new SPA
+//! frontend would load client-side. Zero antibot, zero JS rendering.
+
+use serde::Deserialize;
+use serde_json::{Value, json};
+
+use super::ExtractorInfo;
+use crate::client::FetchClient;
+use crate::error::FetchError;
+
+pub const INFO: ExtractorInfo = ExtractorInfo {
+    name: "reddit",
+    label: "Reddit thread",
+    description: "Returns post + nested comment tree with scores, authors, and timestamps.",
+    url_patterns: &[
+        "https://www.reddit.com/r/*/comments/*",
+        "https://reddit.com/r/*/comments/*",
+        "https://old.reddit.com/r/*/comments/*",
+    ],
+};
+
+pub fn matches(url: &str) -> bool {
+    let host = host_of(url);
+    let is_reddit_host = matches!(
+        host,
+        "reddit.com" | "www.reddit.com" | "old.reddit.com" | "np.reddit.com" | "new.reddit.com"
+    );
+    is_reddit_host && url.contains("/comments/")
+}
+
+pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
+    let json_url = build_json_url(url);
+    let resp = client.fetch(&json_url).await?;
+    if resp.status != 200 {
+        return Err(FetchError::Build(format!(
+            "reddit api returned status {}",
+            resp.status
+        )));
+    }
+
+    let listings: Vec<Listing> = serde_json::from_str(&resp.html)
+        .map_err(|e| FetchError::BodyDecode(format!("reddit json parse: {e}")))?;
+
+    if listings.is_empty() {
+        return Err(FetchError::BodyDecode("reddit response empty".into()));
+    }
+
+    // First listing = the post (single t3 child).
+    let post = listings
+        .first()
+        .and_then(|l| l.data.children.first())
+        .filter(|t| t.kind == "t3")
+        .map(|t| post_json(&t.data))
+        .unwrap_or(Value::Null);
+
+    // Second listing = the comment tree.
+    let comments: Vec<Value> = listings
+        .get(1)
+        .map(|l| l.data.children.iter().filter_map(comment_json).collect())
+        .unwrap_or_default();
+
+    Ok(json!({
+        "url": url,
+        "post": post,
+        "comments": comments,
+    }))
+}
+
+// ---------------------------------------------------------------------------
+// JSON shapers
+// ---------------------------------------------------------------------------
+
+fn post_json(d: &ThingData) -> Value {
+    json!({
+        "id":               d.id,
+        "title":            d.title,
+        "author":           d.author,
+        "subreddit":        d.subreddit_name_prefixed,
+        "permalink":        d.permalink.as_ref().map(|p| format!("https://www.reddit.com{p}")),
+        "url":              d.url_overridden_by_dest,
+        "is_self":          d.is_self,
+        "selftext":         d.selftext,
+        "score":            d.score,
+        "upvote_ratio":     d.upvote_ratio,
+        "num_comments":     d.num_comments,
+        "created_utc":      d.created_utc,
+        "link_flair_text":  d.link_flair_text,
+        "over_18":          d.over_18,
+        "spoiler":          d.spoiler,
+        "stickied":         d.stickied,
+        "locked":           d.locked,
+    })
+}
+
+/// Render a single comment + its reply tree. Returns `None` for non-t1
+/// kinds (the trailing `more` placeholder Reddit injects at depth limits).
+fn comment_json(thing: &Thing) -> Option<Value> {
+    if thing.kind != "t1" {
+        return None;
+    }
+    let d = &thing.data;
+    let replies: Vec<Value> = match &d.replies {
+        Some(Replies::Listing(l)) => l.data.children.iter().filter_map(comment_json).collect(),
+        _ => Vec::new(),
+    };
+    Some(json!({
+        "id":             d.id,
+        "author":         d.author,
+        "body":           d.body,
+        "score":          d.score,
+        "created_utc":    d.created_utc,
+        "is_submitter":   d.is_submitter,
+        "stickied":       d.stickied,
+        "depth":          d.depth,
+        "permalink":      d.permalink.as_ref().map(|p| format!("https://www.reddit.com{p}")),
+        "replies":        replies,
+    }))
+}
+
+// ---------------------------------------------------------------------------
+// URL helpers
+// ---------------------------------------------------------------------------
+
+fn host_of(url: &str) -> &str {
+    url.split("://")
+        .nth(1)
+        .unwrap_or(url)
+        .split('/')
+        .next()
+        .unwrap_or("")
+}
+
+/// Build the Reddit JSON URL. We keep the original host (`www.reddit.com`
+/// or `old.reddit.com` as the caller gave us). Routing through
+/// `old.reddit.com` unconditionally looks appealing but that host has
+/// stricter UA-based blocking than `www.reddit.com`, while the main
+/// host accepts our Chrome-fingerprinted client fine.
+fn build_json_url(url: &str) -> String {
+    let clean = url.split('?').next().unwrap_or(url).trim_end_matches('/');
+    format!("{clean}.json?raw_json=1")
+}
+
+// ---------------------------------------------------------------------------
+// Reddit JSON types — only fields we render. Everything else is dropped.
+// ---------------------------------------------------------------------------
+
+#[derive(Deserialize)]
+struct Listing {
+    data: ListingData,
+}
+
+#[derive(Deserialize)]
+struct ListingData {
+    children: Vec<Thing>,
+}
+
+#[derive(Deserialize)]
+struct Thing {
+    kind: String,
+    data: ThingData,
+}
+
+#[derive(Deserialize, Default)]
+struct ThingData {
+    // post (t3)
+    id: Option<String>,
+    title: Option<String>,
+    selftext: Option<String>,
+    subreddit_name_prefixed: Option<String>,
+    url_overridden_by_dest: Option<String>,
+    is_self: Option<bool>,
+    upvote_ratio: Option<f64>,
+    num_comments: Option<i64>,
+    over_18: Option<bool>,
+    spoiler: Option<bool>,
+    stickied: Option<bool>,
+    locked: Option<bool>,
+    link_flair_text: Option<String>,
+
+    // comment (t1)
+    author: Option<String>,
+    body: Option<String>,
+    score: Option<i64>,
+    created_utc: Option<f64>,
+    is_submitter: Option<bool>,
+    depth: Option<i64>,
+    permalink: Option<String>,
+
+    // recursive
+    replies: Option<Replies>,
+}
+
+#[derive(Deserialize)]
+#[serde(untagged)]
+enum Replies {
+    Listing(Listing),
+    #[allow(dead_code)]
+    Empty(String),
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn matches_reddit_post_urls() {
+        assert!(matches(
+            "https://www.reddit.com/r/rust/comments/abc123/some_title/"
+        ));
+        assert!(matches(
+            "https://reddit.com/r/rust/comments/abc123/some_title"
+        ));
+        assert!(matches("https://old.reddit.com/r/rust/comments/abc123/x/"));
+    }
+
+    #[test]
+    fn rejects_non_post_reddit_urls() {
+        assert!(!matches("https://www.reddit.com/r/rust"));
+        assert!(!matches("https://www.reddit.com/user/foo"));
+        assert!(!matches("https://example.com/r/rust/comments/x"));
+    }
+
+    #[test]
+    fn json_url_appends_suffix_and_drops_query() {
+        assert_eq!(
+            build_json_url("https://www.reddit.com/r/rust/comments/abc/x/?utm=foo"),
+            "https://www.reddit.com/r/rust/comments/abc/x.json?raw_json=1"
+        );
+    }
+}
--- a/crates/webclaw-fetch/src/lib.rs
+++ b/crates/webclaw-fetch/src/lib.rs
@ -6,6 +6,7 @@ pub mod client;
 pub mod crawler;
 pub mod document;
 pub mod error;
+pub mod extractors;
 pub mod linkedin;
 pub mod proxy;
 pub mod reddit;
--- a/crates/webclaw-server/src/main.rs
+++ b/crates/webclaw-server/src/main.rs
@ -79,10 +79,15 @@ async fn main() -> anyhow::Result<()> {

    let v1 = Router::new()
        .route("/scrape", post(routes::scrape::scrape))
+        .route(
+            "/scrape/{vertical}",
+            post(routes::structured::scrape_vertical),
+        )
        .route("/crawl", post(routes::crawl::crawl))
        .route("/map", post(routes::map::map))
        .route("/batch", post(routes::batch::batch))
        .route("/extract", post(routes::extract::extract))
+        .route("/extractors", get(routes::structured::list_extractors))
        .route("/summarize", post(routes::summarize::summarize_route))
        .route("/diff", post(routes::diff::diff_route))
        .route("/brand", post(routes::brand::brand))
--- a/crates/webclaw-server/src/routes/mod.rs
+++ b/crates/webclaw-server/src/routes/mod.rs
@ -15,4 +15,5 @@ pub mod extract;
 pub mod health;
 pub mod map;
 pub mod scrape;
+pub mod structured;
 pub mod summarize;
--- a/crates/webclaw-server/src/routes/structured.rs
+++ b/crates/webclaw-server/src/routes/structured.rs
@ -0,0 +1,55 @@
+//! `POST /v1/scrape/{vertical}` and `GET /v1/extractors`.
+//!
+//! Vertical extractors return typed JSON instead of generic markdown.
+//! See `webclaw_fetch::extractors` for the catalog and per-site logic.
+
+use axum::{
+    Json,
+    extract::{Path, State},
+};
+use serde::Deserialize;
+use serde_json::{Value, json};
+use webclaw_fetch::extractors::{self, ExtractorDispatchError};
+
+use crate::{error::ApiError, state::AppState};
+
+#[derive(Debug, Deserialize)]
+pub struct ScrapeRequest {
+    pub url: String,
+}
+
+/// Map dispatcher errors to ApiError so users get clean HTTP statuses
+/// instead of opaque 500s.
+impl From<ExtractorDispatchError> for ApiError {
+    fn from(e: ExtractorDispatchError) -> Self {
+        match e {
+            ExtractorDispatchError::UnknownVertical(_) => ApiError::NotFound,
+            ExtractorDispatchError::UrlMismatch { .. } => ApiError::bad_request(e.to_string()),
+            ExtractorDispatchError::Fetch(f) => ApiError::Fetch(f.to_string()),
+        }
+    }
+}
+
+/// `GET /v1/extractors` — catalog of all available verticals.
+pub async fn list_extractors() -> Json<Value> {
+    Json(json!({
+        "extractors": extractors::list(),
+    }))
+}
+
+/// `POST /v1/scrape/{vertical}` — explicit vertical, e.g. /v1/scrape/reddit.
+pub async fn scrape_vertical(
+    State(state): State<AppState>,
+    Path(vertical): Path<String>,
+    Json(req): Json<ScrapeRequest>,
+) -> Result<Json<Value>, ApiError> {
+    if req.url.trim().is_empty() {
+        return Err(ApiError::bad_request("`url` is required"));
+    }
+    let data = extractors::dispatch_by_name(state.fetch(), &vertical, &req.url).await?;
+    Ok(Json(json!({
+        "vertical": vertical,
+        "url": req.url,
+        "data": data,
+    })))
+}