feat(extractors): wave 2 \u2014 8 more verticals (14 total)

Adds 8 more vertical extractors using public JSON APIs. All hit deterministic endpoints with no antibot risk. Live tests pass against canonical URLs for each. AI / ML ecosystem (3): - crates_io \u2192 crates.io/api/v1/crates/{name} - huggingface_dataset \u2192 huggingface.co/api/datasets/{path} (handles both legacy /datasets/{name} and canonical {owner}/{name}) - arxiv \u2192 export.arxiv.org/api/query (Atom XML parsed by quick-xml) Code / version control (2): - github_pr \u2192 api.github.com/repos/{owner}/{repo}/pulls/{number} - github_release \u2192 api.github.com/repos/{owner}/{repo}/releases/tags/{tag} Infrastructure (1): - docker_hub \u2192 hub.docker.com/v2/repositories/{namespace}/{name} (official-image shorthand /_/nginx normalized to library/nginx) Community / publishing (2): - dev_to \u2192 dev.to/api/articles/{username}/{slug} - stackoverflow \u2192 api.stackexchange.com/2.3/questions/{id} + answers, filter=withbody for rendered HTML, sort=votes for consistent top-answers ordering Live test results (real URLs): - serde: 942M downloads, 838B response - 'Attention Is All You Need': abstract + authors, 1.8KB - nginx official: 12.9B pulls, 21k stars, 17KB - openai/gsm8k: 822k downloads, 1.7KB - rust-lang/rust#138000: merged by RalfJung, +3/-2, 1KB - webclaw v0.4.0: 2.4KB - a real dev.to article: 2.2KB body, 3.1KB total - python yield Q&A: score 13133, 51 answers, 104KB Catalog now exposes 14 extractors via GET /v1/extractors. Total unit tests across the module: 34 passing. Clippy clean. Fmt clean. Marketing positioning sharpens: 14 dedicated extractors, all deterministic, all 1-credit-per-call. Firecrawl's /extract is 5 credits per call and you write the schema yourself.
2026-04-25 00:06:21 +02:00 · 2026-04-22 14:20:21 +02:00 · 2026-04-22 14:20:21 +02:00 · b041f3cddd
commit b041f3cddd
parent 86182ef28a
9 changed files with 1710 additions and 0 deletions
--- a/crates/webclaw-fetch/src/extractors/arxiv.rs
+++ b/crates/webclaw-fetch/src/extractors/arxiv.rs
@ -0,0 +1,314 @@
+//! ArXiv paper structured extractor.
+//!
+//! Uses the public ArXiv API at `export.arxiv.org/api/query?id_list={id}`
+//! which returns Atom XML. We parse just enough to surface title, authors,
+//! abstract, categories, and the canonical PDF link. No HTML scraping
+//! required and no auth.
+
+use quick_xml::Reader;
+use quick_xml::events::Event;
+use serde_json::{Value, json};
+
+use super::ExtractorInfo;
+use crate::client::FetchClient;
+use crate::error::FetchError;
+
+pub const INFO: ExtractorInfo = ExtractorInfo {
+    name: "arxiv",
+    label: "ArXiv paper",
+    description: "Returns paper metadata: title, authors, abstract, categories, primary category, PDF URL.",
+    url_patterns: &[
+        "https://arxiv.org/abs/{id}",
+        "https://arxiv.org/abs/{id}v{n}",
+        "https://arxiv.org/pdf/{id}",
+    ],
+};
+
+pub fn matches(url: &str) -> bool {
+    let host = host_of(url);
+    if host != "arxiv.org" && host != "www.arxiv.org" {
+        return false;
+    }
+    url.contains("/abs/") || url.contains("/pdf/")
+}
+
+pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
+    let id = parse_id(url)
+        .ok_or_else(|| FetchError::Build(format!("arxiv: cannot parse id from '{url}'")))?;
+
+    let api_url = format!("https://export.arxiv.org/api/query?id_list={id}");
+    let resp = client.fetch(&api_url).await?;
+    if resp.status != 200 {
+        return Err(FetchError::Build(format!(
+            "arxiv api returned status {}",
+            resp.status
+        )));
+    }
+
+    let entry = parse_atom_entry(&resp.html)
+        .ok_or_else(|| FetchError::BodyDecode("arxiv: no <entry> in response".into()))?;
+    if entry.title.is_none() && entry.summary.is_none() {
+        return Err(FetchError::BodyDecode(format!(
+            "arxiv: paper '{id}' returned empty entry (likely withdrawn or invalid id)"
+        )));
+    }
+
+    Ok(json!({
+        "url":              url,
+        "id":               id,
+        "arxiv_id":         entry.id,
+        "title":            entry.title,
+        "authors":          entry.authors,
+        "abstract":         entry.summary.map(|s| collapse_whitespace(&s)),
+        "published":        entry.published,
+        "updated":          entry.updated,
+        "primary_category": entry.primary_category,
+        "categories":       entry.categories,
+        "doi":              entry.doi,
+        "comment":          entry.comment,
+        "pdf_url":          entry.pdf_url,
+        "abs_url":          entry.abs_url,
+    }))
+}
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+fn host_of(url: &str) -> &str {
+    url.split("://")
+        .nth(1)
+        .unwrap_or(url)
+        .split('/')
+        .next()
+        .unwrap_or("")
+}
+
+/// Parse an arxiv id from a URL. Strips the version suffix (`v2`, `v3`)
+/// and the `.pdf` extension when present.
+fn parse_id(url: &str) -> Option<String> {
+    let after = url
+        .split("/abs/")
+        .nth(1)
+        .or_else(|| url.split("/pdf/").nth(1))?;
+    let stripped = after
+        .split(['?', '#'])
+        .next()?
+        .trim_end_matches('/')
+        .trim_end_matches(".pdf");
+    // Strip optional version suffix, e.g. "2401.12345v2" → "2401.12345"
+    let no_version = match stripped.rfind('v') {
+        Some(i) if stripped[i + 1..].chars().all(|c| c.is_ascii_digit()) => &stripped[..i],
+        _ => stripped,
+    };
+    if no_version.is_empty() {
+        None
+    } else {
+        Some(no_version.to_string())
+    }
+}
+
+fn collapse_whitespace(s: &str) -> String {
+    s.split_whitespace().collect::<Vec<_>>().join(" ")
+}
+
+#[derive(Default)]
+struct AtomEntry {
+    id: Option<String>,
+    title: Option<String>,
+    summary: Option<String>,
+    published: Option<String>,
+    updated: Option<String>,
+    primary_category: Option<String>,
+    categories: Vec<String>,
+    authors: Vec<String>,
+    doi: Option<String>,
+    comment: Option<String>,
+    pdf_url: Option<String>,
+    abs_url: Option<String>,
+}
+
+/// Parse the first `<entry>` block of an ArXiv Atom feed.
+fn parse_atom_entry(xml: &str) -> Option<AtomEntry> {
+    let mut reader = Reader::from_str(xml);
+    let mut buf = Vec::new();
+
+    // States
+    let mut in_entry = false;
+    let mut current: Option<&'static str> = None;
+    let mut in_author = false;
+    let mut in_author_name = false;
+    let mut entry = AtomEntry::default();
+
+    loop {
+        match reader.read_event_into(&mut buf) {
+            Ok(Event::Start(ref e)) => {
+                let local = e.local_name();
+                match local.as_ref() {
+                    b"entry" => in_entry = true,
+                    b"id" if in_entry && !in_author => current = Some("id"),
+                    b"title" if in_entry => current = Some("title"),
+                    b"summary" if in_entry => current = Some("summary"),
+                    b"published" if in_entry => current = Some("published"),
+                    b"updated" if in_entry => current = Some("updated"),
+                    b"author" if in_entry => in_author = true,
+                    b"name" if in_author => {
+                        in_author_name = true;
+                        current = Some("author_name");
+                    }
+                    b"category" if in_entry => {
+                        // primary_category is namespaced (arxiv:primary_category)
+                        // category is plain. quick-xml gives us local-name only,
+                        // so we treat both as categories and take the first as
+                        // primary.
+                        for attr in e.attributes().flatten() {
+                            if attr.key.as_ref() == b"term"
+                                && let Ok(v) = attr.unescape_value()
+                            {
+                                let term = v.to_string();
+                                if entry.primary_category.is_none() {
+                                    entry.primary_category = Some(term.clone());
+                                }
+                                entry.categories.push(term);
+                            }
+                        }
+                    }
+                    b"link" if in_entry => {
+                        let mut href = None;
+                        let mut rel = None;
+                        let mut typ = None;
+                        for attr in e.attributes().flatten() {
+                            match attr.key.as_ref() {
+                                b"href" => href = attr.unescape_value().ok().map(|s| s.to_string()),
+                                b"rel" => rel = attr.unescape_value().ok().map(|s| s.to_string()),
+                                b"type" => typ = attr.unescape_value().ok().map(|s| s.to_string()),
+                                _ => {}
+                            }
+                        }
+                        if let Some(h) = href {
+                            if typ.as_deref() == Some("application/pdf") {
+                                entry.pdf_url = Some(h.clone());
+                            }
+                            if rel.as_deref() == Some("alternate") {
+                                entry.abs_url = Some(h);
+                            }
+                        }
+                    }
+                    _ => current = None,
+                }
+            }
+            Ok(Event::Empty(ref e)) => {
+                // Self-closing tags (<link href="..." />). Same handling as Start.
+                let local = e.local_name();
+                if (local.as_ref() == b"link" || local.as_ref() == b"category") && in_entry {
+                    let mut href = None;
+                    let mut rel = None;
+                    let mut typ = None;
+                    let mut term = None;
+                    for attr in e.attributes().flatten() {
+                        match attr.key.as_ref() {
+                            b"href" => href = attr.unescape_value().ok().map(|s| s.to_string()),
+                            b"rel" => rel = attr.unescape_value().ok().map(|s| s.to_string()),
+                            b"type" => typ = attr.unescape_value().ok().map(|s| s.to_string()),
+                            b"term" => term = attr.unescape_value().ok().map(|s| s.to_string()),
+                            _ => {}
+                        }
+                    }
+                    if let Some(t) = term {
+                        if entry.primary_category.is_none() {
+                            entry.primary_category = Some(t.clone());
+                        }
+                        entry.categories.push(t);
+                    }
+                    if let Some(h) = href {
+                        if typ.as_deref() == Some("application/pdf") {
+                            entry.pdf_url = Some(h.clone());
+                        }
+                        if rel.as_deref() == Some("alternate") {
+                            entry.abs_url = Some(h);
+                        }
+                    }
+                }
+            }
+            Ok(Event::Text(ref e)) => {
+                if let (Some(field), Ok(text)) = (current, e.unescape()) {
+                    let text = text.to_string();
+                    match field {
+                        "id" => entry.id = Some(text.trim().to_string()),
+                        "title" => entry.title = append_text(entry.title.take(), &text),
+                        "summary" => entry.summary = append_text(entry.summary.take(), &text),
+                        "published" => entry.published = Some(text.trim().to_string()),
+                        "updated" => entry.updated = Some(text.trim().to_string()),
+                        "author_name" => entry.authors.push(text.trim().to_string()),
+                        _ => {}
+                    }
+                }
+            }
+            Ok(Event::End(ref e)) => {
+                let local = e.local_name();
+                match local.as_ref() {
+                    b"entry" => break,
+                    b"author" => in_author = false,
+                    b"name" => in_author_name = false,
+                    _ => {}
+                }
+                if !in_author_name {
+                    current = None;
+                }
+            }
+            Ok(Event::Eof) => break,
+            Err(_) => return None,
+            _ => {}
+        }
+        buf.clear();
+    }
+
+    if in_entry { Some(entry) } else { None }
+}
+
+/// Concatenate text fragments (long fields can be split across multiple
+/// text events if they contain entities or CDATA).
+fn append_text(prev: Option<String>, next: &str) -> Option<String> {
+    match prev {
+        Some(mut s) => {
+            s.push_str(next);
+            Some(s)
+        }
+        None => Some(next.to_string()),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn matches_arxiv_urls() {
+        assert!(matches("https://arxiv.org/abs/2401.12345"));
+        assert!(matches("https://arxiv.org/abs/2401.12345v2"));
+        assert!(matches("https://arxiv.org/pdf/2401.12345.pdf"));
+        assert!(!matches("https://arxiv.org/"));
+        assert!(!matches("https://example.com/abs/foo"));
+    }
+
+    #[test]
+    fn parse_id_strips_version_and_extension() {
+        assert_eq!(
+            parse_id("https://arxiv.org/abs/2401.12345"),
+            Some("2401.12345".into())
+        );
+        assert_eq!(
+            parse_id("https://arxiv.org/abs/2401.12345v3"),
+            Some("2401.12345".into())
+        );
+        assert_eq!(
+            parse_id("https://arxiv.org/pdf/2401.12345v2.pdf"),
+            Some("2401.12345".into())
+        );
+    }
+
+    #[test]
+    fn collapse_whitespace_handles_newlines_and_tabs() {
+        assert_eq!(collapse_whitespace("a   b\n\tc  "), "a b c");
+    }
+}
--- a/crates/webclaw-fetch/src/extractors/crates_io.rs
+++ b/crates/webclaw-fetch/src/extractors/crates_io.rs
@ -0,0 +1,168 @@
+//! crates.io structured extractor.
+//!
+//! Uses the public JSON API at `crates.io/api/v1/crates/{name}`. No
+//! auth, no rate limit at normal usage. The response includes both
+//! the crate metadata and the full version list, which we summarize
+//! down to a count + latest release info to keep the payload small.
+
+use serde::Deserialize;
+use serde_json::{Value, json};
+
+use super::ExtractorInfo;
+use crate::client::FetchClient;
+use crate::error::FetchError;
+
+pub const INFO: ExtractorInfo = ExtractorInfo {
+    name: "crates_io",
+    label: "crates.io package",
+    description: "Returns crate metadata: latest version, dependencies, downloads, license, repository.",
+    url_patterns: &[
+        "https://crates.io/crates/{name}",
+        "https://crates.io/crates/{name}/{version}",
+    ],
+};
+
+pub fn matches(url: &str) -> bool {
+    let host = host_of(url);
+    if host != "crates.io" && host != "www.crates.io" {
+        return false;
+    }
+    url.contains("/crates/")
+}
+
+pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
+    let name = parse_name(url)
+        .ok_or_else(|| FetchError::Build(format!("crates.io: cannot parse name from '{url}'")))?;
+
+    let api_url = format!("https://crates.io/api/v1/crates/{name}");
+    let resp = client.fetch(&api_url).await?;
+    if resp.status == 404 {
+        return Err(FetchError::Build(format!(
+            "crates.io: crate '{name}' not found"
+        )));
+    }
+    if resp.status != 200 {
+        return Err(FetchError::Build(format!(
+            "crates.io api returned status {}",
+            resp.status
+        )));
+    }
+
+    let body: CratesResponse = serde_json::from_str(&resp.html)
+        .map_err(|e| FetchError::BodyDecode(format!("crates.io parse: {e}")))?;
+
+    let c = body.crate_;
+    let latest_version = body
+        .versions
+        .iter()
+        .find(|v| !v.yanked.unwrap_or(false))
+        .or_else(|| body.versions.first());
+
+    Ok(json!({
+        "url":                 url,
+        "name":                c.id,
+        "description":         c.description,
+        "homepage":            c.homepage,
+        "documentation":       c.documentation,
+        "repository":          c.repository,
+        "max_stable_version":  c.max_stable_version,
+        "max_version":         c.max_version,
+        "newest_version":      c.newest_version,
+        "downloads":           c.downloads,
+        "recent_downloads":    c.recent_downloads,
+        "categories":          c.categories,
+        "keywords":            c.keywords,
+        "release_count":       body.versions.len(),
+        "latest_release_date": latest_version.and_then(|v| v.created_at.clone()),
+        "latest_license":      latest_version.and_then(|v| v.license.clone()),
+        "latest_rust_version": latest_version.and_then(|v| v.rust_version.clone()),
+        "latest_yanked":       latest_version.and_then(|v| v.yanked),
+        "created_at":          c.created_at,
+        "updated_at":          c.updated_at,
+    }))
+}
+
+fn host_of(url: &str) -> &str {
+    url.split("://")
+        .nth(1)
+        .unwrap_or(url)
+        .split('/')
+        .next()
+        .unwrap_or("")
+}
+
+fn parse_name(url: &str) -> Option<String> {
+    let after = url.split("/crates/").nth(1)?;
+    let stripped = after.split(['?', '#']).next()?.trim_end_matches('/');
+    let first = stripped.split('/').find(|s| !s.is_empty())?;
+    Some(first.to_string())
+}
+
+// ---------------------------------------------------------------------------
+// crates.io API types
+// ---------------------------------------------------------------------------
+
+#[derive(Deserialize)]
+struct CratesResponse {
+    #[serde(rename = "crate")]
+    crate_: CrateInfo,
+    #[serde(default)]
+    versions: Vec<VersionInfo>,
+}
+
+#[derive(Deserialize)]
+struct CrateInfo {
+    id: Option<String>,
+    description: Option<String>,
+    homepage: Option<String>,
+    documentation: Option<String>,
+    repository: Option<String>,
+    max_stable_version: Option<String>,
+    max_version: Option<String>,
+    newest_version: Option<String>,
+    downloads: Option<i64>,
+    recent_downloads: Option<i64>,
+    #[serde(default)]
+    categories: Vec<String>,
+    #[serde(default)]
+    keywords: Vec<String>,
+    created_at: Option<String>,
+    updated_at: Option<String>,
+}
+
+#[derive(Deserialize)]
+struct VersionInfo {
+    license: Option<String>,
+    rust_version: Option<String>,
+    yanked: Option<bool>,
+    created_at: Option<String>,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn matches_crate_pages() {
+        assert!(matches("https://crates.io/crates/serde"));
+        assert!(matches("https://crates.io/crates/tokio/1.45.0"));
+        assert!(!matches("https://crates.io/"));
+        assert!(!matches("https://example.com/crates/foo"));
+    }
+
+    #[test]
+    fn parse_name_handles_versioned_urls() {
+        assert_eq!(
+            parse_name("https://crates.io/crates/serde"),
+            Some("serde".into())
+        );
+        assert_eq!(
+            parse_name("https://crates.io/crates/tokio/1.45.0"),
+            Some("tokio".into())
+        );
+        assert_eq!(
+            parse_name("https://crates.io/crates/scraper/?foo=bar"),
+            Some("scraper".into())
+        );
+    }
+}
--- a/crates/webclaw-fetch/src/extractors/dev_to.rs
+++ b/crates/webclaw-fetch/src/extractors/dev_to.rs
@ -0,0 +1,188 @@
+//! dev.to article structured extractor.
+//!
+//! `dev.to/api/articles/{username}/{slug}` returns the full article body,
+//! tags, reaction count, comment count, and reading time. Anonymous
+//! access works fine for published posts.
+
+use serde::Deserialize;
+use serde_json::{Value, json};
+
+use super::ExtractorInfo;
+use crate::client::FetchClient;
+use crate::error::FetchError;
+
+pub const INFO: ExtractorInfo = ExtractorInfo {
+    name: "dev_to",
+    label: "dev.to article",
+    description: "Returns article metadata + body: title, body markdown, tags, reactions, comments, reading time.",
+    url_patterns: &["https://dev.to/{username}/{slug}"],
+};
+
+pub fn matches(url: &str) -> bool {
+    let host = host_of(url);
+    if host != "dev.to" && host != "www.dev.to" {
+        return false;
+    }
+    let path = url
+        .split("://")
+        .nth(1)
+        .and_then(|s| s.split_once('/'))
+        .map(|(_, p)| p)
+        .unwrap_or("");
+    let stripped = path
+        .split(['?', '#'])
+        .next()
+        .unwrap_or("")
+        .trim_end_matches('/');
+    let segs: Vec<&str> = stripped.split('/').filter(|s| !s.is_empty()).collect();
+    // Need exactly /{username}/{slug}, with username starting with non-reserved.
+    segs.len() == 2 && !RESERVED_FIRST_SEGS.contains(&segs[0])
+}
+
+const RESERVED_FIRST_SEGS: &[&str] = &[
+    "api",
+    "tags",
+    "search",
+    "settings",
+    "enter",
+    "signup",
+    "about",
+    "code-of-conduct",
+    "privacy",
+    "terms",
+    "contact",
+    "sponsorships",
+    "sponsors",
+    "shop",
+    "videos",
+    "listings",
+    "podcasts",
+    "p",
+    "t",
+];
+
+pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
+    let (username, slug) = parse_username_slug(url).ok_or_else(|| {
+        FetchError::Build(format!("dev_to: cannot parse username/slug from '{url}'"))
+    })?;
+
+    let api_url = format!("https://dev.to/api/articles/{username}/{slug}");
+    let resp = client.fetch(&api_url).await?;
+    if resp.status == 404 {
+        return Err(FetchError::Build(format!(
+            "dev_to: article '{username}/{slug}' not found"
+        )));
+    }
+    if resp.status != 200 {
+        return Err(FetchError::Build(format!(
+            "dev.to api returned status {}",
+            resp.status
+        )));
+    }
+
+    let a: Article = serde_json::from_str(&resp.html)
+        .map_err(|e| FetchError::BodyDecode(format!("dev.to parse: {e}")))?;
+
+    Ok(json!({
+        "url":               url,
+        "id":                a.id,
+        "title":             a.title,
+        "description":       a.description,
+        "body_markdown":     a.body_markdown,
+        "url_canonical":     a.canonical_url,
+        "published_at":      a.published_at,
+        "edited_at":         a.edited_at,
+        "reading_time_min":  a.reading_time_minutes,
+        "tags":              a.tag_list,
+        "positive_reactions": a.positive_reactions_count,
+        "public_reactions":  a.public_reactions_count,
+        "comments_count":    a.comments_count,
+        "page_views_count":  a.page_views_count,
+        "cover_image":       a.cover_image,
+        "author": json!({
+            "username":  a.user.as_ref().and_then(|u| u.username.clone()),
+            "name":      a.user.as_ref().and_then(|u| u.name.clone()),
+            "twitter":   a.user.as_ref().and_then(|u| u.twitter_username.clone()),
+            "github":    a.user.as_ref().and_then(|u| u.github_username.clone()),
+            "website":   a.user.as_ref().and_then(|u| u.website_url.clone()),
+        }),
+    }))
+}
+
+fn host_of(url: &str) -> &str {
+    url.split("://")
+        .nth(1)
+        .unwrap_or(url)
+        .split('/')
+        .next()
+        .unwrap_or("")
+}
+
+fn parse_username_slug(url: &str) -> Option<(String, String)> {
+    let path = url.split("://").nth(1)?.split_once('/').map(|(_, p)| p)?;
+    let stripped = path.split(['?', '#']).next()?.trim_end_matches('/');
+    let mut segs = stripped.split('/').filter(|s| !s.is_empty());
+    let username = segs.next()?;
+    let slug = segs.next()?;
+    Some((username.to_string(), slug.to_string()))
+}
+
+// ---------------------------------------------------------------------------
+// dev.to API types
+// ---------------------------------------------------------------------------
+
+#[derive(Deserialize)]
+struct Article {
+    id: Option<i64>,
+    title: Option<String>,
+    description: Option<String>,
+    body_markdown: Option<String>,
+    canonical_url: Option<String>,
+    published_at: Option<String>,
+    edited_at: Option<String>,
+    reading_time_minutes: Option<i64>,
+    tag_list: Option<serde_json::Value>, // string OR array depending on endpoint
+    positive_reactions_count: Option<i64>,
+    public_reactions_count: Option<i64>,
+    comments_count: Option<i64>,
+    page_views_count: Option<i64>,
+    cover_image: Option<String>,
+    user: Option<UserRef>,
+}
+
+#[derive(Deserialize)]
+struct UserRef {
+    username: Option<String>,
+    name: Option<String>,
+    twitter_username: Option<String>,
+    github_username: Option<String>,
+    website_url: Option<String>,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn matches_article_urls() {
+        assert!(matches("https://dev.to/ben/welcome-thread"));
+        assert!(matches("https://dev.to/0xmassi/some-post-1abc"));
+        assert!(!matches("https://dev.to/"));
+        assert!(!matches("https://dev.to/api/articles/foo/bar"));
+        assert!(!matches("https://dev.to/tags/rust"));
+        assert!(!matches("https://dev.to/ben")); // user profile, not article
+        assert!(!matches("https://example.com/ben/post"));
+    }
+
+    #[test]
+    fn parse_pulls_username_and_slug() {
+        assert_eq!(
+            parse_username_slug("https://dev.to/ben/welcome-thread"),
+            Some(("ben".into(), "welcome-thread".into()))
+        );
+        assert_eq!(
+            parse_username_slug("https://dev.to/0xmassi/some-post-1abc/?foo=bar"),
+            Some(("0xmassi".into(), "some-post-1abc".into()))
+        );
+    }
+}
--- a/crates/webclaw-fetch/src/extractors/docker_hub.rs
+++ b/crates/webclaw-fetch/src/extractors/docker_hub.rs
@ -0,0 +1,150 @@
+//! Docker Hub repository structured extractor.
+//!
+//! Uses the v2 JSON API at `hub.docker.com/v2/repositories/{namespace}/{name}`.
+//! Anonymous access is allowed for public images. The official-image
+//! shorthand (e.g. `nginx`, `redis`) is normalized to `library/{name}`.
+
+use serde::Deserialize;
+use serde_json::{Value, json};
+
+use super::ExtractorInfo;
+use crate::client::FetchClient;
+use crate::error::FetchError;
+
+pub const INFO: ExtractorInfo = ExtractorInfo {
+    name: "docker_hub",
+    label: "Docker Hub repository",
+    description: "Returns image metadata: pull count, star count, last_updated, official flag, description.",
+    url_patterns: &[
+        "https://hub.docker.com/_/{name}",
+        "https://hub.docker.com/r/{namespace}/{name}",
+    ],
+};
+
+pub fn matches(url: &str) -> bool {
+    let host = host_of(url);
+    if host != "hub.docker.com" {
+        return false;
+    }
+    url.contains("/_/") || url.contains("/r/")
+}
+
+pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
+    let (namespace, name) = parse_repo(url)
+        .ok_or_else(|| FetchError::Build(format!("docker_hub: cannot parse repo from '{url}'")))?;
+
+    let api_url = format!("https://hub.docker.com/v2/repositories/{namespace}/{name}");
+    let resp = client.fetch(&api_url).await?;
+    if resp.status == 404 {
+        return Err(FetchError::Build(format!(
+            "docker_hub: repo '{namespace}/{name}' not found"
+        )));
+    }
+    if resp.status != 200 {
+        return Err(FetchError::Build(format!(
+            "docker_hub api returned status {}",
+            resp.status
+        )));
+    }
+
+    let r: RepoResponse = serde_json::from_str(&resp.html)
+        .map_err(|e| FetchError::BodyDecode(format!("docker_hub parse: {e}")))?;
+
+    Ok(json!({
+        "url":               url,
+        "namespace":         r.namespace,
+        "name":              r.name,
+        "full_name":         format!("{namespace}/{name}"),
+        "pull_count":        r.pull_count,
+        "star_count":        r.star_count,
+        "description":       r.description,
+        "full_description":  r.full_description,
+        "last_updated":      r.last_updated,
+        "date_registered":   r.date_registered,
+        "is_official":       namespace == "library",
+        "is_private":        r.is_private,
+        "status_description":r.status_description,
+        "categories":        r.categories,
+    }))
+}
+
+fn host_of(url: &str) -> &str {
+    url.split("://")
+        .nth(1)
+        .unwrap_or(url)
+        .split('/')
+        .next()
+        .unwrap_or("")
+}
+
+/// Parse `(namespace, name)` from a Docker Hub URL. The official-image
+/// shorthand `/_/nginx` maps to `(library, nginx)`. Personal repos
+/// `/r/foo/bar` map to `(foo, bar)`.
+fn parse_repo(url: &str) -> Option<(String, String)> {
+    if let Some(after) = url.split("/_/").nth(1) {
+        let stripped = after.split(['?', '#']).next()?.trim_end_matches('/');
+        let name = stripped.split('/').next().filter(|s| !s.is_empty())?;
+        return Some(("library".into(), name.to_string()));
+    }
+    let after = url.split("/r/").nth(1)?;
+    let stripped = after.split(['?', '#']).next()?.trim_end_matches('/');
+    let mut segs = stripped.split('/').filter(|s| !s.is_empty());
+    let ns = segs.next()?;
+    let nm = segs.next()?;
+    Some((ns.to_string(), nm.to_string()))
+}
+
+#[derive(Deserialize)]
+struct RepoResponse {
+    namespace: Option<String>,
+    name: Option<String>,
+    pull_count: Option<i64>,
+    star_count: Option<i64>,
+    description: Option<String>,
+    full_description: Option<String>,
+    last_updated: Option<String>,
+    date_registered: Option<String>,
+    is_private: Option<bool>,
+    status_description: Option<String>,
+    #[serde(default)]
+    categories: Vec<DockerCategory>,
+}
+
+#[derive(Deserialize, serde::Serialize)]
+struct DockerCategory {
+    name: Option<String>,
+    slug: Option<String>,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn matches_docker_urls() {
+        assert!(matches("https://hub.docker.com/_/nginx"));
+        assert!(matches("https://hub.docker.com/r/grafana/grafana"));
+        assert!(!matches("https://hub.docker.com/"));
+        assert!(!matches("https://example.com/_/nginx"));
+    }
+
+    #[test]
+    fn parse_repo_handles_official_and_personal() {
+        assert_eq!(
+            parse_repo("https://hub.docker.com/_/nginx"),
+            Some(("library".into(), "nginx".into()))
+        );
+        assert_eq!(
+            parse_repo("https://hub.docker.com/_/nginx/tags"),
+            Some(("library".into(), "nginx".into()))
+        );
+        assert_eq!(
+            parse_repo("https://hub.docker.com/r/grafana/grafana"),
+            Some(("grafana".into(), "grafana".into()))
+        );
+        assert_eq!(
+            parse_repo("https://hub.docker.com/r/grafana/grafana/?foo=bar"),
+            Some(("grafana".into(), "grafana".into()))
+        );
+    }
+}
--- a/crates/webclaw-fetch/src/extractors/github_pr.rs
+++ b/crates/webclaw-fetch/src/extractors/github_pr.rs
@ -0,0 +1,189 @@
+//! GitHub pull request structured extractor.
+//!
+//! Uses `api.github.com/repos/{owner}/{repo}/pulls/{number}`. Returns
+//! the PR metadata + a counted summary of comments and review activity.
+//! Full diff and per-comment bodies require additional calls — left for
+//! a follow-up enhancement so the v1 stays one network round-trip.
+
+use serde::Deserialize;
+use serde_json::{Value, json};
+
+use super::ExtractorInfo;
+use crate::client::FetchClient;
+use crate::error::FetchError;
+
+pub const INFO: ExtractorInfo = ExtractorInfo {
+    name: "github_pr",
+    label: "GitHub pull request",
+    description: "Returns PR metadata: title, body, state, author, labels, additions/deletions, file count.",
+    url_patterns: &["https://github.com/{owner}/{repo}/pull/{number}"],
+};
+
+pub fn matches(url: &str) -> bool {
+    let host = url
+        .split("://")
+        .nth(1)
+        .unwrap_or(url)
+        .split('/')
+        .next()
+        .unwrap_or("");
+    if host != "github.com" && host != "www.github.com" {
+        return false;
+    }
+    parse_pr(url).is_some()
+}
+
+pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
+    let (owner, repo, number) = parse_pr(url).ok_or_else(|| {
+        FetchError::Build(format!("github_pr: cannot parse pull-request URL '{url}'"))
+    })?;
+
+    let api_url = format!("https://api.github.com/repos/{owner}/{repo}/pulls/{number}");
+    let resp = client.fetch(&api_url).await?;
+    if resp.status == 404 {
+        return Err(FetchError::Build(format!(
+            "github_pr: pull request '{owner}/{repo}#{number}' not found"
+        )));
+    }
+    if resp.status == 403 {
+        return Err(FetchError::Build(
+            "github_pr: rate limited (60/hour unauth). Set GITHUB_TOKEN for 5,000/hour.".into(),
+        ));
+    }
+    if resp.status != 200 {
+        return Err(FetchError::Build(format!(
+            "github api returned status {}",
+            resp.status
+        )));
+    }
+
+    let p: PullRequest = serde_json::from_str(&resp.html)
+        .map_err(|e| FetchError::BodyDecode(format!("github pr parse: {e}")))?;
+
+    Ok(json!({
+        "url":            url,
+        "owner":          owner,
+        "repo":           repo,
+        "number":         p.number,
+        "title":          p.title,
+        "body":           p.body,
+        "state":          p.state,
+        "draft":          p.draft,
+        "merged":         p.merged,
+        "merged_at":      p.merged_at,
+        "merge_commit_sha": p.merge_commit_sha,
+        "author":         p.user.as_ref().and_then(|u| u.login.clone()),
+        "labels":         p.labels.iter().filter_map(|l| l.name.clone()).collect::<Vec<_>>(),
+        "milestone":      p.milestone.as_ref().and_then(|m| m.title.clone()),
+        "head_ref":       p.head.as_ref().and_then(|r| r.ref_name.clone()),
+        "base_ref":       p.base.as_ref().and_then(|r| r.ref_name.clone()),
+        "head_sha":       p.head.as_ref().and_then(|r| r.sha.clone()),
+        "additions":      p.additions,
+        "deletions":      p.deletions,
+        "changed_files":  p.changed_files,
+        "commits":        p.commits,
+        "comments":       p.comments,
+        "review_comments":p.review_comments,
+        "created_at":     p.created_at,
+        "updated_at":     p.updated_at,
+        "closed_at":      p.closed_at,
+        "html_url":       p.html_url,
+    }))
+}
+
+fn parse_pr(url: &str) -> Option<(String, String, u64)> {
+    let path = url.split("://").nth(1)?.split_once('/').map(|(_, p)| p)?;
+    let stripped = path.split(['?', '#']).next()?.trim_end_matches('/');
+    let segs: Vec<&str> = stripped.split('/').filter(|s| !s.is_empty()).collect();
+    // /{owner}/{repo}/pull/{number} (or /pulls/{number} variant)
+    if segs.len() < 4 {
+        return None;
+    }
+    if segs[2] != "pull" && segs[2] != "pulls" {
+        return None;
+    }
+    let number: u64 = segs[3].parse().ok()?;
+    Some((segs[0].to_string(), segs[1].to_string(), number))
+}
+
+// ---------------------------------------------------------------------------
+// GitHub PR API types
+// ---------------------------------------------------------------------------
+
+#[derive(Deserialize)]
+struct PullRequest {
+    number: Option<i64>,
+    title: Option<String>,
+    body: Option<String>,
+    state: Option<String>,
+    draft: Option<bool>,
+    merged: Option<bool>,
+    merged_at: Option<String>,
+    merge_commit_sha: Option<String>,
+    user: Option<UserRef>,
+    #[serde(default)]
+    labels: Vec<LabelRef>,
+    milestone: Option<Milestone>,
+    head: Option<GitRef>,
+    base: Option<GitRef>,
+    additions: Option<i64>,
+    deletions: Option<i64>,
+    changed_files: Option<i64>,
+    commits: Option<i64>,
+    comments: Option<i64>,
+    review_comments: Option<i64>,
+    created_at: Option<String>,
+    updated_at: Option<String>,
+    closed_at: Option<String>,
+    html_url: Option<String>,
+}
+
+#[derive(Deserialize)]
+struct UserRef {
+    login: Option<String>,
+}
+
+#[derive(Deserialize)]
+struct LabelRef {
+    name: Option<String>,
+}
+
+#[derive(Deserialize)]
+struct Milestone {
+    title: Option<String>,
+}
+
+#[derive(Deserialize)]
+struct GitRef {
+    #[serde(rename = "ref")]
+    ref_name: Option<String>,
+    sha: Option<String>,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn matches_pr_urls() {
+        assert!(matches("https://github.com/rust-lang/rust/pull/12345"));
+        assert!(matches(
+            "https://github.com/rust-lang/rust/pull/12345/files"
+        ));
+        assert!(!matches("https://github.com/rust-lang/rust"));
+        assert!(!matches("https://github.com/rust-lang/rust/issues/100"));
+        assert!(!matches("https://github.com/rust-lang"));
+    }
+
+    #[test]
+    fn parse_pr_extracts_owner_repo_number() {
+        assert_eq!(
+            parse_pr("https://github.com/rust-lang/rust/pull/12345"),
+            Some(("rust-lang".into(), "rust".into(), 12345))
+        );
+        assert_eq!(
+            parse_pr("https://github.com/rust-lang/rust/pull/12345/files"),
+            Some(("rust-lang".into(), "rust".into(), 12345))
+        );
+    }
+}
--- a/crates/webclaw-fetch/src/extractors/github_release.rs
+++ b/crates/webclaw-fetch/src/extractors/github_release.rs
@ -0,0 +1,179 @@
+//! GitHub release structured extractor.
+//!
+//! `api.github.com/repos/{owner}/{repo}/releases/tags/{tag}`. Returns
+//! the release notes body, asset list with download counts, and
+//! prerelease flag.
+
+use serde::Deserialize;
+use serde_json::{Value, json};
+
+use super::ExtractorInfo;
+use crate::client::FetchClient;
+use crate::error::FetchError;
+
+pub const INFO: ExtractorInfo = ExtractorInfo {
+    name: "github_release",
+    label: "GitHub release",
+    description: "Returns release metadata: tag, name, body (release notes), assets with download counts.",
+    url_patterns: &["https://github.com/{owner}/{repo}/releases/tag/{tag}"],
+};
+
+pub fn matches(url: &str) -> bool {
+    let host = url
+        .split("://")
+        .nth(1)
+        .unwrap_or(url)
+        .split('/')
+        .next()
+        .unwrap_or("");
+    if host != "github.com" && host != "www.github.com" {
+        return false;
+    }
+    parse_release(url).is_some()
+}
+
+pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
+    let (owner, repo, tag) = parse_release(url).ok_or_else(|| {
+        FetchError::Build(format!("github_release: cannot parse release URL '{url}'"))
+    })?;
+
+    let api_url = format!("https://api.github.com/repos/{owner}/{repo}/releases/tags/{tag}");
+    let resp = client.fetch(&api_url).await?;
+    if resp.status == 404 {
+        return Err(FetchError::Build(format!(
+            "github_release: release '{owner}/{repo}@{tag}' not found"
+        )));
+    }
+    if resp.status == 403 {
+        return Err(FetchError::Build(
+            "github_release: rate limited (60/hour unauth). Set GITHUB_TOKEN for 5,000/hour."
+                .into(),
+        ));
+    }
+    if resp.status != 200 {
+        return Err(FetchError::Build(format!(
+            "github api returned status {}",
+            resp.status
+        )));
+    }
+
+    let r: Release = serde_json::from_str(&resp.html)
+        .map_err(|e| FetchError::BodyDecode(format!("github release parse: {e}")))?;
+
+    let assets: Vec<Value> = r
+        .assets
+        .iter()
+        .map(|a| {
+            json!({
+                "name": a.name,
+                "size": a.size,
+                "download_count": a.download_count,
+                "browser_download_url": a.browser_download_url,
+                "content_type": a.content_type,
+                "created_at": a.created_at,
+                "updated_at": a.updated_at,
+            })
+        })
+        .collect();
+
+    Ok(json!({
+        "url":           url,
+        "owner":         owner,
+        "repo":          repo,
+        "tag_name":      r.tag_name,
+        "name":          r.name,
+        "body":          r.body,
+        "draft":         r.draft,
+        "prerelease":    r.prerelease,
+        "author":        r.author.as_ref().and_then(|u| u.login.clone()),
+        "created_at":    r.created_at,
+        "published_at":  r.published_at,
+        "asset_count":   assets.len(),
+        "total_downloads": r.assets.iter().map(|a| a.download_count.unwrap_or(0)).sum::<i64>(),
+        "assets":        assets,
+        "html_url":      r.html_url,
+    }))
+}
+
+fn parse_release(url: &str) -> Option<(String, String, String)> {
+    let path = url.split("://").nth(1)?.split_once('/').map(|(_, p)| p)?;
+    let stripped = path.split(['?', '#']).next()?.trim_end_matches('/');
+    let segs: Vec<&str> = stripped.split('/').filter(|s| !s.is_empty()).collect();
+    // /{owner}/{repo}/releases/tag/{tag}
+    if segs.len() < 5 {
+        return None;
+    }
+    if segs[2] != "releases" || segs[3] != "tag" {
+        return None;
+    }
+    Some((
+        segs[0].to_string(),
+        segs[1].to_string(),
+        segs[4].to_string(),
+    ))
+}
+
+// ---------------------------------------------------------------------------
+// GitHub Release API types
+// ---------------------------------------------------------------------------
+
+#[derive(Deserialize)]
+struct Release {
+    tag_name: Option<String>,
+    name: Option<String>,
+    body: Option<String>,
+    draft: Option<bool>,
+    prerelease: Option<bool>,
+    author: Option<UserRef>,
+    created_at: Option<String>,
+    published_at: Option<String>,
+    html_url: Option<String>,
+    #[serde(default)]
+    assets: Vec<Asset>,
+}
+
+#[derive(Deserialize)]
+struct UserRef {
+    login: Option<String>,
+}
+
+#[derive(Deserialize)]
+struct Asset {
+    name: Option<String>,
+    size: Option<i64>,
+    download_count: Option<i64>,
+    browser_download_url: Option<String>,
+    content_type: Option<String>,
+    created_at: Option<String>,
+    updated_at: Option<String>,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn matches_release_urls() {
+        assert!(matches(
+            "https://github.com/rust-lang/rust/releases/tag/1.85.0"
+        ));
+        assert!(matches(
+            "https://github.com/0xMassi/webclaw/releases/tag/v0.4.0"
+        ));
+        assert!(!matches("https://github.com/rust-lang/rust"));
+        assert!(!matches("https://github.com/rust-lang/rust/releases"));
+        assert!(!matches("https://github.com/rust-lang/rust/pull/100"));
+    }
+
+    #[test]
+    fn parse_release_extracts_owner_repo_tag() {
+        assert_eq!(
+            parse_release("https://github.com/0xMassi/webclaw/releases/tag/v0.4.0"),
+            Some(("0xMassi".into(), "webclaw".into(), "v0.4.0".into()))
+        );
+        assert_eq!(
+            parse_release("https://github.com/rust-lang/rust/releases/tag/1.85.0/?foo=bar"),
+            Some(("rust-lang".into(), "rust".into(), "1.85.0".into()))
+        );
+    }
+}
--- a/crates/webclaw-fetch/src/extractors/huggingface_dataset.rs
+++ b/crates/webclaw-fetch/src/extractors/huggingface_dataset.rs
@ -0,0 +1,189 @@
+//! HuggingFace dataset structured extractor.
+//!
+//! Same shape as the model extractor but hits the dataset endpoint.
+//! `huggingface.co/api/datasets/{owner}/{name}`.
+
+use serde::Deserialize;
+use serde_json::{Value, json};
+
+use super::ExtractorInfo;
+use crate::client::FetchClient;
+use crate::error::FetchError;
+
+pub const INFO: ExtractorInfo = ExtractorInfo {
+    name: "huggingface_dataset",
+    label: "HuggingFace dataset",
+    description: "Returns dataset metadata: downloads, likes, license, language, task categories, file list.",
+    url_patterns: &["https://huggingface.co/datasets/{owner}/{name}"],
+};
+
+pub fn matches(url: &str) -> bool {
+    let host = host_of(url);
+    if host != "huggingface.co" && host != "www.huggingface.co" {
+        return false;
+    }
+    let path = url
+        .split("://")
+        .nth(1)
+        .and_then(|s| s.split_once('/'))
+        .map(|(_, p)| p)
+        .unwrap_or("");
+    let stripped = path
+        .split(['?', '#'])
+        .next()
+        .unwrap_or("")
+        .trim_end_matches('/');
+    let segs: Vec<&str> = stripped.split('/').filter(|s| !s.is_empty()).collect();
+    // /datasets/{name} (legacy top-level) or /datasets/{owner}/{name} (canonical).
+    segs.first().copied() == Some("datasets") && (segs.len() == 2 || segs.len() == 3)
+}
+
+pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
+    let dataset_path = parse_dataset_path(url).ok_or_else(|| {
+        FetchError::Build(format!(
+            "hf_dataset: cannot parse dataset path from '{url}'"
+        ))
+    })?;
+
+    let api_url = format!("https://huggingface.co/api/datasets/{dataset_path}");
+    let resp = client.fetch(&api_url).await?;
+    if resp.status == 404 {
+        return Err(FetchError::Build(format!(
+            "hf_dataset: '{dataset_path}' not found"
+        )));
+    }
+    if resp.status == 401 {
+        return Err(FetchError::Build(format!(
+            "hf_dataset: '{dataset_path}' requires authentication (gated)"
+        )));
+    }
+    if resp.status != 200 {
+        return Err(FetchError::Build(format!(
+            "hf_dataset api returned status {}",
+            resp.status
+        )));
+    }
+
+    let d: DatasetInfo = serde_json::from_str(&resp.html)
+        .map_err(|e| FetchError::BodyDecode(format!("hf_dataset parse: {e}")))?;
+
+    let files: Vec<Value> = d
+        .siblings
+        .iter()
+        .map(|s| json!({"rfilename": s.rfilename, "size": s.size}))
+        .collect();
+
+    Ok(json!({
+        "url":             url,
+        "id":              d.id,
+        "private":         d.private,
+        "gated":           d.gated,
+        "downloads":       d.downloads,
+        "downloads_30d":   d.downloads_all_time,
+        "likes":           d.likes,
+        "tags":            d.tags,
+        "license":         d.card_data.as_ref().and_then(|c| c.license.clone()),
+        "language":        d.card_data.as_ref().and_then(|c| c.language.clone()),
+        "task_categories": d.card_data.as_ref().and_then(|c| c.task_categories.clone()),
+        "size_categories": d.card_data.as_ref().and_then(|c| c.size_categories.clone()),
+        "annotations_creators": d.card_data.as_ref().and_then(|c| c.annotations_creators.clone()),
+        "configs":         d.card_data.as_ref().and_then(|c| c.configs.clone()),
+        "created_at":      d.created_at,
+        "last_modified":   d.last_modified,
+        "sha":             d.sha,
+        "file_count":      d.siblings.len(),
+        "files":           files,
+    }))
+}
+
+fn host_of(url: &str) -> &str {
+    url.split("://")
+        .nth(1)
+        .unwrap_or(url)
+        .split('/')
+        .next()
+        .unwrap_or("")
+}
+
+/// Returns the part to append to the API URL — either `name` (legacy
+/// top-level dataset like `squad`) or `owner/name` (canonical form).
+fn parse_dataset_path(url: &str) -> Option<String> {
+    let path = url.split("://").nth(1)?.split_once('/').map(|(_, p)| p)?;
+    let stripped = path.split(['?', '#']).next()?.trim_end_matches('/');
+    let mut segs = stripped.split('/').filter(|s| !s.is_empty());
+    if segs.next() != Some("datasets") {
+        return None;
+    }
+    let first = segs.next()?.to_string();
+    match segs.next() {
+        Some(second) => Some(format!("{first}/{second}")),
+        None => Some(first),
+    }
+}
+
+#[derive(Deserialize)]
+struct DatasetInfo {
+    id: Option<String>,
+    private: Option<bool>,
+    gated: Option<serde_json::Value>,
+    downloads: Option<i64>,
+    #[serde(rename = "downloadsAllTime")]
+    downloads_all_time: Option<i64>,
+    likes: Option<i64>,
+    #[serde(default)]
+    tags: Vec<String>,
+    #[serde(rename = "createdAt")]
+    created_at: Option<String>,
+    #[serde(rename = "lastModified")]
+    last_modified: Option<String>,
+    sha: Option<String>,
+    #[serde(rename = "cardData")]
+    card_data: Option<DatasetCard>,
+    #[serde(default)]
+    siblings: Vec<Sibling>,
+}
+
+#[derive(Deserialize)]
+struct DatasetCard {
+    license: Option<serde_json::Value>,
+    language: Option<serde_json::Value>,
+    task_categories: Option<serde_json::Value>,
+    size_categories: Option<serde_json::Value>,
+    annotations_creators: Option<serde_json::Value>,
+    configs: Option<serde_json::Value>,
+}
+
+#[derive(Deserialize)]
+struct Sibling {
+    rfilename: String,
+    size: Option<i64>,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn matches_dataset_pages() {
+        assert!(matches("https://huggingface.co/datasets/squad")); // legacy top-level
+        assert!(matches("https://huggingface.co/datasets/openai/gsm8k")); // canonical owner/name
+        assert!(!matches("https://huggingface.co/openai/whisper-large-v3"));
+        assert!(!matches("https://huggingface.co/datasets/"));
+    }
+
+    #[test]
+    fn parse_dataset_path_works() {
+        assert_eq!(
+            parse_dataset_path("https://huggingface.co/datasets/squad"),
+            Some("squad".into())
+        );
+        assert_eq!(
+            parse_dataset_path("https://huggingface.co/datasets/openai/gsm8k"),
+            Some("openai/gsm8k".into())
+        );
+        assert_eq!(
+            parse_dataset_path("https://huggingface.co/datasets/openai/gsm8k/?lib=transformers"),
+            Some("openai/gsm8k".into())
+        );
+    }
+}
--- a/crates/webclaw-fetch/src/extractors/mod.rs
+++ b/crates/webclaw-fetch/src/extractors/mod.rs
@ -14,12 +14,20 @@
 //! exists (Reddit, HN/Algolia, PyPI, npm, GitHub, HuggingFace all have
 //! one). HTML extraction is the fallback for sites that don't.

+pub mod arxiv;
+pub mod crates_io;
+pub mod dev_to;
+pub mod docker_hub;
+pub mod github_pr;
+pub mod github_release;
 pub mod github_repo;
 pub mod hackernews;
+pub mod huggingface_dataset;
 pub mod huggingface_model;
 pub mod npm;
 pub mod pypi;
 pub mod reddit;
+pub mod stackoverflow;

 use serde::Serialize;
 use serde_json::Value;
@ -48,9 +56,17 @@ pub fn list() -> Vec<ExtractorInfo> {
        reddit::INFO,
        hackernews::INFO,
        github_repo::INFO,
+        github_pr::INFO,
+        github_release::INFO,
        pypi::INFO,
        npm::INFO,
+        crates_io::INFO,
        huggingface_model::INFO,
+        huggingface_dataset::INFO,
+        arxiv::INFO,
+        docker_hub::INFO,
+        dev_to::INFO,
+        stackoverflow::INFO,
    ]
 }

@ -92,6 +108,27 @@ pub async fn dispatch_by_url(
    if npm::matches(url) {
        return Some(npm::extract(client, url).await.map(|v| (npm::INFO.name, v)));
    }
+    if github_pr::matches(url) {
+        return Some(
+            github_pr::extract(client, url)
+                .await
+                .map(|v| (github_pr::INFO.name, v)),
+        );
+    }
+    if github_release::matches(url) {
+        return Some(
+            github_release::extract(client, url)
+                .await
+                .map(|v| (github_release::INFO.name, v)),
+        );
+    }
+    if crates_io::matches(url) {
+        return Some(
+            crates_io::extract(client, url)
+                .await
+                .map(|v| (crates_io::INFO.name, v)),
+        );
+    }
    if huggingface_model::matches(url) {
        return Some(
            huggingface_model::extract(client, url)
@ -99,6 +136,41 @@ pub async fn dispatch_by_url(
                .map(|v| (huggingface_model::INFO.name, v)),
        );
    }
+    if huggingface_dataset::matches(url) {
+        return Some(
+            huggingface_dataset::extract(client, url)
+                .await
+                .map(|v| (huggingface_dataset::INFO.name, v)),
+        );
+    }
+    if arxiv::matches(url) {
+        return Some(
+            arxiv::extract(client, url)
+                .await
+                .map(|v| (arxiv::INFO.name, v)),
+        );
+    }
+    if docker_hub::matches(url) {
+        return Some(
+            docker_hub::extract(client, url)
+                .await
+                .map(|v| (docker_hub::INFO.name, v)),
+        );
+    }
+    if dev_to::matches(url) {
+        return Some(
+            dev_to::extract(client, url)
+                .await
+                .map(|v| (dev_to::INFO.name, v)),
+        );
+    }
+    if stackoverflow::matches(url) {
+        return Some(
+            stackoverflow::extract(client, url)
+                .await
+                .map(|v| (stackoverflow::INFO.name, v)),
+        );
+    }
    None
 }

@ -136,12 +208,57 @@ pub async fn dispatch_by_name(
        n if n == npm::INFO.name => {
            run_or_mismatch(npm::matches(url), n, url, || npm::extract(client, url)).await
        }
+        n if n == github_pr::INFO.name => {
+            run_or_mismatch(github_pr::matches(url), n, url, || {
+                github_pr::extract(client, url)
+            })
+            .await
+        }
+        n if n == github_release::INFO.name => {
+            run_or_mismatch(github_release::matches(url), n, url, || {
+                github_release::extract(client, url)
+            })
+            .await
+        }
+        n if n == crates_io::INFO.name => {
+            run_or_mismatch(crates_io::matches(url), n, url, || {
+                crates_io::extract(client, url)
+            })
+            .await
+        }
        n if n == huggingface_model::INFO.name => {
            run_or_mismatch(huggingface_model::matches(url), n, url, || {
                huggingface_model::extract(client, url)
            })
            .await
        }
+        n if n == huggingface_dataset::INFO.name => {
+            run_or_mismatch(huggingface_dataset::matches(url), n, url, || {
+                huggingface_dataset::extract(client, url)
+            })
+            .await
+        }
+        n if n == arxiv::INFO.name => {
+            run_or_mismatch(arxiv::matches(url), n, url, || arxiv::extract(client, url)).await
+        }
+        n if n == docker_hub::INFO.name => {
+            run_or_mismatch(docker_hub::matches(url), n, url, || {
+                docker_hub::extract(client, url)
+            })
+            .await
+        }
+        n if n == dev_to::INFO.name => {
+            run_or_mismatch(dev_to::matches(url), n, url, || {
+                dev_to::extract(client, url)
+            })
+            .await
+        }
+        n if n == stackoverflow::INFO.name => {
+            run_or_mismatch(stackoverflow::matches(url), n, url, || {
+                stackoverflow::extract(client, url)
+            })
+            .await
+        }
        _ => Err(ExtractorDispatchError::UnknownVertical(name.to_string())),
    }
 }
--- a/crates/webclaw-fetch/src/extractors/stackoverflow.rs
+++ b/crates/webclaw-fetch/src/extractors/stackoverflow.rs
@ -0,0 +1,216 @@
+//! Stack Overflow Q&A structured extractor.
+//!
+//! Uses the Stack Exchange API at `api.stackexchange.com/2.3/questions/{id}`
+//! with `site=stackoverflow`. Two calls: one for the question, one for
+//! its answers. Both come pre-filtered to include the rendered HTML body
+//! so we don't re-parse the question page itself.
+//!
+//! Anonymous access caps at 300 requests per IP per day. Production
+//! cloud should set `STACKAPPS_KEY` to lift to 10,000/day, but we don't
+//! require it to work out of the box.
+
+use serde::Deserialize;
+use serde_json::{Value, json};
+
+use super::ExtractorInfo;
+use crate::client::FetchClient;
+use crate::error::FetchError;
+
+pub const INFO: ExtractorInfo = ExtractorInfo {
+    name: "stackoverflow",
+    label: "Stack Overflow Q&A",
+    description: "Returns question + answers: title, body, tags, votes, accepted answer, top answers.",
+    url_patterns: &["https://stackoverflow.com/questions/{id}/{slug}"],
+};
+
+pub fn matches(url: &str) -> bool {
+    let host = host_of(url);
+    if host != "stackoverflow.com" && host != "www.stackoverflow.com" {
+        return false;
+    }
+    parse_question_id(url).is_some()
+}
+
+pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
+    let id = parse_question_id(url).ok_or_else(|| {
+        FetchError::Build(format!(
+            "stackoverflow: cannot parse question id from '{url}'"
+        ))
+    })?;
+
+    // Filter `withbody` includes the rendered HTML body for both questions
+    // and answers. Stack Exchange's filter system is documented at
+    // api.stackexchange.com/docs/filters.
+    let q_url = format!(
+        "https://api.stackexchange.com/2.3/questions/{id}?site=stackoverflow&filter=withbody"
+    );
+    let q_resp = client.fetch(&q_url).await?;
+    if q_resp.status != 200 {
+        return Err(FetchError::Build(format!(
+            "stackexchange api returned status {}",
+            q_resp.status
+        )));
+    }
+    let q_body: QResponse = serde_json::from_str(&q_resp.html)
+        .map_err(|e| FetchError::BodyDecode(format!("stackoverflow q parse: {e}")))?;
+    let q = q_body
+        .items
+        .first()
+        .ok_or_else(|| FetchError::Build(format!("stackoverflow: question {id} not found")))?;
+
+    let a_url = format!(
+        "https://api.stackexchange.com/2.3/questions/{id}/answers?site=stackoverflow&filter=withbody&order=desc&sort=votes"
+    );
+    let a_resp = client.fetch(&a_url).await?;
+    let answers = if a_resp.status == 200 {
+        let a_body: AResponse = serde_json::from_str(&a_resp.html)
+            .map_err(|e| FetchError::BodyDecode(format!("stackoverflow a parse: {e}")))?;
+        a_body
+            .items
+            .iter()
+            .map(|a| {
+                json!({
+                    "answer_id":     a.answer_id,
+                    "is_accepted":   a.is_accepted,
+                    "score":         a.score,
+                    "body":          a.body,
+                    "creation_date": a.creation_date,
+                    "last_edit_date":a.last_edit_date,
+                    "author":        a.owner.as_ref().and_then(|o| o.display_name.clone()),
+                    "author_rep":    a.owner.as_ref().and_then(|o| o.reputation),
+                })
+            })
+            .collect::<Vec<_>>()
+    } else {
+        Vec::new()
+    };
+
+    let accepted = answers
+        .iter()
+        .find(|a| {
+            a.get("is_accepted")
+                .and_then(|v| v.as_bool())
+                .unwrap_or(false)
+        })
+        .cloned();
+
+    Ok(json!({
+        "url":            url,
+        "question_id":    q.question_id,
+        "title":          q.title,
+        "body":           q.body,
+        "tags":           q.tags,
+        "score":          q.score,
+        "view_count":     q.view_count,
+        "answer_count":   q.answer_count,
+        "is_answered":    q.is_answered,
+        "accepted_answer_id": q.accepted_answer_id,
+        "creation_date":  q.creation_date,
+        "last_activity_date": q.last_activity_date,
+        "author":         q.owner.as_ref().and_then(|o| o.display_name.clone()),
+        "author_rep":     q.owner.as_ref().and_then(|o| o.reputation),
+        "link":           q.link,
+        "accepted_answer": accepted,
+        "top_answers":    answers,
+    }))
+}
+
+fn host_of(url: &str) -> &str {
+    url.split("://")
+        .nth(1)
+        .unwrap_or(url)
+        .split('/')
+        .next()
+        .unwrap_or("")
+}
+
+/// Parse question id from a URL of the form `/questions/{id}/{slug}`.
+fn parse_question_id(url: &str) -> Option<u64> {
+    let after = url.split("/questions/").nth(1)?;
+    let stripped = after.split(['?', '#']).next()?.trim_end_matches('/');
+    let first = stripped.split('/').next()?;
+    first.parse::<u64>().ok()
+}
+
+// ---------------------------------------------------------------------------
+// Stack Exchange API types
+// ---------------------------------------------------------------------------
+
+#[derive(Deserialize)]
+struct QResponse {
+    #[serde(default)]
+    items: Vec<Question>,
+}
+
+#[derive(Deserialize)]
+struct Question {
+    question_id: Option<u64>,
+    title: Option<String>,
+    body: Option<String>,
+    #[serde(default)]
+    tags: Vec<String>,
+    score: Option<i64>,
+    view_count: Option<i64>,
+    answer_count: Option<i64>,
+    is_answered: Option<bool>,
+    accepted_answer_id: Option<u64>,
+    creation_date: Option<i64>,
+    last_activity_date: Option<i64>,
+    owner: Option<Owner>,
+    link: Option<String>,
+}
+
+#[derive(Deserialize)]
+struct AResponse {
+    #[serde(default)]
+    items: Vec<Answer>,
+}
+
+#[derive(Deserialize)]
+struct Answer {
+    answer_id: Option<u64>,
+    is_accepted: Option<bool>,
+    score: Option<i64>,
+    body: Option<String>,
+    creation_date: Option<i64>,
+    last_edit_date: Option<i64>,
+    owner: Option<Owner>,
+}
+
+#[derive(Deserialize)]
+struct Owner {
+    display_name: Option<String>,
+    reputation: Option<i64>,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn matches_question_urls() {
+        assert!(matches(
+            "https://stackoverflow.com/questions/12345/some-slug"
+        ));
+        assert!(matches(
+            "https://stackoverflow.com/questions/12345/some-slug?answertab=votes"
+        ));
+        assert!(!matches("https://stackoverflow.com/"));
+        assert!(!matches("https://stackoverflow.com/questions"));
+        assert!(!matches("https://stackoverflow.com/users/100"));
+        assert!(!matches("https://example.com/questions/12345/x"));
+    }
+
+    #[test]
+    fn parse_question_id_handles_slug_and_query() {
+        assert_eq!(
+            parse_question_id("https://stackoverflow.com/questions/12345/some-slug"),
+            Some(12345)
+        );
+        assert_eq!(
+            parse_question_id("https://stackoverflow.com/questions/12345/some-slug?tab=newest"),
+            Some(12345)
+        );
+        assert_eq!(parse_question_id("https://stackoverflow.com/foo"), None);
+    }
+}