diff --git a/crates/webclaw-fetch/src/extractors/github_issue.rs b/crates/webclaw-fetch/src/extractors/github_issue.rs
new file mode 100644
index 0000000..436faa9
--- /dev/null
+++ b/crates/webclaw-fetch/src/extractors/github_issue.rs
@@ -0,0 +1,172 @@
+//! GitHub issue structured extractor.
+//!
+//! Mirror of `github_pr` but on `/issues/{number}`. Uses
+//! `api.github.com/repos/{owner}/{repo}/issues/{number}`. Returns the
+//! issue body + comment count + labels + milestone + author /
+//! assignees. Full per-comment bodies would be another call; kept for
+//! a follow-up.
+
+use serde::Deserialize;
+use serde_json::{Value, json};
+
+use super::ExtractorInfo;
+use crate::client::FetchClient;
+use crate::error::FetchError;
+
+pub const INFO: ExtractorInfo = ExtractorInfo {
+    name: "github_issue",
+    label: "GitHub issue",
+    description: "Returns issue metadata: title, body, state, author, labels, assignees, milestone, comment count.",
+    url_patterns: &["https://github.com/{owner}/{repo}/issues/{number}"],
+};
+
+pub fn matches(url: &str) -> bool {
+    let host = url
+        .split("://")
+        .nth(1)
+        .unwrap_or(url)
+        .split('/')
+        .next()
+        .unwrap_or("");
+    if host != "github.com" && host != "www.github.com" {
+        return false;
+    }
+    parse_issue(url).is_some()
+}
+
+pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
+    let (owner, repo, number) = parse_issue(url).ok_or_else(|| {
+        FetchError::Build(format!("github_issue: cannot parse issue URL '{url}'"))
+    })?;
+
+    let api_url = format!("https://api.github.com/repos/{owner}/{repo}/issues/{number}");
+    let resp = client.fetch(&api_url).await?;
+    if resp.status == 404 {
+        return Err(FetchError::Build(format!(
+            "github_issue: issue '{owner}/{repo}#{number}' not found"
+        )));
+    }
+    if resp.status == 403 {
+        return Err(FetchError::Build(
+            "github_issue: rate limited (60/hour unauth). Set GITHUB_TOKEN for 5,000/hour.".into(),
+        ));
+    }
+    if resp.status != 200 {
+        return Err(FetchError::Build(format!(
+            "github api returned status {}",
+            resp.status
+        )));
+    }
+
+    let issue: Issue = serde_json::from_str(&resp.html)
+        .map_err(|e| FetchError::BodyDecode(format!("github issue parse: {e}")))?;
+
+    // The same endpoint returns PRs too; reject if we got one so the caller
+    // uses /v1/scrape/github_pr instead of getting a half-shaped payload.
+    if issue.pull_request.is_some() {
+        return Err(FetchError::Build(format!(
+            "github_issue: '{owner}/{repo}#{number}' is a pull request, use /v1/scrape/github_pr"
+        )));
+    }
+
+    Ok(json!({
+        "url":         url,
+        "owner":       owner,
+        "repo":        repo,
+        "number":      issue.number,
+        "title":       issue.title,
+        "body":        issue.body,
+        "state":       issue.state,
+        "state_reason":issue.state_reason,
+        "author":      issue.user.as_ref().and_then(|u| u.login.clone()),
+        "labels":      issue.labels.iter().filter_map(|l| l.name.clone()).collect::<Vec<_>>(),
+        "assignees":   issue.assignees.iter().filter_map(|u| u.login.clone()).collect::<Vec<_>>(),
+        "milestone":   issue.milestone.as_ref().and_then(|m| m.title.clone()),
+        "comments":    issue.comments,
+        "locked":      issue.locked,
+        "created_at":  issue.created_at,
+        "updated_at":  issue.updated_at,
+        "closed_at":   issue.closed_at,
+        "html_url":    issue.html_url,
+    }))
+}
+
+fn parse_issue(url: &str) -> Option<(String, String, u64)> {
+    let path = url.split("://").nth(1)?.split_once('/').map(|(_, p)| p)?;
+    let stripped = path.split(['?', '#']).next()?.trim_end_matches('/');
+    let segs: Vec<&str> = stripped.split('/').filter(|s| !s.is_empty()).collect();
+    if segs.len() < 4 || segs[2] != "issues" {
+        return None;
+    }
+    let number: u64 = segs[3].parse().ok()?;
+    Some((segs[0].to_string(), segs[1].to_string(), number))
+}
+
+// ---------------------------------------------------------------------------
+// GitHub issue API types
+// ---------------------------------------------------------------------------
+
+#[derive(Deserialize)]
+struct Issue {
+    number: Option<i64>,
+    title: Option<String>,
+    body: Option<String>,
+    state: Option<String>,
+    state_reason: Option<String>,
+    locked: Option<bool>,
+    comments: Option<i64>,
+    created_at: Option<String>,
+    updated_at: Option<String>,
+    closed_at: Option<String>,
+    html_url: Option<String>,
+    user: Option<UserRef>,
+    #[serde(default)]
+    labels: Vec<LabelRef>,
+    #[serde(default)]
+    assignees: Vec<UserRef>,
+    milestone: Option<Milestone>,
+    /// Present when this "issue" is actually a pull request. The REST
+    /// API overloads the issues endpoint for PRs.
+    pull_request: Option<serde_json::Value>,
+}
+
+#[derive(Deserialize)]
+struct UserRef {
+    login: Option<String>,
+}
+
+#[derive(Deserialize)]
+struct LabelRef {
+    name: Option<String>,
+}
+
+#[derive(Deserialize)]
+struct Milestone {
+    title: Option<String>,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn matches_issue_urls() {
+        assert!(matches("https://github.com/rust-lang/rust/issues/100"));
+        assert!(matches("https://github.com/rust-lang/rust/issues/100/"));
+        assert!(!matches("https://github.com/rust-lang/rust"));
+        assert!(!matches("https://github.com/rust-lang/rust/pull/100"));
+        assert!(!matches("https://github.com/rust-lang/rust/issues"));
+    }
+
+    #[test]
+    fn parse_issue_extracts_owner_repo_number() {
+        assert_eq!(
+            parse_issue("https://github.com/rust-lang/rust/issues/100"),
+            Some(("rust-lang".into(), "rust".into(), 100))
+        );
+        assert_eq!(
+            parse_issue("https://github.com/rust-lang/rust/issues/100/?foo=bar"),
+            Some(("rust-lang".into(), "rust".into(), 100))
+        );
+    }
+}
diff --git a/crates/webclaw-fetch/src/extractors/mod.rs b/crates/webclaw-fetch/src/extractors/mod.rs
index 5cf0993..510adc0 100644
--- a/crates/webclaw-fetch/src/extractors/mod.rs
+++ b/crates/webclaw-fetch/src/extractors/mod.rs
@@ -21,6 +21,7 @@ pub mod dev_to;
 pub mod docker_hub;
 pub mod ebay_listing;
 pub mod ecommerce_product;
+pub mod github_issue;
 pub mod github_pr;
 pub mod github_release;
 pub mod github_repo;
@@ -33,9 +34,13 @@ pub mod linkedin_post;
 pub mod npm;
 pub mod pypi;
 pub mod reddit;
+pub mod shopify_collection;
 pub mod shopify_product;
 pub mod stackoverflow;
+pub mod substack_post;
 pub mod trustpilot_reviews;
+pub mod woocommerce_product;
+pub mod youtube_video;
 
 use serde::Serialize;
 use serde_json::Value;
@@ -65,6 +70,7 @@ pub fn list() -> Vec<ExtractorInfo> {
         hackernews::INFO,
         github_repo::INFO,
         github_pr::INFO,
+        github_issue::INFO,
         github_release::INFO,
         pypi::INFO,
         npm::INFO,
@@ -75,11 +81,15 @@ pub fn list() -> Vec<ExtractorInfo> {
         docker_hub::INFO,
         dev_to::INFO,
         stackoverflow::INFO,
+        substack_post::INFO,
+        youtube_video::INFO,
         linkedin_post::INFO,
         instagram_post::INFO,
         instagram_profile::INFO,
         shopify_product::INFO,
+        shopify_collection::INFO,
         ecommerce_product::INFO,
+        woocommerce_product::INFO,
         amazon_product::INFO,
         ebay_listing::INFO,
         trustpilot_reviews::INFO,
@@ -131,6 +141,13 @@ pub async fn dispatch_by_url(
                 .map(|v| (github_pr::INFO.name, v)),
         );
     }
+    if github_issue::matches(url) {
+        return Some(
+            github_issue::extract(client, url)
+                .await
+                .map(|v| (github_issue::INFO.name, v)),
+        );
+    }
     if github_release::matches(url) {
         return Some(
             github_release::extract(client, url)
@@ -233,7 +250,15 @@ pub async fn dispatch_by_url(
                 .map(|v| (trustpilot_reviews::INFO.name, v)),
         );
     }
-    // NOTE: shopify_product and ecommerce_product are intentionally NOT
+    if youtube_video::matches(url) {
+        return Some(
+            youtube_video::extract(client, url)
+                .await
+                .map(|v| (youtube_video::INFO.name, v)),
+        );
+    }
+    // NOTE: shopify_product, shopify_collection, ecommerce_product,
+    // woocommerce_product, and substack_post are intentionally NOT
     // in auto-dispatch. Their `matches()` functions are permissive
     // (any URL with `/products/`, `/product/`, `/p/`, etc.) and
     // claiming those generically would steal URLs from the default
@@ -282,6 +307,12 @@ pub async fn dispatch_by_name(
             })
             .await
         }
+        n if n == github_issue::INFO.name => {
+            run_or_mismatch(github_issue::matches(url), n, url, || {
+                github_issue::extract(client, url)
+            })
+            .await
+        }
         n if n == github_release::INFO.name => {
             run_or_mismatch(github_release::matches(url), n, url, || {
                 github_release::extract(client, url)
@@ -375,6 +406,30 @@ pub async fn dispatch_by_name(
             })
             .await
         }
+        n if n == youtube_video::INFO.name => {
+            run_or_mismatch(youtube_video::matches(url), n, url, || {
+                youtube_video::extract(client, url)
+            })
+            .await
+        }
+        n if n == substack_post::INFO.name => {
+            run_or_mismatch(substack_post::matches(url), n, url, || {
+                substack_post::extract(client, url)
+            })
+            .await
+        }
+        n if n == shopify_collection::INFO.name => {
+            run_or_mismatch(shopify_collection::matches(url), n, url, || {
+                shopify_collection::extract(client, url)
+            })
+            .await
+        }
+        n if n == woocommerce_product::INFO.name => {
+            run_or_mismatch(woocommerce_product::matches(url), n, url, || {
+                woocommerce_product::extract(client, url)
+            })
+            .await
+        }
         _ => Err(ExtractorDispatchError::UnknownVertical(name.to_string())),
     }
 }
diff --git a/crates/webclaw-fetch/src/extractors/shopify_collection.rs b/crates/webclaw-fetch/src/extractors/shopify_collection.rs
new file mode 100644
index 0000000..095f7dd
--- /dev/null
+++ b/crates/webclaw-fetch/src/extractors/shopify_collection.rs
@@ -0,0 +1,242 @@
+//! Shopify collection structured extractor.
+//!
+//! Every Shopify store exposes `/collections/{handle}.json` and
+//! `/collections/{handle}/products.json` on the public surface. This
+//! extractor hits `.json` (collection metadata) and falls through to
+//! `/products.json` for the first page of products. Same caveat as
+//! `shopify_product`: stores with Cloudflare in front of the shop
+//! will 403 the public path.
+//!
+//! Explicit-call only (like `shopify_product`). `/collections/{slug}`
+//! is a URL shape used by non-Shopify stores too, so auto-dispatch
+//! would claim too many URLs.
+
+use serde::Deserialize;
+use serde_json::{Value, json};
+
+use super::ExtractorInfo;
+use crate::client::FetchClient;
+use crate::error::FetchError;
+
+pub const INFO: ExtractorInfo = ExtractorInfo {
+    name: "shopify_collection",
+    label: "Shopify collection",
+    description: "Returns collection metadata + first page of products (handle, title, vendor, price, available) on ANY Shopify store via /collections/{handle}.json + /products.json.",
+    url_patterns: &[
+        "https://{shop}/collections/{handle}",
+        "https://{shop}.myshopify.com/collections/{handle}",
+    ],
+};
+
+pub fn matches(url: &str) -> bool {
+    let host = host_of(url);
+    if host.is_empty() || NON_SHOPIFY_HOSTS.iter().any(|h| host.ends_with(h)) {
+        return false;
+    }
+    url.contains("/collections/") && !url.ends_with("/collections/")
+}
+
+const NON_SHOPIFY_HOSTS: &[&str] = &[
+    "amazon.com",
+    "amazon.co.uk",
+    "amazon.de",
+    "ebay.com",
+    "etsy.com",
+    "walmart.com",
+    "target.com",
+    "aliexpress.com",
+    "huggingface.co", // has /collections/ for models
+    "github.com",
+];
+
+pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
+    let (coll_meta_url, coll_products_url) = build_json_urls(url);
+
+    // Step 1: collection metadata. Shopify returns 200 on missing
+    // collections sometimes; check "collection" key below.
+    let meta_resp = client.fetch(&coll_meta_url).await?;
+    if meta_resp.status == 404 {
+        return Err(FetchError::Build(format!(
+            "shopify_collection: '{url}' not found"
+        )));
+    }
+    if meta_resp.status == 403 {
+        return Err(FetchError::Build(format!(
+            "shopify_collection: {coll_meta_url} returned 403. The store has antibot in front of the .json endpoint. Use /v1/scrape/ecommerce_product or api.webclaw.io for this store."
+        )));
+    }
+    if meta_resp.status != 200 {
+        return Err(FetchError::Build(format!(
+            "shopify returned status {} for {coll_meta_url}",
+            meta_resp.status
+        )));
+    }
+
+    let meta: MetaWrapper = serde_json::from_str(&meta_resp.html).map_err(|e| {
+        FetchError::BodyDecode(format!(
+            "shopify_collection: '{url}' didn't return Shopify JSON, likely not a Shopify store ({e})"
+        ))
+    })?;
+
+    // Step 2: first page of products for this collection.
+    let products = match client.fetch(&coll_products_url).await {
+        Ok(r) if r.status == 200 => serde_json::from_str::<ProductsWrapper>(&r.html)
+            .ok()
+            .map(|pw| pw.products)
+            .unwrap_or_default(),
+        _ => Vec::new(),
+    };
+
+    let product_summaries: Vec<Value> = products
+        .iter()
+        .map(|p| {
+            let first_variant = p.variants.first();
+            json!({
+                "id":              p.id,
+                "handle":          p.handle,
+                "title":           p.title,
+                "vendor":          p.vendor,
+                "product_type":    p.product_type,
+                "price":           first_variant.and_then(|v| v.price.clone()),
+                "compare_at_price":first_variant.and_then(|v| v.compare_at_price.clone()),
+                "available":       p.variants.iter().any(|v| v.available.unwrap_or(false)),
+                "variant_count":   p.variants.len(),
+                "image":           p.images.first().and_then(|i| i.src.clone()),
+                "created_at":      p.created_at,
+                "updated_at":      p.updated_at,
+            })
+        })
+        .collect();
+
+    let c = meta.collection;
+    Ok(json!({
+        "url":               url,
+        "meta_json_url":     coll_meta_url,
+        "products_json_url": coll_products_url,
+        "collection_id":     c.id,
+        "handle":            c.handle,
+        "title":             c.title,
+        "description_html":  c.body_html,
+        "published_at":      c.published_at,
+        "updated_at":        c.updated_at,
+        "sort_order":        c.sort_order,
+        "products_in_page":  product_summaries.len(),
+        "products":          product_summaries,
+    }))
+}
+
+// ---------------------------------------------------------------------------
+// URL helpers
+// ---------------------------------------------------------------------------
+
+fn host_of(url: &str) -> &str {
+    url.split("://")
+        .nth(1)
+        .unwrap_or(url)
+        .split('/')
+        .next()
+        .unwrap_or("")
+}
+
+/// Build `(collection.json, collection/products.json)` from a user URL.
+fn build_json_urls(url: &str) -> (String, String) {
+    let (path_part, _query_part) = match url.split_once('?') {
+        Some((a, b)) => (a, Some(b)),
+        None => (url, None),
+    };
+    let clean = path_part.trim_end_matches('/').trim_end_matches(".json");
+    (
+        format!("{clean}.json"),
+        format!("{clean}/products.json?limit=50"),
+    )
+}
+
+// ---------------------------------------------------------------------------
+// Shopify collection + product JSON shapes (subsets)
+// ---------------------------------------------------------------------------
+
+#[derive(Deserialize)]
+struct MetaWrapper {
+    collection: Collection,
+}
+
+#[derive(Deserialize)]
+struct Collection {
+    id: Option<i64>,
+    handle: Option<String>,
+    title: Option<String>,
+    body_html: Option<String>,
+    published_at: Option<String>,
+    updated_at: Option<String>,
+    sort_order: Option<String>,
+}
+
+#[derive(Deserialize)]
+struct ProductsWrapper {
+    #[serde(default)]
+    products: Vec<ProductSummary>,
+}
+
+#[derive(Deserialize)]
+struct ProductSummary {
+    id: Option<i64>,
+    handle: Option<String>,
+    title: Option<String>,
+    vendor: Option<String>,
+    product_type: Option<String>,
+    created_at: Option<String>,
+    updated_at: Option<String>,
+    #[serde(default)]
+    variants: Vec<VariantSummary>,
+    #[serde(default)]
+    images: Vec<ImageSummary>,
+}
+
+#[derive(Deserialize)]
+struct VariantSummary {
+    price: Option<String>,
+    compare_at_price: Option<String>,
+    available: Option<bool>,
+}
+
+#[derive(Deserialize)]
+struct ImageSummary {
+    src: Option<String>,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn matches_shopify_collection_urls() {
+        assert!(matches("https://www.allbirds.com/collections/mens"));
+        assert!(matches(
+            "https://shop.example.com/collections/new-arrivals?page=2"
+        ));
+    }
+
+    #[test]
+    fn rejects_non_shopify() {
+        assert!(!matches("https://github.com/collections/foo"));
+        assert!(!matches("https://huggingface.co/collections/foo"));
+        assert!(!matches("https://example.com/"));
+        assert!(!matches("https://example.com/collections/"));
+    }
+
+    #[test]
+    fn build_json_urls_derives_both_paths() {
+        let (meta, products) = build_json_urls("https://shop.example.com/collections/mens");
+        assert_eq!(meta, "https://shop.example.com/collections/mens.json");
+        assert_eq!(
+            products,
+            "https://shop.example.com/collections/mens/products.json?limit=50"
+        );
+    }
+
+    #[test]
+    fn build_json_urls_handles_trailing_slash() {
+        let (meta, _) = build_json_urls("https://shop.example.com/collections/mens/");
+        assert_eq!(meta, "https://shop.example.com/collections/mens.json");
+    }
+}
diff --git a/crates/webclaw-fetch/src/extractors/substack_post.rs b/crates/webclaw-fetch/src/extractors/substack_post.rs
new file mode 100644
index 0000000..03ccbe8
--- /dev/null
+++ b/crates/webclaw-fetch/src/extractors/substack_post.rs
@@ -0,0 +1,213 @@
+//! Substack post extractor.
+//!
+//! Every Substack publication exposes `/api/v1/posts/{slug}` that
+//! returns the full post as JSON: body HTML, cover image, author,
+//! publication info, reactions, paywall state. No auth on public
+//! posts.
+//!
+//! Works on both `*.substack.com` subdomains and custom domains
+//! (e.g. `simonwillison.net` uses Substack too). Detection is
+//! "URL has `/p/{slug}`" because that's the canonical Substack post
+//! path. Explicit-call only because the `/p/{slug}` URL shape is
+//! used by non-Substack sites too.
+
+use serde::Deserialize;
+use serde_json::{Value, json};
+
+use super::ExtractorInfo;
+use crate::client::FetchClient;
+use crate::error::FetchError;
+
+pub const INFO: ExtractorInfo = ExtractorInfo {
+    name: "substack_post",
+    label: "Substack post",
+    description: "Returns post HTML, title, subtitle, author, publication, reactions, paywall status via the Substack public API.",
+    url_patterns: &[
+        "https://{pub}.substack.com/p/{slug}",
+        "https://{custom-domain}/p/{slug}",
+    ],
+};
+
+pub fn matches(url: &str) -> bool {
+    if !(url.starts_with("http://") || url.starts_with("https://")) {
+        return false;
+    }
+    url.contains("/p/")
+}
+
+pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
+    let slug = parse_slug(url).ok_or_else(|| {
+        FetchError::Build(format!("substack_post: cannot parse slug from '{url}'"))
+    })?;
+    let host = host_of(url);
+    if host.is_empty() {
+        return Err(FetchError::Build(format!(
+            "substack_post: empty host in '{url}'"
+        )));
+    }
+    let scheme = if url.starts_with("http://") {
+        "http"
+    } else {
+        "https"
+    };
+    let api_url = format!("{scheme}://{host}/api/v1/posts/{slug}");
+    let resp = client.fetch(&api_url).await?;
+    if resp.status == 404 {
+        return Err(FetchError::Build(format!(
+            "substack_post: '{slug}' not found on {host} (got 404). \
+             If the publication isn't actually on Substack, use /v1/scrape instead."
+        )));
+    }
+    if resp.status != 200 {
+        return Err(FetchError::Build(format!(
+            "substack returned status {} for {api_url}",
+            resp.status
+        )));
+    }
+
+    let p: Post = serde_json::from_str(&resp.html).map_err(|e| {
+        FetchError::BodyDecode(format!(
+            "substack_post: '{host}' didn't return Substack JSON, likely not a Substack ({e})"
+        ))
+    })?;
+
+    Ok(json!({
+        "url":                  url,
+        "api_url":              api_url,
+        "id":                   p.id,
+        "type":                 p.r#type,
+        "slug":                 p.slug,
+        "title":                p.title,
+        "subtitle":             p.subtitle,
+        "description":          p.description,
+        "canonical_url":        p.canonical_url,
+        "post_date":            p.post_date,
+        "updated_at":           p.updated_at,
+        "audience":             p.audience,
+        "has_paywall":          matches!(p.audience.as_deref(), Some("only_paid") | Some("founding")),
+        "is_free_preview":      p.is_free_preview,
+        "cover_image":          p.cover_image,
+        "word_count":           p.wordcount,
+        "reactions":            p.reactions,
+        "comment_count":        p.comment_count,
+        "body_html":            p.body_html,
+        "body_text":            p.truncated_body_text.or(p.body_text),
+        "publication": json!({
+            "id":           p.publication.as_ref().and_then(|pub_| pub_.id),
+            "name":         p.publication.as_ref().and_then(|pub_| pub_.name.clone()),
+            "subdomain":    p.publication.as_ref().and_then(|pub_| pub_.subdomain.clone()),
+            "custom_domain":p.publication.as_ref().and_then(|pub_| pub_.custom_domain.clone()),
+        }),
+        "authors": p.published_bylines.iter().map(|a| json!({
+            "id":     a.id,
+            "name":   a.name,
+            "handle": a.handle,
+            "photo":  a.photo_url,
+        })).collect::<Vec<_>>(),
+    }))
+}
+
+// ---------------------------------------------------------------------------
+// URL helpers
+// ---------------------------------------------------------------------------
+
+fn host_of(url: &str) -> &str {
+    url.split("://")
+        .nth(1)
+        .unwrap_or(url)
+        .split('/')
+        .next()
+        .unwrap_or("")
+}
+
+fn parse_slug(url: &str) -> Option<String> {
+    let after = url.split("/p/").nth(1)?;
+    let stripped = after
+        .split(['?', '#'])
+        .next()?
+        .trim_end_matches('/')
+        .split('/')
+        .next()
+        .unwrap_or("");
+    if stripped.is_empty() {
+        None
+    } else {
+        Some(stripped.to_string())
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Substack API types (subset)
+// ---------------------------------------------------------------------------
+
+#[derive(Deserialize)]
+struct Post {
+    id: Option<i64>,
+    r#type: Option<String>,
+    slug: Option<String>,
+    title: Option<String>,
+    subtitle: Option<String>,
+    description: Option<String>,
+    canonical_url: Option<String>,
+    post_date: Option<String>,
+    updated_at: Option<String>,
+    audience: Option<String>,
+    is_free_preview: Option<bool>,
+    cover_image: Option<String>,
+    wordcount: Option<i64>,
+    reactions: Option<serde_json::Value>,
+    comment_count: Option<i64>,
+    body_html: Option<String>,
+    body_text: Option<String>,
+    truncated_body_text: Option<String>,
+    publication: Option<Publication>,
+    #[serde(default, rename = "publishedBylines")]
+    published_bylines: Vec<Byline>,
+}
+
+#[derive(Deserialize)]
+struct Publication {
+    id: Option<i64>,
+    name: Option<String>,
+    subdomain: Option<String>,
+    custom_domain: Option<String>,
+}
+
+#[derive(Deserialize)]
+struct Byline {
+    id: Option<i64>,
+    name: Option<String>,
+    handle: Option<String>,
+    photo_url: Option<String>,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn matches_post_urls() {
+        assert!(matches(
+            "https://stratechery.substack.com/p/the-tech-letter"
+        ));
+        assert!(matches("https://simonwillison.net/p/2024-08-01-something"));
+        assert!(!matches("https://example.com/"));
+        assert!(!matches("ftp://example.com/p/foo"));
+    }
+
+    #[test]
+    fn parse_slug_strips_query_and_trailing_slash() {
+        assert_eq!(
+            parse_slug("https://example.substack.com/p/my-post"),
+            Some("my-post".into())
+        );
+        assert_eq!(
+            parse_slug("https://example.substack.com/p/my-post/"),
+            Some("my-post".into())
+        );
+        assert_eq!(
+            parse_slug("https://example.substack.com/p/my-post?ref=123"),
+            Some("my-post".into())
+        );
+    }
+}
diff --git a/crates/webclaw-fetch/src/extractors/woocommerce_product.rs b/crates/webclaw-fetch/src/extractors/woocommerce_product.rs
new file mode 100644
index 0000000..73f1109
--- /dev/null
+++ b/crates/webclaw-fetch/src/extractors/woocommerce_product.rs
@@ -0,0 +1,237 @@
+//! WooCommerce product structured extractor.
+//!
+//! Targets WooCommerce's Store API: `/wp-json/wc/store/v1/products?slug={slug}`.
+//! About 30-50% of WooCommerce stores expose this endpoint publicly
+//! (it's on by default, but common security plugins disable it).
+//! When it's off, the server returns 404 at /wp-json. We surface a
+//! clean error and point callers at `/v1/scrape/ecommerce_product`
+//! which works on any store with Schema.org JSON-LD.
+//!
+//! Explicit-call only. `/product/{slug}` is the default permalink for
+//! WooCommerce but custom stores use every variation imaginable, so
+//! auto-dispatch is unreliable.
+
+use serde::Deserialize;
+use serde_json::{Value, json};
+
+use super::ExtractorInfo;
+use crate::client::FetchClient;
+use crate::error::FetchError;
+
+pub const INFO: ExtractorInfo = ExtractorInfo {
+    name: "woocommerce_product",
+    label: "WooCommerce product",
+    description: "Returns product via the WooCommerce Store REST API (requires the /wp-json/wc/store endpoint to be enabled on the target store).",
+    url_patterns: &[
+        "https://{shop}/product/{slug}",
+        "https://{shop}/shop/{slug}",
+    ],
+};
+
+pub fn matches(url: &str) -> bool {
+    let host = host_of(url);
+    if host.is_empty() {
+        return false;
+    }
+    // Permissive: WooCommerce stores use custom domains + custom
+    // permalinks. The extractor's API probe is what confirms it's
+    // really WooCommerce.
+    url.contains("/product/")
+        || url.contains("/shop/")
+        || url.contains("/producto/") // common es locale
+        || url.contains("/produit/") // common fr locale
+}
+
+pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
+    let slug = parse_slug(url).ok_or_else(|| {
+        FetchError::Build(format!(
+            "woocommerce_product: cannot parse slug from '{url}'"
+        ))
+    })?;
+    let host = host_of(url);
+    if host.is_empty() {
+        return Err(FetchError::Build(format!(
+            "woocommerce_product: empty host in '{url}'"
+        )));
+    }
+    let scheme = if url.starts_with("http://") {
+        "http"
+    } else {
+        "https"
+    };
+    let api_url = format!("{scheme}://{host}/wp-json/wc/store/v1/products?slug={slug}&per_page=1");
+    let resp = client.fetch(&api_url).await?;
+    if resp.status == 404 {
+        return Err(FetchError::Build(format!(
+            "woocommerce_product: {host} does not expose /wp-json/wc/store (404). \
+             Use /v1/scrape/ecommerce_product for JSON-LD fallback."
+        )));
+    }
+    if resp.status == 401 || resp.status == 403 {
+        return Err(FetchError::Build(format!(
+            "woocommerce_product: {host} requires auth for /wp-json/wc/store ({}). \
+             Use /v1/scrape/ecommerce_product for the public JSON-LD fallback.",
+            resp.status
+        )));
+    }
+    if resp.status != 200 {
+        return Err(FetchError::Build(format!(
+            "woocommerce api returned status {} for {api_url}",
+            resp.status
+        )));
+    }
+
+    let products: Vec<Product> = serde_json::from_str(&resp.html)
+        .map_err(|e| FetchError::BodyDecode(format!("woocommerce parse: {e}")))?;
+    let p = products.into_iter().next().ok_or_else(|| {
+        FetchError::Build(format!(
+            "woocommerce_product: no product found for slug '{slug}' on {host}"
+        ))
+    })?;
+
+    let images: Vec<Value> = p
+        .images
+        .iter()
+        .map(|i| json!({"src": i.src, "thumbnail": i.thumbnail, "alt": i.alt}))
+        .collect();
+    let variations_count = p.variations.as_ref().map(|v| v.len()).unwrap_or(0);
+
+    Ok(json!({
+        "url":             url,
+        "api_url":         api_url,
+        "product_id":      p.id,
+        "name":            p.name,
+        "slug":            p.slug,
+        "sku":             p.sku,
+        "permalink":       p.permalink,
+        "on_sale":         p.on_sale,
+        "in_stock":        p.is_in_stock,
+        "is_purchasable":  p.is_purchasable,
+        "price":           p.prices.as_ref().and_then(|pr| pr.price.clone()),
+        "regular_price":   p.prices.as_ref().and_then(|pr| pr.regular_price.clone()),
+        "sale_price":      p.prices.as_ref().and_then(|pr| pr.sale_price.clone()),
+        "currency":        p.prices.as_ref().and_then(|pr| pr.currency_code.clone()),
+        "currency_minor":  p.prices.as_ref().and_then(|pr| pr.currency_minor_unit),
+        "price_range":     p.prices.as_ref().and_then(|pr| pr.price_range.clone()),
+        "average_rating":  p.average_rating,
+        "review_count":    p.review_count,
+        "description":     p.description,
+        "short_description": p.short_description,
+        "categories":      p.categories.iter().filter_map(|c| c.name.clone()).collect::<Vec<_>>(),
+        "tags":            p.tags.iter().filter_map(|t| t.name.clone()).collect::<Vec<_>>(),
+        "variation_count": variations_count,
+        "image_count":     images.len(),
+        "images":          images,
+    }))
+}
+
+// ---------------------------------------------------------------------------
+// URL helpers
+// ---------------------------------------------------------------------------
+
+fn host_of(url: &str) -> &str {
+    url.split("://")
+        .nth(1)
+        .unwrap_or(url)
+        .split('/')
+        .next()
+        .unwrap_or("")
+}
+
+/// Extract the product slug from common WooCommerce permalinks.
+fn parse_slug(url: &str) -> Option<String> {
+    for needle in ["/product/", "/shop/", "/producto/", "/produit/"] {
+        if let Some(after) = url.split(needle).nth(1) {
+            let stripped = after
+                .split(['?', '#'])
+                .next()?
+                .trim_end_matches('/')
+                .split('/')
+                .next()
+                .unwrap_or("");
+            if !stripped.is_empty() {
+                return Some(stripped.to_string());
+            }
+        }
+    }
+    None
+}
+
+// ---------------------------------------------------------------------------
+// Store API types (subset of the full response)
+// ---------------------------------------------------------------------------
+
+#[derive(Deserialize)]
+struct Product {
+    id: Option<i64>,
+    name: Option<String>,
+    slug: Option<String>,
+    sku: Option<String>,
+    permalink: Option<String>,
+    description: Option<String>,
+    short_description: Option<String>,
+    on_sale: Option<bool>,
+    is_in_stock: Option<bool>,
+    is_purchasable: Option<bool>,
+    average_rating: Option<serde_json::Value>, // string or number
+    review_count: Option<i64>,
+    prices: Option<Prices>,
+    #[serde(default)]
+    categories: Vec<Term>,
+    #[serde(default)]
+    tags: Vec<Term>,
+    #[serde(default)]
+    images: Vec<Img>,
+    variations: Option<Vec<serde_json::Value>>,
+}
+
+#[derive(Deserialize)]
+struct Prices {
+    price: Option<String>,
+    regular_price: Option<String>,
+    sale_price: Option<String>,
+    currency_code: Option<String>,
+    currency_minor_unit: Option<i64>,
+    price_range: Option<serde_json::Value>,
+}
+
+#[derive(Deserialize)]
+struct Term {
+    name: Option<String>,
+}
+
+#[derive(Deserialize)]
+struct Img {
+    src: Option<String>,
+    thumbnail: Option<String>,
+    alt: Option<String>,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn matches_common_permalinks() {
+        assert!(matches("https://shop.example.com/product/cool-widget"));
+        assert!(matches("https://shop.example.com/shop/cool-widget"));
+        assert!(matches("https://tienda.example.com/producto/cosa"));
+        assert!(matches("https://boutique.example.com/produit/chose"));
+    }
+
+    #[test]
+    fn parse_slug_handles_locale_and_suffix() {
+        assert_eq!(
+            parse_slug("https://shop.example.com/product/cool-widget"),
+            Some("cool-widget".into())
+        );
+        assert_eq!(
+            parse_slug("https://shop.example.com/product/cool-widget/?attr=red"),
+            Some("cool-widget".into())
+        );
+        assert_eq!(
+            parse_slug("https://tienda.example.com/producto/cosa/"),
+            Some("cosa".into())
+        );
+    }
+}
diff --git a/crates/webclaw-fetch/src/extractors/youtube_video.rs b/crates/webclaw-fetch/src/extractors/youtube_video.rs
new file mode 100644
index 0000000..c37230a
--- /dev/null
+++ b/crates/webclaw-fetch/src/extractors/youtube_video.rs
@@ -0,0 +1,255 @@
+//! YouTube video structured extractor.
+//!
+//! YouTube embeds the full player configuration in a
+//! `ytInitialPlayerResponse` JavaScript assignment at the top of
+//! every `/watch`, `/shorts`, and `youtu.be` HTML page. We reuse the
+//! core crate's already-proven regex + parse to surface typed JSON
+//! from it: video id, title, author + channel id, view count,
+//! duration, upload date, keywords, thumbnails, caption-track URLs.
+//!
+//! Auto-dispatched: YouTube host is unique and the `v=` or `/shorts/`
+//! shape is stable.
+
+use serde_json::{Value, json};
+
+use super::ExtractorInfo;
+use crate::client::FetchClient;
+use crate::error::FetchError;
+
+pub const INFO: ExtractorInfo = ExtractorInfo {
+    name: "youtube_video",
+    label: "YouTube video",
+    description: "Returns video id, title, channel, view count, duration, upload date, thumbnails, keywords, and caption-track URLs.",
+    url_patterns: &[
+        "https://www.youtube.com/watch?v={id}",
+        "https://youtu.be/{id}",
+        "https://www.youtube.com/shorts/{id}",
+    ],
+};
+
+pub fn matches(url: &str) -> bool {
+    webclaw_core::youtube::is_youtube_url(url)
+        || url.contains("youtube.com/shorts/")
+        || url.contains("youtube-nocookie.com/embed/")
+}
+
+pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
+    let video_id = parse_video_id(url).ok_or_else(|| {
+        FetchError::Build(format!("youtube_video: cannot parse video id from '{url}'"))
+    })?;
+
+    // Always fetch the canonical /watch URL. /shorts/ and youtu.be
+    // sometimes serve a thinner page without the player blob.
+    let canonical = format!("https://www.youtube.com/watch?v={video_id}");
+    let resp = client.fetch(&canonical).await?;
+    if resp.status != 200 {
+        return Err(FetchError::Build(format!(
+            "youtube returned status {} for {canonical}",
+            resp.status
+        )));
+    }
+
+    let player = extract_player_response(&resp.html).ok_or_else(|| {
+        FetchError::BodyDecode(format!(
+            "youtube_video: no ytInitialPlayerResponse on {canonical} (video may be private, region-blocked, or removed)"
+        ))
+    })?;
+
+    let video_details = player.get("videoDetails");
+    let microformat = player
+        .get("microformat")
+        .and_then(|m| m.get("playerMicroformatRenderer"));
+
+    let thumbnails: Vec<Value> = video_details
+        .and_then(|vd| vd.get("thumbnail"))
+        .and_then(|t| t.get("thumbnails"))
+        .and_then(|t| t.as_array())
+        .cloned()
+        .unwrap_or_default();
+
+    let keywords: Vec<Value> = video_details
+        .and_then(|vd| vd.get("keywords"))
+        .and_then(|k| k.as_array())
+        .cloned()
+        .unwrap_or_default();
+
+    let caption_tracks = webclaw_core::youtube::extract_caption_tracks(&resp.html);
+    let captions: Vec<Value> = caption_tracks
+        .iter()
+        .map(|c| {
+            json!({
+                "url":  c.url,
+                "lang": c.lang,
+                "name": c.name,
+            })
+        })
+        .collect();
+
+    Ok(json!({
+        "url":          url,
+        "canonical_url":canonical,
+        "video_id":     video_id,
+        "title":        get_str(video_details, "title"),
+        "description":  get_str(video_details, "shortDescription"),
+        "author":       get_str(video_details, "author"),
+        "channel_id":   get_str(video_details, "channelId"),
+        "channel_url":  get_str(microformat, "ownerProfileUrl"),
+        "view_count":   get_int(video_details, "viewCount"),
+        "length_seconds": get_int(video_details, "lengthSeconds"),
+        "is_live":      video_details.and_then(|vd| vd.get("isLiveContent")).and_then(|v| v.as_bool()),
+        "is_private":   video_details.and_then(|vd| vd.get("isPrivate")).and_then(|v| v.as_bool()),
+        "is_unlisted":  microformat.and_then(|m| m.get("isUnlisted")).and_then(|v| v.as_bool()),
+        "allow_ratings":video_details.and_then(|vd| vd.get("allowRatings")).and_then(|v| v.as_bool()),
+        "category":     get_str(microformat, "category"),
+        "upload_date":  get_str(microformat, "uploadDate"),
+        "publish_date": get_str(microformat, "publishDate"),
+        "keywords":     keywords,
+        "thumbnails":   thumbnails,
+        "caption_tracks": captions,
+    }))
+}
+
+// ---------------------------------------------------------------------------
+// URL helpers
+// ---------------------------------------------------------------------------
+
+fn parse_video_id(url: &str) -> Option<String> {
+    // youtu.be/{id}
+    if let Some(after) = url.split("youtu.be/").nth(1) {
+        let id = after
+            .split(['?', '#', '/'])
+            .next()
+            .unwrap_or("")
+            .trim_end_matches('/');
+        if !id.is_empty() {
+            return Some(id.to_string());
+        }
+    }
+    // youtube.com/shorts/{id}
+    if let Some(after) = url.split("youtube.com/shorts/").nth(1) {
+        let id = after
+            .split(['?', '#', '/'])
+            .next()
+            .unwrap_or("")
+            .trim_end_matches('/');
+        if !id.is_empty() {
+            return Some(id.to_string());
+        }
+    }
+    // youtube-nocookie.com/embed/{id}
+    if let Some(after) = url.split("/embed/").nth(1) {
+        let id = after
+            .split(['?', '#', '/'])
+            .next()
+            .unwrap_or("")
+            .trim_end_matches('/');
+        if !id.is_empty() {
+            return Some(id.to_string());
+        }
+    }
+    // youtube.com/watch?v={id} (also matches youtube.com/watch?foo=bar&v={id})
+    if let Some(q) = url.split_once('?').map(|(_, q)| q)
+        && let Some(id) = q
+            .split('&')
+            .find_map(|p| p.strip_prefix("v=").map(|v| v.to_string()))
+    {
+        let id = id.split(['#', '/']).next().unwrap_or(&id).to_string();
+        if !id.is_empty() {
+            return Some(id);
+        }
+    }
+    None
+}
+
+// ---------------------------------------------------------------------------
+// Player-response parsing
+// ---------------------------------------------------------------------------
+
+fn extract_player_response(html: &str) -> Option<Value> {
+    use regex::Regex;
+    use std::sync::OnceLock;
+    // Same regex as webclaw_core::youtube. Duplicated here because
+    // core's regex is module-private. Kept in lockstep; changes are
+    // rare and we cover with tests in both places.
+    static RE: OnceLock<Regex> = OnceLock::new();
+    let re = RE
+        .get_or_init(|| Regex::new(r"var\s+ytInitialPlayerResponse\s*=\s*(\{.+?\})\s*;").unwrap());
+    let json_str = re.captures(html)?.get(1)?.as_str();
+    serde_json::from_str(json_str).ok()
+}
+
+fn get_str(v: Option<&Value>, key: &str) -> Option<String> {
+    v.and_then(|x| x.get(key))
+        .and_then(|x| x.as_str().map(String::from))
+}
+
+fn get_int(v: Option<&Value>, key: &str) -> Option<i64> {
+    v.and_then(|x| x.get(key)).and_then(|x| {
+        x.as_i64()
+            .or_else(|| x.as_str().and_then(|s| s.parse::<i64>().ok()))
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn matches_watch_urls() {
+        assert!(matches("https://www.youtube.com/watch?v=dQw4w9WgXcQ"));
+        assert!(matches("https://youtu.be/dQw4w9WgXcQ"));
+        assert!(matches("https://www.youtube.com/shorts/abc123"));
+        assert!(matches(
+            "https://www.youtube-nocookie.com/embed/dQw4w9WgXcQ"
+        ));
+    }
+
+    #[test]
+    fn rejects_non_video_urls() {
+        assert!(!matches("https://www.youtube.com/"));
+        assert!(!matches("https://www.youtube.com/channel/abc"));
+        assert!(!matches("https://example.com/watch?v=abc"));
+    }
+
+    #[test]
+    fn parse_video_id_from_each_shape() {
+        assert_eq!(
+            parse_video_id("https://www.youtube.com/watch?v=dQw4w9WgXcQ"),
+            Some("dQw4w9WgXcQ".into())
+        );
+        assert_eq!(
+            parse_video_id("https://www.youtube.com/watch?v=dQw4w9WgXcQ&t=10s"),
+            Some("dQw4w9WgXcQ".into())
+        );
+        assert_eq!(
+            parse_video_id("https://www.youtube.com/watch?feature=share&v=dQw4w9WgXcQ"),
+            Some("dQw4w9WgXcQ".into())
+        );
+        assert_eq!(
+            parse_video_id("https://youtu.be/dQw4w9WgXcQ"),
+            Some("dQw4w9WgXcQ".into())
+        );
+        assert_eq!(
+            parse_video_id("https://youtu.be/dQw4w9WgXcQ?t=30"),
+            Some("dQw4w9WgXcQ".into())
+        );
+        assert_eq!(
+            parse_video_id("https://www.youtube.com/shorts/abc123"),
+            Some("abc123".into())
+        );
+    }
+
+    #[test]
+    fn extract_player_response_happy_path() {
+        let html = r#"
+<html><body>
+<script>
+var ytInitialPlayerResponse = {"videoDetails":{"videoId":"abc","title":"T","author":"A","viewCount":"100","lengthSeconds":"60","shortDescription":"d"}};
+</script>
+</body></html>
+"#;
+        let v = extract_player_response(html).unwrap();
+        let vd = v.get("videoDetails").unwrap();
+        assert_eq!(vd.get("title").unwrap().as_str(), Some("T"));
+    }
+}