feat(extractors): wave 6a, 5 easy verticals (27 total)

Adds 5 structured extractors that hit public APIs with stable shapes: - github_issue: /repos/{o}/{r}/issues/{n} (rejects PRs, points to github_pr) - shopify_collection: /collections/{handle}.json + products.json - woocommerce_product: /wp-json/wc/store/v1/products?slug={slug} - substack_post: /api/v1/posts/{slug} (works on custom domains too) - youtube_video: ytInitialPlayerResponse blob from /watch HTML Auto-dispatched: github_issue, youtube_video (unique hosts and stable URL shapes). Explicit-call: shopify_collection, woocommerce_product, substack_post (URL shapes overlap with non-target sites). Tests: 82 total passing in webclaw-fetch (12 new), clippy clean.
2026-07-06 04:32:11 +02:00 · 2026-04-22 16:33:35 +02:00 · 2026-04-22 16:33:35 +02:00 · 8cc727c2f2
commit 8cc727c2f2
parent d8c9274a9c
6 changed files with 1175 additions and 1 deletions
--- a/crates/webclaw-fetch/src/extractors/github_issue.rs
+++ b/crates/webclaw-fetch/src/extractors/github_issue.rs
@ -0,0 +1,172 @@
 //! GitHub issue structured extractor.
 //!
 //! Mirror of `github_pr` but on `/issues/{number}`. Uses
 //! `api.github.com/repos/{owner}/{repo}/issues/{number}`. Returns the
 //! issue body + comment count + labels + milestone + author /
 //! assignees. Full per-comment bodies would be another call; kept for
 //! a follow-up.
 use serde::Deserialize;
 use serde_json::{Value, json};
 use super::ExtractorInfo;
 use crate::client::FetchClient;
 use crate::error::FetchError;
 pub const INFO: ExtractorInfo = ExtractorInfo {
    name: "github_issue",
    label: "GitHub issue",
    description: "Returns issue metadata: title, body, state, author, labels, assignees, milestone, comment count.",
    url_patterns: &["https://github.com/{owner}/{repo}/issues/{number}"],
 };
 pub fn matches(url: &str) -> bool {
    let host = url
        .split("://")
        .nth(1)
        .unwrap_or(url)
        .split('/')
        .next()
        .unwrap_or("");
    if host != "github.com" && host != "www.github.com" {
        return false;
    }
    parse_issue(url).is_some()
 }
 pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
    let (owner, repo, number) = parse_issue(url).ok_or_else(|| {
        FetchError::Build(format!("github_issue: cannot parse issue URL '{url}'"))
    })?;
    let api_url = format!("https://api.github.com/repos/{owner}/{repo}/issues/{number}");
    let resp = client.fetch(&api_url).await?;
    if resp.status == 404 {
        return Err(FetchError::Build(format!(
            "github_issue: issue '{owner}/{repo}#{number}' not found"
        )));
    }
    if resp.status == 403 {
        return Err(FetchError::Build(
            "github_issue: rate limited (60/hour unauth). Set GITHUB_TOKEN for 5,000/hour.".into(),
        ));
    }
    if resp.status != 200 {
        return Err(FetchError::Build(format!(
            "github api returned status {}",
            resp.status
        )));
    }
    let issue: Issue = serde_json::from_str(&resp.html)
        .map_err(|e| FetchError::BodyDecode(format!("github issue parse: {e}")))?;
    // The same endpoint returns PRs too; reject if we got one so the caller
    // uses /v1/scrape/github_pr instead of getting a half-shaped payload.
    if issue.pull_request.is_some() {
        return Err(FetchError::Build(format!(
            "github_issue: '{owner}/{repo}#{number}' is a pull request, use /v1/scrape/github_pr"
        )));
    }
    Ok(json!({
        "url":         url,
        "owner":       owner,
        "repo":        repo,
        "number":      issue.number,
        "title":       issue.title,
        "body":        issue.body,
        "state":       issue.state,
        "state_reason":issue.state_reason,
        "author":      issue.user.as_ref().and_then(|u| u.login.clone()),
        "labels":      issue.labels.iter().filter_map(|l| l.name.clone()).collect::<Vec<_>>(),
        "assignees":   issue.assignees.iter().filter_map(|u| u.login.clone()).collect::<Vec<_>>(),
        "milestone":   issue.milestone.as_ref().and_then(|m| m.title.clone()),
        "comments":    issue.comments,
        "locked":      issue.locked,
        "created_at":  issue.created_at,
        "updated_at":  issue.updated_at,
        "closed_at":   issue.closed_at,
        "html_url":    issue.html_url,
    }))
 }
 fn parse_issue(url: &str) -> Option<(String, String, u64)> {
    let path = url.split("://").nth(1)?.split_once('/').map(|(_, p)| p)?;
    let stripped = path.split(['?', '#']).next()?.trim_end_matches('/');
    let segs: Vec<&str> = stripped.split('/').filter(|s| !s.is_empty()).collect();
    if segs.len() < 4 || segs[2] != "issues" {
        return None;
    }
    let number: u64 = segs[3].parse().ok()?;
    Some((segs[0].to_string(), segs[1].to_string(), number))
 }
 // ---------------------------------------------------------------------------
 // GitHub issue API types
 // ---------------------------------------------------------------------------
 #[derive(Deserialize)]
 struct Issue {
    number: Option<i64>,
    title: Option<String>,
    body: Option<String>,
    state: Option<String>,
    state_reason: Option<String>,
    locked: Option<bool>,
    comments: Option<i64>,
    created_at: Option<String>,
    updated_at: Option<String>,
    closed_at: Option<String>,
    html_url: Option<String>,
    user: Option<UserRef>,
    #[serde(default)]
    labels: Vec<LabelRef>,
    #[serde(default)]
    assignees: Vec<UserRef>,
    milestone: Option<Milestone>,
    /// Present when this "issue" is actually a pull request. The REST
    /// API overloads the issues endpoint for PRs.
    pull_request: Option<serde_json::Value>,
 }
 #[derive(Deserialize)]
 struct UserRef {
    login: Option<String>,
 }
 #[derive(Deserialize)]
 struct LabelRef {
    name: Option<String>,
 }
 #[derive(Deserialize)]
 struct Milestone {
    title: Option<String>,
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn matches_issue_urls() {
        assert!(matches("https://github.com/rust-lang/rust/issues/100"));
        assert!(matches("https://github.com/rust-lang/rust/issues/100/"));
        assert!(!matches("https://github.com/rust-lang/rust"));
        assert!(!matches("https://github.com/rust-lang/rust/pull/100"));
        assert!(!matches("https://github.com/rust-lang/rust/issues"));
    }
    #[test]
    fn parse_issue_extracts_owner_repo_number() {
        assert_eq!(
            parse_issue("https://github.com/rust-lang/rust/issues/100"),
            Some(("rust-lang".into(), "rust".into(), 100))
        );
        assert_eq!(
            parse_issue("https://github.com/rust-lang/rust/issues/100/?foo=bar"),
            Some(("rust-lang".into(), "rust".into(), 100))
        );
    }
 }
--- a/crates/webclaw-fetch/src/extractors/mod.rs
+++ b/crates/webclaw-fetch/src/extractors/mod.rs
@ -21,6 +21,7 @@ pub mod dev_to;
 pub mod docker_hub;
 pub mod ebay_listing;
 pub mod ecommerce_product;
 pub mod github_issue;
 pub mod github_pr;
 pub mod github_release;
 pub mod github_repo;
@ -33,9 +34,13 @@ pub mod linkedin_post;
 pub mod npm;
 pub mod pypi;
 pub mod reddit;
 pub mod shopify_collection;
 pub mod shopify_product;
 pub mod stackoverflow;
 pub mod substack_post;
 pub mod trustpilot_reviews;
 pub mod woocommerce_product;
 pub mod youtube_video;
 use serde::Serialize;
 use serde_json::Value;
@ -65,6 +70,7 @@ pub fn list() -> Vec<ExtractorInfo> {
        hackernews::INFO,
        github_repo::INFO,
        github_pr::INFO,
        github_issue::INFO,
        github_release::INFO,
        pypi::INFO,
        npm::INFO,
@ -75,11 +81,15 @@ pub fn list() -> Vec<ExtractorInfo> {
        docker_hub::INFO,
        dev_to::INFO,
        stackoverflow::INFO,
        substack_post::INFO,
        youtube_video::INFO,
        linkedin_post::INFO,
        instagram_post::INFO,
        instagram_profile::INFO,
        shopify_product::INFO,
        shopify_collection::INFO,
        ecommerce_product::INFO,
        woocommerce_product::INFO,
        amazon_product::INFO,
        ebay_listing::INFO,
        trustpilot_reviews::INFO,
@ -131,6 +141,13 @@ pub async fn dispatch_by_url(
                .map(|v| (github_pr::INFO.name, v)),
        );
    }
    if github_issue::matches(url) {
        return Some(
            github_issue::extract(client, url)
                .await
                .map(|v| (github_issue::INFO.name, v)),
        );
    }
    if github_release::matches(url) {
        return Some(
            github_release::extract(client, url)
@ -233,7 +250,15 @@ pub async fn dispatch_by_url(
                .map(|v| (trustpilot_reviews::INFO.name, v)),
        );
    }
-    // NOTE: shopify_product and ecommerce_product are intentionally NOT
+    if youtube_video::matches(url) {
        return Some(
            youtube_video::extract(client, url)
                .await
                .map(|v| (youtube_video::INFO.name, v)),
        );
    }
    // NOTE: shopify_product, shopify_collection, ecommerce_product,
    // woocommerce_product, and substack_post are intentionally NOT
    // in auto-dispatch. Their `matches()` functions are permissive
    // (any URL with `/products/`, `/product/`, `/p/`, etc.) and
    // claiming those generically would steal URLs from the default
@ -282,6 +307,12 @@ pub async fn dispatch_by_name(
            })
            .await
        }
        n if n == github_issue::INFO.name => {
            run_or_mismatch(github_issue::matches(url), n, url, || {
                github_issue::extract(client, url)
            })
            .await
        }
        n if n == github_release::INFO.name => {
            run_or_mismatch(github_release::matches(url), n, url, || {
                github_release::extract(client, url)
@ -375,6 +406,30 @@ pub async fn dispatch_by_name(
            })
            .await
        }
        n if n == youtube_video::INFO.name => {
            run_or_mismatch(youtube_video::matches(url), n, url, || {
                youtube_video::extract(client, url)
            })
            .await
        }
        n if n == substack_post::INFO.name => {
            run_or_mismatch(substack_post::matches(url), n, url, || {
                substack_post::extract(client, url)
            })
            .await
        }
        n if n == shopify_collection::INFO.name => {
            run_or_mismatch(shopify_collection::matches(url), n, url, || {
                shopify_collection::extract(client, url)
            })
            .await
        }
        n if n == woocommerce_product::INFO.name => {
            run_or_mismatch(woocommerce_product::matches(url), n, url, || {
                woocommerce_product::extract(client, url)
            })
            .await
        }
        _ => Err(ExtractorDispatchError::UnknownVertical(name.to_string())),
    }
 }
--- a/crates/webclaw-fetch/src/extractors/shopify_collection.rs
+++ b/crates/webclaw-fetch/src/extractors/shopify_collection.rs
@ -0,0 +1,242 @@
 //! Shopify collection structured extractor.
 //!
 //! Every Shopify store exposes `/collections/{handle}.json` and
 //! `/collections/{handle}/products.json` on the public surface. This
 //! extractor hits `.json` (collection metadata) and falls through to
 //! `/products.json` for the first page of products. Same caveat as
 //! `shopify_product`: stores with Cloudflare in front of the shop
 //! will 403 the public path.
 //!
 //! Explicit-call only (like `shopify_product`). `/collections/{slug}`
 //! is a URL shape used by non-Shopify stores too, so auto-dispatch
 //! would claim too many URLs.
 use serde::Deserialize;
 use serde_json::{Value, json};
 use super::ExtractorInfo;
 use crate::client::FetchClient;
 use crate::error::FetchError;
 pub const INFO: ExtractorInfo = ExtractorInfo {
    name: "shopify_collection",
    label: "Shopify collection",
    description: "Returns collection metadata + first page of products (handle, title, vendor, price, available) on ANY Shopify store via /collections/{handle}.json + /products.json.",
    url_patterns: &[
        "https://{shop}/collections/{handle}",
        "https://{shop}.myshopify.com/collections/{handle}",
    ],
 };
 pub fn matches(url: &str) -> bool {
    let host = host_of(url);
    if host.is_empty() || NON_SHOPIFY_HOSTS.iter().any(|h| host.ends_with(h)) {
        return false;
    }
    url.contains("/collections/") && !url.ends_with("/collections/")
 }
 const NON_SHOPIFY_HOSTS: &[&str] = &[
    "amazon.com",
    "amazon.co.uk",
    "amazon.de",
    "ebay.com",
    "etsy.com",
    "walmart.com",
    "target.com",
    "aliexpress.com",
    "huggingface.co", // has /collections/ for models
    "github.com",
 ];
 pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
    let (coll_meta_url, coll_products_url) = build_json_urls(url);
    // Step 1: collection metadata. Shopify returns 200 on missing
    // collections sometimes; check "collection" key below.
    let meta_resp = client.fetch(&coll_meta_url).await?;
    if meta_resp.status == 404 {
        return Err(FetchError::Build(format!(
            "shopify_collection: '{url}' not found"
        )));
    }
    if meta_resp.status == 403 {
        return Err(FetchError::Build(format!(
            "shopify_collection: {coll_meta_url} returned 403. The store has antibot in front of the .json endpoint. Use /v1/scrape/ecommerce_product or api.webclaw.io for this store."
        )));
    }
    if meta_resp.status != 200 {
        return Err(FetchError::Build(format!(
            "shopify returned status {} for {coll_meta_url}",
            meta_resp.status
        )));
    }
    let meta: MetaWrapper = serde_json::from_str(&meta_resp.html).map_err(|e| {
        FetchError::BodyDecode(format!(
            "shopify_collection: '{url}' didn't return Shopify JSON, likely not a Shopify store ({e})"
        ))
    })?;
    // Step 2: first page of products for this collection.
    let products = match client.fetch(&coll_products_url).await {
        Ok(r) if r.status == 200 => serde_json::from_str::<ProductsWrapper>(&r.html)
            .ok()
            .map(|pw| pw.products)
            .unwrap_or_default(),
        _ => Vec::new(),
    };
    let product_summaries: Vec<Value> = products
        .iter()
        .map(|p| {
            let first_variant = p.variants.first();
            json!({
                "id":              p.id,
                "handle":          p.handle,
                "title":           p.title,
                "vendor":          p.vendor,
                "product_type":    p.product_type,
                "price":           first_variant.and_then(|v| v.price.clone()),
                "compare_at_price":first_variant.and_then(|v| v.compare_at_price.clone()),
                "available":       p.variants.iter().any(|v| v.available.unwrap_or(false)),
                "variant_count":   p.variants.len(),
                "image":           p.images.first().and_then(|i| i.src.clone()),
                "created_at":      p.created_at,
                "updated_at":      p.updated_at,
            })
        })
        .collect();
    let c = meta.collection;
    Ok(json!({
        "url":               url,
        "meta_json_url":     coll_meta_url,
        "products_json_url": coll_products_url,
        "collection_id":     c.id,
        "handle":            c.handle,
        "title":             c.title,
        "description_html":  c.body_html,
        "published_at":      c.published_at,
        "updated_at":        c.updated_at,
        "sort_order":        c.sort_order,
        "products_in_page":  product_summaries.len(),
        "products":          product_summaries,
    }))
 }
 // ---------------------------------------------------------------------------
 // URL helpers
 // ---------------------------------------------------------------------------
 fn host_of(url: &str) -> &str {
    url.split("://")
        .nth(1)
        .unwrap_or(url)
        .split('/')
        .next()
        .unwrap_or("")
 }
 /// Build `(collection.json, collection/products.json)` from a user URL.
 fn build_json_urls(url: &str) -> (String, String) {
    let (path_part, _query_part) = match url.split_once('?') {
        Some((a, b)) => (a, Some(b)),
        None => (url, None),
    };
    let clean = path_part.trim_end_matches('/').trim_end_matches(".json");
    (
        format!("{clean}.json"),
        format!("{clean}/products.json?limit=50"),
    )
 }
 // ---------------------------------------------------------------------------
 // Shopify collection + product JSON shapes (subsets)
 // ---------------------------------------------------------------------------
 #[derive(Deserialize)]
 struct MetaWrapper {
    collection: Collection,
 }
 #[derive(Deserialize)]
 struct Collection {
    id: Option<i64>,
    handle: Option<String>,
    title: Option<String>,
    body_html: Option<String>,
    published_at: Option<String>,
    updated_at: Option<String>,
    sort_order: Option<String>,
 }
 #[derive(Deserialize)]
 struct ProductsWrapper {
    #[serde(default)]
    products: Vec<ProductSummary>,
 }
 #[derive(Deserialize)]
 struct ProductSummary {
    id: Option<i64>,
    handle: Option<String>,
    title: Option<String>,
    vendor: Option<String>,
    product_type: Option<String>,
    created_at: Option<String>,
    updated_at: Option<String>,
    #[serde(default)]
    variants: Vec<VariantSummary>,
    #[serde(default)]
    images: Vec<ImageSummary>,
 }
 #[derive(Deserialize)]
 struct VariantSummary {
    price: Option<String>,
    compare_at_price: Option<String>,
    available: Option<bool>,
 }
 #[derive(Deserialize)]
 struct ImageSummary {
    src: Option<String>,
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn matches_shopify_collection_urls() {
        assert!(matches("https://www.allbirds.com/collections/mens"));
        assert!(matches(
            "https://shop.example.com/collections/new-arrivals?page=2"
        ));
    }
    #[test]
    fn rejects_non_shopify() {
        assert!(!matches("https://github.com/collections/foo"));
        assert!(!matches("https://huggingface.co/collections/foo"));
        assert!(!matches("https://example.com/"));
        assert!(!matches("https://example.com/collections/"));
    }
    #[test]
    fn build_json_urls_derives_both_paths() {
        let (meta, products) = build_json_urls("https://shop.example.com/collections/mens");
        assert_eq!(meta, "https://shop.example.com/collections/mens.json");
        assert_eq!(
            products,
            "https://shop.example.com/collections/mens/products.json?limit=50"
        );
    }
    #[test]
    fn build_json_urls_handles_trailing_slash() {
        let (meta, _) = build_json_urls("https://shop.example.com/collections/mens/");
        assert_eq!(meta, "https://shop.example.com/collections/mens.json");
    }
 }
--- a/crates/webclaw-fetch/src/extractors/substack_post.rs
+++ b/crates/webclaw-fetch/src/extractors/substack_post.rs
@ -0,0 +1,213 @@
 //! Substack post extractor.
 //!
 //! Every Substack publication exposes `/api/v1/posts/{slug}` that
 //! returns the full post as JSON: body HTML, cover image, author,
 //! publication info, reactions, paywall state. No auth on public
 //! posts.
 //!
 //! Works on both `*.substack.com` subdomains and custom domains
 //! (e.g. `simonwillison.net` uses Substack too). Detection is
 //! "URL has `/p/{slug}`" because that's the canonical Substack post
 //! path. Explicit-call only because the `/p/{slug}` URL shape is
 //! used by non-Substack sites too.
 use serde::Deserialize;
 use serde_json::{Value, json};
 use super::ExtractorInfo;
 use crate::client::FetchClient;
 use crate::error::FetchError;
 pub const INFO: ExtractorInfo = ExtractorInfo {
    name: "substack_post",
    label: "Substack post",
    description: "Returns post HTML, title, subtitle, author, publication, reactions, paywall status via the Substack public API.",
    url_patterns: &[
        "https://{pub}.substack.com/p/{slug}",
        "https://{custom-domain}/p/{slug}",
    ],
 };
 pub fn matches(url: &str) -> bool {
    if !(url.starts_with("http://") || url.starts_with("https://")) {
        return false;
    }
    url.contains("/p/")
 }
 pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
    let slug = parse_slug(url).ok_or_else(|| {
        FetchError::Build(format!("substack_post: cannot parse slug from '{url}'"))
    })?;
    let host = host_of(url);
    if host.is_empty() {
        return Err(FetchError::Build(format!(
            "substack_post: empty host in '{url}'"
        )));
    }
    let scheme = if url.starts_with("http://") {
        "http"
    } else {
        "https"
    };
    let api_url = format!("{scheme}://{host}/api/v1/posts/{slug}");
    let resp = client.fetch(&api_url).await?;
    if resp.status == 404 {
        return Err(FetchError::Build(format!(
            "substack_post: '{slug}' not found on {host} (got 404). \
             If the publication isn't actually on Substack, use /v1/scrape instead."
        )));
    }
    if resp.status != 200 {
        return Err(FetchError::Build(format!(
            "substack returned status {} for {api_url}",
            resp.status
        )));
    }
    let p: Post = serde_json::from_str(&resp.html).map_err(|e| {
        FetchError::BodyDecode(format!(
            "substack_post: '{host}' didn't return Substack JSON, likely not a Substack ({e})"
        ))
    })?;
    Ok(json!({
        "url":                  url,
        "api_url":              api_url,
        "id":                   p.id,
        "type":                 p.r#type,
        "slug":                 p.slug,
        "title":                p.title,
        "subtitle":             p.subtitle,
        "description":          p.description,
        "canonical_url":        p.canonical_url,
        "post_date":            p.post_date,
        "updated_at":           p.updated_at,
        "audience":             p.audience,
        "has_paywall":          matches!(p.audience.as_deref(), Some("only_paid") | Some("founding")),
        "is_free_preview":      p.is_free_preview,
        "cover_image":          p.cover_image,
        "word_count":           p.wordcount,
        "reactions":            p.reactions,
        "comment_count":        p.comment_count,
        "body_html":            p.body_html,
        "body_text":            p.truncated_body_text.or(p.body_text),
        "publication": json!({
            "id":           p.publication.as_ref().and_then(|pub_| pub_.id),
            "name":         p.publication.as_ref().and_then(|pub_| pub_.name.clone()),
            "subdomain":    p.publication.as_ref().and_then(|pub_| pub_.subdomain.clone()),
            "custom_domain":p.publication.as_ref().and_then(|pub_| pub_.custom_domain.clone()),
        }),
        "authors": p.published_bylines.iter().map(|a| json!({
            "id":     a.id,
            "name":   a.name,
            "handle": a.handle,
            "photo":  a.photo_url,
        })).collect::<Vec<_>>(),
    }))
 }
 // ---------------------------------------------------------------------------
 // URL helpers
 // ---------------------------------------------------------------------------
 fn host_of(url: &str) -> &str {
    url.split("://")
        .nth(1)
        .unwrap_or(url)
        .split('/')
        .next()
        .unwrap_or("")
 }
 fn parse_slug(url: &str) -> Option<String> {
    let after = url.split("/p/").nth(1)?;
    let stripped = after
        .split(['?', '#'])
        .next()?
        .trim_end_matches('/')
        .split('/')
        .next()
        .unwrap_or("");
    if stripped.is_empty() {
        None
    } else {
        Some(stripped.to_string())
    }
 }
 // ---------------------------------------------------------------------------
 // Substack API types (subset)
 // ---------------------------------------------------------------------------
 #[derive(Deserialize)]
 struct Post {
    id: Option<i64>,
    r#type: Option<String>,
    slug: Option<String>,
    title: Option<String>,
    subtitle: Option<String>,
    description: Option<String>,
    canonical_url: Option<String>,
    post_date: Option<String>,
    updated_at: Option<String>,
    audience: Option<String>,
    is_free_preview: Option<bool>,
    cover_image: Option<String>,
    wordcount: Option<i64>,
    reactions: Option<serde_json::Value>,
    comment_count: Option<i64>,
    body_html: Option<String>,
    body_text: Option<String>,
    truncated_body_text: Option<String>,
    publication: Option<Publication>,
    #[serde(default, rename = "publishedBylines")]
    published_bylines: Vec<Byline>,
 }
 #[derive(Deserialize)]
 struct Publication {
    id: Option<i64>,
    name: Option<String>,
    subdomain: Option<String>,
    custom_domain: Option<String>,
 }
 #[derive(Deserialize)]
 struct Byline {
    id: Option<i64>,
    name: Option<String>,
    handle: Option<String>,
    photo_url: Option<String>,
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn matches_post_urls() {
        assert!(matches(
            "https://stratechery.substack.com/p/the-tech-letter"
        ));
        assert!(matches("https://simonwillison.net/p/2024-08-01-something"));
        assert!(!matches("https://example.com/"));
        assert!(!matches("ftp://example.com/p/foo"));
    }
    #[test]
    fn parse_slug_strips_query_and_trailing_slash() {
        assert_eq!(
            parse_slug("https://example.substack.com/p/my-post"),
            Some("my-post".into())
        );
        assert_eq!(
            parse_slug("https://example.substack.com/p/my-post/"),
            Some("my-post".into())
        );
        assert_eq!(
            parse_slug("https://example.substack.com/p/my-post?ref=123"),
            Some("my-post".into())
        );
    }
 }
--- a/crates/webclaw-fetch/src/extractors/woocommerce_product.rs
+++ b/crates/webclaw-fetch/src/extractors/woocommerce_product.rs
@ -0,0 +1,237 @@
 //! WooCommerce product structured extractor.
 //!
 //! Targets WooCommerce's Store API: `/wp-json/wc/store/v1/products?slug={slug}`.
 //! About 30-50% of WooCommerce stores expose this endpoint publicly
 //! (it's on by default, but common security plugins disable it).
 //! When it's off, the server returns 404 at /wp-json. We surface a
 //! clean error and point callers at `/v1/scrape/ecommerce_product`
 //! which works on any store with Schema.org JSON-LD.
 //!
 //! Explicit-call only. `/product/{slug}` is the default permalink for
 //! WooCommerce but custom stores use every variation imaginable, so
 //! auto-dispatch is unreliable.
 use serde::Deserialize;
 use serde_json::{Value, json};
 use super::ExtractorInfo;
 use crate::client::FetchClient;
 use crate::error::FetchError;
 pub const INFO: ExtractorInfo = ExtractorInfo {
    name: "woocommerce_product",
    label: "WooCommerce product",
    description: "Returns product via the WooCommerce Store REST API (requires the /wp-json/wc/store endpoint to be enabled on the target store).",
    url_patterns: &[
        "https://{shop}/product/{slug}",
        "https://{shop}/shop/{slug}",
    ],
 };
 pub fn matches(url: &str) -> bool {
    let host = host_of(url);
    if host.is_empty() {
        return false;
    }
    // Permissive: WooCommerce stores use custom domains + custom
    // permalinks. The extractor's API probe is what confirms it's
    // really WooCommerce.
    url.contains("/product/")
        || url.contains("/shop/")
        || url.contains("/producto/") // common es locale
        || url.contains("/produit/") // common fr locale
 }
 pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
    let slug = parse_slug(url).ok_or_else(|| {
        FetchError::Build(format!(
            "woocommerce_product: cannot parse slug from '{url}'"
        ))
    })?;
    let host = host_of(url);
    if host.is_empty() {
        return Err(FetchError::Build(format!(
            "woocommerce_product: empty host in '{url}'"
        )));
    }
    let scheme = if url.starts_with("http://") {
        "http"
    } else {
        "https"
    };
    let api_url = format!("{scheme}://{host}/wp-json/wc/store/v1/products?slug={slug}&per_page=1");
    let resp = client.fetch(&api_url).await?;
    if resp.status == 404 {
        return Err(FetchError::Build(format!(
            "woocommerce_product: {host} does not expose /wp-json/wc/store (404). \
             Use /v1/scrape/ecommerce_product for JSON-LD fallback."
        )));
    }
    if resp.status == 401 || resp.status == 403 {
        return Err(FetchError::Build(format!(
            "woocommerce_product: {host} requires auth for /wp-json/wc/store ({}). \
             Use /v1/scrape/ecommerce_product for the public JSON-LD fallback.",
            resp.status
        )));
    }
    if resp.status != 200 {
        return Err(FetchError::Build(format!(
            "woocommerce api returned status {} for {api_url}",
            resp.status
        )));
    }
    let products: Vec<Product> = serde_json::from_str(&resp.html)
        .map_err(|e| FetchError::BodyDecode(format!("woocommerce parse: {e}")))?;
    let p = products.into_iter().next().ok_or_else(|| {
        FetchError::Build(format!(
            "woocommerce_product: no product found for slug '{slug}' on {host}"
        ))
    })?;
    let images: Vec<Value> = p
        .images
        .iter()
        .map(|i| json!({"src": i.src, "thumbnail": i.thumbnail, "alt": i.alt}))
        .collect();
    let variations_count = p.variations.as_ref().map(|v| v.len()).unwrap_or(0);
    Ok(json!({
        "url":             url,
        "api_url":         api_url,
        "product_id":      p.id,
        "name":            p.name,
        "slug":            p.slug,
        "sku":             p.sku,
        "permalink":       p.permalink,
        "on_sale":         p.on_sale,
        "in_stock":        p.is_in_stock,
        "is_purchasable":  p.is_purchasable,
        "price":           p.prices.as_ref().and_then(|pr| pr.price.clone()),
        "regular_price":   p.prices.as_ref().and_then(|pr| pr.regular_price.clone()),
        "sale_price":      p.prices.as_ref().and_then(|pr| pr.sale_price.clone()),
        "currency":        p.prices.as_ref().and_then(|pr| pr.currency_code.clone()),
        "currency_minor":  p.prices.as_ref().and_then(|pr| pr.currency_minor_unit),
        "price_range":     p.prices.as_ref().and_then(|pr| pr.price_range.clone()),
        "average_rating":  p.average_rating,
        "review_count":    p.review_count,
        "description":     p.description,
        "short_description": p.short_description,
        "categories":      p.categories.iter().filter_map(|c| c.name.clone()).collect::<Vec<_>>(),
        "tags":            p.tags.iter().filter_map(|t| t.name.clone()).collect::<Vec<_>>(),
        "variation_count": variations_count,
        "image_count":     images.len(),
        "images":          images,
    }))
 }
 // ---------------------------------------------------------------------------
 // URL helpers
 // ---------------------------------------------------------------------------
 fn host_of(url: &str) -> &str {
    url.split("://")
        .nth(1)
        .unwrap_or(url)
        .split('/')
        .next()
        .unwrap_or("")
 }
 /// Extract the product slug from common WooCommerce permalinks.
 fn parse_slug(url: &str) -> Option<String> {
    for needle in ["/product/", "/shop/", "/producto/", "/produit/"] {
        if let Some(after) = url.split(needle).nth(1) {
            let stripped = after
                .split(['?', '#'])
                .next()?
                .trim_end_matches('/')
                .split('/')
                .next()
                .unwrap_or("");
            if !stripped.is_empty() {
                return Some(stripped.to_string());
            }
        }
    }
    None
 }
 // ---------------------------------------------------------------------------
 // Store API types (subset of the full response)
 // ---------------------------------------------------------------------------
 #[derive(Deserialize)]
 struct Product {
    id: Option<i64>,
    name: Option<String>,
    slug: Option<String>,
    sku: Option<String>,
    permalink: Option<String>,
    description: Option<String>,
    short_description: Option<String>,
    on_sale: Option<bool>,
    is_in_stock: Option<bool>,
    is_purchasable: Option<bool>,
    average_rating: Option<serde_json::Value>, // string or number
    review_count: Option<i64>,
    prices: Option<Prices>,
    #[serde(default)]
    categories: Vec<Term>,
    #[serde(default)]
    tags: Vec<Term>,
    #[serde(default)]
    images: Vec<Img>,
    variations: Option<Vec<serde_json::Value>>,
 }
 #[derive(Deserialize)]
 struct Prices {
    price: Option<String>,
    regular_price: Option<String>,
    sale_price: Option<String>,
    currency_code: Option<String>,
    currency_minor_unit: Option<i64>,
    price_range: Option<serde_json::Value>,
 }
 #[derive(Deserialize)]
 struct Term {
    name: Option<String>,
 }
 #[derive(Deserialize)]
 struct Img {
    src: Option<String>,
    thumbnail: Option<String>,
    alt: Option<String>,
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn matches_common_permalinks() {
        assert!(matches("https://shop.example.com/product/cool-widget"));
        assert!(matches("https://shop.example.com/shop/cool-widget"));
        assert!(matches("https://tienda.example.com/producto/cosa"));
        assert!(matches("https://boutique.example.com/produit/chose"));
    }
    #[test]
    fn parse_slug_handles_locale_and_suffix() {
        assert_eq!(
            parse_slug("https://shop.example.com/product/cool-widget"),
            Some("cool-widget".into())
        );
        assert_eq!(
            parse_slug("https://shop.example.com/product/cool-widget/?attr=red"),
            Some("cool-widget".into())
        );
        assert_eq!(
            parse_slug("https://tienda.example.com/producto/cosa/"),
            Some("cosa".into())
        );
    }
 }
--- a/crates/webclaw-fetch/src/extractors/youtube_video.rs
+++ b/crates/webclaw-fetch/src/extractors/youtube_video.rs
@ -0,0 +1,255 @@
 //! YouTube video structured extractor.
 //!
 //! YouTube embeds the full player configuration in a
 //! `ytInitialPlayerResponse` JavaScript assignment at the top of
 //! every `/watch`, `/shorts`, and `youtu.be` HTML page. We reuse the
 //! core crate's already-proven regex + parse to surface typed JSON
 //! from it: video id, title, author + channel id, view count,
 //! duration, upload date, keywords, thumbnails, caption-track URLs.
 //!
 //! Auto-dispatched: YouTube host is unique and the `v=` or `/shorts/`
 //! shape is stable.
 use serde_json::{Value, json};
 use super::ExtractorInfo;
 use crate::client::FetchClient;
 use crate::error::FetchError;
 pub const INFO: ExtractorInfo = ExtractorInfo {
    name: "youtube_video",
    label: "YouTube video",
    description: "Returns video id, title, channel, view count, duration, upload date, thumbnails, keywords, and caption-track URLs.",
    url_patterns: &[
        "https://www.youtube.com/watch?v={id}",
        "https://youtu.be/{id}",
        "https://www.youtube.com/shorts/{id}",
    ],
 };
 pub fn matches(url: &str) -> bool {
    webclaw_core::youtube::is_youtube_url(url)
        || url.contains("youtube.com/shorts/")
        || url.contains("youtube-nocookie.com/embed/")
 }
 pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
    let video_id = parse_video_id(url).ok_or_else(|| {
        FetchError::Build(format!("youtube_video: cannot parse video id from '{url}'"))
    })?;
    // Always fetch the canonical /watch URL. /shorts/ and youtu.be
    // sometimes serve a thinner page without the player blob.
    let canonical = format!("https://www.youtube.com/watch?v={video_id}");
    let resp = client.fetch(&canonical).await?;
    if resp.status != 200 {
        return Err(FetchError::Build(format!(
            "youtube returned status {} for {canonical}",
            resp.status
        )));
    }
    let player = extract_player_response(&resp.html).ok_or_else(|| {
        FetchError::BodyDecode(format!(
            "youtube_video: no ytInitialPlayerResponse on {canonical} (video may be private, region-blocked, or removed)"
        ))
    })?;
    let video_details = player.get("videoDetails");
    let microformat = player
        .get("microformat")
        .and_then(|m| m.get("playerMicroformatRenderer"));
    let thumbnails: Vec<Value> = video_details
        .and_then(|vd| vd.get("thumbnail"))
        .and_then(|t| t.get("thumbnails"))
        .and_then(|t| t.as_array())
        .cloned()
        .unwrap_or_default();
    let keywords: Vec<Value> = video_details
        .and_then(|vd| vd.get("keywords"))
        .and_then(|k| k.as_array())
        .cloned()
        .unwrap_or_default();
    let caption_tracks = webclaw_core::youtube::extract_caption_tracks(&resp.html);
    let captions: Vec<Value> = caption_tracks
        .iter()
        .map(|c| {
            json!({
                "url":  c.url,
                "lang": c.lang,
                "name": c.name,
            })
        })
        .collect();
    Ok(json!({
        "url":          url,
        "canonical_url":canonical,
        "video_id":     video_id,
        "title":        get_str(video_details, "title"),
        "description":  get_str(video_details, "shortDescription"),
        "author":       get_str(video_details, "author"),
        "channel_id":   get_str(video_details, "channelId"),
        "channel_url":  get_str(microformat, "ownerProfileUrl"),
        "view_count":   get_int(video_details, "viewCount"),
        "length_seconds": get_int(video_details, "lengthSeconds"),
        "is_live":      video_details.and_then(|vd| vd.get("isLiveContent")).and_then(|v| v.as_bool()),
        "is_private":   video_details.and_then(|vd| vd.get("isPrivate")).and_then(|v| v.as_bool()),
        "is_unlisted":  microformat.and_then(|m| m.get("isUnlisted")).and_then(|v| v.as_bool()),
        "allow_ratings":video_details.and_then(|vd| vd.get("allowRatings")).and_then(|v| v.as_bool()),
        "category":     get_str(microformat, "category"),
        "upload_date":  get_str(microformat, "uploadDate"),
        "publish_date": get_str(microformat, "publishDate"),
        "keywords":     keywords,
        "thumbnails":   thumbnails,
        "caption_tracks": captions,
    }))
 }
 // ---------------------------------------------------------------------------
 // URL helpers
 // ---------------------------------------------------------------------------
 fn parse_video_id(url: &str) -> Option<String> {
    // youtu.be/{id}
    if let Some(after) = url.split("youtu.be/").nth(1) {
        let id = after
            .split(['?', '#', '/'])
            .next()
            .unwrap_or("")
            .trim_end_matches('/');
        if !id.is_empty() {
            return Some(id.to_string());
        }
    }
    // youtube.com/shorts/{id}
    if let Some(after) = url.split("youtube.com/shorts/").nth(1) {
        let id = after
            .split(['?', '#', '/'])
            .next()
            .unwrap_or("")
            .trim_end_matches('/');
        if !id.is_empty() {
            return Some(id.to_string());
        }
    }
    // youtube-nocookie.com/embed/{id}
    if let Some(after) = url.split("/embed/").nth(1) {
        let id = after
            .split(['?', '#', '/'])
            .next()
            .unwrap_or("")
            .trim_end_matches('/');
        if !id.is_empty() {
            return Some(id.to_string());
        }
    }
    // youtube.com/watch?v={id} (also matches youtube.com/watch?foo=bar&v={id})
    if let Some(q) = url.split_once('?').map(|(_, q)| q)
        && let Some(id) = q
            .split('&')
            .find_map(|p| p.strip_prefix("v=").map(|v| v.to_string()))
    {
        let id = id.split(['#', '/']).next().unwrap_or(&id).to_string();
        if !id.is_empty() {
            return Some(id);
        }
    }
    None
 }
 // ---------------------------------------------------------------------------
 // Player-response parsing
 // ---------------------------------------------------------------------------
 fn extract_player_response(html: &str) -> Option<Value> {
    use regex::Regex;
    use std::sync::OnceLock;
    // Same regex as webclaw_core::youtube. Duplicated here because
    // core's regex is module-private. Kept in lockstep; changes are
    // rare and we cover with tests in both places.
    static RE: OnceLock<Regex> = OnceLock::new();
    let re = RE
        .get_or_init(|| Regex::new(r"var\s+ytInitialPlayerResponse\s*=\s*(\{.+?\})\s*;").unwrap());
    let json_str = re.captures(html)?.get(1)?.as_str();
    serde_json::from_str(json_str).ok()
 }
 fn get_str(v: Option<&Value>, key: &str) -> Option<String> {
    v.and_then(|x| x.get(key))
        .and_then(|x| x.as_str().map(String::from))
 }
 fn get_int(v: Option<&Value>, key: &str) -> Option<i64> {
    v.and_then(|x| x.get(key)).and_then(|x| {
        x.as_i64()
            .or_else(|| x.as_str().and_then(|s| s.parse::<i64>().ok()))
    })
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn matches_watch_urls() {
        assert!(matches("https://www.youtube.com/watch?v=dQw4w9WgXcQ"));
        assert!(matches("https://youtu.be/dQw4w9WgXcQ"));
        assert!(matches("https://www.youtube.com/shorts/abc123"));
        assert!(matches(
            "https://www.youtube-nocookie.com/embed/dQw4w9WgXcQ"
        ));
    }
    #[test]
    fn rejects_non_video_urls() {
        assert!(!matches("https://www.youtube.com/"));
        assert!(!matches("https://www.youtube.com/channel/abc"));
        assert!(!matches("https://example.com/watch?v=abc"));
    }
    #[test]
    fn parse_video_id_from_each_shape() {
        assert_eq!(
            parse_video_id("https://www.youtube.com/watch?v=dQw4w9WgXcQ"),
            Some("dQw4w9WgXcQ".into())
        );
        assert_eq!(
            parse_video_id("https://www.youtube.com/watch?v=dQw4w9WgXcQ&t=10s"),
            Some("dQw4w9WgXcQ".into())
        );
        assert_eq!(
            parse_video_id("https://www.youtube.com/watch?feature=share&v=dQw4w9WgXcQ"),
            Some("dQw4w9WgXcQ".into())
        );
        assert_eq!(
            parse_video_id("https://youtu.be/dQw4w9WgXcQ"),
            Some("dQw4w9WgXcQ".into())
        );
        assert_eq!(
            parse_video_id("https://youtu.be/dQw4w9WgXcQ?t=30"),
            Some("dQw4w9WgXcQ".into())
        );
        assert_eq!(
            parse_video_id("https://www.youtube.com/shorts/abc123"),
            Some("abc123".into())
        );
    }
    #[test]
    fn extract_player_response_happy_path() {
        let html = r#"
 <html><body>
 <script>
 var ytInitialPlayerResponse = {"videoDetails":{"videoId":"abc","title":"T","author":"A","viewCount":"100","lengthSeconds":"60","shortDescription":"d"}};
 </script>
 </body></html>
 "#;
        let v = extract_player_response(html).unwrap();
        let vd = v.get("videoDetails").unwrap();
        assert_eq!(vd.get("title").unwrap().as_str(), Some("T"));
    }
 }