feat(extractors): wave 6b, etsy_listing + HTML fallbacks for substack/youtube

Adds etsy_listing and hardens two existing extractors with HTML fallbacks so transient API failures still return useful data. New: - etsy_listing: /listing/{id}(/slug) with Schema.org Product JSON-LD + OG fallback. Antibot-gated, routes through cloud::smart_fetch_html like amazon_product and ebay_listing. Auto-dispatched (etsy host is unique). Hardened: - substack_post: when /api/v1/posts/{slug} returns non-200 (rate limit, 403 on hardened custom domains, 5xx), fall back to HTML fetch and parse OG tags + Article JSON-LD. Response shape is stable across both paths, with a `data_source` field of "api" or "html_fallback". - youtube_video: when ytInitialPlayerResponse is missing (EU-consent interstitial, age-gated, some live pre-shows), fall back to OG tags for title/description/thumbnail. `data_source` now "player_response" or "og_fallback". Tests: 91 passing in webclaw-fetch (9 new), clippy clean.
2026-04-25 00:06:21 +02:00 · 2026-04-22 16:44:51 +02:00 · 2026-04-22 16:44:51 +02:00 · 7f5eb93b65
commit 7f5eb93b65
parent 8cc727c2f2
4 changed files with 910 additions and 29 deletions
--- a/crates/webclaw-fetch/src/extractors/etsy_listing.rs
+++ b/crates/webclaw-fetch/src/extractors/etsy_listing.rs
@ -0,0 +1,391 @@
+//! Etsy listing extractor.
+//!
+//! Etsy product pages at `etsy.com/listing/{id}` (and a sluggy variant
+//! `etsy.com/listing/{id}/{slug}`) ship a Schema.org `Product` JSON-LD
+//! block with title, price, currency, availability, shop seller, and
+//! an `AggregateRating` for the listing.
+//!
+//! Etsy puts Cloudflare + custom WAF in front of product pages with a
+//! high variance: the Firefox profile gets clean HTML most of the time
+//! but some listings return a CF interstitial. We route through
+//! `cloud::smart_fetch_html` so both paths resolve to the same parser,
+//! same as `ebay_listing`.
+
+use std::sync::OnceLock;
+
+use regex::Regex;
+use serde_json::{Value, json};
+
+use super::ExtractorInfo;
+use crate::client::FetchClient;
+use crate::cloud::{self, CloudError};
+use crate::error::FetchError;
+
+pub const INFO: ExtractorInfo = ExtractorInfo {
+    name: "etsy_listing",
+    label: "Etsy listing",
+    description: "Returns listing title, price, currency, availability, shop, rating, and image. Heavy listings may need WEBCLAW_API_KEY for antibot.",
+    url_patterns: &[
+        "https://www.etsy.com/listing/{id}",
+        "https://www.etsy.com/listing/{id}/{slug}",
+        "https://www.etsy.com/{locale}/listing/{id}",
+    ],
+};
+
+pub fn matches(url: &str) -> bool {
+    let host = host_of(url);
+    if !is_etsy_host(host) {
+        return false;
+    }
+    parse_listing_id(url).is_some()
+}
+
+pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
+    let listing_id = parse_listing_id(url)
+        .ok_or_else(|| FetchError::Build(format!("etsy_listing: no listing id in '{url}'")))?;
+
+    let fetched = cloud::smart_fetch_html(client, client.cloud(), url)
+        .await
+        .map_err(cloud_to_fetch_err)?;
+
+    let mut data = parse(&fetched.html, url, &listing_id);
+    if let Some(obj) = data.as_object_mut() {
+        obj.insert(
+            "data_source".into(),
+            match fetched.source {
+                cloud::FetchSource::Local => json!("local"),
+                cloud::FetchSource::Cloud => json!("cloud"),
+            },
+        );
+    }
+    Ok(data)
+}
+
+pub fn parse(html: &str, url: &str, listing_id: &str) -> Value {
+    let jsonld = find_product_jsonld(html);
+
+    let title = jsonld
+        .as_ref()
+        .and_then(|v| get_text(v, "name"))
+        .or_else(|| og(html, "title"));
+    let description = jsonld
+        .as_ref()
+        .and_then(|v| get_text(v, "description"))
+        .or_else(|| og(html, "description"));
+    let image = jsonld
+        .as_ref()
+        .and_then(get_first_image)
+        .or_else(|| og(html, "image"));
+    let brand = jsonld.as_ref().and_then(get_brand);
+
+    // Etsy listings often ship either a single Offer or an
+    // AggregateOffer when the listing has variants with different prices.
+    let offer = jsonld.as_ref().and_then(first_offer);
+    let (low_price, high_price, single_price) = match offer.as_ref() {
+        Some(o) => (
+            get_text(o, "lowPrice"),
+            get_text(o, "highPrice"),
+            get_text(o, "price"),
+        ),
+        None => (None, None, None),
+    };
+    let currency = offer.as_ref().and_then(|o| get_text(o, "priceCurrency"));
+    let availability = offer
+        .as_ref()
+        .and_then(|o| get_text(o, "availability").map(strip_schema_prefix));
+    let item_condition = jsonld
+        .as_ref()
+        .and_then(|v| get_text(v, "itemCondition"))
+        .map(strip_schema_prefix);
+
+    // Shop name lives under offers[0].seller.name on Etsy.
+    let shop = offer.as_ref().and_then(|o| {
+        o.get("seller")
+            .and_then(|s| s.get("name"))
+            .and_then(|n| n.as_str())
+            .map(String::from)
+    });
+    let shop_url = shop_url_from_html(html);
+
+    let aggregate_rating = jsonld.as_ref().and_then(get_aggregate_rating);
+
+    json!({
+        "url":              url,
+        "listing_id":       listing_id,
+        "title":            title,
+        "description":      description,
+        "image":            image,
+        "brand":            brand,
+        "price":            single_price,
+        "low_price":        low_price,
+        "high_price":       high_price,
+        "currency":         currency,
+        "availability":     availability,
+        "item_condition":   item_condition,
+        "shop":             shop,
+        "shop_url":         shop_url,
+        "aggregate_rating": aggregate_rating,
+    })
+}
+
+// ---------------------------------------------------------------------------
+// URL helpers
+// ---------------------------------------------------------------------------
+
+fn host_of(url: &str) -> &str {
+    url.split("://")
+        .nth(1)
+        .unwrap_or(url)
+        .split('/')
+        .next()
+        .unwrap_or("")
+}
+
+fn is_etsy_host(host: &str) -> bool {
+    host == "etsy.com" || host == "www.etsy.com" || host.ends_with(".etsy.com")
+}
+
+/// Extract the numeric listing id. Etsy ids are 9-11 digits today but
+/// we accept any all-digit segment right after `/listing/`.
+///
+/// Handles `/listing/{id}`, `/listing/{id}/{slug}`, and the localised
+/// `/{locale}/listing/{id}` shape (e.g. `/fr/listing/...`).
+fn parse_listing_id(url: &str) -> Option<String> {
+    static RE: OnceLock<Regex> = OnceLock::new();
+    let re = RE.get_or_init(|| Regex::new(r"/listing/(\d{6,})(?:[/?#]|$)").unwrap());
+    re.captures(url)
+        .and_then(|c| c.get(1))
+        .map(|m| m.as_str().to_string())
+}
+
+// ---------------------------------------------------------------------------
+// JSON-LD walkers (same shape as ebay_listing; kept separate so the two
+// extractors can diverge without cross-impact)
+// ---------------------------------------------------------------------------
+
+fn find_product_jsonld(html: &str) -> Option<Value> {
+    let blocks = webclaw_core::structured_data::extract_json_ld(html);
+    for b in blocks {
+        if let Some(found) = find_product_in(&b) {
+            return Some(found);
+        }
+    }
+    None
+}
+
+fn find_product_in(v: &Value) -> Option<Value> {
+    if is_product_type(v) {
+        return Some(v.clone());
+    }
+    if let Some(graph) = v.get("@graph").and_then(|g| g.as_array()) {
+        for item in graph {
+            if let Some(found) = find_product_in(item) {
+                return Some(found);
+            }
+        }
+    }
+    if let Some(arr) = v.as_array() {
+        for item in arr {
+            if let Some(found) = find_product_in(item) {
+                return Some(found);
+            }
+        }
+    }
+    None
+}
+
+fn is_product_type(v: &Value) -> bool {
+    let Some(t) = v.get("@type") else {
+        return false;
+    };
+    let is_prod = |s: &str| matches!(s, "Product" | "ProductGroup" | "IndividualProduct");
+    match t {
+        Value::String(s) => is_prod(s),
+        Value::Array(arr) => arr.iter().any(|x| x.as_str().is_some_and(is_prod)),
+        _ => false,
+    }
+}
+
+fn get_text(v: &Value, key: &str) -> Option<String> {
+    v.get(key).and_then(|x| match x {
+        Value::String(s) => Some(s.clone()),
+        Value::Number(n) => Some(n.to_string()),
+        _ => None,
+    })
+}
+
+fn get_brand(v: &Value) -> Option<String> {
+    let brand = v.get("brand")?;
+    if let Some(s) = brand.as_str() {
+        return Some(s.to_string());
+    }
+    brand
+        .as_object()
+        .and_then(|o| o.get("name"))
+        .and_then(|n| n.as_str())
+        .map(String::from)
+}
+
+fn get_first_image(v: &Value) -> Option<String> {
+    match v.get("image")? {
+        Value::String(s) => Some(s.clone()),
+        Value::Array(arr) => arr.iter().find_map(|x| match x {
+            Value::String(s) => Some(s.clone()),
+            Value::Object(_) => x.get("url").and_then(|u| u.as_str()).map(String::from),
+            _ => None,
+        }),
+        Value::Object(o) => o.get("url").and_then(|u| u.as_str()).map(String::from),
+        _ => None,
+    }
+}
+
+fn first_offer(v: &Value) -> Option<Value> {
+    let offers = v.get("offers")?;
+    match offers {
+        Value::Array(arr) => arr.first().cloned(),
+        Value::Object(_) => Some(offers.clone()),
+        _ => None,
+    }
+}
+
+fn get_aggregate_rating(v: &Value) -> Option<Value> {
+    let r = v.get("aggregateRating")?;
+    Some(json!({
+        "rating_value": get_text(r, "ratingValue"),
+        "review_count": get_text(r, "reviewCount"),
+        "best_rating":  get_text(r, "bestRating"),
+    }))
+}
+
+fn strip_schema_prefix(s: String) -> String {
+    s.replace("http://schema.org/", "")
+        .replace("https://schema.org/", "")
+}
+
+fn og(html: &str, prop: &str) -> Option<String> {
+    static RE: OnceLock<Regex> = OnceLock::new();
+    let re = RE.get_or_init(|| {
+        Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
+    });
+    for c in re.captures_iter(html) {
+        if c.get(1).is_some_and(|m| m.as_str() == prop) {
+            return c.get(2).map(|m| m.as_str().to_string());
+        }
+    }
+    None
+}
+
+/// Etsy links the owning shop with a canonical anchor like
+/// `<a href="/shop/ShopName" ...>`. Grab the first one after the
+/// breadcrumb boundary.
+fn shop_url_from_html(html: &str) -> Option<String> {
+    static RE: OnceLock<Regex> = OnceLock::new();
+    let re = RE.get_or_init(|| Regex::new(r#"href="(/shop/[A-Za-z0-9_-]+)""#).unwrap());
+    re.captures(html)
+        .and_then(|c| c.get(1))
+        .map(|m| format!("https://www.etsy.com{}", m.as_str()))
+}
+
+fn cloud_to_fetch_err(e: CloudError) -> FetchError {
+    FetchError::Build(e.to_string())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn matches_etsy_listing_urls() {
+        assert!(matches("https://www.etsy.com/listing/123456789"));
+        assert!(matches(
+            "https://www.etsy.com/listing/123456789/vintage-typewriter"
+        ));
+        assert!(matches(
+            "https://www.etsy.com/fr/listing/123456789/vintage-typewriter"
+        ));
+        assert!(!matches("https://www.etsy.com/"));
+        assert!(!matches("https://www.etsy.com/shop/SomeShop"));
+        assert!(!matches("https://example.com/listing/123456789"));
+    }
+
+    #[test]
+    fn parse_listing_id_handles_slug_and_locale() {
+        assert_eq!(
+            parse_listing_id("https://www.etsy.com/listing/123456789"),
+            Some("123456789".into())
+        );
+        assert_eq!(
+            parse_listing_id("https://www.etsy.com/listing/123456789/slug-here"),
+            Some("123456789".into())
+        );
+        assert_eq!(
+            parse_listing_id("https://www.etsy.com/fr/listing/123456789/slug"),
+            Some("123456789".into())
+        );
+        assert_eq!(
+            parse_listing_id("https://www.etsy.com/listing/123456789?ref=foo"),
+            Some("123456789".into())
+        );
+    }
+
+    #[test]
+    fn parse_extracts_from_fixture_jsonld() {
+        let html = r##"
+<html><head>
+<script type="application/ld+json">
+{"@context":"https://schema.org","@type":"Product",
+ "name":"Handmade Ceramic Mug","sku":"MUG-001",
+ "brand":{"@type":"Brand","name":"Studio Clay"},
+ "image":["https://i.etsystatic.com/abc.jpg","https://i.etsystatic.com/xyz.jpg"],
+ "itemCondition":"https://schema.org/NewCondition",
+ "offers":{"@type":"Offer","price":"24.00","priceCurrency":"USD",
+           "availability":"https://schema.org/InStock",
+           "seller":{"@type":"Organization","name":"StudioClay"}},
+ "aggregateRating":{"@type":"AggregateRating","ratingValue":"4.9","reviewCount":"127","bestRating":"5"}}
+</script>
+<a href="/shop/StudioClay" class="wt-text-link">StudioClay</a>
+</head></html>"##;
+        let v = parse(html, "https://www.etsy.com/listing/1", "1");
+        assert_eq!(v["title"], "Handmade Ceramic Mug");
+        assert_eq!(v["price"], "24.00");
+        assert_eq!(v["currency"], "USD");
+        assert_eq!(v["availability"], "InStock");
+        assert_eq!(v["item_condition"], "NewCondition");
+        assert_eq!(v["shop"], "StudioClay");
+        assert_eq!(v["shop_url"], "https://www.etsy.com/shop/StudioClay");
+        assert_eq!(v["brand"], "Studio Clay");
+        assert_eq!(v["aggregate_rating"]["rating_value"], "4.9");
+        assert_eq!(v["aggregate_rating"]["review_count"], "127");
+    }
+
+    #[test]
+    fn parse_handles_aggregate_offer_price_range() {
+        let html = r##"
+<script type="application/ld+json">
+{"@type":"Product","name":"Mug Set",
+ "offers":{"@type":"AggregateOffer",
+           "lowPrice":"18.00","highPrice":"36.00","priceCurrency":"USD"}}
+</script>
+"##;
+        let v = parse(html, "https://www.etsy.com/listing/2", "2");
+        assert_eq!(v["low_price"], "18.00");
+        assert_eq!(v["high_price"], "36.00");
+        assert_eq!(v["currency"], "USD");
+    }
+
+    #[test]
+    fn parse_falls_back_to_og_when_no_jsonld() {
+        let html = r#"
+<html><head>
+<meta property="og:title" content="Minimal Fallback Item">
+<meta property="og:description" content="OG-only extraction test.">
+<meta property="og:image" content="https://i.etsystatic.com/fallback.jpg">
+</head></html>"#;
+        let v = parse(html, "https://www.etsy.com/listing/3", "3");
+        assert_eq!(v["title"], "Minimal Fallback Item");
+        assert_eq!(v["description"], "OG-only extraction test.");
+        assert_eq!(v["image"], "https://i.etsystatic.com/fallback.jpg");
+        // No price fields when we only have OG.
+        assert!(v["price"].is_null());
+    }
+}
--- a/crates/webclaw-fetch/src/extractors/mod.rs
+++ b/crates/webclaw-fetch/src/extractors/mod.rs
@ -21,6 +21,7 @@ pub mod dev_to;
 pub mod docker_hub;
 pub mod ebay_listing;
 pub mod ecommerce_product;
+pub mod etsy_listing;
 pub mod github_issue;
 pub mod github_pr;
 pub mod github_release;
@ -92,6 +93,7 @@ pub fn list() -> Vec<ExtractorInfo> {
        woocommerce_product::INFO,
        amazon_product::INFO,
        ebay_listing::INFO,
+        etsy_listing::INFO,
        trustpilot_reviews::INFO,
    ]
 }
@ -243,6 +245,13 @@ pub async fn dispatch_by_url(
                .map(|v| (ebay_listing::INFO.name, v)),
        );
    }
+    if etsy_listing::matches(url) {
+        return Some(
+            etsy_listing::extract(client, url)
+                .await
+                .map(|v| (etsy_listing::INFO.name, v)),
+        );
+    }
    if trustpilot_reviews::matches(url) {
        return Some(
            trustpilot_reviews::extract(client, url)
@ -400,6 +409,12 @@ pub async fn dispatch_by_name(
            })
            .await
        }
+        n if n == etsy_listing::INFO.name => {
+            run_or_mismatch(etsy_listing::matches(url), n, url, || {
+                etsy_listing::extract(client, url)
+            })
+            .await
+        }
        n if n == trustpilot_reviews::INFO.name => {
            run_or_mismatch(trustpilot_reviews::matches(url), n, url, || {
                trustpilot_reviews::extract(client, url)
--- a/crates/webclaw-fetch/src/extractors/substack_post.rs
+++ b/crates/webclaw-fetch/src/extractors/substack_post.rs
@ -10,18 +10,32 @@
 //! "URL has `/p/{slug}`" because that's the canonical Substack post
 //! path. Explicit-call only because the `/p/{slug}` URL shape is
 //! used by non-Substack sites too.
+//!
+//! ## Fallback
+//!
+//! The API endpoint is rate-limited aggressively on popular publications
+//! and occasionally returns 403 on custom domains with Cloudflare in
+//! front. When that happens we escalate to an HTML fetch (via
+//! `smart_fetch_html`, so antibot-protected custom domains still work)
+//! and extract OG tags + Article JSON-LD for a degraded-but-useful
+//! payload. The response shape stays stable across both paths; a
+//! `data_source` field tells the caller which branch ran.

+use std::sync::OnceLock;
+
+use regex::Regex;
 use serde::Deserialize;
 use serde_json::{Value, json};

 use super::ExtractorInfo;
 use crate::client::FetchClient;
+use crate::cloud::{self, CloudError};
 use crate::error::FetchError;

 pub const INFO: ExtractorInfo = ExtractorInfo {
    name: "substack_post",
    label: "Substack post",
-    description: "Returns post HTML, title, subtitle, author, publication, reactions, paywall status via the Substack public API.",
+    description: "Returns post HTML, title, subtitle, author, publication, reactions, paywall status via the Substack public API. Falls back to OG + JSON-LD HTML parsing when the API is rate-limited.",
    url_patterns: &[
        "https://{pub}.substack.com/p/{slug}",
        "https://{custom-domain}/p/{slug}",
@ -51,32 +65,55 @@ pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchErro
        "https"
    };
    let api_url = format!("{scheme}://{host}/api/v1/posts/{slug}");
+
+    // 1. Try the public API. 200 = full payload; 404 = real miss; any
+    //    other status hands off to the HTML fallback so a transient rate
+    //    limit or a hardened custom domain doesn't fail the whole call.
    let resp = client.fetch(&api_url).await?;
-    if resp.status == 404 {
-        return Err(FetchError::Build(format!(
+    match resp.status {
+        200 => match serde_json::from_str::<Post>(&resp.html) {
+            Ok(p) => Ok(build_api_payload(url, &api_url, &slug, p)),
+            Err(e) => {
+                // API returned 200 but the body isn't the Post shape we
+                // expect. Could be a custom-domain site that exposes
+                // something else at /api/v1/posts/. Fall back to HTML
+                // rather than hard-failing.
+                html_fallback(
+                    client,
+                    url,
+                    &api_url,
+                    &slug,
+                    Some(format!(
+                        "api returned 200 but body was not Substack JSON ({e})"
+                    )),
+                )
+                .await
+            }
+        },
+        404 => Err(FetchError::Build(format!(
            "substack_post: '{slug}' not found on {host} (got 404). \
             If the publication isn't actually on Substack, use /v1/scrape instead."
-        )));
+        ))),
+        _ => {
+            // Rate limit, 403, 5xx, whatever: try HTML.
+            let reason = format!("api returned status {} for {api_url}", resp.status);
+            html_fallback(client, url, &api_url, &slug, Some(reason)).await
        }
-    if resp.status != 200 {
-        return Err(FetchError::Build(format!(
-            "substack returned status {} for {api_url}",
-            resp.status
-        )));
    }
+}

-    let p: Post = serde_json::from_str(&resp.html).map_err(|e| {
-        FetchError::BodyDecode(format!(
-            "substack_post: '{host}' didn't return Substack JSON, likely not a Substack ({e})"
-        ))
-    })?;
+// ---------------------------------------------------------------------------
+// API-path payload builder
+// ---------------------------------------------------------------------------

-    Ok(json!({
+fn build_api_payload(url: &str, api_url: &str, slug: &str, p: Post) -> Value {
+    json!({
        "url":                  url,
        "api_url":              api_url,
+        "data_source":          "api",
        "id":                   p.id,
        "type":                 p.r#type,
-        "slug":                 p.slug,
+        "slug":                 p.slug.or_else(|| Some(slug.to_string())),
        "title":                p.title,
        "subtitle":             p.subtitle,
        "description":          p.description,
@ -104,7 +141,117 @@ pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchErro
            "handle": a.handle,
            "photo":  a.photo_url,
        })).collect::<Vec<_>>(),
+    })
+}
+
+// ---------------------------------------------------------------------------
+// HTML fallback: OG + Article JSON-LD
+// ---------------------------------------------------------------------------
+
+async fn html_fallback(
+    client: &FetchClient,
+    url: &str,
+    api_url: &str,
+    slug: &str,
+    fallback_reason: Option<String>,
+) -> Result<Value, FetchError> {
+    let fetched = cloud::smart_fetch_html(client, client.cloud(), url)
+        .await
+        .map_err(cloud_to_fetch_err)?;
+
+    let mut data = parse_html(&fetched.html, url, api_url, slug);
+    if let Some(obj) = data.as_object_mut() {
+        obj.insert(
+            "fetch_source".into(),
+            match fetched.source {
+                cloud::FetchSource::Local => json!("local"),
+                cloud::FetchSource::Cloud => json!("cloud"),
+            },
+        );
+        if let Some(reason) = fallback_reason {
+            obj.insert("fallback_reason".into(), json!(reason));
+        }
+    }
+    Ok(data)
+}
+
+/// Pure HTML parser. Pulls title, subtitle, description, cover image,
+/// publish date, and authors from OG tags and Article JSON-LD. Kept
+/// public so tests can exercise it with fixtures.
+pub fn parse_html(html: &str, url: &str, api_url: &str, slug: &str) -> Value {
+    let article = find_article_jsonld(html);
+
+    let title = article
+        .as_ref()
+        .and_then(|v| get_text(v, "headline"))
+        .or_else(|| og(html, "title"));
+    let description = article
+        .as_ref()
+        .and_then(|v| get_text(v, "description"))
+        .or_else(|| og(html, "description"));
+    let cover_image = article
+        .as_ref()
+        .and_then(get_first_image)
+        .or_else(|| og(html, "image"));
+    let post_date = article
+        .as_ref()
+        .and_then(|v| get_text(v, "datePublished"))
+        .or_else(|| meta_property(html, "article:published_time"));
+    let updated_at = article.as_ref().and_then(|v| get_text(v, "dateModified"));
+    let publication_name = og(html, "site_name");
+    let authors = article.as_ref().map(extract_authors).unwrap_or_default();
+
+    json!({
+        "url":                  url,
+        "api_url":              api_url,
+        "data_source":          "html_fallback",
+        "slug":                 slug,
+        "title":                title,
+        "subtitle":             None::<String>,
+        "description":          description,
+        "canonical_url":        canonical_url(html).or_else(|| Some(url.to_string())),
+        "post_date":            post_date,
+        "updated_at":           updated_at,
+        "cover_image":          cover_image,
+        "body_html":            None::<String>,
+        "body_text":            None::<String>,
+        "word_count":           None::<i64>,
+        "comment_count":        None::<i64>,
+        "reactions":            Value::Null,
+        "has_paywall":          None::<bool>,
+        "is_free_preview":      None::<bool>,
+        "publication": json!({
+            "name": publication_name,
+        }),
+        "authors": authors,
+    })
+}
+
+fn extract_authors(v: &Value) -> Vec<Value> {
+    let Some(a) = v.get("author") else {
+        return Vec::new();
+    };
+    let one = |val: &Value| -> Option<Value> {
+        match val {
+            Value::String(s) => Some(json!({"name": s})),
+            Value::Object(_) => {
+                let name = val.get("name").and_then(|n| n.as_str())?;
+                let handle = val
+                    .get("url")
+                    .and_then(|u| u.as_str())
+                    .and_then(handle_from_author_url);
+                Some(json!({
+                    "name":   name,
+                    "handle": handle,
                }))
+            }
+            _ => None,
+        }
+    };
+    match a {
+        Value::Array(arr) => arr.iter().filter_map(one).collect(),
+        _ => one(a).into_iter().collect(),
+    }
 }

 // ---------------------------------------------------------------------------
@ -136,6 +283,139 @@ fn parse_slug(url: &str) -> Option<String> {
    }
 }

+/// Extract the Substack handle from an author URL like
+/// `https://substack.com/@handle` or `https://pub.substack.com/@handle`.
+///
+/// Returns `None` when the URL has no `@` segment (e.g. a non-Substack
+/// author page) so we don't synthesise a fake handle.
+fn handle_from_author_url(u: &str) -> Option<String> {
+    let after = u.rsplit_once('@').map(|(_, tail)| tail)?;
+    let clean = after.split(['/', '?', '#']).next()?;
+    if clean.is_empty() {
+        None
+    } else {
+        Some(clean.to_string())
+    }
+}
+
+// ---------------------------------------------------------------------------
+// HTML tag helpers
+// ---------------------------------------------------------------------------
+
+fn og(html: &str, prop: &str) -> Option<String> {
+    static RE: OnceLock<Regex> = OnceLock::new();
+    let re = RE.get_or_init(|| {
+        Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
+    });
+    for c in re.captures_iter(html) {
+        if c.get(1).is_some_and(|m| m.as_str() == prop) {
+            return c.get(2).map(|m| m.as_str().to_string());
+        }
+    }
+    None
+}
+
+/// Pull `<meta property="article:published_time" content="...">` and
+/// similar structured meta tags.
+fn meta_property(html: &str, prop: &str) -> Option<String> {
+    static RE: OnceLock<Regex> = OnceLock::new();
+    let re = RE.get_or_init(|| {
+        Regex::new(r#"(?i)<meta[^>]+property="([^"]+)"[^>]+content="([^"]+)""#).unwrap()
+    });
+    for c in re.captures_iter(html) {
+        if c.get(1).is_some_and(|m| m.as_str() == prop) {
+            return c.get(2).map(|m| m.as_str().to_string());
+        }
+    }
+    None
+}
+
+fn canonical_url(html: &str) -> Option<String> {
+    static RE: OnceLock<Regex> = OnceLock::new();
+    let re = RE
+        .get_or_init(|| Regex::new(r#"(?i)<link[^>]+rel="canonical"[^>]+href="([^"]+)""#).unwrap());
+    re.captures(html)
+        .and_then(|c| c.get(1))
+        .map(|m| m.as_str().to_string())
+}
+
+// ---------------------------------------------------------------------------
+// JSON-LD walkers (Article / NewsArticle)
+// ---------------------------------------------------------------------------
+
+fn find_article_jsonld(html: &str) -> Option<Value> {
+    let blocks = webclaw_core::structured_data::extract_json_ld(html);
+    for b in blocks {
+        if let Some(found) = find_article_in(&b) {
+            return Some(found);
+        }
+    }
+    None
+}
+
+fn find_article_in(v: &Value) -> Option<Value> {
+    if is_article_type(v) {
+        return Some(v.clone());
+    }
+    if let Some(graph) = v.get("@graph").and_then(|g| g.as_array()) {
+        for item in graph {
+            if let Some(found) = find_article_in(item) {
+                return Some(found);
+            }
+        }
+    }
+    if let Some(arr) = v.as_array() {
+        for item in arr {
+            if let Some(found) = find_article_in(item) {
+                return Some(found);
+            }
+        }
+    }
+    None
+}
+
+fn is_article_type(v: &Value) -> bool {
+    let Some(t) = v.get("@type") else {
+        return false;
+    };
+    let is_art = |s: &str| {
+        matches!(
+            s,
+            "Article" | "NewsArticle" | "BlogPosting" | "SocialMediaPosting"
+        )
+    };
+    match t {
+        Value::String(s) => is_art(s),
+        Value::Array(arr) => arr.iter().any(|x| x.as_str().is_some_and(is_art)),
+        _ => false,
+    }
+}
+
+fn get_text(v: &Value, key: &str) -> Option<String> {
+    v.get(key).and_then(|x| match x {
+        Value::String(s) => Some(s.clone()),
+        Value::Number(n) => Some(n.to_string()),
+        _ => None,
+    })
+}
+
+fn get_first_image(v: &Value) -> Option<String> {
+    match v.get("image")? {
+        Value::String(s) => Some(s.clone()),
+        Value::Array(arr) => arr.iter().find_map(|x| match x {
+            Value::String(s) => Some(s.clone()),
+            Value::Object(_) => x.get("url").and_then(|u| u.as_str()).map(String::from),
+            _ => None,
+        }),
+        Value::Object(o) => o.get("url").and_then(|u| u.as_str()).map(String::from),
+        _ => None,
+    }
+}
+
+fn cloud_to_fetch_err(e: CloudError) -> FetchError {
+    FetchError::Build(e.to_string())
+}
+
 // ---------------------------------------------------------------------------
 // Substack API types (subset)
 // ---------------------------------------------------------------------------
@ -210,4 +490,76 @@ mod tests {
            Some("my-post".into())
        );
    }
+
+    #[test]
+    fn parse_html_extracts_from_og_tags() {
+        let html = r##"
+<html><head>
+<meta property="og:title" content="My Great Post">
+<meta property="og:description" content="A short summary.">
+<meta property="og:image" content="https://cdn.substack.com/cover.jpg">
+<meta property="og:site_name" content="My Publication">
+<meta property="article:published_time" content="2025-09-01T10:00:00Z">
+<link rel="canonical" href="https://mypub.substack.com/p/my-post">
+</head></html>"##;
+        let v = parse_html(
+            html,
+            "https://mypub.substack.com/p/my-post",
+            "https://mypub.substack.com/api/v1/posts/my-post",
+            "my-post",
+        );
+        assert_eq!(v["data_source"], "html_fallback");
+        assert_eq!(v["title"], "My Great Post");
+        assert_eq!(v["description"], "A short summary.");
+        assert_eq!(v["cover_image"], "https://cdn.substack.com/cover.jpg");
+        assert_eq!(v["post_date"], "2025-09-01T10:00:00Z");
+        assert_eq!(v["publication"]["name"], "My Publication");
+        assert_eq!(v["canonical_url"], "https://mypub.substack.com/p/my-post");
+    }
+
+    #[test]
+    fn parse_html_prefers_jsonld_when_present() {
+        let html = r##"
+<html><head>
+<meta property="og:title" content="OG Title">
+<script type="application/ld+json">
+{"@context":"https://schema.org","@type":"NewsArticle",
+ "headline":"JSON-LD Title",
+ "description":"JSON-LD desc.",
+ "image":"https://cdn.substack.com/hero.jpg",
+ "datePublished":"2025-10-12T08:30:00Z",
+ "dateModified":"2025-10-12T09:00:00Z",
+ "author":[{"@type":"Person","name":"Alice Author","url":"https://substack.com/@alice"}]}
+</script>
+</head></html>"##;
+        let v = parse_html(
+            html,
+            "https://example.com/p/a",
+            "https://example.com/api/v1/posts/a",
+            "a",
+        );
+        assert_eq!(v["title"], "JSON-LD Title");
+        assert_eq!(v["description"], "JSON-LD desc.");
+        assert_eq!(v["cover_image"], "https://cdn.substack.com/hero.jpg");
+        assert_eq!(v["post_date"], "2025-10-12T08:30:00Z");
+        assert_eq!(v["updated_at"], "2025-10-12T09:00:00Z");
+        assert_eq!(v["authors"][0]["name"], "Alice Author");
+        assert_eq!(v["authors"][0]["handle"], "alice");
+    }
+
+    #[test]
+    fn handle_from_author_url_pulls_handle() {
+        assert_eq!(
+            handle_from_author_url("https://substack.com/@alice"),
+            Some("alice".into())
+        );
+        assert_eq!(
+            handle_from_author_url("https://mypub.substack.com/@bob/"),
+            Some("bob".into())
+        );
+        assert_eq!(
+            handle_from_author_url("https://not-substack.com/author/carol"),
+            None
+        );
+    }
 }
--- a/crates/webclaw-fetch/src/extractors/youtube_video.rs
+++ b/crates/webclaw-fetch/src/extractors/youtube_video.rs
@ -9,7 +9,19 @@
 //!
 //! Auto-dispatched: YouTube host is unique and the `v=` or `/shorts/`
 //! shape is stable.
+//!
+//! ## Fallback
+//!
+//! `ytInitialPlayerResponse` is missing on EU-consent interstitials,
+//! some live-stream pre-show pages, and age-gated videos. In those
+//! cases we drop down to OG tags for `title`, `description`,
+//! `thumbnail`, and `channel`, and return a `data_source:
+//! "og_fallback"` payload so the caller can tell they got a degraded
+//! shape (no view count, duration, captions).

+use std::sync::OnceLock;
+
+use regex::Regex;
 use serde_json::{Value, json};

 use super::ExtractorInfo;
@ -19,7 +31,7 @@ use crate::error::FetchError;
 pub const INFO: ExtractorInfo = ExtractorInfo {
    name: "youtube_video",
    label: "YouTube video",
-    description: "Returns video id, title, channel, view count, duration, upload date, thumbnails, keywords, and caption-track URLs.",
+    description: "Returns video id, title, channel, view count, duration, upload date, thumbnails, keywords, and caption-track URLs. Falls back to OG metadata on consent / age-gate pages.",
    url_patterns: &[
        "https://www.youtube.com/watch?v={id}",
        "https://youtu.be/{id}",
@ -49,12 +61,28 @@ pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchErro
        )));
    }

-    let player = extract_player_response(&resp.html).ok_or_else(|| {
-        FetchError::BodyDecode(format!(
-            "youtube_video: no ytInitialPlayerResponse on {canonical} (video may be private, region-blocked, or removed)"
-        ))
-    })?;
+    if let Some(player) = extract_player_response(&resp.html) {
+        return Ok(build_player_payload(
+            &player, &resp.html, url, &canonical, &video_id,
+        ));
+    }

+    // No player blob. Fall back to OG tags so the call still returns
+    // something useful for consent / age-gate pages.
+    Ok(build_og_fallback(&resp.html, url, &canonical, &video_id))
+}
+
+// ---------------------------------------------------------------------------
+// Player-blob path (rich payload)
+// ---------------------------------------------------------------------------
+
+fn build_player_payload(
+    player: &Value,
+    html: &str,
+    url: &str,
+    canonical: &str,
+    video_id: &str,
+) -> Value {
    let video_details = player.get("videoDetails");
    let microformat = player
        .get("microformat")
@ -73,7 +101,7 @@ pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchErro
        .cloned()
        .unwrap_or_default();

-    let caption_tracks = webclaw_core::youtube::extract_caption_tracks(&resp.html);
+    let caption_tracks = webclaw_core::youtube::extract_caption_tracks(html);
    let captions: Vec<Value> = caption_tracks
        .iter()
        .map(|c| {
@ -85,9 +113,10 @@ pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchErro
        })
        .collect();

-    Ok(json!({
+    json!({
        "url":          url,
        "canonical_url":canonical,
+        "data_source":  "player_response",
        "video_id":     video_id,
        "title":        get_str(video_details, "title"),
        "description":  get_str(video_details, "shortDescription"),
@ -106,7 +135,46 @@ pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchErro
        "keywords":     keywords,
        "thumbnails":   thumbnails,
        "caption_tracks": captions,
-    }))
+    })
+}
+
+// ---------------------------------------------------------------------------
+// OG fallback path (degraded payload)
+// ---------------------------------------------------------------------------
+
+fn build_og_fallback(html: &str, url: &str, canonical: &str, video_id: &str) -> Value {
+    let title = og(html, "title");
+    let description = og(html, "description");
+    let thumbnail = og(html, "image");
+    // YouTube sets `<meta name="channel_name" ...>` on some pages but
+    // OG-only pages reliably carry `og:video:tag` and the channel in
+    // `<link itemprop="name">`. We keep this lean: just what's stable.
+    let channel = meta_name(html, "author");
+
+    json!({
+        "url":          url,
+        "canonical_url":canonical,
+        "data_source":  "og_fallback",
+        "video_id":     video_id,
+        "title":        title,
+        "description":  description,
+        "author":       channel,
+        // OG path: these are null so the caller doesn't have to guess.
+        "channel_id":   None::<String>,
+        "channel_url":  None::<String>,
+        "view_count":   None::<i64>,
+        "length_seconds": None::<i64>,
+        "is_live":      None::<bool>,
+        "is_private":   None::<bool>,
+        "is_unlisted":  None::<bool>,
+        "allow_ratings":None::<bool>,
+        "category":     None::<String>,
+        "upload_date":  None::<String>,
+        "publish_date": None::<String>,
+        "keywords":     Vec::<Value>::new(),
+        "thumbnails":   thumbnail.as_ref().map(|t| vec![json!({"url": t})]).unwrap_or_default(),
+        "caption_tracks": Vec::<Value>::new(),
+    })
 }

 // ---------------------------------------------------------------------------
@ -166,8 +234,6 @@ fn parse_video_id(url: &str) -> Option<String> {
 // ---------------------------------------------------------------------------

 fn extract_player_response(html: &str) -> Option<Value> {
-    use regex::Regex;
-    use std::sync::OnceLock;
    // Same regex as webclaw_core::youtube. Duplicated here because
    // core's regex is module-private. Kept in lockstep; changes are
    // rare and we cover with tests in both places.
@ -178,6 +244,36 @@ fn extract_player_response(html: &str) -> Option<Value> {
    serde_json::from_str(json_str).ok()
 }

+// ---------------------------------------------------------------------------
+// Meta-tag helpers (for OG fallback)
+// ---------------------------------------------------------------------------
+
+fn og(html: &str, prop: &str) -> Option<String> {
+    static RE: OnceLock<Regex> = OnceLock::new();
+    let re = RE.get_or_init(|| {
+        Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
+    });
+    for c in re.captures_iter(html) {
+        if c.get(1).is_some_and(|m| m.as_str() == prop) {
+            return c.get(2).map(|m| m.as_str().to_string());
+        }
+    }
+    None
+}
+
+fn meta_name(html: &str, name: &str) -> Option<String> {
+    static RE: OnceLock<Regex> = OnceLock::new();
+    let re = RE.get_or_init(|| {
+        Regex::new(r#"(?i)<meta[^>]+name="([^"]+)"[^>]+content="([^"]+)""#).unwrap()
+    });
+    for c in re.captures_iter(html) {
+        if c.get(1).is_some_and(|m| m.as_str() == name) {
+            return c.get(2).map(|m| m.as_str().to_string());
+        }
+    }
+    None
+}
+
 fn get_str(v: Option<&Value>, key: &str) -> Option<String> {
    v.and_then(|x| x.get(key))
        .and_then(|x| x.as_str().map(String::from))
@ -252,4 +348,31 @@ var ytInitialPlayerResponse = {"videoDetails":{"videoId":"abc","title":"T","auth
        let vd = v.get("videoDetails").unwrap();
        assert_eq!(vd.get("title").unwrap().as_str(), Some("T"));
    }
+
+    #[test]
+    fn og_fallback_extracts_basics_from_meta_tags() {
+        let html = r##"
+<html><head>
+<meta property="og:title" content="Example Video Title">
+<meta property="og:description" content="A cool video description.">
+<meta property="og:image" content="https://i.ytimg.com/vi/abc/maxresdefault.jpg">
+<meta name="author" content="Example Channel">
+</head></html>"##;
+        let v = build_og_fallback(
+            html,
+            "https://www.youtube.com/watch?v=abc",
+            "https://www.youtube.com/watch?v=abc",
+            "abc",
+        );
+        assert_eq!(v["data_source"], "og_fallback");
+        assert_eq!(v["title"], "Example Video Title");
+        assert_eq!(v["description"], "A cool video description.");
+        assert_eq!(v["author"], "Example Channel");
+        assert_eq!(
+            v["thumbnails"][0]["url"],
+            "https://i.ytimg.com/vi/abc/maxresdefault.jpg"
+        );
+        assert!(v["view_count"].is_null());
+        assert!(v["caption_tracks"].as_array().unwrap().is_empty());
+    }
 }