diff --git a/crates/webclaw-fetch/src/extractors/etsy_listing.rs b/crates/webclaw-fetch/src/extractors/etsy_listing.rs new file mode 100644 index 0000000..bb7cc97 --- /dev/null +++ b/crates/webclaw-fetch/src/extractors/etsy_listing.rs @@ -0,0 +1,391 @@ +//! Etsy listing extractor. +//! +//! Etsy product pages at `etsy.com/listing/{id}` (and a sluggy variant +//! `etsy.com/listing/{id}/{slug}`) ship a Schema.org `Product` JSON-LD +//! block with title, price, currency, availability, shop seller, and +//! an `AggregateRating` for the listing. +//! +//! Etsy puts Cloudflare + custom WAF in front of product pages with a +//! high variance: the Firefox profile gets clean HTML most of the time +//! but some listings return a CF interstitial. We route through +//! `cloud::smart_fetch_html` so both paths resolve to the same parser, +//! same as `ebay_listing`. + +use std::sync::OnceLock; + +use regex::Regex; +use serde_json::{Value, json}; + +use super::ExtractorInfo; +use crate::client::FetchClient; +use crate::cloud::{self, CloudError}; +use crate::error::FetchError; + +pub const INFO: ExtractorInfo = ExtractorInfo { + name: "etsy_listing", + label: "Etsy listing", + description: "Returns listing title, price, currency, availability, shop, rating, and image. Heavy listings may need WEBCLAW_API_KEY for antibot.", + url_patterns: &[ + "https://www.etsy.com/listing/{id}", + "https://www.etsy.com/listing/{id}/{slug}", + "https://www.etsy.com/{locale}/listing/{id}", + ], +}; + +pub fn matches(url: &str) -> bool { + let host = host_of(url); + if !is_etsy_host(host) { + return false; + } + parse_listing_id(url).is_some() +} + +pub async fn extract(client: &FetchClient, url: &str) -> Result { + let listing_id = parse_listing_id(url) + .ok_or_else(|| FetchError::Build(format!("etsy_listing: no listing id in '{url}'")))?; + + let fetched = cloud::smart_fetch_html(client, client.cloud(), url) + .await + .map_err(cloud_to_fetch_err)?; + + let mut data = parse(&fetched.html, url, &listing_id); + if let Some(obj) = data.as_object_mut() { + obj.insert( + "data_source".into(), + match fetched.source { + cloud::FetchSource::Local => json!("local"), + cloud::FetchSource::Cloud => json!("cloud"), + }, + ); + } + Ok(data) +} + +pub fn parse(html: &str, url: &str, listing_id: &str) -> Value { + let jsonld = find_product_jsonld(html); + + let title = jsonld + .as_ref() + .and_then(|v| get_text(v, "name")) + .or_else(|| og(html, "title")); + let description = jsonld + .as_ref() + .and_then(|v| get_text(v, "description")) + .or_else(|| og(html, "description")); + let image = jsonld + .as_ref() + .and_then(get_first_image) + .or_else(|| og(html, "image")); + let brand = jsonld.as_ref().and_then(get_brand); + + // Etsy listings often ship either a single Offer or an + // AggregateOffer when the listing has variants with different prices. + let offer = jsonld.as_ref().and_then(first_offer); + let (low_price, high_price, single_price) = match offer.as_ref() { + Some(o) => ( + get_text(o, "lowPrice"), + get_text(o, "highPrice"), + get_text(o, "price"), + ), + None => (None, None, None), + }; + let currency = offer.as_ref().and_then(|o| get_text(o, "priceCurrency")); + let availability = offer + .as_ref() + .and_then(|o| get_text(o, "availability").map(strip_schema_prefix)); + let item_condition = jsonld + .as_ref() + .and_then(|v| get_text(v, "itemCondition")) + .map(strip_schema_prefix); + + // Shop name lives under offers[0].seller.name on Etsy. + let shop = offer.as_ref().and_then(|o| { + o.get("seller") + .and_then(|s| s.get("name")) + .and_then(|n| n.as_str()) + .map(String::from) + }); + let shop_url = shop_url_from_html(html); + + let aggregate_rating = jsonld.as_ref().and_then(get_aggregate_rating); + + json!({ + "url": url, + "listing_id": listing_id, + "title": title, + "description": description, + "image": image, + "brand": brand, + "price": single_price, + "low_price": low_price, + "high_price": high_price, + "currency": currency, + "availability": availability, + "item_condition": item_condition, + "shop": shop, + "shop_url": shop_url, + "aggregate_rating": aggregate_rating, + }) +} + +// --------------------------------------------------------------------------- +// URL helpers +// --------------------------------------------------------------------------- + +fn host_of(url: &str) -> &str { + url.split("://") + .nth(1) + .unwrap_or(url) + .split('/') + .next() + .unwrap_or("") +} + +fn is_etsy_host(host: &str) -> bool { + host == "etsy.com" || host == "www.etsy.com" || host.ends_with(".etsy.com") +} + +/// Extract the numeric listing id. Etsy ids are 9-11 digits today but +/// we accept any all-digit segment right after `/listing/`. +/// +/// Handles `/listing/{id}`, `/listing/{id}/{slug}`, and the localised +/// `/{locale}/listing/{id}` shape (e.g. `/fr/listing/...`). +fn parse_listing_id(url: &str) -> Option { + static RE: OnceLock = OnceLock::new(); + let re = RE.get_or_init(|| Regex::new(r"/listing/(\d{6,})(?:[/?#]|$)").unwrap()); + re.captures(url) + .and_then(|c| c.get(1)) + .map(|m| m.as_str().to_string()) +} + +// --------------------------------------------------------------------------- +// JSON-LD walkers (same shape as ebay_listing; kept separate so the two +// extractors can diverge without cross-impact) +// --------------------------------------------------------------------------- + +fn find_product_jsonld(html: &str) -> Option { + let blocks = webclaw_core::structured_data::extract_json_ld(html); + for b in blocks { + if let Some(found) = find_product_in(&b) { + return Some(found); + } + } + None +} + +fn find_product_in(v: &Value) -> Option { + if is_product_type(v) { + return Some(v.clone()); + } + if let Some(graph) = v.get("@graph").and_then(|g| g.as_array()) { + for item in graph { + if let Some(found) = find_product_in(item) { + return Some(found); + } + } + } + if let Some(arr) = v.as_array() { + for item in arr { + if let Some(found) = find_product_in(item) { + return Some(found); + } + } + } + None +} + +fn is_product_type(v: &Value) -> bool { + let Some(t) = v.get("@type") else { + return false; + }; + let is_prod = |s: &str| matches!(s, "Product" | "ProductGroup" | "IndividualProduct"); + match t { + Value::String(s) => is_prod(s), + Value::Array(arr) => arr.iter().any(|x| x.as_str().is_some_and(is_prod)), + _ => false, + } +} + +fn get_text(v: &Value, key: &str) -> Option { + v.get(key).and_then(|x| match x { + Value::String(s) => Some(s.clone()), + Value::Number(n) => Some(n.to_string()), + _ => None, + }) +} + +fn get_brand(v: &Value) -> Option { + let brand = v.get("brand")?; + if let Some(s) = brand.as_str() { + return Some(s.to_string()); + } + brand + .as_object() + .and_then(|o| o.get("name")) + .and_then(|n| n.as_str()) + .map(String::from) +} + +fn get_first_image(v: &Value) -> Option { + match v.get("image")? { + Value::String(s) => Some(s.clone()), + Value::Array(arr) => arr.iter().find_map(|x| match x { + Value::String(s) => Some(s.clone()), + Value::Object(_) => x.get("url").and_then(|u| u.as_str()).map(String::from), + _ => None, + }), + Value::Object(o) => o.get("url").and_then(|u| u.as_str()).map(String::from), + _ => None, + } +} + +fn first_offer(v: &Value) -> Option { + let offers = v.get("offers")?; + match offers { + Value::Array(arr) => arr.first().cloned(), + Value::Object(_) => Some(offers.clone()), + _ => None, + } +} + +fn get_aggregate_rating(v: &Value) -> Option { + let r = v.get("aggregateRating")?; + Some(json!({ + "rating_value": get_text(r, "ratingValue"), + "review_count": get_text(r, "reviewCount"), + "best_rating": get_text(r, "bestRating"), + })) +} + +fn strip_schema_prefix(s: String) -> String { + s.replace("http://schema.org/", "") + .replace("https://schema.org/", "") +} + +fn og(html: &str, prop: &str) -> Option { + static RE: OnceLock = OnceLock::new(); + let re = RE.get_or_init(|| { + Regex::new(r#"(?i)]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap() + }); + for c in re.captures_iter(html) { + if c.get(1).is_some_and(|m| m.as_str() == prop) { + return c.get(2).map(|m| m.as_str().to_string()); + } + } + None +} + +/// Etsy links the owning shop with a canonical anchor like +/// ``. Grab the first one after the +/// breadcrumb boundary. +fn shop_url_from_html(html: &str) -> Option { + static RE: OnceLock = OnceLock::new(); + let re = RE.get_or_init(|| Regex::new(r#"href="(/shop/[A-Za-z0-9_-]+)""#).unwrap()); + re.captures(html) + .and_then(|c| c.get(1)) + .map(|m| format!("https://www.etsy.com{}", m.as_str())) +} + +fn cloud_to_fetch_err(e: CloudError) -> FetchError { + FetchError::Build(e.to_string()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn matches_etsy_listing_urls() { + assert!(matches("https://www.etsy.com/listing/123456789")); + assert!(matches( + "https://www.etsy.com/listing/123456789/vintage-typewriter" + )); + assert!(matches( + "https://www.etsy.com/fr/listing/123456789/vintage-typewriter" + )); + assert!(!matches("https://www.etsy.com/")); + assert!(!matches("https://www.etsy.com/shop/SomeShop")); + assert!(!matches("https://example.com/listing/123456789")); + } + + #[test] + fn parse_listing_id_handles_slug_and_locale() { + assert_eq!( + parse_listing_id("https://www.etsy.com/listing/123456789"), + Some("123456789".into()) + ); + assert_eq!( + parse_listing_id("https://www.etsy.com/listing/123456789/slug-here"), + Some("123456789".into()) + ); + assert_eq!( + parse_listing_id("https://www.etsy.com/fr/listing/123456789/slug"), + Some("123456789".into()) + ); + assert_eq!( + parse_listing_id("https://www.etsy.com/listing/123456789?ref=foo"), + Some("123456789".into()) + ); + } + + #[test] + fn parse_extracts_from_fixture_jsonld() { + let html = r##" + + +StudioClay +"##; + let v = parse(html, "https://www.etsy.com/listing/1", "1"); + assert_eq!(v["title"], "Handmade Ceramic Mug"); + assert_eq!(v["price"], "24.00"); + assert_eq!(v["currency"], "USD"); + assert_eq!(v["availability"], "InStock"); + assert_eq!(v["item_condition"], "NewCondition"); + assert_eq!(v["shop"], "StudioClay"); + assert_eq!(v["shop_url"], "https://www.etsy.com/shop/StudioClay"); + assert_eq!(v["brand"], "Studio Clay"); + assert_eq!(v["aggregate_rating"]["rating_value"], "4.9"); + assert_eq!(v["aggregate_rating"]["review_count"], "127"); + } + + #[test] + fn parse_handles_aggregate_offer_price_range() { + let html = r##" + +"##; + let v = parse(html, "https://www.etsy.com/listing/2", "2"); + assert_eq!(v["low_price"], "18.00"); + assert_eq!(v["high_price"], "36.00"); + assert_eq!(v["currency"], "USD"); + } + + #[test] + fn parse_falls_back_to_og_when_no_jsonld() { + let html = r#" + + + + +"#; + let v = parse(html, "https://www.etsy.com/listing/3", "3"); + assert_eq!(v["title"], "Minimal Fallback Item"); + assert_eq!(v["description"], "OG-only extraction test."); + assert_eq!(v["image"], "https://i.etsystatic.com/fallback.jpg"); + // No price fields when we only have OG. + assert!(v["price"].is_null()); + } +} diff --git a/crates/webclaw-fetch/src/extractors/mod.rs b/crates/webclaw-fetch/src/extractors/mod.rs index 510adc0..5d06158 100644 --- a/crates/webclaw-fetch/src/extractors/mod.rs +++ b/crates/webclaw-fetch/src/extractors/mod.rs @@ -21,6 +21,7 @@ pub mod dev_to; pub mod docker_hub; pub mod ebay_listing; pub mod ecommerce_product; +pub mod etsy_listing; pub mod github_issue; pub mod github_pr; pub mod github_release; @@ -92,6 +93,7 @@ pub fn list() -> Vec { woocommerce_product::INFO, amazon_product::INFO, ebay_listing::INFO, + etsy_listing::INFO, trustpilot_reviews::INFO, ] } @@ -243,6 +245,13 @@ pub async fn dispatch_by_url( .map(|v| (ebay_listing::INFO.name, v)), ); } + if etsy_listing::matches(url) { + return Some( + etsy_listing::extract(client, url) + .await + .map(|v| (etsy_listing::INFO.name, v)), + ); + } if trustpilot_reviews::matches(url) { return Some( trustpilot_reviews::extract(client, url) @@ -400,6 +409,12 @@ pub async fn dispatch_by_name( }) .await } + n if n == etsy_listing::INFO.name => { + run_or_mismatch(etsy_listing::matches(url), n, url, || { + etsy_listing::extract(client, url) + }) + .await + } n if n == trustpilot_reviews::INFO.name => { run_or_mismatch(trustpilot_reviews::matches(url), n, url, || { trustpilot_reviews::extract(client, url) diff --git a/crates/webclaw-fetch/src/extractors/substack_post.rs b/crates/webclaw-fetch/src/extractors/substack_post.rs index 03ccbe8..0571f3d 100644 --- a/crates/webclaw-fetch/src/extractors/substack_post.rs +++ b/crates/webclaw-fetch/src/extractors/substack_post.rs @@ -10,18 +10,32 @@ //! "URL has `/p/{slug}`" because that's the canonical Substack post //! path. Explicit-call only because the `/p/{slug}` URL shape is //! used by non-Substack sites too. +//! +//! ## Fallback +//! +//! The API endpoint is rate-limited aggressively on popular publications +//! and occasionally returns 403 on custom domains with Cloudflare in +//! front. When that happens we escalate to an HTML fetch (via +//! `smart_fetch_html`, so antibot-protected custom domains still work) +//! and extract OG tags + Article JSON-LD for a degraded-but-useful +//! payload. The response shape stays stable across both paths; a +//! `data_source` field tells the caller which branch ran. +use std::sync::OnceLock; + +use regex::Regex; use serde::Deserialize; use serde_json::{Value, json}; use super::ExtractorInfo; use crate::client::FetchClient; +use crate::cloud::{self, CloudError}; use crate::error::FetchError; pub const INFO: ExtractorInfo = ExtractorInfo { name: "substack_post", label: "Substack post", - description: "Returns post HTML, title, subtitle, author, publication, reactions, paywall status via the Substack public API.", + description: "Returns post HTML, title, subtitle, author, publication, reactions, paywall status via the Substack public API. Falls back to OG + JSON-LD HTML parsing when the API is rate-limited.", url_patterns: &[ "https://{pub}.substack.com/p/{slug}", "https://{custom-domain}/p/{slug}", @@ -51,32 +65,55 @@ pub async fn extract(client: &FetchClient, url: &str) -> Result match serde_json::from_str::(&resp.html) { + Ok(p) => Ok(build_api_payload(url, &api_url, &slug, p)), + Err(e) => { + // API returned 200 but the body isn't the Post shape we + // expect. Could be a custom-domain site that exposes + // something else at /api/v1/posts/. Fall back to HTML + // rather than hard-failing. + html_fallback( + client, + url, + &api_url, + &slug, + Some(format!( + "api returned 200 but body was not Substack JSON ({e})" + )), + ) + .await + } + }, + 404 => Err(FetchError::Build(format!( "substack_post: '{slug}' not found on {host} (got 404). \ If the publication isn't actually on Substack, use /v1/scrape instead." - ))); - } - if resp.status != 200 { - return Err(FetchError::Build(format!( - "substack returned status {} for {api_url}", - resp.status - ))); + ))), + _ => { + // Rate limit, 403, 5xx, whatever: try HTML. + let reason = format!("api returned status {} for {api_url}", resp.status); + html_fallback(client, url, &api_url, &slug, Some(reason)).await + } } +} - let p: Post = serde_json::from_str(&resp.html).map_err(|e| { - FetchError::BodyDecode(format!( - "substack_post: '{host}' didn't return Substack JSON, likely not a Substack ({e})" - )) - })?; +// --------------------------------------------------------------------------- +// API-path payload builder +// --------------------------------------------------------------------------- - Ok(json!({ +fn build_api_payload(url: &str, api_url: &str, slug: &str, p: Post) -> Value { + json!({ "url": url, "api_url": api_url, + "data_source": "api", "id": p.id, "type": p.r#type, - "slug": p.slug, + "slug": p.slug.or_else(|| Some(slug.to_string())), "title": p.title, "subtitle": p.subtitle, "description": p.description, @@ -104,7 +141,117 @@ pub async fn extract(client: &FetchClient, url: &str) -> Result>(), - })) + }) +} + +// --------------------------------------------------------------------------- +// HTML fallback: OG + Article JSON-LD +// --------------------------------------------------------------------------- + +async fn html_fallback( + client: &FetchClient, + url: &str, + api_url: &str, + slug: &str, + fallback_reason: Option, +) -> Result { + let fetched = cloud::smart_fetch_html(client, client.cloud(), url) + .await + .map_err(cloud_to_fetch_err)?; + + let mut data = parse_html(&fetched.html, url, api_url, slug); + if let Some(obj) = data.as_object_mut() { + obj.insert( + "fetch_source".into(), + match fetched.source { + cloud::FetchSource::Local => json!("local"), + cloud::FetchSource::Cloud => json!("cloud"), + }, + ); + if let Some(reason) = fallback_reason { + obj.insert("fallback_reason".into(), json!(reason)); + } + } + Ok(data) +} + +/// Pure HTML parser. Pulls title, subtitle, description, cover image, +/// publish date, and authors from OG tags and Article JSON-LD. Kept +/// public so tests can exercise it with fixtures. +pub fn parse_html(html: &str, url: &str, api_url: &str, slug: &str) -> Value { + let article = find_article_jsonld(html); + + let title = article + .as_ref() + .and_then(|v| get_text(v, "headline")) + .or_else(|| og(html, "title")); + let description = article + .as_ref() + .and_then(|v| get_text(v, "description")) + .or_else(|| og(html, "description")); + let cover_image = article + .as_ref() + .and_then(get_first_image) + .or_else(|| og(html, "image")); + let post_date = article + .as_ref() + .and_then(|v| get_text(v, "datePublished")) + .or_else(|| meta_property(html, "article:published_time")); + let updated_at = article.as_ref().and_then(|v| get_text(v, "dateModified")); + let publication_name = og(html, "site_name"); + let authors = article.as_ref().map(extract_authors).unwrap_or_default(); + + json!({ + "url": url, + "api_url": api_url, + "data_source": "html_fallback", + "slug": slug, + "title": title, + "subtitle": None::, + "description": description, + "canonical_url": canonical_url(html).or_else(|| Some(url.to_string())), + "post_date": post_date, + "updated_at": updated_at, + "cover_image": cover_image, + "body_html": None::, + "body_text": None::, + "word_count": None::, + "comment_count": None::, + "reactions": Value::Null, + "has_paywall": None::, + "is_free_preview": None::, + "publication": json!({ + "name": publication_name, + }), + "authors": authors, + }) +} + +fn extract_authors(v: &Value) -> Vec { + let Some(a) = v.get("author") else { + return Vec::new(); + }; + let one = |val: &Value| -> Option { + match val { + Value::String(s) => Some(json!({"name": s})), + Value::Object(_) => { + let name = val.get("name").and_then(|n| n.as_str())?; + let handle = val + .get("url") + .and_then(|u| u.as_str()) + .and_then(handle_from_author_url); + Some(json!({ + "name": name, + "handle": handle, + })) + } + _ => None, + } + }; + match a { + Value::Array(arr) => arr.iter().filter_map(one).collect(), + _ => one(a).into_iter().collect(), + } } // --------------------------------------------------------------------------- @@ -136,6 +283,139 @@ fn parse_slug(url: &str) -> Option { } } +/// Extract the Substack handle from an author URL like +/// `https://substack.com/@handle` or `https://pub.substack.com/@handle`. +/// +/// Returns `None` when the URL has no `@` segment (e.g. a non-Substack +/// author page) so we don't synthesise a fake handle. +fn handle_from_author_url(u: &str) -> Option { + let after = u.rsplit_once('@').map(|(_, tail)| tail)?; + let clean = after.split(['/', '?', '#']).next()?; + if clean.is_empty() { + None + } else { + Some(clean.to_string()) + } +} + +// --------------------------------------------------------------------------- +// HTML tag helpers +// --------------------------------------------------------------------------- + +fn og(html: &str, prop: &str) -> Option { + static RE: OnceLock = OnceLock::new(); + let re = RE.get_or_init(|| { + Regex::new(r#"(?i)]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap() + }); + for c in re.captures_iter(html) { + if c.get(1).is_some_and(|m| m.as_str() == prop) { + return c.get(2).map(|m| m.as_str().to_string()); + } + } + None +} + +/// Pull `` and +/// similar structured meta tags. +fn meta_property(html: &str, prop: &str) -> Option { + static RE: OnceLock = OnceLock::new(); + let re = RE.get_or_init(|| { + Regex::new(r#"(?i)]+property="([^"]+)"[^>]+content="([^"]+)""#).unwrap() + }); + for c in re.captures_iter(html) { + if c.get(1).is_some_and(|m| m.as_str() == prop) { + return c.get(2).map(|m| m.as_str().to_string()); + } + } + None +} + +fn canonical_url(html: &str) -> Option { + static RE: OnceLock = OnceLock::new(); + let re = RE + .get_or_init(|| Regex::new(r#"(?i)]+rel="canonical"[^>]+href="([^"]+)""#).unwrap()); + re.captures(html) + .and_then(|c| c.get(1)) + .map(|m| m.as_str().to_string()) +} + +// --------------------------------------------------------------------------- +// JSON-LD walkers (Article / NewsArticle) +// --------------------------------------------------------------------------- + +fn find_article_jsonld(html: &str) -> Option { + let blocks = webclaw_core::structured_data::extract_json_ld(html); + for b in blocks { + if let Some(found) = find_article_in(&b) { + return Some(found); + } + } + None +} + +fn find_article_in(v: &Value) -> Option { + if is_article_type(v) { + return Some(v.clone()); + } + if let Some(graph) = v.get("@graph").and_then(|g| g.as_array()) { + for item in graph { + if let Some(found) = find_article_in(item) { + return Some(found); + } + } + } + if let Some(arr) = v.as_array() { + for item in arr { + if let Some(found) = find_article_in(item) { + return Some(found); + } + } + } + None +} + +fn is_article_type(v: &Value) -> bool { + let Some(t) = v.get("@type") else { + return false; + }; + let is_art = |s: &str| { + matches!( + s, + "Article" | "NewsArticle" | "BlogPosting" | "SocialMediaPosting" + ) + }; + match t { + Value::String(s) => is_art(s), + Value::Array(arr) => arr.iter().any(|x| x.as_str().is_some_and(is_art)), + _ => false, + } +} + +fn get_text(v: &Value, key: &str) -> Option { + v.get(key).and_then(|x| match x { + Value::String(s) => Some(s.clone()), + Value::Number(n) => Some(n.to_string()), + _ => None, + }) +} + +fn get_first_image(v: &Value) -> Option { + match v.get("image")? { + Value::String(s) => Some(s.clone()), + Value::Array(arr) => arr.iter().find_map(|x| match x { + Value::String(s) => Some(s.clone()), + Value::Object(_) => x.get("url").and_then(|u| u.as_str()).map(String::from), + _ => None, + }), + Value::Object(o) => o.get("url").and_then(|u| u.as_str()).map(String::from), + _ => None, + } +} + +fn cloud_to_fetch_err(e: CloudError) -> FetchError { + FetchError::Build(e.to_string()) +} + // --------------------------------------------------------------------------- // Substack API types (subset) // --------------------------------------------------------------------------- @@ -210,4 +490,76 @@ mod tests { Some("my-post".into()) ); } + + #[test] + fn parse_html_extracts_from_og_tags() { + let html = r##" + + + + + + + +"##; + let v = parse_html( + html, + "https://mypub.substack.com/p/my-post", + "https://mypub.substack.com/api/v1/posts/my-post", + "my-post", + ); + assert_eq!(v["data_source"], "html_fallback"); + assert_eq!(v["title"], "My Great Post"); + assert_eq!(v["description"], "A short summary."); + assert_eq!(v["cover_image"], "https://cdn.substack.com/cover.jpg"); + assert_eq!(v["post_date"], "2025-09-01T10:00:00Z"); + assert_eq!(v["publication"]["name"], "My Publication"); + assert_eq!(v["canonical_url"], "https://mypub.substack.com/p/my-post"); + } + + #[test] + fn parse_html_prefers_jsonld_when_present() { + let html = r##" + + + +"##; + let v = parse_html( + html, + "https://example.com/p/a", + "https://example.com/api/v1/posts/a", + "a", + ); + assert_eq!(v["title"], "JSON-LD Title"); + assert_eq!(v["description"], "JSON-LD desc."); + assert_eq!(v["cover_image"], "https://cdn.substack.com/hero.jpg"); + assert_eq!(v["post_date"], "2025-10-12T08:30:00Z"); + assert_eq!(v["updated_at"], "2025-10-12T09:00:00Z"); + assert_eq!(v["authors"][0]["name"], "Alice Author"); + assert_eq!(v["authors"][0]["handle"], "alice"); + } + + #[test] + fn handle_from_author_url_pulls_handle() { + assert_eq!( + handle_from_author_url("https://substack.com/@alice"), + Some("alice".into()) + ); + assert_eq!( + handle_from_author_url("https://mypub.substack.com/@bob/"), + Some("bob".into()) + ); + assert_eq!( + handle_from_author_url("https://not-substack.com/author/carol"), + None + ); + } } diff --git a/crates/webclaw-fetch/src/extractors/youtube_video.rs b/crates/webclaw-fetch/src/extractors/youtube_video.rs index c37230a..81079f4 100644 --- a/crates/webclaw-fetch/src/extractors/youtube_video.rs +++ b/crates/webclaw-fetch/src/extractors/youtube_video.rs @@ -9,7 +9,19 @@ //! //! Auto-dispatched: YouTube host is unique and the `v=` or `/shorts/` //! shape is stable. +//! +//! ## Fallback +//! +//! `ytInitialPlayerResponse` is missing on EU-consent interstitials, +//! some live-stream pre-show pages, and age-gated videos. In those +//! cases we drop down to OG tags for `title`, `description`, +//! `thumbnail`, and `channel`, and return a `data_source: +//! "og_fallback"` payload so the caller can tell they got a degraded +//! shape (no view count, duration, captions). +use std::sync::OnceLock; + +use regex::Regex; use serde_json::{Value, json}; use super::ExtractorInfo; @@ -19,7 +31,7 @@ use crate::error::FetchError; pub const INFO: ExtractorInfo = ExtractorInfo { name: "youtube_video", label: "YouTube video", - description: "Returns video id, title, channel, view count, duration, upload date, thumbnails, keywords, and caption-track URLs.", + description: "Returns video id, title, channel, view count, duration, upload date, thumbnails, keywords, and caption-track URLs. Falls back to OG metadata on consent / age-gate pages.", url_patterns: &[ "https://www.youtube.com/watch?v={id}", "https://youtu.be/{id}", @@ -49,12 +61,28 @@ pub async fn extract(client: &FetchClient, url: &str) -> Result Value { let video_details = player.get("videoDetails"); let microformat = player .get("microformat") @@ -73,7 +101,7 @@ pub async fn extract(client: &FetchClient, url: &str) -> Result = caption_tracks .iter() .map(|c| { @@ -85,9 +113,10 @@ pub async fn extract(client: &FetchClient, url: &str) -> Result Result Value { + let title = og(html, "title"); + let description = og(html, "description"); + let thumbnail = og(html, "image"); + // YouTube sets `` on some pages but + // OG-only pages reliably carry `og:video:tag` and the channel in + // ``. We keep this lean: just what's stable. + let channel = meta_name(html, "author"); + + json!({ + "url": url, + "canonical_url":canonical, + "data_source": "og_fallback", + "video_id": video_id, + "title": title, + "description": description, + "author": channel, + // OG path: these are null so the caller doesn't have to guess. + "channel_id": None::, + "channel_url": None::, + "view_count": None::, + "length_seconds": None::, + "is_live": None::, + "is_private": None::, + "is_unlisted": None::, + "allow_ratings":None::, + "category": None::, + "upload_date": None::, + "publish_date": None::, + "keywords": Vec::::new(), + "thumbnails": thumbnail.as_ref().map(|t| vec![json!({"url": t})]).unwrap_or_default(), + "caption_tracks": Vec::::new(), + }) } // --------------------------------------------------------------------------- @@ -166,8 +234,6 @@ fn parse_video_id(url: &str) -> Option { // --------------------------------------------------------------------------- fn extract_player_response(html: &str) -> Option { - use regex::Regex; - use std::sync::OnceLock; // Same regex as webclaw_core::youtube. Duplicated here because // core's regex is module-private. Kept in lockstep; changes are // rare and we cover with tests in both places. @@ -178,6 +244,36 @@ fn extract_player_response(html: &str) -> Option { serde_json::from_str(json_str).ok() } +// --------------------------------------------------------------------------- +// Meta-tag helpers (for OG fallback) +// --------------------------------------------------------------------------- + +fn og(html: &str, prop: &str) -> Option { + static RE: OnceLock = OnceLock::new(); + let re = RE.get_or_init(|| { + Regex::new(r#"(?i)]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap() + }); + for c in re.captures_iter(html) { + if c.get(1).is_some_and(|m| m.as_str() == prop) { + return c.get(2).map(|m| m.as_str().to_string()); + } + } + None +} + +fn meta_name(html: &str, name: &str) -> Option { + static RE: OnceLock = OnceLock::new(); + let re = RE.get_or_init(|| { + Regex::new(r#"(?i)]+name="([^"]+)"[^>]+content="([^"]+)""#).unwrap() + }); + for c in re.captures_iter(html) { + if c.get(1).is_some_and(|m| m.as_str() == name) { + return c.get(2).map(|m| m.as_str().to_string()); + } + } + None +} + fn get_str(v: Option<&Value>, key: &str) -> Option { v.and_then(|x| x.get(key)) .and_then(|x| x.as_str().map(String::from)) @@ -252,4 +348,31 @@ var ytInitialPlayerResponse = {"videoDetails":{"videoId":"abc","title":"T","auth let vd = v.get("videoDetails").unwrap(); assert_eq!(vd.get("title").unwrap().as_str(), Some("T")); } + + #[test] + fn og_fallback_extracts_basics_from_meta_tags() { + let html = r##" + + + + + +"##; + let v = build_og_fallback( + html, + "https://www.youtube.com/watch?v=abc", + "https://www.youtube.com/watch?v=abc", + "abc", + ); + assert_eq!(v["data_source"], "og_fallback"); + assert_eq!(v["title"], "Example Video Title"); + assert_eq!(v["description"], "A cool video description."); + assert_eq!(v["author"], "Example Channel"); + assert_eq!( + v["thumbnails"][0]["url"], + "https://i.ytimg.com/vi/abc/maxresdefault.jpg" + ); + assert!(v["view_count"].is_null()); + assert!(v["caption_tracks"].as_array().unwrap().is_empty()); + } }