mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-04-25 00:06:21 +02:00
feat(extractors): wave 6b, etsy_listing + HTML fallbacks for substack/youtube
Adds etsy_listing and hardens two existing extractors with HTML fallbacks
so transient API failures still return useful data.
New:
- etsy_listing: /listing/{id}(/slug) with Schema.org Product JSON-LD +
OG fallback. Antibot-gated, routes through cloud::smart_fetch_html
like amazon_product and ebay_listing. Auto-dispatched (etsy host is
unique).
Hardened:
- substack_post: when /api/v1/posts/{slug} returns non-200 (rate limit,
403 on hardened custom domains, 5xx), fall back to HTML fetch and
parse OG tags + Article JSON-LD. Response shape is stable across
both paths, with a `data_source` field of "api" or "html_fallback".
- youtube_video: when ytInitialPlayerResponse is missing (EU-consent
interstitial, age-gated, some live pre-shows), fall back to OG tags
for title/description/thumbnail. `data_source` now "player_response"
or "og_fallback".
Tests: 91 passing in webclaw-fetch (9 new), clippy clean.
This commit is contained in:
parent
8cc727c2f2
commit
7f5eb93b65
4 changed files with 910 additions and 29 deletions
391
crates/webclaw-fetch/src/extractors/etsy_listing.rs
Normal file
391
crates/webclaw-fetch/src/extractors/etsy_listing.rs
Normal file
|
|
@ -0,0 +1,391 @@
|
||||||
|
//! Etsy listing extractor.
|
||||||
|
//!
|
||||||
|
//! Etsy product pages at `etsy.com/listing/{id}` (and a sluggy variant
|
||||||
|
//! `etsy.com/listing/{id}/{slug}`) ship a Schema.org `Product` JSON-LD
|
||||||
|
//! block with title, price, currency, availability, shop seller, and
|
||||||
|
//! an `AggregateRating` for the listing.
|
||||||
|
//!
|
||||||
|
//! Etsy puts Cloudflare + custom WAF in front of product pages with a
|
||||||
|
//! high variance: the Firefox profile gets clean HTML most of the time
|
||||||
|
//! but some listings return a CF interstitial. We route through
|
||||||
|
//! `cloud::smart_fetch_html` so both paths resolve to the same parser,
|
||||||
|
//! same as `ebay_listing`.
|
||||||
|
|
||||||
|
use std::sync::OnceLock;
|
||||||
|
|
||||||
|
use regex::Regex;
|
||||||
|
use serde_json::{Value, json};
|
||||||
|
|
||||||
|
use super::ExtractorInfo;
|
||||||
|
use crate::client::FetchClient;
|
||||||
|
use crate::cloud::{self, CloudError};
|
||||||
|
use crate::error::FetchError;
|
||||||
|
|
||||||
|
pub const INFO: ExtractorInfo = ExtractorInfo {
|
||||||
|
name: "etsy_listing",
|
||||||
|
label: "Etsy listing",
|
||||||
|
description: "Returns listing title, price, currency, availability, shop, rating, and image. Heavy listings may need WEBCLAW_API_KEY for antibot.",
|
||||||
|
url_patterns: &[
|
||||||
|
"https://www.etsy.com/listing/{id}",
|
||||||
|
"https://www.etsy.com/listing/{id}/{slug}",
|
||||||
|
"https://www.etsy.com/{locale}/listing/{id}",
|
||||||
|
],
|
||||||
|
};
|
||||||
|
|
||||||
|
pub fn matches(url: &str) -> bool {
|
||||||
|
let host = host_of(url);
|
||||||
|
if !is_etsy_host(host) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
parse_listing_id(url).is_some()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
|
||||||
|
let listing_id = parse_listing_id(url)
|
||||||
|
.ok_or_else(|| FetchError::Build(format!("etsy_listing: no listing id in '{url}'")))?;
|
||||||
|
|
||||||
|
let fetched = cloud::smart_fetch_html(client, client.cloud(), url)
|
||||||
|
.await
|
||||||
|
.map_err(cloud_to_fetch_err)?;
|
||||||
|
|
||||||
|
let mut data = parse(&fetched.html, url, &listing_id);
|
||||||
|
if let Some(obj) = data.as_object_mut() {
|
||||||
|
obj.insert(
|
||||||
|
"data_source".into(),
|
||||||
|
match fetched.source {
|
||||||
|
cloud::FetchSource::Local => json!("local"),
|
||||||
|
cloud::FetchSource::Cloud => json!("cloud"),
|
||||||
|
},
|
||||||
|
);
|
||||||
|
}
|
||||||
|
Ok(data)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn parse(html: &str, url: &str, listing_id: &str) -> Value {
|
||||||
|
let jsonld = find_product_jsonld(html);
|
||||||
|
|
||||||
|
let title = jsonld
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|v| get_text(v, "name"))
|
||||||
|
.or_else(|| og(html, "title"));
|
||||||
|
let description = jsonld
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|v| get_text(v, "description"))
|
||||||
|
.or_else(|| og(html, "description"));
|
||||||
|
let image = jsonld
|
||||||
|
.as_ref()
|
||||||
|
.and_then(get_first_image)
|
||||||
|
.or_else(|| og(html, "image"));
|
||||||
|
let brand = jsonld.as_ref().and_then(get_brand);
|
||||||
|
|
||||||
|
// Etsy listings often ship either a single Offer or an
|
||||||
|
// AggregateOffer when the listing has variants with different prices.
|
||||||
|
let offer = jsonld.as_ref().and_then(first_offer);
|
||||||
|
let (low_price, high_price, single_price) = match offer.as_ref() {
|
||||||
|
Some(o) => (
|
||||||
|
get_text(o, "lowPrice"),
|
||||||
|
get_text(o, "highPrice"),
|
||||||
|
get_text(o, "price"),
|
||||||
|
),
|
||||||
|
None => (None, None, None),
|
||||||
|
};
|
||||||
|
let currency = offer.as_ref().and_then(|o| get_text(o, "priceCurrency"));
|
||||||
|
let availability = offer
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|o| get_text(o, "availability").map(strip_schema_prefix));
|
||||||
|
let item_condition = jsonld
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|v| get_text(v, "itemCondition"))
|
||||||
|
.map(strip_schema_prefix);
|
||||||
|
|
||||||
|
// Shop name lives under offers[0].seller.name on Etsy.
|
||||||
|
let shop = offer.as_ref().and_then(|o| {
|
||||||
|
o.get("seller")
|
||||||
|
.and_then(|s| s.get("name"))
|
||||||
|
.and_then(|n| n.as_str())
|
||||||
|
.map(String::from)
|
||||||
|
});
|
||||||
|
let shop_url = shop_url_from_html(html);
|
||||||
|
|
||||||
|
let aggregate_rating = jsonld.as_ref().and_then(get_aggregate_rating);
|
||||||
|
|
||||||
|
json!({
|
||||||
|
"url": url,
|
||||||
|
"listing_id": listing_id,
|
||||||
|
"title": title,
|
||||||
|
"description": description,
|
||||||
|
"image": image,
|
||||||
|
"brand": brand,
|
||||||
|
"price": single_price,
|
||||||
|
"low_price": low_price,
|
||||||
|
"high_price": high_price,
|
||||||
|
"currency": currency,
|
||||||
|
"availability": availability,
|
||||||
|
"item_condition": item_condition,
|
||||||
|
"shop": shop,
|
||||||
|
"shop_url": shop_url,
|
||||||
|
"aggregate_rating": aggregate_rating,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// URL helpers
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
fn host_of(url: &str) -> &str {
|
||||||
|
url.split("://")
|
||||||
|
.nth(1)
|
||||||
|
.unwrap_or(url)
|
||||||
|
.split('/')
|
||||||
|
.next()
|
||||||
|
.unwrap_or("")
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_etsy_host(host: &str) -> bool {
|
||||||
|
host == "etsy.com" || host == "www.etsy.com" || host.ends_with(".etsy.com")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract the numeric listing id. Etsy ids are 9-11 digits today but
|
||||||
|
/// we accept any all-digit segment right after `/listing/`.
|
||||||
|
///
|
||||||
|
/// Handles `/listing/{id}`, `/listing/{id}/{slug}`, and the localised
|
||||||
|
/// `/{locale}/listing/{id}` shape (e.g. `/fr/listing/...`).
|
||||||
|
fn parse_listing_id(url: &str) -> Option<String> {
|
||||||
|
static RE: OnceLock<Regex> = OnceLock::new();
|
||||||
|
let re = RE.get_or_init(|| Regex::new(r"/listing/(\d{6,})(?:[/?#]|$)").unwrap());
|
||||||
|
re.captures(url)
|
||||||
|
.and_then(|c| c.get(1))
|
||||||
|
.map(|m| m.as_str().to_string())
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// JSON-LD walkers (same shape as ebay_listing; kept separate so the two
|
||||||
|
// extractors can diverge without cross-impact)
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
fn find_product_jsonld(html: &str) -> Option<Value> {
|
||||||
|
let blocks = webclaw_core::structured_data::extract_json_ld(html);
|
||||||
|
for b in blocks {
|
||||||
|
if let Some(found) = find_product_in(&b) {
|
||||||
|
return Some(found);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
fn find_product_in(v: &Value) -> Option<Value> {
|
||||||
|
if is_product_type(v) {
|
||||||
|
return Some(v.clone());
|
||||||
|
}
|
||||||
|
if let Some(graph) = v.get("@graph").and_then(|g| g.as_array()) {
|
||||||
|
for item in graph {
|
||||||
|
if let Some(found) = find_product_in(item) {
|
||||||
|
return Some(found);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if let Some(arr) = v.as_array() {
|
||||||
|
for item in arr {
|
||||||
|
if let Some(found) = find_product_in(item) {
|
||||||
|
return Some(found);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_product_type(v: &Value) -> bool {
|
||||||
|
let Some(t) = v.get("@type") else {
|
||||||
|
return false;
|
||||||
|
};
|
||||||
|
let is_prod = |s: &str| matches!(s, "Product" | "ProductGroup" | "IndividualProduct");
|
||||||
|
match t {
|
||||||
|
Value::String(s) => is_prod(s),
|
||||||
|
Value::Array(arr) => arr.iter().any(|x| x.as_str().is_some_and(is_prod)),
|
||||||
|
_ => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_text(v: &Value, key: &str) -> Option<String> {
|
||||||
|
v.get(key).and_then(|x| match x {
|
||||||
|
Value::String(s) => Some(s.clone()),
|
||||||
|
Value::Number(n) => Some(n.to_string()),
|
||||||
|
_ => None,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_brand(v: &Value) -> Option<String> {
|
||||||
|
let brand = v.get("brand")?;
|
||||||
|
if let Some(s) = brand.as_str() {
|
||||||
|
return Some(s.to_string());
|
||||||
|
}
|
||||||
|
brand
|
||||||
|
.as_object()
|
||||||
|
.and_then(|o| o.get("name"))
|
||||||
|
.and_then(|n| n.as_str())
|
||||||
|
.map(String::from)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_first_image(v: &Value) -> Option<String> {
|
||||||
|
match v.get("image")? {
|
||||||
|
Value::String(s) => Some(s.clone()),
|
||||||
|
Value::Array(arr) => arr.iter().find_map(|x| match x {
|
||||||
|
Value::String(s) => Some(s.clone()),
|
||||||
|
Value::Object(_) => x.get("url").and_then(|u| u.as_str()).map(String::from),
|
||||||
|
_ => None,
|
||||||
|
}),
|
||||||
|
Value::Object(o) => o.get("url").and_then(|u| u.as_str()).map(String::from),
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn first_offer(v: &Value) -> Option<Value> {
|
||||||
|
let offers = v.get("offers")?;
|
||||||
|
match offers {
|
||||||
|
Value::Array(arr) => arr.first().cloned(),
|
||||||
|
Value::Object(_) => Some(offers.clone()),
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_aggregate_rating(v: &Value) -> Option<Value> {
|
||||||
|
let r = v.get("aggregateRating")?;
|
||||||
|
Some(json!({
|
||||||
|
"rating_value": get_text(r, "ratingValue"),
|
||||||
|
"review_count": get_text(r, "reviewCount"),
|
||||||
|
"best_rating": get_text(r, "bestRating"),
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn strip_schema_prefix(s: String) -> String {
|
||||||
|
s.replace("http://schema.org/", "")
|
||||||
|
.replace("https://schema.org/", "")
|
||||||
|
}
|
||||||
|
|
||||||
|
fn og(html: &str, prop: &str) -> Option<String> {
|
||||||
|
static RE: OnceLock<Regex> = OnceLock::new();
|
||||||
|
let re = RE.get_or_init(|| {
|
||||||
|
Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
|
||||||
|
});
|
||||||
|
for c in re.captures_iter(html) {
|
||||||
|
if c.get(1).is_some_and(|m| m.as_str() == prop) {
|
||||||
|
return c.get(2).map(|m| m.as_str().to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Etsy links the owning shop with a canonical anchor like
|
||||||
|
/// `<a href="/shop/ShopName" ...>`. Grab the first one after the
|
||||||
|
/// breadcrumb boundary.
|
||||||
|
fn shop_url_from_html(html: &str) -> Option<String> {
|
||||||
|
static RE: OnceLock<Regex> = OnceLock::new();
|
||||||
|
let re = RE.get_or_init(|| Regex::new(r#"href="(/shop/[A-Za-z0-9_-]+)""#).unwrap());
|
||||||
|
re.captures(html)
|
||||||
|
.and_then(|c| c.get(1))
|
||||||
|
.map(|m| format!("https://www.etsy.com{}", m.as_str()))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn cloud_to_fetch_err(e: CloudError) -> FetchError {
|
||||||
|
FetchError::Build(e.to_string())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn matches_etsy_listing_urls() {
|
||||||
|
assert!(matches("https://www.etsy.com/listing/123456789"));
|
||||||
|
assert!(matches(
|
||||||
|
"https://www.etsy.com/listing/123456789/vintage-typewriter"
|
||||||
|
));
|
||||||
|
assert!(matches(
|
||||||
|
"https://www.etsy.com/fr/listing/123456789/vintage-typewriter"
|
||||||
|
));
|
||||||
|
assert!(!matches("https://www.etsy.com/"));
|
||||||
|
assert!(!matches("https://www.etsy.com/shop/SomeShop"));
|
||||||
|
assert!(!matches("https://example.com/listing/123456789"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_listing_id_handles_slug_and_locale() {
|
||||||
|
assert_eq!(
|
||||||
|
parse_listing_id("https://www.etsy.com/listing/123456789"),
|
||||||
|
Some("123456789".into())
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
parse_listing_id("https://www.etsy.com/listing/123456789/slug-here"),
|
||||||
|
Some("123456789".into())
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
parse_listing_id("https://www.etsy.com/fr/listing/123456789/slug"),
|
||||||
|
Some("123456789".into())
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
parse_listing_id("https://www.etsy.com/listing/123456789?ref=foo"),
|
||||||
|
Some("123456789".into())
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_extracts_from_fixture_jsonld() {
|
||||||
|
let html = r##"
|
||||||
|
<html><head>
|
||||||
|
<script type="application/ld+json">
|
||||||
|
{"@context":"https://schema.org","@type":"Product",
|
||||||
|
"name":"Handmade Ceramic Mug","sku":"MUG-001",
|
||||||
|
"brand":{"@type":"Brand","name":"Studio Clay"},
|
||||||
|
"image":["https://i.etsystatic.com/abc.jpg","https://i.etsystatic.com/xyz.jpg"],
|
||||||
|
"itemCondition":"https://schema.org/NewCondition",
|
||||||
|
"offers":{"@type":"Offer","price":"24.00","priceCurrency":"USD",
|
||||||
|
"availability":"https://schema.org/InStock",
|
||||||
|
"seller":{"@type":"Organization","name":"StudioClay"}},
|
||||||
|
"aggregateRating":{"@type":"AggregateRating","ratingValue":"4.9","reviewCount":"127","bestRating":"5"}}
|
||||||
|
</script>
|
||||||
|
<a href="/shop/StudioClay" class="wt-text-link">StudioClay</a>
|
||||||
|
</head></html>"##;
|
||||||
|
let v = parse(html, "https://www.etsy.com/listing/1", "1");
|
||||||
|
assert_eq!(v["title"], "Handmade Ceramic Mug");
|
||||||
|
assert_eq!(v["price"], "24.00");
|
||||||
|
assert_eq!(v["currency"], "USD");
|
||||||
|
assert_eq!(v["availability"], "InStock");
|
||||||
|
assert_eq!(v["item_condition"], "NewCondition");
|
||||||
|
assert_eq!(v["shop"], "StudioClay");
|
||||||
|
assert_eq!(v["shop_url"], "https://www.etsy.com/shop/StudioClay");
|
||||||
|
assert_eq!(v["brand"], "Studio Clay");
|
||||||
|
assert_eq!(v["aggregate_rating"]["rating_value"], "4.9");
|
||||||
|
assert_eq!(v["aggregate_rating"]["review_count"], "127");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_handles_aggregate_offer_price_range() {
|
||||||
|
let html = r##"
|
||||||
|
<script type="application/ld+json">
|
||||||
|
{"@type":"Product","name":"Mug Set",
|
||||||
|
"offers":{"@type":"AggregateOffer",
|
||||||
|
"lowPrice":"18.00","highPrice":"36.00","priceCurrency":"USD"}}
|
||||||
|
</script>
|
||||||
|
"##;
|
||||||
|
let v = parse(html, "https://www.etsy.com/listing/2", "2");
|
||||||
|
assert_eq!(v["low_price"], "18.00");
|
||||||
|
assert_eq!(v["high_price"], "36.00");
|
||||||
|
assert_eq!(v["currency"], "USD");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_falls_back_to_og_when_no_jsonld() {
|
||||||
|
let html = r#"
|
||||||
|
<html><head>
|
||||||
|
<meta property="og:title" content="Minimal Fallback Item">
|
||||||
|
<meta property="og:description" content="OG-only extraction test.">
|
||||||
|
<meta property="og:image" content="https://i.etsystatic.com/fallback.jpg">
|
||||||
|
</head></html>"#;
|
||||||
|
let v = parse(html, "https://www.etsy.com/listing/3", "3");
|
||||||
|
assert_eq!(v["title"], "Minimal Fallback Item");
|
||||||
|
assert_eq!(v["description"], "OG-only extraction test.");
|
||||||
|
assert_eq!(v["image"], "https://i.etsystatic.com/fallback.jpg");
|
||||||
|
// No price fields when we only have OG.
|
||||||
|
assert!(v["price"].is_null());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -21,6 +21,7 @@ pub mod dev_to;
|
||||||
pub mod docker_hub;
|
pub mod docker_hub;
|
||||||
pub mod ebay_listing;
|
pub mod ebay_listing;
|
||||||
pub mod ecommerce_product;
|
pub mod ecommerce_product;
|
||||||
|
pub mod etsy_listing;
|
||||||
pub mod github_issue;
|
pub mod github_issue;
|
||||||
pub mod github_pr;
|
pub mod github_pr;
|
||||||
pub mod github_release;
|
pub mod github_release;
|
||||||
|
|
@ -92,6 +93,7 @@ pub fn list() -> Vec<ExtractorInfo> {
|
||||||
woocommerce_product::INFO,
|
woocommerce_product::INFO,
|
||||||
amazon_product::INFO,
|
amazon_product::INFO,
|
||||||
ebay_listing::INFO,
|
ebay_listing::INFO,
|
||||||
|
etsy_listing::INFO,
|
||||||
trustpilot_reviews::INFO,
|
trustpilot_reviews::INFO,
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
@ -243,6 +245,13 @@ pub async fn dispatch_by_url(
|
||||||
.map(|v| (ebay_listing::INFO.name, v)),
|
.map(|v| (ebay_listing::INFO.name, v)),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
if etsy_listing::matches(url) {
|
||||||
|
return Some(
|
||||||
|
etsy_listing::extract(client, url)
|
||||||
|
.await
|
||||||
|
.map(|v| (etsy_listing::INFO.name, v)),
|
||||||
|
);
|
||||||
|
}
|
||||||
if trustpilot_reviews::matches(url) {
|
if trustpilot_reviews::matches(url) {
|
||||||
return Some(
|
return Some(
|
||||||
trustpilot_reviews::extract(client, url)
|
trustpilot_reviews::extract(client, url)
|
||||||
|
|
@ -400,6 +409,12 @@ pub async fn dispatch_by_name(
|
||||||
})
|
})
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
|
n if n == etsy_listing::INFO.name => {
|
||||||
|
run_or_mismatch(etsy_listing::matches(url), n, url, || {
|
||||||
|
etsy_listing::extract(client, url)
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
}
|
||||||
n if n == trustpilot_reviews::INFO.name => {
|
n if n == trustpilot_reviews::INFO.name => {
|
||||||
run_or_mismatch(trustpilot_reviews::matches(url), n, url, || {
|
run_or_mismatch(trustpilot_reviews::matches(url), n, url, || {
|
||||||
trustpilot_reviews::extract(client, url)
|
trustpilot_reviews::extract(client, url)
|
||||||
|
|
|
||||||
|
|
@ -10,18 +10,32 @@
|
||||||
//! "URL has `/p/{slug}`" because that's the canonical Substack post
|
//! "URL has `/p/{slug}`" because that's the canonical Substack post
|
||||||
//! path. Explicit-call only because the `/p/{slug}` URL shape is
|
//! path. Explicit-call only because the `/p/{slug}` URL shape is
|
||||||
//! used by non-Substack sites too.
|
//! used by non-Substack sites too.
|
||||||
|
//!
|
||||||
|
//! ## Fallback
|
||||||
|
//!
|
||||||
|
//! The API endpoint is rate-limited aggressively on popular publications
|
||||||
|
//! and occasionally returns 403 on custom domains with Cloudflare in
|
||||||
|
//! front. When that happens we escalate to an HTML fetch (via
|
||||||
|
//! `smart_fetch_html`, so antibot-protected custom domains still work)
|
||||||
|
//! and extract OG tags + Article JSON-LD for a degraded-but-useful
|
||||||
|
//! payload. The response shape stays stable across both paths; a
|
||||||
|
//! `data_source` field tells the caller which branch ran.
|
||||||
|
|
||||||
|
use std::sync::OnceLock;
|
||||||
|
|
||||||
|
use regex::Regex;
|
||||||
use serde::Deserialize;
|
use serde::Deserialize;
|
||||||
use serde_json::{Value, json};
|
use serde_json::{Value, json};
|
||||||
|
|
||||||
use super::ExtractorInfo;
|
use super::ExtractorInfo;
|
||||||
use crate::client::FetchClient;
|
use crate::client::FetchClient;
|
||||||
|
use crate::cloud::{self, CloudError};
|
||||||
use crate::error::FetchError;
|
use crate::error::FetchError;
|
||||||
|
|
||||||
pub const INFO: ExtractorInfo = ExtractorInfo {
|
pub const INFO: ExtractorInfo = ExtractorInfo {
|
||||||
name: "substack_post",
|
name: "substack_post",
|
||||||
label: "Substack post",
|
label: "Substack post",
|
||||||
description: "Returns post HTML, title, subtitle, author, publication, reactions, paywall status via the Substack public API.",
|
description: "Returns post HTML, title, subtitle, author, publication, reactions, paywall status via the Substack public API. Falls back to OG + JSON-LD HTML parsing when the API is rate-limited.",
|
||||||
url_patterns: &[
|
url_patterns: &[
|
||||||
"https://{pub}.substack.com/p/{slug}",
|
"https://{pub}.substack.com/p/{slug}",
|
||||||
"https://{custom-domain}/p/{slug}",
|
"https://{custom-domain}/p/{slug}",
|
||||||
|
|
@ -51,32 +65,55 @@ pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchErro
|
||||||
"https"
|
"https"
|
||||||
};
|
};
|
||||||
let api_url = format!("{scheme}://{host}/api/v1/posts/{slug}");
|
let api_url = format!("{scheme}://{host}/api/v1/posts/{slug}");
|
||||||
|
|
||||||
|
// 1. Try the public API. 200 = full payload; 404 = real miss; any
|
||||||
|
// other status hands off to the HTML fallback so a transient rate
|
||||||
|
// limit or a hardened custom domain doesn't fail the whole call.
|
||||||
let resp = client.fetch(&api_url).await?;
|
let resp = client.fetch(&api_url).await?;
|
||||||
if resp.status == 404 {
|
match resp.status {
|
||||||
return Err(FetchError::Build(format!(
|
200 => match serde_json::from_str::<Post>(&resp.html) {
|
||||||
|
Ok(p) => Ok(build_api_payload(url, &api_url, &slug, p)),
|
||||||
|
Err(e) => {
|
||||||
|
// API returned 200 but the body isn't the Post shape we
|
||||||
|
// expect. Could be a custom-domain site that exposes
|
||||||
|
// something else at /api/v1/posts/. Fall back to HTML
|
||||||
|
// rather than hard-failing.
|
||||||
|
html_fallback(
|
||||||
|
client,
|
||||||
|
url,
|
||||||
|
&api_url,
|
||||||
|
&slug,
|
||||||
|
Some(format!(
|
||||||
|
"api returned 200 but body was not Substack JSON ({e})"
|
||||||
|
)),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
},
|
||||||
|
404 => Err(FetchError::Build(format!(
|
||||||
"substack_post: '{slug}' not found on {host} (got 404). \
|
"substack_post: '{slug}' not found on {host} (got 404). \
|
||||||
If the publication isn't actually on Substack, use /v1/scrape instead."
|
If the publication isn't actually on Substack, use /v1/scrape instead."
|
||||||
)));
|
))),
|
||||||
}
|
_ => {
|
||||||
if resp.status != 200 {
|
// Rate limit, 403, 5xx, whatever: try HTML.
|
||||||
return Err(FetchError::Build(format!(
|
let reason = format!("api returned status {} for {api_url}", resp.status);
|
||||||
"substack returned status {} for {api_url}",
|
html_fallback(client, url, &api_url, &slug, Some(reason)).await
|
||||||
resp.status
|
}
|
||||||
)));
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let p: Post = serde_json::from_str(&resp.html).map_err(|e| {
|
// ---------------------------------------------------------------------------
|
||||||
FetchError::BodyDecode(format!(
|
// API-path payload builder
|
||||||
"substack_post: '{host}' didn't return Substack JSON, likely not a Substack ({e})"
|
// ---------------------------------------------------------------------------
|
||||||
))
|
|
||||||
})?;
|
|
||||||
|
|
||||||
Ok(json!({
|
fn build_api_payload(url: &str, api_url: &str, slug: &str, p: Post) -> Value {
|
||||||
|
json!({
|
||||||
"url": url,
|
"url": url,
|
||||||
"api_url": api_url,
|
"api_url": api_url,
|
||||||
|
"data_source": "api",
|
||||||
"id": p.id,
|
"id": p.id,
|
||||||
"type": p.r#type,
|
"type": p.r#type,
|
||||||
"slug": p.slug,
|
"slug": p.slug.or_else(|| Some(slug.to_string())),
|
||||||
"title": p.title,
|
"title": p.title,
|
||||||
"subtitle": p.subtitle,
|
"subtitle": p.subtitle,
|
||||||
"description": p.description,
|
"description": p.description,
|
||||||
|
|
@ -104,7 +141,117 @@ pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchErro
|
||||||
"handle": a.handle,
|
"handle": a.handle,
|
||||||
"photo": a.photo_url,
|
"photo": a.photo_url,
|
||||||
})).collect::<Vec<_>>(),
|
})).collect::<Vec<_>>(),
|
||||||
}))
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// HTML fallback: OG + Article JSON-LD
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
async fn html_fallback(
|
||||||
|
client: &FetchClient,
|
||||||
|
url: &str,
|
||||||
|
api_url: &str,
|
||||||
|
slug: &str,
|
||||||
|
fallback_reason: Option<String>,
|
||||||
|
) -> Result<Value, FetchError> {
|
||||||
|
let fetched = cloud::smart_fetch_html(client, client.cloud(), url)
|
||||||
|
.await
|
||||||
|
.map_err(cloud_to_fetch_err)?;
|
||||||
|
|
||||||
|
let mut data = parse_html(&fetched.html, url, api_url, slug);
|
||||||
|
if let Some(obj) = data.as_object_mut() {
|
||||||
|
obj.insert(
|
||||||
|
"fetch_source".into(),
|
||||||
|
match fetched.source {
|
||||||
|
cloud::FetchSource::Local => json!("local"),
|
||||||
|
cloud::FetchSource::Cloud => json!("cloud"),
|
||||||
|
},
|
||||||
|
);
|
||||||
|
if let Some(reason) = fallback_reason {
|
||||||
|
obj.insert("fallback_reason".into(), json!(reason));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(data)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Pure HTML parser. Pulls title, subtitle, description, cover image,
|
||||||
|
/// publish date, and authors from OG tags and Article JSON-LD. Kept
|
||||||
|
/// public so tests can exercise it with fixtures.
|
||||||
|
pub fn parse_html(html: &str, url: &str, api_url: &str, slug: &str) -> Value {
|
||||||
|
let article = find_article_jsonld(html);
|
||||||
|
|
||||||
|
let title = article
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|v| get_text(v, "headline"))
|
||||||
|
.or_else(|| og(html, "title"));
|
||||||
|
let description = article
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|v| get_text(v, "description"))
|
||||||
|
.or_else(|| og(html, "description"));
|
||||||
|
let cover_image = article
|
||||||
|
.as_ref()
|
||||||
|
.and_then(get_first_image)
|
||||||
|
.or_else(|| og(html, "image"));
|
||||||
|
let post_date = article
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|v| get_text(v, "datePublished"))
|
||||||
|
.or_else(|| meta_property(html, "article:published_time"));
|
||||||
|
let updated_at = article.as_ref().and_then(|v| get_text(v, "dateModified"));
|
||||||
|
let publication_name = og(html, "site_name");
|
||||||
|
let authors = article.as_ref().map(extract_authors).unwrap_or_default();
|
||||||
|
|
||||||
|
json!({
|
||||||
|
"url": url,
|
||||||
|
"api_url": api_url,
|
||||||
|
"data_source": "html_fallback",
|
||||||
|
"slug": slug,
|
||||||
|
"title": title,
|
||||||
|
"subtitle": None::<String>,
|
||||||
|
"description": description,
|
||||||
|
"canonical_url": canonical_url(html).or_else(|| Some(url.to_string())),
|
||||||
|
"post_date": post_date,
|
||||||
|
"updated_at": updated_at,
|
||||||
|
"cover_image": cover_image,
|
||||||
|
"body_html": None::<String>,
|
||||||
|
"body_text": None::<String>,
|
||||||
|
"word_count": None::<i64>,
|
||||||
|
"comment_count": None::<i64>,
|
||||||
|
"reactions": Value::Null,
|
||||||
|
"has_paywall": None::<bool>,
|
||||||
|
"is_free_preview": None::<bool>,
|
||||||
|
"publication": json!({
|
||||||
|
"name": publication_name,
|
||||||
|
}),
|
||||||
|
"authors": authors,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn extract_authors(v: &Value) -> Vec<Value> {
|
||||||
|
let Some(a) = v.get("author") else {
|
||||||
|
return Vec::new();
|
||||||
|
};
|
||||||
|
let one = |val: &Value| -> Option<Value> {
|
||||||
|
match val {
|
||||||
|
Value::String(s) => Some(json!({"name": s})),
|
||||||
|
Value::Object(_) => {
|
||||||
|
let name = val.get("name").and_then(|n| n.as_str())?;
|
||||||
|
let handle = val
|
||||||
|
.get("url")
|
||||||
|
.and_then(|u| u.as_str())
|
||||||
|
.and_then(handle_from_author_url);
|
||||||
|
Some(json!({
|
||||||
|
"name": name,
|
||||||
|
"handle": handle,
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
};
|
||||||
|
match a {
|
||||||
|
Value::Array(arr) => arr.iter().filter_map(one).collect(),
|
||||||
|
_ => one(a).into_iter().collect(),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
|
|
@ -136,6 +283,139 @@ fn parse_slug(url: &str) -> Option<String> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Extract the Substack handle from an author URL like
|
||||||
|
/// `https://substack.com/@handle` or `https://pub.substack.com/@handle`.
|
||||||
|
///
|
||||||
|
/// Returns `None` when the URL has no `@` segment (e.g. a non-Substack
|
||||||
|
/// author page) so we don't synthesise a fake handle.
|
||||||
|
fn handle_from_author_url(u: &str) -> Option<String> {
|
||||||
|
let after = u.rsplit_once('@').map(|(_, tail)| tail)?;
|
||||||
|
let clean = after.split(['/', '?', '#']).next()?;
|
||||||
|
if clean.is_empty() {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(clean.to_string())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// HTML tag helpers
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
fn og(html: &str, prop: &str) -> Option<String> {
|
||||||
|
static RE: OnceLock<Regex> = OnceLock::new();
|
||||||
|
let re = RE.get_or_init(|| {
|
||||||
|
Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
|
||||||
|
});
|
||||||
|
for c in re.captures_iter(html) {
|
||||||
|
if c.get(1).is_some_and(|m| m.as_str() == prop) {
|
||||||
|
return c.get(2).map(|m| m.as_str().to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Pull `<meta property="article:published_time" content="...">` and
|
||||||
|
/// similar structured meta tags.
|
||||||
|
fn meta_property(html: &str, prop: &str) -> Option<String> {
|
||||||
|
static RE: OnceLock<Regex> = OnceLock::new();
|
||||||
|
let re = RE.get_or_init(|| {
|
||||||
|
Regex::new(r#"(?i)<meta[^>]+property="([^"]+)"[^>]+content="([^"]+)""#).unwrap()
|
||||||
|
});
|
||||||
|
for c in re.captures_iter(html) {
|
||||||
|
if c.get(1).is_some_and(|m| m.as_str() == prop) {
|
||||||
|
return c.get(2).map(|m| m.as_str().to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
fn canonical_url(html: &str) -> Option<String> {
|
||||||
|
static RE: OnceLock<Regex> = OnceLock::new();
|
||||||
|
let re = RE
|
||||||
|
.get_or_init(|| Regex::new(r#"(?i)<link[^>]+rel="canonical"[^>]+href="([^"]+)""#).unwrap());
|
||||||
|
re.captures(html)
|
||||||
|
.and_then(|c| c.get(1))
|
||||||
|
.map(|m| m.as_str().to_string())
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// JSON-LD walkers (Article / NewsArticle)
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
fn find_article_jsonld(html: &str) -> Option<Value> {
|
||||||
|
let blocks = webclaw_core::structured_data::extract_json_ld(html);
|
||||||
|
for b in blocks {
|
||||||
|
if let Some(found) = find_article_in(&b) {
|
||||||
|
return Some(found);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
fn find_article_in(v: &Value) -> Option<Value> {
|
||||||
|
if is_article_type(v) {
|
||||||
|
return Some(v.clone());
|
||||||
|
}
|
||||||
|
if let Some(graph) = v.get("@graph").and_then(|g| g.as_array()) {
|
||||||
|
for item in graph {
|
||||||
|
if let Some(found) = find_article_in(item) {
|
||||||
|
return Some(found);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if let Some(arr) = v.as_array() {
|
||||||
|
for item in arr {
|
||||||
|
if let Some(found) = find_article_in(item) {
|
||||||
|
return Some(found);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_article_type(v: &Value) -> bool {
|
||||||
|
let Some(t) = v.get("@type") else {
|
||||||
|
return false;
|
||||||
|
};
|
||||||
|
let is_art = |s: &str| {
|
||||||
|
matches!(
|
||||||
|
s,
|
||||||
|
"Article" | "NewsArticle" | "BlogPosting" | "SocialMediaPosting"
|
||||||
|
)
|
||||||
|
};
|
||||||
|
match t {
|
||||||
|
Value::String(s) => is_art(s),
|
||||||
|
Value::Array(arr) => arr.iter().any(|x| x.as_str().is_some_and(is_art)),
|
||||||
|
_ => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_text(v: &Value, key: &str) -> Option<String> {
|
||||||
|
v.get(key).and_then(|x| match x {
|
||||||
|
Value::String(s) => Some(s.clone()),
|
||||||
|
Value::Number(n) => Some(n.to_string()),
|
||||||
|
_ => None,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_first_image(v: &Value) -> Option<String> {
|
||||||
|
match v.get("image")? {
|
||||||
|
Value::String(s) => Some(s.clone()),
|
||||||
|
Value::Array(arr) => arr.iter().find_map(|x| match x {
|
||||||
|
Value::String(s) => Some(s.clone()),
|
||||||
|
Value::Object(_) => x.get("url").and_then(|u| u.as_str()).map(String::from),
|
||||||
|
_ => None,
|
||||||
|
}),
|
||||||
|
Value::Object(o) => o.get("url").and_then(|u| u.as_str()).map(String::from),
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn cloud_to_fetch_err(e: CloudError) -> FetchError {
|
||||||
|
FetchError::Build(e.to_string())
|
||||||
|
}
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
// Substack API types (subset)
|
// Substack API types (subset)
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
|
|
@ -210,4 +490,76 @@ mod tests {
|
||||||
Some("my-post".into())
|
Some("my-post".into())
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_html_extracts_from_og_tags() {
|
||||||
|
let html = r##"
|
||||||
|
<html><head>
|
||||||
|
<meta property="og:title" content="My Great Post">
|
||||||
|
<meta property="og:description" content="A short summary.">
|
||||||
|
<meta property="og:image" content="https://cdn.substack.com/cover.jpg">
|
||||||
|
<meta property="og:site_name" content="My Publication">
|
||||||
|
<meta property="article:published_time" content="2025-09-01T10:00:00Z">
|
||||||
|
<link rel="canonical" href="https://mypub.substack.com/p/my-post">
|
||||||
|
</head></html>"##;
|
||||||
|
let v = parse_html(
|
||||||
|
html,
|
||||||
|
"https://mypub.substack.com/p/my-post",
|
||||||
|
"https://mypub.substack.com/api/v1/posts/my-post",
|
||||||
|
"my-post",
|
||||||
|
);
|
||||||
|
assert_eq!(v["data_source"], "html_fallback");
|
||||||
|
assert_eq!(v["title"], "My Great Post");
|
||||||
|
assert_eq!(v["description"], "A short summary.");
|
||||||
|
assert_eq!(v["cover_image"], "https://cdn.substack.com/cover.jpg");
|
||||||
|
assert_eq!(v["post_date"], "2025-09-01T10:00:00Z");
|
||||||
|
assert_eq!(v["publication"]["name"], "My Publication");
|
||||||
|
assert_eq!(v["canonical_url"], "https://mypub.substack.com/p/my-post");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_html_prefers_jsonld_when_present() {
|
||||||
|
let html = r##"
|
||||||
|
<html><head>
|
||||||
|
<meta property="og:title" content="OG Title">
|
||||||
|
<script type="application/ld+json">
|
||||||
|
{"@context":"https://schema.org","@type":"NewsArticle",
|
||||||
|
"headline":"JSON-LD Title",
|
||||||
|
"description":"JSON-LD desc.",
|
||||||
|
"image":"https://cdn.substack.com/hero.jpg",
|
||||||
|
"datePublished":"2025-10-12T08:30:00Z",
|
||||||
|
"dateModified":"2025-10-12T09:00:00Z",
|
||||||
|
"author":[{"@type":"Person","name":"Alice Author","url":"https://substack.com/@alice"}]}
|
||||||
|
</script>
|
||||||
|
</head></html>"##;
|
||||||
|
let v = parse_html(
|
||||||
|
html,
|
||||||
|
"https://example.com/p/a",
|
||||||
|
"https://example.com/api/v1/posts/a",
|
||||||
|
"a",
|
||||||
|
);
|
||||||
|
assert_eq!(v["title"], "JSON-LD Title");
|
||||||
|
assert_eq!(v["description"], "JSON-LD desc.");
|
||||||
|
assert_eq!(v["cover_image"], "https://cdn.substack.com/hero.jpg");
|
||||||
|
assert_eq!(v["post_date"], "2025-10-12T08:30:00Z");
|
||||||
|
assert_eq!(v["updated_at"], "2025-10-12T09:00:00Z");
|
||||||
|
assert_eq!(v["authors"][0]["name"], "Alice Author");
|
||||||
|
assert_eq!(v["authors"][0]["handle"], "alice");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn handle_from_author_url_pulls_handle() {
|
||||||
|
assert_eq!(
|
||||||
|
handle_from_author_url("https://substack.com/@alice"),
|
||||||
|
Some("alice".into())
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
handle_from_author_url("https://mypub.substack.com/@bob/"),
|
||||||
|
Some("bob".into())
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
handle_from_author_url("https://not-substack.com/author/carol"),
|
||||||
|
None
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,19 @@
|
||||||
//!
|
//!
|
||||||
//! Auto-dispatched: YouTube host is unique and the `v=` or `/shorts/`
|
//! Auto-dispatched: YouTube host is unique and the `v=` or `/shorts/`
|
||||||
//! shape is stable.
|
//! shape is stable.
|
||||||
|
//!
|
||||||
|
//! ## Fallback
|
||||||
|
//!
|
||||||
|
//! `ytInitialPlayerResponse` is missing on EU-consent interstitials,
|
||||||
|
//! some live-stream pre-show pages, and age-gated videos. In those
|
||||||
|
//! cases we drop down to OG tags for `title`, `description`,
|
||||||
|
//! `thumbnail`, and `channel`, and return a `data_source:
|
||||||
|
//! "og_fallback"` payload so the caller can tell they got a degraded
|
||||||
|
//! shape (no view count, duration, captions).
|
||||||
|
|
||||||
|
use std::sync::OnceLock;
|
||||||
|
|
||||||
|
use regex::Regex;
|
||||||
use serde_json::{Value, json};
|
use serde_json::{Value, json};
|
||||||
|
|
||||||
use super::ExtractorInfo;
|
use super::ExtractorInfo;
|
||||||
|
|
@ -19,7 +31,7 @@ use crate::error::FetchError;
|
||||||
pub const INFO: ExtractorInfo = ExtractorInfo {
|
pub const INFO: ExtractorInfo = ExtractorInfo {
|
||||||
name: "youtube_video",
|
name: "youtube_video",
|
||||||
label: "YouTube video",
|
label: "YouTube video",
|
||||||
description: "Returns video id, title, channel, view count, duration, upload date, thumbnails, keywords, and caption-track URLs.",
|
description: "Returns video id, title, channel, view count, duration, upload date, thumbnails, keywords, and caption-track URLs. Falls back to OG metadata on consent / age-gate pages.",
|
||||||
url_patterns: &[
|
url_patterns: &[
|
||||||
"https://www.youtube.com/watch?v={id}",
|
"https://www.youtube.com/watch?v={id}",
|
||||||
"https://youtu.be/{id}",
|
"https://youtu.be/{id}",
|
||||||
|
|
@ -49,12 +61,28 @@ pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchErro
|
||||||
)));
|
)));
|
||||||
}
|
}
|
||||||
|
|
||||||
let player = extract_player_response(&resp.html).ok_or_else(|| {
|
if let Some(player) = extract_player_response(&resp.html) {
|
||||||
FetchError::BodyDecode(format!(
|
return Ok(build_player_payload(
|
||||||
"youtube_video: no ytInitialPlayerResponse on {canonical} (video may be private, region-blocked, or removed)"
|
&player, &resp.html, url, &canonical, &video_id,
|
||||||
))
|
));
|
||||||
})?;
|
}
|
||||||
|
|
||||||
|
// No player blob. Fall back to OG tags so the call still returns
|
||||||
|
// something useful for consent / age-gate pages.
|
||||||
|
Ok(build_og_fallback(&resp.html, url, &canonical, &video_id))
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Player-blob path (rich payload)
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
fn build_player_payload(
|
||||||
|
player: &Value,
|
||||||
|
html: &str,
|
||||||
|
url: &str,
|
||||||
|
canonical: &str,
|
||||||
|
video_id: &str,
|
||||||
|
) -> Value {
|
||||||
let video_details = player.get("videoDetails");
|
let video_details = player.get("videoDetails");
|
||||||
let microformat = player
|
let microformat = player
|
||||||
.get("microformat")
|
.get("microformat")
|
||||||
|
|
@ -73,7 +101,7 @@ pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchErro
|
||||||
.cloned()
|
.cloned()
|
||||||
.unwrap_or_default();
|
.unwrap_or_default();
|
||||||
|
|
||||||
let caption_tracks = webclaw_core::youtube::extract_caption_tracks(&resp.html);
|
let caption_tracks = webclaw_core::youtube::extract_caption_tracks(html);
|
||||||
let captions: Vec<Value> = caption_tracks
|
let captions: Vec<Value> = caption_tracks
|
||||||
.iter()
|
.iter()
|
||||||
.map(|c| {
|
.map(|c| {
|
||||||
|
|
@ -85,9 +113,10 @@ pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchErro
|
||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
Ok(json!({
|
json!({
|
||||||
"url": url,
|
"url": url,
|
||||||
"canonical_url":canonical,
|
"canonical_url":canonical,
|
||||||
|
"data_source": "player_response",
|
||||||
"video_id": video_id,
|
"video_id": video_id,
|
||||||
"title": get_str(video_details, "title"),
|
"title": get_str(video_details, "title"),
|
||||||
"description": get_str(video_details, "shortDescription"),
|
"description": get_str(video_details, "shortDescription"),
|
||||||
|
|
@ -106,7 +135,46 @@ pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchErro
|
||||||
"keywords": keywords,
|
"keywords": keywords,
|
||||||
"thumbnails": thumbnails,
|
"thumbnails": thumbnails,
|
||||||
"caption_tracks": captions,
|
"caption_tracks": captions,
|
||||||
}))
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// OG fallback path (degraded payload)
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
fn build_og_fallback(html: &str, url: &str, canonical: &str, video_id: &str) -> Value {
|
||||||
|
let title = og(html, "title");
|
||||||
|
let description = og(html, "description");
|
||||||
|
let thumbnail = og(html, "image");
|
||||||
|
// YouTube sets `<meta name="channel_name" ...>` on some pages but
|
||||||
|
// OG-only pages reliably carry `og:video:tag` and the channel in
|
||||||
|
// `<link itemprop="name">`. We keep this lean: just what's stable.
|
||||||
|
let channel = meta_name(html, "author");
|
||||||
|
|
||||||
|
json!({
|
||||||
|
"url": url,
|
||||||
|
"canonical_url":canonical,
|
||||||
|
"data_source": "og_fallback",
|
||||||
|
"video_id": video_id,
|
||||||
|
"title": title,
|
||||||
|
"description": description,
|
||||||
|
"author": channel,
|
||||||
|
// OG path: these are null so the caller doesn't have to guess.
|
||||||
|
"channel_id": None::<String>,
|
||||||
|
"channel_url": None::<String>,
|
||||||
|
"view_count": None::<i64>,
|
||||||
|
"length_seconds": None::<i64>,
|
||||||
|
"is_live": None::<bool>,
|
||||||
|
"is_private": None::<bool>,
|
||||||
|
"is_unlisted": None::<bool>,
|
||||||
|
"allow_ratings":None::<bool>,
|
||||||
|
"category": None::<String>,
|
||||||
|
"upload_date": None::<String>,
|
||||||
|
"publish_date": None::<String>,
|
||||||
|
"keywords": Vec::<Value>::new(),
|
||||||
|
"thumbnails": thumbnail.as_ref().map(|t| vec![json!({"url": t})]).unwrap_or_default(),
|
||||||
|
"caption_tracks": Vec::<Value>::new(),
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
|
|
@ -166,8 +234,6 @@ fn parse_video_id(url: &str) -> Option<String> {
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
fn extract_player_response(html: &str) -> Option<Value> {
|
fn extract_player_response(html: &str) -> Option<Value> {
|
||||||
use regex::Regex;
|
|
||||||
use std::sync::OnceLock;
|
|
||||||
// Same regex as webclaw_core::youtube. Duplicated here because
|
// Same regex as webclaw_core::youtube. Duplicated here because
|
||||||
// core's regex is module-private. Kept in lockstep; changes are
|
// core's regex is module-private. Kept in lockstep; changes are
|
||||||
// rare and we cover with tests in both places.
|
// rare and we cover with tests in both places.
|
||||||
|
|
@ -178,6 +244,36 @@ fn extract_player_response(html: &str) -> Option<Value> {
|
||||||
serde_json::from_str(json_str).ok()
|
serde_json::from_str(json_str).ok()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Meta-tag helpers (for OG fallback)
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
fn og(html: &str, prop: &str) -> Option<String> {
|
||||||
|
static RE: OnceLock<Regex> = OnceLock::new();
|
||||||
|
let re = RE.get_or_init(|| {
|
||||||
|
Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
|
||||||
|
});
|
||||||
|
for c in re.captures_iter(html) {
|
||||||
|
if c.get(1).is_some_and(|m| m.as_str() == prop) {
|
||||||
|
return c.get(2).map(|m| m.as_str().to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
fn meta_name(html: &str, name: &str) -> Option<String> {
|
||||||
|
static RE: OnceLock<Regex> = OnceLock::new();
|
||||||
|
let re = RE.get_or_init(|| {
|
||||||
|
Regex::new(r#"(?i)<meta[^>]+name="([^"]+)"[^>]+content="([^"]+)""#).unwrap()
|
||||||
|
});
|
||||||
|
for c in re.captures_iter(html) {
|
||||||
|
if c.get(1).is_some_and(|m| m.as_str() == name) {
|
||||||
|
return c.get(2).map(|m| m.as_str().to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
fn get_str(v: Option<&Value>, key: &str) -> Option<String> {
|
fn get_str(v: Option<&Value>, key: &str) -> Option<String> {
|
||||||
v.and_then(|x| x.get(key))
|
v.and_then(|x| x.get(key))
|
||||||
.and_then(|x| x.as_str().map(String::from))
|
.and_then(|x| x.as_str().map(String::from))
|
||||||
|
|
@ -252,4 +348,31 @@ var ytInitialPlayerResponse = {"videoDetails":{"videoId":"abc","title":"T","auth
|
||||||
let vd = v.get("videoDetails").unwrap();
|
let vd = v.get("videoDetails").unwrap();
|
||||||
assert_eq!(vd.get("title").unwrap().as_str(), Some("T"));
|
assert_eq!(vd.get("title").unwrap().as_str(), Some("T"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn og_fallback_extracts_basics_from_meta_tags() {
|
||||||
|
let html = r##"
|
||||||
|
<html><head>
|
||||||
|
<meta property="og:title" content="Example Video Title">
|
||||||
|
<meta property="og:description" content="A cool video description.">
|
||||||
|
<meta property="og:image" content="https://i.ytimg.com/vi/abc/maxresdefault.jpg">
|
||||||
|
<meta name="author" content="Example Channel">
|
||||||
|
</head></html>"##;
|
||||||
|
let v = build_og_fallback(
|
||||||
|
html,
|
||||||
|
"https://www.youtube.com/watch?v=abc",
|
||||||
|
"https://www.youtube.com/watch?v=abc",
|
||||||
|
"abc",
|
||||||
|
);
|
||||||
|
assert_eq!(v["data_source"], "og_fallback");
|
||||||
|
assert_eq!(v["title"], "Example Video Title");
|
||||||
|
assert_eq!(v["description"], "A cool video description.");
|
||||||
|
assert_eq!(v["author"], "Example Channel");
|
||||||
|
assert_eq!(
|
||||||
|
v["thumbnails"][0]["url"],
|
||||||
|
"https://i.ytimg.com/vi/abc/maxresdefault.jpg"
|
||||||
|
);
|
||||||
|
assert!(v["view_count"].is_null());
|
||||||
|
assert!(v["caption_tracks"].as_array().unwrap().is_empty());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue