diff --git a/crates/webclaw-fetch/src/extractors/github_issue.rs b/crates/webclaw-fetch/src/extractors/github_issue.rs new file mode 100644 index 0000000..436faa9 --- /dev/null +++ b/crates/webclaw-fetch/src/extractors/github_issue.rs @@ -0,0 +1,172 @@ +//! GitHub issue structured extractor. +//! +//! Mirror of `github_pr` but on `/issues/{number}`. Uses +//! `api.github.com/repos/{owner}/{repo}/issues/{number}`. Returns the +//! issue body + comment count + labels + milestone + author / +//! assignees. Full per-comment bodies would be another call; kept for +//! a follow-up. + +use serde::Deserialize; +use serde_json::{Value, json}; + +use super::ExtractorInfo; +use crate::client::FetchClient; +use crate::error::FetchError; + +pub const INFO: ExtractorInfo = ExtractorInfo { + name: "github_issue", + label: "GitHub issue", + description: "Returns issue metadata: title, body, state, author, labels, assignees, milestone, comment count.", + url_patterns: &["https://github.com/{owner}/{repo}/issues/{number}"], +}; + +pub fn matches(url: &str) -> bool { + let host = url + .split("://") + .nth(1) + .unwrap_or(url) + .split('/') + .next() + .unwrap_or(""); + if host != "github.com" && host != "www.github.com" { + return false; + } + parse_issue(url).is_some() +} + +pub async fn extract(client: &FetchClient, url: &str) -> Result { + let (owner, repo, number) = parse_issue(url).ok_or_else(|| { + FetchError::Build(format!("github_issue: cannot parse issue URL '{url}'")) + })?; + + let api_url = format!("https://api.github.com/repos/{owner}/{repo}/issues/{number}"); + let resp = client.fetch(&api_url).await?; + if resp.status == 404 { + return Err(FetchError::Build(format!( + "github_issue: issue '{owner}/{repo}#{number}' not found" + ))); + } + if resp.status == 403 { + return Err(FetchError::Build( + "github_issue: rate limited (60/hour unauth). Set GITHUB_TOKEN for 5,000/hour.".into(), + )); + } + if resp.status != 200 { + return Err(FetchError::Build(format!( + "github api returned status {}", + resp.status + ))); + } + + let issue: Issue = serde_json::from_str(&resp.html) + .map_err(|e| FetchError::BodyDecode(format!("github issue parse: {e}")))?; + + // The same endpoint returns PRs too; reject if we got one so the caller + // uses /v1/scrape/github_pr instead of getting a half-shaped payload. + if issue.pull_request.is_some() { + return Err(FetchError::Build(format!( + "github_issue: '{owner}/{repo}#{number}' is a pull request, use /v1/scrape/github_pr" + ))); + } + + Ok(json!({ + "url": url, + "owner": owner, + "repo": repo, + "number": issue.number, + "title": issue.title, + "body": issue.body, + "state": issue.state, + "state_reason":issue.state_reason, + "author": issue.user.as_ref().and_then(|u| u.login.clone()), + "labels": issue.labels.iter().filter_map(|l| l.name.clone()).collect::>(), + "assignees": issue.assignees.iter().filter_map(|u| u.login.clone()).collect::>(), + "milestone": issue.milestone.as_ref().and_then(|m| m.title.clone()), + "comments": issue.comments, + "locked": issue.locked, + "created_at": issue.created_at, + "updated_at": issue.updated_at, + "closed_at": issue.closed_at, + "html_url": issue.html_url, + })) +} + +fn parse_issue(url: &str) -> Option<(String, String, u64)> { + let path = url.split("://").nth(1)?.split_once('/').map(|(_, p)| p)?; + let stripped = path.split(['?', '#']).next()?.trim_end_matches('/'); + let segs: Vec<&str> = stripped.split('/').filter(|s| !s.is_empty()).collect(); + if segs.len() < 4 || segs[2] != "issues" { + return None; + } + let number: u64 = segs[3].parse().ok()?; + Some((segs[0].to_string(), segs[1].to_string(), number)) +} + +// --------------------------------------------------------------------------- +// GitHub issue API types +// --------------------------------------------------------------------------- + +#[derive(Deserialize)] +struct Issue { + number: Option, + title: Option, + body: Option, + state: Option, + state_reason: Option, + locked: Option, + comments: Option, + created_at: Option, + updated_at: Option, + closed_at: Option, + html_url: Option, + user: Option, + #[serde(default)] + labels: Vec, + #[serde(default)] + assignees: Vec, + milestone: Option, + /// Present when this "issue" is actually a pull request. The REST + /// API overloads the issues endpoint for PRs. + pull_request: Option, +} + +#[derive(Deserialize)] +struct UserRef { + login: Option, +} + +#[derive(Deserialize)] +struct LabelRef { + name: Option, +} + +#[derive(Deserialize)] +struct Milestone { + title: Option, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn matches_issue_urls() { + assert!(matches("https://github.com/rust-lang/rust/issues/100")); + assert!(matches("https://github.com/rust-lang/rust/issues/100/")); + assert!(!matches("https://github.com/rust-lang/rust")); + assert!(!matches("https://github.com/rust-lang/rust/pull/100")); + assert!(!matches("https://github.com/rust-lang/rust/issues")); + } + + #[test] + fn parse_issue_extracts_owner_repo_number() { + assert_eq!( + parse_issue("https://github.com/rust-lang/rust/issues/100"), + Some(("rust-lang".into(), "rust".into(), 100)) + ); + assert_eq!( + parse_issue("https://github.com/rust-lang/rust/issues/100/?foo=bar"), + Some(("rust-lang".into(), "rust".into(), 100)) + ); + } +} diff --git a/crates/webclaw-fetch/src/extractors/mod.rs b/crates/webclaw-fetch/src/extractors/mod.rs index 5cf0993..510adc0 100644 --- a/crates/webclaw-fetch/src/extractors/mod.rs +++ b/crates/webclaw-fetch/src/extractors/mod.rs @@ -21,6 +21,7 @@ pub mod dev_to; pub mod docker_hub; pub mod ebay_listing; pub mod ecommerce_product; +pub mod github_issue; pub mod github_pr; pub mod github_release; pub mod github_repo; @@ -33,9 +34,13 @@ pub mod linkedin_post; pub mod npm; pub mod pypi; pub mod reddit; +pub mod shopify_collection; pub mod shopify_product; pub mod stackoverflow; +pub mod substack_post; pub mod trustpilot_reviews; +pub mod woocommerce_product; +pub mod youtube_video; use serde::Serialize; use serde_json::Value; @@ -65,6 +70,7 @@ pub fn list() -> Vec { hackernews::INFO, github_repo::INFO, github_pr::INFO, + github_issue::INFO, github_release::INFO, pypi::INFO, npm::INFO, @@ -75,11 +81,15 @@ pub fn list() -> Vec { docker_hub::INFO, dev_to::INFO, stackoverflow::INFO, + substack_post::INFO, + youtube_video::INFO, linkedin_post::INFO, instagram_post::INFO, instagram_profile::INFO, shopify_product::INFO, + shopify_collection::INFO, ecommerce_product::INFO, + woocommerce_product::INFO, amazon_product::INFO, ebay_listing::INFO, trustpilot_reviews::INFO, @@ -131,6 +141,13 @@ pub async fn dispatch_by_url( .map(|v| (github_pr::INFO.name, v)), ); } + if github_issue::matches(url) { + return Some( + github_issue::extract(client, url) + .await + .map(|v| (github_issue::INFO.name, v)), + ); + } if github_release::matches(url) { return Some( github_release::extract(client, url) @@ -233,7 +250,15 @@ pub async fn dispatch_by_url( .map(|v| (trustpilot_reviews::INFO.name, v)), ); } - // NOTE: shopify_product and ecommerce_product are intentionally NOT + if youtube_video::matches(url) { + return Some( + youtube_video::extract(client, url) + .await + .map(|v| (youtube_video::INFO.name, v)), + ); + } + // NOTE: shopify_product, shopify_collection, ecommerce_product, + // woocommerce_product, and substack_post are intentionally NOT // in auto-dispatch. Their `matches()` functions are permissive // (any URL with `/products/`, `/product/`, `/p/`, etc.) and // claiming those generically would steal URLs from the default @@ -282,6 +307,12 @@ pub async fn dispatch_by_name( }) .await } + n if n == github_issue::INFO.name => { + run_or_mismatch(github_issue::matches(url), n, url, || { + github_issue::extract(client, url) + }) + .await + } n if n == github_release::INFO.name => { run_or_mismatch(github_release::matches(url), n, url, || { github_release::extract(client, url) @@ -375,6 +406,30 @@ pub async fn dispatch_by_name( }) .await } + n if n == youtube_video::INFO.name => { + run_or_mismatch(youtube_video::matches(url), n, url, || { + youtube_video::extract(client, url) + }) + .await + } + n if n == substack_post::INFO.name => { + run_or_mismatch(substack_post::matches(url), n, url, || { + substack_post::extract(client, url) + }) + .await + } + n if n == shopify_collection::INFO.name => { + run_or_mismatch(shopify_collection::matches(url), n, url, || { + shopify_collection::extract(client, url) + }) + .await + } + n if n == woocommerce_product::INFO.name => { + run_or_mismatch(woocommerce_product::matches(url), n, url, || { + woocommerce_product::extract(client, url) + }) + .await + } _ => Err(ExtractorDispatchError::UnknownVertical(name.to_string())), } } diff --git a/crates/webclaw-fetch/src/extractors/shopify_collection.rs b/crates/webclaw-fetch/src/extractors/shopify_collection.rs new file mode 100644 index 0000000..095f7dd --- /dev/null +++ b/crates/webclaw-fetch/src/extractors/shopify_collection.rs @@ -0,0 +1,242 @@ +//! Shopify collection structured extractor. +//! +//! Every Shopify store exposes `/collections/{handle}.json` and +//! `/collections/{handle}/products.json` on the public surface. This +//! extractor hits `.json` (collection metadata) and falls through to +//! `/products.json` for the first page of products. Same caveat as +//! `shopify_product`: stores with Cloudflare in front of the shop +//! will 403 the public path. +//! +//! Explicit-call only (like `shopify_product`). `/collections/{slug}` +//! is a URL shape used by non-Shopify stores too, so auto-dispatch +//! would claim too many URLs. + +use serde::Deserialize; +use serde_json::{Value, json}; + +use super::ExtractorInfo; +use crate::client::FetchClient; +use crate::error::FetchError; + +pub const INFO: ExtractorInfo = ExtractorInfo { + name: "shopify_collection", + label: "Shopify collection", + description: "Returns collection metadata + first page of products (handle, title, vendor, price, available) on ANY Shopify store via /collections/{handle}.json + /products.json.", + url_patterns: &[ + "https://{shop}/collections/{handle}", + "https://{shop}.myshopify.com/collections/{handle}", + ], +}; + +pub fn matches(url: &str) -> bool { + let host = host_of(url); + if host.is_empty() || NON_SHOPIFY_HOSTS.iter().any(|h| host.ends_with(h)) { + return false; + } + url.contains("/collections/") && !url.ends_with("/collections/") +} + +const NON_SHOPIFY_HOSTS: &[&str] = &[ + "amazon.com", + "amazon.co.uk", + "amazon.de", + "ebay.com", + "etsy.com", + "walmart.com", + "target.com", + "aliexpress.com", + "huggingface.co", // has /collections/ for models + "github.com", +]; + +pub async fn extract(client: &FetchClient, url: &str) -> Result { + let (coll_meta_url, coll_products_url) = build_json_urls(url); + + // Step 1: collection metadata. Shopify returns 200 on missing + // collections sometimes; check "collection" key below. + let meta_resp = client.fetch(&coll_meta_url).await?; + if meta_resp.status == 404 { + return Err(FetchError::Build(format!( + "shopify_collection: '{url}' not found" + ))); + } + if meta_resp.status == 403 { + return Err(FetchError::Build(format!( + "shopify_collection: {coll_meta_url} returned 403. The store has antibot in front of the .json endpoint. Use /v1/scrape/ecommerce_product or api.webclaw.io for this store." + ))); + } + if meta_resp.status != 200 { + return Err(FetchError::Build(format!( + "shopify returned status {} for {coll_meta_url}", + meta_resp.status + ))); + } + + let meta: MetaWrapper = serde_json::from_str(&meta_resp.html).map_err(|e| { + FetchError::BodyDecode(format!( + "shopify_collection: '{url}' didn't return Shopify JSON, likely not a Shopify store ({e})" + )) + })?; + + // Step 2: first page of products for this collection. + let products = match client.fetch(&coll_products_url).await { + Ok(r) if r.status == 200 => serde_json::from_str::(&r.html) + .ok() + .map(|pw| pw.products) + .unwrap_or_default(), + _ => Vec::new(), + }; + + let product_summaries: Vec = products + .iter() + .map(|p| { + let first_variant = p.variants.first(); + json!({ + "id": p.id, + "handle": p.handle, + "title": p.title, + "vendor": p.vendor, + "product_type": p.product_type, + "price": first_variant.and_then(|v| v.price.clone()), + "compare_at_price":first_variant.and_then(|v| v.compare_at_price.clone()), + "available": p.variants.iter().any(|v| v.available.unwrap_or(false)), + "variant_count": p.variants.len(), + "image": p.images.first().and_then(|i| i.src.clone()), + "created_at": p.created_at, + "updated_at": p.updated_at, + }) + }) + .collect(); + + let c = meta.collection; + Ok(json!({ + "url": url, + "meta_json_url": coll_meta_url, + "products_json_url": coll_products_url, + "collection_id": c.id, + "handle": c.handle, + "title": c.title, + "description_html": c.body_html, + "published_at": c.published_at, + "updated_at": c.updated_at, + "sort_order": c.sort_order, + "products_in_page": product_summaries.len(), + "products": product_summaries, + })) +} + +// --------------------------------------------------------------------------- +// URL helpers +// --------------------------------------------------------------------------- + +fn host_of(url: &str) -> &str { + url.split("://") + .nth(1) + .unwrap_or(url) + .split('/') + .next() + .unwrap_or("") +} + +/// Build `(collection.json, collection/products.json)` from a user URL. +fn build_json_urls(url: &str) -> (String, String) { + let (path_part, _query_part) = match url.split_once('?') { + Some((a, b)) => (a, Some(b)), + None => (url, None), + }; + let clean = path_part.trim_end_matches('/').trim_end_matches(".json"); + ( + format!("{clean}.json"), + format!("{clean}/products.json?limit=50"), + ) +} + +// --------------------------------------------------------------------------- +// Shopify collection + product JSON shapes (subsets) +// --------------------------------------------------------------------------- + +#[derive(Deserialize)] +struct MetaWrapper { + collection: Collection, +} + +#[derive(Deserialize)] +struct Collection { + id: Option, + handle: Option, + title: Option, + body_html: Option, + published_at: Option, + updated_at: Option, + sort_order: Option, +} + +#[derive(Deserialize)] +struct ProductsWrapper { + #[serde(default)] + products: Vec, +} + +#[derive(Deserialize)] +struct ProductSummary { + id: Option, + handle: Option, + title: Option, + vendor: Option, + product_type: Option, + created_at: Option, + updated_at: Option, + #[serde(default)] + variants: Vec, + #[serde(default)] + images: Vec, +} + +#[derive(Deserialize)] +struct VariantSummary { + price: Option, + compare_at_price: Option, + available: Option, +} + +#[derive(Deserialize)] +struct ImageSummary { + src: Option, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn matches_shopify_collection_urls() { + assert!(matches("https://www.allbirds.com/collections/mens")); + assert!(matches( + "https://shop.example.com/collections/new-arrivals?page=2" + )); + } + + #[test] + fn rejects_non_shopify() { + assert!(!matches("https://github.com/collections/foo")); + assert!(!matches("https://huggingface.co/collections/foo")); + assert!(!matches("https://example.com/")); + assert!(!matches("https://example.com/collections/")); + } + + #[test] + fn build_json_urls_derives_both_paths() { + let (meta, products) = build_json_urls("https://shop.example.com/collections/mens"); + assert_eq!(meta, "https://shop.example.com/collections/mens.json"); + assert_eq!( + products, + "https://shop.example.com/collections/mens/products.json?limit=50" + ); + } + + #[test] + fn build_json_urls_handles_trailing_slash() { + let (meta, _) = build_json_urls("https://shop.example.com/collections/mens/"); + assert_eq!(meta, "https://shop.example.com/collections/mens.json"); + } +} diff --git a/crates/webclaw-fetch/src/extractors/substack_post.rs b/crates/webclaw-fetch/src/extractors/substack_post.rs new file mode 100644 index 0000000..03ccbe8 --- /dev/null +++ b/crates/webclaw-fetch/src/extractors/substack_post.rs @@ -0,0 +1,213 @@ +//! Substack post extractor. +//! +//! Every Substack publication exposes `/api/v1/posts/{slug}` that +//! returns the full post as JSON: body HTML, cover image, author, +//! publication info, reactions, paywall state. No auth on public +//! posts. +//! +//! Works on both `*.substack.com` subdomains and custom domains +//! (e.g. `simonwillison.net` uses Substack too). Detection is +//! "URL has `/p/{slug}`" because that's the canonical Substack post +//! path. Explicit-call only because the `/p/{slug}` URL shape is +//! used by non-Substack sites too. + +use serde::Deserialize; +use serde_json::{Value, json}; + +use super::ExtractorInfo; +use crate::client::FetchClient; +use crate::error::FetchError; + +pub const INFO: ExtractorInfo = ExtractorInfo { + name: "substack_post", + label: "Substack post", + description: "Returns post HTML, title, subtitle, author, publication, reactions, paywall status via the Substack public API.", + url_patterns: &[ + "https://{pub}.substack.com/p/{slug}", + "https://{custom-domain}/p/{slug}", + ], +}; + +pub fn matches(url: &str) -> bool { + if !(url.starts_with("http://") || url.starts_with("https://")) { + return false; + } + url.contains("/p/") +} + +pub async fn extract(client: &FetchClient, url: &str) -> Result { + let slug = parse_slug(url).ok_or_else(|| { + FetchError::Build(format!("substack_post: cannot parse slug from '{url}'")) + })?; + let host = host_of(url); + if host.is_empty() { + return Err(FetchError::Build(format!( + "substack_post: empty host in '{url}'" + ))); + } + let scheme = if url.starts_with("http://") { + "http" + } else { + "https" + }; + let api_url = format!("{scheme}://{host}/api/v1/posts/{slug}"); + let resp = client.fetch(&api_url).await?; + if resp.status == 404 { + return Err(FetchError::Build(format!( + "substack_post: '{slug}' not found on {host} (got 404). \ + If the publication isn't actually on Substack, use /v1/scrape instead." + ))); + } + if resp.status != 200 { + return Err(FetchError::Build(format!( + "substack returned status {} for {api_url}", + resp.status + ))); + } + + let p: Post = serde_json::from_str(&resp.html).map_err(|e| { + FetchError::BodyDecode(format!( + "substack_post: '{host}' didn't return Substack JSON, likely not a Substack ({e})" + )) + })?; + + Ok(json!({ + "url": url, + "api_url": api_url, + "id": p.id, + "type": p.r#type, + "slug": p.slug, + "title": p.title, + "subtitle": p.subtitle, + "description": p.description, + "canonical_url": p.canonical_url, + "post_date": p.post_date, + "updated_at": p.updated_at, + "audience": p.audience, + "has_paywall": matches!(p.audience.as_deref(), Some("only_paid") | Some("founding")), + "is_free_preview": p.is_free_preview, + "cover_image": p.cover_image, + "word_count": p.wordcount, + "reactions": p.reactions, + "comment_count": p.comment_count, + "body_html": p.body_html, + "body_text": p.truncated_body_text.or(p.body_text), + "publication": json!({ + "id": p.publication.as_ref().and_then(|pub_| pub_.id), + "name": p.publication.as_ref().and_then(|pub_| pub_.name.clone()), + "subdomain": p.publication.as_ref().and_then(|pub_| pub_.subdomain.clone()), + "custom_domain":p.publication.as_ref().and_then(|pub_| pub_.custom_domain.clone()), + }), + "authors": p.published_bylines.iter().map(|a| json!({ + "id": a.id, + "name": a.name, + "handle": a.handle, + "photo": a.photo_url, + })).collect::>(), + })) +} + +// --------------------------------------------------------------------------- +// URL helpers +// --------------------------------------------------------------------------- + +fn host_of(url: &str) -> &str { + url.split("://") + .nth(1) + .unwrap_or(url) + .split('/') + .next() + .unwrap_or("") +} + +fn parse_slug(url: &str) -> Option { + let after = url.split("/p/").nth(1)?; + let stripped = after + .split(['?', '#']) + .next()? + .trim_end_matches('/') + .split('/') + .next() + .unwrap_or(""); + if stripped.is_empty() { + None + } else { + Some(stripped.to_string()) + } +} + +// --------------------------------------------------------------------------- +// Substack API types (subset) +// --------------------------------------------------------------------------- + +#[derive(Deserialize)] +struct Post { + id: Option, + r#type: Option, + slug: Option, + title: Option, + subtitle: Option, + description: Option, + canonical_url: Option, + post_date: Option, + updated_at: Option, + audience: Option, + is_free_preview: Option, + cover_image: Option, + wordcount: Option, + reactions: Option, + comment_count: Option, + body_html: Option, + body_text: Option, + truncated_body_text: Option, + publication: Option, + #[serde(default, rename = "publishedBylines")] + published_bylines: Vec, +} + +#[derive(Deserialize)] +struct Publication { + id: Option, + name: Option, + subdomain: Option, + custom_domain: Option, +} + +#[derive(Deserialize)] +struct Byline { + id: Option, + name: Option, + handle: Option, + photo_url: Option, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn matches_post_urls() { + assert!(matches( + "https://stratechery.substack.com/p/the-tech-letter" + )); + assert!(matches("https://simonwillison.net/p/2024-08-01-something")); + assert!(!matches("https://example.com/")); + assert!(!matches("ftp://example.com/p/foo")); + } + + #[test] + fn parse_slug_strips_query_and_trailing_slash() { + assert_eq!( + parse_slug("https://example.substack.com/p/my-post"), + Some("my-post".into()) + ); + assert_eq!( + parse_slug("https://example.substack.com/p/my-post/"), + Some("my-post".into()) + ); + assert_eq!( + parse_slug("https://example.substack.com/p/my-post?ref=123"), + Some("my-post".into()) + ); + } +} diff --git a/crates/webclaw-fetch/src/extractors/woocommerce_product.rs b/crates/webclaw-fetch/src/extractors/woocommerce_product.rs new file mode 100644 index 0000000..73f1109 --- /dev/null +++ b/crates/webclaw-fetch/src/extractors/woocommerce_product.rs @@ -0,0 +1,237 @@ +//! WooCommerce product structured extractor. +//! +//! Targets WooCommerce's Store API: `/wp-json/wc/store/v1/products?slug={slug}`. +//! About 30-50% of WooCommerce stores expose this endpoint publicly +//! (it's on by default, but common security plugins disable it). +//! When it's off, the server returns 404 at /wp-json. We surface a +//! clean error and point callers at `/v1/scrape/ecommerce_product` +//! which works on any store with Schema.org JSON-LD. +//! +//! Explicit-call only. `/product/{slug}` is the default permalink for +//! WooCommerce but custom stores use every variation imaginable, so +//! auto-dispatch is unreliable. + +use serde::Deserialize; +use serde_json::{Value, json}; + +use super::ExtractorInfo; +use crate::client::FetchClient; +use crate::error::FetchError; + +pub const INFO: ExtractorInfo = ExtractorInfo { + name: "woocommerce_product", + label: "WooCommerce product", + description: "Returns product via the WooCommerce Store REST API (requires the /wp-json/wc/store endpoint to be enabled on the target store).", + url_patterns: &[ + "https://{shop}/product/{slug}", + "https://{shop}/shop/{slug}", + ], +}; + +pub fn matches(url: &str) -> bool { + let host = host_of(url); + if host.is_empty() { + return false; + } + // Permissive: WooCommerce stores use custom domains + custom + // permalinks. The extractor's API probe is what confirms it's + // really WooCommerce. + url.contains("/product/") + || url.contains("/shop/") + || url.contains("/producto/") // common es locale + || url.contains("/produit/") // common fr locale +} + +pub async fn extract(client: &FetchClient, url: &str) -> Result { + let slug = parse_slug(url).ok_or_else(|| { + FetchError::Build(format!( + "woocommerce_product: cannot parse slug from '{url}'" + )) + })?; + let host = host_of(url); + if host.is_empty() { + return Err(FetchError::Build(format!( + "woocommerce_product: empty host in '{url}'" + ))); + } + let scheme = if url.starts_with("http://") { + "http" + } else { + "https" + }; + let api_url = format!("{scheme}://{host}/wp-json/wc/store/v1/products?slug={slug}&per_page=1"); + let resp = client.fetch(&api_url).await?; + if resp.status == 404 { + return Err(FetchError::Build(format!( + "woocommerce_product: {host} does not expose /wp-json/wc/store (404). \ + Use /v1/scrape/ecommerce_product for JSON-LD fallback." + ))); + } + if resp.status == 401 || resp.status == 403 { + return Err(FetchError::Build(format!( + "woocommerce_product: {host} requires auth for /wp-json/wc/store ({}). \ + Use /v1/scrape/ecommerce_product for the public JSON-LD fallback.", + resp.status + ))); + } + if resp.status != 200 { + return Err(FetchError::Build(format!( + "woocommerce api returned status {} for {api_url}", + resp.status + ))); + } + + let products: Vec = serde_json::from_str(&resp.html) + .map_err(|e| FetchError::BodyDecode(format!("woocommerce parse: {e}")))?; + let p = products.into_iter().next().ok_or_else(|| { + FetchError::Build(format!( + "woocommerce_product: no product found for slug '{slug}' on {host}" + )) + })?; + + let images: Vec = p + .images + .iter() + .map(|i| json!({"src": i.src, "thumbnail": i.thumbnail, "alt": i.alt})) + .collect(); + let variations_count = p.variations.as_ref().map(|v| v.len()).unwrap_or(0); + + Ok(json!({ + "url": url, + "api_url": api_url, + "product_id": p.id, + "name": p.name, + "slug": p.slug, + "sku": p.sku, + "permalink": p.permalink, + "on_sale": p.on_sale, + "in_stock": p.is_in_stock, + "is_purchasable": p.is_purchasable, + "price": p.prices.as_ref().and_then(|pr| pr.price.clone()), + "regular_price": p.prices.as_ref().and_then(|pr| pr.regular_price.clone()), + "sale_price": p.prices.as_ref().and_then(|pr| pr.sale_price.clone()), + "currency": p.prices.as_ref().and_then(|pr| pr.currency_code.clone()), + "currency_minor": p.prices.as_ref().and_then(|pr| pr.currency_minor_unit), + "price_range": p.prices.as_ref().and_then(|pr| pr.price_range.clone()), + "average_rating": p.average_rating, + "review_count": p.review_count, + "description": p.description, + "short_description": p.short_description, + "categories": p.categories.iter().filter_map(|c| c.name.clone()).collect::>(), + "tags": p.tags.iter().filter_map(|t| t.name.clone()).collect::>(), + "variation_count": variations_count, + "image_count": images.len(), + "images": images, + })) +} + +// --------------------------------------------------------------------------- +// URL helpers +// --------------------------------------------------------------------------- + +fn host_of(url: &str) -> &str { + url.split("://") + .nth(1) + .unwrap_or(url) + .split('/') + .next() + .unwrap_or("") +} + +/// Extract the product slug from common WooCommerce permalinks. +fn parse_slug(url: &str) -> Option { + for needle in ["/product/", "/shop/", "/producto/", "/produit/"] { + if let Some(after) = url.split(needle).nth(1) { + let stripped = after + .split(['?', '#']) + .next()? + .trim_end_matches('/') + .split('/') + .next() + .unwrap_or(""); + if !stripped.is_empty() { + return Some(stripped.to_string()); + } + } + } + None +} + +// --------------------------------------------------------------------------- +// Store API types (subset of the full response) +// --------------------------------------------------------------------------- + +#[derive(Deserialize)] +struct Product { + id: Option, + name: Option, + slug: Option, + sku: Option, + permalink: Option, + description: Option, + short_description: Option, + on_sale: Option, + is_in_stock: Option, + is_purchasable: Option, + average_rating: Option, // string or number + review_count: Option, + prices: Option, + #[serde(default)] + categories: Vec, + #[serde(default)] + tags: Vec, + #[serde(default)] + images: Vec, + variations: Option>, +} + +#[derive(Deserialize)] +struct Prices { + price: Option, + regular_price: Option, + sale_price: Option, + currency_code: Option, + currency_minor_unit: Option, + price_range: Option, +} + +#[derive(Deserialize)] +struct Term { + name: Option, +} + +#[derive(Deserialize)] +struct Img { + src: Option, + thumbnail: Option, + alt: Option, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn matches_common_permalinks() { + assert!(matches("https://shop.example.com/product/cool-widget")); + assert!(matches("https://shop.example.com/shop/cool-widget")); + assert!(matches("https://tienda.example.com/producto/cosa")); + assert!(matches("https://boutique.example.com/produit/chose")); + } + + #[test] + fn parse_slug_handles_locale_and_suffix() { + assert_eq!( + parse_slug("https://shop.example.com/product/cool-widget"), + Some("cool-widget".into()) + ); + assert_eq!( + parse_slug("https://shop.example.com/product/cool-widget/?attr=red"), + Some("cool-widget".into()) + ); + assert_eq!( + parse_slug("https://tienda.example.com/producto/cosa/"), + Some("cosa".into()) + ); + } +} diff --git a/crates/webclaw-fetch/src/extractors/youtube_video.rs b/crates/webclaw-fetch/src/extractors/youtube_video.rs new file mode 100644 index 0000000..c37230a --- /dev/null +++ b/crates/webclaw-fetch/src/extractors/youtube_video.rs @@ -0,0 +1,255 @@ +//! YouTube video structured extractor. +//! +//! YouTube embeds the full player configuration in a +//! `ytInitialPlayerResponse` JavaScript assignment at the top of +//! every `/watch`, `/shorts`, and `youtu.be` HTML page. We reuse the +//! core crate's already-proven regex + parse to surface typed JSON +//! from it: video id, title, author + channel id, view count, +//! duration, upload date, keywords, thumbnails, caption-track URLs. +//! +//! Auto-dispatched: YouTube host is unique and the `v=` or `/shorts/` +//! shape is stable. + +use serde_json::{Value, json}; + +use super::ExtractorInfo; +use crate::client::FetchClient; +use crate::error::FetchError; + +pub const INFO: ExtractorInfo = ExtractorInfo { + name: "youtube_video", + label: "YouTube video", + description: "Returns video id, title, channel, view count, duration, upload date, thumbnails, keywords, and caption-track URLs.", + url_patterns: &[ + "https://www.youtube.com/watch?v={id}", + "https://youtu.be/{id}", + "https://www.youtube.com/shorts/{id}", + ], +}; + +pub fn matches(url: &str) -> bool { + webclaw_core::youtube::is_youtube_url(url) + || url.contains("youtube.com/shorts/") + || url.contains("youtube-nocookie.com/embed/") +} + +pub async fn extract(client: &FetchClient, url: &str) -> Result { + let video_id = parse_video_id(url).ok_or_else(|| { + FetchError::Build(format!("youtube_video: cannot parse video id from '{url}'")) + })?; + + // Always fetch the canonical /watch URL. /shorts/ and youtu.be + // sometimes serve a thinner page without the player blob. + let canonical = format!("https://www.youtube.com/watch?v={video_id}"); + let resp = client.fetch(&canonical).await?; + if resp.status != 200 { + return Err(FetchError::Build(format!( + "youtube returned status {} for {canonical}", + resp.status + ))); + } + + let player = extract_player_response(&resp.html).ok_or_else(|| { + FetchError::BodyDecode(format!( + "youtube_video: no ytInitialPlayerResponse on {canonical} (video may be private, region-blocked, or removed)" + )) + })?; + + let video_details = player.get("videoDetails"); + let microformat = player + .get("microformat") + .and_then(|m| m.get("playerMicroformatRenderer")); + + let thumbnails: Vec = video_details + .and_then(|vd| vd.get("thumbnail")) + .and_then(|t| t.get("thumbnails")) + .and_then(|t| t.as_array()) + .cloned() + .unwrap_or_default(); + + let keywords: Vec = video_details + .and_then(|vd| vd.get("keywords")) + .and_then(|k| k.as_array()) + .cloned() + .unwrap_or_default(); + + let caption_tracks = webclaw_core::youtube::extract_caption_tracks(&resp.html); + let captions: Vec = caption_tracks + .iter() + .map(|c| { + json!({ + "url": c.url, + "lang": c.lang, + "name": c.name, + }) + }) + .collect(); + + Ok(json!({ + "url": url, + "canonical_url":canonical, + "video_id": video_id, + "title": get_str(video_details, "title"), + "description": get_str(video_details, "shortDescription"), + "author": get_str(video_details, "author"), + "channel_id": get_str(video_details, "channelId"), + "channel_url": get_str(microformat, "ownerProfileUrl"), + "view_count": get_int(video_details, "viewCount"), + "length_seconds": get_int(video_details, "lengthSeconds"), + "is_live": video_details.and_then(|vd| vd.get("isLiveContent")).and_then(|v| v.as_bool()), + "is_private": video_details.and_then(|vd| vd.get("isPrivate")).and_then(|v| v.as_bool()), + "is_unlisted": microformat.and_then(|m| m.get("isUnlisted")).and_then(|v| v.as_bool()), + "allow_ratings":video_details.and_then(|vd| vd.get("allowRatings")).and_then(|v| v.as_bool()), + "category": get_str(microformat, "category"), + "upload_date": get_str(microformat, "uploadDate"), + "publish_date": get_str(microformat, "publishDate"), + "keywords": keywords, + "thumbnails": thumbnails, + "caption_tracks": captions, + })) +} + +// --------------------------------------------------------------------------- +// URL helpers +// --------------------------------------------------------------------------- + +fn parse_video_id(url: &str) -> Option { + // youtu.be/{id} + if let Some(after) = url.split("youtu.be/").nth(1) { + let id = after + .split(['?', '#', '/']) + .next() + .unwrap_or("") + .trim_end_matches('/'); + if !id.is_empty() { + return Some(id.to_string()); + } + } + // youtube.com/shorts/{id} + if let Some(after) = url.split("youtube.com/shorts/").nth(1) { + let id = after + .split(['?', '#', '/']) + .next() + .unwrap_or("") + .trim_end_matches('/'); + if !id.is_empty() { + return Some(id.to_string()); + } + } + // youtube-nocookie.com/embed/{id} + if let Some(after) = url.split("/embed/").nth(1) { + let id = after + .split(['?', '#', '/']) + .next() + .unwrap_or("") + .trim_end_matches('/'); + if !id.is_empty() { + return Some(id.to_string()); + } + } + // youtube.com/watch?v={id} (also matches youtube.com/watch?foo=bar&v={id}) + if let Some(q) = url.split_once('?').map(|(_, q)| q) + && let Some(id) = q + .split('&') + .find_map(|p| p.strip_prefix("v=").map(|v| v.to_string())) + { + let id = id.split(['#', '/']).next().unwrap_or(&id).to_string(); + if !id.is_empty() { + return Some(id); + } + } + None +} + +// --------------------------------------------------------------------------- +// Player-response parsing +// --------------------------------------------------------------------------- + +fn extract_player_response(html: &str) -> Option { + use regex::Regex; + use std::sync::OnceLock; + // Same regex as webclaw_core::youtube. Duplicated here because + // core's regex is module-private. Kept in lockstep; changes are + // rare and we cover with tests in both places. + static RE: OnceLock = OnceLock::new(); + let re = RE + .get_or_init(|| Regex::new(r"var\s+ytInitialPlayerResponse\s*=\s*(\{.+?\})\s*;").unwrap()); + let json_str = re.captures(html)?.get(1)?.as_str(); + serde_json::from_str(json_str).ok() +} + +fn get_str(v: Option<&Value>, key: &str) -> Option { + v.and_then(|x| x.get(key)) + .and_then(|x| x.as_str().map(String::from)) +} + +fn get_int(v: Option<&Value>, key: &str) -> Option { + v.and_then(|x| x.get(key)).and_then(|x| { + x.as_i64() + .or_else(|| x.as_str().and_then(|s| s.parse::().ok())) + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn matches_watch_urls() { + assert!(matches("https://www.youtube.com/watch?v=dQw4w9WgXcQ")); + assert!(matches("https://youtu.be/dQw4w9WgXcQ")); + assert!(matches("https://www.youtube.com/shorts/abc123")); + assert!(matches( + "https://www.youtube-nocookie.com/embed/dQw4w9WgXcQ" + )); + } + + #[test] + fn rejects_non_video_urls() { + assert!(!matches("https://www.youtube.com/")); + assert!(!matches("https://www.youtube.com/channel/abc")); + assert!(!matches("https://example.com/watch?v=abc")); + } + + #[test] + fn parse_video_id_from_each_shape() { + assert_eq!( + parse_video_id("https://www.youtube.com/watch?v=dQw4w9WgXcQ"), + Some("dQw4w9WgXcQ".into()) + ); + assert_eq!( + parse_video_id("https://www.youtube.com/watch?v=dQw4w9WgXcQ&t=10s"), + Some("dQw4w9WgXcQ".into()) + ); + assert_eq!( + parse_video_id("https://www.youtube.com/watch?feature=share&v=dQw4w9WgXcQ"), + Some("dQw4w9WgXcQ".into()) + ); + assert_eq!( + parse_video_id("https://youtu.be/dQw4w9WgXcQ"), + Some("dQw4w9WgXcQ".into()) + ); + assert_eq!( + parse_video_id("https://youtu.be/dQw4w9WgXcQ?t=30"), + Some("dQw4w9WgXcQ".into()) + ); + assert_eq!( + parse_video_id("https://www.youtube.com/shorts/abc123"), + Some("abc123".into()) + ); + } + + #[test] + fn extract_player_response_happy_path() { + let html = r#" + + + +"#; + let v = extract_player_response(html).unwrap(); + let vd = v.get("videoDetails").unwrap(); + assert_eq!(vd.get("title").unwrap().as_str(), Some("T")); + } +}