feat(extractors): wave 6a, 5 easy verticals (27 total)

Adds 5 structured extractors that hit public APIs with stable shapes:

- github_issue: /repos/{o}/{r}/issues/{n} (rejects PRs, points to github_pr)
- shopify_collection: /collections/{handle}.json + products.json
- woocommerce_product: /wp-json/wc/store/v1/products?slug={slug}
- substack_post: /api/v1/posts/{slug} (works on custom domains too)
- youtube_video: ytInitialPlayerResponse blob from /watch HTML

Auto-dispatched: github_issue, youtube_video (unique hosts and stable
URL shapes). Explicit-call: shopify_collection, woocommerce_product,
substack_post (URL shapes overlap with non-target sites).

Tests: 82 total passing in webclaw-fetch (12 new), clippy clean.
This commit is contained in:
Valerio 2026-04-22 16:33:35 +02:00
parent d8c9274a9c
commit 8cc727c2f2
6 changed files with 1175 additions and 1 deletions

View file

@ -0,0 +1,172 @@
//! GitHub issue structured extractor.
//!
//! Mirror of `github_pr` but on `/issues/{number}`. Uses
//! `api.github.com/repos/{owner}/{repo}/issues/{number}`. Returns the
//! issue body + comment count + labels + milestone + author /
//! assignees. Full per-comment bodies would be another call; kept for
//! a follow-up.
use serde::Deserialize;
use serde_json::{Value, json};
use super::ExtractorInfo;
use crate::client::FetchClient;
use crate::error::FetchError;
pub const INFO: ExtractorInfo = ExtractorInfo {
name: "github_issue",
label: "GitHub issue",
description: "Returns issue metadata: title, body, state, author, labels, assignees, milestone, comment count.",
url_patterns: &["https://github.com/{owner}/{repo}/issues/{number}"],
};
pub fn matches(url: &str) -> bool {
let host = url
.split("://")
.nth(1)
.unwrap_or(url)
.split('/')
.next()
.unwrap_or("");
if host != "github.com" && host != "www.github.com" {
return false;
}
parse_issue(url).is_some()
}
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
let (owner, repo, number) = parse_issue(url).ok_or_else(|| {
FetchError::Build(format!("github_issue: cannot parse issue URL '{url}'"))
})?;
let api_url = format!("https://api.github.com/repos/{owner}/{repo}/issues/{number}");
let resp = client.fetch(&api_url).await?;
if resp.status == 404 {
return Err(FetchError::Build(format!(
"github_issue: issue '{owner}/{repo}#{number}' not found"
)));
}
if resp.status == 403 {
return Err(FetchError::Build(
"github_issue: rate limited (60/hour unauth). Set GITHUB_TOKEN for 5,000/hour.".into(),
));
}
if resp.status != 200 {
return Err(FetchError::Build(format!(
"github api returned status {}",
resp.status
)));
}
let issue: Issue = serde_json::from_str(&resp.html)
.map_err(|e| FetchError::BodyDecode(format!("github issue parse: {e}")))?;
// The same endpoint returns PRs too; reject if we got one so the caller
// uses /v1/scrape/github_pr instead of getting a half-shaped payload.
if issue.pull_request.is_some() {
return Err(FetchError::Build(format!(
"github_issue: '{owner}/{repo}#{number}' is a pull request, use /v1/scrape/github_pr"
)));
}
Ok(json!({
"url": url,
"owner": owner,
"repo": repo,
"number": issue.number,
"title": issue.title,
"body": issue.body,
"state": issue.state,
"state_reason":issue.state_reason,
"author": issue.user.as_ref().and_then(|u| u.login.clone()),
"labels": issue.labels.iter().filter_map(|l| l.name.clone()).collect::<Vec<_>>(),
"assignees": issue.assignees.iter().filter_map(|u| u.login.clone()).collect::<Vec<_>>(),
"milestone": issue.milestone.as_ref().and_then(|m| m.title.clone()),
"comments": issue.comments,
"locked": issue.locked,
"created_at": issue.created_at,
"updated_at": issue.updated_at,
"closed_at": issue.closed_at,
"html_url": issue.html_url,
}))
}
fn parse_issue(url: &str) -> Option<(String, String, u64)> {
let path = url.split("://").nth(1)?.split_once('/').map(|(_, p)| p)?;
let stripped = path.split(['?', '#']).next()?.trim_end_matches('/');
let segs: Vec<&str> = stripped.split('/').filter(|s| !s.is_empty()).collect();
if segs.len() < 4 || segs[2] != "issues" {
return None;
}
let number: u64 = segs[3].parse().ok()?;
Some((segs[0].to_string(), segs[1].to_string(), number))
}
// ---------------------------------------------------------------------------
// GitHub issue API types
// ---------------------------------------------------------------------------
#[derive(Deserialize)]
struct Issue {
number: Option<i64>,
title: Option<String>,
body: Option<String>,
state: Option<String>,
state_reason: Option<String>,
locked: Option<bool>,
comments: Option<i64>,
created_at: Option<String>,
updated_at: Option<String>,
closed_at: Option<String>,
html_url: Option<String>,
user: Option<UserRef>,
#[serde(default)]
labels: Vec<LabelRef>,
#[serde(default)]
assignees: Vec<UserRef>,
milestone: Option<Milestone>,
/// Present when this "issue" is actually a pull request. The REST
/// API overloads the issues endpoint for PRs.
pull_request: Option<serde_json::Value>,
}
#[derive(Deserialize)]
struct UserRef {
login: Option<String>,
}
#[derive(Deserialize)]
struct LabelRef {
name: Option<String>,
}
#[derive(Deserialize)]
struct Milestone {
title: Option<String>,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn matches_issue_urls() {
assert!(matches("https://github.com/rust-lang/rust/issues/100"));
assert!(matches("https://github.com/rust-lang/rust/issues/100/"));
assert!(!matches("https://github.com/rust-lang/rust"));
assert!(!matches("https://github.com/rust-lang/rust/pull/100"));
assert!(!matches("https://github.com/rust-lang/rust/issues"));
}
#[test]
fn parse_issue_extracts_owner_repo_number() {
assert_eq!(
parse_issue("https://github.com/rust-lang/rust/issues/100"),
Some(("rust-lang".into(), "rust".into(), 100))
);
assert_eq!(
parse_issue("https://github.com/rust-lang/rust/issues/100/?foo=bar"),
Some(("rust-lang".into(), "rust".into(), 100))
);
}
}

View file

@ -21,6 +21,7 @@ pub mod dev_to;
pub mod docker_hub; pub mod docker_hub;
pub mod ebay_listing; pub mod ebay_listing;
pub mod ecommerce_product; pub mod ecommerce_product;
pub mod github_issue;
pub mod github_pr; pub mod github_pr;
pub mod github_release; pub mod github_release;
pub mod github_repo; pub mod github_repo;
@ -33,9 +34,13 @@ pub mod linkedin_post;
pub mod npm; pub mod npm;
pub mod pypi; pub mod pypi;
pub mod reddit; pub mod reddit;
pub mod shopify_collection;
pub mod shopify_product; pub mod shopify_product;
pub mod stackoverflow; pub mod stackoverflow;
pub mod substack_post;
pub mod trustpilot_reviews; pub mod trustpilot_reviews;
pub mod woocommerce_product;
pub mod youtube_video;
use serde::Serialize; use serde::Serialize;
use serde_json::Value; use serde_json::Value;
@ -65,6 +70,7 @@ pub fn list() -> Vec<ExtractorInfo> {
hackernews::INFO, hackernews::INFO,
github_repo::INFO, github_repo::INFO,
github_pr::INFO, github_pr::INFO,
github_issue::INFO,
github_release::INFO, github_release::INFO,
pypi::INFO, pypi::INFO,
npm::INFO, npm::INFO,
@ -75,11 +81,15 @@ pub fn list() -> Vec<ExtractorInfo> {
docker_hub::INFO, docker_hub::INFO,
dev_to::INFO, dev_to::INFO,
stackoverflow::INFO, stackoverflow::INFO,
substack_post::INFO,
youtube_video::INFO,
linkedin_post::INFO, linkedin_post::INFO,
instagram_post::INFO, instagram_post::INFO,
instagram_profile::INFO, instagram_profile::INFO,
shopify_product::INFO, shopify_product::INFO,
shopify_collection::INFO,
ecommerce_product::INFO, ecommerce_product::INFO,
woocommerce_product::INFO,
amazon_product::INFO, amazon_product::INFO,
ebay_listing::INFO, ebay_listing::INFO,
trustpilot_reviews::INFO, trustpilot_reviews::INFO,
@ -131,6 +141,13 @@ pub async fn dispatch_by_url(
.map(|v| (github_pr::INFO.name, v)), .map(|v| (github_pr::INFO.name, v)),
); );
} }
if github_issue::matches(url) {
return Some(
github_issue::extract(client, url)
.await
.map(|v| (github_issue::INFO.name, v)),
);
}
if github_release::matches(url) { if github_release::matches(url) {
return Some( return Some(
github_release::extract(client, url) github_release::extract(client, url)
@ -233,7 +250,15 @@ pub async fn dispatch_by_url(
.map(|v| (trustpilot_reviews::INFO.name, v)), .map(|v| (trustpilot_reviews::INFO.name, v)),
); );
} }
// NOTE: shopify_product and ecommerce_product are intentionally NOT if youtube_video::matches(url) {
return Some(
youtube_video::extract(client, url)
.await
.map(|v| (youtube_video::INFO.name, v)),
);
}
// NOTE: shopify_product, shopify_collection, ecommerce_product,
// woocommerce_product, and substack_post are intentionally NOT
// in auto-dispatch. Their `matches()` functions are permissive // in auto-dispatch. Their `matches()` functions are permissive
// (any URL with `/products/`, `/product/`, `/p/`, etc.) and // (any URL with `/products/`, `/product/`, `/p/`, etc.) and
// claiming those generically would steal URLs from the default // claiming those generically would steal URLs from the default
@ -282,6 +307,12 @@ pub async fn dispatch_by_name(
}) })
.await .await
} }
n if n == github_issue::INFO.name => {
run_or_mismatch(github_issue::matches(url), n, url, || {
github_issue::extract(client, url)
})
.await
}
n if n == github_release::INFO.name => { n if n == github_release::INFO.name => {
run_or_mismatch(github_release::matches(url), n, url, || { run_or_mismatch(github_release::matches(url), n, url, || {
github_release::extract(client, url) github_release::extract(client, url)
@ -375,6 +406,30 @@ pub async fn dispatch_by_name(
}) })
.await .await
} }
n if n == youtube_video::INFO.name => {
run_or_mismatch(youtube_video::matches(url), n, url, || {
youtube_video::extract(client, url)
})
.await
}
n if n == substack_post::INFO.name => {
run_or_mismatch(substack_post::matches(url), n, url, || {
substack_post::extract(client, url)
})
.await
}
n if n == shopify_collection::INFO.name => {
run_or_mismatch(shopify_collection::matches(url), n, url, || {
shopify_collection::extract(client, url)
})
.await
}
n if n == woocommerce_product::INFO.name => {
run_or_mismatch(woocommerce_product::matches(url), n, url, || {
woocommerce_product::extract(client, url)
})
.await
}
_ => Err(ExtractorDispatchError::UnknownVertical(name.to_string())), _ => Err(ExtractorDispatchError::UnknownVertical(name.to_string())),
} }
} }

View file

@ -0,0 +1,242 @@
//! Shopify collection structured extractor.
//!
//! Every Shopify store exposes `/collections/{handle}.json` and
//! `/collections/{handle}/products.json` on the public surface. This
//! extractor hits `.json` (collection metadata) and falls through to
//! `/products.json` for the first page of products. Same caveat as
//! `shopify_product`: stores with Cloudflare in front of the shop
//! will 403 the public path.
//!
//! Explicit-call only (like `shopify_product`). `/collections/{slug}`
//! is a URL shape used by non-Shopify stores too, so auto-dispatch
//! would claim too many URLs.
use serde::Deserialize;
use serde_json::{Value, json};
use super::ExtractorInfo;
use crate::client::FetchClient;
use crate::error::FetchError;
pub const INFO: ExtractorInfo = ExtractorInfo {
name: "shopify_collection",
label: "Shopify collection",
description: "Returns collection metadata + first page of products (handle, title, vendor, price, available) on ANY Shopify store via /collections/{handle}.json + /products.json.",
url_patterns: &[
"https://{shop}/collections/{handle}",
"https://{shop}.myshopify.com/collections/{handle}",
],
};
pub fn matches(url: &str) -> bool {
let host = host_of(url);
if host.is_empty() || NON_SHOPIFY_HOSTS.iter().any(|h| host.ends_with(h)) {
return false;
}
url.contains("/collections/") && !url.ends_with("/collections/")
}
const NON_SHOPIFY_HOSTS: &[&str] = &[
"amazon.com",
"amazon.co.uk",
"amazon.de",
"ebay.com",
"etsy.com",
"walmart.com",
"target.com",
"aliexpress.com",
"huggingface.co", // has /collections/ for models
"github.com",
];
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
let (coll_meta_url, coll_products_url) = build_json_urls(url);
// Step 1: collection metadata. Shopify returns 200 on missing
// collections sometimes; check "collection" key below.
let meta_resp = client.fetch(&coll_meta_url).await?;
if meta_resp.status == 404 {
return Err(FetchError::Build(format!(
"shopify_collection: '{url}' not found"
)));
}
if meta_resp.status == 403 {
return Err(FetchError::Build(format!(
"shopify_collection: {coll_meta_url} returned 403. The store has antibot in front of the .json endpoint. Use /v1/scrape/ecommerce_product or api.webclaw.io for this store."
)));
}
if meta_resp.status != 200 {
return Err(FetchError::Build(format!(
"shopify returned status {} for {coll_meta_url}",
meta_resp.status
)));
}
let meta: MetaWrapper = serde_json::from_str(&meta_resp.html).map_err(|e| {
FetchError::BodyDecode(format!(
"shopify_collection: '{url}' didn't return Shopify JSON, likely not a Shopify store ({e})"
))
})?;
// Step 2: first page of products for this collection.
let products = match client.fetch(&coll_products_url).await {
Ok(r) if r.status == 200 => serde_json::from_str::<ProductsWrapper>(&r.html)
.ok()
.map(|pw| pw.products)
.unwrap_or_default(),
_ => Vec::new(),
};
let product_summaries: Vec<Value> = products
.iter()
.map(|p| {
let first_variant = p.variants.first();
json!({
"id": p.id,
"handle": p.handle,
"title": p.title,
"vendor": p.vendor,
"product_type": p.product_type,
"price": first_variant.and_then(|v| v.price.clone()),
"compare_at_price":first_variant.and_then(|v| v.compare_at_price.clone()),
"available": p.variants.iter().any(|v| v.available.unwrap_or(false)),
"variant_count": p.variants.len(),
"image": p.images.first().and_then(|i| i.src.clone()),
"created_at": p.created_at,
"updated_at": p.updated_at,
})
})
.collect();
let c = meta.collection;
Ok(json!({
"url": url,
"meta_json_url": coll_meta_url,
"products_json_url": coll_products_url,
"collection_id": c.id,
"handle": c.handle,
"title": c.title,
"description_html": c.body_html,
"published_at": c.published_at,
"updated_at": c.updated_at,
"sort_order": c.sort_order,
"products_in_page": product_summaries.len(),
"products": product_summaries,
}))
}
// ---------------------------------------------------------------------------
// URL helpers
// ---------------------------------------------------------------------------
fn host_of(url: &str) -> &str {
url.split("://")
.nth(1)
.unwrap_or(url)
.split('/')
.next()
.unwrap_or("")
}
/// Build `(collection.json, collection/products.json)` from a user URL.
fn build_json_urls(url: &str) -> (String, String) {
let (path_part, _query_part) = match url.split_once('?') {
Some((a, b)) => (a, Some(b)),
None => (url, None),
};
let clean = path_part.trim_end_matches('/').trim_end_matches(".json");
(
format!("{clean}.json"),
format!("{clean}/products.json?limit=50"),
)
}
// ---------------------------------------------------------------------------
// Shopify collection + product JSON shapes (subsets)
// ---------------------------------------------------------------------------
#[derive(Deserialize)]
struct MetaWrapper {
collection: Collection,
}
#[derive(Deserialize)]
struct Collection {
id: Option<i64>,
handle: Option<String>,
title: Option<String>,
body_html: Option<String>,
published_at: Option<String>,
updated_at: Option<String>,
sort_order: Option<String>,
}
#[derive(Deserialize)]
struct ProductsWrapper {
#[serde(default)]
products: Vec<ProductSummary>,
}
#[derive(Deserialize)]
struct ProductSummary {
id: Option<i64>,
handle: Option<String>,
title: Option<String>,
vendor: Option<String>,
product_type: Option<String>,
created_at: Option<String>,
updated_at: Option<String>,
#[serde(default)]
variants: Vec<VariantSummary>,
#[serde(default)]
images: Vec<ImageSummary>,
}
#[derive(Deserialize)]
struct VariantSummary {
price: Option<String>,
compare_at_price: Option<String>,
available: Option<bool>,
}
#[derive(Deserialize)]
struct ImageSummary {
src: Option<String>,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn matches_shopify_collection_urls() {
assert!(matches("https://www.allbirds.com/collections/mens"));
assert!(matches(
"https://shop.example.com/collections/new-arrivals?page=2"
));
}
#[test]
fn rejects_non_shopify() {
assert!(!matches("https://github.com/collections/foo"));
assert!(!matches("https://huggingface.co/collections/foo"));
assert!(!matches("https://example.com/"));
assert!(!matches("https://example.com/collections/"));
}
#[test]
fn build_json_urls_derives_both_paths() {
let (meta, products) = build_json_urls("https://shop.example.com/collections/mens");
assert_eq!(meta, "https://shop.example.com/collections/mens.json");
assert_eq!(
products,
"https://shop.example.com/collections/mens/products.json?limit=50"
);
}
#[test]
fn build_json_urls_handles_trailing_slash() {
let (meta, _) = build_json_urls("https://shop.example.com/collections/mens/");
assert_eq!(meta, "https://shop.example.com/collections/mens.json");
}
}

View file

@ -0,0 +1,213 @@
//! Substack post extractor.
//!
//! Every Substack publication exposes `/api/v1/posts/{slug}` that
//! returns the full post as JSON: body HTML, cover image, author,
//! publication info, reactions, paywall state. No auth on public
//! posts.
//!
//! Works on both `*.substack.com` subdomains and custom domains
//! (e.g. `simonwillison.net` uses Substack too). Detection is
//! "URL has `/p/{slug}`" because that's the canonical Substack post
//! path. Explicit-call only because the `/p/{slug}` URL shape is
//! used by non-Substack sites too.
use serde::Deserialize;
use serde_json::{Value, json};
use super::ExtractorInfo;
use crate::client::FetchClient;
use crate::error::FetchError;
pub const INFO: ExtractorInfo = ExtractorInfo {
name: "substack_post",
label: "Substack post",
description: "Returns post HTML, title, subtitle, author, publication, reactions, paywall status via the Substack public API.",
url_patterns: &[
"https://{pub}.substack.com/p/{slug}",
"https://{custom-domain}/p/{slug}",
],
};
pub fn matches(url: &str) -> bool {
if !(url.starts_with("http://") || url.starts_with("https://")) {
return false;
}
url.contains("/p/")
}
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
let slug = parse_slug(url).ok_or_else(|| {
FetchError::Build(format!("substack_post: cannot parse slug from '{url}'"))
})?;
let host = host_of(url);
if host.is_empty() {
return Err(FetchError::Build(format!(
"substack_post: empty host in '{url}'"
)));
}
let scheme = if url.starts_with("http://") {
"http"
} else {
"https"
};
let api_url = format!("{scheme}://{host}/api/v1/posts/{slug}");
let resp = client.fetch(&api_url).await?;
if resp.status == 404 {
return Err(FetchError::Build(format!(
"substack_post: '{slug}' not found on {host} (got 404). \
If the publication isn't actually on Substack, use /v1/scrape instead."
)));
}
if resp.status != 200 {
return Err(FetchError::Build(format!(
"substack returned status {} for {api_url}",
resp.status
)));
}
let p: Post = serde_json::from_str(&resp.html).map_err(|e| {
FetchError::BodyDecode(format!(
"substack_post: '{host}' didn't return Substack JSON, likely not a Substack ({e})"
))
})?;
Ok(json!({
"url": url,
"api_url": api_url,
"id": p.id,
"type": p.r#type,
"slug": p.slug,
"title": p.title,
"subtitle": p.subtitle,
"description": p.description,
"canonical_url": p.canonical_url,
"post_date": p.post_date,
"updated_at": p.updated_at,
"audience": p.audience,
"has_paywall": matches!(p.audience.as_deref(), Some("only_paid") | Some("founding")),
"is_free_preview": p.is_free_preview,
"cover_image": p.cover_image,
"word_count": p.wordcount,
"reactions": p.reactions,
"comment_count": p.comment_count,
"body_html": p.body_html,
"body_text": p.truncated_body_text.or(p.body_text),
"publication": json!({
"id": p.publication.as_ref().and_then(|pub_| pub_.id),
"name": p.publication.as_ref().and_then(|pub_| pub_.name.clone()),
"subdomain": p.publication.as_ref().and_then(|pub_| pub_.subdomain.clone()),
"custom_domain":p.publication.as_ref().and_then(|pub_| pub_.custom_domain.clone()),
}),
"authors": p.published_bylines.iter().map(|a| json!({
"id": a.id,
"name": a.name,
"handle": a.handle,
"photo": a.photo_url,
})).collect::<Vec<_>>(),
}))
}
// ---------------------------------------------------------------------------
// URL helpers
// ---------------------------------------------------------------------------
fn host_of(url: &str) -> &str {
url.split("://")
.nth(1)
.unwrap_or(url)
.split('/')
.next()
.unwrap_or("")
}
fn parse_slug(url: &str) -> Option<String> {
let after = url.split("/p/").nth(1)?;
let stripped = after
.split(['?', '#'])
.next()?
.trim_end_matches('/')
.split('/')
.next()
.unwrap_or("");
if stripped.is_empty() {
None
} else {
Some(stripped.to_string())
}
}
// ---------------------------------------------------------------------------
// Substack API types (subset)
// ---------------------------------------------------------------------------
#[derive(Deserialize)]
struct Post {
id: Option<i64>,
r#type: Option<String>,
slug: Option<String>,
title: Option<String>,
subtitle: Option<String>,
description: Option<String>,
canonical_url: Option<String>,
post_date: Option<String>,
updated_at: Option<String>,
audience: Option<String>,
is_free_preview: Option<bool>,
cover_image: Option<String>,
wordcount: Option<i64>,
reactions: Option<serde_json::Value>,
comment_count: Option<i64>,
body_html: Option<String>,
body_text: Option<String>,
truncated_body_text: Option<String>,
publication: Option<Publication>,
#[serde(default, rename = "publishedBylines")]
published_bylines: Vec<Byline>,
}
#[derive(Deserialize)]
struct Publication {
id: Option<i64>,
name: Option<String>,
subdomain: Option<String>,
custom_domain: Option<String>,
}
#[derive(Deserialize)]
struct Byline {
id: Option<i64>,
name: Option<String>,
handle: Option<String>,
photo_url: Option<String>,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn matches_post_urls() {
assert!(matches(
"https://stratechery.substack.com/p/the-tech-letter"
));
assert!(matches("https://simonwillison.net/p/2024-08-01-something"));
assert!(!matches("https://example.com/"));
assert!(!matches("ftp://example.com/p/foo"));
}
#[test]
fn parse_slug_strips_query_and_trailing_slash() {
assert_eq!(
parse_slug("https://example.substack.com/p/my-post"),
Some("my-post".into())
);
assert_eq!(
parse_slug("https://example.substack.com/p/my-post/"),
Some("my-post".into())
);
assert_eq!(
parse_slug("https://example.substack.com/p/my-post?ref=123"),
Some("my-post".into())
);
}
}

View file

@ -0,0 +1,237 @@
//! WooCommerce product structured extractor.
//!
//! Targets WooCommerce's Store API: `/wp-json/wc/store/v1/products?slug={slug}`.
//! About 30-50% of WooCommerce stores expose this endpoint publicly
//! (it's on by default, but common security plugins disable it).
//! When it's off, the server returns 404 at /wp-json. We surface a
//! clean error and point callers at `/v1/scrape/ecommerce_product`
//! which works on any store with Schema.org JSON-LD.
//!
//! Explicit-call only. `/product/{slug}` is the default permalink for
//! WooCommerce but custom stores use every variation imaginable, so
//! auto-dispatch is unreliable.
use serde::Deserialize;
use serde_json::{Value, json};
use super::ExtractorInfo;
use crate::client::FetchClient;
use crate::error::FetchError;
pub const INFO: ExtractorInfo = ExtractorInfo {
name: "woocommerce_product",
label: "WooCommerce product",
description: "Returns product via the WooCommerce Store REST API (requires the /wp-json/wc/store endpoint to be enabled on the target store).",
url_patterns: &[
"https://{shop}/product/{slug}",
"https://{shop}/shop/{slug}",
],
};
pub fn matches(url: &str) -> bool {
let host = host_of(url);
if host.is_empty() {
return false;
}
// Permissive: WooCommerce stores use custom domains + custom
// permalinks. The extractor's API probe is what confirms it's
// really WooCommerce.
url.contains("/product/")
|| url.contains("/shop/")
|| url.contains("/producto/") // common es locale
|| url.contains("/produit/") // common fr locale
}
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
let slug = parse_slug(url).ok_or_else(|| {
FetchError::Build(format!(
"woocommerce_product: cannot parse slug from '{url}'"
))
})?;
let host = host_of(url);
if host.is_empty() {
return Err(FetchError::Build(format!(
"woocommerce_product: empty host in '{url}'"
)));
}
let scheme = if url.starts_with("http://") {
"http"
} else {
"https"
};
let api_url = format!("{scheme}://{host}/wp-json/wc/store/v1/products?slug={slug}&per_page=1");
let resp = client.fetch(&api_url).await?;
if resp.status == 404 {
return Err(FetchError::Build(format!(
"woocommerce_product: {host} does not expose /wp-json/wc/store (404). \
Use /v1/scrape/ecommerce_product for JSON-LD fallback."
)));
}
if resp.status == 401 || resp.status == 403 {
return Err(FetchError::Build(format!(
"woocommerce_product: {host} requires auth for /wp-json/wc/store ({}). \
Use /v1/scrape/ecommerce_product for the public JSON-LD fallback.",
resp.status
)));
}
if resp.status != 200 {
return Err(FetchError::Build(format!(
"woocommerce api returned status {} for {api_url}",
resp.status
)));
}
let products: Vec<Product> = serde_json::from_str(&resp.html)
.map_err(|e| FetchError::BodyDecode(format!("woocommerce parse: {e}")))?;
let p = products.into_iter().next().ok_or_else(|| {
FetchError::Build(format!(
"woocommerce_product: no product found for slug '{slug}' on {host}"
))
})?;
let images: Vec<Value> = p
.images
.iter()
.map(|i| json!({"src": i.src, "thumbnail": i.thumbnail, "alt": i.alt}))
.collect();
let variations_count = p.variations.as_ref().map(|v| v.len()).unwrap_or(0);
Ok(json!({
"url": url,
"api_url": api_url,
"product_id": p.id,
"name": p.name,
"slug": p.slug,
"sku": p.sku,
"permalink": p.permalink,
"on_sale": p.on_sale,
"in_stock": p.is_in_stock,
"is_purchasable": p.is_purchasable,
"price": p.prices.as_ref().and_then(|pr| pr.price.clone()),
"regular_price": p.prices.as_ref().and_then(|pr| pr.regular_price.clone()),
"sale_price": p.prices.as_ref().and_then(|pr| pr.sale_price.clone()),
"currency": p.prices.as_ref().and_then(|pr| pr.currency_code.clone()),
"currency_minor": p.prices.as_ref().and_then(|pr| pr.currency_minor_unit),
"price_range": p.prices.as_ref().and_then(|pr| pr.price_range.clone()),
"average_rating": p.average_rating,
"review_count": p.review_count,
"description": p.description,
"short_description": p.short_description,
"categories": p.categories.iter().filter_map(|c| c.name.clone()).collect::<Vec<_>>(),
"tags": p.tags.iter().filter_map(|t| t.name.clone()).collect::<Vec<_>>(),
"variation_count": variations_count,
"image_count": images.len(),
"images": images,
}))
}
// ---------------------------------------------------------------------------
// URL helpers
// ---------------------------------------------------------------------------
fn host_of(url: &str) -> &str {
url.split("://")
.nth(1)
.unwrap_or(url)
.split('/')
.next()
.unwrap_or("")
}
/// Extract the product slug from common WooCommerce permalinks.
fn parse_slug(url: &str) -> Option<String> {
for needle in ["/product/", "/shop/", "/producto/", "/produit/"] {
if let Some(after) = url.split(needle).nth(1) {
let stripped = after
.split(['?', '#'])
.next()?
.trim_end_matches('/')
.split('/')
.next()
.unwrap_or("");
if !stripped.is_empty() {
return Some(stripped.to_string());
}
}
}
None
}
// ---------------------------------------------------------------------------
// Store API types (subset of the full response)
// ---------------------------------------------------------------------------
#[derive(Deserialize)]
struct Product {
id: Option<i64>,
name: Option<String>,
slug: Option<String>,
sku: Option<String>,
permalink: Option<String>,
description: Option<String>,
short_description: Option<String>,
on_sale: Option<bool>,
is_in_stock: Option<bool>,
is_purchasable: Option<bool>,
average_rating: Option<serde_json::Value>, // string or number
review_count: Option<i64>,
prices: Option<Prices>,
#[serde(default)]
categories: Vec<Term>,
#[serde(default)]
tags: Vec<Term>,
#[serde(default)]
images: Vec<Img>,
variations: Option<Vec<serde_json::Value>>,
}
#[derive(Deserialize)]
struct Prices {
price: Option<String>,
regular_price: Option<String>,
sale_price: Option<String>,
currency_code: Option<String>,
currency_minor_unit: Option<i64>,
price_range: Option<serde_json::Value>,
}
#[derive(Deserialize)]
struct Term {
name: Option<String>,
}
#[derive(Deserialize)]
struct Img {
src: Option<String>,
thumbnail: Option<String>,
alt: Option<String>,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn matches_common_permalinks() {
assert!(matches("https://shop.example.com/product/cool-widget"));
assert!(matches("https://shop.example.com/shop/cool-widget"));
assert!(matches("https://tienda.example.com/producto/cosa"));
assert!(matches("https://boutique.example.com/produit/chose"));
}
#[test]
fn parse_slug_handles_locale_and_suffix() {
assert_eq!(
parse_slug("https://shop.example.com/product/cool-widget"),
Some("cool-widget".into())
);
assert_eq!(
parse_slug("https://shop.example.com/product/cool-widget/?attr=red"),
Some("cool-widget".into())
);
assert_eq!(
parse_slug("https://tienda.example.com/producto/cosa/"),
Some("cosa".into())
);
}
}

View file

@ -0,0 +1,255 @@
//! YouTube video structured extractor.
//!
//! YouTube embeds the full player configuration in a
//! `ytInitialPlayerResponse` JavaScript assignment at the top of
//! every `/watch`, `/shorts`, and `youtu.be` HTML page. We reuse the
//! core crate's already-proven regex + parse to surface typed JSON
//! from it: video id, title, author + channel id, view count,
//! duration, upload date, keywords, thumbnails, caption-track URLs.
//!
//! Auto-dispatched: YouTube host is unique and the `v=` or `/shorts/`
//! shape is stable.
use serde_json::{Value, json};
use super::ExtractorInfo;
use crate::client::FetchClient;
use crate::error::FetchError;
pub const INFO: ExtractorInfo = ExtractorInfo {
name: "youtube_video",
label: "YouTube video",
description: "Returns video id, title, channel, view count, duration, upload date, thumbnails, keywords, and caption-track URLs.",
url_patterns: &[
"https://www.youtube.com/watch?v={id}",
"https://youtu.be/{id}",
"https://www.youtube.com/shorts/{id}",
],
};
pub fn matches(url: &str) -> bool {
webclaw_core::youtube::is_youtube_url(url)
|| url.contains("youtube.com/shorts/")
|| url.contains("youtube-nocookie.com/embed/")
}
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
let video_id = parse_video_id(url).ok_or_else(|| {
FetchError::Build(format!("youtube_video: cannot parse video id from '{url}'"))
})?;
// Always fetch the canonical /watch URL. /shorts/ and youtu.be
// sometimes serve a thinner page without the player blob.
let canonical = format!("https://www.youtube.com/watch?v={video_id}");
let resp = client.fetch(&canonical).await?;
if resp.status != 200 {
return Err(FetchError::Build(format!(
"youtube returned status {} for {canonical}",
resp.status
)));
}
let player = extract_player_response(&resp.html).ok_or_else(|| {
FetchError::BodyDecode(format!(
"youtube_video: no ytInitialPlayerResponse on {canonical} (video may be private, region-blocked, or removed)"
))
})?;
let video_details = player.get("videoDetails");
let microformat = player
.get("microformat")
.and_then(|m| m.get("playerMicroformatRenderer"));
let thumbnails: Vec<Value> = video_details
.and_then(|vd| vd.get("thumbnail"))
.and_then(|t| t.get("thumbnails"))
.and_then(|t| t.as_array())
.cloned()
.unwrap_or_default();
let keywords: Vec<Value> = video_details
.and_then(|vd| vd.get("keywords"))
.and_then(|k| k.as_array())
.cloned()
.unwrap_or_default();
let caption_tracks = webclaw_core::youtube::extract_caption_tracks(&resp.html);
let captions: Vec<Value> = caption_tracks
.iter()
.map(|c| {
json!({
"url": c.url,
"lang": c.lang,
"name": c.name,
})
})
.collect();
Ok(json!({
"url": url,
"canonical_url":canonical,
"video_id": video_id,
"title": get_str(video_details, "title"),
"description": get_str(video_details, "shortDescription"),
"author": get_str(video_details, "author"),
"channel_id": get_str(video_details, "channelId"),
"channel_url": get_str(microformat, "ownerProfileUrl"),
"view_count": get_int(video_details, "viewCount"),
"length_seconds": get_int(video_details, "lengthSeconds"),
"is_live": video_details.and_then(|vd| vd.get("isLiveContent")).and_then(|v| v.as_bool()),
"is_private": video_details.and_then(|vd| vd.get("isPrivate")).and_then(|v| v.as_bool()),
"is_unlisted": microformat.and_then(|m| m.get("isUnlisted")).and_then(|v| v.as_bool()),
"allow_ratings":video_details.and_then(|vd| vd.get("allowRatings")).and_then(|v| v.as_bool()),
"category": get_str(microformat, "category"),
"upload_date": get_str(microformat, "uploadDate"),
"publish_date": get_str(microformat, "publishDate"),
"keywords": keywords,
"thumbnails": thumbnails,
"caption_tracks": captions,
}))
}
// ---------------------------------------------------------------------------
// URL helpers
// ---------------------------------------------------------------------------
fn parse_video_id(url: &str) -> Option<String> {
// youtu.be/{id}
if let Some(after) = url.split("youtu.be/").nth(1) {
let id = after
.split(['?', '#', '/'])
.next()
.unwrap_or("")
.trim_end_matches('/');
if !id.is_empty() {
return Some(id.to_string());
}
}
// youtube.com/shorts/{id}
if let Some(after) = url.split("youtube.com/shorts/").nth(1) {
let id = after
.split(['?', '#', '/'])
.next()
.unwrap_or("")
.trim_end_matches('/');
if !id.is_empty() {
return Some(id.to_string());
}
}
// youtube-nocookie.com/embed/{id}
if let Some(after) = url.split("/embed/").nth(1) {
let id = after
.split(['?', '#', '/'])
.next()
.unwrap_or("")
.trim_end_matches('/');
if !id.is_empty() {
return Some(id.to_string());
}
}
// youtube.com/watch?v={id} (also matches youtube.com/watch?foo=bar&v={id})
if let Some(q) = url.split_once('?').map(|(_, q)| q)
&& let Some(id) = q
.split('&')
.find_map(|p| p.strip_prefix("v=").map(|v| v.to_string()))
{
let id = id.split(['#', '/']).next().unwrap_or(&id).to_string();
if !id.is_empty() {
return Some(id);
}
}
None
}
// ---------------------------------------------------------------------------
// Player-response parsing
// ---------------------------------------------------------------------------
fn extract_player_response(html: &str) -> Option<Value> {
use regex::Regex;
use std::sync::OnceLock;
// Same regex as webclaw_core::youtube. Duplicated here because
// core's regex is module-private. Kept in lockstep; changes are
// rare and we cover with tests in both places.
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE
.get_or_init(|| Regex::new(r"var\s+ytInitialPlayerResponse\s*=\s*(\{.+?\})\s*;").unwrap());
let json_str = re.captures(html)?.get(1)?.as_str();
serde_json::from_str(json_str).ok()
}
fn get_str(v: Option<&Value>, key: &str) -> Option<String> {
v.and_then(|x| x.get(key))
.and_then(|x| x.as_str().map(String::from))
}
fn get_int(v: Option<&Value>, key: &str) -> Option<i64> {
v.and_then(|x| x.get(key)).and_then(|x| {
x.as_i64()
.or_else(|| x.as_str().and_then(|s| s.parse::<i64>().ok()))
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn matches_watch_urls() {
assert!(matches("https://www.youtube.com/watch?v=dQw4w9WgXcQ"));
assert!(matches("https://youtu.be/dQw4w9WgXcQ"));
assert!(matches("https://www.youtube.com/shorts/abc123"));
assert!(matches(
"https://www.youtube-nocookie.com/embed/dQw4w9WgXcQ"
));
}
#[test]
fn rejects_non_video_urls() {
assert!(!matches("https://www.youtube.com/"));
assert!(!matches("https://www.youtube.com/channel/abc"));
assert!(!matches("https://example.com/watch?v=abc"));
}
#[test]
fn parse_video_id_from_each_shape() {
assert_eq!(
parse_video_id("https://www.youtube.com/watch?v=dQw4w9WgXcQ"),
Some("dQw4w9WgXcQ".into())
);
assert_eq!(
parse_video_id("https://www.youtube.com/watch?v=dQw4w9WgXcQ&t=10s"),
Some("dQw4w9WgXcQ".into())
);
assert_eq!(
parse_video_id("https://www.youtube.com/watch?feature=share&v=dQw4w9WgXcQ"),
Some("dQw4w9WgXcQ".into())
);
assert_eq!(
parse_video_id("https://youtu.be/dQw4w9WgXcQ"),
Some("dQw4w9WgXcQ".into())
);
assert_eq!(
parse_video_id("https://youtu.be/dQw4w9WgXcQ?t=30"),
Some("dQw4w9WgXcQ".into())
);
assert_eq!(
parse_video_id("https://www.youtube.com/shorts/abc123"),
Some("abc123".into())
);
}
#[test]
fn extract_player_response_happy_path() {
let html = r#"
<html><body>
<script>
var ytInitialPlayerResponse = {"videoDetails":{"videoId":"abc","title":"T","author":"A","viewCount":"100","lengthSeconds":"60","shortDescription":"d"}};
</script>
</body></html>
"#;
let v = extract_player_response(html).unwrap();
let vd = v.get("videoDetails").unwrap();
assert_eq!(vd.get("title").unwrap().as_str(), Some("T"));
}
}