mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-04-25 00:06:21 +02:00
feat(extractors): wave 6a, 5 easy verticals (27 total)
Adds 5 structured extractors that hit public APIs with stable shapes:
- github_issue: /repos/{o}/{r}/issues/{n} (rejects PRs, points to github_pr)
- shopify_collection: /collections/{handle}.json + products.json
- woocommerce_product: /wp-json/wc/store/v1/products?slug={slug}
- substack_post: /api/v1/posts/{slug} (works on custom domains too)
- youtube_video: ytInitialPlayerResponse blob from /watch HTML
Auto-dispatched: github_issue, youtube_video (unique hosts and stable
URL shapes). Explicit-call: shopify_collection, woocommerce_product,
substack_post (URL shapes overlap with non-target sites).
Tests: 82 total passing in webclaw-fetch (12 new), clippy clean.
This commit is contained in:
parent
d8c9274a9c
commit
8cc727c2f2
6 changed files with 1175 additions and 1 deletions
172
crates/webclaw-fetch/src/extractors/github_issue.rs
Normal file
172
crates/webclaw-fetch/src/extractors/github_issue.rs
Normal file
|
|
@ -0,0 +1,172 @@
|
||||||
|
//! GitHub issue structured extractor.
|
||||||
|
//!
|
||||||
|
//! Mirror of `github_pr` but on `/issues/{number}`. Uses
|
||||||
|
//! `api.github.com/repos/{owner}/{repo}/issues/{number}`. Returns the
|
||||||
|
//! issue body + comment count + labels + milestone + author /
|
||||||
|
//! assignees. Full per-comment bodies would be another call; kept for
|
||||||
|
//! a follow-up.
|
||||||
|
|
||||||
|
use serde::Deserialize;
|
||||||
|
use serde_json::{Value, json};
|
||||||
|
|
||||||
|
use super::ExtractorInfo;
|
||||||
|
use crate::client::FetchClient;
|
||||||
|
use crate::error::FetchError;
|
||||||
|
|
||||||
|
pub const INFO: ExtractorInfo = ExtractorInfo {
|
||||||
|
name: "github_issue",
|
||||||
|
label: "GitHub issue",
|
||||||
|
description: "Returns issue metadata: title, body, state, author, labels, assignees, milestone, comment count.",
|
||||||
|
url_patterns: &["https://github.com/{owner}/{repo}/issues/{number}"],
|
||||||
|
};
|
||||||
|
|
||||||
|
pub fn matches(url: &str) -> bool {
|
||||||
|
let host = url
|
||||||
|
.split("://")
|
||||||
|
.nth(1)
|
||||||
|
.unwrap_or(url)
|
||||||
|
.split('/')
|
||||||
|
.next()
|
||||||
|
.unwrap_or("");
|
||||||
|
if host != "github.com" && host != "www.github.com" {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
parse_issue(url).is_some()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
|
||||||
|
let (owner, repo, number) = parse_issue(url).ok_or_else(|| {
|
||||||
|
FetchError::Build(format!("github_issue: cannot parse issue URL '{url}'"))
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let api_url = format!("https://api.github.com/repos/{owner}/{repo}/issues/{number}");
|
||||||
|
let resp = client.fetch(&api_url).await?;
|
||||||
|
if resp.status == 404 {
|
||||||
|
return Err(FetchError::Build(format!(
|
||||||
|
"github_issue: issue '{owner}/{repo}#{number}' not found"
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
if resp.status == 403 {
|
||||||
|
return Err(FetchError::Build(
|
||||||
|
"github_issue: rate limited (60/hour unauth). Set GITHUB_TOKEN for 5,000/hour.".into(),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
if resp.status != 200 {
|
||||||
|
return Err(FetchError::Build(format!(
|
||||||
|
"github api returned status {}",
|
||||||
|
resp.status
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
|
||||||
|
let issue: Issue = serde_json::from_str(&resp.html)
|
||||||
|
.map_err(|e| FetchError::BodyDecode(format!("github issue parse: {e}")))?;
|
||||||
|
|
||||||
|
// The same endpoint returns PRs too; reject if we got one so the caller
|
||||||
|
// uses /v1/scrape/github_pr instead of getting a half-shaped payload.
|
||||||
|
if issue.pull_request.is_some() {
|
||||||
|
return Err(FetchError::Build(format!(
|
||||||
|
"github_issue: '{owner}/{repo}#{number}' is a pull request, use /v1/scrape/github_pr"
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(json!({
|
||||||
|
"url": url,
|
||||||
|
"owner": owner,
|
||||||
|
"repo": repo,
|
||||||
|
"number": issue.number,
|
||||||
|
"title": issue.title,
|
||||||
|
"body": issue.body,
|
||||||
|
"state": issue.state,
|
||||||
|
"state_reason":issue.state_reason,
|
||||||
|
"author": issue.user.as_ref().and_then(|u| u.login.clone()),
|
||||||
|
"labels": issue.labels.iter().filter_map(|l| l.name.clone()).collect::<Vec<_>>(),
|
||||||
|
"assignees": issue.assignees.iter().filter_map(|u| u.login.clone()).collect::<Vec<_>>(),
|
||||||
|
"milestone": issue.milestone.as_ref().and_then(|m| m.title.clone()),
|
||||||
|
"comments": issue.comments,
|
||||||
|
"locked": issue.locked,
|
||||||
|
"created_at": issue.created_at,
|
||||||
|
"updated_at": issue.updated_at,
|
||||||
|
"closed_at": issue.closed_at,
|
||||||
|
"html_url": issue.html_url,
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn parse_issue(url: &str) -> Option<(String, String, u64)> {
|
||||||
|
let path = url.split("://").nth(1)?.split_once('/').map(|(_, p)| p)?;
|
||||||
|
let stripped = path.split(['?', '#']).next()?.trim_end_matches('/');
|
||||||
|
let segs: Vec<&str> = stripped.split('/').filter(|s| !s.is_empty()).collect();
|
||||||
|
if segs.len() < 4 || segs[2] != "issues" {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
let number: u64 = segs[3].parse().ok()?;
|
||||||
|
Some((segs[0].to_string(), segs[1].to_string(), number))
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// GitHub issue API types
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
struct Issue {
|
||||||
|
number: Option<i64>,
|
||||||
|
title: Option<String>,
|
||||||
|
body: Option<String>,
|
||||||
|
state: Option<String>,
|
||||||
|
state_reason: Option<String>,
|
||||||
|
locked: Option<bool>,
|
||||||
|
comments: Option<i64>,
|
||||||
|
created_at: Option<String>,
|
||||||
|
updated_at: Option<String>,
|
||||||
|
closed_at: Option<String>,
|
||||||
|
html_url: Option<String>,
|
||||||
|
user: Option<UserRef>,
|
||||||
|
#[serde(default)]
|
||||||
|
labels: Vec<LabelRef>,
|
||||||
|
#[serde(default)]
|
||||||
|
assignees: Vec<UserRef>,
|
||||||
|
milestone: Option<Milestone>,
|
||||||
|
/// Present when this "issue" is actually a pull request. The REST
|
||||||
|
/// API overloads the issues endpoint for PRs.
|
||||||
|
pull_request: Option<serde_json::Value>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
struct UserRef {
|
||||||
|
login: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
struct LabelRef {
|
||||||
|
name: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
struct Milestone {
|
||||||
|
title: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn matches_issue_urls() {
|
||||||
|
assert!(matches("https://github.com/rust-lang/rust/issues/100"));
|
||||||
|
assert!(matches("https://github.com/rust-lang/rust/issues/100/"));
|
||||||
|
assert!(!matches("https://github.com/rust-lang/rust"));
|
||||||
|
assert!(!matches("https://github.com/rust-lang/rust/pull/100"));
|
||||||
|
assert!(!matches("https://github.com/rust-lang/rust/issues"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_issue_extracts_owner_repo_number() {
|
||||||
|
assert_eq!(
|
||||||
|
parse_issue("https://github.com/rust-lang/rust/issues/100"),
|
||||||
|
Some(("rust-lang".into(), "rust".into(), 100))
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
parse_issue("https://github.com/rust-lang/rust/issues/100/?foo=bar"),
|
||||||
|
Some(("rust-lang".into(), "rust".into(), 100))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -21,6 +21,7 @@ pub mod dev_to;
|
||||||
pub mod docker_hub;
|
pub mod docker_hub;
|
||||||
pub mod ebay_listing;
|
pub mod ebay_listing;
|
||||||
pub mod ecommerce_product;
|
pub mod ecommerce_product;
|
||||||
|
pub mod github_issue;
|
||||||
pub mod github_pr;
|
pub mod github_pr;
|
||||||
pub mod github_release;
|
pub mod github_release;
|
||||||
pub mod github_repo;
|
pub mod github_repo;
|
||||||
|
|
@ -33,9 +34,13 @@ pub mod linkedin_post;
|
||||||
pub mod npm;
|
pub mod npm;
|
||||||
pub mod pypi;
|
pub mod pypi;
|
||||||
pub mod reddit;
|
pub mod reddit;
|
||||||
|
pub mod shopify_collection;
|
||||||
pub mod shopify_product;
|
pub mod shopify_product;
|
||||||
pub mod stackoverflow;
|
pub mod stackoverflow;
|
||||||
|
pub mod substack_post;
|
||||||
pub mod trustpilot_reviews;
|
pub mod trustpilot_reviews;
|
||||||
|
pub mod woocommerce_product;
|
||||||
|
pub mod youtube_video;
|
||||||
|
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
|
|
@ -65,6 +70,7 @@ pub fn list() -> Vec<ExtractorInfo> {
|
||||||
hackernews::INFO,
|
hackernews::INFO,
|
||||||
github_repo::INFO,
|
github_repo::INFO,
|
||||||
github_pr::INFO,
|
github_pr::INFO,
|
||||||
|
github_issue::INFO,
|
||||||
github_release::INFO,
|
github_release::INFO,
|
||||||
pypi::INFO,
|
pypi::INFO,
|
||||||
npm::INFO,
|
npm::INFO,
|
||||||
|
|
@ -75,11 +81,15 @@ pub fn list() -> Vec<ExtractorInfo> {
|
||||||
docker_hub::INFO,
|
docker_hub::INFO,
|
||||||
dev_to::INFO,
|
dev_to::INFO,
|
||||||
stackoverflow::INFO,
|
stackoverflow::INFO,
|
||||||
|
substack_post::INFO,
|
||||||
|
youtube_video::INFO,
|
||||||
linkedin_post::INFO,
|
linkedin_post::INFO,
|
||||||
instagram_post::INFO,
|
instagram_post::INFO,
|
||||||
instagram_profile::INFO,
|
instagram_profile::INFO,
|
||||||
shopify_product::INFO,
|
shopify_product::INFO,
|
||||||
|
shopify_collection::INFO,
|
||||||
ecommerce_product::INFO,
|
ecommerce_product::INFO,
|
||||||
|
woocommerce_product::INFO,
|
||||||
amazon_product::INFO,
|
amazon_product::INFO,
|
||||||
ebay_listing::INFO,
|
ebay_listing::INFO,
|
||||||
trustpilot_reviews::INFO,
|
trustpilot_reviews::INFO,
|
||||||
|
|
@ -131,6 +141,13 @@ pub async fn dispatch_by_url(
|
||||||
.map(|v| (github_pr::INFO.name, v)),
|
.map(|v| (github_pr::INFO.name, v)),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
if github_issue::matches(url) {
|
||||||
|
return Some(
|
||||||
|
github_issue::extract(client, url)
|
||||||
|
.await
|
||||||
|
.map(|v| (github_issue::INFO.name, v)),
|
||||||
|
);
|
||||||
|
}
|
||||||
if github_release::matches(url) {
|
if github_release::matches(url) {
|
||||||
return Some(
|
return Some(
|
||||||
github_release::extract(client, url)
|
github_release::extract(client, url)
|
||||||
|
|
@ -233,7 +250,15 @@ pub async fn dispatch_by_url(
|
||||||
.map(|v| (trustpilot_reviews::INFO.name, v)),
|
.map(|v| (trustpilot_reviews::INFO.name, v)),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
// NOTE: shopify_product and ecommerce_product are intentionally NOT
|
if youtube_video::matches(url) {
|
||||||
|
return Some(
|
||||||
|
youtube_video::extract(client, url)
|
||||||
|
.await
|
||||||
|
.map(|v| (youtube_video::INFO.name, v)),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
// NOTE: shopify_product, shopify_collection, ecommerce_product,
|
||||||
|
// woocommerce_product, and substack_post are intentionally NOT
|
||||||
// in auto-dispatch. Their `matches()` functions are permissive
|
// in auto-dispatch. Their `matches()` functions are permissive
|
||||||
// (any URL with `/products/`, `/product/`, `/p/`, etc.) and
|
// (any URL with `/products/`, `/product/`, `/p/`, etc.) and
|
||||||
// claiming those generically would steal URLs from the default
|
// claiming those generically would steal URLs from the default
|
||||||
|
|
@ -282,6 +307,12 @@ pub async fn dispatch_by_name(
|
||||||
})
|
})
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
|
n if n == github_issue::INFO.name => {
|
||||||
|
run_or_mismatch(github_issue::matches(url), n, url, || {
|
||||||
|
github_issue::extract(client, url)
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
}
|
||||||
n if n == github_release::INFO.name => {
|
n if n == github_release::INFO.name => {
|
||||||
run_or_mismatch(github_release::matches(url), n, url, || {
|
run_or_mismatch(github_release::matches(url), n, url, || {
|
||||||
github_release::extract(client, url)
|
github_release::extract(client, url)
|
||||||
|
|
@ -375,6 +406,30 @@ pub async fn dispatch_by_name(
|
||||||
})
|
})
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
|
n if n == youtube_video::INFO.name => {
|
||||||
|
run_or_mismatch(youtube_video::matches(url), n, url, || {
|
||||||
|
youtube_video::extract(client, url)
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
n if n == substack_post::INFO.name => {
|
||||||
|
run_or_mismatch(substack_post::matches(url), n, url, || {
|
||||||
|
substack_post::extract(client, url)
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
n if n == shopify_collection::INFO.name => {
|
||||||
|
run_or_mismatch(shopify_collection::matches(url), n, url, || {
|
||||||
|
shopify_collection::extract(client, url)
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
n if n == woocommerce_product::INFO.name => {
|
||||||
|
run_or_mismatch(woocommerce_product::matches(url), n, url, || {
|
||||||
|
woocommerce_product::extract(client, url)
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
}
|
||||||
_ => Err(ExtractorDispatchError::UnknownVertical(name.to_string())),
|
_ => Err(ExtractorDispatchError::UnknownVertical(name.to_string())),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
242
crates/webclaw-fetch/src/extractors/shopify_collection.rs
Normal file
242
crates/webclaw-fetch/src/extractors/shopify_collection.rs
Normal file
|
|
@ -0,0 +1,242 @@
|
||||||
|
//! Shopify collection structured extractor.
|
||||||
|
//!
|
||||||
|
//! Every Shopify store exposes `/collections/{handle}.json` and
|
||||||
|
//! `/collections/{handle}/products.json` on the public surface. This
|
||||||
|
//! extractor hits `.json` (collection metadata) and falls through to
|
||||||
|
//! `/products.json` for the first page of products. Same caveat as
|
||||||
|
//! `shopify_product`: stores with Cloudflare in front of the shop
|
||||||
|
//! will 403 the public path.
|
||||||
|
//!
|
||||||
|
//! Explicit-call only (like `shopify_product`). `/collections/{slug}`
|
||||||
|
//! is a URL shape used by non-Shopify stores too, so auto-dispatch
|
||||||
|
//! would claim too many URLs.
|
||||||
|
|
||||||
|
use serde::Deserialize;
|
||||||
|
use serde_json::{Value, json};
|
||||||
|
|
||||||
|
use super::ExtractorInfo;
|
||||||
|
use crate::client::FetchClient;
|
||||||
|
use crate::error::FetchError;
|
||||||
|
|
||||||
|
pub const INFO: ExtractorInfo = ExtractorInfo {
|
||||||
|
name: "shopify_collection",
|
||||||
|
label: "Shopify collection",
|
||||||
|
description: "Returns collection metadata + first page of products (handle, title, vendor, price, available) on ANY Shopify store via /collections/{handle}.json + /products.json.",
|
||||||
|
url_patterns: &[
|
||||||
|
"https://{shop}/collections/{handle}",
|
||||||
|
"https://{shop}.myshopify.com/collections/{handle}",
|
||||||
|
],
|
||||||
|
};
|
||||||
|
|
||||||
|
pub fn matches(url: &str) -> bool {
|
||||||
|
let host = host_of(url);
|
||||||
|
if host.is_empty() || NON_SHOPIFY_HOSTS.iter().any(|h| host.ends_with(h)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
url.contains("/collections/") && !url.ends_with("/collections/")
|
||||||
|
}
|
||||||
|
|
||||||
|
const NON_SHOPIFY_HOSTS: &[&str] = &[
|
||||||
|
"amazon.com",
|
||||||
|
"amazon.co.uk",
|
||||||
|
"amazon.de",
|
||||||
|
"ebay.com",
|
||||||
|
"etsy.com",
|
||||||
|
"walmart.com",
|
||||||
|
"target.com",
|
||||||
|
"aliexpress.com",
|
||||||
|
"huggingface.co", // has /collections/ for models
|
||||||
|
"github.com",
|
||||||
|
];
|
||||||
|
|
||||||
|
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
|
||||||
|
let (coll_meta_url, coll_products_url) = build_json_urls(url);
|
||||||
|
|
||||||
|
// Step 1: collection metadata. Shopify returns 200 on missing
|
||||||
|
// collections sometimes; check "collection" key below.
|
||||||
|
let meta_resp = client.fetch(&coll_meta_url).await?;
|
||||||
|
if meta_resp.status == 404 {
|
||||||
|
return Err(FetchError::Build(format!(
|
||||||
|
"shopify_collection: '{url}' not found"
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
if meta_resp.status == 403 {
|
||||||
|
return Err(FetchError::Build(format!(
|
||||||
|
"shopify_collection: {coll_meta_url} returned 403. The store has antibot in front of the .json endpoint. Use /v1/scrape/ecommerce_product or api.webclaw.io for this store."
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
if meta_resp.status != 200 {
|
||||||
|
return Err(FetchError::Build(format!(
|
||||||
|
"shopify returned status {} for {coll_meta_url}",
|
||||||
|
meta_resp.status
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
|
||||||
|
let meta: MetaWrapper = serde_json::from_str(&meta_resp.html).map_err(|e| {
|
||||||
|
FetchError::BodyDecode(format!(
|
||||||
|
"shopify_collection: '{url}' didn't return Shopify JSON, likely not a Shopify store ({e})"
|
||||||
|
))
|
||||||
|
})?;
|
||||||
|
|
||||||
|
// Step 2: first page of products for this collection.
|
||||||
|
let products = match client.fetch(&coll_products_url).await {
|
||||||
|
Ok(r) if r.status == 200 => serde_json::from_str::<ProductsWrapper>(&r.html)
|
||||||
|
.ok()
|
||||||
|
.map(|pw| pw.products)
|
||||||
|
.unwrap_or_default(),
|
||||||
|
_ => Vec::new(),
|
||||||
|
};
|
||||||
|
|
||||||
|
let product_summaries: Vec<Value> = products
|
||||||
|
.iter()
|
||||||
|
.map(|p| {
|
||||||
|
let first_variant = p.variants.first();
|
||||||
|
json!({
|
||||||
|
"id": p.id,
|
||||||
|
"handle": p.handle,
|
||||||
|
"title": p.title,
|
||||||
|
"vendor": p.vendor,
|
||||||
|
"product_type": p.product_type,
|
||||||
|
"price": first_variant.and_then(|v| v.price.clone()),
|
||||||
|
"compare_at_price":first_variant.and_then(|v| v.compare_at_price.clone()),
|
||||||
|
"available": p.variants.iter().any(|v| v.available.unwrap_or(false)),
|
||||||
|
"variant_count": p.variants.len(),
|
||||||
|
"image": p.images.first().and_then(|i| i.src.clone()),
|
||||||
|
"created_at": p.created_at,
|
||||||
|
"updated_at": p.updated_at,
|
||||||
|
})
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let c = meta.collection;
|
||||||
|
Ok(json!({
|
||||||
|
"url": url,
|
||||||
|
"meta_json_url": coll_meta_url,
|
||||||
|
"products_json_url": coll_products_url,
|
||||||
|
"collection_id": c.id,
|
||||||
|
"handle": c.handle,
|
||||||
|
"title": c.title,
|
||||||
|
"description_html": c.body_html,
|
||||||
|
"published_at": c.published_at,
|
||||||
|
"updated_at": c.updated_at,
|
||||||
|
"sort_order": c.sort_order,
|
||||||
|
"products_in_page": product_summaries.len(),
|
||||||
|
"products": product_summaries,
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// URL helpers
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
fn host_of(url: &str) -> &str {
|
||||||
|
url.split("://")
|
||||||
|
.nth(1)
|
||||||
|
.unwrap_or(url)
|
||||||
|
.split('/')
|
||||||
|
.next()
|
||||||
|
.unwrap_or("")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Build `(collection.json, collection/products.json)` from a user URL.
|
||||||
|
fn build_json_urls(url: &str) -> (String, String) {
|
||||||
|
let (path_part, _query_part) = match url.split_once('?') {
|
||||||
|
Some((a, b)) => (a, Some(b)),
|
||||||
|
None => (url, None),
|
||||||
|
};
|
||||||
|
let clean = path_part.trim_end_matches('/').trim_end_matches(".json");
|
||||||
|
(
|
||||||
|
format!("{clean}.json"),
|
||||||
|
format!("{clean}/products.json?limit=50"),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Shopify collection + product JSON shapes (subsets)
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
struct MetaWrapper {
|
||||||
|
collection: Collection,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
struct Collection {
|
||||||
|
id: Option<i64>,
|
||||||
|
handle: Option<String>,
|
||||||
|
title: Option<String>,
|
||||||
|
body_html: Option<String>,
|
||||||
|
published_at: Option<String>,
|
||||||
|
updated_at: Option<String>,
|
||||||
|
sort_order: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
struct ProductsWrapper {
|
||||||
|
#[serde(default)]
|
||||||
|
products: Vec<ProductSummary>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
struct ProductSummary {
|
||||||
|
id: Option<i64>,
|
||||||
|
handle: Option<String>,
|
||||||
|
title: Option<String>,
|
||||||
|
vendor: Option<String>,
|
||||||
|
product_type: Option<String>,
|
||||||
|
created_at: Option<String>,
|
||||||
|
updated_at: Option<String>,
|
||||||
|
#[serde(default)]
|
||||||
|
variants: Vec<VariantSummary>,
|
||||||
|
#[serde(default)]
|
||||||
|
images: Vec<ImageSummary>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
struct VariantSummary {
|
||||||
|
price: Option<String>,
|
||||||
|
compare_at_price: Option<String>,
|
||||||
|
available: Option<bool>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
struct ImageSummary {
|
||||||
|
src: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn matches_shopify_collection_urls() {
|
||||||
|
assert!(matches("https://www.allbirds.com/collections/mens"));
|
||||||
|
assert!(matches(
|
||||||
|
"https://shop.example.com/collections/new-arrivals?page=2"
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn rejects_non_shopify() {
|
||||||
|
assert!(!matches("https://github.com/collections/foo"));
|
||||||
|
assert!(!matches("https://huggingface.co/collections/foo"));
|
||||||
|
assert!(!matches("https://example.com/"));
|
||||||
|
assert!(!matches("https://example.com/collections/"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn build_json_urls_derives_both_paths() {
|
||||||
|
let (meta, products) = build_json_urls("https://shop.example.com/collections/mens");
|
||||||
|
assert_eq!(meta, "https://shop.example.com/collections/mens.json");
|
||||||
|
assert_eq!(
|
||||||
|
products,
|
||||||
|
"https://shop.example.com/collections/mens/products.json?limit=50"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn build_json_urls_handles_trailing_slash() {
|
||||||
|
let (meta, _) = build_json_urls("https://shop.example.com/collections/mens/");
|
||||||
|
assert_eq!(meta, "https://shop.example.com/collections/mens.json");
|
||||||
|
}
|
||||||
|
}
|
||||||
213
crates/webclaw-fetch/src/extractors/substack_post.rs
Normal file
213
crates/webclaw-fetch/src/extractors/substack_post.rs
Normal file
|
|
@ -0,0 +1,213 @@
|
||||||
|
//! Substack post extractor.
|
||||||
|
//!
|
||||||
|
//! Every Substack publication exposes `/api/v1/posts/{slug}` that
|
||||||
|
//! returns the full post as JSON: body HTML, cover image, author,
|
||||||
|
//! publication info, reactions, paywall state. No auth on public
|
||||||
|
//! posts.
|
||||||
|
//!
|
||||||
|
//! Works on both `*.substack.com` subdomains and custom domains
|
||||||
|
//! (e.g. `simonwillison.net` uses Substack too). Detection is
|
||||||
|
//! "URL has `/p/{slug}`" because that's the canonical Substack post
|
||||||
|
//! path. Explicit-call only because the `/p/{slug}` URL shape is
|
||||||
|
//! used by non-Substack sites too.
|
||||||
|
|
||||||
|
use serde::Deserialize;
|
||||||
|
use serde_json::{Value, json};
|
||||||
|
|
||||||
|
use super::ExtractorInfo;
|
||||||
|
use crate::client::FetchClient;
|
||||||
|
use crate::error::FetchError;
|
||||||
|
|
||||||
|
pub const INFO: ExtractorInfo = ExtractorInfo {
|
||||||
|
name: "substack_post",
|
||||||
|
label: "Substack post",
|
||||||
|
description: "Returns post HTML, title, subtitle, author, publication, reactions, paywall status via the Substack public API.",
|
||||||
|
url_patterns: &[
|
||||||
|
"https://{pub}.substack.com/p/{slug}",
|
||||||
|
"https://{custom-domain}/p/{slug}",
|
||||||
|
],
|
||||||
|
};
|
||||||
|
|
||||||
|
pub fn matches(url: &str) -> bool {
|
||||||
|
if !(url.starts_with("http://") || url.starts_with("https://")) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
url.contains("/p/")
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
|
||||||
|
let slug = parse_slug(url).ok_or_else(|| {
|
||||||
|
FetchError::Build(format!("substack_post: cannot parse slug from '{url}'"))
|
||||||
|
})?;
|
||||||
|
let host = host_of(url);
|
||||||
|
if host.is_empty() {
|
||||||
|
return Err(FetchError::Build(format!(
|
||||||
|
"substack_post: empty host in '{url}'"
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
let scheme = if url.starts_with("http://") {
|
||||||
|
"http"
|
||||||
|
} else {
|
||||||
|
"https"
|
||||||
|
};
|
||||||
|
let api_url = format!("{scheme}://{host}/api/v1/posts/{slug}");
|
||||||
|
let resp = client.fetch(&api_url).await?;
|
||||||
|
if resp.status == 404 {
|
||||||
|
return Err(FetchError::Build(format!(
|
||||||
|
"substack_post: '{slug}' not found on {host} (got 404). \
|
||||||
|
If the publication isn't actually on Substack, use /v1/scrape instead."
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
if resp.status != 200 {
|
||||||
|
return Err(FetchError::Build(format!(
|
||||||
|
"substack returned status {} for {api_url}",
|
||||||
|
resp.status
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
|
||||||
|
let p: Post = serde_json::from_str(&resp.html).map_err(|e| {
|
||||||
|
FetchError::BodyDecode(format!(
|
||||||
|
"substack_post: '{host}' didn't return Substack JSON, likely not a Substack ({e})"
|
||||||
|
))
|
||||||
|
})?;
|
||||||
|
|
||||||
|
Ok(json!({
|
||||||
|
"url": url,
|
||||||
|
"api_url": api_url,
|
||||||
|
"id": p.id,
|
||||||
|
"type": p.r#type,
|
||||||
|
"slug": p.slug,
|
||||||
|
"title": p.title,
|
||||||
|
"subtitle": p.subtitle,
|
||||||
|
"description": p.description,
|
||||||
|
"canonical_url": p.canonical_url,
|
||||||
|
"post_date": p.post_date,
|
||||||
|
"updated_at": p.updated_at,
|
||||||
|
"audience": p.audience,
|
||||||
|
"has_paywall": matches!(p.audience.as_deref(), Some("only_paid") | Some("founding")),
|
||||||
|
"is_free_preview": p.is_free_preview,
|
||||||
|
"cover_image": p.cover_image,
|
||||||
|
"word_count": p.wordcount,
|
||||||
|
"reactions": p.reactions,
|
||||||
|
"comment_count": p.comment_count,
|
||||||
|
"body_html": p.body_html,
|
||||||
|
"body_text": p.truncated_body_text.or(p.body_text),
|
||||||
|
"publication": json!({
|
||||||
|
"id": p.publication.as_ref().and_then(|pub_| pub_.id),
|
||||||
|
"name": p.publication.as_ref().and_then(|pub_| pub_.name.clone()),
|
||||||
|
"subdomain": p.publication.as_ref().and_then(|pub_| pub_.subdomain.clone()),
|
||||||
|
"custom_domain":p.publication.as_ref().and_then(|pub_| pub_.custom_domain.clone()),
|
||||||
|
}),
|
||||||
|
"authors": p.published_bylines.iter().map(|a| json!({
|
||||||
|
"id": a.id,
|
||||||
|
"name": a.name,
|
||||||
|
"handle": a.handle,
|
||||||
|
"photo": a.photo_url,
|
||||||
|
})).collect::<Vec<_>>(),
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// URL helpers
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
fn host_of(url: &str) -> &str {
|
||||||
|
url.split("://")
|
||||||
|
.nth(1)
|
||||||
|
.unwrap_or(url)
|
||||||
|
.split('/')
|
||||||
|
.next()
|
||||||
|
.unwrap_or("")
|
||||||
|
}
|
||||||
|
|
||||||
|
fn parse_slug(url: &str) -> Option<String> {
|
||||||
|
let after = url.split("/p/").nth(1)?;
|
||||||
|
let stripped = after
|
||||||
|
.split(['?', '#'])
|
||||||
|
.next()?
|
||||||
|
.trim_end_matches('/')
|
||||||
|
.split('/')
|
||||||
|
.next()
|
||||||
|
.unwrap_or("");
|
||||||
|
if stripped.is_empty() {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(stripped.to_string())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Substack API types (subset)
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
struct Post {
|
||||||
|
id: Option<i64>,
|
||||||
|
r#type: Option<String>,
|
||||||
|
slug: Option<String>,
|
||||||
|
title: Option<String>,
|
||||||
|
subtitle: Option<String>,
|
||||||
|
description: Option<String>,
|
||||||
|
canonical_url: Option<String>,
|
||||||
|
post_date: Option<String>,
|
||||||
|
updated_at: Option<String>,
|
||||||
|
audience: Option<String>,
|
||||||
|
is_free_preview: Option<bool>,
|
||||||
|
cover_image: Option<String>,
|
||||||
|
wordcount: Option<i64>,
|
||||||
|
reactions: Option<serde_json::Value>,
|
||||||
|
comment_count: Option<i64>,
|
||||||
|
body_html: Option<String>,
|
||||||
|
body_text: Option<String>,
|
||||||
|
truncated_body_text: Option<String>,
|
||||||
|
publication: Option<Publication>,
|
||||||
|
#[serde(default, rename = "publishedBylines")]
|
||||||
|
published_bylines: Vec<Byline>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
struct Publication {
|
||||||
|
id: Option<i64>,
|
||||||
|
name: Option<String>,
|
||||||
|
subdomain: Option<String>,
|
||||||
|
custom_domain: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
struct Byline {
|
||||||
|
id: Option<i64>,
|
||||||
|
name: Option<String>,
|
||||||
|
handle: Option<String>,
|
||||||
|
photo_url: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn matches_post_urls() {
|
||||||
|
assert!(matches(
|
||||||
|
"https://stratechery.substack.com/p/the-tech-letter"
|
||||||
|
));
|
||||||
|
assert!(matches("https://simonwillison.net/p/2024-08-01-something"));
|
||||||
|
assert!(!matches("https://example.com/"));
|
||||||
|
assert!(!matches("ftp://example.com/p/foo"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_slug_strips_query_and_trailing_slash() {
|
||||||
|
assert_eq!(
|
||||||
|
parse_slug("https://example.substack.com/p/my-post"),
|
||||||
|
Some("my-post".into())
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
parse_slug("https://example.substack.com/p/my-post/"),
|
||||||
|
Some("my-post".into())
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
parse_slug("https://example.substack.com/p/my-post?ref=123"),
|
||||||
|
Some("my-post".into())
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
237
crates/webclaw-fetch/src/extractors/woocommerce_product.rs
Normal file
237
crates/webclaw-fetch/src/extractors/woocommerce_product.rs
Normal file
|
|
@ -0,0 +1,237 @@
|
||||||
|
//! WooCommerce product structured extractor.
|
||||||
|
//!
|
||||||
|
//! Targets WooCommerce's Store API: `/wp-json/wc/store/v1/products?slug={slug}`.
|
||||||
|
//! About 30-50% of WooCommerce stores expose this endpoint publicly
|
||||||
|
//! (it's on by default, but common security plugins disable it).
|
||||||
|
//! When it's off, the server returns 404 at /wp-json. We surface a
|
||||||
|
//! clean error and point callers at `/v1/scrape/ecommerce_product`
|
||||||
|
//! which works on any store with Schema.org JSON-LD.
|
||||||
|
//!
|
||||||
|
//! Explicit-call only. `/product/{slug}` is the default permalink for
|
||||||
|
//! WooCommerce but custom stores use every variation imaginable, so
|
||||||
|
//! auto-dispatch is unreliable.
|
||||||
|
|
||||||
|
use serde::Deserialize;
|
||||||
|
use serde_json::{Value, json};
|
||||||
|
|
||||||
|
use super::ExtractorInfo;
|
||||||
|
use crate::client::FetchClient;
|
||||||
|
use crate::error::FetchError;
|
||||||
|
|
||||||
|
pub const INFO: ExtractorInfo = ExtractorInfo {
|
||||||
|
name: "woocommerce_product",
|
||||||
|
label: "WooCommerce product",
|
||||||
|
description: "Returns product via the WooCommerce Store REST API (requires the /wp-json/wc/store endpoint to be enabled on the target store).",
|
||||||
|
url_patterns: &[
|
||||||
|
"https://{shop}/product/{slug}",
|
||||||
|
"https://{shop}/shop/{slug}",
|
||||||
|
],
|
||||||
|
};
|
||||||
|
|
||||||
|
pub fn matches(url: &str) -> bool {
|
||||||
|
let host = host_of(url);
|
||||||
|
if host.is_empty() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
// Permissive: WooCommerce stores use custom domains + custom
|
||||||
|
// permalinks. The extractor's API probe is what confirms it's
|
||||||
|
// really WooCommerce.
|
||||||
|
url.contains("/product/")
|
||||||
|
|| url.contains("/shop/")
|
||||||
|
|| url.contains("/producto/") // common es locale
|
||||||
|
|| url.contains("/produit/") // common fr locale
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
|
||||||
|
let slug = parse_slug(url).ok_or_else(|| {
|
||||||
|
FetchError::Build(format!(
|
||||||
|
"woocommerce_product: cannot parse slug from '{url}'"
|
||||||
|
))
|
||||||
|
})?;
|
||||||
|
let host = host_of(url);
|
||||||
|
if host.is_empty() {
|
||||||
|
return Err(FetchError::Build(format!(
|
||||||
|
"woocommerce_product: empty host in '{url}'"
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
let scheme = if url.starts_with("http://") {
|
||||||
|
"http"
|
||||||
|
} else {
|
||||||
|
"https"
|
||||||
|
};
|
||||||
|
let api_url = format!("{scheme}://{host}/wp-json/wc/store/v1/products?slug={slug}&per_page=1");
|
||||||
|
let resp = client.fetch(&api_url).await?;
|
||||||
|
if resp.status == 404 {
|
||||||
|
return Err(FetchError::Build(format!(
|
||||||
|
"woocommerce_product: {host} does not expose /wp-json/wc/store (404). \
|
||||||
|
Use /v1/scrape/ecommerce_product for JSON-LD fallback."
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
if resp.status == 401 || resp.status == 403 {
|
||||||
|
return Err(FetchError::Build(format!(
|
||||||
|
"woocommerce_product: {host} requires auth for /wp-json/wc/store ({}). \
|
||||||
|
Use /v1/scrape/ecommerce_product for the public JSON-LD fallback.",
|
||||||
|
resp.status
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
if resp.status != 200 {
|
||||||
|
return Err(FetchError::Build(format!(
|
||||||
|
"woocommerce api returned status {} for {api_url}",
|
||||||
|
resp.status
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
|
||||||
|
let products: Vec<Product> = serde_json::from_str(&resp.html)
|
||||||
|
.map_err(|e| FetchError::BodyDecode(format!("woocommerce parse: {e}")))?;
|
||||||
|
let p = products.into_iter().next().ok_or_else(|| {
|
||||||
|
FetchError::Build(format!(
|
||||||
|
"woocommerce_product: no product found for slug '{slug}' on {host}"
|
||||||
|
))
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let images: Vec<Value> = p
|
||||||
|
.images
|
||||||
|
.iter()
|
||||||
|
.map(|i| json!({"src": i.src, "thumbnail": i.thumbnail, "alt": i.alt}))
|
||||||
|
.collect();
|
||||||
|
let variations_count = p.variations.as_ref().map(|v| v.len()).unwrap_or(0);
|
||||||
|
|
||||||
|
Ok(json!({
|
||||||
|
"url": url,
|
||||||
|
"api_url": api_url,
|
||||||
|
"product_id": p.id,
|
||||||
|
"name": p.name,
|
||||||
|
"slug": p.slug,
|
||||||
|
"sku": p.sku,
|
||||||
|
"permalink": p.permalink,
|
||||||
|
"on_sale": p.on_sale,
|
||||||
|
"in_stock": p.is_in_stock,
|
||||||
|
"is_purchasable": p.is_purchasable,
|
||||||
|
"price": p.prices.as_ref().and_then(|pr| pr.price.clone()),
|
||||||
|
"regular_price": p.prices.as_ref().and_then(|pr| pr.regular_price.clone()),
|
||||||
|
"sale_price": p.prices.as_ref().and_then(|pr| pr.sale_price.clone()),
|
||||||
|
"currency": p.prices.as_ref().and_then(|pr| pr.currency_code.clone()),
|
||||||
|
"currency_minor": p.prices.as_ref().and_then(|pr| pr.currency_minor_unit),
|
||||||
|
"price_range": p.prices.as_ref().and_then(|pr| pr.price_range.clone()),
|
||||||
|
"average_rating": p.average_rating,
|
||||||
|
"review_count": p.review_count,
|
||||||
|
"description": p.description,
|
||||||
|
"short_description": p.short_description,
|
||||||
|
"categories": p.categories.iter().filter_map(|c| c.name.clone()).collect::<Vec<_>>(),
|
||||||
|
"tags": p.tags.iter().filter_map(|t| t.name.clone()).collect::<Vec<_>>(),
|
||||||
|
"variation_count": variations_count,
|
||||||
|
"image_count": images.len(),
|
||||||
|
"images": images,
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// URL helpers
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
fn host_of(url: &str) -> &str {
|
||||||
|
url.split("://")
|
||||||
|
.nth(1)
|
||||||
|
.unwrap_or(url)
|
||||||
|
.split('/')
|
||||||
|
.next()
|
||||||
|
.unwrap_or("")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract the product slug from common WooCommerce permalinks.
|
||||||
|
fn parse_slug(url: &str) -> Option<String> {
|
||||||
|
for needle in ["/product/", "/shop/", "/producto/", "/produit/"] {
|
||||||
|
if let Some(after) = url.split(needle).nth(1) {
|
||||||
|
let stripped = after
|
||||||
|
.split(['?', '#'])
|
||||||
|
.next()?
|
||||||
|
.trim_end_matches('/')
|
||||||
|
.split('/')
|
||||||
|
.next()
|
||||||
|
.unwrap_or("");
|
||||||
|
if !stripped.is_empty() {
|
||||||
|
return Some(stripped.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Store API types (subset of the full response)
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
struct Product {
|
||||||
|
id: Option<i64>,
|
||||||
|
name: Option<String>,
|
||||||
|
slug: Option<String>,
|
||||||
|
sku: Option<String>,
|
||||||
|
permalink: Option<String>,
|
||||||
|
description: Option<String>,
|
||||||
|
short_description: Option<String>,
|
||||||
|
on_sale: Option<bool>,
|
||||||
|
is_in_stock: Option<bool>,
|
||||||
|
is_purchasable: Option<bool>,
|
||||||
|
average_rating: Option<serde_json::Value>, // string or number
|
||||||
|
review_count: Option<i64>,
|
||||||
|
prices: Option<Prices>,
|
||||||
|
#[serde(default)]
|
||||||
|
categories: Vec<Term>,
|
||||||
|
#[serde(default)]
|
||||||
|
tags: Vec<Term>,
|
||||||
|
#[serde(default)]
|
||||||
|
images: Vec<Img>,
|
||||||
|
variations: Option<Vec<serde_json::Value>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
struct Prices {
|
||||||
|
price: Option<String>,
|
||||||
|
regular_price: Option<String>,
|
||||||
|
sale_price: Option<String>,
|
||||||
|
currency_code: Option<String>,
|
||||||
|
currency_minor_unit: Option<i64>,
|
||||||
|
price_range: Option<serde_json::Value>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
struct Term {
|
||||||
|
name: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
struct Img {
|
||||||
|
src: Option<String>,
|
||||||
|
thumbnail: Option<String>,
|
||||||
|
alt: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn matches_common_permalinks() {
|
||||||
|
assert!(matches("https://shop.example.com/product/cool-widget"));
|
||||||
|
assert!(matches("https://shop.example.com/shop/cool-widget"));
|
||||||
|
assert!(matches("https://tienda.example.com/producto/cosa"));
|
||||||
|
assert!(matches("https://boutique.example.com/produit/chose"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_slug_handles_locale_and_suffix() {
|
||||||
|
assert_eq!(
|
||||||
|
parse_slug("https://shop.example.com/product/cool-widget"),
|
||||||
|
Some("cool-widget".into())
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
parse_slug("https://shop.example.com/product/cool-widget/?attr=red"),
|
||||||
|
Some("cool-widget".into())
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
parse_slug("https://tienda.example.com/producto/cosa/"),
|
||||||
|
Some("cosa".into())
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
255
crates/webclaw-fetch/src/extractors/youtube_video.rs
Normal file
255
crates/webclaw-fetch/src/extractors/youtube_video.rs
Normal file
|
|
@ -0,0 +1,255 @@
|
||||||
|
//! YouTube video structured extractor.
|
||||||
|
//!
|
||||||
|
//! YouTube embeds the full player configuration in a
|
||||||
|
//! `ytInitialPlayerResponse` JavaScript assignment at the top of
|
||||||
|
//! every `/watch`, `/shorts`, and `youtu.be` HTML page. We reuse the
|
||||||
|
//! core crate's already-proven regex + parse to surface typed JSON
|
||||||
|
//! from it: video id, title, author + channel id, view count,
|
||||||
|
//! duration, upload date, keywords, thumbnails, caption-track URLs.
|
||||||
|
//!
|
||||||
|
//! Auto-dispatched: YouTube host is unique and the `v=` or `/shorts/`
|
||||||
|
//! shape is stable.
|
||||||
|
|
||||||
|
use serde_json::{Value, json};
|
||||||
|
|
||||||
|
use super::ExtractorInfo;
|
||||||
|
use crate::client::FetchClient;
|
||||||
|
use crate::error::FetchError;
|
||||||
|
|
||||||
|
pub const INFO: ExtractorInfo = ExtractorInfo {
|
||||||
|
name: "youtube_video",
|
||||||
|
label: "YouTube video",
|
||||||
|
description: "Returns video id, title, channel, view count, duration, upload date, thumbnails, keywords, and caption-track URLs.",
|
||||||
|
url_patterns: &[
|
||||||
|
"https://www.youtube.com/watch?v={id}",
|
||||||
|
"https://youtu.be/{id}",
|
||||||
|
"https://www.youtube.com/shorts/{id}",
|
||||||
|
],
|
||||||
|
};
|
||||||
|
|
||||||
|
pub fn matches(url: &str) -> bool {
|
||||||
|
webclaw_core::youtube::is_youtube_url(url)
|
||||||
|
|| url.contains("youtube.com/shorts/")
|
||||||
|
|| url.contains("youtube-nocookie.com/embed/")
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchError> {
|
||||||
|
let video_id = parse_video_id(url).ok_or_else(|| {
|
||||||
|
FetchError::Build(format!("youtube_video: cannot parse video id from '{url}'"))
|
||||||
|
})?;
|
||||||
|
|
||||||
|
// Always fetch the canonical /watch URL. /shorts/ and youtu.be
|
||||||
|
// sometimes serve a thinner page without the player blob.
|
||||||
|
let canonical = format!("https://www.youtube.com/watch?v={video_id}");
|
||||||
|
let resp = client.fetch(&canonical).await?;
|
||||||
|
if resp.status != 200 {
|
||||||
|
return Err(FetchError::Build(format!(
|
||||||
|
"youtube returned status {} for {canonical}",
|
||||||
|
resp.status
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
|
||||||
|
let player = extract_player_response(&resp.html).ok_or_else(|| {
|
||||||
|
FetchError::BodyDecode(format!(
|
||||||
|
"youtube_video: no ytInitialPlayerResponse on {canonical} (video may be private, region-blocked, or removed)"
|
||||||
|
))
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let video_details = player.get("videoDetails");
|
||||||
|
let microformat = player
|
||||||
|
.get("microformat")
|
||||||
|
.and_then(|m| m.get("playerMicroformatRenderer"));
|
||||||
|
|
||||||
|
let thumbnails: Vec<Value> = video_details
|
||||||
|
.and_then(|vd| vd.get("thumbnail"))
|
||||||
|
.and_then(|t| t.get("thumbnails"))
|
||||||
|
.and_then(|t| t.as_array())
|
||||||
|
.cloned()
|
||||||
|
.unwrap_or_default();
|
||||||
|
|
||||||
|
let keywords: Vec<Value> = video_details
|
||||||
|
.and_then(|vd| vd.get("keywords"))
|
||||||
|
.and_then(|k| k.as_array())
|
||||||
|
.cloned()
|
||||||
|
.unwrap_or_default();
|
||||||
|
|
||||||
|
let caption_tracks = webclaw_core::youtube::extract_caption_tracks(&resp.html);
|
||||||
|
let captions: Vec<Value> = caption_tracks
|
||||||
|
.iter()
|
||||||
|
.map(|c| {
|
||||||
|
json!({
|
||||||
|
"url": c.url,
|
||||||
|
"lang": c.lang,
|
||||||
|
"name": c.name,
|
||||||
|
})
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
Ok(json!({
|
||||||
|
"url": url,
|
||||||
|
"canonical_url":canonical,
|
||||||
|
"video_id": video_id,
|
||||||
|
"title": get_str(video_details, "title"),
|
||||||
|
"description": get_str(video_details, "shortDescription"),
|
||||||
|
"author": get_str(video_details, "author"),
|
||||||
|
"channel_id": get_str(video_details, "channelId"),
|
||||||
|
"channel_url": get_str(microformat, "ownerProfileUrl"),
|
||||||
|
"view_count": get_int(video_details, "viewCount"),
|
||||||
|
"length_seconds": get_int(video_details, "lengthSeconds"),
|
||||||
|
"is_live": video_details.and_then(|vd| vd.get("isLiveContent")).and_then(|v| v.as_bool()),
|
||||||
|
"is_private": video_details.and_then(|vd| vd.get("isPrivate")).and_then(|v| v.as_bool()),
|
||||||
|
"is_unlisted": microformat.and_then(|m| m.get("isUnlisted")).and_then(|v| v.as_bool()),
|
||||||
|
"allow_ratings":video_details.and_then(|vd| vd.get("allowRatings")).and_then(|v| v.as_bool()),
|
||||||
|
"category": get_str(microformat, "category"),
|
||||||
|
"upload_date": get_str(microformat, "uploadDate"),
|
||||||
|
"publish_date": get_str(microformat, "publishDate"),
|
||||||
|
"keywords": keywords,
|
||||||
|
"thumbnails": thumbnails,
|
||||||
|
"caption_tracks": captions,
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// URL helpers
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
fn parse_video_id(url: &str) -> Option<String> {
|
||||||
|
// youtu.be/{id}
|
||||||
|
if let Some(after) = url.split("youtu.be/").nth(1) {
|
||||||
|
let id = after
|
||||||
|
.split(['?', '#', '/'])
|
||||||
|
.next()
|
||||||
|
.unwrap_or("")
|
||||||
|
.trim_end_matches('/');
|
||||||
|
if !id.is_empty() {
|
||||||
|
return Some(id.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// youtube.com/shorts/{id}
|
||||||
|
if let Some(after) = url.split("youtube.com/shorts/").nth(1) {
|
||||||
|
let id = after
|
||||||
|
.split(['?', '#', '/'])
|
||||||
|
.next()
|
||||||
|
.unwrap_or("")
|
||||||
|
.trim_end_matches('/');
|
||||||
|
if !id.is_empty() {
|
||||||
|
return Some(id.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// youtube-nocookie.com/embed/{id}
|
||||||
|
if let Some(after) = url.split("/embed/").nth(1) {
|
||||||
|
let id = after
|
||||||
|
.split(['?', '#', '/'])
|
||||||
|
.next()
|
||||||
|
.unwrap_or("")
|
||||||
|
.trim_end_matches('/');
|
||||||
|
if !id.is_empty() {
|
||||||
|
return Some(id.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// youtube.com/watch?v={id} (also matches youtube.com/watch?foo=bar&v={id})
|
||||||
|
if let Some(q) = url.split_once('?').map(|(_, q)| q)
|
||||||
|
&& let Some(id) = q
|
||||||
|
.split('&')
|
||||||
|
.find_map(|p| p.strip_prefix("v=").map(|v| v.to_string()))
|
||||||
|
{
|
||||||
|
let id = id.split(['#', '/']).next().unwrap_or(&id).to_string();
|
||||||
|
if !id.is_empty() {
|
||||||
|
return Some(id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Player-response parsing
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
fn extract_player_response(html: &str) -> Option<Value> {
|
||||||
|
use regex::Regex;
|
||||||
|
use std::sync::OnceLock;
|
||||||
|
// Same regex as webclaw_core::youtube. Duplicated here because
|
||||||
|
// core's regex is module-private. Kept in lockstep; changes are
|
||||||
|
// rare and we cover with tests in both places.
|
||||||
|
static RE: OnceLock<Regex> = OnceLock::new();
|
||||||
|
let re = RE
|
||||||
|
.get_or_init(|| Regex::new(r"var\s+ytInitialPlayerResponse\s*=\s*(\{.+?\})\s*;").unwrap());
|
||||||
|
let json_str = re.captures(html)?.get(1)?.as_str();
|
||||||
|
serde_json::from_str(json_str).ok()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_str(v: Option<&Value>, key: &str) -> Option<String> {
|
||||||
|
v.and_then(|x| x.get(key))
|
||||||
|
.and_then(|x| x.as_str().map(String::from))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_int(v: Option<&Value>, key: &str) -> Option<i64> {
|
||||||
|
v.and_then(|x| x.get(key)).and_then(|x| {
|
||||||
|
x.as_i64()
|
||||||
|
.or_else(|| x.as_str().and_then(|s| s.parse::<i64>().ok()))
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn matches_watch_urls() {
|
||||||
|
assert!(matches("https://www.youtube.com/watch?v=dQw4w9WgXcQ"));
|
||||||
|
assert!(matches("https://youtu.be/dQw4w9WgXcQ"));
|
||||||
|
assert!(matches("https://www.youtube.com/shorts/abc123"));
|
||||||
|
assert!(matches(
|
||||||
|
"https://www.youtube-nocookie.com/embed/dQw4w9WgXcQ"
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn rejects_non_video_urls() {
|
||||||
|
assert!(!matches("https://www.youtube.com/"));
|
||||||
|
assert!(!matches("https://www.youtube.com/channel/abc"));
|
||||||
|
assert!(!matches("https://example.com/watch?v=abc"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_video_id_from_each_shape() {
|
||||||
|
assert_eq!(
|
||||||
|
parse_video_id("https://www.youtube.com/watch?v=dQw4w9WgXcQ"),
|
||||||
|
Some("dQw4w9WgXcQ".into())
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
parse_video_id("https://www.youtube.com/watch?v=dQw4w9WgXcQ&t=10s"),
|
||||||
|
Some("dQw4w9WgXcQ".into())
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
parse_video_id("https://www.youtube.com/watch?feature=share&v=dQw4w9WgXcQ"),
|
||||||
|
Some("dQw4w9WgXcQ".into())
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
parse_video_id("https://youtu.be/dQw4w9WgXcQ"),
|
||||||
|
Some("dQw4w9WgXcQ".into())
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
parse_video_id("https://youtu.be/dQw4w9WgXcQ?t=30"),
|
||||||
|
Some("dQw4w9WgXcQ".into())
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
parse_video_id("https://www.youtube.com/shorts/abc123"),
|
||||||
|
Some("abc123".into())
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn extract_player_response_happy_path() {
|
||||||
|
let html = r#"
|
||||||
|
<html><body>
|
||||||
|
<script>
|
||||||
|
var ytInitialPlayerResponse = {"videoDetails":{"videoId":"abc","title":"T","author":"A","viewCount":"100","lengthSeconds":"60","shortDescription":"d"}};
|
||||||
|
</script>
|
||||||
|
</body></html>
|
||||||
|
"#;
|
||||||
|
let v = extract_player_response(html).unwrap();
|
||||||
|
let vd = v.get("videoDetails").unwrap();
|
||||||
|
assert_eq!(vd.get("title").unwrap().as_str(), Some("T"));
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Add table
Add a link
Reference in a new issue