diff --git a/crates/webclaw-fetch/src/client.rs b/crates/webclaw-fetch/src/client.rs index a4f6dd5..7ce16d7 100644 --- a/crates/webclaw-fetch/src/client.rs +++ b/crates/webclaw-fetch/src/client.rs @@ -177,6 +177,11 @@ enum ClientPool { pub struct FetchClient { pool: ClientPool, pdf_mode: PdfMode, + /// Optional cloud-fallback client. Extractors that need to + /// escalate past bot protection call `client.cloud()` to get this + /// out. Stored as `Arc` so cloning a `FetchClient` (common in + /// axum state) doesn't clone the underlying reqwest pool. + cloud: Option>, } impl FetchClient { @@ -225,7 +230,35 @@ impl FetchClient { ClientPool::Rotating { clients } }; - Ok(Self { pool, pdf_mode }) + Ok(Self { + pool, + pdf_mode, + cloud: None, + }) + } + + /// Attach a cloud-fallback client. Returns `self` so it composes in + /// a builder-ish way: + /// + /// ```ignore + /// let client = FetchClient::new(config)? + /// .with_cloud(CloudClient::from_env()?); + /// ``` + /// + /// Extractors that can escalate past bot protection will call + /// `client.cloud()` internally. Sets the field regardless of + /// whether `cloud` is configured to bypass anything specific — + /// attachment is cheap (just wraps in `Arc`). + pub fn with_cloud(mut self, cloud: crate::cloud::CloudClient) -> Self { + self.cloud = Some(std::sync::Arc::new(cloud)); + self + } + + /// Optional cloud-fallback client, if one was attached via + /// [`Self::with_cloud`]. Extractors that handle antibot sites + /// pass this into `cloud::smart_fetch_html`. + pub fn cloud(&self) -> Option<&crate::cloud::CloudClient> { + self.cloud.as_deref() } /// Fetch a URL and return the raw HTML + response metadata. diff --git a/crates/webclaw-fetch/src/extractors/amazon_product.rs b/crates/webclaw-fetch/src/extractors/amazon_product.rs new file mode 100644 index 0000000..3c96385 --- /dev/null +++ b/crates/webclaw-fetch/src/extractors/amazon_product.rs @@ -0,0 +1,361 @@ +//! Amazon product detail page extractor. +//! +//! Amazon product pages (`/dp/{ASIN}/` on every locale) always return +//! a "Sorry, we need to verify you're human" interstitial to any +//! client without a warm Amazon session + residential IP. Detection +//! fires immediately in [`cloud::is_bot_protected`] via the dedicated +//! Amazon heuristic, so this extractor always hits the cloud fallback +//! path in practice. +//! +//! Parsing logic works on the final HTML, local or cloud-sourced. We +//! read the product details primarily from JSON-LD `Product` blocks +//! (Amazon exposes a solid subset for SEO) plus a couple of Amazon- +//! specific DOM IDs picked up with cheap regex. +//! +//! Auto-dispatch: we accept any amazon.* host with a `/dp/{ASIN}/` +//! path. ASINs are a stable Amazon identifier so we extract that as +//! part of the response even when everything else is empty (tells +//! callers the URL was at least recognised). + +use std::sync::OnceLock; + +use regex::Regex; +use serde_json::{Value, json}; + +use super::ExtractorInfo; +use crate::client::FetchClient; +use crate::cloud::{self, CloudError}; +use crate::error::FetchError; + +pub const INFO: ExtractorInfo = ExtractorInfo { + name: "amazon_product", + label: "Amazon product", + description: "Returns product detail: title, brand, price, currency, availability, rating, image, ASIN. Requires WEBCLAW_API_KEY — Amazon's antibot means we always go through the cloud.", + url_patterns: &[ + "https://www.amazon.com/dp/{ASIN}", + "https://www.amazon.co.uk/dp/{ASIN}", + "https://www.amazon.de/dp/{ASIN}", + "https://www.amazon.fr/dp/{ASIN}", + "https://www.amazon.it/dp/{ASIN}", + "https://www.amazon.es/dp/{ASIN}", + "https://www.amazon.co.jp/dp/{ASIN}", + ], +}; + +pub fn matches(url: &str) -> bool { + let host = host_of(url); + if !is_amazon_host(host) { + return false; + } + parse_asin(url).is_some() +} + +pub async fn extract(client: &FetchClient, url: &str) -> Result { + let asin = parse_asin(url) + .ok_or_else(|| FetchError::Build(format!("amazon_product: no ASIN in '{url}'")))?; + + let fetched = cloud::smart_fetch_html(client, client.cloud(), url) + .await + .map_err(cloud_to_fetch_err)?; + + let mut data = parse(&fetched.html, url, &asin); + if let Some(obj) = data.as_object_mut() { + obj.insert( + "data_source".into(), + match fetched.source { + cloud::FetchSource::Local => json!("local"), + cloud::FetchSource::Cloud => json!("cloud"), + }, + ); + } + Ok(data) +} + +/// Pure parser. Given HTML (from anywhere — direct, cloud, or a fixture +/// file) and the source URL, extract Amazon product detail. Returns a +/// `Value` rather than a typed struct so callers can pass it through +/// without carrying webclaw_fetch types. +pub fn parse(html: &str, url: &str, asin: &str) -> Value { + let jsonld = find_product_jsonld(html); + let title = jsonld + .as_ref() + .and_then(|v| get_text(v, "name")) + .or_else(|| dom_title(html)); + let image = jsonld + .as_ref() + .and_then(get_first_image) + .or_else(|| dom_image(html)); + let brand = jsonld.as_ref().and_then(get_brand); + let description = jsonld.as_ref().and_then(|v| get_text(v, "description")); + let aggregate_rating = jsonld.as_ref().and_then(get_aggregate_rating); + let offer = jsonld.as_ref().and_then(first_offer); + + let sku = jsonld.as_ref().and_then(|v| get_text(v, "sku")); + let mpn = jsonld.as_ref().and_then(|v| get_text(v, "mpn")); + + json!({ + "url": url, + "asin": asin, + "title": title, + "brand": brand, + "description": description, + "image": image, + "price": offer.as_ref().and_then(|o| get_text(o, "price")), + "currency": offer.as_ref().and_then(|o| get_text(o, "priceCurrency")), + "availability": offer.as_ref().and_then(|o| { + get_text(o, "availability").map(|s| + s.replace("http://schema.org/", "").replace("https://schema.org/", "")) + }), + "condition": offer.as_ref().and_then(|o| { + get_text(o, "itemCondition").map(|s| + s.replace("http://schema.org/", "").replace("https://schema.org/", "")) + }), + "sku": sku, + "mpn": mpn, + "aggregate_rating": aggregate_rating, + }) +} + +// --------------------------------------------------------------------------- +// URL helpers +// --------------------------------------------------------------------------- + +fn host_of(url: &str) -> &str { + url.split("://") + .nth(1) + .unwrap_or(url) + .split('/') + .next() + .unwrap_or("") +} + +fn is_amazon_host(host: &str) -> bool { + host.starts_with("www.amazon.") || host.starts_with("amazon.") +} + +/// Pull a 10-char ASIN out of any recognised Amazon URL shape: +/// - /dp/{ASIN} +/// - /gp/product/{ASIN} +/// - /product/{ASIN} +/// - /exec/obidos/ASIN/{ASIN} +fn parse_asin(url: &str) -> Option { + static RE: OnceLock = OnceLock::new(); + let re = RE.get_or_init(|| { + Regex::new(r"/(?:dp|gp/product|product|ASIN)/([A-Z0-9]{10})(?:[/?#]|$)").unwrap() + }); + re.captures(url) + .and_then(|c| c.get(1)) + .map(|m| m.as_str().to_string()) +} + +// --------------------------------------------------------------------------- +// JSON-LD walkers — light reuse of ecommerce_product's style +// --------------------------------------------------------------------------- + +fn find_product_jsonld(html: &str) -> Option { + let blocks = webclaw_core::structured_data::extract_json_ld(html); + for b in blocks { + if let Some(found) = find_product_in(&b) { + return Some(found); + } + } + None +} + +fn find_product_in(v: &Value) -> Option { + if is_product_type(v) { + return Some(v.clone()); + } + if let Some(graph) = v.get("@graph").and_then(|g| g.as_array()) { + for item in graph { + if let Some(found) = find_product_in(item) { + return Some(found); + } + } + } + if let Some(arr) = v.as_array() { + for item in arr { + if let Some(found) = find_product_in(item) { + return Some(found); + } + } + } + None +} + +fn is_product_type(v: &Value) -> bool { + let Some(t) = v.get("@type") else { + return false; + }; + let is_prod = |s: &str| matches!(s, "Product" | "ProductGroup" | "IndividualProduct"); + match t { + Value::String(s) => is_prod(s), + Value::Array(arr) => arr.iter().any(|x| x.as_str().is_some_and(is_prod)), + _ => false, + } +} + +fn get_text(v: &Value, key: &str) -> Option { + v.get(key).and_then(|x| match x { + Value::String(s) => Some(s.clone()), + Value::Number(n) => Some(n.to_string()), + _ => None, + }) +} + +fn get_brand(v: &Value) -> Option { + let brand = v.get("brand")?; + if let Some(s) = brand.as_str() { + return Some(s.to_string()); + } + brand + .as_object() + .and_then(|o| o.get("name")) + .and_then(|n| n.as_str()) + .map(String::from) +} + +fn get_first_image(v: &Value) -> Option { + match v.get("image")? { + Value::String(s) => Some(s.clone()), + Value::Array(arr) => arr.iter().find_map(|x| match x { + Value::String(s) => Some(s.clone()), + Value::Object(_) => x.get("url").and_then(|u| u.as_str()).map(String::from), + _ => None, + }), + Value::Object(o) => o.get("url").and_then(|u| u.as_str()).map(String::from), + _ => None, + } +} + +fn first_offer(v: &Value) -> Option { + let offers = v.get("offers")?; + match offers { + Value::Array(arr) => arr.first().cloned(), + Value::Object(_) => Some(offers.clone()), + _ => None, + } +} + +fn get_aggregate_rating(v: &Value) -> Option { + let r = v.get("aggregateRating")?; + Some(json!({ + "rating_value": get_text(r, "ratingValue"), + "review_count": get_text(r, "reviewCount"), + "best_rating": get_text(r, "bestRating"), + })) +} + +// --------------------------------------------------------------------------- +// DOM fallbacks — cheap regex for the two fields most likely to be +// missing from JSON-LD on Amazon. +// --------------------------------------------------------------------------- + +fn dom_title(html: &str) -> Option { + static RE: OnceLock = OnceLock::new(); + let re = RE.get_or_init(|| Regex::new(r#"(?s)id="productTitle"[^>]*>([^<]+)<"#).unwrap()); + re.captures(html) + .and_then(|c| c.get(1)) + .map(|m| m.as_str().trim().to_string()) +} + +fn dom_image(html: &str) -> Option { + static RE: OnceLock = OnceLock::new(); + let re = RE.get_or_init(|| Regex::new(r#"id="landingImage"[^>]+src="([^"]+)""#).unwrap()); + re.captures(html) + .and_then(|c| c.get(1)) + .map(|m| m.as_str().to_string()) +} + +fn cloud_to_fetch_err(e: CloudError) -> FetchError { + FetchError::Build(e.to_string()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn matches_multi_locale() { + assert!(matches("https://www.amazon.com/dp/B0CHX1W1XY")); + assert!(matches("https://www.amazon.co.uk/dp/B0CHX1W1XY/")); + assert!(matches("https://www.amazon.de/dp/B0CHX1W1XY?psc=1")); + assert!(matches( + "https://www.amazon.com/gp/product/B0CHX1W1XY/ref=foo" + )); + } + + #[test] + fn rejects_non_product_urls() { + assert!(!matches("https://www.amazon.com/")); + assert!(!matches("https://www.amazon.com/gp/cart")); + assert!(!matches("https://example.com/dp/B0CHX1W1XY")); + } + + #[test] + fn parse_asin_extracts_from_multiple_shapes() { + assert_eq!( + parse_asin("https://www.amazon.com/dp/B0CHX1W1XY"), + Some("B0CHX1W1XY".into()) + ); + assert_eq!( + parse_asin("https://www.amazon.com/dp/B0CHX1W1XY/"), + Some("B0CHX1W1XY".into()) + ); + assert_eq!( + parse_asin("https://www.amazon.com/dp/B0CHX1W1XY?psc=1"), + Some("B0CHX1W1XY".into()) + ); + assert_eq!( + parse_asin("https://www.amazon.com/gp/product/B0CHX1W1XY/ref=bar"), + Some("B0CHX1W1XY".into()) + ); + assert_eq!( + parse_asin("https://www.amazon.com/exec/obidos/ASIN/B0CHX1W1XY/baz"), + Some("B0CHX1W1XY".into()) + ); + assert_eq!(parse_asin("https://www.amazon.com/"), None); + } + + #[test] + fn parse_extracts_from_fixture_jsonld() { + // Minimal Amazon-style fixture with a Product JSON-LD block. + let html = r##" + + +"##; + let v = parse(html, "https://www.amazon.com/dp/B0CHX1W1XY", "B0CHX1W1XY"); + assert_eq!(v["asin"], "B0CHX1W1XY"); + assert_eq!(v["title"], "ACME Widget"); + assert_eq!(v["brand"], "ACME"); + assert_eq!(v["price"], "19.99"); + assert_eq!(v["currency"], "USD"); + assert_eq!(v["availability"], "InStock"); + assert_eq!(v["aggregate_rating"]["rating_value"], "4.6"); + assert_eq!(v["aggregate_rating"]["review_count"], "1234"); + } + + #[test] + fn parse_falls_back_to_dom_when_jsonld_missing_fields() { + let html = r#" + +Fallback Title + + +"#; + let v = parse(html, "https://www.amazon.com/dp/B0CHX1W1XY", "B0CHX1W1XY"); + assert_eq!(v["title"], "Fallback Title"); + assert_eq!( + v["image"], + "https://m.media-amazon.com/images/I/fallback.jpg" + ); + } +} diff --git a/crates/webclaw-fetch/src/extractors/ebay_listing.rs b/crates/webclaw-fetch/src/extractors/ebay_listing.rs new file mode 100644 index 0000000..14c36ef --- /dev/null +++ b/crates/webclaw-fetch/src/extractors/ebay_listing.rs @@ -0,0 +1,337 @@ +//! eBay listing extractor. +//! +//! eBay item pages at `ebay.com/itm/{id}` and international variants +//! usually ship a `Product` JSON-LD block with title, price, currency, +//! condition, and an `AggregateOffer` when bidding. eBay applies +//! Cloudflare + custom WAF selectively — some item IDs return normal +//! HTML to the Firefox profile, others 403 / get the "Pardon our +//! interruption" page. We route through `cloud::smart_fetch_html` so +//! both paths resolve to the same parser. + +use std::sync::OnceLock; + +use regex::Regex; +use serde_json::{Value, json}; + +use super::ExtractorInfo; +use crate::client::FetchClient; +use crate::cloud::{self, CloudError}; +use crate::error::FetchError; + +pub const INFO: ExtractorInfo = ExtractorInfo { + name: "ebay_listing", + label: "eBay listing", + description: "Returns item title, price, currency, condition, seller, shipping, and bid info. Heavy listings may need WEBCLAW_API_KEY for antibot.", + url_patterns: &[ + "https://www.ebay.com/itm/{id}", + "https://www.ebay.co.uk/itm/{id}", + "https://www.ebay.de/itm/{id}", + "https://www.ebay.fr/itm/{id}", + "https://www.ebay.it/itm/{id}", + ], +}; + +pub fn matches(url: &str) -> bool { + let host = host_of(url); + if !is_ebay_host(host) { + return false; + } + parse_item_id(url).is_some() +} + +pub async fn extract(client: &FetchClient, url: &str) -> Result { + let item_id = parse_item_id(url) + .ok_or_else(|| FetchError::Build(format!("ebay_listing: no item id in '{url}'")))?; + + let fetched = cloud::smart_fetch_html(client, client.cloud(), url) + .await + .map_err(cloud_to_fetch_err)?; + + let mut data = parse(&fetched.html, url, &item_id); + if let Some(obj) = data.as_object_mut() { + obj.insert( + "data_source".into(), + match fetched.source { + cloud::FetchSource::Local => json!("local"), + cloud::FetchSource::Cloud => json!("cloud"), + }, + ); + } + Ok(data) +} + +pub fn parse(html: &str, url: &str, item_id: &str) -> Value { + let jsonld = find_product_jsonld(html); + let title = jsonld + .as_ref() + .and_then(|v| get_text(v, "name")) + .or_else(|| og(html, "title")); + let image = jsonld + .as_ref() + .and_then(get_first_image) + .or_else(|| og(html, "image")); + let brand = jsonld.as_ref().and_then(get_brand); + let description = jsonld + .as_ref() + .and_then(|v| get_text(v, "description")) + .or_else(|| og(html, "description")); + let offer = jsonld.as_ref().and_then(first_offer); + + // eBay's AggregateOffer uses lowPrice/highPrice. Offer uses price. + let (low_price, high_price, single_price) = match offer.as_ref() { + Some(o) => ( + get_text(o, "lowPrice"), + get_text(o, "highPrice"), + get_text(o, "price"), + ), + None => (None, None, None), + }; + let offer_count = offer.as_ref().and_then(|o| get_text(o, "offerCount")); + + let aggregate_rating = jsonld.as_ref().and_then(get_aggregate_rating); + + json!({ + "url": url, + "item_id": item_id, + "title": title, + "brand": brand, + "description": description, + "image": image, + "price": single_price, + "low_price": low_price, + "high_price": high_price, + "offer_count": offer_count, + "currency": offer.as_ref().and_then(|o| get_text(o, "priceCurrency")), + "availability": offer.as_ref().and_then(|o| { + get_text(o, "availability").map(|s| + s.replace("http://schema.org/", "").replace("https://schema.org/", "")) + }), + "condition": offer.as_ref().and_then(|o| { + get_text(o, "itemCondition").map(|s| + s.replace("http://schema.org/", "").replace("https://schema.org/", "")) + }), + "seller": offer.as_ref().and_then(|o| + o.get("seller").and_then(|s| s.get("name")).and_then(|n| n.as_str()).map(String::from)), + "aggregate_rating": aggregate_rating, + }) +} + +// --------------------------------------------------------------------------- +// URL helpers +// --------------------------------------------------------------------------- + +fn host_of(url: &str) -> &str { + url.split("://") + .nth(1) + .unwrap_or(url) + .split('/') + .next() + .unwrap_or("") +} + +fn is_ebay_host(host: &str) -> bool { + host.starts_with("www.ebay.") || host.starts_with("ebay.") +} + +/// Pull the numeric item id out of `/itm/{id}` or `/itm/{slug}/{id}` +/// URLs. IDs are 10-15 digits today, but we accept any all-digit +/// trailing segment so the extractor stays forward-compatible. +fn parse_item_id(url: &str) -> Option { + static RE: OnceLock = OnceLock::new(); + let re = RE.get_or_init(|| { + // /itm/(optional-slug/)?(digits)([/?#]|end) + Regex::new(r"/itm/(?:[^/]+/)?(\d{8,})(?:[/?#]|$)").unwrap() + }); + re.captures(url) + .and_then(|c| c.get(1)) + .map(|m| m.as_str().to_string()) +} + +// --------------------------------------------------------------------------- +// JSON-LD walkers +// --------------------------------------------------------------------------- + +fn find_product_jsonld(html: &str) -> Option { + let blocks = webclaw_core::structured_data::extract_json_ld(html); + for b in blocks { + if let Some(found) = find_product_in(&b) { + return Some(found); + } + } + None +} + +fn find_product_in(v: &Value) -> Option { + if is_product_type(v) { + return Some(v.clone()); + } + if let Some(graph) = v.get("@graph").and_then(|g| g.as_array()) { + for item in graph { + if let Some(found) = find_product_in(item) { + return Some(found); + } + } + } + if let Some(arr) = v.as_array() { + for item in arr { + if let Some(found) = find_product_in(item) { + return Some(found); + } + } + } + None +} + +fn is_product_type(v: &Value) -> bool { + let Some(t) = v.get("@type") else { + return false; + }; + let is_prod = |s: &str| matches!(s, "Product" | "ProductGroup" | "IndividualProduct"); + match t { + Value::String(s) => is_prod(s), + Value::Array(arr) => arr.iter().any(|x| x.as_str().is_some_and(is_prod)), + _ => false, + } +} + +fn get_text(v: &Value, key: &str) -> Option { + v.get(key).and_then(|x| match x { + Value::String(s) => Some(s.clone()), + Value::Number(n) => Some(n.to_string()), + _ => None, + }) +} + +fn get_brand(v: &Value) -> Option { + let brand = v.get("brand")?; + if let Some(s) = brand.as_str() { + return Some(s.to_string()); + } + brand + .as_object() + .and_then(|o| o.get("name")) + .and_then(|n| n.as_str()) + .map(String::from) +} + +fn get_first_image(v: &Value) -> Option { + match v.get("image")? { + Value::String(s) => Some(s.clone()), + Value::Array(arr) => arr.iter().find_map(|x| match x { + Value::String(s) => Some(s.clone()), + Value::Object(_) => x.get("url").and_then(|u| u.as_str()).map(String::from), + _ => None, + }), + Value::Object(o) => o.get("url").and_then(|u| u.as_str()).map(String::from), + _ => None, + } +} + +fn first_offer(v: &Value) -> Option { + let offers = v.get("offers")?; + match offers { + Value::Array(arr) => arr.first().cloned(), + Value::Object(_) => Some(offers.clone()), + _ => None, + } +} + +fn get_aggregate_rating(v: &Value) -> Option { + let r = v.get("aggregateRating")?; + Some(json!({ + "rating_value": get_text(r, "ratingValue"), + "review_count": get_text(r, "reviewCount"), + "best_rating": get_text(r, "bestRating"), + })) +} + +fn og(html: &str, prop: &str) -> Option { + static RE: OnceLock = OnceLock::new(); + let re = RE.get_or_init(|| { + Regex::new(r#"(?i)]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap() + }); + for c in re.captures_iter(html) { + if c.get(1).is_some_and(|m| m.as_str() == prop) { + return c.get(2).map(|m| m.as_str().to_string()); + } + } + None +} + +fn cloud_to_fetch_err(e: CloudError) -> FetchError { + FetchError::Build(e.to_string()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn matches_ebay_item_urls() { + assert!(matches("https://www.ebay.com/itm/325478156234")); + assert!(matches( + "https://www.ebay.com/itm/vintage-typewriter/325478156234" + )); + assert!(matches("https://www.ebay.co.uk/itm/325478156234")); + assert!(!matches("https://www.ebay.com/")); + assert!(!matches("https://www.ebay.com/sch/foo")); + assert!(!matches("https://example.com/itm/325478156234")); + } + + #[test] + fn parse_item_id_handles_slugged_urls() { + assert_eq!( + parse_item_id("https://www.ebay.com/itm/325478156234"), + Some("325478156234".into()) + ); + assert_eq!( + parse_item_id("https://www.ebay.com/itm/vintage-typewriter/325478156234"), + Some("325478156234".into()) + ); + assert_eq!( + parse_item_id("https://www.ebay.com/itm/325478156234?hash=abc"), + Some("325478156234".into()) + ); + } + + #[test] + fn parse_extracts_from_fixture_jsonld() { + let html = r##" + + +"##; + let v = parse(html, "https://www.ebay.co.uk/itm/325", "325"); + assert_eq!(v["title"], "Vintage Typewriter"); + assert_eq!(v["price"], "79.99"); + assert_eq!(v["currency"], "GBP"); + assert_eq!(v["availability"], "InStock"); + assert_eq!(v["condition"], "UsedCondition"); + assert_eq!(v["seller"], "vintage_seller_99"); + assert_eq!(v["brand"], "Olivetti"); + } + + #[test] + fn parse_handles_aggregate_offer_price_range() { + let html = r##" + +"##; + let v = parse(html, "https://www.ebay.com/itm/1", "1"); + assert_eq!(v["low_price"], "10.00"); + assert_eq!(v["high_price"], "50.00"); + assert_eq!(v["offer_count"], "5"); + assert_eq!(v["currency"], "USD"); + } +} diff --git a/crates/webclaw-fetch/src/extractors/mod.rs b/crates/webclaw-fetch/src/extractors/mod.rs index ea273e6..5cf0993 100644 --- a/crates/webclaw-fetch/src/extractors/mod.rs +++ b/crates/webclaw-fetch/src/extractors/mod.rs @@ -14,10 +14,12 @@ //! exists (Reddit, HN/Algolia, PyPI, npm, GitHub, HuggingFace all have //! one). HTML extraction is the fallback for sites that don't. +pub mod amazon_product; pub mod arxiv; pub mod crates_io; pub mod dev_to; pub mod docker_hub; +pub mod ebay_listing; pub mod ecommerce_product; pub mod github_pr; pub mod github_release; @@ -33,12 +35,6 @@ pub mod pypi; pub mod reddit; pub mod shopify_product; pub mod stackoverflow; -// `trustpilot_reviews` code lives in the tree but is not wired into the -// catalog or dispatch: Cloudflare turnstile blocks our client at the TLS -// layer (all browser profiles tried, all UAs, mobile + desktop). Shipping -// it would return 403 more often than not — bad UX. When the cloud tier -// has residential proxies or a CDP renderer, flip this back on. -#[allow(dead_code)] pub mod trustpilot_reviews; use serde::Serialize; @@ -84,6 +80,9 @@ pub fn list() -> Vec { instagram_profile::INFO, shopify_product::INFO, ecommerce_product::INFO, + amazon_product::INFO, + ebay_listing::INFO, + trustpilot_reviews::INFO, ] } @@ -209,6 +208,31 @@ pub async fn dispatch_by_url( .map(|v| (instagram_profile::INFO.name, v)), ); } + // Antibot-gated verticals with unique hosts: safe to auto-dispatch + // because the matcher can't confuse the URL for anything else. The + // extractor's smart_fetch_html path handles the blocked-without- + // API-key case with a clear actionable error. + if amazon_product::matches(url) { + return Some( + amazon_product::extract(client, url) + .await + .map(|v| (amazon_product::INFO.name, v)), + ); + } + if ebay_listing::matches(url) { + return Some( + ebay_listing::extract(client, url) + .await + .map(|v| (ebay_listing::INFO.name, v)), + ); + } + if trustpilot_reviews::matches(url) { + return Some( + trustpilot_reviews::extract(client, url) + .await + .map(|v| (trustpilot_reviews::INFO.name, v)), + ); + } // NOTE: shopify_product and ecommerce_product are intentionally NOT // in auto-dispatch. Their `matches()` functions are permissive // (any URL with `/products/`, `/product/`, `/p/`, etc.) and @@ -333,6 +357,24 @@ pub async fn dispatch_by_name( }) .await } + n if n == amazon_product::INFO.name => { + run_or_mismatch(amazon_product::matches(url), n, url, || { + amazon_product::extract(client, url) + }) + .await + } + n if n == ebay_listing::INFO.name => { + run_or_mismatch(ebay_listing::matches(url), n, url, || { + ebay_listing::extract(client, url) + }) + .await + } + n if n == trustpilot_reviews::INFO.name => { + run_or_mismatch(trustpilot_reviews::matches(url), n, url, || { + trustpilot_reviews::extract(client, url) + }) + .await + } _ => Err(ExtractorDispatchError::UnknownVertical(name.to_string())), } } diff --git a/crates/webclaw-fetch/src/extractors/trustpilot_reviews.rs b/crates/webclaw-fetch/src/extractors/trustpilot_reviews.rs index 41f40d4..a5e1e48 100644 --- a/crates/webclaw-fetch/src/extractors/trustpilot_reviews.rs +++ b/crates/webclaw-fetch/src/extractors/trustpilot_reviews.rs @@ -1,16 +1,18 @@ //! Trustpilot company reviews extractor. //! -//! Trustpilot pages at `trustpilot.com/review/{domain}` embed a rich -//! JSON-LD `LocalBusiness` / `Organization` block with aggregate -//! rating + up to 20 recent reviews. No auth, no antibot for the -//! page HTML itself. -//! -//! Auto-dispatch safe because the host is unique. +//! `trustpilot.com/review/{domain}` pages embed a JSON-LD +//! `Organization` / `LocalBusiness` block with aggregate rating + up +//! to 20 recent reviews. The page HTML itself is usually behind AWS +//! WAF's "Verifying Connection" interstitial — so this extractor +//! always uses [`cloud::smart_fetch_html`] and only returns data when +//! the caller has `WEBCLAW_API_KEY` set (cloud handles the bypass). +//! OSS users without a key get a clear error pointing at signup. use serde_json::{Value, json}; use super::ExtractorInfo; use crate::client::FetchClient; +use crate::cloud::{self, CloudError}; use crate::error::FetchError; pub const INFO: ExtractorInfo = ExtractorInfo { @@ -29,15 +31,22 @@ pub fn matches(url: &str) -> bool { } pub async fn extract(client: &FetchClient, url: &str) -> Result { - let resp = client.fetch(url).await?; - if !(200..300).contains(&resp.status) { - return Err(FetchError::Build(format!( - "trustpilot_reviews: status {} for {url}", - resp.status - ))); - } + // Trustpilot is always behind AWS WAF, so we go through smart_fetch + // which tries local first (which will hit the challenge interstitial), + // detects it, and escalates to cloud /v1/scrape for the real HTML. + let fetched = cloud::smart_fetch_html(client, client.cloud(), url) + .await + .map_err(cloud_to_fetch_err)?; - let blocks = webclaw_core::structured_data::extract_json_ld(&resp.html); + let html = parse(&fetched.html, url)?; + Ok(html_with_source(html, fetched.source)) +} + +/// Run the pure parser on already-fetched HTML. Split out so the cloud +/// pipeline can call it directly after its own antibot-aware fetch +/// without going through [`extract`]. +pub fn parse(html: &str, url: &str) -> Result { + let blocks = webclaw_core::structured_data::extract_json_ld(html); let business = find_business(&blocks).ok_or_else(|| { FetchError::BodyDecode(format!( "trustpilot_reviews: no Organization/LocalBusiness JSON-LD on {url}" @@ -94,6 +103,26 @@ pub async fn extract(client: &FetchClient, url: &str) -> Result FetchError { + FetchError::Build(e.to_string()) +} + +/// Stamp `data_source` onto the parser output so callers can tell at a +/// glance whether this row came from local or cloud. Useful for UX and +/// for pricing-aware pipelines. +fn html_with_source(mut v: Value, source: cloud::FetchSource) -> Value { + if let Some(obj) = v.as_object_mut() { + obj.insert( + "data_source".into(), + match source { + cloud::FetchSource::Local => json!("local"), + cloud::FetchSource::Cloud => json!("cloud"), + }, + ); + } + v +} + // --------------------------------------------------------------------------- // JSON-LD walker — same pattern as ecommerce_product // --------------------------------------------------------------------------- diff --git a/crates/webclaw-server/src/state.rs b/crates/webclaw-server/src/state.rs index d7f151b..6c2e8f7 100644 --- a/crates/webclaw-server/src/state.rs +++ b/crates/webclaw-server/src/state.rs @@ -1,7 +1,24 @@ //! Shared application state. Cheap to clone via Arc; held by the axum //! Router for the life of the process. +//! +//! Two unrelated keys get carried here: +//! +//! 1. [`AppState::api_key`] — the **bearer token clients must present** +//! to call this server. Set via `WEBCLAW_API_KEY` / `--api-key`. +//! Unset = open mode. +//! 2. The inner [`webclaw_fetch::cloud::CloudClient`] (if any) — our +//! **outbound** credential for api.webclaw.io, used by extractors +//! that escalate on antibot. Set via `WEBCLAW_CLOUD_API_KEY`. +//! Unset = hard-site extractors return a "set WEBCLAW_CLOUD_API_KEY" +//! error with a signup link. +//! +//! Different variables on purpose: conflating the two means operators +//! who want their server behind an auth token can't also enable cloud +//! fallback, and vice versa. use std::sync::Arc; +use tracing::info; +use webclaw_fetch::cloud::CloudClient; use webclaw_fetch::{BrowserProfile, FetchClient, FetchConfig}; /// Single-process state shared across all request handlers. @@ -17,6 +34,7 @@ struct Inner { /// auto-deref `&Arc` -> `&FetchClient`, so this costs /// them nothing. pub fetch: Arc, + /// Inbound bearer-auth token for this server's own `/v1/*` surface. pub api_key: Option, } @@ -24,17 +42,34 @@ impl AppState { /// Build the application state. The fetch client is constructed once /// and shared across requests so connection pools + browser profile /// state don't churn per request. - pub fn new(api_key: Option) -> anyhow::Result { + /// + /// `inbound_api_key` is the bearer token clients must present; + /// cloud-fallback credentials come from the env (checked here). + pub fn new(inbound_api_key: Option) -> anyhow::Result { let config = FetchConfig { browser: BrowserProfile::Firefox, ..FetchConfig::default() }; - let fetch = FetchClient::new(config) + let mut fetch = FetchClient::new(config) .map_err(|e| anyhow::anyhow!("failed to build fetch client: {e}"))?; + + // Cloud fallback: only activates when the operator has provided + // an api.webclaw.io key. Supports both WEBCLAW_CLOUD_API_KEY + // (preferred, disambiguates from the inbound-auth key) and + // WEBCLAW_API_KEY as a fallback when there's no inbound key + // configured (backwards compat with MCP / CLI conventions). + if let Some(cloud) = build_cloud_client(inbound_api_key.as_deref()) { + info!( + base = cloud.base_url(), + "cloud fallback enabled — antibot-protected sites will escalate via api.webclaw.io" + ); + fetch = fetch.with_cloud(cloud); + } + Ok(Self { inner: Arc::new(Inner { fetch: Arc::new(fetch), - api_key, + api_key: inbound_api_key, }), }) } @@ -47,3 +82,26 @@ impl AppState { self.inner.api_key.as_deref() } } + +/// Resolve the outbound cloud key. Prefers `WEBCLAW_CLOUD_API_KEY`; +/// falls back to `WEBCLAW_API_KEY` *only* when no inbound key is +/// configured (i.e. open mode — the same env var can't mean two +/// things to one process). +fn build_cloud_client(inbound_api_key: Option<&str>) -> Option { + let cloud_key = std::env::var("WEBCLAW_CLOUD_API_KEY").ok(); + if let Some(k) = cloud_key.as_deref() + && !k.trim().is_empty() + { + return Some(CloudClient::with_key(k)); + } + // Reuse WEBCLAW_API_KEY only when not also acting as our own + // inbound-auth token — otherwise we'd be telling the operator + // they can't have both. + if inbound_api_key.is_none() + && let Ok(k) = std::env::var("WEBCLAW_API_KEY") + && !k.trim().is_empty() + { + return Some(CloudClient::with_key(k)); + } + None +}