From a53578e45c096dc141b7ebb65a5992451e638e63 Mon Sep 17 00:00:00 2001 From: Valerio Date: Wed, 22 Apr 2026 17:07:31 +0200 Subject: [PATCH] fix(extractors): detect AWS WAF verifying-connection page, add OG fallback to ecommerce_product Two targeted fixes surfaced by the manual extractor smoke test. cloud::is_bot_protected: - Trustpilot serves a ~565-byte AWS WAF interstitial with the string "Verifying your connection..." and an `interstitial-spinner` div. That pattern was not in our detector, so local fetch returned the challenge page, JSON-LD parsing found nothing, and the extractor emitted a confusing "no Organization/LocalBusiness JSON-LD" error. - Added the pattern plus a <10KB size gate so real articles that happen to mention the phrase aren't misclassified. Two new tests cover positive + negative cases. - With the fix, trustpilot_reviews now correctly escalates via smart_fetch_html and returns the clean "Set WEBCLAW_API_KEY" actionable error without a key, or cloud-bypassed HTML with one. ecommerce_product: - Previously hard-failed when a page had no Product JSON-LD, and produced an empty `offers` list when JSON-LD was present but its `offers` node was. Many sites (Patagonia-style catalog pages, smaller Squarespace stores) ship one or the other of OG / JSON-LD but not both with price data. - Added OG meta-tag fallback that handles: * no JSON-LD at all -> build minimal payload from og:title, og:image, og:description, product:price:amount, product:price:currency, product:availability, product:brand * JSON-LD present but offers empty -> augment with an OG-derived offer so price comes through - New `data_source` field: "jsonld", "jsonld+og", or "og_fallback" so callers can tell which branch populated the data. - `has_og_product_signal()` requires og:type=product or a price tag so blog posts don't get mis-classified as products. Tests: 197 passing in webclaw-fetch (6 new), clippy clean. --- crates/webclaw-fetch/src/cloud.rs | 32 ++ .../src/extractors/ecommerce_product.rs | 295 ++++++++++++++++-- 2 files changed, 299 insertions(+), 28 deletions(-) diff --git a/crates/webclaw-fetch/src/cloud.rs b/crates/webclaw-fetch/src/cloud.rs index 3e1110a..ecce934 100644 --- a/crates/webclaw-fetch/src/cloud.rs +++ b/crates/webclaw-fetch/src/cloud.rs @@ -325,6 +325,18 @@ pub fn is_bot_protected(html: &str, headers: &HeaderMap) -> bool { return true; } + // AWS WAF "Verifying your connection" interstitial (used by Trustpilot). + // Distinct from the captcha-branded path above: the challenge page is + // a tiny HTML shell with an `interstitial-spinner` div and no content. + // Gating on html.len() keeps false-positives off long pages that + // happen to mention the phrase in an unrelated context. + if html_lower.contains("interstitial-spinner") + && html_lower.contains("verifying your connection") + && html.len() < 10_000 + { + return true; + } + // hCaptcha *blocking* page (not just an embedded widget). if html_lower.contains("hcaptcha.com") && html_lower.contains("h-captcha") @@ -564,6 +576,26 @@ mod tests { assert!(!is_bot_protected(&html, &empty_headers())); } + #[test] + fn is_bot_protected_detects_aws_waf_verifying_connection() { + // The exact shape Trustpilot serves under AWS WAF. + let html = r#"
+
+

Verifying your connection...

"#; + assert!(is_bot_protected(html, &empty_headers())); + } + + #[test] + fn is_bot_protected_ignores_phrase_on_real_content() { + // A real article that happens to mention the phrase in prose + // should not trigger the short-page detector. + let html = format!( + "{}

Verifying your connection is tricky.

", + "article text ".repeat(2_000) + ); + assert!(!is_bot_protected(&html, &empty_headers())); + } + #[test] fn needs_js_rendering_flags_spa_skeleton() { let html = format!( diff --git a/crates/webclaw-fetch/src/extractors/ecommerce_product.rs b/crates/webclaw-fetch/src/extractors/ecommerce_product.rs index bad2f9b..099a8fb 100644 --- a/crates/webclaw-fetch/src/extractors/ecommerce_product.rs +++ b/crates/webclaw-fetch/src/extractors/ecommerce_product.rs @@ -7,7 +7,7 @@ //! BigCommerce, WooCommerce, Squarespace, Magento, custom storefronts, //! and anything else that follows Schema.org. //! -//! **Explicit-call only** — `/v1/scrape/ecommerce_product`. Not in the +//! **Explicit-call only** (`/v1/scrape/ecommerce_product`). Not in the //! auto-dispatch because we can't identify "this is a product page" //! from the URL alone. When the caller knows they have a product URL, //! this is the reliable fallback for stores where shopify_product @@ -17,7 +17,28 @@ //! so JSON-LD parsing is shared with the rest of the extraction //! pipeline. We walk all blocks looking for `@type: Product`, //! `ProductGroup`, or an `ItemList` whose first entry is a Product. +//! +//! ## OG fallback +//! +//! Two real-world cases JSON-LD alone can't cover: +//! +//! 1. Site has no Product JSON-LD at all (smaller Squarespace / custom +//! storefronts, many European shops). +//! 2. Site has Product JSON-LD but the `offers` block is empty (seen on +//! Patagonia and other catalog-style sites that split price onto a +//! separate widget). +//! +//! For case 1 we build a minimal payload from OG / product meta tags +//! (`og:title`, `og:image`, `og:description`, `product:price:amount`, +//! `product:price:currency`, `product:availability`, `product:brand`). +//! For case 2 we augment the JSON-LD offers list with an OG-derived +//! offer so callers get a price either way. A `data_source` field +//! (`"jsonld"` / `"jsonld+og"` / `"og_fallback"`) tells the caller +//! which branch produced the data. +use std::sync::OnceLock; + +use regex::Regex; use serde_json::{Value, json}; use super::ExtractorInfo; @@ -56,38 +77,104 @@ pub async fn extract(client: &FetchClient, url: &str) -> Result Option { // Reuse the core JSON-LD parser so we benefit from whatever // robustness it gains over time (handling @graph, arrays, etc.). - let blocks = webclaw_core::structured_data::extract_json_ld(&resp.html); - let product = find_product(&blocks).ok_or_else(|| { - FetchError::BodyDecode(format!( - "ecommerce_product: no Schema.org Product found in JSON-LD on {url}" - )) - })?; + let blocks = webclaw_core::structured_data::extract_json_ld(html); + let product = find_product(&blocks); - Ok(json!({ + if let Some(p) = product { + Some(build_jsonld_payload(&p, html, url)) + } else if has_og_product_signal(html) { + Some(build_og_payload(html, url)) + } else { + None + } +} + +/// Build the rich payload from a Product JSON-LD node. Augments the +/// `offers` array with an OG-derived offer when JSON-LD offers is empty +/// so callers get a price on sites like Patagonia. +fn build_jsonld_payload(product: &Value, html: &str, url: &str) -> Value { + let mut offers = collect_offers(product); + let mut data_source = "jsonld"; + if offers.is_empty() + && let Some(og_offer) = build_og_offer(html) + { + offers.push(og_offer); + data_source = "jsonld+og"; + } + + json!({ "url": url, - "name": get_text(&product, "name"), - "description": get_text(&product, "description"), - "brand": get_brand(&product), - "sku": get_text(&product, "sku"), - "mpn": get_text(&product, "mpn"), - "gtin": get_text(&product, "gtin") - .or_else(|| get_text(&product, "gtin13")) - .or_else(|| get_text(&product, "gtin12")) - .or_else(|| get_text(&product, "gtin8")), - "product_id": get_text(&product, "productID"), - "category": get_text(&product, "category"), - "color": get_text(&product, "color"), - "material": get_text(&product, "material"), - "images": collect_images(&product), - "offers": collect_offers(&product), - "aggregate_rating": get_aggregate_rating(&product), - "review_count": get_review_count(&product), - "raw_schema_type": get_text(&product, "@type"), - "raw_jsonld": product, - })) + "data_source": data_source, + "name": get_text(product, "name").or_else(|| og(html, "title")), + "description": get_text(product, "description").or_else(|| og(html, "description")), + "brand": get_brand(product).or_else(|| meta_property(html, "product:brand")), + "sku": get_text(product, "sku"), + "mpn": get_text(product, "mpn"), + "gtin": get_text(product, "gtin") + .or_else(|| get_text(product, "gtin13")) + .or_else(|| get_text(product, "gtin12")) + .or_else(|| get_text(product, "gtin8")), + "product_id": get_text(product, "productID"), + "category": get_text(product, "category"), + "color": get_text(product, "color"), + "material": get_text(product, "material"), + "images": nonempty_or_og(collect_images(product), html), + "offers": offers, + "aggregate_rating": get_aggregate_rating(product), + "review_count": get_review_count(product), + "raw_schema_type": get_text(product, "@type"), + "raw_jsonld": product.clone(), + }) +} + +/// Build a minimal payload from OG / product meta tags. Used when a +/// page has no Product JSON-LD at all. +fn build_og_payload(html: &str, url: &str) -> Value { + let offers = build_og_offer(html).map(|o| vec![o]).unwrap_or_default(); + let image = og(html, "image"); + let images: Vec = image.map(|i| vec![Value::String(i)]).unwrap_or_default(); + + json!({ + "url": url, + "data_source": "og_fallback", + "name": og(html, "title"), + "description": og(html, "description"), + "brand": meta_property(html, "product:brand"), + "sku": None::, + "mpn": None::, + "gtin": None::, + "product_id": None::, + "category": None::, + "color": None::, + "material": None::, + "images": images, + "offers": offers, + "aggregate_rating": Value::Null, + "review_count": None::, + "raw_schema_type": None::, + "raw_jsonld": Value::Null, + }) +} + +fn nonempty_or_og(imgs: Vec, html: &str) -> Vec { + if !imgs.is_empty() { + return imgs; + } + og(html, "image") + .map(|s| vec![Value::String(s)]) + .unwrap_or_default() } // --------------------------------------------------------------------------- @@ -236,6 +323,81 @@ fn host_of(url: &str) -> &str { .unwrap_or("") } +// --------------------------------------------------------------------------- +// OG / product meta-tag helpers +// --------------------------------------------------------------------------- + +/// True when the HTML has enough OG / product meta tags to justify +/// building a fallback payload. A single `og:title` isn't enough on its +/// own — every blog post has that. We require either a product price +/// tag or at least an `og:type` of `product`/`og:product` to avoid +/// mis-classifying articles as products. +fn has_og_product_signal(html: &str) -> bool { + let has_price = meta_property(html, "product:price:amount").is_some() + || meta_property(html, "og:price:amount").is_some(); + if has_price { + return true; + } + // `` is the Schema.org OG + // marker for product pages. + let og_type = og(html, "type").unwrap_or_default().to_lowercase(); + matches!(og_type.as_str(), "product" | "og:product" | "product.item") +} + +/// Build a single Offer-shaped Value from OG / product meta tags, or +/// `None` if there's no price info at all. +fn build_og_offer(html: &str) -> Option { + let price = meta_property(html, "product:price:amount") + .or_else(|| meta_property(html, "og:price:amount")); + let currency = meta_property(html, "product:price:currency") + .or_else(|| meta_property(html, "og:price:currency")); + let availability = meta_property(html, "product:availability") + .or_else(|| meta_property(html, "og:availability")); + price.as_ref()?; + Some(json!({ + "price": price, + "low_price": None::, + "high_price": None::, + "currency": currency, + "availability": availability, + "item_condition": None::, + "valid_until": None::, + "url": None::, + "seller": None::, + "offer_count": None::, + })) +} + +/// Pull the value of ``. +fn og(html: &str, prop: &str) -> Option { + static RE: OnceLock = OnceLock::new(); + let re = RE.get_or_init(|| { + Regex::new(r#"(?i)]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap() + }); + for c in re.captures_iter(html) { + if c.get(1).is_some_and(|m| m.as_str() == prop) { + return c.get(2).map(|m| m.as_str().to_string()); + } + } + None +} + +/// Pull the value of any `` tag. +/// Needed for namespaced OG variants like `product:price:amount` that +/// the simple `og:*` matcher above doesn't cover. +fn meta_property(html: &str, prop: &str) -> Option { + static RE: OnceLock = OnceLock::new(); + let re = RE.get_or_init(|| { + Regex::new(r#"(?i)]+property="([^"]+)"[^>]+content="([^"]+)""#).unwrap() + }); + for c in re.captures_iter(html) { + if c.get(1).is_some_and(|m| m.as_str() == prop) { + return c.get(2).map(|m| m.as_str().to_string()); + } + } + None +} + #[cfg(test)] mod tests { use super::*; @@ -311,4 +473,81 @@ mod tests { Some("InStock") ); } + + // --- OG fallback -------------------------------------------------------- + + #[test] + fn has_og_product_signal_accepts_product_type_or_price() { + let type_only = r#""#; + let price_only = r#""#; + let neither = r#""#; + assert!(has_og_product_signal(type_only)); + assert!(has_og_product_signal(price_only)); + assert!(!has_og_product_signal(neither)); + } + + #[test] + fn og_fallback_builds_payload_without_jsonld() { + let html = r##" + + + + + + + + + "##; + let v = parse(html, "https://example.com/p/candle").unwrap(); + assert_eq!(v["data_source"], "og_fallback"); + assert_eq!(v["name"], "Handmade Candle"); + assert_eq!(v["description"], "Small-batch soy candle."); + assert_eq!(v["brand"], "Little Studio"); + assert_eq!(v["offers"][0]["price"], "18.00"); + assert_eq!(v["offers"][0]["currency"], "USD"); + assert_eq!(v["offers"][0]["availability"], "in stock"); + assert_eq!(v["images"][0], "https://cdn.example.com/candle.jpg"); + } + + #[test] + fn jsonld_augments_empty_offers_with_og_price() { + // Patagonia-shaped page: Product JSON-LD without an Offer, plus + // product:price:* OG tags. We should merge. + let html = r##" + + + + "##; + let v = parse(html, "https://patagonia.com/p/x").unwrap(); + assert_eq!(v["data_source"], "jsonld+og"); + assert_eq!(v["name"], "Better Sweater"); + assert_eq!(v["offers"].as_array().unwrap().len(), 1); + assert_eq!(v["offers"][0]["price"], "139.00"); + } + + #[test] + fn jsonld_only_stays_pure_jsonld() { + let html = r##" + + "##; + let v = parse(html, "https://example.com/p/w").unwrap(); + assert_eq!(v["data_source"], "jsonld"); + assert_eq!(v["offers"][0]["price"], "9.99"); + } + + #[test] + fn parse_returns_none_on_no_product_signals() { + let html = r#" + + + "#; + assert!(parse(html, "https://blog.example.com/post").is_none()); + } }