From a53578e45c096dc141b7ebb65a5992451e638e63 Mon Sep 17 00:00:00 2001
From: Valerio <massimianivalerio1@gmail.com>
Date: Wed, 22 Apr 2026 17:07:31 +0200
Subject: [PATCH] fix(extractors): detect AWS WAF verifying-connection page,
 add OG fallback to ecommerce_product

Two targeted fixes surfaced by the manual extractor smoke test.

cloud::is_bot_protected:
- Trustpilot serves a ~565-byte AWS WAF interstitial with the string
  "Verifying your connection..." and an `interstitial-spinner` div.
  That pattern was not in our detector, so local fetch returned the
  challenge page, JSON-LD parsing found nothing, and the extractor
  emitted a confusing "no Organization/LocalBusiness JSON-LD" error.
- Added the pattern plus a <10KB size gate so real articles that
  happen to mention the phrase aren't misclassified. Two new tests
  cover positive + negative cases.
- With the fix, trustpilot_reviews now correctly escalates via
  smart_fetch_html and returns the clean "Set WEBCLAW_API_KEY"
  actionable error without a key, or cloud-bypassed HTML with one.

ecommerce_product:
- Previously hard-failed when a page had no Product JSON-LD, and
  produced an empty `offers` list when JSON-LD was present but its
  `offers` node was. Many sites (Patagonia-style catalog pages,
  smaller Squarespace stores) ship one or the other of OG / JSON-LD
  but not both with price data.
- Added OG meta-tag fallback that handles:
  * no JSON-LD at all -> build minimal payload from og:title,
    og:image, og:description, product:price:amount,
    product:price:currency, product:availability, product:brand
  * JSON-LD present but offers empty -> augment with an OG-derived
    offer so price comes through
- New `data_source` field: "jsonld", "jsonld+og", or "og_fallback"
  so callers can tell which branch populated the data.
- `has_og_product_signal()` requires og:type=product or a price tag
  so blog posts don't get mis-classified as products.

Tests: 197 passing in webclaw-fetch (6 new), clippy clean.
---
 crates/webclaw-fetch/src/cloud.rs             |  32 ++
 .../src/extractors/ecommerce_product.rs       | 295 ++++++++++++++++--
 2 files changed, 299 insertions(+), 28 deletions(-)
diff --git a/crates/webclaw-fetch/src/cloud.rs b/crates/webclaw-fetch/src/cloud.rs
index 3e1110a..ecce934 100644
--- a/crates/webclaw-fetch/src/cloud.rs
+++ b/crates/webclaw-fetch/src/cloud.rs
@@ -325,6 +325,18 @@ pub fn is_bot_protected(html: &str, headers: &HeaderMap) -> bool {
         return true;
     }
 
+    // AWS WAF "Verifying your connection" interstitial (used by Trustpilot).
+    // Distinct from the captcha-branded path above: the challenge page is
+    // a tiny HTML shell with an `interstitial-spinner` div and no content.
+    // Gating on html.len() keeps false-positives off long pages that
+    // happen to mention the phrase in an unrelated context.
+    if html_lower.contains("interstitial-spinner")
+        && html_lower.contains("verifying your connection")
+        && html.len() < 10_000
+    {
+        return true;
+    }
+
     // hCaptcha *blocking* page (not just an embedded widget).
     if html_lower.contains("hcaptcha.com")
         && html_lower.contains("h-captcha")
@@ -564,6 +576,26 @@ mod tests {
         assert!(!is_bot_protected(&html, &empty_headers()));
     }
 
+    #[test]
+    fn is_bot_protected_detects_aws_waf_verifying_connection() {
+        // The exact shape Trustpilot serves under AWS WAF.
+        let html = r#"<div class="container"><div id="loading-state">
+            <div class="interstitial-spinner" id="spinner"></div>
+            <h1>Verifying your connection...</h1></div></div>"#;
+        assert!(is_bot_protected(html, &empty_headers()));
+    }
+
+    #[test]
+    fn is_bot_protected_ignores_phrase_on_real_content() {
+        // A real article that happens to mention the phrase in prose
+        // should not trigger the short-page detector.
+        let html = format!(
+            "<html><body>{}<p>Verifying your connection is tricky.</p></body></html>",
+            "article text ".repeat(2_000)
+        );
+        assert!(!is_bot_protected(&html, &empty_headers()));
+    }
+
     #[test]
     fn needs_js_rendering_flags_spa_skeleton() {
         let html = format!(
diff --git a/crates/webclaw-fetch/src/extractors/ecommerce_product.rs b/crates/webclaw-fetch/src/extractors/ecommerce_product.rs
index bad2f9b..099a8fb 100644
--- a/crates/webclaw-fetch/src/extractors/ecommerce_product.rs
+++ b/crates/webclaw-fetch/src/extractors/ecommerce_product.rs
@@ -7,7 +7,7 @@
 //! BigCommerce, WooCommerce, Squarespace, Magento, custom storefronts,
 //! and anything else that follows Schema.org.
 //!
-//! **Explicit-call only** — `/v1/scrape/ecommerce_product`. Not in the
+//! **Explicit-call only** (`/v1/scrape/ecommerce_product`). Not in the
 //! auto-dispatch because we can't identify "this is a product page"
 //! from the URL alone. When the caller knows they have a product URL,
 //! this is the reliable fallback for stores where shopify_product
@@ -17,7 +17,28 @@
 //! so JSON-LD parsing is shared with the rest of the extraction
 //! pipeline. We walk all blocks looking for `@type: Product`,
 //! `ProductGroup`, or an `ItemList` whose first entry is a Product.
+//!
+//! ## OG fallback
+//!
+//! Two real-world cases JSON-LD alone can't cover:
+//!
+//! 1. Site has no Product JSON-LD at all (smaller Squarespace / custom
+//!    storefronts, many European shops).
+//! 2. Site has Product JSON-LD but the `offers` block is empty (seen on
+//!    Patagonia and other catalog-style sites that split price onto a
+//!    separate widget).
+//!
+//! For case 1 we build a minimal payload from OG / product meta tags
+//! (`og:title`, `og:image`, `og:description`, `product:price:amount`,
+//! `product:price:currency`, `product:availability`, `product:brand`).
+//! For case 2 we augment the JSON-LD offers list with an OG-derived
+//! offer so callers get a price either way. A `data_source` field
+//! (`"jsonld"` / `"jsonld+og"` / `"og_fallback"`) tells the caller
+//! which branch produced the data.
 
+use std::sync::OnceLock;
+
+use regex::Regex;
 use serde_json::{Value, json};
 
 use super::ExtractorInfo;
@@ -56,38 +77,104 @@ pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchErro
             resp.status
         )));
     }
+    parse(&resp.html, url).ok_or_else(|| {
+        FetchError::BodyDecode(format!(
+            "ecommerce_product: no Schema.org Product JSON-LD and no OG product tags on {url}"
+        ))
+    })
+}
 
+/// Pure parser: try JSON-LD first, fall back to OG meta tags. Returns
+/// `None` when neither path has enough to say "this is a product page".
+pub fn parse(html: &str, url: &str) -> Option<Value> {
     // Reuse the core JSON-LD parser so we benefit from whatever
     // robustness it gains over time (handling @graph, arrays, etc.).
-    let blocks = webclaw_core::structured_data::extract_json_ld(&resp.html);
-    let product = find_product(&blocks).ok_or_else(|| {
-        FetchError::BodyDecode(format!(
-            "ecommerce_product: no Schema.org Product found in JSON-LD on {url}"
-        ))
-    })?;
+    let blocks = webclaw_core::structured_data::extract_json_ld(html);
+    let product = find_product(&blocks);
 
-    Ok(json!({
+    if let Some(p) = product {
+        Some(build_jsonld_payload(&p, html, url))
+    } else if has_og_product_signal(html) {
+        Some(build_og_payload(html, url))
+    } else {
+        None
+    }
+}
+
+/// Build the rich payload from a Product JSON-LD node. Augments the
+/// `offers` array with an OG-derived offer when JSON-LD offers is empty
+/// so callers get a price on sites like Patagonia.
+fn build_jsonld_payload(product: &Value, html: &str, url: &str) -> Value {
+    let mut offers = collect_offers(product);
+    let mut data_source = "jsonld";
+    if offers.is_empty()
+        && let Some(og_offer) = build_og_offer(html)
+    {
+        offers.push(og_offer);
+        data_source = "jsonld+og";
+    }
+
+    json!({
         "url":                url,
-        "name":               get_text(&product, "name"),
-        "description":        get_text(&product, "description"),
-        "brand":              get_brand(&product),
-        "sku":                get_text(&product, "sku"),
-        "mpn":                get_text(&product, "mpn"),
-        "gtin":               get_text(&product, "gtin")
-                                 .or_else(|| get_text(&product, "gtin13"))
-                                 .or_else(|| get_text(&product, "gtin12"))
-                                 .or_else(|| get_text(&product, "gtin8")),
-        "product_id":         get_text(&product, "productID"),
-        "category":           get_text(&product, "category"),
-        "color":              get_text(&product, "color"),
-        "material":           get_text(&product, "material"),
-        "images":             collect_images(&product),
-        "offers":             collect_offers(&product),
-        "aggregate_rating":   get_aggregate_rating(&product),
-        "review_count":       get_review_count(&product),
-        "raw_schema_type":    get_text(&product, "@type"),
-        "raw_jsonld":         product,
-    }))
+        "data_source":        data_source,
+        "name":               get_text(product, "name").or_else(|| og(html, "title")),
+        "description":        get_text(product, "description").or_else(|| og(html, "description")),
+        "brand":              get_brand(product).or_else(|| meta_property(html, "product:brand")),
+        "sku":                get_text(product, "sku"),
+        "mpn":                get_text(product, "mpn"),
+        "gtin":               get_text(product, "gtin")
+                                 .or_else(|| get_text(product, "gtin13"))
+                                 .or_else(|| get_text(product, "gtin12"))
+                                 .or_else(|| get_text(product, "gtin8")),
+        "product_id":         get_text(product, "productID"),
+        "category":           get_text(product, "category"),
+        "color":              get_text(product, "color"),
+        "material":           get_text(product, "material"),
+        "images":             nonempty_or_og(collect_images(product), html),
+        "offers":             offers,
+        "aggregate_rating":   get_aggregate_rating(product),
+        "review_count":       get_review_count(product),
+        "raw_schema_type":    get_text(product, "@type"),
+        "raw_jsonld":         product.clone(),
+    })
+}
+
+/// Build a minimal payload from OG / product meta tags. Used when a
+/// page has no Product JSON-LD at all.
+fn build_og_payload(html: &str, url: &str) -> Value {
+    let offers = build_og_offer(html).map(|o| vec![o]).unwrap_or_default();
+    let image = og(html, "image");
+    let images: Vec<Value> = image.map(|i| vec![Value::String(i)]).unwrap_or_default();
+
+    json!({
+        "url":                url,
+        "data_source":        "og_fallback",
+        "name":               og(html, "title"),
+        "description":        og(html, "description"),
+        "brand":              meta_property(html, "product:brand"),
+        "sku":                None::<String>,
+        "mpn":                None::<String>,
+        "gtin":               None::<String>,
+        "product_id":         None::<String>,
+        "category":           None::<String>,
+        "color":              None::<String>,
+        "material":           None::<String>,
+        "images":             images,
+        "offers":             offers,
+        "aggregate_rating":   Value::Null,
+        "review_count":       None::<String>,
+        "raw_schema_type":    None::<String>,
+        "raw_jsonld":         Value::Null,
+    })
+}
+
+fn nonempty_or_og(imgs: Vec<Value>, html: &str) -> Vec<Value> {
+    if !imgs.is_empty() {
+        return imgs;
+    }
+    og(html, "image")
+        .map(|s| vec![Value::String(s)])
+        .unwrap_or_default()
 }
 
 // ---------------------------------------------------------------------------
@@ -236,6 +323,81 @@ fn host_of(url: &str) -> &str {
         .unwrap_or("")
 }
 
+// ---------------------------------------------------------------------------
+// OG / product meta-tag helpers
+// ---------------------------------------------------------------------------
+
+/// True when the HTML has enough OG / product meta tags to justify
+/// building a fallback payload. A single `og:title` isn't enough on its
+/// own — every blog post has that. We require either a product price
+/// tag or at least an `og:type` of `product`/`og:product` to avoid
+/// mis-classifying articles as products.
+fn has_og_product_signal(html: &str) -> bool {
+    let has_price = meta_property(html, "product:price:amount").is_some()
+        || meta_property(html, "og:price:amount").is_some();
+    if has_price {
+        return true;
+    }
+    // `<meta property="og:type" content="product">` is the Schema.org OG
+    // marker for product pages.
+    let og_type = og(html, "type").unwrap_or_default().to_lowercase();
+    matches!(og_type.as_str(), "product" | "og:product" | "product.item")
+}
+
+/// Build a single Offer-shaped Value from OG / product meta tags, or
+/// `None` if there's no price info at all.
+fn build_og_offer(html: &str) -> Option<Value> {
+    let price = meta_property(html, "product:price:amount")
+        .or_else(|| meta_property(html, "og:price:amount"));
+    let currency = meta_property(html, "product:price:currency")
+        .or_else(|| meta_property(html, "og:price:currency"));
+    let availability = meta_property(html, "product:availability")
+        .or_else(|| meta_property(html, "og:availability"));
+    price.as_ref()?;
+    Some(json!({
+        "price":            price,
+        "low_price":        None::<String>,
+        "high_price":       None::<String>,
+        "currency":         currency,
+        "availability":     availability,
+        "item_condition":   None::<String>,
+        "valid_until":      None::<String>,
+        "url":              None::<String>,
+        "seller":           None::<String>,
+        "offer_count":      None::<String>,
+    }))
+}
+
+/// Pull the value of `<meta property="og:{prop}" content="...">`.
+fn og(html: &str, prop: &str) -> Option<String> {
+    static RE: OnceLock<Regex> = OnceLock::new();
+    let re = RE.get_or_init(|| {
+        Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
+    });
+    for c in re.captures_iter(html) {
+        if c.get(1).is_some_and(|m| m.as_str() == prop) {
+            return c.get(2).map(|m| m.as_str().to_string());
+        }
+    }
+    None
+}
+
+/// Pull the value of any `<meta property="..." content="...">` tag.
+/// Needed for namespaced OG variants like `product:price:amount` that
+/// the simple `og:*` matcher above doesn't cover.
+fn meta_property(html: &str, prop: &str) -> Option<String> {
+    static RE: OnceLock<Regex> = OnceLock::new();
+    let re = RE.get_or_init(|| {
+        Regex::new(r#"(?i)<meta[^>]+property="([^"]+)"[^>]+content="([^"]+)""#).unwrap()
+    });
+    for c in re.captures_iter(html) {
+        if c.get(1).is_some_and(|m| m.as_str() == prop) {
+            return c.get(2).map(|m| m.as_str().to_string());
+        }
+    }
+    None
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -311,4 +473,81 @@ mod tests {
             Some("InStock")
         );
     }
+
+    // --- OG fallback --------------------------------------------------------
+
+    #[test]
+    fn has_og_product_signal_accepts_product_type_or_price() {
+        let type_only = r#"<meta property="og:type" content="product">"#;
+        let price_only = r#"<meta property="product:price:amount" content="49.00">"#;
+        let neither = r#"<meta property="og:title" content="My Article"><meta property="og:type" content="article">"#;
+        assert!(has_og_product_signal(type_only));
+        assert!(has_og_product_signal(price_only));
+        assert!(!has_og_product_signal(neither));
+    }
+
+    #[test]
+    fn og_fallback_builds_payload_without_jsonld() {
+        let html = r##"<html><head>
+            <meta property="og:type" content="product">
+            <meta property="og:title" content="Handmade Candle">
+            <meta property="og:image" content="https://cdn.example.com/candle.jpg">
+            <meta property="og:description" content="Small-batch soy candle.">
+            <meta property="product:price:amount" content="18.00">
+            <meta property="product:price:currency" content="USD">
+            <meta property="product:availability" content="in stock">
+            <meta property="product:brand" content="Little Studio">
+        </head></html>"##;
+        let v = parse(html, "https://example.com/p/candle").unwrap();
+        assert_eq!(v["data_source"], "og_fallback");
+        assert_eq!(v["name"], "Handmade Candle");
+        assert_eq!(v["description"], "Small-batch soy candle.");
+        assert_eq!(v["brand"], "Little Studio");
+        assert_eq!(v["offers"][0]["price"], "18.00");
+        assert_eq!(v["offers"][0]["currency"], "USD");
+        assert_eq!(v["offers"][0]["availability"], "in stock");
+        assert_eq!(v["images"][0], "https://cdn.example.com/candle.jpg");
+    }
+
+    #[test]
+    fn jsonld_augments_empty_offers_with_og_price() {
+        // Patagonia-shaped page: Product JSON-LD without an Offer, plus
+        // product:price:* OG tags. We should merge.
+        let html = r##"<html><head>
+            <script type="application/ld+json">
+            {"@context":"https://schema.org","@type":"Product",
+             "name":"Better Sweater","brand":"Patagonia",
+             "aggregateRating":{"@type":"AggregateRating","ratingValue":"4.4","reviewCount":"1142"}}
+            </script>
+            <meta property="product:price:amount" content="139.00">
+            <meta property="product:price:currency" content="USD">
+        </head></html>"##;
+        let v = parse(html, "https://patagonia.com/p/x").unwrap();
+        assert_eq!(v["data_source"], "jsonld+og");
+        assert_eq!(v["name"], "Better Sweater");
+        assert_eq!(v["offers"].as_array().unwrap().len(), 1);
+        assert_eq!(v["offers"][0]["price"], "139.00");
+    }
+
+    #[test]
+    fn jsonld_only_stays_pure_jsonld() {
+        let html = r##"<html><head>
+            <script type="application/ld+json">
+            {"@type":"Product","name":"Widget",
+             "offers":{"@type":"Offer","price":"9.99","priceCurrency":"USD"}}
+            </script>
+        </head></html>"##;
+        let v = parse(html, "https://example.com/p/w").unwrap();
+        assert_eq!(v["data_source"], "jsonld");
+        assert_eq!(v["offers"][0]["price"], "9.99");
+    }
+
+    #[test]
+    fn parse_returns_none_on_no_product_signals() {
+        let html = r#"<html><head>
+            <meta property="og:title" content="My Blog Post">
+            <meta property="og:type" content="article">
+        </head></html>"#;
+        assert!(parse(html, "https://blog.example.com/post").is_none());
+    }
 }