mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-04-25 00:06:21 +02:00
fix(extractors): detect AWS WAF verifying-connection page, add OG fallback to ecommerce_product
Two targeted fixes surfaced by the manual extractor smoke test.
cloud::is_bot_protected:
- Trustpilot serves a ~565-byte AWS WAF interstitial with the string
"Verifying your connection..." and an `interstitial-spinner` div.
That pattern was not in our detector, so local fetch returned the
challenge page, JSON-LD parsing found nothing, and the extractor
emitted a confusing "no Organization/LocalBusiness JSON-LD" error.
- Added the pattern plus a <10KB size gate so real articles that
happen to mention the phrase aren't misclassified. Two new tests
cover positive + negative cases.
- With the fix, trustpilot_reviews now correctly escalates via
smart_fetch_html and returns the clean "Set WEBCLAW_API_KEY"
actionable error without a key, or cloud-bypassed HTML with one.
ecommerce_product:
- Previously hard-failed when a page had no Product JSON-LD, and
produced an empty `offers` list when JSON-LD was present but its
`offers` node was. Many sites (Patagonia-style catalog pages,
smaller Squarespace stores) ship one or the other of OG / JSON-LD
but not both with price data.
- Added OG meta-tag fallback that handles:
* no JSON-LD at all -> build minimal payload from og:title,
og:image, og:description, product:price:amount,
product:price:currency, product:availability, product:brand
* JSON-LD present but offers empty -> augment with an OG-derived
offer so price comes through
- New `data_source` field: "jsonld", "jsonld+og", or "og_fallback"
so callers can tell which branch populated the data.
- `has_og_product_signal()` requires og:type=product or a price tag
so blog posts don't get mis-classified as products.
Tests: 197 passing in webclaw-fetch (6 new), clippy clean.
This commit is contained in:
parent
7f5eb93b65
commit
a53578e45c
2 changed files with 299 additions and 28 deletions
|
|
@ -325,6 +325,18 @@ pub fn is_bot_protected(html: &str, headers: &HeaderMap) -> bool {
|
|||
return true;
|
||||
}
|
||||
|
||||
// AWS WAF "Verifying your connection" interstitial (used by Trustpilot).
|
||||
// Distinct from the captcha-branded path above: the challenge page is
|
||||
// a tiny HTML shell with an `interstitial-spinner` div and no content.
|
||||
// Gating on html.len() keeps false-positives off long pages that
|
||||
// happen to mention the phrase in an unrelated context.
|
||||
if html_lower.contains("interstitial-spinner")
|
||||
&& html_lower.contains("verifying your connection")
|
||||
&& html.len() < 10_000
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
// hCaptcha *blocking* page (not just an embedded widget).
|
||||
if html_lower.contains("hcaptcha.com")
|
||||
&& html_lower.contains("h-captcha")
|
||||
|
|
@ -564,6 +576,26 @@ mod tests {
|
|||
assert!(!is_bot_protected(&html, &empty_headers()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn is_bot_protected_detects_aws_waf_verifying_connection() {
|
||||
// The exact shape Trustpilot serves under AWS WAF.
|
||||
let html = r#"<div class="container"><div id="loading-state">
|
||||
<div class="interstitial-spinner" id="spinner"></div>
|
||||
<h1>Verifying your connection...</h1></div></div>"#;
|
||||
assert!(is_bot_protected(html, &empty_headers()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn is_bot_protected_ignores_phrase_on_real_content() {
|
||||
// A real article that happens to mention the phrase in prose
|
||||
// should not trigger the short-page detector.
|
||||
let html = format!(
|
||||
"<html><body>{}<p>Verifying your connection is tricky.</p></body></html>",
|
||||
"article text ".repeat(2_000)
|
||||
);
|
||||
assert!(!is_bot_protected(&html, &empty_headers()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn needs_js_rendering_flags_spa_skeleton() {
|
||||
let html = format!(
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@
|
|||
//! BigCommerce, WooCommerce, Squarespace, Magento, custom storefronts,
|
||||
//! and anything else that follows Schema.org.
|
||||
//!
|
||||
//! **Explicit-call only** — `/v1/scrape/ecommerce_product`. Not in the
|
||||
//! **Explicit-call only** (`/v1/scrape/ecommerce_product`). Not in the
|
||||
//! auto-dispatch because we can't identify "this is a product page"
|
||||
//! from the URL alone. When the caller knows they have a product URL,
|
||||
//! this is the reliable fallback for stores where shopify_product
|
||||
|
|
@ -17,7 +17,28 @@
|
|||
//! so JSON-LD parsing is shared with the rest of the extraction
|
||||
//! pipeline. We walk all blocks looking for `@type: Product`,
|
||||
//! `ProductGroup`, or an `ItemList` whose first entry is a Product.
|
||||
//!
|
||||
//! ## OG fallback
|
||||
//!
|
||||
//! Two real-world cases JSON-LD alone can't cover:
|
||||
//!
|
||||
//! 1. Site has no Product JSON-LD at all (smaller Squarespace / custom
|
||||
//! storefronts, many European shops).
|
||||
//! 2. Site has Product JSON-LD but the `offers` block is empty (seen on
|
||||
//! Patagonia and other catalog-style sites that split price onto a
|
||||
//! separate widget).
|
||||
//!
|
||||
//! For case 1 we build a minimal payload from OG / product meta tags
|
||||
//! (`og:title`, `og:image`, `og:description`, `product:price:amount`,
|
||||
//! `product:price:currency`, `product:availability`, `product:brand`).
|
||||
//! For case 2 we augment the JSON-LD offers list with an OG-derived
|
||||
//! offer so callers get a price either way. A `data_source` field
|
||||
//! (`"jsonld"` / `"jsonld+og"` / `"og_fallback"`) tells the caller
|
||||
//! which branch produced the data.
|
||||
|
||||
use std::sync::OnceLock;
|
||||
|
||||
use regex::Regex;
|
||||
use serde_json::{Value, json};
|
||||
|
||||
use super::ExtractorInfo;
|
||||
|
|
@ -56,38 +77,104 @@ pub async fn extract(client: &FetchClient, url: &str) -> Result<Value, FetchErro
|
|||
resp.status
|
||||
)));
|
||||
}
|
||||
parse(&resp.html, url).ok_or_else(|| {
|
||||
FetchError::BodyDecode(format!(
|
||||
"ecommerce_product: no Schema.org Product JSON-LD and no OG product tags on {url}"
|
||||
))
|
||||
})
|
||||
}
|
||||
|
||||
/// Pure parser: try JSON-LD first, fall back to OG meta tags. Returns
|
||||
/// `None` when neither path has enough to say "this is a product page".
|
||||
pub fn parse(html: &str, url: &str) -> Option<Value> {
|
||||
// Reuse the core JSON-LD parser so we benefit from whatever
|
||||
// robustness it gains over time (handling @graph, arrays, etc.).
|
||||
let blocks = webclaw_core::structured_data::extract_json_ld(&resp.html);
|
||||
let product = find_product(&blocks).ok_or_else(|| {
|
||||
FetchError::BodyDecode(format!(
|
||||
"ecommerce_product: no Schema.org Product found in JSON-LD on {url}"
|
||||
))
|
||||
})?;
|
||||
let blocks = webclaw_core::structured_data::extract_json_ld(html);
|
||||
let product = find_product(&blocks);
|
||||
|
||||
Ok(json!({
|
||||
if let Some(p) = product {
|
||||
Some(build_jsonld_payload(&p, html, url))
|
||||
} else if has_og_product_signal(html) {
|
||||
Some(build_og_payload(html, url))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Build the rich payload from a Product JSON-LD node. Augments the
|
||||
/// `offers` array with an OG-derived offer when JSON-LD offers is empty
|
||||
/// so callers get a price on sites like Patagonia.
|
||||
fn build_jsonld_payload(product: &Value, html: &str, url: &str) -> Value {
|
||||
let mut offers = collect_offers(product);
|
||||
let mut data_source = "jsonld";
|
||||
if offers.is_empty()
|
||||
&& let Some(og_offer) = build_og_offer(html)
|
||||
{
|
||||
offers.push(og_offer);
|
||||
data_source = "jsonld+og";
|
||||
}
|
||||
|
||||
json!({
|
||||
"url": url,
|
||||
"name": get_text(&product, "name"),
|
||||
"description": get_text(&product, "description"),
|
||||
"brand": get_brand(&product),
|
||||
"sku": get_text(&product, "sku"),
|
||||
"mpn": get_text(&product, "mpn"),
|
||||
"gtin": get_text(&product, "gtin")
|
||||
.or_else(|| get_text(&product, "gtin13"))
|
||||
.or_else(|| get_text(&product, "gtin12"))
|
||||
.or_else(|| get_text(&product, "gtin8")),
|
||||
"product_id": get_text(&product, "productID"),
|
||||
"category": get_text(&product, "category"),
|
||||
"color": get_text(&product, "color"),
|
||||
"material": get_text(&product, "material"),
|
||||
"images": collect_images(&product),
|
||||
"offers": collect_offers(&product),
|
||||
"aggregate_rating": get_aggregate_rating(&product),
|
||||
"review_count": get_review_count(&product),
|
||||
"raw_schema_type": get_text(&product, "@type"),
|
||||
"raw_jsonld": product,
|
||||
}))
|
||||
"data_source": data_source,
|
||||
"name": get_text(product, "name").or_else(|| og(html, "title")),
|
||||
"description": get_text(product, "description").or_else(|| og(html, "description")),
|
||||
"brand": get_brand(product).or_else(|| meta_property(html, "product:brand")),
|
||||
"sku": get_text(product, "sku"),
|
||||
"mpn": get_text(product, "mpn"),
|
||||
"gtin": get_text(product, "gtin")
|
||||
.or_else(|| get_text(product, "gtin13"))
|
||||
.or_else(|| get_text(product, "gtin12"))
|
||||
.or_else(|| get_text(product, "gtin8")),
|
||||
"product_id": get_text(product, "productID"),
|
||||
"category": get_text(product, "category"),
|
||||
"color": get_text(product, "color"),
|
||||
"material": get_text(product, "material"),
|
||||
"images": nonempty_or_og(collect_images(product), html),
|
||||
"offers": offers,
|
||||
"aggregate_rating": get_aggregate_rating(product),
|
||||
"review_count": get_review_count(product),
|
||||
"raw_schema_type": get_text(product, "@type"),
|
||||
"raw_jsonld": product.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Build a minimal payload from OG / product meta tags. Used when a
|
||||
/// page has no Product JSON-LD at all.
|
||||
fn build_og_payload(html: &str, url: &str) -> Value {
|
||||
let offers = build_og_offer(html).map(|o| vec![o]).unwrap_or_default();
|
||||
let image = og(html, "image");
|
||||
let images: Vec<Value> = image.map(|i| vec![Value::String(i)]).unwrap_or_default();
|
||||
|
||||
json!({
|
||||
"url": url,
|
||||
"data_source": "og_fallback",
|
||||
"name": og(html, "title"),
|
||||
"description": og(html, "description"),
|
||||
"brand": meta_property(html, "product:brand"),
|
||||
"sku": None::<String>,
|
||||
"mpn": None::<String>,
|
||||
"gtin": None::<String>,
|
||||
"product_id": None::<String>,
|
||||
"category": None::<String>,
|
||||
"color": None::<String>,
|
||||
"material": None::<String>,
|
||||
"images": images,
|
||||
"offers": offers,
|
||||
"aggregate_rating": Value::Null,
|
||||
"review_count": None::<String>,
|
||||
"raw_schema_type": None::<String>,
|
||||
"raw_jsonld": Value::Null,
|
||||
})
|
||||
}
|
||||
|
||||
fn nonempty_or_og(imgs: Vec<Value>, html: &str) -> Vec<Value> {
|
||||
if !imgs.is_empty() {
|
||||
return imgs;
|
||||
}
|
||||
og(html, "image")
|
||||
.map(|s| vec![Value::String(s)])
|
||||
.unwrap_or_default()
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
|
@ -236,6 +323,81 @@ fn host_of(url: &str) -> &str {
|
|||
.unwrap_or("")
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// OG / product meta-tag helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// True when the HTML has enough OG / product meta tags to justify
|
||||
/// building a fallback payload. A single `og:title` isn't enough on its
|
||||
/// own — every blog post has that. We require either a product price
|
||||
/// tag or at least an `og:type` of `product`/`og:product` to avoid
|
||||
/// mis-classifying articles as products.
|
||||
fn has_og_product_signal(html: &str) -> bool {
|
||||
let has_price = meta_property(html, "product:price:amount").is_some()
|
||||
|| meta_property(html, "og:price:amount").is_some();
|
||||
if has_price {
|
||||
return true;
|
||||
}
|
||||
// `<meta property="og:type" content="product">` is the Schema.org OG
|
||||
// marker for product pages.
|
||||
let og_type = og(html, "type").unwrap_or_default().to_lowercase();
|
||||
matches!(og_type.as_str(), "product" | "og:product" | "product.item")
|
||||
}
|
||||
|
||||
/// Build a single Offer-shaped Value from OG / product meta tags, or
|
||||
/// `None` if there's no price info at all.
|
||||
fn build_og_offer(html: &str) -> Option<Value> {
|
||||
let price = meta_property(html, "product:price:amount")
|
||||
.or_else(|| meta_property(html, "og:price:amount"));
|
||||
let currency = meta_property(html, "product:price:currency")
|
||||
.or_else(|| meta_property(html, "og:price:currency"));
|
||||
let availability = meta_property(html, "product:availability")
|
||||
.or_else(|| meta_property(html, "og:availability"));
|
||||
price.as_ref()?;
|
||||
Some(json!({
|
||||
"price": price,
|
||||
"low_price": None::<String>,
|
||||
"high_price": None::<String>,
|
||||
"currency": currency,
|
||||
"availability": availability,
|
||||
"item_condition": None::<String>,
|
||||
"valid_until": None::<String>,
|
||||
"url": None::<String>,
|
||||
"seller": None::<String>,
|
||||
"offer_count": None::<String>,
|
||||
}))
|
||||
}
|
||||
|
||||
/// Pull the value of `<meta property="og:{prop}" content="...">`.
|
||||
fn og(html: &str, prop: &str) -> Option<String> {
|
||||
static RE: OnceLock<Regex> = OnceLock::new();
|
||||
let re = RE.get_or_init(|| {
|
||||
Regex::new(r#"(?i)<meta[^>]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#).unwrap()
|
||||
});
|
||||
for c in re.captures_iter(html) {
|
||||
if c.get(1).is_some_and(|m| m.as_str() == prop) {
|
||||
return c.get(2).map(|m| m.as_str().to_string());
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Pull the value of any `<meta property="..." content="...">` tag.
|
||||
/// Needed for namespaced OG variants like `product:price:amount` that
|
||||
/// the simple `og:*` matcher above doesn't cover.
|
||||
fn meta_property(html: &str, prop: &str) -> Option<String> {
|
||||
static RE: OnceLock<Regex> = OnceLock::new();
|
||||
let re = RE.get_or_init(|| {
|
||||
Regex::new(r#"(?i)<meta[^>]+property="([^"]+)"[^>]+content="([^"]+)""#).unwrap()
|
||||
});
|
||||
for c in re.captures_iter(html) {
|
||||
if c.get(1).is_some_and(|m| m.as_str() == prop) {
|
||||
return c.get(2).map(|m| m.as_str().to_string());
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
|
@ -311,4 +473,81 @@ mod tests {
|
|||
Some("InStock")
|
||||
);
|
||||
}
|
||||
|
||||
// --- OG fallback --------------------------------------------------------
|
||||
|
||||
#[test]
|
||||
fn has_og_product_signal_accepts_product_type_or_price() {
|
||||
let type_only = r#"<meta property="og:type" content="product">"#;
|
||||
let price_only = r#"<meta property="product:price:amount" content="49.00">"#;
|
||||
let neither = r#"<meta property="og:title" content="My Article"><meta property="og:type" content="article">"#;
|
||||
assert!(has_og_product_signal(type_only));
|
||||
assert!(has_og_product_signal(price_only));
|
||||
assert!(!has_og_product_signal(neither));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn og_fallback_builds_payload_without_jsonld() {
|
||||
let html = r##"<html><head>
|
||||
<meta property="og:type" content="product">
|
||||
<meta property="og:title" content="Handmade Candle">
|
||||
<meta property="og:image" content="https://cdn.example.com/candle.jpg">
|
||||
<meta property="og:description" content="Small-batch soy candle.">
|
||||
<meta property="product:price:amount" content="18.00">
|
||||
<meta property="product:price:currency" content="USD">
|
||||
<meta property="product:availability" content="in stock">
|
||||
<meta property="product:brand" content="Little Studio">
|
||||
</head></html>"##;
|
||||
let v = parse(html, "https://example.com/p/candle").unwrap();
|
||||
assert_eq!(v["data_source"], "og_fallback");
|
||||
assert_eq!(v["name"], "Handmade Candle");
|
||||
assert_eq!(v["description"], "Small-batch soy candle.");
|
||||
assert_eq!(v["brand"], "Little Studio");
|
||||
assert_eq!(v["offers"][0]["price"], "18.00");
|
||||
assert_eq!(v["offers"][0]["currency"], "USD");
|
||||
assert_eq!(v["offers"][0]["availability"], "in stock");
|
||||
assert_eq!(v["images"][0], "https://cdn.example.com/candle.jpg");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn jsonld_augments_empty_offers_with_og_price() {
|
||||
// Patagonia-shaped page: Product JSON-LD without an Offer, plus
|
||||
// product:price:* OG tags. We should merge.
|
||||
let html = r##"<html><head>
|
||||
<script type="application/ld+json">
|
||||
{"@context":"https://schema.org","@type":"Product",
|
||||
"name":"Better Sweater","brand":"Patagonia",
|
||||
"aggregateRating":{"@type":"AggregateRating","ratingValue":"4.4","reviewCount":"1142"}}
|
||||
</script>
|
||||
<meta property="product:price:amount" content="139.00">
|
||||
<meta property="product:price:currency" content="USD">
|
||||
</head></html>"##;
|
||||
let v = parse(html, "https://patagonia.com/p/x").unwrap();
|
||||
assert_eq!(v["data_source"], "jsonld+og");
|
||||
assert_eq!(v["name"], "Better Sweater");
|
||||
assert_eq!(v["offers"].as_array().unwrap().len(), 1);
|
||||
assert_eq!(v["offers"][0]["price"], "139.00");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn jsonld_only_stays_pure_jsonld() {
|
||||
let html = r##"<html><head>
|
||||
<script type="application/ld+json">
|
||||
{"@type":"Product","name":"Widget",
|
||||
"offers":{"@type":"Offer","price":"9.99","priceCurrency":"USD"}}
|
||||
</script>
|
||||
</head></html>"##;
|
||||
let v = parse(html, "https://example.com/p/w").unwrap();
|
||||
assert_eq!(v["data_source"], "jsonld");
|
||||
assert_eq!(v["offers"][0]["price"], "9.99");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_returns_none_on_no_product_signals() {
|
||||
let html = r#"<html><head>
|
||||
<meta property="og:title" content="My Blog Post">
|
||||
<meta property="og:type" content="article">
|
||||
</head></html>"#;
|
||||
assert!(parse(html, "https://blog.example.com/post").is_none());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue