diff --git a/crates/webclaw-fetch/src/cloud.rs b/crates/webclaw-fetch/src/cloud.rs index dad7519..c70a75e 100644 --- a/crates/webclaw-fetch/src/cloud.rs +++ b/crates/webclaw-fetch/src/cloud.rs @@ -24,6 +24,37 @@ //! parser on it. Returns the typed [`CloudError`] so extractors can //! emit precise "upgrade your plan" / "invalid key" messages. //! +//! ## Cloud response shape and [`synthesize_html`] +//! +//! `api.webclaw.io/v1/scrape` deliberately does **not** return a +//! `html` field even when `formats=["html"]` is requested. By design +//! the cloud API returns a parsed bundle: +//! +//! ```text +//! { +//! "url": "https://...", +//! "metadata": { title, description, image, site_name, ... }, // OG / meta tags +//! "structured_data": [ { "@type": "...", ... }, ... ], // JSON-LD blocks +//! "markdown": "# Page Title\n\n...", // cleaned markdown +//! "antibot": { engine, path, user_agent }, // bypass telemetry +//! "cache": { status, age_seconds } +//! } +//! ``` +//! +//! [`CloudClient::fetch_html`] reassembles that bundle back into a +//! minimal synthetic HTML document so the existing local extractor +//! parsers (JSON-LD walkers, OG regex, DOM-regex) run unchanged over +//! cloud output. Each `structured_data` entry becomes a +//! ` + + +"##; + let v = parse(html, "https://www.trustpilot.com/review/anthropic.com").unwrap(); + assert_eq!(v["domain"], "anthropic.com"); + assert_eq!(v["business_name"], "Anthropic"); + assert_eq!(v["rating_label"], "Bad"); + assert_eq!(v["review_count"], 226); + assert_eq!(v["rating_distribution"]["one_star"]["count"], 196); + assert_eq!(v["rating_distribution"]["total"]["count"], 226); + assert_eq!(v["ai_summary"], "Mixed reviews."); + assert_eq!(v["recent_reviews"].as_array().unwrap().len(), 1); + assert_eq!(v["recent_reviews"][0]["author"], "W.FRH"); + assert_eq!(v["recent_reviews"][0]["rating"], 1); + assert_eq!(v["recent_reviews"][0]["title"], "Bad"); + } + + #[test] + fn parse_falls_back_to_og_when_no_jsonld() { + let html = r#" +"#; + let v = parse(html, "https://www.trustpilot.com/review/anthropic.com").unwrap(); + assert_eq!(v["domain"], "anthropic.com"); + assert_eq!(v["business_name"], "Anthropic"); + assert_eq!(v["average_rating"], "1.5"); + assert_eq!(v["review_count"], 226); + assert_eq!(v["rating_label"], "Bad"); + } + + #[test] + fn parse_returns_ok_with_url_domain_when_nothing_else() { + let v = parse( + "", + "https://www.trustpilot.com/review/example.com", + ) + .unwrap(); + assert_eq!(v["domain"], "example.com"); + assert_eq!(v["business_name"], "example.com"); } }