fix(cloud): synthesize HTML from cloud response instead of requesting raw html

api.webclaw.io/v1/scrape does not return a `html` field even when `formats=["html"]` is requested, by design: the cloud API returns pre-parsed `structured_data` (JSON-LD blocks), `metadata` (OG tags, title, description, image, site_name), and `markdown`. Our CloudClient::fetch_html helper was premised on the API returning raw HTML. Without a key set, the error message was hidden behind CloudError::NotConfigured so the bug never surfaced. With a key set, every extractor that escalated to cloud (trustpilot_reviews, etsy_listing, amazon_product, ebay_listing, substack_post HTML fallback) got back "cloud /v1/scrape returned no html field". Fix: reassemble a minimal synthetic HTML document from the cloud's parsed output. Each JSON-LD block goes back into a `<script type="application/ld+json">` tag, metadata fields become OG `<meta>` tags, and the markdown body lands in a `<pre>` tag. Existing local extractor parsers (find_product_jsonld, find_business, og() regex) see the same shapes they'd see from a real page, so no per-extractor changes needed. Verified end-to-end with WEBCLAW_CLOUD_API_KEY set: - trustpilot_reviews: escalates, returns Organization JSON-LD data (parser picks Trustpilot site-level Org not the reviewed business; tracked as a follow-up to update Trustpilot schema handling) - etsy_listing: escalates via antibot render path; listing-specific data depends on target listing having JSON-LD (many Etsy listings don't) - amazon_product, ebay_listing: stay local because their pages ship enough content not to trigger bot-detection escalation - The other 24 extractors unchanged (local path, zero cloud credits) Tests: 200 passing in webclaw-fetch (3 new), clippy clean.
2026-06-08 22:25:12 +02:00 · 2026-04-22 17:24:50 +02:00 · 2026-04-22 17:24:50 +02:00 · e10066f527
commit e10066f527
parent a53578e45c
1 changed files with 131 additions and 12 deletions
--- a/crates/webclaw-fetch/src/cloud.rs
+++ b/crates/webclaw-fetch/src/cloud.rs
@ -252,22 +252,93 @@ impl CloudClient {
        self.post("scrape", body).await
    }

-    /// Convenience: scrape with `formats: ["html"]` and pull out the
-    /// raw HTML string. Used by vertical extractors that want to run
-    /// their own parser on antibot-bypassed HTML.
+    /// Get antibot-bypassed page data back as a synthetic HTML string.
+    ///
+    /// `api.webclaw.io/v1/scrape` intentionally does not return raw
+    /// HTML: it returns pre-parsed `structured_data` (JSON-LD blocks)
+    /// plus `metadata` (title, description, OG tags, image) plus a
+    /// `markdown` body. We reassemble those into a minimal HTML doc
+    /// that looks enough like the real page for our local extractor
+    /// parsers to run unchanged: each JSON-LD block gets emitted as a
+    /// `<script type="application/ld+json">` tag, metadata gets
+    /// emitted as OG `<meta>` tags, and the markdown lands in the
+    /// body. Extractors that walk JSON-LD (ecommerce_product,
+    /// trustpilot_reviews, ebay_listing, etsy_listing, amazon_product)
+    /// see exactly the same shapes they'd see from a live HTML fetch.
    pub async fn fetch_html(&self, url: &str) -> Result<String, CloudError> {
-        let resp = self.scrape(url, &["html"], &[], &[], false).await?;
-        resp.get("html")
-            .and_then(|v| v.as_str())
-            .map(String::from)
-            .ok_or_else(|| {
-                CloudError::ParseFailed(
-                    "cloud /v1/scrape returned no `html` field — check cloud API version".into(),
-                )
-            })
+        let resp = self.scrape(url, &["markdown"], &[], &[], false).await?;
+        Ok(synthesize_html(&resp))
    }
 }

+/// Reassemble a minimal HTML document from a cloud `/v1/scrape`
+/// response so existing HTML-based extractor parsers can run against
+/// cloud output without a separate code path.
+fn synthesize_html(resp: &Value) -> String {
+    let mut out = String::with_capacity(8_192);
+    out.push_str("<html><head>\n");
+
+    // Metadata → OG meta tags. Keep keys stable with what local
+    // extractors read: og:title, og:description, og:image, og:site_name.
+    if let Some(meta) = resp.get("metadata").and_then(|m| m.as_object()) {
+        for (src_key, og_key) in [
+            ("title", "title"),
+            ("description", "description"),
+            ("image", "image"),
+            ("site_name", "site_name"),
+        ] {
+            if let Some(val) = meta.get(src_key).and_then(|v| v.as_str())
+                && !val.is_empty()
+            {
+                out.push_str(&format!(
+                    "<meta property=\"og:{og_key}\" content=\"{}\">\n",
+                    html_escape_attr(val)
+                ));
+            }
+        }
+    }
+
+    // Structured data blocks → <script type="application/ld+json">.
+    // Serialise losslessly so extract_json_ld's parser gets the same
+    // shape it would get from a real page.
+    if let Some(blocks) = resp.get("structured_data").and_then(|v| v.as_array()) {
+        for block in blocks {
+            if let Ok(s) = serde_json::to_string(block) {
+                out.push_str("<script type=\"application/ld+json\">");
+                out.push_str(&s);
+                out.push_str("</script>\n");
+            }
+        }
+    }
+
+    out.push_str("</head><body>\n");
+
+    // Markdown body → plaintext in <body>. Extractors that regex over
+    // <div> IDs won't hit here, but they won't hit on local cloud
+    // bypass either. OK to keep minimal.
+    if let Some(md) = resp.get("markdown").and_then(|v| v.as_str()) {
+        out.push_str("<pre>");
+        out.push_str(&html_escape_text(md));
+        out.push_str("</pre>\n");
+    }
+
+    out.push_str("</body></html>");
+    out
+}
+
+fn html_escape_attr(s: &str) -> String {
+    s.replace('&', "&amp;")
+        .replace('"', "&quot;")
+        .replace('<', "&lt;")
+        .replace('>', "&gt;")
+}
+
+fn html_escape_text(s: &str) -> String {
+    s.replace('&', "&amp;")
+        .replace('<', "&lt;")
+        .replace('>', "&gt;")
+}
+
 async fn parse_cloud_response(resp: reqwest::Response) -> Result<Value, CloudError> {
    let status = resp.status();
    if status.is_success() {
@ -585,6 +656,54 @@ mod tests {
        assert!(is_bot_protected(html, &empty_headers()));
    }

+    #[test]
+    fn synthesize_html_embeds_jsonld_and_og_tags() {
+        let resp = json!({
+            "url": "https://example.com/p/1",
+            "metadata": {
+                "title": "My Product",
+                "description": "A nice thing.",
+                "image": "https://cdn.example.com/1.jpg",
+                "site_name": "Example Shop"
+            },
+            "structured_data": [
+                {"@context":"https://schema.org","@type":"Product",
+                 "name":"Widget","offers":{"@type":"Offer","price":"9.99","priceCurrency":"USD"}}
+            ],
+            "markdown": "# Widget\n\nA nice widget."
+        });
+        let html = synthesize_html(&resp);
+        // OG tags from metadata.
+        assert!(html.contains(r#"<meta property="og:title" content="My Product">"#));
+        assert!(
+            html.contains(r#"<meta property="og:image" content="https://cdn.example.com/1.jpg">"#)
+        );
+        // JSON-LD block preserved losslessly.
+        assert!(html.contains(r#"<script type="application/ld+json">"#));
+        assert!(html.contains(r#""@type":"Product""#));
+        assert!(html.contains(r#""price":"9.99""#));
+        // Body carries markdown.
+        assert!(html.contains("A nice widget."));
+    }
+
+    #[test]
+    fn synthesize_html_handles_missing_fields_gracefully() {
+        let resp = json!({"url": "https://example.com", "metadata": {}});
+        let html = synthesize_html(&resp);
+        // No panic, no stray unclosed tags.
+        assert!(html.starts_with("<html><head>"));
+        assert!(html.ends_with("</body></html>"));
+    }
+
+    #[test]
+    fn synthesize_html_escapes_attribute_quotes() {
+        let resp = json!({
+            "metadata": {"title": r#"She said "hi""#}
+        });
+        let html = synthesize_html(&resp);
+        assert!(html.contains(r#"og:title" content="She said &quot;hi&quot;""#));
+    }
+
    #[test]
    fn is_bot_protected_ignores_phrase_on_real_content() {
        // A real article that happens to mention the phrase in prose