fix(cloud): synthesize HTML from cloud response instead of requesting raw html

api.webclaw.io/v1/scrape does not return a `html` field even when
`formats=["html"]` is requested, by design: the cloud API returns
pre-parsed `structured_data` (JSON-LD blocks), `metadata` (OG tags,
title, description, image, site_name), and `markdown`.

Our CloudClient::fetch_html helper was premised on the API returning
raw HTML. Without a key set, the error message was hidden behind
CloudError::NotConfigured so the bug never surfaced. With a key set,
every extractor that escalated to cloud (trustpilot_reviews,
etsy_listing, amazon_product, ebay_listing, substack_post HTML
fallback) got back "cloud /v1/scrape returned no html field".

Fix: reassemble a minimal synthetic HTML document from the cloud's
parsed output. Each JSON-LD block goes back into a
`<script type="application/ld+json">` tag, metadata fields become OG
`<meta>` tags, and the markdown body lands in a `<pre>` tag. Existing
local extractor parsers (find_product_jsonld, find_business,
og() regex) see the same shapes they'd see from a real page, so no
per-extractor changes needed.

Verified end-to-end with WEBCLAW_CLOUD_API_KEY set:
- trustpilot_reviews: escalates, returns Organization JSON-LD data
  (parser picks Trustpilot site-level Org not the reviewed business;
  tracked as a follow-up to update Trustpilot schema handling)
- etsy_listing: escalates via antibot render path; listing-specific
  data depends on target listing having JSON-LD (many Etsy listings
  don't)
- amazon_product, ebay_listing: stay local because their pages ship
  enough content not to trigger bot-detection escalation
- The other 24 extractors unchanged (local path, zero cloud credits)

Tests: 200 passing in webclaw-fetch (3 new), clippy clean.
This commit is contained in:
Valerio 2026-04-22 17:24:50 +02:00
parent a53578e45c
commit e10066f527

View file

@ -252,22 +252,93 @@ impl CloudClient {
self.post("scrape", body).await
}
/// Convenience: scrape with `formats: ["html"]` and pull out the
/// raw HTML string. Used by vertical extractors that want to run
/// their own parser on antibot-bypassed HTML.
/// Get antibot-bypassed page data back as a synthetic HTML string.
///
/// `api.webclaw.io/v1/scrape` intentionally does not return raw
/// HTML: it returns pre-parsed `structured_data` (JSON-LD blocks)
/// plus `metadata` (title, description, OG tags, image) plus a
/// `markdown` body. We reassemble those into a minimal HTML doc
/// that looks enough like the real page for our local extractor
/// parsers to run unchanged: each JSON-LD block gets emitted as a
/// `<script type="application/ld+json">` tag, metadata gets
/// emitted as OG `<meta>` tags, and the markdown lands in the
/// body. Extractors that walk JSON-LD (ecommerce_product,
/// trustpilot_reviews, ebay_listing, etsy_listing, amazon_product)
/// see exactly the same shapes they'd see from a live HTML fetch.
pub async fn fetch_html(&self, url: &str) -> Result<String, CloudError> {
let resp = self.scrape(url, &["html"], &[], &[], false).await?;
resp.get("html")
.and_then(|v| v.as_str())
.map(String::from)
.ok_or_else(|| {
CloudError::ParseFailed(
"cloud /v1/scrape returned no `html` field — check cloud API version".into(),
)
})
let resp = self.scrape(url, &["markdown"], &[], &[], false).await?;
Ok(synthesize_html(&resp))
}
}
/// Reassemble a minimal HTML document from a cloud `/v1/scrape`
/// response so existing HTML-based extractor parsers can run against
/// cloud output without a separate code path.
fn synthesize_html(resp: &Value) -> String {
let mut out = String::with_capacity(8_192);
out.push_str("<html><head>\n");
// Metadata → OG meta tags. Keep keys stable with what local
// extractors read: og:title, og:description, og:image, og:site_name.
if let Some(meta) = resp.get("metadata").and_then(|m| m.as_object()) {
for (src_key, og_key) in [
("title", "title"),
("description", "description"),
("image", "image"),
("site_name", "site_name"),
] {
if let Some(val) = meta.get(src_key).and_then(|v| v.as_str())
&& !val.is_empty()
{
out.push_str(&format!(
"<meta property=\"og:{og_key}\" content=\"{}\">\n",
html_escape_attr(val)
));
}
}
}
// Structured data blocks → <script type="application/ld+json">.
// Serialise losslessly so extract_json_ld's parser gets the same
// shape it would get from a real page.
if let Some(blocks) = resp.get("structured_data").and_then(|v| v.as_array()) {
for block in blocks {
if let Ok(s) = serde_json::to_string(block) {
out.push_str("<script type=\"application/ld+json\">");
out.push_str(&s);
out.push_str("</script>\n");
}
}
}
out.push_str("</head><body>\n");
// Markdown body → plaintext in <body>. Extractors that regex over
// <div> IDs won't hit here, but they won't hit on local cloud
// bypass either. OK to keep minimal.
if let Some(md) = resp.get("markdown").and_then(|v| v.as_str()) {
out.push_str("<pre>");
out.push_str(&html_escape_text(md));
out.push_str("</pre>\n");
}
out.push_str("</body></html>");
out
}
fn html_escape_attr(s: &str) -> String {
s.replace('&', "&amp;")
.replace('"', "&quot;")
.replace('<', "&lt;")
.replace('>', "&gt;")
}
fn html_escape_text(s: &str) -> String {
s.replace('&', "&amp;")
.replace('<', "&lt;")
.replace('>', "&gt;")
}
async fn parse_cloud_response(resp: reqwest::Response) -> Result<Value, CloudError> {
let status = resp.status();
if status.is_success() {
@ -585,6 +656,54 @@ mod tests {
assert!(is_bot_protected(html, &empty_headers()));
}
#[test]
fn synthesize_html_embeds_jsonld_and_og_tags() {
let resp = json!({
"url": "https://example.com/p/1",
"metadata": {
"title": "My Product",
"description": "A nice thing.",
"image": "https://cdn.example.com/1.jpg",
"site_name": "Example Shop"
},
"structured_data": [
{"@context":"https://schema.org","@type":"Product",
"name":"Widget","offers":{"@type":"Offer","price":"9.99","priceCurrency":"USD"}}
],
"markdown": "# Widget\n\nA nice widget."
});
let html = synthesize_html(&resp);
// OG tags from metadata.
assert!(html.contains(r#"<meta property="og:title" content="My Product">"#));
assert!(
html.contains(r#"<meta property="og:image" content="https://cdn.example.com/1.jpg">"#)
);
// JSON-LD block preserved losslessly.
assert!(html.contains(r#"<script type="application/ld+json">"#));
assert!(html.contains(r#""@type":"Product""#));
assert!(html.contains(r#""price":"9.99""#));
// Body carries markdown.
assert!(html.contains("A nice widget."));
}
#[test]
fn synthesize_html_handles_missing_fields_gracefully() {
let resp = json!({"url": "https://example.com", "metadata": {}});
let html = synthesize_html(&resp);
// No panic, no stray unclosed tags.
assert!(html.starts_with("<html><head>"));
assert!(html.ends_with("</body></html>"));
}
#[test]
fn synthesize_html_escapes_attribute_quotes() {
let resp = json!({
"metadata": {"title": r#"She said "hi""#}
});
let html = synthesize_html(&resp);
assert!(html.contains(r#"og:title" content="She said &quot;hi&quot;""#));
}
#[test]
fn is_bot_protected_ignores_phrase_on_real_content() {
// A real article that happens to mention the phrase in prose