From 2373162c811b82301147ea6cfb6aa1dcb237814f Mon Sep 17 00:00:00 2001
From: Valerio <massimianivalerio1@gmail.com>
Date: Wed, 22 Apr 2026 20:59:01 +0200
Subject: [PATCH] chore: release v0.5.0 (28 vertical extractors + cloud
 integration)

See CHANGELOG.md for the full entry. Headline: 28 site-specific
extractors returning typed JSON, five with automatic antibot
cloud-escalation via api.webclaw.io, `POST /v1/scrape/{vertical}` +
`GET /v1/extractors` on webclaw-server.
---
 CHANGELOG.md | 23 +++++++++++++++++++++++
 Cargo.lock   | 14 +++++++-------
 Cargo.toml   |  2 +-
 3 files changed, 31 insertions(+), 8 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4069d54..938a0b4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,29 @@
 All notable changes to webclaw are documented here.
 Format follows [Keep a Changelog](https://keepachangelog.com/).
 
+## [0.5.0] — 2026-04-22
+
+### Added
+- **28 vertical extractors that return typed JSON instead of generic markdown.** New `webclaw_fetch::extractors` module with one extractor per site. Dev: reddit, hackernews, github_repo / github_pr / github_issue / github_release, crates_io, pypi, npm. AI/ML: huggingface_model, huggingface_dataset, arxiv, docker_hub. Writing: dev_to, stackoverflow, youtube_video. Social: linkedin_post, instagram_post, instagram_profile. Ecommerce: shopify_product, shopify_collection, ecommerce_product (generic Schema.org), woocommerce_product, amazon_product, ebay_listing, etsy_listing. Reviews: trustpilot_reviews, substack_post. Each extractor claims a URL pattern via a public `matches()` fn and returns a typed JSON payload with the fields callers actually want (title, price, author, rating, review count, etc.) rather than a markdown blob.
+- **`POST /v1/scrape/{vertical}` on `webclaw-server` for explicit vertical routing.** Picks the parser by name, validates the URL plausibly belongs to that vertical, returns the same shape as `POST /v1/scrape` but typed. 23 of 28 verticals also auto-dispatch from a plain `POST /v1/scrape` because their URL shapes are unique enough to claim safely; the remaining 5 (`shopify_product`, `shopify_collection`, `ecommerce_product`, `woocommerce_product`, `substack_post`) use patterns that non-target sites share, so callers opt in via the `{vertical}` route.
+- **`GET /v1/extractors` on `webclaw-server`.** Returns the full catalog as `{"extractors": [{"name": "...", "label": "...", "description": "...", "url_patterns": [...]}, ...]}` so clients can build tooling / autocomplete / user-facing docs off a live source.
+- **Antibot cloud-escalation for 5 ecommerce + reviews verticals.** Amazon, eBay, Etsy, Trustpilot, and Substack (as HTML fallback) go through `cloud::smart_fetch_html`: try local fetch first; on bot-protection detection (Cloudflare challenge, DataDome, AWS WAF "Verifying your connection", etc.) escalate to `api.webclaw.io/v1/scrape`. Without `WEBCLAW_API_KEY` / `WEBCLAW_CLOUD_API_KEY` the extractor returns a typed `CloudError::NotConfigured` with an actionable signup link. With a key set, escalation is automatic. Every extractor stamps a `data_source: "local" | "cloud"` field on the response so callers can tell which path ran.
+- **`cloud::synthesize_html` for cloud-bypassed extraction.** `api.webclaw.io/v1/scrape` deliberately does not return raw HTML; it returns a parsed bundle (`structured_data` JSON-LD blocks + `metadata` OG/meta tags + `markdown`). The new helper reassembles that bundle back into a minimal synthetic HTML doc (JSON-LD as `<script>` tags, metadata as OG `<meta>` tags, markdown in a `<pre>`) so existing local parsers run unchanged across both paths. No per-extractor code path branches are needed for "came from cloud" vs "came from local".
+- **Trustpilot 2025 schema parser.** Trustpilot replaced their single-Organization + aggregateRating shape with three separate JSON-LD blocks: a site-level Organization (Trustpilot itself), a Dataset with a csvw:Table `mainEntity` carrying the per-star distribution for the target business, and an aiSummary + aiSummaryReviews block with the AI-generated summary and recent reviews. The parser walks all three, skips the site-level Org, picks the Dataset by `about.@id` matching the target domain, parses each csvw:column for rating buckets, computes weighted-average rating + total from the distribution, extracts the aiSummary text, and returns recent reviews with author / country / date / rating / title / text / likes.
+- **OG-tag fallback in `ecommerce_product` for sites with no JSON-LD and sites with JSON-LD but empty offers.** Three paths now: `jsonld` (Schema.org Product with offers), `jsonld+og` (Product JSON-LD plus OG product tags filling in missing price), and `og_fallback` (no JSON-LD at all, build minimal payload from `og:title`, `og:image`, `og:description`, `product:price:amount`, `product:price:currency`, `product:availability`, `product:brand`). `has_og_product_signal()` gates the fallback on `og:type=product` or a price tag so blog posts don't get mis-classified as products.
+- **URL-slug title fallback in `etsy_listing` for delisted / blocked pages.** When Etsy serves a placeholder page (`"etsy.com"`, `"Etsy - Your place to buy..."`, `"This item is unavailable"`), humanise the URL slug (`/listing/123/personalized-stainless-steel-tumbler` becomes `"Personalized Stainless Steel Tumbler"`) so callers always get a meaningful title. Plus shop falls through `offers[].seller.name` then top-level `brand` because Etsy uses both schemas depending on listing age.
+- **Force-cloud-escalation in `amazon_product` when local HTML lacks Product JSON-LD.** Amazon A/B-tests JSON-LD presence. When local fetch succeeds but has no `Product` block and a cloud client is configured, the extractor force-escalates to the cloud which reliably surfaces title + description via its render engine. Added OG meta-tag fallback so the cloud's synthesized HTML output (OG tags only, no Amazon DOM IDs) still yields title / image / description.
+- **AWS WAF "Verifying your connection" detector in `cloud::is_bot_protected`.** Trustpilot serves a `~565` byte interstitial with an `interstitial-spinner` CSS class. The detector now fires on that pattern with a `< 10_000` byte size gate to avoid false positives on real articles that happen to mention the phrase.
+
+### Changed
+- **`webclaw-fetch::FetchClient` gained an optional `cloud` field** via `with_cloud(CloudClient)`. Extractors reach it through `client.cloud()` to decide whether to escalate. `webclaw-server::AppState` reads `WEBCLAW_CLOUD_API_KEY` (preferred) or falls back to `WEBCLAW_API_KEY` only when inbound auth is not configured (open mode).
+- **Consolidated `CloudClient` into `webclaw-fetch`.** Previously duplicated between `webclaw-mcp/src/cloud.rs` (302 LOC) and `webclaw-cli/src/cloud.rs` (80 LOC). Single canonical home with typed `CloudError` (`NotConfigured`, `Unauthorized`, `InsufficientPlan`, `RateLimited`, `ServerError`, `Network`, `ParseFailed`) that Display with actionable URLs; `From<CloudError> for String` bridge keeps pre-existing CLI / MCP call sites compiling unchanged during migration.
+
+### Tests
+- 215 unit tests passing in `webclaw-fetch` (100+ new, covering every extractor's matcher, URL parser, JSON-LD / OG fallback paths, and the cloud synthesis helper). `cargo clippy --workspace --release --no-deps` clean.
+
+---
+
 ## [0.4.0] — 2026-04-22
 
 ### Added
diff --git a/Cargo.lock b/Cargo.lock
index cb8296b..3603981 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3199,7 +3199,7 @@ dependencies = [
 
 [[package]]
 name = "webclaw-cli"
-version = "0.4.0"
+version = "0.5.0"
 dependencies = [
  "clap",
  "dotenvy",
@@ -3220,7 +3220,7 @@ dependencies = [
 
 [[package]]
 name = "webclaw-core"
-version = "0.4.0"
+version = "0.5.0"
 dependencies = [
  "ego-tree",
  "once_cell",
@@ -3238,7 +3238,7 @@ dependencies = [
 
 [[package]]
 name = "webclaw-fetch"
-version = "0.4.0"
+version = "0.5.0"
 dependencies = [
  "bytes",
  "calamine",
@@ -3262,7 +3262,7 @@ dependencies = [
 
 [[package]]
 name = "webclaw-llm"
-version = "0.4.0"
+version = "0.5.0"
 dependencies = [
  "async-trait",
  "reqwest",
@@ -3275,7 +3275,7 @@ dependencies = [
 
 [[package]]
 name = "webclaw-mcp"
-version = "0.4.0"
+version = "0.5.0"
 dependencies = [
  "dirs",
  "dotenvy",
@@ -3295,7 +3295,7 @@ dependencies = [
 
 [[package]]
 name = "webclaw-pdf"
-version = "0.4.0"
+version = "0.5.0"
 dependencies = [
  "pdf-extract",
  "thiserror",
@@ -3304,7 +3304,7 @@ dependencies = [
 
 [[package]]
 name = "webclaw-server"
-version = "0.4.0"
+version = "0.5.0"
 dependencies = [
  "anyhow",
  "axum",
diff --git a/Cargo.toml b/Cargo.toml
index e17d843..e8b2677 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,7 +3,7 @@ resolver = "2"
 members = ["crates/*"]
 
 [workspace.package]
-version = "0.4.0"
+version = "0.5.0"
 edition = "2024"
 license = "AGPL-3.0"
 repository = "https://github.com/0xMassi/webclaw"