refactor(cloud): consolidate CloudClient + smart_fetch into webclaw-fetch

The local-first / cloud-fallback flow was duplicated in two places:
- webclaw-mcp/src/cloud.rs (302 lines, canonical)
- webclaw-cli/src/cloud.rs (80 lines, minimal subset kept to avoid
  pulling rmcp as a dep)

Move to the shared crate where all vertical extractors and the new
webclaw-server can also reach it.

## New module: webclaw-fetch/src/cloud.rs

Single canonical home. Consolidates both previous versions and
promotes the error type from stringy to typed:

- `CloudError` enum with dedicated variants for the four HTTP
  outcomes callers act on differently — 401 (key rejected),
  402 (insufficient plan), 429 (rate limited), plus ServerError /
  Network / ParseFailed. Each variant's Display message ends with
  an actionable URL (signup / pricing / dashboard) so API consumers
  can surface it verbatim.

- `From<CloudError> for String` bridge so the dozen existing
  `.await?` call sites in MCP / CLI that expected `Result<_, String>`
  keep compiling. We can migrate them to the typed error per-site
  later without a churn commit.

- `CloudClient::new(Option<&str>)` matches the CLI's `--api-key`
  flag pattern (explicit key wins, env fallback, None when empty).
  `::from_env()` kept for MCP-style call sites.

- `with_key_and_base` for staging / integration tests.

- `scrape / post / get / fetch_html` — `fetch_html` is new, a
  convenience that calls /v1/scrape with formats=["html"] and
  returns the raw HTML string so vertical extractors can plug
  antibot-bypassed HTML straight into their parsers.

- `is_bot_protected` + `needs_js_rendering` detectors moved
  over verbatim. Detection patterns are public (CF / DataDome /
  AWS WAF challenge-page signatures) — no moat leak.

- `smart_fetch` kept on the original `Result<_, String>`
  signature so MCP's six call sites compile unchanged.

- `smart_fetch_html` is new: the local-first-then-cloud flow
  for the vertical-extractor pattern, returning the typed
  `CloudError` so extractors can emit precise upgrade-path
  messages.

## Cleanup

- Deleted webclaw-mcp/src/cloud.rs — all imports now resolve to
  `webclaw_fetch:☁️:*`. Dropped reqwest as a direct dep of
  webclaw-mcp (it only used it for the old cloud client).
- Deleted webclaw-cli/src/cloud.rs. CLI keeps reqwest for its
  webhook / on-change / research HTTP calls.
- webclaw-fetch now has reqwest as a direct dep. It was already
  transitively pulled in by webclaw-llm; this just makes the
  dependency relationship explicit at the call site.

## Tests

16 new unit tests cover:
- CloudError status mapping (401/402/429/5xx)
- NotConfigured error includes signup URL
- CloudClient::new explicit-key-wins-over-env + empty-string = None
- base_url strips trailing slash
- Detector matrix (CF challenge / Turnstile / real content with
  embedded Turnstile / SPA skeleton / real article with script tags)
- truncate respects char boundaries (don't slice inside UTF-8)

Full workspace test suite still passes (~500 tests). fmt + clippy
clean. No behavior change for existing MCP / CLI call sites.
This commit is contained in:
Valerio 2026-04-22 16:05:44 +02:00
parent 0221c151dc
commit 0ab891bd6b
10 changed files with 675 additions and 388 deletions

View file

@ -1,302 +0,0 @@
/// Cloud API fallback for protected sites.
///
/// When local fetch returns a challenge page, this module retries
/// via api.webclaw.io. Requires WEBCLAW_API_KEY to be set.
use std::time::Duration;
use serde_json::{Value, json};
use tracing::info;
const API_BASE: &str = "https://api.webclaw.io/v1";
/// Lightweight client for the webclaw cloud API.
pub struct CloudClient {
api_key: String,
http: reqwest::Client,
}
impl CloudClient {
/// Create a new cloud client from WEBCLAW_API_KEY env var.
/// Returns None if the key is not set.
pub fn from_env() -> Option<Self> {
let key = std::env::var("WEBCLAW_API_KEY").ok()?;
if key.is_empty() {
return None;
}
let http = reqwest::Client::builder()
.timeout(Duration::from_secs(60))
.build()
.unwrap_or_default();
Some(Self { api_key: key, http })
}
/// Scrape a URL via the cloud API. Returns the response JSON.
pub async fn scrape(
&self,
url: &str,
formats: &[&str],
include_selectors: &[String],
exclude_selectors: &[String],
only_main_content: bool,
) -> Result<Value, String> {
let mut body = json!({
"url": url,
"formats": formats,
});
if only_main_content {
body["only_main_content"] = json!(true);
}
if !include_selectors.is_empty() {
body["include_selectors"] = json!(include_selectors);
}
if !exclude_selectors.is_empty() {
body["exclude_selectors"] = json!(exclude_selectors);
}
self.post("scrape", body).await
}
/// Generic POST to the cloud API.
pub async fn post(&self, endpoint: &str, body: Value) -> Result<Value, String> {
let resp = self
.http
.post(format!("{API_BASE}/{endpoint}"))
.header("Authorization", format!("Bearer {}", self.api_key))
.json(&body)
.send()
.await
.map_err(|e| format!("Cloud API request failed: {e}"))?;
let status = resp.status();
if !status.is_success() {
let text = resp.text().await.unwrap_or_default();
let truncated = truncate_error(&text);
return Err(format!("Cloud API error {status}: {truncated}"));
}
resp.json::<Value>()
.await
.map_err(|e| format!("Cloud API response parse failed: {e}"))
}
/// Generic GET from the cloud API.
pub async fn get(&self, endpoint: &str) -> Result<Value, String> {
let resp = self
.http
.get(format!("{API_BASE}/{endpoint}"))
.header("Authorization", format!("Bearer {}", self.api_key))
.send()
.await
.map_err(|e| format!("Cloud API request failed: {e}"))?;
let status = resp.status();
if !status.is_success() {
let text = resp.text().await.unwrap_or_default();
let truncated = truncate_error(&text);
return Err(format!("Cloud API error {status}: {truncated}"));
}
resp.json::<Value>()
.await
.map_err(|e| format!("Cloud API response parse failed: {e}"))
}
}
/// Truncate error body to avoid flooding logs with huge HTML responses.
fn truncate_error(text: &str) -> &str {
const MAX_LEN: usize = 500;
match text.char_indices().nth(MAX_LEN) {
Some((byte_pos, _)) => &text[..byte_pos],
None => text,
}
}
/// Check if fetched HTML looks like a bot protection challenge page.
/// Detects common bot protection challenge pages.
pub fn is_bot_protected(html: &str, headers: &webclaw_fetch::HeaderMap) -> bool {
let html_lower = html.to_lowercase();
// Cloudflare challenge page
if html_lower.contains("_cf_chl_opt") || html_lower.contains("challenge-platform") {
return true;
}
// Cloudflare "checking your browser" spinner
if (html_lower.contains("just a moment") || html_lower.contains("checking your browser"))
&& html_lower.contains("cf-spinner")
{
return true;
}
// Cloudflare Turnstile (only on short pages = challenge, not embedded on real content)
if (html_lower.contains("cf-turnstile")
|| html_lower.contains("challenges.cloudflare.com/turnstile"))
&& html.len() < 100_000
{
return true;
}
// DataDome
if html_lower.contains("geo.captcha-delivery.com")
|| html_lower.contains("captcha-delivery.com/captcha")
{
return true;
}
// AWS WAF
if html_lower.contains("awswaf-captcha") || html_lower.contains("aws-waf-client-browser") {
return true;
}
// hCaptcha blocking page
if html_lower.contains("hcaptcha.com")
&& html_lower.contains("h-captcha")
&& html.len() < 50_000
{
return true;
}
// Cloudflare via headers + challenge body
let has_cf_headers = headers.get("cf-ray").is_some() || headers.get("cf-mitigated").is_some();
if has_cf_headers
&& (html_lower.contains("just a moment") || html_lower.contains("checking your browser"))
{
return true;
}
false
}
/// Check if a page likely needs JS rendering (SPA with almost no text content).
pub fn needs_js_rendering(word_count: usize, html: &str) -> bool {
let has_scripts = html.contains("<script");
// Tier 1: almost no extractable text from a large page
if word_count < 50 && html.len() > 5_000 && has_scripts {
return true;
}
// Tier 2: SPA framework detected with suspiciously low content-to-HTML ratio
if word_count < 800 && html.len() > 50_000 && has_scripts {
let html_lower = html.to_lowercase();
let has_spa_marker = html_lower.contains("react-app")
|| html_lower.contains("id=\"__next\"")
|| html_lower.contains("id=\"root\"")
|| html_lower.contains("id=\"app\"")
|| html_lower.contains("__next_data__")
|| html_lower.contains("nuxt")
|| html_lower.contains("ng-app");
if has_spa_marker {
return true;
}
}
false
}
/// Result of a smart fetch: either local extraction or cloud API response.
pub enum SmartFetchResult {
/// Successfully extracted locally.
Local(Box<webclaw_core::ExtractionResult>),
/// Fell back to cloud API. Contains the API response JSON.
Cloud(Value),
}
/// Try local fetch first, fall back to cloud API if bot-protected or JS-rendered.
///
/// Returns the extraction result (local) or the cloud API response JSON.
/// If no API key is configured and local fetch is blocked, returns an error
/// with a helpful message.
pub async fn smart_fetch(
client: &webclaw_fetch::FetchClient,
cloud: Option<&CloudClient>,
url: &str,
include_selectors: &[String],
exclude_selectors: &[String],
only_main_content: bool,
formats: &[&str],
) -> Result<SmartFetchResult, String> {
// Step 1: Try local fetch (with timeout to avoid hanging on slow servers)
let fetch_result = tokio::time::timeout(Duration::from_secs(30), client.fetch(url))
.await
.map_err(|_| format!("Fetch timed out after 30s for {url}"))?
.map_err(|e| format!("Fetch failed: {e}"))?;
// Step 2: Check for bot protection
if is_bot_protected(&fetch_result.html, &fetch_result.headers) {
info!(url, "bot protection detected, falling back to cloud API");
return cloud_fallback(
cloud,
url,
include_selectors,
exclude_selectors,
only_main_content,
formats,
)
.await;
}
// Step 3: Extract locally
let options = webclaw_core::ExtractionOptions {
include_selectors: include_selectors.to_vec(),
exclude_selectors: exclude_selectors.to_vec(),
only_main_content,
include_raw_html: false,
};
let extraction =
webclaw_core::extract_with_options(&fetch_result.html, Some(&fetch_result.url), &options)
.map_err(|e| format!("Extraction failed: {e}"))?;
// Step 4: Check for JS-rendered pages (low content from large HTML)
if needs_js_rendering(extraction.metadata.word_count, &fetch_result.html) {
info!(
url,
word_count = extraction.metadata.word_count,
html_len = fetch_result.html.len(),
"JS-rendered page detected, falling back to cloud API"
);
return cloud_fallback(
cloud,
url,
include_selectors,
exclude_selectors,
only_main_content,
formats,
)
.await;
}
Ok(SmartFetchResult::Local(Box::new(extraction)))
}
async fn cloud_fallback(
cloud: Option<&CloudClient>,
url: &str,
include_selectors: &[String],
exclude_selectors: &[String],
only_main_content: bool,
formats: &[&str],
) -> Result<SmartFetchResult, String> {
match cloud {
Some(c) => {
let resp = c
.scrape(
url,
formats,
include_selectors,
exclude_selectors,
only_main_content,
)
.await?;
info!(url, "cloud API fallback successful");
Ok(SmartFetchResult::Cloud(resp))
}
None => Err(format!(
"Bot protection detected on {url}. Set WEBCLAW_API_KEY for automatic cloud bypass. \
Get a key at https://webclaw.io"
)),
}
}

View file

@ -1,7 +1,6 @@
/// webclaw-mcp: MCP (Model Context Protocol) server for webclaw.
/// Exposes web extraction tools over stdio transport for AI agents
/// like Claude Desktop, Claude Code, and other MCP clients.
mod cloud;
mod server;
mod tools;

View file

@ -15,7 +15,8 @@ use serde_json::json;
use tracing::{error, info, warn};
use url::Url;
use crate::cloud::{self, CloudClient, SmartFetchResult};
use webclaw_fetch::cloud::{self, CloudClient, SmartFetchResult};
use crate::tools::*;
pub struct WebclawMcp {