refactor(cloud): consolidate CloudClient + smart_fetch into webclaw-fetch

The local-first / cloud-fallback flow was duplicated in two places: - webclaw-mcp/src/cloud.rs (302 lines, canonical) - webclaw-cli/src/cloud.rs (80 lines, minimal subset kept to avoid pulling rmcp as a dep) Move to the shared crate where all vertical extractors and the new webclaw-server can also reach it. ## New module: webclaw-fetch/src/cloud.rs Single canonical home. Consolidates both previous versions and promotes the error type from stringy to typed: - `CloudError` enum with dedicated variants for the four HTTP outcomes callers act on differently — 401 (key rejected), 402 (insufficient plan), 429 (rate limited), plus ServerError / Network / ParseFailed. Each variant's Display message ends with an actionable URL (signup / pricing / dashboard) so API consumers can surface it verbatim. - `From<CloudError> for String` bridge so the dozen existing `.await?` call sites in MCP / CLI that expected `Result<_, String>` keep compiling. We can migrate them to the typed error per-site later without a churn commit. - `CloudClient::new(Option<&str>)` matches the CLI's `--api-key` flag pattern (explicit key wins, env fallback, None when empty). `::from_env()` kept for MCP-style call sites. - `with_key_and_base` for staging / integration tests. - `scrape / post / get / fetch_html` — `fetch_html` is new, a convenience that calls /v1/scrape with formats=["html"] and returns the raw HTML string so vertical extractors can plug antibot-bypassed HTML straight into their parsers. - `is_bot_protected` + `needs_js_rendering` detectors moved over verbatim. Detection patterns are public (CF / DataDome / AWS WAF challenge-page signatures) — no moat leak. - `smart_fetch` kept on the original `Result<_, String>` signature so MCP's six call sites compile unchanged. - `smart_fetch_html` is new: the local-first-then-cloud flow for the vertical-extractor pattern, returning the typed `CloudError` so extractors can emit precise upgrade-path messages. ## Cleanup - Deleted webclaw-mcp/src/cloud.rs — all imports now resolve to `webclaw_fetch:☁️:*`. Dropped reqwest as a direct dep of webclaw-mcp (it only used it for the old cloud client). - Deleted webclaw-cli/src/cloud.rs. CLI keeps reqwest for its webhook / on-change / research HTTP calls. - webclaw-fetch now has reqwest as a direct dep. It was already transitively pulled in by webclaw-llm; this just makes the dependency relationship explicit at the call site. ## Tests 16 new unit tests cover: - CloudError status mapping (401/402/429/5xx) - NotConfigured error includes signup URL - CloudClient::new explicit-key-wins-over-env + empty-string = None - base_url strips trailing slash - Detector matrix (CF challenge / Turnstile / real content with embedded Turnstile / SPA skeleton / real article with script tags) - truncate respects char boundaries (don't slice inside UTF-8) Full workspace test suite still passes (~500 tests). fmt + clippy clean. No behavior change for existing MCP / CLI call sites.
2026-06-07 22:15:12 +02:00 · 2026-04-22 16:05:44 +02:00 · 2026-04-22 16:05:44 +02:00 · 0ab891bd6b
commit 0ab891bd6b
parent 0221c151dc
10 changed files with 675 additions and 388 deletions
--- a/crates/webclaw-mcp/src/cloud.rs
+++ b/crates/webclaw-mcp/src/cloud.rs
@ -1,302 +0,0 @@
-/// Cloud API fallback for protected sites.
-///
-/// When local fetch returns a challenge page, this module retries
-/// via api.webclaw.io. Requires WEBCLAW_API_KEY to be set.
-use std::time::Duration;
-
-use serde_json::{Value, json};
-use tracing::info;
-
-const API_BASE: &str = "https://api.webclaw.io/v1";
-
-/// Lightweight client for the webclaw cloud API.
-pub struct CloudClient {
-    api_key: String,
-    http: reqwest::Client,
-}
-
-impl CloudClient {
-    /// Create a new cloud client from WEBCLAW_API_KEY env var.
-    /// Returns None if the key is not set.
-    pub fn from_env() -> Option<Self> {
-        let key = std::env::var("WEBCLAW_API_KEY").ok()?;
-        if key.is_empty() {
-            return None;
-        }
-        let http = reqwest::Client::builder()
-            .timeout(Duration::from_secs(60))
-            .build()
-            .unwrap_or_default();
-        Some(Self { api_key: key, http })
-    }
-
-    /// Scrape a URL via the cloud API. Returns the response JSON.
-    pub async fn scrape(
-        &self,
-        url: &str,
-        formats: &[&str],
-        include_selectors: &[String],
-        exclude_selectors: &[String],
-        only_main_content: bool,
-    ) -> Result<Value, String> {
-        let mut body = json!({
-            "url": url,
-            "formats": formats,
-        });
-
-        if only_main_content {
-            body["only_main_content"] = json!(true);
-        }
-        if !include_selectors.is_empty() {
-            body["include_selectors"] = json!(include_selectors);
-        }
-        if !exclude_selectors.is_empty() {
-            body["exclude_selectors"] = json!(exclude_selectors);
-        }
-
-        self.post("scrape", body).await
-    }
-
-    /// Generic POST to the cloud API.
-    pub async fn post(&self, endpoint: &str, body: Value) -> Result<Value, String> {
-        let resp = self
-            .http
-            .post(format!("{API_BASE}/{endpoint}"))
-            .header("Authorization", format!("Bearer {}", self.api_key))
-            .json(&body)
-            .send()
-            .await
-            .map_err(|e| format!("Cloud API request failed: {e}"))?;
-
-        let status = resp.status();
-        if !status.is_success() {
-            let text = resp.text().await.unwrap_or_default();
-            let truncated = truncate_error(&text);
-            return Err(format!("Cloud API error {status}: {truncated}"));
-        }
-
-        resp.json::<Value>()
-            .await
-            .map_err(|e| format!("Cloud API response parse failed: {e}"))
-    }
-
-    /// Generic GET from the cloud API.
-    pub async fn get(&self, endpoint: &str) -> Result<Value, String> {
-        let resp = self
-            .http
-            .get(format!("{API_BASE}/{endpoint}"))
-            .header("Authorization", format!("Bearer {}", self.api_key))
-            .send()
-            .await
-            .map_err(|e| format!("Cloud API request failed: {e}"))?;
-
-        let status = resp.status();
-        if !status.is_success() {
-            let text = resp.text().await.unwrap_or_default();
-            let truncated = truncate_error(&text);
-            return Err(format!("Cloud API error {status}: {truncated}"));
-        }
-
-        resp.json::<Value>()
-            .await
-            .map_err(|e| format!("Cloud API response parse failed: {e}"))
-    }
-}
-
-/// Truncate error body to avoid flooding logs with huge HTML responses.
-fn truncate_error(text: &str) -> &str {
-    const MAX_LEN: usize = 500;
-    match text.char_indices().nth(MAX_LEN) {
-        Some((byte_pos, _)) => &text[..byte_pos],
-        None => text,
-    }
-}
-
-/// Check if fetched HTML looks like a bot protection challenge page.
-/// Detects common bot protection challenge pages.
-pub fn is_bot_protected(html: &str, headers: &webclaw_fetch::HeaderMap) -> bool {
-    let html_lower = html.to_lowercase();
-
-    // Cloudflare challenge page
-    if html_lower.contains("_cf_chl_opt") || html_lower.contains("challenge-platform") {
-        return true;
-    }
-
-    // Cloudflare "checking your browser" spinner
-    if (html_lower.contains("just a moment") || html_lower.contains("checking your browser"))
-        && html_lower.contains("cf-spinner")
-    {
-        return true;
-    }
-
-    // Cloudflare Turnstile (only on short pages = challenge, not embedded on real content)
-    if (html_lower.contains("cf-turnstile")
-        || html_lower.contains("challenges.cloudflare.com/turnstile"))
-        && html.len() < 100_000
-    {
-        return true;
-    }
-
-    // DataDome
-    if html_lower.contains("geo.captcha-delivery.com")
-        || html_lower.contains("captcha-delivery.com/captcha")
-    {
-        return true;
-    }
-
-    // AWS WAF
-    if html_lower.contains("awswaf-captcha") || html_lower.contains("aws-waf-client-browser") {
-        return true;
-    }
-
-    // hCaptcha blocking page
-    if html_lower.contains("hcaptcha.com")
-        && html_lower.contains("h-captcha")
-        && html.len() < 50_000
-    {
-        return true;
-    }
-
-    // Cloudflare via headers + challenge body
-    let has_cf_headers = headers.get("cf-ray").is_some() || headers.get("cf-mitigated").is_some();
-    if has_cf_headers
-        && (html_lower.contains("just a moment") || html_lower.contains("checking your browser"))
-    {
-        return true;
-    }
-
-    false
-}
-
-/// Check if a page likely needs JS rendering (SPA with almost no text content).
-pub fn needs_js_rendering(word_count: usize, html: &str) -> bool {
-    let has_scripts = html.contains("<script");
-
-    // Tier 1: almost no extractable text from a large page
-    if word_count < 50 && html.len() > 5_000 && has_scripts {
-        return true;
-    }
-
-    // Tier 2: SPA framework detected with suspiciously low content-to-HTML ratio
-    if word_count < 800 && html.len() > 50_000 && has_scripts {
-        let html_lower = html.to_lowercase();
-        let has_spa_marker = html_lower.contains("react-app")
-            || html_lower.contains("id=\"__next\"")
-            || html_lower.contains("id=\"root\"")
-            || html_lower.contains("id=\"app\"")
-            || html_lower.contains("__next_data__")
-            || html_lower.contains("nuxt")
-            || html_lower.contains("ng-app");
-
-        if has_spa_marker {
-            return true;
-        }
-    }
-
-    false
-}
-
-/// Result of a smart fetch: either local extraction or cloud API response.
-pub enum SmartFetchResult {
-    /// Successfully extracted locally.
-    Local(Box<webclaw_core::ExtractionResult>),
-    /// Fell back to cloud API. Contains the API response JSON.
-    Cloud(Value),
-}
-
-/// Try local fetch first, fall back to cloud API if bot-protected or JS-rendered.
-///
-/// Returns the extraction result (local) or the cloud API response JSON.
-/// If no API key is configured and local fetch is blocked, returns an error
-/// with a helpful message.
-pub async fn smart_fetch(
-    client: &webclaw_fetch::FetchClient,
-    cloud: Option<&CloudClient>,
-    url: &str,
-    include_selectors: &[String],
-    exclude_selectors: &[String],
-    only_main_content: bool,
-    formats: &[&str],
-) -> Result<SmartFetchResult, String> {
-    // Step 1: Try local fetch (with timeout to avoid hanging on slow servers)
-    let fetch_result = tokio::time::timeout(Duration::from_secs(30), client.fetch(url))
-        .await
-        .map_err(|_| format!("Fetch timed out after 30s for {url}"))?
-        .map_err(|e| format!("Fetch failed: {e}"))?;
-
-    // Step 2: Check for bot protection
-    if is_bot_protected(&fetch_result.html, &fetch_result.headers) {
-        info!(url, "bot protection detected, falling back to cloud API");
-        return cloud_fallback(
-            cloud,
-            url,
-            include_selectors,
-            exclude_selectors,
-            only_main_content,
-            formats,
-        )
-        .await;
-    }
-
-    // Step 3: Extract locally
-    let options = webclaw_core::ExtractionOptions {
-        include_selectors: include_selectors.to_vec(),
-        exclude_selectors: exclude_selectors.to_vec(),
-        only_main_content,
-        include_raw_html: false,
-    };
-
-    let extraction =
-        webclaw_core::extract_with_options(&fetch_result.html, Some(&fetch_result.url), &options)
-            .map_err(|e| format!("Extraction failed: {e}"))?;
-
-    // Step 4: Check for JS-rendered pages (low content from large HTML)
-    if needs_js_rendering(extraction.metadata.word_count, &fetch_result.html) {
-        info!(
-            url,
-            word_count = extraction.metadata.word_count,
-            html_len = fetch_result.html.len(),
-            "JS-rendered page detected, falling back to cloud API"
-        );
-        return cloud_fallback(
-            cloud,
-            url,
-            include_selectors,
-            exclude_selectors,
-            only_main_content,
-            formats,
-        )
-        .await;
-    }
-
-    Ok(SmartFetchResult::Local(Box::new(extraction)))
-}
-
-async fn cloud_fallback(
-    cloud: Option<&CloudClient>,
-    url: &str,
-    include_selectors: &[String],
-    exclude_selectors: &[String],
-    only_main_content: bool,
-    formats: &[&str],
-) -> Result<SmartFetchResult, String> {
-    match cloud {
-        Some(c) => {
-            let resp = c
-                .scrape(
-                    url,
-                    formats,
-                    include_selectors,
-                    exclude_selectors,
-                    only_main_content,
-                )
-                .await?;
-            info!(url, "cloud API fallback successful");
-            Ok(SmartFetchResult::Cloud(resp))
-        }
-        None => Err(format!(
-            "Bot protection detected on {url}. Set WEBCLAW_API_KEY for automatic cloud bypass. \
-             Get a key at https://webclaw.io"
-        )),
-    }
-}
--- a/crates/webclaw-mcp/src/main.rs
+++ b/crates/webclaw-mcp/src/main.rs
@ -1,7 +1,6 @@
 /// webclaw-mcp: MCP (Model Context Protocol) server for webclaw.
 /// Exposes web extraction tools over stdio transport for AI agents
 /// like Claude Desktop, Claude Code, and other MCP clients.
-mod cloud;
 mod server;
 mod tools;

--- a/crates/webclaw-mcp/src/server.rs
+++ b/crates/webclaw-mcp/src/server.rs
@ -15,7 +15,8 @@ use serde_json::json;
 use tracing::{error, info, warn};
 use url::Url;

-use crate::cloud::{self, CloudClient, SmartFetchResult};
+use webclaw_fetch::cloud::{self, CloudClient, SmartFetchResult};
+
 use crate::tools::*;

 pub struct WebclawMcp {