//! Cloud API fallback client for api.webclaw.io.
//!
//! When local fetch hits bot protection or a JS-only SPA, callers can
//! fall back to the hosted API which runs the full antibot / CDP
//! pipeline. This module is the shared home for that flow: previously
//! duplicated between `webclaw-mcp/src/cloud.rs` and
//! `webclaw-cli/src/cloud.rs`.
//!
//! ## Architecture
//!
//! - [`CloudClient`] — thin reqwest wrapper around the api.webclaw.io
//! REST surface. Typed errors for the four HTTP failures callers act
//! on differently (401 / 402 / 429 / other) plus network + parse.
//! - [`is_bot_protected`] / [`needs_js_rendering`] — pure detectors on
//! response bodies. The detection patterns are public (CF / DataDome
//! challenge-page signatures) so these live in OSS without leaking
//! any moat.
//! - [`smart_fetch`] — try-local-then-escalate flow returning an
//! [`ExtractionResult`] or raw cloud JSON. Kept on the original
//! `Result<_, String>` signature so the existing MCP / CLI call
//! sites work unchanged.
//! - [`smart_fetch_html`] — new convenience for the vertical-extractor
//! pattern: just give me antibot-bypassed HTML so I can run my own
//! parser on it. Returns the typed [`CloudError`] so extractors can
//! emit precise "upgrade your plan" / "invalid key" messages.
//!
//! ## Cloud response shape and [`synthesize_html`]
//!
//! `api.webclaw.io/v1/scrape` deliberately does **not** return a
//! `html` field even when `formats=["html"]` is requested. By design
//! the cloud API returns a parsed bundle:
//!
//! ```text
//! {
//! "url": "https://...",
//! "metadata": { title, description, image, site_name, ... }, // OG / meta tags
//! "structured_data": [ { "@type": "...", ... }, ... ], // JSON-LD blocks
//! "markdown": "# Page Title\n\n...", // cleaned markdown
//! "antibot": { engine, path, user_agent }, // bypass telemetry
//! "cache": { status, age_seconds }
//! }
//! ```
//!
//! [`CloudClient::fetch_html`] reassembles that bundle back into a
//! minimal synthetic HTML document so the existing local extractor
//! parsers (JSON-LD walkers, OG regex, DOM-regex) run unchanged over
//! cloud output. Each `structured_data` entry becomes a
//! `\n");
}
}
}
out.push_str("
\n");
// Markdown body → plaintext in . Extractors that regex over
// IDs won't hit here, but they won't hit on local cloud
// bypass either. OK to keep minimal.
if let Some(md) = resp.get("markdown").and_then(|v| v.as_str()) {
out.push_str("
");
out.push_str(&html_escape_text(md));
out.push_str("\n");
}
out.push_str("");
out
}
fn html_escape_attr(s: &str) -> String {
s.replace('&', "&")
.replace('"', """)
.replace('<', "<")
.replace('>', ">")
}
fn html_escape_text(s: &str) -> String {
s.replace('&', "&")
.replace('<', "<")
.replace('>', ">")
}
async fn parse_cloud_response(resp: reqwest::Response) -> Result
{
let status = resp.status();
if status.is_success() {
return resp
.json()
.await
.map_err(|e| CloudError::ParseFailed(e.to_string()));
}
let body = resp.text().await.unwrap_or_default();
Err(CloudError::from_status_and_body(status.as_u16(), body))
}
// ---------------------------------------------------------------------------
// Detection
// ---------------------------------------------------------------------------
/// True when a fetched response body is actually a bot-protection
/// challenge page rather than the content the caller asked for.
///
/// Conservative — only fires on patterns that indicate the *entire*
/// page is a challenge, not embedded CAPTCHAs on a real content page.
pub fn is_bot_protected(html: &str, headers: &HeaderMap) -> bool {
let html_lower = html.to_lowercase();
// Cloudflare challenge page.
if html_lower.contains("_cf_chl_opt") || html_lower.contains("challenge-platform") {
return true;
}
// Cloudflare "Just a moment" / "Checking your browser" interstitial.
if (html_lower.contains("just a moment") || html_lower.contains("checking your browser"))
&& html_lower.contains("cf-spinner")
{
return true;
}
// Cloudflare Turnstile. Only counts when the page is small —
// legitimate pages embed Turnstile for signup forms etc.
if (html_lower.contains("cf-turnstile")
|| html_lower.contains("challenges.cloudflare.com/turnstile"))
&& html.len() < 100_000
{
return true;
}
// DataDome.
if html_lower.contains("geo.captcha-delivery.com")
|| html_lower.contains("captcha-delivery.com/captcha")
{
return true;
}
// AWS WAF.
if html_lower.contains("awswaf-captcha") || html_lower.contains("aws-waf-client-browser") {
return true;
}
// AWS WAF "Verifying your connection" interstitial (used by Trustpilot).
// Distinct from the captcha-branded path above: the challenge page is
// a tiny HTML shell with an `interstitial-spinner` div and no content.
// Gating on html.len() keeps false-positives off long pages that
// happen to mention the phrase in an unrelated context.
if html_lower.contains("interstitial-spinner")
&& html_lower.contains("verifying your connection")
&& html.len() < 10_000
{
return true;
}
// hCaptcha *blocking* page (not just an embedded widget).
if html_lower.contains("hcaptcha.com")
&& html_lower.contains("h-captcha")
&& html.len() < 50_000
{
return true;
}
// Cloudflare via response headers + challenge body.
let has_cf_headers = headers.get("cf-ray").is_some() || headers.get("cf-mitigated").is_some();
if has_cf_headers
&& (html_lower.contains("just a moment") || html_lower.contains("checking your browser"))
{
return true;
}
false
}
/// True when a page likely needs JS rendering — a large HTML document
/// with almost no extractable text + an SPA framework signature.
pub fn needs_js_rendering(word_count: usize, html: &str) -> bool {
let has_scripts = html.contains("".repeat(500)
);
assert!(needs_js_rendering(10, &html));
}
#[test]
fn needs_js_rendering_passes_real_article() {
let html = format!(
"{}",
"Real article text ".repeat(5_000)
);
assert!(!needs_js_rendering(5_000, &html));
}
// --- CloudError mapping -------------------------------------------------
#[test]
fn cloud_error_maps_401() {
let e = CloudError::from_status_and_body(401, "invalid key".into());
assert!(matches!(e, CloudError::Unauthorized));
assert!(e.to_string().contains(KEYS_URL));
}
#[test]
fn cloud_error_maps_402() {
let e = CloudError::from_status_and_body(402, "{}".into());
assert!(matches!(e, CloudError::InsufficientPlan));
assert!(e.to_string().contains(PRICING_URL));
}
#[test]
fn cloud_error_maps_429() {
let e = CloudError::from_status_and_body(429, "slow down".into());
assert!(matches!(e, CloudError::RateLimited));
assert!(e.to_string().contains(PRICING_URL));
}
#[test]
fn cloud_error_maps_generic_5xx() {
let e = CloudError::from_status_and_body(503, "x".repeat(2000));
match e {
CloudError::ServerError { status, body } => {
assert_eq!(status, 503);
assert!(body.len() <= 500);
}
_ => panic!("expected ServerError"),
}
}
#[test]
fn not_configured_error_points_at_signup() {
let msg = CloudError::NotConfigured.to_string();
assert!(msg.contains(SIGNUP_URL));
assert!(msg.contains("WEBCLAW_API_KEY"));
}
// --- CloudClient construction ------------------------------------------
// `WEBCLAW_API_KEY` is process-global; cargo runs tests in parallel
// threads. Without serialization, a test that sets the var can race a
// test asserting it is absent. This lock makes the env-mutating
// CloudClient tests mutually exclusive (poison-tolerant: a panicking
// test must not wedge the others).
static ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
#[test]
fn cloud_client_explicit_key_wins_over_env() {
let _guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
// SAFETY: env mutation is serialized by ENV_LOCK; set_var/remove_var
// are unsafe on the 2024 toolchain. Explicit key must beat the env.
unsafe {
std::env::set_var("WEBCLAW_API_KEY", "from-env");
}
let client = CloudClient::new(Some("from-flag")).expect("client built");
assert_eq!(client.api_key, "from-flag");
unsafe {
std::env::remove_var("WEBCLAW_API_KEY");
}
}
#[test]
fn cloud_client_none_when_empty() {
let _guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
// SAFETY: env mutation serialized by ENV_LOCK. Clearing the var
// (incl. any ambient runner value) is what makes this deterministic.
unsafe {
std::env::remove_var("WEBCLAW_API_KEY");
}
assert!(CloudClient::new(None).is_none());
assert!(CloudClient::new(Some("")).is_none());
assert!(CloudClient::new(Some(" ")).is_none());
}
#[test]
fn cloud_client_base_url_strips_trailing_slash() {
let c = CloudClient::with_key_and_base("k", "https://api.example.com/v1/");
assert_eq!(c.base_url(), "https://api.example.com/v1");
}
#[test]
fn truncate_respects_char_boundaries() {
// Ensure we don't slice inside a multi-byte char.
let s = "a".repeat(10) + "é"; // é is 2 bytes
let out = truncate(&s, 11);
assert_eq!(out.chars().count(), 11);
}
}