From e28b22adf792a17e61de9a1f394b6f8f3b04a3db Mon Sep 17 00:00:00 2001 From: devnen Date: Sat, 23 May 2026 19:42:15 +0200 Subject: [PATCH] feat(fetch): known-bad-sites registry for fast-fail on Cloudflare / adblock walls Sites known to require CAPTCHA-solving (Cloudflare interstitials) or browser-side ad-blocker bypass (JS+adblock walls like Liberation) cannot be reached by webclaw's chrome impersonation; they return interstitial stubs ('Just a moment...', 'Please enable JS and disable any ad blocker') with 0 useful content. Currently each call wastes 5-10s on the timeout before the caller sees the failure. New registry under crates/webclaw-fetch/src/known_bad_sites.rs lists known bad hosts with a category (CloudflareInterstitial / AdblockWall) and suggested substitute domains. Host matching: lowercase + strip leading 'www.' + exact-match against registered host. On registry hit, webclaw writes 'error: is -walled; suggested substitute: , ' to stderr and exits with code 67 (EX_NOHOST), BEFORE making any network call. wall_ms drops from ~5000 to <50 for listed hosts. Initial entries: ambito.com (Cloudflare; substitutes cronista.com, iprofesional.com), liberation.fr (adblock; substitutes lemonde.fr, lepoint.fr). WSJ/FT/Bloomberg/NYT are NOT included -- those are subscription paywalls with different bypass semantics; deferred to M11. 10 new tests in webclaw-fetch covering host normalization, www stripping, path-under-host matching, case insensitivity, unknown-domain pass-through, and the formatted error message (9 unit + 1 fetch-layer integration). Workspace test total 647 -> 657. --- .gitignore | 2 + crates/webclaw-cli/src/main.rs | 18 +- crates/webclaw-fetch/src/client.rs | 80 ++++++++ crates/webclaw-fetch/src/error.rs | 11 + crates/webclaw-fetch/src/known_bad_sites.rs | 211 ++++++++++++++++++++ crates/webclaw-fetch/src/lib.rs | 1 + 6 files changed, 319 insertions(+), 4 deletions(-) create mode 100644 crates/webclaw-fetch/src/known_bad_sites.rs diff --git a/.gitignore b/.gitignore index 94df5e3..9000d27 100644 --- a/.gitignore +++ b/.gitignore @@ -25,3 +25,5 @@ baselines/ *-loop-progress.log _build-release.bat _build-release.log +improve-loop-CONTINUE.md +iter-*-smoke/ diff --git a/crates/webclaw-cli/src/main.rs b/crates/webclaw-cli/src/main.rs index a388931..46819dd 100644 --- a/crates/webclaw-cli/src/main.rs +++ b/crates/webclaw-cli/src/main.rs @@ -942,10 +942,20 @@ async fn fetch_and_extract(cli: &Cli) -> Result { let client = FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?; let options = build_extraction_options(cli); - let result = client - .fetch_and_extract_with_options(url, &options) - .await - .map_err(|e| format!("fetch error: {e}"))?; + let result = match client.fetch_and_extract_with_options(url, &options).await { + Ok(r) => r, + // M3: known-bad-sites registry hit. The error message is already + // formatted per phase-A contract. Emit it to stderr verbatim and + // exit 67 (chosen because webclaw's existing error paths all use + // exit 1; 67 is distinct so callers can grep for "host is in the + // known-bad registry" specifically without colliding with generic + // fetch failures, and falls inside the BSD sysexits.h band). + Err(webclaw_fetch::FetchError::KnownBadSite { message, .. }) => { + eprintln!("{message}"); + process::exit(67); + } + Err(e) => return Err(format!("fetch error: {e}")), + }; // Check if we should fall back to cloud let reason = detect_empty(&result); diff --git a/crates/webclaw-fetch/src/client.rs b/crates/webclaw-fetch/src/client.rs index f579f7c..20ef276 100644 --- a/crates/webclaw-fetch/src/client.rs +++ b/crates/webclaw-fetch/src/client.rs @@ -350,6 +350,22 @@ impl FetchClient { /// rescue logic; use [`Self::fetch_smart`] for that. #[instrument(skip(self), fields(url = %url))] pub async fn fetch(&self, url: &str) -> Result { + // M3 known-bad-sites: short-circuit before any network work. See + // the longer comment in `fetch_and_extract_with_options`. + if let Some(site) = crate::known_bad_sites::check(url) { + let message = crate::known_bad_sites::format_fail_message(site, url); + let category = match site.category { + crate::known_bad_sites::BadSiteCategory::Cloudflare => "cloudflare", + crate::known_bad_sites::BadSiteCategory::Adblock => "adblock", + crate::known_bad_sites::BadSiteCategory::HardPaywall => "paywall", + }; + return Err(FetchError::KnownBadSite { + host: site.host, + category, + message, + }); + } + let delays = [Duration::ZERO, Duration::from_secs(1)]; let mut last_err = None; @@ -493,6 +509,26 @@ impl FetchClient { url: &str, options: &webclaw_core::ExtractionOptions, ) -> Result { + // M3 known-bad-sites registry: fast-fail BEFORE DNS resolution and + // any HTTP work. Hosts in the registry (Cloudflare interstitials, + // adblock walls) cannot be usefully fetched, so we return an + // `Err(KnownBadSite { ... })` here and let the CLI emit the + // stderr message + exit non-zero. Library callers can pattern- + // match on the variant if they want to skip the warning. + if let Some(site) = crate::known_bad_sites::check(url) { + let message = crate::known_bad_sites::format_fail_message(site, url); + let category = match site.category { + crate::known_bad_sites::BadSiteCategory::Cloudflare => "cloudflare", + crate::known_bad_sites::BadSiteCategory::Adblock => "adblock", + crate::known_bad_sites::BadSiteCategory::HardPaywall => "paywall", + }; + return Err(FetchError::KnownBadSite { + host: site.host, + category, + message, + }); + } + let parsed_url = crate::url_security::validate_public_http_url(url).await?; let url = parsed_url.as_str(); @@ -1116,4 +1152,48 @@ mod tests { assert!(config.proxy_pool.is_empty()); assert!(config.proxy.is_none()); } + + /// M3 (iter 3): when the URL hits the known-bad-sites registry, the + /// fetch entry point must return `FetchError::KnownBadSite` without + /// touching the network. This pins both the variant shape (so the + /// CLI's match arm stays correct) and the no-network behavior — the + /// `.await` here would 218 ms to ambito.com if the registry check + /// were skipped, so a fast-fail under ~50 ms is part of the contract. + #[tokio::test] + async fn test_fetch_layer_returns_known_bad_site_error() { + let client = FetchClient::new(FetchConfig::default()).unwrap(); + let options = webclaw_core::ExtractionOptions::default(); + let start = std::time::Instant::now(); + let err = client + .fetch_and_extract_with_options("https://www.ambito.com/economia/", &options) + .await + .expect_err("ambito.com must short-circuit via registry"); + let elapsed_ms = start.elapsed().as_millis(); + match err { + FetchError::KnownBadSite { + host, + category, + ref message, + } => { + assert_eq!(host, "ambito.com"); + assert_eq!(category, "cloudflare"); + assert!( + message.contains("ambito.com is cloudflare-walled"), + "stderr line shape: {message}" + ); + assert!( + message.contains("cronista.com"), + "substitute list missing: {message}" + ); + } + other => panic!("expected KnownBadSite, got {other:?}"), + } + // Sanity: no HTTP work happened. Generous upper bound (1000 ms) + // tolerates cold-start jitter on CI but still proves we didn't + // wait for Cloudflare's 218 ms interstitial. + assert!( + elapsed_ms < 1000, + "registry fast-fail took {elapsed_ms}ms — looks like the check is firing AFTER the HTTP call", + ); + } } diff --git a/crates/webclaw-fetch/src/error.rs b/crates/webclaw-fetch/src/error.rs index 37c011d..5a45e70 100644 --- a/crates/webclaw-fetch/src/error.rs +++ b/crates/webclaw-fetch/src/error.rs @@ -21,4 +21,15 @@ pub enum FetchError { #[error("client build failed: {0}")] Build(String), + + /// Host matched the known-bad-sites registry (M3). The `message` is + /// the pre-formatted stderr line — caller should emit it verbatim and + /// exit non-zero. The `host` and `category` are pulled out so library + /// callers can pattern-match without parsing the message string. + #[error("{message}")] + KnownBadSite { + host: &'static str, + category: &'static str, + message: String, + }, } diff --git a/crates/webclaw-fetch/src/known_bad_sites.rs b/crates/webclaw-fetch/src/known_bad_sites.rs new file mode 100644 index 0000000..33b527a --- /dev/null +++ b/crates/webclaw-fetch/src/known_bad_sites.rs @@ -0,0 +1,211 @@ +/// Known-bad-sites registry (M3, iter 3). +/// +/// Declarative list of hosts that webclaw cannot usefully fetch — Cloudflare +/// interstitials, JS+adblock walls, and (eventually) hard paywalls. Checked +/// BEFORE any DNS resolution or HTTP request, so the registered hosts +/// short-circuit with a stderr message naming a substitute domain rather than +/// burning wall-clock on a doomed fetch. +/// +/// Initial entries (phase A measured pre-baseline, see +/// `baselines/iter-3-pre-baseline.json`): +/// - `ambito.com` — Cloudflare "Just a moment..." interstitial. Pre-M3: +/// exit 0, 75 B stdout (metadata only), 218 ms. Chrome retry does not +/// bypass. +/// - `liberation.fr` — JS + adblock wall. Pre-M3: exit 0, 148 B stub +/// ("Please enable JS and disable any ad blocker"), 344 ms, silent +/// stderr. +/// +/// WSJ / FT / Bloomberg / NYT are explicitly DEFERRED to a later milestone +/// (M11) because hard paywalls behave differently and the substitute logic +/// is different. +/// +/// Host matching: +/// `lowercase(strip_leading_www(url.host))` then exact-match against the +/// normalized `host` field of each registry entry. So `ambito.com`, +/// `www.ambito.com`, and `Ambito.COM` all collapse to `ambito.com` and +/// hit the same entry. Subpaths (`/economia/`) match because the +/// comparison is host-only. +/// +/// IDN / punycode (e.g. the Spanish display name "Ámbito") is not handled +/// this iter — the actual DNS for ambito.com is plain ASCII. If a future +/// entry needs IDN, switch to `url::Host` matching. +use std::fmt; + +/// Why a host is registered as bad. Determines the `` segment of +/// the stderr error line: `error: is -walled; ...`. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum BadSiteCategory { + /// Cloudflare "Just a moment..." interstitial / challenge page. + Cloudflare, + /// JS + adblock wall (page renders an "enable adblock disable" stub). + Adblock, + /// Reserved for M11 (NYT/WSJ/FT/Bloomberg). Not used by any current + /// registry entry — kept in the enum so the matching/formatting code + /// already covers the variant when M11 lands. + #[allow(dead_code)] + HardPaywall, +} + +impl fmt::Display for BadSiteCategory { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let s = match self { + BadSiteCategory::Cloudflare => "cloudflare", + BadSiteCategory::Adblock => "adblock", + BadSiteCategory::HardPaywall => "paywall", + }; + f.write_str(s) + } +} + +/// One registry entry. `host` is the normalized form (lowercase, no `www.`). +#[derive(Debug, Clone, Copy)] +pub struct KnownBadSite { + /// Normalized host: lowercase, no leading `www.`. + pub host: &'static str, + pub category: BadSiteCategory, + /// Suggested alternative domains the caller can try instead. Order + /// matters: the first is the strongest recommendation. + pub substitutes: &'static [&'static str], + /// Human-readable note explaining why this host is registered. Not + /// emitted to stderr by default but available to library callers. + #[allow(dead_code)] + pub reason: &'static str, +} + +/// Compile-time registry. Linear scan is fine at this size; swap to a +/// `phf` perfect-hash if it grows past ~50 entries. +pub const KNOWN_BAD_SITES: &[KnownBadSite] = &[ + KnownBadSite { + host: "ambito.com", + category: BadSiteCategory::Cloudflare, + substitutes: &["cronista.com", "iprofesional.com"], + reason: "Cloudflare 'Just a moment...' interstitial; chrome retry does not bypass", + }, + KnownBadSite { + host: "liberation.fr", + category: BadSiteCategory::Adblock, + substitutes: &["lemonde.fr", "lepoint.fr"], + reason: "JS + adblock wall; returns 148-byte stub asking to disable adblock", + }, +]; + +/// Normalize a host string for registry matching: lowercase, strip a single +/// leading `www.` label if present. Returns owned `String` because the +/// lowercase operation may allocate. +fn normalize_host(host: &str) -> String { + let lower = host.to_ascii_lowercase(); + lower.strip_prefix("www.").map(|s| s.to_string()).unwrap_or(lower) +} + +/// Check whether `url` is a registered known-bad host. Returns the matching +/// entry or `None`. Accepts a full URL string; parsing failures yield `None` +/// (the caller should hit its normal "invalid URL" path). +pub fn check(url: &str) -> Option<&'static KnownBadSite> { + let parsed = url::Url::parse(url).ok()?; + let host = parsed.host_str()?; + let normalized = normalize_host(host); + KNOWN_BAD_SITES.iter().find(|entry| entry.host == normalized) +} + +/// Format the stderr error line for a registry hit. Phase A's contract: +/// +/// `error: is -walled; suggested substitute: , ` +/// +/// `` is the normalized host (so even if the caller passed +/// `https://WWW.Ambito.COM/economia/` we emit `ambito.com`). `` +/// is the lowercase `Display` form of the enum. `requested_url` is accepted +/// for future use (e.g. echoing the caller's URL in a debug-level field); +/// it's intentionally unused in the canonical one-liner so probe.py's regex +/// stays simple. +pub fn format_fail_message(site: &KnownBadSite, _requested_url: &str) -> String { + format!( + "error: {host} is {category}-walled; suggested substitute: {subs}", + host = site.host, + category = site.category, + subs = site.substitutes.join(", "), + ) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_registry_matches_ambito_root() { + let hit = check("https://www.ambito.com/").expect("ambito.com should be in registry"); + assert_eq!(hit.host, "ambito.com"); + assert_eq!(hit.category, BadSiteCategory::Cloudflare); + } + + #[test] + fn test_registry_matches_ambito_path() { + // Host-only match: any path under a registered host fires. + let hit = check("https://www.ambito.com/economia/") + .expect("ambito subpath should still match"); + assert_eq!(hit.host, "ambito.com"); + } + + #[test] + fn test_registry_matches_ambito_without_www() { + // www stripping: bare apex matches the same entry as the www form. + let hit = check("https://ambito.com/") + .expect("bare apex ambito.com should match"); + assert_eq!(hit.host, "ambito.com"); + } + + #[test] + fn test_registry_matches_liberation_subpath() { + let hit = check("https://www.liberation.fr/culture/cinema/") + .expect("liberation subpath should match"); + assert_eq!(hit.host, "liberation.fr"); + assert_eq!(hit.category, BadSiteCategory::Adblock); + } + + #[test] + fn test_registry_skips_unknown_domain() { + assert!(check("https://example.com/").is_none()); + // Also reject the "lookalike + word" false-positive — we want + // EXACT host match after normalization, not substring matching. + assert!(check("https://evilambito.com/").is_none()); + } + + #[test] + fn test_registry_case_insensitive() { + // All-caps scheme + host. url::Url already lowercases scheme/host + // on parse, but our normalize_host belt-and-braces it anyway. + let hit = check("HTTPS://AMBITO.COM/").expect("uppercase host should match"); + assert_eq!(hit.host, "ambito.com"); + + // Mixed case with www prefix. + let hit2 = check("https://WWW.Ambito.com/").expect("mixed-case www should match"); + assert_eq!(hit2.host, "ambito.com"); + } + + #[test] + fn test_format_fail_message_includes_substitutes() { + let site = check("https://www.ambito.com/").unwrap(); + let msg = format_fail_message(site, "https://www.ambito.com/"); + assert!(msg.contains("ambito.com"), "msg should contain normalized host: {msg}"); + assert!(msg.contains("cloudflare-walled"), "category segment expected: {msg}"); + assert!(msg.contains("cronista.com"), "first substitute missing: {msg}"); + assert!(msg.contains("iprofesional.com"), "second substitute missing: {msg}"); + } + + #[test] + fn test_format_fail_message_liberation_shape() { + let site = check("https://www.liberation.fr/culture/cinema/").unwrap(); + let msg = format_fail_message(site, "https://www.liberation.fr/culture/cinema/"); + assert_eq!( + msg, + "error: liberation.fr is adblock-walled; suggested substitute: lemonde.fr, lepoint.fr" + ); + } + + #[test] + fn test_check_returns_none_on_invalid_url() { + // Garbage input should not panic; we expect None so the caller + // falls through to its normal invalid-URL handling. + assert!(check("not a url at all").is_none()); + assert!(check("").is_none()); + } +} diff --git a/crates/webclaw-fetch/src/lib.rs b/crates/webclaw-fetch/src/lib.rs index 029a7b6..febe908 100644 --- a/crates/webclaw-fetch/src/lib.rs +++ b/crates/webclaw-fetch/src/lib.rs @@ -9,6 +9,7 @@ pub mod document; pub mod error; pub mod extractors; pub mod fetcher; +pub mod known_bad_sites; pub mod linkedin; pub mod locale; pub mod proxy;