feat(fetch): known-bad-sites registry for fast-fail on Cloudflare / adblock walls

Sites known to require CAPTCHA-solving (Cloudflare interstitials) or browser-side ad-blocker bypass (JS+adblock walls like Liberation) cannot be reached by webclaw's chrome impersonation; they return interstitial stubs ('Just a moment...', 'Please enable JS and disable any ad blocker') with 0 useful content. Currently each call wastes 5-10s on the timeout before the caller sees the failure. New registry under crates/webclaw-fetch/src/known_bad_sites.rs lists known bad hosts with a category (CloudflareInterstitial / AdblockWall) and suggested substitute domains. Host matching: lowercase + strip leading 'www.' + exact-match against registered host. On registry hit, webclaw writes 'error: <host> is <category>-walled; suggested substitute: <alt1>, <alt2>' to stderr and exits with code 67 (EX_NOHOST), BEFORE making any network call. wall_ms drops from ~5000 to <50 for listed hosts. Initial entries: ambito.com (Cloudflare; substitutes cronista.com, iprofesional.com), liberation.fr (adblock; substitutes lemonde.fr, lepoint.fr). WSJ/FT/Bloomberg/NYT are NOT included -- those are subscription paywalls with different bypass semantics; deferred to M11. 10 new tests in webclaw-fetch covering host normalization, www stripping, path-under-host matching, case insensitivity, unknown-domain pass-through, and the formatted error message (9 unit + 1 fetch-layer integration). Workspace test total 647 -> 657.
2026-07-26 07:51:01 +02:00 · 2026-05-23 19:42:15 +02:00 · 2026-05-23 19:42:15 +02:00 · e28b22adf7
commit e28b22adf7
parent 31a8f6150f
6 changed files with 319 additions and 4 deletions
--- a/.gitignore
+++ b/.gitignore
@ -25,3 +25,5 @@ baselines/
 *-loop-progress.log
 _build-release.bat
 _build-release.log
+improve-loop-CONTINUE.md
+iter-*-smoke/
--- a/crates/webclaw-cli/src/main.rs
+++ b/crates/webclaw-cli/src/main.rs
@ -942,10 +942,20 @@ async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
    let client =
        FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
    let options = build_extraction_options(cli);
-    let result = client
-        .fetch_and_extract_with_options(url, &options)
-        .await
-        .map_err(|e| format!("fetch error: {e}"))?;
+    let result = match client.fetch_and_extract_with_options(url, &options).await {
+        Ok(r) => r,
+        // M3: known-bad-sites registry hit. The error message is already
+        // formatted per phase-A contract. Emit it to stderr verbatim and
+        // exit 67 (chosen because webclaw's existing error paths all use
+        // exit 1; 67 is distinct so callers can grep for "host is in the
+        // known-bad registry" specifically without colliding with generic
+        // fetch failures, and falls inside the BSD sysexits.h band).
+        Err(webclaw_fetch::FetchError::KnownBadSite { message, .. }) => {
+            eprintln!("{message}");
+            process::exit(67);
+        }
+        Err(e) => return Err(format!("fetch error: {e}")),
+    };

    // Check if we should fall back to cloud
    let reason = detect_empty(&result);
--- a/crates/webclaw-fetch/src/client.rs
+++ b/crates/webclaw-fetch/src/client.rs
@ -350,6 +350,22 @@ impl FetchClient {
    /// rescue logic; use [`Self::fetch_smart`] for that.
    #[instrument(skip(self), fields(url = %url))]
    pub async fn fetch(&self, url: &str) -> Result<FetchResult, FetchError> {
+        // M3 known-bad-sites: short-circuit before any network work. See
+        // the longer comment in `fetch_and_extract_with_options`.
+        if let Some(site) = crate::known_bad_sites::check(url) {
+            let message = crate::known_bad_sites::format_fail_message(site, url);
+            let category = match site.category {
+                crate::known_bad_sites::BadSiteCategory::Cloudflare => "cloudflare",
+                crate::known_bad_sites::BadSiteCategory::Adblock => "adblock",
+                crate::known_bad_sites::BadSiteCategory::HardPaywall => "paywall",
+            };
+            return Err(FetchError::KnownBadSite {
+                host: site.host,
+                category,
+                message,
+            });
+        }
+
        let delays = [Duration::ZERO, Duration::from_secs(1)];
        let mut last_err = None;

@ -493,6 +509,26 @@ impl FetchClient {
        url: &str,
        options: &webclaw_core::ExtractionOptions,
    ) -> Result<webclaw_core::ExtractionResult, FetchError> {
+        // M3 known-bad-sites registry: fast-fail BEFORE DNS resolution and
+        // any HTTP work. Hosts in the registry (Cloudflare interstitials,
+        // adblock walls) cannot be usefully fetched, so we return an
+        // `Err(KnownBadSite { ... })` here and let the CLI emit the
+        // stderr message + exit non-zero. Library callers can pattern-
+        // match on the variant if they want to skip the warning.
+        if let Some(site) = crate::known_bad_sites::check(url) {
+            let message = crate::known_bad_sites::format_fail_message(site, url);
+            let category = match site.category {
+                crate::known_bad_sites::BadSiteCategory::Cloudflare => "cloudflare",
+                crate::known_bad_sites::BadSiteCategory::Adblock => "adblock",
+                crate::known_bad_sites::BadSiteCategory::HardPaywall => "paywall",
+            };
+            return Err(FetchError::KnownBadSite {
+                host: site.host,
+                category,
+                message,
+            });
+        }
+
        let parsed_url = crate::url_security::validate_public_http_url(url).await?;
        let url = parsed_url.as_str();

@ -1116,4 +1152,48 @@ mod tests {
        assert!(config.proxy_pool.is_empty());
        assert!(config.proxy.is_none());
    }
+
+    /// M3 (iter 3): when the URL hits the known-bad-sites registry, the
+    /// fetch entry point must return `FetchError::KnownBadSite` without
+    /// touching the network. This pins both the variant shape (so the
+    /// CLI's match arm stays correct) and the no-network behavior — the
+    /// `.await` here would 218 ms to ambito.com if the registry check
+    /// were skipped, so a fast-fail under ~50 ms is part of the contract.
+    #[tokio::test]
+    async fn test_fetch_layer_returns_known_bad_site_error() {
+        let client = FetchClient::new(FetchConfig::default()).unwrap();
+        let options = webclaw_core::ExtractionOptions::default();
+        let start = std::time::Instant::now();
+        let err = client
+            .fetch_and_extract_with_options("https://www.ambito.com/economia/", &options)
+            .await
+            .expect_err("ambito.com must short-circuit via registry");
+        let elapsed_ms = start.elapsed().as_millis();
+        match err {
+            FetchError::KnownBadSite {
+                host,
+                category,
+                ref message,
+            } => {
+                assert_eq!(host, "ambito.com");
+                assert_eq!(category, "cloudflare");
+                assert!(
+                    message.contains("ambito.com is cloudflare-walled"),
+                    "stderr line shape: {message}"
+                );
+                assert!(
+                    message.contains("cronista.com"),
+                    "substitute list missing: {message}"
+                );
+            }
+            other => panic!("expected KnownBadSite, got {other:?}"),
+        }
+        // Sanity: no HTTP work happened. Generous upper bound (1000 ms)
+        // tolerates cold-start jitter on CI but still proves we didn't
+        // wait for Cloudflare's 218 ms interstitial.
+        assert!(
+            elapsed_ms < 1000,
+            "registry fast-fail took {elapsed_ms}ms — looks like the check is firing AFTER the HTTP call",
+        );
+    }
 }
--- a/crates/webclaw-fetch/src/error.rs
+++ b/crates/webclaw-fetch/src/error.rs
@ -21,4 +21,15 @@ pub enum FetchError {

    #[error("client build failed: {0}")]
    Build(String),
+
+    /// Host matched the known-bad-sites registry (M3). The `message` is
+    /// the pre-formatted stderr line — caller should emit it verbatim and
+    /// exit non-zero. The `host` and `category` are pulled out so library
+    /// callers can pattern-match without parsing the message string.
+    #[error("{message}")]
+    KnownBadSite {
+        host: &'static str,
+        category: &'static str,
+        message: String,
+    },
 }
--- a/crates/webclaw-fetch/src/known_bad_sites.rs
+++ b/crates/webclaw-fetch/src/known_bad_sites.rs
@ -0,0 +1,211 @@
+/// Known-bad-sites registry (M3, iter 3).
+///
+/// Declarative list of hosts that webclaw cannot usefully fetch — Cloudflare
+/// interstitials, JS+adblock walls, and (eventually) hard paywalls. Checked
+/// BEFORE any DNS resolution or HTTP request, so the registered hosts
+/// short-circuit with a stderr message naming a substitute domain rather than
+/// burning wall-clock on a doomed fetch.
+///
+/// Initial entries (phase A measured pre-baseline, see
+/// `baselines/iter-3-pre-baseline.json`):
+///   - `ambito.com` — Cloudflare "Just a moment..." interstitial. Pre-M3:
+///     exit 0, 75 B stdout (metadata only), 218 ms. Chrome retry does not
+///     bypass.
+///   - `liberation.fr` — JS + adblock wall. Pre-M3: exit 0, 148 B stub
+///     ("Please enable JS and disable any ad blocker"), 344 ms, silent
+///     stderr.
+///
+/// WSJ / FT / Bloomberg / NYT are explicitly DEFERRED to a later milestone
+/// (M11) because hard paywalls behave differently and the substitute logic
+/// is different.
+///
+/// Host matching:
+///   `lowercase(strip_leading_www(url.host))` then exact-match against the
+///   normalized `host` field of each registry entry. So `ambito.com`,
+///   `www.ambito.com`, and `Ambito.COM` all collapse to `ambito.com` and
+///   hit the same entry. Subpaths (`/economia/`) match because the
+///   comparison is host-only.
+///
+/// IDN / punycode (e.g. the Spanish display name "Ámbito") is not handled
+/// this iter — the actual DNS for ambito.com is plain ASCII. If a future
+/// entry needs IDN, switch to `url::Host` matching.
+use std::fmt;
+
+/// Why a host is registered as bad. Determines the `<category>` segment of
+/// the stderr error line: `error: <host> is <category>-walled; ...`.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum BadSiteCategory {
+    /// Cloudflare "Just a moment..." interstitial / challenge page.
+    Cloudflare,
+    /// JS + adblock wall (page renders an "enable adblock disable" stub).
+    Adblock,
+    /// Reserved for M11 (NYT/WSJ/FT/Bloomberg). Not used by any current
+    /// registry entry — kept in the enum so the matching/formatting code
+    /// already covers the variant when M11 lands.
+    #[allow(dead_code)]
+    HardPaywall,
+}
+
+impl fmt::Display for BadSiteCategory {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let s = match self {
+            BadSiteCategory::Cloudflare => "cloudflare",
+            BadSiteCategory::Adblock => "adblock",
+            BadSiteCategory::HardPaywall => "paywall",
+        };
+        f.write_str(s)
+    }
+}
+
+/// One registry entry. `host` is the normalized form (lowercase, no `www.`).
+#[derive(Debug, Clone, Copy)]
+pub struct KnownBadSite {
+    /// Normalized host: lowercase, no leading `www.`.
+    pub host: &'static str,
+    pub category: BadSiteCategory,
+    /// Suggested alternative domains the caller can try instead. Order
+    /// matters: the first is the strongest recommendation.
+    pub substitutes: &'static [&'static str],
+    /// Human-readable note explaining why this host is registered. Not
+    /// emitted to stderr by default but available to library callers.
+    #[allow(dead_code)]
+    pub reason: &'static str,
+}
+
+/// Compile-time registry. Linear scan is fine at this size; swap to a
+/// `phf` perfect-hash if it grows past ~50 entries.
+pub const KNOWN_BAD_SITES: &[KnownBadSite] = &[
+    KnownBadSite {
+        host: "ambito.com",
+        category: BadSiteCategory::Cloudflare,
+        substitutes: &["cronista.com", "iprofesional.com"],
+        reason: "Cloudflare 'Just a moment...' interstitial; chrome retry does not bypass",
+    },
+    KnownBadSite {
+        host: "liberation.fr",
+        category: BadSiteCategory::Adblock,
+        substitutes: &["lemonde.fr", "lepoint.fr"],
+        reason: "JS + adblock wall; returns 148-byte stub asking to disable adblock",
+    },
+];
+
+/// Normalize a host string for registry matching: lowercase, strip a single
+/// leading `www.` label if present. Returns owned `String` because the
+/// lowercase operation may allocate.
+fn normalize_host(host: &str) -> String {
+    let lower = host.to_ascii_lowercase();
+    lower.strip_prefix("www.").map(|s| s.to_string()).unwrap_or(lower)
+}
+
+/// Check whether `url` is a registered known-bad host. Returns the matching
+/// entry or `None`. Accepts a full URL string; parsing failures yield `None`
+/// (the caller should hit its normal "invalid URL" path).
+pub fn check(url: &str) -> Option<&'static KnownBadSite> {
+    let parsed = url::Url::parse(url).ok()?;
+    let host = parsed.host_str()?;
+    let normalized = normalize_host(host);
+    KNOWN_BAD_SITES.iter().find(|entry| entry.host == normalized)
+}
+
+/// Format the stderr error line for a registry hit. Phase A's contract:
+///
+///   `error: <host> is <category>-walled; suggested substitute: <a>, <b>`
+///
+/// `<host>` is the normalized host (so even if the caller passed
+/// `https://WWW.Ambito.COM/economia/` we emit `ambito.com`). `<category>`
+/// is the lowercase `Display` form of the enum. `requested_url` is accepted
+/// for future use (e.g. echoing the caller's URL in a debug-level field);
+/// it's intentionally unused in the canonical one-liner so probe.py's regex
+/// stays simple.
+pub fn format_fail_message(site: &KnownBadSite, _requested_url: &str) -> String {
+    format!(
+        "error: {host} is {category}-walled; suggested substitute: {subs}",
+        host = site.host,
+        category = site.category,
+        subs = site.substitutes.join(", "),
+    )
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_registry_matches_ambito_root() {
+        let hit = check("https://www.ambito.com/").expect("ambito.com should be in registry");
+        assert_eq!(hit.host, "ambito.com");
+        assert_eq!(hit.category, BadSiteCategory::Cloudflare);
+    }
+
+    #[test]
+    fn test_registry_matches_ambito_path() {
+        // Host-only match: any path under a registered host fires.
+        let hit = check("https://www.ambito.com/economia/")
+            .expect("ambito subpath should still match");
+        assert_eq!(hit.host, "ambito.com");
+    }
+
+    #[test]
+    fn test_registry_matches_ambito_without_www() {
+        // www stripping: bare apex matches the same entry as the www form.
+        let hit = check("https://ambito.com/")
+            .expect("bare apex ambito.com should match");
+        assert_eq!(hit.host, "ambito.com");
+    }
+
+    #[test]
+    fn test_registry_matches_liberation_subpath() {
+        let hit = check("https://www.liberation.fr/culture/cinema/")
+            .expect("liberation subpath should match");
+        assert_eq!(hit.host, "liberation.fr");
+        assert_eq!(hit.category, BadSiteCategory::Adblock);
+    }
+
+    #[test]
+    fn test_registry_skips_unknown_domain() {
+        assert!(check("https://example.com/").is_none());
+        // Also reject the "lookalike + word" false-positive — we want
+        // EXACT host match after normalization, not substring matching.
+        assert!(check("https://evilambito.com/").is_none());
+    }
+
+    #[test]
+    fn test_registry_case_insensitive() {
+        // All-caps scheme + host. url::Url already lowercases scheme/host
+        // on parse, but our normalize_host belt-and-braces it anyway.
+        let hit = check("HTTPS://AMBITO.COM/").expect("uppercase host should match");
+        assert_eq!(hit.host, "ambito.com");
+
+        // Mixed case with www prefix.
+        let hit2 = check("https://WWW.Ambito.com/").expect("mixed-case www should match");
+        assert_eq!(hit2.host, "ambito.com");
+    }
+
+    #[test]
+    fn test_format_fail_message_includes_substitutes() {
+        let site = check("https://www.ambito.com/").unwrap();
+        let msg = format_fail_message(site, "https://www.ambito.com/");
+        assert!(msg.contains("ambito.com"), "msg should contain normalized host: {msg}");
+        assert!(msg.contains("cloudflare-walled"), "category segment expected: {msg}");
+        assert!(msg.contains("cronista.com"), "first substitute missing: {msg}");
+        assert!(msg.contains("iprofesional.com"), "second substitute missing: {msg}");
+    }
+
+    #[test]
+    fn test_format_fail_message_liberation_shape() {
+        let site = check("https://www.liberation.fr/culture/cinema/").unwrap();
+        let msg = format_fail_message(site, "https://www.liberation.fr/culture/cinema/");
+        assert_eq!(
+            msg,
+            "error: liberation.fr is adblock-walled; suggested substitute: lemonde.fr, lepoint.fr"
+        );
+    }
+
+    #[test]
+    fn test_check_returns_none_on_invalid_url() {
+        // Garbage input should not panic; we expect None so the caller
+        // falls through to its normal invalid-URL handling.
+        assert!(check("not a url at all").is_none());
+        assert!(check("").is_none());
+    }
+}
--- a/crates/webclaw-fetch/src/lib.rs
+++ b/crates/webclaw-fetch/src/lib.rs
@ -9,6 +9,7 @@ pub mod document;
 pub mod error;
 pub mod extractors;
 pub mod fetcher;
+pub mod known_bad_sites;
 pub mod linkedin;
 pub mod locale;
 pub mod proxy;