feat(fetch): known-bad-sites registry for fast-fail on Cloudflare / adblock walls

Sites known to require CAPTCHA-solving (Cloudflare interstitials) or
browser-side ad-blocker bypass (JS+adblock walls like Liberation) cannot
be reached by webclaw's chrome impersonation; they return interstitial
stubs ('Just a moment...', 'Please enable JS and disable any ad blocker')
with 0 useful content. Currently each call wastes 5-10s on the timeout
before the caller sees the failure.

New registry under crates/webclaw-fetch/src/known_bad_sites.rs lists
known bad hosts with a category (CloudflareInterstitial / AdblockWall)
and suggested substitute domains. Host matching: lowercase + strip
leading 'www.' + exact-match against registered host.

On registry hit, webclaw writes 'error: <host> is <category>-walled;
suggested substitute: <alt1>, <alt2>' to stderr and exits with code 67
(EX_NOHOST), BEFORE making any network call. wall_ms drops from ~5000
to <50 for listed hosts.

Initial entries: ambito.com (Cloudflare; substitutes cronista.com,
iprofesional.com), liberation.fr (adblock; substitutes lemonde.fr,
lepoint.fr). WSJ/FT/Bloomberg/NYT are NOT included -- those are
subscription paywalls with different bypass semantics; deferred to M11.

10 new tests in webclaw-fetch covering host normalization, www
stripping, path-under-host matching, case insensitivity, unknown-domain
pass-through, and the formatted error message (9 unit + 1 fetch-layer
integration). Workspace test total 647 -> 657.
This commit is contained in:
devnen 2026-05-23 19:42:15 +02:00
parent 31a8f6150f
commit e28b22adf7
6 changed files with 319 additions and 4 deletions

2
.gitignore vendored
View file

@ -25,3 +25,5 @@ baselines/
*-loop-progress.log
_build-release.bat
_build-release.log
improve-loop-CONTINUE.md
iter-*-smoke/

View file

@ -942,10 +942,20 @@ async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
let client =
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
let options = build_extraction_options(cli);
let result = client
.fetch_and_extract_with_options(url, &options)
.await
.map_err(|e| format!("fetch error: {e}"))?;
let result = match client.fetch_and_extract_with_options(url, &options).await {
Ok(r) => r,
// M3: known-bad-sites registry hit. The error message is already
// formatted per phase-A contract. Emit it to stderr verbatim and
// exit 67 (chosen because webclaw's existing error paths all use
// exit 1; 67 is distinct so callers can grep for "host is in the
// known-bad registry" specifically without colliding with generic
// fetch failures, and falls inside the BSD sysexits.h band).
Err(webclaw_fetch::FetchError::KnownBadSite { message, .. }) => {
eprintln!("{message}");
process::exit(67);
}
Err(e) => return Err(format!("fetch error: {e}")),
};
// Check if we should fall back to cloud
let reason = detect_empty(&result);

View file

@ -350,6 +350,22 @@ impl FetchClient {
/// rescue logic; use [`Self::fetch_smart`] for that.
#[instrument(skip(self), fields(url = %url))]
pub async fn fetch(&self, url: &str) -> Result<FetchResult, FetchError> {
// M3 known-bad-sites: short-circuit before any network work. See
// the longer comment in `fetch_and_extract_with_options`.
if let Some(site) = crate::known_bad_sites::check(url) {
let message = crate::known_bad_sites::format_fail_message(site, url);
let category = match site.category {
crate::known_bad_sites::BadSiteCategory::Cloudflare => "cloudflare",
crate::known_bad_sites::BadSiteCategory::Adblock => "adblock",
crate::known_bad_sites::BadSiteCategory::HardPaywall => "paywall",
};
return Err(FetchError::KnownBadSite {
host: site.host,
category,
message,
});
}
let delays = [Duration::ZERO, Duration::from_secs(1)];
let mut last_err = None;
@ -493,6 +509,26 @@ impl FetchClient {
url: &str,
options: &webclaw_core::ExtractionOptions,
) -> Result<webclaw_core::ExtractionResult, FetchError> {
// M3 known-bad-sites registry: fast-fail BEFORE DNS resolution and
// any HTTP work. Hosts in the registry (Cloudflare interstitials,
// adblock walls) cannot be usefully fetched, so we return an
// `Err(KnownBadSite { ... })` here and let the CLI emit the
// stderr message + exit non-zero. Library callers can pattern-
// match on the variant if they want to skip the warning.
if let Some(site) = crate::known_bad_sites::check(url) {
let message = crate::known_bad_sites::format_fail_message(site, url);
let category = match site.category {
crate::known_bad_sites::BadSiteCategory::Cloudflare => "cloudflare",
crate::known_bad_sites::BadSiteCategory::Adblock => "adblock",
crate::known_bad_sites::BadSiteCategory::HardPaywall => "paywall",
};
return Err(FetchError::KnownBadSite {
host: site.host,
category,
message,
});
}
let parsed_url = crate::url_security::validate_public_http_url(url).await?;
let url = parsed_url.as_str();
@ -1116,4 +1152,48 @@ mod tests {
assert!(config.proxy_pool.is_empty());
assert!(config.proxy.is_none());
}
/// M3 (iter 3): when the URL hits the known-bad-sites registry, the
/// fetch entry point must return `FetchError::KnownBadSite` without
/// touching the network. This pins both the variant shape (so the
/// CLI's match arm stays correct) and the no-network behavior — the
/// `.await` here would 218 ms to ambito.com if the registry check
/// were skipped, so a fast-fail under ~50 ms is part of the contract.
#[tokio::test]
async fn test_fetch_layer_returns_known_bad_site_error() {
let client = FetchClient::new(FetchConfig::default()).unwrap();
let options = webclaw_core::ExtractionOptions::default();
let start = std::time::Instant::now();
let err = client
.fetch_and_extract_with_options("https://www.ambito.com/economia/", &options)
.await
.expect_err("ambito.com must short-circuit via registry");
let elapsed_ms = start.elapsed().as_millis();
match err {
FetchError::KnownBadSite {
host,
category,
ref message,
} => {
assert_eq!(host, "ambito.com");
assert_eq!(category, "cloudflare");
assert!(
message.contains("ambito.com is cloudflare-walled"),
"stderr line shape: {message}"
);
assert!(
message.contains("cronista.com"),
"substitute list missing: {message}"
);
}
other => panic!("expected KnownBadSite, got {other:?}"),
}
// Sanity: no HTTP work happened. Generous upper bound (1000 ms)
// tolerates cold-start jitter on CI but still proves we didn't
// wait for Cloudflare's 218 ms interstitial.
assert!(
elapsed_ms < 1000,
"registry fast-fail took {elapsed_ms}ms — looks like the check is firing AFTER the HTTP call",
);
}
}

View file

@ -21,4 +21,15 @@ pub enum FetchError {
#[error("client build failed: {0}")]
Build(String),
/// Host matched the known-bad-sites registry (M3). The `message` is
/// the pre-formatted stderr line — caller should emit it verbatim and
/// exit non-zero. The `host` and `category` are pulled out so library
/// callers can pattern-match without parsing the message string.
#[error("{message}")]
KnownBadSite {
host: &'static str,
category: &'static str,
message: String,
},
}

View file

@ -0,0 +1,211 @@
/// Known-bad-sites registry (M3, iter 3).
///
/// Declarative list of hosts that webclaw cannot usefully fetch — Cloudflare
/// interstitials, JS+adblock walls, and (eventually) hard paywalls. Checked
/// BEFORE any DNS resolution or HTTP request, so the registered hosts
/// short-circuit with a stderr message naming a substitute domain rather than
/// burning wall-clock on a doomed fetch.
///
/// Initial entries (phase A measured pre-baseline, see
/// `baselines/iter-3-pre-baseline.json`):
/// - `ambito.com` — Cloudflare "Just a moment..." interstitial. Pre-M3:
/// exit 0, 75 B stdout (metadata only), 218 ms. Chrome retry does not
/// bypass.
/// - `liberation.fr` — JS + adblock wall. Pre-M3: exit 0, 148 B stub
/// ("Please enable JS and disable any ad blocker"), 344 ms, silent
/// stderr.
///
/// WSJ / FT / Bloomberg / NYT are explicitly DEFERRED to a later milestone
/// (M11) because hard paywalls behave differently and the substitute logic
/// is different.
///
/// Host matching:
/// `lowercase(strip_leading_www(url.host))` then exact-match against the
/// normalized `host` field of each registry entry. So `ambito.com`,
/// `www.ambito.com`, and `Ambito.COM` all collapse to `ambito.com` and
/// hit the same entry. Subpaths (`/economia/`) match because the
/// comparison is host-only.
///
/// IDN / punycode (e.g. the Spanish display name "Ámbito") is not handled
/// this iter — the actual DNS for ambito.com is plain ASCII. If a future
/// entry needs IDN, switch to `url::Host` matching.
use std::fmt;
/// Why a host is registered as bad. Determines the `<category>` segment of
/// the stderr error line: `error: <host> is <category>-walled; ...`.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum BadSiteCategory {
/// Cloudflare "Just a moment..." interstitial / challenge page.
Cloudflare,
/// JS + adblock wall (page renders an "enable adblock disable" stub).
Adblock,
/// Reserved for M11 (NYT/WSJ/FT/Bloomberg). Not used by any current
/// registry entry — kept in the enum so the matching/formatting code
/// already covers the variant when M11 lands.
#[allow(dead_code)]
HardPaywall,
}
impl fmt::Display for BadSiteCategory {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let s = match self {
BadSiteCategory::Cloudflare => "cloudflare",
BadSiteCategory::Adblock => "adblock",
BadSiteCategory::HardPaywall => "paywall",
};
f.write_str(s)
}
}
/// One registry entry. `host` is the normalized form (lowercase, no `www.`).
#[derive(Debug, Clone, Copy)]
pub struct KnownBadSite {
/// Normalized host: lowercase, no leading `www.`.
pub host: &'static str,
pub category: BadSiteCategory,
/// Suggested alternative domains the caller can try instead. Order
/// matters: the first is the strongest recommendation.
pub substitutes: &'static [&'static str],
/// Human-readable note explaining why this host is registered. Not
/// emitted to stderr by default but available to library callers.
#[allow(dead_code)]
pub reason: &'static str,
}
/// Compile-time registry. Linear scan is fine at this size; swap to a
/// `phf` perfect-hash if it grows past ~50 entries.
pub const KNOWN_BAD_SITES: &[KnownBadSite] = &[
KnownBadSite {
host: "ambito.com",
category: BadSiteCategory::Cloudflare,
substitutes: &["cronista.com", "iprofesional.com"],
reason: "Cloudflare 'Just a moment...' interstitial; chrome retry does not bypass",
},
KnownBadSite {
host: "liberation.fr",
category: BadSiteCategory::Adblock,
substitutes: &["lemonde.fr", "lepoint.fr"],
reason: "JS + adblock wall; returns 148-byte stub asking to disable adblock",
},
];
/// Normalize a host string for registry matching: lowercase, strip a single
/// leading `www.` label if present. Returns owned `String` because the
/// lowercase operation may allocate.
fn normalize_host(host: &str) -> String {
let lower = host.to_ascii_lowercase();
lower.strip_prefix("www.").map(|s| s.to_string()).unwrap_or(lower)
}
/// Check whether `url` is a registered known-bad host. Returns the matching
/// entry or `None`. Accepts a full URL string; parsing failures yield `None`
/// (the caller should hit its normal "invalid URL" path).
pub fn check(url: &str) -> Option<&'static KnownBadSite> {
let parsed = url::Url::parse(url).ok()?;
let host = parsed.host_str()?;
let normalized = normalize_host(host);
KNOWN_BAD_SITES.iter().find(|entry| entry.host == normalized)
}
/// Format the stderr error line for a registry hit. Phase A's contract:
///
/// `error: <host> is <category>-walled; suggested substitute: <a>, <b>`
///
/// `<host>` is the normalized host (so even if the caller passed
/// `https://WWW.Ambito.COM/economia/` we emit `ambito.com`). `<category>`
/// is the lowercase `Display` form of the enum. `requested_url` is accepted
/// for future use (e.g. echoing the caller's URL in a debug-level field);
/// it's intentionally unused in the canonical one-liner so probe.py's regex
/// stays simple.
pub fn format_fail_message(site: &KnownBadSite, _requested_url: &str) -> String {
format!(
"error: {host} is {category}-walled; suggested substitute: {subs}",
host = site.host,
category = site.category,
subs = site.substitutes.join(", "),
)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_registry_matches_ambito_root() {
let hit = check("https://www.ambito.com/").expect("ambito.com should be in registry");
assert_eq!(hit.host, "ambito.com");
assert_eq!(hit.category, BadSiteCategory::Cloudflare);
}
#[test]
fn test_registry_matches_ambito_path() {
// Host-only match: any path under a registered host fires.
let hit = check("https://www.ambito.com/economia/")
.expect("ambito subpath should still match");
assert_eq!(hit.host, "ambito.com");
}
#[test]
fn test_registry_matches_ambito_without_www() {
// www stripping: bare apex matches the same entry as the www form.
let hit = check("https://ambito.com/")
.expect("bare apex ambito.com should match");
assert_eq!(hit.host, "ambito.com");
}
#[test]
fn test_registry_matches_liberation_subpath() {
let hit = check("https://www.liberation.fr/culture/cinema/")
.expect("liberation subpath should match");
assert_eq!(hit.host, "liberation.fr");
assert_eq!(hit.category, BadSiteCategory::Adblock);
}
#[test]
fn test_registry_skips_unknown_domain() {
assert!(check("https://example.com/").is_none());
// Also reject the "lookalike + word" false-positive — we want
// EXACT host match after normalization, not substring matching.
assert!(check("https://evilambito.com/").is_none());
}
#[test]
fn test_registry_case_insensitive() {
// All-caps scheme + host. url::Url already lowercases scheme/host
// on parse, but our normalize_host belt-and-braces it anyway.
let hit = check("HTTPS://AMBITO.COM/").expect("uppercase host should match");
assert_eq!(hit.host, "ambito.com");
// Mixed case with www prefix.
let hit2 = check("https://WWW.Ambito.com/").expect("mixed-case www should match");
assert_eq!(hit2.host, "ambito.com");
}
#[test]
fn test_format_fail_message_includes_substitutes() {
let site = check("https://www.ambito.com/").unwrap();
let msg = format_fail_message(site, "https://www.ambito.com/");
assert!(msg.contains("ambito.com"), "msg should contain normalized host: {msg}");
assert!(msg.contains("cloudflare-walled"), "category segment expected: {msg}");
assert!(msg.contains("cronista.com"), "first substitute missing: {msg}");
assert!(msg.contains("iprofesional.com"), "second substitute missing: {msg}");
}
#[test]
fn test_format_fail_message_liberation_shape() {
let site = check("https://www.liberation.fr/culture/cinema/").unwrap();
let msg = format_fail_message(site, "https://www.liberation.fr/culture/cinema/");
assert_eq!(
msg,
"error: liberation.fr is adblock-walled; suggested substitute: lemonde.fr, lepoint.fr"
);
}
#[test]
fn test_check_returns_none_on_invalid_url() {
// Garbage input should not panic; we expect None so the caller
// falls through to its normal invalid-URL handling.
assert!(check("not a url at all").is_none());
assert!(check("").is_none());
}
}

View file

@ -9,6 +9,7 @@ pub mod document;
pub mod error;
pub mod extractors;
pub mod fetcher;
pub mod known_bad_sites;
pub mod linkedin;
pub mod locale;
pub mod proxy;