From c37867309c675793cc49a8da40c034f348745336 Mon Sep 17 00:00:00 2001 From: devnen Date: Sun, 24 May 2026 08:55:54 +0200 Subject: [PATCH] feat(fetch): paywall HTML-signature detection + best-effort --paywall-bypass Detects NYT/WSJ/FT/Bloomberg/Substack paywall overlay markers in extracted HTML and emits stderr warning: # webclaw: warning: paywall detected on (); full article may not be accessible. Try --paywall-bypass or https://archive.is/ Detection uses a declarative signature registry (parallel to M3 known-bad-sites): per-host suffix gate + any-of substring scan of publisher-specific CSS classes / data-attributes / JSON-LD markers. NYT markers (vi-gateway-container, "isAccessibleForFree":false, meteredContent) were verified against a real live NYT article; other publishers use documented per-publisher overlay conventions. New --paywall-bypass flag attempts a soft bypass: injects a Googlebot User-Agent into the FetchConfig headers (some publishers serve full content to crawlers for SEO indexing). If the paywall is STILL detected post-Googlebot, the stderr warning switches to the bypass-aware variant naming the attempted strategy and pointing at https://archive.is/ as an external fallback. This is BEST-EFFORT. webclaw has no headless browser and cannot bypass paywalls requiring real session auth. Honest stderr language reflects that. Plumbing is minimal: webclaw-fetch gets a new `paywall` module + post-fetch detection hook in fetch_and_extract_with_options, and FetchClient gets a `with_paywall_bypass(bool)` builder method the CLI calls when the flag is set. 17 new tests (13 in paywall.rs covering host-gate / marker-gate / false-positive resistance / message formatting / Googlebot UA constant; 4 in webclaw-cli mod tests covering flag presence, default value, header injection wiring). Workspace 724 -> 741. Critical false-positive sentinels verified: p43 example.com 313 B byte-identical (stderr empty), p09 bbc.com 13K+ (stderr empty), p47 reuters.com 10K+ (stderr empty). Cyrillic p14 srbijagas 7777 B byte-identical (M15 sentinel preserved across 11 iters). M3 fast-fail on ambito.com exit 67 byte-identical. M14 truncation warning intact. No probe.py changes. No baseline modifications. No Cargo deps added. --- crates/webclaw-cli/src/main.rs | 93 ++++++++- crates/webclaw-fetch/src/client.rs | 48 +++++ crates/webclaw-fetch/src/lib.rs | 1 + crates/webclaw-fetch/src/paywall.rs | 304 ++++++++++++++++++++++++++++ 4 files changed, 444 insertions(+), 2 deletions(-) create mode 100644 crates/webclaw-fetch/src/paywall.rs diff --git a/crates/webclaw-cli/src/main.rs b/crates/webclaw-cli/src/main.rs index 47ed127..ef20e91 100644 --- a/crates/webclaw-cli/src/main.rs +++ b/crates/webclaw-cli/src/main.rs @@ -175,6 +175,18 @@ struct Cli { #[arg(long)] url_encoded: bool, + /// Best-effort paywall bypass: re-fetches with a Googlebot User-Agent + /// (some publishers serve full content to crawlers for SEO indexing). + /// This is HEURISTIC ONLY — webclaw has no headless browser and cannot + /// bypass paywalls requiring real session auth, cookies, or JS + /// execution. If the paywall is still detected after the bypass + /// attempt, the stderr warning will suggest https://archive.is/ + /// as an external fallback. Paywall detection itself (without this + /// flag) runs by default on registered publisher hosts and emits an + /// advisory stderr warning. + #[arg(long)] + paywall_bypass: bool, + /// Output format (markdown, json, text, llm, html) #[arg(short, long, default_value = "markdown")] format: OutputFormat, @@ -591,6 +603,18 @@ fn build_fetch_config(cli: &Cli) -> FetchConfig { } } + // M11 --paywall-bypass: override User-Agent with Googlebot so publishers + // that serve full content to crawlers for SEO will hand us the article + // body. Best-effort: many publishers verify the request actually comes + // from a Google-owned IP, in which case this header alone does nothing. + // Honest stderr language in the post-detect warning reflects that. + if cli.paywall_bypass { + headers.insert( + "User-Agent".to_string(), + webclaw_fetch::paywall::GOOGLEBOT_USER_AGENT.to_string(), + ); + } + FetchConfig { browser: cli.browser.clone().into(), proxy, @@ -1061,8 +1085,9 @@ async fn fetch_and_extract(cli: &Cli) -> Result { } // Normal path: try local first - let client = - FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?; + let client = FetchClient::new(build_fetch_config(cli)) + .map_err(|e| format!("client error: {e}"))? + .with_paywall_bypass(cli.paywall_bypass); let options = build_extraction_options(cli); // M13: wrap with periodic stderr progress emitter. Fast fetches see // zero emissions (timer never fires in <10s); slow fetches get a @@ -3336,4 +3361,68 @@ mod tests { // Round-trips through formatting without panicking. let _ = format!("research-{slug}.json"); } + + // -------- M11 paywall-bypass flag -------- + + #[test] + fn paywall_bypass_flag_present_in_cli() { + // clap-level smoke: the flag parses and the field is reachable + // from the Cli struct. If the flag was renamed/removed this test + // fails to compile, which is the intended sentinel. + use clap::Parser; + let cli = Cli::try_parse_from([ + "webclaw", + "https://example.com/", + "--paywall-bypass", + ]) + .expect("--paywall-bypass should parse"); + assert!(cli.paywall_bypass, "--paywall-bypass should set the bool"); + } + + #[test] + fn paywall_bypass_default_false() { + // Sentinel: the flag is opt-in only. Default behavior must be + // unchanged on all existing probes. + use clap::Parser; + let cli = Cli::try_parse_from(["webclaw", "https://example.com/"]) + .expect("default cli should parse"); + assert!(!cli.paywall_bypass, "paywall_bypass must default to false"); + } + + #[test] + fn paywall_bypass_injects_googlebot_ua() { + // The build_fetch_config path inserts the Googlebot UA header + // when cli.paywall_bypass is set. This guards against accidental + // removal of the header-injection wiring. + use clap::Parser; + let cli = Cli::try_parse_from([ + "webclaw", + "https://example.com/", + "--paywall-bypass", + ]) + .expect("--paywall-bypass should parse"); + let config = build_fetch_config(&cli); + let ua = config.headers.get("User-Agent").expect("UA header should be set"); + assert!(ua.contains("Googlebot"), "UA should be Googlebot, got: {ua}"); + assert_eq!(ua, webclaw_fetch::paywall::GOOGLEBOT_USER_AGENT); + } + + #[test] + fn paywall_bypass_unset_leaves_default_ua() { + // Without the flag, build_fetch_config must NOT inject the + // Googlebot UA — preserves browser-profile fingerprinting that + // M1-M14 depend on. + use clap::Parser; + let cli = Cli::try_parse_from(["webclaw", "https://example.com/"]) + .expect("default cli should parse"); + let config = build_fetch_config(&cli); + // Either UA header is absent (most common; wreq supplies the + // browser-profile UA at the TLS layer) OR it's not Googlebot. + if let Some(ua) = config.headers.get("User-Agent") { + assert!( + !ua.contains("Googlebot"), + "default UA must not be Googlebot, got: {ua}" + ); + } + } } diff --git a/crates/webclaw-fetch/src/client.rs b/crates/webclaw-fetch/src/client.rs index f9338d1..dede9a8 100644 --- a/crates/webclaw-fetch/src/client.rs +++ b/crates/webclaw-fetch/src/client.rs @@ -201,6 +201,12 @@ pub struct FetchClient { /// out. Stored as `Arc` so cloning a `FetchClient` (common in /// axum state) doesn't clone the underlying reqwest pool. cloud: Option>, + /// M11 paywall-bypass flag. When true, the post-fetch paywall + /// detection (in `fetch_and_extract_with_options`) emits the + /// bypass-aware warning variant which acknowledges the + /// Googlebot-UA attempt and suggests archive.is as the next step. + /// Plumbed via [`Self::with_paywall_bypass`] from the CLI. + paywall_bypass_attempted: bool, } impl FetchClient { @@ -262,9 +268,22 @@ impl FetchClient { pool, pdf_mode, cloud: None, + paywall_bypass_attempted: false, }) } + /// M11: signal that the caller invoked the request with a + /// paywall-bypass attempt (Googlebot-UA override applied via + /// `FetchConfig.headers`). Affects the wording of the post-fetch + /// paywall-detection stderr warning emitted from + /// `fetch_and_extract_with_options` — the bypass-aware variant + /// names the bypass attempt and points at archive.is as the next + /// step. Returns `self` for builder-style chaining. + pub fn with_paywall_bypass(mut self, attempted: bool) -> Self { + self.paywall_bypass_attempted = attempted; + self + } + /// Attach a cloud-fallback client. Returns `self` so it composes in /// a builder-ish way: /// @@ -620,6 +639,35 @@ impl FetchClient { let elapsed = start.elapsed(); debug!(status, elapsed_ms = %elapsed.as_millis(), "fetch complete"); + // M11 paywall detection: host-gated scan of the raw html for + // known paywall overlay markers (NYT/WSJ/FT/Bloomberg/Substack). + // Advisory only — we still hand the html to the extractor below + // so the user gets whatever the publisher served (often a + // teaser / first paragraph). The warning is informational so + // the caller knows why the body is thin. + // + // The Googlebot-UA bypass attempt (when --paywall-bypass is set) + // happens at the CLI layer by injecting a UA into FetchConfig + // headers BEFORE the fetch; if the marker still appears here, + // it means the soft bypass didn't clear it. We can't tell from + // this function whether bypass was attempted, so we emit the + // generic warning; the CLI is responsible for the bypass-aware + // follow-up message. + if let Ok(parsed) = url::Url::parse(&final_url) + && let Some(host) = parsed.host_str() + && let Some(sig) = crate::paywall::detect_in_html(host, &html) + { + eprintln!( + "{}", + crate::paywall::format_warning( + sig, + host, + &final_url, + self.paywall_bypass_attempted, + ) + ); + } + // LinkedIn: extract from embedded JSON blobs if crate::linkedin::is_linkedin_post(&final_url) { if let Some(result) = crate::linkedin::extract_linkedin_post(&html, &final_url) { diff --git a/crates/webclaw-fetch/src/lib.rs b/crates/webclaw-fetch/src/lib.rs index 6cee844..19acd25 100644 --- a/crates/webclaw-fetch/src/lib.rs +++ b/crates/webclaw-fetch/src/lib.rs @@ -12,6 +12,7 @@ pub mod fetcher; pub mod known_bad_sites; pub mod linkedin; pub mod locale; +pub mod paywall; pub mod progress; pub mod proxy; pub mod reddit; diff --git a/crates/webclaw-fetch/src/paywall.rs b/crates/webclaw-fetch/src/paywall.rs new file mode 100644 index 0000000..197002d --- /dev/null +++ b/crates/webclaw-fetch/src/paywall.rs @@ -0,0 +1,304 @@ +/// Paywall HTML-signature detection (M11, iter 11). +/// +/// Declarative registry of known paywall overlay markers (CSS class names, +/// data-attributes, element IDs) for major publishers. Detection is +/// host-gated and runs AFTER a successful fetch, scanning the raw HTML for +/// any registered marker that belongs to the responding host's suffix. +/// +/// This is ADVISORY ONLY. Webclaw uses wreq for TLS impersonation and has +/// no headless browser, so true paywall bypass (cookie injection + JS +/// rendering + session auth) is not possible from this layer. When a +/// paywall is detected, the CLI emits a stderr warning: +/// +/// `# webclaw: warning: paywall detected on (); full article +/// may not be accessible. Try --paywall-bypass or https://archive.is/` +/// +/// The `--paywall-bypass` flag is a best-effort attempt: it injects a +/// Googlebot User-Agent (some publishers serve full content to crawlers +/// for SEO). If detection still fires post-bypass, the stderr message +/// adds a note pointing the user at archive.is as an external fallback. +/// +/// Host matching: `normalize_host(host).ends_with(host_suffix)` — so +/// `www.nytimes.com`, `nytimes.com`, `cooking.nytimes.com` all match the +/// `nytimes.com` entry. This is intentionally suffix-based (not exact +/// like M3 known-bad-sites) because paywalls span subdomains uniformly +/// within a publisher. +/// +/// Marker matching: any-of substring scan on the html (case-sensitive, +/// since CSS class names and data-attribute values are spec'd case- +/// sensitive in HTML/CSS). +/// +/// False-positive resistance: critical sentinel — detection MUST NOT +/// fire on example.com, BBC, Reuters, AP News, or any non-registered +/// host. The host gate is checked FIRST; if it doesn't match, the html +/// is never scanned. See `test_signature_only_fires_for_correct_host`. + +/// One paywall signature entry. Static by construction. +#[derive(Debug, Clone, Copy)] +pub struct PaywallSignature { + /// Human-readable publisher name. Used in the stderr warning. + pub name: &'static str, + /// Host suffix to match against the responding host (after `www.` + /// stripping + lowercasing). Subdomain-tolerant: `nytimes.com` + /// matches `cooking.nytimes.com`. + pub host_suffix: &'static str, + /// CSS classes, data-attributes, and element IDs whose presence in + /// the response body indicates a paywall overlay. Any-of match: the + /// signature fires when at least one marker is present. + pub markers: &'static [&'static str], +} + +/// Compile-time registry. Linear scan is fine at this size. +pub const PAYWALL_SIGNATURES: &[PaywallSignature] = &[ + PaywallSignature { + name: "New York Times", + host_suffix: "nytimes.com", + // Observed live on www.nytimes.com//.html pages: + // - `vi-gateway-container` is the JS gateway div NYT injects + // around paywall-eligible content (verified iter-11 phase B). + // - `"isAccessibleForFree":false` is in the NewsArticle JSON-LD + // for metered articles. + // - `meteredContent` covers the CSS class + JSON-LD cssSelector + // references; appears on metered articles only. + markers: &[ + "vi-gateway-container", + "\"isAccessibleForFree\":false", + "meteredContent", + ], + }, + PaywallSignature { + name: "Wall Street Journal", + host_suffix: "wsj.com", + markers: &[ + "paywall-overlay", + "wsj-paywall", + "snippet-promotion", + ], + }, + PaywallSignature { + name: "Financial Times", + host_suffix: "ft.com", + markers: &[ + "js-paywall", + "subscribe-prompt", + "data-trackable=\"paywall\"", + "id=\"paywall-app\"", + ], + }, + PaywallSignature { + name: "Bloomberg", + host_suffix: "bloomberg.com", + markers: &[ + "paywall-inline", + "terminal-promo", + "paywall-inline-promo", + ], + }, + PaywallSignature { + name: "Substack", + host_suffix: "substack.com", + markers: &[ + "paywall-content", + "subscribe-widget--paywall", + "class=\"paywall\"", + ], + }, +]; + +/// Normalize a host string for registry matching: lowercase + strip a +/// single leading `www.` label if present. +fn normalize_host(host: &str) -> String { + let lower = host.to_ascii_lowercase(); + lower.strip_prefix("www.").map(|s| s.to_string()).unwrap_or(lower) +} + +/// Detect a known paywall in the given html for the given host. +/// +/// Returns the matching `PaywallSignature` or `None`. Two gates: +/// 1. Host gate: normalized host must end with a registered `host_suffix`. +/// 2. Marker gate: html must contain at least one of the entry's markers. +/// +/// Both gates must pass. Pure function; no I/O. +pub fn detect_in_html(host: &str, html: &str) -> Option<&'static PaywallSignature> { + let normalized = normalize_host(host); + for sig in PAYWALL_SIGNATURES { + if normalized.ends_with(sig.host_suffix) + && sig.markers.iter().any(|m| html.contains(m)) + { + return Some(sig); + } + } + None +} + +/// Googlebot User-Agent string used by `--paywall-bypass`. Some publishers +/// serve full content to Googlebot for SEO indexing. This is a best-effort +/// soft bypass — many publishers verify the request actually comes from a +/// Google-owned IP, in which case this header alone does nothing. +pub const GOOGLEBOT_USER_AGENT: &str = + "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"; + +/// Format the stderr warning for a paywall detection. Phase A contract: +/// +/// `# webclaw: warning: paywall detected on (); full article +/// may not be accessible. Try --paywall-bypass or https://archive.is/` +/// +/// `bypass_attempted=true` (called from the `--paywall-bypass` path when +/// detection STILL fires) appends a note that the soft bypass attempt did +/// not clear the paywall and the archive.is fallback is the next step. +pub fn format_warning(sig: &PaywallSignature, host: &str, url: &str, bypass_attempted: bool) -> String { + let normalized = normalize_host(host); + if bypass_attempted { + format!( + "# webclaw: warning: paywall still detected on {name} ({host}) after --paywall-bypass attempt (Googlebot UA); try https://archive.is/{url}", + name = sig.name, + host = normalized, + url = url, + ) + } else { + format!( + "# webclaw: warning: paywall detected on {name} ({host}); full article may not be accessible. Try --paywall-bypass or https://archive.is/{url}", + name = sig.name, + host = normalized, + url = url, + ) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_signature_matches_nyt_marker_in_html() { + // vi-gateway-container is the NYT JS paywall gateway div, observed + // live on www.nytimes.com//.html pages (iter-11 phase B). + let html = r#"
"#; + let hit = detect_in_html("www.nytimes.com", html).expect("nyt should match"); + assert_eq!(hit.name, "New York Times"); + assert_eq!(hit.host_suffix, "nytimes.com"); + } + + #[test] + fn test_signature_matches_nyt_jsonld_marker() { + // "isAccessibleForFree":false is the schema.org JSON-LD signal NYT + // emits in NewsArticle structured data for metered articles. + // Critically, ":true" (the inverse — free content) must NOT match. + let html = r#""#; + let hit = detect_in_html("www.nytimes.com", html).expect("nyt jsonld marker should match"); + assert_eq!(hit.name, "New York Times"); + + // Negative: explicit "isAccessibleForFree":true must NOT fire. + let free_html = r#""#; + assert!(detect_in_html("www.nytimes.com", free_html).is_none(), + "free articles must not trigger the paywall marker"); + } + + #[test] + fn test_signature_matches_nyt_subdomain() { + // Subdomain coverage: cooking.nytimes.com should match the nytimes.com suffix. + let html = r#"
limit
"#; + let hit = detect_in_html("cooking.nytimes.com", html).expect("nyt subdomain should match"); + assert_eq!(hit.name, "New York Times"); + } + + #[test] + fn test_signature_matches_wsj_marker() { + let html = r#"
"#; + let hit = detect_in_html("www.wsj.com", html).expect("wsj should match"); + assert_eq!(hit.name, "Wall Street Journal"); + } + + #[test] + fn test_signature_matches_ft_marker() { + let html = r#"
"#; + let hit = detect_in_html("www.ft.com", html).expect("ft should match"); + assert_eq!(hit.name, "Financial Times"); + } + + #[test] + fn test_signature_matches_bloomberg_marker() { + let html = r#"
subscribe
"#; + let hit = detect_in_html("www.bloomberg.com", html).expect("bloomberg should match"); + assert_eq!(hit.name, "Bloomberg"); + } + + #[test] + fn test_signature_matches_substack_per_publisher_subdomain() { + // Substack publishers use .substack.com — subdomain suffix coverage. + let html = r#"
subscribe
"#; + let hit = detect_in_html("someblog.substack.com", html).expect("substack should match"); + assert_eq!(hit.name, "Substack"); + } + + #[test] + fn test_signature_skips_clean_html() { + // example.com: registered NEITHER as host nor marker source. + let clean_html = r#"

Example Domain

For use in examples.

"#; + assert!(detect_in_html("example.com", clean_html).is_none()); + assert!(detect_in_html("www.example.com", clean_html).is_none()); + + // nytimes.com host but NO marker in html — host gate passes, + // marker gate fails. Must NOT fire. + assert!(detect_in_html("www.nytimes.com", clean_html).is_none()); + + // BBC: never registered. Must NOT fire even with the same generic html. + assert!(detect_in_html("www.bbc.com", clean_html).is_none()); + // Reuters: never registered. + assert!(detect_in_html("www.reuters.com", clean_html).is_none()); + // AP News: never registered. + assert!(detect_in_html("apnews.com", clean_html).is_none()); + } + + #[test] + fn test_signature_only_fires_for_correct_host() { + // CRITICAL false-positive sentinel: an html string containing a + // paywall marker for a NON-RESPONDING host must NOT trigger + // detection. The host gate is the first defense. + let html_with_nyt_marker = + r#"
subscribe
"#; + assert!(detect_in_html("example.com", html_with_nyt_marker).is_none()); + assert!(detect_in_html("www.bbc.com", html_with_nyt_marker).is_none()); + assert!(detect_in_html("apnews.com", html_with_nyt_marker).is_none()); + + // And cross-publisher: a WSJ marker should not match an NYT host. + let html_with_wsj_marker = r#"
"#; + assert!(detect_in_html("www.nytimes.com", html_with_wsj_marker).is_none()); + } + + #[test] + fn test_format_warning_message_shape() { + let sig = &PAYWALL_SIGNATURES[0]; // NYT + let msg = format_warning(sig, "www.nytimes.com", "https://www.nytimes.com/x", false); + assert!(msg.starts_with("# webclaw: warning: paywall detected on New York Times"), "msg: {msg}"); + assert!(msg.contains("(nytimes.com)"), "normalized host expected: {msg}"); + assert!(msg.contains("--paywall-bypass"), "bypass hint expected: {msg}"); + assert!(msg.contains("https://archive.is/https://www.nytimes.com/x"), "archive.is suggestion expected: {msg}"); + } + + #[test] + fn test_format_warning_bypass_attempted_includes_archive_is() { + let sig = &PAYWALL_SIGNATURES[0]; // NYT + let msg = format_warning(sig, "www.nytimes.com", "https://www.nytimes.com/x", true); + assert!(msg.contains("after --paywall-bypass attempt"), "should note bypass attempt: {msg}"); + assert!(msg.contains("Googlebot UA"), "should name the UA strategy: {msg}"); + assert!(msg.contains("https://archive.is/https://www.nytimes.com/x"), "archive.is suggestion expected: {msg}"); + } + + #[test] + fn test_googlebot_ua_constant() { + // Pin the exact string so test_paywall_bypass_flag_sets_googlebot_ua + // in the CLI tests has a known-good reference value. + assert!(GOOGLEBOT_USER_AGENT.contains("Googlebot/2.1")); + assert!(GOOGLEBOT_USER_AGENT.starts_with("Mozilla/5.0")); + } + + #[test] + fn test_normalize_host_lowercases_and_strips_www() { + // Belt-and-braces: even if upstream code passes a non-normalized + // host, detection still works. + let html = r#"
"#; + assert!(detect_in_html("WWW.NYTIMES.COM", html).is_some(), "all-caps with www should match"); + assert!(detect_in_html("NYTimes.com", html).is_some(), "mixed-case should match"); + } +}