feat(fetch): paywall HTML-signature detection + best-effort --paywall-bypass

Detects NYT/WSJ/FT/Bloomberg/Substack paywall overlay markers in extracted HTML and emits stderr warning: # webclaw: warning: paywall detected on <name> (<host>); full article may not be accessible. Try --paywall-bypass or https://archive.is/<url> Detection uses a declarative signature registry (parallel to M3 known-bad-sites): per-host suffix gate + any-of substring scan of publisher-specific CSS classes / data-attributes / JSON-LD markers. NYT markers (vi-gateway-container, "isAccessibleForFree":false, meteredContent) were verified against a real live NYT article; other publishers use documented per-publisher overlay conventions. New --paywall-bypass flag attempts a soft bypass: injects a Googlebot User-Agent into the FetchConfig headers (some publishers serve full content to crawlers for SEO indexing). If the paywall is STILL detected post-Googlebot, the stderr warning switches to the bypass-aware variant naming the attempted strategy and pointing at https://archive.is/<url> as an external fallback. This is BEST-EFFORT. webclaw has no headless browser and cannot bypass paywalls requiring real session auth. Honest stderr language reflects that. Plumbing is minimal: webclaw-fetch gets a new `paywall` module + post-fetch detection hook in fetch_and_extract_with_options, and FetchClient gets a `with_paywall_bypass(bool)` builder method the CLI calls when the flag is set. 17 new tests (13 in paywall.rs covering host-gate / marker-gate / false-positive resistance / message formatting / Googlebot UA constant; 4 in webclaw-cli mod tests covering flag presence, default value, header injection wiring). Workspace 724 -> 741. Critical false-positive sentinels verified: p43 example.com 313 B byte-identical (stderr empty), p09 bbc.com 13K+ (stderr empty), p47 reuters.com 10K+ (stderr empty). Cyrillic p14 srbijagas 7777 B byte-identical (M15 sentinel preserved across 11 iters). M3 fast-fail on ambito.com exit 67 byte-identical. M14 truncation warning intact. No probe.py changes. No baseline modifications. No Cargo deps added.
2026-06-16 23:45:13 +02:00 · 2026-05-24 08:55:54 +02:00 · 2026-05-24 08:55:54 +02:00 · c37867309c
commit c37867309c
parent 4ef27fcd33
4 changed files with 444 additions and 2 deletions
--- a/crates/webclaw-cli/src/main.rs
+++ b/crates/webclaw-cli/src/main.rs
@ -175,6 +175,18 @@ struct Cli {
    #[arg(long)]
    url_encoded: bool,

+    /// Best-effort paywall bypass: re-fetches with a Googlebot User-Agent
+    /// (some publishers serve full content to crawlers for SEO indexing).
+    /// This is HEURISTIC ONLY — webclaw has no headless browser and cannot
+    /// bypass paywalls requiring real session auth, cookies, or JS
+    /// execution. If the paywall is still detected after the bypass
+    /// attempt, the stderr warning will suggest https://archive.is/<url>
+    /// as an external fallback. Paywall detection itself (without this
+    /// flag) runs by default on registered publisher hosts and emits an
+    /// advisory stderr warning.
+    #[arg(long)]
+    paywall_bypass: bool,
+
    /// Output format (markdown, json, text, llm, html)
    #[arg(short, long, default_value = "markdown")]
    format: OutputFormat,
@ -591,6 +603,18 @@ fn build_fetch_config(cli: &Cli) -> FetchConfig {
        }
    }

+    // M11 --paywall-bypass: override User-Agent with Googlebot so publishers
+    // that serve full content to crawlers for SEO will hand us the article
+    // body. Best-effort: many publishers verify the request actually comes
+    // from a Google-owned IP, in which case this header alone does nothing.
+    // Honest stderr language in the post-detect warning reflects that.
+    if cli.paywall_bypass {
+        headers.insert(
+            "User-Agent".to_string(),
+            webclaw_fetch::paywall::GOOGLEBOT_USER_AGENT.to_string(),
+        );
+    }
+
    FetchConfig {
        browser: cli.browser.clone().into(),
        proxy,
@ -1061,8 +1085,9 @@ async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
    }

    // Normal path: try local first
-    let client =
-        FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
+    let client = FetchClient::new(build_fetch_config(cli))
+        .map_err(|e| format!("client error: {e}"))?
+        .with_paywall_bypass(cli.paywall_bypass);
    let options = build_extraction_options(cli);
    // M13: wrap with periodic stderr progress emitter. Fast fetches see
    // zero emissions (timer never fires in <10s); slow fetches get a
@ -3336,4 +3361,68 @@ mod tests {
        // Round-trips through formatting without panicking.
        let _ = format!("research-{slug}.json");
    }
+
+    // -------- M11 paywall-bypass flag --------
+
+    #[test]
+    fn paywall_bypass_flag_present_in_cli() {
+        // clap-level smoke: the flag parses and the field is reachable
+        // from the Cli struct. If the flag was renamed/removed this test
+        // fails to compile, which is the intended sentinel.
+        use clap::Parser;
+        let cli = Cli::try_parse_from([
+            "webclaw",
+            "https://example.com/",
+            "--paywall-bypass",
+        ])
+        .expect("--paywall-bypass should parse");
+        assert!(cli.paywall_bypass, "--paywall-bypass should set the bool");
+    }
+
+    #[test]
+    fn paywall_bypass_default_false() {
+        // Sentinel: the flag is opt-in only. Default behavior must be
+        // unchanged on all existing probes.
+        use clap::Parser;
+        let cli = Cli::try_parse_from(["webclaw", "https://example.com/"])
+            .expect("default cli should parse");
+        assert!(!cli.paywall_bypass, "paywall_bypass must default to false");
+    }
+
+    #[test]
+    fn paywall_bypass_injects_googlebot_ua() {
+        // The build_fetch_config path inserts the Googlebot UA header
+        // when cli.paywall_bypass is set. This guards against accidental
+        // removal of the header-injection wiring.
+        use clap::Parser;
+        let cli = Cli::try_parse_from([
+            "webclaw",
+            "https://example.com/",
+            "--paywall-bypass",
+        ])
+        .expect("--paywall-bypass should parse");
+        let config = build_fetch_config(&cli);
+        let ua = config.headers.get("User-Agent").expect("UA header should be set");
+        assert!(ua.contains("Googlebot"), "UA should be Googlebot, got: {ua}");
+        assert_eq!(ua, webclaw_fetch::paywall::GOOGLEBOT_USER_AGENT);
+    }
+
+    #[test]
+    fn paywall_bypass_unset_leaves_default_ua() {
+        // Without the flag, build_fetch_config must NOT inject the
+        // Googlebot UA — preserves browser-profile fingerprinting that
+        // M1-M14 depend on.
+        use clap::Parser;
+        let cli = Cli::try_parse_from(["webclaw", "https://example.com/"])
+            .expect("default cli should parse");
+        let config = build_fetch_config(&cli);
+        // Either UA header is absent (most common; wreq supplies the
+        // browser-profile UA at the TLS layer) OR it's not Googlebot.
+        if let Some(ua) = config.headers.get("User-Agent") {
+            assert!(
+                !ua.contains("Googlebot"),
+                "default UA must not be Googlebot, got: {ua}"
+            );
+        }
+    }
 }