From c37867309c675793cc49a8da40c034f348745336 Mon Sep 17 00:00:00 2001
From: devnen <nenadoric@gmail.com>
Date: Sun, 24 May 2026 08:55:54 +0200
Subject: [PATCH] feat(fetch): paywall HTML-signature detection + best-effort
 --paywall-bypass

Detects NYT/WSJ/FT/Bloomberg/Substack paywall overlay markers in extracted
HTML and emits stderr warning:
  # webclaw: warning: paywall detected on <name> (<host>); full article
    may not be accessible. Try --paywall-bypass or https://archive.is/<url>

Detection uses a declarative signature registry (parallel to M3
known-bad-sites): per-host suffix gate + any-of substring scan of
publisher-specific CSS classes / data-attributes / JSON-LD markers.
NYT markers (vi-gateway-container, "isAccessibleForFree":false,
meteredContent) were verified against a real live NYT article;
other publishers use documented per-publisher overlay conventions.

New --paywall-bypass flag attempts a soft bypass: injects a Googlebot
User-Agent into the FetchConfig headers (some publishers serve full
content to crawlers for SEO indexing). If the paywall is STILL
detected post-Googlebot, the stderr warning switches to the
bypass-aware variant naming the attempted strategy and pointing at
https://archive.is/<url> as an external fallback.

This is BEST-EFFORT. webclaw has no headless browser and cannot
bypass paywalls requiring real session auth. Honest stderr language
reflects that. Plumbing is minimal: webclaw-fetch gets a new
`paywall` module + post-fetch detection hook in
fetch_and_extract_with_options, and FetchClient gets a
`with_paywall_bypass(bool)` builder method the CLI calls when the
flag is set.

17 new tests (13 in paywall.rs covering host-gate / marker-gate /
false-positive resistance / message formatting / Googlebot UA
constant; 4 in webclaw-cli mod tests covering flag presence,
default value, header injection wiring). Workspace 724 -> 741.

Critical false-positive sentinels verified: p43 example.com 313 B
byte-identical (stderr empty), p09 bbc.com 13K+ (stderr empty), p47
reuters.com 10K+ (stderr empty). Cyrillic p14 srbijagas 7777 B
byte-identical (M15 sentinel preserved across 11 iters). M3 fast-fail
on ambito.com exit 67 byte-identical. M14 truncation warning intact.

No probe.py changes. No baseline modifications. No Cargo deps added.
---
 crates/webclaw-cli/src/main.rs      |  93 ++++++++-
 crates/webclaw-fetch/src/client.rs  |  48 +++++
 crates/webclaw-fetch/src/lib.rs     |   1 +
 crates/webclaw-fetch/src/paywall.rs | 304 ++++++++++++++++++++++++++++
 4 files changed, 444 insertions(+), 2 deletions(-)
 create mode 100644 crates/webclaw-fetch/src/paywall.rs
diff --git a/crates/webclaw-cli/src/main.rs b/crates/webclaw-cli/src/main.rs
index 47ed127..ef20e91 100644
--- a/crates/webclaw-cli/src/main.rs
+++ b/crates/webclaw-cli/src/main.rs
@@ -175,6 +175,18 @@ struct Cli {
     #[arg(long)]
     url_encoded: bool,
 
+    /// Best-effort paywall bypass: re-fetches with a Googlebot User-Agent
+    /// (some publishers serve full content to crawlers for SEO indexing).
+    /// This is HEURISTIC ONLY — webclaw has no headless browser and cannot
+    /// bypass paywalls requiring real session auth, cookies, or JS
+    /// execution. If the paywall is still detected after the bypass
+    /// attempt, the stderr warning will suggest https://archive.is/<url>
+    /// as an external fallback. Paywall detection itself (without this
+    /// flag) runs by default on registered publisher hosts and emits an
+    /// advisory stderr warning.
+    #[arg(long)]
+    paywall_bypass: bool,
+
     /// Output format (markdown, json, text, llm, html)
     #[arg(short, long, default_value = "markdown")]
     format: OutputFormat,
@@ -591,6 +603,18 @@ fn build_fetch_config(cli: &Cli) -> FetchConfig {
         }
     }
 
+    // M11 --paywall-bypass: override User-Agent with Googlebot so publishers
+    // that serve full content to crawlers for SEO will hand us the article
+    // body. Best-effort: many publishers verify the request actually comes
+    // from a Google-owned IP, in which case this header alone does nothing.
+    // Honest stderr language in the post-detect warning reflects that.
+    if cli.paywall_bypass {
+        headers.insert(
+            "User-Agent".to_string(),
+            webclaw_fetch::paywall::GOOGLEBOT_USER_AGENT.to_string(),
+        );
+    }
+
     FetchConfig {
         browser: cli.browser.clone().into(),
         proxy,
@@ -1061,8 +1085,9 @@ async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
     }
 
     // Normal path: try local first
-    let client =
-        FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
+    let client = FetchClient::new(build_fetch_config(cli))
+        .map_err(|e| format!("client error: {e}"))?
+        .with_paywall_bypass(cli.paywall_bypass);
     let options = build_extraction_options(cli);
     // M13: wrap with periodic stderr progress emitter. Fast fetches see
     // zero emissions (timer never fires in <10s); slow fetches get a
@@ -3336,4 +3361,68 @@ mod tests {
         // Round-trips through formatting without panicking.
         let _ = format!("research-{slug}.json");
     }
+
+    // -------- M11 paywall-bypass flag --------
+
+    #[test]
+    fn paywall_bypass_flag_present_in_cli() {
+        // clap-level smoke: the flag parses and the field is reachable
+        // from the Cli struct. If the flag was renamed/removed this test
+        // fails to compile, which is the intended sentinel.
+        use clap::Parser;
+        let cli = Cli::try_parse_from([
+            "webclaw",
+            "https://example.com/",
+            "--paywall-bypass",
+        ])
+        .expect("--paywall-bypass should parse");
+        assert!(cli.paywall_bypass, "--paywall-bypass should set the bool");
+    }
+
+    #[test]
+    fn paywall_bypass_default_false() {
+        // Sentinel: the flag is opt-in only. Default behavior must be
+        // unchanged on all existing probes.
+        use clap::Parser;
+        let cli = Cli::try_parse_from(["webclaw", "https://example.com/"])
+            .expect("default cli should parse");
+        assert!(!cli.paywall_bypass, "paywall_bypass must default to false");
+    }
+
+    #[test]
+    fn paywall_bypass_injects_googlebot_ua() {
+        // The build_fetch_config path inserts the Googlebot UA header
+        // when cli.paywall_bypass is set. This guards against accidental
+        // removal of the header-injection wiring.
+        use clap::Parser;
+        let cli = Cli::try_parse_from([
+            "webclaw",
+            "https://example.com/",
+            "--paywall-bypass",
+        ])
+        .expect("--paywall-bypass should parse");
+        let config = build_fetch_config(&cli);
+        let ua = config.headers.get("User-Agent").expect("UA header should be set");
+        assert!(ua.contains("Googlebot"), "UA should be Googlebot, got: {ua}");
+        assert_eq!(ua, webclaw_fetch::paywall::GOOGLEBOT_USER_AGENT);
+    }
+
+    #[test]
+    fn paywall_bypass_unset_leaves_default_ua() {
+        // Without the flag, build_fetch_config must NOT inject the
+        // Googlebot UA — preserves browser-profile fingerprinting that
+        // M1-M14 depend on.
+        use clap::Parser;
+        let cli = Cli::try_parse_from(["webclaw", "https://example.com/"])
+            .expect("default cli should parse");
+        let config = build_fetch_config(&cli);
+        // Either UA header is absent (most common; wreq supplies the
+        // browser-profile UA at the TLS layer) OR it's not Googlebot.
+        if let Some(ua) = config.headers.get("User-Agent") {
+            assert!(
+                !ua.contains("Googlebot"),
+                "default UA must not be Googlebot, got: {ua}"
+            );
+        }
+    }
 }
diff --git a/crates/webclaw-fetch/src/client.rs b/crates/webclaw-fetch/src/client.rs
index f9338d1..dede9a8 100644
--- a/crates/webclaw-fetch/src/client.rs
+++ b/crates/webclaw-fetch/src/client.rs
@@ -201,6 +201,12 @@ pub struct FetchClient {
     /// out. Stored as `Arc` so cloning a `FetchClient` (common in
     /// axum state) doesn't clone the underlying reqwest pool.
     cloud: Option<std::sync::Arc<crate::cloud::CloudClient>>,
+    /// M11 paywall-bypass flag. When true, the post-fetch paywall
+    /// detection (in `fetch_and_extract_with_options`) emits the
+    /// bypass-aware warning variant which acknowledges the
+    /// Googlebot-UA attempt and suggests archive.is as the next step.
+    /// Plumbed via [`Self::with_paywall_bypass`] from the CLI.
+    paywall_bypass_attempted: bool,
 }
 
 impl FetchClient {
@@ -262,9 +268,22 @@ impl FetchClient {
             pool,
             pdf_mode,
             cloud: None,
+            paywall_bypass_attempted: false,
         })
     }
 
+    /// M11: signal that the caller invoked the request with a
+    /// paywall-bypass attempt (Googlebot-UA override applied via
+    /// `FetchConfig.headers`). Affects the wording of the post-fetch
+    /// paywall-detection stderr warning emitted from
+    /// `fetch_and_extract_with_options` — the bypass-aware variant
+    /// names the bypass attempt and points at archive.is as the next
+    /// step. Returns `self` for builder-style chaining.
+    pub fn with_paywall_bypass(mut self, attempted: bool) -> Self {
+        self.paywall_bypass_attempted = attempted;
+        self
+    }
+
     /// Attach a cloud-fallback client. Returns `self` so it composes in
     /// a builder-ish way:
     ///
@@ -620,6 +639,35 @@ impl FetchClient {
             let elapsed = start.elapsed();
             debug!(status, elapsed_ms = %elapsed.as_millis(), "fetch complete");
 
+            // M11 paywall detection: host-gated scan of the raw html for
+            // known paywall overlay markers (NYT/WSJ/FT/Bloomberg/Substack).
+            // Advisory only — we still hand the html to the extractor below
+            // so the user gets whatever the publisher served (often a
+            // teaser / first paragraph). The warning is informational so
+            // the caller knows why the body is thin.
+            //
+            // The Googlebot-UA bypass attempt (when --paywall-bypass is set)
+            // happens at the CLI layer by injecting a UA into FetchConfig
+            // headers BEFORE the fetch; if the marker still appears here,
+            // it means the soft bypass didn't clear it. We can't tell from
+            // this function whether bypass was attempted, so we emit the
+            // generic warning; the CLI is responsible for the bypass-aware
+            // follow-up message.
+            if let Ok(parsed) = url::Url::parse(&final_url)
+                && let Some(host) = parsed.host_str()
+                && let Some(sig) = crate::paywall::detect_in_html(host, &html)
+            {
+                eprintln!(
+                    "{}",
+                    crate::paywall::format_warning(
+                        sig,
+                        host,
+                        &final_url,
+                        self.paywall_bypass_attempted,
+                    )
+                );
+            }
+
             // LinkedIn: extract from embedded <code> JSON blobs
             if crate::linkedin::is_linkedin_post(&final_url) {
                 if let Some(result) = crate::linkedin::extract_linkedin_post(&html, &final_url) {
diff --git a/crates/webclaw-fetch/src/lib.rs b/crates/webclaw-fetch/src/lib.rs
index 6cee844..19acd25 100644
--- a/crates/webclaw-fetch/src/lib.rs
+++ b/crates/webclaw-fetch/src/lib.rs
@@ -12,6 +12,7 @@ pub mod fetcher;
 pub mod known_bad_sites;
 pub mod linkedin;
 pub mod locale;
+pub mod paywall;
 pub mod progress;
 pub mod proxy;
 pub mod reddit;
diff --git a/crates/webclaw-fetch/src/paywall.rs b/crates/webclaw-fetch/src/paywall.rs
new file mode 100644
index 0000000..197002d
--- /dev/null
+++ b/crates/webclaw-fetch/src/paywall.rs
@@ -0,0 +1,304 @@
+/// Paywall HTML-signature detection (M11, iter 11).
+///
+/// Declarative registry of known paywall overlay markers (CSS class names,
+/// data-attributes, element IDs) for major publishers. Detection is
+/// host-gated and runs AFTER a successful fetch, scanning the raw HTML for
+/// any registered marker that belongs to the responding host's suffix.
+///
+/// This is ADVISORY ONLY. Webclaw uses wreq for TLS impersonation and has
+/// no headless browser, so true paywall bypass (cookie injection + JS
+/// rendering + session auth) is not possible from this layer. When a
+/// paywall is detected, the CLI emits a stderr warning:
+///
+///   `# webclaw: warning: paywall detected on <name> (<host>); full article
+///    may not be accessible. Try --paywall-bypass or https://archive.is/<url>`
+///
+/// The `--paywall-bypass` flag is a best-effort attempt: it injects a
+/// Googlebot User-Agent (some publishers serve full content to crawlers
+/// for SEO). If detection still fires post-bypass, the stderr message
+/// adds a note pointing the user at archive.is as an external fallback.
+///
+/// Host matching: `normalize_host(host).ends_with(host_suffix)` — so
+/// `www.nytimes.com`, `nytimes.com`, `cooking.nytimes.com` all match the
+/// `nytimes.com` entry. This is intentionally suffix-based (not exact
+/// like M3 known-bad-sites) because paywalls span subdomains uniformly
+/// within a publisher.
+///
+/// Marker matching: any-of substring scan on the html (case-sensitive,
+/// since CSS class names and data-attribute values are spec'd case-
+/// sensitive in HTML/CSS).
+///
+/// False-positive resistance: critical sentinel — detection MUST NOT
+/// fire on example.com, BBC, Reuters, AP News, or any non-registered
+/// host. The host gate is checked FIRST; if it doesn't match, the html
+/// is never scanned. See `test_signature_only_fires_for_correct_host`.
+
+/// One paywall signature entry. Static by construction.
+#[derive(Debug, Clone, Copy)]
+pub struct PaywallSignature {
+    /// Human-readable publisher name. Used in the stderr warning.
+    pub name: &'static str,
+    /// Host suffix to match against the responding host (after `www.`
+    /// stripping + lowercasing). Subdomain-tolerant: `nytimes.com`
+    /// matches `cooking.nytimes.com`.
+    pub host_suffix: &'static str,
+    /// CSS classes, data-attributes, and element IDs whose presence in
+    /// the response body indicates a paywall overlay. Any-of match: the
+    /// signature fires when at least one marker is present.
+    pub markers: &'static [&'static str],
+}
+
+/// Compile-time registry. Linear scan is fine at this size.
+pub const PAYWALL_SIGNATURES: &[PaywallSignature] = &[
+    PaywallSignature {
+        name: "New York Times",
+        host_suffix: "nytimes.com",
+        // Observed live on www.nytimes.com/<date>/<slug>.html pages:
+        //   - `vi-gateway-container` is the JS gateway div NYT injects
+        //     around paywall-eligible content (verified iter-11 phase B).
+        //   - `"isAccessibleForFree":false` is in the NewsArticle JSON-LD
+        //     for metered articles.
+        //   - `meteredContent` covers the CSS class + JSON-LD cssSelector
+        //     references; appears on metered articles only.
+        markers: &[
+            "vi-gateway-container",
+            "\"isAccessibleForFree\":false",
+            "meteredContent",
+        ],
+    },
+    PaywallSignature {
+        name: "Wall Street Journal",
+        host_suffix: "wsj.com",
+        markers: &[
+            "paywall-overlay",
+            "wsj-paywall",
+            "snippet-promotion",
+        ],
+    },
+    PaywallSignature {
+        name: "Financial Times",
+        host_suffix: "ft.com",
+        markers: &[
+            "js-paywall",
+            "subscribe-prompt",
+            "data-trackable=\"paywall\"",
+            "id=\"paywall-app\"",
+        ],
+    },
+    PaywallSignature {
+        name: "Bloomberg",
+        host_suffix: "bloomberg.com",
+        markers: &[
+            "paywall-inline",
+            "terminal-promo",
+            "paywall-inline-promo",
+        ],
+    },
+    PaywallSignature {
+        name: "Substack",
+        host_suffix: "substack.com",
+        markers: &[
+            "paywall-content",
+            "subscribe-widget--paywall",
+            "class=\"paywall\"",
+        ],
+    },
+];
+
+/// Normalize a host string for registry matching: lowercase + strip a
+/// single leading `www.` label if present.
+fn normalize_host(host: &str) -> String {
+    let lower = host.to_ascii_lowercase();
+    lower.strip_prefix("www.").map(|s| s.to_string()).unwrap_or(lower)
+}
+
+/// Detect a known paywall in the given html for the given host.
+///
+/// Returns the matching `PaywallSignature` or `None`. Two gates:
+///   1. Host gate: normalized host must end with a registered `host_suffix`.
+///   2. Marker gate: html must contain at least one of the entry's markers.
+///
+/// Both gates must pass. Pure function; no I/O.
+pub fn detect_in_html(host: &str, html: &str) -> Option<&'static PaywallSignature> {
+    let normalized = normalize_host(host);
+    for sig in PAYWALL_SIGNATURES {
+        if normalized.ends_with(sig.host_suffix)
+            && sig.markers.iter().any(|m| html.contains(m))
+        {
+            return Some(sig);
+        }
+    }
+    None
+}
+
+/// Googlebot User-Agent string used by `--paywall-bypass`. Some publishers
+/// serve full content to Googlebot for SEO indexing. This is a best-effort
+/// soft bypass — many publishers verify the request actually comes from a
+/// Google-owned IP, in which case this header alone does nothing.
+pub const GOOGLEBOT_USER_AGENT: &str =
+    "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)";
+
+/// Format the stderr warning for a paywall detection. Phase A contract:
+///
+///   `# webclaw: warning: paywall detected on <name> (<host>); full article
+///    may not be accessible. Try --paywall-bypass or https://archive.is/<url>`
+///
+/// `bypass_attempted=true` (called from the `--paywall-bypass` path when
+/// detection STILL fires) appends a note that the soft bypass attempt did
+/// not clear the paywall and the archive.is fallback is the next step.
+pub fn format_warning(sig: &PaywallSignature, host: &str, url: &str, bypass_attempted: bool) -> String {
+    let normalized = normalize_host(host);
+    if bypass_attempted {
+        format!(
+            "# webclaw: warning: paywall still detected on {name} ({host}) after --paywall-bypass attempt (Googlebot UA); try https://archive.is/{url}",
+            name = sig.name,
+            host = normalized,
+            url = url,
+        )
+    } else {
+        format!(
+            "# webclaw: warning: paywall detected on {name} ({host}); full article may not be accessible. Try --paywall-bypass or https://archive.is/{url}",
+            name = sig.name,
+            host = normalized,
+            url = url,
+        )
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_signature_matches_nyt_marker_in_html() {
+        // vi-gateway-container is the NYT JS paywall gateway div, observed
+        // live on www.nytimes.com/<date>/<slug>.html pages (iter-11 phase B).
+        let html = r#"<html><body><div class="vi-gateway-container" data-testid="vi-gateway-container"></div></body></html>"#;
+        let hit = detect_in_html("www.nytimes.com", html).expect("nyt should match");
+        assert_eq!(hit.name, "New York Times");
+        assert_eq!(hit.host_suffix, "nytimes.com");
+    }
+
+    #[test]
+    fn test_signature_matches_nyt_jsonld_marker() {
+        // "isAccessibleForFree":false is the schema.org JSON-LD signal NYT
+        // emits in NewsArticle structured data for metered articles.
+        // Critically, ":true" (the inverse — free content) must NOT match.
+        let html = r#"<script type="application/ld+json">{"@type":"NewsArticle","isAccessibleForFree":false}</script>"#;
+        let hit = detect_in_html("www.nytimes.com", html).expect("nyt jsonld marker should match");
+        assert_eq!(hit.name, "New York Times");
+
+        // Negative: explicit "isAccessibleForFree":true must NOT fire.
+        let free_html = r#"<script>{"isAccessibleForFree":true}</script>"#;
+        assert!(detect_in_html("www.nytimes.com", free_html).is_none(),
+            "free articles must not trigger the paywall marker");
+    }
+
+    #[test]
+    fn test_signature_matches_nyt_subdomain() {
+        // Subdomain coverage: cooking.nytimes.com should match the nytimes.com suffix.
+        let html = r#"<div class="meteredContent">limit</div>"#;
+        let hit = detect_in_html("cooking.nytimes.com", html).expect("nyt subdomain should match");
+        assert_eq!(hit.name, "New York Times");
+    }
+
+    #[test]
+    fn test_signature_matches_wsj_marker() {
+        let html = r#"<div class="wsj-paywall snippet-promotion"></div>"#;
+        let hit = detect_in_html("www.wsj.com", html).expect("wsj should match");
+        assert_eq!(hit.name, "Wall Street Journal");
+    }
+
+    #[test]
+    fn test_signature_matches_ft_marker() {
+        let html = r#"<div class="js-paywall"></div>"#;
+        let hit = detect_in_html("www.ft.com", html).expect("ft should match");
+        assert_eq!(hit.name, "Financial Times");
+    }
+
+    #[test]
+    fn test_signature_matches_bloomberg_marker() {
+        let html = r#"<div class="paywall-inline-promo">subscribe</div>"#;
+        let hit = detect_in_html("www.bloomberg.com", html).expect("bloomberg should match");
+        assert_eq!(hit.name, "Bloomberg");
+    }
+
+    #[test]
+    fn test_signature_matches_substack_per_publisher_subdomain() {
+        // Substack publishers use <name>.substack.com — subdomain suffix coverage.
+        let html = r#"<div class="paywall-content">subscribe</div>"#;
+        let hit = detect_in_html("someblog.substack.com", html).expect("substack should match");
+        assert_eq!(hit.name, "Substack");
+    }
+
+    #[test]
+    fn test_signature_skips_clean_html() {
+        // example.com: registered NEITHER as host nor marker source.
+        let clean_html = r#"<html><body><h1>Example Domain</h1><p>For use in examples.</p></body></html>"#;
+        assert!(detect_in_html("example.com", clean_html).is_none());
+        assert!(detect_in_html("www.example.com", clean_html).is_none());
+
+        // nytimes.com host but NO marker in html — host gate passes,
+        // marker gate fails. Must NOT fire.
+        assert!(detect_in_html("www.nytimes.com", clean_html).is_none());
+
+        // BBC: never registered. Must NOT fire even with the same generic html.
+        assert!(detect_in_html("www.bbc.com", clean_html).is_none());
+        // Reuters: never registered.
+        assert!(detect_in_html("www.reuters.com", clean_html).is_none());
+        // AP News: never registered.
+        assert!(detect_in_html("apnews.com", clean_html).is_none());
+    }
+
+    #[test]
+    fn test_signature_only_fires_for_correct_host() {
+        // CRITICAL false-positive sentinel: an html string containing a
+        // paywall marker for a NON-RESPONDING host must NOT trigger
+        // detection. The host gate is the first defense.
+        let html_with_nyt_marker =
+            r#"<div class="vi-gateway-container">subscribe</div>"#;
+        assert!(detect_in_html("example.com", html_with_nyt_marker).is_none());
+        assert!(detect_in_html("www.bbc.com", html_with_nyt_marker).is_none());
+        assert!(detect_in_html("apnews.com", html_with_nyt_marker).is_none());
+
+        // And cross-publisher: a WSJ marker should not match an NYT host.
+        let html_with_wsj_marker = r#"<div class="paywall-overlay"></div>"#;
+        assert!(detect_in_html("www.nytimes.com", html_with_wsj_marker).is_none());
+    }
+
+    #[test]
+    fn test_format_warning_message_shape() {
+        let sig = &PAYWALL_SIGNATURES[0]; // NYT
+        let msg = format_warning(sig, "www.nytimes.com", "https://www.nytimes.com/x", false);
+        assert!(msg.starts_with("# webclaw: warning: paywall detected on New York Times"), "msg: {msg}");
+        assert!(msg.contains("(nytimes.com)"), "normalized host expected: {msg}");
+        assert!(msg.contains("--paywall-bypass"), "bypass hint expected: {msg}");
+        assert!(msg.contains("https://archive.is/https://www.nytimes.com/x"), "archive.is suggestion expected: {msg}");
+    }
+
+    #[test]
+    fn test_format_warning_bypass_attempted_includes_archive_is() {
+        let sig = &PAYWALL_SIGNATURES[0]; // NYT
+        let msg = format_warning(sig, "www.nytimes.com", "https://www.nytimes.com/x", true);
+        assert!(msg.contains("after --paywall-bypass attempt"), "should note bypass attempt: {msg}");
+        assert!(msg.contains("Googlebot UA"), "should name the UA strategy: {msg}");
+        assert!(msg.contains("https://archive.is/https://www.nytimes.com/x"), "archive.is suggestion expected: {msg}");
+    }
+
+    #[test]
+    fn test_googlebot_ua_constant() {
+        // Pin the exact string so test_paywall_bypass_flag_sets_googlebot_ua
+        // in the CLI tests has a known-good reference value.
+        assert!(GOOGLEBOT_USER_AGENT.contains("Googlebot/2.1"));
+        assert!(GOOGLEBOT_USER_AGENT.starts_with("Mozilla/5.0"));
+    }
+
+    #[test]
+    fn test_normalize_host_lowercases_and_strips_www() {
+        // Belt-and-braces: even if upstream code passes a non-normalized
+        // host, detection still works.
+        let html = r#"<div class="vi-gateway-container"></div>"#;
+        assert!(detect_in_html("WWW.NYTIMES.COM", html).is_some(), "all-caps with www should match");
+        assert!(detect_in_html("NYTimes.com", html).is_some(), "mixed-case should match");
+    }
+}