mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-12 23:05:12 +02:00
feat(fetch): paywall HTML-signature detection + best-effort --paywall-bypass
Detects NYT/WSJ/FT/Bloomberg/Substack paywall overlay markers in extracted
HTML and emits stderr warning:
# webclaw: warning: paywall detected on <name> (<host>); full article
may not be accessible. Try --paywall-bypass or https://archive.is/<url>
Detection uses a declarative signature registry (parallel to M3
known-bad-sites): per-host suffix gate + any-of substring scan of
publisher-specific CSS classes / data-attributes / JSON-LD markers.
NYT markers (vi-gateway-container, "isAccessibleForFree":false,
meteredContent) were verified against a real live NYT article;
other publishers use documented per-publisher overlay conventions.
New --paywall-bypass flag attempts a soft bypass: injects a Googlebot
User-Agent into the FetchConfig headers (some publishers serve full
content to crawlers for SEO indexing). If the paywall is STILL
detected post-Googlebot, the stderr warning switches to the
bypass-aware variant naming the attempted strategy and pointing at
https://archive.is/<url> as an external fallback.
This is BEST-EFFORT. webclaw has no headless browser and cannot
bypass paywalls requiring real session auth. Honest stderr language
reflects that. Plumbing is minimal: webclaw-fetch gets a new
`paywall` module + post-fetch detection hook in
fetch_and_extract_with_options, and FetchClient gets a
`with_paywall_bypass(bool)` builder method the CLI calls when the
flag is set.
17 new tests (13 in paywall.rs covering host-gate / marker-gate /
false-positive resistance / message formatting / Googlebot UA
constant; 4 in webclaw-cli mod tests covering flag presence,
default value, header injection wiring). Workspace 724 -> 741.
Critical false-positive sentinels verified: p43 example.com 313 B
byte-identical (stderr empty), p09 bbc.com 13K+ (stderr empty), p47
reuters.com 10K+ (stderr empty). Cyrillic p14 srbijagas 7777 B
byte-identical (M15 sentinel preserved across 11 iters). M3 fast-fail
on ambito.com exit 67 byte-identical. M14 truncation warning intact.
No probe.py changes. No baseline modifications. No Cargo deps added.
This commit is contained in:
parent
4ef27fcd33
commit
c37867309c
4 changed files with 444 additions and 2 deletions
|
|
@ -175,6 +175,18 @@ struct Cli {
|
|||
#[arg(long)]
|
||||
url_encoded: bool,
|
||||
|
||||
/// Best-effort paywall bypass: re-fetches with a Googlebot User-Agent
|
||||
/// (some publishers serve full content to crawlers for SEO indexing).
|
||||
/// This is HEURISTIC ONLY — webclaw has no headless browser and cannot
|
||||
/// bypass paywalls requiring real session auth, cookies, or JS
|
||||
/// execution. If the paywall is still detected after the bypass
|
||||
/// attempt, the stderr warning will suggest https://archive.is/<url>
|
||||
/// as an external fallback. Paywall detection itself (without this
|
||||
/// flag) runs by default on registered publisher hosts and emits an
|
||||
/// advisory stderr warning.
|
||||
#[arg(long)]
|
||||
paywall_bypass: bool,
|
||||
|
||||
/// Output format (markdown, json, text, llm, html)
|
||||
#[arg(short, long, default_value = "markdown")]
|
||||
format: OutputFormat,
|
||||
|
|
@ -591,6 +603,18 @@ fn build_fetch_config(cli: &Cli) -> FetchConfig {
|
|||
}
|
||||
}
|
||||
|
||||
// M11 --paywall-bypass: override User-Agent with Googlebot so publishers
|
||||
// that serve full content to crawlers for SEO will hand us the article
|
||||
// body. Best-effort: many publishers verify the request actually comes
|
||||
// from a Google-owned IP, in which case this header alone does nothing.
|
||||
// Honest stderr language in the post-detect warning reflects that.
|
||||
if cli.paywall_bypass {
|
||||
headers.insert(
|
||||
"User-Agent".to_string(),
|
||||
webclaw_fetch::paywall::GOOGLEBOT_USER_AGENT.to_string(),
|
||||
);
|
||||
}
|
||||
|
||||
FetchConfig {
|
||||
browser: cli.browser.clone().into(),
|
||||
proxy,
|
||||
|
|
@ -1061,8 +1085,9 @@ async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
|
|||
}
|
||||
|
||||
// Normal path: try local first
|
||||
let client =
|
||||
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
|
||||
let client = FetchClient::new(build_fetch_config(cli))
|
||||
.map_err(|e| format!("client error: {e}"))?
|
||||
.with_paywall_bypass(cli.paywall_bypass);
|
||||
let options = build_extraction_options(cli);
|
||||
// M13: wrap with periodic stderr progress emitter. Fast fetches see
|
||||
// zero emissions (timer never fires in <10s); slow fetches get a
|
||||
|
|
@ -3336,4 +3361,68 @@ mod tests {
|
|||
// Round-trips through formatting without panicking.
|
||||
let _ = format!("research-{slug}.json");
|
||||
}
|
||||
|
||||
// -------- M11 paywall-bypass flag --------
|
||||
|
||||
#[test]
|
||||
fn paywall_bypass_flag_present_in_cli() {
|
||||
// clap-level smoke: the flag parses and the field is reachable
|
||||
// from the Cli struct. If the flag was renamed/removed this test
|
||||
// fails to compile, which is the intended sentinel.
|
||||
use clap::Parser;
|
||||
let cli = Cli::try_parse_from([
|
||||
"webclaw",
|
||||
"https://example.com/",
|
||||
"--paywall-bypass",
|
||||
])
|
||||
.expect("--paywall-bypass should parse");
|
||||
assert!(cli.paywall_bypass, "--paywall-bypass should set the bool");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn paywall_bypass_default_false() {
|
||||
// Sentinel: the flag is opt-in only. Default behavior must be
|
||||
// unchanged on all existing probes.
|
||||
use clap::Parser;
|
||||
let cli = Cli::try_parse_from(["webclaw", "https://example.com/"])
|
||||
.expect("default cli should parse");
|
||||
assert!(!cli.paywall_bypass, "paywall_bypass must default to false");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn paywall_bypass_injects_googlebot_ua() {
|
||||
// The build_fetch_config path inserts the Googlebot UA header
|
||||
// when cli.paywall_bypass is set. This guards against accidental
|
||||
// removal of the header-injection wiring.
|
||||
use clap::Parser;
|
||||
let cli = Cli::try_parse_from([
|
||||
"webclaw",
|
||||
"https://example.com/",
|
||||
"--paywall-bypass",
|
||||
])
|
||||
.expect("--paywall-bypass should parse");
|
||||
let config = build_fetch_config(&cli);
|
||||
let ua = config.headers.get("User-Agent").expect("UA header should be set");
|
||||
assert!(ua.contains("Googlebot"), "UA should be Googlebot, got: {ua}");
|
||||
assert_eq!(ua, webclaw_fetch::paywall::GOOGLEBOT_USER_AGENT);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn paywall_bypass_unset_leaves_default_ua() {
|
||||
// Without the flag, build_fetch_config must NOT inject the
|
||||
// Googlebot UA — preserves browser-profile fingerprinting that
|
||||
// M1-M14 depend on.
|
||||
use clap::Parser;
|
||||
let cli = Cli::try_parse_from(["webclaw", "https://example.com/"])
|
||||
.expect("default cli should parse");
|
||||
let config = build_fetch_config(&cli);
|
||||
// Either UA header is absent (most common; wreq supplies the
|
||||
// browser-profile UA at the TLS layer) OR it's not Googlebot.
|
||||
if let Some(ua) = config.headers.get("User-Agent") {
|
||||
assert!(
|
||||
!ua.contains("Googlebot"),
|
||||
"default UA must not be Googlebot, got: {ua}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -201,6 +201,12 @@ pub struct FetchClient {
|
|||
/// out. Stored as `Arc` so cloning a `FetchClient` (common in
|
||||
/// axum state) doesn't clone the underlying reqwest pool.
|
||||
cloud: Option<std::sync::Arc<crate::cloud::CloudClient>>,
|
||||
/// M11 paywall-bypass flag. When true, the post-fetch paywall
|
||||
/// detection (in `fetch_and_extract_with_options`) emits the
|
||||
/// bypass-aware warning variant which acknowledges the
|
||||
/// Googlebot-UA attempt and suggests archive.is as the next step.
|
||||
/// Plumbed via [`Self::with_paywall_bypass`] from the CLI.
|
||||
paywall_bypass_attempted: bool,
|
||||
}
|
||||
|
||||
impl FetchClient {
|
||||
|
|
@ -262,9 +268,22 @@ impl FetchClient {
|
|||
pool,
|
||||
pdf_mode,
|
||||
cloud: None,
|
||||
paywall_bypass_attempted: false,
|
||||
})
|
||||
}
|
||||
|
||||
/// M11: signal that the caller invoked the request with a
|
||||
/// paywall-bypass attempt (Googlebot-UA override applied via
|
||||
/// `FetchConfig.headers`). Affects the wording of the post-fetch
|
||||
/// paywall-detection stderr warning emitted from
|
||||
/// `fetch_and_extract_with_options` — the bypass-aware variant
|
||||
/// names the bypass attempt and points at archive.is as the next
|
||||
/// step. Returns `self` for builder-style chaining.
|
||||
pub fn with_paywall_bypass(mut self, attempted: bool) -> Self {
|
||||
self.paywall_bypass_attempted = attempted;
|
||||
self
|
||||
}
|
||||
|
||||
/// Attach a cloud-fallback client. Returns `self` so it composes in
|
||||
/// a builder-ish way:
|
||||
///
|
||||
|
|
@ -620,6 +639,35 @@ impl FetchClient {
|
|||
let elapsed = start.elapsed();
|
||||
debug!(status, elapsed_ms = %elapsed.as_millis(), "fetch complete");
|
||||
|
||||
// M11 paywall detection: host-gated scan of the raw html for
|
||||
// known paywall overlay markers (NYT/WSJ/FT/Bloomberg/Substack).
|
||||
// Advisory only — we still hand the html to the extractor below
|
||||
// so the user gets whatever the publisher served (often a
|
||||
// teaser / first paragraph). The warning is informational so
|
||||
// the caller knows why the body is thin.
|
||||
//
|
||||
// The Googlebot-UA bypass attempt (when --paywall-bypass is set)
|
||||
// happens at the CLI layer by injecting a UA into FetchConfig
|
||||
// headers BEFORE the fetch; if the marker still appears here,
|
||||
// it means the soft bypass didn't clear it. We can't tell from
|
||||
// this function whether bypass was attempted, so we emit the
|
||||
// generic warning; the CLI is responsible for the bypass-aware
|
||||
// follow-up message.
|
||||
if let Ok(parsed) = url::Url::parse(&final_url)
|
||||
&& let Some(host) = parsed.host_str()
|
||||
&& let Some(sig) = crate::paywall::detect_in_html(host, &html)
|
||||
{
|
||||
eprintln!(
|
||||
"{}",
|
||||
crate::paywall::format_warning(
|
||||
sig,
|
||||
host,
|
||||
&final_url,
|
||||
self.paywall_bypass_attempted,
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
// LinkedIn: extract from embedded <code> JSON blobs
|
||||
if crate::linkedin::is_linkedin_post(&final_url) {
|
||||
if let Some(result) = crate::linkedin::extract_linkedin_post(&html, &final_url) {
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@ pub mod fetcher;
|
|||
pub mod known_bad_sites;
|
||||
pub mod linkedin;
|
||||
pub mod locale;
|
||||
pub mod paywall;
|
||||
pub mod progress;
|
||||
pub mod proxy;
|
||||
pub mod reddit;
|
||||
|
|
|
|||
304
crates/webclaw-fetch/src/paywall.rs
Normal file
304
crates/webclaw-fetch/src/paywall.rs
Normal file
|
|
@ -0,0 +1,304 @@
|
|||
/// Paywall HTML-signature detection (M11, iter 11).
|
||||
///
|
||||
/// Declarative registry of known paywall overlay markers (CSS class names,
|
||||
/// data-attributes, element IDs) for major publishers. Detection is
|
||||
/// host-gated and runs AFTER a successful fetch, scanning the raw HTML for
|
||||
/// any registered marker that belongs to the responding host's suffix.
|
||||
///
|
||||
/// This is ADVISORY ONLY. Webclaw uses wreq for TLS impersonation and has
|
||||
/// no headless browser, so true paywall bypass (cookie injection + JS
|
||||
/// rendering + session auth) is not possible from this layer. When a
|
||||
/// paywall is detected, the CLI emits a stderr warning:
|
||||
///
|
||||
/// `# webclaw: warning: paywall detected on <name> (<host>); full article
|
||||
/// may not be accessible. Try --paywall-bypass or https://archive.is/<url>`
|
||||
///
|
||||
/// The `--paywall-bypass` flag is a best-effort attempt: it injects a
|
||||
/// Googlebot User-Agent (some publishers serve full content to crawlers
|
||||
/// for SEO). If detection still fires post-bypass, the stderr message
|
||||
/// adds a note pointing the user at archive.is as an external fallback.
|
||||
///
|
||||
/// Host matching: `normalize_host(host).ends_with(host_suffix)` — so
|
||||
/// `www.nytimes.com`, `nytimes.com`, `cooking.nytimes.com` all match the
|
||||
/// `nytimes.com` entry. This is intentionally suffix-based (not exact
|
||||
/// like M3 known-bad-sites) because paywalls span subdomains uniformly
|
||||
/// within a publisher.
|
||||
///
|
||||
/// Marker matching: any-of substring scan on the html (case-sensitive,
|
||||
/// since CSS class names and data-attribute values are spec'd case-
|
||||
/// sensitive in HTML/CSS).
|
||||
///
|
||||
/// False-positive resistance: critical sentinel — detection MUST NOT
|
||||
/// fire on example.com, BBC, Reuters, AP News, or any non-registered
|
||||
/// host. The host gate is checked FIRST; if it doesn't match, the html
|
||||
/// is never scanned. See `test_signature_only_fires_for_correct_host`.
|
||||
|
||||
/// One paywall signature entry. Static by construction.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct PaywallSignature {
|
||||
/// Human-readable publisher name. Used in the stderr warning.
|
||||
pub name: &'static str,
|
||||
/// Host suffix to match against the responding host (after `www.`
|
||||
/// stripping + lowercasing). Subdomain-tolerant: `nytimes.com`
|
||||
/// matches `cooking.nytimes.com`.
|
||||
pub host_suffix: &'static str,
|
||||
/// CSS classes, data-attributes, and element IDs whose presence in
|
||||
/// the response body indicates a paywall overlay. Any-of match: the
|
||||
/// signature fires when at least one marker is present.
|
||||
pub markers: &'static [&'static str],
|
||||
}
|
||||
|
||||
/// Compile-time registry. Linear scan is fine at this size.
|
||||
pub const PAYWALL_SIGNATURES: &[PaywallSignature] = &[
|
||||
PaywallSignature {
|
||||
name: "New York Times",
|
||||
host_suffix: "nytimes.com",
|
||||
// Observed live on www.nytimes.com/<date>/<slug>.html pages:
|
||||
// - `vi-gateway-container` is the JS gateway div NYT injects
|
||||
// around paywall-eligible content (verified iter-11 phase B).
|
||||
// - `"isAccessibleForFree":false` is in the NewsArticle JSON-LD
|
||||
// for metered articles.
|
||||
// - `meteredContent` covers the CSS class + JSON-LD cssSelector
|
||||
// references; appears on metered articles only.
|
||||
markers: &[
|
||||
"vi-gateway-container",
|
||||
"\"isAccessibleForFree\":false",
|
||||
"meteredContent",
|
||||
],
|
||||
},
|
||||
PaywallSignature {
|
||||
name: "Wall Street Journal",
|
||||
host_suffix: "wsj.com",
|
||||
markers: &[
|
||||
"paywall-overlay",
|
||||
"wsj-paywall",
|
||||
"snippet-promotion",
|
||||
],
|
||||
},
|
||||
PaywallSignature {
|
||||
name: "Financial Times",
|
||||
host_suffix: "ft.com",
|
||||
markers: &[
|
||||
"js-paywall",
|
||||
"subscribe-prompt",
|
||||
"data-trackable=\"paywall\"",
|
||||
"id=\"paywall-app\"",
|
||||
],
|
||||
},
|
||||
PaywallSignature {
|
||||
name: "Bloomberg",
|
||||
host_suffix: "bloomberg.com",
|
||||
markers: &[
|
||||
"paywall-inline",
|
||||
"terminal-promo",
|
||||
"paywall-inline-promo",
|
||||
],
|
||||
},
|
||||
PaywallSignature {
|
||||
name: "Substack",
|
||||
host_suffix: "substack.com",
|
||||
markers: &[
|
||||
"paywall-content",
|
||||
"subscribe-widget--paywall",
|
||||
"class=\"paywall\"",
|
||||
],
|
||||
},
|
||||
];
|
||||
|
||||
/// Normalize a host string for registry matching: lowercase + strip a
|
||||
/// single leading `www.` label if present.
|
||||
fn normalize_host(host: &str) -> String {
|
||||
let lower = host.to_ascii_lowercase();
|
||||
lower.strip_prefix("www.").map(|s| s.to_string()).unwrap_or(lower)
|
||||
}
|
||||
|
||||
/// Detect a known paywall in the given html for the given host.
|
||||
///
|
||||
/// Returns the matching `PaywallSignature` or `None`. Two gates:
|
||||
/// 1. Host gate: normalized host must end with a registered `host_suffix`.
|
||||
/// 2. Marker gate: html must contain at least one of the entry's markers.
|
||||
///
|
||||
/// Both gates must pass. Pure function; no I/O.
|
||||
pub fn detect_in_html(host: &str, html: &str) -> Option<&'static PaywallSignature> {
|
||||
let normalized = normalize_host(host);
|
||||
for sig in PAYWALL_SIGNATURES {
|
||||
if normalized.ends_with(sig.host_suffix)
|
||||
&& sig.markers.iter().any(|m| html.contains(m))
|
||||
{
|
||||
return Some(sig);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Googlebot User-Agent string used by `--paywall-bypass`. Some publishers
|
||||
/// serve full content to Googlebot for SEO indexing. This is a best-effort
|
||||
/// soft bypass — many publishers verify the request actually comes from a
|
||||
/// Google-owned IP, in which case this header alone does nothing.
|
||||
pub const GOOGLEBOT_USER_AGENT: &str =
|
||||
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)";
|
||||
|
||||
/// Format the stderr warning for a paywall detection. Phase A contract:
|
||||
///
|
||||
/// `# webclaw: warning: paywall detected on <name> (<host>); full article
|
||||
/// may not be accessible. Try --paywall-bypass or https://archive.is/<url>`
|
||||
///
|
||||
/// `bypass_attempted=true` (called from the `--paywall-bypass` path when
|
||||
/// detection STILL fires) appends a note that the soft bypass attempt did
|
||||
/// not clear the paywall and the archive.is fallback is the next step.
|
||||
pub fn format_warning(sig: &PaywallSignature, host: &str, url: &str, bypass_attempted: bool) -> String {
|
||||
let normalized = normalize_host(host);
|
||||
if bypass_attempted {
|
||||
format!(
|
||||
"# webclaw: warning: paywall still detected on {name} ({host}) after --paywall-bypass attempt (Googlebot UA); try https://archive.is/{url}",
|
||||
name = sig.name,
|
||||
host = normalized,
|
||||
url = url,
|
||||
)
|
||||
} else {
|
||||
format!(
|
||||
"# webclaw: warning: paywall detected on {name} ({host}); full article may not be accessible. Try --paywall-bypass or https://archive.is/{url}",
|
||||
name = sig.name,
|
||||
host = normalized,
|
||||
url = url,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_signature_matches_nyt_marker_in_html() {
|
||||
// vi-gateway-container is the NYT JS paywall gateway div, observed
|
||||
// live on www.nytimes.com/<date>/<slug>.html pages (iter-11 phase B).
|
||||
let html = r#"<html><body><div class="vi-gateway-container" data-testid="vi-gateway-container"></div></body></html>"#;
|
||||
let hit = detect_in_html("www.nytimes.com", html).expect("nyt should match");
|
||||
assert_eq!(hit.name, "New York Times");
|
||||
assert_eq!(hit.host_suffix, "nytimes.com");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_signature_matches_nyt_jsonld_marker() {
|
||||
// "isAccessibleForFree":false is the schema.org JSON-LD signal NYT
|
||||
// emits in NewsArticle structured data for metered articles.
|
||||
// Critically, ":true" (the inverse — free content) must NOT match.
|
||||
let html = r#"<script type="application/ld+json">{"@type":"NewsArticle","isAccessibleForFree":false}</script>"#;
|
||||
let hit = detect_in_html("www.nytimes.com", html).expect("nyt jsonld marker should match");
|
||||
assert_eq!(hit.name, "New York Times");
|
||||
|
||||
// Negative: explicit "isAccessibleForFree":true must NOT fire.
|
||||
let free_html = r#"<script>{"isAccessibleForFree":true}</script>"#;
|
||||
assert!(detect_in_html("www.nytimes.com", free_html).is_none(),
|
||||
"free articles must not trigger the paywall marker");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_signature_matches_nyt_subdomain() {
|
||||
// Subdomain coverage: cooking.nytimes.com should match the nytimes.com suffix.
|
||||
let html = r#"<div class="meteredContent">limit</div>"#;
|
||||
let hit = detect_in_html("cooking.nytimes.com", html).expect("nyt subdomain should match");
|
||||
assert_eq!(hit.name, "New York Times");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_signature_matches_wsj_marker() {
|
||||
let html = r#"<div class="wsj-paywall snippet-promotion"></div>"#;
|
||||
let hit = detect_in_html("www.wsj.com", html).expect("wsj should match");
|
||||
assert_eq!(hit.name, "Wall Street Journal");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_signature_matches_ft_marker() {
|
||||
let html = r#"<div class="js-paywall"></div>"#;
|
||||
let hit = detect_in_html("www.ft.com", html).expect("ft should match");
|
||||
assert_eq!(hit.name, "Financial Times");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_signature_matches_bloomberg_marker() {
|
||||
let html = r#"<div class="paywall-inline-promo">subscribe</div>"#;
|
||||
let hit = detect_in_html("www.bloomberg.com", html).expect("bloomberg should match");
|
||||
assert_eq!(hit.name, "Bloomberg");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_signature_matches_substack_per_publisher_subdomain() {
|
||||
// Substack publishers use <name>.substack.com — subdomain suffix coverage.
|
||||
let html = r#"<div class="paywall-content">subscribe</div>"#;
|
||||
let hit = detect_in_html("someblog.substack.com", html).expect("substack should match");
|
||||
assert_eq!(hit.name, "Substack");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_signature_skips_clean_html() {
|
||||
// example.com: registered NEITHER as host nor marker source.
|
||||
let clean_html = r#"<html><body><h1>Example Domain</h1><p>For use in examples.</p></body></html>"#;
|
||||
assert!(detect_in_html("example.com", clean_html).is_none());
|
||||
assert!(detect_in_html("www.example.com", clean_html).is_none());
|
||||
|
||||
// nytimes.com host but NO marker in html — host gate passes,
|
||||
// marker gate fails. Must NOT fire.
|
||||
assert!(detect_in_html("www.nytimes.com", clean_html).is_none());
|
||||
|
||||
// BBC: never registered. Must NOT fire even with the same generic html.
|
||||
assert!(detect_in_html("www.bbc.com", clean_html).is_none());
|
||||
// Reuters: never registered.
|
||||
assert!(detect_in_html("www.reuters.com", clean_html).is_none());
|
||||
// AP News: never registered.
|
||||
assert!(detect_in_html("apnews.com", clean_html).is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_signature_only_fires_for_correct_host() {
|
||||
// CRITICAL false-positive sentinel: an html string containing a
|
||||
// paywall marker for a NON-RESPONDING host must NOT trigger
|
||||
// detection. The host gate is the first defense.
|
||||
let html_with_nyt_marker =
|
||||
r#"<div class="vi-gateway-container">subscribe</div>"#;
|
||||
assert!(detect_in_html("example.com", html_with_nyt_marker).is_none());
|
||||
assert!(detect_in_html("www.bbc.com", html_with_nyt_marker).is_none());
|
||||
assert!(detect_in_html("apnews.com", html_with_nyt_marker).is_none());
|
||||
|
||||
// And cross-publisher: a WSJ marker should not match an NYT host.
|
||||
let html_with_wsj_marker = r#"<div class="paywall-overlay"></div>"#;
|
||||
assert!(detect_in_html("www.nytimes.com", html_with_wsj_marker).is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_format_warning_message_shape() {
|
||||
let sig = &PAYWALL_SIGNATURES[0]; // NYT
|
||||
let msg = format_warning(sig, "www.nytimes.com", "https://www.nytimes.com/x", false);
|
||||
assert!(msg.starts_with("# webclaw: warning: paywall detected on New York Times"), "msg: {msg}");
|
||||
assert!(msg.contains("(nytimes.com)"), "normalized host expected: {msg}");
|
||||
assert!(msg.contains("--paywall-bypass"), "bypass hint expected: {msg}");
|
||||
assert!(msg.contains("https://archive.is/https://www.nytimes.com/x"), "archive.is suggestion expected: {msg}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_format_warning_bypass_attempted_includes_archive_is() {
|
||||
let sig = &PAYWALL_SIGNATURES[0]; // NYT
|
||||
let msg = format_warning(sig, "www.nytimes.com", "https://www.nytimes.com/x", true);
|
||||
assert!(msg.contains("after --paywall-bypass attempt"), "should note bypass attempt: {msg}");
|
||||
assert!(msg.contains("Googlebot UA"), "should name the UA strategy: {msg}");
|
||||
assert!(msg.contains("https://archive.is/https://www.nytimes.com/x"), "archive.is suggestion expected: {msg}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_googlebot_ua_constant() {
|
||||
// Pin the exact string so test_paywall_bypass_flag_sets_googlebot_ua
|
||||
// in the CLI tests has a known-good reference value.
|
||||
assert!(GOOGLEBOT_USER_AGENT.contains("Googlebot/2.1"));
|
||||
assert!(GOOGLEBOT_USER_AGENT.starts_with("Mozilla/5.0"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_normalize_host_lowercases_and_strips_www() {
|
||||
// Belt-and-braces: even if upstream code passes a non-normalized
|
||||
// host, detection still works.
|
||||
let html = r#"<div class="vi-gateway-container"></div>"#;
|
||||
assert!(detect_in_html("WWW.NYTIMES.COM", html).is_some(), "all-caps with www should match");
|
||||
assert!(detect_in_html("NYTimes.com", html).is_some(), "mixed-case should match");
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue