mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-16 23:45:13 +02:00
feat(fetch): paywall HTML-signature detection + best-effort --paywall-bypass
Detects NYT/WSJ/FT/Bloomberg/Substack paywall overlay markers in extracted
HTML and emits stderr warning:
# webclaw: warning: paywall detected on <name> (<host>); full article
may not be accessible. Try --paywall-bypass or https://archive.is/<url>
Detection uses a declarative signature registry (parallel to M3
known-bad-sites): per-host suffix gate + any-of substring scan of
publisher-specific CSS classes / data-attributes / JSON-LD markers.
NYT markers (vi-gateway-container, "isAccessibleForFree":false,
meteredContent) were verified against a real live NYT article;
other publishers use documented per-publisher overlay conventions.
New --paywall-bypass flag attempts a soft bypass: injects a Googlebot
User-Agent into the FetchConfig headers (some publishers serve full
content to crawlers for SEO indexing). If the paywall is STILL
detected post-Googlebot, the stderr warning switches to the
bypass-aware variant naming the attempted strategy and pointing at
https://archive.is/<url> as an external fallback.
This is BEST-EFFORT. webclaw has no headless browser and cannot
bypass paywalls requiring real session auth. Honest stderr language
reflects that. Plumbing is minimal: webclaw-fetch gets a new
`paywall` module + post-fetch detection hook in
fetch_and_extract_with_options, and FetchClient gets a
`with_paywall_bypass(bool)` builder method the CLI calls when the
flag is set.
17 new tests (13 in paywall.rs covering host-gate / marker-gate /
false-positive resistance / message formatting / Googlebot UA
constant; 4 in webclaw-cli mod tests covering flag presence,
default value, header injection wiring). Workspace 724 -> 741.
Critical false-positive sentinels verified: p43 example.com 313 B
byte-identical (stderr empty), p09 bbc.com 13K+ (stderr empty), p47
reuters.com 10K+ (stderr empty). Cyrillic p14 srbijagas 7777 B
byte-identical (M15 sentinel preserved across 11 iters). M3 fast-fail
on ambito.com exit 67 byte-identical. M14 truncation warning intact.
No probe.py changes. No baseline modifications. No Cargo deps added.
This commit is contained in:
parent
4ef27fcd33
commit
c37867309c
4 changed files with 444 additions and 2 deletions
|
|
@ -175,6 +175,18 @@ struct Cli {
|
|||
#[arg(long)]
|
||||
url_encoded: bool,
|
||||
|
||||
/// Best-effort paywall bypass: re-fetches with a Googlebot User-Agent
|
||||
/// (some publishers serve full content to crawlers for SEO indexing).
|
||||
/// This is HEURISTIC ONLY — webclaw has no headless browser and cannot
|
||||
/// bypass paywalls requiring real session auth, cookies, or JS
|
||||
/// execution. If the paywall is still detected after the bypass
|
||||
/// attempt, the stderr warning will suggest https://archive.is/<url>
|
||||
/// as an external fallback. Paywall detection itself (without this
|
||||
/// flag) runs by default on registered publisher hosts and emits an
|
||||
/// advisory stderr warning.
|
||||
#[arg(long)]
|
||||
paywall_bypass: bool,
|
||||
|
||||
/// Output format (markdown, json, text, llm, html)
|
||||
#[arg(short, long, default_value = "markdown")]
|
||||
format: OutputFormat,
|
||||
|
|
@ -591,6 +603,18 @@ fn build_fetch_config(cli: &Cli) -> FetchConfig {
|
|||
}
|
||||
}
|
||||
|
||||
// M11 --paywall-bypass: override User-Agent with Googlebot so publishers
|
||||
// that serve full content to crawlers for SEO will hand us the article
|
||||
// body. Best-effort: many publishers verify the request actually comes
|
||||
// from a Google-owned IP, in which case this header alone does nothing.
|
||||
// Honest stderr language in the post-detect warning reflects that.
|
||||
if cli.paywall_bypass {
|
||||
headers.insert(
|
||||
"User-Agent".to_string(),
|
||||
webclaw_fetch::paywall::GOOGLEBOT_USER_AGENT.to_string(),
|
||||
);
|
||||
}
|
||||
|
||||
FetchConfig {
|
||||
browser: cli.browser.clone().into(),
|
||||
proxy,
|
||||
|
|
@ -1061,8 +1085,9 @@ async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
|
|||
}
|
||||
|
||||
// Normal path: try local first
|
||||
let client =
|
||||
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
|
||||
let client = FetchClient::new(build_fetch_config(cli))
|
||||
.map_err(|e| format!("client error: {e}"))?
|
||||
.with_paywall_bypass(cli.paywall_bypass);
|
||||
let options = build_extraction_options(cli);
|
||||
// M13: wrap with periodic stderr progress emitter. Fast fetches see
|
||||
// zero emissions (timer never fires in <10s); slow fetches get a
|
||||
|
|
@ -3336,4 +3361,68 @@ mod tests {
|
|||
// Round-trips through formatting without panicking.
|
||||
let _ = format!("research-{slug}.json");
|
||||
}
|
||||
|
||||
// -------- M11 paywall-bypass flag --------
|
||||
|
||||
#[test]
|
||||
fn paywall_bypass_flag_present_in_cli() {
|
||||
// clap-level smoke: the flag parses and the field is reachable
|
||||
// from the Cli struct. If the flag was renamed/removed this test
|
||||
// fails to compile, which is the intended sentinel.
|
||||
use clap::Parser;
|
||||
let cli = Cli::try_parse_from([
|
||||
"webclaw",
|
||||
"https://example.com/",
|
||||
"--paywall-bypass",
|
||||
])
|
||||
.expect("--paywall-bypass should parse");
|
||||
assert!(cli.paywall_bypass, "--paywall-bypass should set the bool");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn paywall_bypass_default_false() {
|
||||
// Sentinel: the flag is opt-in only. Default behavior must be
|
||||
// unchanged on all existing probes.
|
||||
use clap::Parser;
|
||||
let cli = Cli::try_parse_from(["webclaw", "https://example.com/"])
|
||||
.expect("default cli should parse");
|
||||
assert!(!cli.paywall_bypass, "paywall_bypass must default to false");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn paywall_bypass_injects_googlebot_ua() {
|
||||
// The build_fetch_config path inserts the Googlebot UA header
|
||||
// when cli.paywall_bypass is set. This guards against accidental
|
||||
// removal of the header-injection wiring.
|
||||
use clap::Parser;
|
||||
let cli = Cli::try_parse_from([
|
||||
"webclaw",
|
||||
"https://example.com/",
|
||||
"--paywall-bypass",
|
||||
])
|
||||
.expect("--paywall-bypass should parse");
|
||||
let config = build_fetch_config(&cli);
|
||||
let ua = config.headers.get("User-Agent").expect("UA header should be set");
|
||||
assert!(ua.contains("Googlebot"), "UA should be Googlebot, got: {ua}");
|
||||
assert_eq!(ua, webclaw_fetch::paywall::GOOGLEBOT_USER_AGENT);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn paywall_bypass_unset_leaves_default_ua() {
|
||||
// Without the flag, build_fetch_config must NOT inject the
|
||||
// Googlebot UA — preserves browser-profile fingerprinting that
|
||||
// M1-M14 depend on.
|
||||
use clap::Parser;
|
||||
let cli = Cli::try_parse_from(["webclaw", "https://example.com/"])
|
||||
.expect("default cli should parse");
|
||||
let config = build_fetch_config(&cli);
|
||||
// Either UA header is absent (most common; wreq supplies the
|
||||
// browser-profile UA at the TLS layer) OR it's not Googlebot.
|
||||
if let Some(ua) = config.headers.get("User-Agent") {
|
||||
assert!(
|
||||
!ua.contains("Googlebot"),
|
||||
"default UA must not be Googlebot, got: {ua}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue