From 920d71f5615c421875892eb9332b8d472dc1cad9 Mon Sep 17 00:00:00 2001 From: devnen Date: Sat, 16 May 2026 18:55:28 +0200 Subject: [PATCH] Strip more llm output noise: consent walls, bare integers, JSON-LD body duplication - detect_empty: add ConsentWall variant for GDPR/cookie redirects (Yahoo, Google, EU news sites). Detect via final-URL host match (consent.*, /consent/, collectConsent) and warn with proxy/cookie remediation hint. - init_logging: silence html5ever / markup5ever / selectors WARNs by default (foster-parenting messages from malformed real-world HTML pollute stderr with dozens of lines per fetch; override via WEBCLAW_LOG). - cleanup: add strip_bare_number_lines for paragraphs that are just a short integer (news-index comment counts, page numbers); make is_ui_control_token case-insensitive and extend UI_CONTROLS with pagination chrome (next, prev, previous, older, newer) plus bare <=4-digit integers so '0 Next'-style glued lines are caught. - links: drop bare-integer link labels and #comment-stream / #comments / #disqus hrefs from the deduplicated Links section. - mod: scrub articleBody / body / text / description fields from JSON-LD structured-data emission when they would duplicate the rendered markdown body (always for articleBody; conditional >=500 chars for the others). All 292 core tests pass. --- crates/webclaw-cli/src/main.rs | 73 ++++++++++++++++++++++++-- crates/webclaw-core/src/llm/body.rs | 12 +++++ crates/webclaw-core/src/llm/cleanup.rs | 68 +++++++++++++++++++++++- crates/webclaw-core/src/llm/links.rs | 15 ++++++ crates/webclaw-core/src/llm/mod.rs | 47 ++++++++++++++++- 5 files changed, 210 insertions(+), 5 deletions(-) diff --git a/crates/webclaw-cli/src/main.rs b/crates/webclaw-cli/src/main.rs index a45bce8..3b32526 100644 --- a/crates/webclaw-cli/src/main.rs +++ b/crates/webclaw-cli/src/main.rs @@ -35,10 +35,38 @@ const ANTIBOT_TITLES: &[&str] = &[ "ddos protection", ]; -/// Detect why a page returned empty content. +/// URL host/path fragments that indicate a GDPR/cookie consent redirect. +/// Yahoo, Google, and several EU news sites redirect to a `consent.*` host +/// (or `/consent/` path) when the client is detected as being in the EU/EEA. +/// The resulting page has near-zero usable content and a locale-specific +/// title, so URL-shape detection is the most reliable signal. +const CONSENT_URL_FRAGMENTS: &[&str] = &[ + "://consent.", + "/consent?", + "/consent/", + "collectconsent", + "consentcheck", + "/cmp/", + "guce.advertising.com", +]; + +/// English consent-wall title prefixes. Many providers localise the page, +/// so this is a best-effort secondary signal — primary detection is the URL. +const CONSENT_TITLES: &[&str] = &[ + "before you continue", + "your privacy choices", + "we value your privacy", + "we care about your privacy", + "cookie consent", + "consent required", +]; + +/// Detect why a page returned empty (or near-empty) content. enum EmptyReason { /// Anti-bot challenge page (Cloudflare, Akamai, etc.) Antibot, + /// GDPR/cookie consent redirect — content is gated behind a consent form + ConsentWall, /// JS-only SPA that returns an empty shell without a browser JsRequired, /// Page has content — not empty @@ -46,6 +74,13 @@ enum EmptyReason { } fn detect_empty(result: &ExtractionResult) -> EmptyReason { + // Consent walls come back with a non-empty (but tiny) markdown body and a + // post-redirect URL pointing at a consent host. Check before the + // word-count short-circuit so the Yahoo / Google / EU case is caught. + if is_consent_wall(result) { + return EmptyReason::ConsentWall; + } + // Has real content — nothing to warn about if result.metadata.word_count > 50 || !result.content.markdown.is_empty() { return EmptyReason::None; @@ -67,6 +102,27 @@ fn detect_empty(result: &ExtractionResult) -> EmptyReason { EmptyReason::None } +/// A consent wall is identified by either: +/// 1. The final (post-redirect) URL pointing at a known consent host/path, OR +/// 2. A consent-wall title prefix, AND the body being very short (<= 50 words) +fn is_consent_wall(result: &ExtractionResult) -> bool { + if let Some(ref url) = result.metadata.url { + let lower = url.to_ascii_lowercase(); + if CONSENT_URL_FRAGMENTS.iter().any(|f| lower.contains(f)) { + return true; + } + } + if result.metadata.word_count <= 50 + && let Some(ref title) = result.metadata.title + { + let lower = title.to_lowercase(); + if CONSENT_TITLES.iter().any(|t| lower.starts_with(t)) { + return true; + } + } + false +} + fn warn_empty(url: &str, reason: &EmptyReason) { match reason { EmptyReason::Antibot => eprintln!( @@ -74,6 +130,12 @@ fn warn_empty(url: &str, reason: &EmptyReason) { This site requires CAPTCHA solving or browser rendering.\n\ Use the webclaw Cloud API for automatic bypass: https://webclaw.io/pricing" ), + EmptyReason::ConsentWall => eprintln!( + "\x1b[33mwarning:\x1b[0m GDPR/cookie consent wall detected on {url}\n\ + The site redirected to a consent page and returned no usable content.\n\ + Try a non-EU proxy via --proxy, or pass a pre-accepted consent cookie\n\ + via --cookie / --cookie-file." + ), EmptyReason::JsRequired => eprintln!( "\x1b[33mwarning:\x1b[0m No content extracted from {url}\n\ This site requires JavaScript rendering (SPA).\n\ @@ -387,10 +449,15 @@ impl From for BrowserProfile { } fn init_logging(verbose: bool) { + // html5ever / markup5ever / selectors emit WARN on every page with real-world + // HTML quirks (foster-parenting, malformed tables). They are not actionable + // and pollute stderr with dozens of lines per fetch. Silence by default; users + // who need them can override via WEBCLAW_LOG. + let default = "warn,html5ever=error,markup5ever=error,selectors=error"; let filter = if verbose { - EnvFilter::new("webclaw=debug") + EnvFilter::new("webclaw=debug,html5ever=error,markup5ever=error,selectors=error") } else { - EnvFilter::try_from_env("WEBCLAW_LOG").unwrap_or_else(|_| EnvFilter::new("warn")) + EnvFilter::try_from_env("WEBCLAW_LOG").unwrap_or_else(|_| EnvFilter::new(default)) }; tracing_subscriber::fmt().with_env_filter(filter).init(); diff --git a/crates/webclaw-core/src/llm/body.rs b/crates/webclaw-core/src/llm/body.rs index db2a011..0caafd1 100644 --- a/crates/webclaw-core/src/llm/body.rs +++ b/crates/webclaw-core/src/llm/body.rs @@ -73,6 +73,18 @@ pub(crate) fn process_body(markdown: &str) -> ProcessedBody { // d. Extract links, replace inline `[text](url)` with just `text` let (text, extracted_links) = links::extract_and_strip_links(&text); + // d1b. Strip bare-integer paragraphs (news-index comment counts, page + // numbers). Must run AFTER link extraction so [0](#comment) collapses + // to "0" first, and BEFORE dedup so identical 0 lines don't muddle + // fingerprint dedup. + let text = cleanup::strip_bare_number_lines(&text); + + // d1c. Second UI-control pass: after link extraction, lines that were + // previously `[0](url) Next` now read as `0 Next` -- a pure control + // line that the pre-link c3 pass couldn't see through the link + // syntax. + let text = cleanup::strip_ui_control_text(&text); + // d2. Collapse repeated adjacent phrases on the same line // (responsive variants: "Read more Read more Read more" -> "Read more") let text = dedup_repeated_phrases(&text); diff --git a/crates/webclaw-core/src/llm/cleanup.rs b/crates/webclaw-core/src/llm/cleanup.rs index dc447a5..e805ab3 100644 --- a/crates/webclaw-core/src/llm/cleanup.rs +++ b/crates/webclaw-core/src/llm/cleanup.rs @@ -395,6 +395,9 @@ pub(crate) fn is_ui_control_line(line: &str) -> bool { /// Known UI control tokens from Material Icons ligatures, icon fonts, and /// common navigation elements that leak into text extraction. +/// +/// Match is case-insensitive: `Next`, `next`, and `NEXT` are all treated as +/// pagination chrome when alone on a line. fn is_ui_control_token(token: &str) -> bool { const UI_CONTROLS: &[&str] = &[ // Material Icons ligatures @@ -428,6 +431,12 @@ fn is_ui_control_token(token: &str) -> bool { "search", "menu", "share", + // Pagination chrome left over from rendered "Next | Previous" links + "next", + "previous", + "prev", + "older", + "newer", // Arrow/nav characters "\u{2190}", "\u{2192}", @@ -444,7 +453,64 @@ fn is_ui_control_token(token: &str) -> bool { "\u{00BB}", "\u{00AB}", ]; - UI_CONTROLS.contains(&token) + let lowered = token.to_ascii_lowercase(); + if UI_CONTROLS.contains(&lowered.as_str()) { + return true; + } + // Bare short integers (≤4 digits) on a line of otherwise pure control + // tokens are comment counts / page numbers glued to pagination chrome + // (e.g. "0 Next" at the bottom of news index pages). Counting these as + // controls lets `is_ui_control_line` strip the whole line. A legitimate + // line like "5 minutes" stays because "minutes" is not a control. + !token.is_empty() && token.len() <= 4 && token.chars().all(|c| c.is_ascii_digit()) +} + +/// Remove lines that are a bare short integer alone in their paragraph. +/// +/// News index pages often render comment counts (`0`, `42`) and pagination +/// page numbers (`1`, `2`) as standalone paragraphs after each article. These +/// add zero signal and confuse downstream readers, but they are real numbers +/// not control tokens, so [`strip_ui_control_text`] doesn't catch them. +/// +/// To stay safe, we only drop a line if BOTH conditions hold: +/// 1. The trimmed line parses cleanly as a non-negative integer ≤ 9999. +/// 2. The line is alone in its paragraph (surrounded by blank lines, or at +/// document edges). A `1.` or `- 1` list marker has different trim text and +/// is not affected. +pub(crate) fn strip_bare_number_lines(input: &str) -> String { + let lines: Vec<&str> = input.lines().collect(); + let mut out: Vec<&str> = Vec::with_capacity(lines.len()); + let mut in_code = false; + for (i, line) in lines.iter().enumerate() { + let trimmed = line.trim(); + if trimmed.starts_with("```") { + in_code = !in_code; + out.push(line); + continue; + } + if in_code { + out.push(line); + continue; + } + if is_bare_short_integer(trimmed) && is_isolated_in_paragraph(&lines, i) { + continue; + } + out.push(line); + } + out.join("\n") +} + +fn is_bare_short_integer(s: &str) -> bool { + if s.is_empty() || s.len() > 4 { + return false; + } + s.chars().all(|c| c.is_ascii_digit()) +} + +fn is_isolated_in_paragraph(lines: &[&str], i: usize) -> bool { + let prev_blank = i == 0 || lines[i - 1].trim().is_empty(); + let next_blank = i + 1 == lines.len() || lines[i + 1].trim().is_empty(); + prev_blank && next_blank } // --------------------------------------------------------------------------- diff --git a/crates/webclaw-core/src/llm/links.rs b/crates/webclaw-core/src/llm/links.rs index 3d25179..acde6fe 100644 --- a/crates/webclaw-core/src/llm/links.rs +++ b/crates/webclaw-core/src/llm/links.rs @@ -69,6 +69,21 @@ fn is_noise_link(text: &str, href: &str) -> bool { return true; } + // Bare-integer labels (comment counts, vote counts, page numbers). Always + // noise; the number alone tells the LLM nothing. Capped at 4 digits so a + // legitimately useful year-style label (`1999`) isn't dropped — but in + // practice bare 4-digit labels in a links section are page anchors, not + // articles, so this stays safe. + if !text.is_empty() && text.len() <= 4 && text.chars().all(|c| c.is_ascii_digit()) { + return true; + } + + // In-page comment/discussion fragments that survived the bare-fragment check + // because the href is a full URL with `#comment-stream` (or similar) tail. + if href.contains("#comment-stream") || href.contains("#comments") || href.contains("#disqus") { + return true; + } + // Internal user profile / action URLs (HN-style) if href.contains("/user?id=") || href.contains("/hide?id=") diff --git a/crates/webclaw-core/src/llm/mod.rs b/crates/webclaw-core/src/llm/mod.rs index bc65be6..57b964e 100644 --- a/crates/webclaw-core/src/llm/mod.rs +++ b/crates/webclaw-core/src/llm/mod.rs @@ -51,12 +51,17 @@ pub fn to_llm_text(result: &ExtractionResult, url: Option<&str>) -> String { // hydration blobs (Next.js pageProps full of ad-targeting flags, build // IDs, schedule paths) explode to hundreds of KB and drown the LLM in // noise — drop them rather than ship them. - let useful: Vec<_> = result + let mut useful: Vec<_> = result .structured_data .iter() .filter(|v| is_useful_structured_data(v)) .cloned() .collect(); + // Scrub body-duplicating fields so the article body doesn't ship twice + // (once as rendered markdown, again inside JSON-LD `articleBody`). + for v in &mut useful { + scrub_body_fields(v); + } if !useful.is_empty() { let serialized = serde_json::to_string_pretty(&useful).unwrap_or_default(); const STRUCTURED_DATA_MAX_BYTES: usize = 16 * 1024; @@ -113,6 +118,46 @@ fn is_useful_structured_data(v: &serde_json::Value) -> bool { serialized.len() <= 4 * 1024 } +/// Recursively remove fields whose value is a long body string already +/// rendered as markdown in the main output. +/// +/// Schema.org `NewsArticle` / `Article` records often embed the full article +/// body inside `articleBody`. Emitting that alongside the rendered markdown +/// ships the same content twice and wastes the LLM's context budget. We +/// always strip `articleBody`, and additionally strip `body` / `text` / +/// `description` when their value is ≥ 500 chars (short blurbs are kept +/// because they carry signal that may not be in the rendered body). +fn scrub_body_fields(v: &mut serde_json::Value) { + const BODY_KEYS: &[&str] = &["articleBody"]; + const LONG_BODY_KEYS: &[&str] = &["body", "text", "description"]; + const LONG_THRESHOLD: usize = 500; + match v { + serde_json::Value::Object(map) => { + map.retain(|k, val| { + if BODY_KEYS.contains(&k.as_str()) { + return false; + } + if LONG_BODY_KEYS.contains(&k.as_str()) + && let Some(s) = val.as_str() + && s.len() >= LONG_THRESHOLD + { + return false; + } + true + }); + for child in map.values_mut() { + scrub_body_fields(child); + } + } + serde_json::Value::Array(arr) => { + for child in arr.iter_mut() { + scrub_body_fields(child); + } + } + _ => {} + } +} + // --------------------------------------------------------------------------- // Integration tests that exercise the full pipeline through to_llm_text // ---------------------------------------------------------------------------