From 920d71f5615c421875892eb9332b8d472dc1cad9 Mon Sep 17 00:00:00 2001
From: devnen <nenadoric@gmail.com>
Date: Sat, 16 May 2026 18:55:28 +0200
Subject: [PATCH] Strip more llm output noise: consent walls, bare integers,
 JSON-LD body duplication

- detect_empty: add ConsentWall variant for GDPR/cookie redirects (Yahoo,
  Google, EU news sites). Detect via final-URL host match (consent.*,
  /consent/, collectConsent) and warn with proxy/cookie remediation hint.
- init_logging: silence html5ever / markup5ever / selectors WARNs by
  default (foster-parenting messages from malformed real-world HTML
  pollute stderr with dozens of lines per fetch; override via WEBCLAW_LOG).
- cleanup: add strip_bare_number_lines for paragraphs that are just a
  short integer (news-index comment counts, page numbers); make
  is_ui_control_token case-insensitive and extend UI_CONTROLS with
  pagination chrome (next, prev, previous, older, newer) plus bare
  <=4-digit integers so '0 Next'-style glued lines are caught.
- links: drop bare-integer link labels and #comment-stream / #comments /
  #disqus hrefs from the deduplicated Links section.
- mod: scrub articleBody / body / text / description fields from JSON-LD
  structured-data emission when they would duplicate the rendered markdown
  body (always for articleBody; conditional >=500 chars for the others).

All 292 core tests pass.
---
 crates/webclaw-cli/src/main.rs         | 73 ++++++++++++++++++++++++--
 crates/webclaw-core/src/llm/body.rs    | 12 +++++
 crates/webclaw-core/src/llm/cleanup.rs | 68 +++++++++++++++++++++++-
 crates/webclaw-core/src/llm/links.rs   | 15 ++++++
 crates/webclaw-core/src/llm/mod.rs     | 47 ++++++++++++++++-
 5 files changed, 210 insertions(+), 5 deletions(-)

diff --git a/crates/webclaw-cli/src/main.rs b/crates/webclaw-cli/src/main.rs
index a45bce8..3b32526 100644
--- a/crates/webclaw-cli/src/main.rs
+++ b/crates/webclaw-cli/src/main.rs
@@ -35,10 +35,38 @@ const ANTIBOT_TITLES: &[&str] = &[
     "ddos protection",
 ];
 
-/// Detect why a page returned empty content.
+/// URL host/path fragments that indicate a GDPR/cookie consent redirect.
+/// Yahoo, Google, and several EU news sites redirect to a `consent.*` host
+/// (or `/consent/` path) when the client is detected as being in the EU/EEA.
+/// The resulting page has near-zero usable content and a locale-specific
+/// title, so URL-shape detection is the most reliable signal.
+const CONSENT_URL_FRAGMENTS: &[&str] = &[
+    "://consent.",
+    "/consent?",
+    "/consent/",
+    "collectconsent",
+    "consentcheck",
+    "/cmp/",
+    "guce.advertising.com",
+];
+
+/// English consent-wall title prefixes. Many providers localise the page,
+/// so this is a best-effort secondary signal — primary detection is the URL.
+const CONSENT_TITLES: &[&str] = &[
+    "before you continue",
+    "your privacy choices",
+    "we value your privacy",
+    "we care about your privacy",
+    "cookie consent",
+    "consent required",
+];
+
+/// Detect why a page returned empty (or near-empty) content.
 enum EmptyReason {
     /// Anti-bot challenge page (Cloudflare, Akamai, etc.)
     Antibot,
+    /// GDPR/cookie consent redirect — content is gated behind a consent form
+    ConsentWall,
     /// JS-only SPA that returns an empty shell without a browser
     JsRequired,
     /// Page has content — not empty
@@ -46,6 +74,13 @@ enum EmptyReason {
 }
 
 fn detect_empty(result: &ExtractionResult) -> EmptyReason {
+    // Consent walls come back with a non-empty (but tiny) markdown body and a
+    // post-redirect URL pointing at a consent host. Check before the
+    // word-count short-circuit so the Yahoo / Google / EU case is caught.
+    if is_consent_wall(result) {
+        return EmptyReason::ConsentWall;
+    }
+
     // Has real content — nothing to warn about
     if result.metadata.word_count > 50 || !result.content.markdown.is_empty() {
         return EmptyReason::None;
@@ -67,6 +102,27 @@ fn detect_empty(result: &ExtractionResult) -> EmptyReason {
     EmptyReason::None
 }
 
+/// A consent wall is identified by either:
+/// 1. The final (post-redirect) URL pointing at a known consent host/path, OR
+/// 2. A consent-wall title prefix, AND the body being very short (<= 50 words)
+fn is_consent_wall(result: &ExtractionResult) -> bool {
+    if let Some(ref url) = result.metadata.url {
+        let lower = url.to_ascii_lowercase();
+        if CONSENT_URL_FRAGMENTS.iter().any(|f| lower.contains(f)) {
+            return true;
+        }
+    }
+    if result.metadata.word_count <= 50
+        && let Some(ref title) = result.metadata.title
+    {
+        let lower = title.to_lowercase();
+        if CONSENT_TITLES.iter().any(|t| lower.starts_with(t)) {
+            return true;
+        }
+    }
+    false
+}
+
 fn warn_empty(url: &str, reason: &EmptyReason) {
     match reason {
         EmptyReason::Antibot => eprintln!(
@@ -74,6 +130,12 @@ fn warn_empty(url: &str, reason: &EmptyReason) {
              This site requires CAPTCHA solving or browser rendering.\n\
              Use the webclaw Cloud API for automatic bypass: https://webclaw.io/pricing"
         ),
+        EmptyReason::ConsentWall => eprintln!(
+            "\x1b[33mwarning:\x1b[0m GDPR/cookie consent wall detected on {url}\n\
+             The site redirected to a consent page and returned no usable content.\n\
+             Try a non-EU proxy via --proxy, or pass a pre-accepted consent cookie\n\
+             via --cookie / --cookie-file."
+        ),
         EmptyReason::JsRequired => eprintln!(
             "\x1b[33mwarning:\x1b[0m No content extracted from {url}\n\
              This site requires JavaScript rendering (SPA).\n\
@@ -387,10 +449,15 @@ impl From<Browser> for BrowserProfile {
 }
 
 fn init_logging(verbose: bool) {
+    // html5ever / markup5ever / selectors emit WARN on every page with real-world
+    // HTML quirks (foster-parenting, malformed tables). They are not actionable
+    // and pollute stderr with dozens of lines per fetch. Silence by default; users
+    // who need them can override via WEBCLAW_LOG.
+    let default = "warn,html5ever=error,markup5ever=error,selectors=error";
     let filter = if verbose {
-        EnvFilter::new("webclaw=debug")
+        EnvFilter::new("webclaw=debug,html5ever=error,markup5ever=error,selectors=error")
     } else {
-        EnvFilter::try_from_env("WEBCLAW_LOG").unwrap_or_else(|_| EnvFilter::new("warn"))
+        EnvFilter::try_from_env("WEBCLAW_LOG").unwrap_or_else(|_| EnvFilter::new(default))
     };
 
     tracing_subscriber::fmt().with_env_filter(filter).init();
diff --git a/crates/webclaw-core/src/llm/body.rs b/crates/webclaw-core/src/llm/body.rs
index db2a011..0caafd1 100644
--- a/crates/webclaw-core/src/llm/body.rs
+++ b/crates/webclaw-core/src/llm/body.rs
@@ -73,6 +73,18 @@ pub(crate) fn process_body(markdown: &str) -> ProcessedBody {
     // d. Extract links, replace inline `[text](url)` with just `text`
     let (text, extracted_links) = links::extract_and_strip_links(&text);
 
+    // d1b. Strip bare-integer paragraphs (news-index comment counts, page
+    //      numbers). Must run AFTER link extraction so [0](#comment) collapses
+    //      to "0" first, and BEFORE dedup so identical 0 lines don't muddle
+    //      fingerprint dedup.
+    let text = cleanup::strip_bare_number_lines(&text);
+
+    // d1c. Second UI-control pass: after link extraction, lines that were
+    //      previously `[0](url) Next` now read as `0 Next` -- a pure control
+    //      line that the pre-link c3 pass couldn't see through the link
+    //      syntax.
+    let text = cleanup::strip_ui_control_text(&text);
+
     // d2. Collapse repeated adjacent phrases on the same line
     // (responsive variants: "Read more Read more Read more" -> "Read more")
     let text = dedup_repeated_phrases(&text);
diff --git a/crates/webclaw-core/src/llm/cleanup.rs b/crates/webclaw-core/src/llm/cleanup.rs
index dc447a5..e805ab3 100644
--- a/crates/webclaw-core/src/llm/cleanup.rs
+++ b/crates/webclaw-core/src/llm/cleanup.rs
@@ -395,6 +395,9 @@ pub(crate) fn is_ui_control_line(line: &str) -> bool {
 
 /// Known UI control tokens from Material Icons ligatures, icon fonts, and
 /// common navigation elements that leak into text extraction.
+///
+/// Match is case-insensitive: `Next`, `next`, and `NEXT` are all treated as
+/// pagination chrome when alone on a line.
 fn is_ui_control_token(token: &str) -> bool {
     const UI_CONTROLS: &[&str] = &[
         // Material Icons ligatures
@@ -428,6 +431,12 @@ fn is_ui_control_token(token: &str) -> bool {
         "search",
         "menu",
         "share",
+        // Pagination chrome left over from rendered "Next | Previous" links
+        "next",
+        "previous",
+        "prev",
+        "older",
+        "newer",
         // Arrow/nav characters
         "\u{2190}",
         "\u{2192}",
@@ -444,7 +453,64 @@ fn is_ui_control_token(token: &str) -> bool {
         "\u{00BB}",
         "\u{00AB}",
     ];
-    UI_CONTROLS.contains(&token)
+    let lowered = token.to_ascii_lowercase();
+    if UI_CONTROLS.contains(&lowered.as_str()) {
+        return true;
+    }
+    // Bare short integers (≤4 digits) on a line of otherwise pure control
+    // tokens are comment counts / page numbers glued to pagination chrome
+    // (e.g. "0 Next" at the bottom of news index pages). Counting these as
+    // controls lets `is_ui_control_line` strip the whole line. A legitimate
+    // line like "5 minutes" stays because "minutes" is not a control.
+    !token.is_empty() && token.len() <= 4 && token.chars().all(|c| c.is_ascii_digit())
+}
+
+/// Remove lines that are a bare short integer alone in their paragraph.
+///
+/// News index pages often render comment counts (`0`, `42`) and pagination
+/// page numbers (`1`, `2`) as standalone paragraphs after each article. These
+/// add zero signal and confuse downstream readers, but they are real numbers
+/// not control tokens, so [`strip_ui_control_text`] doesn't catch them.
+///
+/// To stay safe, we only drop a line if BOTH conditions hold:
+/// 1. The trimmed line parses cleanly as a non-negative integer ≤ 9999.
+/// 2. The line is alone in its paragraph (surrounded by blank lines, or at
+///    document edges). A `1.` or `- 1` list marker has different trim text and
+///    is not affected.
+pub(crate) fn strip_bare_number_lines(input: &str) -> String {
+    let lines: Vec<&str> = input.lines().collect();
+    let mut out: Vec<&str> = Vec::with_capacity(lines.len());
+    let mut in_code = false;
+    for (i, line) in lines.iter().enumerate() {
+        let trimmed = line.trim();
+        if trimmed.starts_with("```") {
+            in_code = !in_code;
+            out.push(line);
+            continue;
+        }
+        if in_code {
+            out.push(line);
+            continue;
+        }
+        if is_bare_short_integer(trimmed) && is_isolated_in_paragraph(&lines, i) {
+            continue;
+        }
+        out.push(line);
+    }
+    out.join("\n")
+}
+
+fn is_bare_short_integer(s: &str) -> bool {
+    if s.is_empty() || s.len() > 4 {
+        return false;
+    }
+    s.chars().all(|c| c.is_ascii_digit())
+}
+
+fn is_isolated_in_paragraph(lines: &[&str], i: usize) -> bool {
+    let prev_blank = i == 0 || lines[i - 1].trim().is_empty();
+    let next_blank = i + 1 == lines.len() || lines[i + 1].trim().is_empty();
+    prev_blank && next_blank
 }
 
 // ---------------------------------------------------------------------------
diff --git a/crates/webclaw-core/src/llm/links.rs b/crates/webclaw-core/src/llm/links.rs
index 3d25179..acde6fe 100644
--- a/crates/webclaw-core/src/llm/links.rs
+++ b/crates/webclaw-core/src/llm/links.rs
@@ -69,6 +69,21 @@ fn is_noise_link(text: &str, href: &str) -> bool {
         return true;
     }
 
+    // Bare-integer labels (comment counts, vote counts, page numbers). Always
+    // noise; the number alone tells the LLM nothing. Capped at 4 digits so a
+    // legitimately useful year-style label (`1999`) isn't dropped — but in
+    // practice bare 4-digit labels in a links section are page anchors, not
+    // articles, so this stays safe.
+    if !text.is_empty() && text.len() <= 4 && text.chars().all(|c| c.is_ascii_digit()) {
+        return true;
+    }
+
+    // In-page comment/discussion fragments that survived the bare-fragment check
+    // because the href is a full URL with `#comment-stream` (or similar) tail.
+    if href.contains("#comment-stream") || href.contains("#comments") || href.contains("#disqus") {
+        return true;
+    }
+
     // Internal user profile / action URLs (HN-style)
     if href.contains("/user?id=")
         || href.contains("/hide?id=")
diff --git a/crates/webclaw-core/src/llm/mod.rs b/crates/webclaw-core/src/llm/mod.rs
index bc65be6..57b964e 100644
--- a/crates/webclaw-core/src/llm/mod.rs
+++ b/crates/webclaw-core/src/llm/mod.rs
@@ -51,12 +51,17 @@ pub fn to_llm_text(result: &ExtractionResult, url: Option<&str>) -> String {
     // hydration blobs (Next.js pageProps full of ad-targeting flags, build
     // IDs, schedule paths) explode to hundreds of KB and drown the LLM in
     // noise — drop them rather than ship them.
-    let useful: Vec<_> = result
+    let mut useful: Vec<_> = result
         .structured_data
         .iter()
         .filter(|v| is_useful_structured_data(v))
         .cloned()
         .collect();
+    // Scrub body-duplicating fields so the article body doesn't ship twice
+    // (once as rendered markdown, again inside JSON-LD `articleBody`).
+    for v in &mut useful {
+        scrub_body_fields(v);
+    }
     if !useful.is_empty() {
         let serialized = serde_json::to_string_pretty(&useful).unwrap_or_default();
         const STRUCTURED_DATA_MAX_BYTES: usize = 16 * 1024;
@@ -113,6 +118,46 @@ fn is_useful_structured_data(v: &serde_json::Value) -> bool {
     serialized.len() <= 4 * 1024
 }
 
+/// Recursively remove fields whose value is a long body string already
+/// rendered as markdown in the main output.
+///
+/// Schema.org `NewsArticle` / `Article` records often embed the full article
+/// body inside `articleBody`. Emitting that alongside the rendered markdown
+/// ships the same content twice and wastes the LLM's context budget. We
+/// always strip `articleBody`, and additionally strip `body` / `text` /
+/// `description` when their value is ≥ 500 chars (short blurbs are kept
+/// because they carry signal that may not be in the rendered body).
+fn scrub_body_fields(v: &mut serde_json::Value) {
+    const BODY_KEYS: &[&str] = &["articleBody"];
+    const LONG_BODY_KEYS: &[&str] = &["body", "text", "description"];
+    const LONG_THRESHOLD: usize = 500;
+    match v {
+        serde_json::Value::Object(map) => {
+            map.retain(|k, val| {
+                if BODY_KEYS.contains(&k.as_str()) {
+                    return false;
+                }
+                if LONG_BODY_KEYS.contains(&k.as_str())
+                    && let Some(s) = val.as_str()
+                    && s.len() >= LONG_THRESHOLD
+                {
+                    return false;
+                }
+                true
+            });
+            for child in map.values_mut() {
+                scrub_body_fields(child);
+            }
+        }
+        serde_json::Value::Array(arr) => {
+            for child in arr.iter_mut() {
+                scrub_body_fields(child);
+            }
+        }
+        _ => {}
+    }
+}
+
 // ---------------------------------------------------------------------------
 // Integration tests that exercise the full pipeline through to_llm_text
 // ---------------------------------------------------------------------------