fix: clean llm output noise

Port the valid PR #43 LLM cleanup fixes onto current main without stale branch regressions.\n\nIncludes comment-count link cleanup, bare numeric paragraph cleanup, pagination leftover cleanup, JSON-LD article body scrubbing, clearer CLI consent-wall warnings, and quieter parser logs by default.\n\nThanks to @devnen for the report and patch work.
2026-07-23 07:21:02 +02:00 · 2026-05-18 18:39:33 +02:00 · 2026-05-18 18:39:33 +02:00 · 3fabdc1d02
commit 3fabdc1d02
parent 5eef8358b0
8 changed files with 348 additions and 18 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -3,6 +3,17 @@
 All notable changes to webclaw are documented here.
 Format follows [Keep a Changelog](https://keepachangelog.com/).

+## [0.6.2] — 2026-05-18
+
+### Fixed
+- Cleaned up `--format llm` output on noisy news and documentation pages. Comment-count links, bare page-number paragraphs, pagination leftovers such as `0 Next`, and duplicated JSON-LD article bodies are now removed before they reach the LLM context.
+- The CLI now recognizes common cookie-consent redirects and prints a clearer warning when a page returns a consent wall instead of usable content.
+- The CLI keeps noisy parser warnings from real-world malformed HTML out of stderr by default. `WEBCLAW_LOG` still lets advanced users opt into deeper parser logs.
+
+Thanks to Nenad Oric (`@devnen`) for the report and patch work in PR #43.
+
+---
+
 ## [0.6.1] — 2026-05-12

 ### Fixed
--- a/Cargo.lock
+++ b/Cargo.lock
@ -3219,7 +3219,7 @@ dependencies = [

 [[package]]
 name = "webclaw-cli"
-version = "0.6.1"
+version = "0.6.2"
 dependencies = [
 "clap",
 "dotenvy",
@ -3240,7 +3240,7 @@ dependencies = [

 [[package]]
 name = "webclaw-core"
-version = "0.6.1"
+version = "0.6.2"
 dependencies = [
 "ego-tree",
 "once_cell",
@ -3258,7 +3258,7 @@ dependencies = [

 [[package]]
 name = "webclaw-fetch"
-version = "0.6.1"
+version = "0.6.2"
 dependencies = [
 "async-trait",
 "bytes",
@ -3284,7 +3284,7 @@ dependencies = [

 [[package]]
 name = "webclaw-llm"
-version = "0.6.1"
+version = "0.6.2"
 dependencies = [
 "async-trait",
 "reqwest",
@ -3297,7 +3297,7 @@ dependencies = [

 [[package]]
 name = "webclaw-mcp"
-version = "0.6.1"
+version = "0.6.2"
 dependencies = [
 "dirs",
 "dotenvy",
@ -3317,7 +3317,7 @@ dependencies = [

 [[package]]
 name = "webclaw-pdf"
-version = "0.6.1"
+version = "0.6.2"
 dependencies = [
 "pdf-extract",
 "thiserror",
@ -3326,7 +3326,7 @@ dependencies = [

 [[package]]
 name = "webclaw-server"
-version = "0.6.1"
+version = "0.6.2"
 dependencies = [
 "anyhow",
 "axum",
--- a/Cargo.toml
+++ b/Cargo.toml
@ -3,7 +3,7 @@ resolver = "2"
 members = ["crates/*"]

 [workspace.package]
-version = "0.6.1"
+version = "0.6.2"
 edition = "2024"
 license = "AGPL-3.0"
 repository = "https://github.com/0xMassi/webclaw"
--- a/crates/webclaw-cli/src/main.rs
+++ b/crates/webclaw-cli/src/main.rs
@ -35,18 +35,49 @@ const ANTIBOT_TITLES: &[&str] = &[
    "ddos protection",
 ];

-/// Detect why a page returned empty content.
+/// URL host/path fragments that indicate a GDPR/cookie consent redirect.
+const CONSENT_URL_FRAGMENTS: &[&str] = &[
+    "://consent.",
+    "/consent?",
+    "/consent/",
+    "collectconsent",
+    "consentcheck",
+    "/cmp/",
+    "guce.advertising.com",
+];
+
+/// English consent-wall title prefixes. Many providers localize this page, so
+/// this is a best-effort secondary signal. URL shape is the primary signal.
+const CONSENT_TITLES: &[&str] = &[
+    "before you continue",
+    "your privacy choices",
+    "we value your privacy",
+    "we care about your privacy",
+    "cookie consent",
+    "consent required",
+];
+
+/// Detect why a page returned empty or near-empty content.
+#[derive(Debug, PartialEq, Eq)]
 enum EmptyReason {
    /// Anti-bot challenge page (Cloudflare, Akamai, etc.)
    Antibot,
+    /// GDPR/cookie consent redirect.
+    ConsentWall,
    /// JS-only SPA that returns an empty shell without a browser
    JsRequired,
-    /// Page has content — not empty
+    /// Page has content.
    None,
 }

 fn detect_empty(result: &ExtractionResult) -> EmptyReason {
-    // Has real content — nothing to warn about
+    // Consent walls can have a tiny body, so check before the content
+    // short-circuit.
+    if is_consent_wall(result) {
+        return EmptyReason::ConsentWall;
+    }
+
+    // Has real content. Nothing to warn about.
    if result.metadata.word_count > 50 || !result.content.markdown.is_empty() {
        return EmptyReason::None;
    }
@ -67,6 +98,35 @@ fn detect_empty(result: &ExtractionResult) -> EmptyReason {
    EmptyReason::None
 }

+/// A consent wall is identified by either:
+/// 1. The final URL pointing at a known consent host/path, or
+/// 2. A consent-wall title prefix with a very small body.
+fn is_consent_wall(result: &ExtractionResult) -> bool {
+    if let Some(ref url) = result.metadata.url {
+        let lower = url.to_ascii_lowercase();
+        if CONSENT_URL_FRAGMENTS
+            .iter()
+            .any(|fragment| lower.contains(fragment))
+        {
+            return true;
+        }
+    }
+
+    if result.metadata.word_count <= 50
+        && let Some(ref title) = result.metadata.title
+    {
+        let lower = title.to_lowercase();
+        if CONSENT_TITLES
+            .iter()
+            .any(|prefix| lower.starts_with(prefix))
+        {
+            return true;
+        }
+    }
+
+    false
+}
+
 fn warn_empty(url: &str, reason: &EmptyReason) {
    match reason {
        EmptyReason::Antibot => eprintln!(
@ -74,6 +134,12 @@ fn warn_empty(url: &str, reason: &EmptyReason) {
             This site requires CAPTCHA solving or browser rendering.\n\
             Use the webclaw Cloud API for automatic bypass: https://webclaw.io/pricing"
        ),
+        EmptyReason::ConsentWall => eprintln!(
+            "\x1b[33mwarning:\x1b[0m GDPR/cookie consent wall detected on {url}\n\
+             The site redirected to a consent page and returned no usable content.\n\
+             Try a different region via --proxy, or pass a pre-accepted consent cookie\n\
+             via --cookie / --cookie-file."
+        ),
        EmptyReason::JsRequired => eprintln!(
            "\x1b[33mwarning:\x1b[0m No content extracted from {url}\n\
             This site requires JavaScript rendering (SPA).\n\
@ -387,10 +453,14 @@ impl From<Browser> for BrowserProfile {
 }

 fn init_logging(verbose: bool) {
+    // html5ever / markup5ever / selectors emit WARN on common real-world HTML
+    // quirks. They are rarely actionable for CLI users, so keep them quiet by
+    // default while still allowing WEBCLAW_LOG to override the filter.
+    let default = "warn,html5ever=error,markup5ever=error,selectors=error";
    let filter = if verbose {
-        EnvFilter::new("webclaw=debug")
+        EnvFilter::new("webclaw=debug,html5ever=error,markup5ever=error,selectors=error")
    } else {
-        EnvFilter::try_from_env("WEBCLAW_LOG").unwrap_or_else(|_| EnvFilter::new("warn"))
+        EnvFilter::try_from_env("WEBCLAW_LOG").unwrap_or_else(|_| EnvFilter::new(default))
    };

    tracing_subscriber::fmt().with_env_filter(filter).init();
@ -2547,6 +2617,64 @@ async fn main() {
 #[cfg(test)]
 mod tests {
    use super::*;
+    use webclaw_core::Content;
+
+    fn empty_result(title: Option<&str>, url: Option<&str>, markdown: &str) -> ExtractionResult {
+        ExtractionResult {
+            metadata: Metadata {
+                title: title.map(str::to_string),
+                description: None,
+                author: None,
+                published_date: None,
+                language: None,
+                url: url.map(str::to_string),
+                site_name: None,
+                image: None,
+                favicon: None,
+                word_count: markdown.split_whitespace().count(),
+            },
+            content: Content {
+                markdown: markdown.to_string(),
+                plain_text: markdown.to_string(),
+                links: vec![],
+                images: vec![],
+                code_blocks: vec![],
+                raw_html: None,
+            },
+            domain_data: None,
+            structured_data: vec![],
+        }
+    }
+
+    #[test]
+    fn detect_empty_identifies_consent_redirect_url() {
+        let result = empty_result(
+            Some("Yahoo"),
+            Some("https://guce.advertising.com/collectIdentifiers?sessionId=abc"),
+            "Continue",
+        );
+        assert_eq!(detect_empty(&result), EmptyReason::ConsentWall);
+    }
+
+    #[test]
+    fn detect_empty_identifies_short_consent_title() {
+        let result = empty_result(
+            Some("Before you continue"),
+            Some("https://www.google.com/"),
+            "Review privacy options",
+        );
+        assert_eq!(detect_empty(&result), EmptyReason::ConsentWall);
+    }
+
+    #[test]
+    fn detect_empty_does_not_flag_real_content_with_consent_words() {
+        let result = empty_result(
+            Some("Cookie consent patterns explained"),
+            Some("https://example.com/blog"),
+            "This article explains cookie consent patterns for product teams with enough real body text to be useful. It covers consent banners, privacy controls, analytics configuration, regional requirements, product tradeoffs, implementation details, testing flows, debugging notes, accessibility needs, and operational lessons from real teams shipping public websites across multiple markets. It also explains measurement, rollout planning, copy review, support workflows, design constraints, release notes, and how to keep privacy choices understandable for users.",
+        );
+        assert_eq!(detect_empty(&result), EmptyReason::None);
+    }

    #[test]
    fn url_to_filename_root() {
--- a/crates/webclaw-core/src/llm/body.rs
+++ b/crates/webclaw-core/src/llm/body.rs
@ -73,7 +73,15 @@ pub(crate) fn process_body(markdown: &str) -> ProcessedBody {
    // d. Extract links, replace inline `[text](url)` with just `text`
    let (text, extracted_links) = links::extract_and_strip_links(&text);

-    // d2. Collapse repeated adjacent phrases on the same line
+    // d1. Strip bare-integer paragraphs after link extraction, so
+    // `[0](#comments)` collapses to `0` before the paragraph-aware check.
+    let text = cleanup::strip_bare_number_lines(&text);
+
+    // d2. Run UI-control stripping again after link extraction. Lines like
+    // `[0](url) Next` become `0 Next`, which is pure pagination chrome.
+    let text = cleanup::strip_ui_control_text(&text);
+
+    // d3. Collapse repeated adjacent phrases on the same line
    // (responsive variants: "Read more Read more Read more" -> "Read more")
    let text = dedup_repeated_phrases(&text);

--- a/crates/webclaw-core/src/llm/cleanup.rs
+++ b/crates/webclaw-core/src/llm/cleanup.rs
@ -385,16 +385,33 @@ pub(crate) fn is_ui_control_line(line: &str) -> bool {
        return false;
    }

-    // Split by whitespace: every token must be a known UI control
+    // Split by whitespace: every token must be a known UI control, with short
+    // numbers allowed only when paired with real pagination chrome.
    let tokens: Vec<&str> = trimmed.split_whitespace().collect();
    if tokens.is_empty() {
        return false;
    }
-    tokens.iter().all(|t| is_ui_control_token(t))
+
+    let mut has_named_control = false;
+    for token in tokens {
+        if is_bare_short_integer(token) {
+            continue;
+        }
+        if is_ui_control_token(token) {
+            has_named_control = true;
+            continue;
+        }
+        return false;
+    }
+
+    has_named_control
 }

 /// Known UI control tokens from Material Icons ligatures, icon fonts, and
 /// common navigation elements that leak into text extraction.
+///
+/// Match is case-insensitive: `Next`, `next`, and `NEXT` are all treated as
+/// pagination chrome when alone on a line.
 fn is_ui_control_token(token: &str) -> bool {
    const UI_CONTROLS: &[&str] = &[
        // Material Icons ligatures
@ -428,6 +445,12 @@ fn is_ui_control_token(token: &str) -> bool {
        "search",
        "menu",
        "share",
+        // Pagination chrome left over from rendered "Next | Previous" links.
+        "next",
+        "previous",
+        "prev",
+        "older",
+        "newer",
        // Arrow/nav characters
        "\u{2190}",
        "\u{2192}",
@ -444,7 +467,56 @@ fn is_ui_control_token(token: &str) -> bool {
        "\u{00BB}",
        "\u{00AB}",
    ];
-    UI_CONTROLS.contains(&token)
+    let lowered = token.to_ascii_lowercase();
+    UI_CONTROLS.contains(&lowered.as_str())
+}
+
+/// Remove lines that are a bare short integer alone in their paragraph.
+///
+/// News index pages often render comment counts (`0`, `42`) and pagination
+/// page numbers (`1`, `2`) as standalone paragraphs after each article. These
+/// add zero signal and confuse downstream readers, but they are real numbers
+/// not control tokens, so [`strip_ui_control_text`] does not catch them.
+///
+/// To stay safe, we only drop a line if both conditions hold:
+/// 1. The trimmed line is a non-negative integer <= 9999.
+/// 2. The line is alone in its paragraph, surrounded by blank lines or edges.
+pub(crate) fn strip_bare_number_lines(input: &str) -> String {
+    let lines: Vec<&str> = input.lines().collect();
+    let mut out: Vec<&str> = Vec::with_capacity(lines.len());
+    let mut in_code = false;
+
+    for (i, line) in lines.iter().enumerate() {
+        let trimmed = line.trim();
+        if trimmed.starts_with("```") {
+            in_code = !in_code;
+            out.push(line);
+            continue;
+        }
+        if in_code {
+            out.push(line);
+            continue;
+        }
+        if is_bare_short_integer(trimmed) && is_isolated_in_paragraph(&lines, i) {
+            continue;
+        }
+        out.push(line);
+    }
+
+    out.join("\n")
+}
+
+fn is_bare_short_integer(s: &str) -> bool {
+    if s.is_empty() || s.len() > 4 {
+        return false;
+    }
+    s.chars().all(|c| c.is_ascii_digit())
+}
+
+fn is_isolated_in_paragraph(lines: &[&str], i: usize) -> bool {
+    let prev_blank = i == 0 || lines[i - 1].trim().is_empty();
+    let next_blank = i + 1 == lines.len() || lines[i + 1].trim().is_empty();
+    prev_blank && next_blank
 }

 // ---------------------------------------------------------------------------
@ -1158,6 +1230,37 @@ mod tests {
        assert_eq!(strip_ui_control_text(input), "Hello\nWorld");
    }

+    #[test]
+    fn ui_control_strips_pagination_with_comment_count() {
+        assert!(is_ui_control_line("0 Next"));
+        assert!(is_ui_control_line("12 PREVIOUS"));
+        assert_eq!(strip_ui_control_text("Story\n0 Next\nMore"), "Story\nMore");
+    }
+
+    #[test]
+    fn ui_control_keeps_bare_numbers_for_context() {
+        assert!(!is_ui_control_line("2026"));
+        assert_eq!(
+            strip_ui_control_text("Revenue\n2026\nReport"),
+            "Revenue\n2026\nReport"
+        );
+    }
+
+    #[test]
+    fn bare_number_lines_strip_isolated_counts() {
+        let input = "Article title\n\n0\n\nNext article";
+        assert_eq!(
+            strip_bare_number_lines(input),
+            "Article title\n\n\nNext article"
+        );
+    }
+
+    #[test]
+    fn bare_number_lines_keep_lists_and_code() {
+        let input = "- 1\n\n1.\n\n```\n0\n```\n\nReal text";
+        assert_eq!(strip_bare_number_lines(input), input);
+    }
+
    // -- Long alt-text descriptions --

    #[test]
--- a/crates/webclaw-core/src/llm/links.rs
+++ b/crates/webclaw-core/src/llm/links.rs
@ -69,6 +69,18 @@ fn is_noise_link(text: &str, href: &str) -> bool {
        return true;
    }

+    // Bare integer labels are usually comment counts, vote counts, or page
+    // numbers. The label alone carries no useful link context for an LLM.
+    if !text.is_empty() && text.len() <= 4 && text.chars().all(|c| c.is_ascii_digit()) {
+        return true;
+    }
+
+    // In-page comment/discussion fragments that survived the bare-fragment
+    // check because the href is a full URL with a comment fragment.
+    if href.contains("#comment-stream") || href.contains("#comments") || href.contains("#disqus") {
+        return true;
+    }
+
    // Internal user profile / action URLs (HN-style)
    if href.contains("/user?id=")
        || href.contains("/hide?id=")
--- a/crates/webclaw-core/src/llm/mod.rs
+++ b/crates/webclaw-core/src/llm/mod.rs
@ -51,12 +51,15 @@ pub fn to_llm_text(result: &ExtractionResult, url: Option<&str>) -> String {
    // hydration blobs (Next.js pageProps full of ad-targeting flags, build
    // IDs, schedule paths) explode to hundreds of KB and drown the LLM in
    // noise — drop them rather than ship them.
-    let useful: Vec<_> = result
+    let mut useful: Vec<_> = result
        .structured_data
        .iter()
        .filter(|v| is_useful_structured_data(v))
        .cloned()
        .collect();
+    for value in &mut useful {
+        scrub_body_fields(value);
+    }
    if !useful.is_empty() {
        let serialized = serde_json::to_string_pretty(&useful).unwrap_or_default();
        const STRUCTURED_DATA_MAX_BYTES: usize = 16 * 1024;
@ -113,6 +116,38 @@ fn is_useful_structured_data(v: &serde_json::Value) -> bool {
    serialized.len() <= 4 * 1024
 }

+/// Recursively remove long fields that duplicate the rendered markdown body.
+fn scrub_body_fields(v: &mut serde_json::Value) {
+    const BODY_KEYS: &[&str] = &["articleBody"];
+    const LONG_BODY_KEYS: &[&str] = &["body", "text", "description"];
+    const LONG_THRESHOLD: usize = 500;
+
+    match v {
+        serde_json::Value::Object(map) => {
+            map.retain(|key, value| {
+                if BODY_KEYS.contains(&key.as_str()) {
+                    return false;
+                }
+                if LONG_BODY_KEYS.contains(&key.as_str())
+                    && value.as_str().is_some_and(|s| s.len() >= LONG_THRESHOLD)
+                {
+                    return false;
+                }
+                true
+            });
+            for value in map.values_mut() {
+                scrub_body_fields(value);
+            }
+        }
+        serde_json::Value::Array(values) => {
+            for value in values {
+                scrub_body_fields(value);
+            }
+        }
+        _ => {}
+    }
+}
+
 // ---------------------------------------------------------------------------
 // Integration tests that exercise the full pipeline through to_llm_text
 // ---------------------------------------------------------------------------
@ -797,6 +832,39 @@ mod tests {
        assert!(out.contains("Big news"));
    }

+    #[test]
+    fn structured_data_scrubs_duplicate_article_body() {
+        let body = "This is the rendered article body. ".repeat(40);
+        let r = make_result_with_structured(vec![serde_json::json!({
+            "@type": "NewsArticle",
+            "headline": "Big news",
+            "articleBody": body,
+            "description": "A short useful summary"
+        })]);
+        let out = to_llm_text(&r, None);
+        assert!(out.contains("Big news"));
+        assert!(out.contains("A short useful summary"));
+        assert!(
+            !out.contains("articleBody"),
+            "Duplicate article body leaked: {out}"
+        );
+    }
+
+    #[test]
+    fn llm_output_strips_comment_count_links_and_pagination() {
+        let md = "Lead paragraph.\n\n[0](https://example.com/#comment-stream) Next\n\n5 minutes read\n\n[Article](https://example.com/article)";
+        let result = make_result(md);
+        let out = to_llm_text(&result, None);
+        assert!(out.contains("Lead paragraph."));
+        assert!(out.contains("5 minutes read"));
+        assert!(out.contains("- Article: https://example.com/article"));
+        assert!(!out.contains("0 Next"), "Pagination leaked: {out}");
+        assert!(
+            !out.contains("comment-stream"),
+            "Comment link leaked: {out}"
+        );
+    }
+
    #[test]
    fn structured_data_drops_oversized_blob() {
        // 32KB pageProps-style blob with no @type — should be dropped.