fix: clean llm output noise

Port the valid PR #43 LLM cleanup fixes onto current main without stale branch regressions.\n\nIncludes comment-count link cleanup, bare numeric paragraph cleanup, pagination leftover cleanup, JSON-LD article body scrubbing, clearer CLI consent-wall warnings, and quieter parser logs by default.\n\nThanks to @devnen for the report and patch work.
2026-07-26 07:51:01 +02:00 · 2026-05-18 18:39:33 +02:00 · 2026-05-18 18:39:33 +02:00 · 3fabdc1d02
commit 3fabdc1d02
parent 5eef8358b0
8 changed files with 348 additions and 18 deletions
--- a/crates/webclaw-core/src/llm/body.rs
+++ b/crates/webclaw-core/src/llm/body.rs
@ -73,7 +73,15 @@ pub(crate) fn process_body(markdown: &str) -> ProcessedBody {
    // d. Extract links, replace inline `[text](url)` with just `text`
    let (text, extracted_links) = links::extract_and_strip_links(&text);

-    // d2. Collapse repeated adjacent phrases on the same line
+    // d1. Strip bare-integer paragraphs after link extraction, so
+    // `[0](#comments)` collapses to `0` before the paragraph-aware check.
+    let text = cleanup::strip_bare_number_lines(&text);
+
+    // d2. Run UI-control stripping again after link extraction. Lines like
+    // `[0](url) Next` become `0 Next`, which is pure pagination chrome.
+    let text = cleanup::strip_ui_control_text(&text);
+
+    // d3. Collapse repeated adjacent phrases on the same line
    // (responsive variants: "Read more Read more Read more" -> "Read more")
    let text = dedup_repeated_phrases(&text);

--- a/crates/webclaw-core/src/llm/cleanup.rs
+++ b/crates/webclaw-core/src/llm/cleanup.rs
@ -385,16 +385,33 @@ pub(crate) fn is_ui_control_line(line: &str) -> bool {
        return false;
    }

-    // Split by whitespace: every token must be a known UI control
+    // Split by whitespace: every token must be a known UI control, with short
+    // numbers allowed only when paired with real pagination chrome.
    let tokens: Vec<&str> = trimmed.split_whitespace().collect();
    if tokens.is_empty() {
        return false;
    }
-    tokens.iter().all(|t| is_ui_control_token(t))
+
+    let mut has_named_control = false;
+    for token in tokens {
+        if is_bare_short_integer(token) {
+            continue;
+        }
+        if is_ui_control_token(token) {
+            has_named_control = true;
+            continue;
+        }
+        return false;
+    }
+
+    has_named_control
 }

 /// Known UI control tokens from Material Icons ligatures, icon fonts, and
 /// common navigation elements that leak into text extraction.
+///
+/// Match is case-insensitive: `Next`, `next`, and `NEXT` are all treated as
+/// pagination chrome when alone on a line.
 fn is_ui_control_token(token: &str) -> bool {
    const UI_CONTROLS: &[&str] = &[
        // Material Icons ligatures
@ -428,6 +445,12 @@ fn is_ui_control_token(token: &str) -> bool {
        "search",
        "menu",
        "share",
+        // Pagination chrome left over from rendered "Next | Previous" links.
+        "next",
+        "previous",
+        "prev",
+        "older",
+        "newer",
        // Arrow/nav characters
        "\u{2190}",
        "\u{2192}",
@ -444,7 +467,56 @@ fn is_ui_control_token(token: &str) -> bool {
        "\u{00BB}",
        "\u{00AB}",
    ];
-    UI_CONTROLS.contains(&token)
+    let lowered = token.to_ascii_lowercase();
+    UI_CONTROLS.contains(&lowered.as_str())
+}
+
+/// Remove lines that are a bare short integer alone in their paragraph.
+///
+/// News index pages often render comment counts (`0`, `42`) and pagination
+/// page numbers (`1`, `2`) as standalone paragraphs after each article. These
+/// add zero signal and confuse downstream readers, but they are real numbers
+/// not control tokens, so [`strip_ui_control_text`] does not catch them.
+///
+/// To stay safe, we only drop a line if both conditions hold:
+/// 1. The trimmed line is a non-negative integer <= 9999.
+/// 2. The line is alone in its paragraph, surrounded by blank lines or edges.
+pub(crate) fn strip_bare_number_lines(input: &str) -> String {
+    let lines: Vec<&str> = input.lines().collect();
+    let mut out: Vec<&str> = Vec::with_capacity(lines.len());
+    let mut in_code = false;
+
+    for (i, line) in lines.iter().enumerate() {
+        let trimmed = line.trim();
+        if trimmed.starts_with("```") {
+            in_code = !in_code;
+            out.push(line);
+            continue;
+        }
+        if in_code {
+            out.push(line);
+            continue;
+        }
+        if is_bare_short_integer(trimmed) && is_isolated_in_paragraph(&lines, i) {
+            continue;
+        }
+        out.push(line);
+    }
+
+    out.join("\n")
+}
+
+fn is_bare_short_integer(s: &str) -> bool {
+    if s.is_empty() || s.len() > 4 {
+        return false;
+    }
+    s.chars().all(|c| c.is_ascii_digit())
+}
+
+fn is_isolated_in_paragraph(lines: &[&str], i: usize) -> bool {
+    let prev_blank = i == 0 || lines[i - 1].trim().is_empty();
+    let next_blank = i + 1 == lines.len() || lines[i + 1].trim().is_empty();
+    prev_blank && next_blank
 }

 // ---------------------------------------------------------------------------
@ -1158,6 +1230,37 @@ mod tests {
        assert_eq!(strip_ui_control_text(input), "Hello\nWorld");
    }

+    #[test]
+    fn ui_control_strips_pagination_with_comment_count() {
+        assert!(is_ui_control_line("0 Next"));
+        assert!(is_ui_control_line("12 PREVIOUS"));
+        assert_eq!(strip_ui_control_text("Story\n0 Next\nMore"), "Story\nMore");
+    }
+
+    #[test]
+    fn ui_control_keeps_bare_numbers_for_context() {
+        assert!(!is_ui_control_line("2026"));
+        assert_eq!(
+            strip_ui_control_text("Revenue\n2026\nReport"),
+            "Revenue\n2026\nReport"
+        );
+    }
+
+    #[test]
+    fn bare_number_lines_strip_isolated_counts() {
+        let input = "Article title\n\n0\n\nNext article";
+        assert_eq!(
+            strip_bare_number_lines(input),
+            "Article title\n\n\nNext article"
+        );
+    }
+
+    #[test]
+    fn bare_number_lines_keep_lists_and_code() {
+        let input = "- 1\n\n1.\n\n```\n0\n```\n\nReal text";
+        assert_eq!(strip_bare_number_lines(input), input);
+    }
+
    // -- Long alt-text descriptions --

    #[test]
--- a/crates/webclaw-core/src/llm/links.rs
+++ b/crates/webclaw-core/src/llm/links.rs
@ -69,6 +69,18 @@ fn is_noise_link(text: &str, href: &str) -> bool {
        return true;
    }

+    // Bare integer labels are usually comment counts, vote counts, or page
+    // numbers. The label alone carries no useful link context for an LLM.
+    if !text.is_empty() && text.len() <= 4 && text.chars().all(|c| c.is_ascii_digit()) {
+        return true;
+    }
+
+    // In-page comment/discussion fragments that survived the bare-fragment
+    // check because the href is a full URL with a comment fragment.
+    if href.contains("#comment-stream") || href.contains("#comments") || href.contains("#disqus") {
+        return true;
+    }
+
    // Internal user profile / action URLs (HN-style)
    if href.contains("/user?id=")
        || href.contains("/hide?id=")
--- a/crates/webclaw-core/src/llm/mod.rs
+++ b/crates/webclaw-core/src/llm/mod.rs
@ -51,12 +51,15 @@ pub fn to_llm_text(result: &ExtractionResult, url: Option<&str>) -> String {
    // hydration blobs (Next.js pageProps full of ad-targeting flags, build
    // IDs, schedule paths) explode to hundreds of KB and drown the LLM in
    // noise — drop them rather than ship them.
-    let useful: Vec<_> = result
+    let mut useful: Vec<_> = result
        .structured_data
        .iter()
        .filter(|v| is_useful_structured_data(v))
        .cloned()
        .collect();
+    for value in &mut useful {
+        scrub_body_fields(value);
+    }
    if !useful.is_empty() {
        let serialized = serde_json::to_string_pretty(&useful).unwrap_or_default();
        const STRUCTURED_DATA_MAX_BYTES: usize = 16 * 1024;
@ -113,6 +116,38 @@ fn is_useful_structured_data(v: &serde_json::Value) -> bool {
    serialized.len() <= 4 * 1024
 }

+/// Recursively remove long fields that duplicate the rendered markdown body.
+fn scrub_body_fields(v: &mut serde_json::Value) {
+    const BODY_KEYS: &[&str] = &["articleBody"];
+    const LONG_BODY_KEYS: &[&str] = &["body", "text", "description"];
+    const LONG_THRESHOLD: usize = 500;
+
+    match v {
+        serde_json::Value::Object(map) => {
+            map.retain(|key, value| {
+                if BODY_KEYS.contains(&key.as_str()) {
+                    return false;
+                }
+                if LONG_BODY_KEYS.contains(&key.as_str())
+                    && value.as_str().is_some_and(|s| s.len() >= LONG_THRESHOLD)
+                {
+                    return false;
+                }
+                true
+            });
+            for value in map.values_mut() {
+                scrub_body_fields(value);
+            }
+        }
+        serde_json::Value::Array(values) => {
+            for value in values {
+                scrub_body_fields(value);
+            }
+        }
+        _ => {}
+    }
+}
+
 // ---------------------------------------------------------------------------
 // Integration tests that exercise the full pipeline through to_llm_text
 // ---------------------------------------------------------------------------
@ -797,6 +832,39 @@ mod tests {
        assert!(out.contains("Big news"));
    }

+    #[test]
+    fn structured_data_scrubs_duplicate_article_body() {
+        let body = "This is the rendered article body. ".repeat(40);
+        let r = make_result_with_structured(vec![serde_json::json!({
+            "@type": "NewsArticle",
+            "headline": "Big news",
+            "articleBody": body,
+            "description": "A short useful summary"
+        })]);
+        let out = to_llm_text(&r, None);
+        assert!(out.contains("Big news"));
+        assert!(out.contains("A short useful summary"));
+        assert!(
+            !out.contains("articleBody"),
+            "Duplicate article body leaked: {out}"
+        );
+    }
+
+    #[test]
+    fn llm_output_strips_comment_count_links_and_pagination() {
+        let md = "Lead paragraph.\n\n[0](https://example.com/#comment-stream) Next\n\n5 minutes read\n\n[Article](https://example.com/article)";
+        let result = make_result(md);
+        let out = to_llm_text(&result, None);
+        assert!(out.contains("Lead paragraph."));
+        assert!(out.contains("5 minutes read"));
+        assert!(out.contains("- Article: https://example.com/article"));
+        assert!(!out.contains("0 Next"), "Pagination leaked: {out}");
+        assert!(
+            !out.contains("comment-stream"),
+            "Comment link leaked: {out}"
+        );
+    }
+
    #[test]
    fn structured_data_drops_oversized_blob() {
        // 32KB pageProps-style blob with no @type — should be dropped.