fix: clean llm output noise

Port the valid PR #43 LLM cleanup fixes onto current main without stale branch regressions.\n\nIncludes comment-count link cleanup, bare numeric paragraph cleanup, pagination leftover cleanup, JSON-LD article body scrubbing, clearer CLI consent-wall warnings, and quieter parser logs by default.\n\nThanks to @devnen for the report and patch work.
2026-06-08 22:25:12 +02:00 · 2026-05-18 18:39:33 +02:00 · 2026-05-18 18:39:33 +02:00 · 3fabdc1d02
commit 3fabdc1d02
parent 5eef8358b0
8 changed files with 348 additions and 18 deletions
--- a/crates/webclaw-core/src/llm/mod.rs
+++ b/crates/webclaw-core/src/llm/mod.rs
@ -51,12 +51,15 @@ pub fn to_llm_text(result: &ExtractionResult, url: Option<&str>) -> String {
    // hydration blobs (Next.js pageProps full of ad-targeting flags, build
    // IDs, schedule paths) explode to hundreds of KB and drown the LLM in
    // noise — drop them rather than ship them.
-    let useful: Vec<_> = result
+    let mut useful: Vec<_> = result
        .structured_data
        .iter()
        .filter(|v| is_useful_structured_data(v))
        .cloned()
        .collect();
+    for value in &mut useful {
+        scrub_body_fields(value);
+    }
    if !useful.is_empty() {
        let serialized = serde_json::to_string_pretty(&useful).unwrap_or_default();
        const STRUCTURED_DATA_MAX_BYTES: usize = 16 * 1024;
@ -113,6 +116,38 @@ fn is_useful_structured_data(v: &serde_json::Value) -> bool {
    serialized.len() <= 4 * 1024
 }

+/// Recursively remove long fields that duplicate the rendered markdown body.
+fn scrub_body_fields(v: &mut serde_json::Value) {
+    const BODY_KEYS: &[&str] = &["articleBody"];
+    const LONG_BODY_KEYS: &[&str] = &["body", "text", "description"];
+    const LONG_THRESHOLD: usize = 500;
+
+    match v {
+        serde_json::Value::Object(map) => {
+            map.retain(|key, value| {
+                if BODY_KEYS.contains(&key.as_str()) {
+                    return false;
+                }
+                if LONG_BODY_KEYS.contains(&key.as_str())
+                    && value.as_str().is_some_and(|s| s.len() >= LONG_THRESHOLD)
+                {
+                    return false;
+                }
+                true
+            });
+            for value in map.values_mut() {
+                scrub_body_fields(value);
+            }
+        }
+        serde_json::Value::Array(values) => {
+            for value in values {
+                scrub_body_fields(value);
+            }
+        }
+        _ => {}
+    }
+}
+
 // ---------------------------------------------------------------------------
 // Integration tests that exercise the full pipeline through to_llm_text
 // ---------------------------------------------------------------------------
@ -797,6 +832,39 @@ mod tests {
        assert!(out.contains("Big news"));
    }

+    #[test]
+    fn structured_data_scrubs_duplicate_article_body() {
+        let body = "This is the rendered article body. ".repeat(40);
+        let r = make_result_with_structured(vec![serde_json::json!({
+            "@type": "NewsArticle",
+            "headline": "Big news",
+            "articleBody": body,
+            "description": "A short useful summary"
+        })]);
+        let out = to_llm_text(&r, None);
+        assert!(out.contains("Big news"));
+        assert!(out.contains("A short useful summary"));
+        assert!(
+            !out.contains("articleBody"),
+            "Duplicate article body leaked: {out}"
+        );
+    }
+
+    #[test]
+    fn llm_output_strips_comment_count_links_and_pagination() {
+        let md = "Lead paragraph.\n\n[0](https://example.com/#comment-stream) Next\n\n5 minutes read\n\n[Article](https://example.com/article)";
+        let result = make_result(md);
+        let out = to_llm_text(&result, None);
+        assert!(out.contains("Lead paragraph."));
+        assert!(out.contains("5 minutes read"));
+        assert!(out.contains("- Article: https://example.com/article"));
+        assert!(!out.contains("0 Next"), "Pagination leaked: {out}");
+        assert!(
+            !out.contains("comment-stream"),
+            "Comment link leaked: {out}"
+        );
+    }
+
    #[test]
    fn structured_data_drops_oversized_blob() {
        // 32KB pageProps-style blob with no @type — should be dropped.