fix: clean llm output noise

Port the valid PR #43 LLM cleanup fixes onto current main without stale branch regressions.\n\nIncludes comment-count link cleanup, bare numeric paragraph cleanup, pagination leftover cleanup, JSON-LD article body scrubbing, clearer CLI consent-wall warnings, and quieter parser logs by default.\n\nThanks to @devnen for the report and patch work.
This commit is contained in:
Valerio 2026-05-18 18:39:33 +02:00 committed by GitHub
parent 5eef8358b0
commit 3fabdc1d02
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 348 additions and 18 deletions

View file

@ -73,7 +73,15 @@ pub(crate) fn process_body(markdown: &str) -> ProcessedBody {
// d. Extract links, replace inline `[text](url)` with just `text`
let (text, extracted_links) = links::extract_and_strip_links(&text);
// d2. Collapse repeated adjacent phrases on the same line
// d1. Strip bare-integer paragraphs after link extraction, so
// `[0](#comments)` collapses to `0` before the paragraph-aware check.
let text = cleanup::strip_bare_number_lines(&text);
// d2. Run UI-control stripping again after link extraction. Lines like
// `[0](url) Next` become `0 Next`, which is pure pagination chrome.
let text = cleanup::strip_ui_control_text(&text);
// d3. Collapse repeated adjacent phrases on the same line
// (responsive variants: "Read more Read more Read more" -> "Read more")
let text = dedup_repeated_phrases(&text);

View file

@ -385,16 +385,33 @@ pub(crate) fn is_ui_control_line(line: &str) -> bool {
return false;
}
// Split by whitespace: every token must be a known UI control
// Split by whitespace: every token must be a known UI control, with short
// numbers allowed only when paired with real pagination chrome.
let tokens: Vec<&str> = trimmed.split_whitespace().collect();
if tokens.is_empty() {
return false;
}
tokens.iter().all(|t| is_ui_control_token(t))
let mut has_named_control = false;
for token in tokens {
if is_bare_short_integer(token) {
continue;
}
if is_ui_control_token(token) {
has_named_control = true;
continue;
}
return false;
}
has_named_control
}
/// Known UI control tokens from Material Icons ligatures, icon fonts, and
/// common navigation elements that leak into text extraction.
///
/// Match is case-insensitive: `Next`, `next`, and `NEXT` are all treated as
/// pagination chrome when alone on a line.
fn is_ui_control_token(token: &str) -> bool {
const UI_CONTROLS: &[&str] = &[
// Material Icons ligatures
@ -428,6 +445,12 @@ fn is_ui_control_token(token: &str) -> bool {
"search",
"menu",
"share",
// Pagination chrome left over from rendered "Next | Previous" links.
"next",
"previous",
"prev",
"older",
"newer",
// Arrow/nav characters
"\u{2190}",
"\u{2192}",
@ -444,7 +467,56 @@ fn is_ui_control_token(token: &str) -> bool {
"\u{00BB}",
"\u{00AB}",
];
UI_CONTROLS.contains(&token)
let lowered = token.to_ascii_lowercase();
UI_CONTROLS.contains(&lowered.as_str())
}
/// Remove lines that are a bare short integer alone in their paragraph.
///
/// News index pages often render comment counts (`0`, `42`) and pagination
/// page numbers (`1`, `2`) as standalone paragraphs after each article. These
/// add zero signal and confuse downstream readers, but they are real numbers
/// not control tokens, so [`strip_ui_control_text`] does not catch them.
///
/// To stay safe, we only drop a line if both conditions hold:
/// 1. The trimmed line is a non-negative integer <= 9999.
/// 2. The line is alone in its paragraph, surrounded by blank lines or edges.
pub(crate) fn strip_bare_number_lines(input: &str) -> String {
let lines: Vec<&str> = input.lines().collect();
let mut out: Vec<&str> = Vec::with_capacity(lines.len());
let mut in_code = false;
for (i, line) in lines.iter().enumerate() {
let trimmed = line.trim();
if trimmed.starts_with("```") {
in_code = !in_code;
out.push(line);
continue;
}
if in_code {
out.push(line);
continue;
}
if is_bare_short_integer(trimmed) && is_isolated_in_paragraph(&lines, i) {
continue;
}
out.push(line);
}
out.join("\n")
}
fn is_bare_short_integer(s: &str) -> bool {
if s.is_empty() || s.len() > 4 {
return false;
}
s.chars().all(|c| c.is_ascii_digit())
}
fn is_isolated_in_paragraph(lines: &[&str], i: usize) -> bool {
let prev_blank = i == 0 || lines[i - 1].trim().is_empty();
let next_blank = i + 1 == lines.len() || lines[i + 1].trim().is_empty();
prev_blank && next_blank
}
// ---------------------------------------------------------------------------
@ -1158,6 +1230,37 @@ mod tests {
assert_eq!(strip_ui_control_text(input), "Hello\nWorld");
}
#[test]
fn ui_control_strips_pagination_with_comment_count() {
assert!(is_ui_control_line("0 Next"));
assert!(is_ui_control_line("12 PREVIOUS"));
assert_eq!(strip_ui_control_text("Story\n0 Next\nMore"), "Story\nMore");
}
#[test]
fn ui_control_keeps_bare_numbers_for_context() {
assert!(!is_ui_control_line("2026"));
assert_eq!(
strip_ui_control_text("Revenue\n2026\nReport"),
"Revenue\n2026\nReport"
);
}
#[test]
fn bare_number_lines_strip_isolated_counts() {
let input = "Article title\n\n0\n\nNext article";
assert_eq!(
strip_bare_number_lines(input),
"Article title\n\n\nNext article"
);
}
#[test]
fn bare_number_lines_keep_lists_and_code() {
let input = "- 1\n\n1.\n\n```\n0\n```\n\nReal text";
assert_eq!(strip_bare_number_lines(input), input);
}
// -- Long alt-text descriptions --
#[test]

View file

@ -69,6 +69,18 @@ fn is_noise_link(text: &str, href: &str) -> bool {
return true;
}
// Bare integer labels are usually comment counts, vote counts, or page
// numbers. The label alone carries no useful link context for an LLM.
if !text.is_empty() && text.len() <= 4 && text.chars().all(|c| c.is_ascii_digit()) {
return true;
}
// In-page comment/discussion fragments that survived the bare-fragment
// check because the href is a full URL with a comment fragment.
if href.contains("#comment-stream") || href.contains("#comments") || href.contains("#disqus") {
return true;
}
// Internal user profile / action URLs (HN-style)
if href.contains("/user?id=")
|| href.contains("/hide?id=")

View file

@ -51,12 +51,15 @@ pub fn to_llm_text(result: &ExtractionResult, url: Option<&str>) -> String {
// hydration blobs (Next.js pageProps full of ad-targeting flags, build
// IDs, schedule paths) explode to hundreds of KB and drown the LLM in
// noise — drop them rather than ship them.
let useful: Vec<_> = result
let mut useful: Vec<_> = result
.structured_data
.iter()
.filter(|v| is_useful_structured_data(v))
.cloned()
.collect();
for value in &mut useful {
scrub_body_fields(value);
}
if !useful.is_empty() {
let serialized = serde_json::to_string_pretty(&useful).unwrap_or_default();
const STRUCTURED_DATA_MAX_BYTES: usize = 16 * 1024;
@ -113,6 +116,38 @@ fn is_useful_structured_data(v: &serde_json::Value) -> bool {
serialized.len() <= 4 * 1024
}
/// Recursively remove long fields that duplicate the rendered markdown body.
fn scrub_body_fields(v: &mut serde_json::Value) {
const BODY_KEYS: &[&str] = &["articleBody"];
const LONG_BODY_KEYS: &[&str] = &["body", "text", "description"];
const LONG_THRESHOLD: usize = 500;
match v {
serde_json::Value::Object(map) => {
map.retain(|key, value| {
if BODY_KEYS.contains(&key.as_str()) {
return false;
}
if LONG_BODY_KEYS.contains(&key.as_str())
&& value.as_str().is_some_and(|s| s.len() >= LONG_THRESHOLD)
{
return false;
}
true
});
for value in map.values_mut() {
scrub_body_fields(value);
}
}
serde_json::Value::Array(values) => {
for value in values {
scrub_body_fields(value);
}
}
_ => {}
}
}
// ---------------------------------------------------------------------------
// Integration tests that exercise the full pipeline through to_llm_text
// ---------------------------------------------------------------------------
@ -797,6 +832,39 @@ mod tests {
assert!(out.contains("Big news"));
}
#[test]
fn structured_data_scrubs_duplicate_article_body() {
let body = "This is the rendered article body. ".repeat(40);
let r = make_result_with_structured(vec![serde_json::json!({
"@type": "NewsArticle",
"headline": "Big news",
"articleBody": body,
"description": "A short useful summary"
})]);
let out = to_llm_text(&r, None);
assert!(out.contains("Big news"));
assert!(out.contains("A short useful summary"));
assert!(
!out.contains("articleBody"),
"Duplicate article body leaked: {out}"
);
}
#[test]
fn llm_output_strips_comment_count_links_and_pagination() {
let md = "Lead paragraph.\n\n[0](https://example.com/#comment-stream) Next\n\n5 minutes read\n\n[Article](https://example.com/article)";
let result = make_result(md);
let out = to_llm_text(&result, None);
assert!(out.contains("Lead paragraph."));
assert!(out.contains("5 minutes read"));
assert!(out.contains("- Article: https://example.com/article"));
assert!(!out.contains("0 Next"), "Pagination leaked: {out}");
assert!(
!out.contains("comment-stream"),
"Comment link leaked: {out}"
);
}
#[test]
fn structured_data_drops_oversized_blob() {
// 32KB pageProps-style blob with no @type — should be dropped.