mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-06 22:05:13 +02:00
fix: clean llm output noise
Port the valid PR #43 LLM cleanup fixes onto current main without stale branch regressions.\n\nIncludes comment-count link cleanup, bare numeric paragraph cleanup, pagination leftover cleanup, JSON-LD article body scrubbing, clearer CLI consent-wall warnings, and quieter parser logs by default.\n\nThanks to @devnen for the report and patch work.
This commit is contained in:
parent
5eef8358b0
commit
3fabdc1d02
8 changed files with 348 additions and 18 deletions
|
|
@ -73,7 +73,15 @@ pub(crate) fn process_body(markdown: &str) -> ProcessedBody {
|
|||
// d. Extract links, replace inline `[text](url)` with just `text`
|
||||
let (text, extracted_links) = links::extract_and_strip_links(&text);
|
||||
|
||||
// d2. Collapse repeated adjacent phrases on the same line
|
||||
// d1. Strip bare-integer paragraphs after link extraction, so
|
||||
// `[0](#comments)` collapses to `0` before the paragraph-aware check.
|
||||
let text = cleanup::strip_bare_number_lines(&text);
|
||||
|
||||
// d2. Run UI-control stripping again after link extraction. Lines like
|
||||
// `[0](url) Next` become `0 Next`, which is pure pagination chrome.
|
||||
let text = cleanup::strip_ui_control_text(&text);
|
||||
|
||||
// d3. Collapse repeated adjacent phrases on the same line
|
||||
// (responsive variants: "Read more Read more Read more" -> "Read more")
|
||||
let text = dedup_repeated_phrases(&text);
|
||||
|
||||
|
|
|
|||
|
|
@ -385,16 +385,33 @@ pub(crate) fn is_ui_control_line(line: &str) -> bool {
|
|||
return false;
|
||||
}
|
||||
|
||||
// Split by whitespace: every token must be a known UI control
|
||||
// Split by whitespace: every token must be a known UI control, with short
|
||||
// numbers allowed only when paired with real pagination chrome.
|
||||
let tokens: Vec<&str> = trimmed.split_whitespace().collect();
|
||||
if tokens.is_empty() {
|
||||
return false;
|
||||
}
|
||||
tokens.iter().all(|t| is_ui_control_token(t))
|
||||
|
||||
let mut has_named_control = false;
|
||||
for token in tokens {
|
||||
if is_bare_short_integer(token) {
|
||||
continue;
|
||||
}
|
||||
if is_ui_control_token(token) {
|
||||
has_named_control = true;
|
||||
continue;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
has_named_control
|
||||
}
|
||||
|
||||
/// Known UI control tokens from Material Icons ligatures, icon fonts, and
|
||||
/// common navigation elements that leak into text extraction.
|
||||
///
|
||||
/// Match is case-insensitive: `Next`, `next`, and `NEXT` are all treated as
|
||||
/// pagination chrome when alone on a line.
|
||||
fn is_ui_control_token(token: &str) -> bool {
|
||||
const UI_CONTROLS: &[&str] = &[
|
||||
// Material Icons ligatures
|
||||
|
|
@ -428,6 +445,12 @@ fn is_ui_control_token(token: &str) -> bool {
|
|||
"search",
|
||||
"menu",
|
||||
"share",
|
||||
// Pagination chrome left over from rendered "Next | Previous" links.
|
||||
"next",
|
||||
"previous",
|
||||
"prev",
|
||||
"older",
|
||||
"newer",
|
||||
// Arrow/nav characters
|
||||
"\u{2190}",
|
||||
"\u{2192}",
|
||||
|
|
@ -444,7 +467,56 @@ fn is_ui_control_token(token: &str) -> bool {
|
|||
"\u{00BB}",
|
||||
"\u{00AB}",
|
||||
];
|
||||
UI_CONTROLS.contains(&token)
|
||||
let lowered = token.to_ascii_lowercase();
|
||||
UI_CONTROLS.contains(&lowered.as_str())
|
||||
}
|
||||
|
||||
/// Remove lines that are a bare short integer alone in their paragraph.
|
||||
///
|
||||
/// News index pages often render comment counts (`0`, `42`) and pagination
|
||||
/// page numbers (`1`, `2`) as standalone paragraphs after each article. These
|
||||
/// add zero signal and confuse downstream readers, but they are real numbers
|
||||
/// not control tokens, so [`strip_ui_control_text`] does not catch them.
|
||||
///
|
||||
/// To stay safe, we only drop a line if both conditions hold:
|
||||
/// 1. The trimmed line is a non-negative integer <= 9999.
|
||||
/// 2. The line is alone in its paragraph, surrounded by blank lines or edges.
|
||||
pub(crate) fn strip_bare_number_lines(input: &str) -> String {
|
||||
let lines: Vec<&str> = input.lines().collect();
|
||||
let mut out: Vec<&str> = Vec::with_capacity(lines.len());
|
||||
let mut in_code = false;
|
||||
|
||||
for (i, line) in lines.iter().enumerate() {
|
||||
let trimmed = line.trim();
|
||||
if trimmed.starts_with("```") {
|
||||
in_code = !in_code;
|
||||
out.push(line);
|
||||
continue;
|
||||
}
|
||||
if in_code {
|
||||
out.push(line);
|
||||
continue;
|
||||
}
|
||||
if is_bare_short_integer(trimmed) && is_isolated_in_paragraph(&lines, i) {
|
||||
continue;
|
||||
}
|
||||
out.push(line);
|
||||
}
|
||||
|
||||
out.join("\n")
|
||||
}
|
||||
|
||||
fn is_bare_short_integer(s: &str) -> bool {
|
||||
if s.is_empty() || s.len() > 4 {
|
||||
return false;
|
||||
}
|
||||
s.chars().all(|c| c.is_ascii_digit())
|
||||
}
|
||||
|
||||
fn is_isolated_in_paragraph(lines: &[&str], i: usize) -> bool {
|
||||
let prev_blank = i == 0 || lines[i - 1].trim().is_empty();
|
||||
let next_blank = i + 1 == lines.len() || lines[i + 1].trim().is_empty();
|
||||
prev_blank && next_blank
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
|
@ -1158,6 +1230,37 @@ mod tests {
|
|||
assert_eq!(strip_ui_control_text(input), "Hello\nWorld");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ui_control_strips_pagination_with_comment_count() {
|
||||
assert!(is_ui_control_line("0 Next"));
|
||||
assert!(is_ui_control_line("12 PREVIOUS"));
|
||||
assert_eq!(strip_ui_control_text("Story\n0 Next\nMore"), "Story\nMore");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ui_control_keeps_bare_numbers_for_context() {
|
||||
assert!(!is_ui_control_line("2026"));
|
||||
assert_eq!(
|
||||
strip_ui_control_text("Revenue\n2026\nReport"),
|
||||
"Revenue\n2026\nReport"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bare_number_lines_strip_isolated_counts() {
|
||||
let input = "Article title\n\n0\n\nNext article";
|
||||
assert_eq!(
|
||||
strip_bare_number_lines(input),
|
||||
"Article title\n\n\nNext article"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bare_number_lines_keep_lists_and_code() {
|
||||
let input = "- 1\n\n1.\n\n```\n0\n```\n\nReal text";
|
||||
assert_eq!(strip_bare_number_lines(input), input);
|
||||
}
|
||||
|
||||
// -- Long alt-text descriptions --
|
||||
|
||||
#[test]
|
||||
|
|
|
|||
|
|
@ -69,6 +69,18 @@ fn is_noise_link(text: &str, href: &str) -> bool {
|
|||
return true;
|
||||
}
|
||||
|
||||
// Bare integer labels are usually comment counts, vote counts, or page
|
||||
// numbers. The label alone carries no useful link context for an LLM.
|
||||
if !text.is_empty() && text.len() <= 4 && text.chars().all(|c| c.is_ascii_digit()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// In-page comment/discussion fragments that survived the bare-fragment
|
||||
// check because the href is a full URL with a comment fragment.
|
||||
if href.contains("#comment-stream") || href.contains("#comments") || href.contains("#disqus") {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Internal user profile / action URLs (HN-style)
|
||||
if href.contains("/user?id=")
|
||||
|| href.contains("/hide?id=")
|
||||
|
|
|
|||
|
|
@ -51,12 +51,15 @@ pub fn to_llm_text(result: &ExtractionResult, url: Option<&str>) -> String {
|
|||
// hydration blobs (Next.js pageProps full of ad-targeting flags, build
|
||||
// IDs, schedule paths) explode to hundreds of KB and drown the LLM in
|
||||
// noise — drop them rather than ship them.
|
||||
let useful: Vec<_> = result
|
||||
let mut useful: Vec<_> = result
|
||||
.structured_data
|
||||
.iter()
|
||||
.filter(|v| is_useful_structured_data(v))
|
||||
.cloned()
|
||||
.collect();
|
||||
for value in &mut useful {
|
||||
scrub_body_fields(value);
|
||||
}
|
||||
if !useful.is_empty() {
|
||||
let serialized = serde_json::to_string_pretty(&useful).unwrap_or_default();
|
||||
const STRUCTURED_DATA_MAX_BYTES: usize = 16 * 1024;
|
||||
|
|
@ -113,6 +116,38 @@ fn is_useful_structured_data(v: &serde_json::Value) -> bool {
|
|||
serialized.len() <= 4 * 1024
|
||||
}
|
||||
|
||||
/// Recursively remove long fields that duplicate the rendered markdown body.
|
||||
fn scrub_body_fields(v: &mut serde_json::Value) {
|
||||
const BODY_KEYS: &[&str] = &["articleBody"];
|
||||
const LONG_BODY_KEYS: &[&str] = &["body", "text", "description"];
|
||||
const LONG_THRESHOLD: usize = 500;
|
||||
|
||||
match v {
|
||||
serde_json::Value::Object(map) => {
|
||||
map.retain(|key, value| {
|
||||
if BODY_KEYS.contains(&key.as_str()) {
|
||||
return false;
|
||||
}
|
||||
if LONG_BODY_KEYS.contains(&key.as_str())
|
||||
&& value.as_str().is_some_and(|s| s.len() >= LONG_THRESHOLD)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
true
|
||||
});
|
||||
for value in map.values_mut() {
|
||||
scrub_body_fields(value);
|
||||
}
|
||||
}
|
||||
serde_json::Value::Array(values) => {
|
||||
for value in values {
|
||||
scrub_body_fields(value);
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Integration tests that exercise the full pipeline through to_llm_text
|
||||
// ---------------------------------------------------------------------------
|
||||
|
|
@ -797,6 +832,39 @@ mod tests {
|
|||
assert!(out.contains("Big news"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn structured_data_scrubs_duplicate_article_body() {
|
||||
let body = "This is the rendered article body. ".repeat(40);
|
||||
let r = make_result_with_structured(vec![serde_json::json!({
|
||||
"@type": "NewsArticle",
|
||||
"headline": "Big news",
|
||||
"articleBody": body,
|
||||
"description": "A short useful summary"
|
||||
})]);
|
||||
let out = to_llm_text(&r, None);
|
||||
assert!(out.contains("Big news"));
|
||||
assert!(out.contains("A short useful summary"));
|
||||
assert!(
|
||||
!out.contains("articleBody"),
|
||||
"Duplicate article body leaked: {out}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn llm_output_strips_comment_count_links_and_pagination() {
|
||||
let md = "Lead paragraph.\n\n[0](https://example.com/#comment-stream) Next\n\n5 minutes read\n\n[Article](https://example.com/article)";
|
||||
let result = make_result(md);
|
||||
let out = to_llm_text(&result, None);
|
||||
assert!(out.contains("Lead paragraph."));
|
||||
assert!(out.contains("5 minutes read"));
|
||||
assert!(out.contains("- Article: https://example.com/article"));
|
||||
assert!(!out.contains("0 Next"), "Pagination leaked: {out}");
|
||||
assert!(
|
||||
!out.contains("comment-stream"),
|
||||
"Comment link leaked: {out}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn structured_data_drops_oversized_blob() {
|
||||
// 32KB pageProps-style blob with no @type — should be dropped.
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue