fix: clean llm output noise

Port the valid PR #43 LLM cleanup fixes onto current main without stale branch regressions.\n\nIncludes comment-count link cleanup, bare numeric paragraph cleanup, pagination leftover cleanup, JSON-LD article body scrubbing, clearer CLI consent-wall warnings, and quieter parser logs by default.\n\nThanks to @devnen for the report and patch work.
This commit is contained in:
Valerio 2026-05-18 18:39:33 +02:00 committed by GitHub
parent 5eef8358b0
commit 3fabdc1d02
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 348 additions and 18 deletions

View file

@ -69,6 +69,18 @@ fn is_noise_link(text: &str, href: &str) -> bool {
return true;
}
// Bare integer labels are usually comment counts, vote counts, or page
// numbers. The label alone carries no useful link context for an LLM.
if !text.is_empty() && text.len() <= 4 && text.chars().all(|c| c.is_ascii_digit()) {
return true;
}
// In-page comment/discussion fragments that survived the bare-fragment
// check because the href is a full URL with a comment fragment.
if href.contains("#comment-stream") || href.contains("#comments") || href.contains("#disqus") {
return true;
}
// Internal user profile / action URLs (HN-style)
if href.contains("/user?id=")
|| href.contains("/hide?id=")