mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-07 22:15:12 +02:00
fix: clean llm output noise
Port the valid PR #43 LLM cleanup fixes onto current main without stale branch regressions.\n\nIncludes comment-count link cleanup, bare numeric paragraph cleanup, pagination leftover cleanup, JSON-LD article body scrubbing, clearer CLI consent-wall warnings, and quieter parser logs by default.\n\nThanks to @devnen for the report and patch work.
This commit is contained in:
parent
5eef8358b0
commit
3fabdc1d02
8 changed files with 348 additions and 18 deletions
|
|
@ -69,6 +69,18 @@ fn is_noise_link(text: &str, href: &str) -> bool {
|
|||
return true;
|
||||
}
|
||||
|
||||
// Bare integer labels are usually comment counts, vote counts, or page
|
||||
// numbers. The label alone carries no useful link context for an LLM.
|
||||
if !text.is_empty() && text.len() <= 4 && text.chars().all(|c| c.is_ascii_digit()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// In-page comment/discussion fragments that survived the bare-fragment
|
||||
// check because the href is a full URL with a comment fragment.
|
||||
if href.contains("#comment-stream") || href.contains("#comments") || href.contains("#disqus") {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Internal user profile / action URLs (HN-style)
|
||||
if href.contains("/user?id=")
|
||||
|| href.contains("/hide?id=")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue