fix: prevent noise filter from swallowing content in malformed HTML

Two related fixes for content being stripped by the noise filter:

1. Remove <form> from unconditional noise tags. ASP.NET and similar
   frameworks wrap entire pages in a <form> tag — these are not input
   forms. Forms with >500 chars of text are now treated as content
   wrappers, not noise.

2. Add safety valve for class/ID noise matching. When malformed HTML
   leaves a noise container unclosed (e.g., <div class="header"> missing
   its </div>), the HTML5 parser makes all subsequent siblings into
   children of that container. A header/nav/footer with >5000 chars of
   text is almost certainly a broken wrapper absorbing real content —
   exempt it from noise filtering.
This commit is contained in:
devnen 2026-04-04 01:33:11 +02:00
parent 74bac87435
commit 70c67f2ed6
2 changed files with 138 additions and 7 deletions

View file

@ -1484,3 +1484,56 @@ mod tests {
);
}
}
#[cfg(test)]
mod form_integration_tests {
use super::*;
#[test]
fn aspnet_form_content_extraction() {
let content = "x".repeat(600); // Ensure >500 chars
let html = format!(r#"<html><body>
<form method="post" action="./page.aspx" id="form1">
<div class="wrapper">
<div class="header"><a href="/">Logo</a></div>
<div class="content">
<h2>Section</h2>
<h3>Question?</h3>
<p>{content}</p>
</div>
</div>
</form>
</body></html>"#);
let doc = Html::parse_document(&html);
let opts = ExtractionOptions::default();
let result = extract_content(&doc, None, &opts);
assert!(result.markdown.contains("Section"), "h2 missing from markdown");
assert!(result.markdown.contains("Question"), "h3 missing from markdown");
}
/// Simulate unclosed header div absorbing the content div.
/// The header's noise class should NOT propagate to the absorbed content
/// because the safety valve detects the header has >5000 chars (broken wrapper).
#[test]
fn unclosed_header_div_does_not_swallow_content() {
let faq = "Lorem ipsum dolor sit amet. ".repeat(300); // ~8400 chars
// The header div is intentionally NOT closed — the HTML parser makes
// div.content a child of div.header. The safety valve (>5000 chars)
// should prevent div.header from being treated as noise.
let html = format!(r#"<html><body>
<div class="wrapper">
<div class="header"><a href="/">Logo</a>
<div class="content">
<h2>FAQ Section</h2>
<h3>First question?</h3>
<p>{faq}</p>
</div>
</div>
</body></html>"#);
let doc = Html::parse_document(&html);
let opts = ExtractionOptions::default();
let result = extract_content(&doc, None, &opts);
assert!(result.markdown.contains("FAQ Section"), "h2 missing: header swallowed content");
assert!(result.markdown.contains("First question"), "h3 missing: header swallowed content");
}
}