mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-08 22:25:12 +02:00
fix: prevent noise filter from swallowing content in malformed HTML
Two related fixes for content being stripped by the noise filter: 1. Remove <form> from unconditional noise tags. ASP.NET and similar frameworks wrap entire pages in a <form> tag — these are not input forms. Forms with >500 chars of text are now treated as content wrappers, not noise. 2. Add safety valve for class/ID noise matching. When malformed HTML leaves a noise container unclosed (e.g., <div class="header"> missing its </div>), the HTML5 parser makes all subsequent siblings into children of that container. A header/nav/footer with >5000 chars of text is almost certainly a broken wrapper absorbing real content — exempt it from noise filtering.
This commit is contained in:
parent
74bac87435
commit
70c67f2ed6
2 changed files with 138 additions and 7 deletions
|
|
@ -1484,3 +1484,56 @@ mod tests {
|
|||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod form_integration_tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn aspnet_form_content_extraction() {
|
||||
let content = "x".repeat(600); // Ensure >500 chars
|
||||
let html = format!(r#"<html><body>
|
||||
<form method="post" action="./page.aspx" id="form1">
|
||||
<div class="wrapper">
|
||||
<div class="header"><a href="/">Logo</a></div>
|
||||
<div class="content">
|
||||
<h2>Section</h2>
|
||||
<h3>Question?</h3>
|
||||
<p>{content}</p>
|
||||
</div>
|
||||
</div>
|
||||
</form>
|
||||
</body></html>"#);
|
||||
let doc = Html::parse_document(&html);
|
||||
let opts = ExtractionOptions::default();
|
||||
let result = extract_content(&doc, None, &opts);
|
||||
assert!(result.markdown.contains("Section"), "h2 missing from markdown");
|
||||
assert!(result.markdown.contains("Question"), "h3 missing from markdown");
|
||||
}
|
||||
|
||||
/// Simulate unclosed header div absorbing the content div.
|
||||
/// The header's noise class should NOT propagate to the absorbed content
|
||||
/// because the safety valve detects the header has >5000 chars (broken wrapper).
|
||||
#[test]
|
||||
fn unclosed_header_div_does_not_swallow_content() {
|
||||
let faq = "Lorem ipsum dolor sit amet. ".repeat(300); // ~8400 chars
|
||||
// The header div is intentionally NOT closed — the HTML parser makes
|
||||
// div.content a child of div.header. The safety valve (>5000 chars)
|
||||
// should prevent div.header from being treated as noise.
|
||||
let html = format!(r#"<html><body>
|
||||
<div class="wrapper">
|
||||
<div class="header"><a href="/">Logo</a>
|
||||
<div class="content">
|
||||
<h2>FAQ Section</h2>
|
||||
<h3>First question?</h3>
|
||||
<p>{faq}</p>
|
||||
</div>
|
||||
</div>
|
||||
</body></html>"#);
|
||||
let doc = Html::parse_document(&html);
|
||||
let opts = ExtractionOptions::default();
|
||||
let result = extract_content(&doc, None, &opts);
|
||||
assert!(result.markdown.contains("FAQ Section"), "h2 missing: header swallowed content");
|
||||
assert!(result.markdown.contains("First question"), "h3 missing: header swallowed content");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue