fix: prevent noise filter from swallowing content in malformed HTML

Two related fixes for content being stripped by the noise filter: 1. Remove <form> from unconditional noise tags. ASP.NET and similar frameworks wrap entire pages in a <form> tag — these are not input forms. Forms with >500 chars of text are now treated as content wrappers, not noise. 2. Add safety valve for class/ID noise matching. When malformed HTML leaves a noise container unclosed (e.g., <div class="header"> missing its </div>), the HTML5 parser makes all subsequent siblings into children of that container. A header/nav/footer with >5000 chars of text is almost certainly a broken wrapper absorbing real content — exempt it from noise filtering.
2026-06-08 22:25:12 +02:00 · 2026-04-04 01:33:11 +02:00 · 2026-04-04 01:33:11 +02:00 · 70c67f2ed6
commit 70c67f2ed6
parent 74bac87435
2 changed files with 138 additions and 7 deletions
--- a/crates/webclaw-core/src/extractor.rs
+++ b/crates/webclaw-core/src/extractor.rs
@ -1484,3 +1484,56 @@ mod tests {
        );
    }
 }
+
+#[cfg(test)]
+mod form_integration_tests {
+    use super::*;
+
+    #[test]
+    fn aspnet_form_content_extraction() {
+        let content = "x".repeat(600); // Ensure >500 chars
+        let html = format!(r#"<html><body>
+            <form method="post" action="./page.aspx" id="form1">
+                <div class="wrapper">
+                    <div class="header"><a href="/">Logo</a></div>
+                    <div class="content">
+                        <h2>Section</h2>
+                        <h3>Question?</h3>
+                        <p>{content}</p>
+                    </div>
+                </div>
+            </form>
+        </body></html>"#);
+        let doc = Html::parse_document(&html);
+        let opts = ExtractionOptions::default();
+        let result = extract_content(&doc, None, &opts);
+        assert!(result.markdown.contains("Section"), "h2 missing from markdown");
+        assert!(result.markdown.contains("Question"), "h3 missing from markdown");
+    }
+
+    /// Simulate unclosed header div absorbing the content div.
+    /// The header's noise class should NOT propagate to the absorbed content
+    /// because the safety valve detects the header has >5000 chars (broken wrapper).
+    #[test]
+    fn unclosed_header_div_does_not_swallow_content() {
+        let faq = "Lorem ipsum dolor sit amet. ".repeat(300); // ~8400 chars
+        // The header div is intentionally NOT closed — the HTML parser makes
+        // div.content a child of div.header. The safety valve (>5000 chars)
+        // should prevent div.header from being treated as noise.
+        let html = format!(r#"<html><body>
+            <div class="wrapper">
+                <div class="header"><a href="/">Logo</a>
+                <div class="content">
+                    <h2>FAQ Section</h2>
+                    <h3>First question?</h3>
+                    <p>{faq}</p>
+                </div>
+            </div>
+        </body></html>"#);
+        let doc = Html::parse_document(&html);
+        let opts = ExtractionOptions::default();
+        let result = extract_content(&doc, None, &opts);
+        assert!(result.markdown.contains("FAQ Section"), "h2 missing: header swallowed content");
+        assert!(result.markdown.contains("First question"), "h3 missing: header swallowed content");
+    }
+}