fix: prevent noise filter from swallowing content in malformed HTML

Two related fixes for content being stripped by the noise filter: 1. Remove <form> from unconditional noise tags. ASP.NET and similar frameworks wrap entire pages in a <form> tag — these are not input forms. Forms with >500 chars of text are now treated as content wrappers, not noise. 2. Add safety valve for class/ID noise matching. When malformed HTML leaves a noise container unclosed (e.g., <div class="header"> missing its </div>), the HTML5 parser makes all subsequent siblings into children of that container. A header/nav/footer with >5000 chars of text is almost certainly a broken wrapper absorbing real content — exempt it from noise filtering.
2026-04-25 00:06:21 +02:00 · 2026-04-04 01:33:11 +02:00 · 2026-04-04 01:33:11 +02:00 · 70c67f2ed6
commit 70c67f2ed6
parent 74bac87435
2 changed files with 138 additions and 7 deletions
--- a/crates/webclaw-core/src/extractor.rs
+++ b/crates/webclaw-core/src/extractor.rs
@ -1484,3 +1484,56 @@ mod tests {
        );
    }
 }
 #[cfg(test)]
 mod form_integration_tests {
    use super::*;
    #[test]
    fn aspnet_form_content_extraction() {
        let content = "x".repeat(600); // Ensure >500 chars
        let html = format!(r#"<html><body>
            <form method="post" action="./page.aspx" id="form1">
                <div class="wrapper">
                    <div class="header"><a href="/">Logo</a></div>
                    <div class="content">
                        <h2>Section</h2>
                        <h3>Question?</h3>
                        <p>{content}</p>
                    </div>
                </div>
            </form>
        </body></html>"#);
        let doc = Html::parse_document(&html);
        let opts = ExtractionOptions::default();
        let result = extract_content(&doc, None, &opts);
        assert!(result.markdown.contains("Section"), "h2 missing from markdown");
        assert!(result.markdown.contains("Question"), "h3 missing from markdown");
    }
    /// Simulate unclosed header div absorbing the content div.
    /// The header's noise class should NOT propagate to the absorbed content
    /// because the safety valve detects the header has >5000 chars (broken wrapper).
    #[test]
    fn unclosed_header_div_does_not_swallow_content() {
        let faq = "Lorem ipsum dolor sit amet. ".repeat(300); // ~8400 chars
        // The header div is intentionally NOT closed — the HTML parser makes
        // div.content a child of div.header. The safety valve (>5000 chars)
        // should prevent div.header from being treated as noise.
        let html = format!(r#"<html><body>
            <div class="wrapper">
                <div class="header"><a href="/">Logo</a>
                <div class="content">
                    <h2>FAQ Section</h2>
                    <h3>First question?</h3>
                    <p>{faq}</p>
                </div>
            </div>
        </body></html>"#);
        let doc = Html::parse_document(&html);
        let opts = ExtractionOptions::default();
        let result = extract_content(&doc, None, &opts);
        assert!(result.markdown.contains("FAQ Section"), "h2 missing: header swallowed content");
        assert!(result.markdown.contains("First question"), "h3 missing: header swallowed content");
    }
 }
--- a/crates/webclaw-core/src/noise.rs
+++ b/crates/webclaw-core/src/noise.rs
@ -7,9 +7,12 @@
 use scraper::ElementRef;
 const NOISE_TAGS: &[&str] = &[
-    "script", "style", "noscript", "iframe", "svg", "nav", "aside", "footer", "header", "form",
+    "script", "style", "noscript", "iframe", "svg", "nav", "aside", "footer", "header", "video",
-    "video", "audio",
+    "audio", "canvas",
-    "canvas",
+    // NOTE: <form> removed from this list — ASP.NET and similar frameworks wrap the
    // entire page body in a single <form> tag that contains all real content.
    // Forms are now handled with a heuristic in is_noise() that distinguishes
    // small input forms (noise) from page-wrapping forms (not noise).
    // NOTE: <picture> removed — it's a responsive image container, not noise.
    // <picture> wraps <source> and <img> for responsive images.
 ];
@ -189,6 +192,28 @@ pub fn is_noise(el: ElementRef<'_>) -> bool {
        return true;
    }
    // <form> heuristic: ASP.NET wraps the entire page body in a single <form>.
    // These page-wrapping forms contain hundreds of words of real content.
    // Small forms (login, search, newsletter) are noise.
    if tag == "form" {
        let text_len = el.text().collect::<String>().len();
        // A form with substantial text (>500 chars) is likely a page wrapper, not noise.
        // Small forms (login/search/subscribe) rarely exceed a few hundred chars.
        if text_len < 500 {
            return true;
        }
        // Also check noise classes/IDs — a big form with class="login-form" is still noise
        if let Some(class) = el.value().attr("class") {
            let cl = class.to_lowercase();
            if cl.contains("login") || cl.contains("search") || cl.contains("subscribe")
                || cl.contains("signup") || cl.contains("newsletter") || cl.contains("contact")
            {
                return true;
            }
        }
        return false;
    }
    // ARIA role-based noise
    if let Some(role) = el.value().attr("role")
        && NOISE_ROLES.contains(&role)
@ -200,10 +225,12 @@ pub fn is_noise(el: ElementRef<'_>) -> bool {
    // check each against the noise list. "free-modal-container" splits into
    // ["free-modal-container"] which does NOT match "modal".
    if let Some(class) = el.value().attr("class") {
        let mut class_matched = false;
        for token in class.split_whitespace() {
            let lower = token.to_lowercase();
            if NOISE_CLASSES.contains(&lower.as_str()) {
-                return true;
+                class_matched = true;
                break;
            }
            // Structural elements use compound names (FooterLinks, Header-nav, etc.)
            // These are always noise regardless of compound form.
@ -211,11 +238,24 @@ pub fn is_noise(el: ElementRef<'_>) -> bool {
                || lower.starts_with("header-")
                || lower.starts_with("nav-")
            {
-                return true;
+                class_matched = true;
                break;
            }
        }
-        // Also check for ad-specific patterns (standalone "ad" class)
+        if !class_matched {
-        if is_ad_class(class) {
+            class_matched = is_ad_class(class);
        }
        if class_matched {
            // Safety valve: malformed HTML can leave noise containers unclosed,
            // causing them to absorb the entire page content. A real header/nav/
            // footer rarely exceeds a few thousand characters of text. If a
            // noise-class element has massive text content, it's almost certainly
            // a broken wrapper — treat it as content, not noise.
            let text_len = el.text().collect::<String>().len();
            if text_len > 5000 {
                return false;
            }
            return true;
        }
    }
@ -224,6 +264,11 @@ pub fn is_noise(el: ElementRef<'_>) -> bool {
    if let Some(id) = el.value().attr("id") {
        let id_lower = id.to_lowercase();
        if NOISE_IDS.contains(&id_lower.as_str()) && !is_structural_id(&id_lower) {
            // Same safety valve for ID-matched noise elements
            let text_len = el.text().collect::<String>().len();
            if text_len > 5000 {
                return false;
            }
            return true;
        }
        // Cookie consent platform IDs (prefix match — these generate huge overlays)
@ -754,3 +799,36 @@ mod tests {
        ));
    }
 }
 #[cfg(test)]
 mod form_tests {
    use super::*;
    use scraper::Html;
    #[test]
    fn aspnet_page_wrapping_form_is_not_noise() {
        let html = r#"<html><body><form method="post" action="./page.aspx" id="form1"><div class="wrapper"><div class="content"><h1>Support</h1><h3>Question one?</h3><p>Long answer text that should definitely be captured by the extraction engine. This is real content with multiple sentences to ensure it passes any text length thresholds in the scoring algorithm. We need at least five hundred characters of actual text content here to exceed the threshold. Adding more sentences about various topics including data formats, historical prices, stock market analysis, technical indicators, and trading strategies. This paragraph discusses how intraday data can be used for backtesting quantitative models and developing automated trading systems.</p><h3>Question two?</h3><p>Another substantial answer paragraph with detailed information about the product features and capabilities.</p></div></div></form></body></html>"#;
        let doc = Html::parse_document(html);
        let form = doc.select(&scraper::Selector::parse("form").unwrap()).next().unwrap();
        let text = form.text().collect::<String>();
        let text_len = text.len();
        assert!(text_len >= 500, "Form text should be >= 500 chars, got {text_len}");
        assert!(!is_noise(form), "ASP.NET page-wrapping form should NOT be noise");
    }
    #[test]
    fn small_login_form_is_noise() {
        let html = r#"
        <html><body>
        <form action="/login">
            <input type="text" name="user" />
            <input type="password" name="pass" />
            <button>Login</button>
        </form>
        </body></html>
        "#;
        let doc = Html::parse_document(html);
        let form = doc.select(&scraper::Selector::parse("form").unwrap()).next().unwrap();
        assert!(is_noise(form), "Small login form SHOULD be noise");
    }
 }