FAQ Section
+First question?
+{faq}
+diff --git a/crates/webclaw-core/src/extractor.rs b/crates/webclaw-core/src/extractor.rs index a26055f..3efe9e0 100644 --- a/crates/webclaw-core/src/extractor.rs +++ b/crates/webclaw-core/src/extractor.rs @@ -1484,3 +1484,56 @@ mod tests { ); } } + +#[cfg(test)] +mod form_integration_tests { + use super::*; + + #[test] + fn aspnet_form_content_extraction() { + let content = "x".repeat(600); // Ensure >500 chars + let html = format!(r#"
+ + "#); + let doc = Html::parse_document(&html); + let opts = ExtractionOptions::default(); + let result = extract_content(&doc, None, &opts); + assert!(result.markdown.contains("Section"), "h2 missing from markdown"); + assert!(result.markdown.contains("Question"), "h3 missing from markdown"); + } + + /// Simulate unclosed header div absorbing the content div. + /// The header's noise class should NOT propagate to the absorbed content + /// because the safety valve detects the header has >5000 chars (broken wrapper). + #[test] + fn unclosed_header_div_does_not_swallow_content() { + let faq = "Lorem ipsum dolor sit amet. ".repeat(300); // ~8400 chars + // The header div is intentionally NOT closed — the HTML parser makes + // div.content a child of div.header. The safety valve (>5000 chars) + // should prevent div.header from being treated as noise. + let html = format!(r#" +{faq}
+Deep content here
"); + for _ in 0..depth { + html.push_str("|
+ Column one first paragraph +Column one second paragraph + |
+
+ Column two content ++ Column two after rule + |
+
| + + + | ++ + | +
Above
Below
", None); diff --git a/crates/webclaw-core/src/noise.rs b/crates/webclaw-core/src/noise.rs index 46885f7..c9c9caf 100644 --- a/crates/webclaw-core/src/noise.rs +++ b/crates/webclaw-core/src/noise.rs @@ -7,9 +7,12 @@ use scraper::ElementRef; const NOISE_TAGS: &[&str] = &[ - "script", "style", "noscript", "iframe", "svg", "nav", "aside", "footer", "header", "form", - "video", "audio", - "canvas", + "script", "style", "noscript", "iframe", "svg", "nav", "aside", "footer", "header", "video", + "audio", "canvas", + // NOTE: