FAQ Section
+First question?
+{faq}
+diff --git a/crates/webclaw-core/src/extractor.rs b/crates/webclaw-core/src/extractor.rs index a26055f..3efe9e0 100644 --- a/crates/webclaw-core/src/extractor.rs +++ b/crates/webclaw-core/src/extractor.rs @@ -1484,3 +1484,56 @@ mod tests { ); } } + +#[cfg(test)] +mod form_integration_tests { + use super::*; + + #[test] + fn aspnet_form_content_extraction() { + let content = "x".repeat(600); // Ensure >500 chars + let html = format!(r#"
+ + "#); + let doc = Html::parse_document(&html); + let opts = ExtractionOptions::default(); + let result = extract_content(&doc, None, &opts); + assert!(result.markdown.contains("Section"), "h2 missing from markdown"); + assert!(result.markdown.contains("Question"), "h3 missing from markdown"); + } + + /// Simulate unclosed header div absorbing the content div. + /// The header's noise class should NOT propagate to the absorbed content + /// because the safety valve detects the header has >5000 chars (broken wrapper). + #[test] + fn unclosed_header_div_does_not_swallow_content() { + let faq = "Lorem ipsum dolor sit amet. ".repeat(300); // ~8400 chars + // The header div is intentionally NOT closed — the HTML parser makes + // div.content a child of div.header. The safety valve (>5000 chars) + // should prevent div.header from being treated as noise. + let html = format!(r#" +{faq}
+