fix: prevent stack overflow on deeply nested HTML pages

Pages like Express.co.uk live blogs nest 200+ DOM levels deep, overflowing the default 1 MB main-thread stack on Windows during recursive markdown conversion. Two-layer fix: 1. markdown.rs: add depth parameter to node_to_md/children_to_md/inline_text with MAX_DOM_DEPTH=200 guard — falls back to plain text collection at limit 2. lib.rs: wrap extract_with_options in a worker thread with 8 MB stack so html5ever parsing and extraction both have room on deeply nested pages Tested with Express.co.uk live blog (previously crashed, now extracts 2000+ lines of clean markdown) and drudgereport.com (still works correctly).
2026-06-07 22:15:12 +02:00 · 2026-04-03 23:45:19 +02:00 · 2026-04-03 23:45:19 +02:00 · 74bac87435
commit 74bac87435
parent 95a6681b02
3 changed files with 129 additions and 28 deletions
--- a/crates/webclaw-core/src/lib.rs
+++ b/crates/webclaw-core/src/lib.rs
@ -45,10 +45,35 @@ pub fn extract(html: &str, url: Option<&str>) -> Result<ExtractionResult, Extrac
 /// `html`    — raw HTML string to parse
 /// `url`     — optional source URL, used for resolving relative links and domain detection
 /// `options` — controls include/exclude selectors, main content mode, and raw HTML output
+///
+/// Spawns extraction on a thread with an 8 MB stack to handle deeply nested
+/// HTML (e.g., Express.co.uk live blogs) without overflowing the default 1-2 MB
+/// main-thread stack on Windows.
 pub fn extract_with_options(
    html: &str,
    url: Option<&str>,
    options: &ExtractionOptions,
+) -> Result<ExtractionResult, ExtractError> {
+    // The default main-thread stack on Windows is 1 MB, which can overflow
+    // on deeply nested pages.  Spawn a worker thread with 8 MB to be safe.
+    const STACK_SIZE: usize = 8 * 1024 * 1024; // 8 MB
+
+    let html = html.to_string();
+    let url = url.map(|u| u.to_string());
+    let options = options.clone();
+
+    std::thread::Builder::new()
+        .stack_size(STACK_SIZE)
+        .spawn(move || extract_with_options_inner(&html, url.as_deref(), &options))
+        .map_err(|_| ExtractError::NoContent)?
+        .join()
+        .unwrap_or(Err(ExtractError::NoContent))
+}
+
+fn extract_with_options_inner(
+    html: &str,
+    url: Option<&str>,
+    options: &ExtractionOptions,
 ) -> Result<ExtractionResult, ExtractError> {
    if html.is_empty() {
        return Err(ExtractError::NoContent);
@ -530,4 +555,44 @@ mod tests {
            "raw_html should be absent from JSON when None"
        );
    }
+
+    #[test]
+    fn express_live_blog_no_stack_overflow() {
+        // Real-world Express.co.uk live blog that previously caused stack overflow
+        let html = include_str!("../testdata/express_test.html");
+        let result = extract(
+            html,
+            Some("https://www.express.co.uk/news/world/2189934/iran-live-donald-trump-uae-dubai-kuwait-attacks"),
+        );
+        assert!(result.is_ok(), "Should not stack overflow on Express.co.uk live blog");
+        let result = result.unwrap();
+        assert!(
+            result.metadata.word_count > 100,
+            "Should extract meaningful content, got {} words",
+            result.metadata.word_count
+        );
+    }
+
+    #[test]
+    fn deeply_nested_html_no_stack_overflow() {
+        // Simulate deeply nested HTML like Express.co.uk live blogs
+        let depth = 500;
+        let mut html = String::from("<html><body>");
+        for _ in 0..depth {
+            html.push_str("<div><span>");
+        }
+        html.push_str("<p>Deep content here</p>");
+        for _ in 0..depth {
+            html.push_str("</span></div>");
+        }
+        html.push_str("</body></html>");
+
+        let result = extract(&html, None);
+        assert!(result.is_ok(), "Should not stack overflow on deeply nested HTML");
+        let result = result.unwrap();
+        assert!(
+            result.content.markdown.contains("Deep content"),
+            "Should extract content from deep nesting"
+        );
+    }
 }