fix: prevent stack overflow on deeply nested HTML pages

Pages like Express.co.uk live blogs nest 200+ DOM levels deep, overflowing the default 1 MB main-thread stack on Windows during recursive markdown conversion. Two-layer fix: 1. markdown.rs: add depth parameter to node_to_md/children_to_md/inline_text with MAX_DOM_DEPTH=200 guard — falls back to plain text collection at limit 2. lib.rs: wrap extract_with_options in a worker thread with 8 MB stack so html5ever parsing and extraction both have room on deeply nested pages Tested with Express.co.uk live blog (previously crashed, now extracts 2000+ lines of clean markdown) and drudgereport.com (still works correctly).
2026-04-25 00:06:21 +02:00 · 2026-04-03 23:45:19 +02:00 · 2026-04-03 23:45:19 +02:00 · 74bac87435
commit 74bac87435
parent 95a6681b02
3 changed files with 129 additions and 28 deletions
--- a/crates/webclaw-core/src/lib.rs
+++ b/crates/webclaw-core/src/lib.rs
@ -45,10 +45,35 @@ pub fn extract(html: &str, url: Option<&str>) -> Result<ExtractionResult, Extrac
 /// `html`    — raw HTML string to parse
 /// `url`     — optional source URL, used for resolving relative links and domain detection
 /// `options` — controls include/exclude selectors, main content mode, and raw HTML output
+///
+/// Spawns extraction on a thread with an 8 MB stack to handle deeply nested
+/// HTML (e.g., Express.co.uk live blogs) without overflowing the default 1-2 MB
+/// main-thread stack on Windows.
 pub fn extract_with_options(
    html: &str,
    url: Option<&str>,
    options: &ExtractionOptions,
+) -> Result<ExtractionResult, ExtractError> {
+    // The default main-thread stack on Windows is 1 MB, which can overflow
+    // on deeply nested pages.  Spawn a worker thread with 8 MB to be safe.
+    const STACK_SIZE: usize = 8 * 1024 * 1024; // 8 MB
+
+    let html = html.to_string();
+    let url = url.map(|u| u.to_string());
+    let options = options.clone();
+
+    std::thread::Builder::new()
+        .stack_size(STACK_SIZE)
+        .spawn(move || extract_with_options_inner(&html, url.as_deref(), &options))
+        .map_err(|_| ExtractError::NoContent)?
+        .join()
+        .unwrap_or(Err(ExtractError::NoContent))
+}
+
+fn extract_with_options_inner(
+    html: &str,
+    url: Option<&str>,
+    options: &ExtractionOptions,
 ) -> Result<ExtractionResult, ExtractError> {
    if html.is_empty() {
        return Err(ExtractError::NoContent);
@ -530,4 +555,44 @@ mod tests {
            "raw_html should be absent from JSON when None"
        );
    }
+
+    #[test]
+    fn express_live_blog_no_stack_overflow() {
+        // Real-world Express.co.uk live blog that previously caused stack overflow
+        let html = include_str!("../testdata/express_test.html");
+        let result = extract(
+            html,
+            Some("https://www.express.co.uk/news/world/2189934/iran-live-donald-trump-uae-dubai-kuwait-attacks"),
+        );
+        assert!(result.is_ok(), "Should not stack overflow on Express.co.uk live blog");
+        let result = result.unwrap();
+        assert!(
+            result.metadata.word_count > 100,
+            "Should extract meaningful content, got {} words",
+            result.metadata.word_count
+        );
+    }
+
+    #[test]
+    fn deeply_nested_html_no_stack_overflow() {
+        // Simulate deeply nested HTML like Express.co.uk live blogs
+        let depth = 500;
+        let mut html = String::from("<html><body>");
+        for _ in 0..depth {
+            html.push_str("<div><span>");
+        }
+        html.push_str("<p>Deep content here</p>");
+        for _ in 0..depth {
+            html.push_str("</span></div>");
+        }
+        html.push_str("</body></html>");
+
+        let result = extract(&html, None);
+        assert!(result.is_ok(), "Should not stack overflow on deeply nested HTML");
+        let result = result.unwrap();
+        assert!(
+            result.content.markdown.contains("Deep content"),
+            "Should extract content from deep nesting"
+        );
+    }
 }
--- a/crates/webclaw-core/src/markdown.rs
+++ b/crates/webclaw-core/src/markdown.rs
@ -14,6 +14,12 @@ use crate::types::{CodeBlock, Image, Link};

 static CODE_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("code").unwrap());

+/// Maximum recursion depth for DOM traversal.
+/// Express.co.uk live blogs and similar pages can nest 1000+ levels deep,
+/// overflowing the default ~1 MB stack on Windows.  When we hit this limit
+/// we fall back to plain-text collection (which uses an iterator, not recursion).
+const MAX_DOM_DEPTH: usize = 200;
+
 /// Collected assets found during conversion.
 pub struct ConvertedAssets {
    pub links: Vec<Link>,
@ -34,7 +40,7 @@ pub fn convert(
        code_blocks: Vec::new(),
    };

-    let md = node_to_md(element, base_url, &mut assets, 0, exclude);
+    let md = node_to_md(element, base_url, &mut assets, 0, exclude, 0);
    let plain = strip_markdown(&md);
    let md = collapse_whitespace(&md);
    let plain = collapse_whitespace(&plain);
@ -49,11 +55,17 @@ fn node_to_md(
    assets: &mut ConvertedAssets,
    list_depth: usize,
    exclude: &HashSet<NodeId>,
+    depth: usize,
 ) -> String {
    if exclude.contains(&element.id()) {
        return String::new();
    }

+    // Guard against deeply nested DOM trees (e.g., Express.co.uk live blogs).
+    if depth > MAX_DOM_DEPTH {
+        return collect_text(element);
+    }
+
    if noise::is_noise(element) || noise::is_noise_descendant(element) {
        // Still collect images and links from noise elements — they're useful
        // metadata even though we don't include the noise text in markdown.
@ -67,38 +79,38 @@ fn node_to_md(
        // Headings
        "h1" => format!(
            "\n\n# {}\n\n",
-            inline_text(element, base_url, assets, exclude)
+            inline_text(element, base_url, assets, exclude, depth)
        ),
        "h2" => format!(
            "\n\n## {}\n\n",
-            inline_text(element, base_url, assets, exclude)
+            inline_text(element, base_url, assets, exclude, depth)
        ),
        "h3" => format!(
            "\n\n### {}\n\n",
-            inline_text(element, base_url, assets, exclude)
+            inline_text(element, base_url, assets, exclude, depth)
        ),
        "h4" => format!(
            "\n\n#### {}\n\n",
-            inline_text(element, base_url, assets, exclude)
+            inline_text(element, base_url, assets, exclude, depth)
        ),
        "h5" => format!(
            "\n\n##### {}\n\n",
-            inline_text(element, base_url, assets, exclude)
+            inline_text(element, base_url, assets, exclude, depth)
        ),
        "h6" => format!(
            "\n\n###### {}\n\n",
-            inline_text(element, base_url, assets, exclude)
+            inline_text(element, base_url, assets, exclude, depth)
        ),

        // Paragraph
        "p" => format!(
            "\n\n{}\n\n",
-            inline_text(element, base_url, assets, exclude)
+            inline_text(element, base_url, assets, exclude, depth)
        ),

        // Links
        "a" => {
-            let text = inline_text(element, base_url, assets, exclude);
+            let text = inline_text(element, base_url, assets, exclude, depth);
            let href = element
                .value()
                .attr("href")
@ -167,18 +179,18 @@ fn node_to_md(
        // in <b>), treat as a container instead of inline bold.
        "strong" | "b" => {
            if cell_has_block_content(element) {
-                children_to_md(element, base_url, assets, list_depth, exclude)
+                children_to_md(element, base_url, assets, list_depth, exclude, depth)
            } else {
-                format!("**{}**", inline_text(element, base_url, assets, exclude))
+                format!("**{}**", inline_text(element, base_url, assets, exclude, depth))
            }
        }

        // Italic — same block-content check as bold.
        "em" | "i" => {
            if cell_has_block_content(element) {
-                children_to_md(element, base_url, assets, list_depth, exclude)
+                children_to_md(element, base_url, assets, list_depth, exclude, depth)
            } else {
-                format!("*{}*", inline_text(element, base_url, assets, exclude))
+                format!("*{}*", inline_text(element, base_url, assets, exclude, depth))
            }
        }

@ -213,13 +225,13 @@ fn node_to_md(
                            .attr("class")
                            .and_then(extract_language_from_class)
                    });
-                (collect_preformatted_text(code_el), lang)
+                (collect_preformatted_text(code_el, depth), lang)
            } else {
                let lang = element
                    .value()
                    .attr("class")
                    .and_then(extract_language_from_class);
-                (collect_preformatted_text(element), lang)
+                (collect_preformatted_text(element, depth), lang)
            };

            let code = code.trim_matches('\n').to_string();
@ -234,7 +246,7 @@ fn node_to_md(

        // Blockquote
        "blockquote" => {
-            let inner = children_to_md(element, base_url, assets, list_depth, exclude);
+            let inner = children_to_md(element, base_url, assets, list_depth, exclude, depth);
            let quoted = inner
                .trim()
                .lines()
@ -246,19 +258,19 @@ fn node_to_md(

        // Unordered list
        "ul" => {
-            let items = list_items(element, base_url, assets, list_depth, false, exclude);
+            let items = list_items(element, base_url, assets, list_depth, false, exclude, depth);
            format!("\n\n{items}\n\n")
        }

        // Ordered list
        "ol" => {
-            let items = list_items(element, base_url, assets, list_depth, true, exclude);
+            let items = list_items(element, base_url, assets, list_depth, true, exclude, depth);
            format!("\n\n{items}\n\n")
        }

        // List item — handled by ul/ol parent, but if encountered standalone:
        "li" => {
-            let text = inline_text(element, base_url, assets, exclude);
+            let text = inline_text(element, base_url, assets, exclude, depth);
            format!("- {text}\n")
        }

@ -271,11 +283,11 @@ fn node_to_md(
        // Table
        "table" => format!(
            "\n\n{}\n\n",
-            table_to_md(element, base_url, assets, exclude)
+            table_to_md(element, base_url, assets, exclude, depth)
        ),

        // Divs and other containers — just recurse
-        _ => children_to_md(element, base_url, assets, list_depth, exclude),
+        _ => children_to_md(element, base_url, assets, list_depth, exclude, depth),
    }
 }

@ -286,13 +298,14 @@ fn children_to_md(
    assets: &mut ConvertedAssets,
    list_depth: usize,
    exclude: &HashSet<NodeId>,
+    depth: usize,
 ) -> String {
    let mut out = String::new();
    for child in element.children() {
        match child.value() {
            Node::Element(_) => {
                if let Some(child_el) = ElementRef::wrap(child) {
-                    let chunk = node_to_md(child_el, base_url, assets, list_depth, exclude);
+                    let chunk = node_to_md(child_el, base_url, assets, list_depth, exclude, depth + 1);
                    if !chunk.is_empty() && !out.is_empty() && needs_separator(&out, &chunk) {
                        out.push(' ');
                    }
@ -315,13 +328,14 @@ fn inline_text(
    base_url: Option<&Url>,
    assets: &mut ConvertedAssets,
    exclude: &HashSet<NodeId>,
+    depth: usize,
 ) -> String {
    let mut out = String::new();
    for child in element.children() {
        match child.value() {
            Node::Element(_) => {
                if let Some(child_el) = ElementRef::wrap(child) {
-                    let chunk = node_to_md(child_el, base_url, assets, 0, exclude);
+                    let chunk = node_to_md(child_el, base_url, assets, 0, exclude, depth + 1);
                    if !chunk.is_empty() && !out.is_empty() && needs_separator(&out, &chunk) {
                        out.push(' ');
                    }
@ -356,7 +370,10 @@ fn collect_text(element: ElementRef<'_>) -> String {
 /// Every text node is pushed verbatim -- no trimming, no collapsing.
 /// Handles `<br>` as newlines and inserts newlines between block-level children
 /// (e.g., `<div>` lines produced by some syntax highlighters).
-fn collect_preformatted_text(element: ElementRef<'_>) -> String {
+fn collect_preformatted_text(element: ElementRef<'_>, depth: usize) -> String {
+    if depth > MAX_DOM_DEPTH {
+        return element.text().collect::<String>();
+    }
    let mut out = String::new();
    for child in element.children() {
        match child.value() {
@ -370,12 +387,12 @@ fn collect_preformatted_text(element: ElementRef<'_>) -> String {
                        if !out.is_empty() && !out.ends_with('\n') {
                            out.push('\n');
                        }
-                        out.push_str(&collect_preformatted_text(child_el));
+                        out.push_str(&collect_preformatted_text(child_el, depth + 1));
                        if !out.ends_with('\n') {
                            out.push('\n');
                        }
                    } else {
-                        out.push_str(&collect_preformatted_text(child_el));
+                        out.push_str(&collect_preformatted_text(child_el, depth + 1));
                    }
                }
            }
@ -405,6 +422,7 @@ fn list_items(
    depth: usize,
    ordered: bool,
    exclude: &HashSet<NodeId>,
+    dom_depth: usize,
 ) -> String {
    let indent = "  ".repeat(depth);
    let mut out = String::new();
@ -443,6 +461,7 @@ fn list_items(
                                depth + 1,
                                child_tag == "ol",
                                exclude,
+                                dom_depth + 1,
                            ));
                        } else {
                            inline_parts.push_str(&node_to_md(
@ -451,6 +470,7 @@ fn list_items(
                                assets,
                                depth,
                                exclude,
+                                dom_depth + 1,
                            ));
                        }
                    } else if let Some(text) = li_child.value().as_text() {
@ -495,6 +515,7 @@ fn table_to_md(
    base_url: Option<&Url>,
    assets: &mut ConvertedAssets,
    exclude: &HashSet<NodeId>,
+    depth: usize,
 ) -> String {
    // Collect all <td>/<th> cells grouped by row, and detect layout tables
    let mut raw_rows: Vec<Vec<ElementRef<'_>>> = Vec::new();
@ -542,7 +563,7 @@ fn table_to_md(
        for row in &raw_rows {
            for cell in row {
                let content =
-                    children_to_md(*cell, base_url, assets, 0, exclude);
+                    children_to_md(*cell, base_url, assets, 0, exclude, depth);
                let content = content.trim();
                if !content.is_empty() {
                    if !out.is_empty() {
@ -560,7 +581,7 @@ fn table_to_md(
        .iter()
        .map(|row| {
            row.iter()
-                .map(|c| inline_text(*c, base_url, assets, exclude))
+                .map(|c| inline_text(*c, base_url, assets, exclude, depth))
                .collect()
        })
        .collect();
--- a/crates/webclaw-core/testdata/express_test.html
+++ b/crates/webclaw-core/testdata/express_test.html