fix: prevent stack overflow on deeply nested HTML pages

Pages like Express.co.uk live blogs nest 200+ DOM levels deep, overflowing
the default 1 MB main-thread stack on Windows during recursive markdown
conversion.

Two-layer fix:

1. markdown.rs: add depth parameter to node_to_md/children_to_md/inline_text
   with MAX_DOM_DEPTH=200 guard — falls back to plain text collection at limit

2. lib.rs: wrap extract_with_options in a worker thread with 8 MB stack so
   html5ever parsing and extraction both have room on deeply nested pages

Tested with Express.co.uk live blog (previously crashed, now extracts 2000+
lines of clean markdown) and drudgereport.com (still works correctly).
This commit is contained in:
devnen 2026-04-03 23:45:19 +02:00
parent 95a6681b02
commit 74bac87435
3 changed files with 129 additions and 28 deletions

View file

@ -45,10 +45,35 @@ pub fn extract(html: &str, url: Option<&str>) -> Result<ExtractionResult, Extrac
/// `html` — raw HTML string to parse
/// `url` — optional source URL, used for resolving relative links and domain detection
/// `options` — controls include/exclude selectors, main content mode, and raw HTML output
///
/// Spawns extraction on a thread with an 8 MB stack to handle deeply nested
/// HTML (e.g., Express.co.uk live blogs) without overflowing the default 1-2 MB
/// main-thread stack on Windows.
pub fn extract_with_options(
html: &str,
url: Option<&str>,
options: &ExtractionOptions,
) -> Result<ExtractionResult, ExtractError> {
// The default main-thread stack on Windows is 1 MB, which can overflow
// on deeply nested pages. Spawn a worker thread with 8 MB to be safe.
const STACK_SIZE: usize = 8 * 1024 * 1024; // 8 MB
let html = html.to_string();
let url = url.map(|u| u.to_string());
let options = options.clone();
std::thread::Builder::new()
.stack_size(STACK_SIZE)
.spawn(move || extract_with_options_inner(&html, url.as_deref(), &options))
.map_err(|_| ExtractError::NoContent)?
.join()
.unwrap_or(Err(ExtractError::NoContent))
}
fn extract_with_options_inner(
html: &str,
url: Option<&str>,
options: &ExtractionOptions,
) -> Result<ExtractionResult, ExtractError> {
if html.is_empty() {
return Err(ExtractError::NoContent);
@ -530,4 +555,44 @@ mod tests {
"raw_html should be absent from JSON when None"
);
}
#[test]
fn express_live_blog_no_stack_overflow() {
// Real-world Express.co.uk live blog that previously caused stack overflow
let html = include_str!("../testdata/express_test.html");
let result = extract(
html,
Some("https://www.express.co.uk/news/world/2189934/iran-live-donald-trump-uae-dubai-kuwait-attacks"),
);
assert!(result.is_ok(), "Should not stack overflow on Express.co.uk live blog");
let result = result.unwrap();
assert!(
result.metadata.word_count > 100,
"Should extract meaningful content, got {} words",
result.metadata.word_count
);
}
#[test]
fn deeply_nested_html_no_stack_overflow() {
// Simulate deeply nested HTML like Express.co.uk live blogs
let depth = 500;
let mut html = String::from("<html><body>");
for _ in 0..depth {
html.push_str("<div><span>");
}
html.push_str("<p>Deep content here</p>");
for _ in 0..depth {
html.push_str("</span></div>");
}
html.push_str("</body></html>");
let result = extract(&html, None);
assert!(result.is_ok(), "Should not stack overflow on deeply nested HTML");
let result = result.unwrap();
assert!(
result.content.markdown.contains("Deep content"),
"Should extract content from deep nesting"
);
}
}

View file

@ -14,6 +14,12 @@ use crate::types::{CodeBlock, Image, Link};
static CODE_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("code").unwrap());
/// Maximum recursion depth for DOM traversal.
/// Express.co.uk live blogs and similar pages can nest 1000+ levels deep,
/// overflowing the default ~1 MB stack on Windows. When we hit this limit
/// we fall back to plain-text collection (which uses an iterator, not recursion).
const MAX_DOM_DEPTH: usize = 200;
/// Collected assets found during conversion.
pub struct ConvertedAssets {
pub links: Vec<Link>,
@ -34,7 +40,7 @@ pub fn convert(
code_blocks: Vec::new(),
};
let md = node_to_md(element, base_url, &mut assets, 0, exclude);
let md = node_to_md(element, base_url, &mut assets, 0, exclude, 0);
let plain = strip_markdown(&md);
let md = collapse_whitespace(&md);
let plain = collapse_whitespace(&plain);
@ -49,11 +55,17 @@ fn node_to_md(
assets: &mut ConvertedAssets,
list_depth: usize,
exclude: &HashSet<NodeId>,
depth: usize,
) -> String {
if exclude.contains(&element.id()) {
return String::new();
}
// Guard against deeply nested DOM trees (e.g., Express.co.uk live blogs).
if depth > MAX_DOM_DEPTH {
return collect_text(element);
}
if noise::is_noise(element) || noise::is_noise_descendant(element) {
// Still collect images and links from noise elements — they're useful
// metadata even though we don't include the noise text in markdown.
@ -67,38 +79,38 @@ fn node_to_md(
// Headings
"h1" => format!(
"\n\n# {}\n\n",
inline_text(element, base_url, assets, exclude)
inline_text(element, base_url, assets, exclude, depth)
),
"h2" => format!(
"\n\n## {}\n\n",
inline_text(element, base_url, assets, exclude)
inline_text(element, base_url, assets, exclude, depth)
),
"h3" => format!(
"\n\n### {}\n\n",
inline_text(element, base_url, assets, exclude)
inline_text(element, base_url, assets, exclude, depth)
),
"h4" => format!(
"\n\n#### {}\n\n",
inline_text(element, base_url, assets, exclude)
inline_text(element, base_url, assets, exclude, depth)
),
"h5" => format!(
"\n\n##### {}\n\n",
inline_text(element, base_url, assets, exclude)
inline_text(element, base_url, assets, exclude, depth)
),
"h6" => format!(
"\n\n###### {}\n\n",
inline_text(element, base_url, assets, exclude)
inline_text(element, base_url, assets, exclude, depth)
),
// Paragraph
"p" => format!(
"\n\n{}\n\n",
inline_text(element, base_url, assets, exclude)
inline_text(element, base_url, assets, exclude, depth)
),
// Links
"a" => {
let text = inline_text(element, base_url, assets, exclude);
let text = inline_text(element, base_url, assets, exclude, depth);
let href = element
.value()
.attr("href")
@ -167,18 +179,18 @@ fn node_to_md(
// in <b>), treat as a container instead of inline bold.
"strong" | "b" => {
if cell_has_block_content(element) {
children_to_md(element, base_url, assets, list_depth, exclude)
children_to_md(element, base_url, assets, list_depth, exclude, depth)
} else {
format!("**{}**", inline_text(element, base_url, assets, exclude))
format!("**{}**", inline_text(element, base_url, assets, exclude, depth))
}
}
// Italic — same block-content check as bold.
"em" | "i" => {
if cell_has_block_content(element) {
children_to_md(element, base_url, assets, list_depth, exclude)
children_to_md(element, base_url, assets, list_depth, exclude, depth)
} else {
format!("*{}*", inline_text(element, base_url, assets, exclude))
format!("*{}*", inline_text(element, base_url, assets, exclude, depth))
}
}
@ -213,13 +225,13 @@ fn node_to_md(
.attr("class")
.and_then(extract_language_from_class)
});
(collect_preformatted_text(code_el), lang)
(collect_preformatted_text(code_el, depth), lang)
} else {
let lang = element
.value()
.attr("class")
.and_then(extract_language_from_class);
(collect_preformatted_text(element), lang)
(collect_preformatted_text(element, depth), lang)
};
let code = code.trim_matches('\n').to_string();
@ -234,7 +246,7 @@ fn node_to_md(
// Blockquote
"blockquote" => {
let inner = children_to_md(element, base_url, assets, list_depth, exclude);
let inner = children_to_md(element, base_url, assets, list_depth, exclude, depth);
let quoted = inner
.trim()
.lines()
@ -246,19 +258,19 @@ fn node_to_md(
// Unordered list
"ul" => {
let items = list_items(element, base_url, assets, list_depth, false, exclude);
let items = list_items(element, base_url, assets, list_depth, false, exclude, depth);
format!("\n\n{items}\n\n")
}
// Ordered list
"ol" => {
let items = list_items(element, base_url, assets, list_depth, true, exclude);
let items = list_items(element, base_url, assets, list_depth, true, exclude, depth);
format!("\n\n{items}\n\n")
}
// List item — handled by ul/ol parent, but if encountered standalone:
"li" => {
let text = inline_text(element, base_url, assets, exclude);
let text = inline_text(element, base_url, assets, exclude, depth);
format!("- {text}\n")
}
@ -271,11 +283,11 @@ fn node_to_md(
// Table
"table" => format!(
"\n\n{}\n\n",
table_to_md(element, base_url, assets, exclude)
table_to_md(element, base_url, assets, exclude, depth)
),
// Divs and other containers — just recurse
_ => children_to_md(element, base_url, assets, list_depth, exclude),
_ => children_to_md(element, base_url, assets, list_depth, exclude, depth),
}
}
@ -286,13 +298,14 @@ fn children_to_md(
assets: &mut ConvertedAssets,
list_depth: usize,
exclude: &HashSet<NodeId>,
depth: usize,
) -> String {
let mut out = String::new();
for child in element.children() {
match child.value() {
Node::Element(_) => {
if let Some(child_el) = ElementRef::wrap(child) {
let chunk = node_to_md(child_el, base_url, assets, list_depth, exclude);
let chunk = node_to_md(child_el, base_url, assets, list_depth, exclude, depth + 1);
if !chunk.is_empty() && !out.is_empty() && needs_separator(&out, &chunk) {
out.push(' ');
}
@ -315,13 +328,14 @@ fn inline_text(
base_url: Option<&Url>,
assets: &mut ConvertedAssets,
exclude: &HashSet<NodeId>,
depth: usize,
) -> String {
let mut out = String::new();
for child in element.children() {
match child.value() {
Node::Element(_) => {
if let Some(child_el) = ElementRef::wrap(child) {
let chunk = node_to_md(child_el, base_url, assets, 0, exclude);
let chunk = node_to_md(child_el, base_url, assets, 0, exclude, depth + 1);
if !chunk.is_empty() && !out.is_empty() && needs_separator(&out, &chunk) {
out.push(' ');
}
@ -356,7 +370,10 @@ fn collect_text(element: ElementRef<'_>) -> String {
/// Every text node is pushed verbatim -- no trimming, no collapsing.
/// Handles `<br>` as newlines and inserts newlines between block-level children
/// (e.g., `<div>` lines produced by some syntax highlighters).
fn collect_preformatted_text(element: ElementRef<'_>) -> String {
fn collect_preformatted_text(element: ElementRef<'_>, depth: usize) -> String {
if depth > MAX_DOM_DEPTH {
return element.text().collect::<String>();
}
let mut out = String::new();
for child in element.children() {
match child.value() {
@ -370,12 +387,12 @@ fn collect_preformatted_text(element: ElementRef<'_>) -> String {
if !out.is_empty() && !out.ends_with('\n') {
out.push('\n');
}
out.push_str(&collect_preformatted_text(child_el));
out.push_str(&collect_preformatted_text(child_el, depth + 1));
if !out.ends_with('\n') {
out.push('\n');
}
} else {
out.push_str(&collect_preformatted_text(child_el));
out.push_str(&collect_preformatted_text(child_el, depth + 1));
}
}
}
@ -405,6 +422,7 @@ fn list_items(
depth: usize,
ordered: bool,
exclude: &HashSet<NodeId>,
dom_depth: usize,
) -> String {
let indent = " ".repeat(depth);
let mut out = String::new();
@ -443,6 +461,7 @@ fn list_items(
depth + 1,
child_tag == "ol",
exclude,
dom_depth + 1,
));
} else {
inline_parts.push_str(&node_to_md(
@ -451,6 +470,7 @@ fn list_items(
assets,
depth,
exclude,
dom_depth + 1,
));
}
} else if let Some(text) = li_child.value().as_text() {
@ -495,6 +515,7 @@ fn table_to_md(
base_url: Option<&Url>,
assets: &mut ConvertedAssets,
exclude: &HashSet<NodeId>,
depth: usize,
) -> String {
// Collect all <td>/<th> cells grouped by row, and detect layout tables
let mut raw_rows: Vec<Vec<ElementRef<'_>>> = Vec::new();
@ -542,7 +563,7 @@ fn table_to_md(
for row in &raw_rows {
for cell in row {
let content =
children_to_md(*cell, base_url, assets, 0, exclude);
children_to_md(*cell, base_url, assets, 0, exclude, depth);
let content = content.trim();
if !content.is_empty() {
if !out.is_empty() {
@ -560,7 +581,7 @@ fn table_to_md(
.iter()
.map(|row| {
row.iter()
.map(|c| inline_text(*c, base_url, assets, exclude))
.map(|c| inline_text(*c, base_url, assets, exclude, depth))
.collect()
})
.collect();

File diff suppressed because one or more lines are too long