mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-04-25 00:06:21 +02:00
fix: prevent stack overflow on deeply nested HTML pages
Pages like Express.co.uk live blogs nest 200+ DOM levels deep, overflowing the default 1 MB main-thread stack on Windows during recursive markdown conversion. Two-layer fix: 1. markdown.rs: add depth parameter to node_to_md/children_to_md/inline_text with MAX_DOM_DEPTH=200 guard — falls back to plain text collection at limit 2. lib.rs: wrap extract_with_options in a worker thread with 8 MB stack so html5ever parsing and extraction both have room on deeply nested pages Tested with Express.co.uk live blog (previously crashed, now extracts 2000+ lines of clean markdown) and drudgereport.com (still works correctly).
This commit is contained in:
parent
95a6681b02
commit
74bac87435
3 changed files with 129 additions and 28 deletions
|
|
@ -45,10 +45,35 @@ pub fn extract(html: &str, url: Option<&str>) -> Result<ExtractionResult, Extrac
|
|||
/// `html` — raw HTML string to parse
|
||||
/// `url` — optional source URL, used for resolving relative links and domain detection
|
||||
/// `options` — controls include/exclude selectors, main content mode, and raw HTML output
|
||||
///
|
||||
/// Spawns extraction on a thread with an 8 MB stack to handle deeply nested
|
||||
/// HTML (e.g., Express.co.uk live blogs) without overflowing the default 1-2 MB
|
||||
/// main-thread stack on Windows.
|
||||
pub fn extract_with_options(
|
||||
html: &str,
|
||||
url: Option<&str>,
|
||||
options: &ExtractionOptions,
|
||||
) -> Result<ExtractionResult, ExtractError> {
|
||||
// The default main-thread stack on Windows is 1 MB, which can overflow
|
||||
// on deeply nested pages. Spawn a worker thread with 8 MB to be safe.
|
||||
const STACK_SIZE: usize = 8 * 1024 * 1024; // 8 MB
|
||||
|
||||
let html = html.to_string();
|
||||
let url = url.map(|u| u.to_string());
|
||||
let options = options.clone();
|
||||
|
||||
std::thread::Builder::new()
|
||||
.stack_size(STACK_SIZE)
|
||||
.spawn(move || extract_with_options_inner(&html, url.as_deref(), &options))
|
||||
.map_err(|_| ExtractError::NoContent)?
|
||||
.join()
|
||||
.unwrap_or(Err(ExtractError::NoContent))
|
||||
}
|
||||
|
||||
fn extract_with_options_inner(
|
||||
html: &str,
|
||||
url: Option<&str>,
|
||||
options: &ExtractionOptions,
|
||||
) -> Result<ExtractionResult, ExtractError> {
|
||||
if html.is_empty() {
|
||||
return Err(ExtractError::NoContent);
|
||||
|
|
@ -530,4 +555,44 @@ mod tests {
|
|||
"raw_html should be absent from JSON when None"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn express_live_blog_no_stack_overflow() {
|
||||
// Real-world Express.co.uk live blog that previously caused stack overflow
|
||||
let html = include_str!("../testdata/express_test.html");
|
||||
let result = extract(
|
||||
html,
|
||||
Some("https://www.express.co.uk/news/world/2189934/iran-live-donald-trump-uae-dubai-kuwait-attacks"),
|
||||
);
|
||||
assert!(result.is_ok(), "Should not stack overflow on Express.co.uk live blog");
|
||||
let result = result.unwrap();
|
||||
assert!(
|
||||
result.metadata.word_count > 100,
|
||||
"Should extract meaningful content, got {} words",
|
||||
result.metadata.word_count
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deeply_nested_html_no_stack_overflow() {
|
||||
// Simulate deeply nested HTML like Express.co.uk live blogs
|
||||
let depth = 500;
|
||||
let mut html = String::from("<html><body>");
|
||||
for _ in 0..depth {
|
||||
html.push_str("<div><span>");
|
||||
}
|
||||
html.push_str("<p>Deep content here</p>");
|
||||
for _ in 0..depth {
|
||||
html.push_str("</span></div>");
|
||||
}
|
||||
html.push_str("</body></html>");
|
||||
|
||||
let result = extract(&html, None);
|
||||
assert!(result.is_ok(), "Should not stack overflow on deeply nested HTML");
|
||||
let result = result.unwrap();
|
||||
assert!(
|
||||
result.content.markdown.contains("Deep content"),
|
||||
"Should extract content from deep nesting"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -14,6 +14,12 @@ use crate::types::{CodeBlock, Image, Link};
|
|||
|
||||
static CODE_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("code").unwrap());
|
||||
|
||||
/// Maximum recursion depth for DOM traversal.
|
||||
/// Express.co.uk live blogs and similar pages can nest 1000+ levels deep,
|
||||
/// overflowing the default ~1 MB stack on Windows. When we hit this limit
|
||||
/// we fall back to plain-text collection (which uses an iterator, not recursion).
|
||||
const MAX_DOM_DEPTH: usize = 200;
|
||||
|
||||
/// Collected assets found during conversion.
|
||||
pub struct ConvertedAssets {
|
||||
pub links: Vec<Link>,
|
||||
|
|
@ -34,7 +40,7 @@ pub fn convert(
|
|||
code_blocks: Vec::new(),
|
||||
};
|
||||
|
||||
let md = node_to_md(element, base_url, &mut assets, 0, exclude);
|
||||
let md = node_to_md(element, base_url, &mut assets, 0, exclude, 0);
|
||||
let plain = strip_markdown(&md);
|
||||
let md = collapse_whitespace(&md);
|
||||
let plain = collapse_whitespace(&plain);
|
||||
|
|
@ -49,11 +55,17 @@ fn node_to_md(
|
|||
assets: &mut ConvertedAssets,
|
||||
list_depth: usize,
|
||||
exclude: &HashSet<NodeId>,
|
||||
depth: usize,
|
||||
) -> String {
|
||||
if exclude.contains(&element.id()) {
|
||||
return String::new();
|
||||
}
|
||||
|
||||
// Guard against deeply nested DOM trees (e.g., Express.co.uk live blogs).
|
||||
if depth > MAX_DOM_DEPTH {
|
||||
return collect_text(element);
|
||||
}
|
||||
|
||||
if noise::is_noise(element) || noise::is_noise_descendant(element) {
|
||||
// Still collect images and links from noise elements — they're useful
|
||||
// metadata even though we don't include the noise text in markdown.
|
||||
|
|
@ -67,38 +79,38 @@ fn node_to_md(
|
|||
// Headings
|
||||
"h1" => format!(
|
||||
"\n\n# {}\n\n",
|
||||
inline_text(element, base_url, assets, exclude)
|
||||
inline_text(element, base_url, assets, exclude, depth)
|
||||
),
|
||||
"h2" => format!(
|
||||
"\n\n## {}\n\n",
|
||||
inline_text(element, base_url, assets, exclude)
|
||||
inline_text(element, base_url, assets, exclude, depth)
|
||||
),
|
||||
"h3" => format!(
|
||||
"\n\n### {}\n\n",
|
||||
inline_text(element, base_url, assets, exclude)
|
||||
inline_text(element, base_url, assets, exclude, depth)
|
||||
),
|
||||
"h4" => format!(
|
||||
"\n\n#### {}\n\n",
|
||||
inline_text(element, base_url, assets, exclude)
|
||||
inline_text(element, base_url, assets, exclude, depth)
|
||||
),
|
||||
"h5" => format!(
|
||||
"\n\n##### {}\n\n",
|
||||
inline_text(element, base_url, assets, exclude)
|
||||
inline_text(element, base_url, assets, exclude, depth)
|
||||
),
|
||||
"h6" => format!(
|
||||
"\n\n###### {}\n\n",
|
||||
inline_text(element, base_url, assets, exclude)
|
||||
inline_text(element, base_url, assets, exclude, depth)
|
||||
),
|
||||
|
||||
// Paragraph
|
||||
"p" => format!(
|
||||
"\n\n{}\n\n",
|
||||
inline_text(element, base_url, assets, exclude)
|
||||
inline_text(element, base_url, assets, exclude, depth)
|
||||
),
|
||||
|
||||
// Links
|
||||
"a" => {
|
||||
let text = inline_text(element, base_url, assets, exclude);
|
||||
let text = inline_text(element, base_url, assets, exclude, depth);
|
||||
let href = element
|
||||
.value()
|
||||
.attr("href")
|
||||
|
|
@ -167,18 +179,18 @@ fn node_to_md(
|
|||
// in <b>), treat as a container instead of inline bold.
|
||||
"strong" | "b" => {
|
||||
if cell_has_block_content(element) {
|
||||
children_to_md(element, base_url, assets, list_depth, exclude)
|
||||
children_to_md(element, base_url, assets, list_depth, exclude, depth)
|
||||
} else {
|
||||
format!("**{}**", inline_text(element, base_url, assets, exclude))
|
||||
format!("**{}**", inline_text(element, base_url, assets, exclude, depth))
|
||||
}
|
||||
}
|
||||
|
||||
// Italic — same block-content check as bold.
|
||||
"em" | "i" => {
|
||||
if cell_has_block_content(element) {
|
||||
children_to_md(element, base_url, assets, list_depth, exclude)
|
||||
children_to_md(element, base_url, assets, list_depth, exclude, depth)
|
||||
} else {
|
||||
format!("*{}*", inline_text(element, base_url, assets, exclude))
|
||||
format!("*{}*", inline_text(element, base_url, assets, exclude, depth))
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -213,13 +225,13 @@ fn node_to_md(
|
|||
.attr("class")
|
||||
.and_then(extract_language_from_class)
|
||||
});
|
||||
(collect_preformatted_text(code_el), lang)
|
||||
(collect_preformatted_text(code_el, depth), lang)
|
||||
} else {
|
||||
let lang = element
|
||||
.value()
|
||||
.attr("class")
|
||||
.and_then(extract_language_from_class);
|
||||
(collect_preformatted_text(element), lang)
|
||||
(collect_preformatted_text(element, depth), lang)
|
||||
};
|
||||
|
||||
let code = code.trim_matches('\n').to_string();
|
||||
|
|
@ -234,7 +246,7 @@ fn node_to_md(
|
|||
|
||||
// Blockquote
|
||||
"blockquote" => {
|
||||
let inner = children_to_md(element, base_url, assets, list_depth, exclude);
|
||||
let inner = children_to_md(element, base_url, assets, list_depth, exclude, depth);
|
||||
let quoted = inner
|
||||
.trim()
|
||||
.lines()
|
||||
|
|
@ -246,19 +258,19 @@ fn node_to_md(
|
|||
|
||||
// Unordered list
|
||||
"ul" => {
|
||||
let items = list_items(element, base_url, assets, list_depth, false, exclude);
|
||||
let items = list_items(element, base_url, assets, list_depth, false, exclude, depth);
|
||||
format!("\n\n{items}\n\n")
|
||||
}
|
||||
|
||||
// Ordered list
|
||||
"ol" => {
|
||||
let items = list_items(element, base_url, assets, list_depth, true, exclude);
|
||||
let items = list_items(element, base_url, assets, list_depth, true, exclude, depth);
|
||||
format!("\n\n{items}\n\n")
|
||||
}
|
||||
|
||||
// List item — handled by ul/ol parent, but if encountered standalone:
|
||||
"li" => {
|
||||
let text = inline_text(element, base_url, assets, exclude);
|
||||
let text = inline_text(element, base_url, assets, exclude, depth);
|
||||
format!("- {text}\n")
|
||||
}
|
||||
|
||||
|
|
@ -271,11 +283,11 @@ fn node_to_md(
|
|||
// Table
|
||||
"table" => format!(
|
||||
"\n\n{}\n\n",
|
||||
table_to_md(element, base_url, assets, exclude)
|
||||
table_to_md(element, base_url, assets, exclude, depth)
|
||||
),
|
||||
|
||||
// Divs and other containers — just recurse
|
||||
_ => children_to_md(element, base_url, assets, list_depth, exclude),
|
||||
_ => children_to_md(element, base_url, assets, list_depth, exclude, depth),
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -286,13 +298,14 @@ fn children_to_md(
|
|||
assets: &mut ConvertedAssets,
|
||||
list_depth: usize,
|
||||
exclude: &HashSet<NodeId>,
|
||||
depth: usize,
|
||||
) -> String {
|
||||
let mut out = String::new();
|
||||
for child in element.children() {
|
||||
match child.value() {
|
||||
Node::Element(_) => {
|
||||
if let Some(child_el) = ElementRef::wrap(child) {
|
||||
let chunk = node_to_md(child_el, base_url, assets, list_depth, exclude);
|
||||
let chunk = node_to_md(child_el, base_url, assets, list_depth, exclude, depth + 1);
|
||||
if !chunk.is_empty() && !out.is_empty() && needs_separator(&out, &chunk) {
|
||||
out.push(' ');
|
||||
}
|
||||
|
|
@ -315,13 +328,14 @@ fn inline_text(
|
|||
base_url: Option<&Url>,
|
||||
assets: &mut ConvertedAssets,
|
||||
exclude: &HashSet<NodeId>,
|
||||
depth: usize,
|
||||
) -> String {
|
||||
let mut out = String::new();
|
||||
for child in element.children() {
|
||||
match child.value() {
|
||||
Node::Element(_) => {
|
||||
if let Some(child_el) = ElementRef::wrap(child) {
|
||||
let chunk = node_to_md(child_el, base_url, assets, 0, exclude);
|
||||
let chunk = node_to_md(child_el, base_url, assets, 0, exclude, depth + 1);
|
||||
if !chunk.is_empty() && !out.is_empty() && needs_separator(&out, &chunk) {
|
||||
out.push(' ');
|
||||
}
|
||||
|
|
@ -356,7 +370,10 @@ fn collect_text(element: ElementRef<'_>) -> String {
|
|||
/// Every text node is pushed verbatim -- no trimming, no collapsing.
|
||||
/// Handles `<br>` as newlines and inserts newlines between block-level children
|
||||
/// (e.g., `<div>` lines produced by some syntax highlighters).
|
||||
fn collect_preformatted_text(element: ElementRef<'_>) -> String {
|
||||
fn collect_preformatted_text(element: ElementRef<'_>, depth: usize) -> String {
|
||||
if depth > MAX_DOM_DEPTH {
|
||||
return element.text().collect::<String>();
|
||||
}
|
||||
let mut out = String::new();
|
||||
for child in element.children() {
|
||||
match child.value() {
|
||||
|
|
@ -370,12 +387,12 @@ fn collect_preformatted_text(element: ElementRef<'_>) -> String {
|
|||
if !out.is_empty() && !out.ends_with('\n') {
|
||||
out.push('\n');
|
||||
}
|
||||
out.push_str(&collect_preformatted_text(child_el));
|
||||
out.push_str(&collect_preformatted_text(child_el, depth + 1));
|
||||
if !out.ends_with('\n') {
|
||||
out.push('\n');
|
||||
}
|
||||
} else {
|
||||
out.push_str(&collect_preformatted_text(child_el));
|
||||
out.push_str(&collect_preformatted_text(child_el, depth + 1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -405,6 +422,7 @@ fn list_items(
|
|||
depth: usize,
|
||||
ordered: bool,
|
||||
exclude: &HashSet<NodeId>,
|
||||
dom_depth: usize,
|
||||
) -> String {
|
||||
let indent = " ".repeat(depth);
|
||||
let mut out = String::new();
|
||||
|
|
@ -443,6 +461,7 @@ fn list_items(
|
|||
depth + 1,
|
||||
child_tag == "ol",
|
||||
exclude,
|
||||
dom_depth + 1,
|
||||
));
|
||||
} else {
|
||||
inline_parts.push_str(&node_to_md(
|
||||
|
|
@ -451,6 +470,7 @@ fn list_items(
|
|||
assets,
|
||||
depth,
|
||||
exclude,
|
||||
dom_depth + 1,
|
||||
));
|
||||
}
|
||||
} else if let Some(text) = li_child.value().as_text() {
|
||||
|
|
@ -495,6 +515,7 @@ fn table_to_md(
|
|||
base_url: Option<&Url>,
|
||||
assets: &mut ConvertedAssets,
|
||||
exclude: &HashSet<NodeId>,
|
||||
depth: usize,
|
||||
) -> String {
|
||||
// Collect all <td>/<th> cells grouped by row, and detect layout tables
|
||||
let mut raw_rows: Vec<Vec<ElementRef<'_>>> = Vec::new();
|
||||
|
|
@ -542,7 +563,7 @@ fn table_to_md(
|
|||
for row in &raw_rows {
|
||||
for cell in row {
|
||||
let content =
|
||||
children_to_md(*cell, base_url, assets, 0, exclude);
|
||||
children_to_md(*cell, base_url, assets, 0, exclude, depth);
|
||||
let content = content.trim();
|
||||
if !content.is_empty() {
|
||||
if !out.is_empty() {
|
||||
|
|
@ -560,7 +581,7 @@ fn table_to_md(
|
|||
.iter()
|
||||
.map(|row| {
|
||||
row.iter()
|
||||
.map(|c| inline_text(*c, base_url, assets, exclude))
|
||||
.map(|c| inline_text(*c, base_url, assets, exclude, depth))
|
||||
.collect()
|
||||
})
|
||||
.collect();
|
||||
|
|
|
|||
15
crates/webclaw-core/testdata/express_test.html
vendored
Normal file
15
crates/webclaw-core/testdata/express_test.html
vendored
Normal file
File diff suppressed because one or more lines are too long
Loading…
Add table
Add a link
Reference in a new issue