diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2f202bb..ce4a703 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,18 @@
All notable changes to webclaw are documented here.
Format follows [Keep a Changelog](https://keepachangelog.com/).
+## [0.3.9] — 2026-04-04
+
+### Fixed
+- **Layout tables rendered as sections**: tables used for page layout (containing block elements like `
`, `
`, `
`) are now rendered as standalone sections instead of pipe-delimited markdown tables. Fixes Drudge Report and similar sites where all content was flattened into a single unreadable line. (by [@devnen](https://github.com/devnen) in #14)
+- **Stack overflow on deeply nested HTML**: pages with 200+ DOM nesting levels (e.g., Express.co.uk live blogs) no longer overflow the stack. Two-layer fix: depth guard in markdown.rs falls back to iterator-based text collection at depth 200, and `extract_with_options()` spawns an 8 MB worker thread for safety on Windows. (by [@devnen](https://github.com/devnen) in #14)
+- **Noise filter swallowing content in malformed HTML**: `
- "#);
+ "#
+ );
let doc = Html::parse_document(&html);
let opts = ExtractionOptions::default();
let result = extract_content(&doc, None, &opts);
- assert!(result.markdown.contains("Section"), "h2 missing from markdown");
- assert!(result.markdown.contains("Question"), "h3 missing from markdown");
+ assert!(
+ result.markdown.contains("Section"),
+ "h2 missing from markdown"
+ );
+ assert!(
+ result.markdown.contains("Question"),
+ "h3 missing from markdown"
+ );
}
/// Simulate unclosed header div absorbing the content div.
@@ -1520,7 +1528,8 @@ mod form_integration_tests {
// The header div is intentionally NOT closed — the HTML parser makes
// div.content a child of div.header. The safety valve (>5000 chars)
// should prevent div.header from being treated as noise.
- let html = format!(r#"
+ let html = format!(
+ r#"
- "#);
+ "#
+ );
let doc = Html::parse_document(&html);
let opts = ExtractionOptions::default();
let result = extract_content(&doc, None, &opts);
- assert!(result.markdown.contains("FAQ Section"), "h2 missing: header swallowed content");
- assert!(result.markdown.contains("First question"), "h3 missing: header swallowed content");
+ assert!(
+ result.markdown.contains("FAQ Section"),
+ "h2 missing: header swallowed content"
+ );
+ assert!(
+ result.markdown.contains("First question"),
+ "h3 missing: header swallowed content"
+ );
}
}
diff --git a/crates/webclaw-core/src/lib.rs b/crates/webclaw-core/src/lib.rs
index bbd6525..80dbb5c 100644
--- a/crates/webclaw-core/src/lib.rs
+++ b/crates/webclaw-core/src/lib.rs
@@ -562,9 +562,14 @@ mod tests {
let html = include_str!("../testdata/express_test.html");
let result = extract(
html,
- Some("https://www.express.co.uk/news/world/2189934/iran-live-donald-trump-uae-dubai-kuwait-attacks"),
+ Some(
+ "https://www.express.co.uk/news/world/2189934/iran-live-donald-trump-uae-dubai-kuwait-attacks",
+ ),
+ );
+ assert!(
+ result.is_ok(),
+ "Should not stack overflow on Express.co.uk live blog"
);
- assert!(result.is_ok(), "Should not stack overflow on Express.co.uk live blog");
let result = result.unwrap();
assert!(
result.metadata.word_count > 100,
@@ -588,7 +593,10 @@ mod tests {
html.push_str("