chore: bump to 0.3.9, fix formatting from #14

Version bump for layout table, stack overflow, and noise filter fixes
contributed by @devnen. Also fixes cargo fmt issues that caused CI lint
failure on the merge commit.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Valerio 2026-04-04 15:24:17 +02:00
parent 87ecf4241f
commit 3cf9dbaf2a
7 changed files with 137 additions and 39 deletions

View file

@ -3,6 +3,18 @@
All notable changes to webclaw are documented here. All notable changes to webclaw are documented here.
Format follows [Keep a Changelog](https://keepachangelog.com/). Format follows [Keep a Changelog](https://keepachangelog.com/).
## [0.3.9] — 2026-04-04
### Fixed
- **Layout tables rendered as sections**: tables used for page layout (containing block elements like `<p>`, `<div>`, `<hr>`) are now rendered as standalone sections instead of pipe-delimited markdown tables. Fixes Drudge Report and similar sites where all content was flattened into a single unreadable line. (by [@devnen](https://github.com/devnen) in #14)
- **Stack overflow on deeply nested HTML**: pages with 200+ DOM nesting levels (e.g., Express.co.uk live blogs) no longer overflow the stack. Two-layer fix: depth guard in markdown.rs falls back to iterator-based text collection at depth 200, and `extract_with_options()` spawns an 8 MB worker thread for safety on Windows. (by [@devnen](https://github.com/devnen) in #14)
- **Noise filter swallowing content in malformed HTML**: `<form>` tags no longer unconditionally treated as noise — ASP.NET page-wrapping forms (>500 chars) are preserved. Safety valve prevents unclosed noise containers (header/footer with >5000 chars) from absorbing entire page content. (by [@devnen](https://github.com/devnen) in #14)
### Changed
- **Bold/italic block passthrough**: `<b>`/`<strong>`/`<em>`/`<i>` tags containing block-level children (e.g., Drudge wrapping columns in `<b>`) now act as transparent containers instead of collapsing everything into inline bold/italic. (by [@devnen](https://github.com/devnen) in #14)
---
## [0.3.8] — 2026-04-03 ## [0.3.8] — 2026-04-03
### Fixed ### Fixed

12
Cargo.lock generated
View file

@ -3102,7 +3102,7 @@ dependencies = [
[[package]] [[package]]
name = "webclaw-cli" name = "webclaw-cli"
version = "0.3.8" version = "0.3.9"
dependencies = [ dependencies = [
"clap", "clap",
"dotenvy", "dotenvy",
@ -3122,7 +3122,7 @@ dependencies = [
[[package]] [[package]]
name = "webclaw-core" name = "webclaw-core"
version = "0.3.8" version = "0.3.9"
dependencies = [ dependencies = [
"ego-tree", "ego-tree",
"once_cell", "once_cell",
@ -3140,7 +3140,7 @@ dependencies = [
[[package]] [[package]]
name = "webclaw-fetch" name = "webclaw-fetch"
version = "0.3.8" version = "0.3.9"
dependencies = [ dependencies = [
"bytes", "bytes",
"calamine", "calamine",
@ -3162,7 +3162,7 @@ dependencies = [
[[package]] [[package]]
name = "webclaw-llm" name = "webclaw-llm"
version = "0.3.8" version = "0.3.9"
dependencies = [ dependencies = [
"async-trait", "async-trait",
"reqwest", "reqwest",
@ -3175,7 +3175,7 @@ dependencies = [
[[package]] [[package]]
name = "webclaw-mcp" name = "webclaw-mcp"
version = "0.3.8" version = "0.3.9"
dependencies = [ dependencies = [
"dirs", "dirs",
"dotenvy", "dotenvy",
@ -3196,7 +3196,7 @@ dependencies = [
[[package]] [[package]]
name = "webclaw-pdf" name = "webclaw-pdf"
version = "0.3.8" version = "0.3.9"
dependencies = [ dependencies = [
"pdf-extract", "pdf-extract",
"thiserror", "thiserror",

View file

@ -3,7 +3,7 @@ resolver = "2"
members = ["crates/*"] members = ["crates/*"]
[workspace.package] [workspace.package]
version = "0.3.8" version = "0.3.9"
edition = "2024" edition = "2024"
license = "AGPL-3.0" license = "AGPL-3.0"
repository = "https://github.com/0xMassi/webclaw" repository = "https://github.com/0xMassi/webclaw"

View file

@ -1492,7 +1492,8 @@ mod form_integration_tests {
#[test] #[test]
fn aspnet_form_content_extraction() { fn aspnet_form_content_extraction() {
let content = "x".repeat(600); // Ensure >500 chars let content = "x".repeat(600); // Ensure >500 chars
let html = format!(r#"<html><body> let html = format!(
r#"<html><body>
<form method="post" action="./page.aspx" id="form1"> <form method="post" action="./page.aspx" id="form1">
<div class="wrapper"> <div class="wrapper">
<div class="header"><a href="/">Logo</a></div> <div class="header"><a href="/">Logo</a></div>
@ -1503,12 +1504,19 @@ mod form_integration_tests {
</div> </div>
</div> </div>
</form> </form>
</body></html>"#); </body></html>"#
);
let doc = Html::parse_document(&html); let doc = Html::parse_document(&html);
let opts = ExtractionOptions::default(); let opts = ExtractionOptions::default();
let result = extract_content(&doc, None, &opts); let result = extract_content(&doc, None, &opts);
assert!(result.markdown.contains("Section"), "h2 missing from markdown"); assert!(
assert!(result.markdown.contains("Question"), "h3 missing from markdown"); result.markdown.contains("Section"),
"h2 missing from markdown"
);
assert!(
result.markdown.contains("Question"),
"h3 missing from markdown"
);
} }
/// Simulate unclosed header div absorbing the content div. /// Simulate unclosed header div absorbing the content div.
@ -1520,7 +1528,8 @@ mod form_integration_tests {
// The header div is intentionally NOT closed — the HTML parser makes // The header div is intentionally NOT closed — the HTML parser makes
// div.content a child of div.header. The safety valve (>5000 chars) // div.content a child of div.header. The safety valve (>5000 chars)
// should prevent div.header from being treated as noise. // should prevent div.header from being treated as noise.
let html = format!(r#"<html><body> let html = format!(
r#"<html><body>
<div class="wrapper"> <div class="wrapper">
<div class="header"><a href="/">Logo</a> <div class="header"><a href="/">Logo</a>
<div class="content"> <div class="content">
@ -1529,11 +1538,18 @@ mod form_integration_tests {
<p>{faq}</p> <p>{faq}</p>
</div> </div>
</div> </div>
</body></html>"#); </body></html>"#
);
let doc = Html::parse_document(&html); let doc = Html::parse_document(&html);
let opts = ExtractionOptions::default(); let opts = ExtractionOptions::default();
let result = extract_content(&doc, None, &opts); let result = extract_content(&doc, None, &opts);
assert!(result.markdown.contains("FAQ Section"), "h2 missing: header swallowed content"); assert!(
assert!(result.markdown.contains("First question"), "h3 missing: header swallowed content"); result.markdown.contains("FAQ Section"),
"h2 missing: header swallowed content"
);
assert!(
result.markdown.contains("First question"),
"h3 missing: header swallowed content"
);
} }
} }

View file

@ -562,9 +562,14 @@ mod tests {
let html = include_str!("../testdata/express_test.html"); let html = include_str!("../testdata/express_test.html");
let result = extract( let result = extract(
html, html,
Some("https://www.express.co.uk/news/world/2189934/iran-live-donald-trump-uae-dubai-kuwait-attacks"), Some(
"https://www.express.co.uk/news/world/2189934/iran-live-donald-trump-uae-dubai-kuwait-attacks",
),
);
assert!(
result.is_ok(),
"Should not stack overflow on Express.co.uk live blog"
); );
assert!(result.is_ok(), "Should not stack overflow on Express.co.uk live blog");
let result = result.unwrap(); let result = result.unwrap();
assert!( assert!(
result.metadata.word_count > 100, result.metadata.word_count > 100,
@ -588,7 +593,10 @@ mod tests {
html.push_str("</body></html>"); html.push_str("</body></html>");
let result = extract(&html, None); let result = extract(&html, None);
assert!(result.is_ok(), "Should not stack overflow on deeply nested HTML"); assert!(
result.is_ok(),
"Should not stack overflow on deeply nested HTML"
);
let result = result.unwrap(); let result = result.unwrap();
assert!( assert!(
result.content.markdown.contains("Deep content"), result.content.markdown.contains("Deep content"),

View file

@ -181,7 +181,10 @@ fn node_to_md(
if cell_has_block_content(element) { if cell_has_block_content(element) {
children_to_md(element, base_url, assets, list_depth, exclude, depth) children_to_md(element, base_url, assets, list_depth, exclude, depth)
} else { } else {
format!("**{}**", inline_text(element, base_url, assets, exclude, depth)) format!(
"**{}**",
inline_text(element, base_url, assets, exclude, depth)
)
} }
} }
@ -190,7 +193,10 @@ fn node_to_md(
if cell_has_block_content(element) { if cell_has_block_content(element) {
children_to_md(element, base_url, assets, list_depth, exclude, depth) children_to_md(element, base_url, assets, list_depth, exclude, depth)
} else { } else {
format!("*{}*", inline_text(element, base_url, assets, exclude, depth)) format!(
"*{}*",
inline_text(element, base_url, assets, exclude, depth)
)
} }
} }
@ -305,7 +311,8 @@ fn children_to_md(
match child.value() { match child.value() {
Node::Element(_) => { Node::Element(_) => {
if let Some(child_el) = ElementRef::wrap(child) { if let Some(child_el) = ElementRef::wrap(child) {
let chunk = node_to_md(child_el, base_url, assets, list_depth, exclude, depth + 1); let chunk =
node_to_md(child_el, base_url, assets, list_depth, exclude, depth + 1);
if !chunk.is_empty() && !out.is_empty() && needs_separator(&out, &chunk) { if !chunk.is_empty() && !out.is_empty() && needs_separator(&out, &chunk) {
out.push(' '); out.push(' ');
} }
@ -497,8 +504,26 @@ fn list_items(
/// table rather than a data table. /// table rather than a data table.
fn cell_has_block_content(cell: ElementRef<'_>) -> bool { fn cell_has_block_content(cell: ElementRef<'_>) -> bool {
const BLOCK_TAGS: &[&str] = &[ const BLOCK_TAGS: &[&str] = &[
"p", "div", "ul", "ol", "blockquote", "h1", "h2", "h3", "h4", "h5", "h6", "hr", "pre", "p",
"table", "section", "article", "header", "footer", "nav", "aside", "div",
"ul",
"ol",
"blockquote",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"hr",
"pre",
"table",
"section",
"article",
"header",
"footer",
"nav",
"aside",
]; ];
for desc in cell.descendants() { for desc in cell.descendants() {
if let Some(el) = ElementRef::wrap(desc) { if let Some(el) = ElementRef::wrap(desc) {
@ -562,8 +587,7 @@ fn table_to_md(
let mut out = String::new(); let mut out = String::new();
for row in &raw_rows { for row in &raw_rows {
for cell in row { for cell in row {
let content = let content = children_to_md(*cell, base_url, assets, 0, exclude, depth);
children_to_md(*cell, base_url, assets, 0, exclude, depth);
let content = content.trim(); let content = content.trim();
if !content.is_empty() { if !content.is_empty() {
if !out.is_empty() { if !out.is_empty() {
@ -1098,11 +1122,20 @@ mod tests {
</table>"##; </table>"##;
let (md, _, _) = convert_html(html, None); let (md, _, _) = convert_html(html, None);
// Should NOT produce markdown table syntax // Should NOT produce markdown table syntax
assert!(!md.contains("| "), "layout table should not use pipe syntax: {md}"); assert!(
!md.contains("| "),
"layout table should not use pipe syntax: {md}"
);
// Should contain the content as separate blocks // Should contain the content as separate blocks
assert!(md.contains("Column one first paragraph"), "missing content: {md}"); assert!(
md.contains("Column one first paragraph"),
"missing content: {md}"
);
assert!(md.contains("Column two content"), "missing content: {md}"); assert!(md.contains("Column two content"), "missing content: {md}");
assert!(md.contains("Column two after rule"), "missing content: {md}"); assert!(
md.contains("Column two after rule"),
"missing content: {md}"
);
} }
#[test] #[test]
@ -1121,10 +1154,22 @@ mod tests {
</tr> </tr>
</table>"##; </table>"##;
let (md, _, _) = convert_html(html, None); let (md, _, _) = convert_html(html, None);
assert!(!md.contains("| "), "layout table should not use pipe syntax: {md}"); assert!(
assert!(md.contains("[Headline One](https://example.com/1)"), "missing link: {md}"); !md.contains("| "),
assert!(md.contains("[Headline Two](https://example.com/2)"), "missing link: {md}"); "layout table should not use pipe syntax: {md}"
assert!(md.contains("[Headline Three](https://example.com/3)"), "missing link: {md}"); );
assert!(
md.contains("[Headline One](https://example.com/1)"),
"missing link: {md}"
);
assert!(
md.contains("[Headline Two](https://example.com/2)"),
"missing link: {md}"
);
assert!(
md.contains("[Headline Three](https://example.com/3)"),
"missing link: {md}"
);
} }
#[test] #[test]

View file

@ -8,7 +8,8 @@ use scraper::ElementRef;
const NOISE_TAGS: &[&str] = &[ const NOISE_TAGS: &[&str] = &[
"script", "style", "noscript", "iframe", "svg", "nav", "aside", "footer", "header", "video", "script", "style", "noscript", "iframe", "svg", "nav", "aside", "footer", "header", "video",
"audio", "canvas", "audio",
"canvas",
// NOTE: <form> removed from this list — ASP.NET and similar frameworks wrap the // NOTE: <form> removed from this list — ASP.NET and similar frameworks wrap the
// entire page body in a single <form> tag that contains all real content. // entire page body in a single <form> tag that contains all real content.
// Forms are now handled with a heuristic in is_noise() that distinguishes // Forms are now handled with a heuristic in is_noise() that distinguishes
@ -205,8 +206,12 @@ pub fn is_noise(el: ElementRef<'_>) -> bool {
// Also check noise classes/IDs — a big form with class="login-form" is still noise // Also check noise classes/IDs — a big form with class="login-form" is still noise
if let Some(class) = el.value().attr("class") { if let Some(class) = el.value().attr("class") {
let cl = class.to_lowercase(); let cl = class.to_lowercase();
if cl.contains("login") || cl.contains("search") || cl.contains("subscribe") if cl.contains("login")
|| cl.contains("signup") || cl.contains("newsletter") || cl.contains("contact") || cl.contains("search")
|| cl.contains("subscribe")
|| cl.contains("signup")
|| cl.contains("newsletter")
|| cl.contains("contact")
{ {
return true; return true;
} }
@ -809,11 +814,20 @@ mod form_tests {
fn aspnet_page_wrapping_form_is_not_noise() { fn aspnet_page_wrapping_form_is_not_noise() {
let html = r#"<html><body><form method="post" action="./page.aspx" id="form1"><div class="wrapper"><div class="content"><h1>Support</h1><h3>Question one?</h3><p>Long answer text that should definitely be captured by the extraction engine. This is real content with multiple sentences to ensure it passes any text length thresholds in the scoring algorithm. We need at least five hundred characters of actual text content here to exceed the threshold. Adding more sentences about various topics including data formats, historical prices, stock market analysis, technical indicators, and trading strategies. This paragraph discusses how intraday data can be used for backtesting quantitative models and developing automated trading systems.</p><h3>Question two?</h3><p>Another substantial answer paragraph with detailed information about the product features and capabilities.</p></div></div></form></body></html>"#; let html = r#"<html><body><form method="post" action="./page.aspx" id="form1"><div class="wrapper"><div class="content"><h1>Support</h1><h3>Question one?</h3><p>Long answer text that should definitely be captured by the extraction engine. This is real content with multiple sentences to ensure it passes any text length thresholds in the scoring algorithm. We need at least five hundred characters of actual text content here to exceed the threshold. Adding more sentences about various topics including data formats, historical prices, stock market analysis, technical indicators, and trading strategies. This paragraph discusses how intraday data can be used for backtesting quantitative models and developing automated trading systems.</p><h3>Question two?</h3><p>Another substantial answer paragraph with detailed information about the product features and capabilities.</p></div></div></form></body></html>"#;
let doc = Html::parse_document(html); let doc = Html::parse_document(html);
let form = doc.select(&scraper::Selector::parse("form").unwrap()).next().unwrap(); let form = doc
.select(&scraper::Selector::parse("form").unwrap())
.next()
.unwrap();
let text = form.text().collect::<String>(); let text = form.text().collect::<String>();
let text_len = text.len(); let text_len = text.len();
assert!(text_len >= 500, "Form text should be >= 500 chars, got {text_len}"); assert!(
assert!(!is_noise(form), "ASP.NET page-wrapping form should NOT be noise"); text_len >= 500,
"Form text should be >= 500 chars, got {text_len}"
);
assert!(
!is_noise(form),
"ASP.NET page-wrapping form should NOT be noise"
);
} }
#[test] #[test]
@ -828,7 +842,10 @@ mod form_tests {
</body></html> </body></html>
"#; "#;
let doc = Html::parse_document(html); let doc = Html::parse_document(html);
let form = doc.select(&scraper::Selector::parse("form").unwrap()).next().unwrap(); let form = doc
.select(&scraper::Selector::parse("form").unwrap())
.next()
.unwrap();
assert!(is_noise(form), "Small login form SHOULD be noise"); assert!(is_noise(form), "Small login form SHOULD be noise");
} }
} }