From 3cf9dbaf2a3dca4e811dfa0818262601219d04be Mon Sep 17 00:00:00 2001 From: Valerio Date: Sat, 4 Apr 2026 15:24:17 +0200 Subject: [PATCH] chore: bump to 0.3.9, fix formatting from #14 Version bump for layout table, stack overflow, and noise filter fixes contributed by @devnen. Also fixes cargo fmt issues that caused CI lint failure on the merge commit. Co-Authored-By: Claude Opus 4.6 (1M context) --- CHANGELOG.md | 12 +++++ Cargo.lock | 12 ++--- Cargo.toml | 2 +- crates/webclaw-core/src/extractor.rs | 32 +++++++++--- crates/webclaw-core/src/lib.rs | 14 ++++-- crates/webclaw-core/src/markdown.rs | 73 ++++++++++++++++++++++------ crates/webclaw-core/src/noise.rs | 31 +++++++++--- 7 files changed, 137 insertions(+), 39 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2f202bb..ce4a703 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,18 @@ All notable changes to webclaw are documented here. Format follows [Keep a Changelog](https://keepachangelog.com/). +## [0.3.9] — 2026-04-04 + +### Fixed +- **Layout tables rendered as sections**: tables used for page layout (containing block elements like `

`, `

`, `
`) are now rendered as standalone sections instead of pipe-delimited markdown tables. Fixes Drudge Report and similar sites where all content was flattened into a single unreadable line. (by [@devnen](https://github.com/devnen) in #14) +- **Stack overflow on deeply nested HTML**: pages with 200+ DOM nesting levels (e.g., Express.co.uk live blogs) no longer overflow the stack. Two-layer fix: depth guard in markdown.rs falls back to iterator-based text collection at depth 200, and `extract_with_options()` spawns an 8 MB worker thread for safety on Windows. (by [@devnen](https://github.com/devnen) in #14) +- **Noise filter swallowing content in malformed HTML**: `
` tags no longer unconditionally treated as noise — ASP.NET page-wrapping forms (>500 chars) are preserved. Safety valve prevents unclosed noise containers (header/footer with >5000 chars) from absorbing entire page content. (by [@devnen](https://github.com/devnen) in #14) + +### Changed +- **Bold/italic block passthrough**: ``/``/``/`` tags containing block-level children (e.g., Drudge wrapping columns in ``) now act as transparent containers instead of collapsing everything into inline bold/italic. (by [@devnen](https://github.com/devnen) in #14) + +--- + ## [0.3.8] — 2026-04-03 ### Fixed diff --git a/Cargo.lock b/Cargo.lock index 581f03e..3407fac 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3102,7 +3102,7 @@ dependencies = [ [[package]] name = "webclaw-cli" -version = "0.3.8" +version = "0.3.9" dependencies = [ "clap", "dotenvy", @@ -3122,7 +3122,7 @@ dependencies = [ [[package]] name = "webclaw-core" -version = "0.3.8" +version = "0.3.9" dependencies = [ "ego-tree", "once_cell", @@ -3140,7 +3140,7 @@ dependencies = [ [[package]] name = "webclaw-fetch" -version = "0.3.8" +version = "0.3.9" dependencies = [ "bytes", "calamine", @@ -3162,7 +3162,7 @@ dependencies = [ [[package]] name = "webclaw-llm" -version = "0.3.8" +version = "0.3.9" dependencies = [ "async-trait", "reqwest", @@ -3175,7 +3175,7 @@ dependencies = [ [[package]] name = "webclaw-mcp" -version = "0.3.8" +version = "0.3.9" dependencies = [ "dirs", "dotenvy", @@ -3196,7 +3196,7 @@ dependencies = [ [[package]] name = "webclaw-pdf" -version = "0.3.8" +version = "0.3.9" dependencies = [ "pdf-extract", "thiserror", diff --git a/Cargo.toml b/Cargo.toml index afc82dc..6496c18 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ resolver = "2" members = ["crates/*"] [workspace.package] -version = "0.3.8" +version = "0.3.9" edition = "2024" license = "AGPL-3.0" repository = "https://github.com/0xMassi/webclaw" diff --git a/crates/webclaw-core/src/extractor.rs b/crates/webclaw-core/src/extractor.rs index 3efe9e0..7ba7fc8 100644 --- a/crates/webclaw-core/src/extractor.rs +++ b/crates/webclaw-core/src/extractor.rs @@ -1492,7 +1492,8 @@ mod form_integration_tests { #[test] fn aspnet_form_content_extraction() { let content = "x".repeat(600); // Ensure >500 chars - let html = format!(r#" + let html = format!( + r#"
@@ -1503,12 +1504,19 @@ mod form_integration_tests {
- "#); + "# + ); let doc = Html::parse_document(&html); let opts = ExtractionOptions::default(); let result = extract_content(&doc, None, &opts); - assert!(result.markdown.contains("Section"), "h2 missing from markdown"); - assert!(result.markdown.contains("Question"), "h3 missing from markdown"); + assert!( + result.markdown.contains("Section"), + "h2 missing from markdown" + ); + assert!( + result.markdown.contains("Question"), + "h3 missing from markdown" + ); } /// Simulate unclosed header div absorbing the content div. @@ -1520,7 +1528,8 @@ mod form_integration_tests { // The header div is intentionally NOT closed — the HTML parser makes // div.content a child of div.header. The safety valve (>5000 chars) // should prevent div.header from being treated as noise. - let html = format!(r#" + let html = format!( + r#"
Logo
@@ -1529,11 +1538,18 @@ mod form_integration_tests {

{faq}

- "#); + "# + ); let doc = Html::parse_document(&html); let opts = ExtractionOptions::default(); let result = extract_content(&doc, None, &opts); - assert!(result.markdown.contains("FAQ Section"), "h2 missing: header swallowed content"); - assert!(result.markdown.contains("First question"), "h3 missing: header swallowed content"); + assert!( + result.markdown.contains("FAQ Section"), + "h2 missing: header swallowed content" + ); + assert!( + result.markdown.contains("First question"), + "h3 missing: header swallowed content" + ); } } diff --git a/crates/webclaw-core/src/lib.rs b/crates/webclaw-core/src/lib.rs index bbd6525..80dbb5c 100644 --- a/crates/webclaw-core/src/lib.rs +++ b/crates/webclaw-core/src/lib.rs @@ -562,9 +562,14 @@ mod tests { let html = include_str!("../testdata/express_test.html"); let result = extract( html, - Some("https://www.express.co.uk/news/world/2189934/iran-live-donald-trump-uae-dubai-kuwait-attacks"), + Some( + "https://www.express.co.uk/news/world/2189934/iran-live-donald-trump-uae-dubai-kuwait-attacks", + ), + ); + assert!( + result.is_ok(), + "Should not stack overflow on Express.co.uk live blog" ); - assert!(result.is_ok(), "Should not stack overflow on Express.co.uk live blog"); let result = result.unwrap(); assert!( result.metadata.word_count > 100, @@ -588,7 +593,10 @@ mod tests { html.push_str(""); let result = extract(&html, None); - assert!(result.is_ok(), "Should not stack overflow on deeply nested HTML"); + assert!( + result.is_ok(), + "Should not stack overflow on deeply nested HTML" + ); let result = result.unwrap(); assert!( result.content.markdown.contains("Deep content"), diff --git a/crates/webclaw-core/src/markdown.rs b/crates/webclaw-core/src/markdown.rs index ff33b75..7908185 100644 --- a/crates/webclaw-core/src/markdown.rs +++ b/crates/webclaw-core/src/markdown.rs @@ -181,7 +181,10 @@ fn node_to_md( if cell_has_block_content(element) { children_to_md(element, base_url, assets, list_depth, exclude, depth) } else { - format!("**{}**", inline_text(element, base_url, assets, exclude, depth)) + format!( + "**{}**", + inline_text(element, base_url, assets, exclude, depth) + ) } } @@ -190,7 +193,10 @@ fn node_to_md( if cell_has_block_content(element) { children_to_md(element, base_url, assets, list_depth, exclude, depth) } else { - format!("*{}*", inline_text(element, base_url, assets, exclude, depth)) + format!( + "*{}*", + inline_text(element, base_url, assets, exclude, depth) + ) } } @@ -305,7 +311,8 @@ fn children_to_md( match child.value() { Node::Element(_) => { if let Some(child_el) = ElementRef::wrap(child) { - let chunk = node_to_md(child_el, base_url, assets, list_depth, exclude, depth + 1); + let chunk = + node_to_md(child_el, base_url, assets, list_depth, exclude, depth + 1); if !chunk.is_empty() && !out.is_empty() && needs_separator(&out, &chunk) { out.push(' '); } @@ -497,8 +504,26 @@ fn list_items( /// table rather than a data table. fn cell_has_block_content(cell: ElementRef<'_>) -> bool { const BLOCK_TAGS: &[&str] = &[ - "p", "div", "ul", "ol", "blockquote", "h1", "h2", "h3", "h4", "h5", "h6", "hr", "pre", - "table", "section", "article", "header", "footer", "nav", "aside", + "p", + "div", + "ul", + "ol", + "blockquote", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "hr", + "pre", + "table", + "section", + "article", + "header", + "footer", + "nav", + "aside", ]; for desc in cell.descendants() { if let Some(el) = ElementRef::wrap(desc) { @@ -562,8 +587,7 @@ fn table_to_md( let mut out = String::new(); for row in &raw_rows { for cell in row { - let content = - children_to_md(*cell, base_url, assets, 0, exclude, depth); + let content = children_to_md(*cell, base_url, assets, 0, exclude, depth); let content = content.trim(); if !content.is_empty() { if !out.is_empty() { @@ -1098,11 +1122,20 @@ mod tests { "##; let (md, _, _) = convert_html(html, None); // Should NOT produce markdown table syntax - assert!(!md.contains("| "), "layout table should not use pipe syntax: {md}"); + assert!( + !md.contains("| "), + "layout table should not use pipe syntax: {md}" + ); // Should contain the content as separate blocks - assert!(md.contains("Column one first paragraph"), "missing content: {md}"); + assert!( + md.contains("Column one first paragraph"), + "missing content: {md}" + ); assert!(md.contains("Column two content"), "missing content: {md}"); - assert!(md.contains("Column two after rule"), "missing content: {md}"); + assert!( + md.contains("Column two after rule"), + "missing content: {md}" + ); } #[test] @@ -1121,10 +1154,22 @@ mod tests { "##; let (md, _, _) = convert_html(html, None); - assert!(!md.contains("| "), "layout table should not use pipe syntax: {md}"); - assert!(md.contains("[Headline One](https://example.com/1)"), "missing link: {md}"); - assert!(md.contains("[Headline Two](https://example.com/2)"), "missing link: {md}"); - assert!(md.contains("[Headline Three](https://example.com/3)"), "missing link: {md}"); + assert!( + !md.contains("| "), + "layout table should not use pipe syntax: {md}" + ); + assert!( + md.contains("[Headline One](https://example.com/1)"), + "missing link: {md}" + ); + assert!( + md.contains("[Headline Two](https://example.com/2)"), + "missing link: {md}" + ); + assert!( + md.contains("[Headline Three](https://example.com/3)"), + "missing link: {md}" + ); } #[test] diff --git a/crates/webclaw-core/src/noise.rs b/crates/webclaw-core/src/noise.rs index c9c9caf..53b32ed 100644 --- a/crates/webclaw-core/src/noise.rs +++ b/crates/webclaw-core/src/noise.rs @@ -8,7 +8,8 @@ use scraper::ElementRef; const NOISE_TAGS: &[&str] = &[ "script", "style", "noscript", "iframe", "svg", "nav", "aside", "footer", "header", "video", - "audio", "canvas", + "audio", + "canvas", // NOTE:
removed from this list — ASP.NET and similar frameworks wrap the // entire page body in a single tag that contains all real content. // Forms are now handled with a heuristic in is_noise() that distinguishes @@ -205,8 +206,12 @@ pub fn is_noise(el: ElementRef<'_>) -> bool { // Also check noise classes/IDs — a big form with class="login-form" is still noise if let Some(class) = el.value().attr("class") { let cl = class.to_lowercase(); - if cl.contains("login") || cl.contains("search") || cl.contains("subscribe") - || cl.contains("signup") || cl.contains("newsletter") || cl.contains("contact") + if cl.contains("login") + || cl.contains("search") + || cl.contains("subscribe") + || cl.contains("signup") + || cl.contains("newsletter") + || cl.contains("contact") { return true; } @@ -809,11 +814,20 @@ mod form_tests { fn aspnet_page_wrapping_form_is_not_noise() { let html = r#"

Support

Question one?

Long answer text that should definitely be captured by the extraction engine. This is real content with multiple sentences to ensure it passes any text length thresholds in the scoring algorithm. We need at least five hundred characters of actual text content here to exceed the threshold. Adding more sentences about various topics including data formats, historical prices, stock market analysis, technical indicators, and trading strategies. This paragraph discusses how intraday data can be used for backtesting quantitative models and developing automated trading systems.

Question two?

Another substantial answer paragraph with detailed information about the product features and capabilities.

"#; let doc = Html::parse_document(html); - let form = doc.select(&scraper::Selector::parse("form").unwrap()).next().unwrap(); + let form = doc + .select(&scraper::Selector::parse("form").unwrap()) + .next() + .unwrap(); let text = form.text().collect::(); let text_len = text.len(); - assert!(text_len >= 500, "Form text should be >= 500 chars, got {text_len}"); - assert!(!is_noise(form), "ASP.NET page-wrapping form should NOT be noise"); + assert!( + text_len >= 500, + "Form text should be >= 500 chars, got {text_len}" + ); + assert!( + !is_noise(form), + "ASP.NET page-wrapping form should NOT be noise" + ); } #[test] @@ -828,7 +842,10 @@ mod form_tests { "#; let doc = Html::parse_document(html); - let form = doc.select(&scraper::Selector::parse("form").unwrap()).next().unwrap(); + let form = doc + .select(&scraper::Selector::parse("form").unwrap()) + .next() + .unwrap(); assert!(is_noise(form), "Small login form SHOULD be noise"); } }