mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-04-25 00:06:21 +02:00
chore: bump to 0.3.9, fix formatting from #14
Version bump for layout table, stack overflow, and noise filter fixes contributed by @devnen. Also fixes cargo fmt issues that caused CI lint failure on the merge commit. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
87ecf4241f
commit
3cf9dbaf2a
7 changed files with 137 additions and 39 deletions
12
CHANGELOG.md
12
CHANGELOG.md
|
|
@ -3,6 +3,18 @@
|
|||
All notable changes to webclaw are documented here.
|
||||
Format follows [Keep a Changelog](https://keepachangelog.com/).
|
||||
|
||||
## [0.3.9] — 2026-04-04
|
||||
|
||||
### Fixed
|
||||
- **Layout tables rendered as sections**: tables used for page layout (containing block elements like `<p>`, `<div>`, `<hr>`) are now rendered as standalone sections instead of pipe-delimited markdown tables. Fixes Drudge Report and similar sites where all content was flattened into a single unreadable line. (by [@devnen](https://github.com/devnen) in #14)
|
||||
- **Stack overflow on deeply nested HTML**: pages with 200+ DOM nesting levels (e.g., Express.co.uk live blogs) no longer overflow the stack. Two-layer fix: depth guard in markdown.rs falls back to iterator-based text collection at depth 200, and `extract_with_options()` spawns an 8 MB worker thread for safety on Windows. (by [@devnen](https://github.com/devnen) in #14)
|
||||
- **Noise filter swallowing content in malformed HTML**: `<form>` tags no longer unconditionally treated as noise — ASP.NET page-wrapping forms (>500 chars) are preserved. Safety valve prevents unclosed noise containers (header/footer with >5000 chars) from absorbing entire page content. (by [@devnen](https://github.com/devnen) in #14)
|
||||
|
||||
### Changed
|
||||
- **Bold/italic block passthrough**: `<b>`/`<strong>`/`<em>`/`<i>` tags containing block-level children (e.g., Drudge wrapping columns in `<b>`) now act as transparent containers instead of collapsing everything into inline bold/italic. (by [@devnen](https://github.com/devnen) in #14)
|
||||
|
||||
---
|
||||
|
||||
## [0.3.8] — 2026-04-03
|
||||
|
||||
### Fixed
|
||||
|
|
|
|||
12
Cargo.lock
generated
12
Cargo.lock
generated
|
|
@ -3102,7 +3102,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-cli"
|
||||
version = "0.3.8"
|
||||
version = "0.3.9"
|
||||
dependencies = [
|
||||
"clap",
|
||||
"dotenvy",
|
||||
|
|
@ -3122,7 +3122,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-core"
|
||||
version = "0.3.8"
|
||||
version = "0.3.9"
|
||||
dependencies = [
|
||||
"ego-tree",
|
||||
"once_cell",
|
||||
|
|
@ -3140,7 +3140,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-fetch"
|
||||
version = "0.3.8"
|
||||
version = "0.3.9"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"calamine",
|
||||
|
|
@ -3162,7 +3162,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-llm"
|
||||
version = "0.3.8"
|
||||
version = "0.3.9"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"reqwest",
|
||||
|
|
@ -3175,7 +3175,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-mcp"
|
||||
version = "0.3.8"
|
||||
version = "0.3.9"
|
||||
dependencies = [
|
||||
"dirs",
|
||||
"dotenvy",
|
||||
|
|
@ -3196,7 +3196,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-pdf"
|
||||
version = "0.3.8"
|
||||
version = "0.3.9"
|
||||
dependencies = [
|
||||
"pdf-extract",
|
||||
"thiserror",
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ resolver = "2"
|
|||
members = ["crates/*"]
|
||||
|
||||
[workspace.package]
|
||||
version = "0.3.8"
|
||||
version = "0.3.9"
|
||||
edition = "2024"
|
||||
license = "AGPL-3.0"
|
||||
repository = "https://github.com/0xMassi/webclaw"
|
||||
|
|
|
|||
|
|
@ -1492,7 +1492,8 @@ mod form_integration_tests {
|
|||
#[test]
|
||||
fn aspnet_form_content_extraction() {
|
||||
let content = "x".repeat(600); // Ensure >500 chars
|
||||
let html = format!(r#"<html><body>
|
||||
let html = format!(
|
||||
r#"<html><body>
|
||||
<form method="post" action="./page.aspx" id="form1">
|
||||
<div class="wrapper">
|
||||
<div class="header"><a href="/">Logo</a></div>
|
||||
|
|
@ -1503,12 +1504,19 @@ mod form_integration_tests {
|
|||
</div>
|
||||
</div>
|
||||
</form>
|
||||
</body></html>"#);
|
||||
</body></html>"#
|
||||
);
|
||||
let doc = Html::parse_document(&html);
|
||||
let opts = ExtractionOptions::default();
|
||||
let result = extract_content(&doc, None, &opts);
|
||||
assert!(result.markdown.contains("Section"), "h2 missing from markdown");
|
||||
assert!(result.markdown.contains("Question"), "h3 missing from markdown");
|
||||
assert!(
|
||||
result.markdown.contains("Section"),
|
||||
"h2 missing from markdown"
|
||||
);
|
||||
assert!(
|
||||
result.markdown.contains("Question"),
|
||||
"h3 missing from markdown"
|
||||
);
|
||||
}
|
||||
|
||||
/// Simulate unclosed header div absorbing the content div.
|
||||
|
|
@ -1520,7 +1528,8 @@ mod form_integration_tests {
|
|||
// The header div is intentionally NOT closed — the HTML parser makes
|
||||
// div.content a child of div.header. The safety valve (>5000 chars)
|
||||
// should prevent div.header from being treated as noise.
|
||||
let html = format!(r#"<html><body>
|
||||
let html = format!(
|
||||
r#"<html><body>
|
||||
<div class="wrapper">
|
||||
<div class="header"><a href="/">Logo</a>
|
||||
<div class="content">
|
||||
|
|
@ -1529,11 +1538,18 @@ mod form_integration_tests {
|
|||
<p>{faq}</p>
|
||||
</div>
|
||||
</div>
|
||||
</body></html>"#);
|
||||
</body></html>"#
|
||||
);
|
||||
let doc = Html::parse_document(&html);
|
||||
let opts = ExtractionOptions::default();
|
||||
let result = extract_content(&doc, None, &opts);
|
||||
assert!(result.markdown.contains("FAQ Section"), "h2 missing: header swallowed content");
|
||||
assert!(result.markdown.contains("First question"), "h3 missing: header swallowed content");
|
||||
assert!(
|
||||
result.markdown.contains("FAQ Section"),
|
||||
"h2 missing: header swallowed content"
|
||||
);
|
||||
assert!(
|
||||
result.markdown.contains("First question"),
|
||||
"h3 missing: header swallowed content"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -562,9 +562,14 @@ mod tests {
|
|||
let html = include_str!("../testdata/express_test.html");
|
||||
let result = extract(
|
||||
html,
|
||||
Some("https://www.express.co.uk/news/world/2189934/iran-live-donald-trump-uae-dubai-kuwait-attacks"),
|
||||
Some(
|
||||
"https://www.express.co.uk/news/world/2189934/iran-live-donald-trump-uae-dubai-kuwait-attacks",
|
||||
),
|
||||
);
|
||||
assert!(
|
||||
result.is_ok(),
|
||||
"Should not stack overflow on Express.co.uk live blog"
|
||||
);
|
||||
assert!(result.is_ok(), "Should not stack overflow on Express.co.uk live blog");
|
||||
let result = result.unwrap();
|
||||
assert!(
|
||||
result.metadata.word_count > 100,
|
||||
|
|
@ -588,7 +593,10 @@ mod tests {
|
|||
html.push_str("</body></html>");
|
||||
|
||||
let result = extract(&html, None);
|
||||
assert!(result.is_ok(), "Should not stack overflow on deeply nested HTML");
|
||||
assert!(
|
||||
result.is_ok(),
|
||||
"Should not stack overflow on deeply nested HTML"
|
||||
);
|
||||
let result = result.unwrap();
|
||||
assert!(
|
||||
result.content.markdown.contains("Deep content"),
|
||||
|
|
|
|||
|
|
@ -181,7 +181,10 @@ fn node_to_md(
|
|||
if cell_has_block_content(element) {
|
||||
children_to_md(element, base_url, assets, list_depth, exclude, depth)
|
||||
} else {
|
||||
format!("**{}**", inline_text(element, base_url, assets, exclude, depth))
|
||||
format!(
|
||||
"**{}**",
|
||||
inline_text(element, base_url, assets, exclude, depth)
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -190,7 +193,10 @@ fn node_to_md(
|
|||
if cell_has_block_content(element) {
|
||||
children_to_md(element, base_url, assets, list_depth, exclude, depth)
|
||||
} else {
|
||||
format!("*{}*", inline_text(element, base_url, assets, exclude, depth))
|
||||
format!(
|
||||
"*{}*",
|
||||
inline_text(element, base_url, assets, exclude, depth)
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -305,7 +311,8 @@ fn children_to_md(
|
|||
match child.value() {
|
||||
Node::Element(_) => {
|
||||
if let Some(child_el) = ElementRef::wrap(child) {
|
||||
let chunk = node_to_md(child_el, base_url, assets, list_depth, exclude, depth + 1);
|
||||
let chunk =
|
||||
node_to_md(child_el, base_url, assets, list_depth, exclude, depth + 1);
|
||||
if !chunk.is_empty() && !out.is_empty() && needs_separator(&out, &chunk) {
|
||||
out.push(' ');
|
||||
}
|
||||
|
|
@ -497,8 +504,26 @@ fn list_items(
|
|||
/// table rather than a data table.
|
||||
fn cell_has_block_content(cell: ElementRef<'_>) -> bool {
|
||||
const BLOCK_TAGS: &[&str] = &[
|
||||
"p", "div", "ul", "ol", "blockquote", "h1", "h2", "h3", "h4", "h5", "h6", "hr", "pre",
|
||||
"table", "section", "article", "header", "footer", "nav", "aside",
|
||||
"p",
|
||||
"div",
|
||||
"ul",
|
||||
"ol",
|
||||
"blockquote",
|
||||
"h1",
|
||||
"h2",
|
||||
"h3",
|
||||
"h4",
|
||||
"h5",
|
||||
"h6",
|
||||
"hr",
|
||||
"pre",
|
||||
"table",
|
||||
"section",
|
||||
"article",
|
||||
"header",
|
||||
"footer",
|
||||
"nav",
|
||||
"aside",
|
||||
];
|
||||
for desc in cell.descendants() {
|
||||
if let Some(el) = ElementRef::wrap(desc) {
|
||||
|
|
@ -562,8 +587,7 @@ fn table_to_md(
|
|||
let mut out = String::new();
|
||||
for row in &raw_rows {
|
||||
for cell in row {
|
||||
let content =
|
||||
children_to_md(*cell, base_url, assets, 0, exclude, depth);
|
||||
let content = children_to_md(*cell, base_url, assets, 0, exclude, depth);
|
||||
let content = content.trim();
|
||||
if !content.is_empty() {
|
||||
if !out.is_empty() {
|
||||
|
|
@ -1098,11 +1122,20 @@ mod tests {
|
|||
</table>"##;
|
||||
let (md, _, _) = convert_html(html, None);
|
||||
// Should NOT produce markdown table syntax
|
||||
assert!(!md.contains("| "), "layout table should not use pipe syntax: {md}");
|
||||
assert!(
|
||||
!md.contains("| "),
|
||||
"layout table should not use pipe syntax: {md}"
|
||||
);
|
||||
// Should contain the content as separate blocks
|
||||
assert!(md.contains("Column one first paragraph"), "missing content: {md}");
|
||||
assert!(
|
||||
md.contains("Column one first paragraph"),
|
||||
"missing content: {md}"
|
||||
);
|
||||
assert!(md.contains("Column two content"), "missing content: {md}");
|
||||
assert!(md.contains("Column two after rule"), "missing content: {md}");
|
||||
assert!(
|
||||
md.contains("Column two after rule"),
|
||||
"missing content: {md}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -1121,10 +1154,22 @@ mod tests {
|
|||
</tr>
|
||||
</table>"##;
|
||||
let (md, _, _) = convert_html(html, None);
|
||||
assert!(!md.contains("| "), "layout table should not use pipe syntax: {md}");
|
||||
assert!(md.contains("[Headline One](https://example.com/1)"), "missing link: {md}");
|
||||
assert!(md.contains("[Headline Two](https://example.com/2)"), "missing link: {md}");
|
||||
assert!(md.contains("[Headline Three](https://example.com/3)"), "missing link: {md}");
|
||||
assert!(
|
||||
!md.contains("| "),
|
||||
"layout table should not use pipe syntax: {md}"
|
||||
);
|
||||
assert!(
|
||||
md.contains("[Headline One](https://example.com/1)"),
|
||||
"missing link: {md}"
|
||||
);
|
||||
assert!(
|
||||
md.contains("[Headline Two](https://example.com/2)"),
|
||||
"missing link: {md}"
|
||||
);
|
||||
assert!(
|
||||
md.contains("[Headline Three](https://example.com/3)"),
|
||||
"missing link: {md}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
|
|||
|
|
@ -8,7 +8,8 @@ use scraper::ElementRef;
|
|||
|
||||
const NOISE_TAGS: &[&str] = &[
|
||||
"script", "style", "noscript", "iframe", "svg", "nav", "aside", "footer", "header", "video",
|
||||
"audio", "canvas",
|
||||
"audio",
|
||||
"canvas",
|
||||
// NOTE: <form> removed from this list — ASP.NET and similar frameworks wrap the
|
||||
// entire page body in a single <form> tag that contains all real content.
|
||||
// Forms are now handled with a heuristic in is_noise() that distinguishes
|
||||
|
|
@ -205,8 +206,12 @@ pub fn is_noise(el: ElementRef<'_>) -> bool {
|
|||
// Also check noise classes/IDs — a big form with class="login-form" is still noise
|
||||
if let Some(class) = el.value().attr("class") {
|
||||
let cl = class.to_lowercase();
|
||||
if cl.contains("login") || cl.contains("search") || cl.contains("subscribe")
|
||||
|| cl.contains("signup") || cl.contains("newsletter") || cl.contains("contact")
|
||||
if cl.contains("login")
|
||||
|| cl.contains("search")
|
||||
|| cl.contains("subscribe")
|
||||
|| cl.contains("signup")
|
||||
|| cl.contains("newsletter")
|
||||
|| cl.contains("contact")
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
|
@ -809,11 +814,20 @@ mod form_tests {
|
|||
fn aspnet_page_wrapping_form_is_not_noise() {
|
||||
let html = r#"<html><body><form method="post" action="./page.aspx" id="form1"><div class="wrapper"><div class="content"><h1>Support</h1><h3>Question one?</h3><p>Long answer text that should definitely be captured by the extraction engine. This is real content with multiple sentences to ensure it passes any text length thresholds in the scoring algorithm. We need at least five hundred characters of actual text content here to exceed the threshold. Adding more sentences about various topics including data formats, historical prices, stock market analysis, technical indicators, and trading strategies. This paragraph discusses how intraday data can be used for backtesting quantitative models and developing automated trading systems.</p><h3>Question two?</h3><p>Another substantial answer paragraph with detailed information about the product features and capabilities.</p></div></div></form></body></html>"#;
|
||||
let doc = Html::parse_document(html);
|
||||
let form = doc.select(&scraper::Selector::parse("form").unwrap()).next().unwrap();
|
||||
let form = doc
|
||||
.select(&scraper::Selector::parse("form").unwrap())
|
||||
.next()
|
||||
.unwrap();
|
||||
let text = form.text().collect::<String>();
|
||||
let text_len = text.len();
|
||||
assert!(text_len >= 500, "Form text should be >= 500 chars, got {text_len}");
|
||||
assert!(!is_noise(form), "ASP.NET page-wrapping form should NOT be noise");
|
||||
assert!(
|
||||
text_len >= 500,
|
||||
"Form text should be >= 500 chars, got {text_len}"
|
||||
);
|
||||
assert!(
|
||||
!is_noise(form),
|
||||
"ASP.NET page-wrapping form should NOT be noise"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -828,7 +842,10 @@ mod form_tests {
|
|||
</body></html>
|
||||
"#;
|
||||
let doc = Html::parse_document(html);
|
||||
let form = doc.select(&scraper::Selector::parse("form").unwrap()).next().unwrap();
|
||||
let form = doc
|
||||
.select(&scraper::Selector::parse("form").unwrap())
|
||||
.next()
|
||||
.unwrap();
|
||||
assert!(is_noise(form), "Small login form SHOULD be noise");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue