mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-04-25 00:06:21 +02:00
fix: layout tables, stack overflow, and noise filter (#14)
fix: layout tables rendered as sections instead of markdown tables
This commit is contained in:
commit
87ecf4241f
5 changed files with 382 additions and 39 deletions
|
|
@ -1484,3 +1484,56 @@ mod tests {
|
|||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod form_integration_tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn aspnet_form_content_extraction() {
|
||||
let content = "x".repeat(600); // Ensure >500 chars
|
||||
let html = format!(r#"<html><body>
|
||||
<form method="post" action="./page.aspx" id="form1">
|
||||
<div class="wrapper">
|
||||
<div class="header"><a href="/">Logo</a></div>
|
||||
<div class="content">
|
||||
<h2>Section</h2>
|
||||
<h3>Question?</h3>
|
||||
<p>{content}</p>
|
||||
</div>
|
||||
</div>
|
||||
</form>
|
||||
</body></html>"#);
|
||||
let doc = Html::parse_document(&html);
|
||||
let opts = ExtractionOptions::default();
|
||||
let result = extract_content(&doc, None, &opts);
|
||||
assert!(result.markdown.contains("Section"), "h2 missing from markdown");
|
||||
assert!(result.markdown.contains("Question"), "h3 missing from markdown");
|
||||
}
|
||||
|
||||
/// Simulate unclosed header div absorbing the content div.
|
||||
/// The header's noise class should NOT propagate to the absorbed content
|
||||
/// because the safety valve detects the header has >5000 chars (broken wrapper).
|
||||
#[test]
|
||||
fn unclosed_header_div_does_not_swallow_content() {
|
||||
let faq = "Lorem ipsum dolor sit amet. ".repeat(300); // ~8400 chars
|
||||
// The header div is intentionally NOT closed — the HTML parser makes
|
||||
// div.content a child of div.header. The safety valve (>5000 chars)
|
||||
// should prevent div.header from being treated as noise.
|
||||
let html = format!(r#"<html><body>
|
||||
<div class="wrapper">
|
||||
<div class="header"><a href="/">Logo</a>
|
||||
<div class="content">
|
||||
<h2>FAQ Section</h2>
|
||||
<h3>First question?</h3>
|
||||
<p>{faq}</p>
|
||||
</div>
|
||||
</div>
|
||||
</body></html>"#);
|
||||
let doc = Html::parse_document(&html);
|
||||
let opts = ExtractionOptions::default();
|
||||
let result = extract_content(&doc, None, &opts);
|
||||
assert!(result.markdown.contains("FAQ Section"), "h2 missing: header swallowed content");
|
||||
assert!(result.markdown.contains("First question"), "h3 missing: header swallowed content");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -45,10 +45,35 @@ pub fn extract(html: &str, url: Option<&str>) -> Result<ExtractionResult, Extrac
|
|||
/// `html` — raw HTML string to parse
|
||||
/// `url` — optional source URL, used for resolving relative links and domain detection
|
||||
/// `options` — controls include/exclude selectors, main content mode, and raw HTML output
|
||||
///
|
||||
/// Spawns extraction on a thread with an 8 MB stack to handle deeply nested
|
||||
/// HTML (e.g., Express.co.uk live blogs) without overflowing the default 1-2 MB
|
||||
/// main-thread stack on Windows.
|
||||
pub fn extract_with_options(
|
||||
html: &str,
|
||||
url: Option<&str>,
|
||||
options: &ExtractionOptions,
|
||||
) -> Result<ExtractionResult, ExtractError> {
|
||||
// The default main-thread stack on Windows is 1 MB, which can overflow
|
||||
// on deeply nested pages. Spawn a worker thread with 8 MB to be safe.
|
||||
const STACK_SIZE: usize = 8 * 1024 * 1024; // 8 MB
|
||||
|
||||
let html = html.to_string();
|
||||
let url = url.map(|u| u.to_string());
|
||||
let options = options.clone();
|
||||
|
||||
std::thread::Builder::new()
|
||||
.stack_size(STACK_SIZE)
|
||||
.spawn(move || extract_with_options_inner(&html, url.as_deref(), &options))
|
||||
.map_err(|_| ExtractError::NoContent)?
|
||||
.join()
|
||||
.unwrap_or(Err(ExtractError::NoContent))
|
||||
}
|
||||
|
||||
fn extract_with_options_inner(
|
||||
html: &str,
|
||||
url: Option<&str>,
|
||||
options: &ExtractionOptions,
|
||||
) -> Result<ExtractionResult, ExtractError> {
|
||||
if html.is_empty() {
|
||||
return Err(ExtractError::NoContent);
|
||||
|
|
@ -530,4 +555,44 @@ mod tests {
|
|||
"raw_html should be absent from JSON when None"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn express_live_blog_no_stack_overflow() {
|
||||
// Real-world Express.co.uk live blog that previously caused stack overflow
|
||||
let html = include_str!("../testdata/express_test.html");
|
||||
let result = extract(
|
||||
html,
|
||||
Some("https://www.express.co.uk/news/world/2189934/iran-live-donald-trump-uae-dubai-kuwait-attacks"),
|
||||
);
|
||||
assert!(result.is_ok(), "Should not stack overflow on Express.co.uk live blog");
|
||||
let result = result.unwrap();
|
||||
assert!(
|
||||
result.metadata.word_count > 100,
|
||||
"Should extract meaningful content, got {} words",
|
||||
result.metadata.word_count
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deeply_nested_html_no_stack_overflow() {
|
||||
// Simulate deeply nested HTML like Express.co.uk live blogs
|
||||
let depth = 500;
|
||||
let mut html = String::from("<html><body>");
|
||||
for _ in 0..depth {
|
||||
html.push_str("<div><span>");
|
||||
}
|
||||
html.push_str("<p>Deep content here</p>");
|
||||
for _ in 0..depth {
|
||||
html.push_str("</span></div>");
|
||||
}
|
||||
html.push_str("</body></html>");
|
||||
|
||||
let result = extract(&html, None);
|
||||
assert!(result.is_ok(), "Should not stack overflow on deeply nested HTML");
|
||||
let result = result.unwrap();
|
||||
assert!(
|
||||
result.content.markdown.contains("Deep content"),
|
||||
"Should extract content from deep nesting"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -14,6 +14,12 @@ use crate::types::{CodeBlock, Image, Link};
|
|||
|
||||
static CODE_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("code").unwrap());
|
||||
|
||||
/// Maximum recursion depth for DOM traversal.
|
||||
/// Express.co.uk live blogs and similar pages can nest 1000+ levels deep,
|
||||
/// overflowing the default ~1 MB stack on Windows. When we hit this limit
|
||||
/// we fall back to plain-text collection (which uses an iterator, not recursion).
|
||||
const MAX_DOM_DEPTH: usize = 200;
|
||||
|
||||
/// Collected assets found during conversion.
|
||||
pub struct ConvertedAssets {
|
||||
pub links: Vec<Link>,
|
||||
|
|
@ -34,7 +40,7 @@ pub fn convert(
|
|||
code_blocks: Vec::new(),
|
||||
};
|
||||
|
||||
let md = node_to_md(element, base_url, &mut assets, 0, exclude);
|
||||
let md = node_to_md(element, base_url, &mut assets, 0, exclude, 0);
|
||||
let plain = strip_markdown(&md);
|
||||
let md = collapse_whitespace(&md);
|
||||
let plain = collapse_whitespace(&plain);
|
||||
|
|
@ -49,11 +55,17 @@ fn node_to_md(
|
|||
assets: &mut ConvertedAssets,
|
||||
list_depth: usize,
|
||||
exclude: &HashSet<NodeId>,
|
||||
depth: usize,
|
||||
) -> String {
|
||||
if exclude.contains(&element.id()) {
|
||||
return String::new();
|
||||
}
|
||||
|
||||
// Guard against deeply nested DOM trees (e.g., Express.co.uk live blogs).
|
||||
if depth > MAX_DOM_DEPTH {
|
||||
return collect_text(element);
|
||||
}
|
||||
|
||||
if noise::is_noise(element) || noise::is_noise_descendant(element) {
|
||||
// Still collect images and links from noise elements — they're useful
|
||||
// metadata even though we don't include the noise text in markdown.
|
||||
|
|
@ -67,38 +79,38 @@ fn node_to_md(
|
|||
// Headings
|
||||
"h1" => format!(
|
||||
"\n\n# {}\n\n",
|
||||
inline_text(element, base_url, assets, exclude)
|
||||
inline_text(element, base_url, assets, exclude, depth)
|
||||
),
|
||||
"h2" => format!(
|
||||
"\n\n## {}\n\n",
|
||||
inline_text(element, base_url, assets, exclude)
|
||||
inline_text(element, base_url, assets, exclude, depth)
|
||||
),
|
||||
"h3" => format!(
|
||||
"\n\n### {}\n\n",
|
||||
inline_text(element, base_url, assets, exclude)
|
||||
inline_text(element, base_url, assets, exclude, depth)
|
||||
),
|
||||
"h4" => format!(
|
||||
"\n\n#### {}\n\n",
|
||||
inline_text(element, base_url, assets, exclude)
|
||||
inline_text(element, base_url, assets, exclude, depth)
|
||||
),
|
||||
"h5" => format!(
|
||||
"\n\n##### {}\n\n",
|
||||
inline_text(element, base_url, assets, exclude)
|
||||
inline_text(element, base_url, assets, exclude, depth)
|
||||
),
|
||||
"h6" => format!(
|
||||
"\n\n###### {}\n\n",
|
||||
inline_text(element, base_url, assets, exclude)
|
||||
inline_text(element, base_url, assets, exclude, depth)
|
||||
),
|
||||
|
||||
// Paragraph
|
||||
"p" => format!(
|
||||
"\n\n{}\n\n",
|
||||
inline_text(element, base_url, assets, exclude)
|
||||
inline_text(element, base_url, assets, exclude, depth)
|
||||
),
|
||||
|
||||
// Links
|
||||
"a" => {
|
||||
let text = inline_text(element, base_url, assets, exclude);
|
||||
let text = inline_text(element, base_url, assets, exclude, depth);
|
||||
let href = element
|
||||
.value()
|
||||
.attr("href")
|
||||
|
|
@ -163,11 +175,24 @@ fn node_to_md(
|
|||
}
|
||||
}
|
||||
|
||||
// Bold
|
||||
"strong" | "b" => format!("**{}**", inline_text(element, base_url, assets, exclude)),
|
||||
// Bold — if it contains block elements (e.g., Drudge wraps entire columns
|
||||
// in <b>), treat as a container instead of inline bold.
|
||||
"strong" | "b" => {
|
||||
if cell_has_block_content(element) {
|
||||
children_to_md(element, base_url, assets, list_depth, exclude, depth)
|
||||
} else {
|
||||
format!("**{}**", inline_text(element, base_url, assets, exclude, depth))
|
||||
}
|
||||
}
|
||||
|
||||
// Italic
|
||||
"em" | "i" => format!("*{}*", inline_text(element, base_url, assets, exclude)),
|
||||
// Italic — same block-content check as bold.
|
||||
"em" | "i" => {
|
||||
if cell_has_block_content(element) {
|
||||
children_to_md(element, base_url, assets, list_depth, exclude, depth)
|
||||
} else {
|
||||
format!("*{}*", inline_text(element, base_url, assets, exclude, depth))
|
||||
}
|
||||
}
|
||||
|
||||
// Inline code
|
||||
"code" => {
|
||||
|
|
@ -200,13 +225,13 @@ fn node_to_md(
|
|||
.attr("class")
|
||||
.and_then(extract_language_from_class)
|
||||
});
|
||||
(collect_preformatted_text(code_el), lang)
|
||||
(collect_preformatted_text(code_el, depth), lang)
|
||||
} else {
|
||||
let lang = element
|
||||
.value()
|
||||
.attr("class")
|
||||
.and_then(extract_language_from_class);
|
||||
(collect_preformatted_text(element), lang)
|
||||
(collect_preformatted_text(element, depth), lang)
|
||||
};
|
||||
|
||||
let code = code.trim_matches('\n').to_string();
|
||||
|
|
@ -221,7 +246,7 @@ fn node_to_md(
|
|||
|
||||
// Blockquote
|
||||
"blockquote" => {
|
||||
let inner = children_to_md(element, base_url, assets, list_depth, exclude);
|
||||
let inner = children_to_md(element, base_url, assets, list_depth, exclude, depth);
|
||||
let quoted = inner
|
||||
.trim()
|
||||
.lines()
|
||||
|
|
@ -233,19 +258,19 @@ fn node_to_md(
|
|||
|
||||
// Unordered list
|
||||
"ul" => {
|
||||
let items = list_items(element, base_url, assets, list_depth, false, exclude);
|
||||
let items = list_items(element, base_url, assets, list_depth, false, exclude, depth);
|
||||
format!("\n\n{items}\n\n")
|
||||
}
|
||||
|
||||
// Ordered list
|
||||
"ol" => {
|
||||
let items = list_items(element, base_url, assets, list_depth, true, exclude);
|
||||
let items = list_items(element, base_url, assets, list_depth, true, exclude, depth);
|
||||
format!("\n\n{items}\n\n")
|
||||
}
|
||||
|
||||
// List item — handled by ul/ol parent, but if encountered standalone:
|
||||
"li" => {
|
||||
let text = inline_text(element, base_url, assets, exclude);
|
||||
let text = inline_text(element, base_url, assets, exclude, depth);
|
||||
format!("- {text}\n")
|
||||
}
|
||||
|
||||
|
|
@ -258,11 +283,11 @@ fn node_to_md(
|
|||
// Table
|
||||
"table" => format!(
|
||||
"\n\n{}\n\n",
|
||||
table_to_md(element, base_url, assets, exclude)
|
||||
table_to_md(element, base_url, assets, exclude, depth)
|
||||
),
|
||||
|
||||
// Divs and other containers — just recurse
|
||||
_ => children_to_md(element, base_url, assets, list_depth, exclude),
|
||||
_ => children_to_md(element, base_url, assets, list_depth, exclude, depth),
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -273,13 +298,14 @@ fn children_to_md(
|
|||
assets: &mut ConvertedAssets,
|
||||
list_depth: usize,
|
||||
exclude: &HashSet<NodeId>,
|
||||
depth: usize,
|
||||
) -> String {
|
||||
let mut out = String::new();
|
||||
for child in element.children() {
|
||||
match child.value() {
|
||||
Node::Element(_) => {
|
||||
if let Some(child_el) = ElementRef::wrap(child) {
|
||||
let chunk = node_to_md(child_el, base_url, assets, list_depth, exclude);
|
||||
let chunk = node_to_md(child_el, base_url, assets, list_depth, exclude, depth + 1);
|
||||
if !chunk.is_empty() && !out.is_empty() && needs_separator(&out, &chunk) {
|
||||
out.push(' ');
|
||||
}
|
||||
|
|
@ -302,13 +328,14 @@ fn inline_text(
|
|||
base_url: Option<&Url>,
|
||||
assets: &mut ConvertedAssets,
|
||||
exclude: &HashSet<NodeId>,
|
||||
depth: usize,
|
||||
) -> String {
|
||||
let mut out = String::new();
|
||||
for child in element.children() {
|
||||
match child.value() {
|
||||
Node::Element(_) => {
|
||||
if let Some(child_el) = ElementRef::wrap(child) {
|
||||
let chunk = node_to_md(child_el, base_url, assets, 0, exclude);
|
||||
let chunk = node_to_md(child_el, base_url, assets, 0, exclude, depth + 1);
|
||||
if !chunk.is_empty() && !out.is_empty() && needs_separator(&out, &chunk) {
|
||||
out.push(' ');
|
||||
}
|
||||
|
|
@ -343,7 +370,10 @@ fn collect_text(element: ElementRef<'_>) -> String {
|
|||
/// Every text node is pushed verbatim -- no trimming, no collapsing.
|
||||
/// Handles `<br>` as newlines and inserts newlines between block-level children
|
||||
/// (e.g., `<div>` lines produced by some syntax highlighters).
|
||||
fn collect_preformatted_text(element: ElementRef<'_>) -> String {
|
||||
fn collect_preformatted_text(element: ElementRef<'_>, depth: usize) -> String {
|
||||
if depth > MAX_DOM_DEPTH {
|
||||
return element.text().collect::<String>();
|
||||
}
|
||||
let mut out = String::new();
|
||||
for child in element.children() {
|
||||
match child.value() {
|
||||
|
|
@ -357,12 +387,12 @@ fn collect_preformatted_text(element: ElementRef<'_>) -> String {
|
|||
if !out.is_empty() && !out.ends_with('\n') {
|
||||
out.push('\n');
|
||||
}
|
||||
out.push_str(&collect_preformatted_text(child_el));
|
||||
out.push_str(&collect_preformatted_text(child_el, depth + 1));
|
||||
if !out.ends_with('\n') {
|
||||
out.push('\n');
|
||||
}
|
||||
} else {
|
||||
out.push_str(&collect_preformatted_text(child_el));
|
||||
out.push_str(&collect_preformatted_text(child_el, depth + 1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -392,6 +422,7 @@ fn list_items(
|
|||
depth: usize,
|
||||
ordered: bool,
|
||||
exclude: &HashSet<NodeId>,
|
||||
dom_depth: usize,
|
||||
) -> String {
|
||||
let indent = " ".repeat(depth);
|
||||
let mut out = String::new();
|
||||
|
|
@ -430,6 +461,7 @@ fn list_items(
|
|||
depth + 1,
|
||||
child_tag == "ol",
|
||||
exclude,
|
||||
dom_depth + 1,
|
||||
));
|
||||
} else {
|
||||
inline_parts.push_str(&node_to_md(
|
||||
|
|
@ -438,6 +470,7 @@ fn list_items(
|
|||
assets,
|
||||
depth,
|
||||
exclude,
|
||||
dom_depth + 1,
|
||||
));
|
||||
}
|
||||
} else if let Some(text) = li_child.value().as_text() {
|
||||
|
|
@ -460,23 +493,42 @@ fn list_items(
|
|||
out.trim_end_matches('\n').to_string()
|
||||
}
|
||||
|
||||
/// Check whether a table cell contains block-level elements, indicating a layout
|
||||
/// table rather than a data table.
|
||||
fn cell_has_block_content(cell: ElementRef<'_>) -> bool {
|
||||
const BLOCK_TAGS: &[&str] = &[
|
||||
"p", "div", "ul", "ol", "blockquote", "h1", "h2", "h3", "h4", "h5", "h6", "hr", "pre",
|
||||
"table", "section", "article", "header", "footer", "nav", "aside",
|
||||
];
|
||||
for desc in cell.descendants() {
|
||||
if let Some(el) = ElementRef::wrap(desc) {
|
||||
if BLOCK_TAGS.contains(&el.value().name()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
fn table_to_md(
|
||||
table_el: ElementRef<'_>,
|
||||
base_url: Option<&Url>,
|
||||
assets: &mut ConvertedAssets,
|
||||
exclude: &HashSet<NodeId>,
|
||||
depth: usize,
|
||||
) -> String {
|
||||
let mut rows: Vec<Vec<String>> = Vec::new();
|
||||
// Collect all <td>/<th> cells grouped by row, and detect layout tables
|
||||
let mut raw_rows: Vec<Vec<ElementRef<'_>>> = Vec::new();
|
||||
let mut has_header = false;
|
||||
let mut is_layout = false;
|
||||
|
||||
// Collect rows from thead and tbody
|
||||
for child in table_el.descendants() {
|
||||
if let Some(el) = ElementRef::wrap(child) {
|
||||
if exclude.contains(&el.id()) {
|
||||
continue;
|
||||
}
|
||||
if el.value().name() == "tr" {
|
||||
let cells: Vec<String> = el
|
||||
let cells: Vec<ElementRef<'_>> = el
|
||||
.children()
|
||||
.filter_map(ElementRef::wrap)
|
||||
.filter(|c| {
|
||||
|
|
@ -487,21 +539,53 @@ fn table_to_md(
|
|||
if c.value().name() == "th" {
|
||||
has_header = true;
|
||||
}
|
||||
inline_text(c, base_url, assets, exclude)
|
||||
if !is_layout && cell_has_block_content(c) {
|
||||
is_layout = true;
|
||||
}
|
||||
c
|
||||
})
|
||||
.collect();
|
||||
|
||||
if !cells.is_empty() {
|
||||
rows.push(cells);
|
||||
raw_rows.push(cells);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if rows.is_empty() {
|
||||
if raw_rows.is_empty() {
|
||||
return String::new();
|
||||
}
|
||||
|
||||
// Layout table: render each cell as a standalone block section
|
||||
if is_layout {
|
||||
let mut out = String::new();
|
||||
for row in &raw_rows {
|
||||
for cell in row {
|
||||
let content =
|
||||
children_to_md(*cell, base_url, assets, 0, exclude, depth);
|
||||
let content = content.trim();
|
||||
if !content.is_empty() {
|
||||
if !out.is_empty() {
|
||||
out.push_str("\n\n");
|
||||
}
|
||||
out.push_str(content);
|
||||
}
|
||||
}
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
// Data table: render as markdown table
|
||||
let mut rows: Vec<Vec<String>> = raw_rows
|
||||
.iter()
|
||||
.map(|row| {
|
||||
row.iter()
|
||||
.map(|c| inline_text(*c, base_url, assets, exclude, depth))
|
||||
.collect()
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Find max column count
|
||||
let cols = rows.iter().map(|r| r.len()).max().unwrap_or(0);
|
||||
if cols == 0 {
|
||||
|
|
@ -995,6 +1079,54 @@ mod tests {
|
|||
assert!(md.contains("| Alice | 30 |"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn layout_table() {
|
||||
// Layout tables (cells with block elements) should render as sections, not markdown tables
|
||||
let html = r##"
|
||||
<table>
|
||||
<tr>
|
||||
<td>
|
||||
<p>Column one first paragraph</p>
|
||||
<p>Column one second paragraph</p>
|
||||
</td>
|
||||
<td>
|
||||
<p>Column two content</p>
|
||||
<hr>
|
||||
<p>Column two after rule</p>
|
||||
</td>
|
||||
</tr>
|
||||
</table>"##;
|
||||
let (md, _, _) = convert_html(html, None);
|
||||
// Should NOT produce markdown table syntax
|
||||
assert!(!md.contains("| "), "layout table should not use pipe syntax: {md}");
|
||||
// Should contain the content as separate blocks
|
||||
assert!(md.contains("Column one first paragraph"), "missing content: {md}");
|
||||
assert!(md.contains("Column two content"), "missing content: {md}");
|
||||
assert!(md.contains("Column two after rule"), "missing content: {md}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn layout_table_with_links() {
|
||||
// Drudge-style layout: cells full of links and divs
|
||||
let html = r##"
|
||||
<table>
|
||||
<tr>
|
||||
<td>
|
||||
<div><a href="https://example.com/1">Headline One</a></div>
|
||||
<div><a href="https://example.com/2">Headline Two</a></div>
|
||||
</td>
|
||||
<td>
|
||||
<div><a href="https://example.com/3">Headline Three</a></div>
|
||||
</td>
|
||||
</tr>
|
||||
</table>"##;
|
||||
let (md, _, _) = convert_html(html, None);
|
||||
assert!(!md.contains("| "), "layout table should not use pipe syntax: {md}");
|
||||
assert!(md.contains("[Headline One](https://example.com/1)"), "missing link: {md}");
|
||||
assert!(md.contains("[Headline Two](https://example.com/2)"), "missing link: {md}");
|
||||
assert!(md.contains("[Headline Three](https://example.com/3)"), "missing link: {md}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn horizontal_rule() {
|
||||
let (md, _, _) = convert_html("<p>Above</p><hr><p>Below</p>", None);
|
||||
|
|
|
|||
|
|
@ -7,9 +7,12 @@
|
|||
use scraper::ElementRef;
|
||||
|
||||
const NOISE_TAGS: &[&str] = &[
|
||||
"script", "style", "noscript", "iframe", "svg", "nav", "aside", "footer", "header", "form",
|
||||
"video", "audio",
|
||||
"canvas",
|
||||
"script", "style", "noscript", "iframe", "svg", "nav", "aside", "footer", "header", "video",
|
||||
"audio", "canvas",
|
||||
// NOTE: <form> removed from this list — ASP.NET and similar frameworks wrap the
|
||||
// entire page body in a single <form> tag that contains all real content.
|
||||
// Forms are now handled with a heuristic in is_noise() that distinguishes
|
||||
// small input forms (noise) from page-wrapping forms (not noise).
|
||||
// NOTE: <picture> removed — it's a responsive image container, not noise.
|
||||
// <picture> wraps <source> and <img> for responsive images.
|
||||
];
|
||||
|
|
@ -189,6 +192,28 @@ pub fn is_noise(el: ElementRef<'_>) -> bool {
|
|||
return true;
|
||||
}
|
||||
|
||||
// <form> heuristic: ASP.NET wraps the entire page body in a single <form>.
|
||||
// These page-wrapping forms contain hundreds of words of real content.
|
||||
// Small forms (login, search, newsletter) are noise.
|
||||
if tag == "form" {
|
||||
let text_len = el.text().collect::<String>().len();
|
||||
// A form with substantial text (>500 chars) is likely a page wrapper, not noise.
|
||||
// Small forms (login/search/subscribe) rarely exceed a few hundred chars.
|
||||
if text_len < 500 {
|
||||
return true;
|
||||
}
|
||||
// Also check noise classes/IDs — a big form with class="login-form" is still noise
|
||||
if let Some(class) = el.value().attr("class") {
|
||||
let cl = class.to_lowercase();
|
||||
if cl.contains("login") || cl.contains("search") || cl.contains("subscribe")
|
||||
|| cl.contains("signup") || cl.contains("newsletter") || cl.contains("contact")
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// ARIA role-based noise
|
||||
if let Some(role) = el.value().attr("role")
|
||||
&& NOISE_ROLES.contains(&role)
|
||||
|
|
@ -200,10 +225,12 @@ pub fn is_noise(el: ElementRef<'_>) -> bool {
|
|||
// check each against the noise list. "free-modal-container" splits into
|
||||
// ["free-modal-container"] which does NOT match "modal".
|
||||
if let Some(class) = el.value().attr("class") {
|
||||
let mut class_matched = false;
|
||||
for token in class.split_whitespace() {
|
||||
let lower = token.to_lowercase();
|
||||
if NOISE_CLASSES.contains(&lower.as_str()) {
|
||||
return true;
|
||||
class_matched = true;
|
||||
break;
|
||||
}
|
||||
// Structural elements use compound names (FooterLinks, Header-nav, etc.)
|
||||
// These are always noise regardless of compound form.
|
||||
|
|
@ -211,11 +238,24 @@ pub fn is_noise(el: ElementRef<'_>) -> bool {
|
|||
|| lower.starts_with("header-")
|
||||
|| lower.starts_with("nav-")
|
||||
{
|
||||
return true;
|
||||
class_matched = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Also check for ad-specific patterns (standalone "ad" class)
|
||||
if is_ad_class(class) {
|
||||
if !class_matched {
|
||||
class_matched = is_ad_class(class);
|
||||
}
|
||||
|
||||
if class_matched {
|
||||
// Safety valve: malformed HTML can leave noise containers unclosed,
|
||||
// causing them to absorb the entire page content. A real header/nav/
|
||||
// footer rarely exceeds a few thousand characters of text. If a
|
||||
// noise-class element has massive text content, it's almost certainly
|
||||
// a broken wrapper — treat it as content, not noise.
|
||||
let text_len = el.text().collect::<String>().len();
|
||||
if text_len > 5000 {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
@ -224,6 +264,11 @@ pub fn is_noise(el: ElementRef<'_>) -> bool {
|
|||
if let Some(id) = el.value().attr("id") {
|
||||
let id_lower = id.to_lowercase();
|
||||
if NOISE_IDS.contains(&id_lower.as_str()) && !is_structural_id(&id_lower) {
|
||||
// Same safety valve for ID-matched noise elements
|
||||
let text_len = el.text().collect::<String>().len();
|
||||
if text_len > 5000 {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
// Cookie consent platform IDs (prefix match — these generate huge overlays)
|
||||
|
|
@ -754,3 +799,36 @@ mod tests {
|
|||
));
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod form_tests {
|
||||
use super::*;
|
||||
use scraper::Html;
|
||||
|
||||
#[test]
|
||||
fn aspnet_page_wrapping_form_is_not_noise() {
|
||||
let html = r#"<html><body><form method="post" action="./page.aspx" id="form1"><div class="wrapper"><div class="content"><h1>Support</h1><h3>Question one?</h3><p>Long answer text that should definitely be captured by the extraction engine. This is real content with multiple sentences to ensure it passes any text length thresholds in the scoring algorithm. We need at least five hundred characters of actual text content here to exceed the threshold. Adding more sentences about various topics including data formats, historical prices, stock market analysis, technical indicators, and trading strategies. This paragraph discusses how intraday data can be used for backtesting quantitative models and developing automated trading systems.</p><h3>Question two?</h3><p>Another substantial answer paragraph with detailed information about the product features and capabilities.</p></div></div></form></body></html>"#;
|
||||
let doc = Html::parse_document(html);
|
||||
let form = doc.select(&scraper::Selector::parse("form").unwrap()).next().unwrap();
|
||||
let text = form.text().collect::<String>();
|
||||
let text_len = text.len();
|
||||
assert!(text_len >= 500, "Form text should be >= 500 chars, got {text_len}");
|
||||
assert!(!is_noise(form), "ASP.NET page-wrapping form should NOT be noise");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn small_login_form_is_noise() {
|
||||
let html = r#"
|
||||
<html><body>
|
||||
<form action="/login">
|
||||
<input type="text" name="user" />
|
||||
<input type="password" name="pass" />
|
||||
<button>Login</button>
|
||||
</form>
|
||||
</body></html>
|
||||
"#;
|
||||
let doc = Html::parse_document(html);
|
||||
let form = doc.select(&scraper::Selector::parse("form").unwrap()).next().unwrap();
|
||||
assert!(is_noise(form), "Small login form SHOULD be noise");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
15
crates/webclaw-core/testdata/express_test.html
vendored
Normal file
15
crates/webclaw-core/testdata/express_test.html
vendored
Normal file
File diff suppressed because one or more lines are too long
Loading…
Add table
Add a link
Reference in a new issue