From 95a6681b026ba42327b81a06af2643618ba6fb49 Mon Sep 17 00:00:00 2001 From: devnen Date: Fri, 3 Apr 2026 22:24:35 +0200 Subject: [PATCH 1/3] fix: detect layout tables and render as sections instead of markdown tables Sites like Drudge Report use for page layout, not data. Each cell contains extensive block-level content (divs, hrs, paragraphs, links). Previously, table_to_md() called inline_text() on every cell, collapsing all whitespace and flattening block elements into a single unreadable line. Changes: - Add cell_has_block_content() heuristic: scans for block-level descendants (p, div, hr, ul, ol, h1-h6, etc.) to distinguish layout vs data tables - Layout tables render each cell as a standalone section separated by blank lines, using children_to_md() to preserve block structure - Data tables (no block elements in cells) keep existing markdown table format - Bold/italic tags containing block elements are treated as containers instead of wrapping in **/**/* (fixes Drudge's ... column wrappers that contain the entire column content) - Add tests for layout tables with paragraphs and with links --- crates/webclaw-core/src/markdown.rs | 131 +++++++++++++++++++++++++--- 1 file changed, 121 insertions(+), 10 deletions(-) diff --git a/crates/webclaw-core/src/markdown.rs b/crates/webclaw-core/src/markdown.rs index 91f2c6e..d9ab56e 100644 --- a/crates/webclaw-core/src/markdown.rs +++ b/crates/webclaw-core/src/markdown.rs @@ -163,11 +163,24 @@ fn node_to_md( } } - // Bold - "strong" | "b" => format!("**{}**", inline_text(element, base_url, assets, exclude)), + // Bold — if it contains block elements (e.g., Drudge wraps entire columns + // in ), treat as a container instead of inline bold. + "strong" | "b" => { + if cell_has_block_content(element) { + children_to_md(element, base_url, assets, list_depth, exclude) + } else { + format!("**{}**", inline_text(element, base_url, assets, exclude)) + } + } - // Italic - "em" | "i" => format!("*{}*", inline_text(element, base_url, assets, exclude)), + // Italic — same block-content check as bold. + "em" | "i" => { + if cell_has_block_content(element) { + children_to_md(element, base_url, assets, list_depth, exclude) + } else { + format!("*{}*", inline_text(element, base_url, assets, exclude)) + } + } // Inline code "code" => { @@ -460,23 +473,41 @@ fn list_items( out.trim_end_matches('\n').to_string() } +/// Check whether a table cell contains block-level elements, indicating a layout +/// table rather than a data table. +fn cell_has_block_content(cell: ElementRef<'_>) -> bool { + const BLOCK_TAGS: &[&str] = &[ + "p", "div", "ul", "ol", "blockquote", "h1", "h2", "h3", "h4", "h5", "h6", "hr", "pre", + "table", "section", "article", "header", "footer", "nav", "aside", + ]; + for desc in cell.descendants() { + if let Some(el) = ElementRef::wrap(desc) { + if BLOCK_TAGS.contains(&el.value().name()) { + return true; + } + } + } + false +} + fn table_to_md( table_el: ElementRef<'_>, base_url: Option<&Url>, assets: &mut ConvertedAssets, exclude: &HashSet, ) -> String { - let mut rows: Vec> = Vec::new(); + // Collect all
/ cells grouped by row, and detect layout tables + let mut raw_rows: Vec>> = Vec::new(); let mut has_header = false; + let mut is_layout = false; - // Collect rows from thead and tbody for child in table_el.descendants() { if let Some(el) = ElementRef::wrap(child) { if exclude.contains(&el.id()) { continue; } if el.value().name() == "tr" { - let cells: Vec = el + let cells: Vec> = el .children() .filter_map(ElementRef::wrap) .filter(|c| { @@ -487,21 +518,53 @@ fn table_to_md( if c.value().name() == "th" { has_header = true; } - inline_text(c, base_url, assets, exclude) + if !is_layout && cell_has_block_content(c) { + is_layout = true; + } + c }) .collect(); if !cells.is_empty() { - rows.push(cells); + raw_rows.push(cells); } } } } - if rows.is_empty() { + if raw_rows.is_empty() { return String::new(); } + // Layout table: render each cell as a standalone block section + if is_layout { + let mut out = String::new(); + for row in &raw_rows { + for cell in row { + let content = + children_to_md(*cell, base_url, assets, 0, exclude); + let content = content.trim(); + if !content.is_empty() { + if !out.is_empty() { + out.push_str("\n\n"); + } + out.push_str(content); + } + } + } + return out; + } + + // Data table: render as markdown table + let mut rows: Vec> = raw_rows + .iter() + .map(|row| { + row.iter() + .map(|c| inline_text(*c, base_url, assets, exclude)) + .collect() + }) + .collect(); + // Find max column count let cols = rows.iter().map(|r| r.len()).max().unwrap_or(0); if cols == 0 { @@ -995,6 +1058,54 @@ mod tests { assert!(md.contains("| Alice | 30 |")); } + #[test] + fn layout_table() { + // Layout tables (cells with block elements) should render as sections, not markdown tables + let html = r##" + + + + + +
+

Column one first paragraph

+

Column one second paragraph

+
+

Column two content

+
+

Column two after rule

+
"##; + let (md, _, _) = convert_html(html, None); + // Should NOT produce markdown table syntax + assert!(!md.contains("| "), "layout table should not use pipe syntax: {md}"); + // Should contain the content as separate blocks + assert!(md.contains("Column one first paragraph"), "missing content: {md}"); + assert!(md.contains("Column two content"), "missing content: {md}"); + assert!(md.contains("Column two after rule"), "missing content: {md}"); + } + + #[test] + fn layout_table_with_links() { + // Drudge-style layout: cells full of links and divs + let html = r##" + + + + + +
+ + + + +
"##; + let (md, _, _) = convert_html(html, None); + assert!(!md.contains("| "), "layout table should not use pipe syntax: {md}"); + assert!(md.contains("[Headline One](https://example.com/1)"), "missing link: {md}"); + assert!(md.contains("[Headline Two](https://example.com/2)"), "missing link: {md}"); + assert!(md.contains("[Headline Three](https://example.com/3)"), "missing link: {md}"); + } + #[test] fn horizontal_rule() { let (md, _, _) = convert_html("

Above


Below

", None); From 74bac874359afe6896b8d8b14786a9aefba853c6 Mon Sep 17 00:00:00 2001 From: devnen Date: Fri, 3 Apr 2026 23:45:19 +0200 Subject: [PATCH 2/3] fix: prevent stack overflow on deeply nested HTML pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pages like Express.co.uk live blogs nest 200+ DOM levels deep, overflowing the default 1 MB main-thread stack on Windows during recursive markdown conversion. Two-layer fix: 1. markdown.rs: add depth parameter to node_to_md/children_to_md/inline_text with MAX_DOM_DEPTH=200 guard — falls back to plain text collection at limit 2. lib.rs: wrap extract_with_options in a worker thread with 8 MB stack so html5ever parsing and extraction both have room on deeply nested pages Tested with Express.co.uk live blog (previously crashed, now extracts 2000+ lines of clean markdown) and drudgereport.com (still works correctly). --- crates/webclaw-core/src/lib.rs | 65 ++++++++++++++++ crates/webclaw-core/src/markdown.rs | 77 ++++++++++++------- .../webclaw-core/testdata/express_test.html | 15 ++++ 3 files changed, 129 insertions(+), 28 deletions(-) create mode 100644 crates/webclaw-core/testdata/express_test.html diff --git a/crates/webclaw-core/src/lib.rs b/crates/webclaw-core/src/lib.rs index 1e8ea2a..bbd6525 100644 --- a/crates/webclaw-core/src/lib.rs +++ b/crates/webclaw-core/src/lib.rs @@ -45,10 +45,35 @@ pub fn extract(html: &str, url: Option<&str>) -> Result, options: &ExtractionOptions, +) -> Result { + // The default main-thread stack on Windows is 1 MB, which can overflow + // on deeply nested pages. Spawn a worker thread with 8 MB to be safe. + const STACK_SIZE: usize = 8 * 1024 * 1024; // 8 MB + + let html = html.to_string(); + let url = url.map(|u| u.to_string()); + let options = options.clone(); + + std::thread::Builder::new() + .stack_size(STACK_SIZE) + .spawn(move || extract_with_options_inner(&html, url.as_deref(), &options)) + .map_err(|_| ExtractError::NoContent)? + .join() + .unwrap_or(Err(ExtractError::NoContent)) +} + +fn extract_with_options_inner( + html: &str, + url: Option<&str>, + options: &ExtractionOptions, ) -> Result { if html.is_empty() { return Err(ExtractError::NoContent); @@ -530,4 +555,44 @@ mod tests { "raw_html should be absent from JSON when None" ); } + + #[test] + fn express_live_blog_no_stack_overflow() { + // Real-world Express.co.uk live blog that previously caused stack overflow + let html = include_str!("../testdata/express_test.html"); + let result = extract( + html, + Some("https://www.express.co.uk/news/world/2189934/iran-live-donald-trump-uae-dubai-kuwait-attacks"), + ); + assert!(result.is_ok(), "Should not stack overflow on Express.co.uk live blog"); + let result = result.unwrap(); + assert!( + result.metadata.word_count > 100, + "Should extract meaningful content, got {} words", + result.metadata.word_count + ); + } + + #[test] + fn deeply_nested_html_no_stack_overflow() { + // Simulate deeply nested HTML like Express.co.uk live blogs + let depth = 500; + let mut html = String::from(""); + for _ in 0..depth { + html.push_str("
"); + } + html.push_str("

Deep content here

"); + for _ in 0..depth { + html.push_str("
"); + } + html.push_str(""); + + let result = extract(&html, None); + assert!(result.is_ok(), "Should not stack overflow on deeply nested HTML"); + let result = result.unwrap(); + assert!( + result.content.markdown.contains("Deep content"), + "Should extract content from deep nesting" + ); + } } diff --git a/crates/webclaw-core/src/markdown.rs b/crates/webclaw-core/src/markdown.rs index d9ab56e..ff33b75 100644 --- a/crates/webclaw-core/src/markdown.rs +++ b/crates/webclaw-core/src/markdown.rs @@ -14,6 +14,12 @@ use crate::types::{CodeBlock, Image, Link}; static CODE_SELECTOR: Lazy = Lazy::new(|| Selector::parse("code").unwrap()); +/// Maximum recursion depth for DOM traversal. +/// Express.co.uk live blogs and similar pages can nest 1000+ levels deep, +/// overflowing the default ~1 MB stack on Windows. When we hit this limit +/// we fall back to plain-text collection (which uses an iterator, not recursion). +const MAX_DOM_DEPTH: usize = 200; + /// Collected assets found during conversion. pub struct ConvertedAssets { pub links: Vec, @@ -34,7 +40,7 @@ pub fn convert( code_blocks: Vec::new(), }; - let md = node_to_md(element, base_url, &mut assets, 0, exclude); + let md = node_to_md(element, base_url, &mut assets, 0, exclude, 0); let plain = strip_markdown(&md); let md = collapse_whitespace(&md); let plain = collapse_whitespace(&plain); @@ -49,11 +55,17 @@ fn node_to_md( assets: &mut ConvertedAssets, list_depth: usize, exclude: &HashSet, + depth: usize, ) -> String { if exclude.contains(&element.id()) { return String::new(); } + // Guard against deeply nested DOM trees (e.g., Express.co.uk live blogs). + if depth > MAX_DOM_DEPTH { + return collect_text(element); + } + if noise::is_noise(element) || noise::is_noise_descendant(element) { // Still collect images and links from noise elements — they're useful // metadata even though we don't include the noise text in markdown. @@ -67,38 +79,38 @@ fn node_to_md( // Headings "h1" => format!( "\n\n# {}\n\n", - inline_text(element, base_url, assets, exclude) + inline_text(element, base_url, assets, exclude, depth) ), "h2" => format!( "\n\n## {}\n\n", - inline_text(element, base_url, assets, exclude) + inline_text(element, base_url, assets, exclude, depth) ), "h3" => format!( "\n\n### {}\n\n", - inline_text(element, base_url, assets, exclude) + inline_text(element, base_url, assets, exclude, depth) ), "h4" => format!( "\n\n#### {}\n\n", - inline_text(element, base_url, assets, exclude) + inline_text(element, base_url, assets, exclude, depth) ), "h5" => format!( "\n\n##### {}\n\n", - inline_text(element, base_url, assets, exclude) + inline_text(element, base_url, assets, exclude, depth) ), "h6" => format!( "\n\n###### {}\n\n", - inline_text(element, base_url, assets, exclude) + inline_text(element, base_url, assets, exclude, depth) ), // Paragraph "p" => format!( "\n\n{}\n\n", - inline_text(element, base_url, assets, exclude) + inline_text(element, base_url, assets, exclude, depth) ), // Links "a" => { - let text = inline_text(element, base_url, assets, exclude); + let text = inline_text(element, base_url, assets, exclude, depth); let href = element .value() .attr("href") @@ -167,18 +179,18 @@ fn node_to_md( // in ), treat as a container instead of inline bold. "strong" | "b" => { if cell_has_block_content(element) { - children_to_md(element, base_url, assets, list_depth, exclude) + children_to_md(element, base_url, assets, list_depth, exclude, depth) } else { - format!("**{}**", inline_text(element, base_url, assets, exclude)) + format!("**{}**", inline_text(element, base_url, assets, exclude, depth)) } } // Italic — same block-content check as bold. "em" | "i" => { if cell_has_block_content(element) { - children_to_md(element, base_url, assets, list_depth, exclude) + children_to_md(element, base_url, assets, list_depth, exclude, depth) } else { - format!("*{}*", inline_text(element, base_url, assets, exclude)) + format!("*{}*", inline_text(element, base_url, assets, exclude, depth)) } } @@ -213,13 +225,13 @@ fn node_to_md( .attr("class") .and_then(extract_language_from_class) }); - (collect_preformatted_text(code_el), lang) + (collect_preformatted_text(code_el, depth), lang) } else { let lang = element .value() .attr("class") .and_then(extract_language_from_class); - (collect_preformatted_text(element), lang) + (collect_preformatted_text(element, depth), lang) }; let code = code.trim_matches('\n').to_string(); @@ -234,7 +246,7 @@ fn node_to_md( // Blockquote "blockquote" => { - let inner = children_to_md(element, base_url, assets, list_depth, exclude); + let inner = children_to_md(element, base_url, assets, list_depth, exclude, depth); let quoted = inner .trim() .lines() @@ -246,19 +258,19 @@ fn node_to_md( // Unordered list "ul" => { - let items = list_items(element, base_url, assets, list_depth, false, exclude); + let items = list_items(element, base_url, assets, list_depth, false, exclude, depth); format!("\n\n{items}\n\n") } // Ordered list "ol" => { - let items = list_items(element, base_url, assets, list_depth, true, exclude); + let items = list_items(element, base_url, assets, list_depth, true, exclude, depth); format!("\n\n{items}\n\n") } // List item — handled by ul/ol parent, but if encountered standalone: "li" => { - let text = inline_text(element, base_url, assets, exclude); + let text = inline_text(element, base_url, assets, exclude, depth); format!("- {text}\n") } @@ -271,11 +283,11 @@ fn node_to_md( // Table "table" => format!( "\n\n{}\n\n", - table_to_md(element, base_url, assets, exclude) + table_to_md(element, base_url, assets, exclude, depth) ), // Divs and other containers — just recurse - _ => children_to_md(element, base_url, assets, list_depth, exclude), + _ => children_to_md(element, base_url, assets, list_depth, exclude, depth), } } @@ -286,13 +298,14 @@ fn children_to_md( assets: &mut ConvertedAssets, list_depth: usize, exclude: &HashSet, + depth: usize, ) -> String { let mut out = String::new(); for child in element.children() { match child.value() { Node::Element(_) => { if let Some(child_el) = ElementRef::wrap(child) { - let chunk = node_to_md(child_el, base_url, assets, list_depth, exclude); + let chunk = node_to_md(child_el, base_url, assets, list_depth, exclude, depth + 1); if !chunk.is_empty() && !out.is_empty() && needs_separator(&out, &chunk) { out.push(' '); } @@ -315,13 +328,14 @@ fn inline_text( base_url: Option<&Url>, assets: &mut ConvertedAssets, exclude: &HashSet, + depth: usize, ) -> String { let mut out = String::new(); for child in element.children() { match child.value() { Node::Element(_) => { if let Some(child_el) = ElementRef::wrap(child) { - let chunk = node_to_md(child_el, base_url, assets, 0, exclude); + let chunk = node_to_md(child_el, base_url, assets, 0, exclude, depth + 1); if !chunk.is_empty() && !out.is_empty() && needs_separator(&out, &chunk) { out.push(' '); } @@ -356,7 +370,10 @@ fn collect_text(element: ElementRef<'_>) -> String { /// Every text node is pushed verbatim -- no trimming, no collapsing. /// Handles `
` as newlines and inserts newlines between block-level children /// (e.g., `
` lines produced by some syntax highlighters). -fn collect_preformatted_text(element: ElementRef<'_>) -> String { +fn collect_preformatted_text(element: ElementRef<'_>, depth: usize) -> String { + if depth > MAX_DOM_DEPTH { + return element.text().collect::(); + } let mut out = String::new(); for child in element.children() { match child.value() { @@ -370,12 +387,12 @@ fn collect_preformatted_text(element: ElementRef<'_>) -> String { if !out.is_empty() && !out.ends_with('\n') { out.push('\n'); } - out.push_str(&collect_preformatted_text(child_el)); + out.push_str(&collect_preformatted_text(child_el, depth + 1)); if !out.ends_with('\n') { out.push('\n'); } } else { - out.push_str(&collect_preformatted_text(child_el)); + out.push_str(&collect_preformatted_text(child_el, depth + 1)); } } } @@ -405,6 +422,7 @@ fn list_items( depth: usize, ordered: bool, exclude: &HashSet, + dom_depth: usize, ) -> String { let indent = " ".repeat(depth); let mut out = String::new(); @@ -443,6 +461,7 @@ fn list_items( depth + 1, child_tag == "ol", exclude, + dom_depth + 1, )); } else { inline_parts.push_str(&node_to_md( @@ -451,6 +470,7 @@ fn list_items( assets, depth, exclude, + dom_depth + 1, )); } } else if let Some(text) = li_child.value().as_text() { @@ -495,6 +515,7 @@ fn table_to_md( base_url: Option<&Url>, assets: &mut ConvertedAssets, exclude: &HashSet, + depth: usize, ) -> String { // Collect all
/ cells grouped by row, and detect layout tables let mut raw_rows: Vec>> = Vec::new(); @@ -542,7 +563,7 @@ fn table_to_md( for row in &raw_rows { for cell in row { let content = - children_to_md(*cell, base_url, assets, 0, exclude); + children_to_md(*cell, base_url, assets, 0, exclude, depth); let content = content.trim(); if !content.is_empty() { if !out.is_empty() { @@ -560,7 +581,7 @@ fn table_to_md( .iter() .map(|row| { row.iter() - .map(|c| inline_text(*c, base_url, assets, exclude)) + .map(|c| inline_text(*c, base_url, assets, exclude, depth)) .collect() }) .collect(); diff --git a/crates/webclaw-core/testdata/express_test.html b/crates/webclaw-core/testdata/express_test.html new file mode 100644 index 0000000..bed02be --- /dev/null +++ b/crates/webclaw-core/testdata/express_test.html @@ -0,0 +1,15 @@ + Iran war LIVE: US special forces enter Iran to rescue jet pilot as choppers hit | World | News | Express.co.uk
Taboola above article placeholder

Iran war LIVE: US special forces enter Iran to rescue jet pilot as choppers hit

An F-15 and an A10 Warthog attack plane were both downed in the skies above Iran on Friday

Comments
live

Donald Trump

Donald Trump has warned Iran (Image: Getty)

US special forces have entered Iran to rescue a crew member of the downed F-15 fighter jet, which crashed over the south of the country. Tehran shot down the aircraft over its airspace, marking the first fighter jet destroyed by enemy fire since the war began just over a month ago.

Two crew members reportedly ejected, one of whom was rescued by two US military helicopters and low-flying refuelling aircraft, which were also targeted by Iran. Both of them were hit, and one was seen with smoke pouring out of it as it flew back to Iraqi territory, where it landed safely, officials confirmed.

One of the two US pilots in the fighter jet that went down has been rescued alive by US forces, two US officials told NBC News. The pilot is now undergoing medical treatment, two US officials confirmed to Newsmax.A rescue operation is still ongoing for the second pilot, according to reports.

THIS IS A LIVE BLOG. FOLLOW OUR COVERAGE BELOW.

Iran claims to have shot down a US F-35 jet

Iran claims to have shot down a US F-35 jet (Image: Press.TV/X)

White House to ask for $1.5tn defence budget

The White House said it will ask Congress to approve a record-breaking $1.5 trillion (£1.13 trillion) defence budget for the 2027 fiscal year.

If approved, the budget would be a 42% increase from the previous year and the highest in modern US history.

The Office of Management and Budget said the request would cover dozens of military ships, including a new series of heavily armed US Navy Trump-class battleships.

Trump briefed by national security team

Donald Trump's national security team is briefing him regarding search operations for the F-15 in Iran, a White House official told NBC News.

The US President has been working in the Oval Office and the adjacent Oval dining room since this morning, receiving regular updates on the situation.

President Trump Delivers Address To Nation

Trump has been briefed on the situation. (Image: Getty)

Two helicopters struck in search mission

Two US military helicopters have been struck by Iranian fire while participating in the search and rescue mission for the F-15, a US official told NBC News.

All service members are safe, they added.

'We're in war', says Trump

Donald Trump told NBC News that the US F-15 fighter jet being shot down doesn't affect negotiations, but refused to discuss the search and rescue mission.

Asked if today's events will affect any negotiations with Iran, he said: "No, not at all. No, it's war. We're in war."

President Trump Delivers Address To Nation

Trump said the incident didn't affect negotiations. (Image: Getty)

Expert issues dire warning for second pilot

The second pilot now faces a "race against the clock" as an urgent rescue mission is launched.

Read more here.

Second US plane 'crashes near Strait of Hormuz'

A second US Air Force plane crashed today, this time near the Strait of Hormuz, two US officials told the New York Times.

They said the A-10 Warthog pilot was safely rescued following the crash, which reportedly happened around the same time as the F-15 went down.

Iran rejects US 48-hour ceasefire

Iran has rejected a US proposal for a 48-hour ceasefire, Fars news agency has reported, citing an anonymous source.

The country officially told mediators it's not willing to meet US officials in Islamabad in the coming days, the Wall Street Journal reports.

Iran's foreign minister previously said it would only accept a permanent end to the war, not a temporary ceasefire.

Iran 'claims victory' over downed F-15

Iran has "claimed victory" over the downing of a US F-15 fighter jet, an expert has said, as the rescue mission continues.

Middle East commentator Tara Kangarlou said Iranian state media was quick to "jump on the story" and "claim victory" over their adversary.

She told Sky News: "What we know is that the Iranian regime thrives on propaganda and thrives on any chance it has to claim a victory, whether small or big, against the United States and Israel. For them, this is a big story. They're really all over it."

Israel postpones Iran strikes

Israel has postponed some of its planned strikes in Iran while the search for the missing pilot continues, an Israeli official told CNN.

Rescued pilot 'undergoing medical treatment'

The pilot that has been rescued is now undergoing medical treatment, two US officials confirmed to Newsmax.

They did not know the status of the second pilot.

Footage matches search and rescue, says expert

Footage showing US warplanes over Iran matches typical search and rescue mission manoeuvres, CBS national security analyst Aaron MacLean says.

The footage seems to show US aircraft flying at low altitudes in broad daylight, which he said was something the US would only do if it had a good reason.

One pilot rescued alive

One of the two US pilots in the F-15 fighter jet that went down in Iran has been rescued alive by US forces, two US officials told NBC News.

One crew member rescued, officials say

A crew member from the downed fighter jet has been rescued by US forces, two US officials told CBS News.

Iranian governor says capture US crew alive

The governor of the Kohgiluyeh and Boyer-Ahmad province said the priority is "capturing" any downed US crew "alive".

He said: "Those who succeed in capturing or killing hostile enemy forces will be specially commended by the Governor’s office."

US officials confirm F-15 downed

A US F-15 went down over Iran, two US officials told NBC News.

The two-seater plane went down over the country on Friday, but the officials didn't say how it went down.

The US military is now conducting a search and rescue operation.

Israel Faces Further Missile And Drone Attacks After Weeks Of War With Iran

Officials said an F-15 was downed. (Image: Getty)

Search and rescue team 'looking for jet'

The search is under way for the downed US F-15 fighter jet, which crashed in southern Iran, two sources told CBS News.

Images verified by the outlet show a refuelling plane and two helicopters flying over Khuzestan Province - believed to be the rescue mission.

It's not known how many crew members were on board.

US fighter jet downed over Iran, reports say

A US fighter jet has been shot down over Iran, a US official told Reuters, adding that a search is under way for the pilots.

This report would confirm previous Iranian media reports claiming a F-35 jet had been downed on Good Friday.

Aviation experts analysing pictures circulating online of an alleged wreckage suggest they belong to a F-15 rather than a F-35.

Security Council vote on Middle East moved to Saturday

A vote by the United Nations Security Council on a resolution concerning the Strait of Hormuz has been delayed until Saturday,

Speaking to the BBC, UN spokesperson said the vote had originally been scheduled for Friday at 11am local time, or 3pm GMT, but was no longer listed on the UN website.

The spokesperson said the decision to postpone the vote was made by council members.

Netanyahu says 70% of Iran’s steel production capacity destroyed

Israeli strikes have wiped out around 70 percent of Iran’s steel production capacity, according to Prime Minister of Israel Benjamin Netanyahu.

In a video posted on social media, Netanyahu described the damage as a "tremendous achievement" and claimed it had deprived the Islamic Revolutionary Guard Corps of both financial and military resources.

He said: "In full co-ordination between me and President Trump, between the IDF and the United States military, we will continue to crush Iran.”

Netanyahu also said Israel was continuing to “hit Hezbollah in the shin".

Iran state TV tells public to hunt down US pilot after fighter jet shot down claim

Iranian state TV has reportedly urged people in a rural province southwest of Tehran to look for a US fighter pilot after Tehran claimed it shot down an American warplane on Friday.

According to the Associated Press, a local channel in Kohkilouyeh and Boyer-Ahmad province said a US pilot had ejected from their aircraft over southwestern Iran.

An anchor reportedly told viewers: “If you capture the enemy pilot or pilots alive and hand them over to the police, you will receive a precious prize.”

The same channel is said to have initially urged viewers to “shoot them as soon as you see them” if they spotted Americans, before later changing the message and instead asking for co-operation with police.

It has been claimed that American forces are also looking for the pilots, however this is yet to be verified.

Tasnim reports that US helicopters, planes and reconnaissance drones are involved in the search.

Washington has previously denied similar Iranian claims about downed US aircraft, but the Associated Press said this is the first time Iran has gone on television asking the public to hunt for a suspected American pilot.

The US hasn't yet commented on Iran's latest claim.

Trump says the US could reopen Strait of Hormuz with 'a little more time'

Donald Trump has suggested on his socila media platform, Truth Social, that with more time the US could reopen the Strait of Hormuz, seize oil supplies and profit heavily from it, describing it as a potential "gusher for the world".

UAE intercepted 475 ballistic missiles, 23 cruise missiles and 2,085 drones since conflict began

The UAE said its air defences intercepted 18 ballistic missiles, four cruise missiles and 47 drones launched from Iran on April 3.

Since Iran’s attacks began, the UAE says it has intercepted a total of 475 ballistic missiles, 23 cruise missiles and 2,085 drones.

The attacks have killed two members of the armed forces, one Moroccan civilian contractor working with the military and nine civilians of various nationalities. A further 203 people have been injured.

The Ministry of Defense said it remains on high alert and ready to respond to any threats against the country’s security and stability.

Iranian missile slams into Israel's Petah Tikva leaving huge crater

An Iranian ballistic missile slammed into Petah Tikva this evening, causing major damage to industrial buildings and leaving a huge crater near a residential area.

Footage from the scene showed the blast struck between a housing estate and the city’s industrial zone, with the force of the explosion smashing nearby buildings.

Despite the scale of the damage, no injuries have been reported.

Israel Faces Iranian Missiles And Drones In War's Second Month

Emergency crews inspect a crater where an Iranian ballistic missile hit (Image: Getty Images)

UAE authorities confirm 12 injured after falling shrapnel in Ajman area

UAE authorities said falling shrapnel in the Ajman area, after an interception by air defences, injured 12 people.

Those hurt included six Nepalese nationals with minor to moderate injuries, five Indian nationals with minor to moderate injuries, and one Nepalese national who suffered serious injuries.

Ajman is one of the seven emirates of the UAE and is around 30 to 40-minute drive away from Dubai.

First Western European vessel to cross the Strait of Hormuz

According to Euronews, a French-linked cargo ship has become the first Western European vessel to cross the Strait of Hormuz since the Iran war began in late February.

The Maltese-flagged CMA CGM Kribi, owned by French shipping giant CMA CGM, sailed eastbound from waters near Dubai on Thursday and travelled through the approved shipping corridor between Qeshm and Larak islands.

Euronews reported that the vessel had remained in the Gulf since early March before making the crossing, which is believed to have been coordinated with Iranian maritime authorities.

The ship is thought to be heading towards Pointe Noire in the Republic of the Congo as part of a route linking India, the Gulf and Africa.

Starmer discusses deploying air defence system to Kuwait after refinery drone attack

Sir Keir Starmer has spoken with High Highness the Crown Prince of Kuwait following the overnight drone attack on a Kuwaiti oil refinery.

During the call, Starmer condemned the attack and said the UK stood with Kuwait and its Gulf allies. The pair discussed the deployment of the UK’s Rapid Sentry air defence system to Kuwait to help protect both Kuwaiti and British personnel and interests in the region, while aiming to avoid a wider conflict.

They also discussed disruption to shipping through the Strait of Hormuz and welcomed a meeting led by the Foreign Secretary on plans to reopen the key trade route.

Both leaders agreed to remain in close contact in the coming weeks.

Abu Dhabi intercepts more missiles

The Abu Dhabi Emergencies, Crises and Disasters Management Centre has confirmed fallen deris from succesfully intercepted missilies in the Abjan area.

Iran shares photos of alleged downed F-35 jet

Iran has shared photos of the alleged downed F-35 jet.

This is the second time in two days that Iranian media has made a similar claim.

US Central Command denied claims yesterday Iran's Islamic Revolutionary Guard Corps (IRGC) downed an "enemy" fighter jet over Qeshm Island in the Strait of Hormuz.

The US military has yet to officially comment on the latest Iranian claim.

F-35 jet

Iran claims to have shot down a F-35 jet (Image: Press.TV/X)

F-35 jet

A tail piece from the wreckage (Image: Press.TV/X)

Shrapnel hit gas facilities in Abu Dhabi

Authorities in Abu Dhabi are responding after shrapnel fell at the Habshan gas facilities following what officials described as a successful air defence interception.

Operations at the site have been suspended while emergency teams deal with a fire caused by the incident. Officials said no injuries have been reported.

Authorities have urged the public not to spread rumours and to rely only on official sources for updates.

UAE reports further missile strikes

The National Emergency Crisis and Disasters Management Authority has just announced its dealing with more missile threats.

It urged reisidents to remain in a safe place and follow the warnings and updates on official websites.

Kuwait says power and water station attacked in latest strike

Kuwait's Ministry of Electricity and Water said one of the country's power generation and water desalination stations was attacked, causing material damage to parts of the facility.

The ministry said technical and emergency teams were immediately deployed under emergency plans to deal with the incident and maintain services.

More than 6,500 casualties in Israel evacuated to hospital since Operation Roaring Lion began

Israeli Ministry of Health said that since the start of Operation Roaring Lion, 6,594 casualties have been taken to hospitals in Israel.

As of 7am on Friday, 125 people were still in hospital, including two in critical condition, 14 seriously injured, 24 in moderate condition and 79 with minor injuries. One person was being treated for anxiety and five others were still being assessed.

Iran’s tallest bridge split in half as strike kills eight and injures 95

At least eight civilians were killed and 95 others injured after US and Israeli strikes hit Iran’s B1 Bridge in Karaj, near Tehran.

The bridge, regarded as the tallest in the Middle East, was split in half after being struck twice.

The bridge was a major route linking Tehran with Karaj and northern Iran, helping cut traffic and transport goods. US officials reportedly viewed it as an important supply route for missiles and drones, making it one of the most significant pieces of civilian infrastructure hit since the conflict began.

Donald Trump claimed responsibility for the strike and warned Iran there was “much more to follow” if Tehran did not agree to a deal

Bridge damaged in repeated strikes near Tehran

Significant sections of the B1 Bridge are seen destroyed after an airstrike (Image: Anadolu via Getty Images)

Poland’s prime minister says NATO split part of Putin’s ‘dream plan’

Poland’s prime minister, Donald Tusk, said the risk of NATO breaking apart, sanctions on Russia being eased, Europe facing an energy crisis, aid to Ukraine being stopped and Hungary blocking a loan for Kyiv all appear to fit into Vladimir Putin’s "dream plan".

Human remains 'found on Thai cargo ship' attacked in Strait of Hormuz

Human remains have been discovered following last month’s attack on a cargo ship in the Strait of Hormuz, according to Thai shipping firm Precious Shipping.

The remains were reportedly found in the damaged area of the vessel M.V. Mayuree Naree, which Iranian news agency Tasnim News Agency previously reported had been struck by unknown projectiles.

The Royal Navy of Oman rescued 20 crew members after an explosion at the rear of the ship caused a fire in the engine room, but three people were reported missing.

Precious Shipping said it has not yet been confirmed how many people the remains belong to. 

Mayuree Naree

Thai bulk carrier 'Mayuree Naree' near the Strait of Hormuz after an attack (Image: ROYAL THAI NAVY/AFP via Getty Images)

Australia’s foreign minister accuses Iran of weaponising Strait of Hormuz

Penny Wong, Australia’s foreign minister and a Labor senator for South Australia, said Iran is using the Strait of Hormuz to put pressure on other countries, which is pushing up fuel and energy prices worldwide.

She said poorer communities are being hit hardest by the rising costs, and Australia has joined other countries in condemning Iran’s actions.

Finland President urges Iran ceasefire after talks with Pezeshkian

Finland's president, Alexander Stubb said he had spoken with Iranian presient Masoud Pezeshkian about the situation in Iran and the wider Middle East, stressing that an urgent ceasefire and diplomatic solution are needed.

He also called for an end to strikes on neighbouring countries and urged the restoration of freedom of navigation through the Strait of Hormuz, adding that dialogue must continue despite disagreements.

US Central Commands dismisses IRGC's previous downed F-35 claim

US Central Command dismissed on Thursday a report that Iran’s Islamic Revolutionary Guard Corps claimed it had shot down an “enemy” fighter jet over Qeshm Island in the Strait of Hormuz.

It added all American fighter aircraft had been accounted for and accused the IRGC of making the same false claim multiple times before. 

Iran's IRGC claims to have shot down a second American F-35

A post on the IRGC-linked Fars News Agency Telegram channel said the aircraft was "completely destroyed and crashed", adding that there was no information on the pilot because of the extent of the damage.

Iran’s Mehr News Agency said it was unlikely the pilot had managed to eject due to the “violent explosion” caused by the crash.

This claim comes a day after a similar claim was made by Iranian media on Thursday. The US military denied yesterday's Iranian report. 

Saudi Arabia downs 6 drones

A spokesperson for Saudi Arabia Ministry of Defence has confirmed it intercepted six drones in the past few hours. 

Kuwait’s Mina Al-Ahmadi hit by drone strike

Kuwait’s Mina Al-Ahmadi refinery was hit by a drone strike in the early hours of this morning, sparking fires in several operating units.

According to the Kuwait News Agency (KUNA), emergency crews and firefighters quickly launched response plans to bring the fires under control and stop them spreading further.

The Mina Al-Ahmadi oil refinery in Kuwait

The Mina Al-Ahmadi oil refinery in Kuwait (Image: AP)

Donald Trump's most recent threat to Iran

Trump said the US military had "not even started" destroying what remains of Iran, warning that bridges and power plants could be targeted next unless Iran’s new leadership acts quickly.

UAE intercepting missiles

National Emergency Crisis and Disasters Management Authority in the UAE has confirmed the UAE is currently intercepting missiles. 

Its urged residents to remain in a safe location and follow official channels for warnings and updates.

Welcome to our live blog

Welcome to our live blog. We'll bring you the latest on the conflict in Iran. 

Comments

Daily Express uses notifications to keep you updated

\ No newline at end of file From 70c67f2ed61e8c4a6d1219484fb73b6d865fb52b Mon Sep 17 00:00:00 2001 From: devnen Date: Sat, 4 Apr 2026 01:33:11 +0200 Subject: [PATCH 3/3] fix: prevent noise filter from swallowing content in malformed HTML MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two related fixes for content being stripped by the noise filter: 1. Remove
from unconditional noise tags. ASP.NET and similar frameworks wrap entire pages in a tag — these are not input forms. Forms with >500 chars of text are now treated as content wrappers, not noise. 2. Add safety valve for class/ID noise matching. When malformed HTML leaves a noise container unclosed (e.g.,
missing its
), the HTML5 parser makes all subsequent siblings into children of that container. A header/nav/footer with >5000 chars of text is almost certainly a broken wrapper absorbing real content — exempt it from noise filtering. --- crates/webclaw-core/src/extractor.rs | 53 ++++++++++++++++ crates/webclaw-core/src/noise.rs | 92 +++++++++++++++++++++++++--- 2 files changed, 138 insertions(+), 7 deletions(-) diff --git a/crates/webclaw-core/src/extractor.rs b/crates/webclaw-core/src/extractor.rs index a26055f..3efe9e0 100644 --- a/crates/webclaw-core/src/extractor.rs +++ b/crates/webclaw-core/src/extractor.rs @@ -1484,3 +1484,56 @@ mod tests { ); } } + +#[cfg(test)] +mod form_integration_tests { + use super::*; + + #[test] + fn aspnet_form_content_extraction() { + let content = "x".repeat(600); // Ensure >500 chars + let html = format!(r#" + +
+ +
+

Section

+

Question?

+

{content}

+
+
+
+ "#); + let doc = Html::parse_document(&html); + let opts = ExtractionOptions::default(); + let result = extract_content(&doc, None, &opts); + assert!(result.markdown.contains("Section"), "h2 missing from markdown"); + assert!(result.markdown.contains("Question"), "h3 missing from markdown"); + } + + /// Simulate unclosed header div absorbing the content div. + /// The header's noise class should NOT propagate to the absorbed content + /// because the safety valve detects the header has >5000 chars (broken wrapper). + #[test] + fn unclosed_header_div_does_not_swallow_content() { + let faq = "Lorem ipsum dolor sit amet. ".repeat(300); // ~8400 chars + // The header div is intentionally NOT closed — the HTML parser makes + // div.content a child of div.header. The safety valve (>5000 chars) + // should prevent div.header from being treated as noise. + let html = format!(r#" +
+
Logo +
+

FAQ Section

+

First question?

+

{faq}

+
+
+ "#); + let doc = Html::parse_document(&html); + let opts = ExtractionOptions::default(); + let result = extract_content(&doc, None, &opts); + assert!(result.markdown.contains("FAQ Section"), "h2 missing: header swallowed content"); + assert!(result.markdown.contains("First question"), "h3 missing: header swallowed content"); + } +} diff --git a/crates/webclaw-core/src/noise.rs b/crates/webclaw-core/src/noise.rs index 46885f7..c9c9caf 100644 --- a/crates/webclaw-core/src/noise.rs +++ b/crates/webclaw-core/src/noise.rs @@ -7,9 +7,12 @@ use scraper::ElementRef; const NOISE_TAGS: &[&str] = &[ - "script", "style", "noscript", "iframe", "svg", "nav", "aside", "footer", "header", "form", - "video", "audio", - "canvas", + "script", "style", "noscript", "iframe", "svg", "nav", "aside", "footer", "header", "video", + "audio", "canvas", + // NOTE:
removed from this list — ASP.NET and similar frameworks wrap the + // entire page body in a single tag that contains all real content. + // Forms are now handled with a heuristic in is_noise() that distinguishes + // small input forms (noise) from page-wrapping forms (not noise). // NOTE: removed — it's a responsive image container, not noise. // wraps and for responsive images. ]; @@ -189,6 +192,28 @@ pub fn is_noise(el: ElementRef<'_>) -> bool { return true; } + // heuristic: ASP.NET wraps the entire page body in a single . + // These page-wrapping forms contain hundreds of words of real content. + // Small forms (login, search, newsletter) are noise. + if tag == "form" { + let text_len = el.text().collect::().len(); + // A form with substantial text (>500 chars) is likely a page wrapper, not noise. + // Small forms (login/search/subscribe) rarely exceed a few hundred chars. + if text_len < 500 { + return true; + } + // Also check noise classes/IDs — a big form with class="login-form" is still noise + if let Some(class) = el.value().attr("class") { + let cl = class.to_lowercase(); + if cl.contains("login") || cl.contains("search") || cl.contains("subscribe") + || cl.contains("signup") || cl.contains("newsletter") || cl.contains("contact") + { + return true; + } + } + return false; + } + // ARIA role-based noise if let Some(role) = el.value().attr("role") && NOISE_ROLES.contains(&role) @@ -200,10 +225,12 @@ pub fn is_noise(el: ElementRef<'_>) -> bool { // check each against the noise list. "free-modal-container" splits into // ["free-modal-container"] which does NOT match "modal". if let Some(class) = el.value().attr("class") { + let mut class_matched = false; for token in class.split_whitespace() { let lower = token.to_lowercase(); if NOISE_CLASSES.contains(&lower.as_str()) { - return true; + class_matched = true; + break; } // Structural elements use compound names (FooterLinks, Header-nav, etc.) // These are always noise regardless of compound form. @@ -211,11 +238,24 @@ pub fn is_noise(el: ElementRef<'_>) -> bool { || lower.starts_with("header-") || lower.starts_with("nav-") { - return true; + class_matched = true; + break; } } - // Also check for ad-specific patterns (standalone "ad" class) - if is_ad_class(class) { + if !class_matched { + class_matched = is_ad_class(class); + } + + if class_matched { + // Safety valve: malformed HTML can leave noise containers unclosed, + // causing them to absorb the entire page content. A real header/nav/ + // footer rarely exceeds a few thousand characters of text. If a + // noise-class element has massive text content, it's almost certainly + // a broken wrapper — treat it as content, not noise. + let text_len = el.text().collect::().len(); + if text_len > 5000 { + return false; + } return true; } } @@ -224,6 +264,11 @@ pub fn is_noise(el: ElementRef<'_>) -> bool { if let Some(id) = el.value().attr("id") { let id_lower = id.to_lowercase(); if NOISE_IDS.contains(&id_lower.as_str()) && !is_structural_id(&id_lower) { + // Same safety valve for ID-matched noise elements + let text_len = el.text().collect::().len(); + if text_len > 5000 { + return false; + } return true; } // Cookie consent platform IDs (prefix match — these generate huge overlays) @@ -754,3 +799,36 @@ mod tests { )); } } + +#[cfg(test)] +mod form_tests { + use super::*; + use scraper::Html; + + #[test] + fn aspnet_page_wrapping_form_is_not_noise() { + let html = r#"

Support

Question one?

Long answer text that should definitely be captured by the extraction engine. This is real content with multiple sentences to ensure it passes any text length thresholds in the scoring algorithm. We need at least five hundred characters of actual text content here to exceed the threshold. Adding more sentences about various topics including data formats, historical prices, stock market analysis, technical indicators, and trading strategies. This paragraph discusses how intraday data can be used for backtesting quantitative models and developing automated trading systems.

Question two?

Another substantial answer paragraph with detailed information about the product features and capabilities.

"#; + let doc = Html::parse_document(html); + let form = doc.select(&scraper::Selector::parse("form").unwrap()).next().unwrap(); + let text = form.text().collect::(); + let text_len = text.len(); + assert!(text_len >= 500, "Form text should be >= 500 chars, got {text_len}"); + assert!(!is_noise(form), "ASP.NET page-wrapping form should NOT be noise"); + } + + #[test] + fn small_login_form_is_noise() { + let html = r#" + +
+ + + +
+ + "#; + let doc = Html::parse_document(html); + let form = doc.select(&scraper::Selector::parse("form").unwrap()).next().unwrap(); + assert!(is_noise(form), "Small login form SHOULD be noise"); + } +}