From 95a6681b026ba42327b81a06af2643618ba6fb49 Mon Sep 17 00:00:00 2001 From: devnen Date: Fri, 3 Apr 2026 22:24:35 +0200 Subject: [PATCH] fix: detect layout tables and render as sections instead of markdown tables Sites like Drudge Report use for page layout, not data. Each cell contains extensive block-level content (divs, hrs, paragraphs, links). Previously, table_to_md() called inline_text() on every cell, collapsing all whitespace and flattening block elements into a single unreadable line. Changes: - Add cell_has_block_content() heuristic: scans for block-level descendants (p, div, hr, ul, ol, h1-h6, etc.) to distinguish layout vs data tables - Layout tables render each cell as a standalone section separated by blank lines, using children_to_md() to preserve block structure - Data tables (no block elements in cells) keep existing markdown table format - Bold/italic tags containing block elements are treated as containers instead of wrapping in **/**/* (fixes Drudge's ... column wrappers that contain the entire column content) - Add tests for layout tables with paragraphs and with links --- crates/webclaw-core/src/markdown.rs | 131 +++++++++++++++++++++++++--- 1 file changed, 121 insertions(+), 10 deletions(-) diff --git a/crates/webclaw-core/src/markdown.rs b/crates/webclaw-core/src/markdown.rs index 91f2c6e..d9ab56e 100644 --- a/crates/webclaw-core/src/markdown.rs +++ b/crates/webclaw-core/src/markdown.rs @@ -163,11 +163,24 @@ fn node_to_md( } } - // Bold - "strong" | "b" => format!("**{}**", inline_text(element, base_url, assets, exclude)), + // Bold — if it contains block elements (e.g., Drudge wraps entire columns + // in ), treat as a container instead of inline bold. + "strong" | "b" => { + if cell_has_block_content(element) { + children_to_md(element, base_url, assets, list_depth, exclude) + } else { + format!("**{}**", inline_text(element, base_url, assets, exclude)) + } + } - // Italic - "em" | "i" => format!("*{}*", inline_text(element, base_url, assets, exclude)), + // Italic — same block-content check as bold. + "em" | "i" => { + if cell_has_block_content(element) { + children_to_md(element, base_url, assets, list_depth, exclude) + } else { + format!("*{}*", inline_text(element, base_url, assets, exclude)) + } + } // Inline code "code" => { @@ -460,23 +473,41 @@ fn list_items( out.trim_end_matches('\n').to_string() } +/// Check whether a table cell contains block-level elements, indicating a layout +/// table rather than a data table. +fn cell_has_block_content(cell: ElementRef<'_>) -> bool { + const BLOCK_TAGS: &[&str] = &[ + "p", "div", "ul", "ol", "blockquote", "h1", "h2", "h3", "h4", "h5", "h6", "hr", "pre", + "table", "section", "article", "header", "footer", "nav", "aside", + ]; + for desc in cell.descendants() { + if let Some(el) = ElementRef::wrap(desc) { + if BLOCK_TAGS.contains(&el.value().name()) { + return true; + } + } + } + false +} + fn table_to_md( table_el: ElementRef<'_>, base_url: Option<&Url>, assets: &mut ConvertedAssets, exclude: &HashSet, ) -> String { - let mut rows: Vec> = Vec::new(); + // Collect all
/ cells grouped by row, and detect layout tables + let mut raw_rows: Vec>> = Vec::new(); let mut has_header = false; + let mut is_layout = false; - // Collect rows from thead and tbody for child in table_el.descendants() { if let Some(el) = ElementRef::wrap(child) { if exclude.contains(&el.id()) { continue; } if el.value().name() == "tr" { - let cells: Vec = el + let cells: Vec> = el .children() .filter_map(ElementRef::wrap) .filter(|c| { @@ -487,21 +518,53 @@ fn table_to_md( if c.value().name() == "th" { has_header = true; } - inline_text(c, base_url, assets, exclude) + if !is_layout && cell_has_block_content(c) { + is_layout = true; + } + c }) .collect(); if !cells.is_empty() { - rows.push(cells); + raw_rows.push(cells); } } } } - if rows.is_empty() { + if raw_rows.is_empty() { return String::new(); } + // Layout table: render each cell as a standalone block section + if is_layout { + let mut out = String::new(); + for row in &raw_rows { + for cell in row { + let content = + children_to_md(*cell, base_url, assets, 0, exclude); + let content = content.trim(); + if !content.is_empty() { + if !out.is_empty() { + out.push_str("\n\n"); + } + out.push_str(content); + } + } + } + return out; + } + + // Data table: render as markdown table + let mut rows: Vec> = raw_rows + .iter() + .map(|row| { + row.iter() + .map(|c| inline_text(*c, base_url, assets, exclude)) + .collect() + }) + .collect(); + // Find max column count let cols = rows.iter().map(|r| r.len()).max().unwrap_or(0); if cols == 0 { @@ -995,6 +1058,54 @@ mod tests { assert!(md.contains("| Alice | 30 |")); } + #[test] + fn layout_table() { + // Layout tables (cells with block elements) should render as sections, not markdown tables + let html = r##" + + + + + +
+

Column one first paragraph

+

Column one second paragraph

+
+

Column two content

+
+

Column two after rule

+
"##; + let (md, _, _) = convert_html(html, None); + // Should NOT produce markdown table syntax + assert!(!md.contains("| "), "layout table should not use pipe syntax: {md}"); + // Should contain the content as separate blocks + assert!(md.contains("Column one first paragraph"), "missing content: {md}"); + assert!(md.contains("Column two content"), "missing content: {md}"); + assert!(md.contains("Column two after rule"), "missing content: {md}"); + } + + #[test] + fn layout_table_with_links() { + // Drudge-style layout: cells full of links and divs + let html = r##" + + + + + +
+ + + + +
"##; + let (md, _, _) = convert_html(html, None); + assert!(!md.contains("| "), "layout table should not use pipe syntax: {md}"); + assert!(md.contains("[Headline One](https://example.com/1)"), "missing link: {md}"); + assert!(md.contains("[Headline Two](https://example.com/2)"), "missing link: {md}"); + assert!(md.contains("[Headline Three](https://example.com/3)"), "missing link: {md}"); + } + #[test] fn horizontal_rule() { let (md, _, _) = convert_html("

Above


Below

", None);