Main Article Title
This is the actual content that readers care about.
, this is handled by the "pre" arm
if is_inside_pre(element) {
// Just return raw text — the pre handler wraps it
collect_text(element)
} else {
let text = collect_text(element);
if text.is_empty() {
String::new()
} else {
format!("`{text}`")
}
}
}
// Fenced code blocks
"pre" => {
let code_el = element.select(&CODE_SELECTOR).next();
let (code, lang) = if let Some(code_el) = code_el {
// Try class first, then fall back to class
let lang = code_el
.value()
.attr("class")
.and_then(extract_language_from_class)
.or_else(|| {
element
.value()
.attr("class")
.and_then(extract_language_from_class)
});
(collect_preformatted_text(code_el), lang)
} else {
let lang = element
.value()
.attr("class")
.and_then(extract_language_from_class);
(collect_preformatted_text(element), lang)
};
let code = code.trim_matches('\n').to_string();
assets.code_blocks.push(CodeBlock {
language: lang.clone(),
code: code.clone(),
});
let fence_lang = lang.as_deref().unwrap_or("");
format!("\n\n```{fence_lang}\n{code}\n```\n\n")
}
// Blockquote
"blockquote" => {
let inner = children_to_md(element, base_url, assets, list_depth, exclude);
let quoted = inner
.trim()
.lines()
.map(|line| format!("> {line}"))
.collect::>()
.join("\n");
format!("\n\n{quoted}\n\n")
}
// Unordered list
"ul" => {
let items = list_items(element, base_url, assets, list_depth, false, exclude);
format!("\n\n{items}\n\n")
}
// Ordered list
"ol" => {
let items = list_items(element, base_url, assets, list_depth, true, exclude);
format!("\n\n{items}\n\n")
}
// List item — handled by ul/ol parent, but if encountered standalone:
"li" => {
let text = inline_text(element, base_url, assets, exclude);
format!("- {text}\n")
}
// Horizontal rule
"hr" => "\n\n---\n\n".to_string(),
// Line break
"br" => "\n".to_string(),
// Table
"table" => format!(
"\n\n{}\n\n",
table_to_md(element, base_url, assets, exclude)
),
// Divs and other containers — just recurse
_ => children_to_md(element, base_url, assets, list_depth, exclude),
}
}
/// Collect markdown from all children of an element.
fn children_to_md(
element: ElementRef<'_>,
base_url: Option<&Url>,
assets: &mut ConvertedAssets,
list_depth: usize,
exclude: &HashSet,
) -> String {
let mut out = String::new();
for child in element.children() {
match child.value() {
Node::Element(_) => {
if let Some(child_el) = ElementRef::wrap(child) {
let chunk = node_to_md(child_el, base_url, assets, list_depth, exclude);
if !chunk.is_empty() && !out.is_empty() && needs_separator(&out, &chunk) {
out.push(' ');
}
out.push_str(&chunk);
}
}
Node::Text(text) => {
out.push_str(text);
}
_ => {}
}
}
out
}
/// Collect inline text — walks children, converting inline elements to markdown.
/// This is for contexts where we want inline content (headings, paragraphs, links).
fn inline_text(
element: ElementRef<'_>,
base_url: Option<&Url>,
assets: &mut ConvertedAssets,
exclude: &HashSet,
) -> String {
let mut out = String::new();
for child in element.children() {
match child.value() {
Node::Element(_) => {
if let Some(child_el) = ElementRef::wrap(child) {
let chunk = node_to_md(child_el, base_url, assets, 0, exclude);
if !chunk.is_empty() && !out.is_empty() && needs_separator(&out, &chunk) {
out.push(' ');
}
out.push_str(&chunk);
}
}
Node::Text(text) => {
out.push_str(text);
}
_ => {}
}
}
// Collapse internal whitespace for inline content
out.split_whitespace().collect::>().join(" ")
}
/// Check whether a space is needed between two adjacent chunks of output.
/// Returns true when the left side doesn't end with whitespace and the right
/// side doesn't start with whitespace — i.e., two words would be mashed together.
fn needs_separator(left: &str, right: &str) -> bool {
let l = left.as_bytes().last().copied().unwrap_or(b' ');
let r = right.as_bytes().first().copied().unwrap_or(b' ');
!l.is_ascii_whitespace() && !r.is_ascii_whitespace()
}
/// Collect raw text content (no markdown formatting).
fn collect_text(element: ElementRef<'_>) -> String {
element.text().collect::()
}
/// Collect text from a preformatted element, preserving all whitespace.
/// Every text node is pushed verbatim -- no trimming, no collapsing.
/// Handles `
` as newlines and inserts newlines between block-level children
/// (e.g., `` lines produced by some syntax highlighters).
fn collect_preformatted_text(element: ElementRef<'_>) -> String {
let mut out = String::new();
for child in element.children() {
match child.value() {
Node::Text(text) => out.push_str(text),
Node::Element(el) => {
let tag = el.name.local.as_ref();
if tag == "br" {
out.push('\n');
} else if let Some(child_el) = ElementRef::wrap(child) {
if tag == "div" || tag == "p" {
if !out.is_empty() && !out.ends_with('\n') {
out.push('\n');
}
out.push_str(&collect_preformatted_text(child_el));
if !out.ends_with('\n') {
out.push('\n');
}
} else {
out.push_str(&collect_preformatted_text(child_el));
}
}
}
_ => {}
}
}
out
}
fn is_inside_pre(element: ElementRef<'_>) -> bool {
let mut node = element.parent();
while let Some(parent) = node {
if let Some(el) = ElementRef::wrap(parent)
&& el.value().name() == "pre"
{
return true;
}
node = parent.parent();
}
false
}
fn list_items(
list_el: ElementRef<'_>,
base_url: Option<&Url>,
assets: &mut ConvertedAssets,
depth: usize,
ordered: bool,
exclude: &HashSet,
) -> String {
let indent = " ".repeat(depth);
let mut out = String::new();
let mut index = 1;
for child in list_el.children() {
if let Some(child_el) = ElementRef::wrap(child) {
if exclude.contains(&child_el.id()) {
continue;
}
let tag = child_el.value().name();
if tag == "li" {
let bullet = if ordered {
let b = format!("{index}.");
index += 1;
b
} else {
"-".to_string()
};
// Separate nested lists from inline content
let mut inline_parts = String::new();
let mut nested_lists = String::new();
for li_child in child_el.children() {
if let Some(li_child_el) = ElementRef::wrap(li_child) {
if exclude.contains(&li_child_el.id()) {
continue;
}
let child_tag = li_child_el.value().name();
if child_tag == "ul" || child_tag == "ol" {
nested_lists.push_str(&list_items(
li_child_el,
base_url,
assets,
depth + 1,
child_tag == "ol",
exclude,
));
} else {
inline_parts.push_str(&node_to_md(
li_child_el,
base_url,
assets,
depth,
exclude,
));
}
} else if let Some(text) = li_child.value().as_text() {
inline_parts.push_str(text);
}
}
let text = inline_parts
.split_whitespace()
.collect::>()
.join(" ");
out.push_str(&format!("{indent}{bullet} {text}\n"));
if !nested_lists.is_empty() {
out.push_str(&nested_lists);
}
}
}
}
out.trim_end_matches('\n').to_string()
}
fn table_to_md(
table_el: ElementRef<'_>,
base_url: Option<&Url>,
assets: &mut ConvertedAssets,
exclude: &HashSet,
) -> String {
let mut rows: Vec> = Vec::new();
let mut has_header = false;
// Collect rows from thead and tbody
for child in table_el.descendants() {
if let Some(el) = ElementRef::wrap(child) {
if exclude.contains(&el.id()) {
continue;
}
if el.value().name() == "tr" {
let cells: Vec = el
.children()
.filter_map(ElementRef::wrap)
.filter(|c| {
!exclude.contains(&c.id())
&& (c.value().name() == "th" || c.value().name() == "td")
})
.map(|c| {
if c.value().name() == "th" {
has_header = true;
}
inline_text(c, base_url, assets, exclude)
})
.collect();
if !cells.is_empty() {
rows.push(cells);
}
}
}
}
if rows.is_empty() {
return String::new();
}
// Find max column count
let cols = rows.iter().map(|r| r.len()).max().unwrap_or(0);
if cols == 0 {
return String::new();
}
// Normalize row lengths
for row in &mut rows {
while row.len() < cols {
row.push(String::new());
}
}
let mut out = String::new();
// Header row
let header = &rows[0];
out.push_str("| ");
out.push_str(&header.join(" | "));
out.push_str(" |\n");
// Separator
out.push_str("| ");
out.push_str(&(0..cols).map(|_| "---").collect::>().join(" | "));
out.push_str(" |\n");
// Data rows (skip first if it was a header)
let start = if has_header { 1 } else { 0 };
for row in &rows[start..] {
out.push_str("| ");
out.push_str(&row.join(" | "));
out.push_str(" |\n");
}
out.trim_end().to_string()
}
/// Extract language hint from code element class (e.g., "language-rust", "lang-js", "highlight-python")
/// Known language names to match as bare class values (e.g., `class="javascript"`).
const KNOWN_LANGS: &[&str] = &[
"javascript",
"typescript",
"python",
"rust",
"go",
"java",
"c",
"cpp",
"csharp",
"ruby",
"php",
"swift",
"kotlin",
"scala",
"shell",
"bash",
"zsh",
"fish",
"sql",
"html",
"css",
"scss",
"sass",
"less",
"json",
"yaml",
"yml",
"toml",
"xml",
"markdown",
"md",
"jsx",
"tsx",
"vue",
"svelte",
"graphql",
"protobuf",
"dockerfile",
"makefile",
"lua",
"perl",
"r",
"matlab",
"haskell",
"elixir",
"erlang",
"clojure",
"dart",
"zig",
"nim",
"wasm",
"diff",
"text",
"plaintext",
"console",
];
fn extract_language_from_class(class: &str) -> Option {
for cls in class.split_whitespace() {
// Standard prefixes: language-js, lang-python, highlight-rust
for prefix in &["language-", "lang-", "highlight-"] {
if let Some(lang) = cls.strip_prefix(prefix)
&& !lang.is_empty()
&& lang.len() < 20
{
return Some(normalize_lang(lang));
}
}
// Sandpack prefix (sp-javascript, sp-python) — validate against known langs
if let Some(lang) = cls.strip_prefix("sp-") {
let lower = lang.to_lowercase();
if KNOWN_LANGS.contains(&lower.as_str()) {
return Some(normalize_lang(&lower));
}
}
// Bare language name as class: class="javascript" or class="python"
let lower = cls.to_lowercase();
if KNOWN_LANGS.contains(&lower.as_str()) {
return Some(normalize_lang(&lower));
}
}
None
}
/// Normalize language identifiers to common short forms.
fn normalize_lang(lang: &str) -> String {
match lang.to_lowercase().as_str() {
"javascript" | "js" => "js".to_string(),
"typescript" | "ts" => "ts".to_string(),
"python" | "py" => "python".to_string(),
"csharp" | "cs" | "c#" => "csharp".to_string(),
"cpp" | "c++" => "cpp".to_string(),
"shell" | "bash" | "zsh" | "sh" => "bash".to_string(),
"yaml" | "yml" => "yaml".to_string(),
"markdown" | "md" => "markdown".to_string(),
"plaintext" | "text" => "text".to_string(),
other => other.to_string(),
}
}
/// Pick the best (largest) image from an HTML srcset attribute.
/// srcset format: "url1 300w, url2 600w, url3 1200w" or "url1 1x, url2 2x"
fn pick_best_srcset(srcset: &str) -> Option {
let mut best_url = None;
let mut best_size: u32 = 0;
for entry in srcset.split(',') {
let parts: Vec<&str> = entry.split_whitespace().collect();
if parts.is_empty() {
continue;
}
let url = parts[0];
// Skip data URIs
if url.starts_with("data:") || url.starts_with("blob:") {
continue;
}
let size = if parts.len() > 1 {
let descriptor = parts[1];
// Parse "300w" or "2x"
descriptor
.trim_end_matches(|c: char| !c.is_ascii_digit())
.parse::()
.unwrap_or(1)
} else {
1
};
if size > best_size {
best_size = size;
best_url = Some(url.to_string());
}
}
best_url
}
/// Collect images and links from a noise element without adding text to markdown.
/// This preserves valuable metadata (links, images) from nav/header/footer
/// that would otherwise be completely lost.
fn collect_assets_from_noise(
element: ElementRef<'_>,
base_url: Option<&Url>,
assets: &mut ConvertedAssets,
) {
// Collect images with alt text
for img in element.select(&Selector::parse("img[alt]").unwrap()) {
let alt = img.value().attr("alt").unwrap_or("").to_string();
let src = img
.value()
.attr("src")
.map(|s| resolve_url(s, base_url))
.unwrap_or_default();
if !src.is_empty() && !alt.is_empty() {
assets.images.push(Image { alt, src });
}
}
// Collect links
for link in element.select(&Selector::parse("a[href]").unwrap()) {
let href = link
.value()
.attr("href")
.map(|h| resolve_url(h, base_url))
.unwrap_or_default();
let text: String = link.text().collect::().trim().to_string();
if !href.is_empty() && !text.is_empty() && href.starts_with("http") {
assets.links.push(Link { text, href });
}
}
}
pub fn resolve_url(href: &str, base_url: Option<&Url>) -> String {
// Absolute URLs pass through
if href.starts_with("http://") || href.starts_with("https://") || href.starts_with("//") {
return href.to_string();
}
// Try resolving against base
if let Some(base) = base_url
&& let Ok(resolved) = base.join(href)
{
return resolved.to_string();
}
href.to_string()
}
/// Collapse excessive whitespace: max 2 consecutive newlines, trim trailing
/// whitespace from lines. Content inside fenced code blocks (``` ... ```) is
/// passed through verbatim to preserve indentation and preformatted layout.
fn collapse_whitespace(s: &str) -> String {
let mut result = String::with_capacity(s.len());
let mut consecutive_newlines = 0;
let mut in_code_fence = false;
for line in s.lines() {
// Detect code fence boundaries
if line.trim_start().starts_with("```") {
in_code_fence = !in_code_fence;
consecutive_newlines = 0;
if !result.is_empty() && !result.ends_with('\n') {
result.push('\n');
}
result.push_str(line.trim_end());
result.push('\n');
continue;
}
// Inside code fences: preserve content exactly (only trim trailing whitespace)
if in_code_fence {
result.push_str(line.trim_end());
result.push('\n');
continue;
}
let trimmed = line.trim_end();
if trimmed.is_empty() {
consecutive_newlines += 1;
if consecutive_newlines <= 2 {
result.push('\n');
}
} else {
consecutive_newlines = 0;
if !result.is_empty() && !result.ends_with('\n') {
result.push('\n');
}
result.push_str(trimmed);
result.push('\n');
}
}
result.trim().to_string()
}
/// Crude markdown stripping for plain_text output.
fn strip_markdown(md: &str) -> String {
use once_cell::sync::Lazy;
use regex::Regex;
static LINK_RE: Lazy = Lazy::new(|| Regex::new(r"\[([^\]]*)\]\([^)]*\)").unwrap());
static IMG_RE: Lazy = Lazy::new(|| Regex::new(r"!\[([^\]]*)\]\([^)]*\)").unwrap());
static BOLD_RE: Lazy = Lazy::new(|| Regex::new(r"\*\*([^*]+)\*\*").unwrap());
static ITALIC_RE: Lazy = Lazy::new(|| Regex::new(r"\*([^*]+)\*").unwrap());
static CODE_RE: Lazy = Lazy::new(|| Regex::new(r"`([^`]+)`").unwrap());
static HEADING_RE: Lazy = Lazy::new(|| Regex::new(r"(?m)^#{1,6}\s+").unwrap());
let s = IMG_RE.replace_all(md, "$1");
let s = LINK_RE.replace_all(&s, "$1");
let s = BOLD_RE.replace_all(&s, "$1");
let s = ITALIC_RE.replace_all(&s, "$1");
let s = CODE_RE.replace_all(&s, "$1");
let s = HEADING_RE.replace_all(&s, "");
// Remove fenced code block markers
let mut lines: Vec<&str> = Vec::new();
let mut in_fence = false;
for line in s.lines() {
if line.trim_start().starts_with("```") {
in_fence = !in_fence;
continue;
}
lines.push(line);
}
lines.join("\n")
}
#[cfg(test)]
mod tests {
use super::*;
use scraper::Html;
fn convert_html(html: &str, base: Option<&str>) -> (String, String, ConvertedAssets) {
let doc = Html::parse_fragment(html);
let root = doc.root_element();
let base_url = base.and_then(|u| Url::parse(u).ok());
convert(root, base_url.as_ref(), &HashSet::new())
}
#[test]
fn headings() {
let (md, _, _) = convert_html("Title
", None);
assert!(md.contains("# Title"));
let (md, _, _) = convert_html("Sub
", None);
assert!(md.contains("### Sub"));
}
#[test]
fn paragraphs_and_inline() {
let (md, _, _) = convert_html(
"Hello world and stuff
",
None,
);
assert!(md.contains("Hello **world** and *stuff*"));
}
#[test]
fn links_collected() {
let (md, _, assets) = convert_html(
r#""#,
None,
);
assert!(md.contains("[Click here](https://example.com)"));
assert_eq!(assets.links.len(), 1);
assert_eq!(assets.links[0].href, "https://example.com");
}
#[test]
fn relative_url_resolution() {
let (md, _, _) = convert_html(
r#"About"#,
Some("https://example.com/page"),
);
assert!(md.contains("[About](https://example.com/about)"));
}
#[test]
fn images_collected() {
let (md, _, assets) = convert_html(
r#"
"#,
None,
);
assert!(md.contains(""));
assert_eq!(assets.images.len(), 1);
}
#[test]
fn code_blocks() {
let (md, _, assets) = convert_html(
r#"fn main() {}
"#,
None,
);
assert!(md.contains("```rust"));
assert!(md.contains("fn main() {}"));
assert_eq!(assets.code_blocks.len(), 1);
assert_eq!(assets.code_blocks[0].language.as_deref(), Some("rust"));
}
#[test]
fn multiline_code_preserves_newlines() {
let html = "function App() {\n const [count, setCount] = useState(0);\n return count;\n}
";
let (md, _, assets) = convert_html(html, None);
assert!(md.contains("```js"), "missing language fence: {md}");
assert!(
md.contains("function App() {\n const [count, setCount] = useState(0);"),
"newlines collapsed in code block: {md}"
);
assert_eq!(assets.code_blocks.len(), 1);
assert_eq!(assets.code_blocks[0].language.as_deref(), Some("js"));
}
#[test]
fn multiline_code_with_br_tags() {
let html = "function App() {
const x = 1;
return x;
}
";
let (md, _, _) = convert_html(html, None);
assert!(md.contains("```js"), "missing language fence: {md}");
assert!(
md.contains("function App() {\n const x = 1;\n return x;\n}"),
"br tags not converted to newlines in code block: {md}"
);
}
#[test]
fn multiline_code_with_div_lines() {
let html = "def hello(): print(\"hi\")
";
let (md, _, _) = convert_html(html, None);
assert!(md.contains("```py"), "missing language fence: {md}");
assert!(
md.contains("def hello():\n"),
"div-separated lines not preserved in code block: {md}"
);
}
#[test]
fn multiline_code_with_span_children() {
let html = "function App() {\n const [count, setCount] = useState(0);\n return count;\n}
";
let (md, _, assets) = convert_html(html, None);
assert!(md.contains("```js"), "missing language fence: {md}");
assert!(
md.contains("function App() {\n const"),
"newlines collapsed in highlighted code block: {md}"
);
assert_eq!(assets.code_blocks.len(), 1);
}
#[test]
fn multiline_code_no_inline_markdown() {
let html = "let **x** = *y*;\nlet a = b;
";
let (md, _, _) = convert_html(html, None);
assert!(
md.contains("let **x** = *y*;"),
"code block content was processed for inline markdown: {md}"
);
}
#[test]
fn inline_code() {
let (md, _, _) = convert_html("Use cargo build to compile
", None);
assert!(md.contains("`cargo build`"));
}
#[test]
fn unordered_list() {
let (md, _, _) = convert_html("- Alpha
- Beta
", None);
assert!(md.contains("- Alpha"));
assert!(md.contains("- Beta"));
}
#[test]
fn ordered_list() {
let (md, _, _) = convert_html("- First
- Second
", None);
assert!(md.contains("1. First"));
assert!(md.contains("2. Second"));
}
#[test]
fn blockquote() {
let (md, _, _) = convert_html("A wise quote
", None);
assert!(md.contains("> A wise quote"));
}
#[test]
fn table() {
let html = r##"
Name Age
Alice 30
"##;
let (md, _, _) = convert_html(html, None);
assert!(md.contains("| Name | Age |"));
assert!(md.contains("| --- | --- |"));
assert!(md.contains("| Alice | 30 |"));
}
#[test]
fn horizontal_rule() {
let (md, _, _) = convert_html("Above
Below
", None);
assert!(md.contains("---"));
}
#[test]
fn strips_to_plain_text() {
let (_, plain, _) = convert_html(
"Hello bold link
",
None,
);
assert!(plain.contains("Hello bold link"));
assert!(!plain.contains("**"));
assert!(!plain.contains("["));
}
#[test]
fn nested_list() {
let html = r##"
- Top
- Nested
"##;
let (md, _, _) = convert_html(html, None);
assert!(md.contains("- Top"));
assert!(md.contains(" - Nested"));
}
// --- Noise stripping tests ---
#[test]
fn strips_nav_sidebar_from_content() {
let html = r##"
Main Article Title
This is the actual content that readers care about.
"##;
let (md, plain, _) = convert_html(html, None);
assert!(md.contains("Main Article Title"));
assert!(md.contains("actual content"));
assert!(!md.contains("Home"), "nav link 'Home' leaked into output");
assert!(!md.contains("About"), "nav link 'About' leaked into output");
assert!(
!md.contains("Related Articles"),
"sidebar heading leaked into output"
);
assert!(
!plain.contains("Other article"),
"sidebar link leaked into plain text"
);
}
#[test]
fn strips_script_content() {
let html = r##"
Real content here.
More real content.
"##;
let (md, _, _) = convert_html(html, None);
assert!(md.contains("Real content here"));
assert!(md.contains("More real content"));
assert!(!md.contains("React"), "script variable leaked into output");
assert!(
!md.contains("NEXT_DATA"),
"React hydration data leaked into output"
);
assert!(!md.contains("console.log"), "JS code leaked into output");
assert!(
!md.contains(r#""key""#),
"JSON script content leaked into output"
);
}
#[test]
fn strips_style_content() {
let html = r##"
Styled paragraph content.
"##;
let (md, _, _) = convert_html(html, None);
assert!(md.contains("Styled paragraph content"));
assert!(!md.contains("font-size"), "CSS leaked into output");
assert!(!md.contains("margin"), "CSS leaked into output");
}
#[test]
fn strips_footer_content() {
let html = r##"
Article body text with important information.
"##;
let (md, _, _) = convert_html(html, None);
assert!(md.contains("Article body text"));
assert!(!md.contains("Copyright"), "footer text leaked into output");
assert!(
!md.contains("Privacy Policy"),
"footer nav leaked into output"
);
}
#[test]
fn strips_by_role_attribute() {
let html = r##"
Site Banner
The main content lives here.
Sidebar widget
Footer info
"##;
let (md, _, _) = convert_html(html, None);
assert!(md.contains("main content lives here"));
assert!(!md.contains("Site Banner"), "banner role leaked");
assert!(!md.contains("Sidebar widget"), "complementary role leaked");
assert!(!md.contains("Footer info"), "contentinfo role leaked");
assert!(!md.contains("Docs"), "navigation role leaked");
}
#[test]
fn strips_by_class_patterns() {
// Uses exact class token matching.
// "cookie" matches class="cookie", not class="cookie-banner".
let html = r##"
Subscribe to newsletter
This is the real article content.
"##;
let (md, _, _) = convert_html(html, None);
assert!(md.contains("real article content"));
assert!(!md.contains("cookies"), "cookie class leaked");
assert!(!md.contains("Twitter"), "social class leaked");
assert!(!md.contains("Sidebar content"), "sidebar class leaked");
assert!(!md.contains("Subscribe"), "modal class leaked");
}
#[test]
fn compound_classes_not_noise() {
// Compound class names should NOT trigger noise filter.
// "free-modal-container" is Vice.com's content wrapper, not a modal.
let html = r##"
Vice article content here
Main content.
"##;
let (md, _, _) = convert_html(html, None);
assert!(
md.contains("Vice article content"),
"compound modal class should not be noise"
);
assert!(
md.contains("Share link"),
"social-share should not be noise"
);
assert!(
md.contains("Cookie notice"),
"cookie-banner should not be noise"
);
}
#[test]
fn strips_by_id_patterns() {
// Exact ID matching — "sidebar" matches, "sidebar-left" does NOT.
let html = r##"
Article text that matters.
"##;
let (md, _, _) = convert_html(html, None);
assert!(md.contains("Article text that matters"));
assert!(!md.contains("Sidebar content"), "sidebar id leaked");
assert!(!md.contains("Accept cookies"), "cookie id leaked");
}
#[test]
fn preserves_content_with_no_noise() {
let html = r##"
Clean Article
First paragraph with bold and italic.
Second paragraph with a link.
print("hello")
A great quote.
"##;
let (md, _, assets) = convert_html(html, None);
assert!(md.contains("# Clean Article"));
assert!(md.contains("**bold**"));
assert!(md.contains("*italic*"));
assert!(md.contains("[link](https://example.com)"));
assert!(md.contains("```python"));
assert!(md.contains("> A great quote."));
assert_eq!(assets.links.len(), 1);
assert_eq!(assets.code_blocks.len(), 1);
}
#[test]
fn ad_class_does_not_false_positive() {
// "ad" as substring in "read", "loading", "load" should NOT be stripped
let html = r##"
5 min read
Loading content
Main text.
"##;
let (md, _, _) = convert_html(html, None);
assert!(
md.contains("5 min read"),
"reading-time was incorrectly stripped"
);
assert!(
md.contains("Loading content"),
"loading-indicator was incorrectly stripped"
);
assert!(md.contains("Main text"));
}
// --- Adjacent inline element spacing tests ---
#[test]
fn adjacent_buttons_get_separated() {
let html =
r#""#;
let (md, _, _) = convert_html(html, None);
assert!(
!md.contains("searchextract"),
"adjacent buttons mashed: {md}"
);
assert!(
!md.contains("extractcrawl"),
"adjacent buttons mashed: {md}"
);
}
#[test]
fn adjacent_links_get_separated() {
let html = r#""#;
let (md, _, _) = convert_html(html, None);
assert!(
!md.contains("expert)["),
"adjacent links should have space: {md}"
);
}
#[test]
fn adjacent_spans_get_separated() {
let html = r#"HelloWorld"#;
let (md, _, _) = convert_html(html, None);
assert!(!md.contains("HelloWorld"), "adjacent spans mashed: {md}");
}
#[test]
fn inline_text_with_adjacent_elements() {
// Inside a , adjacent inline elements should also be separated
let html = r#"
"#;
let (md, _, _) = convert_html(html, None);
assert!(
!md.contains(")("),
"adjacent links in paragraph mashed: {md}"
);
}
#[test]
fn no_extra_space_when_whitespace_exists() {
// When HTML already has whitespace, don't double-space
let html = r#""#;
let (md, _, _) = convert_html(html, None);
assert!(!md.contains(" "), "double space introduced: {md}");
}
// --- Code block indentation tests ---
// Syntax highlighters (Prism.js, Shiki, highlight.js) wrap tokens in
// elements. Leading whitespace (indentation) appears as text nodes between
// these spans. collect_preformatted_text must preserve all whitespace verbatim,
// and collapse_whitespace must not strip leading spaces inside fenced code blocks.
#[test]
fn syntax_highlighted_code_preserves_indentation() {
// Mimics React docs Prism.js output where each token is a
// and indentation is a text node between closing/opening spans.
let html = r#"function MyComponent() {
const [age, setAge] = useState(28);
}
"#;
let (md, _, assets) = convert_html(html, None);
assert!(md.contains("```js"), "missing language fence: {md}");
assert!(
md.contains("function MyComponent() {"),
"first line wrong: {md}"
);
assert!(
md.contains(" const [age, setAge] = useState(28);"),
"indentation not preserved in syntax-highlighted code: {md}"
);
assert!(md.contains("\n}"), "closing brace missing: {md}");
assert_eq!(assets.code_blocks.len(), 1);
assert_eq!(assets.code_blocks[0].language.as_deref(), Some("js"));
}
#[test]
fn shiki_line_spans_preserve_indentation() {
// Shiki wraps each line in , indentation is a text
// node inside the line span.
let html = concat!(
r#""#,
r#"function foo() {"#,
"\n",
r#" return 1;"#,
"\n",
r#"}"#,
r#"
"#,
);
let (md, _, _) = convert_html(html, None);
assert!(
md.contains(" return 1;"),
"Shiki-style indentation lost: {md}"
);
}
#[test]
fn deep_indentation_preserved_in_code() {
// Multiple nesting levels -- 4-space indentation
let html = concat!(
"",
"def outer():\n",
" def inner():\n",
" return 42\n",
" return inner",
"
"
);
let (md, _, _) = convert_html(html, None);
assert!(md.contains(" def inner():"), "4-space indent lost: {md}");
assert!(
md.contains(" return 42"),
"8-space indent lost: {md}"
);
}
#[test]
fn tab_indentation_preserved_in_code() {
let html = "if (x) {\n\treturn;\n}
";
let (md, _, _) = convert_html(html, None);
assert!(md.contains("\treturn;"), "tab indentation lost: {md}");
}
#[test]
fn collapse_whitespace_skips_code_fences() {
// Directly test that collapse_whitespace bypasses code block content
let input = "text\n\n```js\nfunction foo() {\n const x = 1;\n if (true) {\n return;\n }\n}\n```\n\nmore text";
let output = collapse_whitespace(input);
assert!(
output.contains(" const x = 1;"),
"collapse_whitespace stripped 2-space indent: {output}"
);
assert!(
output.contains(" if (true) {"),
"collapse_whitespace stripped 4-space indent: {output}"
);
assert!(
output.contains(" return;"),
"collapse_whitespace stripped 6-space indent: {output}"
);
}
}