/// HTML-to-markdown converter. /// Walks the DOM tree and emits clean markdown, resolving relative URLs /// against the provided base URL when available. use std::collections::HashSet; use ego_tree::NodeId; use once_cell::sync::Lazy; use scraper::node::Node; use scraper::{ElementRef, Selector}; use url::Url; use crate::noise; use crate::types::{CodeBlock, Image, Link}; static CODE_SELECTOR: Lazy = Lazy::new(|| Selector::parse("code").unwrap()); /// Collected assets found during conversion. pub struct ConvertedAssets { pub links: Vec, pub images: Vec, pub code_blocks: Vec, } /// Convert an element subtree to markdown + plain text. /// Elements whose NodeId is in `exclude` (and their descendants) are skipped. pub fn convert( element: ElementRef<'_>, base_url: Option<&Url>, exclude: &HashSet, ) -> (String, String, ConvertedAssets) { let mut assets = ConvertedAssets { links: Vec::new(), images: Vec::new(), code_blocks: Vec::new(), }; let md = node_to_md(element, base_url, &mut assets, 0, exclude); let plain = strip_markdown(&md); let md = collapse_whitespace(&md); let plain = collapse_whitespace(&plain); (md, plain, assets) } /// Recursive descent through the DOM, emitting markdown for each node. fn node_to_md( element: ElementRef<'_>, base_url: Option<&Url>, assets: &mut ConvertedAssets, list_depth: usize, exclude: &HashSet, ) -> String { if exclude.contains(&element.id()) { return String::new(); } if noise::is_noise(element) || noise::is_noise_descendant(element) { // Still collect images and links from noise elements — they're useful // metadata even though we don't include the noise text in markdown. // We strip noise text but preserve link/image references as metadata. collect_assets_from_noise(element, base_url, assets); return String::new(); } let tag = element.value().name(); match tag { // Headings "h1" => format!( "\n\n# {}\n\n", inline_text(element, base_url, assets, exclude) ), "h2" => format!( "\n\n## {}\n\n", inline_text(element, base_url, assets, exclude) ), "h3" => format!( "\n\n### {}\n\n", inline_text(element, base_url, assets, exclude) ), "h4" => format!( "\n\n#### {}\n\n", inline_text(element, base_url, assets, exclude) ), "h5" => format!( "\n\n##### {}\n\n", inline_text(element, base_url, assets, exclude) ), "h6" => format!( "\n\n###### {}\n\n", inline_text(element, base_url, assets, exclude) ), // Paragraph "p" => format!( "\n\n{}\n\n", inline_text(element, base_url, assets, exclude) ), // Links "a" => { let text = inline_text(element, base_url, assets, exclude); let href = element .value() .attr("href") .map(|h| resolve_url(h, base_url)) .unwrap_or_default(); if !text.is_empty() && !href.is_empty() { assets.links.push(Link { text: text.clone(), href: href.clone(), }); format!("[{text}]({href})") } else if !text.is_empty() { text } else { String::new() } } // Images — handle lazy loading (data-src), srcset, and skip base64/blob "img" => { let alt = element.value().attr("alt").unwrap_or("").to_string(); // Resolve src: prefer src, fall back to data-src (lazy loading), // then data-lazy-src, data-original (common lazy load patterns) let raw_src = element .value() .attr("src") .or_else(|| element.value().attr("data-src")) .or_else(|| element.value().attr("data-lazy-src")) .or_else(|| element.value().attr("data-original")) .unwrap_or(""); // Skip base64 data URIs and blob URLs (they bloat markdown) let src = if raw_src.starts_with("data:") || raw_src.starts_with("blob:") { String::new() } else { resolve_url(raw_src, base_url) }; // Try srcset for better resolution image let src = if src.is_empty() { // No src found, try srcset element .value() .attr("srcset") .and_then(pick_best_srcset) .map(|s| resolve_url(&s, base_url)) .unwrap_or_default() } else { src }; if !src.is_empty() { assets.images.push(Image { alt: alt.clone(), src: src.clone(), }); format!("![{alt}]({src})") } else { String::new() } } // Bold "strong" | "b" => format!("**{}**", inline_text(element, base_url, assets, exclude)), // Italic "em" | "i" => format!("*{}*", inline_text(element, base_url, assets, exclude)), // Inline code "code" => { // If parent is
, this is handled by the "pre" arm
            if is_inside_pre(element) {
                // Just return raw text — the pre handler wraps it
                collect_text(element)
            } else {
                let text = collect_text(element);
                if text.is_empty() {
                    String::new()
                } else {
                    format!("`{text}`")
                }
            }
        }

        // Fenced code blocks
        "pre" => {
            let code_el = element.select(&CODE_SELECTOR).next();
            let (code, lang) = if let Some(code_el) = code_el {
                // Try  class first, then fall back to  class
                let lang = code_el
                    .value()
                    .attr("class")
                    .and_then(extract_language_from_class)
                    .or_else(|| {
                        element
                            .value()
                            .attr("class")
                            .and_then(extract_language_from_class)
                    });
                (collect_preformatted_text(code_el), lang)
            } else {
                let lang = element
                    .value()
                    .attr("class")
                    .and_then(extract_language_from_class);
                (collect_preformatted_text(element), lang)
            };

            let code = code.trim_matches('\n').to_string();
            assets.code_blocks.push(CodeBlock {
                language: lang.clone(),
                code: code.clone(),
            });

            let fence_lang = lang.as_deref().unwrap_or("");
            format!("\n\n```{fence_lang}\n{code}\n```\n\n")
        }

        // Blockquote
        "blockquote" => {
            let inner = children_to_md(element, base_url, assets, list_depth, exclude);
            let quoted = inner
                .trim()
                .lines()
                .map(|line| format!("> {line}"))
                .collect::>()
                .join("\n");
            format!("\n\n{quoted}\n\n")
        }

        // Unordered list
        "ul" => {
            let items = list_items(element, base_url, assets, list_depth, false, exclude);
            format!("\n\n{items}\n\n")
        }

        // Ordered list
        "ol" => {
            let items = list_items(element, base_url, assets, list_depth, true, exclude);
            format!("\n\n{items}\n\n")
        }

        // List item — handled by ul/ol parent, but if encountered standalone:
        "li" => {
            let text = inline_text(element, base_url, assets, exclude);
            format!("- {text}\n")
        }

        // Horizontal rule
        "hr" => "\n\n---\n\n".to_string(),

        // Line break
        "br" => "\n".to_string(),

        // Table
        "table" => format!(
            "\n\n{}\n\n",
            table_to_md(element, base_url, assets, exclude)
        ),

        // Divs and other containers — just recurse
        _ => children_to_md(element, base_url, assets, list_depth, exclude),
    }
}

/// Collect markdown from all children of an element.
fn children_to_md(
    element: ElementRef<'_>,
    base_url: Option<&Url>,
    assets: &mut ConvertedAssets,
    list_depth: usize,
    exclude: &HashSet,
) -> String {
    let mut out = String::new();
    for child in element.children() {
        match child.value() {
            Node::Element(_) => {
                if let Some(child_el) = ElementRef::wrap(child) {
                    let chunk = node_to_md(child_el, base_url, assets, list_depth, exclude);
                    if !chunk.is_empty() && !out.is_empty() && needs_separator(&out, &chunk) {
                        out.push(' ');
                    }
                    out.push_str(&chunk);
                }
            }
            Node::Text(text) => {
                out.push_str(text);
            }
            _ => {}
        }
    }
    out
}

/// Collect inline text — walks children, converting inline elements to markdown.
/// This is for contexts where we want inline content (headings, paragraphs, links).
fn inline_text(
    element: ElementRef<'_>,
    base_url: Option<&Url>,
    assets: &mut ConvertedAssets,
    exclude: &HashSet,
) -> String {
    let mut out = String::new();
    for child in element.children() {
        match child.value() {
            Node::Element(_) => {
                if let Some(child_el) = ElementRef::wrap(child) {
                    let chunk = node_to_md(child_el, base_url, assets, 0, exclude);
                    if !chunk.is_empty() && !out.is_empty() && needs_separator(&out, &chunk) {
                        out.push(' ');
                    }
                    out.push_str(&chunk);
                }
            }
            Node::Text(text) => {
                out.push_str(text);
            }
            _ => {}
        }
    }
    // Collapse internal whitespace for inline content
    out.split_whitespace().collect::>().join(" ")
}

/// Check whether a space is needed between two adjacent chunks of output.
/// Returns true when the left side doesn't end with whitespace and the right
/// side doesn't start with whitespace — i.e., two words would be mashed together.
fn needs_separator(left: &str, right: &str) -> bool {
    let l = left.as_bytes().last().copied().unwrap_or(b' ');
    let r = right.as_bytes().first().copied().unwrap_or(b' ');
    !l.is_ascii_whitespace() && !r.is_ascii_whitespace()
}

/// Collect raw text content (no markdown formatting).
fn collect_text(element: ElementRef<'_>) -> String {
    element.text().collect::()
}

/// Collect text from a preformatted element, preserving all whitespace.
/// Every text node is pushed verbatim -- no trimming, no collapsing.
/// Handles `
` as newlines and inserts newlines between block-level children
/// (e.g., `` lines produced by some syntax highlighters).
fn collect_preformatted_text(element: ElementRef<'_>) -> String {
    let mut out = String::new();
    for child in element.children() {
        match child.value() {
            Node::Text(text) => out.push_str(text),
            Node::Element(el) => {
                let tag = el.name.local.as_ref();
                if tag == "br" {
                    out.push('\n');
                } else if let Some(child_el) = ElementRef::wrap(child) {
                    if tag == "div" || tag == "p" {
                        if !out.is_empty() && !out.ends_with('\n') {
                            out.push('\n');
                        }
                        out.push_str(&collect_preformatted_text(child_el));
                        if !out.ends_with('\n') {
                            out.push('\n');
                        }
                    } else {
                        out.push_str(&collect_preformatted_text(child_el));
                    }
                }
            }
            _ => {}
        }
    }
    out
}

fn is_inside_pre(element: ElementRef<'_>) -> bool {
    let mut node = element.parent();
    while let Some(parent) = node {
        if let Some(el) = ElementRef::wrap(parent)
            && el.value().name() == "pre"
        {
            return true;
        }
        node = parent.parent();
    }
    false
}

fn list_items(
    list_el: ElementRef<'_>,
    base_url: Option<&Url>,
    assets: &mut ConvertedAssets,
    depth: usize,
    ordered: bool,
    exclude: &HashSet,
) -> String {
    let indent = "  ".repeat(depth);
    let mut out = String::new();
    let mut index = 1;

    for child in list_el.children() {
        if let Some(child_el) = ElementRef::wrap(child) {
            if exclude.contains(&child_el.id()) {
                continue;
            }
            let tag = child_el.value().name();
            if tag == "li" {
                let bullet = if ordered {
                    let b = format!("{index}.");
                    index += 1;
                    b
                } else {
                    "-".to_string()
                };

                // Separate nested lists from inline content
                let mut inline_parts = String::new();
                let mut nested_lists = String::new();

                for li_child in child_el.children() {
                    if let Some(li_child_el) = ElementRef::wrap(li_child) {
                        if exclude.contains(&li_child_el.id()) {
                            continue;
                        }
                        let child_tag = li_child_el.value().name();
                        if child_tag == "ul" || child_tag == "ol" {
                            nested_lists.push_str(&list_items(
                                li_child_el,
                                base_url,
                                assets,
                                depth + 1,
                                child_tag == "ol",
                                exclude,
                            ));
                        } else {
                            inline_parts.push_str(&node_to_md(
                                li_child_el,
                                base_url,
                                assets,
                                depth,
                                exclude,
                            ));
                        }
                    } else if let Some(text) = li_child.value().as_text() {
                        inline_parts.push_str(text);
                    }
                }

                let text = inline_parts
                    .split_whitespace()
                    .collect::>()
                    .join(" ");
                out.push_str(&format!("{indent}{bullet} {text}\n"));

                if !nested_lists.is_empty() {
                    out.push_str(&nested_lists);
                }
            }
        }
    }
    out.trim_end_matches('\n').to_string()
}

fn table_to_md(
    table_el: ElementRef<'_>,
    base_url: Option<&Url>,
    assets: &mut ConvertedAssets,
    exclude: &HashSet,
) -> String {
    let mut rows: Vec> = Vec::new();
    let mut has_header = false;

    // Collect rows from thead and tbody
    for child in table_el.descendants() {
        if let Some(el) = ElementRef::wrap(child) {
            if exclude.contains(&el.id()) {
                continue;
            }
            if el.value().name() == "tr" {
                let cells: Vec = el
                    .children()
                    .filter_map(ElementRef::wrap)
                    .filter(|c| {
                        !exclude.contains(&c.id())
                            && (c.value().name() == "th" || c.value().name() == "td")
                    })
                    .map(|c| {
                        if c.value().name() == "th" {
                            has_header = true;
                        }
                        inline_text(c, base_url, assets, exclude)
                    })
                    .collect();

                if !cells.is_empty() {
                    rows.push(cells);
                }
            }
        }
    }

    if rows.is_empty() {
        return String::new();
    }

    // Find max column count
    let cols = rows.iter().map(|r| r.len()).max().unwrap_or(0);
    if cols == 0 {
        return String::new();
    }

    // Normalize row lengths
    for row in &mut rows {
        while row.len() < cols {
            row.push(String::new());
        }
    }

    let mut out = String::new();

    // Header row
    let header = &rows[0];
    out.push_str("| ");
    out.push_str(&header.join(" | "));
    out.push_str(" |\n");

    // Separator
    out.push_str("| ");
    out.push_str(&(0..cols).map(|_| "---").collect::>().join(" | "));
    out.push_str(" |\n");

    // Data rows (skip first if it was a header)
    let start = if has_header { 1 } else { 0 };
    for row in &rows[start..] {
        out.push_str("| ");
        out.push_str(&row.join(" | "));
        out.push_str(" |\n");
    }

    out.trim_end().to_string()
}

/// Extract language hint from code element class (e.g., "language-rust", "lang-js", "highlight-python")
/// Known language names to match as bare class values (e.g., `class="javascript"`).
const KNOWN_LANGS: &[&str] = &[
    "javascript",
    "typescript",
    "python",
    "rust",
    "go",
    "java",
    "c",
    "cpp",
    "csharp",
    "ruby",
    "php",
    "swift",
    "kotlin",
    "scala",
    "shell",
    "bash",
    "zsh",
    "fish",
    "sql",
    "html",
    "css",
    "scss",
    "sass",
    "less",
    "json",
    "yaml",
    "yml",
    "toml",
    "xml",
    "markdown",
    "md",
    "jsx",
    "tsx",
    "vue",
    "svelte",
    "graphql",
    "protobuf",
    "dockerfile",
    "makefile",
    "lua",
    "perl",
    "r",
    "matlab",
    "haskell",
    "elixir",
    "erlang",
    "clojure",
    "dart",
    "zig",
    "nim",
    "wasm",
    "diff",
    "text",
    "plaintext",
    "console",
];

fn extract_language_from_class(class: &str) -> Option {
    for cls in class.split_whitespace() {
        // Standard prefixes: language-js, lang-python, highlight-rust
        for prefix in &["language-", "lang-", "highlight-"] {
            if let Some(lang) = cls.strip_prefix(prefix)
                && !lang.is_empty()
                && lang.len() < 20
            {
                return Some(normalize_lang(lang));
            }
        }
        // Sandpack prefix (sp-javascript, sp-python) — validate against known langs
        if let Some(lang) = cls.strip_prefix("sp-") {
            let lower = lang.to_lowercase();
            if KNOWN_LANGS.contains(&lower.as_str()) {
                return Some(normalize_lang(&lower));
            }
        }
        // Bare language name as class: class="javascript" or class="python"
        let lower = cls.to_lowercase();
        if KNOWN_LANGS.contains(&lower.as_str()) {
            return Some(normalize_lang(&lower));
        }
    }
    None
}

/// Normalize language identifiers to common short forms.
fn normalize_lang(lang: &str) -> String {
    match lang.to_lowercase().as_str() {
        "javascript" | "js" => "js".to_string(),
        "typescript" | "ts" => "ts".to_string(),
        "python" | "py" => "python".to_string(),
        "csharp" | "cs" | "c#" => "csharp".to_string(),
        "cpp" | "c++" => "cpp".to_string(),
        "shell" | "bash" | "zsh" | "sh" => "bash".to_string(),
        "yaml" | "yml" => "yaml".to_string(),
        "markdown" | "md" => "markdown".to_string(),
        "plaintext" | "text" => "text".to_string(),
        other => other.to_string(),
    }
}

/// Pick the best (largest) image from an HTML srcset attribute.
/// srcset format: "url1 300w, url2 600w, url3 1200w" or "url1 1x, url2 2x"
fn pick_best_srcset(srcset: &str) -> Option {
    let mut best_url = None;
    let mut best_size: u32 = 0;

    for entry in srcset.split(',') {
        let parts: Vec<&str> = entry.split_whitespace().collect();
        if parts.is_empty() {
            continue;
        }
        let url = parts[0];
        // Skip data URIs
        if url.starts_with("data:") || url.starts_with("blob:") {
            continue;
        }
        let size = if parts.len() > 1 {
            let descriptor = parts[1];
            // Parse "300w" or "2x"
            descriptor
                .trim_end_matches(|c: char| !c.is_ascii_digit())
                .parse::()
                .unwrap_or(1)
        } else {
            1
        };
        if size > best_size {
            best_size = size;
            best_url = Some(url.to_string());
        }
    }

    best_url
}

/// Collect images and links from a noise element without adding text to markdown.
/// This preserves valuable metadata (links, images) from nav/header/footer
/// that would otherwise be completely lost.
fn collect_assets_from_noise(
    element: ElementRef<'_>,
    base_url: Option<&Url>,
    assets: &mut ConvertedAssets,
) {
    // Collect images with alt text
    for img in element.select(&Selector::parse("img[alt]").unwrap()) {
        let alt = img.value().attr("alt").unwrap_or("").to_string();
        let src = img
            .value()
            .attr("src")
            .map(|s| resolve_url(s, base_url))
            .unwrap_or_default();
        if !src.is_empty() && !alt.is_empty() {
            assets.images.push(Image { alt, src });
        }
    }

    // Collect links
    for link in element.select(&Selector::parse("a[href]").unwrap()) {
        let href = link
            .value()
            .attr("href")
            .map(|h| resolve_url(h, base_url))
            .unwrap_or_default();
        let text: String = link.text().collect::().trim().to_string();
        if !href.is_empty() && !text.is_empty() && href.starts_with("http") {
            assets.links.push(Link { text, href });
        }
    }
}

pub fn resolve_url(href: &str, base_url: Option<&Url>) -> String {
    // Absolute URLs pass through
    if href.starts_with("http://") || href.starts_with("https://") || href.starts_with("//") {
        return href.to_string();
    }

    // Try resolving against base
    if let Some(base) = base_url
        && let Ok(resolved) = base.join(href)
    {
        return resolved.to_string();
    }

    href.to_string()
}

/// Collapse excessive whitespace: max 2 consecutive newlines, trim trailing
/// whitespace from lines. Content inside fenced code blocks (``` ... ```) is
/// passed through verbatim to preserve indentation and preformatted layout.
fn collapse_whitespace(s: &str) -> String {
    let mut result = String::with_capacity(s.len());
    let mut consecutive_newlines = 0;
    let mut in_code_fence = false;

    for line in s.lines() {
        // Detect code fence boundaries
        if line.trim_start().starts_with("```") {
            in_code_fence = !in_code_fence;
            consecutive_newlines = 0;
            if !result.is_empty() && !result.ends_with('\n') {
                result.push('\n');
            }
            result.push_str(line.trim_end());
            result.push('\n');
            continue;
        }

        // Inside code fences: preserve content exactly (only trim trailing whitespace)
        if in_code_fence {
            result.push_str(line.trim_end());
            result.push('\n');
            continue;
        }

        let trimmed = line.trim_end();
        if trimmed.is_empty() {
            consecutive_newlines += 1;
            if consecutive_newlines <= 2 {
                result.push('\n');
            }
        } else {
            consecutive_newlines = 0;
            if !result.is_empty() && !result.ends_with('\n') {
                result.push('\n');
            }
            result.push_str(trimmed);
            result.push('\n');
        }
    }

    result.trim().to_string()
}

/// Crude markdown stripping for plain_text output.
fn strip_markdown(md: &str) -> String {
    use once_cell::sync::Lazy;
    use regex::Regex;

    static LINK_RE: Lazy = Lazy::new(|| Regex::new(r"\[([^\]]*)\]\([^)]*\)").unwrap());
    static IMG_RE: Lazy = Lazy::new(|| Regex::new(r"!\[([^\]]*)\]\([^)]*\)").unwrap());
    static BOLD_RE: Lazy = Lazy::new(|| Regex::new(r"\*\*([^*]+)\*\*").unwrap());
    static ITALIC_RE: Lazy = Lazy::new(|| Regex::new(r"\*([^*]+)\*").unwrap());
    static CODE_RE: Lazy = Lazy::new(|| Regex::new(r"`([^`]+)`").unwrap());
    static HEADING_RE: Lazy = Lazy::new(|| Regex::new(r"(?m)^#{1,6}\s+").unwrap());

    let s = IMG_RE.replace_all(md, "$1");
    let s = LINK_RE.replace_all(&s, "$1");
    let s = BOLD_RE.replace_all(&s, "$1");
    let s = ITALIC_RE.replace_all(&s, "$1");
    let s = CODE_RE.replace_all(&s, "$1");
    let s = HEADING_RE.replace_all(&s, "");

    // Remove fenced code block markers
    let mut lines: Vec<&str> = Vec::new();
    let mut in_fence = false;
    for line in s.lines() {
        if line.trim_start().starts_with("```") {
            in_fence = !in_fence;
            continue;
        }
        lines.push(line);
    }

    lines.join("\n")
}

#[cfg(test)]
mod tests {
    use super::*;
    use scraper::Html;

    fn convert_html(html: &str, base: Option<&str>) -> (String, String, ConvertedAssets) {
        let doc = Html::parse_fragment(html);
        let root = doc.root_element();
        let base_url = base.and_then(|u| Url::parse(u).ok());
        convert(root, base_url.as_ref(), &HashSet::new())
    }

    #[test]
    fn headings() {
        let (md, _, _) = convert_html("Title", None);
        assert!(md.contains("# Title"));

        let (md, _, _) = convert_html("Sub", None);
        assert!(md.contains("### Sub"));
    }

    #[test]
    fn paragraphs_and_inline() {
        let (md, _, _) = convert_html(
            "Hello world and stuff",
            None,
        );
        assert!(md.contains("Hello **world** and *stuff*"));
    }

    #[test]
    fn links_collected() {
        let (md, _, assets) = convert_html(
            r#"Click here"#,
            None,
        );
        assert!(md.contains("[Click here](https://example.com)"));
        assert_eq!(assets.links.len(), 1);
        assert_eq!(assets.links[0].href, "https://example.com");
    }

    #[test]
    fn relative_url_resolution() {
        let (md, _, _) = convert_html(
            r#"About"#,
            Some("https://example.com/page"),
        );
        assert!(md.contains("[About](https://example.com/about)"));
    }

    #[test]
    fn images_collected() {
        let (md, _, assets) = convert_html(
            r#""#,
            None,
        );
        assert!(md.contains("![A photo](https://img.example.com/photo.jpg)"));
        assert_eq!(assets.images.len(), 1);
    }

    #[test]
    fn code_blocks() {
        let (md, _, assets) = convert_html(
            r#"fn main() {}"#,
            None,
        );
        assert!(md.contains("```rust"));
        assert!(md.contains("fn main() {}"));
        assert_eq!(assets.code_blocks.len(), 1);
        assert_eq!(assets.code_blocks[0].language.as_deref(), Some("rust"));
    }

    #[test]
    fn multiline_code_preserves_newlines() {
        let html = "function App() {\n  const [count, setCount] = useState(0);\n  return count;\n}";
        let (md, _, assets) = convert_html(html, None);
        assert!(md.contains("```js"), "missing language fence: {md}");
        assert!(
            md.contains("function App() {\n  const [count, setCount] = useState(0);"),
            "newlines collapsed in code block: {md}"
        );
        assert_eq!(assets.code_blocks.len(), 1);
        assert_eq!(assets.code_blocks[0].language.as_deref(), Some("js"));
    }

    #[test]
    fn multiline_code_with_br_tags() {
        let html = "function App() {
  const x = 1;
  return x;
}";
        let (md, _, _) = convert_html(html, None);
        assert!(md.contains("```js"), "missing language fence: {md}");
        assert!(
            md.contains("function App() {\n  const x = 1;\n  return x;\n}"),
            "br tags not converted to newlines in code block: {md}"
        );
    }

    #[test]
    fn multiline_code_with_div_lines() {
        let html = "def hello():
    print(\"hi\")";
        let (md, _, _) = convert_html(html, None);
        assert!(md.contains("```py"), "missing language fence: {md}");
        assert!(
            md.contains("def hello():\n"),
            "div-separated lines not preserved in code block: {md}"
        );
    }

    #[test]
    fn multiline_code_with_span_children() {
        let html = "function App() {\n  const [count, setCount] = useState(0);\n  return count;\n}";
        let (md, _, assets) = convert_html(html, None);
        assert!(md.contains("```js"), "missing language fence: {md}");
        assert!(
            md.contains("function App() {\n  const"),
            "newlines collapsed in highlighted code block: {md}"
        );
        assert_eq!(assets.code_blocks.len(), 1);
    }

    #[test]
    fn multiline_code_no_inline_markdown() {
        let html = "let **x** = *y*;\nlet a = b;";
        let (md, _, _) = convert_html(html, None);
        assert!(
            md.contains("let **x** = *y*;"),
            "code block content was processed for inline markdown: {md}"
        );
    }

    #[test]
    fn inline_code() {
        let (md, _, _) = convert_html("Use cargo build to compile", None);
        assert!(md.contains("`cargo build`"));
    }

    #[test]
    fn unordered_list() {
        let (md, _, _) = convert_html("Alpha
Beta", None);
        assert!(md.contains("- Alpha"));
        assert!(md.contains("- Beta"));
    }

    #[test]
    fn ordered_list() {
        let (md, _, _) = convert_html("First
Second", None);
        assert!(md.contains("1. First"));
        assert!(md.contains("2. Second"));
    }

    #[test]
    fn blockquote() {
        let (md, _, _) = convert_html("A wise quote", None);
        assert!(md.contains("> A wise quote"));
    }

    #[test]
    fn table() {
        let html = r##"
        
            Name Age
            Alice 30
        "##;
        let (md, _, _) = convert_html(html, None);
        assert!(md.contains("| Name | Age |"));
        assert!(md.contains("| --- | --- |"));
        assert!(md.contains("| Alice | 30 |"));
    }

    #[test]
    fn horizontal_rule() {
        let (md, _, _) = convert_html("Above
Below", None);
        assert!(md.contains("---"));
    }

    #[test]
    fn strips_to_plain_text() {
        let (_, plain, _) = convert_html(
            "Hello bold link",
            None,
        );
        assert!(plain.contains("Hello bold link"));
        assert!(!plain.contains("**"));
        assert!(!plain.contains("["));
    }

    #[test]
    fn nested_list() {
        let html = r##"
        
            Top
                
                    Nested
                
            
        "##;
        let (md, _, _) = convert_html(html, None);
        assert!(md.contains("- Top"));
        assert!(md.contains("  - Nested"));
    }

    // --- Noise stripping tests ---

    #[test]
    fn strips_nav_sidebar_from_content() {
        let html = r##"
        
            
                
                    Home
                    About
                    Contact
                
            
            
                Related Articles
                Other article
            
            
                Main Article Title
                This is the actual content that readers care about.
            
        "##;
        let (md, plain, _) = convert_html(html, None);

        assert!(md.contains("Main Article Title"));
        assert!(md.contains("actual content"));
        assert!(!md.contains("Home"), "nav link 'Home' leaked into output");
        assert!(!md.contains("About"), "nav link 'About' leaked into output");
        assert!(
            !md.contains("Related Articles"),
            "sidebar heading leaked into output"
        );
        assert!(
            !plain.contains("Other article"),
            "sidebar link leaked into plain text"
        );
    }

    #[test]
    fn strips_script_content() {
        let html = r##"
        
            Real content here.
            
            
            More real content.
        "##;
        let (md, _, _) = convert_html(html, None);

        assert!(md.contains("Real content here"));
        assert!(md.contains("More real content"));
        assert!(!md.contains("React"), "script variable leaked into output");
        assert!(
            !md.contains("NEXT_DATA"),
            "React hydration data leaked into output"
        );
        assert!(!md.contains("console.log"), "JS code leaked into output");
        assert!(
            !md.contains(r#""key""#),
            "JSON script content leaked into output"
        );
    }

    #[test]
    fn strips_style_content() {
        let html = r##"
        
            
            Styled paragraph content.
        "##;
        let (md, _, _) = convert_html(html, None);

        assert!(md.contains("Styled paragraph content"));
        assert!(!md.contains("font-size"), "CSS leaked into output");
        assert!(!md.contains("margin"), "CSS leaked into output");
    }

    #[test]
    fn strips_footer_content() {
        let html = r##"
        
            Article body text with important information.
            
                Copyright 2025 Example Corp. All rights reserved.
                
                    Privacy Policy
                    Terms of Service
                
            
        "##;
        let (md, _, _) = convert_html(html, None);

        assert!(md.contains("Article body text"));
        assert!(!md.contains("Copyright"), "footer text leaked into output");
        assert!(
            !md.contains("Privacy Policy"),
            "footer nav leaked into output"
        );
    }

    #[test]
    fn strips_by_role_attribute() {
        let html = r##"
        
            HomeDocs
            Site Banner
            
                The main content lives here.
            
            Sidebar widget
            Footer info
        "##;
        let (md, _, _) = convert_html(html, None);

        assert!(md.contains("main content lives here"));
        assert!(!md.contains("Site Banner"), "banner role leaked");
        assert!(!md.contains("Sidebar widget"), "complementary role leaked");
        assert!(!md.contains("Footer info"), "contentinfo role leaked");
        assert!(!md.contains("Docs"), "navigation role leaked");
    }

    #[test]
    fn strips_by_class_patterns() {
        // Uses exact class token matching.
        // "cookie" matches class="cookie", not class="cookie-banner".
        let html = r##"
        
            We use cookies
            Share on Twitter
            Sidebar content here
            Subscribe to newsletter
            This is the real article content.
        "##;
        let (md, _, _) = convert_html(html, None);

        assert!(md.contains("real article content"));
        assert!(!md.contains("cookies"), "cookie class leaked");
        assert!(!md.contains("Twitter"), "social class leaked");
        assert!(!md.contains("Sidebar content"), "sidebar class leaked");
        assert!(!md.contains("Subscribe"), "modal class leaked");
    }

    #[test]
    fn compound_classes_not_noise() {
        // Compound class names should NOT trigger noise filter.
        // "free-modal-container" is Vice.com's content wrapper, not a modal.
        let html = r##"
        
            Vice article content here
            Share link
            Cookie notice
            Main content.
        "##;
        let (md, _, _) = convert_html(html, None);

        assert!(
            md.contains("Vice article content"),
            "compound modal class should not be noise"
        );
        assert!(
            md.contains("Share link"),
            "social-share should not be noise"
        );
        assert!(
            md.contains("Cookie notice"),
            "cookie-banner should not be noise"
        );
    }

    #[test]
    fn strips_by_id_patterns() {
        // Exact ID matching — "sidebar" matches, "sidebar-left" does NOT.
        let html = r##"
        
            Sidebar content
            Home
            Accept cookies?
            Article text that matters.
        "##;
        let (md, _, _) = convert_html(html, None);

        assert!(md.contains("Article text that matters"));
        assert!(!md.contains("Sidebar content"), "sidebar id leaked");
        assert!(!md.contains("Accept cookies"), "cookie id leaked");
    }

    #[test]
    fn preserves_content_with_no_noise() {
        let html = r##"
        
            Clean Article
            First paragraph with bold and italic.
            Second paragraph with a link.
            print("hello")
            A great quote.
        "##;
        let (md, _, assets) = convert_html(html, None);

        assert!(md.contains("# Clean Article"));
        assert!(md.contains("**bold**"));
        assert!(md.contains("*italic*"));
        assert!(md.contains("[link](https://example.com)"));
        assert!(md.contains("```python"));
        assert!(md.contains("> A great quote."));
        assert_eq!(assets.links.len(), 1);
        assert_eq!(assets.code_blocks.len(), 1);
    }

    #[test]
    fn ad_class_does_not_false_positive() {
        // "ad" as substring in "read", "loading", "load" should NOT be stripped
        let html = r##"
        
            5 min read
            Loading content
            Main text.
        "##;
        let (md, _, _) = convert_html(html, None);

        assert!(
            md.contains("5 min read"),
            "reading-time was incorrectly stripped"
        );
        assert!(
            md.contains("Loading content"),
            "loading-indicator was incorrectly stripped"
        );
        assert!(md.contains("Main text"));
    }

    // --- Adjacent inline element spacing tests ---

    #[test]
    fn adjacent_buttons_get_separated() {
        let html =
            r#""#;
        let (md, _, _) = convert_html(html, None);
        assert!(
            !md.contains("searchextract"),
            "adjacent buttons mashed: {md}"
        );
        assert!(
            !md.contains("extractcrawl"),
            "adjacent buttons mashed: {md}"
        );
    }

    #[test]
    fn adjacent_links_get_separated() {
        let html = r#"Talk to an expertTry it out"#;
        let (md, _, _) = convert_html(html, None);
        assert!(
            !md.contains("expert)["),
            "adjacent links should have space: {md}"
        );
    }

    #[test]
    fn adjacent_spans_get_separated() {
        let html = r#"HelloWorld"#;
        let (md, _, _) = convert_html(html, None);
        assert!(!md.contains("HelloWorld"), "adjacent spans mashed: {md}");
    }

    #[test]
    fn inline_text_with_adjacent_elements() {
        // Inside a , adjacent inline elements should also be separated
        let html = r#"
OneTwoThree"#;
        let (md, _, _) = convert_html(html, None);
        assert!(
            !md.contains(")("),
            "adjacent links in paragraph mashed: {md}"
        );
    }

    #[test]
    fn no_extra_space_when_whitespace_exists() {
        // When HTML already has whitespace, don't double-space
        let html = r#"One Two"#;
        let (md, _, _) = convert_html(html, None);
        assert!(!md.contains("  "), "double space introduced: {md}");
    }

    // --- Code block indentation tests ---
    // Syntax highlighters (Prism.js, Shiki, highlight.js) wrap tokens in 
    // elements. Leading whitespace (indentation) appears as text nodes between
    // these spans. collect_preformatted_text must preserve all whitespace verbatim,
    // and collapse_whitespace must not strip leading spaces inside fenced code blocks.

    #[test]
    fn syntax_highlighted_code_preserves_indentation() {
        // Mimics React docs Prism.js output where each token is a 
        // and indentation is a text node between closing/opening spans.
        let html = r#"function MyComponent() {
  const [age, setAge] = useState(28);
}"#;

        let (md, _, assets) = convert_html(html, None);

        assert!(md.contains("```js"), "missing language fence: {md}");
        assert!(
            md.contains("function MyComponent() {"),
            "first line wrong: {md}"
        );
        assert!(
            md.contains("  const [age, setAge] = useState(28);"),
            "indentation not preserved in syntax-highlighted code: {md}"
        );
        assert!(md.contains("\n}"), "closing brace missing: {md}");
        assert_eq!(assets.code_blocks.len(), 1);
        assert_eq!(assets.code_blocks[0].language.as_deref(), Some("js"));
    }

    #[test]
    fn shiki_line_spans_preserve_indentation() {
        // Shiki wraps each line in , indentation is a text
        // node inside the line span.
        let html = concat!(
            r#""#,
            r#"function foo() {"#,
            "\n",
            r#"  return 1;"#,
            "\n",
            r#"}"#,
            r#""#,
        );
        let (md, _, _) = convert_html(html, None);
        assert!(
            md.contains("  return 1;"),
            "Shiki-style indentation lost: {md}"
        );
    }

    #[test]
    fn deep_indentation_preserved_in_code() {
        // Multiple nesting levels -- 4-space indentation
        let html = concat!(
            "",
            "def outer():\n",
            "    def inner():\n",
            "        return 42\n",
            "    return inner",
            ""
        );
        let (md, _, _) = convert_html(html, None);
        assert!(md.contains("    def inner():"), "4-space indent lost: {md}");
        assert!(
            md.contains("        return 42"),
            "8-space indent lost: {md}"
        );
    }

    #[test]
    fn tab_indentation_preserved_in_code() {
        let html = "if (x) {\n\treturn;\n}";
        let (md, _, _) = convert_html(html, None);
        assert!(md.contains("\treturn;"), "tab indentation lost: {md}");
    }

    #[test]
    fn collapse_whitespace_skips_code_fences() {
        // Directly test that collapse_whitespace bypasses code block content
        let input = "text\n\n```js\nfunction foo() {\n  const x = 1;\n    if (true) {\n      return;\n    }\n}\n```\n\nmore text";
        let output = collapse_whitespace(input);
        assert!(
            output.contains("  const x = 1;"),
            "collapse_whitespace stripped 2-space indent: {output}"
        );
        assert!(
            output.contains("    if (true) {"),
            "collapse_whitespace stripped 4-space indent: {output}"
        );
        assert!(
            output.contains("      return;"),
            "collapse_whitespace stripped 6-space indent: {output}"
        );
    }
}