/// Readability-style content extraction. /// Strips noise (nav, ads, sidebars), scores remaining nodes by text density /// and structural signals, then converts the best candidate to markdown. use std::collections::HashSet; use ego_tree::NodeId; use once_cell::sync::Lazy; use scraper::{ElementRef, Html, Selector}; use tracing::{debug, warn}; use url::Url; use crate::markdown; use crate::noise; use crate::types::{Content, ExtractionOptions, Link}; static CANDIDATE_SELECTOR: Lazy = Lazy::new(|| Selector::parse("article, main, [role='main'], div, section, td").unwrap()); static BODY_SELECTOR: Lazy = Lazy::new(|| Selector::parse("body").unwrap()); static H1_SELECTOR: Lazy = Lazy::new(|| Selector::parse("h1").unwrap()); static H2_SELECTOR: Lazy = Lazy::new(|| Selector::parse("h2").unwrap()); static P_SELECTOR: Lazy = Lazy::new(|| Selector::parse("p").unwrap()); static A_SELECTOR: Lazy = Lazy::new(|| Selector::parse("a").unwrap()); static ANNOUNCEMENT_SELECTOR: Lazy = Lazy::new(|| Selector::parse("[role='region'][aria-label]").unwrap()); static FOOTER_SELECTOR: Lazy = Lazy::new(|| Selector::parse("footer").unwrap()); static FOOTER_HEADING_SELECTOR: Lazy = Lazy::new(|| Selector::parse("h2, h3, h4, h5, h6").unwrap()); /// Selector for only_main_content: article, main, [role="main"] static MAIN_CONTENT_SELECTOR: Lazy = Lazy::new(|| Selector::parse("article, main, [role='main']").unwrap()); const MAX_SELECTORS: usize = 100; /// Build a HashSet of NodeIds to exclude based on CSS selector strings. /// Invalid selectors are skipped with a warning. fn build_exclude_set(doc: &Html, selectors: &[String]) -> HashSet { if selectors.len() > MAX_SELECTORS { warn!( "too many CSS selectors ({}, max {}), truncating", selectors.len(), MAX_SELECTORS ); } let mut exclude = HashSet::new(); for selector_str in selectors.iter().take(MAX_SELECTORS) { let Ok(selector) = Selector::parse(selector_str) else { warn!( selector = selector_str.as_str(), "invalid CSS selector, skipping" ); continue; }; for el in doc.select(&selector) { // Add the element itself and all descendants exclude.insert(el.id()); for descendant in el.descendants() { if let Some(child_el) = ElementRef::wrap(descendant) { exclude.insert(child_el.id()); } } } } exclude } /// Parse CSS selector strings into Selectors, skipping invalid ones. fn parse_selectors(strings: &[String]) -> Vec { strings .iter() .filter_map(|s| { Selector::parse(s) .map_err(|_| warn!(selector = s.as_str(), "invalid CSS selector, skipping")) .ok() }) .collect() } /// Extract the main content from a parsed HTML document with options. pub fn extract_content(doc: &Html, base_url: Option<&Url>, options: &ExtractionOptions) -> Content { let exclude = build_exclude_set(doc, &options.exclude_selectors); // Path 1: Include selectors — skip scoring, extract only matching elements if !options.include_selectors.is_empty() { return extract_with_include(doc, base_url, &options.include_selectors, &exclude, options); } // Path 2: only_main_content — pick first article/main/[role="main"] if options.only_main_content { if let Some(main_el) = doc.select(&MAIN_CONTENT_SELECTOR).next() { debug!( tag = main_el.value().name(), "only_main_content: selected element" ); let (markdown, plain_text, assets) = markdown::convert(main_el, base_url, &exclude); let raw_html = if options.include_raw_html { Some(main_el.html()) } else { None }; return Content { markdown, plain_text, links: assets.links, images: assets.images, code_blocks: assets.code_blocks, raw_html, }; } debug!("only_main_content: no article/main found, falling back to scoring"); } // Path 3: Default scoring algorithm let best = find_best_node(doc); let (content_element, mut markdown, plain_text, mut assets) = if let Some(node) = best { debug!(tag = node.value().name(), "selected content node"); let (md, pt, a) = markdown::convert(node, base_url, &exclude); (Some(node), md, pt, a) } else { debug!("no strong candidate, falling back to body"); if let Some(body) = doc.select(&BODY_SELECTOR).next() { let (md, pt, a) = markdown::convert(body, base_url, &exclude); (Some(body), md, pt, a) } else { let root = doc.root_element(); let (md, pt, a) = markdown::convert(root, base_url, &exclude); (Some(root), md, pt, a) } }; // The best content node often excludes the page's primary H1 (e.g., in a // hero/banner section). If the document has an H1 and its text isn't already // in the markdown, prepend it so the output always starts with the title. if let Some(h1) = doc.select(&H1_SELECTOR).next() { let h1_text = h1 .text() .collect::() .trim() .trim_end_matches(|c: char| !c.is_alphanumeric()) .trim() .to_string(); if !h1_text.is_empty() && !markdown.contains(&h1_text) { markdown = format!("# {h1_text}\n\n{markdown}"); // Recover hero paragraph: H1 was outside the content node (noise-stripped), // so adjacent tagline/mission paragraphs are also lost. Recover them. recover_hero_paragraph(h1, &mut markdown); } } // Recover announcement banners (role="region" with announcement-like aria-label). // These are often stripped by class-based noise filters ("banner" class) but // contain genuinely important content like product announcements. recover_announcements(doc, base_url, &mut markdown, &mut assets.links); // Recover section headings that were stripped because their wrapper had a // noise class (e.g.,
). If an

is missing // from the markdown but nearby content from the same section IS present, // the heading was likely a false-positive noise strip. recover_section_headings(doc, &mut markdown); // Recover prominent CTA links from the footer (e.g., documentation links). // The footer tag is noise, but "call to action" sections inside it often // contain high-value links and headings worth capturing. recover_footer_cta(doc, base_url, &mut markdown, &mut assets.links); // Recover structured site navigation from footer (product/service listings). // Many homepages have organized footer sitemaps (Products, Solutions, etc.) // that are genuinely useful for LLM consumption. recover_footer_sitemap(doc, base_url, &mut markdown, &mut assets.links); let raw_html = if options.include_raw_html { content_element.map(|el| el.html()) } else { None }; Content { markdown, plain_text, links: assets.links, images: assets.images, code_blocks: assets.code_blocks, raw_html, } } /// Extract content using include selectors. Each matching element is converted /// to markdown and the results are concatenated. fn extract_with_include( doc: &Html, base_url: Option<&Url>, include_selectors: &[String], exclude: &HashSet, options: &ExtractionOptions, ) -> Content { let selectors = parse_selectors(include_selectors); let mut all_md = String::new(); let mut all_plain = String::new(); let mut all_links = Vec::new(); let mut all_images = Vec::new(); let mut all_code_blocks = Vec::new(); let mut all_raw_html = if options.include_raw_html { Some(String::new()) } else { None }; for selector in &selectors { for el in doc.select(selector) { if exclude.contains(&el.id()) { continue; } let (md, plain, assets) = markdown::convert(el, base_url, exclude); if !md.is_empty() { if !all_md.is_empty() { all_md.push_str("\n\n"); } all_md.push_str(&md); } if !plain.is_empty() { if !all_plain.is_empty() { all_plain.push('\n'); } all_plain.push_str(&plain); } all_links.extend(assets.links); all_images.extend(assets.images); all_code_blocks.extend(assets.code_blocks); if let Some(ref mut raw) = all_raw_html { raw.push_str(&el.html()); } } } Content { markdown: all_md, plain_text: all_plain, links: all_links, images: all_images, code_blocks: all_code_blocks, raw_html: all_raw_html, } } /// Recover announcement banners that were stripped as noise. /// Pattern: `
` with short, meaningful text. fn recover_announcements( doc: &Html, base_url: Option<&Url>, markdown: &mut String, links: &mut Vec, ) { for el in doc.select(&ANNOUNCEMENT_SELECTOR) { let label = el.value().attr("aria-label").unwrap_or(""); if !label.to_lowercase().contains("announcement") { continue; } let text = el.text().collect::(); let text = text.split_whitespace().collect::>().join(" "); if text.is_empty() || markdown.contains(&text) { continue; } // Build markdown for the announcement, including any links let mut announcement = format!("> **{text}**"); for a in el.select(&A_SELECTOR) { let link_text = a.text().collect::().trim().to_string(); let href = a .value() .attr("href") .map(|h| markdown::resolve_url(h, base_url)) .unwrap_or_default(); if !link_text.is_empty() && !href.is_empty() { links.push(Link { text: link_text, href, }); } } announcement.push_str("\n\n"); debug!("recovered announcement banner"); *markdown = format!("{announcement}{markdown}"); } } /// Recover the hero paragraph (mission/tagline) that's near the H1 but inside /// a noise-stripped container like `
`. Walk siblings/cousins of the H1 /// to find a substantial `

` that isn't in the markdown. fn recover_hero_paragraph(h1: ElementRef<'_>, markdown: &mut String) { // Walk up to find a container that holds both H1 and sibling content let mut node = h1.parent(); for _ in 0..4 { let Some(parent) = node else { break }; let Some(parent_el) = ElementRef::wrap(parent) else { node = parent.parent(); continue; }; // Search all

descendants of this container for descendant in parent_el.descendants() { let Some(el) = ElementRef::wrap(descendant) else { continue; }; if el.value().name() != "p" { continue; } let text = el .text() .collect::() .split_whitespace() .collect::>() .join(" "); // Only recover substantial paragraphs (taglines, mission statements) if text.len() < 40 || text.len() > 300 { continue; } if markdown.contains(&text) { continue; } // Insert right after the H1 heading line debug!(text = text.as_str(), "recovered hero paragraph"); let insert = format!("\n{text}\n"); if let Some(pos) = markdown.find('\n') { markdown.insert_str(pos + 1, &insert); } else { markdown.push_str(&insert); } return; } node = parent.parent(); } } /// Recover

headings that were stripped because their wrapper div had a /// noise class like "header". If adjacent content from the same parent section /// IS in the markdown, the heading should be there too. fn recover_section_headings(doc: &Html, markdown: &mut String) { for h2 in doc.select(&H2_SELECTOR) { let h2_text = h2.text().collect::().trim().to_string(); if h2_text.is_empty() || find_content_position(markdown, &h2_text).is_some() { continue; } // Don't recover headings inside structural noise tags (nav, aside, footer, // header). These are genuine noise — not false-positive class matches like //
inside a content section. if is_inside_structural_noise(h2) { continue; } // Walk up to the nearest section/div parent, then check if any sibling // content from that parent made it into the markdown. let anchor = find_sibling_anchor_text(h2, markdown); if let Some(anchor) = anchor { debug!( heading = h2_text.as_str(), "recovered stripped section heading" ); // Insert the heading before the anchor's content block. // Walk backwards past short orphan lines (stat numbers etc.) // that likely belong to the same section. if let Some(pos) = find_content_position(markdown, &anchor) { let line_start = markdown[..pos].rfind('\n').map_or(0, |p| p + 1); let insert_pos = walk_back_past_orphans(markdown, line_start); let heading_md = format!("## {h2_text}\n\n"); markdown.insert_str(insert_pos, &heading_md); } } } // Also recover

"eyebrow" text (short taglines above section headings). // These are typically inside the same noise-stripped wrapper as the

. // Eyebrows are short (e.g., "/the web access layer for agents") — skip full paragraphs. for h2 in doc.select(&H2_SELECTOR) { let h2_text = h2.text().collect::().trim().to_string(); if h2_text.is_empty() || find_content_position(markdown, &h2_text).is_none() { continue; } // Look for a preceding

sibling inside the same parent if let Some(parent) = h2.parent().and_then(ElementRef::wrap) { for child in parent.children() { if let Some(child_el) = ElementRef::wrap(child) { // Stop when we reach the h2 itself if child_el == h2 { break; } if child_el.value().name() == "p" { let p_text = child_el.text().collect::().trim().to_string(); // Only short text qualifies as an eyebrow — full paragraphs // are regular content, not taglines. if p_text.is_empty() || p_text.len() > 80 { continue; } // Skip decorative route-style labels (e.g., "/proof is in // the numbers", "/press room") — common design pattern, not content. if p_text.starts_with('/') { continue; } // Check against a stripped version of the markdown to handle // formatting like **bold** that breaks plain-text matching. let plain_md = strip_md_formatting(markdown); if plain_md.contains(&p_text) { continue; } { // Insert the eyebrow text at the start of the heading's line if let Some(pos) = find_content_position(markdown, &h2_text) { let line_start = markdown[..pos].rfind('\n').map_or(0, |p| p + 1); let eyebrow_md = format!("*{p_text}*\n\n"); markdown.insert_str(line_start, &eyebrow_md); debug!(eyebrow = p_text.as_str(), "recovered eyebrow text"); } } } } } } } } /// Find text from a sibling element (in the same section) that IS in the markdown. /// This confirms the heading belongs to content we already captured. fn find_sibling_anchor_text(heading: ElementRef<'_>, markdown: &str) -> Option { let heading_text = heading.text().collect::(); // Walk up to find the containing section or significant parent let mut node = heading.parent(); while let Some(parent) = node { if let Some(parent_el) = ElementRef::wrap(parent) { let tag = parent_el.value().name(); if tag == "section" || tag == "article" || tag == "main" || tag == "body" { // Search descendant

and

elements for text in the markdown. // Using specific elements avoids the multiline blob issue from // concatenating all text nodes of a large container. for descendant in parent_el.descendants() { if let Some(el) = ElementRef::wrap(descendant) { let dtag = el.value().name(); if dtag != "p" && dtag != "h3" && dtag != "h4" { continue; } // Normalize whitespace to match how the markdown converter collapses it let el_text: String = el .text() .collect::() .split_whitespace() .collect::>() .join(" "); // Skip if this text is part of the heading itself if el_text.is_empty() || heading_text.contains(&el_text) { continue; } if el_text.len() > 15 && find_content_position(markdown, &el_text).is_some() { return Some(el_text); } } } break; } } node = parent.parent(); } None } /// Recover CTA (call-to-action) links and headings from footer sections. /// Many sites have a "hero" CTA block in the footer with documentation links /// or signup prompts. These are valuable content, not navigational noise. fn recover_footer_cta( doc: &Html, base_url: Option<&Url>, markdown: &mut String, links: &mut Vec, ) { for footer in doc.select(&FOOTER_SELECTOR) { // Look for h2 headings in the footer (CTA headings like "Power your AI...") for h2 in footer.select(&H2_SELECTOR) { let h2_text = h2.text().collect::().trim().to_string(); if h2_text.is_empty() || markdown.contains(&h2_text) { continue; } // Skip meta headings (screen-reader-only "Footer", "Navigation") let h2_lower = h2_text.to_lowercase(); if h2_lower == "footer" || h2_lower == "navigation" || h2_lower == "site map" { continue; } // Skip screen-reader-only headings (sr-only, visually-hidden) if let Some(class) = h2.value().attr("class") { let cl = class.to_lowercase(); if cl.contains("sr-only") || cl.contains("visually-hidden") || cl.contains("screen-reader") { continue; } } debug!(heading = h2_text.as_str(), "recovered footer CTA heading"); markdown.push_str(&format!("\n\n## {h2_text}\n\n")); } // Recover links that point to documentation or app URLs for a in footer.select(&A_SELECTOR) { let href = match a.value().attr("href") { Some(h) => markdown::resolve_url(h, base_url), None => continue, }; let text = a.text().collect::().trim().to_string(); if text.is_empty() || href.is_empty() { continue; } // Only recover links to docs/app/API — not generic footer nav let href_lower = href.to_lowercase(); let is_valuable_cta = href_lower.contains("docs.") || href_lower.contains("/docs") || href_lower.contains("app.") || href_lower.contains("/app") || href_lower.contains("api."); if is_valuable_cta && !markdown.contains(&text) { debug!( text = text.as_str(), href = href.as_str(), "recovered footer CTA link" ); markdown.push_str(&format!("[{text}]({href})\n\n")); links.push(Link { text: text.clone(), href: href.clone(), }); } } } } /// Recover structured site navigation from footer when it has organized /// link categories (Products, Solutions, Resources, etc.). This captures /// the site's offering structure — useful for LLM queries like "what does /// this company offer?" Only fires when the footer has 3+ categories. fn recover_footer_sitemap( doc: &Html, base_url: Option<&Url>, markdown: &mut String, links: &mut Vec, ) { for footer in doc.select(&FOOTER_SELECTOR) { let mut categories: Vec<(String, Vec<(String, String)>)> = Vec::new(); for heading in footer.select(&FOOTER_HEADING_SELECTOR) { let heading_text = heading.text().collect::().trim().to_string(); if heading_text.is_empty() || heading_text.len() > 50 { continue; } // Skip meta headings like "Footer" and headings already in the markdown if heading_text.eq_ignore_ascii_case("footer") || markdown.contains(&heading_text) { continue; } // Find links in the nearest container that holds both heading + link list. // Try parent first, then grandparent (handles wrapper divs). let cat_links = collect_sibling_links(heading, base_url); // 2–20 links: too few = not a real category, too many = aggregate container if cat_links.len() >= 2 && cat_links.len() <= 20 { categories.push((heading_text, cat_links)); } } if categories.len() < 3 { continue; } // Build compact sitemap — category name + comma-separated link text let mut sitemap = String::from("\n\n---\n\n"); for (heading, cat_links) in &categories { let names: Vec<&str> = cat_links.iter().map(|(t, _)| t.as_str()).collect(); sitemap.push_str(&format!("**{heading}**: {}\n", names.join(", "))); for (text, href) in cat_links { links.push(Link { text: text.clone(), href: href.clone(), }); } } debug!(categories = categories.len(), "recovered footer sitemap"); markdown.push_str(&sitemap); } } /// Collect links from the same container as a heading element. /// Walks up the DOM to find the nearest ancestor that contains elements. fn collect_sibling_links(heading: ElementRef<'_>, base_url: Option<&Url>) -> Vec<(String, String)> { let mut node = heading.parent(); // Try up to 2 levels (parent, grandparent) to find a link container for _ in 0..2 { let Some(parent) = node else { break }; let Some(parent_el) = ElementRef::wrap(parent) else { node = parent.parent(); continue; }; let a_elements: Vec<_> = parent_el.select(&A_SELECTOR).collect(); if a_elements.len() >= 2 { return a_elements .into_iter() .filter_map(|a| { let text = a.text().collect::().trim().to_string(); let href = a .value() .attr("href") .map(|h| markdown::resolve_url(h, base_url)); match (text.is_empty(), href) { (false, Some(h)) if !h.is_empty() && text.len() > 1 && text.len() < 60 && !matches!( text.to_lowercase().as_str(), "here" | "link" | "click" | "more" ) => { Some((text, h)) } _ => None, } }) .collect(); } node = parent.parent(); } Vec::new() } /// Walk backwards from `pos` in markdown, skipping blank lines and short /// orphan lines (<=25 chars, likely stat numbers or labels) that belong to /// the same section. Stops at headings, long content lines, or start of string. fn walk_back_past_orphans(markdown: &str, mut pos: usize) -> usize { loop { if pos == 0 { break; } // Find the previous line let prev_end = pos.saturating_sub(1); // skip the \n let prev_start = markdown[..prev_end].rfind('\n').map_or(0, |p| p + 1); let prev_line = markdown[prev_start..prev_end].trim(); if prev_line.is_empty() { pos = prev_start; continue; } if prev_line.starts_with('#') || prev_line.starts_with('>') || prev_line.len() > 25 { break; } // Short non-structural line — likely a stat number, include it pos = prev_start; } pos } /// Quick strip of markdown bold/italic markers for plain-text comparison. fn strip_md_formatting(md: &str) -> String { md.replace("**", "").replace('*', "") } /// Find `needle` in `markdown` only at a position that isn't inside image/link /// alt text (`![...](...)`). Returns the byte offset or None. fn find_content_position(markdown: &str, needle: &str) -> Option { let mut search_from = 0; while let Some(pos) = markdown[search_from..].find(needle) { let abs_pos = search_from + pos; if !is_inside_image_syntax(markdown, abs_pos) { return Some(abs_pos); } // Advance past the rejected match. `abs_pos + needle.len()` is always a // valid UTF-8 char boundary (end of the matched substring); `abs_pos + 1` // is not, and panics on the next slice when the match starts on a // multi-byte char (Cyrillic, CJK, accented Latin, emoji). See issue #16. search_from = abs_pos + needle.len(); } None } /// Check if a position in markdown falls inside `![...](...)` image syntax. fn is_inside_image_syntax(markdown: &str, pos: usize) -> bool { // Walk backwards from pos to find the nearest unmatched `![` let before = &markdown[..pos]; // Find the last `![` that hasn't been closed by `](` let mut i = before.len(); while i > 0 { i -= 1; if i > 0 && before.as_bytes()[i - 1] == b'!' && before.as_bytes()[i] == b'[' { // Found `![` — check if there's a matching `](` after pos let after = &markdown[pos..]; if after.contains("](") { return true; } } // If we hit a `)` that closes a previous image, stop searching if before.as_bytes()[i] == b')' { break; } } false } /// Check if an element is inside a structural noise tag (nav, aside, footer, header). /// Unlike class-based noise (e.g.,
), these are strong signals /// that the content is genuinely non-content and should NOT be recovered. const STRUCTURAL_NOISE_TAGS: &[&str] = &["nav", "aside", "footer", "header"]; fn is_inside_structural_noise(el: ElementRef<'_>) -> bool { let mut node = el.parent(); while let Some(parent) = node { if let Some(parent_el) = ElementRef::wrap(parent) { let tag = parent_el.value().name(); if STRUCTURAL_NOISE_TAGS.contains(&tag) { return true; } // Also check role-based structural noise if let Some(role) = parent_el.value().attr("role") && (role == "navigation" || role == "contentinfo") { return true; } } node = parent.parent(); } false } /// Score each candidate node and return the best one. fn find_best_node(doc: &Html) -> Option> { let mut best: Option<(ElementRef<'_>, f64)> = None; for candidate in doc.select(&CANDIDATE_SELECTOR) { if noise::is_noise(candidate) || noise::is_noise_descendant(candidate) { continue; } let score = score_node(candidate); if score > 0.0 && best.as_ref().is_none_or(|(_, s)| score > *s) { best = Some((candidate, score)); } } best.map(|(el, score)| { debug!(score, tag = el.value().name(), "best content candidate"); el }) } fn score_node(el: ElementRef<'_>) -> f64 { let text = el.text().collect::(); let text_len = text.len() as f64; // Very short nodes aren't content if text_len < 50.0 { return 0.0; } let mut score = 0.0; // Base score: text length (log scale to avoid huge nodes dominating purely by size) score += text_len.ln(); // Bonus for
or
— these are strong semantic signals let tag = el.value().name(); match tag { "article" => score += 50.0, "main" => score += 50.0, _ => {} } // Bonus for role="main" if el.value().attr("role") == Some("main") { score += 50.0; } // Bonus for common content class/id patterns if let Some(class) = el.value().attr("class") { let cl = class.to_lowercase(); if cl.contains("content") || cl.contains("article") || cl.contains("post") || cl.contains("entry") { score += 25.0; } } if let Some(id) = el.value().attr("id") { let id = id.to_lowercase(); if id.contains("content") || id.contains("article") || id.contains("post") || id.contains("main") { score += 25.0; } } // Paragraph density: count

children — real content has paragraphs let p_count = el.select(&P_SELECTOR).count() as f64; score += p_count * 3.0; // Link density penalty: nodes that are mostly links (nav, footer) score low. // link_text_len / total_text_len — lower is better for content. let link_text_len: f64 = el .select(&A_SELECTOR) .map(|a| a.text().collect::().len() as f64) .sum(); // Semantic nodes (article, main, role=main) get milder link density penalties. // Documentation pages often have high link density from TOCs inside the main // content container — these are expected, not spam. let is_semantic = matches!(tag, "article" | "main") || el.value().attr("role") == Some("main"); if text_len > 0.0 { let link_density = link_text_len / text_len; if is_semantic { // Semantic nodes: only penalize extreme link density if link_density > 0.7 { score *= 0.3; } else if link_density > 0.5 { score *= 0.5; } } else { // Generic divs: heavy penalty for link-dense content if link_density > 0.5 { score *= 0.1; } else if link_density > 0.3 { score *= 0.5; } } } score } /// Count words in text (for word_count metadata). pub fn word_count(text: &str) -> usize { text.split_whitespace().count() } #[cfg(test)] mod tests { use super::*; fn parse(html: &str) -> Html { Html::parse_document(html) } /// Regression: issue #16 — `find_content_position` used to advance /// `search_from` by 1 byte after an image-syntax rejection, which landed /// mid-char on multi-byte UTF-8 input (Cyrillic, CJK, accented Latin, emoji) /// and panicked on the next `markdown[search_from..]` slice. #[test] fn find_content_position_does_not_panic_on_multibyte_rejected_match() { // `needle` appears first inside image syntax (must be rejected), then // again as plain content after a block of Cyrillic prose. The bump // from the rejected match used to land inside 'Ч'. let markdown = "![alt needle text](/img.png) Наша история Brûler d'Amour. needle text appears here."; let pos = find_content_position(markdown, "needle text"); assert!(pos.is_some(), "second occurrence should be found"); assert!( markdown.is_char_boundary(pos.unwrap()), "returned offset must be a char boundary" ); } #[test] fn find_content_position_survives_all_rejected_in_cyrillic() { // Every occurrence of `needle` sits inside image syntax, so the // function must walk the whole string rejecting each one. With the // `+1` bug this panicked the first time `search_from` crossed a // 2-byte char. With the fix it should return None cleanly. let markdown = "Наша история ![foo needle bar](a.png) Ещё текст ![needle](b.png) Конец 'Ч'"; assert_eq!(find_content_position(markdown, "needle"), None); } /// Helper: extract with default options (backward-compatible). fn extract_default(doc: &Html, base_url: Option<&Url>) -> Content { extract_content(doc, base_url, &ExtractionOptions::default()) } #[test] fn picks_article_over_nav() { let html = r##"

Real Article

This is the main content of the page. It contains several paragraphs of text that make it clearly the main content area.

Another paragraph with useful information for the reader.

And a third paragraph to make it really obvious this is content.

"##; let doc = parse(html); let content = extract_default(&doc, None); assert!(content.markdown.contains("Real Article")); assert!(content.markdown.contains("main content")); } #[test] fn falls_back_to_body() { let html = r##"

Simple page with just a paragraph.

"##; let doc = parse(html); let content = extract_default(&doc, None); assert!(content.plain_text.contains("Simple page")); } #[test] fn word_count_works() { assert_eq!(word_count("hello world foo bar"), 4); assert_eq!(word_count(""), 0); assert_eq!(word_count(" spaces everywhere "), 2); } #[test] fn prefers_content_class() { let html = r##"

Site header with some branding text content here

Main Content

This is the primary content of the page that readers want to see. It has multiple sentences and meaningful paragraphs.

Second paragraph with additional details and context for the article.

Third paragraph because real articles have substantial text.

"##; let doc = parse(html); let content = extract_default(&doc, None); assert!(content.markdown.contains("Main Content")); } /// Simulates a Wikipedia-like page where the best content node (article/main) /// contains a nav sidebar as a child. The markdown converter must strip it. #[test] fn wikipedia_like_nav_sidebar_stripped() { let html = r##"

Rust (programming language)

Rust is a multi-paradigm programming language focused on performance and safety, especially safe concurrency. It accomplishes these goals without a garbage collector.

Rust was originally designed by Graydon Hoare at Mozilla Research, with contributions from several other developers.

The language grew out of a personal project begun in 2006 by Mozilla employee Graydon Hoare, who stated that it was possibly named after the rust family of fungi.

"##; let doc = parse(html); let content = extract_default(&doc, None); // Article content preserved assert!(content.markdown.contains("Rust (programming language)")); assert!( content .markdown .contains("multi-paradigm programming language") ); assert!(content.markdown.contains("Graydon Hoare")); // Nav sidebar stripped assert!( !content.markdown.contains("Contents"), "TOC nav heading leaked" ); assert!( !content.markdown.contains("#history"), "TOC nav link leaked" ); // Aside infobox stripped assert!( !content.markdown.contains("First appeared"), "infobox aside leaked" ); } /// When the best node is a large div that happens to contain script tags, /// the JS code must not appear in the markdown. #[test] fn script_inside_content_node_stripped() { let html = r##"

Interactive Article

This article has some embedded JavaScript for interactivity. The content itself is what we want to extract, not the code.

The article continues with more useful information for readers who want to learn about the topic.

"##; let doc = parse(html); let content = extract_default(&doc, None); assert!(content.markdown.contains("Interactive Article")); assert!(content.markdown.contains("embedded JavaScript")); assert!(content.markdown.contains("continues with more")); assert!( !content.markdown.contains("NEXT_DATA"), "script content leaked" ); assert!( !content.markdown.contains("initializeApp"), "JS function call leaked" ); assert!( !content.markdown.contains("background: yellow"), "CSS leaked" ); } /// Full-page simulation: header, nav, main content, footer. /// Only the main content should survive. #[test] fn full_page_noise_stripped() { let html = r##"

How to Write Clean Code

Writing clean code is an essential skill for every developer. It makes your codebase easier to maintain and understand.

In this article, we will explore several principles that can help you write better, more readable code.

The first principle is to use meaningful variable names that clearly describe what the variable holds.

"##; let doc = parse(html); let content = extract_default(&doc, None); assert!(content.markdown.contains("How to Write Clean Code")); assert!(content.markdown.contains("meaningful variable names")); assert!( !content.markdown.contains("MySite"), "header/footer branding leaked" ); assert!(!content.markdown.contains("Privacy"), "footer link leaked"); assert!(!content.markdown.contains("Blog"), "nav link leaked"); } /// H1 in a hero/banner section outside the main content node should be /// captured and prepended to the markdown output. #[test] fn h1_outside_content_node_captured() { let html = r##"

The Ultimate Guide to Async Rust

Everything you need to know

Asynchronous programming in Rust is powered by the async/await syntax and the Future trait. This guide covers all the fundamentals you need to get started with async Rust.

We will explore tokio, the most popular async runtime, and show you how to build concurrent applications efficiently.

By the end of this guide you will understand how to write performant async code that handles thousands of connections.

"##; let doc = parse(html); let content = extract_default(&doc, None); // H1 must appear in markdown even though it's outside
assert!( content .markdown .contains("The Ultimate Guide to Async Rust"), "H1 from hero banner missing from output" ); // Should be prepended as a heading assert!( content .markdown .starts_with("# The Ultimate Guide to Async Rust"), "H1 should be prepended as markdown heading" ); // Article content still present assert!(content.markdown.contains("async/await")); assert!(content.markdown.contains("tokio")); } /// Announcement banners with role="region" and aria-label="Announcement" /// should be recovered even though their class contains "banner" (noise). #[test] fn announcement_banner_recovered() { let html = r##"

Big news! We are joining forces with Acme Corp - read more in our blog

Our Product

We build amazing tools for developers that simplify complex workflows and boost productivity every day.

Our platform handles millions of requests per second with low latency and high reliability.

"##; let doc = parse(html); let content = extract_default(&doc, Some(&Url::parse("https://example.com").unwrap())); assert!( content.markdown.contains("joining forces with Acme Corp"), "Announcement banner text missing from output" ); assert!( content.markdown.contains("Our Product"), "Main content missing" ); // The announcement link should be captured assert!( content .links .iter() .any(|l| l.href.contains("example.com/blog")), "Announcement link not captured" ); } /// Section headings inside
wrappers should be /// recovered when sibling content from the same section is in the output. #[test] fn section_heading_in_header_class_recovered() { let html = r##"

Built for scale

Handle thousands of concurrent requests with intelligent load balancing and automatic failover.

Deploy globally with edge locations in every major region for minimal latency.

"##; let doc = parse(html); let content = extract_default(&doc, None); assert!( content.markdown.contains("## Built for scale"), "Section heading should be recovered: {}", content.markdown ); assert!( content.markdown.contains("concurrent requests"), "Section content missing" ); } /// Eyebrow text (short tagline above a section heading) should be /// recovered when it's inside the same noise-stripped wrapper as the

. #[test] fn eyebrow_text_recovered() { let html = r##"

the platform for builders

Loved by developers worldwide

Thousands of teams rely on our platform daily for mission-critical applications and workflows.

"##; let doc = parse(html); let content = extract_default(&doc, None); assert!( content.markdown.contains("the platform for builders"), "Eyebrow text missing: {}", content.markdown ); assert!( content.markdown.contains("Loved by developers worldwide"), "Section heading missing" ); } /// Decorative route-style labels (starting with "/") should NOT be recovered /// as eyebrow text — they're design elements, not content. #[test] fn route_style_eyebrow_not_recovered() { let html = r##"

/proof is in the numbers

Trusted in production

Our platform handles millions of requests per second with low latency and high reliability.

"##; let doc = parse(html); let content = extract_default(&doc, None); // With exact class matching, "section-header" is NOT noise // (only exact "header" class would be). The eyebrow text is now // preserved, which is correct — it's content, not navigation. assert!( content.markdown.contains("Trusted in production"), "Section heading should be recovered" ); assert!( content.markdown.contains("Our platform"), "Grid content should be present" ); } /// Footer CTA links to documentation URLs should be recovered. #[test] fn footer_cta_link_recovered() { let html = r##"

Our Platform

Build powerful applications with our comprehensive API and developer tools that handle millions of requests.

Get started in minutes with our quickstart guide and extensive documentation for every feature.

"##; let doc = parse(html); let content = extract_default(&doc, Some(&Url::parse("https://example.com").unwrap())); assert!( content.markdown.contains("Start building today"), "Footer CTA heading missing: {}", content.markdown ); assert!( content.markdown.contains("Explore API Docs"), "Footer CTA link missing" ); // Non-doc footer links should NOT be recovered assert!( !content.markdown.contains("Privacy"), "Generic footer nav leaked" ); assert!( !content.markdown.contains("Terms"), "Generic footer nav leaked" ); } /// Headings inside genuine noise (nav, aside) should NOT be recovered, /// even when sibling content exists in the output. #[test] fn heading_inside_nav_not_recovered() { let html = r##"

Programming Guide

This comprehensive guide covers everything you need to know about modern programming practices.

From basics to advanced topics, we will explore patterns and techniques used by professionals.

"##; let doc = parse(html); let content = extract_default(&doc, None); assert!( !content.markdown.contains("Table of Contents"), "TOC heading from nav should not be recovered: {}", content.markdown ); assert!(content.markdown.contains("comprehensive guide")); } /// Structured footer sitemaps (3+ categories with headings) should be /// recovered as a compact reference section. #[test] fn footer_sitemap_recovered() { let html = r##"

Our Company

We build tools that help developers create amazing applications faster and more efficiently than ever before.

Join thousands of teams who trust our platform for their mission-critical workloads every single day.

"##; let doc = parse(html); let content = extract_default(&doc, Some(&Url::parse("https://example.com").unwrap())); // Categories should be captured assert!( content.markdown.contains("Products"), "Footer sitemap Products missing: {}", content.markdown ); assert!( content.markdown.contains("Product A"), "Footer sitemap link missing" ); assert!( content.markdown.contains("Solutions"), "Footer sitemap Solutions missing" ); assert!( content.markdown.contains("Resources"), "Footer sitemap Resources missing" ); // Main content still present assert!(content.markdown.contains("Our Company")); } /// Footer sitemaps with fewer than 3 categories should NOT be recovered /// (not enough structure to be confident it's a sitemap). #[test] fn small_footer_not_treated_as_sitemap() { let html = r##"

Simple Page

This is a simple page with minimal footer structure that should not trigger sitemap recovery at all.

The content here is what matters, not the footer links or navigation elements below the main content.

"##; let doc = parse(html); let content = extract_default(&doc, None); assert!( !content.markdown.contains("Legal"), "Small footer should not be treated as sitemap: {}", content.markdown ); } /// Screen-reader-only footer headings (like "Footer") should not leak. #[test] fn sr_only_footer_heading_not_recovered() { let html = r##"

Our Platform

Build powerful applications with our comprehensive API and developer tools that handle millions of requests.

Get started in minutes with our quickstart guide and extensive documentation for every feature.

"##; let doc = parse(html); let content = extract_default(&doc, Some(&Url::parse("https://example.com").unwrap())); assert!( !content.markdown.contains("## Footer"), "SR-only 'Footer' heading should not be recovered: {}", content.markdown ); } } #[cfg(test)] mod form_integration_tests { use super::*; #[test] fn aspnet_form_content_extraction() { let content = "x".repeat(600); // Ensure >500 chars let html = format!( r#"

Section

Question?

{content}

"# ); let doc = Html::parse_document(&html); let opts = ExtractionOptions::default(); let result = extract_content(&doc, None, &opts); assert!( result.markdown.contains("Section"), "h2 missing from markdown" ); assert!( result.markdown.contains("Question"), "h3 missing from markdown" ); } /// Simulate unclosed header div absorbing the content div. /// The header's noise class should NOT propagate to the absorbed content /// because the safety valve detects the header has >5000 chars (broken wrapper). #[test] fn unclosed_header_div_does_not_swallow_content() { let faq = "Lorem ipsum dolor sit amet. ".repeat(300); // ~8400 chars // The header div is intentionally NOT closed — the HTML parser makes // div.content a child of div.header. The safety valve (>5000 chars) // should prevent div.header from being treated as noise. let html = format!( r#"
Logo

FAQ Section

First question?

{faq}

"# ); let doc = Html::parse_document(&html); let opts = ExtractionOptions::default(); let result = extract_content(&doc, None, &opts); assert!( result.markdown.contains("FAQ Section"), "h2 missing: header swallowed content" ); assert!( result.markdown.contains("First question"), "h3 missing: header swallowed content" ); } }