— these are strong semantic signals let tag = el.value().name(); match tag { "article" => score += 50.0, "main" => score += 50.0, _ => {} } // Bonus for role="main" if el.value().attr("role") == Some("main") { score += 50.0; } // Bonus for common content class/id patterns if let Some(class) = el.value().attr("class") { let cl = class.to_lowercase(); if cl.contains("content") || cl.contains("article") || cl.contains("post") || cl.contains("entry") { score += 25.0; } } if let Some(id) = el.value().attr("id") { let id = id.to_lowercase(); if id.contains("content") || id.contains("article") || id.contains("post") || id.contains("main") { score += 25.0; } } // Paragraph density: count

children — real content has paragraphs let p_count = el.select(&P_SELECTOR).count() as f64; score += p_count * 3.0; // Link density penalty: nodes that are mostly links (nav, footer) score low. // link_text_len / total_text_len — lower is better for content. let link_text_len: f64 = el .select(&A_SELECTOR) .map(|a| a.text().collect::().len() as f64) .sum(); // Semantic nodes (article, main, role=main) get milder link density penalties. // Documentation pages often have high link density from TOCs inside the main // content container — these are expected, not spam. let is_semantic = matches!(tag, "article" | "main") || el.value().attr("role") == Some("main"); if text_len > 0.0 { let link_density = link_text_len / text_len; if is_semantic { // Semantic nodes: only penalize extreme link density if link_density > 0.7 { score *= 0.3; } else if link_density > 0.5 { score *= 0.5; } } else { // Generic divs: heavy penalty for link-dense content if link_density > 0.5 { score *= 0.1; } else if link_density > 0.3 { score *= 0.5; } } } score } /// Count words in text (for word_count metadata). pub fn word_count(text: &str) -> usize { text.split_whitespace().count() } #[cfg(test)] mod tests { use super::*; fn parse(html: &str) -> Html { Html::parse_document(html) } /// Regression: issue #16 — `find_content_position` used to advance /// `search_from` by 1 byte after an image-syntax rejection, which landed /// mid-char on multi-byte UTF-8 input (Cyrillic, CJK, accented Latin, emoji) /// and panicked on the next `markdown[search_from..]` slice. #[test] fn find_content_position_does_not_panic_on_multibyte_rejected_match() { // `needle` appears first inside image syntax (must be rejected), then // again as plain content after a block of Cyrillic prose. The bump // from the rejected match used to land inside 'Ч'. let markdown = "![alt needle text](/img.png) Наша история Brûler d'Amour. needle text appears here."; let pos = find_content_position(markdown, "needle text"); assert!(pos.is_some(), "second occurrence should be found"); assert!( markdown.is_char_boundary(pos.unwrap()), "returned offset must be a char boundary" ); } #[test] fn find_content_position_survives_all_rejected_in_cyrillic() { // Every occurrence of `needle` sits inside image syntax, so the // function must walk the whole string rejecting each one. With the // `+1` bug this panicked the first time `search_from` crossed a // 2-byte char. With the fix it should return None cleanly. let markdown = "Наша история ![foo needle bar](a.png) Ещё текст ![needle](b.png) Конец 'Ч'"; assert_eq!(find_content_position(markdown, "needle"), None); } /// Helper: extract with default options (backward-compatible). fn extract_default(doc: &Html, base_url: Option<&Url>) -> Content { extract_content(doc, base_url, &ExtractionOptions::default()) } #[test] fn picks_article_over_nav() { let html = r##"

Real Article

This is the main content of the page. It contains several paragraphs of text that make it clearly the main content area.

Another paragraph with useful information for the reader.

And a third paragraph to make it really obvious this is content.

"##; let doc = parse(html); let content = extract_default(&doc, None); assert!(content.markdown.contains("Real Article")); assert!(content.markdown.contains("main content")); } #[test] fn falls_back_to_body() { let html = r##"

Simple page with just a paragraph.

"##; let doc = parse(html); let content = extract_default(&doc, None); assert!(content.plain_text.contains("Simple page")); } #[test] fn word_count_works() { assert_eq!(word_count("hello world foo bar"), 4); assert_eq!(word_count(""), 0); assert_eq!(word_count(" spaces everywhere "), 2); } #[test] fn prefers_content_class() { let html = r##"

Main Content

This is the primary content of the page that readers want to see. It has multiple sentences and meaningful paragraphs.

Second paragraph with additional details and context for the article.

Third paragraph because real articles have substantial text.

"##; let doc = parse(html); let content = extract_default(&doc, None); assert!(content.markdown.contains("Main Content")); } /// Simulates a Wikipedia-like page where the best content node (article/main) /// contains a nav sidebar as a child. The markdown converter must strip it. #[test] fn wikipedia_like_nav_sidebar_stripped() { let html = r##"

Rust (programming language)

Rust is a multi-paradigm programming language focused on performance and safety, especially safe concurrency. It accomplishes these goals without a garbage collector.

Rust was originally designed by Graydon Hoare at Mozilla Research, with contributions from several other developers.

The language grew out of a personal project begun in 2006 by Mozilla employee Graydon Hoare, who stated that it was possibly named after the rust family of fungi.

"##; let doc = parse(html); let content = extract_default(&doc, None); // Article content preserved assert!(content.markdown.contains("Rust (programming language)")); assert!( content .markdown .contains("multi-paradigm programming language") ); assert!(content.markdown.contains("Graydon Hoare")); // Nav sidebar stripped assert!( !content.markdown.contains("Contents"), "TOC nav heading leaked" ); assert!( !content.markdown.contains("#history"), "TOC nav link leaked" ); // Aside infobox stripped assert!( !content.markdown.contains("First appeared"), "infobox aside leaked" ); } /// When the best node is a large div that happens to contain script tags, /// the JS code must not appear in the markdown. #[test] fn script_inside_content_node_stripped() { let html = r##"

Interactive Article

This article has some embedded JavaScript for interactivity. The content itself is what we want to extract, not the code.

The article continues with more useful information for readers who want to learn about the topic.

"##; let doc = parse(html); let content = extract_default(&doc, None); assert!(content.markdown.contains("Interactive Article")); assert!(content.markdown.contains("embedded JavaScript")); assert!(content.markdown.contains("continues with more")); assert!( !content.markdown.contains("NEXT_DATA"), "script content leaked" ); assert!( !content.markdown.contains("initializeApp"), "JS function call leaked" ); assert!( !content.markdown.contains("background: yellow"), "CSS leaked" ); } /// Full-page simulation: header, nav, main content, footer. /// Only the main content should survive. #[test] fn full_page_noise_stripped() { let html = r##"

MySite

How to Write Clean Code

Writing clean code is an essential skill for every developer. It makes your codebase easier to maintain and understand.

In this article, we will explore several principles that can help you write better, more readable code.

The first principle is to use meaningful variable names that clearly describe what the variable holds.

"##; let doc = parse(html); let content = extract_default(&doc, None); assert!(content.markdown.contains("How to Write Clean Code")); assert!(content.markdown.contains("meaningful variable names")); assert!( !content.markdown.contains("MySite"), "header/footer branding leaked" ); assert!(!content.markdown.contains("Privacy"), "footer link leaked"); assert!(!content.markdown.contains("Blog"), "nav link leaked"); } /// H1 in a hero/banner section outside the main content node should be /// captured and prepended to the markdown output. #[test] fn h1_outside_content_node_captured() { let html = r##"

Asynchronous programming in Rust is powered by the async/await syntax and the Future trait. This guide covers all the fundamentals you need to get started with async Rust.

We will explore tokio, the most popular async runtime, and show you how to build concurrent applications efficiently.

By the end of this guide you will understand how to write performant async code that handles thousands of connections.

"##; let doc = parse(html); let content = extract_default(&doc, None); // H1 must appear in markdown even though it's outside

assert!( content .markdown .contains("The Ultimate Guide to Async Rust"), "H1 from hero banner missing from output" ); // Should be prepended as a heading assert!( content .markdown .starts_with("# The Ultimate Guide to Async Rust"), "H1 should be prepended as markdown heading" ); // Article content still present assert!(content.markdown.contains("async/await")); assert!(content.markdown.contains("tokio")); } /// Announcement banners with role="region" and aria-label="Announcement" /// should be recovered even though their class contains "banner" (noise). #[test] fn announcement_banner_recovered() { let html = r##"

Our Product

We build amazing tools for developers that simplify complex workflows and boost productivity every day.

Our platform handles millions of requests per second with low latency and high reliability.

"##; let doc = parse(html); let content = extract_default(&doc, Some(&Url::parse("https://example.com").unwrap())); assert!( content.markdown.contains("joining forces with Acme Corp"), "Announcement banner text missing from output" ); assert!( content.markdown.contains("Our Product"), "Main content missing" ); // The announcement link should be captured assert!( content .links .iter() .any(|l| l.href.contains("example.com/blog")), "Announcement link not captured" ); } /// Section headings inside

wrappers should be /// recovered when sibling content from the same section is in the output. #[test] fn section_heading_in_header_class_recovered() { let html = r##"

Built for scale

Handle thousands of concurrent requests with intelligent load balancing and automatic failover.

Deploy globally with edge locations in every major region for minimal latency.

"##; let doc = parse(html); let content = extract_default(&doc, None); assert!( content.markdown.contains("## Built for scale"), "Section heading should be recovered: {}", content.markdown ); assert!( content.markdown.contains("concurrent requests"), "Section content missing" ); } /// Eyebrow text (short tagline above a section heading) should be /// recovered when it's inside the same noise-stripped wrapper as the

. #[test] fn eyebrow_text_recovered() { let html = r##"

the platform for builders

Loved by developers worldwide

Thousands of teams rely on our platform daily for mission-critical applications and workflows.

"##; let doc = parse(html); let content = extract_default(&doc, None); assert!( content.markdown.contains("the platform for builders"), "Eyebrow text missing: {}", content.markdown ); assert!( content.markdown.contains("Loved by developers worldwide"), "Section heading missing" ); } /// Decorative route-style labels (starting with "/") should NOT be recovered /// as eyebrow text — they're design elements, not content. #[test] fn route_style_eyebrow_not_recovered() { let html = r##"

/proof is in the numbers

Trusted in production

Our platform handles millions of requests per second with low latency and high reliability.

"##; let doc = parse(html); let content = extract_default(&doc, None); // With exact class matching, "section-header" is NOT noise // (only exact "header" class would be). The eyebrow text is now // preserved, which is correct — it's content, not navigation. assert!( content.markdown.contains("Trusted in production"), "Section heading should be recovered" ); assert!( content.markdown.contains("Our platform"), "Grid content should be present" ); } /// Footer CTA links to documentation URLs should be recovered. #[test] fn footer_cta_link_recovered() { let html = r##"

Our Platform

Build powerful applications with our comprehensive API and developer tools that handle millions of requests.

Get started in minutes with our quickstart guide and extensive documentation for every feature.

Start building today
Explore API Docs Try it free Privacy Terms
"##; let doc = parse(html); let content = extract_default(&doc, Some(&Url::parse("https://example.com").unwrap())); assert!( content.markdown.contains("Start building today"), "Footer CTA heading missing: {}", content.markdown ); assert!( content.markdown.contains("Explore API Docs"), "Footer CTA link missing" ); // Non-doc footer links should NOT be recovered assert!( !content.markdown.contains("Privacy"), "Generic footer nav leaked" ); assert!( !content.markdown.contains("Terms"), "Generic footer nav leaked" ); } /// Headings inside genuine noise (nav, aside) should NOT be recovered, /// even when sibling content exists in the output. #[test] fn heading_inside_nav_not_recovered() { let html = r##"

Programming Guide

Table of Contents

Chapter 1

Chapter 2

This comprehensive guide covers everything you need to know about modern programming practices.

From basics to advanced topics, we will explore patterns and techniques used by professionals.

"##; let doc = parse(html); let content = extract_default(&doc, None); assert!( !content.markdown.contains("Table of Contents"), "TOC heading from nav should not be recovered: {}", content.markdown ); assert!(content.markdown.contains("comprehensive guide")); } /// Structured footer sitemaps (3+ categories with headings) should be /// recovered as a compact reference section. #[test] fn footer_sitemap_recovered() { let html = r##"

Our Company

We build tools that help developers create amazing applications faster and more efficiently than ever before.

Join thousands of teams who trust our platform for their mission-critical workloads every single day.

Products
Product A Product B Product C

Solutions
Enterprise Startup Education

Resources
Blog Documentation Community

"##; let doc = parse(html); let content = extract_default(&doc, Some(&Url::parse("https://example.com").unwrap())); // Categories should be captured assert!( content.markdown.contains("Products"), "Footer sitemap Products missing: {}", content.markdown ); assert!( content.markdown.contains("Product A"), "Footer sitemap link missing" ); assert!( content.markdown.contains("Solutions"), "Footer sitemap Solutions missing" ); assert!( content.markdown.contains("Resources"), "Footer sitemap Resources missing" ); // Main content still present assert!(content.markdown.contains("Our Company")); } /// Footer sitemaps with fewer than 3 categories should NOT be recovered /// (not enough structure to be confident it's a sitemap). #[test] fn small_footer_not_treated_as_sitemap() { let html = r##"

Simple Page

This is a simple page with minimal footer structure that should not trigger sitemap recovery at all.

The content here is what matters, not the footer links or navigation elements below the main content.

Legal
Privacy Terms
"##; let doc = parse(html); let content = extract_default(&doc, None); assert!( !content.markdown.contains("Legal"), "Small footer should not be treated as sitemap: {}", content.markdown ); } /// Screen-reader-only footer headings (like "Footer") should not leak. #[test] fn sr_only_footer_heading_not_recovered() { let html = r##"

Our Platform

Build powerful applications with our comprehensive API and developer tools that handle millions of requests.

Get started in minutes with our quickstart guide and extensive documentation for every feature.

Footer
Explore API Docs
"##; let doc = parse(html); let content = extract_default(&doc, Some(&Url::parse("https://example.com").unwrap())); assert!( !content.markdown.contains("## Footer"), "SR-only 'Footer' heading should not be recovered: {}", content.markdown ); } } #[cfg(test)] mod form_integration_tests { use super::*; #[test] fn aspnet_form_content_extraction() { let content = "x".repeat(600); // Ensure >500 chars let html = format!( r#"

Logo

Section

Question?

{content}

"# ); let doc = Html::parse_document(&html); let opts = ExtractionOptions::default(); let result = extract_content(&doc, None, &opts); assert!( result.markdown.contains("Section"), "h2 missing from markdown" ); assert!( result.markdown.contains("Question"), "h3 missing from markdown" ); } /// Simulate unclosed header div absorbing the content div. /// The header's noise class should NOT propagate to the absorbed content /// because the safety valve detects the header has >5000 chars (broken wrapper). #[test] fn unclosed_header_div_does_not_swallow_content() { let faq = "Lorem ipsum dolor sit amet. ".repeat(300); // ~8400 chars // The header div is intentionally NOT closed — the HTML parser makes // div.content a child of div.header. The safety valve (>5000 chars) // should prevent div.header from being treated as noise. let html = format!( r#"

Logo

FAQ Section

First question?

{faq}

"# ); let doc = Html::parse_document(&html); let opts = ExtractionOptions::default(); let result = extract_content(&doc, None, &opts); assert!( result.markdown.contains("FAQ Section"), "h2 missing: header swallowed content" ); assert!( result.markdown.contains("First question"), "h3 missing: header swallowed content" ); } }