mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-04-25 00:06:21 +02:00
1555 lines
57 KiB
Rust
1555 lines
57 KiB
Rust
/// Readability-style content extraction.
|
||
/// Strips noise (nav, ads, sidebars), scores remaining nodes by text density
|
||
/// and structural signals, then converts the best candidate to markdown.
|
||
use std::collections::HashSet;
|
||
|
||
use ego_tree::NodeId;
|
||
use once_cell::sync::Lazy;
|
||
use scraper::{ElementRef, Html, Selector};
|
||
use tracing::{debug, warn};
|
||
use url::Url;
|
||
|
||
use crate::markdown;
|
||
use crate::noise;
|
||
use crate::types::{Content, ExtractionOptions, Link};
|
||
|
||
static CANDIDATE_SELECTOR: Lazy<Selector> =
|
||
Lazy::new(|| Selector::parse("article, main, [role='main'], div, section, td").unwrap());
|
||
static BODY_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("body").unwrap());
|
||
static H1_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("h1").unwrap());
|
||
static H2_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("h2").unwrap());
|
||
static P_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("p").unwrap());
|
||
static A_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("a").unwrap());
|
||
static ANNOUNCEMENT_SELECTOR: Lazy<Selector> =
|
||
Lazy::new(|| Selector::parse("[role='region'][aria-label]").unwrap());
|
||
static FOOTER_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("footer").unwrap());
|
||
static FOOTER_HEADING_SELECTOR: Lazy<Selector> =
|
||
Lazy::new(|| Selector::parse("h2, h3, h4, h5, h6").unwrap());
|
||
|
||
/// Selector for only_main_content: article, main, [role="main"]
|
||
static MAIN_CONTENT_SELECTOR: Lazy<Selector> =
|
||
Lazy::new(|| Selector::parse("article, main, [role='main']").unwrap());
|
||
|
||
const MAX_SELECTORS: usize = 100;
|
||
|
||
/// Build a HashSet of NodeIds to exclude based on CSS selector strings.
|
||
/// Invalid selectors are skipped with a warning.
|
||
fn build_exclude_set(doc: &Html, selectors: &[String]) -> HashSet<NodeId> {
|
||
if selectors.len() > MAX_SELECTORS {
|
||
warn!(
|
||
"too many CSS selectors ({}, max {}), truncating",
|
||
selectors.len(),
|
||
MAX_SELECTORS
|
||
);
|
||
}
|
||
|
||
let mut exclude = HashSet::new();
|
||
for selector_str in selectors.iter().take(MAX_SELECTORS) {
|
||
let Ok(selector) = Selector::parse(selector_str) else {
|
||
warn!(
|
||
selector = selector_str.as_str(),
|
||
"invalid CSS selector, skipping"
|
||
);
|
||
continue;
|
||
};
|
||
for el in doc.select(&selector) {
|
||
// Add the element itself and all descendants
|
||
exclude.insert(el.id());
|
||
for descendant in el.descendants() {
|
||
if let Some(child_el) = ElementRef::wrap(descendant) {
|
||
exclude.insert(child_el.id());
|
||
}
|
||
}
|
||
}
|
||
}
|
||
exclude
|
||
}
|
||
|
||
/// Parse CSS selector strings into Selectors, skipping invalid ones.
|
||
fn parse_selectors(strings: &[String]) -> Vec<Selector> {
|
||
strings
|
||
.iter()
|
||
.filter_map(|s| {
|
||
Selector::parse(s)
|
||
.map_err(|_| warn!(selector = s.as_str(), "invalid CSS selector, skipping"))
|
||
.ok()
|
||
})
|
||
.collect()
|
||
}
|
||
|
||
/// Extract the main content from a parsed HTML document with options.
|
||
pub fn extract_content(doc: &Html, base_url: Option<&Url>, options: &ExtractionOptions) -> Content {
|
||
let exclude = build_exclude_set(doc, &options.exclude_selectors);
|
||
|
||
// Path 1: Include selectors — skip scoring, extract only matching elements
|
||
if !options.include_selectors.is_empty() {
|
||
return extract_with_include(doc, base_url, &options.include_selectors, &exclude, options);
|
||
}
|
||
|
||
// Path 2: only_main_content — pick first article/main/[role="main"]
|
||
if options.only_main_content {
|
||
if let Some(main_el) = doc.select(&MAIN_CONTENT_SELECTOR).next() {
|
||
debug!(
|
||
tag = main_el.value().name(),
|
||
"only_main_content: selected element"
|
||
);
|
||
let (markdown, plain_text, assets) = markdown::convert(main_el, base_url, &exclude);
|
||
|
||
let raw_html = if options.include_raw_html {
|
||
Some(main_el.html())
|
||
} else {
|
||
None
|
||
};
|
||
|
||
return Content {
|
||
markdown,
|
||
plain_text,
|
||
links: assets.links,
|
||
images: assets.images,
|
||
code_blocks: assets.code_blocks,
|
||
raw_html,
|
||
};
|
||
}
|
||
debug!("only_main_content: no article/main found, falling back to scoring");
|
||
}
|
||
|
||
// Path 3: Default scoring algorithm
|
||
let best = find_best_node(doc);
|
||
|
||
let (content_element, mut markdown, plain_text, mut assets) = if let Some(node) = best {
|
||
debug!(tag = node.value().name(), "selected content node");
|
||
let (md, pt, a) = markdown::convert(node, base_url, &exclude);
|
||
(Some(node), md, pt, a)
|
||
} else {
|
||
debug!("no strong candidate, falling back to body");
|
||
if let Some(body) = doc.select(&BODY_SELECTOR).next() {
|
||
let (md, pt, a) = markdown::convert(body, base_url, &exclude);
|
||
(Some(body), md, pt, a)
|
||
} else {
|
||
let root = doc.root_element();
|
||
let (md, pt, a) = markdown::convert(root, base_url, &exclude);
|
||
(Some(root), md, pt, a)
|
||
}
|
||
};
|
||
|
||
// The best content node often excludes the page's primary H1 (e.g., in a
|
||
// hero/banner section). If the document has an H1 and its text isn't already
|
||
// in the markdown, prepend it so the output always starts with the title.
|
||
if let Some(h1) = doc.select(&H1_SELECTOR).next() {
|
||
let h1_text = h1
|
||
.text()
|
||
.collect::<String>()
|
||
.trim()
|
||
.trim_end_matches(|c: char| !c.is_alphanumeric())
|
||
.trim()
|
||
.to_string();
|
||
if !h1_text.is_empty() && !markdown.contains(&h1_text) {
|
||
markdown = format!("# {h1_text}\n\n{markdown}");
|
||
// Recover hero paragraph: H1 was outside the content node (noise-stripped),
|
||
// so adjacent tagline/mission paragraphs are also lost. Recover them.
|
||
recover_hero_paragraph(h1, &mut markdown);
|
||
}
|
||
}
|
||
|
||
// Recover announcement banners (role="region" with announcement-like aria-label).
|
||
// These are often stripped by class-based noise filters ("banner" class) but
|
||
// contain genuinely important content like product announcements.
|
||
recover_announcements(doc, base_url, &mut markdown, &mut assets.links);
|
||
|
||
// Recover section headings that were stripped because their wrapper had a
|
||
// noise class (e.g., <div class="section-header">). If an <h2> is missing
|
||
// from the markdown but nearby content from the same section IS present,
|
||
// the heading was likely a false-positive noise strip.
|
||
recover_section_headings(doc, &mut markdown);
|
||
|
||
// Recover prominent CTA links from the footer (e.g., documentation links).
|
||
// The footer tag is noise, but "call to action" sections inside it often
|
||
// contain high-value links and headings worth capturing.
|
||
recover_footer_cta(doc, base_url, &mut markdown, &mut assets.links);
|
||
|
||
// Recover structured site navigation from footer (product/service listings).
|
||
// Many homepages have organized footer sitemaps (Products, Solutions, etc.)
|
||
// that are genuinely useful for LLM consumption.
|
||
recover_footer_sitemap(doc, base_url, &mut markdown, &mut assets.links);
|
||
|
||
let raw_html = if options.include_raw_html {
|
||
content_element.map(|el| el.html())
|
||
} else {
|
||
None
|
||
};
|
||
|
||
Content {
|
||
markdown,
|
||
plain_text,
|
||
links: assets.links,
|
||
images: assets.images,
|
||
code_blocks: assets.code_blocks,
|
||
raw_html,
|
||
}
|
||
}
|
||
|
||
/// Extract content using include selectors. Each matching element is converted
|
||
/// to markdown and the results are concatenated.
|
||
fn extract_with_include(
|
||
doc: &Html,
|
||
base_url: Option<&Url>,
|
||
include_selectors: &[String],
|
||
exclude: &HashSet<NodeId>,
|
||
options: &ExtractionOptions,
|
||
) -> Content {
|
||
let selectors = parse_selectors(include_selectors);
|
||
|
||
let mut all_md = String::new();
|
||
let mut all_plain = String::new();
|
||
let mut all_links = Vec::new();
|
||
let mut all_images = Vec::new();
|
||
let mut all_code_blocks = Vec::new();
|
||
let mut all_raw_html = if options.include_raw_html {
|
||
Some(String::new())
|
||
} else {
|
||
None
|
||
};
|
||
|
||
for selector in &selectors {
|
||
for el in doc.select(selector) {
|
||
if exclude.contains(&el.id()) {
|
||
continue;
|
||
}
|
||
|
||
let (md, plain, assets) = markdown::convert(el, base_url, exclude);
|
||
|
||
if !md.is_empty() {
|
||
if !all_md.is_empty() {
|
||
all_md.push_str("\n\n");
|
||
}
|
||
all_md.push_str(&md);
|
||
}
|
||
if !plain.is_empty() {
|
||
if !all_plain.is_empty() {
|
||
all_plain.push('\n');
|
||
}
|
||
all_plain.push_str(&plain);
|
||
}
|
||
|
||
all_links.extend(assets.links);
|
||
all_images.extend(assets.images);
|
||
all_code_blocks.extend(assets.code_blocks);
|
||
|
||
if let Some(ref mut raw) = all_raw_html {
|
||
raw.push_str(&el.html());
|
||
}
|
||
}
|
||
}
|
||
|
||
Content {
|
||
markdown: all_md,
|
||
plain_text: all_plain,
|
||
links: all_links,
|
||
images: all_images,
|
||
code_blocks: all_code_blocks,
|
||
raw_html: all_raw_html,
|
||
}
|
||
}
|
||
|
||
/// Recover announcement banners that were stripped as noise.
|
||
/// Pattern: `<div role="region" aria-label="Announcement">` with short, meaningful text.
|
||
fn recover_announcements(
|
||
doc: &Html,
|
||
base_url: Option<&Url>,
|
||
markdown: &mut String,
|
||
links: &mut Vec<Link>,
|
||
) {
|
||
for el in doc.select(&ANNOUNCEMENT_SELECTOR) {
|
||
let label = el.value().attr("aria-label").unwrap_or("");
|
||
if !label.to_lowercase().contains("announcement") {
|
||
continue;
|
||
}
|
||
|
||
let text = el.text().collect::<String>();
|
||
let text = text.split_whitespace().collect::<Vec<_>>().join(" ");
|
||
if text.is_empty() || markdown.contains(&text) {
|
||
continue;
|
||
}
|
||
|
||
// Build markdown for the announcement, including any links
|
||
let mut announcement = format!("> **{text}**");
|
||
for a in el.select(&A_SELECTOR) {
|
||
let link_text = a.text().collect::<String>().trim().to_string();
|
||
let href = a
|
||
.value()
|
||
.attr("href")
|
||
.map(|h| markdown::resolve_url(h, base_url))
|
||
.unwrap_or_default();
|
||
if !link_text.is_empty() && !href.is_empty() {
|
||
links.push(Link {
|
||
text: link_text,
|
||
href,
|
||
});
|
||
}
|
||
}
|
||
announcement.push_str("\n\n");
|
||
|
||
debug!("recovered announcement banner");
|
||
*markdown = format!("{announcement}{markdown}");
|
||
}
|
||
}
|
||
|
||
/// Recover the hero paragraph (mission/tagline) that's near the H1 but inside
|
||
/// a noise-stripped container like `<header>`. Walk siblings/cousins of the H1
|
||
/// to find a substantial `<p>` that isn't in the markdown.
|
||
fn recover_hero_paragraph(h1: ElementRef<'_>, markdown: &mut String) {
|
||
// Walk up to find a container that holds both H1 and sibling content
|
||
let mut node = h1.parent();
|
||
for _ in 0..4 {
|
||
let Some(parent) = node else { break };
|
||
let Some(parent_el) = ElementRef::wrap(parent) else {
|
||
node = parent.parent();
|
||
continue;
|
||
};
|
||
|
||
// Search all <p> descendants of this container
|
||
for descendant in parent_el.descendants() {
|
||
let Some(el) = ElementRef::wrap(descendant) else {
|
||
continue;
|
||
};
|
||
if el.value().name() != "p" {
|
||
continue;
|
||
}
|
||
let text = el
|
||
.text()
|
||
.collect::<String>()
|
||
.split_whitespace()
|
||
.collect::<Vec<_>>()
|
||
.join(" ");
|
||
// Only recover substantial paragraphs (taglines, mission statements)
|
||
if text.len() < 40 || text.len() > 300 {
|
||
continue;
|
||
}
|
||
if markdown.contains(&text) {
|
||
continue;
|
||
}
|
||
// Insert right after the H1 heading line
|
||
debug!(text = text.as_str(), "recovered hero paragraph");
|
||
let insert = format!("\n{text}\n");
|
||
if let Some(pos) = markdown.find('\n') {
|
||
markdown.insert_str(pos + 1, &insert);
|
||
} else {
|
||
markdown.push_str(&insert);
|
||
}
|
||
return;
|
||
}
|
||
node = parent.parent();
|
||
}
|
||
}
|
||
|
||
/// Recover <h2> headings that were stripped because their wrapper div had a
|
||
/// noise class like "header". If adjacent content from the same parent section
|
||
/// IS in the markdown, the heading should be there too.
|
||
fn recover_section_headings(doc: &Html, markdown: &mut String) {
|
||
for h2 in doc.select(&H2_SELECTOR) {
|
||
let h2_text = h2.text().collect::<String>().trim().to_string();
|
||
if h2_text.is_empty() || find_content_position(markdown, &h2_text).is_some() {
|
||
continue;
|
||
}
|
||
|
||
// Don't recover headings inside structural noise tags (nav, aside, footer,
|
||
// header). These are genuine noise — not false-positive class matches like
|
||
// <div class="section-header"> inside a content section.
|
||
if is_inside_structural_noise(h2) {
|
||
continue;
|
||
}
|
||
|
||
// Walk up to the nearest section/div parent, then check if any sibling
|
||
// content from that parent made it into the markdown.
|
||
let anchor = find_sibling_anchor_text(h2, markdown);
|
||
if let Some(anchor) = anchor {
|
||
debug!(
|
||
heading = h2_text.as_str(),
|
||
"recovered stripped section heading"
|
||
);
|
||
// Insert the heading before the anchor's content block.
|
||
// Walk backwards past short orphan lines (stat numbers etc.)
|
||
// that likely belong to the same section.
|
||
if let Some(pos) = find_content_position(markdown, &anchor) {
|
||
let line_start = markdown[..pos].rfind('\n').map_or(0, |p| p + 1);
|
||
let insert_pos = walk_back_past_orphans(markdown, line_start);
|
||
let heading_md = format!("## {h2_text}\n\n");
|
||
markdown.insert_str(insert_pos, &heading_md);
|
||
}
|
||
}
|
||
}
|
||
|
||
// Also recover <p> "eyebrow" text (short taglines above section headings).
|
||
// These are typically inside the same noise-stripped wrapper as the <h2>.
|
||
// Eyebrows are short (e.g., "/the web access layer for agents") — skip full paragraphs.
|
||
for h2 in doc.select(&H2_SELECTOR) {
|
||
let h2_text = h2.text().collect::<String>().trim().to_string();
|
||
if h2_text.is_empty() || find_content_position(markdown, &h2_text).is_none() {
|
||
continue;
|
||
}
|
||
|
||
// Look for a preceding <p> sibling inside the same parent
|
||
if let Some(parent) = h2.parent().and_then(ElementRef::wrap) {
|
||
for child in parent.children() {
|
||
if let Some(child_el) = ElementRef::wrap(child) {
|
||
// Stop when we reach the h2 itself
|
||
if child_el == h2 {
|
||
break;
|
||
}
|
||
if child_el.value().name() == "p" {
|
||
let p_text = child_el.text().collect::<String>().trim().to_string();
|
||
// Only short text qualifies as an eyebrow — full paragraphs
|
||
// are regular content, not taglines.
|
||
if p_text.is_empty() || p_text.len() > 80 {
|
||
continue;
|
||
}
|
||
// Skip decorative route-style labels (e.g., "/proof is in
|
||
// the numbers", "/press room") — common design pattern, not content.
|
||
if p_text.starts_with('/') {
|
||
continue;
|
||
}
|
||
// Check against a stripped version of the markdown to handle
|
||
// formatting like **bold** that breaks plain-text matching.
|
||
let plain_md = strip_md_formatting(markdown);
|
||
if plain_md.contains(&p_text) {
|
||
continue;
|
||
}
|
||
{
|
||
// Insert the eyebrow text at the start of the heading's line
|
||
if let Some(pos) = find_content_position(markdown, &h2_text) {
|
||
let line_start = markdown[..pos].rfind('\n').map_or(0, |p| p + 1);
|
||
let eyebrow_md = format!("*{p_text}*\n\n");
|
||
markdown.insert_str(line_start, &eyebrow_md);
|
||
debug!(eyebrow = p_text.as_str(), "recovered eyebrow text");
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
/// Find text from a sibling element (in the same section) that IS in the markdown.
|
||
/// This confirms the heading belongs to content we already captured.
|
||
fn find_sibling_anchor_text(heading: ElementRef<'_>, markdown: &str) -> Option<String> {
|
||
let heading_text = heading.text().collect::<String>();
|
||
|
||
// Walk up to find the containing section or significant parent
|
||
let mut node = heading.parent();
|
||
while let Some(parent) = node {
|
||
if let Some(parent_el) = ElementRef::wrap(parent) {
|
||
let tag = parent_el.value().name();
|
||
if tag == "section" || tag == "article" || tag == "main" || tag == "body" {
|
||
// Search descendant <p> and <h3> elements for text in the markdown.
|
||
// Using specific elements avoids the multiline blob issue from
|
||
// concatenating all text nodes of a large container.
|
||
for descendant in parent_el.descendants() {
|
||
if let Some(el) = ElementRef::wrap(descendant) {
|
||
let dtag = el.value().name();
|
||
if dtag != "p" && dtag != "h3" && dtag != "h4" {
|
||
continue;
|
||
}
|
||
// Normalize whitespace to match how the markdown converter collapses it
|
||
let el_text: String = el
|
||
.text()
|
||
.collect::<String>()
|
||
.split_whitespace()
|
||
.collect::<Vec<_>>()
|
||
.join(" ");
|
||
// Skip if this text is part of the heading itself
|
||
if el_text.is_empty() || heading_text.contains(&el_text) {
|
||
continue;
|
||
}
|
||
if el_text.len() > 15 && find_content_position(markdown, &el_text).is_some()
|
||
{
|
||
return Some(el_text);
|
||
}
|
||
}
|
||
}
|
||
break;
|
||
}
|
||
}
|
||
node = parent.parent();
|
||
}
|
||
None
|
||
}
|
||
|
||
/// Recover CTA (call-to-action) links and headings from footer sections.
|
||
/// Many sites have a "hero" CTA block in the footer with documentation links
|
||
/// or signup prompts. These are valuable content, not navigational noise.
|
||
fn recover_footer_cta(
|
||
doc: &Html,
|
||
base_url: Option<&Url>,
|
||
markdown: &mut String,
|
||
links: &mut Vec<Link>,
|
||
) {
|
||
for footer in doc.select(&FOOTER_SELECTOR) {
|
||
// Look for h2 headings in the footer (CTA headings like "Power your AI...")
|
||
for h2 in footer.select(&H2_SELECTOR) {
|
||
let h2_text = h2.text().collect::<String>().trim().to_string();
|
||
if h2_text.is_empty() || markdown.contains(&h2_text) {
|
||
continue;
|
||
}
|
||
// Skip meta headings (screen-reader-only "Footer", "Navigation")
|
||
let h2_lower = h2_text.to_lowercase();
|
||
if h2_lower == "footer" || h2_lower == "navigation" || h2_lower == "site map" {
|
||
continue;
|
||
}
|
||
// Skip screen-reader-only headings (sr-only, visually-hidden)
|
||
if let Some(class) = h2.value().attr("class") {
|
||
let cl = class.to_lowercase();
|
||
if cl.contains("sr-only")
|
||
|| cl.contains("visually-hidden")
|
||
|| cl.contains("screen-reader")
|
||
{
|
||
continue;
|
||
}
|
||
}
|
||
|
||
debug!(heading = h2_text.as_str(), "recovered footer CTA heading");
|
||
markdown.push_str(&format!("\n\n## {h2_text}\n\n"));
|
||
}
|
||
|
||
// Recover links that point to documentation or app URLs
|
||
for a in footer.select(&A_SELECTOR) {
|
||
let href = match a.value().attr("href") {
|
||
Some(h) => markdown::resolve_url(h, base_url),
|
||
None => continue,
|
||
};
|
||
let text = a.text().collect::<String>().trim().to_string();
|
||
if text.is_empty() || href.is_empty() {
|
||
continue;
|
||
}
|
||
|
||
// Only recover links to docs/app/API — not generic footer nav
|
||
let href_lower = href.to_lowercase();
|
||
let is_valuable_cta = href_lower.contains("docs.")
|
||
|| href_lower.contains("/docs")
|
||
|| href_lower.contains("app.")
|
||
|| href_lower.contains("/app")
|
||
|| href_lower.contains("api.");
|
||
|
||
if is_valuable_cta && !markdown.contains(&text) {
|
||
debug!(
|
||
text = text.as_str(),
|
||
href = href.as_str(),
|
||
"recovered footer CTA link"
|
||
);
|
||
markdown.push_str(&format!("[{text}]({href})\n\n"));
|
||
links.push(Link {
|
||
text: text.clone(),
|
||
href: href.clone(),
|
||
});
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
/// Recover structured site navigation from footer when it has organized
|
||
/// link categories (Products, Solutions, Resources, etc.). This captures
|
||
/// the site's offering structure — useful for LLM queries like "what does
|
||
/// this company offer?" Only fires when the footer has 3+ categories.
|
||
fn recover_footer_sitemap(
|
||
doc: &Html,
|
||
base_url: Option<&Url>,
|
||
markdown: &mut String,
|
||
links: &mut Vec<Link>,
|
||
) {
|
||
for footer in doc.select(&FOOTER_SELECTOR) {
|
||
let mut categories: Vec<(String, Vec<(String, String)>)> = Vec::new();
|
||
|
||
for heading in footer.select(&FOOTER_HEADING_SELECTOR) {
|
||
let heading_text = heading.text().collect::<String>().trim().to_string();
|
||
if heading_text.is_empty() || heading_text.len() > 50 {
|
||
continue;
|
||
}
|
||
// Skip meta headings like "Footer" and headings already in the markdown
|
||
if heading_text.eq_ignore_ascii_case("footer") || markdown.contains(&heading_text) {
|
||
continue;
|
||
}
|
||
|
||
// Find links in the nearest container that holds both heading + link list.
|
||
// Try parent first, then grandparent (handles wrapper divs).
|
||
let cat_links = collect_sibling_links(heading, base_url);
|
||
// 2–20 links: too few = not a real category, too many = aggregate container
|
||
if cat_links.len() >= 2 && cat_links.len() <= 20 {
|
||
categories.push((heading_text, cat_links));
|
||
}
|
||
}
|
||
|
||
if categories.len() < 3 {
|
||
continue;
|
||
}
|
||
|
||
// Build compact sitemap — category name + comma-separated link text
|
||
let mut sitemap = String::from("\n\n---\n\n");
|
||
for (heading, cat_links) in &categories {
|
||
let names: Vec<&str> = cat_links.iter().map(|(t, _)| t.as_str()).collect();
|
||
sitemap.push_str(&format!("**{heading}**: {}\n", names.join(", ")));
|
||
|
||
for (text, href) in cat_links {
|
||
links.push(Link {
|
||
text: text.clone(),
|
||
href: href.clone(),
|
||
});
|
||
}
|
||
}
|
||
|
||
debug!(categories = categories.len(), "recovered footer sitemap");
|
||
markdown.push_str(&sitemap);
|
||
}
|
||
}
|
||
|
||
/// Collect links from the same container as a heading element.
|
||
/// Walks up the DOM to find the nearest ancestor that contains <a> elements.
|
||
fn collect_sibling_links(heading: ElementRef<'_>, base_url: Option<&Url>) -> Vec<(String, String)> {
|
||
let mut node = heading.parent();
|
||
// Try up to 2 levels (parent, grandparent) to find a link container
|
||
for _ in 0..2 {
|
||
let Some(parent) = node else { break };
|
||
let Some(parent_el) = ElementRef::wrap(parent) else {
|
||
node = parent.parent();
|
||
continue;
|
||
};
|
||
let a_elements: Vec<_> = parent_el.select(&A_SELECTOR).collect();
|
||
if a_elements.len() >= 2 {
|
||
return a_elements
|
||
.into_iter()
|
||
.filter_map(|a| {
|
||
let text = a.text().collect::<String>().trim().to_string();
|
||
let href = a
|
||
.value()
|
||
.attr("href")
|
||
.map(|h| markdown::resolve_url(h, base_url));
|
||
match (text.is_empty(), href) {
|
||
(false, Some(h))
|
||
if !h.is_empty()
|
||
&& text.len() > 1
|
||
&& text.len() < 60
|
||
&& !matches!(
|
||
text.to_lowercase().as_str(),
|
||
"here" | "link" | "click" | "more"
|
||
) =>
|
||
{
|
||
Some((text, h))
|
||
}
|
||
_ => None,
|
||
}
|
||
})
|
||
.collect();
|
||
}
|
||
node = parent.parent();
|
||
}
|
||
Vec::new()
|
||
}
|
||
|
||
/// Walk backwards from `pos` in markdown, skipping blank lines and short
|
||
/// orphan lines (<=25 chars, likely stat numbers or labels) that belong to
|
||
/// the same section. Stops at headings, long content lines, or start of string.
|
||
fn walk_back_past_orphans(markdown: &str, mut pos: usize) -> usize {
|
||
loop {
|
||
if pos == 0 {
|
||
break;
|
||
}
|
||
// Find the previous line
|
||
let prev_end = pos.saturating_sub(1); // skip the \n
|
||
let prev_start = markdown[..prev_end].rfind('\n').map_or(0, |p| p + 1);
|
||
let prev_line = markdown[prev_start..prev_end].trim();
|
||
|
||
if prev_line.is_empty() {
|
||
pos = prev_start;
|
||
continue;
|
||
}
|
||
if prev_line.starts_with('#') || prev_line.starts_with('>') || prev_line.len() > 25 {
|
||
break;
|
||
}
|
||
// Short non-structural line — likely a stat number, include it
|
||
pos = prev_start;
|
||
}
|
||
pos
|
||
}
|
||
|
||
/// Quick strip of markdown bold/italic markers for plain-text comparison.
|
||
fn strip_md_formatting(md: &str) -> String {
|
||
md.replace("**", "").replace('*', "")
|
||
}
|
||
|
||
/// Find `needle` in `markdown` only at a position that isn't inside image/link
|
||
/// alt text (``). Returns the byte offset or None.
|
||
fn find_content_position(markdown: &str, needle: &str) -> Option<usize> {
|
||
let mut search_from = 0;
|
||
while let Some(pos) = markdown[search_from..].find(needle) {
|
||
let abs_pos = search_from + pos;
|
||
if !is_inside_image_syntax(markdown, abs_pos) {
|
||
return Some(abs_pos);
|
||
}
|
||
search_from = abs_pos + 1;
|
||
}
|
||
None
|
||
}
|
||
|
||
/// Check if a position in markdown falls inside `` image syntax.
|
||
fn is_inside_image_syntax(markdown: &str, pos: usize) -> bool {
|
||
// Walk backwards from pos to find the nearest unmatched `![`
|
||
let before = &markdown[..pos];
|
||
// Find the last `;
|
||
while i > 0 {
|
||
i -= 1;
|
||
if i > 0 && before.as_bytes()[i - 1] == b'!' && before.as_bytes()[i] == b'[' {
|
||
// Found ` {
|
||
return true;
|
||
}
|
||
}
|
||
// If we hit a `)` that closes a previous image, stop searching
|
||
if before.as_bytes()[i] == b')' {
|
||
break;
|
||
}
|
||
}
|
||
false
|
||
}
|
||
|
||
/// Check if an element is inside a structural noise tag (nav, aside, footer, header).
|
||
/// Unlike class-based noise (e.g., <div class="header">), these are strong signals
|
||
/// that the content is genuinely non-content and should NOT be recovered.
|
||
const STRUCTURAL_NOISE_TAGS: &[&str] = &["nav", "aside", "footer", "header"];
|
||
|
||
fn is_inside_structural_noise(el: ElementRef<'_>) -> bool {
|
||
let mut node = el.parent();
|
||
while let Some(parent) = node {
|
||
if let Some(parent_el) = ElementRef::wrap(parent) {
|
||
let tag = parent_el.value().name();
|
||
if STRUCTURAL_NOISE_TAGS.contains(&tag) {
|
||
return true;
|
||
}
|
||
// Also check role-based structural noise
|
||
if let Some(role) = parent_el.value().attr("role")
|
||
&& (role == "navigation" || role == "contentinfo")
|
||
{
|
||
return true;
|
||
}
|
||
}
|
||
node = parent.parent();
|
||
}
|
||
false
|
||
}
|
||
|
||
/// Score each candidate node and return the best one.
|
||
fn find_best_node(doc: &Html) -> Option<ElementRef<'_>> {
|
||
let mut best: Option<(ElementRef<'_>, f64)> = None;
|
||
|
||
for candidate in doc.select(&CANDIDATE_SELECTOR) {
|
||
if noise::is_noise(candidate) || noise::is_noise_descendant(candidate) {
|
||
continue;
|
||
}
|
||
|
||
let score = score_node(candidate);
|
||
|
||
if score > 0.0 && best.as_ref().is_none_or(|(_, s)| score > *s) {
|
||
best = Some((candidate, score));
|
||
}
|
||
}
|
||
|
||
best.map(|(el, score)| {
|
||
debug!(score, tag = el.value().name(), "best content candidate");
|
||
el
|
||
})
|
||
}
|
||
|
||
fn score_node(el: ElementRef<'_>) -> f64 {
|
||
let text = el.text().collect::<String>();
|
||
let text_len = text.len() as f64;
|
||
|
||
// Very short nodes aren't content
|
||
if text_len < 50.0 {
|
||
return 0.0;
|
||
}
|
||
|
||
let mut score = 0.0;
|
||
|
||
// Base score: text length (log scale to avoid huge nodes dominating purely by size)
|
||
score += text_len.ln();
|
||
|
||
// Bonus for <article> or <main> — these are strong semantic signals
|
||
let tag = el.value().name();
|
||
match tag {
|
||
"article" => score += 50.0,
|
||
"main" => score += 50.0,
|
||
_ => {}
|
||
}
|
||
|
||
// Bonus for role="main"
|
||
if el.value().attr("role") == Some("main") {
|
||
score += 50.0;
|
||
}
|
||
|
||
// Bonus for common content class/id patterns
|
||
if let Some(class) = el.value().attr("class") {
|
||
let cl = class.to_lowercase();
|
||
if cl.contains("content")
|
||
|| cl.contains("article")
|
||
|| cl.contains("post")
|
||
|| cl.contains("entry")
|
||
{
|
||
score += 25.0;
|
||
}
|
||
}
|
||
if let Some(id) = el.value().attr("id") {
|
||
let id = id.to_lowercase();
|
||
if id.contains("content")
|
||
|| id.contains("article")
|
||
|| id.contains("post")
|
||
|| id.contains("main")
|
||
{
|
||
score += 25.0;
|
||
}
|
||
}
|
||
|
||
// Paragraph density: count <p> children — real content has paragraphs
|
||
let p_count = el.select(&P_SELECTOR).count() as f64;
|
||
score += p_count * 3.0;
|
||
|
||
// Link density penalty: nodes that are mostly links (nav, footer) score low.
|
||
// link_text_len / total_text_len — lower is better for content.
|
||
let link_text_len: f64 = el
|
||
.select(&A_SELECTOR)
|
||
.map(|a| a.text().collect::<String>().len() as f64)
|
||
.sum();
|
||
|
||
// Semantic nodes (article, main, role=main) get milder link density penalties.
|
||
// Documentation pages often have high link density from TOCs inside the main
|
||
// content container — these are expected, not spam.
|
||
let is_semantic = matches!(tag, "article" | "main") || el.value().attr("role") == Some("main");
|
||
|
||
if text_len > 0.0 {
|
||
let link_density = link_text_len / text_len;
|
||
if is_semantic {
|
||
// Semantic nodes: only penalize extreme link density
|
||
if link_density > 0.7 {
|
||
score *= 0.3;
|
||
} else if link_density > 0.5 {
|
||
score *= 0.5;
|
||
}
|
||
} else {
|
||
// Generic divs: heavy penalty for link-dense content
|
||
if link_density > 0.5 {
|
||
score *= 0.1;
|
||
} else if link_density > 0.3 {
|
||
score *= 0.5;
|
||
}
|
||
}
|
||
}
|
||
|
||
score
|
||
}
|
||
|
||
/// Count words in text (for word_count metadata).
|
||
pub fn word_count(text: &str) -> usize {
|
||
text.split_whitespace().count()
|
||
}
|
||
|
||
#[cfg(test)]
|
||
mod tests {
|
||
use super::*;
|
||
|
||
fn parse(html: &str) -> Html {
|
||
Html::parse_document(html)
|
||
}
|
||
|
||
/// Helper: extract with default options (backward-compatible).
|
||
fn extract_default(doc: &Html, base_url: Option<&Url>) -> Content {
|
||
extract_content(doc, base_url, &ExtractionOptions::default())
|
||
}
|
||
|
||
#[test]
|
||
fn picks_article_over_nav() {
|
||
let html = r##"
|
||
<html>
|
||
<body>
|
||
<nav><ul><li><a href="/">Home</a></li><li><a href="/about">About</a></li></ul></nav>
|
||
<article>
|
||
<h1>Real Article</h1>
|
||
<p>This is the main content of the page. It contains several paragraphs
|
||
of text that make it clearly the main content area.</p>
|
||
<p>Another paragraph with useful information for the reader.</p>
|
||
<p>And a third paragraph to make it really obvious this is content.</p>
|
||
</article>
|
||
<aside class="sidebar">
|
||
<h3>Related Links</h3>
|
||
<ul><li><a href="/1">Link 1</a></li></ul>
|
||
</aside>
|
||
</body>
|
||
</html>"##;
|
||
|
||
let doc = parse(html);
|
||
let content = extract_default(&doc, None);
|
||
assert!(content.markdown.contains("Real Article"));
|
||
assert!(content.markdown.contains("main content"));
|
||
}
|
||
|
||
#[test]
|
||
fn falls_back_to_body() {
|
||
let html = r##"<html><body><p>Simple page with just a paragraph.</p></body></html>"##;
|
||
let doc = parse(html);
|
||
let content = extract_default(&doc, None);
|
||
assert!(content.plain_text.contains("Simple page"));
|
||
}
|
||
|
||
#[test]
|
||
fn word_count_works() {
|
||
assert_eq!(word_count("hello world foo bar"), 4);
|
||
assert_eq!(word_count(""), 0);
|
||
assert_eq!(word_count(" spaces everywhere "), 2);
|
||
}
|
||
|
||
#[test]
|
||
fn prefers_content_class() {
|
||
let html = r##"
|
||
<html>
|
||
<body>
|
||
<div class="header"><p>Site header with some branding text content here</p></div>
|
||
<div class="content">
|
||
<h1>Main Content</h1>
|
||
<p>This is the primary content of the page that readers want to see.
|
||
It has multiple sentences and meaningful paragraphs.</p>
|
||
<p>Second paragraph with additional details and context for the article.</p>
|
||
<p>Third paragraph because real articles have substantial text.</p>
|
||
</div>
|
||
<div class="footer"><p>Footer stuff with copyright and legal text here</p></div>
|
||
</body>
|
||
</html>"##;
|
||
|
||
let doc = parse(html);
|
||
let content = extract_default(&doc, None);
|
||
assert!(content.markdown.contains("Main Content"));
|
||
}
|
||
|
||
/// Simulates a Wikipedia-like page where the best content node (article/main)
|
||
/// contains a nav sidebar as a child. The markdown converter must strip it.
|
||
#[test]
|
||
fn wikipedia_like_nav_sidebar_stripped() {
|
||
let html = r##"
|
||
<html>
|
||
<body>
|
||
<article>
|
||
<h1>Rust (programming language)</h1>
|
||
<nav class="sidebar-toc">
|
||
<h2>Contents</h2>
|
||
<ul>
|
||
<li><a href="#history">History</a></li>
|
||
<li><a href="#syntax">Syntax</a></li>
|
||
<li><a href="#features">Features</a></li>
|
||
</ul>
|
||
</nav>
|
||
<aside class="infobox">
|
||
<p>Developer: Mozilla Research</p>
|
||
<p>First appeared: 2010</p>
|
||
</aside>
|
||
<p>Rust is a multi-paradigm programming language focused on performance
|
||
and safety, especially safe concurrency. It accomplishes these goals
|
||
without a garbage collector.</p>
|
||
<p>Rust was originally designed by Graydon Hoare at Mozilla Research,
|
||
with contributions from several other developers.</p>
|
||
<p>The language grew out of a personal project begun in 2006 by Mozilla
|
||
employee Graydon Hoare, who stated that it was possibly named after
|
||
the rust family of fungi.</p>
|
||
</article>
|
||
</body>
|
||
</html>"##;
|
||
|
||
let doc = parse(html);
|
||
let content = extract_default(&doc, None);
|
||
|
||
// Article content preserved
|
||
assert!(content.markdown.contains("Rust (programming language)"));
|
||
assert!(
|
||
content
|
||
.markdown
|
||
.contains("multi-paradigm programming language")
|
||
);
|
||
assert!(content.markdown.contains("Graydon Hoare"));
|
||
|
||
// Nav sidebar stripped
|
||
assert!(
|
||
!content.markdown.contains("Contents"),
|
||
"TOC nav heading leaked"
|
||
);
|
||
assert!(
|
||
!content.markdown.contains("#history"),
|
||
"TOC nav link leaked"
|
||
);
|
||
|
||
// Aside infobox stripped
|
||
assert!(
|
||
!content.markdown.contains("First appeared"),
|
||
"infobox aside leaked"
|
||
);
|
||
}
|
||
|
||
/// When the best node is a large div that happens to contain script tags,
|
||
/// the JS code must not appear in the markdown.
|
||
#[test]
|
||
fn script_inside_content_node_stripped() {
|
||
let html = r##"
|
||
<html>
|
||
<body>
|
||
<main>
|
||
<h1>Interactive Article</h1>
|
||
<p>This article has some embedded JavaScript for interactivity.
|
||
The content itself is what we want to extract, not the code.</p>
|
||
<script>
|
||
window.__NEXT_DATA__ = {"props":{"pageProps":{"article":{"id":123}}}};
|
||
document.addEventListener('DOMContentLoaded', function() {
|
||
initializeApp();
|
||
});
|
||
</script>
|
||
<p>The article continues with more useful information for readers
|
||
who want to learn about the topic.</p>
|
||
<style>.highlight { background: yellow; }</style>
|
||
</main>
|
||
</body>
|
||
</html>"##;
|
||
|
||
let doc = parse(html);
|
||
let content = extract_default(&doc, None);
|
||
|
||
assert!(content.markdown.contains("Interactive Article"));
|
||
assert!(content.markdown.contains("embedded JavaScript"));
|
||
assert!(content.markdown.contains("continues with more"));
|
||
assert!(
|
||
!content.markdown.contains("NEXT_DATA"),
|
||
"script content leaked"
|
||
);
|
||
assert!(
|
||
!content.markdown.contains("initializeApp"),
|
||
"JS function call leaked"
|
||
);
|
||
assert!(
|
||
!content.markdown.contains("background: yellow"),
|
||
"CSS leaked"
|
||
);
|
||
}
|
||
|
||
/// Full-page simulation: header, nav, main content, footer.
|
||
/// Only the main content should survive.
|
||
#[test]
|
||
fn full_page_noise_stripped() {
|
||
let html = r##"
|
||
<html>
|
||
<body>
|
||
<header>
|
||
<div class="logo">MySite</div>
|
||
<nav>
|
||
<a href="/">Home</a>
|
||
<a href="/blog">Blog</a>
|
||
<a href="/about">About</a>
|
||
</nav>
|
||
</header>
|
||
<main>
|
||
<article>
|
||
<h1>How to Write Clean Code</h1>
|
||
<p>Writing clean code is an essential skill for every developer.
|
||
It makes your codebase easier to maintain and understand.</p>
|
||
<p>In this article, we will explore several principles that can
|
||
help you write better, more readable code.</p>
|
||
<p>The first principle is to use meaningful variable names that
|
||
clearly describe what the variable holds.</p>
|
||
</article>
|
||
</main>
|
||
<footer>
|
||
<p>Copyright 2025 MySite</p>
|
||
<a href="/privacy">Privacy</a>
|
||
<a href="/terms">Terms</a>
|
||
</footer>
|
||
</body>
|
||
</html>"##;
|
||
|
||
let doc = parse(html);
|
||
let content = extract_default(&doc, None);
|
||
|
||
assert!(content.markdown.contains("How to Write Clean Code"));
|
||
assert!(content.markdown.contains("meaningful variable names"));
|
||
assert!(
|
||
!content.markdown.contains("MySite"),
|
||
"header/footer branding leaked"
|
||
);
|
||
assert!(!content.markdown.contains("Privacy"), "footer link leaked");
|
||
assert!(!content.markdown.contains("Blog"), "nav link leaked");
|
||
}
|
||
|
||
/// H1 in a hero/banner section outside the main content node should be
|
||
/// captured and prepended to the markdown output.
|
||
#[test]
|
||
fn h1_outside_content_node_captured() {
|
||
let html = r##"
|
||
<html>
|
||
<body>
|
||
<div class="hero-banner">
|
||
<h1>The Ultimate Guide to Async Rust</h1>
|
||
<p class="subtitle">Everything you need to know</p>
|
||
</div>
|
||
<article>
|
||
<p>Asynchronous programming in Rust is powered by the async/await
|
||
syntax and the Future trait. This guide covers all the fundamentals
|
||
you need to get started with async Rust.</p>
|
||
<p>We will explore tokio, the most popular async runtime, and show
|
||
you how to build concurrent applications efficiently.</p>
|
||
<p>By the end of this guide you will understand how to write
|
||
performant async code that handles thousands of connections.</p>
|
||
</article>
|
||
</body>
|
||
</html>"##;
|
||
|
||
let doc = parse(html);
|
||
let content = extract_default(&doc, None);
|
||
|
||
// H1 must appear in markdown even though it's outside <article>
|
||
assert!(
|
||
content
|
||
.markdown
|
||
.contains("The Ultimate Guide to Async Rust"),
|
||
"H1 from hero banner missing from output"
|
||
);
|
||
// Should be prepended as a heading
|
||
assert!(
|
||
content
|
||
.markdown
|
||
.starts_with("# The Ultimate Guide to Async Rust"),
|
||
"H1 should be prepended as markdown heading"
|
||
);
|
||
// Article content still present
|
||
assert!(content.markdown.contains("async/await"));
|
||
assert!(content.markdown.contains("tokio"));
|
||
}
|
||
|
||
/// Announcement banners with role="region" and aria-label="Announcement"
|
||
/// should be recovered even though their class contains "banner" (noise).
|
||
#[test]
|
||
fn announcement_banner_recovered() {
|
||
let html = r##"
|
||
<html>
|
||
<body>
|
||
<div class="announcement-banner" role="region" aria-label="Announcement">
|
||
<p>Big news! We are joining forces with Acme Corp -
|
||
read more in <a href="https://example.com/blog">our blog</a></p>
|
||
</div>
|
||
<header><nav><a href="/">Home</a></nav></header>
|
||
<article>
|
||
<h1>Our Product</h1>
|
||
<p>We build amazing tools for developers that simplify
|
||
complex workflows and boost productivity every day.</p>
|
||
<p>Our platform handles millions of requests per second
|
||
with low latency and high reliability.</p>
|
||
</article>
|
||
</body>
|
||
</html>"##;
|
||
|
||
let doc = parse(html);
|
||
let content = extract_default(&doc, Some(&Url::parse("https://example.com").unwrap()));
|
||
|
||
assert!(
|
||
content.markdown.contains("joining forces with Acme Corp"),
|
||
"Announcement banner text missing from output"
|
||
);
|
||
assert!(
|
||
content.markdown.contains("Our Product"),
|
||
"Main content missing"
|
||
);
|
||
// The announcement link should be captured
|
||
assert!(
|
||
content
|
||
.links
|
||
.iter()
|
||
.any(|l| l.href.contains("example.com/blog")),
|
||
"Announcement link not captured"
|
||
);
|
||
}
|
||
|
||
/// Section headings inside <div class="...header"> wrappers should be
|
||
/// recovered when sibling content from the same section is in the output.
|
||
#[test]
|
||
fn section_heading_in_header_class_recovered() {
|
||
let html = r##"
|
||
<html>
|
||
<body>
|
||
<div class="page-wrapper">
|
||
<section class="features">
|
||
<div class="section-header">
|
||
<h2>Built for scale</h2>
|
||
</div>
|
||
<div class="feature-grid">
|
||
<p>Handle thousands of concurrent requests with
|
||
intelligent load balancing and automatic failover.</p>
|
||
<p>Deploy globally with edge locations in every
|
||
major region for minimal latency.</p>
|
||
</div>
|
||
</section>
|
||
</div>
|
||
</body>
|
||
</html>"##;
|
||
|
||
let doc = parse(html);
|
||
let content = extract_default(&doc, None);
|
||
|
||
assert!(
|
||
content.markdown.contains("## Built for scale"),
|
||
"Section heading should be recovered: {}",
|
||
content.markdown
|
||
);
|
||
assert!(
|
||
content.markdown.contains("concurrent requests"),
|
||
"Section content missing"
|
||
);
|
||
}
|
||
|
||
/// Eyebrow text (short tagline above a section heading) should be
|
||
/// recovered when it's inside the same noise-stripped wrapper as the <h2>.
|
||
#[test]
|
||
fn eyebrow_text_recovered() {
|
||
let html = r##"
|
||
<html>
|
||
<body>
|
||
<div class="page-wrapper">
|
||
<section class="users-section">
|
||
<div class="section-header">
|
||
<p class="eyebrow">the platform for builders</p>
|
||
<h2>Loved by developers worldwide</h2>
|
||
</div>
|
||
<div class="grid">
|
||
<p>Thousands of teams rely on our platform daily for
|
||
mission-critical applications and workflows.</p>
|
||
</div>
|
||
</section>
|
||
</div>
|
||
</body>
|
||
</html>"##;
|
||
|
||
let doc = parse(html);
|
||
let content = extract_default(&doc, None);
|
||
|
||
assert!(
|
||
content.markdown.contains("the platform for builders"),
|
||
"Eyebrow text missing: {}",
|
||
content.markdown
|
||
);
|
||
assert!(
|
||
content.markdown.contains("Loved by developers worldwide"),
|
||
"Section heading missing"
|
||
);
|
||
}
|
||
|
||
/// Decorative route-style labels (starting with "/") should NOT be recovered
|
||
/// as eyebrow text — they're design elements, not content.
|
||
#[test]
|
||
fn route_style_eyebrow_not_recovered() {
|
||
let html = r##"
|
||
<html>
|
||
<body>
|
||
<div class="page-wrapper">
|
||
<section>
|
||
<div class="section-header">
|
||
<p class="eyebrow">/proof is in the numbers</p>
|
||
<h2>Trusted in production</h2>
|
||
</div>
|
||
<div class="grid">
|
||
<p>Our platform handles millions of requests per second
|
||
with low latency and high reliability.</p>
|
||
</div>
|
||
</section>
|
||
</div>
|
||
</body>
|
||
</html>"##;
|
||
|
||
let doc = parse(html);
|
||
let content = extract_default(&doc, None);
|
||
|
||
// With exact class matching, "section-header" is NOT noise
|
||
// (only exact "header" class would be). The eyebrow text is now
|
||
// preserved, which is correct — it's content, not navigation.
|
||
assert!(
|
||
content.markdown.contains("Trusted in production"),
|
||
"Section heading should be recovered"
|
||
);
|
||
assert!(
|
||
content.markdown.contains("Our platform"),
|
||
"Grid content should be present"
|
||
);
|
||
}
|
||
|
||
/// Footer CTA links to documentation URLs should be recovered.
|
||
#[test]
|
||
fn footer_cta_link_recovered() {
|
||
let html = r##"
|
||
<html>
|
||
<body>
|
||
<article>
|
||
<h1>Our Platform</h1>
|
||
<p>Build powerful applications with our comprehensive API
|
||
and developer tools that handle millions of requests.</p>
|
||
<p>Get started in minutes with our quickstart guide and
|
||
extensive documentation for every feature.</p>
|
||
</article>
|
||
<footer>
|
||
<h2>Start building today</h2>
|
||
<a href="https://docs.example.com">Explore API Docs</a>
|
||
<a href="https://app.example.com">Try it free</a>
|
||
<a href="/privacy">Privacy</a>
|
||
<a href="/terms">Terms</a>
|
||
</footer>
|
||
</body>
|
||
</html>"##;
|
||
|
||
let doc = parse(html);
|
||
let content = extract_default(&doc, Some(&Url::parse("https://example.com").unwrap()));
|
||
|
||
assert!(
|
||
content.markdown.contains("Start building today"),
|
||
"Footer CTA heading missing: {}",
|
||
content.markdown
|
||
);
|
||
assert!(
|
||
content.markdown.contains("Explore API Docs"),
|
||
"Footer CTA link missing"
|
||
);
|
||
// Non-doc footer links should NOT be recovered
|
||
assert!(
|
||
!content.markdown.contains("Privacy"),
|
||
"Generic footer nav leaked"
|
||
);
|
||
assert!(
|
||
!content.markdown.contains("Terms"),
|
||
"Generic footer nav leaked"
|
||
);
|
||
}
|
||
|
||
/// Headings inside genuine noise (nav, aside) should NOT be recovered,
|
||
/// even when sibling content exists in the output.
|
||
#[test]
|
||
fn heading_inside_nav_not_recovered() {
|
||
let html = r##"
|
||
<html>
|
||
<body>
|
||
<article>
|
||
<h1>Programming Guide</h1>
|
||
<nav class="table-of-contents">
|
||
<h2>Table of Contents</h2>
|
||
<ul>
|
||
<li><a href="#ch1">Chapter 1</a></li>
|
||
<li><a href="#ch2">Chapter 2</a></li>
|
||
</ul>
|
||
</nav>
|
||
<p>This comprehensive guide covers everything you need
|
||
to know about modern programming practices.</p>
|
||
<p>From basics to advanced topics, we will explore
|
||
patterns and techniques used by professionals.</p>
|
||
</article>
|
||
</body>
|
||
</html>"##;
|
||
|
||
let doc = parse(html);
|
||
let content = extract_default(&doc, None);
|
||
|
||
assert!(
|
||
!content.markdown.contains("Table of Contents"),
|
||
"TOC heading from nav should not be recovered: {}",
|
||
content.markdown
|
||
);
|
||
assert!(content.markdown.contains("comprehensive guide"));
|
||
}
|
||
|
||
/// Structured footer sitemaps (3+ categories with headings) should be
|
||
/// recovered as a compact reference section.
|
||
#[test]
|
||
fn footer_sitemap_recovered() {
|
||
let html = r##"
|
||
<html>
|
||
<body>
|
||
<article>
|
||
<h1>Our Company</h1>
|
||
<p>We build tools that help developers create amazing applications
|
||
faster and more efficiently than ever before.</p>
|
||
<p>Join thousands of teams who trust our platform for their
|
||
mission-critical workloads every single day.</p>
|
||
</article>
|
||
<footer>
|
||
<div class="col">
|
||
<h3>Products</h3>
|
||
<a href="/product-a">Product A</a>
|
||
<a href="/product-b">Product B</a>
|
||
<a href="/product-c">Product C</a>
|
||
</div>
|
||
<div class="col">
|
||
<h3>Solutions</h3>
|
||
<a href="/enterprise">Enterprise</a>
|
||
<a href="/startup">Startup</a>
|
||
<a href="/education">Education</a>
|
||
</div>
|
||
<div class="col">
|
||
<h3>Resources</h3>
|
||
<a href="/blog">Blog</a>
|
||
<a href="/docs">Documentation</a>
|
||
<a href="/community">Community</a>
|
||
</div>
|
||
</footer>
|
||
</body>
|
||
</html>"##;
|
||
|
||
let doc = parse(html);
|
||
let content = extract_default(&doc, Some(&Url::parse("https://example.com").unwrap()));
|
||
|
||
// Categories should be captured
|
||
assert!(
|
||
content.markdown.contains("Products"),
|
||
"Footer sitemap Products missing: {}",
|
||
content.markdown
|
||
);
|
||
assert!(
|
||
content.markdown.contains("Product A"),
|
||
"Footer sitemap link missing"
|
||
);
|
||
assert!(
|
||
content.markdown.contains("Solutions"),
|
||
"Footer sitemap Solutions missing"
|
||
);
|
||
assert!(
|
||
content.markdown.contains("Resources"),
|
||
"Footer sitemap Resources missing"
|
||
);
|
||
// Main content still present
|
||
assert!(content.markdown.contains("Our Company"));
|
||
}
|
||
|
||
/// Footer sitemaps with fewer than 3 categories should NOT be recovered
|
||
/// (not enough structure to be confident it's a sitemap).
|
||
#[test]
|
||
fn small_footer_not_treated_as_sitemap() {
|
||
let html = r##"
|
||
<html>
|
||
<body>
|
||
<article>
|
||
<h1>Simple Page</h1>
|
||
<p>This is a simple page with minimal footer structure that
|
||
should not trigger sitemap recovery at all.</p>
|
||
<p>The content here is what matters, not the footer links
|
||
or navigation elements below the main content.</p>
|
||
</article>
|
||
<footer>
|
||
<h3>Legal</h3>
|
||
<a href="/privacy">Privacy</a>
|
||
<a href="/terms">Terms</a>
|
||
</footer>
|
||
</body>
|
||
</html>"##;
|
||
|
||
let doc = parse(html);
|
||
let content = extract_default(&doc, None);
|
||
|
||
assert!(
|
||
!content.markdown.contains("Legal"),
|
||
"Small footer should not be treated as sitemap: {}",
|
||
content.markdown
|
||
);
|
||
}
|
||
|
||
/// Screen-reader-only footer headings (like "Footer") should not leak.
|
||
#[test]
|
||
fn sr_only_footer_heading_not_recovered() {
|
||
let html = r##"
|
||
<html>
|
||
<body>
|
||
<article>
|
||
<h1>Our Platform</h1>
|
||
<p>Build powerful applications with our comprehensive API
|
||
and developer tools that handle millions of requests.</p>
|
||
<p>Get started in minutes with our quickstart guide and
|
||
extensive documentation for every feature.</p>
|
||
</article>
|
||
<footer>
|
||
<h2 class="u-sr-only">Footer</h2>
|
||
<a href="https://docs.example.com">Explore API Docs</a>
|
||
</footer>
|
||
</body>
|
||
</html>"##;
|
||
|
||
let doc = parse(html);
|
||
let content = extract_default(&doc, Some(&Url::parse("https://example.com").unwrap()));
|
||
|
||
assert!(
|
||
!content.markdown.contains("## Footer"),
|
||
"SR-only 'Footer' heading should not be recovered: {}",
|
||
content.markdown
|
||
);
|
||
}
|
||
}
|
||
|
||
#[cfg(test)]
|
||
mod form_integration_tests {
|
||
use super::*;
|
||
|
||
#[test]
|
||
fn aspnet_form_content_extraction() {
|
||
let content = "x".repeat(600); // Ensure >500 chars
|
||
let html = format!(
|
||
r#"<html><body>
|
||
<form method="post" action="./page.aspx" id="form1">
|
||
<div class="wrapper">
|
||
<div class="header"><a href="/">Logo</a></div>
|
||
<div class="content">
|
||
<h2>Section</h2>
|
||
<h3>Question?</h3>
|
||
<p>{content}</p>
|
||
</div>
|
||
</div>
|
||
</form>
|
||
</body></html>"#
|
||
);
|
||
let doc = Html::parse_document(&html);
|
||
let opts = ExtractionOptions::default();
|
||
let result = extract_content(&doc, None, &opts);
|
||
assert!(
|
||
result.markdown.contains("Section"),
|
||
"h2 missing from markdown"
|
||
);
|
||
assert!(
|
||
result.markdown.contains("Question"),
|
||
"h3 missing from markdown"
|
||
);
|
||
}
|
||
|
||
/// Simulate unclosed header div absorbing the content div.
|
||
/// The header's noise class should NOT propagate to the absorbed content
|
||
/// because the safety valve detects the header has >5000 chars (broken wrapper).
|
||
#[test]
|
||
fn unclosed_header_div_does_not_swallow_content() {
|
||
let faq = "Lorem ipsum dolor sit amet. ".repeat(300); // ~8400 chars
|
||
// The header div is intentionally NOT closed — the HTML parser makes
|
||
// div.content a child of div.header. The safety valve (>5000 chars)
|
||
// should prevent div.header from being treated as noise.
|
||
let html = format!(
|
||
r#"<html><body>
|
||
<div class="wrapper">
|
||
<div class="header"><a href="/">Logo</a>
|
||
<div class="content">
|
||
<h2>FAQ Section</h2>
|
||
<h3>First question?</h3>
|
||
<p>{faq}</p>
|
||
</div>
|
||
</div>
|
||
</body></html>"#
|
||
);
|
||
let doc = Html::parse_document(&html);
|
||
let opts = ExtractionOptions::default();
|
||
let result = extract_content(&doc, None, &opts);
|
||
assert!(
|
||
result.markdown.contains("FAQ Section"),
|
||
"h2 missing: header swallowed content"
|
||
);
|
||
assert!(
|
||
result.markdown.contains("First question"),
|
||
"h3 missing: header swallowed content"
|
||
);
|
||
}
|
||
}
|