/// Metadata extraction from HTML . /// Prioritizes Open Graph and Twitter Card tags, falls back to standard meta tags. use scraper::{Html, Selector}; use crate::types::Metadata; /// Selectors are cheap to compile but we call them often — cache with once_cell. macro_rules! selector { ($s:expr) => {{ use once_cell::sync::Lazy; static SEL: Lazy = Lazy::new(|| Selector::parse($s).unwrap()); &*SEL }}; } pub fn extract(doc: &Html, url: Option<&str>) -> Metadata { let title = og_meta(doc, "og:title") .or_else(|| meta_name(doc, "twitter:title")) .or_else(|| title_tag(doc)); let description = og_meta(doc, "og:description") .or_else(|| meta_name(doc, "twitter:description")) .or_else(|| meta_name(doc, "description")); let author = meta_name(doc, "author").or_else(|| og_meta(doc, "article:author")); let published_date = og_meta(doc, "article:published_time") .or_else(|| meta_name(doc, "date")) .or_else(|| meta_name(doc, "publication_date")); // Search the whole document for — root_element() IS the // node in scraper, so selecting "html" from it finds nothing (no nested ). let language = doc .select(selector!("html")) .next() .and_then(|el| el.value().attr("lang")) .map(|s| s.to_string()); let site_name = og_meta(doc, "og:site_name"); let image = og_meta(doc, "og:image").or_else(|| meta_name(doc, "twitter:image")); let favicon = extract_favicon(doc); Metadata { title, description, author, published_date, language, url: url.map(String::from), site_name, image, favicon, word_count: 0, // filled later by the extractor } } /// fn og_meta(doc: &Html, property: &str) -> Option { // OG tags use property= not name= doc.select(selector!("meta[property]")) .find(|el| el.value().attr("property") == Some(property)) .and_then(|el| el.value().attr("content")) .map(|s| s.trim().to_string()) .filter(|s| !s.is_empty()) } /// fn meta_name(doc: &Html, name: &str) -> Option { doc.select(selector!("meta[name]")) .find(|el| { el.value() .attr("name") .is_some_and(|n| n.eq_ignore_ascii_case(name)) }) .and_then(|el| el.value().attr("content")) .map(|s| s.trim().to_string()) .filter(|s| !s.is_empty()) } fn title_tag(doc: &Html) -> Option { doc.select(selector!("title")) .next() .map(|el| el.text().collect::().trim().to_string()) .filter(|s| !s.is_empty()) } fn extract_favicon(doc: &Html) -> Option { // or doc.select(selector!("link[rel]")) .find(|el| el.value().attr("rel").is_some_and(|r| r.contains("icon"))) .and_then(|el| el.value().attr("href")) .map(|s| s.to_string()) } #[cfg(test)] mod tests { use super::*; fn parse(html: &str) -> Html { Html::parse_document(html) } #[test] fn extracts_basic_metadata() { let html = r#" Test Page "#; let doc = parse(html); let meta = extract(&doc, Some("https://example.com")); // OG title wins over assert_eq!(meta.title.as_deref(), Some("OG Title")); assert_eq!(meta.description.as_deref(), Some("A test page")); assert_eq!(meta.author.as_deref(), Some("Alice")); assert_eq!(meta.published_date.as_deref(), Some("2025-01-15")); assert_eq!(meta.language.as_deref(), Some("en")); assert_eq!(meta.site_name.as_deref(), Some("Example")); assert_eq!( meta.image.as_deref(), Some("https://img.example.com/og.png") ); assert_eq!(meta.favicon.as_deref(), Some("/favicon.ico")); assert_eq!(meta.url.as_deref(), Some("https://example.com")); } #[test] fn falls_back_to_title_tag() { let html = r#"<html><head><title>Fallback Title"#; let doc = parse(html); let meta = extract(&doc, None); assert_eq!(meta.title.as_deref(), Some("Fallback Title")); } #[test] fn handles_missing_metadata_gracefully() { let html = r#""#; let doc = parse(html); let meta = extract(&doc, None); assert!(meta.title.is_none()); assert!(meta.description.is_none()); assert!(meta.language.is_none()); } }