/// Metadata extraction from HTML
.
/// Prioritizes Open Graph and Twitter Card tags, falls back to standard meta tags.
use scraper::{Html, Selector};
use crate::types::Metadata;
/// Selectors are cheap to compile but we call them often — cache with once_cell.
macro_rules! selector {
($s:expr) => {{
use once_cell::sync::Lazy;
static SEL: Lazy = Lazy::new(|| Selector::parse($s).unwrap());
&*SEL
}};
}
pub fn extract(doc: &Html, url: Option<&str>) -> Metadata {
let title = og_meta(doc, "og:title")
.or_else(|| meta_name(doc, "twitter:title"))
.or_else(|| title_tag(doc));
let description = og_meta(doc, "og:description")
.or_else(|| meta_name(doc, "twitter:description"))
.or_else(|| meta_name(doc, "description"));
let author = meta_name(doc, "author").or_else(|| og_meta(doc, "article:author"));
let published_date = og_meta(doc, "article:published_time")
.or_else(|| meta_name(doc, "date"))
.or_else(|| meta_name(doc, "publication_date"));
// Search the whole document for — root_element() IS the
// node in scraper, so selecting "html" from it finds nothing (no nested ).
let language = doc
.select(selector!("html"))
.next()
.and_then(|el| el.value().attr("lang"))
.map(|s| s.to_string());
let site_name = og_meta(doc, "og:site_name");
let image = og_meta(doc, "og:image").or_else(|| meta_name(doc, "twitter:image"));
let favicon = extract_favicon(doc);
Metadata {
title,
description,
author,
published_date,
language,
url: url.map(String::from),
site_name,
image,
favicon,
word_count: 0, // filled later by the extractor
}
}
///
fn og_meta(doc: &Html, property: &str) -> Option {
// OG tags use property= not name=
doc.select(selector!("meta[property]"))
.find(|el| el.value().attr("property") == Some(property))
.and_then(|el| el.value().attr("content"))
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
}
///
fn meta_name(doc: &Html, name: &str) -> Option {
doc.select(selector!("meta[name]"))
.find(|el| {
el.value()
.attr("name")
.is_some_and(|n| n.eq_ignore_ascii_case(name))
})
.and_then(|el| el.value().attr("content"))
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
}
fn title_tag(doc: &Html) -> Option {
doc.select(selector!("title"))
.next()
.map(|el| el.text().collect::().trim().to_string())
.filter(|s| !s.is_empty())
}
fn extract_favicon(doc: &Html) -> Option {
// or
doc.select(selector!("link[rel]"))
.find(|el| el.value().attr("rel").is_some_and(|r| r.contains("icon")))
.and_then(|el| el.value().attr("href"))
.map(|s| s.to_string())
}
#[cfg(test)]
mod tests {
use super::*;
fn parse(html: &str) -> Html {
Html::parse_document(html)
}
#[test]
fn extracts_basic_metadata() {
let html = r#"
Test Page
"#;
let doc = parse(html);
let meta = extract(&doc, Some("https://example.com"));
// OG title wins over
assert_eq!(meta.title.as_deref(), Some("OG Title"));
assert_eq!(meta.description.as_deref(), Some("A test page"));
assert_eq!(meta.author.as_deref(), Some("Alice"));
assert_eq!(meta.published_date.as_deref(), Some("2025-01-15"));
assert_eq!(meta.language.as_deref(), Some("en"));
assert_eq!(meta.site_name.as_deref(), Some("Example"));
assert_eq!(
meta.image.as_deref(),
Some("https://img.example.com/og.png")
);
assert_eq!(meta.favicon.as_deref(), Some("/favicon.ico"));
assert_eq!(meta.url.as_deref(), Some("https://example.com"));
}
#[test]
fn falls_back_to_title_tag() {
let html = r#"Fallback Title"#;
let doc = parse(html);
let meta = extract(&doc, None);
assert_eq!(meta.title.as_deref(), Some("Fallback Title"));
}
#[test]
fn handles_missing_metadata_gracefully() {
let html = r#""#;
let doc = parse(html);
let meta = extract(&doc, None);
assert!(meta.title.is_none());
assert!(meta.description.is_none());
assert!(meta.language.is_none());
}
}