Initial release: webclaw v0.1.0 — web content extraction for LLMs

CLI + MCP server for extracting clean, structured content from any URL.
6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats.

MIT Licensed | https://webclaw.io
This commit is contained in:
Valerio 2026-03-23 18:31:11 +01:00
commit c99ec684fa
79 changed files with 24074 additions and 0 deletions

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,551 @@
/// Extract content from JSON data islands embedded in `<script>` tags.
///
/// Many modern SPAs (React, Next.js, Nuxt) ship server-rendered page data
/// as JSON inside script tags rather than in visible DOM elements. This module
/// walks those JSON blobs and recovers text content as a fallback when normal
/// DOM extraction yields sparse results.
use once_cell::sync::Lazy;
use scraper::{Html, Selector};
use tracing::debug;
static SCRIPT_JSON_SELECTOR: Lazy<Selector> =
Lazy::new(|| Selector::parse("script[type='application/json']").unwrap());
/// Below this word count, try data islands for supplemental content.
/// Set high enough to cover marketing homepages with partial SSR (e.g., Notion
/// SSR-renders ~300 words but has ~800 words in __NEXT_DATA__).
const SPARSE_THRESHOLD: usize = 500;
/// Cap total extracted chunks to bound memory and CPU on adversarial inputs.
const MAX_CHUNKS: usize = 1000;
/// A chunk of text extracted from a JSON data island, with optional heading.
#[derive(Debug)]
struct TextChunk {
heading: Option<String>,
body: String,
}
/// Try to extract content from JSON data islands when DOM extraction is sparse.
/// Deduplicates against existing markdown so we only add genuinely new content.
pub fn try_extract(doc: &Html, dom_word_count: usize, existing_markdown: &str) -> Option<String> {
if dom_word_count >= SPARSE_THRESHOLD {
return None;
}
let mut all_chunks: Vec<TextChunk> = Vec::new();
let existing_lower = existing_markdown.to_lowercase();
for script in doc.select(&SCRIPT_JSON_SELECTOR) {
if all_chunks.len() >= MAX_CHUNKS {
break;
}
let json_text = script.text().collect::<String>();
if json_text.len() < 50 {
continue;
}
let Ok(value) = serde_json::from_str::<serde_json::Value>(&json_text) else {
continue;
};
let mut chunks = Vec::new();
walk_json(&value, &mut chunks, 0);
if !chunks.is_empty() {
debug!(
script_id = script.value().attr("id").unwrap_or(""),
data_target = script.value().attr("data-target").unwrap_or(""),
chunks = chunks.len(),
"extracted text from data island"
);
all_chunks.extend(chunks);
}
}
if all_chunks.is_empty() {
return None;
}
// Enforce limit after collecting from all scripts
all_chunks.truncate(MAX_CHUNKS);
// Dedup: remove chunks whose text already appears in DOM markdown
let mut seen = std::collections::HashSet::new();
all_chunks.retain(|c| {
// Must have heading or body
let key = if !c.body.is_empty() {
c.body.clone()
} else if let Some(ref h) = c.heading {
h.clone()
} else {
return false;
};
if !seen.insert(key.clone()) {
return false;
}
// Skip if the text already exists in the DOM-extracted content
!existing_lower.contains(&key.to_lowercase())
});
if all_chunks.is_empty() {
return None;
}
let mut md = String::new();
for chunk in &all_chunks {
if let Some(ref h) = chunk.heading {
md.push_str(&format!("\n## {h}\n\n"));
}
md.push_str(&chunk.body);
md.push_str("\n\n");
}
let md = md.trim().to_string();
if md.is_empty() {
None
} else {
debug!(chars = md.len(), "data island content recovered");
Some(md)
}
}
/// Recursively walk a JSON value and extract text content.
fn walk_json(value: &serde_json::Value, chunks: &mut Vec<TextChunk>, depth: usize) {
if depth > 15 {
return;
}
match value {
serde_json::Value::Object(map) => {
// Contentful rich text node: { "nodeType": "...", "content": [...] }
if let Some(node_type) = map.get("nodeType").and_then(|v| v.as_str())
&& let Some(text) = extract_contentful_node(map, node_type)
{
chunks.push(text);
return;
}
// CMS-style entry with heading + subheading/description
if is_cms_entry(map)
&& let Some(chunk) = extract_cms_entry(map)
{
chunks.push(chunk);
return;
}
// Quote/testimonial pattern
if let Some(chunk) = extract_quote(map) {
chunks.push(chunk);
return;
}
// Extract orphaned content strings from known field names
// before recursing (they won't be caught by CMS/quote patterns)
extract_orphan_texts(map, chunks);
// Recurse into all values, skipping image/media/asset fields
for (key, v) in map {
if is_media_key(key) {
continue;
}
walk_json(v, chunks, depth + 1);
}
}
serde_json::Value::Array(arr) => {
// Check for stat-style string arrays (e.g., ["100M+ users", "#1 rated"])
let content_strings: Vec<&str> = arr
.iter()
.filter_map(|v| v.as_str())
.filter(|s| s.len() > 10 && s.contains(' '))
.collect();
if content_strings.len() >= 2 {
let body = content_strings.join(" | ");
chunks.push(TextChunk {
heading: None,
body,
});
return;
}
for v in arr {
walk_json(v, chunks, depth + 1);
}
}
_ => {}
}
}
/// Extract text from a Contentful rich text node.
/// Handles: document, paragraph, heading-1..6, blockquote, etc.
fn extract_contentful_node(
map: &serde_json::Map<String, serde_json::Value>,
node_type: &str,
) -> Option<TextChunk> {
match node_type {
"document" => {
// Top-level document — collect children
let content = map.get("content")?.as_array()?;
let mut parts = Vec::new();
for child in content {
if let Some(chunk) = child
.as_object()
.and_then(|m| m.get("nodeType").and_then(|v| v.as_str()))
.and_then(|nt| extract_contentful_node(child.as_object().unwrap(), nt))
{
if let Some(h) = &chunk.heading {
parts.push(format!("## {h}"));
}
if !chunk.body.is_empty() {
parts.push(chunk.body);
}
}
}
if parts.is_empty() {
return None;
}
Some(TextChunk {
heading: None,
body: parts.join("\n\n"),
})
}
"paragraph" | "text" => {
let text = collect_text_content(map);
if is_content_text(&text) {
Some(TextChunk {
heading: None,
body: text,
})
} else {
None
}
}
nt if nt.starts_with("heading-") => {
let text = collect_text_content(map);
if text.is_empty() {
None
} else {
Some(TextChunk {
heading: Some(text),
body: String::new(),
})
}
}
"blockquote" => {
let text = collect_text_content(map);
if is_content_text(&text) {
Some(TextChunk {
heading: None,
body: format!("> {text}"),
})
} else {
None
}
}
_ => None,
}
}
/// Recursively collect plain text from a Contentful rich text node tree.
fn collect_text_content(map: &serde_json::Map<String, serde_json::Value>) -> String {
let mut text = String::new();
if let Some(v) = map.get("value").and_then(|v| v.as_str()) {
text.push_str(v);
}
if let Some(content) = map.get("content").and_then(|v| v.as_array()) {
for child in content {
if let Some(child_map) = child.as_object() {
let child_text = collect_text_content(child_map);
text.push_str(&child_text);
}
}
}
text.trim().to_string()
}
/// Check if a JSON object looks like a CMS entry with heading + description.
fn is_cms_entry(map: &serde_json::Map<String, serde_json::Value>) -> bool {
let has_heading =
map.contains_key("heading") || map.contains_key("title") || map.contains_key("headline");
let has_body = map.contains_key("description")
|| map.contains_key("subheading")
|| map.contains_key("body")
|| map.contains_key("text");
has_heading && has_body
}
/// Extract heading + body from a CMS-style entry.
fn extract_cms_entry(map: &serde_json::Map<String, serde_json::Value>) -> Option<TextChunk> {
let heading = extract_text_field(map, "heading")
.or_else(|| extract_text_field(map, "title"))
.or_else(|| extract_text_field(map, "headline"))
.filter(|h| !is_cms_internal_title(h) && h.len() > 5)?;
let body = extract_text_field(map, "description")
.or_else(|| extract_text_field(map, "subheading"))
.or_else(|| extract_text_field(map, "body"))
.or_else(|| extract_text_field(map, "text"))
.unwrap_or_default();
if !is_content_text(&heading) && !is_content_text(&body) {
return None;
}
Some(TextChunk {
heading: Some(heading),
body,
})
}
/// Extract a quote/testimonial from a JSON object.
fn extract_quote(map: &serde_json::Map<String, serde_json::Value>) -> Option<TextChunk> {
let quote =
extract_text_field(map, "quote").or_else(|| extract_text_field(map, "quoteText"))?;
if !is_content_text(&quote) {
return None;
}
let attribution = extract_text_field(map, "position")
.or_else(|| extract_text_field(map, "author"))
.or_else(|| extract_text_field(map, "name"))
.unwrap_or_default();
let body = if attribution.is_empty() {
format!("> {quote}")
} else {
format!("> {quote}\n> — {attribution}")
};
Some(TextChunk {
heading: None,
body,
})
}
/// Extract standalone content strings from known field names that weren't
/// caught by the CMS entry or quote patterns. These are body/description/
/// subheading/eyebrow fields on objects that lack a paired heading, or
/// headline fields on objects that lack a body.
fn extract_orphan_texts(
map: &serde_json::Map<String, serde_json::Value>,
chunks: &mut Vec<TextChunk>,
) {
const BODY_KEYS: &[&str] = &["body", "description", "subheading", "eyebrow", "children"];
const HEADING_KEYS: &[&str] = &["heading", "title", "headline"];
// Don't extract if this object was already handled as a CMS entry
if is_cms_entry(map) {
return;
}
// Try extracting a standalone heading (without body)
for key in HEADING_KEYS {
if let Some(text) = extract_text_field(map, key)
&& is_content_text(&text)
{
chunks.push(TextChunk {
heading: Some(text),
body: String::new(),
});
return;
}
}
// Try extracting a standalone body field
for key in BODY_KEYS {
if let Some(text) = extract_text_field(map, key)
&& is_content_text(&text)
{
chunks.push(TextChunk {
heading: None,
body: text,
});
return;
}
}
}
/// Extract a text value from a JSON field, handling both plain strings and
/// Contentful rich text objects.
fn extract_text_field(
map: &serde_json::Map<String, serde_json::Value>,
key: &str,
) -> Option<String> {
let value = map.get(key)?;
// Plain string
if let Some(s) = value.as_str() {
let s = s.trim().to_string();
return if s.is_empty() { None } else { Some(s) };
}
// Contentful rich text object: { "content": [{ "content": [{ "value": "..." }] }] }
if let Some(obj) = value.as_object() {
let text = collect_text_content(obj);
return if text.is_empty() { None } else { Some(text) };
}
None
}
/// JSON keys that hold image/media/asset data — skip recursing into these
/// to avoid extracting CMS alt text as content.
fn is_media_key(key: &str) -> bool {
let k = key.to_lowercase();
k == "alt"
|| k.contains("image")
|| k.contains("poster")
|| k.contains("video")
|| k.contains("thumbnail")
|| k.contains("icon")
|| k.contains("logo")
|| k == "src"
|| k == "url"
|| k == "href"
}
/// CMS internal titles like "/home Customer Stories: Logo" or
/// "Copilot agent mode hero poster desktop" are editorial labels, not user-facing text.
fn is_cms_internal_title(s: &str) -> bool {
// Contentful path-style titles
if s.starts_with("/home ") || s.starts_with("/page ") {
return true;
}
// Titles that look like asset/component labels (short words, no sentence structure)
let words: Vec<&str> = s.split_whitespace().collect();
if words.len() >= 3 {
let has_label_keyword = words
.iter()
.any(|w| ["poster", "logo", "image", "icon", "asset", "thumbnail"].contains(w));
if has_label_keyword {
return true;
}
}
false
}
/// Heuristic: is this string actual content (not an ID, URL, class name, etc.)?
fn is_content_text(s: &str) -> bool {
let s = s.trim();
if s.len() < 15 {
return false;
}
// Skip URLs, IDs, technical strings
if s.starts_with("http") || s.starts_with('/') || s.starts_with('{') || s.starts_with('[') {
return false;
}
// Must contain spaces (prose), not just a single technical token
if !s.contains(' ') {
return false;
}
// Skip strings that are mostly hex/base64 (hashes, IDs)
let alnum_ratio = s.chars().filter(|c| c.is_alphanumeric()).count() as f64 / s.len() as f64;
if alnum_ratio < 0.6 {
return false;
}
true
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn extracts_contentful_rich_text() {
let html = r#"<html><body>
<script type="application/json" data-target="react-app.embeddedData">
{"payload":{"contentfulRawJsonResponse":{"includes":{"Entry":[
{"fields":{
"heading":"Ship faster with secure CI/CD",
"subheading":{"content":[{"content":[{"value":"Automate builds, tests, and deployments."}]}]}
}},
{"fields":{
"heading":"Built-in application security",
"description":{"content":[{"content":[{"value":"Use AI to find and fix vulnerabilities so your team can ship more secure software faster."}]}]}
}}
]}}}}
</script>
</body></html>"#;
let doc = Html::parse_document(html);
let result = try_extract(&doc, 0, "").unwrap();
assert!(result.contains("Ship faster with secure CI/CD"));
assert!(result.contains("Automate builds, tests, and deployments"));
assert!(result.contains("Built-in application security"));
assert!(result.contains("find and fix vulnerabilities"));
}
#[test]
fn skips_when_dom_has_enough_content() {
let html = r#"<html><body>
<script type="application/json">{"heading":"Foo","description":"Some long description here."}</script>
</body></html>"#;
let doc = Html::parse_document(html);
assert!(try_extract(&doc, 500, "").is_none());
}
#[test]
fn skips_non_content_strings() {
assert!(!is_content_text("abc123"));
assert!(!is_content_text("https://example.com/foo/bar"));
assert!(!is_content_text("/home Customer Stories: Logo"));
assert!(!is_content_text("a1b2c3d4e5f6a1b2c3d4e5f6"));
assert!(is_content_text(
"Automate builds, tests, and deployments with CI/CD."
));
}
#[test]
fn extracts_quotes() {
let html = r#"<html><body>
<script type="application/json">
{"fields":{"quote":{"content":[{"content":[{"value":"GitHub frees us from maintaining our own infrastructure."}]}]},"position":"CTO at Example Corp"}}
</script>
</body></html>"#;
let doc = Html::parse_document(html);
let result = try_extract(&doc, 0, "").unwrap();
assert!(result.contains("> GitHub frees us from maintaining our own infrastructure."));
assert!(result.contains("CTO at Example Corp"));
}
#[test]
fn skips_content_already_in_dom() {
let html = r#"<html><body>
<script type="application/json">
{"fields":{"heading":"Already in DOM heading","description":"This text already appears in the DOM markdown output."}}
</script>
</body></html>"#;
let doc = Html::parse_document(html);
let existing =
"# Already in DOM heading\n\nThis text already appears in the DOM markdown output.";
assert!(try_extract(&doc, 10, existing).is_none());
}
#[test]
fn deduplicates_chunks() {
let html = r#"<html><body>
<script type="application/json">
{"a":{"heading":"Same heading here","description":"Same body content across multiple entries."},
"b":{"heading":"Same heading here","description":"Same body content across multiple entries."}}
</script>
</body></html>"#;
let doc = Html::parse_document(html);
let result = try_extract(&doc, 0, "").unwrap();
// Should appear only once
assert_eq!(
result
.matches("Same body content across multiple entries")
.count(),
1
);
}
}

View file

@ -0,0 +1,340 @@
/// Change tracking between two extraction snapshots.
/// Pure computation -- no I/O, WASM-safe.
use std::collections::HashSet;
use serde::Serialize;
use similar::TextDiff;
use crate::types::{ExtractionResult, Link};
#[derive(Debug, Clone, Serialize, PartialEq)]
pub enum ChangeStatus {
Same,
Changed,
New,
}
#[derive(Debug, Clone, Serialize)]
pub struct MetadataChange {
pub field: String,
pub old: Option<String>,
pub new: Option<String>,
}
#[derive(Debug, Clone, Serialize)]
pub struct ContentDiff {
pub status: ChangeStatus,
pub text_diff: Option<String>,
pub metadata_changes: Vec<MetadataChange>,
pub links_added: Vec<Link>,
pub links_removed: Vec<Link>,
pub word_count_delta: i64,
}
/// Compare two extraction results and produce a diff.
/// `old` is the previous snapshot, `new_result` is the current extraction.
pub fn diff(old: &ExtractionResult, new_result: &ExtractionResult) -> ContentDiff {
let text_diff = compute_text_diff(&old.content.markdown, &new_result.content.markdown);
let metadata_changes = compute_metadata_changes(&old.metadata, &new_result.metadata);
let (links_added, links_removed) =
compute_link_changes(&old.content.links, &new_result.content.links);
let word_count_delta = new_result.metadata.word_count as i64 - old.metadata.word_count as i64;
let status = if text_diff.is_none() && metadata_changes.is_empty() {
ChangeStatus::Same
} else {
ChangeStatus::Changed
};
ContentDiff {
status,
text_diff,
metadata_changes,
links_added,
links_removed,
word_count_delta,
}
}
fn compute_text_diff(old: &str, new: &str) -> Option<String> {
if old == new {
return None;
}
let diff = TextDiff::from_lines(old, new);
let unified = diff
.unified_diff()
.context_radius(3)
.header("old", "new")
.to_string();
if unified.is_empty() {
None
} else {
Some(unified)
}
}
/// Compare each metadata field, returning only those that changed.
fn compute_metadata_changes(
old: &crate::types::Metadata,
new: &crate::types::Metadata,
) -> Vec<MetadataChange> {
let mut changes = Vec::new();
let fields: Vec<(&str, &Option<String>, &Option<String>)> = vec![
("title", &old.title, &new.title),
("description", &old.description, &new.description),
("author", &old.author, &new.author),
("published_date", &old.published_date, &new.published_date),
("language", &old.language, &new.language),
("url", &old.url, &new.url),
("site_name", &old.site_name, &new.site_name),
("image", &old.image, &new.image),
("favicon", &old.favicon, &new.favicon),
];
for (name, old_val, new_val) in fields {
if old_val != new_val {
changes.push(MetadataChange {
field: name.to_string(),
old: old_val.clone(),
new: new_val.clone(),
});
}
}
changes
}
/// Links added/removed, compared by href (ignoring text differences).
fn compute_link_changes(old: &[Link], new: &[Link]) -> (Vec<Link>, Vec<Link>) {
let old_hrefs: HashSet<&str> = old.iter().map(|l| l.href.as_str()).collect();
let new_hrefs: HashSet<&str> = new.iter().map(|l| l.href.as_str()).collect();
let added: Vec<Link> = new
.iter()
.filter(|l| !old_hrefs.contains(l.href.as_str()))
.cloned()
.collect();
let removed: Vec<Link> = old
.iter()
.filter(|l| !new_hrefs.contains(l.href.as_str()))
.cloned()
.collect();
(added, removed)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::domain::DomainType;
use crate::types::{Content, DomainData, Metadata};
/// Build a minimal ExtractionResult for test comparisons.
fn make_result(markdown: &str, title: Option<&str>, links: Vec<Link>) -> ExtractionResult {
let word_count = markdown.split_whitespace().count();
ExtractionResult {
metadata: Metadata {
title: title.map(String::from),
description: None,
author: None,
published_date: None,
language: None,
url: None,
site_name: None,
image: None,
favicon: None,
word_count,
},
content: Content {
markdown: markdown.to_string(),
plain_text: markdown.to_string(),
links,
images: vec![],
code_blocks: vec![],
raw_html: None,
},
domain_data: Some(DomainData {
domain_type: DomainType::Generic,
}),
structured_data: vec![],
}
}
fn link(href: &str, text: &str) -> Link {
Link {
href: href.to_string(),
text: text.to_string(),
}
}
#[test]
fn test_identical_content() {
let a = make_result("# Hello\n\nSome content here.", Some("Hello"), vec![]);
let b = make_result("# Hello\n\nSome content here.", Some("Hello"), vec![]);
let result = diff(&a, &b);
assert_eq!(result.status, ChangeStatus::Same);
assert!(result.text_diff.is_none());
assert!(result.metadata_changes.is_empty());
assert!(result.links_added.is_empty());
assert!(result.links_removed.is_empty());
assert_eq!(result.word_count_delta, 0);
}
#[test]
fn test_title_change() {
let a = make_result("# Hello\n\nContent.", Some("Old Title"), vec![]);
let b = make_result("# Hello\n\nContent.", Some("New Title"), vec![]);
let result = diff(&a, &b);
assert_eq!(result.status, ChangeStatus::Changed);
assert!(result.text_diff.is_none(), "text is identical");
assert_eq!(result.metadata_changes.len(), 1);
assert_eq!(result.metadata_changes[0].field, "title");
assert_eq!(result.metadata_changes[0].old.as_deref(), Some("Old Title"));
assert_eq!(result.metadata_changes[0].new.as_deref(), Some("New Title"));
}
#[test]
fn test_content_change() {
let a = make_result("# Hello\n\nOld paragraph.", Some("Title"), vec![]);
let b = make_result("# Hello\n\nNew paragraph.", Some("Title"), vec![]);
let result = diff(&a, &b);
assert_eq!(result.status, ChangeStatus::Changed);
assert!(result.text_diff.is_some());
let diff_text = result.text_diff.unwrap();
assert!(diff_text.contains('-'), "should have removal markers");
assert!(diff_text.contains('+'), "should have addition markers");
}
#[test]
fn test_link_added() {
let a = make_result("Content.", None, vec![]);
let b = make_result(
"Content.",
None,
vec![link("https://example.com", "Example")],
);
let result = diff(&a, &b);
assert_eq!(result.links_added.len(), 1);
assert_eq!(result.links_added[0].href, "https://example.com");
assert!(result.links_removed.is_empty());
}
#[test]
fn test_link_removed() {
let a = make_result(
"Content.",
None,
vec![link("https://example.com", "Example")],
);
let b = make_result("Content.", None, vec![]);
let result = diff(&a, &b);
assert!(result.links_added.is_empty());
assert_eq!(result.links_removed.len(), 1);
assert_eq!(result.links_removed[0].href, "https://example.com");
}
#[test]
fn test_links_added_and_removed() {
let a = make_result(
"Content.",
None,
vec![
link("https://old.com", "Old"),
link("https://stable.com", "Stable"),
],
);
let b = make_result(
"Content.",
None,
vec![
link("https://stable.com", "Stable"),
link("https://new.com", "New"),
],
);
let result = diff(&a, &b);
assert_eq!(result.links_added.len(), 1);
assert_eq!(result.links_added[0].href, "https://new.com");
assert_eq!(result.links_removed.len(), 1);
assert_eq!(result.links_removed[0].href, "https://old.com");
}
#[test]
fn test_word_count_delta() {
let a = make_result("one two three", None, vec![]);
let b = make_result("one two three four five", None, vec![]);
let result = diff(&a, &b);
assert_eq!(result.word_count_delta, 2);
// Negative delta
let result_rev = diff(&b, &a);
assert_eq!(result_rev.word_count_delta, -2);
}
#[test]
fn test_unified_diff_format() {
let a = make_result("line one\nline two\nline three\n", None, vec![]);
let b = make_result("line one\nline changed\nline three\n", None, vec![]);
let result = diff(&a, &b);
assert!(result.text_diff.is_some());
let diff_text = result.text_diff.unwrap();
assert!(diff_text.contains("--- old"), "should have old header");
assert!(diff_text.contains("+++ new"), "should have new header");
assert!(diff_text.contains("-line two"), "should show removed line");
assert!(
diff_text.contains("+line changed"),
"should show added line"
);
}
#[test]
fn test_empty_content() {
let a = make_result("", None, vec![]);
let b = make_result("", None, vec![]);
let result = diff(&a, &b);
assert_eq!(result.status, ChangeStatus::Same);
assert!(result.text_diff.is_none());
assert_eq!(result.word_count_delta, 0);
}
#[test]
fn test_link_text_change_ignored() {
// Same href, different text -- should not appear in added/removed
let a = make_result(
"Content.",
None,
vec![link("https://example.com", "Old Text")],
);
let b = make_result(
"Content.",
None,
vec![link("https://example.com", "New Text")],
);
let result = diff(&a, &b);
assert!(result.links_added.is_empty());
assert!(result.links_removed.is_empty());
}
}

View file

@ -0,0 +1,187 @@
/// Domain detection via URL patterns and DOM heuristics.
/// Knowing the domain type lets downstream consumers apply
/// domain-specific prompts or post-processing.
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
pub enum DomainType {
Article,
Documentation,
GitHub,
Forum,
ECommerce,
Social,
Generic,
}
/// Detect domain type from URL patterns first, then fall back to DOM heuristics.
pub fn detect(url: Option<&str>, html: &str) -> DomainType {
if let Some(url) = url
&& let Some(dt) = detect_from_url(url)
{
return dt;
}
detect_from_dom(html)
}
fn detect_from_url(url: &str) -> Option<DomainType> {
let lower = url.to_lowercase();
// GitHub
if lower.contains("github.com") || lower.contains("gitlab.com") {
return Some(DomainType::GitHub);
}
// Documentation sites
let doc_patterns = [
"docs.",
"readthedocs",
"gitbook",
"docusaurus",
"/docs/",
"/documentation/",
"devdocs.io",
"doc.rust-lang.org",
"developer.mozilla.org",
"developer.apple.com/documentation",
];
if doc_patterns.iter().any(|p| lower.contains(p)) {
return Some(DomainType::Documentation);
}
// Forums
let forum_patterns = [
"reddit.com",
"news.ycombinator.com",
"stackoverflow.com",
"stackexchange.com",
"discourse",
"forum",
"community.",
];
if forum_patterns.iter().any(|p| lower.contains(p)) {
return Some(DomainType::Forum);
}
// Social
let social_patterns = [
"twitter.com",
"x.com",
"linkedin.com",
"facebook.com",
"instagram.com",
"mastodon",
"bsky.app",
];
if social_patterns.iter().any(|p| lower.contains(p)) {
return Some(DomainType::Social);
}
// E-commerce
let ecommerce_patterns = [
"amazon.",
"ebay.",
"shopify.",
"etsy.com",
"/product/",
"/shop/",
"/cart",
];
if ecommerce_patterns.iter().any(|p| lower.contains(p)) {
return Some(DomainType::ECommerce);
}
None
}
/// Fallback: check HTML for structural hints when URL isn't enough.
fn detect_from_dom(html: &str) -> DomainType {
let lower = html.to_lowercase();
// Article signals: <article> tag, schema.org Article type
if lower.contains("<article") || lower.contains("schema.org/article") {
return DomainType::Article;
}
// Documentation signals
if lower.contains("docsearch") || lower.contains("sidebar-nav") || lower.contains("doc-content")
{
return DomainType::Documentation;
}
DomainType::Generic
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn github_urls() {
assert_eq!(
detect(Some("https://github.com/tokio-rs/tokio"), ""),
DomainType::GitHub
);
assert_eq!(
detect(Some("https://gitlab.com/foo/bar"), ""),
DomainType::GitHub
);
}
#[test]
fn documentation_urls() {
assert_eq!(
detect(Some("https://docs.rs/serde/latest"), ""),
DomainType::Documentation
);
assert_eq!(
detect(Some("https://readthedocs.org/projects/foo"), ""),
DomainType::Documentation
);
}
#[test]
fn forum_urls() {
assert_eq!(
detect(Some("https://www.reddit.com/r/rust"), ""),
DomainType::Forum
);
assert_eq!(
detect(Some("https://stackoverflow.com/questions/123"), ""),
DomainType::Forum
);
}
#[test]
fn social_urls() {
assert_eq!(
detect(Some("https://x.com/elonmusk"), ""),
DomainType::Social
);
assert_eq!(
detect(Some("https://linkedin.com/in/someone"), ""),
DomainType::Social
);
}
#[test]
fn ecommerce_urls() {
assert_eq!(
detect(Some("https://amazon.com/dp/B001"), ""),
DomainType::ECommerce
);
}
#[test]
fn dom_fallback_article() {
let html = r#"<html><body><article><p>Hello world</p></article></body></html>"#;
assert_eq!(detect(None, html), DomainType::Article);
}
#[test]
fn dom_fallback_generic() {
let html = r#"<html><body><div>Just some div</div></body></html>"#;
assert_eq!(detect(None, html), DomainType::Generic);
}
}

View file

@ -0,0 +1,15 @@
/// Extraction errors — kept minimal since this crate does no I/O.
/// Most failures come from malformed HTML or invalid URLs.
use thiserror::Error;
#[derive(Debug, Error)]
pub enum ExtractError {
#[error("failed to parse HTML")]
ParseError,
#[error("invalid URL: {0}")]
InvalidUrl(String),
#[error("no content found")]
NoContent,
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,513 @@
pub mod brand;
pub(crate) mod data_island;
/// webclaw-core: Pure HTML content extraction engine for LLMs.
///
/// Takes raw HTML + optional URL, returns structured content
/// (metadata, markdown, plain text, links, images, code blocks).
/// Zero network dependencies — WASM-compatible by design.
pub mod diff;
pub mod domain;
pub mod error;
pub mod extractor;
pub mod llm;
pub mod markdown;
pub mod metadata;
#[allow(dead_code)]
pub(crate) mod noise;
pub mod structured_data;
pub mod types;
pub mod youtube;
pub use brand::BrandIdentity;
pub use diff::{ChangeStatus, ContentDiff, MetadataChange};
pub use domain::DomainType;
pub use error::ExtractError;
pub use llm::to_llm_text;
pub use types::{
CodeBlock, Content, DomainData, ExtractionOptions, ExtractionResult, Image, Link, Metadata,
};
use scraper::Html;
use url::Url;
/// Extract structured content from raw HTML.
///
/// `html` — raw HTML string to parse
/// `url` — optional source URL, used for resolving relative links and domain detection
pub fn extract(html: &str, url: Option<&str>) -> Result<ExtractionResult, ExtractError> {
extract_with_options(html, url, &ExtractionOptions::default())
}
/// Extract structured content from raw HTML with configurable options.
///
/// `html` — raw HTML string to parse
/// `url` — optional source URL, used for resolving relative links and domain detection
/// `options` — controls include/exclude selectors, main content mode, and raw HTML output
pub fn extract_with_options(
html: &str,
url: Option<&str>,
options: &ExtractionOptions,
) -> Result<ExtractionResult, ExtractError> {
if html.is_empty() {
return Err(ExtractError::NoContent);
}
// YouTube fast path: if the URL is a YouTube video page, try extracting
// structured metadata from ytInitialPlayerResponse before DOM scoring.
// This gives LLMs a clean, structured view of video metadata.
if let Some(u) = url
&& youtube::is_youtube_url(u)
&& let Some(yt_md) = youtube::try_extract(html)
{
let doc = Html::parse_document(html);
let mut meta = metadata::extract(&doc, url);
meta.word_count = extractor::word_count(&yt_md);
let plain_text = yt_md
.lines()
.filter(|l| !l.starts_with('#') && !l.starts_with("**"))
.collect::<Vec<_>>()
.join("\n")
.trim()
.to_string();
let domain_data = Some(DomainData {
domain_type: DomainType::Social,
});
let structured_data = structured_data::extract_json_ld(html);
return Ok(ExtractionResult {
metadata: meta,
content: Content {
markdown: yt_md,
plain_text,
links: Vec::new(),
images: Vec::new(),
code_blocks: Vec::new(),
raw_html: None,
},
domain_data,
structured_data,
});
}
let doc = Html::parse_document(html);
let base_url = url
.map(|u| Url::parse(u).map_err(|_| ExtractError::InvalidUrl(u.to_string())))
.transpose()?;
// Metadata from <head>
let mut meta = metadata::extract(&doc, url);
// Main content extraction (Readability-style scoring + markdown conversion)
let mut content = extractor::extract_content(&doc, base_url.as_ref(), options);
// Use the higher of plain_text and markdown word counts.
// Some pages (headings + links) have content in markdown but empty plain_text.
let pt_wc = extractor::word_count(&content.plain_text);
let md_wc = extractor::word_count(&content.markdown);
meta.word_count = pt_wc.max(md_wc);
// Retry fallback: if extraction captured too little of the page's visible content,
// retry with wider strategies. The scorer sometimes picks a tiny node (e.g., an
// <article> with 52 words when the body has 1300 words of real content).
//
// Strategy 1: retry without only_main_content restriction
if options.only_main_content && meta.word_count < 30 {
let relaxed = ExtractionOptions {
only_main_content: false,
..options.clone()
};
let retry = extractor::extract_content(&doc, base_url.as_ref(), &relaxed);
let retry_wc =
extractor::word_count(&retry.plain_text).max(extractor::word_count(&retry.markdown));
if retry_wc > meta.word_count {
content = retry;
meta.word_count = retry_wc;
}
}
// Strategy 2: if scored extraction is sparse (<200 words) AND the page has
// significantly more visible text, retry with include_selectors: ["body"].
// This bypasses the readability scorer entirely — catches blogs, pricing
// pages, and modern sites where no single element scores well.
if meta.word_count < 200 && options.include_selectors.is_empty() {
let body_opts = ExtractionOptions {
include_selectors: vec!["body".to_string()],
exclude_selectors: options.exclude_selectors.clone(),
only_main_content: false,
include_raw_html: false,
};
let body_content = extractor::extract_content(&doc, base_url.as_ref(), &body_opts);
let body_wc = extractor::word_count(&body_content.plain_text)
.max(extractor::word_count(&body_content.markdown));
// Use body extraction if it captures significantly more content (>2x)
if body_wc > meta.word_count * 2 && body_wc > 50 {
content = body_content;
meta.word_count = body_wc;
}
}
// Fallback: if DOM extraction was sparse, try JSON data islands
// (React SPAs, Next.js, Contentful CMS embed page data in <script> tags)
if let Some(island_md) = data_island::try_extract(&doc, meta.word_count, &content.markdown) {
content.markdown.push_str("\n\n");
content.markdown.push_str(&island_md);
meta.word_count = extractor::word_count(&content.markdown);
}
// Domain detection from URL patterns and DOM heuristics
let domain_type = domain::detect(url, html);
let domain_data = Some(DomainData { domain_type });
// JSON-LD structured data (Schema.org Product, Article, etc.)
let structured_data = structured_data::extract_json_ld(html);
Ok(ExtractionResult {
metadata: meta,
content,
domain_data,
structured_data,
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn full_extraction_pipeline() {
let html = r#"
<html lang="en">
<head>
<title>Rust is Great</title>
<meta name="description" content="An article about Rust">
<meta name="author" content="Bob">
</head>
<body>
<nav><a href="/">Home</a> | <a href="/about">About</a></nav>
<article>
<h1>Why Rust is Great</h1>
<p>Rust gives you <strong>memory safety</strong> without a garbage collector.
This is achieved through its <em>ownership system</em>.</p>
<p>Here is an example:</p>
<pre><code class="language-rust">fn main() {
println!("Hello, world!");
}</code></pre>
<p>Learn more at <a href="https://rust-lang.org">rust-lang.org</a>.</p>
</article>
<footer>Copyright 2025</footer>
</body>
</html>"#;
let result = extract(html, Some("https://blog.example.com/rust")).unwrap();
// Metadata
assert_eq!(result.metadata.title.as_deref(), Some("Rust is Great"));
assert_eq!(
result.metadata.description.as_deref(),
Some("An article about Rust")
);
assert_eq!(result.metadata.author.as_deref(), Some("Bob"));
assert_eq!(result.metadata.language.as_deref(), Some("en"));
assert!(result.metadata.word_count > 0);
// Content
assert!(result.content.markdown.contains("# Why Rust is Great"));
assert!(result.content.markdown.contains("**memory safety**"));
assert!(result.content.markdown.contains("```rust"));
assert!(
result
.content
.links
.iter()
.any(|l| l.href == "https://rust-lang.org")
);
assert!(!result.content.code_blocks.is_empty());
// raw_html not populated by default
assert!(result.content.raw_html.is_none());
// Domain — blog.example.com has <article> tag
let dd = result.domain_data.unwrap();
assert_eq!(dd.domain_type, DomainType::Article);
}
#[test]
fn invalid_url_returns_error() {
let result = extract("<html></html>", Some("not a url"));
assert!(matches!(result, Err(ExtractError::InvalidUrl(_))));
}
#[test]
fn empty_html_returns_error() {
let result = extract("", None);
assert!(matches!(result, Err(ExtractError::NoContent)));
}
#[test]
fn no_url_is_fine() {
let result = extract("<html><body><p>Hello</p></body></html>", None);
assert!(result.is_ok());
}
#[test]
fn serializes_to_json() {
let result = extract("<html><body><p>Test</p></body></html>", None).unwrap();
let json = serde_json::to_string_pretty(&result).unwrap();
assert!(json.contains("metadata"));
assert!(json.contains("content"));
// raw_html should be absent (skip_serializing_if)
assert!(!json.contains("raw_html"));
}
#[test]
fn youtube_extraction_produces_structured_markdown() {
let html = r#"
<html><head><title>Rust in 100 Seconds - YouTube</title></head>
<body>
<script>
var ytInitialPlayerResponse = {"videoDetails":{"title":"Rust in 100 Seconds","author":"Fireship","viewCount":"5432100","shortDescription":"Learn Rust in 100 seconds. A mass of web developers are mass adopting Rust.","lengthSeconds":"120"},"microformat":{"playerMicroformatRenderer":{"uploadDate":"2023-01-15"}}};
</script>
</body></html>
"#;
let result = extract(html, Some("https://www.youtube.com/watch?v=5C_HPTJg5ek")).unwrap();
assert!(result.content.markdown.contains("# Rust in 100 Seconds"));
assert!(result.content.markdown.contains("**Channel:** Fireship"));
assert!(result.content.markdown.contains("2:00"));
assert!(
result
.content
.markdown
.contains("Learn Rust in 100 seconds")
);
// Should be detected as Social domain
let dd = result.domain_data.unwrap();
assert_eq!(dd.domain_type, DomainType::Social);
}
#[test]
fn youtube_url_without_player_response_falls_through() {
// If ytInitialPlayerResponse is missing, fall through to normal extraction
let html = r#"<html><body><article><h1>Some YouTube Page</h1><p>Content here for testing.</p></article></body></html>"#;
let result = extract(html, Some("https://www.youtube.com/watch?v=abc123")).unwrap();
// Should still extract something via normal pipeline
assert!(result.content.markdown.contains("Some YouTube Page"));
}
// --- ExtractionOptions tests ---
#[test]
fn test_exclude_selectors() {
let html = r#"<html><body>
<nav>Navigation stuff</nav>
<article><h1>Title</h1><p>Real content here.</p></article>
<footer>Footer stuff</footer>
</body></html>"#;
let options = ExtractionOptions {
exclude_selectors: vec!["nav".into(), "footer".into()],
..Default::default()
};
let result = extract_with_options(html, None, &options).unwrap();
assert!(result.content.markdown.contains("Real content"));
assert!(
!result.content.markdown.contains("Navigation stuff"),
"nav should be excluded"
);
assert!(
!result.content.markdown.contains("Footer stuff"),
"footer should be excluded"
);
}
#[test]
fn test_include_selectors() {
let html = r#"<html><body>
<nav>Navigation stuff</nav>
<article><h1>Title</h1><p>Real content here.</p></article>
<div class="sidebar">Sidebar junk</div>
<footer>Footer stuff</footer>
</body></html>"#;
let options = ExtractionOptions {
include_selectors: vec!["article".into()],
..Default::default()
};
let result = extract_with_options(html, None, &options).unwrap();
assert!(result.content.markdown.contains("Title"));
assert!(result.content.markdown.contains("Real content"));
assert!(
!result.content.markdown.contains("Navigation stuff"),
"nav should not be included"
);
assert!(
!result.content.markdown.contains("Sidebar junk"),
"sidebar should not be included"
);
assert!(
!result.content.markdown.contains("Footer stuff"),
"footer should not be included"
);
}
#[test]
fn test_include_and_exclude() {
let html = r#"<html><body>
<article>
<h1>Title</h1>
<p>Real content here.</p>
<div class="sidebar">Sidebar inside article</div>
</article>
<footer>Footer stuff</footer>
</body></html>"#;
let options = ExtractionOptions {
include_selectors: vec!["article".into()],
exclude_selectors: vec![".sidebar".into()],
..Default::default()
};
let result = extract_with_options(html, None, &options).unwrap();
assert!(result.content.markdown.contains("Title"));
assert!(result.content.markdown.contains("Real content"));
assert!(
!result.content.markdown.contains("Sidebar inside article"),
"sidebar inside article should be excluded"
);
assert!(
!result.content.markdown.contains("Footer stuff"),
"footer should not be included"
);
}
#[test]
fn test_only_main_content() {
let html = r#"<html><body>
<nav>Navigation</nav>
<div class="hero"><h1>Big Hero</h1></div>
<article><h2>Article Title</h2><p>Article content that is long enough to be real.</p></article>
<div class="sidebar">Sidebar</div>
<footer>Footer</footer>
</body></html>"#;
let options = ExtractionOptions {
only_main_content: true,
..Default::default()
};
let result = extract_with_options(html, None, &options).unwrap();
assert!(
result.content.markdown.contains("Article Title"),
"article content should be present"
);
assert!(
result.content.markdown.contains("Article content"),
"article body should be present"
);
// only_main_content picks the article/main element directly, so hero and sidebar
// should not be in the output
assert!(
!result.content.markdown.contains("Sidebar"),
"sidebar should not be in only_main_content output"
);
}
#[test]
fn test_include_raw_html() {
let html = r#"<html><body>
<article><h1>Title</h1><p>Content here.</p></article>
</body></html>"#;
let options = ExtractionOptions {
include_raw_html: true,
..Default::default()
};
let result = extract_with_options(html, None, &options).unwrap();
assert!(
result.content.raw_html.is_some(),
"raw_html should be populated"
);
let raw = result.content.raw_html.unwrap();
assert!(
raw.contains("<article>"),
"raw_html should contain article tag"
);
assert!(raw.contains("<h1>Title</h1>"), "raw_html should contain h1");
}
#[test]
fn test_invalid_selectors() {
let html = r#"<html><body>
<article><h1>Title</h1><p>Content here.</p></article>
</body></html>"#;
// Invalid selectors should be gracefully skipped
let options = ExtractionOptions {
include_selectors: vec!["[invalid[[[".into(), "article".into()],
exclude_selectors: vec![">>>bad".into()],
..Default::default()
};
let result = extract_with_options(html, None, &options).unwrap();
assert!(
result.content.markdown.contains("Title"),
"valid selectors should still work"
);
assert!(
result.content.markdown.contains("Content here"),
"extraction should proceed despite invalid selectors"
);
}
#[test]
fn test_backward_compat() {
let html = r#"<html><body>
<article><h1>Title</h1><p>Content here.</p></article>
</body></html>"#;
let result_old = extract(html, None).unwrap();
let result_new = extract_with_options(html, None, &ExtractionOptions::default()).unwrap();
assert_eq!(result_old.content.markdown, result_new.content.markdown);
assert_eq!(result_old.content.plain_text, result_new.content.plain_text);
assert_eq!(
result_old.content.links.len(),
result_new.content.links.len()
);
}
#[test]
fn test_empty_options() {
let html = r#"<html><body>
<article><h1>Title</h1><p>Content here.</p></article>
</body></html>"#;
let result_extract = extract(html, None).unwrap();
let result_options =
extract_with_options(html, None, &ExtractionOptions::default()).unwrap();
assert_eq!(
result_extract.content.markdown, result_options.content.markdown,
"default ExtractionOptions should produce identical results to extract()"
);
}
#[test]
fn test_raw_html_not_in_json_when_none() {
let result = extract("<html><body><p>Test</p></body></html>", None).unwrap();
let json = serde_json::to_string(&result).unwrap();
assert!(
!json.contains("raw_html"),
"raw_html should be absent from JSON when None"
);
}
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,237 @@
/// Image handling for LLM output: linked image conversion, logo detection,
/// standalone image stripping, and bare image reference removal.
use once_cell::sync::Lazy;
use regex::Regex;
use super::cleanup::is_asset_label;
// ---------------------------------------------------------------------------
// Linked image conversion: [![alt](img)](url) -> [alt](url)
// ---------------------------------------------------------------------------
/// Matches `[![alt](img-url)](link-url)` -- an image wrapped in a link.
static LINKED_IMAGE_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"\[!\[([^\]]*)\]\([^)]+\)\]\(([^)]+)\)").unwrap());
/// Matches empty markdown links `[](url)` left after image stripping.
pub(crate) static EMPTY_LINK_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"\[\s*\]\([^)]+\)").unwrap());
/// Convert linked images to plain links, preserving the alt text and link target.
/// Adds a newline after each to prevent text mashing when multiple are adjacent.
pub(crate) fn convert_linked_images(input: &str) -> String {
LINKED_IMAGE_RE
.replace_all(input, |caps: &regex::Captures| {
let alt = caps.get(1).map_or("", |m| m.as_str());
let href = caps.get(2).map_or("", |m| m.as_str());
format!("[{alt}]({href})\n")
})
.into_owned()
}
// ---------------------------------------------------------------------------
// Logo image collapsing
// ---------------------------------------------------------------------------
/// Regex matching a line that is *only* a markdown image (with optional whitespace).
static IMAGE_LINE_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"^!\[([^\]]*)\]\([^)]+\)\s*$").unwrap());
/// Collapse consecutive image-only lines into a comma-separated summary
/// of their alt texts (for logo bars, partner grids, etc.).
pub(crate) fn collapse_logo_images(input: &str) -> String {
let lines: Vec<&str> = input.lines().collect();
let mut out = String::with_capacity(input.len());
let mut i = 0;
while i < lines.len() {
// Check if this starts a run of consecutive image-only lines
if IMAGE_LINE_RE.is_match(lines[i].trim()) {
let mut alts: Vec<String> = Vec::new();
let start = i;
while i < lines.len() {
let trimmed = lines[i].trim();
// Allow blank lines between images in the same run
if trimmed.is_empty() {
i += 1;
continue;
}
if let Some(caps) = IMAGE_LINE_RE.captures(trimmed) {
let alt = caps.get(1).map_or("", |m| m.as_str()).trim().to_string();
if !alt.is_empty() {
alts.push(alt);
}
i += 1;
} else {
break;
}
}
let image_count = if alts.is_empty() {
i - start
} else {
alts.len()
};
if image_count >= 2 && !alts.is_empty() {
out.push_str(&alts.join(", "));
out.push('\n');
} else if image_count == 1 && !alts.is_empty() && alts[0].len() > 30 {
out.push_str(&alts[0]);
out.push('\n');
}
// else: single image with short/empty alt -- drop entirely
} else {
out.push_str(lines[i]);
out.push('\n');
i += 1;
}
}
out
}
// ---------------------------------------------------------------------------
// Remaining inline image stripping
// ---------------------------------------------------------------------------
/// Matches `![alt](url)` anywhere in a line, including multiple on the same line.
static INLINE_IMAGE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"!\[([^\]]*)\]\([^)]+\)").unwrap());
/// Strip inline images. For multi-image lines, separate short alts (logos)
/// from long alts (descriptive) so they don't get mixed together.
pub(crate) fn strip_remaining_images(input: &str) -> String {
let mut out = String::with_capacity(input.len());
for line in input.lines() {
let image_matches: Vec<_> = INLINE_IMAGE_RE.find_iter(line).collect();
if image_matches.len() >= 2 {
// Separate short alts (brand names/logos) from long alts (descriptions)
let mut short_alts: Vec<&str> = Vec::new();
let mut long_alts: Vec<&str> = Vec::new();
for caps in INLINE_IMAGE_RE.captures_iter(line) {
let alt = caps.get(1).map_or("", |m| m.as_str()).trim();
// Skip empty alts and quoted-empty alts like `""`
if alt.is_empty() || alt == "\"\"" {
continue;
}
if alt.len() <= 30 {
short_alts.push(alt);
} else {
long_alts.push(alt);
}
}
// Filter out CMS asset labels from alt texts before output
short_alts.retain(|alt| !is_asset_label(alt));
long_alts.retain(|alt| !is_asset_label(alt));
// Remove images, then strip empty link remnants [](url)
let remaining = INLINE_IMAGE_RE.replace_all(line, "");
let remaining = EMPTY_LINK_RE.replace_all(&remaining, "");
let remaining = remaining.trim();
if !short_alts.is_empty() {
if !remaining.is_empty() {
out.push_str(remaining);
out.push('\n');
}
out.push_str(&short_alts.join(", "));
out.push('\n');
} else if !remaining.is_empty() {
out.push_str(remaining);
out.push('\n');
}
// Long alts on their own lines (descriptions, not logos)
for alt in &long_alts {
out.push_str(alt);
out.push('\n');
}
} else {
// 0 or 1 image -- keep long alt text, drop short/empty/CMS labels
let replaced = INLINE_IMAGE_RE.replace_all(line, |caps: &regex::Captures| {
let alt = caps.get(1).map_or("", |m| m.as_str()).trim();
if alt.len() > 30 && !is_asset_label(alt) {
alt.to_string()
} else {
String::new()
}
});
out.push_str(&replaced);
out.push('\n');
}
}
out
}
// ---------------------------------------------------------------------------
// Bare image file reference stripping
// ---------------------------------------------------------------------------
const IMAGE_EXTENSIONS: &[&str] = &[
".webp", ".svg", ".png", ".jpg", ".jpeg", ".gif", ".avif", ".ico", ".bmp",
];
/// Strip lines that are just bare image filenames or image URLs.
/// Keeps lines where an image filename appears within a larger sentence.
pub(crate) fn strip_bare_image_refs(input: &str) -> String {
let mut out = String::with_capacity(input.len());
for line in input.lines() {
let trimmed = line.trim();
if !trimmed.is_empty() && is_bare_image_ref(trimmed) {
continue;
}
out.push_str(line);
out.push('\n');
}
out
}
/// A line is a bare image reference if it's a single token ending with an image extension.
/// Catches filenames ("hero.webp") and URLs ("https://cdn.example.com/logo.svg").
fn is_bare_image_ref(line: &str) -> bool {
if line.starts_with('#')
|| line.starts_with("- ")
|| line.starts_with("* ")
|| line.starts_with("```")
|| line.starts_with("> ")
{
return false;
}
if line.contains(' ') {
return false;
}
let lower = line.to_lowercase();
IMAGE_EXTENSIONS.iter().any(|ext| lower.ends_with(ext))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn linked_image_conversion() {
let input = "[![docs](https://img/d.png)](https://docs.example.com)";
let result = convert_linked_images(input);
assert!(result.contains("[docs](https://docs.example.com)"));
assert!(!result.contains("!["));
}
#[test]
fn bare_image_ref_detected() {
assert!(is_bare_image_ref("hero.webp"));
assert!(is_bare_image_ref("https://cdn.example.com/logo.svg"));
assert!(!is_bare_image_ref("The file output.png is saved to disk."));
assert!(!is_bare_image_ref("# heading.png"));
}
}

View file

@ -0,0 +1,184 @@
/// Link extraction, deduplication, noise filtering, and label formatting
/// for the LLM output's deduplicated links section.
use std::collections::HashSet;
use once_cell::sync::Lazy;
use regex::Regex;
// ---------------------------------------------------------------------------
// Link extraction
// ---------------------------------------------------------------------------
/// Matches `[text](url)`. Images are already stripped, so no `!` prefix concern.
static LINK_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\[([^\]]*)\]\(([^)]+)\)").unwrap());
/// Extract all links from markdown, replacing inline `[text](url)` with just `text`.
/// Returns the cleaned text and a deduplicated list of (label, href) pairs.
pub(crate) fn extract_and_strip_links(input: &str) -> (String, Vec<(String, String)>) {
let mut links: Vec<(String, String)> = Vec::new();
let mut seen_hrefs: HashSet<String> = HashSet::new();
let replaced = LINK_RE.replace_all(input, |caps: &regex::Captures| {
let text = caps.get(1).map_or("", |m| m.as_str()).trim().to_string();
let href = caps.get(2).map_or("", |m| m.as_str()).trim().to_string();
let skip = href.starts_with('#')
|| href.starts_with("javascript:")
|| href.is_empty()
|| is_noise_link(&text, &href);
if !skip && !text.is_empty() && seen_hrefs.insert(href.clone()) {
links.push((text.clone(), href));
}
text
});
(replaced.into_owned(), links)
}
/// Links that are noise for LLM consumption: internal actions, timestamps,
/// user profiles, generic short text.
fn is_noise_link(text: &str, href: &str) -> bool {
let t = text.to_lowercase();
// Generic action links
if matches!(
t.as_str(),
"hide"
| "flag"
| "reply"
| "favorite"
| "unflag"
| "vouch"
| "next"
| "prev"
| "previous"
| "more"
) {
return true;
}
// Timestamp text ("1 hour ago", "5 minutes ago", "yesterday")
if t.ends_with(" ago") || t == "yesterday" || t == "just now" {
return true;
}
// Single-char text that's not meaningful (but keep letters -- "X", "Go", etc.)
if text.len() == 1 && !text.chars().next().unwrap_or(' ').is_alphanumeric() {
return true;
}
// Internal user profile / action URLs (HN-style)
if href.contains("/user?id=")
|| href.contains("/hide?id=")
|| href.contains("/from?site=")
|| href.contains("/flag?id=")
{
return true;
}
false
}
// ---------------------------------------------------------------------------
// Link label cleaning
// ---------------------------------------------------------------------------
static MD_MARKERS_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"#{1,6}\s+|\*{1,2}|_{1,2}|`").unwrap());
/// Clean a link label: strip markdown, dedup repeated phrases, truncate.
pub(crate) fn clean_link_label(raw: &str) -> String {
// Strip markdown markers
let label = MD_MARKERS_RE.replace_all(raw, "").to_string();
let label = label.split_whitespace().collect::<Vec<_>>().join(" ");
// Dedup repeated phrases in label
let label = dedup_label_phrase(&label);
// Truncate to ~80 chars (UTF-8 safe)
if label.len() > 80 {
// Find last whitespace boundary at or before 80 bytes
let mut end = None;
for (i, _) in label.char_indices() {
if i > 80 {
break;
}
if i > 0 && label.as_bytes()[i - 1].is_ascii_whitespace() {
end = Some(i);
}
}
let end = end.unwrap_or_else(|| {
// No whitespace found -- find char boundary near 80
label
.char_indices()
.map(|(i, _)| i)
.find(|&i| i >= 80)
.unwrap_or(label.len())
});
format!("{}...", label[..end].trim_end())
} else {
label
}
}
/// If a label contains the same phrase twice (e.g., "X Y Z X Y Z"), return just one copy.
fn dedup_label_phrase(label: &str) -> String {
let len = label.len();
if len < 8 {
return label.to_string();
}
// Try split at each whitespace boundary
for (i, _) in label.match_indices(' ') {
let left = label[..i].trim();
let right = label[i + 1..].trim();
if left.len() >= 4 && left.eq_ignore_ascii_case(right) {
return left.to_string();
}
}
label.to_string()
}
// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn link_label_truncated() {
let long = "The quick brown fox jumps over the lazy dog and then runs across the field to find more interesting things to do on a sunny afternoon";
let result = clean_link_label(long);
assert!(result.len() <= 84, "got len {}: {result}", result.len());
assert!(result.ends_with("..."), "got: {result}");
}
#[test]
fn link_label_markdown_stripped() {
assert_eq!(clean_link_label("## Hello **world**"), "Hello world");
}
#[test]
fn link_label_duplicate_deduped() {
assert_eq!(
clean_link_label("Express Delivery Express Delivery"),
"Express Delivery"
);
}
#[test]
fn link_label_short_unchanged() {
assert_eq!(clean_link_label("Click here"), "Click here");
}
#[test]
fn noise_link_detected() {
assert!(is_noise_link("hide", "https://example.com"));
assert!(is_noise_link("5 minutes ago", "https://example.com"));
assert!(is_noise_link("user", "https://hn.com/user?id=foo"));
assert!(!is_noise_link("Rust docs", "https://rust-lang.org"));
}
}

View file

@ -0,0 +1,47 @@
/// Metadata header building for LLM-optimized output.
///
/// Produces `> ` prefixed lines with URL, title, author, etc.
/// Omits empty/zero fields to minimize token waste.
use crate::types::ExtractionResult;
pub(crate) fn build_metadata_header(
out: &mut String,
result: &ExtractionResult,
url: Option<&str>,
) {
let meta = &result.metadata;
// URL: prefer explicit arg, fall back to metadata
let effective_url = url.or(meta.url.as_deref());
if let Some(u) = effective_url {
out.push_str(&format!("> URL: {u}\n"));
}
if let Some(t) = &meta.title
&& !t.is_empty()
{
out.push_str(&format!("> Title: {t}\n"));
}
if let Some(d) = &meta.description
&& !d.is_empty()
{
out.push_str(&format!("> Description: {d}\n"));
}
if let Some(a) = &meta.author
&& !a.is_empty()
{
out.push_str(&format!("> Author: {a}\n"));
}
if let Some(d) = &meta.published_date
&& !d.is_empty()
{
out.push_str(&format!("> Published: {d}\n"));
}
if let Some(l) = &meta.language
&& !l.is_empty()
{
out.push_str(&format!("> Language: {l}\n"));
}
if meta.word_count > 0 {
out.push_str(&format!("> Word count: {}\n", meta.word_count));
}
}

View file

@ -0,0 +1,696 @@
/// LLM-optimized output format.
///
/// Takes an `ExtractionResult` and produces a compact text representation
/// that maximizes information density per token. Strips decorative images,
/// visual-only formatting (bold/italic), and inline link URLs -- moving links
/// to a deduplicated section at the end.
mod body;
mod cleanup;
mod images;
mod links;
mod metadata;
use crate::types::ExtractionResult;
/// Produce a token-optimized text representation of extracted content.
///
/// The output has three sections:
/// 1. Compact metadata header (`> ` prefixed lines)
/// 2. Cleaned body (no images, no bold/italic, links as plain text)
/// 3. Deduplicated links section at the end
pub fn to_llm_text(result: &ExtractionResult, url: Option<&str>) -> String {
let mut out = String::new();
// -- 1. Metadata header --
metadata::build_metadata_header(&mut out, result, url);
// -- 2. Process body --
let processed = body::process_body(&result.content.markdown);
if !processed.text.is_empty() {
if !out.is_empty() {
out.push('\n');
}
out.push_str(&processed.text);
}
// -- 3. Links section --
if !processed.links.is_empty() {
out.push_str("\n\n## Links\n");
for (text, href) in &processed.links {
let label = links::clean_link_label(text);
if !label.is_empty() {
out.push_str(&format!("- {label}: {href}\n"));
}
}
}
out.trim().to_string()
}
// ---------------------------------------------------------------------------
// Integration tests that exercise the full pipeline through to_llm_text
// ---------------------------------------------------------------------------
#[cfg(test)]
mod tests {
use super::*;
use crate::types::*;
fn make_result(markdown: &str) -> ExtractionResult {
ExtractionResult {
metadata: Metadata {
title: Some("Test Page".into()),
description: Some("A test page".into()),
author: None,
published_date: None,
language: Some("en".into()),
url: Some("https://example.com".into()),
site_name: None,
image: None,
favicon: None,
word_count: 42,
},
content: Content {
markdown: markdown.into(),
plain_text: String::new(),
links: vec![],
images: vec![],
code_blocks: vec![],
raw_html: None,
},
domain_data: None,
structured_data: vec![],
}
}
#[test]
fn metadata_header_includes_populated_fields() {
let result = make_result("# Hello");
let out = to_llm_text(&result, Some("https://example.com/page"));
assert!(out.contains("> URL: https://example.com/page"));
assert!(out.contains("> Title: Test Page"));
assert!(out.contains("> Description: A test page"));
assert!(out.contains("> Language: en"));
assert!(out.contains("> Word count: 42"));
assert!(!out.contains("> Author:"));
}
#[test]
fn strips_image_markdown() {
let md = "Some text\n\n![logo](https://cdn.example.com/img/logo.png)\n\nMore text";
let result = make_result(md);
let out = to_llm_text(&result, None);
assert!(!out.contains("!["));
assert!(!out.contains("cdn.example.com"));
assert!(out.contains("Some text"));
assert!(out.contains("More text"));
}
#[test]
fn collapses_consecutive_logo_images_on_separate_lines() {
let md = "# Partners\n\n\
![WRITER](https://cdn.example.com/writer.png)\n\
![MongoDB](https://cdn.example.com/mongo.png)\n\
![GROQ](https://cdn.example.com/groq.png)\n\
![LangChain](https://cdn.example.com/langchain.png)\n\n\
Some other content";
let result = make_result(md);
let out = to_llm_text(&result, None);
assert!(out.contains("WRITER, MongoDB, GROQ, LangChain"));
assert!(!out.contains("!["));
assert!(!out.contains("cdn.example.com"));
}
#[test]
fn collapses_consecutive_logo_images_on_same_line() {
let md = "![WRITER](https://cdn.example.com/w.png)![MongoDB](https://cdn.example.com/m.png)![GROQ](https://cdn.example.com/g.png)";
let result = make_result(md);
let out = to_llm_text(&result, None);
assert!(out.contains("WRITER"));
assert!(out.contains("MongoDB"));
assert!(out.contains("GROQ"));
assert!(!out.contains("!["));
assert!(!out.contains("cdn.example.com"));
}
#[test]
fn keeps_meaningful_alt_text() {
let md = "![A detailed photograph showing the team collaborating on the project](https://img.example.com/photo.jpg)";
let result = make_result(md);
let out = to_llm_text(&result, None);
assert!(
out.contains("A detailed photograph showing the team collaborating on the project")
);
assert!(!out.contains("!["));
}
#[test]
fn strips_bold_and_italic() {
let md = "This is **bold text** and *italic text* and __also bold__ and _also italic_.";
let result = make_result(md);
let out = to_llm_text(&result, None);
assert!(out.contains("This is bold text and italic text and also bold and also italic."));
assert!(!out.contains("**"));
assert!(!out.contains("__"));
}
#[test]
fn moves_links_to_end() {
let md = "Check out [Rust](https://rust-lang.org) and [Go](https://go.dev) for details.";
let result = make_result(md);
let out = to_llm_text(&result, None);
assert!(out.contains("Check out Rust and Go for details."));
assert!(out.contains("## Links"));
assert!(out.contains("- Rust: https://rust-lang.org"));
assert!(out.contains("- Go: https://go.dev"));
}
#[test]
fn skips_anchor_and_javascript_links() {
let md = "Go to [top](#top) and [click](javascript:void(0)) and [real](https://real.example.com).";
let result = make_result(md);
let out = to_llm_text(&result, None);
assert!(out.contains("## Links"));
assert!(out.contains("- real: https://real.example.com"));
let links_section = out.split("## Links").nth(1).unwrap_or("");
assert!(!links_section.contains("#top"));
assert!(!links_section.contains("javascript:"));
}
#[test]
fn deduplicates_heading_and_paragraph() {
let md = "### Ground models\n\nGround models with fresh web context\n\nRetrieve live data.";
let result = make_result(md);
let out = to_llm_text(&result, None);
assert!(out.contains("### Ground models with fresh web context"));
assert!(out.contains("Retrieve live data."));
}
#[test]
fn deduplicates_identical_heading_paragraph() {
let md = "## Features\n\nFeatures\n\nHere are the features.";
let result = make_result(md);
let out = to_llm_text(&result, None);
let feature_count = out.matches("Features").count();
assert_eq!(
feature_count, 1,
"Expected 'Features' exactly once, got: {out}"
);
}
#[test]
fn collapses_excessive_whitespace() {
let md = "Line one\n\n\n\n\nLine two\n\n\n\nLine three";
let result = make_result(md);
let out = to_llm_text(&result, None);
assert!(
!out.contains("\n\n\n"),
"Found 3+ consecutive newlines in: {:?}",
out
);
}
#[test]
fn preserves_code_blocks() {
let md = "Example:\n\n```rust\nfn main() {\n println!(\"hello\");\n}\n```\n\nDone.";
let result = make_result(md);
let out = to_llm_text(&result, None);
assert!(out.contains("```rust"));
assert!(out.contains("fn main()"));
assert!(out.contains("```"));
}
#[test]
fn preserves_list_structure() {
let md = "Features:\n\n- Fast\n- Safe\n- Concurrent";
let result = make_result(md);
let out = to_llm_text(&result, None);
assert!(out.contains("- Fast"));
assert!(out.contains("- Safe"));
assert!(out.contains("- Concurrent"));
}
#[test]
fn deduplicates_links() {
let md = "Visit [Example](https://example.org/page) or [Example again](https://example.org/page).";
let result = make_result(md);
let out = to_llm_text(&result, None);
let link_count = out.matches("https://example.org/page").count();
assert_eq!(link_count, 1, "Expected link once, got: {out}");
}
#[test]
fn realistic_page() {
let html = r#"
<html lang="en">
<head>
<title>Tavily - AI Search API</title>
<meta name="description" content="Real-time search for AI agents">
</head>
<body>
<article>
<h1>Connect your AI agents to the web</h1>
<p>Real-time search, extraction, and web crawling through a <strong>single API</strong>.</p>
<p>Trusted by <em>1M+ developers</em>.</p>
<img src="https://cdn.example.com/writer.png" alt="WRITER">
<img src="https://cdn.example.com/mongo.png" alt="MongoDB">
<img src="https://cdn.example.com/groq.png" alt="GROQ">
<img src="https://cdn.example.com/langchain.png" alt="LangChain">
<h2>Ground models with fresh web context</h2>
<p>Retrieve live web data and return it structured for models.</p>
<p>Learn more at <a href="https://docs.tavily.com">the docs</a>.</p>
<p><a href="https://app.tavily.com">Try it out</a></p>
</article>
</body>
</html>"#;
let result = crate::extract(html, Some("https://www.tavily.com/")).unwrap();
let out = to_llm_text(&result, Some("https://www.tavily.com/"));
assert!(out.contains("> URL: https://www.tavily.com/"));
assert!(out.contains("> Title:"));
assert!(!out.contains("!["), "Image markdown not stripped: {out}");
assert!(
!out.contains("cdn.example.com"),
"CDN URL not stripped: {out}"
);
assert!(
out.contains("WRITER") && out.contains("MongoDB"),
"Logo alt texts missing: {out}"
);
assert!(!out.contains("**"), "Bold not stripped: {out}");
assert!(out.contains("# Connect your AI agents to the web"));
assert!(out.contains("## Ground models with fresh web context"));
assert!(out.contains("Retrieve live web data"));
assert!(out.contains("## Links"));
assert!(out.contains("https://docs.tavily.com"));
assert!(out.contains("https://app.tavily.com"));
}
#[test]
fn empty_metadata_fields_excluded() {
let result = ExtractionResult {
metadata: Metadata {
title: None,
description: None,
author: None,
published_date: None,
language: None,
url: None,
site_name: None,
image: None,
favicon: None,
word_count: 0,
},
content: Content {
markdown: "Just content".into(),
plain_text: String::new(),
links: vec![],
images: vec![],
code_blocks: vec![],
raw_html: None,
},
domain_data: None,
structured_data: vec![],
};
let out = to_llm_text(&result, None);
assert!(!out.contains("> "));
assert!(out.contains("Just content"));
}
#[test]
fn strips_empty_alt_images() {
let md = "Before\n\n![](https://cdn.example.com/spacer.gif)\n\nAfter";
let result = make_result(md);
let out = to_llm_text(&result, None);
assert!(!out.contains("cdn.example.com"));
assert!(!out.contains("!["));
assert!(out.contains("Before"));
assert!(out.contains("After"));
}
#[test]
fn preserves_headings_structure() {
let md = "# H1\n\n## H2\n\n### H3\n\nContent under H3.";
let result = make_result(md);
let out = to_llm_text(&result, None);
assert!(out.contains("# H1"));
assert!(out.contains("## H2"));
assert!(out.contains("### H3"));
}
#[test]
fn inline_image_in_paragraph_stripped() {
let md = "Check this ![icon](https://x.com/icon.png) out and read more.";
let result = make_result(md);
let out = to_llm_text(&result, None);
assert!(!out.contains("!["));
assert!(!out.contains("x.com/icon.png"));
assert!(out.contains("Check this"));
assert!(out.contains("out and read more."));
}
#[test]
fn does_not_strip_emphasis_inside_code_blocks() {
let md = "Normal **bold** text\n\n```python\ndef foo(**kwargs):\n return _internal_var_\n```\n\nMore text";
let result = make_result(md);
let out = to_llm_text(&result, None);
assert!(out.contains("Normal bold text"));
assert!(out.contains("**kwargs"));
assert!(out.contains("_internal_var_"));
}
#[test]
fn converts_linked_images_to_links() {
let md = "[![Read the docs](https://img.example.com/docs.png)](https://docs.example.com)";
let result = make_result(md);
let out = to_llm_text(&result, None);
assert!(!out.contains("!["), "Image not converted: {out}");
assert!(
out.contains("https://docs.example.com"),
"Link URL missing from footer: {out}"
);
assert!(out.contains("Read the docs"), "Link text missing: {out}");
}
#[test]
fn linked_images_split_on_separate_lines() {
let md = "[![Article A](https://img/a.png)](https://a.example.com)[![Article B](https://img/b.png)](https://b.example.com)";
let result = make_result(md);
let out = to_llm_text(&result, None);
assert!(out.contains("Article A"), "Article A missing: {out}");
assert!(out.contains("Article B"), "Article B missing: {out}");
assert!(
!out.contains("Article AArticle B"),
"Text mashed together: {out}"
);
}
#[test]
fn separates_short_and_long_alts_on_same_line() {
let md = "![AWS](https://cdn/aws.png)![IBM](https://cdn/ibm.png)![Ground models with fresh web context](https://cdn/icon.png)";
let result = make_result(md);
let out = to_llm_text(&result, None);
assert!(out.contains("AWS, IBM"), "Logo collapse failed: {out}");
assert!(
!out.contains("IBM, Ground"),
"Long alt mixed with logos: {out}"
);
}
#[test]
fn dedup_text_line_matching_heading() {
let md = "![Handle thousands of web queries in seconds](https://cdn/icon.png)\n\n### Handle thousands of web queries in seconds\n\nA production-grade stack.";
let result = make_result(md);
let out = to_llm_text(&result, None);
let count = out
.matches("Handle thousands of web queries in seconds")
.count();
assert_eq!(count, 1, "Expected once, got {count}: {out}");
assert!(out.contains("### Handle thousands"));
assert!(out.contains("A production-grade stack."));
}
#[test]
fn no_leading_dot_from_linked_images() {
let md = "[![News A](https://img/a.png)](https://a.com)[![News B](https://img/b.png)](https://b.com)";
let result = make_result(md);
let out = to_llm_text(&result, None);
assert!(
!out.contains(". News"),
"Leading dot from empty remaining: {out}"
);
}
#[test]
fn merges_stat_lines_with_descriptions() {
let md = "100M+\n\nmonthly requests handled\n\n99.99% uptime\n\nSLA powering mission-critical systems\n\n180 ms\n\np50 on Tavily /search making us fastest on the market\n\n1M+\n\ndevelopers using Tavily\n\nBillions\n\nof pages crawled and extracted without downtime\n\nDrop-in integration\n\nwith leading LLM providers (OpenAI, Anthropic, Groq)";
let result = make_result(md);
let out = to_llm_text(&result, None);
assert!(
out.contains("100M+ monthly requests handled"),
"Stat not merged: {out}"
);
assert!(
out.contains("99.99% uptime SLA powering mission-critical systems"),
"Stat not merged: {out}"
);
assert!(
out.contains("180 ms p50 on Tavily /search making us fastest on the market"),
"Stat not merged: {out}"
);
assert!(
out.contains("1M+ developers using Tavily"),
"Stat not merged: {out}"
);
assert!(
out.contains("Billions of pages crawled and extracted without downtime"),
"Stat not merged: {out}"
);
assert!(
out.contains(
"Drop-in integration with leading LLM providers (OpenAI, Anthropic, Groq)"
),
"Stat not merged: {out}"
);
}
#[test]
fn merge_stat_preserves_headings_and_lists() {
let md = "## Features\n\n100M+\n\nmonthly requests\n\n- Fast\n- Safe";
let result = make_result(md);
let out = to_llm_text(&result, None);
assert!(out.contains("## Features"), "Heading lost: {out}");
assert!(
out.contains("100M+ monthly requests"),
"Stat not merged: {out}"
);
assert!(out.contains("- Fast"), "List item lost: {out}");
assert!(out.contains("- Safe"), "List item lost: {out}");
}
#[test]
fn merge_stat_does_not_merge_long_lines() {
let md = "This is a longer line of text!\n\nAnd this follows after a blank";
let result = make_result(md);
let out = to_llm_text(&result, None);
assert!(
!out.contains("text! And"),
"Long line incorrectly merged: {out}"
);
}
#[test]
fn strips_css_class_text_lines() {
let md = "# Typography\n\n\
text-4xl font-bold tracking-tight text-gray-900\n\n\
Build beautiful websites with Tailwind CSS.\n\n\
text-5xl text-6xl text-8xl text-gray-950 text-white tracking-tighter text-balance";
let result = make_result(md);
let out = to_llm_text(&result, None);
assert!(
!out.contains("text-4xl font-bold"),
"CSS class line was not stripped: {out}"
);
assert!(
!out.contains("text-5xl text-6xl"),
"CSS class line was not stripped: {out}"
);
assert!(
out.contains("Build beautiful websites"),
"Normal prose was stripped: {out}"
);
assert!(out.contains("Typography"), "Heading was stripped: {out}");
}
#[test]
fn keeps_prose_with_css_like_word() {
let md = "The text-based approach works well for this use case.\n\n\
We use a grid-like layout for the dashboard.";
let result = make_result(md);
let out = to_llm_text(&result, None);
assert!(
out.contains("text-based approach"),
"Normal prose incorrectly stripped: {out}"
);
assert!(
out.contains("grid-like layout"),
"Normal prose incorrectly stripped: {out}"
);
}
#[test]
fn preserves_css_classes_inside_code_blocks() {
let md = "Example usage:\n\n\
```html\n\
<div class=\"text-4xl font-bold tracking-tight text-gray-900\">\n\
```\n\n\
That applies bold typography.";
let result = make_result(md);
let out = to_llm_text(&result, None);
assert!(
out.contains("text-4xl font-bold tracking-tight"),
"CSS classes inside code block were stripped: {out}"
);
}
#[test]
fn dedup_removes_exact_duplicate_paragraphs() {
let md = "Supabase is an amazing platform that makes building apps incredibly fast.\n\nSupabase is an amazing platform that makes building apps incredibly fast.\n\nSupabase is an amazing platform that makes building apps incredibly fast.\n\nEach project gets its own dedicated Postgres database.";
let result = make_result(md);
let out = to_llm_text(&result, None);
let count = out.matches("Supabase is an amazing platform").count();
assert_eq!(
count, 1,
"Duplicate paragraph should appear only once, got {count}: {out}"
);
assert!(
out.contains("Each project gets its own dedicated Postgres database"),
"Unique paragraph missing: {out}"
);
}
#[test]
fn dedup_preserves_unique_paragraphs() {
let md = "First unique paragraph with enough content to be checked.\n\nSecond unique paragraph that is completely different.\n\nThird unique paragraph covering another topic entirely.";
let result = make_result(md);
let out = to_llm_text(&result, None);
assert!(out.contains("First unique paragraph"), "Lost first: {out}");
assert!(
out.contains("Second unique paragraph"),
"Lost second: {out}"
);
assert!(out.contains("Third unique paragraph"), "Lost third: {out}");
}
#[test]
fn dedup_keeps_short_repeated_text() {
let md = "Learn more\n\nA detailed explanation of the first feature.\n\nLearn more\n\nA detailed explanation of the second feature.";
let result = make_result(md);
let out = to_llm_text(&result, None);
let count = out.matches("Learn more").count();
assert!(
count >= 2,
"Short repeated text should be kept, got {count}: {out}"
);
}
#[test]
fn dedup_catches_near_duplicates_via_prefix() {
let md = "The platform provides real-time sync collaboration tools for modern developers building web applications with React and Next.js.\n\nThe platform provides real-time sync collaboration tools for modern developers building mobile apps with Flutter.\n\nA completely different paragraph about database design.";
let result = make_result(md);
let out = to_llm_text(&result, None);
let count = out.matches("The platform provides real-time sync").count();
assert_eq!(
count, 1,
"Near-duplicate should be removed, got {count}: {out}"
);
assert!(
out.contains("A completely different paragraph"),
"Unique paragraph missing: {out}"
);
}
#[test]
fn dedup_carousel_realistic() {
let md = "## What our users say\n\n\"Supabase has transformed how we build products. The developer experience is unmatched.\" - Sarah Chen, CTO at TechCorp\n\n\"Moving from Firebase to Supabase was the best decision we made this year.\" - James Liu, Lead Engineer\n\n\"The real-time features and Postgres foundation give us confidence at scale.\" - Maria Garcia, VP Engineering\n\n\"Supabase has transformed how we build products. The developer experience is unmatched.\" - Sarah Chen, CTO at TechCorp\n\n\"Moving from Firebase to Supabase was the best decision we made this year.\" - James Liu, Lead Engineer\n\n\"The real-time features and Postgres foundation give us confidence at scale.\" - Maria Garcia, VP Engineering\n\n\"Supabase has transformed how we build products. The developer experience is unmatched.\" - Sarah Chen, CTO at TechCorp\n\n\"Moving from Firebase to Supabase was the best decision we made this year.\" - James Liu, Lead Engineer\n\n\"The real-time features and Postgres foundation give us confidence at scale.\" - Maria Garcia, VP Engineering\n\n## Get started\n\nSign up for free today.";
let result = make_result(md);
let out = to_llm_text(&result, None);
let sarah_count = out.matches("Sarah Chen").count();
let james_count = out.matches("James Liu").count();
let maria_count = out.matches("Maria Garcia").count();
assert_eq!(sarah_count, 1, "Sarah duplicated {sarah_count}x: {out}");
assert_eq!(james_count, 1, "James duplicated {james_count}x: {out}");
assert_eq!(maria_count, 1, "Maria duplicated {maria_count}x: {out}");
assert!(out.contains("## What our users say"), "Heading lost: {out}");
assert!(out.contains("## Get started"), "Heading lost: {out}");
}
#[test]
fn strips_bare_image_references() {
let md = "Some content\n\nhero.webp\n\nhttps://example.com/logo.svg\n\n![](image.png)\n\n![icon](logo.svg)\n\nThe file output.png is saved to disk.\n\n![Detailed architecture diagram showing the data flow](arch.png)\n\nMore content";
let result = make_result(md);
let out = to_llm_text(&result, None);
assert!(
!out.contains("hero.webp"),
"Bare filename not stripped: {out}"
);
assert!(
!out.contains("https://example.com/logo.svg"),
"Bare image URL not stripped: {out}"
);
assert!(
!out.contains("image.png"),
"Empty-alt image not stripped: {out}"
);
assert!(
!out.contains("logo.svg"),
"Generic-alt image not stripped: {out}"
);
assert!(
out.contains("output.png is saved to disk"),
"Sentence with .png filename was incorrectly stripped: {out}"
);
assert!(
out.contains("Detailed architecture diagram showing the data flow"),
"Meaningful alt text was stripped: {out}"
);
assert!(
!out.contains("arch.png"),
"Image URL not stripped from meaningful alt: {out}"
);
assert!(out.contains("Some content"), "Content before lost: {out}");
assert!(out.contains("More content"), "Content after lost: {out}");
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,156 @@
/// Metadata extraction from HTML <head>.
/// Prioritizes Open Graph and Twitter Card tags, falls back to standard meta tags.
use scraper::{Html, Selector};
use crate::types::Metadata;
/// Selectors are cheap to compile but we call them often — cache with once_cell.
macro_rules! selector {
($s:expr) => {{
use once_cell::sync::Lazy;
static SEL: Lazy<Selector> = Lazy::new(|| Selector::parse($s).unwrap());
&*SEL
}};
}
pub fn extract(doc: &Html, url: Option<&str>) -> Metadata {
let title = og_meta(doc, "og:title")
.or_else(|| meta_name(doc, "twitter:title"))
.or_else(|| title_tag(doc));
let description = og_meta(doc, "og:description")
.or_else(|| meta_name(doc, "twitter:description"))
.or_else(|| meta_name(doc, "description"));
let author = meta_name(doc, "author").or_else(|| og_meta(doc, "article:author"));
let published_date = og_meta(doc, "article:published_time")
.or_else(|| meta_name(doc, "date"))
.or_else(|| meta_name(doc, "publication_date"));
// Search the whole document for <html lang="..."> — root_element() IS the <html>
// node in scraper, so selecting "html" from it finds nothing (no nested <html>).
let language = doc
.select(selector!("html"))
.next()
.and_then(|el| el.value().attr("lang"))
.map(|s| s.to_string());
let site_name = og_meta(doc, "og:site_name");
let image = og_meta(doc, "og:image").or_else(|| meta_name(doc, "twitter:image"));
let favicon = extract_favicon(doc);
Metadata {
title,
description,
author,
published_date,
language,
url: url.map(String::from),
site_name,
image,
favicon,
word_count: 0, // filled later by the extractor
}
}
/// <meta property="og:..." content="...">
fn og_meta(doc: &Html, property: &str) -> Option<String> {
// OG tags use property= not name=
doc.select(selector!("meta[property]"))
.find(|el| el.value().attr("property") == Some(property))
.and_then(|el| el.value().attr("content"))
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
}
/// <meta name="..." content="...">
fn meta_name(doc: &Html, name: &str) -> Option<String> {
doc.select(selector!("meta[name]"))
.find(|el| {
el.value()
.attr("name")
.is_some_and(|n| n.eq_ignore_ascii_case(name))
})
.and_then(|el| el.value().attr("content"))
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
}
fn title_tag(doc: &Html) -> Option<String> {
doc.select(selector!("title"))
.next()
.map(|el| el.text().collect::<String>().trim().to_string())
.filter(|s| !s.is_empty())
}
fn extract_favicon(doc: &Html) -> Option<String> {
// <link rel="icon" href="..."> or <link rel="shortcut icon" href="...">
doc.select(selector!("link[rel]"))
.find(|el| el.value().attr("rel").is_some_and(|r| r.contains("icon")))
.and_then(|el| el.value().attr("href"))
.map(|s| s.to_string())
}
#[cfg(test)]
mod tests {
use super::*;
fn parse(html: &str) -> Html {
Html::parse_document(html)
}
#[test]
fn extracts_basic_metadata() {
let html = r#"
<html lang="en">
<head>
<title>Test Page</title>
<meta name="description" content="A test page">
<meta name="author" content="Alice">
<meta property="og:title" content="OG Title">
<meta property="og:image" content="https://img.example.com/og.png">
<meta property="og:site_name" content="Example">
<meta property="article:published_time" content="2025-01-15">
<link rel="icon" href="/favicon.ico">
</head>
<body></body>
</html>"#;
let doc = parse(html);
let meta = extract(&doc, Some("https://example.com"));
// OG title wins over <title>
assert_eq!(meta.title.as_deref(), Some("OG Title"));
assert_eq!(meta.description.as_deref(), Some("A test page"));
assert_eq!(meta.author.as_deref(), Some("Alice"));
assert_eq!(meta.published_date.as_deref(), Some("2025-01-15"));
assert_eq!(meta.language.as_deref(), Some("en"));
assert_eq!(meta.site_name.as_deref(), Some("Example"));
assert_eq!(
meta.image.as_deref(),
Some("https://img.example.com/og.png")
);
assert_eq!(meta.favicon.as_deref(), Some("/favicon.ico"));
assert_eq!(meta.url.as_deref(), Some("https://example.com"));
}
#[test]
fn falls_back_to_title_tag() {
let html = r#"<html><head><title>Fallback Title</title></head><body></body></html>"#;
let doc = parse(html);
let meta = extract(&doc, None);
assert_eq!(meta.title.as_deref(), Some("Fallback Title"));
}
#[test]
fn handles_missing_metadata_gracefully() {
let html = r#"<html><head></head><body></body></html>"#;
let doc = parse(html);
let meta = extract(&doc, None);
assert!(meta.title.is_none());
assert!(meta.description.is_none());
assert!(meta.language.is_none());
}
}

View file

@ -0,0 +1,756 @@
/// Shared noise detection for web content extraction.
///
/// Identifies elements that don't contribute to main content:
/// navigation, sidebars, footers, ads, cookie banners, modals, etc.
/// Used by both the extractor (candidate filtering) and the markdown
/// converter (output-time stripping).
use scraper::ElementRef;
const NOISE_TAGS: &[&str] = &[
"script", "style", "noscript", "iframe", "svg", "nav", "aside", "footer", "header", "form",
"video", "audio",
"canvas",
// NOTE: <picture> removed — it's a responsive image container, not noise.
// <picture> wraps <source> and <img> for responsive images.
];
const NOISE_ROLES: &[&str] = &["navigation", "banner", "complementary", "contentinfo"];
const NOISE_CLASS_PATTERNS: &[&str] = &[
"sidebar",
"side",
"nav",
"navbar",
"navigation",
"menu",
"footer",
"header",
"top",
"bottom",
"advertisement",
"advert",
"social",
"social-media",
"social-links",
"share",
"comment",
"cookie",
"popup",
"modal",
"overlay",
"banner",
"breadcrumb",
"breadcrumbs",
"widget",
"lang-selector",
"language",
"newsletter",
"subscribe",
"related-posts",
"recommended",
"pagination",
"pager",
"signup",
"login-form",
"search-form",
"notification",
"alert",
"toast",
"skip-link",
"sr-only",
"visually-hidden",
];
const NOISE_ID_PATTERNS: &[&str] = &[
"sidebar",
"nav",
"menu",
"footer",
"header",
"cookie",
"popup",
"modal",
"breadcrumbs",
"widget",
"language-selector",
"ad",
"social",
"share",
"newsletter",
"subscribe",
"comments",
"related",
"recommended",
];
/// Exact class tokens that indicate noise.
/// Unlike substring matching, these only match when the EXACT class token
/// is present — ".modal" matches `class="modal"` but NOT `class="free-modal-container"`.
const NOISE_CLASSES: &[&str] = &[
"header",
"top",
"navbar",
"footer",
"bottom",
"sidebar",
"modal",
"popup",
"overlay",
"ad",
"ads",
"advert",
"lang-selector",
"language",
"social",
"social-media",
"social-links",
"menu",
"navigation",
"breadcrumbs",
"breadcrumb",
"share",
"widget",
"cookie",
"newsletter",
"subscribe",
"skip-link",
"sr-only",
"visually-hidden",
"notification",
"alert",
"toast",
"pagination",
"pager",
"signup",
"login-form",
"search-form",
"related-posts",
"recommended",
];
/// Exact IDs that indicate noise.
const NOISE_IDS: &[&str] = &[
"header",
"footer",
"nav",
"sidebar",
"menu",
"modal",
"popup",
"cookie",
"breadcrumbs",
"widget",
"ad",
"social",
"share",
"newsletter",
"subscribe",
"comments",
"related",
"recommended",
];
/// ID prefixes for cookie consent platforms that should be stripped entirely.
/// These generate massive DOM overlays that dominate content extraction.
const COOKIE_CONSENT_ID_PREFIXES: &[&str] = &[
"onetrust", // OneTrust (Foot Locker, many EU sites)
"optanon", // OneTrust legacy
"ot-sdk", // OneTrust SDK
"cookiebot", // Cookiebot
"CybotCookiebot", // Cookiebot
"cc-", // Cookie Consent (Osano)
"cookie-law", // Cookie Law Info
"gdpr", // Generic GDPR banners
"consent-", // Generic consent banners
"cmp-", // Consent Management Platforms
"sp_message", // SourcePoint
"qc-cmp", // Quantcast CMP
"trustarc", // TrustArc
"evidon", // Evidon/Crownpeak
];
/// Check if an element is noise by tag, role, class, or id.
///
/// Uses EXACT class token matching instead
/// of substring matching. This prevents false positives like:
/// - "free-modal-container" ≠ noise (Vice.com's content wrapper)
/// - "a-bw_aui_cxc_alert_measurement" ≠ noise (Amazon's body class)
/// - "desktop" ≠ noise (not matching "top")
pub fn is_noise(el: ElementRef<'_>) -> bool {
let tag = el.value().name();
// Never treat <body> or <html> as noise.
if tag == "body" || tag == "html" {
return false;
}
// Tag-based noise (script, style, nav, etc.)
if NOISE_TAGS.contains(&tag) {
return true;
}
// ARIA role-based noise
if let Some(role) = el.value().attr("role")
&& NOISE_ROLES.contains(&role)
{
return true;
}
// Exact class token matching — split class attribute into tokens,
// check each against the noise list. "free-modal-container" splits into
// ["free-modal-container"] which does NOT match "modal".
if let Some(class) = el.value().attr("class") {
for token in class.split_whitespace() {
let lower = token.to_lowercase();
if NOISE_CLASSES.contains(&lower.as_str()) {
return true;
}
// Structural elements use compound names (FooterLinks, Header-nav, etc.)
// These are always noise regardless of compound form.
if lower.starts_with("footer")
|| lower.starts_with("header-")
|| lower.starts_with("nav-")
{
return true;
}
}
// Also check for ad-specific patterns (standalone "ad" class)
if is_ad_class(class) {
return true;
}
}
// Exact ID matching
if let Some(id) = el.value().attr("id") {
let id_lower = id.to_lowercase();
if NOISE_IDS.contains(&id_lower.as_str()) && !is_structural_id(&id_lower) {
return true;
}
// Cookie consent platform IDs (prefix match — these generate huge overlays)
for prefix in COOKIE_CONSENT_ID_PREFIXES {
if id_lower.starts_with(prefix) {
return true;
}
}
}
// Class-based cookie consent detection (prefix match for platform classes)
if let Some(class) = el.value().attr("class") {
let class_lower = class.to_lowercase();
for prefix in COOKIE_CONSENT_ID_PREFIXES {
if class_lower.contains(prefix) {
return true;
}
}
}
false
}
/// Check if an element is inside a noise container.
pub fn is_noise_descendant(el: ElementRef<'_>) -> bool {
let mut node = el.parent();
while let Some(parent) = node {
if let Some(parent_el) = ElementRef::wrap(parent)
&& is_noise(parent_el)
{
return true;
}
node = parent.parent();
}
false
}
fn has_noise_class(class: &str) -> bool {
// Match noise patterns against individual class tokens, with safeguards
// against Tailwind CSS utility classes that contain noise keywords as
// substrings (e.g., "pt-header-h" is padding, not a header class).
class.split_whitespace().any(is_noise_token) || is_ad_class(class)
}
/// Check if a single class token is a noise indicator.
/// Requires the noise pattern to be the *semantic core* of the token,
/// not embedded inside a Tailwind utility prefix or CSS variable.
fn is_noise_token(token: &str) -> bool {
let t = token.to_lowercase();
// Skip Tailwind arbitrary values and CSS variable references entirely
if t.contains("[--") || t.contains("var(") {
return false;
}
// Strip common Tailwind responsive/state prefixes (e.g., "lg:", "hover:", "md:")
let core = t.rsplit_once(':').map_or(t.as_str(), |(_, c)| c);
// The noise pattern should match the semantic name, not be buried inside
// a utility like "pt-header-h" (padding) or "mt-nav-offset" (margin).
// Tailwind utilities start with known prefixes; if the token starts with one,
// it's a utility class, not a semantic class.
const UTILITY_PREFIXES: &[&str] = &[
"p-",
"pt-",
"pb-",
"pl-",
"pr-",
"px-",
"py-",
"m-",
"mt-",
"mb-",
"ml-",
"mr-",
"mx-",
"my-",
"w-",
"h-",
"min-",
"max-",
"top-",
"left-",
"right-",
"bottom-",
"z-",
"gap-",
"text-",
"bg-",
"border-",
"rounded-",
"flex-",
"grid-",
"col-",
"row-",
"opacity-",
"transition-",
"duration-",
"delay-",
"ease-",
"translate-",
"scale-",
"rotate-",
"origin-",
"overflow-",
"inset-",
"space-",
"divide-",
"ring-",
"shadow-",
"outline-",
"font-",
"leading-",
"tracking-",
"decoration-",
];
if UTILITY_PREFIXES.iter().any(|pfx| core.starts_with(pfx)) {
return false;
}
// "banner" and "overlay" only match as prefix — they false-positive as
// suffixes in BEM/Webflow component names (e.g., "package_banner" is a
// product card, not an ad banner; "planet-overlay" is a visual effect).
const PREFIX_ONLY: &[&str] = &["banner", "overlay"];
// Short patterns (≤6 chars like "nav", "top", "header", "widget") require
// word-boundary matching to avoid false positives on compound CSS class
// names (e.g., "desktop" ≠ "top", "celwidget" ≠ "widget",
// "_categoriesheader_active" ≠ semantic "header").
// A word boundary is `-`, `_`, or start/end of string.
// Longer patterns (7+ chars like "sidebar", "breadcrumb") are specific
// enough that substring matching is safe.
NOISE_CLASS_PATTERNS.iter().any(|p| {
if PREFIX_ONLY.contains(p) {
core == *p || core.starts_with(&format!("{p}-")) || core.starts_with(&format!("{p}_"))
} else if p.len() <= 6 {
is_word_boundary_match(core, p)
} else {
core.contains(p)
}
})
}
/// Check if `pattern` appears in `text` at a word boundary.
/// Word boundaries are `-`, `_`, or start/end of string.
/// e.g., "nav" matches "main-nav", "nav-bar", "nav" but NOT "canvas", "navbar".
fn is_word_boundary_match(text: &str, pattern: &str) -> bool {
let mut start = 0;
while let Some(pos) = text[start..].find(pattern) {
let abs = start + pos;
let before_ok = abs == 0 || matches!(text.as_bytes()[abs - 1], b'-' | b'_');
let end = abs + pattern.len();
let after_ok = end == text.len() || matches!(text.as_bytes()[end], b'-' | b'_');
if before_ok && after_ok {
return true;
}
start = abs + 1;
}
false
}
/// IDs like "modal-portal", "nav-root", "header-container" are structural
/// wrappers (React portals, app roots), not actual noise elements.
fn is_structural_id(id: &str) -> bool {
const STRUCTURAL_SUFFIXES: &[&str] =
&["portal", "root", "container", "wrapper", "mount", "app"];
STRUCTURAL_SUFFIXES.iter().any(|s| id.contains(s))
}
// ---------------------------------------------------------------------------
// CSS class text detection (visible content that looks like class names)
// ---------------------------------------------------------------------------
/// CSS utility prefixes that indicate a word is a class name, not prose.
/// Covers Tailwind, Bootstrap-ish, and common utility-first patterns.
const CSS_CLASS_PREFIXES: &[&str] = &[
"text-",
"bg-",
"px-",
"py-",
"pt-",
"pb-",
"pl-",
"pr-",
"p-",
"mx-",
"my-",
"mt-",
"mb-",
"ml-",
"mr-",
"m-",
"w-",
"h-",
"min-",
"max-",
"flex-",
"grid-",
"col-",
"row-",
"gap-",
"space-",
"rounded-",
"shadow-",
"border-",
"ring-",
"outline-",
"font-",
"tracking-",
"leading-",
"decoration-",
"opacity-",
"transition-",
"duration-",
"delay-",
"ease-",
"translate-",
"scale-",
"rotate-",
"origin-",
"overflow-",
"inset-",
"divide-",
"z-",
"top-",
"left-",
"right-",
"bottom-",
"sr-",
"not-",
"group-",
"peer-",
"placeholder-",
"focus-",
"hover-",
"active-",
"disabled-",
"dark-",
"sm-",
"md-",
"lg-",
"xl-",
"2xl-",
];
/// Exact single-word CSS utility class names (no prefix needed).
const CSS_CLASS_EXACT: &[&str] = &[
"flex",
"grid",
"block",
"inline",
"hidden",
"static",
"fixed",
"absolute",
"relative",
"sticky",
"isolate",
"container",
"prose",
"antialiased",
"truncate",
"uppercase",
"lowercase",
"capitalize",
"italic",
"underline",
"overline",
"invisible",
"visible",
"sr-only",
"not-sr-only",
];
/// Tailwind responsive/state prefixes that can appear before a utility class
/// (e.g., "sm:text-lg", "hover:bg-blue-500", "dark:text-white").
fn strip_tw_variant_prefix(word: &str) -> &str {
// Handle chained variants: "dark:sm:text-lg" → "text-lg"
word.rsplit_once(':').map_or(word, |(_, core)| core)
}
/// Check if a single whitespace-delimited word looks like a CSS utility class.
fn is_css_class_word(word: &str) -> bool {
let core = strip_tw_variant_prefix(word);
let lower = core.to_lowercase();
// Arbitrary value syntax: "[--foo:bar]", "w-[200px]"
if lower.contains('[') && lower.contains(']') {
return true;
}
// Exact matches
if CSS_CLASS_EXACT.iter().any(|&e| lower == e) {
return true;
}
// Prefix matches
if CSS_CLASS_PREFIXES.iter().any(|pfx| lower.starts_with(pfx)) {
return true;
}
// Negative utilities: "-mt-4", "-translate-x-1/2"
if lower.starts_with('-') && lower.len() > 1 {
let rest = &lower[1..];
if CSS_CLASS_PREFIXES.iter().any(|pfx| rest.starts_with(pfx)) {
return true;
}
}
false
}
/// Public wrapper for single-word CSS class detection (used by LLM pipeline
/// for stripping trailing CSS classes from mixed-content lines).
pub fn is_css_class_word_pub(word: &str) -> bool {
is_css_class_word(word)
}
/// Check if a text block is predominantly CSS class names.
///
/// Returns true if >50% of the whitespace-delimited words look like CSS
/// utility classes. Requires at least 3 words to avoid false positives on
/// short fragments.
pub fn is_css_class_text(text: &str) -> bool {
let words: Vec<&str> = text.split_whitespace().collect();
if words.len() < 3 {
return false;
}
let css_count = words.iter().filter(|w| is_css_class_word(w)).count();
// >50% of words are CSS classes
css_count * 2 > words.len()
}
/// Detect "ad" as a standalone class token, not a substring of "read" or "loading".
fn is_ad_class(class: &str) -> bool {
class.split_whitespace().any(|token| {
token == "ad"
|| token.starts_with("ad-")
|| token.starts_with("ad_")
|| token.ends_with("-ad")
|| token.ends_with("_ad")
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn ad_class_standalone_detected() {
assert!(is_ad_class("ad"));
assert!(is_ad_class("some ad-banner"));
assert!(is_ad_class("top-ad widget"));
assert!(is_ad_class("ad_unit"));
assert!(is_ad_class("sidebar_ad"));
}
#[test]
fn ad_class_no_false_positive() {
assert!(!is_ad_class("reading-time"));
assert!(!is_ad_class("loading-indicator"));
assert!(!is_ad_class("download-button"));
assert!(!is_ad_class("breadcrumb"));
}
#[test]
fn noise_class_patterns() {
assert!(has_noise_class("main-sidebar"));
assert!(has_noise_class("cookie-banner")); // "cookie" substring match
assert!(has_noise_class("modal-overlay")); // "modal" substring match
assert!(has_noise_class("banner-top")); // "banner" as prefix
assert!(has_noise_class("overlay-popup")); // "overlay" as prefix
assert!(!has_noise_class("article-content"));
assert!(!has_noise_class("post-body"));
}
#[test]
fn short_patterns_require_word_boundary() {
// "nav" (3 chars) — must be a standalone word segment
assert!(has_noise_class("main-nav"));
assert!(has_noise_class("nav-bar"));
assert!(has_noise_class("nav"));
assert!(!has_noise_class("canvas")); // "nav" is substring, not word
assert!(has_noise_class("icp-nav-flag")); // "nav" IS between word boundaries
// "top" (3 chars) — note: "top-bar" starts with Tailwind prefix "top-" → filtered out
assert!(has_noise_class("page-top")); // "top" at word boundary
assert!(!has_noise_class("desktop")); // "top" is substring inside word
assert!(!has_noise_class("stop-motion")); // "top" inside word
// "side" (4 chars) — "left-side" starts with Tailwind prefix "left-" → filtered
assert!(has_noise_class("page-side"));
assert!(!has_noise_class("inside-content"));
assert!(!has_noise_class("consider"));
}
#[test]
fn amazon_classes_not_noise() {
// Amazon CSS module class names that were false-positiving
assert!(!has_noise_class("desktop")); // contains "top"
assert!(!has_noise_class("celwidget")); // contains "widget"
// a-alert-container: "alert" IS a proper word segment → still matches (correct for UI alerts)
assert!(has_noise_class("a-alert-container"));
assert!(!has_noise_class(
"_haul-cx-images-carousel_style_desktop-card__fid8k"
));
assert!(!has_noise_class(
"_haul-cx-infinite-scroll-body_categoriesheader_active__2j-4u"
));
// But actual noise classes still work
assert!(has_noise_class("site-header"));
assert!(has_noise_class("main-nav"));
assert!(has_noise_class("footer-links"));
assert!(has_noise_class("cookie-consent"));
}
#[test]
fn word_boundary_match_works() {
assert!(is_word_boundary_match("main-nav", "nav"));
assert!(is_word_boundary_match("nav-bar", "nav"));
assert!(is_word_boundary_match("nav", "nav"));
assert!(is_word_boundary_match("top-nav_bar", "nav"));
assert!(!is_word_boundary_match("canvas", "nav"));
assert!(!is_word_boundary_match("navbar", "nav"));
assert!(!is_word_boundary_match("navigate", "nav"));
assert!(is_word_boundary_match("top-bar", "top"));
assert!(!is_word_boundary_match("desktop", "top"));
assert!(!is_word_boundary_match("stopper", "top"));
}
#[test]
fn bem_component_names_not_noise() {
// BEM/Webflow component names where noise keyword is a suffix
assert!(!has_noise_class("package_banner"));
assert!(!has_noise_class("mars-cta_planet-overlay"));
assert!(!has_noise_class("hero_banner_wrap"));
// But actual noise classes still work
assert!(has_noise_class("banner-dismiss"));
assert!(has_noise_class("overlay-backdrop"));
}
#[test]
fn structural_ids_not_noise() {
assert!(is_structural_id("modal-portal"));
assert!(is_structural_id("nav-root"));
assert!(is_structural_id("header-container"));
assert!(is_structural_id("sidebar-wrapper"));
assert!(is_structural_id("menu-mount"));
assert!(is_structural_id("app"));
// Actual noise IDs should NOT be structural
assert!(!is_structural_id("main-sidebar"));
assert!(!is_structural_id("cookie-consent"));
assert!(!is_structural_id("popup-overlay"));
}
#[test]
fn tailwind_animation_utilities_not_noise() {
// Tailwind transition/animation utilities with noise keywords as values
assert!(!has_noise_class("ease-curve-sidebar"));
assert!(!has_noise_class("duration-sidebar"));
assert!(!has_noise_class("delay-modal-open"));
// But actual sidebar/modal classes still work
assert!(has_noise_class("sidebar-panel"));
assert!(has_noise_class("modal-dialog"));
}
#[test]
fn tailwind_css_vars_not_noise() {
// Tailwind arbitrary values and CSS variables should NOT trigger noise
assert!(!has_noise_class("[--content-top-offset:var(--header-h)]"));
assert!(!has_noise_class(
"pt-[var(--content-top-offset)] [--content-top-offset:var(--header-h)]"
));
assert!(!has_noise_class("[--nav-width:200px]"));
// But actual noise classes still work
assert!(has_noise_class("[--offset:10px] header-bar"));
assert!(has_noise_class("sidebar [--x:1]"));
}
// -----------------------------------------------------------------------
// CSS class text detection (decorative text that looks like class names)
// -----------------------------------------------------------------------
#[test]
fn css_class_text_detected() {
// Pure Tailwind utility class blocks — the real-world problem
assert!(is_css_class_text(
"text-4xl font-bold tracking-tight text-gray-900"
));
assert!(is_css_class_text(
"text-4xl text-5xl text-6xl text-8xl text-gray-950 text-white tracking-tighter text-balance"
));
assert!(is_css_class_text(
"flex grid rounded-lg shadow-md bg-white px-4 py-2"
));
assert!(is_css_class_text(
"sm:text-lg dark:bg-gray-800 hover:bg-blue-500"
));
// Negative utilities
assert!(is_css_class_text("-mt-4 -translate-x-1/2 flex"));
}
#[test]
fn css_class_text_normal_prose_kept() {
// Normal English text — must NOT be detected as CSS
assert!(!is_css_class_text(
"the text-based approach works well for this use case"
));
assert!(!is_css_class_text(
"Build beautiful websites with modern tools"
));
assert!(!is_css_class_text(
"Tailwind CSS is a utility-first CSS framework"
));
// Too short to be confident
assert!(!is_css_class_text("flex grid"));
assert!(!is_css_class_text("text-lg"));
}
#[test]
fn css_class_text_mixed_content() {
// Majority CSS → detected
assert!(is_css_class_text(
"text-4xl font-bold tracking-tight text-gray-900 hero"
));
// Majority prose → not detected
assert!(!is_css_class_text(
"The quick brown fox jumps over the lazy text-lg dog"
));
}
}

View file

@ -0,0 +1,165 @@
/// Extract JSON-LD structured data from HTML.
///
/// Parses `<script type="application/ld+json">` blocks commonly found in
/// e-commerce, news, and recipe sites. Returns machine-readable product info,
/// prices, availability, reviews, etc. without needing JS rendering or LLM.
use serde_json::Value;
/// Extract all JSON-LD blocks from raw HTML.
///
/// Returns parsed JSON values, skipping any blocks that fail to parse.
/// Most e-commerce sites include Schema.org Product markup with prices,
/// sizes, availability, and images.
pub fn extract_json_ld(html: &str) -> Vec<Value> {
let mut results = Vec::new();
let needle = "application/ld+json";
// Walk through the HTML finding <script type="application/ld+json"> blocks.
// Using simple string scanning instead of a full HTML parser — these blocks
// are self-contained and reliably structured.
let mut search_from = 0;
while let Some(tag_start) = html[search_from..].find("<script") {
let abs_start = search_from + tag_start;
let tag_region = &html[abs_start..];
// Find the end of the opening tag
let Some(tag_end_offset) = tag_region.find('>') else {
search_from = abs_start + 7;
continue;
};
let opening_tag = &tag_region[..tag_end_offset];
// Check if this is a JSON-LD script
if !opening_tag.to_lowercase().contains(needle) {
search_from = abs_start + tag_end_offset + 1;
continue;
}
// Find the closing </script>
let content_start = abs_start + tag_end_offset + 1;
let remaining = &html[content_start..];
let Some(close_offset) = remaining.to_lowercase().find("</script>") else {
search_from = content_start;
continue;
};
let json_str = remaining[..close_offset].trim();
search_from = content_start + close_offset + 9;
if json_str.is_empty() {
continue;
}
// Parse — some sites have arrays at top level
match serde_json::from_str::<Value>(json_str) {
Ok(Value::Array(arr)) => results.extend(arr),
Ok(val) => results.push(val),
Err(_) => {}
}
}
results
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn extracts_single_json_ld() {
let html = r#"
<html><head>
<script type="application/ld+json">{"@type":"Product","name":"Test"}</script>
</head><body></body></html>
"#;
let results = extract_json_ld(html);
assert_eq!(results.len(), 1);
assert_eq!(results[0]["@type"], "Product");
assert_eq!(results[0]["name"], "Test");
}
#[test]
fn extracts_multiple_json_ld_blocks() {
let html = r#"
<script type="application/ld+json">{"@type":"WebSite","url":"https://example.com"}</script>
<script type="application/ld+json">{"@type":"Product","name":"Shoe","offers":{"price":99.99}}</script>
"#;
let results = extract_json_ld(html);
assert_eq!(results.len(), 2);
assert_eq!(results[0]["@type"], "WebSite");
assert_eq!(results[1]["@type"], "Product");
}
#[test]
fn handles_array_json_ld() {
let html = r#"
<script type="application/ld+json">[{"@type":"BreadcrumbList"},{"@type":"Product"}]</script>
"#;
let results = extract_json_ld(html);
assert_eq!(results.len(), 2);
}
#[test]
fn skips_invalid_json() {
let html = r#"
<script type="application/ld+json">{invalid json here}</script>
<script type="application/ld+json">{"@type":"Product","name":"Valid"}</script>
"#;
let results = extract_json_ld(html);
assert_eq!(results.len(), 1);
assert_eq!(results[0]["name"], "Valid");
}
#[test]
fn ignores_regular_script_tags() {
let html = r#"
<script>console.log("not json-ld")</script>
<script type="text/javascript">var x = 1;</script>
<script type="application/ld+json">{"@type":"Product"}</script>
"#;
let results = extract_json_ld(html);
assert_eq!(results.len(), 1);
}
#[test]
fn handles_no_json_ld() {
let html = "<html><body><p>No structured data here</p></body></html>";
let results = extract_json_ld(html);
assert!(results.is_empty());
}
#[test]
fn case_insensitive_type() {
let html = r#"
<script type="Application/LD+JSON">{"@type":"Product"}</script>
"#;
let results = extract_json_ld(html);
assert_eq!(results.len(), 1);
}
#[test]
fn handles_whitespace_in_json() {
let html = r#"
<script type="application/ld+json">
{
"@type": "Product",
"name": "Test"
}
</script>
"#;
let results = extract_json_ld(html);
assert_eq!(results.len(), 1);
assert_eq!(results[0]["name"], "Test");
}
#[test]
fn empty_script_tag_skipped() {
let html = r#"
<script type="application/ld+json"> </script>
<script type="application/ld+json">{"@type":"Product"}</script>
"#;
let results = extract_json_ld(html);
assert_eq!(results.len(), 1);
}
}

View file

@ -0,0 +1,80 @@
/// Core types for extraction output.
/// All types are serializable for JSON output to LLM consumers.
use serde::{Deserialize, Serialize};
use crate::domain::DomainType;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ExtractionResult {
pub metadata: Metadata,
pub content: Content,
pub domain_data: Option<DomainData>,
/// JSON-LD structured data extracted from `<script type="application/ld+json">` blocks.
/// Contains Schema.org markup (Product, Article, BreadcrumbList, etc.) when present.
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub structured_data: Vec<serde_json::Value>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Metadata {
pub title: Option<String>,
pub description: Option<String>,
pub author: Option<String>,
pub published_date: Option<String>,
pub language: Option<String>,
pub url: Option<String>,
pub site_name: Option<String>,
pub image: Option<String>,
pub favicon: Option<String>,
pub word_count: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Content {
pub markdown: String,
pub plain_text: String,
pub links: Vec<Link>,
pub images: Vec<Image>,
pub code_blocks: Vec<CodeBlock>,
#[serde(skip_serializing_if = "Option::is_none")]
pub raw_html: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct Link {
pub text: String,
pub href: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Image {
pub alt: String,
pub src: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CodeBlock {
pub language: Option<String>,
pub code: String,
}
/// Domain-specific extracted data. For MVP, only the detected type is stored.
/// Future: each variant carries structured fields (e.g., Article { author, date, ... }).
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DomainData {
pub domain_type: DomainType,
}
/// Options for controlling content extraction behavior.
#[derive(Debug, Clone, Default)]
pub struct ExtractionOptions {
/// CSS selectors for elements to include. If non-empty, only these elements
/// are extracted (skipping the scoring algorithm entirely).
pub include_selectors: Vec<String>,
/// CSS selectors for elements to exclude from the output.
pub exclude_selectors: Vec<String>,
/// If true, skip scoring and pick the first `article`, `main`, or `[role="main"]` element.
pub only_main_content: bool,
/// If true, populate `Content::raw_html` with the extracted content's HTML.
pub include_raw_html: bool,
}

View file

@ -0,0 +1,220 @@
use once_cell::sync::Lazy;
/// YouTube video metadata extraction from `ytInitialPlayerResponse` embedded JSON.
///
/// YouTube embeds the full player config (title, author, view count, description,
/// duration, upload date) in a `<script>` tag as a JS variable assignment. This
/// module parses that blob and formats it as structured markdown, giving LLMs a
/// clean representation without needing the YouTube API.
use regex::Regex;
use tracing::debug;
/// Regex to find the ytInitialPlayerResponse assignment in a <script> block.
/// YouTube uses: `var ytInitialPlayerResponse = {...};`
static YT_PLAYER_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"var\s+ytInitialPlayerResponse\s*=\s*(\{.+?\})\s*;").unwrap());
/// Check if a URL is a YouTube video page.
pub fn is_youtube_url(url: &str) -> bool {
let lower = url.to_lowercase();
lower.contains("youtube.com/watch") || lower.contains("youtu.be/")
}
/// Extracted YouTube video metadata.
#[derive(Debug)]
struct VideoMeta {
title: String,
author: String,
view_count: String,
upload_date: String,
description: String,
duration: String,
}
/// Try to extract YouTube video metadata from the page HTML.
/// Returns structured markdown if successful, None if the page doesn't contain
/// ytInitialPlayerResponse or parsing fails.
pub fn try_extract(html: &str) -> Option<String> {
let json_str = YT_PLAYER_RE.captures(html)?.get(1)?.as_str();
let value: serde_json::Value = serde_json::from_str(json_str).ok()?;
let video_details = value.get("videoDetails")?;
let microformat = value
.get("microformat")
.and_then(|m| m.get("playerMicroformatRenderer"));
let title = video_details
.get("title")
.and_then(|v| v.as_str())
.unwrap_or("Untitled")
.to_string();
let author = video_details
.get("author")
.and_then(|v| v.as_str())
.unwrap_or("Unknown")
.to_string();
let view_count = video_details
.get("viewCount")
.and_then(|v| v.as_str())
.map(format_view_count)
.unwrap_or_else(|| "N/A".to_string());
let upload_date = microformat
.and_then(|m| m.get("uploadDate"))
.or_else(|| microformat.and_then(|m| m.get("publishDate")))
.and_then(|v| v.as_str())
.unwrap_or("Unknown")
.to_string();
let description = video_details
.get("shortDescription")
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let duration_secs = video_details
.get("lengthSeconds")
.and_then(|v| v.as_str())
.and_then(|s| s.parse::<u64>().ok())
.unwrap_or(0);
let duration = format_duration(duration_secs);
let meta = VideoMeta {
title,
author,
view_count,
upload_date,
description,
duration,
};
debug!(
title = %meta.title,
author = %meta.author,
"extracted YouTube video metadata"
);
Some(format_markdown(&meta))
}
/// Format seconds into human-readable duration (e.g., "1:23:45" or "12:34").
fn format_duration(total_secs: u64) -> String {
let hours = total_secs / 3600;
let minutes = (total_secs % 3600) / 60;
let seconds = total_secs % 60;
if hours > 0 {
format!("{hours}:{minutes:02}:{seconds:02}")
} else {
format!("{minutes}:{seconds:02}")
}
}
/// Format a raw view count string with commas (e.g., "1234567" -> "1,234,567").
fn format_view_count(raw: &str) -> String {
let Ok(n) = raw.parse::<u64>() else {
return raw.to_string();
};
if n >= 1_000_000 {
format!("{:.1}M", n as f64 / 1_000_000.0)
} else if n >= 1_000 {
format!("{:.1}K", n as f64 / 1_000.0)
} else {
n.to_string()
}
}
/// Format extracted metadata into structured markdown.
fn format_markdown(meta: &VideoMeta) -> String {
let mut md = format!("# {}\n\n", meta.title);
md.push_str(&format!(
"**Channel:** {} | **Views:** {} | **Published:** {} | **Duration:** {}\n\n",
meta.author, meta.view_count, meta.upload_date, meta.duration
));
if !meta.description.is_empty() {
md.push_str("## Description\n\n");
md.push_str(&meta.description);
md.push('\n');
}
md
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn detects_youtube_urls() {
assert!(is_youtube_url(
"https://www.youtube.com/watch?v=dQw4w9WgXcQ"
));
assert!(is_youtube_url("https://youtube.com/watch?v=abc123"));
assert!(is_youtube_url("https://youtu.be/dQw4w9WgXcQ"));
assert!(!is_youtube_url("https://example.com"));
assert!(!is_youtube_url("https://vimeo.com/123456"));
}
#[test]
fn format_duration_short() {
assert_eq!(format_duration(0), "0:00");
assert_eq!(format_duration(65), "1:05");
assert_eq!(format_duration(3661), "1:01:01");
assert_eq!(format_duration(754), "12:34");
}
#[test]
fn format_view_count_values() {
assert_eq!(format_view_count("500"), "500");
assert_eq!(format_view_count("1500"), "1.5K");
assert_eq!(format_view_count("1234567"), "1.2M");
}
#[test]
fn extracts_from_mock_html() {
let html = r#"
<html><head><title>Test Video</title></head>
<body>
<script>
var ytInitialPlayerResponse = {"videoDetails":{"title":"Rust in 100 Seconds","author":"Fireship","viewCount":"5432100","shortDescription":"Learn Rust in 100 seconds.","lengthSeconds":"120"},"microformat":{"playerMicroformatRenderer":{"uploadDate":"2023-01-15"}}};
</script>
</body></html>
"#;
let result = try_extract(html).unwrap();
assert!(result.contains("# Rust in 100 Seconds"));
assert!(result.contains("**Channel:** Fireship"));
assert!(result.contains("5.4M"));
assert!(result.contains("2023-01-15"));
assert!(result.contains("2:00"));
assert!(result.contains("Learn Rust in 100 seconds."));
}
#[test]
fn returns_none_for_non_youtube_html() {
let html = "<html><body><p>Hello world</p></body></html>";
assert!(try_extract(html).is_none());
}
#[test]
fn handles_missing_optional_fields() {
let html = r#"
<html><body>
<script>
var ytInitialPlayerResponse = {"videoDetails":{"title":"Minimal Video","author":"Someone","viewCount":"100","shortDescription":"","lengthSeconds":"60"}};
</script>
</body></html>
"#;
let result = try_extract(html).unwrap();
assert!(result.contains("# Minimal Video"));
assert!(result.contains("**Channel:** Someone"));
// Upload date should be "Unknown" when microformat is missing
assert!(result.contains("Unknown"));
}
}