mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-05-13 17:02:36 +02:00
chore: rebrand webclaw to noxa
This commit is contained in:
parent
a4c351d5ae
commit
8674b60b4e
86 changed files with 781 additions and 2121 deletions
26
crates/noxa-core/Cargo.toml
Normal file
26
crates/noxa-core/Cargo.toml
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
[package]
|
||||
name = "noxa-core"
|
||||
description = "Pure HTML content extraction engine for LLMs"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[features]
|
||||
default = ["quickjs"]
|
||||
quickjs = ["rquickjs"]
|
||||
|
||||
[dependencies]
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
scraper = "0.22"
|
||||
ego-tree = "0.10"
|
||||
url = { version = "2", features = ["serde"] }
|
||||
regex = "1"
|
||||
once_cell = "1"
|
||||
similar = "2"
|
||||
rquickjs = { version = "0.9", features = ["classes", "properties"], optional = true }
|
||||
|
||||
[dev-dependencies]
|
||||
tokio = { workspace = true }
|
||||
1340
crates/noxa-core/src/brand.rs
Normal file
1340
crates/noxa-core/src/brand.rs
Normal file
File diff suppressed because it is too large
Load diff
557
crates/noxa-core/src/data_island.rs
Normal file
557
crates/noxa-core/src/data_island.rs
Normal file
|
|
@ -0,0 +1,557 @@
|
|||
/// Extract content from JSON data islands embedded in `<script>` tags.
|
||||
///
|
||||
/// Many modern SPAs (React, Next.js, Nuxt) ship server-rendered page data
|
||||
/// as JSON inside script tags rather than in visible DOM elements. This module
|
||||
/// walks those JSON blobs and recovers text content as a fallback when normal
|
||||
/// DOM extraction yields sparse results.
|
||||
use once_cell::sync::Lazy;
|
||||
use scraper::{Html, Selector};
|
||||
use tracing::debug;
|
||||
|
||||
static SCRIPT_JSON_SELECTOR: Lazy<Selector> =
|
||||
Lazy::new(|| Selector::parse("script[type='application/json']").unwrap());
|
||||
|
||||
/// Below this word count, try data islands for supplemental content.
|
||||
/// Set high enough to cover marketing homepages with partial SSR (e.g., Notion
|
||||
/// SSR-renders ~300 words but has ~800 words in __NEXT_DATA__).
|
||||
const SPARSE_THRESHOLD: usize = 500;
|
||||
|
||||
/// Cap total extracted chunks to bound memory and CPU on adversarial inputs.
|
||||
const MAX_CHUNKS: usize = 1000;
|
||||
|
||||
/// A chunk of text extracted from a JSON data island, with optional heading.
|
||||
#[derive(Debug)]
|
||||
struct TextChunk {
|
||||
heading: Option<String>,
|
||||
body: String,
|
||||
}
|
||||
|
||||
/// Try to extract content from JSON data islands when DOM extraction is sparse.
|
||||
/// Deduplicates against existing markdown so we only add genuinely new content.
|
||||
/// Handles: application/json script tags, SvelteKit kit.start() data, and
|
||||
/// other inline JS data patterns.
|
||||
pub fn try_extract(doc: &Html, dom_word_count: usize, existing_markdown: &str) -> Option<String> {
|
||||
if dom_word_count >= SPARSE_THRESHOLD {
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut all_chunks: Vec<TextChunk> = Vec::new();
|
||||
let existing_lower = existing_markdown.to_lowercase();
|
||||
|
||||
// 1. Standard JSON data islands (application/json script tags)
|
||||
for script in doc.select(&SCRIPT_JSON_SELECTOR) {
|
||||
if all_chunks.len() >= MAX_CHUNKS {
|
||||
break;
|
||||
}
|
||||
|
||||
let json_text = script.text().collect::<String>();
|
||||
if json_text.len() < 50 {
|
||||
continue;
|
||||
}
|
||||
|
||||
let Ok(value) = serde_json::from_str::<serde_json::Value>(&json_text) else {
|
||||
continue;
|
||||
};
|
||||
|
||||
let mut chunks = Vec::new();
|
||||
walk_json(&value, &mut chunks, 0);
|
||||
|
||||
if !chunks.is_empty() {
|
||||
debug!(
|
||||
script_id = script.value().attr("id").unwrap_or(""),
|
||||
data_target = script.value().attr("data-target").unwrap_or(""),
|
||||
chunks = chunks.len(),
|
||||
"extracted text from data island"
|
||||
);
|
||||
all_chunks.extend(chunks);
|
||||
}
|
||||
}
|
||||
|
||||
// Note: SvelteKit data islands are handled in structured_data.rs
|
||||
// (extracted as structured JSON, not markdown chunks)
|
||||
|
||||
if all_chunks.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Enforce limit after collecting from all scripts
|
||||
all_chunks.truncate(MAX_CHUNKS);
|
||||
|
||||
// Dedup: remove chunks whose text already appears in DOM markdown
|
||||
let mut seen = std::collections::HashSet::new();
|
||||
all_chunks.retain(|c| {
|
||||
// Must have heading or body
|
||||
let key = if !c.body.is_empty() {
|
||||
c.body.clone()
|
||||
} else if let Some(ref h) = c.heading {
|
||||
h.clone()
|
||||
} else {
|
||||
return false;
|
||||
};
|
||||
if !seen.insert(key.clone()) {
|
||||
return false;
|
||||
}
|
||||
// Skip if the text already exists in the DOM-extracted content
|
||||
!existing_lower.contains(&key.to_lowercase())
|
||||
});
|
||||
|
||||
if all_chunks.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut md = String::new();
|
||||
for chunk in &all_chunks {
|
||||
if let Some(ref h) = chunk.heading {
|
||||
md.push_str(&format!("\n## {h}\n\n"));
|
||||
}
|
||||
md.push_str(&chunk.body);
|
||||
md.push_str("\n\n");
|
||||
}
|
||||
|
||||
let md = md.trim().to_string();
|
||||
if md.is_empty() {
|
||||
None
|
||||
} else {
|
||||
debug!(chars = md.len(), "data island content recovered");
|
||||
Some(md)
|
||||
}
|
||||
}
|
||||
|
||||
/// Recursively walk a JSON value and extract text content.
|
||||
fn walk_json(value: &serde_json::Value, chunks: &mut Vec<TextChunk>, depth: usize) {
|
||||
if depth > 15 {
|
||||
return;
|
||||
}
|
||||
|
||||
match value {
|
||||
serde_json::Value::Object(map) => {
|
||||
// Contentful rich text node: { "nodeType": "...", "content": [...] }
|
||||
if let Some(node_type) = map.get("nodeType").and_then(|v| v.as_str())
|
||||
&& let Some(text) = extract_contentful_node(map, node_type)
|
||||
{
|
||||
chunks.push(text);
|
||||
return;
|
||||
}
|
||||
|
||||
// CMS-style entry with heading + subheading/description
|
||||
if is_cms_entry(map)
|
||||
&& let Some(chunk) = extract_cms_entry(map)
|
||||
{
|
||||
chunks.push(chunk);
|
||||
return;
|
||||
}
|
||||
|
||||
// Quote/testimonial pattern
|
||||
if let Some(chunk) = extract_quote(map) {
|
||||
chunks.push(chunk);
|
||||
return;
|
||||
}
|
||||
|
||||
// Extract orphaned content strings from known field names
|
||||
// before recursing (they won't be caught by CMS/quote patterns)
|
||||
extract_orphan_texts(map, chunks);
|
||||
|
||||
// Recurse into all values, skipping image/media/asset fields
|
||||
for (key, v) in map {
|
||||
if is_media_key(key) {
|
||||
continue;
|
||||
}
|
||||
walk_json(v, chunks, depth + 1);
|
||||
}
|
||||
}
|
||||
serde_json::Value::Array(arr) => {
|
||||
// Check for stat-style string arrays (e.g., ["100M+ users", "#1 rated"])
|
||||
let content_strings: Vec<&str> = arr
|
||||
.iter()
|
||||
.filter_map(|v| v.as_str())
|
||||
.filter(|s| s.len() > 10 && s.contains(' '))
|
||||
.collect();
|
||||
if content_strings.len() >= 2 {
|
||||
let body = content_strings.join(" | ");
|
||||
chunks.push(TextChunk {
|
||||
heading: None,
|
||||
body,
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
for v in arr {
|
||||
walk_json(v, chunks, depth + 1);
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract text from a Contentful rich text node.
|
||||
/// Handles: document, paragraph, heading-1..6, blockquote, etc.
|
||||
fn extract_contentful_node(
|
||||
map: &serde_json::Map<String, serde_json::Value>,
|
||||
node_type: &str,
|
||||
) -> Option<TextChunk> {
|
||||
match node_type {
|
||||
"document" => {
|
||||
// Top-level document — collect children
|
||||
let content = map.get("content")?.as_array()?;
|
||||
let mut parts = Vec::new();
|
||||
for child in content {
|
||||
if let Some(chunk) = child
|
||||
.as_object()
|
||||
.and_then(|m| m.get("nodeType").and_then(|v| v.as_str()))
|
||||
.and_then(|nt| extract_contentful_node(child.as_object().unwrap(), nt))
|
||||
{
|
||||
if let Some(h) = &chunk.heading {
|
||||
parts.push(format!("## {h}"));
|
||||
}
|
||||
if !chunk.body.is_empty() {
|
||||
parts.push(chunk.body);
|
||||
}
|
||||
}
|
||||
}
|
||||
if parts.is_empty() {
|
||||
return None;
|
||||
}
|
||||
Some(TextChunk {
|
||||
heading: None,
|
||||
body: parts.join("\n\n"),
|
||||
})
|
||||
}
|
||||
"paragraph" | "text" => {
|
||||
let text = collect_text_content(map);
|
||||
if is_content_text(&text) {
|
||||
Some(TextChunk {
|
||||
heading: None,
|
||||
body: text,
|
||||
})
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
nt if nt.starts_with("heading-") => {
|
||||
let text = collect_text_content(map);
|
||||
if text.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(TextChunk {
|
||||
heading: Some(text),
|
||||
body: String::new(),
|
||||
})
|
||||
}
|
||||
}
|
||||
"blockquote" => {
|
||||
let text = collect_text_content(map);
|
||||
if is_content_text(&text) {
|
||||
Some(TextChunk {
|
||||
heading: None,
|
||||
body: format!("> {text}"),
|
||||
})
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Recursively collect plain text from a Contentful rich text node tree.
|
||||
fn collect_text_content(map: &serde_json::Map<String, serde_json::Value>) -> String {
|
||||
let mut text = String::new();
|
||||
|
||||
if let Some(v) = map.get("value").and_then(|v| v.as_str()) {
|
||||
text.push_str(v);
|
||||
}
|
||||
|
||||
if let Some(content) = map.get("content").and_then(|v| v.as_array()) {
|
||||
for child in content {
|
||||
if let Some(child_map) = child.as_object() {
|
||||
let child_text = collect_text_content(child_map);
|
||||
text.push_str(&child_text);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
text.trim().to_string()
|
||||
}
|
||||
|
||||
/// Check if a JSON object looks like a CMS entry with heading + description.
|
||||
fn is_cms_entry(map: &serde_json::Map<String, serde_json::Value>) -> bool {
|
||||
let has_heading =
|
||||
map.contains_key("heading") || map.contains_key("title") || map.contains_key("headline");
|
||||
let has_body = map.contains_key("description")
|
||||
|| map.contains_key("subheading")
|
||||
|| map.contains_key("body")
|
||||
|| map.contains_key("text");
|
||||
has_heading && has_body
|
||||
}
|
||||
|
||||
/// Extract heading + body from a CMS-style entry.
|
||||
fn extract_cms_entry(map: &serde_json::Map<String, serde_json::Value>) -> Option<TextChunk> {
|
||||
let heading = extract_text_field(map, "heading")
|
||||
.or_else(|| extract_text_field(map, "title"))
|
||||
.or_else(|| extract_text_field(map, "headline"))
|
||||
.filter(|h| !is_cms_internal_title(h) && h.len() > 5)?;
|
||||
|
||||
let body = extract_text_field(map, "description")
|
||||
.or_else(|| extract_text_field(map, "subheading"))
|
||||
.or_else(|| extract_text_field(map, "body"))
|
||||
.or_else(|| extract_text_field(map, "text"))
|
||||
.unwrap_or_default();
|
||||
|
||||
if !is_content_text(&heading) && !is_content_text(&body) {
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(TextChunk {
|
||||
heading: Some(heading),
|
||||
body,
|
||||
})
|
||||
}
|
||||
|
||||
/// Extract a quote/testimonial from a JSON object.
|
||||
fn extract_quote(map: &serde_json::Map<String, serde_json::Value>) -> Option<TextChunk> {
|
||||
let quote =
|
||||
extract_text_field(map, "quote").or_else(|| extract_text_field(map, "quoteText"))?;
|
||||
if !is_content_text("e) {
|
||||
return None;
|
||||
}
|
||||
|
||||
let attribution = extract_text_field(map, "position")
|
||||
.or_else(|| extract_text_field(map, "author"))
|
||||
.or_else(|| extract_text_field(map, "name"))
|
||||
.unwrap_or_default();
|
||||
|
||||
let body = if attribution.is_empty() {
|
||||
format!("> {quote}")
|
||||
} else {
|
||||
format!("> {quote}\n> — {attribution}")
|
||||
};
|
||||
|
||||
Some(TextChunk {
|
||||
heading: None,
|
||||
body,
|
||||
})
|
||||
}
|
||||
|
||||
/// Extract standalone content strings from known field names that weren't
|
||||
/// caught by the CMS entry or quote patterns. These are body/description/
|
||||
/// subheading/eyebrow fields on objects that lack a paired heading, or
|
||||
/// headline fields on objects that lack a body.
|
||||
fn extract_orphan_texts(
|
||||
map: &serde_json::Map<String, serde_json::Value>,
|
||||
chunks: &mut Vec<TextChunk>,
|
||||
) {
|
||||
const BODY_KEYS: &[&str] = &["body", "description", "subheading", "eyebrow", "children"];
|
||||
const HEADING_KEYS: &[&str] = &["heading", "title", "headline"];
|
||||
|
||||
// Don't extract if this object was already handled as a CMS entry
|
||||
if is_cms_entry(map) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Try extracting a standalone heading (without body)
|
||||
for key in HEADING_KEYS {
|
||||
if let Some(text) = extract_text_field(map, key)
|
||||
&& is_content_text(&text)
|
||||
{
|
||||
chunks.push(TextChunk {
|
||||
heading: Some(text),
|
||||
body: String::new(),
|
||||
});
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Try extracting a standalone body field
|
||||
for key in BODY_KEYS {
|
||||
if let Some(text) = extract_text_field(map, key)
|
||||
&& is_content_text(&text)
|
||||
{
|
||||
chunks.push(TextChunk {
|
||||
heading: None,
|
||||
body: text,
|
||||
});
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract a text value from a JSON field, handling both plain strings and
|
||||
/// Contentful rich text objects.
|
||||
fn extract_text_field(
|
||||
map: &serde_json::Map<String, serde_json::Value>,
|
||||
key: &str,
|
||||
) -> Option<String> {
|
||||
let value = map.get(key)?;
|
||||
|
||||
// Plain string
|
||||
if let Some(s) = value.as_str() {
|
||||
let s = s.trim().to_string();
|
||||
return if s.is_empty() { None } else { Some(s) };
|
||||
}
|
||||
|
||||
// Contentful rich text object: { "content": [{ "content": [{ "value": "..." }] }] }
|
||||
if let Some(obj) = value.as_object() {
|
||||
let text = collect_text_content(obj);
|
||||
return if text.is_empty() { None } else { Some(text) };
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
/// JSON keys that hold image/media/asset data — skip recursing into these
|
||||
/// to avoid extracting CMS alt text as content.
|
||||
fn is_media_key(key: &str) -> bool {
|
||||
let k = key.to_lowercase();
|
||||
k == "alt"
|
||||
|| k.contains("image")
|
||||
|| k.contains("poster")
|
||||
|| k.contains("video")
|
||||
|| k.contains("thumbnail")
|
||||
|| k.contains("icon")
|
||||
|| k.contains("logo")
|
||||
|| k == "src"
|
||||
|| k == "url"
|
||||
|| k == "href"
|
||||
}
|
||||
|
||||
/// CMS internal titles like "/home Customer Stories: Logo" or
|
||||
/// "Copilot agent mode hero poster desktop" are editorial labels, not user-facing text.
|
||||
fn is_cms_internal_title(s: &str) -> bool {
|
||||
// Contentful path-style titles
|
||||
if s.starts_with("/home ") || s.starts_with("/page ") {
|
||||
return true;
|
||||
}
|
||||
// Titles that look like asset/component labels (short words, no sentence structure)
|
||||
let words: Vec<&str> = s.split_whitespace().collect();
|
||||
if words.len() >= 3 {
|
||||
let has_label_keyword = words
|
||||
.iter()
|
||||
.any(|w| ["poster", "logo", "image", "icon", "asset", "thumbnail"].contains(w));
|
||||
if has_label_keyword {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
/// Heuristic: is this string actual content (not an ID, URL, class name, etc.)?
|
||||
fn is_content_text(s: &str) -> bool {
|
||||
let s = s.trim();
|
||||
if s.len() < 15 {
|
||||
return false;
|
||||
}
|
||||
// Skip URLs, IDs, technical strings
|
||||
if s.starts_with("http") || s.starts_with('/') || s.starts_with('{') || s.starts_with('[') {
|
||||
return false;
|
||||
}
|
||||
// Must contain spaces (prose), not just a single technical token
|
||||
if !s.contains(' ') {
|
||||
return false;
|
||||
}
|
||||
// Skip strings that are mostly hex/base64 (hashes, IDs)
|
||||
let alnum_ratio = s.chars().filter(|c| c.is_alphanumeric()).count() as f64 / s.len() as f64;
|
||||
if alnum_ratio < 0.6 {
|
||||
return false;
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn extracts_contentful_rich_text() {
|
||||
let html = r#"<html><body>
|
||||
<script type="application/json" data-target="react-app.embeddedData">
|
||||
{"payload":{"contentfulRawJsonResponse":{"includes":{"Entry":[
|
||||
{"fields":{
|
||||
"heading":"Ship faster with secure CI/CD",
|
||||
"subheading":{"content":[{"content":[{"value":"Automate builds, tests, and deployments."}]}]}
|
||||
}},
|
||||
{"fields":{
|
||||
"heading":"Built-in application security",
|
||||
"description":{"content":[{"content":[{"value":"Use AI to find and fix vulnerabilities so your team can ship more secure software faster."}]}]}
|
||||
}}
|
||||
]}}}}
|
||||
</script>
|
||||
</body></html>"#;
|
||||
|
||||
let doc = Html::parse_document(html);
|
||||
let result = try_extract(&doc, 0, "").unwrap();
|
||||
|
||||
assert!(result.contains("Ship faster with secure CI/CD"));
|
||||
assert!(result.contains("Automate builds, tests, and deployments"));
|
||||
assert!(result.contains("Built-in application security"));
|
||||
assert!(result.contains("find and fix vulnerabilities"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn skips_when_dom_has_enough_content() {
|
||||
let html = r#"<html><body>
|
||||
<script type="application/json">{"heading":"Foo","description":"Some long description here."}</script>
|
||||
</body></html>"#;
|
||||
|
||||
let doc = Html::parse_document(html);
|
||||
assert!(try_extract(&doc, 500, "").is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn skips_non_content_strings() {
|
||||
assert!(!is_content_text("abc123"));
|
||||
assert!(!is_content_text("https://example.com/foo/bar"));
|
||||
assert!(!is_content_text("/home Customer Stories: Logo"));
|
||||
assert!(!is_content_text("a1b2c3d4e5f6a1b2c3d4e5f6"));
|
||||
assert!(is_content_text(
|
||||
"Automate builds, tests, and deployments with CI/CD."
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extracts_quotes() {
|
||||
let html = r#"<html><body>
|
||||
<script type="application/json">
|
||||
{"fields":{"quote":{"content":[{"content":[{"value":"GitHub frees us from maintaining our own infrastructure."}]}]},"position":"CTO at Example Corp"}}
|
||||
</script>
|
||||
</body></html>"#;
|
||||
|
||||
let doc = Html::parse_document(html);
|
||||
let result = try_extract(&doc, 0, "").unwrap();
|
||||
assert!(result.contains("> GitHub frees us from maintaining our own infrastructure."));
|
||||
assert!(result.contains("CTO at Example Corp"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn skips_content_already_in_dom() {
|
||||
let html = r#"<html><body>
|
||||
<script type="application/json">
|
||||
{"fields":{"heading":"Already in DOM heading","description":"This text already appears in the DOM markdown output."}}
|
||||
</script>
|
||||
</body></html>"#;
|
||||
|
||||
let doc = Html::parse_document(html);
|
||||
let existing =
|
||||
"# Already in DOM heading\n\nThis text already appears in the DOM markdown output.";
|
||||
assert!(try_extract(&doc, 10, existing).is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deduplicates_chunks() {
|
||||
let html = r#"<html><body>
|
||||
<script type="application/json">
|
||||
{"a":{"heading":"Same heading here","description":"Same body content across multiple entries."},
|
||||
"b":{"heading":"Same heading here","description":"Same body content across multiple entries."}}
|
||||
</script>
|
||||
</body></html>"#;
|
||||
|
||||
let doc = Html::parse_document(html);
|
||||
let result = try_extract(&doc, 0, "").unwrap();
|
||||
// Should appear only once
|
||||
assert_eq!(
|
||||
result
|
||||
.matches("Same body content across multiple entries")
|
||||
.count(),
|
||||
1
|
||||
);
|
||||
}
|
||||
}
|
||||
340
crates/noxa-core/src/diff.rs
Normal file
340
crates/noxa-core/src/diff.rs
Normal file
|
|
@ -0,0 +1,340 @@
|
|||
/// Change tracking between two extraction snapshots.
|
||||
/// Pure computation -- no I/O, WASM-safe.
|
||||
use std::collections::HashSet;
|
||||
|
||||
use serde::Serialize;
|
||||
use similar::TextDiff;
|
||||
|
||||
use crate::types::{ExtractionResult, Link};
|
||||
|
||||
#[derive(Debug, Clone, Serialize, PartialEq)]
|
||||
pub enum ChangeStatus {
|
||||
Same,
|
||||
Changed,
|
||||
New,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct MetadataChange {
|
||||
pub field: String,
|
||||
pub old: Option<String>,
|
||||
pub new: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct ContentDiff {
|
||||
pub status: ChangeStatus,
|
||||
pub text_diff: Option<String>,
|
||||
pub metadata_changes: Vec<MetadataChange>,
|
||||
pub links_added: Vec<Link>,
|
||||
pub links_removed: Vec<Link>,
|
||||
pub word_count_delta: i64,
|
||||
}
|
||||
|
||||
/// Compare two extraction results and produce a diff.
|
||||
/// `old` is the previous snapshot, `new_result` is the current extraction.
|
||||
pub fn diff(old: &ExtractionResult, new_result: &ExtractionResult) -> ContentDiff {
|
||||
let text_diff = compute_text_diff(&old.content.markdown, &new_result.content.markdown);
|
||||
let metadata_changes = compute_metadata_changes(&old.metadata, &new_result.metadata);
|
||||
let (links_added, links_removed) =
|
||||
compute_link_changes(&old.content.links, &new_result.content.links);
|
||||
let word_count_delta = new_result.metadata.word_count as i64 - old.metadata.word_count as i64;
|
||||
|
||||
let status = if text_diff.is_none() && metadata_changes.is_empty() {
|
||||
ChangeStatus::Same
|
||||
} else {
|
||||
ChangeStatus::Changed
|
||||
};
|
||||
|
||||
ContentDiff {
|
||||
status,
|
||||
text_diff,
|
||||
metadata_changes,
|
||||
links_added,
|
||||
links_removed,
|
||||
word_count_delta,
|
||||
}
|
||||
}
|
||||
|
||||
fn compute_text_diff(old: &str, new: &str) -> Option<String> {
|
||||
if old == new {
|
||||
return None;
|
||||
}
|
||||
|
||||
let diff = TextDiff::from_lines(old, new);
|
||||
let unified = diff
|
||||
.unified_diff()
|
||||
.context_radius(3)
|
||||
.header("old", "new")
|
||||
.to_string();
|
||||
|
||||
if unified.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(unified)
|
||||
}
|
||||
}
|
||||
|
||||
/// Compare each metadata field, returning only those that changed.
|
||||
fn compute_metadata_changes(
|
||||
old: &crate::types::Metadata,
|
||||
new: &crate::types::Metadata,
|
||||
) -> Vec<MetadataChange> {
|
||||
let mut changes = Vec::new();
|
||||
|
||||
let fields: Vec<(&str, &Option<String>, &Option<String>)> = vec![
|
||||
("title", &old.title, &new.title),
|
||||
("description", &old.description, &new.description),
|
||||
("author", &old.author, &new.author),
|
||||
("published_date", &old.published_date, &new.published_date),
|
||||
("language", &old.language, &new.language),
|
||||
("url", &old.url, &new.url),
|
||||
("site_name", &old.site_name, &new.site_name),
|
||||
("image", &old.image, &new.image),
|
||||
("favicon", &old.favicon, &new.favicon),
|
||||
];
|
||||
|
||||
for (name, old_val, new_val) in fields {
|
||||
if old_val != new_val {
|
||||
changes.push(MetadataChange {
|
||||
field: name.to_string(),
|
||||
old: old_val.clone(),
|
||||
new: new_val.clone(),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
changes
|
||||
}
|
||||
|
||||
/// Links added/removed, compared by href (ignoring text differences).
|
||||
fn compute_link_changes(old: &[Link], new: &[Link]) -> (Vec<Link>, Vec<Link>) {
|
||||
let old_hrefs: HashSet<&str> = old.iter().map(|l| l.href.as_str()).collect();
|
||||
let new_hrefs: HashSet<&str> = new.iter().map(|l| l.href.as_str()).collect();
|
||||
|
||||
let added: Vec<Link> = new
|
||||
.iter()
|
||||
.filter(|l| !old_hrefs.contains(l.href.as_str()))
|
||||
.cloned()
|
||||
.collect();
|
||||
|
||||
let removed: Vec<Link> = old
|
||||
.iter()
|
||||
.filter(|l| !new_hrefs.contains(l.href.as_str()))
|
||||
.cloned()
|
||||
.collect();
|
||||
|
||||
(added, removed)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::domain::DomainType;
|
||||
use crate::types::{Content, DomainData, Metadata};
|
||||
|
||||
/// Build a minimal ExtractionResult for test comparisons.
|
||||
fn make_result(markdown: &str, title: Option<&str>, links: Vec<Link>) -> ExtractionResult {
|
||||
let word_count = markdown.split_whitespace().count();
|
||||
ExtractionResult {
|
||||
metadata: Metadata {
|
||||
title: title.map(String::from),
|
||||
description: None,
|
||||
author: None,
|
||||
published_date: None,
|
||||
language: None,
|
||||
url: None,
|
||||
site_name: None,
|
||||
image: None,
|
||||
favicon: None,
|
||||
word_count,
|
||||
},
|
||||
content: Content {
|
||||
markdown: markdown.to_string(),
|
||||
plain_text: markdown.to_string(),
|
||||
links,
|
||||
images: vec![],
|
||||
code_blocks: vec![],
|
||||
raw_html: None,
|
||||
},
|
||||
domain_data: Some(DomainData {
|
||||
domain_type: DomainType::Generic,
|
||||
}),
|
||||
structured_data: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
fn link(href: &str, text: &str) -> Link {
|
||||
Link {
|
||||
href: href.to_string(),
|
||||
text: text.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_identical_content() {
|
||||
let a = make_result("# Hello\n\nSome content here.", Some("Hello"), vec![]);
|
||||
let b = make_result("# Hello\n\nSome content here.", Some("Hello"), vec![]);
|
||||
|
||||
let result = diff(&a, &b);
|
||||
|
||||
assert_eq!(result.status, ChangeStatus::Same);
|
||||
assert!(result.text_diff.is_none());
|
||||
assert!(result.metadata_changes.is_empty());
|
||||
assert!(result.links_added.is_empty());
|
||||
assert!(result.links_removed.is_empty());
|
||||
assert_eq!(result.word_count_delta, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_title_change() {
|
||||
let a = make_result("# Hello\n\nContent.", Some("Old Title"), vec![]);
|
||||
let b = make_result("# Hello\n\nContent.", Some("New Title"), vec![]);
|
||||
|
||||
let result = diff(&a, &b);
|
||||
|
||||
assert_eq!(result.status, ChangeStatus::Changed);
|
||||
assert!(result.text_diff.is_none(), "text is identical");
|
||||
assert_eq!(result.metadata_changes.len(), 1);
|
||||
assert_eq!(result.metadata_changes[0].field, "title");
|
||||
assert_eq!(result.metadata_changes[0].old.as_deref(), Some("Old Title"));
|
||||
assert_eq!(result.metadata_changes[0].new.as_deref(), Some("New Title"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_content_change() {
|
||||
let a = make_result("# Hello\n\nOld paragraph.", Some("Title"), vec![]);
|
||||
let b = make_result("# Hello\n\nNew paragraph.", Some("Title"), vec![]);
|
||||
|
||||
let result = diff(&a, &b);
|
||||
|
||||
assert_eq!(result.status, ChangeStatus::Changed);
|
||||
assert!(result.text_diff.is_some());
|
||||
let diff_text = result.text_diff.unwrap();
|
||||
assert!(diff_text.contains('-'), "should have removal markers");
|
||||
assert!(diff_text.contains('+'), "should have addition markers");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_link_added() {
|
||||
let a = make_result("Content.", None, vec![]);
|
||||
let b = make_result(
|
||||
"Content.",
|
||||
None,
|
||||
vec![link("https://example.com", "Example")],
|
||||
);
|
||||
|
||||
let result = diff(&a, &b);
|
||||
|
||||
assert_eq!(result.links_added.len(), 1);
|
||||
assert_eq!(result.links_added[0].href, "https://example.com");
|
||||
assert!(result.links_removed.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_link_removed() {
|
||||
let a = make_result(
|
||||
"Content.",
|
||||
None,
|
||||
vec![link("https://example.com", "Example")],
|
||||
);
|
||||
let b = make_result("Content.", None, vec![]);
|
||||
|
||||
let result = diff(&a, &b);
|
||||
|
||||
assert!(result.links_added.is_empty());
|
||||
assert_eq!(result.links_removed.len(), 1);
|
||||
assert_eq!(result.links_removed[0].href, "https://example.com");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_links_added_and_removed() {
|
||||
let a = make_result(
|
||||
"Content.",
|
||||
None,
|
||||
vec![
|
||||
link("https://old.com", "Old"),
|
||||
link("https://stable.com", "Stable"),
|
||||
],
|
||||
);
|
||||
let b = make_result(
|
||||
"Content.",
|
||||
None,
|
||||
vec![
|
||||
link("https://stable.com", "Stable"),
|
||||
link("https://new.com", "New"),
|
||||
],
|
||||
);
|
||||
|
||||
let result = diff(&a, &b);
|
||||
|
||||
assert_eq!(result.links_added.len(), 1);
|
||||
assert_eq!(result.links_added[0].href, "https://new.com");
|
||||
assert_eq!(result.links_removed.len(), 1);
|
||||
assert_eq!(result.links_removed[0].href, "https://old.com");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_word_count_delta() {
|
||||
let a = make_result("one two three", None, vec![]);
|
||||
let b = make_result("one two three four five", None, vec![]);
|
||||
|
||||
let result = diff(&a, &b);
|
||||
|
||||
assert_eq!(result.word_count_delta, 2);
|
||||
|
||||
// Negative delta
|
||||
let result_rev = diff(&b, &a);
|
||||
assert_eq!(result_rev.word_count_delta, -2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_unified_diff_format() {
|
||||
let a = make_result("line one\nline two\nline three\n", None, vec![]);
|
||||
let b = make_result("line one\nline changed\nline three\n", None, vec![]);
|
||||
|
||||
let result = diff(&a, &b);
|
||||
|
||||
assert!(result.text_diff.is_some());
|
||||
let diff_text = result.text_diff.unwrap();
|
||||
assert!(diff_text.contains("--- old"), "should have old header");
|
||||
assert!(diff_text.contains("+++ new"), "should have new header");
|
||||
assert!(diff_text.contains("-line two"), "should show removed line");
|
||||
assert!(
|
||||
diff_text.contains("+line changed"),
|
||||
"should show added line"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty_content() {
|
||||
let a = make_result("", None, vec![]);
|
||||
let b = make_result("", None, vec![]);
|
||||
|
||||
let result = diff(&a, &b);
|
||||
|
||||
assert_eq!(result.status, ChangeStatus::Same);
|
||||
assert!(result.text_diff.is_none());
|
||||
assert_eq!(result.word_count_delta, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_link_text_change_ignored() {
|
||||
// Same href, different text -- should not appear in added/removed
|
||||
let a = make_result(
|
||||
"Content.",
|
||||
None,
|
||||
vec![link("https://example.com", "Old Text")],
|
||||
);
|
||||
let b = make_result(
|
||||
"Content.",
|
||||
None,
|
||||
vec![link("https://example.com", "New Text")],
|
||||
);
|
||||
|
||||
let result = diff(&a, &b);
|
||||
|
||||
assert!(result.links_added.is_empty());
|
||||
assert!(result.links_removed.is_empty());
|
||||
}
|
||||
}
|
||||
187
crates/noxa-core/src/domain.rs
Normal file
187
crates/noxa-core/src/domain.rs
Normal file
|
|
@ -0,0 +1,187 @@
|
|||
/// Domain detection via URL patterns and DOM heuristics.
|
||||
/// Knowing the domain type lets downstream consumers apply
|
||||
/// domain-specific prompts or post-processing.
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum DomainType {
|
||||
Article,
|
||||
Documentation,
|
||||
GitHub,
|
||||
Forum,
|
||||
ECommerce,
|
||||
Social,
|
||||
Generic,
|
||||
}
|
||||
|
||||
/// Detect domain type from URL patterns first, then fall back to DOM heuristics.
|
||||
pub fn detect(url: Option<&str>, html: &str) -> DomainType {
|
||||
if let Some(url) = url
|
||||
&& let Some(dt) = detect_from_url(url)
|
||||
{
|
||||
return dt;
|
||||
}
|
||||
detect_from_dom(html)
|
||||
}
|
||||
|
||||
fn detect_from_url(url: &str) -> Option<DomainType> {
|
||||
let lower = url.to_lowercase();
|
||||
|
||||
// GitHub
|
||||
if lower.contains("github.com") || lower.contains("gitlab.com") {
|
||||
return Some(DomainType::GitHub);
|
||||
}
|
||||
|
||||
// Documentation sites
|
||||
let doc_patterns = [
|
||||
"docs.",
|
||||
"readthedocs",
|
||||
"gitbook",
|
||||
"docusaurus",
|
||||
"/docs/",
|
||||
"/documentation/",
|
||||
"devdocs.io",
|
||||
"doc.rust-lang.org",
|
||||
"developer.mozilla.org",
|
||||
"developer.apple.com/documentation",
|
||||
];
|
||||
if doc_patterns.iter().any(|p| lower.contains(p)) {
|
||||
return Some(DomainType::Documentation);
|
||||
}
|
||||
|
||||
// Forums
|
||||
let forum_patterns = [
|
||||
"reddit.com",
|
||||
"news.ycombinator.com",
|
||||
"stackoverflow.com",
|
||||
"stackexchange.com",
|
||||
"discourse",
|
||||
"forum",
|
||||
"community.",
|
||||
];
|
||||
if forum_patterns.iter().any(|p| lower.contains(p)) {
|
||||
return Some(DomainType::Forum);
|
||||
}
|
||||
|
||||
// Social
|
||||
let social_patterns = [
|
||||
"twitter.com",
|
||||
"x.com",
|
||||
"linkedin.com",
|
||||
"facebook.com",
|
||||
"instagram.com",
|
||||
"mastodon",
|
||||
"bsky.app",
|
||||
];
|
||||
if social_patterns.iter().any(|p| lower.contains(p)) {
|
||||
return Some(DomainType::Social);
|
||||
}
|
||||
|
||||
// E-commerce
|
||||
let ecommerce_patterns = [
|
||||
"amazon.",
|
||||
"ebay.",
|
||||
"shopify.",
|
||||
"etsy.com",
|
||||
"/product/",
|
||||
"/shop/",
|
||||
"/cart",
|
||||
];
|
||||
if ecommerce_patterns.iter().any(|p| lower.contains(p)) {
|
||||
return Some(DomainType::ECommerce);
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
/// Fallback: check HTML for structural hints when URL isn't enough.
|
||||
fn detect_from_dom(html: &str) -> DomainType {
|
||||
let lower = html.to_lowercase();
|
||||
|
||||
// Article signals: <article> tag, schema.org Article type
|
||||
if lower.contains("<article") || lower.contains("schema.org/article") {
|
||||
return DomainType::Article;
|
||||
}
|
||||
|
||||
// Documentation signals
|
||||
if lower.contains("docsearch") || lower.contains("sidebar-nav") || lower.contains("doc-content")
|
||||
{
|
||||
return DomainType::Documentation;
|
||||
}
|
||||
|
||||
DomainType::Generic
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn github_urls() {
|
||||
assert_eq!(
|
||||
detect(Some("https://github.com/tokio-rs/tokio"), ""),
|
||||
DomainType::GitHub
|
||||
);
|
||||
assert_eq!(
|
||||
detect(Some("https://gitlab.com/foo/bar"), ""),
|
||||
DomainType::GitHub
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn documentation_urls() {
|
||||
assert_eq!(
|
||||
detect(Some("https://docs.rs/serde/latest"), ""),
|
||||
DomainType::Documentation
|
||||
);
|
||||
assert_eq!(
|
||||
detect(Some("https://readthedocs.org/projects/foo"), ""),
|
||||
DomainType::Documentation
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn forum_urls() {
|
||||
assert_eq!(
|
||||
detect(Some("https://www.reddit.com/r/rust"), ""),
|
||||
DomainType::Forum
|
||||
);
|
||||
assert_eq!(
|
||||
detect(Some("https://stackoverflow.com/questions/123"), ""),
|
||||
DomainType::Forum
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn social_urls() {
|
||||
assert_eq!(
|
||||
detect(Some("https://x.com/elonmusk"), ""),
|
||||
DomainType::Social
|
||||
);
|
||||
assert_eq!(
|
||||
detect(Some("https://linkedin.com/in/someone"), ""),
|
||||
DomainType::Social
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ecommerce_urls() {
|
||||
assert_eq!(
|
||||
detect(Some("https://amazon.com/dp/B001"), ""),
|
||||
DomainType::ECommerce
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dom_fallback_article() {
|
||||
let html = r#"<html><body><article><p>Hello world</p></article></body></html>"#;
|
||||
assert_eq!(detect(None, html), DomainType::Article);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dom_fallback_generic() {
|
||||
let html = r#"<html><body><div>Just some div</div></body></html>"#;
|
||||
assert_eq!(detect(None, html), DomainType::Generic);
|
||||
}
|
||||
}
|
||||
15
crates/noxa-core/src/error.rs
Normal file
15
crates/noxa-core/src/error.rs
Normal file
|
|
@ -0,0 +1,15 @@
|
|||
/// Extraction errors — kept minimal since this crate does no I/O.
|
||||
/// Most failures come from malformed HTML or invalid URLs.
|
||||
use thiserror::Error;
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum ExtractError {
|
||||
#[error("failed to parse HTML")]
|
||||
ParseError,
|
||||
|
||||
#[error("invalid URL: {0}")]
|
||||
InvalidUrl(String),
|
||||
|
||||
#[error("no content found")]
|
||||
NoContent,
|
||||
}
|
||||
1555
crates/noxa-core/src/extractor.rs
Normal file
1555
crates/noxa-core/src/extractor.rs
Normal file
File diff suppressed because it is too large
Load diff
596
crates/noxa-core/src/js_eval.rs
Normal file
596
crates/noxa-core/src/js_eval.rs
Normal file
|
|
@ -0,0 +1,596 @@
|
|||
/// QuickJS-based extraction of data from inline JavaScript in HTML pages.
|
||||
///
|
||||
/// Many modern websites embed page data as JavaScript variable assignments
|
||||
/// (e.g., `window.__PRELOADED_STATE__`, Next.js `self.__next_f`). The static
|
||||
/// JSON data island approach (`data_island.rs`) only handles `<script type="application/json">`.
|
||||
/// This module executes inline `<script>` tags in a sandboxed QuickJS runtime
|
||||
/// to capture those JS-assigned data blobs.
|
||||
use once_cell::sync::Lazy;
|
||||
use regex::Regex;
|
||||
use rquickjs::{Context, Runtime};
|
||||
use scraper::{Html, Selector};
|
||||
use tracing::debug;
|
||||
|
||||
static SCRIPT_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("script").unwrap());
|
||||
static HTML_TAG_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"<[^>]+>").unwrap());
|
||||
|
||||
/// A blob of data extracted from JS execution.
|
||||
pub struct JsDataBlob {
|
||||
pub name: String,
|
||||
pub data: String,
|
||||
pub size: usize,
|
||||
}
|
||||
|
||||
/// Execute inline `<script>` tags in a QuickJS sandbox and extract `window.__*` data.
|
||||
pub fn extract_js_data(html: &str) -> Vec<JsDataBlob> {
|
||||
let doc = Html::parse_document(html);
|
||||
|
||||
let scripts: Vec<String> = doc
|
||||
.select(&SCRIPT_SELECTOR)
|
||||
.filter(|el| {
|
||||
let v = el.value();
|
||||
// Skip external scripts and ES modules
|
||||
if v.attr("src").is_some() {
|
||||
return false;
|
||||
}
|
||||
if v.attr("type").is_some_and(|t| t == "module") {
|
||||
return false;
|
||||
}
|
||||
true
|
||||
})
|
||||
.map(|el| el.text().collect::<String>())
|
||||
.filter(|s| !s.trim().is_empty())
|
||||
.collect();
|
||||
|
||||
if scripts.is_empty() {
|
||||
return Vec::new();
|
||||
}
|
||||
|
||||
let rt = Runtime::new().expect("QuickJS runtime creation failed");
|
||||
rt.set_memory_limit(64 * 1024 * 1024); // 64 MB
|
||||
rt.set_max_stack_size(1024 * 1024); // 1 MB
|
||||
|
||||
let ctx = Context::full(&rt).expect("QuickJS context creation failed");
|
||||
|
||||
ctx.with(|ctx| {
|
||||
// Set up minimal browser stubs so scripts don't crash on missing globals.
|
||||
// We don't need real implementations — just enough to avoid ReferenceErrors.
|
||||
let setup = r#"
|
||||
globalThis.window = globalThis;
|
||||
globalThis.self = globalThis;
|
||||
globalThis.document = {
|
||||
createElement: function() { return { style: {}, setAttribute: function(){}, appendChild: function(){} }; },
|
||||
getElementById: function() { return null; },
|
||||
querySelector: function() { return null; },
|
||||
querySelectorAll: function() { return []; },
|
||||
addEventListener: function() {},
|
||||
createEvent: function() { return { initEvent: function(){} }; },
|
||||
createTextNode: function() { return {}; },
|
||||
head: { appendChild: function(){}, removeChild: function(){} },
|
||||
body: { appendChild: function(){}, removeChild: function(){} },
|
||||
documentElement: { style: {} },
|
||||
cookie: "",
|
||||
readyState: "complete",
|
||||
location: { href: "", hostname: "", pathname: "/" }
|
||||
};
|
||||
globalThis.navigator = {
|
||||
userAgent: "Mozilla/5.0",
|
||||
language: "en-US",
|
||||
languages: ["en-US"],
|
||||
platform: "Linux x86_64",
|
||||
cookieEnabled: true
|
||||
};
|
||||
globalThis.location = { href: "", hostname: "", pathname: "/", search: "", hash: "" };
|
||||
globalThis.history = { pushState: function(){}, replaceState: function(){} };
|
||||
globalThis.setTimeout = function(fn) { if (typeof fn === "function") { try { fn(); } catch(e) {} } return 0; };
|
||||
globalThis.clearTimeout = function() {};
|
||||
globalThis.setInterval = function() { return 0; };
|
||||
globalThis.clearInterval = function() {};
|
||||
globalThis.requestAnimationFrame = function() { return 0; };
|
||||
globalThis.cancelAnimationFrame = function() {};
|
||||
globalThis.console = { log: function(){}, warn: function(){}, error: function(){}, info: function(){}, debug: function(){} };
|
||||
globalThis.fetch = function() { return Promise.resolve({ json: function(){ return Promise.resolve({}); }, text: function(){ return Promise.resolve(""); } }); };
|
||||
globalThis.XMLHttpRequest = function() { this.open = function(){}; this.send = function(){}; this.setRequestHeader = function(){}; };
|
||||
globalThis.localStorage = { getItem: function(){ return null; }, setItem: function(){}, removeItem: function(){}, clear: function(){} };
|
||||
globalThis.sessionStorage = { getItem: function(){ return null; }, setItem: function(){}, removeItem: function(){}, clear: function(){} };
|
||||
globalThis.addEventListener = function() {};
|
||||
globalThis.removeEventListener = function() {};
|
||||
globalThis.dispatchEvent = function() {};
|
||||
globalThis.getComputedStyle = function() { return {}; };
|
||||
globalThis.matchMedia = function() { return { matches: false, addListener: function(){}, removeListener: function(){} }; };
|
||||
globalThis.Image = function() {};
|
||||
globalThis.Event = function() {};
|
||||
globalThis.CustomEvent = function() {};
|
||||
globalThis.MutationObserver = function() { this.observe = function(){}; this.disconnect = function(){}; };
|
||||
globalThis.IntersectionObserver = function() { this.observe = function(){}; this.disconnect = function(){}; };
|
||||
globalThis.ResizeObserver = function() { this.observe = function(){}; this.disconnect = function(){}; };
|
||||
globalThis.performance = { now: function(){ return 0; }, mark: function(){}, measure: function(){} };
|
||||
globalThis.crypto = { getRandomValues: function(arr) { return arr; } };
|
||||
globalThis.URL = function(u) { this.href = u || ""; this.searchParams = { get: function(){ return null; } }; };
|
||||
globalThis.Promise = Promise;
|
||||
self.__next_f = self.__next_f || [];
|
||||
"#;
|
||||
let _ = ctx.eval::<(), _>(setup);
|
||||
|
||||
// Execute each inline script, silently ignoring errors
|
||||
for script in &scripts {
|
||||
let _ = ctx.eval::<(), _>(script.as_str());
|
||||
}
|
||||
|
||||
// Scan window.__* properties for data blobs
|
||||
let scan = r#"
|
||||
(function() {
|
||||
var results = [];
|
||||
var keys = Object.keys(globalThis);
|
||||
for (var i = 0; i < keys.length; i++) {
|
||||
var key = keys[i];
|
||||
if (key.indexOf("__") !== 0) continue;
|
||||
var val = globalThis[key];
|
||||
if (val === null || val === undefined) continue;
|
||||
|
||||
// __next_f is an array of RSC flight data chunks
|
||||
if (key === "__next_f") {
|
||||
if (Array.isArray(val) && val.length > 0) {
|
||||
var json = JSON.stringify(val);
|
||||
if (json.length > 100) {
|
||||
results.push({ name: key, data: json, size: json.length });
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (typeof val === "object") {
|
||||
try {
|
||||
var json = JSON.stringify(val);
|
||||
if (json && json.length > 100) {
|
||||
results.push({ name: key, data: json, size: json.length });
|
||||
}
|
||||
} catch(e) {}
|
||||
}
|
||||
}
|
||||
return JSON.stringify(results);
|
||||
})()
|
||||
"#;
|
||||
|
||||
let Ok(raw): Result<String, _> = ctx.eval(scan) else {
|
||||
return Vec::new();
|
||||
};
|
||||
|
||||
let Ok(entries) = serde_json::from_str::<Vec<RawBlob>>(&raw) else {
|
||||
return Vec::new();
|
||||
};
|
||||
|
||||
let blobs: Vec<JsDataBlob> = entries
|
||||
.into_iter()
|
||||
.map(|e| JsDataBlob {
|
||||
name: e.name,
|
||||
size: e.size,
|
||||
data: e.data,
|
||||
})
|
||||
.collect();
|
||||
|
||||
if !blobs.is_empty() {
|
||||
debug!(
|
||||
count = blobs.len(),
|
||||
names = blobs
|
||||
.iter()
|
||||
.map(|b| b.name.as_str())
|
||||
.collect::<Vec<_>>()
|
||||
.join(", "),
|
||||
"extracted JS data blobs"
|
||||
);
|
||||
}
|
||||
|
||||
blobs
|
||||
})
|
||||
}
|
||||
|
||||
/// Intermediate deserialization target for the scan script output.
|
||||
#[derive(serde::Deserialize)]
|
||||
struct RawBlob {
|
||||
name: String,
|
||||
data: String,
|
||||
size: usize,
|
||||
}
|
||||
|
||||
/// Extract readable text from JS data blobs and format as markdown.
|
||||
///
|
||||
/// Walks each blob's JSON looking for human-readable strings, filters out
|
||||
/// URLs/paths/CSS/base64, deduplicates, and joins into a single section.
|
||||
pub fn extract_readable_text(blobs: &[JsDataBlob]) -> String {
|
||||
let mut texts: Vec<String> = Vec::new();
|
||||
let mut seen = std::collections::HashSet::new();
|
||||
|
||||
for blob in blobs {
|
||||
if blob.name == "__next_f" {
|
||||
let rsc_texts = extract_next_f_text(&blob.data);
|
||||
for t in rsc_texts {
|
||||
if seen.insert(t.clone()) {
|
||||
texts.push(t);
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
let Ok(value) = serde_json::from_str::<serde_json::Value>(&blob.data) else {
|
||||
continue;
|
||||
};
|
||||
|
||||
let mut found = Vec::new();
|
||||
walk_json_for_text(&value, &mut found, 0);
|
||||
|
||||
for t in found {
|
||||
if seen.insert(t.clone()) {
|
||||
texts.push(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if texts.is_empty() {
|
||||
return String::new();
|
||||
}
|
||||
|
||||
let mut md = String::from("## Additional Content\n\n");
|
||||
md.push_str(&texts.join("\n\n"));
|
||||
md
|
||||
}
|
||||
|
||||
/// Recursively walk JSON and collect readable text strings.
|
||||
fn walk_json_for_text(value: &serde_json::Value, out: &mut Vec<String>, depth: usize) {
|
||||
if depth > 15 {
|
||||
return;
|
||||
}
|
||||
|
||||
match value {
|
||||
serde_json::Value::String(s) => {
|
||||
if let Some(clean) = filter_readable(s) {
|
||||
out.push(clean);
|
||||
}
|
||||
}
|
||||
serde_json::Value::Object(map) => {
|
||||
for (_, v) in map {
|
||||
walk_json_for_text(v, out, depth + 1);
|
||||
}
|
||||
}
|
||||
serde_json::Value::Array(arr) => {
|
||||
for v in arr {
|
||||
walk_json_for_text(v, out, depth + 1);
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
/// Filter a string for readability: must be >15 chars, mostly alphabetic,
|
||||
/// not a URL, file path, CSS rule, or base64 blob. Strips inline HTML tags.
|
||||
fn filter_readable(s: &str) -> Option<String> {
|
||||
let s = s.trim();
|
||||
if s.len() <= 15 {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Skip URLs
|
||||
if s.starts_with("http://") || s.starts_with("https://") || s.starts_with("//") {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Skip file paths
|
||||
if s.starts_with('/') || s.starts_with("./") || s.starts_with("../") {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Skip CSS-like strings
|
||||
if s.contains('{') && s.contains('}') && (s.contains(':') || s.contains(';')) {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Skip CSS grid templates, layout strings, and dimension patterns
|
||||
if s.contains("1fr")
|
||||
|| s.contains("grid-")
|
||||
|| s.contains("max-content")
|
||||
|| s.contains("divider-v-")
|
||||
|| s.contains("divider-h-")
|
||||
{
|
||||
return None;
|
||||
}
|
||||
|
||||
// Skip CSS layout area definitions (e.g. "card1 card2 card3")
|
||||
// These have repeated dash-separated tokens with digits
|
||||
let dash_digit_tokens = s
|
||||
.split_whitespace()
|
||||
.filter(|w| w.contains('-') && w.chars().any(|c| c.is_ascii_digit()))
|
||||
.count();
|
||||
if dash_digit_tokens >= 2 {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Skip strings containing literal quote characters (CSS grid areas, code snippets)
|
||||
if s.contains('"') {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Skip CSS grid area names and layout tokens.
|
||||
// These are strings of short lowercase words/dots with no sentence structure.
|
||||
if !s.chars().any(|c| c.is_uppercase()) {
|
||||
let is_css_layout = s.split_whitespace().all(|w| {
|
||||
w == "."
|
||||
|| (w.len() <= 20
|
||||
&& w.chars()
|
||||
.all(|c| c.is_ascii_lowercase() || c.is_ascii_digit() || c == '-'))
|
||||
});
|
||||
if is_css_layout {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
|
||||
// Skip CSS dimension strings (e.g. "16px 0px 0px 0px")
|
||||
if s.split_whitespace().all(|w| {
|
||||
w.ends_with("px") || w.ends_with("em") || w.ends_with("rem") || w.ends_with("%") || w == "0"
|
||||
}) {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Skip base64
|
||||
if s.len() > 50 && !s.contains(' ') {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Skip strings that are mostly HTML tags
|
||||
if s.matches('<').count() > 3 && s.matches('>').count() > 3 {
|
||||
let stripped = HTML_TAG_RE.replace_all(s, "");
|
||||
if stripped.trim().len() < 15 {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
|
||||
// Skip strings ending with file extensions
|
||||
if s.ends_with(".js")
|
||||
|| s.ends_with(".css")
|
||||
|| s.ends_with(".png")
|
||||
|| s.ends_with(".jpg")
|
||||
|| s.ends_with(".svg")
|
||||
|| s.ends_with(".woff2")
|
||||
{
|
||||
return None;
|
||||
}
|
||||
|
||||
// Must be mostly alphabetic (spaces + letters should dominate)
|
||||
let alpha_space = s
|
||||
.chars()
|
||||
.filter(|c| c.is_alphabetic() || c.is_whitespace())
|
||||
.count();
|
||||
let ratio = alpha_space as f64 / s.len() as f64;
|
||||
if ratio < 0.6 {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Must contain spaces (prose, not a single token)
|
||||
if !s.contains(' ') {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Strip any inline HTML tags
|
||||
let clean = HTML_TAG_RE.replace_all(s, "").trim().to_string();
|
||||
|
||||
if clean.len() <= 15 {
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(clean)
|
||||
}
|
||||
|
||||
/// Parse Next.js RSC flight data (`self.__next_f`) and extract readable text.
|
||||
///
|
||||
/// Wire format: array of `[type, payload]` tuples. Type 1 contains the actual
|
||||
/// RSC data as newline-delimited entries like `id:TYPE|payload`.
|
||||
fn extract_next_f_text(raw_json: &str) -> Vec<String> {
|
||||
let Ok(entries) = serde_json::from_str::<Vec<serde_json::Value>>(raw_json) else {
|
||||
return Vec::new();
|
||||
};
|
||||
|
||||
// Concatenate all type=1 payloads
|
||||
let mut wire = String::new();
|
||||
for entry in &entries {
|
||||
let arr = match entry.as_array() {
|
||||
Some(a) if a.len() >= 2 => a,
|
||||
_ => continue,
|
||||
};
|
||||
let entry_type = arr[0].as_u64().unwrap_or(0);
|
||||
if entry_type != 1 {
|
||||
continue;
|
||||
}
|
||||
if let Some(payload) = arr[1].as_str() {
|
||||
wire.push_str(payload);
|
||||
}
|
||||
}
|
||||
|
||||
if wire.is_empty() {
|
||||
return Vec::new();
|
||||
}
|
||||
|
||||
let mut texts = Vec::new();
|
||||
|
||||
// Each line is `id:TYPE|payload` — parse the JSON payloads
|
||||
for line in wire.lines() {
|
||||
// Find the payload after the first `|` or `:` marker
|
||||
let payload = if let Some(pos) = line.find('|') {
|
||||
&line[pos + 1..]
|
||||
} else {
|
||||
continue;
|
||||
};
|
||||
|
||||
// Try to parse as JSON array (RSC element representation)
|
||||
if let Ok(value) = serde_json::from_str::<serde_json::Value>(payload) {
|
||||
walk_rsc_tree(&value, &mut texts, 0);
|
||||
}
|
||||
}
|
||||
|
||||
texts
|
||||
}
|
||||
|
||||
/// Walk an RSC tree element extracting children text content.
|
||||
fn walk_rsc_tree(value: &serde_json::Value, out: &mut Vec<String>, depth: usize) {
|
||||
if depth > 20 {
|
||||
return;
|
||||
}
|
||||
|
||||
match value {
|
||||
serde_json::Value::String(s) => {
|
||||
if let Some(clean) = filter_readable(s) {
|
||||
out.push(clean);
|
||||
}
|
||||
}
|
||||
serde_json::Value::Array(arr) => {
|
||||
for item in arr {
|
||||
walk_rsc_tree(item, out, depth + 1);
|
||||
}
|
||||
}
|
||||
serde_json::Value::Object(map) => {
|
||||
// RSC elements have "children" that contain text
|
||||
if let Some(children) = map.get("children") {
|
||||
walk_rsc_tree(children, out, depth + 1);
|
||||
}
|
||||
// Also check other fields
|
||||
for (key, v) in map {
|
||||
if key == "children" {
|
||||
continue;
|
||||
}
|
||||
walk_rsc_tree(v, out, depth + 1);
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn extracts_window_preloaded_data() {
|
||||
let html = r#"<html><body>
|
||||
<script>
|
||||
window.__preloadedData = {
|
||||
"page": {
|
||||
"title": "Hello World Article Title",
|
||||
"body": "This is a longer paragraph of text that should be extracted from the preloaded data blob successfully."
|
||||
}
|
||||
};
|
||||
</script>
|
||||
</body></html>"#;
|
||||
|
||||
let blobs = extract_js_data(html);
|
||||
assert!(!blobs.is_empty(), "should extract at least one blob");
|
||||
assert!(
|
||||
blobs.iter().any(|b| b.name == "__preloadedData"),
|
||||
"should find __preloadedData"
|
||||
);
|
||||
|
||||
let text = extract_readable_text(&blobs);
|
||||
assert!(
|
||||
text.contains("This is a longer paragraph"),
|
||||
"should extract readable text from blob"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn skips_external_and_module_scripts() {
|
||||
let html = r#"<html><body>
|
||||
<script src="https://cdn.example.com/app.js"></script>
|
||||
<script type="module">export default {};</script>
|
||||
<script>window.__testData = {"content": "This is a test sentence that is long enough to be extracted from the page and it needs over one hundred characters of JSON to pass the threshold."};</script>
|
||||
</body></html>"#;
|
||||
|
||||
let blobs = extract_js_data(html);
|
||||
assert_eq!(
|
||||
blobs.len(),
|
||||
1,
|
||||
"should only process inline non-module script"
|
||||
);
|
||||
assert_eq!(blobs[0].name, "__testData");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty_html_returns_no_blobs() {
|
||||
let blobs = extract_js_data("<html><body></body></html>");
|
||||
assert!(blobs.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn filter_readable_rejects_junk() {
|
||||
assert!(filter_readable("short").is_none());
|
||||
assert!(filter_readable("https://example.com/some/long/path").is_none());
|
||||
assert!(filter_readable("/static/js/bundle.min.js").is_none());
|
||||
assert!(filter_readable("aGVsbG8gd29ybGQgdGhpcyBpcyBhIGJhc2U2NCBzdHJpbmc=").is_none());
|
||||
assert!(filter_readable(".container { display: flex; padding: 10px; }").is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn filter_readable_accepts_prose() {
|
||||
let result = filter_readable("This is a normal sentence with enough words.");
|
||||
assert!(result.is_some());
|
||||
assert_eq!(
|
||||
result.unwrap(),
|
||||
"This is a normal sentence with enough words."
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn strips_html_tags_from_text() {
|
||||
let result = filter_readable(
|
||||
"This has <strong>bold</strong> and <em>italic</em> formatting inside it.",
|
||||
);
|
||||
assert!(result.is_some());
|
||||
let clean = result.unwrap();
|
||||
assert!(!clean.contains('<'));
|
||||
assert!(clean.contains("bold"));
|
||||
assert!(clean.contains("italic"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extract_readable_text_produces_markdown() {
|
||||
let blobs = vec![JsDataBlob {
|
||||
name: "__data".to_string(),
|
||||
data: r#"{"article":"This is the main article content that should appear in the extracted text."}"#
|
||||
.to_string(),
|
||||
size: 100,
|
||||
}];
|
||||
|
||||
let text = extract_readable_text(&blobs);
|
||||
assert!(text.starts_with("## Additional Content"));
|
||||
assert!(text.contains("main article content"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extract_next_f_rsc_data() {
|
||||
let blobs = vec![JsDataBlob {
|
||||
name: "__next_f".to_string(),
|
||||
data: r#"[[0,""],
|
||||
[1,"0:T1234|{\"children\":\"This is some Next.js RSC flight data content that we want to extract.\"}\n"]]"#
|
||||
.to_string(),
|
||||
size: 200,
|
||||
}];
|
||||
|
||||
let text = extract_readable_text(&blobs);
|
||||
assert!(
|
||||
text.contains("Next.js RSC flight data content"),
|
||||
"should extract text from RSC flight data. Got: {text}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn handles_script_errors_gracefully() {
|
||||
// Scripts that throw errors should be silently ignored
|
||||
let html = r#"<html><body>
|
||||
<script>throw new Error("intentional crash");</script>
|
||||
<script>undefined_function();</script>
|
||||
<script>window.__survived = {"message": "This script ran after the errors and the data should still be found in the extracted blobs because it exceeds the minimum threshold."};</script>
|
||||
</body></html>"#;
|
||||
|
||||
let blobs = extract_js_data(html);
|
||||
assert!(
|
||||
blobs.iter().any(|b| b.name == "__survived"),
|
||||
"should extract data from scripts that succeed after failures"
|
||||
);
|
||||
}
|
||||
}
|
||||
606
crates/noxa-core/src/lib.rs
Normal file
606
crates/noxa-core/src/lib.rs
Normal file
|
|
@ -0,0 +1,606 @@
|
|||
pub mod brand;
|
||||
pub(crate) mod data_island;
|
||||
/// noxa-core: Pure HTML content extraction engine for LLMs.
|
||||
///
|
||||
/// Takes raw HTML + optional URL, returns structured content
|
||||
/// (metadata, markdown, plain text, links, images, code blocks).
|
||||
/// Zero network dependencies — WASM-compatible by design.
|
||||
pub mod diff;
|
||||
pub mod domain;
|
||||
pub mod error;
|
||||
pub mod extractor;
|
||||
#[cfg(feature = "quickjs")]
|
||||
pub mod js_eval;
|
||||
pub mod llm;
|
||||
pub mod markdown;
|
||||
pub mod metadata;
|
||||
#[allow(dead_code)]
|
||||
pub(crate) mod noise;
|
||||
pub mod structured_data;
|
||||
pub mod types;
|
||||
pub mod youtube;
|
||||
|
||||
pub use brand::BrandIdentity;
|
||||
pub use diff::{ChangeStatus, ContentDiff, MetadataChange};
|
||||
pub use domain::DomainType;
|
||||
pub use error::ExtractError;
|
||||
pub use llm::to_llm_text;
|
||||
pub use types::{
|
||||
CodeBlock, Content, DomainData, ExtractionOptions, ExtractionResult, Image, Link, Metadata,
|
||||
};
|
||||
|
||||
use scraper::Html;
|
||||
use url::Url;
|
||||
|
||||
/// Extract structured content from raw HTML.
|
||||
///
|
||||
/// `html` — raw HTML string to parse
|
||||
/// `url` — optional source URL, used for resolving relative links and domain detection
|
||||
pub fn extract(html: &str, url: Option<&str>) -> Result<ExtractionResult, ExtractError> {
|
||||
extract_with_options(html, url, &ExtractionOptions::default())
|
||||
}
|
||||
|
||||
/// Extract structured content from raw HTML with configurable options.
|
||||
///
|
||||
/// `html` — raw HTML string to parse
|
||||
/// `url` — optional source URL, used for resolving relative links and domain detection
|
||||
/// `options` — controls include/exclude selectors, main content mode, and raw HTML output
|
||||
///
|
||||
/// Spawns extraction on a thread with an 8 MB stack to handle deeply nested
|
||||
/// HTML (e.g., Express.co.uk live blogs) without overflowing the default 1-2 MB
|
||||
/// main-thread stack on Windows.
|
||||
pub fn extract_with_options(
|
||||
html: &str,
|
||||
url: Option<&str>,
|
||||
options: &ExtractionOptions,
|
||||
) -> Result<ExtractionResult, ExtractError> {
|
||||
// The default main-thread stack on Windows is 1 MB, which can overflow
|
||||
// on deeply nested pages. Spawn a worker thread with 8 MB to be safe.
|
||||
const STACK_SIZE: usize = 8 * 1024 * 1024; // 8 MB
|
||||
|
||||
let html = html.to_string();
|
||||
let url = url.map(|u| u.to_string());
|
||||
let options = options.clone();
|
||||
|
||||
std::thread::Builder::new()
|
||||
.stack_size(STACK_SIZE)
|
||||
.spawn(move || extract_with_options_inner(&html, url.as_deref(), &options))
|
||||
.map_err(|_| ExtractError::NoContent)?
|
||||
.join()
|
||||
.unwrap_or(Err(ExtractError::NoContent))
|
||||
}
|
||||
|
||||
fn extract_with_options_inner(
|
||||
html: &str,
|
||||
url: Option<&str>,
|
||||
options: &ExtractionOptions,
|
||||
) -> Result<ExtractionResult, ExtractError> {
|
||||
if html.is_empty() {
|
||||
return Err(ExtractError::NoContent);
|
||||
}
|
||||
|
||||
// YouTube fast path: if the URL is a YouTube video page, try extracting
|
||||
// structured metadata from ytInitialPlayerResponse before DOM scoring.
|
||||
// This gives LLMs a clean, structured view of video metadata.
|
||||
if let Some(u) = url
|
||||
&& youtube::is_youtube_url(u)
|
||||
&& let Some(yt_md) = youtube::try_extract(html)
|
||||
{
|
||||
let doc = Html::parse_document(html);
|
||||
let mut meta = metadata::extract(&doc, url);
|
||||
meta.word_count = extractor::word_count(&yt_md);
|
||||
|
||||
let plain_text = yt_md
|
||||
.lines()
|
||||
.filter(|l| !l.starts_with('#') && !l.starts_with("**"))
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n")
|
||||
.trim()
|
||||
.to_string();
|
||||
|
||||
let domain_data = Some(DomainData {
|
||||
domain_type: DomainType::Social,
|
||||
});
|
||||
|
||||
let structured_data = structured_data::extract_json_ld(html);
|
||||
|
||||
return Ok(ExtractionResult {
|
||||
metadata: meta,
|
||||
content: Content {
|
||||
markdown: yt_md,
|
||||
plain_text,
|
||||
links: Vec::new(),
|
||||
images: Vec::new(),
|
||||
code_blocks: Vec::new(),
|
||||
raw_html: None,
|
||||
},
|
||||
domain_data,
|
||||
structured_data,
|
||||
});
|
||||
}
|
||||
|
||||
let doc = Html::parse_document(html);
|
||||
|
||||
let base_url = url
|
||||
.map(|u| Url::parse(u).map_err(|_| ExtractError::InvalidUrl(u.to_string())))
|
||||
.transpose()?;
|
||||
|
||||
// Metadata from <head>
|
||||
let mut meta = metadata::extract(&doc, url);
|
||||
|
||||
// Main content extraction (Readability-style scoring + markdown conversion)
|
||||
let mut content = extractor::extract_content(&doc, base_url.as_ref(), options);
|
||||
// Use the higher of plain_text and markdown word counts.
|
||||
// Some pages (headings + links) have content in markdown but empty plain_text.
|
||||
let pt_wc = extractor::word_count(&content.plain_text);
|
||||
let md_wc = extractor::word_count(&content.markdown);
|
||||
meta.word_count = pt_wc.max(md_wc);
|
||||
|
||||
// Retry fallback: if extraction captured too little of the page's visible content,
|
||||
// retry with wider strategies. The scorer sometimes picks a tiny node (e.g., an
|
||||
// <article> with 52 words when the body has 1300 words of real content).
|
||||
//
|
||||
// Strategy 1: retry without only_main_content restriction
|
||||
if options.only_main_content && meta.word_count < 30 {
|
||||
let relaxed = ExtractionOptions {
|
||||
only_main_content: false,
|
||||
..options.clone()
|
||||
};
|
||||
let retry = extractor::extract_content(&doc, base_url.as_ref(), &relaxed);
|
||||
let retry_wc =
|
||||
extractor::word_count(&retry.plain_text).max(extractor::word_count(&retry.markdown));
|
||||
if retry_wc > meta.word_count {
|
||||
content = retry;
|
||||
meta.word_count = retry_wc;
|
||||
}
|
||||
}
|
||||
|
||||
// Strategy 2: if scored extraction is sparse (<200 words) AND the page has
|
||||
// significantly more visible text, retry with include_selectors: ["body"].
|
||||
// This bypasses the readability scorer entirely — catches blogs, pricing
|
||||
// pages, and modern sites where no single element scores well.
|
||||
if meta.word_count < 200 && options.include_selectors.is_empty() {
|
||||
let body_opts = ExtractionOptions {
|
||||
include_selectors: vec!["body".to_string()],
|
||||
exclude_selectors: options.exclude_selectors.clone(),
|
||||
only_main_content: false,
|
||||
include_raw_html: false,
|
||||
};
|
||||
let body_content = extractor::extract_content(&doc, base_url.as_ref(), &body_opts);
|
||||
let body_wc = extractor::word_count(&body_content.plain_text)
|
||||
.max(extractor::word_count(&body_content.markdown));
|
||||
// Use body extraction if it captures significantly more content (>2x)
|
||||
if body_wc > meta.word_count * 2 && body_wc > 50 {
|
||||
content = body_content;
|
||||
meta.word_count = body_wc;
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: if DOM extraction was sparse, try JSON data islands
|
||||
// (React SPAs, Next.js, Contentful CMS embed page data in <script> tags)
|
||||
if let Some(island_md) = data_island::try_extract(&doc, meta.word_count, &content.markdown) {
|
||||
content.markdown.push_str("\n\n");
|
||||
content.markdown.push_str(&island_md);
|
||||
meta.word_count = extractor::word_count(&content.markdown);
|
||||
}
|
||||
|
||||
// QuickJS: execute inline <script> tags to capture JS-assigned data blobs
|
||||
// (e.g., window.__PRELOADED_STATE__, self.__next_f). This supplements the
|
||||
// static JSON data island extraction above with runtime-evaluated data.
|
||||
#[cfg(feature = "quickjs")]
|
||||
{
|
||||
let blobs = js_eval::extract_js_data(html);
|
||||
if !blobs.is_empty() {
|
||||
let js_text = js_eval::extract_readable_text(&blobs);
|
||||
if !js_text.is_empty() {
|
||||
content.markdown.push_str("\n\n");
|
||||
content.markdown.push_str(&js_text);
|
||||
meta.word_count = extractor::word_count(&content.markdown);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Domain detection from URL patterns and DOM heuristics
|
||||
let domain_type = domain::detect(url, html);
|
||||
let domain_data = Some(DomainData { domain_type });
|
||||
|
||||
// Structured data: JSON-LD + __NEXT_DATA__ + SvelteKit data islands
|
||||
let mut structured_data = structured_data::extract_json_ld(html);
|
||||
structured_data.extend(structured_data::extract_next_data(html));
|
||||
structured_data.extend(structured_data::extract_sveltekit(html));
|
||||
|
||||
Ok(ExtractionResult {
|
||||
metadata: meta,
|
||||
content,
|
||||
domain_data,
|
||||
structured_data,
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn full_extraction_pipeline() {
|
||||
let html = r#"
|
||||
<html lang="en">
|
||||
<head>
|
||||
<title>Rust is Great</title>
|
||||
<meta name="description" content="An article about Rust">
|
||||
<meta name="author" content="Bob">
|
||||
</head>
|
||||
<body>
|
||||
<nav><a href="/">Home</a> | <a href="/about">About</a></nav>
|
||||
<article>
|
||||
<h1>Why Rust is Great</h1>
|
||||
<p>Rust gives you <strong>memory safety</strong> without a garbage collector.
|
||||
This is achieved through its <em>ownership system</em>.</p>
|
||||
<p>Here is an example:</p>
|
||||
<pre><code class="language-rust">fn main() {
|
||||
println!("Hello, world!");
|
||||
}</code></pre>
|
||||
<p>Learn more at <a href="https://rust-lang.org">rust-lang.org</a>.</p>
|
||||
</article>
|
||||
<footer>Copyright 2025</footer>
|
||||
</body>
|
||||
</html>"#;
|
||||
|
||||
let result = extract(html, Some("https://blog.example.com/rust")).unwrap();
|
||||
|
||||
// Metadata
|
||||
assert_eq!(result.metadata.title.as_deref(), Some("Rust is Great"));
|
||||
assert_eq!(
|
||||
result.metadata.description.as_deref(),
|
||||
Some("An article about Rust")
|
||||
);
|
||||
assert_eq!(result.metadata.author.as_deref(), Some("Bob"));
|
||||
assert_eq!(result.metadata.language.as_deref(), Some("en"));
|
||||
assert!(result.metadata.word_count > 0);
|
||||
|
||||
// Content
|
||||
assert!(result.content.markdown.contains("# Why Rust is Great"));
|
||||
assert!(result.content.markdown.contains("**memory safety**"));
|
||||
assert!(result.content.markdown.contains("```rust"));
|
||||
assert!(
|
||||
result
|
||||
.content
|
||||
.links
|
||||
.iter()
|
||||
.any(|l| l.href == "https://rust-lang.org")
|
||||
);
|
||||
assert!(!result.content.code_blocks.is_empty());
|
||||
|
||||
// raw_html not populated by default
|
||||
assert!(result.content.raw_html.is_none());
|
||||
|
||||
// Domain — blog.example.com has <article> tag
|
||||
let dd = result.domain_data.unwrap();
|
||||
assert_eq!(dd.domain_type, DomainType::Article);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn invalid_url_returns_error() {
|
||||
let result = extract("<html></html>", Some("not a url"));
|
||||
assert!(matches!(result, Err(ExtractError::InvalidUrl(_))));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty_html_returns_error() {
|
||||
let result = extract("", None);
|
||||
assert!(matches!(result, Err(ExtractError::NoContent)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn no_url_is_fine() {
|
||||
let result = extract("<html><body><p>Hello</p></body></html>", None);
|
||||
assert!(result.is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn serializes_to_json() {
|
||||
let result = extract("<html><body><p>Test</p></body></html>", None).unwrap();
|
||||
let json = serde_json::to_string_pretty(&result).unwrap();
|
||||
assert!(json.contains("metadata"));
|
||||
assert!(json.contains("content"));
|
||||
// raw_html should be absent (skip_serializing_if)
|
||||
assert!(!json.contains("raw_html"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn youtube_extraction_produces_structured_markdown() {
|
||||
let html = r#"
|
||||
<html><head><title>Rust in 100 Seconds - YouTube</title></head>
|
||||
<body>
|
||||
<script>
|
||||
var ytInitialPlayerResponse = {"videoDetails":{"title":"Rust in 100 Seconds","author":"Fireship","viewCount":"5432100","shortDescription":"Learn Rust in 100 seconds. A mass of web developers are mass adopting Rust.","lengthSeconds":"120"},"microformat":{"playerMicroformatRenderer":{"uploadDate":"2023-01-15"}}};
|
||||
</script>
|
||||
</body></html>
|
||||
"#;
|
||||
|
||||
let result = extract(html, Some("https://www.youtube.com/watch?v=5C_HPTJg5ek")).unwrap();
|
||||
|
||||
assert!(result.content.markdown.contains("# Rust in 100 Seconds"));
|
||||
assert!(result.content.markdown.contains("**Channel:** Fireship"));
|
||||
assert!(result.content.markdown.contains("2:00"));
|
||||
assert!(
|
||||
result
|
||||
.content
|
||||
.markdown
|
||||
.contains("Learn Rust in 100 seconds")
|
||||
);
|
||||
|
||||
// Should be detected as Social domain
|
||||
let dd = result.domain_data.unwrap();
|
||||
assert_eq!(dd.domain_type, DomainType::Social);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn youtube_url_without_player_response_falls_through() {
|
||||
// If ytInitialPlayerResponse is missing, fall through to normal extraction
|
||||
let html = r#"<html><body><article><h1>Some YouTube Page</h1><p>Content here for testing.</p></article></body></html>"#;
|
||||
let result = extract(html, Some("https://www.youtube.com/watch?v=abc123")).unwrap();
|
||||
|
||||
// Should still extract something via normal pipeline
|
||||
assert!(result.content.markdown.contains("Some YouTube Page"));
|
||||
}
|
||||
|
||||
// --- ExtractionOptions tests ---
|
||||
|
||||
#[test]
|
||||
fn test_exclude_selectors() {
|
||||
let html = r#"<html><body>
|
||||
<nav>Navigation stuff</nav>
|
||||
<article><h1>Title</h1><p>Real content here.</p></article>
|
||||
<footer>Footer stuff</footer>
|
||||
</body></html>"#;
|
||||
|
||||
let options = ExtractionOptions {
|
||||
exclude_selectors: vec!["nav".into(), "footer".into()],
|
||||
..Default::default()
|
||||
};
|
||||
let result = extract_with_options(html, None, &options).unwrap();
|
||||
|
||||
assert!(result.content.markdown.contains("Real content"));
|
||||
assert!(
|
||||
!result.content.markdown.contains("Navigation stuff"),
|
||||
"nav should be excluded"
|
||||
);
|
||||
assert!(
|
||||
!result.content.markdown.contains("Footer stuff"),
|
||||
"footer should be excluded"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_include_selectors() {
|
||||
let html = r#"<html><body>
|
||||
<nav>Navigation stuff</nav>
|
||||
<article><h1>Title</h1><p>Real content here.</p></article>
|
||||
<div class="sidebar">Sidebar junk</div>
|
||||
<footer>Footer stuff</footer>
|
||||
</body></html>"#;
|
||||
|
||||
let options = ExtractionOptions {
|
||||
include_selectors: vec!["article".into()],
|
||||
..Default::default()
|
||||
};
|
||||
let result = extract_with_options(html, None, &options).unwrap();
|
||||
|
||||
assert!(result.content.markdown.contains("Title"));
|
||||
assert!(result.content.markdown.contains("Real content"));
|
||||
assert!(
|
||||
!result.content.markdown.contains("Navigation stuff"),
|
||||
"nav should not be included"
|
||||
);
|
||||
assert!(
|
||||
!result.content.markdown.contains("Sidebar junk"),
|
||||
"sidebar should not be included"
|
||||
);
|
||||
assert!(
|
||||
!result.content.markdown.contains("Footer stuff"),
|
||||
"footer should not be included"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_include_and_exclude() {
|
||||
let html = r#"<html><body>
|
||||
<article>
|
||||
<h1>Title</h1>
|
||||
<p>Real content here.</p>
|
||||
<div class="sidebar">Sidebar inside article</div>
|
||||
</article>
|
||||
<footer>Footer stuff</footer>
|
||||
</body></html>"#;
|
||||
|
||||
let options = ExtractionOptions {
|
||||
include_selectors: vec!["article".into()],
|
||||
exclude_selectors: vec![".sidebar".into()],
|
||||
..Default::default()
|
||||
};
|
||||
let result = extract_with_options(html, None, &options).unwrap();
|
||||
|
||||
assert!(result.content.markdown.contains("Title"));
|
||||
assert!(result.content.markdown.contains("Real content"));
|
||||
assert!(
|
||||
!result.content.markdown.contains("Sidebar inside article"),
|
||||
"sidebar inside article should be excluded"
|
||||
);
|
||||
assert!(
|
||||
!result.content.markdown.contains("Footer stuff"),
|
||||
"footer should not be included"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_only_main_content() {
|
||||
let html = r#"<html><body>
|
||||
<nav>Navigation</nav>
|
||||
<div class="hero"><h1>Big Hero</h1></div>
|
||||
<article><h2>Article Title</h2><p>Article content that is long enough to be real.</p></article>
|
||||
<div class="sidebar">Sidebar</div>
|
||||
<footer>Footer</footer>
|
||||
</body></html>"#;
|
||||
|
||||
let options = ExtractionOptions {
|
||||
only_main_content: true,
|
||||
..Default::default()
|
||||
};
|
||||
let result = extract_with_options(html, None, &options).unwrap();
|
||||
|
||||
assert!(
|
||||
result.content.markdown.contains("Article Title"),
|
||||
"article content should be present"
|
||||
);
|
||||
assert!(
|
||||
result.content.markdown.contains("Article content"),
|
||||
"article body should be present"
|
||||
);
|
||||
// only_main_content picks the article/main element directly, so hero and sidebar
|
||||
// should not be in the output
|
||||
assert!(
|
||||
!result.content.markdown.contains("Sidebar"),
|
||||
"sidebar should not be in only_main_content output"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_include_raw_html() {
|
||||
let html = r#"<html><body>
|
||||
<article><h1>Title</h1><p>Content here.</p></article>
|
||||
</body></html>"#;
|
||||
|
||||
let options = ExtractionOptions {
|
||||
include_raw_html: true,
|
||||
..Default::default()
|
||||
};
|
||||
let result = extract_with_options(html, None, &options).unwrap();
|
||||
|
||||
assert!(
|
||||
result.content.raw_html.is_some(),
|
||||
"raw_html should be populated"
|
||||
);
|
||||
let raw = result.content.raw_html.unwrap();
|
||||
assert!(
|
||||
raw.contains("<article>"),
|
||||
"raw_html should contain article tag"
|
||||
);
|
||||
assert!(raw.contains("<h1>Title</h1>"), "raw_html should contain h1");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_invalid_selectors() {
|
||||
let html = r#"<html><body>
|
||||
<article><h1>Title</h1><p>Content here.</p></article>
|
||||
</body></html>"#;
|
||||
|
||||
// Invalid selectors should be gracefully skipped
|
||||
let options = ExtractionOptions {
|
||||
include_selectors: vec!["[invalid[[[".into(), "article".into()],
|
||||
exclude_selectors: vec![">>>bad".into()],
|
||||
..Default::default()
|
||||
};
|
||||
let result = extract_with_options(html, None, &options).unwrap();
|
||||
|
||||
assert!(
|
||||
result.content.markdown.contains("Title"),
|
||||
"valid selectors should still work"
|
||||
);
|
||||
assert!(
|
||||
result.content.markdown.contains("Content here"),
|
||||
"extraction should proceed despite invalid selectors"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_backward_compat() {
|
||||
let html = r#"<html><body>
|
||||
<article><h1>Title</h1><p>Content here.</p></article>
|
||||
</body></html>"#;
|
||||
|
||||
let result_old = extract(html, None).unwrap();
|
||||
let result_new = extract_with_options(html, None, &ExtractionOptions::default()).unwrap();
|
||||
|
||||
assert_eq!(result_old.content.markdown, result_new.content.markdown);
|
||||
assert_eq!(result_old.content.plain_text, result_new.content.plain_text);
|
||||
assert_eq!(
|
||||
result_old.content.links.len(),
|
||||
result_new.content.links.len()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty_options() {
|
||||
let html = r#"<html><body>
|
||||
<article><h1>Title</h1><p>Content here.</p></article>
|
||||
</body></html>"#;
|
||||
|
||||
let result_extract = extract(html, None).unwrap();
|
||||
let result_options =
|
||||
extract_with_options(html, None, &ExtractionOptions::default()).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
result_extract.content.markdown, result_options.content.markdown,
|
||||
"default ExtractionOptions should produce identical results to extract()"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_raw_html_not_in_json_when_none() {
|
||||
let result = extract("<html><body><p>Test</p></body></html>", None).unwrap();
|
||||
let json = serde_json::to_string(&result).unwrap();
|
||||
assert!(
|
||||
!json.contains("raw_html"),
|
||||
"raw_html should be absent from JSON when None"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn express_live_blog_no_stack_overflow() {
|
||||
// Real-world Express.co.uk live blog that previously caused stack overflow
|
||||
let html = include_str!("../testdata/express_test.html");
|
||||
let result = extract(
|
||||
html,
|
||||
Some(
|
||||
"https://www.express.co.uk/news/world/2189934/iran-live-donald-trump-uae-dubai-kuwait-attacks",
|
||||
),
|
||||
);
|
||||
assert!(
|
||||
result.is_ok(),
|
||||
"Should not stack overflow on Express.co.uk live blog"
|
||||
);
|
||||
let result = result.unwrap();
|
||||
assert!(
|
||||
result.metadata.word_count > 100,
|
||||
"Should extract meaningful content, got {} words",
|
||||
result.metadata.word_count
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deeply_nested_html_no_stack_overflow() {
|
||||
// Simulate deeply nested HTML like Express.co.uk live blogs
|
||||
let depth = 500;
|
||||
let mut html = String::from("<html><body>");
|
||||
for _ in 0..depth {
|
||||
html.push_str("<div><span>");
|
||||
}
|
||||
html.push_str("<p>Deep content here</p>");
|
||||
for _ in 0..depth {
|
||||
html.push_str("</span></div>");
|
||||
}
|
||||
html.push_str("</body></html>");
|
||||
|
||||
let result = extract(&html, None);
|
||||
assert!(
|
||||
result.is_ok(),
|
||||
"Should not stack overflow on deeply nested HTML"
|
||||
);
|
||||
let result = result.unwrap();
|
||||
assert!(
|
||||
result.content.markdown.contains("Deep content"),
|
||||
"Should extract content from deep nesting"
|
||||
);
|
||||
}
|
||||
}
|
||||
1053
crates/noxa-core/src/llm/body.rs
Normal file
1053
crates/noxa-core/src/llm/body.rs
Normal file
File diff suppressed because it is too large
Load diff
1359
crates/noxa-core/src/llm/cleanup.rs
Normal file
1359
crates/noxa-core/src/llm/cleanup.rs
Normal file
File diff suppressed because it is too large
Load diff
237
crates/noxa-core/src/llm/images.rs
Normal file
237
crates/noxa-core/src/llm/images.rs
Normal file
|
|
@ -0,0 +1,237 @@
|
|||
/// Image handling for LLM output: linked image conversion, logo detection,
|
||||
/// standalone image stripping, and bare image reference removal.
|
||||
use once_cell::sync::Lazy;
|
||||
use regex::Regex;
|
||||
|
||||
use super::cleanup::is_asset_label;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Linked image conversion: [](url) -> [alt](url)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// Matches `[](link-url)` -- an image wrapped in a link.
|
||||
static LINKED_IMAGE_RE: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r"\[!\[([^\]]*)\]\([^)]+\)\]\(([^)]+)\)").unwrap());
|
||||
|
||||
/// Matches empty markdown links `[](url)` left after image stripping.
|
||||
pub(crate) static EMPTY_LINK_RE: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r"\[\s*\]\([^)]+\)").unwrap());
|
||||
|
||||
/// Convert linked images to plain links, preserving the alt text and link target.
|
||||
/// Adds a newline after each to prevent text mashing when multiple are adjacent.
|
||||
pub(crate) fn convert_linked_images(input: &str) -> String {
|
||||
LINKED_IMAGE_RE
|
||||
.replace_all(input, |caps: ®ex::Captures| {
|
||||
let alt = caps.get(1).map_or("", |m| m.as_str());
|
||||
let href = caps.get(2).map_or("", |m| m.as_str());
|
||||
format!("[{alt}]({href})\n")
|
||||
})
|
||||
.into_owned()
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Logo image collapsing
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// Regex matching a line that is *only* a markdown image (with optional whitespace).
|
||||
static IMAGE_LINE_RE: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r"^!\[([^\]]*)\]\([^)]+\)\s*$").unwrap());
|
||||
|
||||
/// Collapse consecutive image-only lines into a comma-separated summary
|
||||
/// of their alt texts (for logo bars, partner grids, etc.).
|
||||
pub(crate) fn collapse_logo_images(input: &str) -> String {
|
||||
let lines: Vec<&str> = input.lines().collect();
|
||||
let mut out = String::with_capacity(input.len());
|
||||
let mut i = 0;
|
||||
|
||||
while i < lines.len() {
|
||||
// Check if this starts a run of consecutive image-only lines
|
||||
if IMAGE_LINE_RE.is_match(lines[i].trim()) {
|
||||
let mut alts: Vec<String> = Vec::new();
|
||||
let start = i;
|
||||
while i < lines.len() {
|
||||
let trimmed = lines[i].trim();
|
||||
// Allow blank lines between images in the same run
|
||||
if trimmed.is_empty() {
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
if let Some(caps) = IMAGE_LINE_RE.captures(trimmed) {
|
||||
let alt = caps.get(1).map_or("", |m| m.as_str()).trim().to_string();
|
||||
if !alt.is_empty() {
|
||||
alts.push(alt);
|
||||
}
|
||||
i += 1;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let image_count = if alts.is_empty() {
|
||||
i - start
|
||||
} else {
|
||||
alts.len()
|
||||
};
|
||||
|
||||
if image_count >= 2 && !alts.is_empty() {
|
||||
out.push_str(&alts.join(", "));
|
||||
out.push('\n');
|
||||
} else if image_count == 1 && !alts.is_empty() && alts[0].len() > 30 {
|
||||
out.push_str(&alts[0]);
|
||||
out.push('\n');
|
||||
}
|
||||
// else: single image with short/empty alt -- drop entirely
|
||||
} else {
|
||||
out.push_str(lines[i]);
|
||||
out.push('\n');
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
out
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Remaining inline image stripping
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// Matches `` anywhere in a line, including multiple on the same line.
|
||||
static INLINE_IMAGE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"!\[([^\]]*)\]\([^)]+\)").unwrap());
|
||||
|
||||
/// Strip inline images. For multi-image lines, separate short alts (logos)
|
||||
/// from long alts (descriptive) so they don't get mixed together.
|
||||
pub(crate) fn strip_remaining_images(input: &str) -> String {
|
||||
let mut out = String::with_capacity(input.len());
|
||||
|
||||
for line in input.lines() {
|
||||
let image_matches: Vec<_> = INLINE_IMAGE_RE.find_iter(line).collect();
|
||||
|
||||
if image_matches.len() >= 2 {
|
||||
// Separate short alts (brand names/logos) from long alts (descriptions)
|
||||
let mut short_alts: Vec<&str> = Vec::new();
|
||||
let mut long_alts: Vec<&str> = Vec::new();
|
||||
|
||||
for caps in INLINE_IMAGE_RE.captures_iter(line) {
|
||||
let alt = caps.get(1).map_or("", |m| m.as_str()).trim();
|
||||
// Skip empty alts and quoted-empty alts like `""`
|
||||
if alt.is_empty() || alt == "\"\"" {
|
||||
continue;
|
||||
}
|
||||
if alt.len() <= 30 {
|
||||
short_alts.push(alt);
|
||||
} else {
|
||||
long_alts.push(alt);
|
||||
}
|
||||
}
|
||||
|
||||
// Filter out CMS asset labels from alt texts before output
|
||||
short_alts.retain(|alt| !is_asset_label(alt));
|
||||
long_alts.retain(|alt| !is_asset_label(alt));
|
||||
|
||||
// Remove images, then strip empty link remnants [](url)
|
||||
let remaining = INLINE_IMAGE_RE.replace_all(line, "");
|
||||
let remaining = EMPTY_LINK_RE.replace_all(&remaining, "");
|
||||
let remaining = remaining.trim();
|
||||
|
||||
if !short_alts.is_empty() {
|
||||
if !remaining.is_empty() {
|
||||
out.push_str(remaining);
|
||||
out.push('\n');
|
||||
}
|
||||
out.push_str(&short_alts.join(", "));
|
||||
out.push('\n');
|
||||
} else if !remaining.is_empty() {
|
||||
out.push_str(remaining);
|
||||
out.push('\n');
|
||||
}
|
||||
|
||||
// Long alts on their own lines (descriptions, not logos)
|
||||
for alt in &long_alts {
|
||||
out.push_str(alt);
|
||||
out.push('\n');
|
||||
}
|
||||
} else {
|
||||
// 0 or 1 image -- keep long alt text, drop short/empty/CMS labels
|
||||
let replaced = INLINE_IMAGE_RE.replace_all(line, |caps: ®ex::Captures| {
|
||||
let alt = caps.get(1).map_or("", |m| m.as_str()).trim();
|
||||
if alt.len() > 30 && !is_asset_label(alt) {
|
||||
alt.to_string()
|
||||
} else {
|
||||
String::new()
|
||||
}
|
||||
});
|
||||
out.push_str(&replaced);
|
||||
out.push('\n');
|
||||
}
|
||||
}
|
||||
|
||||
out
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Bare image file reference stripping
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
const IMAGE_EXTENSIONS: &[&str] = &[
|
||||
".webp", ".svg", ".png", ".jpg", ".jpeg", ".gif", ".avif", ".ico", ".bmp",
|
||||
];
|
||||
|
||||
/// Strip lines that are just bare image filenames or image URLs.
|
||||
/// Keeps lines where an image filename appears within a larger sentence.
|
||||
pub(crate) fn strip_bare_image_refs(input: &str) -> String {
|
||||
let mut out = String::with_capacity(input.len());
|
||||
|
||||
for line in input.lines() {
|
||||
let trimmed = line.trim();
|
||||
|
||||
if !trimmed.is_empty() && is_bare_image_ref(trimmed) {
|
||||
continue;
|
||||
}
|
||||
|
||||
out.push_str(line);
|
||||
out.push('\n');
|
||||
}
|
||||
|
||||
out
|
||||
}
|
||||
|
||||
/// A line is a bare image reference if it's a single token ending with an image extension.
|
||||
/// Catches filenames ("hero.webp") and URLs ("https://cdn.example.com/logo.svg").
|
||||
fn is_bare_image_ref(line: &str) -> bool {
|
||||
if line.starts_with('#')
|
||||
|| line.starts_with("- ")
|
||||
|| line.starts_with("* ")
|
||||
|| line.starts_with("```")
|
||||
|| line.starts_with("> ")
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
if line.contains(' ') {
|
||||
return false;
|
||||
}
|
||||
|
||||
let lower = line.to_lowercase();
|
||||
IMAGE_EXTENSIONS.iter().any(|ext| lower.ends_with(ext))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn linked_image_conversion() {
|
||||
let input = "[](https://docs.example.com)";
|
||||
let result = convert_linked_images(input);
|
||||
assert!(result.contains("[docs](https://docs.example.com)"));
|
||||
assert!(!result.contains("!["));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bare_image_ref_detected() {
|
||||
assert!(is_bare_image_ref("hero.webp"));
|
||||
assert!(is_bare_image_ref("https://cdn.example.com/logo.svg"));
|
||||
assert!(!is_bare_image_ref("The file output.png is saved to disk."));
|
||||
assert!(!is_bare_image_ref("# heading.png"));
|
||||
}
|
||||
}
|
||||
184
crates/noxa-core/src/llm/links.rs
Normal file
184
crates/noxa-core/src/llm/links.rs
Normal file
|
|
@ -0,0 +1,184 @@
|
|||
/// Link extraction, deduplication, noise filtering, and label formatting
|
||||
/// for the LLM output's deduplicated links section.
|
||||
use std::collections::HashSet;
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
use regex::Regex;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Link extraction
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// Matches `[text](url)`. Images are already stripped, so no `!` prefix concern.
|
||||
static LINK_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\[([^\]]*)\]\(([^)]+)\)").unwrap());
|
||||
|
||||
/// Extract all links from markdown, replacing inline `[text](url)` with just `text`.
|
||||
/// Returns the cleaned text and a deduplicated list of (label, href) pairs.
|
||||
pub(crate) fn extract_and_strip_links(input: &str) -> (String, Vec<(String, String)>) {
|
||||
let mut links: Vec<(String, String)> = Vec::new();
|
||||
let mut seen_hrefs: HashSet<String> = HashSet::new();
|
||||
|
||||
let replaced = LINK_RE.replace_all(input, |caps: ®ex::Captures| {
|
||||
let text = caps.get(1).map_or("", |m| m.as_str()).trim().to_string();
|
||||
let href = caps.get(2).map_or("", |m| m.as_str()).trim().to_string();
|
||||
|
||||
let skip = href.starts_with('#')
|
||||
|| href.starts_with("javascript:")
|
||||
|| href.is_empty()
|
||||
|| is_noise_link(&text, &href);
|
||||
|
||||
if !skip && !text.is_empty() && seen_hrefs.insert(href.clone()) {
|
||||
links.push((text.clone(), href));
|
||||
}
|
||||
|
||||
text
|
||||
});
|
||||
|
||||
(replaced.into_owned(), links)
|
||||
}
|
||||
|
||||
/// Links that are noise for LLM consumption: internal actions, timestamps,
|
||||
/// user profiles, generic short text.
|
||||
fn is_noise_link(text: &str, href: &str) -> bool {
|
||||
let t = text.to_lowercase();
|
||||
|
||||
// Generic action links
|
||||
if matches!(
|
||||
t.as_str(),
|
||||
"hide"
|
||||
| "flag"
|
||||
| "reply"
|
||||
| "favorite"
|
||||
| "unflag"
|
||||
| "vouch"
|
||||
| "next"
|
||||
| "prev"
|
||||
| "previous"
|
||||
| "more"
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Timestamp text ("1 hour ago", "5 minutes ago", "yesterday")
|
||||
if t.ends_with(" ago") || t == "yesterday" || t == "just now" {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Single-char text that's not meaningful (but keep letters -- "X", "Go", etc.)
|
||||
if text.len() == 1 && !text.chars().next().unwrap_or(' ').is_alphanumeric() {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Internal user profile / action URLs (HN-style)
|
||||
if href.contains("/user?id=")
|
||||
|| href.contains("/hide?id=")
|
||||
|| href.contains("/from?site=")
|
||||
|| href.contains("/flag?id=")
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Link label cleaning
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
static MD_MARKERS_RE: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r"#{1,6}\s+|\*{1,2}|_{1,2}|`").unwrap());
|
||||
|
||||
/// Clean a link label: strip markdown, dedup repeated phrases, truncate.
|
||||
pub(crate) fn clean_link_label(raw: &str) -> String {
|
||||
// Strip markdown markers
|
||||
let label = MD_MARKERS_RE.replace_all(raw, "").to_string();
|
||||
let label = label.split_whitespace().collect::<Vec<_>>().join(" ");
|
||||
|
||||
// Dedup repeated phrases in label
|
||||
let label = dedup_label_phrase(&label);
|
||||
|
||||
// Truncate to ~80 chars (UTF-8 safe)
|
||||
if label.len() > 80 {
|
||||
// Find last whitespace boundary at or before 80 bytes
|
||||
let mut end = None;
|
||||
for (i, _) in label.char_indices() {
|
||||
if i > 80 {
|
||||
break;
|
||||
}
|
||||
if i > 0 && label.as_bytes()[i - 1].is_ascii_whitespace() {
|
||||
end = Some(i);
|
||||
}
|
||||
}
|
||||
let end = end.unwrap_or_else(|| {
|
||||
// No whitespace found -- find char boundary near 80
|
||||
label
|
||||
.char_indices()
|
||||
.map(|(i, _)| i)
|
||||
.find(|&i| i >= 80)
|
||||
.unwrap_or(label.len())
|
||||
});
|
||||
format!("{}...", label[..end].trim_end())
|
||||
} else {
|
||||
label
|
||||
}
|
||||
}
|
||||
|
||||
/// If a label contains the same phrase twice (e.g., "X Y Z X Y Z"), return just one copy.
|
||||
fn dedup_label_phrase(label: &str) -> String {
|
||||
let len = label.len();
|
||||
if len < 8 {
|
||||
return label.to_string();
|
||||
}
|
||||
// Try split at each whitespace boundary
|
||||
for (i, _) in label.match_indices(' ') {
|
||||
let left = label[..i].trim();
|
||||
let right = label[i + 1..].trim();
|
||||
if left.len() >= 4 && left.eq_ignore_ascii_case(right) {
|
||||
return left.to_string();
|
||||
}
|
||||
}
|
||||
label.to_string()
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Tests
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn link_label_truncated() {
|
||||
let long = "The quick brown fox jumps over the lazy dog and then runs across the field to find more interesting things to do on a sunny afternoon";
|
||||
let result = clean_link_label(long);
|
||||
assert!(result.len() <= 84, "got len {}: {result}", result.len());
|
||||
assert!(result.ends_with("..."), "got: {result}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn link_label_markdown_stripped() {
|
||||
assert_eq!(clean_link_label("## Hello **world**"), "Hello world");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn link_label_duplicate_deduped() {
|
||||
assert_eq!(
|
||||
clean_link_label("Express Delivery Express Delivery"),
|
||||
"Express Delivery"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn link_label_short_unchanged() {
|
||||
assert_eq!(clean_link_label("Click here"), "Click here");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn noise_link_detected() {
|
||||
assert!(is_noise_link("hide", "https://example.com"));
|
||||
assert!(is_noise_link("5 minutes ago", "https://example.com"));
|
||||
assert!(is_noise_link("user", "https://hn.com/user?id=foo"));
|
||||
assert!(!is_noise_link("Rust docs", "https://rust-lang.org"));
|
||||
}
|
||||
}
|
||||
47
crates/noxa-core/src/llm/metadata.rs
Normal file
47
crates/noxa-core/src/llm/metadata.rs
Normal file
|
|
@ -0,0 +1,47 @@
|
|||
/// Metadata header building for LLM-optimized output.
|
||||
///
|
||||
/// Produces `> ` prefixed lines with URL, title, author, etc.
|
||||
/// Omits empty/zero fields to minimize token waste.
|
||||
use crate::types::ExtractionResult;
|
||||
|
||||
pub(crate) fn build_metadata_header(
|
||||
out: &mut String,
|
||||
result: &ExtractionResult,
|
||||
url: Option<&str>,
|
||||
) {
|
||||
let meta = &result.metadata;
|
||||
|
||||
// URL: prefer explicit arg, fall back to metadata
|
||||
let effective_url = url.or(meta.url.as_deref());
|
||||
if let Some(u) = effective_url {
|
||||
out.push_str(&format!("> URL: {u}\n"));
|
||||
}
|
||||
if let Some(t) = &meta.title
|
||||
&& !t.is_empty()
|
||||
{
|
||||
out.push_str(&format!("> Title: {t}\n"));
|
||||
}
|
||||
if let Some(d) = &meta.description
|
||||
&& !d.is_empty()
|
||||
{
|
||||
out.push_str(&format!("> Description: {d}\n"));
|
||||
}
|
||||
if let Some(a) = &meta.author
|
||||
&& !a.is_empty()
|
||||
{
|
||||
out.push_str(&format!("> Author: {a}\n"));
|
||||
}
|
||||
if let Some(d) = &meta.published_date
|
||||
&& !d.is_empty()
|
||||
{
|
||||
out.push_str(&format!("> Published: {d}\n"));
|
||||
}
|
||||
if let Some(l) = &meta.language
|
||||
&& !l.is_empty()
|
||||
{
|
||||
out.push_str(&format!("> Language: {l}\n"));
|
||||
}
|
||||
if meta.word_count > 0 {
|
||||
out.push_str(&format!("> Word count: {}\n", meta.word_count));
|
||||
}
|
||||
}
|
||||
703
crates/noxa-core/src/llm/mod.rs
Normal file
703
crates/noxa-core/src/llm/mod.rs
Normal file
|
|
@ -0,0 +1,703 @@
|
|||
/// LLM-optimized output format.
|
||||
///
|
||||
/// Takes an `ExtractionResult` and produces a compact text representation
|
||||
/// that maximizes information density per token. Strips decorative images,
|
||||
/// visual-only formatting (bold/italic), and inline link URLs -- moving links
|
||||
/// to a deduplicated section at the end.
|
||||
mod body;
|
||||
mod cleanup;
|
||||
mod images;
|
||||
mod links;
|
||||
mod metadata;
|
||||
|
||||
use crate::types::ExtractionResult;
|
||||
|
||||
/// Produce a token-optimized text representation of extracted content.
|
||||
///
|
||||
/// The output has three sections:
|
||||
/// 1. Compact metadata header (`> ` prefixed lines)
|
||||
/// 2. Cleaned body (no images, no bold/italic, links as plain text)
|
||||
/// 3. Deduplicated links section at the end
|
||||
pub fn to_llm_text(result: &ExtractionResult, url: Option<&str>) -> String {
|
||||
let mut out = String::new();
|
||||
|
||||
// -- 1. Metadata header --
|
||||
metadata::build_metadata_header(&mut out, result, url);
|
||||
|
||||
// -- 2. Process body --
|
||||
let processed = body::process_body(&result.content.markdown);
|
||||
|
||||
if !processed.text.is_empty() {
|
||||
if !out.is_empty() {
|
||||
out.push('\n');
|
||||
}
|
||||
out.push_str(&processed.text);
|
||||
}
|
||||
|
||||
// -- 3. Links section --
|
||||
if !processed.links.is_empty() {
|
||||
out.push_str("\n\n## Links\n");
|
||||
for (text, href) in &processed.links {
|
||||
let label = links::clean_link_label(text);
|
||||
if !label.is_empty() {
|
||||
out.push_str(&format!("- {label}: {href}\n"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// -- 4. Structured data (NEXT_DATA, SvelteKit, JSON-LD) --
|
||||
if !result.structured_data.is_empty() {
|
||||
out.push_str("\n\n## Structured Data\n\n```json\n");
|
||||
out.push_str(&serde_json::to_string_pretty(&result.structured_data).unwrap_or_default());
|
||||
out.push_str("\n```");
|
||||
}
|
||||
|
||||
out.trim().to_string()
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Integration tests that exercise the full pipeline through to_llm_text
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::types::*;
|
||||
|
||||
fn make_result(markdown: &str) -> ExtractionResult {
|
||||
ExtractionResult {
|
||||
metadata: Metadata {
|
||||
title: Some("Test Page".into()),
|
||||
description: Some("A test page".into()),
|
||||
author: None,
|
||||
published_date: None,
|
||||
language: Some("en".into()),
|
||||
url: Some("https://example.com".into()),
|
||||
site_name: None,
|
||||
image: None,
|
||||
favicon: None,
|
||||
word_count: 42,
|
||||
},
|
||||
content: Content {
|
||||
markdown: markdown.into(),
|
||||
plain_text: String::new(),
|
||||
links: vec![],
|
||||
images: vec![],
|
||||
code_blocks: vec![],
|
||||
raw_html: None,
|
||||
},
|
||||
domain_data: None,
|
||||
structured_data: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn metadata_header_includes_populated_fields() {
|
||||
let result = make_result("# Hello");
|
||||
let out = to_llm_text(&result, Some("https://example.com/page"));
|
||||
|
||||
assert!(out.contains("> URL: https://example.com/page"));
|
||||
assert!(out.contains("> Title: Test Page"));
|
||||
assert!(out.contains("> Description: A test page"));
|
||||
assert!(out.contains("> Language: en"));
|
||||
assert!(out.contains("> Word count: 42"));
|
||||
assert!(!out.contains("> Author:"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn strips_image_markdown() {
|
||||
let md = "Some text\n\n\n\nMore text";
|
||||
let result = make_result(md);
|
||||
let out = to_llm_text(&result, None);
|
||||
|
||||
assert!(!out.contains("!["));
|
||||
assert!(!out.contains("cdn.example.com"));
|
||||
assert!(out.contains("Some text"));
|
||||
assert!(out.contains("More text"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn collapses_consecutive_logo_images_on_separate_lines() {
|
||||
let md = "# Partners\n\n\
|
||||
\n\
|
||||
\n\
|
||||
\n\
|
||||
\n\n\
|
||||
Some other content";
|
||||
|
||||
let result = make_result(md);
|
||||
let out = to_llm_text(&result, None);
|
||||
|
||||
assert!(out.contains("WRITER, MongoDB, GROQ, LangChain"));
|
||||
assert!(!out.contains("!["));
|
||||
assert!(!out.contains("cdn.example.com"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn collapses_consecutive_logo_images_on_same_line() {
|
||||
let md = "";
|
||||
|
||||
let result = make_result(md);
|
||||
let out = to_llm_text(&result, None);
|
||||
|
||||
assert!(out.contains("WRITER"));
|
||||
assert!(out.contains("MongoDB"));
|
||||
assert!(out.contains("GROQ"));
|
||||
assert!(!out.contains("!["));
|
||||
assert!(!out.contains("cdn.example.com"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn keeps_meaningful_alt_text() {
|
||||
let md = "";
|
||||
let result = make_result(md);
|
||||
let out = to_llm_text(&result, None);
|
||||
|
||||
assert!(
|
||||
out.contains("A detailed photograph showing the team collaborating on the project")
|
||||
);
|
||||
assert!(!out.contains("!["));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn strips_bold_and_italic() {
|
||||
let md = "This is **bold text** and *italic text* and __also bold__ and _also italic_.";
|
||||
let result = make_result(md);
|
||||
let out = to_llm_text(&result, None);
|
||||
|
||||
assert!(out.contains("This is bold text and italic text and also bold and also italic."));
|
||||
assert!(!out.contains("**"));
|
||||
assert!(!out.contains("__"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn moves_links_to_end() {
|
||||
let md = "Check out [Rust](https://rust-lang.org) and [Go](https://go.dev) for details.";
|
||||
let result = make_result(md);
|
||||
let out = to_llm_text(&result, None);
|
||||
|
||||
assert!(out.contains("Check out Rust and Go for details."));
|
||||
assert!(out.contains("## Links"));
|
||||
assert!(out.contains("- Rust: https://rust-lang.org"));
|
||||
assert!(out.contains("- Go: https://go.dev"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn skips_anchor_and_javascript_links() {
|
||||
let md = "Go to [top](#top) and [click](javascript:void(0)) and [real](https://real.example.com).";
|
||||
let result = make_result(md);
|
||||
let out = to_llm_text(&result, None);
|
||||
|
||||
assert!(out.contains("## Links"));
|
||||
assert!(out.contains("- real: https://real.example.com"));
|
||||
let links_section = out.split("## Links").nth(1).unwrap_or("");
|
||||
assert!(!links_section.contains("#top"));
|
||||
assert!(!links_section.contains("javascript:"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deduplicates_heading_and_paragraph() {
|
||||
let md = "### Ground models\n\nGround models with fresh web context\n\nRetrieve live data.";
|
||||
let result = make_result(md);
|
||||
let out = to_llm_text(&result, None);
|
||||
|
||||
assert!(out.contains("### Ground models with fresh web context"));
|
||||
assert!(out.contains("Retrieve live data."));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deduplicates_identical_heading_paragraph() {
|
||||
let md = "## Features\n\nFeatures\n\nHere are the features.";
|
||||
let result = make_result(md);
|
||||
let out = to_llm_text(&result, None);
|
||||
|
||||
let feature_count = out.matches("Features").count();
|
||||
assert_eq!(
|
||||
feature_count, 1,
|
||||
"Expected 'Features' exactly once, got: {out}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn collapses_excessive_whitespace() {
|
||||
let md = "Line one\n\n\n\n\nLine two\n\n\n\nLine three";
|
||||
let result = make_result(md);
|
||||
let out = to_llm_text(&result, None);
|
||||
|
||||
assert!(
|
||||
!out.contains("\n\n\n"),
|
||||
"Found 3+ consecutive newlines in: {:?}",
|
||||
out
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn preserves_code_blocks() {
|
||||
let md = "Example:\n\n```rust\nfn main() {\n println!(\"hello\");\n}\n```\n\nDone.";
|
||||
let result = make_result(md);
|
||||
let out = to_llm_text(&result, None);
|
||||
|
||||
assert!(out.contains("```rust"));
|
||||
assert!(out.contains("fn main()"));
|
||||
assert!(out.contains("```"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn preserves_list_structure() {
|
||||
let md = "Features:\n\n- Fast\n- Safe\n- Concurrent";
|
||||
let result = make_result(md);
|
||||
let out = to_llm_text(&result, None);
|
||||
|
||||
assert!(out.contains("- Fast"));
|
||||
assert!(out.contains("- Safe"));
|
||||
assert!(out.contains("- Concurrent"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deduplicates_links() {
|
||||
let md = "Visit [Example](https://example.org/page) or [Example again](https://example.org/page).";
|
||||
let result = make_result(md);
|
||||
let out = to_llm_text(&result, None);
|
||||
|
||||
let link_count = out.matches("https://example.org/page").count();
|
||||
assert_eq!(link_count, 1, "Expected link once, got: {out}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn realistic_page() {
|
||||
let html = r#"
|
||||
<html lang="en">
|
||||
<head>
|
||||
<title>Tavily - AI Search API</title>
|
||||
<meta name="description" content="Real-time search for AI agents">
|
||||
</head>
|
||||
<body>
|
||||
<article>
|
||||
<h1>Connect your AI agents to the web</h1>
|
||||
<p>Real-time search, extraction, and web crawling through a <strong>single API</strong>.</p>
|
||||
<p>Trusted by <em>1M+ developers</em>.</p>
|
||||
<img src="https://cdn.example.com/writer.png" alt="WRITER">
|
||||
<img src="https://cdn.example.com/mongo.png" alt="MongoDB">
|
||||
<img src="https://cdn.example.com/groq.png" alt="GROQ">
|
||||
<img src="https://cdn.example.com/langchain.png" alt="LangChain">
|
||||
<h2>Ground models with fresh web context</h2>
|
||||
<p>Retrieve live web data and return it structured for models.</p>
|
||||
<p>Learn more at <a href="https://docs.tavily.com">the docs</a>.</p>
|
||||
<p><a href="https://app.tavily.com">Try it out</a></p>
|
||||
</article>
|
||||
</body>
|
||||
</html>"#;
|
||||
|
||||
let result = crate::extract(html, Some("https://www.tavily.com/")).unwrap();
|
||||
let out = to_llm_text(&result, Some("https://www.tavily.com/"));
|
||||
|
||||
assert!(out.contains("> URL: https://www.tavily.com/"));
|
||||
assert!(out.contains("> Title:"));
|
||||
|
||||
assert!(!out.contains("!["), "Image markdown not stripped: {out}");
|
||||
assert!(
|
||||
!out.contains("cdn.example.com"),
|
||||
"CDN URL not stripped: {out}"
|
||||
);
|
||||
|
||||
assert!(
|
||||
out.contains("WRITER") && out.contains("MongoDB"),
|
||||
"Logo alt texts missing: {out}"
|
||||
);
|
||||
|
||||
assert!(!out.contains("**"), "Bold not stripped: {out}");
|
||||
|
||||
assert!(out.contains("# Connect your AI agents to the web"));
|
||||
assert!(out.contains("## Ground models with fresh web context"));
|
||||
assert!(out.contains("Retrieve live web data"));
|
||||
|
||||
assert!(out.contains("## Links"));
|
||||
assert!(out.contains("https://docs.tavily.com"));
|
||||
assert!(out.contains("https://app.tavily.com"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty_metadata_fields_excluded() {
|
||||
let result = ExtractionResult {
|
||||
metadata: Metadata {
|
||||
title: None,
|
||||
description: None,
|
||||
author: None,
|
||||
published_date: None,
|
||||
language: None,
|
||||
url: None,
|
||||
site_name: None,
|
||||
image: None,
|
||||
favicon: None,
|
||||
word_count: 0,
|
||||
},
|
||||
content: Content {
|
||||
markdown: "Just content".into(),
|
||||
plain_text: String::new(),
|
||||
links: vec![],
|
||||
images: vec![],
|
||||
code_blocks: vec![],
|
||||
raw_html: None,
|
||||
},
|
||||
domain_data: None,
|
||||
structured_data: vec![],
|
||||
};
|
||||
|
||||
let out = to_llm_text(&result, None);
|
||||
assert!(!out.contains("> "));
|
||||
assert!(out.contains("Just content"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn strips_empty_alt_images() {
|
||||
let md = "Before\n\n\n\nAfter";
|
||||
let result = make_result(md);
|
||||
let out = to_llm_text(&result, None);
|
||||
|
||||
assert!(!out.contains("cdn.example.com"));
|
||||
assert!(!out.contains("!["));
|
||||
assert!(out.contains("Before"));
|
||||
assert!(out.contains("After"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn preserves_headings_structure() {
|
||||
let md = "# H1\n\n## H2\n\n### H3\n\nContent under H3.";
|
||||
let result = make_result(md);
|
||||
let out = to_llm_text(&result, None);
|
||||
|
||||
assert!(out.contains("# H1"));
|
||||
assert!(out.contains("## H2"));
|
||||
assert!(out.contains("### H3"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn inline_image_in_paragraph_stripped() {
|
||||
let md = "Check this  out and read more.";
|
||||
let result = make_result(md);
|
||||
let out = to_llm_text(&result, None);
|
||||
|
||||
assert!(!out.contains("!["));
|
||||
assert!(!out.contains("x.com/icon.png"));
|
||||
assert!(out.contains("Check this"));
|
||||
assert!(out.contains("out and read more."));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn does_not_strip_emphasis_inside_code_blocks() {
|
||||
let md = "Normal **bold** text\n\n```python\ndef foo(**kwargs):\n return _internal_var_\n```\n\nMore text";
|
||||
let result = make_result(md);
|
||||
let out = to_llm_text(&result, None);
|
||||
|
||||
assert!(out.contains("Normal bold text"));
|
||||
assert!(out.contains("**kwargs"));
|
||||
assert!(out.contains("_internal_var_"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn converts_linked_images_to_links() {
|
||||
let md = "[](https://docs.example.com)";
|
||||
let result = make_result(md);
|
||||
let out = to_llm_text(&result, None);
|
||||
|
||||
assert!(!out.contains("!["), "Image not converted: {out}");
|
||||
assert!(
|
||||
out.contains("https://docs.example.com"),
|
||||
"Link URL missing from footer: {out}"
|
||||
);
|
||||
assert!(out.contains("Read the docs"), "Link text missing: {out}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn linked_images_split_on_separate_lines() {
|
||||
let md = "[](https://a.example.com)[](https://b.example.com)";
|
||||
let result = make_result(md);
|
||||
let out = to_llm_text(&result, None);
|
||||
|
||||
assert!(out.contains("Article A"), "Article A missing: {out}");
|
||||
assert!(out.contains("Article B"), "Article B missing: {out}");
|
||||
assert!(
|
||||
!out.contains("Article AArticle B"),
|
||||
"Text mashed together: {out}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn separates_short_and_long_alts_on_same_line() {
|
||||
let md = "";
|
||||
let result = make_result(md);
|
||||
let out = to_llm_text(&result, None);
|
||||
|
||||
assert!(out.contains("AWS, IBM"), "Logo collapse failed: {out}");
|
||||
assert!(
|
||||
!out.contains("IBM, Ground"),
|
||||
"Long alt mixed with logos: {out}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dedup_text_line_matching_heading() {
|
||||
let md = "\n\n### Handle thousands of web queries in seconds\n\nA production-grade stack.";
|
||||
let result = make_result(md);
|
||||
let out = to_llm_text(&result, None);
|
||||
|
||||
let count = out
|
||||
.matches("Handle thousands of web queries in seconds")
|
||||
.count();
|
||||
assert_eq!(count, 1, "Expected once, got {count}: {out}");
|
||||
assert!(out.contains("### Handle thousands"));
|
||||
assert!(out.contains("A production-grade stack."));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn no_leading_dot_from_linked_images() {
|
||||
let md = "[](https://a.com)[](https://b.com)";
|
||||
let result = make_result(md);
|
||||
let out = to_llm_text(&result, None);
|
||||
|
||||
assert!(
|
||||
!out.contains(". News"),
|
||||
"Leading dot from empty remaining: {out}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn merges_stat_lines_with_descriptions() {
|
||||
let md = "100M+\n\nmonthly requests handled\n\n99.99% uptime\n\nSLA powering mission-critical systems\n\n180 ms\n\np50 on Tavily /search making us fastest on the market\n\n1M+\n\ndevelopers using Tavily\n\nBillions\n\nof pages crawled and extracted without downtime\n\nDrop-in integration\n\nwith leading LLM providers (OpenAI, Anthropic, Groq)";
|
||||
let result = make_result(md);
|
||||
let out = to_llm_text(&result, None);
|
||||
|
||||
assert!(
|
||||
out.contains("100M+ monthly requests handled"),
|
||||
"Stat not merged: {out}"
|
||||
);
|
||||
assert!(
|
||||
out.contains("99.99% uptime SLA powering mission-critical systems"),
|
||||
"Stat not merged: {out}"
|
||||
);
|
||||
assert!(
|
||||
out.contains("180 ms p50 on Tavily /search making us fastest on the market"),
|
||||
"Stat not merged: {out}"
|
||||
);
|
||||
assert!(
|
||||
out.contains("1M+ developers using Tavily"),
|
||||
"Stat not merged: {out}"
|
||||
);
|
||||
assert!(
|
||||
out.contains("Billions of pages crawled and extracted without downtime"),
|
||||
"Stat not merged: {out}"
|
||||
);
|
||||
assert!(
|
||||
out.contains(
|
||||
"Drop-in integration with leading LLM providers (OpenAI, Anthropic, Groq)"
|
||||
),
|
||||
"Stat not merged: {out}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn merge_stat_preserves_headings_and_lists() {
|
||||
let md = "## Features\n\n100M+\n\nmonthly requests\n\n- Fast\n- Safe";
|
||||
let result = make_result(md);
|
||||
let out = to_llm_text(&result, None);
|
||||
|
||||
assert!(out.contains("## Features"), "Heading lost: {out}");
|
||||
assert!(
|
||||
out.contains("100M+ monthly requests"),
|
||||
"Stat not merged: {out}"
|
||||
);
|
||||
assert!(out.contains("- Fast"), "List item lost: {out}");
|
||||
assert!(out.contains("- Safe"), "List item lost: {out}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn merge_stat_does_not_merge_long_lines() {
|
||||
let md = "This is a longer line of text!\n\nAnd this follows after a blank";
|
||||
let result = make_result(md);
|
||||
let out = to_llm_text(&result, None);
|
||||
|
||||
assert!(
|
||||
!out.contains("text! And"),
|
||||
"Long line incorrectly merged: {out}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn strips_css_class_text_lines() {
|
||||
let md = "# Typography\n\n\
|
||||
text-4xl font-bold tracking-tight text-gray-900\n\n\
|
||||
Build beautiful websites with Tailwind CSS.\n\n\
|
||||
text-5xl text-6xl text-8xl text-gray-950 text-white tracking-tighter text-balance";
|
||||
let result = make_result(md);
|
||||
let out = to_llm_text(&result, None);
|
||||
|
||||
assert!(
|
||||
!out.contains("text-4xl font-bold"),
|
||||
"CSS class line was not stripped: {out}"
|
||||
);
|
||||
assert!(
|
||||
!out.contains("text-5xl text-6xl"),
|
||||
"CSS class line was not stripped: {out}"
|
||||
);
|
||||
assert!(
|
||||
out.contains("Build beautiful websites"),
|
||||
"Normal prose was stripped: {out}"
|
||||
);
|
||||
assert!(out.contains("Typography"), "Heading was stripped: {out}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn keeps_prose_with_css_like_word() {
|
||||
let md = "The text-based approach works well for this use case.\n\n\
|
||||
We use a grid-like layout for the dashboard.";
|
||||
let result = make_result(md);
|
||||
let out = to_llm_text(&result, None);
|
||||
|
||||
assert!(
|
||||
out.contains("text-based approach"),
|
||||
"Normal prose incorrectly stripped: {out}"
|
||||
);
|
||||
assert!(
|
||||
out.contains("grid-like layout"),
|
||||
"Normal prose incorrectly stripped: {out}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn preserves_css_classes_inside_code_blocks() {
|
||||
let md = "Example usage:\n\n\
|
||||
```html\n\
|
||||
<div class=\"text-4xl font-bold tracking-tight text-gray-900\">\n\
|
||||
```\n\n\
|
||||
That applies bold typography.";
|
||||
let result = make_result(md);
|
||||
let out = to_llm_text(&result, None);
|
||||
|
||||
assert!(
|
||||
out.contains("text-4xl font-bold tracking-tight"),
|
||||
"CSS classes inside code block were stripped: {out}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dedup_removes_exact_duplicate_paragraphs() {
|
||||
let md = "Supabase is an amazing platform that makes building apps incredibly fast.\n\nSupabase is an amazing platform that makes building apps incredibly fast.\n\nSupabase is an amazing platform that makes building apps incredibly fast.\n\nEach project gets its own dedicated Postgres database.";
|
||||
|
||||
let result = make_result(md);
|
||||
let out = to_llm_text(&result, None);
|
||||
|
||||
let count = out.matches("Supabase is an amazing platform").count();
|
||||
assert_eq!(
|
||||
count, 1,
|
||||
"Duplicate paragraph should appear only once, got {count}: {out}"
|
||||
);
|
||||
assert!(
|
||||
out.contains("Each project gets its own dedicated Postgres database"),
|
||||
"Unique paragraph missing: {out}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dedup_preserves_unique_paragraphs() {
|
||||
let md = "First unique paragraph with enough content to be checked.\n\nSecond unique paragraph that is completely different.\n\nThird unique paragraph covering another topic entirely.";
|
||||
|
||||
let result = make_result(md);
|
||||
let out = to_llm_text(&result, None);
|
||||
|
||||
assert!(out.contains("First unique paragraph"), "Lost first: {out}");
|
||||
assert!(
|
||||
out.contains("Second unique paragraph"),
|
||||
"Lost second: {out}"
|
||||
);
|
||||
assert!(out.contains("Third unique paragraph"), "Lost third: {out}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dedup_keeps_short_repeated_text() {
|
||||
let md = "Learn more\n\nA detailed explanation of the first feature.\n\nLearn more\n\nA detailed explanation of the second feature.";
|
||||
|
||||
let result = make_result(md);
|
||||
let out = to_llm_text(&result, None);
|
||||
|
||||
let count = out.matches("Learn more").count();
|
||||
assert!(
|
||||
count >= 2,
|
||||
"Short repeated text should be kept, got {count}: {out}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dedup_catches_near_duplicates_via_prefix() {
|
||||
let md = "The platform provides real-time sync collaboration tools for modern developers building web applications with React and Next.js.\n\nThe platform provides real-time sync collaboration tools for modern developers building mobile apps with Flutter.\n\nA completely different paragraph about database design.";
|
||||
|
||||
let result = make_result(md);
|
||||
let out = to_llm_text(&result, None);
|
||||
|
||||
let count = out.matches("The platform provides real-time sync").count();
|
||||
assert_eq!(
|
||||
count, 1,
|
||||
"Near-duplicate should be removed, got {count}: {out}"
|
||||
);
|
||||
assert!(
|
||||
out.contains("A completely different paragraph"),
|
||||
"Unique paragraph missing: {out}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dedup_carousel_realistic() {
|
||||
let md = "## What our users say\n\n\"Supabase has transformed how we build products. The developer experience is unmatched.\" - Sarah Chen, CTO at TechCorp\n\n\"Moving from Firebase to Supabase was the best decision we made this year.\" - James Liu, Lead Engineer\n\n\"The real-time features and Postgres foundation give us confidence at scale.\" - Maria Garcia, VP Engineering\n\n\"Supabase has transformed how we build products. The developer experience is unmatched.\" - Sarah Chen, CTO at TechCorp\n\n\"Moving from Firebase to Supabase was the best decision we made this year.\" - James Liu, Lead Engineer\n\n\"The real-time features and Postgres foundation give us confidence at scale.\" - Maria Garcia, VP Engineering\n\n\"Supabase has transformed how we build products. The developer experience is unmatched.\" - Sarah Chen, CTO at TechCorp\n\n\"Moving from Firebase to Supabase was the best decision we made this year.\" - James Liu, Lead Engineer\n\n\"The real-time features and Postgres foundation give us confidence at scale.\" - Maria Garcia, VP Engineering\n\n## Get started\n\nSign up for free today.";
|
||||
|
||||
let result = make_result(md);
|
||||
let out = to_llm_text(&result, None);
|
||||
|
||||
let sarah_count = out.matches("Sarah Chen").count();
|
||||
let james_count = out.matches("James Liu").count();
|
||||
let maria_count = out.matches("Maria Garcia").count();
|
||||
|
||||
assert_eq!(sarah_count, 1, "Sarah duplicated {sarah_count}x: {out}");
|
||||
assert_eq!(james_count, 1, "James duplicated {james_count}x: {out}");
|
||||
assert_eq!(maria_count, 1, "Maria duplicated {maria_count}x: {out}");
|
||||
|
||||
assert!(out.contains("## What our users say"), "Heading lost: {out}");
|
||||
assert!(out.contains("## Get started"), "Heading lost: {out}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn strips_bare_image_references() {
|
||||
let md = "Some content\n\nhero.webp\n\nhttps://example.com/logo.svg\n\n\n\n\n\nThe file output.png is saved to disk.\n\n\n\nMore content";
|
||||
let result = make_result(md);
|
||||
let out = to_llm_text(&result, None);
|
||||
|
||||
assert!(
|
||||
!out.contains("hero.webp"),
|
||||
"Bare filename not stripped: {out}"
|
||||
);
|
||||
assert!(
|
||||
!out.contains("https://example.com/logo.svg"),
|
||||
"Bare image URL not stripped: {out}"
|
||||
);
|
||||
assert!(
|
||||
!out.contains("image.png"),
|
||||
"Empty-alt image not stripped: {out}"
|
||||
);
|
||||
assert!(
|
||||
!out.contains("logo.svg"),
|
||||
"Generic-alt image not stripped: {out}"
|
||||
);
|
||||
assert!(
|
||||
out.contains("output.png is saved to disk"),
|
||||
"Sentence with .png filename was incorrectly stripped: {out}"
|
||||
);
|
||||
assert!(
|
||||
out.contains("Detailed architecture diagram showing the data flow"),
|
||||
"Meaningful alt text was stripped: {out}"
|
||||
);
|
||||
assert!(
|
||||
!out.contains("arch.png"),
|
||||
"Image URL not stripped from meaningful alt: {out}"
|
||||
);
|
||||
assert!(out.contains("Some content"), "Content before lost: {out}");
|
||||
assert!(out.contains("More content"), "Content after lost: {out}");
|
||||
}
|
||||
}
|
||||
1607
crates/noxa-core/src/markdown.rs
Normal file
1607
crates/noxa-core/src/markdown.rs
Normal file
File diff suppressed because it is too large
Load diff
156
crates/noxa-core/src/metadata.rs
Normal file
156
crates/noxa-core/src/metadata.rs
Normal file
|
|
@ -0,0 +1,156 @@
|
|||
/// Metadata extraction from HTML <head>.
|
||||
/// Prioritizes Open Graph and Twitter Card tags, falls back to standard meta tags.
|
||||
use scraper::{Html, Selector};
|
||||
|
||||
use crate::types::Metadata;
|
||||
|
||||
/// Selectors are cheap to compile but we call them often — cache with once_cell.
|
||||
macro_rules! selector {
|
||||
($s:expr) => {{
|
||||
use once_cell::sync::Lazy;
|
||||
static SEL: Lazy<Selector> = Lazy::new(|| Selector::parse($s).unwrap());
|
||||
&*SEL
|
||||
}};
|
||||
}
|
||||
|
||||
pub fn extract(doc: &Html, url: Option<&str>) -> Metadata {
|
||||
let title = og_meta(doc, "og:title")
|
||||
.or_else(|| meta_name(doc, "twitter:title"))
|
||||
.or_else(|| title_tag(doc));
|
||||
|
||||
let description = og_meta(doc, "og:description")
|
||||
.or_else(|| meta_name(doc, "twitter:description"))
|
||||
.or_else(|| meta_name(doc, "description"));
|
||||
|
||||
let author = meta_name(doc, "author").or_else(|| og_meta(doc, "article:author"));
|
||||
|
||||
let published_date = og_meta(doc, "article:published_time")
|
||||
.or_else(|| meta_name(doc, "date"))
|
||||
.or_else(|| meta_name(doc, "publication_date"));
|
||||
|
||||
// Search the whole document for <html lang="..."> — root_element() IS the <html>
|
||||
// node in scraper, so selecting "html" from it finds nothing (no nested <html>).
|
||||
let language = doc
|
||||
.select(selector!("html"))
|
||||
.next()
|
||||
.and_then(|el| el.value().attr("lang"))
|
||||
.map(|s| s.to_string());
|
||||
|
||||
let site_name = og_meta(doc, "og:site_name");
|
||||
let image = og_meta(doc, "og:image").or_else(|| meta_name(doc, "twitter:image"));
|
||||
|
||||
let favicon = extract_favicon(doc);
|
||||
|
||||
Metadata {
|
||||
title,
|
||||
description,
|
||||
author,
|
||||
published_date,
|
||||
language,
|
||||
url: url.map(String::from),
|
||||
site_name,
|
||||
image,
|
||||
favicon,
|
||||
word_count: 0, // filled later by the extractor
|
||||
}
|
||||
}
|
||||
|
||||
/// <meta property="og:..." content="...">
|
||||
fn og_meta(doc: &Html, property: &str) -> Option<String> {
|
||||
// OG tags use property= not name=
|
||||
doc.select(selector!("meta[property]"))
|
||||
.find(|el| el.value().attr("property") == Some(property))
|
||||
.and_then(|el| el.value().attr("content"))
|
||||
.map(|s| s.trim().to_string())
|
||||
.filter(|s| !s.is_empty())
|
||||
}
|
||||
|
||||
/// <meta name="..." content="...">
|
||||
fn meta_name(doc: &Html, name: &str) -> Option<String> {
|
||||
doc.select(selector!("meta[name]"))
|
||||
.find(|el| {
|
||||
el.value()
|
||||
.attr("name")
|
||||
.is_some_and(|n| n.eq_ignore_ascii_case(name))
|
||||
})
|
||||
.and_then(|el| el.value().attr("content"))
|
||||
.map(|s| s.trim().to_string())
|
||||
.filter(|s| !s.is_empty())
|
||||
}
|
||||
|
||||
fn title_tag(doc: &Html) -> Option<String> {
|
||||
doc.select(selector!("title"))
|
||||
.next()
|
||||
.map(|el| el.text().collect::<String>().trim().to_string())
|
||||
.filter(|s| !s.is_empty())
|
||||
}
|
||||
|
||||
fn extract_favicon(doc: &Html) -> Option<String> {
|
||||
// <link rel="icon" href="..."> or <link rel="shortcut icon" href="...">
|
||||
doc.select(selector!("link[rel]"))
|
||||
.find(|el| el.value().attr("rel").is_some_and(|r| r.contains("icon")))
|
||||
.and_then(|el| el.value().attr("href"))
|
||||
.map(|s| s.to_string())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn parse(html: &str) -> Html {
|
||||
Html::parse_document(html)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extracts_basic_metadata() {
|
||||
let html = r#"
|
||||
<html lang="en">
|
||||
<head>
|
||||
<title>Test Page</title>
|
||||
<meta name="description" content="A test page">
|
||||
<meta name="author" content="Alice">
|
||||
<meta property="og:title" content="OG Title">
|
||||
<meta property="og:image" content="https://img.example.com/og.png">
|
||||
<meta property="og:site_name" content="Example">
|
||||
<meta property="article:published_time" content="2025-01-15">
|
||||
<link rel="icon" href="/favicon.ico">
|
||||
</head>
|
||||
<body></body>
|
||||
</html>"#;
|
||||
|
||||
let doc = parse(html);
|
||||
let meta = extract(&doc, Some("https://example.com"));
|
||||
|
||||
// OG title wins over <title>
|
||||
assert_eq!(meta.title.as_deref(), Some("OG Title"));
|
||||
assert_eq!(meta.description.as_deref(), Some("A test page"));
|
||||
assert_eq!(meta.author.as_deref(), Some("Alice"));
|
||||
assert_eq!(meta.published_date.as_deref(), Some("2025-01-15"));
|
||||
assert_eq!(meta.language.as_deref(), Some("en"));
|
||||
assert_eq!(meta.site_name.as_deref(), Some("Example"));
|
||||
assert_eq!(
|
||||
meta.image.as_deref(),
|
||||
Some("https://img.example.com/og.png")
|
||||
);
|
||||
assert_eq!(meta.favicon.as_deref(), Some("/favicon.ico"));
|
||||
assert_eq!(meta.url.as_deref(), Some("https://example.com"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn falls_back_to_title_tag() {
|
||||
let html = r#"<html><head><title>Fallback Title</title></head><body></body></html>"#;
|
||||
let doc = parse(html);
|
||||
let meta = extract(&doc, None);
|
||||
assert_eq!(meta.title.as_deref(), Some("Fallback Title"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn handles_missing_metadata_gracefully() {
|
||||
let html = r#"<html><head></head><body></body></html>"#;
|
||||
let doc = parse(html);
|
||||
let meta = extract(&doc, None);
|
||||
assert!(meta.title.is_none());
|
||||
assert!(meta.description.is_none());
|
||||
assert!(meta.language.is_none());
|
||||
}
|
||||
}
|
||||
851
crates/noxa-core/src/noise.rs
Normal file
851
crates/noxa-core/src/noise.rs
Normal file
|
|
@ -0,0 +1,851 @@
|
|||
/// Shared noise detection for web content extraction.
|
||||
///
|
||||
/// Identifies elements that don't contribute to main content:
|
||||
/// navigation, sidebars, footers, ads, cookie banners, modals, etc.
|
||||
/// Used by both the extractor (candidate filtering) and the markdown
|
||||
/// converter (output-time stripping).
|
||||
use scraper::ElementRef;
|
||||
|
||||
const NOISE_TAGS: &[&str] = &[
|
||||
"script", "style", "noscript", "iframe", "svg", "nav", "aside", "footer", "header", "video",
|
||||
"audio",
|
||||
"canvas",
|
||||
// NOTE: <form> removed from this list — ASP.NET and similar frameworks wrap the
|
||||
// entire page body in a single <form> tag that contains all real content.
|
||||
// Forms are now handled with a heuristic in is_noise() that distinguishes
|
||||
// small input forms (noise) from page-wrapping forms (not noise).
|
||||
// NOTE: <picture> removed — it's a responsive image container, not noise.
|
||||
// <picture> wraps <source> and <img> for responsive images.
|
||||
];
|
||||
|
||||
const NOISE_ROLES: &[&str] = &["navigation", "banner", "complementary", "contentinfo"];
|
||||
|
||||
const NOISE_CLASS_PATTERNS: &[&str] = &[
|
||||
"sidebar",
|
||||
"side",
|
||||
"nav",
|
||||
"navbar",
|
||||
"navigation",
|
||||
"menu",
|
||||
"footer",
|
||||
"header",
|
||||
"top",
|
||||
"bottom",
|
||||
"advertisement",
|
||||
"advert",
|
||||
"social",
|
||||
"social-media",
|
||||
"social-links",
|
||||
"share",
|
||||
"comment",
|
||||
"cookie",
|
||||
"popup",
|
||||
"modal",
|
||||
"overlay",
|
||||
"banner",
|
||||
"breadcrumb",
|
||||
"breadcrumbs",
|
||||
"widget",
|
||||
"lang-selector",
|
||||
"language",
|
||||
"newsletter",
|
||||
"subscribe",
|
||||
"related-posts",
|
||||
"recommended",
|
||||
"pagination",
|
||||
"pager",
|
||||
"signup",
|
||||
"login-form",
|
||||
"search-form",
|
||||
"notification",
|
||||
"alert",
|
||||
"toast",
|
||||
"skip-link",
|
||||
"sr-only",
|
||||
"visually-hidden",
|
||||
];
|
||||
|
||||
const NOISE_ID_PATTERNS: &[&str] = &[
|
||||
"sidebar",
|
||||
"nav",
|
||||
"menu",
|
||||
"footer",
|
||||
"header",
|
||||
"cookie",
|
||||
"popup",
|
||||
"modal",
|
||||
"breadcrumbs",
|
||||
"widget",
|
||||
"language-selector",
|
||||
"ad",
|
||||
"social",
|
||||
"share",
|
||||
"newsletter",
|
||||
"subscribe",
|
||||
"comments",
|
||||
"related",
|
||||
"recommended",
|
||||
];
|
||||
|
||||
/// Exact class tokens that indicate noise.
|
||||
/// Unlike substring matching, these only match when the EXACT class token
|
||||
/// is present — ".modal" matches `class="modal"` but NOT `class="free-modal-container"`.
|
||||
const NOISE_CLASSES: &[&str] = &[
|
||||
"header",
|
||||
"top",
|
||||
"navbar",
|
||||
"footer",
|
||||
"bottom",
|
||||
"sidebar",
|
||||
"modal",
|
||||
"popup",
|
||||
"overlay",
|
||||
"ad",
|
||||
"ads",
|
||||
"advert",
|
||||
"lang-selector",
|
||||
"language",
|
||||
"social",
|
||||
"social-media",
|
||||
"social-links",
|
||||
"menu",
|
||||
"navigation",
|
||||
"breadcrumbs",
|
||||
"breadcrumb",
|
||||
"share",
|
||||
"widget",
|
||||
"cookie",
|
||||
"newsletter",
|
||||
"subscribe",
|
||||
"skip-link",
|
||||
"sr-only",
|
||||
"visually-hidden",
|
||||
"notification",
|
||||
"alert",
|
||||
"toast",
|
||||
"pagination",
|
||||
"pager",
|
||||
"signup",
|
||||
"login-form",
|
||||
"search-form",
|
||||
"related-posts",
|
||||
"recommended",
|
||||
];
|
||||
|
||||
/// Exact IDs that indicate noise.
|
||||
const NOISE_IDS: &[&str] = &[
|
||||
"header",
|
||||
"footer",
|
||||
"nav",
|
||||
"sidebar",
|
||||
"menu",
|
||||
"modal",
|
||||
"popup",
|
||||
"cookie",
|
||||
"breadcrumbs",
|
||||
"widget",
|
||||
"ad",
|
||||
"social",
|
||||
"share",
|
||||
"newsletter",
|
||||
"subscribe",
|
||||
"comments",
|
||||
"related",
|
||||
"recommended",
|
||||
];
|
||||
|
||||
/// ID prefixes for cookie consent platforms that should be stripped entirely.
|
||||
/// These generate massive DOM overlays that dominate content extraction.
|
||||
const COOKIE_CONSENT_ID_PREFIXES: &[&str] = &[
|
||||
"onetrust", // OneTrust (Foot Locker, many EU sites)
|
||||
"optanon", // OneTrust legacy
|
||||
"ot-sdk", // OneTrust SDK
|
||||
"cookiebot", // Cookiebot
|
||||
"CybotCookiebot", // Cookiebot
|
||||
"cc-", // Cookie Consent (Osano)
|
||||
"cookie-law", // Cookie Law Info
|
||||
"gdpr", // Generic GDPR banners
|
||||
"consent-", // Generic consent banners
|
||||
"cmp-", // Consent Management Platforms
|
||||
"sp_message", // SourcePoint
|
||||
"qc-cmp", // Quantcast CMP
|
||||
"trustarc", // TrustArc
|
||||
"evidon", // Evidon/Crownpeak
|
||||
];
|
||||
|
||||
/// Check if an element is noise by tag, role, class, or id.
|
||||
///
|
||||
/// Uses EXACT class token matching instead
|
||||
/// of substring matching. This prevents false positives like:
|
||||
/// - "free-modal-container" ≠ noise (Vice.com's content wrapper)
|
||||
/// - "a-bw_aui_cxc_alert_measurement" ≠ noise (Amazon's body class)
|
||||
/// - "desktop" ≠ noise (not matching "top")
|
||||
pub fn is_noise(el: ElementRef<'_>) -> bool {
|
||||
let tag = el.value().name();
|
||||
|
||||
// Never treat <body> or <html> as noise.
|
||||
if tag == "body" || tag == "html" {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Tag-based noise (script, style, nav, etc.)
|
||||
if NOISE_TAGS.contains(&tag) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// <form> heuristic: ASP.NET wraps the entire page body in a single <form>.
|
||||
// These page-wrapping forms contain hundreds of words of real content.
|
||||
// Small forms (login, search, newsletter) are noise.
|
||||
if tag == "form" {
|
||||
let text_len = el.text().collect::<String>().len();
|
||||
// A form with substantial text (>500 chars) is likely a page wrapper, not noise.
|
||||
// Small forms (login/search/subscribe) rarely exceed a few hundred chars.
|
||||
if text_len < 500 {
|
||||
return true;
|
||||
}
|
||||
// Also check noise classes/IDs — a big form with class="login-form" is still noise
|
||||
if let Some(class) = el.value().attr("class") {
|
||||
let cl = class.to_lowercase();
|
||||
if cl.contains("login")
|
||||
|| cl.contains("search")
|
||||
|| cl.contains("subscribe")
|
||||
|| cl.contains("signup")
|
||||
|| cl.contains("newsletter")
|
||||
|| cl.contains("contact")
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// ARIA role-based noise
|
||||
if let Some(role) = el.value().attr("role")
|
||||
&& NOISE_ROLES.contains(&role)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
// Exact class token matching — split class attribute into tokens,
|
||||
// check each against the noise list. "free-modal-container" splits into
|
||||
// ["free-modal-container"] which does NOT match "modal".
|
||||
if let Some(class) = el.value().attr("class") {
|
||||
let mut class_matched = false;
|
||||
for token in class.split_whitespace() {
|
||||
let lower = token.to_lowercase();
|
||||
if NOISE_CLASSES.contains(&lower.as_str()) {
|
||||
class_matched = true;
|
||||
break;
|
||||
}
|
||||
// Structural elements use compound names (FooterLinks, Header-nav, etc.)
|
||||
// These are always noise regardless of compound form.
|
||||
if lower.starts_with("footer")
|
||||
|| lower.starts_with("header-")
|
||||
|| lower.starts_with("nav-")
|
||||
{
|
||||
class_matched = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if !class_matched {
|
||||
class_matched = is_ad_class(class);
|
||||
}
|
||||
|
||||
if class_matched {
|
||||
// Safety valve: malformed HTML can leave noise containers unclosed,
|
||||
// causing them to absorb the entire page content. A real header/nav/
|
||||
// footer rarely exceeds a few thousand characters of text. If a
|
||||
// noise-class element has massive text content, it's almost certainly
|
||||
// a broken wrapper — treat it as content, not noise.
|
||||
let text_len = el.text().collect::<String>().len();
|
||||
if text_len > 5000 {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Exact ID matching
|
||||
if let Some(id) = el.value().attr("id") {
|
||||
let id_lower = id.to_lowercase();
|
||||
if NOISE_IDS.contains(&id_lower.as_str()) && !is_structural_id(&id_lower) {
|
||||
// Same safety valve for ID-matched noise elements
|
||||
let text_len = el.text().collect::<String>().len();
|
||||
if text_len > 5000 {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
// Cookie consent platform IDs (prefix match — these generate huge overlays)
|
||||
for prefix in COOKIE_CONSENT_ID_PREFIXES {
|
||||
if id_lower.starts_with(prefix) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Class-based cookie consent detection (prefix match for platform classes)
|
||||
if let Some(class) = el.value().attr("class") {
|
||||
let class_lower = class.to_lowercase();
|
||||
for prefix in COOKIE_CONSENT_ID_PREFIXES {
|
||||
if class_lower.contains(prefix) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
/// Check if an element is inside a noise container.
|
||||
pub fn is_noise_descendant(el: ElementRef<'_>) -> bool {
|
||||
let mut node = el.parent();
|
||||
while let Some(parent) = node {
|
||||
if let Some(parent_el) = ElementRef::wrap(parent)
|
||||
&& is_noise(parent_el)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
node = parent.parent();
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
fn has_noise_class(class: &str) -> bool {
|
||||
// Match noise patterns against individual class tokens, with safeguards
|
||||
// against Tailwind CSS utility classes that contain noise keywords as
|
||||
// substrings (e.g., "pt-header-h" is padding, not a header class).
|
||||
class.split_whitespace().any(is_noise_token) || is_ad_class(class)
|
||||
}
|
||||
|
||||
/// Check if a single class token is a noise indicator.
|
||||
/// Requires the noise pattern to be the *semantic core* of the token,
|
||||
/// not embedded inside a Tailwind utility prefix or CSS variable.
|
||||
fn is_noise_token(token: &str) -> bool {
|
||||
let t = token.to_lowercase();
|
||||
|
||||
// Skip Tailwind arbitrary values and CSS variable references entirely
|
||||
if t.contains("[--") || t.contains("var(") {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Strip common Tailwind responsive/state prefixes (e.g., "lg:", "hover:", "md:")
|
||||
let core = t.rsplit_once(':').map_or(t.as_str(), |(_, c)| c);
|
||||
|
||||
// The noise pattern should match the semantic name, not be buried inside
|
||||
// a utility like "pt-header-h" (padding) or "mt-nav-offset" (margin).
|
||||
// Tailwind utilities start with known prefixes; if the token starts with one,
|
||||
// it's a utility class, not a semantic class.
|
||||
const UTILITY_PREFIXES: &[&str] = &[
|
||||
"p-",
|
||||
"pt-",
|
||||
"pb-",
|
||||
"pl-",
|
||||
"pr-",
|
||||
"px-",
|
||||
"py-",
|
||||
"m-",
|
||||
"mt-",
|
||||
"mb-",
|
||||
"ml-",
|
||||
"mr-",
|
||||
"mx-",
|
||||
"my-",
|
||||
"w-",
|
||||
"h-",
|
||||
"min-",
|
||||
"max-",
|
||||
"top-",
|
||||
"left-",
|
||||
"right-",
|
||||
"bottom-",
|
||||
"z-",
|
||||
"gap-",
|
||||
"text-",
|
||||
"bg-",
|
||||
"border-",
|
||||
"rounded-",
|
||||
"flex-",
|
||||
"grid-",
|
||||
"col-",
|
||||
"row-",
|
||||
"opacity-",
|
||||
"transition-",
|
||||
"duration-",
|
||||
"delay-",
|
||||
"ease-",
|
||||
"translate-",
|
||||
"scale-",
|
||||
"rotate-",
|
||||
"origin-",
|
||||
"overflow-",
|
||||
"inset-",
|
||||
"space-",
|
||||
"divide-",
|
||||
"ring-",
|
||||
"shadow-",
|
||||
"outline-",
|
||||
"font-",
|
||||
"leading-",
|
||||
"tracking-",
|
||||
"decoration-",
|
||||
];
|
||||
if UTILITY_PREFIXES.iter().any(|pfx| core.starts_with(pfx)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// "banner" and "overlay" only match as prefix — they false-positive as
|
||||
// suffixes in BEM/Webflow component names (e.g., "package_banner" is a
|
||||
// product card, not an ad banner; "planet-overlay" is a visual effect).
|
||||
const PREFIX_ONLY: &[&str] = &["banner", "overlay"];
|
||||
|
||||
// Short patterns (≤6 chars like "nav", "top", "header", "widget") require
|
||||
// word-boundary matching to avoid false positives on compound CSS class
|
||||
// names (e.g., "desktop" ≠ "top", "celwidget" ≠ "widget",
|
||||
// "_categoriesheader_active" ≠ semantic "header").
|
||||
// A word boundary is `-`, `_`, or start/end of string.
|
||||
// Longer patterns (7+ chars like "sidebar", "breadcrumb") are specific
|
||||
// enough that substring matching is safe.
|
||||
NOISE_CLASS_PATTERNS.iter().any(|p| {
|
||||
if PREFIX_ONLY.contains(p) {
|
||||
core == *p || core.starts_with(&format!("{p}-")) || core.starts_with(&format!("{p}_"))
|
||||
} else if p.len() <= 6 {
|
||||
is_word_boundary_match(core, p)
|
||||
} else {
|
||||
core.contains(p)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
/// Check if `pattern` appears in `text` at a word boundary.
|
||||
/// Word boundaries are `-`, `_`, or start/end of string.
|
||||
/// e.g., "nav" matches "main-nav", "nav-bar", "nav" but NOT "canvas", "navbar".
|
||||
fn is_word_boundary_match(text: &str, pattern: &str) -> bool {
|
||||
let mut start = 0;
|
||||
while let Some(pos) = text[start..].find(pattern) {
|
||||
let abs = start + pos;
|
||||
let before_ok = abs == 0 || matches!(text.as_bytes()[abs - 1], b'-' | b'_');
|
||||
let end = abs + pattern.len();
|
||||
let after_ok = end == text.len() || matches!(text.as_bytes()[end], b'-' | b'_');
|
||||
if before_ok && after_ok {
|
||||
return true;
|
||||
}
|
||||
start = abs + 1;
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
/// IDs like "modal-portal", "nav-root", "header-container" are structural
|
||||
/// wrappers (React portals, app roots), not actual noise elements.
|
||||
fn is_structural_id(id: &str) -> bool {
|
||||
const STRUCTURAL_SUFFIXES: &[&str] =
|
||||
&["portal", "root", "container", "wrapper", "mount", "app"];
|
||||
STRUCTURAL_SUFFIXES.iter().any(|s| id.contains(s))
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// CSS class text detection (visible content that looks like class names)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// CSS utility prefixes that indicate a word is a class name, not prose.
|
||||
/// Covers Tailwind, Bootstrap-ish, and common utility-first patterns.
|
||||
const CSS_CLASS_PREFIXES: &[&str] = &[
|
||||
"text-",
|
||||
"bg-",
|
||||
"px-",
|
||||
"py-",
|
||||
"pt-",
|
||||
"pb-",
|
||||
"pl-",
|
||||
"pr-",
|
||||
"p-",
|
||||
"mx-",
|
||||
"my-",
|
||||
"mt-",
|
||||
"mb-",
|
||||
"ml-",
|
||||
"mr-",
|
||||
"m-",
|
||||
"w-",
|
||||
"h-",
|
||||
"min-",
|
||||
"max-",
|
||||
"flex-",
|
||||
"grid-",
|
||||
"col-",
|
||||
"row-",
|
||||
"gap-",
|
||||
"space-",
|
||||
"rounded-",
|
||||
"shadow-",
|
||||
"border-",
|
||||
"ring-",
|
||||
"outline-",
|
||||
"font-",
|
||||
"tracking-",
|
||||
"leading-",
|
||||
"decoration-",
|
||||
"opacity-",
|
||||
"transition-",
|
||||
"duration-",
|
||||
"delay-",
|
||||
"ease-",
|
||||
"translate-",
|
||||
"scale-",
|
||||
"rotate-",
|
||||
"origin-",
|
||||
"overflow-",
|
||||
"inset-",
|
||||
"divide-",
|
||||
"z-",
|
||||
"top-",
|
||||
"left-",
|
||||
"right-",
|
||||
"bottom-",
|
||||
"sr-",
|
||||
"not-",
|
||||
"group-",
|
||||
"peer-",
|
||||
"placeholder-",
|
||||
"focus-",
|
||||
"hover-",
|
||||
"active-",
|
||||
"disabled-",
|
||||
"dark-",
|
||||
"sm-",
|
||||
"md-",
|
||||
"lg-",
|
||||
"xl-",
|
||||
"2xl-",
|
||||
];
|
||||
|
||||
/// Exact single-word CSS utility class names (no prefix needed).
|
||||
const CSS_CLASS_EXACT: &[&str] = &[
|
||||
"flex",
|
||||
"grid",
|
||||
"block",
|
||||
"inline",
|
||||
"hidden",
|
||||
"static",
|
||||
"fixed",
|
||||
"absolute",
|
||||
"relative",
|
||||
"sticky",
|
||||
"isolate",
|
||||
"container",
|
||||
"prose",
|
||||
"antialiased",
|
||||
"truncate",
|
||||
"uppercase",
|
||||
"lowercase",
|
||||
"capitalize",
|
||||
"italic",
|
||||
"underline",
|
||||
"overline",
|
||||
"invisible",
|
||||
"visible",
|
||||
"sr-only",
|
||||
"not-sr-only",
|
||||
];
|
||||
|
||||
/// Tailwind responsive/state prefixes that can appear before a utility class
|
||||
/// (e.g., "sm:text-lg", "hover:bg-blue-500", "dark:text-white").
|
||||
fn strip_tw_variant_prefix(word: &str) -> &str {
|
||||
// Handle chained variants: "dark:sm:text-lg" → "text-lg"
|
||||
word.rsplit_once(':').map_or(word, |(_, core)| core)
|
||||
}
|
||||
|
||||
/// Check if a single whitespace-delimited word looks like a CSS utility class.
|
||||
fn is_css_class_word(word: &str) -> bool {
|
||||
let core = strip_tw_variant_prefix(word);
|
||||
let lower = core.to_lowercase();
|
||||
|
||||
// Arbitrary value syntax: "[--foo:bar]", "w-[200px]"
|
||||
if lower.contains('[') && lower.contains(']') {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Exact matches
|
||||
if CSS_CLASS_EXACT.iter().any(|&e| lower == e) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Prefix matches
|
||||
if CSS_CLASS_PREFIXES.iter().any(|pfx| lower.starts_with(pfx)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Negative utilities: "-mt-4", "-translate-x-1/2"
|
||||
if lower.starts_with('-') && lower.len() > 1 {
|
||||
let rest = &lower[1..];
|
||||
if CSS_CLASS_PREFIXES.iter().any(|pfx| rest.starts_with(pfx)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
/// Public wrapper for single-word CSS class detection (used by LLM pipeline
|
||||
/// for stripping trailing CSS classes from mixed-content lines).
|
||||
pub fn is_css_class_word_pub(word: &str) -> bool {
|
||||
is_css_class_word(word)
|
||||
}
|
||||
|
||||
/// Check if a text block is predominantly CSS class names.
|
||||
///
|
||||
/// Returns true if >50% of the whitespace-delimited words look like CSS
|
||||
/// utility classes. Requires at least 3 words to avoid false positives on
|
||||
/// short fragments.
|
||||
pub fn is_css_class_text(text: &str) -> bool {
|
||||
let words: Vec<&str> = text.split_whitespace().collect();
|
||||
if words.len() < 3 {
|
||||
return false;
|
||||
}
|
||||
|
||||
let css_count = words.iter().filter(|w| is_css_class_word(w)).count();
|
||||
// >50% of words are CSS classes
|
||||
css_count * 2 > words.len()
|
||||
}
|
||||
|
||||
/// Detect "ad" as a standalone class token, not a substring of "read" or "loading".
|
||||
fn is_ad_class(class: &str) -> bool {
|
||||
class.split_whitespace().any(|token| {
|
||||
token == "ad"
|
||||
|| token.starts_with("ad-")
|
||||
|| token.starts_with("ad_")
|
||||
|| token.ends_with("-ad")
|
||||
|| token.ends_with("_ad")
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn ad_class_standalone_detected() {
|
||||
assert!(is_ad_class("ad"));
|
||||
assert!(is_ad_class("some ad-banner"));
|
||||
assert!(is_ad_class("top-ad widget"));
|
||||
assert!(is_ad_class("ad_unit"));
|
||||
assert!(is_ad_class("sidebar_ad"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ad_class_no_false_positive() {
|
||||
assert!(!is_ad_class("reading-time"));
|
||||
assert!(!is_ad_class("loading-indicator"));
|
||||
assert!(!is_ad_class("download-button"));
|
||||
assert!(!is_ad_class("breadcrumb"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn noise_class_patterns() {
|
||||
assert!(has_noise_class("main-sidebar"));
|
||||
assert!(has_noise_class("cookie-banner")); // "cookie" substring match
|
||||
assert!(has_noise_class("modal-overlay")); // "modal" substring match
|
||||
assert!(has_noise_class("banner-top")); // "banner" as prefix
|
||||
assert!(has_noise_class("overlay-popup")); // "overlay" as prefix
|
||||
assert!(!has_noise_class("article-content"));
|
||||
assert!(!has_noise_class("post-body"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn short_patterns_require_word_boundary() {
|
||||
// "nav" (3 chars) — must be a standalone word segment
|
||||
assert!(has_noise_class("main-nav"));
|
||||
assert!(has_noise_class("nav-bar"));
|
||||
assert!(has_noise_class("nav"));
|
||||
assert!(!has_noise_class("canvas")); // "nav" is substring, not word
|
||||
assert!(has_noise_class("icp-nav-flag")); // "nav" IS between word boundaries
|
||||
// "top" (3 chars) — note: "top-bar" starts with Tailwind prefix "top-" → filtered out
|
||||
assert!(has_noise_class("page-top")); // "top" at word boundary
|
||||
assert!(!has_noise_class("desktop")); // "top" is substring inside word
|
||||
assert!(!has_noise_class("stop-motion")); // "top" inside word
|
||||
// "side" (4 chars) — "left-side" starts with Tailwind prefix "left-" → filtered
|
||||
assert!(has_noise_class("page-side"));
|
||||
assert!(!has_noise_class("inside-content"));
|
||||
assert!(!has_noise_class("consider"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn amazon_classes_not_noise() {
|
||||
// Amazon CSS module class names that were false-positiving
|
||||
assert!(!has_noise_class("desktop")); // contains "top"
|
||||
assert!(!has_noise_class("celwidget")); // contains "widget"
|
||||
// a-alert-container: "alert" IS a proper word segment → still matches (correct for UI alerts)
|
||||
assert!(has_noise_class("a-alert-container"));
|
||||
assert!(!has_noise_class(
|
||||
"_haul-cx-images-carousel_style_desktop-card__fid8k"
|
||||
));
|
||||
assert!(!has_noise_class(
|
||||
"_haul-cx-infinite-scroll-body_categoriesheader_active__2j-4u"
|
||||
));
|
||||
// But actual noise classes still work
|
||||
assert!(has_noise_class("site-header"));
|
||||
assert!(has_noise_class("main-nav"));
|
||||
assert!(has_noise_class("footer-links"));
|
||||
assert!(has_noise_class("cookie-consent"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn word_boundary_match_works() {
|
||||
assert!(is_word_boundary_match("main-nav", "nav"));
|
||||
assert!(is_word_boundary_match("nav-bar", "nav"));
|
||||
assert!(is_word_boundary_match("nav", "nav"));
|
||||
assert!(is_word_boundary_match("top-nav_bar", "nav"));
|
||||
assert!(!is_word_boundary_match("canvas", "nav"));
|
||||
assert!(!is_word_boundary_match("navbar", "nav"));
|
||||
assert!(!is_word_boundary_match("navigate", "nav"));
|
||||
assert!(is_word_boundary_match("top-bar", "top"));
|
||||
assert!(!is_word_boundary_match("desktop", "top"));
|
||||
assert!(!is_word_boundary_match("stopper", "top"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bem_component_names_not_noise() {
|
||||
// BEM/Webflow component names where noise keyword is a suffix
|
||||
assert!(!has_noise_class("package_banner"));
|
||||
assert!(!has_noise_class("mars-cta_planet-overlay"));
|
||||
assert!(!has_noise_class("hero_banner_wrap"));
|
||||
// But actual noise classes still work
|
||||
assert!(has_noise_class("banner-dismiss"));
|
||||
assert!(has_noise_class("overlay-backdrop"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn structural_ids_not_noise() {
|
||||
assert!(is_structural_id("modal-portal"));
|
||||
assert!(is_structural_id("nav-root"));
|
||||
assert!(is_structural_id("header-container"));
|
||||
assert!(is_structural_id("sidebar-wrapper"));
|
||||
assert!(is_structural_id("menu-mount"));
|
||||
assert!(is_structural_id("app"));
|
||||
// Actual noise IDs should NOT be structural
|
||||
assert!(!is_structural_id("main-sidebar"));
|
||||
assert!(!is_structural_id("cookie-consent"));
|
||||
assert!(!is_structural_id("popup-overlay"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tailwind_animation_utilities_not_noise() {
|
||||
// Tailwind transition/animation utilities with noise keywords as values
|
||||
assert!(!has_noise_class("ease-curve-sidebar"));
|
||||
assert!(!has_noise_class("duration-sidebar"));
|
||||
assert!(!has_noise_class("delay-modal-open"));
|
||||
// But actual sidebar/modal classes still work
|
||||
assert!(has_noise_class("sidebar-panel"));
|
||||
assert!(has_noise_class("modal-dialog"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tailwind_css_vars_not_noise() {
|
||||
// Tailwind arbitrary values and CSS variables should NOT trigger noise
|
||||
assert!(!has_noise_class("[--content-top-offset:var(--header-h)]"));
|
||||
assert!(!has_noise_class(
|
||||
"pt-[var(--content-top-offset)] [--content-top-offset:var(--header-h)]"
|
||||
));
|
||||
assert!(!has_noise_class("[--nav-width:200px]"));
|
||||
// But actual noise classes still work
|
||||
assert!(has_noise_class("[--offset:10px] header-bar"));
|
||||
assert!(has_noise_class("sidebar [--x:1]"));
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// CSS class text detection (decorative text that looks like class names)
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
#[test]
|
||||
fn css_class_text_detected() {
|
||||
// Pure Tailwind utility class blocks — the real-world problem
|
||||
assert!(is_css_class_text(
|
||||
"text-4xl font-bold tracking-tight text-gray-900"
|
||||
));
|
||||
assert!(is_css_class_text(
|
||||
"text-4xl text-5xl text-6xl text-8xl text-gray-950 text-white tracking-tighter text-balance"
|
||||
));
|
||||
assert!(is_css_class_text(
|
||||
"flex grid rounded-lg shadow-md bg-white px-4 py-2"
|
||||
));
|
||||
assert!(is_css_class_text(
|
||||
"sm:text-lg dark:bg-gray-800 hover:bg-blue-500"
|
||||
));
|
||||
// Negative utilities
|
||||
assert!(is_css_class_text("-mt-4 -translate-x-1/2 flex"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn css_class_text_normal_prose_kept() {
|
||||
// Normal English text — must NOT be detected as CSS
|
||||
assert!(!is_css_class_text(
|
||||
"the text-based approach works well for this use case"
|
||||
));
|
||||
assert!(!is_css_class_text(
|
||||
"Build beautiful websites with modern tools"
|
||||
));
|
||||
assert!(!is_css_class_text(
|
||||
"Tailwind CSS is a utility-first CSS framework"
|
||||
));
|
||||
// Too short to be confident
|
||||
assert!(!is_css_class_text("flex grid"));
|
||||
assert!(!is_css_class_text("text-lg"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn css_class_text_mixed_content() {
|
||||
// Majority CSS → detected
|
||||
assert!(is_css_class_text(
|
||||
"text-4xl font-bold tracking-tight text-gray-900 hero"
|
||||
));
|
||||
// Majority prose → not detected
|
||||
assert!(!is_css_class_text(
|
||||
"The quick brown fox jumps over the lazy text-lg dog"
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod form_tests {
|
||||
use super::*;
|
||||
use scraper::Html;
|
||||
|
||||
#[test]
|
||||
fn aspnet_page_wrapping_form_is_not_noise() {
|
||||
let html = r#"<html><body><form method="post" action="./page.aspx" id="form1"><div class="wrapper"><div class="content"><h1>Support</h1><h3>Question one?</h3><p>Long answer text that should definitely be captured by the extraction engine. This is real content with multiple sentences to ensure it passes any text length thresholds in the scoring algorithm. We need at least five hundred characters of actual text content here to exceed the threshold. Adding more sentences about various topics including data formats, historical prices, stock market analysis, technical indicators, and trading strategies. This paragraph discusses how intraday data can be used for backtesting quantitative models and developing automated trading systems.</p><h3>Question two?</h3><p>Another substantial answer paragraph with detailed information about the product features and capabilities.</p></div></div></form></body></html>"#;
|
||||
let doc = Html::parse_document(html);
|
||||
let form = doc
|
||||
.select(&scraper::Selector::parse("form").unwrap())
|
||||
.next()
|
||||
.unwrap();
|
||||
let text = form.text().collect::<String>();
|
||||
let text_len = text.len();
|
||||
assert!(
|
||||
text_len >= 500,
|
||||
"Form text should be >= 500 chars, got {text_len}"
|
||||
);
|
||||
assert!(
|
||||
!is_noise(form),
|
||||
"ASP.NET page-wrapping form should NOT be noise"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn small_login_form_is_noise() {
|
||||
let html = r#"
|
||||
<html><body>
|
||||
<form action="/login">
|
||||
<input type="text" name="user" />
|
||||
<input type="password" name="pass" />
|
||||
<button>Login</button>
|
||||
</form>
|
||||
</body></html>
|
||||
"#;
|
||||
let doc = Html::parse_document(html);
|
||||
let form = doc
|
||||
.select(&scraper::Selector::parse("form").unwrap())
|
||||
.next()
|
||||
.unwrap();
|
||||
assert!(is_noise(form), "Small login form SHOULD be noise");
|
||||
}
|
||||
}
|
||||
377
crates/noxa-core/src/structured_data.rs
Normal file
377
crates/noxa-core/src/structured_data.rs
Normal file
|
|
@ -0,0 +1,377 @@
|
|||
/// Extract structured data from HTML.
|
||||
///
|
||||
/// Handles three sources:
|
||||
/// 1. JSON-LD (`<script type="application/ld+json">`) — e-commerce, news, recipes
|
||||
/// 2. `__NEXT_DATA__` (`<script id="__NEXT_DATA__" type="application/json">`) — Next.js pages
|
||||
/// 3. SvelteKit data islands (`kit.start(app, element, { data: [...] })`) — SPAs
|
||||
use serde_json::Value;
|
||||
|
||||
/// Extract all JSON-LD blocks from raw HTML.
|
||||
///
|
||||
/// Returns parsed JSON values, skipping any blocks that fail to parse.
|
||||
/// Most e-commerce sites include Schema.org Product markup with prices,
|
||||
/// sizes, availability, and images.
|
||||
pub fn extract_json_ld(html: &str) -> Vec<Value> {
|
||||
let mut results = Vec::new();
|
||||
let needle = "application/ld+json";
|
||||
|
||||
// Walk through the HTML finding <script type="application/ld+json"> blocks.
|
||||
// Using simple string scanning instead of a full HTML parser — these blocks
|
||||
// are self-contained and reliably structured.
|
||||
let mut search_from = 0;
|
||||
while let Some(tag_start) = html[search_from..].find("<script") {
|
||||
let abs_start = search_from + tag_start;
|
||||
let tag_region = &html[abs_start..];
|
||||
|
||||
// Find the end of the opening tag
|
||||
let Some(tag_end_offset) = tag_region.find('>') else {
|
||||
search_from = abs_start + 7;
|
||||
continue;
|
||||
};
|
||||
|
||||
let opening_tag = &tag_region[..tag_end_offset];
|
||||
|
||||
// Check if this is a JSON-LD script
|
||||
if !opening_tag.to_lowercase().contains(needle) {
|
||||
search_from = abs_start + tag_end_offset + 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Find the closing </script>
|
||||
let content_start = abs_start + tag_end_offset + 1;
|
||||
let remaining = &html[content_start..];
|
||||
let Some(close_offset) = remaining.to_lowercase().find("</script>") else {
|
||||
search_from = content_start;
|
||||
continue;
|
||||
};
|
||||
|
||||
let json_str = remaining[..close_offset].trim();
|
||||
search_from = content_start + close_offset + 9;
|
||||
|
||||
if json_str.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Parse — some sites have arrays at top level
|
||||
match serde_json::from_str::<Value>(json_str) {
|
||||
Ok(Value::Array(arr)) => results.extend(arr),
|
||||
Ok(val) => results.push(val),
|
||||
Err(_) => {}
|
||||
}
|
||||
}
|
||||
|
||||
results
|
||||
}
|
||||
|
||||
/// Extract `__NEXT_DATA__` from Next.js pages.
|
||||
///
|
||||
/// Next.js embeds server-rendered page data in:
|
||||
/// `<script id="__NEXT_DATA__" type="application/json">{...}</script>`
|
||||
///
|
||||
/// Returns the `pageProps` object (the actual page data), skipping Next.js
|
||||
/// internals like `buildId`, `isFallback`, etc.
|
||||
pub fn extract_next_data(html: &str) -> Vec<Value> {
|
||||
let Some(id_pos) = html.find("__NEXT_DATA__") else {
|
||||
return Vec::new();
|
||||
};
|
||||
|
||||
// Find the enclosing <script> tag
|
||||
let Some(tag_start) = html[..id_pos].rfind("<script") else {
|
||||
return Vec::new();
|
||||
};
|
||||
let tag_region = &html[tag_start..];
|
||||
|
||||
let Some(tag_end) = tag_region.find('>') else {
|
||||
return Vec::new();
|
||||
};
|
||||
|
||||
let content_start = tag_start + tag_end + 1;
|
||||
let remaining = &html[content_start..];
|
||||
let Some(close) = remaining.find("</script>") else {
|
||||
return Vec::new();
|
||||
};
|
||||
|
||||
let json_str = remaining[..close].trim();
|
||||
if json_str.len() < 20 {
|
||||
return Vec::new();
|
||||
}
|
||||
|
||||
let Ok(data) = serde_json::from_str::<Value>(json_str) else {
|
||||
return Vec::new();
|
||||
};
|
||||
|
||||
// Extract pageProps — the actual page data
|
||||
if let Some(page_props) = data.get("props").and_then(|p| p.get("pageProps"))
|
||||
&& page_props.is_object()
|
||||
&& page_props.as_object().is_some_and(|m| !m.is_empty())
|
||||
{
|
||||
return vec![page_props.clone()];
|
||||
}
|
||||
|
||||
// Fallback: return the whole thing if pageProps is missing/empty
|
||||
if data.is_object() {
|
||||
vec![data]
|
||||
} else {
|
||||
Vec::new()
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract data from SvelteKit's `kit.start()` pattern.
|
||||
///
|
||||
/// SvelteKit embeds page data inside:
|
||||
/// `kit.start(app, element, { data: [null, null, {"type":"data","data":{...}}] })`
|
||||
///
|
||||
/// Returns parsed JSON objects from the data array (skipping nulls).
|
||||
pub fn extract_sveltekit(html: &str) -> Vec<Value> {
|
||||
let Some(kit_pos) = html.find("kit.start(") else {
|
||||
return Vec::new();
|
||||
};
|
||||
let region = &html[kit_pos..];
|
||||
|
||||
let Some(data_offset) = region.find("data: [") else {
|
||||
return Vec::new();
|
||||
};
|
||||
let bracket_start = kit_pos + data_offset + "data: ".len();
|
||||
let bracket_region = &html[bracket_start..];
|
||||
|
||||
let Some(balanced) = extract_balanced(bracket_region, b'[', b']') else {
|
||||
return Vec::new();
|
||||
};
|
||||
if balanced.len() < 50 {
|
||||
return Vec::new();
|
||||
}
|
||||
|
||||
// SvelteKit uses JS object literals (unquoted keys). Convert to valid JSON.
|
||||
let json_str = js_literal_to_json(&balanced);
|
||||
let Ok(arr) = serde_json::from_str::<Vec<Value>>(&json_str) else {
|
||||
return Vec::new();
|
||||
};
|
||||
|
||||
let mut results = Vec::new();
|
||||
for item in arr {
|
||||
if item.is_null() {
|
||||
continue;
|
||||
}
|
||||
// SvelteKit wraps as {"type":"data","data":{...}} — unwrap if present
|
||||
if let Some(inner) = item.get("data")
|
||||
&& (inner.is_object() || inner.is_array())
|
||||
{
|
||||
results.push(inner.clone());
|
||||
continue;
|
||||
}
|
||||
if item.is_object() || item.is_array() {
|
||||
results.push(item);
|
||||
}
|
||||
}
|
||||
results
|
||||
}
|
||||
|
||||
/// Convert a JS object literal to valid JSON by quoting unquoted keys.
|
||||
///
|
||||
/// Handles: `{foo:"bar", baz:123}` → `{"foo":"bar", "baz":123}`
|
||||
/// Preserves already-quoted keys and string values.
|
||||
fn js_literal_to_json(input: &str) -> String {
|
||||
let bytes = input.as_bytes();
|
||||
let mut out = String::with_capacity(input.len() + input.len() / 10);
|
||||
let mut i = 0;
|
||||
let len = bytes.len();
|
||||
|
||||
while i < len {
|
||||
let b = bytes[i];
|
||||
|
||||
// Skip through strings
|
||||
if b == b'"' {
|
||||
out.push('"');
|
||||
i += 1;
|
||||
while i < len {
|
||||
let c = bytes[i];
|
||||
out.push(c as char);
|
||||
i += 1;
|
||||
if c == b'\\' && i < len {
|
||||
out.push(bytes[i] as char);
|
||||
i += 1;
|
||||
} else if c == b'"' {
|
||||
break;
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// After { or , — look for unquoted key followed by :
|
||||
if (b == b'{' || b == b',' || b == b'[') && i + 1 < len {
|
||||
out.push(b as char);
|
||||
i += 1;
|
||||
// Skip whitespace
|
||||
while i < len && bytes[i].is_ascii_whitespace() {
|
||||
out.push(bytes[i] as char);
|
||||
i += 1;
|
||||
}
|
||||
// Check if next is an unquoted identifier (key)
|
||||
if i < len && (bytes[i].is_ascii_alphabetic() || bytes[i] == b'_') {
|
||||
let key_start = i;
|
||||
while i < len && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'_') {
|
||||
i += 1;
|
||||
}
|
||||
let key = &input[key_start..i];
|
||||
// Skip whitespace after key
|
||||
while i < len && bytes[i].is_ascii_whitespace() {
|
||||
i += 1;
|
||||
}
|
||||
// If followed by :, it's an unquoted key — quote it
|
||||
if i < len && bytes[i] == b':' {
|
||||
out.push('"');
|
||||
out.push_str(key);
|
||||
out.push('"');
|
||||
} else {
|
||||
// Not a key — might be a bare value like true/false/null
|
||||
out.push_str(key);
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
out.push(b as char);
|
||||
i += 1;
|
||||
}
|
||||
|
||||
out
|
||||
}
|
||||
|
||||
/// Extract content between balanced brackets, handling string escaping.
|
||||
fn extract_balanced(text: &str, open: u8, close: u8) -> Option<String> {
|
||||
if text.as_bytes().first()? != &open {
|
||||
return None;
|
||||
}
|
||||
let mut depth: i32 = 0;
|
||||
let mut in_string = false;
|
||||
let mut escape_next = false;
|
||||
|
||||
for (i, &b) in text.as_bytes().iter().enumerate() {
|
||||
if escape_next {
|
||||
escape_next = false;
|
||||
continue;
|
||||
}
|
||||
if b == b'\\' && in_string {
|
||||
escape_next = true;
|
||||
continue;
|
||||
}
|
||||
if b == b'"' {
|
||||
in_string = !in_string;
|
||||
continue;
|
||||
}
|
||||
if in_string {
|
||||
continue;
|
||||
}
|
||||
if b == open {
|
||||
depth += 1;
|
||||
} else if b == close {
|
||||
depth -= 1;
|
||||
if depth == 0 {
|
||||
return Some(text[..=i].to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn extracts_single_json_ld() {
|
||||
let html = r#"
|
||||
<html><head>
|
||||
<script type="application/ld+json">{"@type":"Product","name":"Test"}</script>
|
||||
</head><body></body></html>
|
||||
"#;
|
||||
let results = extract_json_ld(html);
|
||||
assert_eq!(results.len(), 1);
|
||||
assert_eq!(results[0]["@type"], "Product");
|
||||
assert_eq!(results[0]["name"], "Test");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extracts_multiple_json_ld_blocks() {
|
||||
let html = r#"
|
||||
<script type="application/ld+json">{"@type":"WebSite","url":"https://example.com"}</script>
|
||||
<script type="application/ld+json">{"@type":"Product","name":"Shoe","offers":{"price":99.99}}</script>
|
||||
"#;
|
||||
let results = extract_json_ld(html);
|
||||
assert_eq!(results.len(), 2);
|
||||
assert_eq!(results[0]["@type"], "WebSite");
|
||||
assert_eq!(results[1]["@type"], "Product");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn handles_array_json_ld() {
|
||||
let html = r#"
|
||||
<script type="application/ld+json">[{"@type":"BreadcrumbList"},{"@type":"Product"}]</script>
|
||||
"#;
|
||||
let results = extract_json_ld(html);
|
||||
assert_eq!(results.len(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn skips_invalid_json() {
|
||||
let html = r#"
|
||||
<script type="application/ld+json">{invalid json here}</script>
|
||||
<script type="application/ld+json">{"@type":"Product","name":"Valid"}</script>
|
||||
"#;
|
||||
let results = extract_json_ld(html);
|
||||
assert_eq!(results.len(), 1);
|
||||
assert_eq!(results[0]["name"], "Valid");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ignores_regular_script_tags() {
|
||||
let html = r#"
|
||||
<script>console.log("not json-ld")</script>
|
||||
<script type="text/javascript">var x = 1;</script>
|
||||
<script type="application/ld+json">{"@type":"Product"}</script>
|
||||
"#;
|
||||
let results = extract_json_ld(html);
|
||||
assert_eq!(results.len(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn handles_no_json_ld() {
|
||||
let html = "<html><body><p>No structured data here</p></body></html>";
|
||||
let results = extract_json_ld(html);
|
||||
assert!(results.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn case_insensitive_type() {
|
||||
let html = r#"
|
||||
<script type="Application/LD+JSON">{"@type":"Product"}</script>
|
||||
"#;
|
||||
let results = extract_json_ld(html);
|
||||
assert_eq!(results.len(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn handles_whitespace_in_json() {
|
||||
let html = r#"
|
||||
<script type="application/ld+json">
|
||||
{
|
||||
"@type": "Product",
|
||||
"name": "Test"
|
||||
}
|
||||
</script>
|
||||
"#;
|
||||
let results = extract_json_ld(html);
|
||||
assert_eq!(results.len(), 1);
|
||||
assert_eq!(results[0]["name"], "Test");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty_script_tag_skipped() {
|
||||
let html = r#"
|
||||
<script type="application/ld+json"> </script>
|
||||
<script type="application/ld+json">{"@type":"Product"}</script>
|
||||
"#;
|
||||
let results = extract_json_ld(html);
|
||||
assert_eq!(results.len(), 1);
|
||||
}
|
||||
}
|
||||
80
crates/noxa-core/src/types.rs
Normal file
80
crates/noxa-core/src/types.rs
Normal file
|
|
@ -0,0 +1,80 @@
|
|||
/// Core types for extraction output.
|
||||
/// All types are serializable for JSON output to LLM consumers.
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::domain::DomainType;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ExtractionResult {
|
||||
pub metadata: Metadata,
|
||||
pub content: Content,
|
||||
pub domain_data: Option<DomainData>,
|
||||
/// JSON-LD structured data extracted from `<script type="application/ld+json">` blocks.
|
||||
/// Contains Schema.org markup (Product, Article, BreadcrumbList, etc.) when present.
|
||||
#[serde(default, skip_serializing_if = "Vec::is_empty")]
|
||||
pub structured_data: Vec<serde_json::Value>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Metadata {
|
||||
pub title: Option<String>,
|
||||
pub description: Option<String>,
|
||||
pub author: Option<String>,
|
||||
pub published_date: Option<String>,
|
||||
pub language: Option<String>,
|
||||
pub url: Option<String>,
|
||||
pub site_name: Option<String>,
|
||||
pub image: Option<String>,
|
||||
pub favicon: Option<String>,
|
||||
pub word_count: usize,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Content {
|
||||
pub markdown: String,
|
||||
pub plain_text: String,
|
||||
pub links: Vec<Link>,
|
||||
pub images: Vec<Image>,
|
||||
pub code_blocks: Vec<CodeBlock>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub raw_html: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub struct Link {
|
||||
pub text: String,
|
||||
pub href: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Image {
|
||||
pub alt: String,
|
||||
pub src: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct CodeBlock {
|
||||
pub language: Option<String>,
|
||||
pub code: String,
|
||||
}
|
||||
|
||||
/// Domain-specific extracted data. For MVP, only the detected type is stored.
|
||||
/// Future: each variant carries structured fields (e.g., Article { author, date, ... }).
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct DomainData {
|
||||
pub domain_type: DomainType,
|
||||
}
|
||||
|
||||
/// Options for controlling content extraction behavior.
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct ExtractionOptions {
|
||||
/// CSS selectors for elements to include. If non-empty, only these elements
|
||||
/// are extracted (skipping the scoring algorithm entirely).
|
||||
pub include_selectors: Vec<String>,
|
||||
/// CSS selectors for elements to exclude from the output.
|
||||
pub exclude_selectors: Vec<String>,
|
||||
/// If true, skip scoring and pick the first `article`, `main`, or `[role="main"]` element.
|
||||
pub only_main_content: bool,
|
||||
/// If true, populate `Content::raw_html` with the extracted content's HTML.
|
||||
pub include_raw_html: bool,
|
||||
}
|
||||
296
crates/noxa-core/src/youtube.rs
Normal file
296
crates/noxa-core/src/youtube.rs
Normal file
|
|
@ -0,0 +1,296 @@
|
|||
use once_cell::sync::Lazy;
|
||||
/// YouTube video metadata extraction from `ytInitialPlayerResponse` embedded JSON.
|
||||
///
|
||||
/// YouTube embeds the full player config (title, author, view count, description,
|
||||
/// duration, upload date) in a `<script>` tag as a JS variable assignment. This
|
||||
/// module parses that blob and formats it as structured markdown, giving LLMs a
|
||||
/// clean representation without needing the YouTube API.
|
||||
use regex::Regex;
|
||||
use tracing::debug;
|
||||
|
||||
/// Regex to find the ytInitialPlayerResponse assignment in a <script> block.
|
||||
/// YouTube uses: `var ytInitialPlayerResponse = {...};`
|
||||
static YT_PLAYER_RE: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r"var\s+ytInitialPlayerResponse\s*=\s*(\{.+?\})\s*;").unwrap());
|
||||
|
||||
/// Check if a URL is a YouTube video page.
|
||||
pub fn is_youtube_url(url: &str) -> bool {
|
||||
let lower = url.to_lowercase();
|
||||
lower.contains("youtube.com/watch") || lower.contains("youtu.be/")
|
||||
}
|
||||
|
||||
/// Extracted YouTube video metadata.
|
||||
#[derive(Debug)]
|
||||
struct VideoMeta {
|
||||
title: String,
|
||||
author: String,
|
||||
view_count: String,
|
||||
upload_date: String,
|
||||
description: String,
|
||||
duration: String,
|
||||
}
|
||||
|
||||
/// Try to extract YouTube video metadata from the page HTML.
|
||||
/// Returns structured markdown if successful, None if the page doesn't contain
|
||||
/// ytInitialPlayerResponse or parsing fails.
|
||||
pub fn try_extract(html: &str) -> Option<String> {
|
||||
let json_str = YT_PLAYER_RE.captures(html)?.get(1)?.as_str();
|
||||
|
||||
let value: serde_json::Value = serde_json::from_str(json_str).ok()?;
|
||||
|
||||
let video_details = value.get("videoDetails")?;
|
||||
let microformat = value
|
||||
.get("microformat")
|
||||
.and_then(|m| m.get("playerMicroformatRenderer"));
|
||||
|
||||
let title = video_details
|
||||
.get("title")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("Untitled")
|
||||
.to_string();
|
||||
|
||||
let author = video_details
|
||||
.get("author")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("Unknown")
|
||||
.to_string();
|
||||
|
||||
let view_count = video_details
|
||||
.get("viewCount")
|
||||
.and_then(|v| v.as_str())
|
||||
.map(format_view_count)
|
||||
.unwrap_or_else(|| "N/A".to_string());
|
||||
|
||||
let upload_date = microformat
|
||||
.and_then(|m| m.get("uploadDate"))
|
||||
.or_else(|| microformat.and_then(|m| m.get("publishDate")))
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("Unknown")
|
||||
.to_string();
|
||||
|
||||
let description = video_details
|
||||
.get("shortDescription")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("")
|
||||
.to_string();
|
||||
|
||||
let duration_secs = video_details
|
||||
.get("lengthSeconds")
|
||||
.and_then(|v| v.as_str())
|
||||
.and_then(|s| s.parse::<u64>().ok())
|
||||
.unwrap_or(0);
|
||||
let duration = format_duration(duration_secs);
|
||||
|
||||
let meta = VideoMeta {
|
||||
title,
|
||||
author,
|
||||
view_count,
|
||||
upload_date,
|
||||
description,
|
||||
duration,
|
||||
};
|
||||
|
||||
debug!(
|
||||
title = %meta.title,
|
||||
author = %meta.author,
|
||||
"extracted YouTube video metadata"
|
||||
);
|
||||
|
||||
Some(format_markdown(&meta))
|
||||
}
|
||||
|
||||
/// Format seconds into human-readable duration (e.g., "1:23:45" or "12:34").
|
||||
fn format_duration(total_secs: u64) -> String {
|
||||
let hours = total_secs / 3600;
|
||||
let minutes = (total_secs % 3600) / 60;
|
||||
let seconds = total_secs % 60;
|
||||
|
||||
if hours > 0 {
|
||||
format!("{hours}:{minutes:02}:{seconds:02}")
|
||||
} else {
|
||||
format!("{minutes}:{seconds:02}")
|
||||
}
|
||||
}
|
||||
|
||||
/// Format a raw view count string with commas (e.g., "1234567" -> "1,234,567").
|
||||
fn format_view_count(raw: &str) -> String {
|
||||
let Ok(n) = raw.parse::<u64>() else {
|
||||
return raw.to_string();
|
||||
};
|
||||
|
||||
if n >= 1_000_000 {
|
||||
format!("{:.1}M", n as f64 / 1_000_000.0)
|
||||
} else if n >= 1_000 {
|
||||
format!("{:.1}K", n as f64 / 1_000.0)
|
||||
} else {
|
||||
n.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
/// A caption track URL extracted from ytInitialPlayerResponse.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CaptionTrack {
|
||||
pub url: String,
|
||||
pub lang: String,
|
||||
pub name: String,
|
||||
}
|
||||
|
||||
/// Extract caption track URLs from ytInitialPlayerResponse JSON.
|
||||
/// Returns empty vec if no captions are available.
|
||||
pub fn extract_caption_tracks(html: &str) -> Vec<CaptionTrack> {
|
||||
let Some(json_str) = YT_PLAYER_RE.captures(html).and_then(|c| c.get(1)) else {
|
||||
return vec![];
|
||||
};
|
||||
|
||||
let Ok(value) = serde_json::from_str::<serde_json::Value>(json_str.as_str()) else {
|
||||
return vec![];
|
||||
};
|
||||
|
||||
let Some(tracks) = value
|
||||
.get("captions")
|
||||
.and_then(|c| c.get("playerCaptionsTracklistRenderer"))
|
||||
.and_then(|r| r.get("captionTracks"))
|
||||
.and_then(|t| t.as_array())
|
||||
else {
|
||||
return vec![];
|
||||
};
|
||||
|
||||
tracks
|
||||
.iter()
|
||||
.filter_map(|t| {
|
||||
let url = t.get("baseUrl")?.as_str()?.to_string();
|
||||
let lang = t
|
||||
.get("languageCode")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("en")
|
||||
.to_string();
|
||||
let name = t
|
||||
.get("name")
|
||||
.and_then(|v| v.get("simpleText"))
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or(&lang)
|
||||
.to_string();
|
||||
Some(CaptionTrack { url, lang, name })
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Parse YouTube timed text XML into plain transcript text.
|
||||
/// The XML format is: `<transcript><text start="0" dur="1.5">Hello</text>...</transcript>`
|
||||
pub fn parse_timed_text(xml: &str) -> String {
|
||||
// Simple regex-based parsing to avoid adding an XML crate dependency.
|
||||
// Extract text content between <text ...>...</text> tags.
|
||||
static TEXT_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"<text[^>]*>([^<]*)</text>").unwrap());
|
||||
|
||||
let mut lines: Vec<String> = Vec::new();
|
||||
for cap in TEXT_RE.captures_iter(xml) {
|
||||
let text = cap[1].trim();
|
||||
if text.is_empty() {
|
||||
continue;
|
||||
}
|
||||
// Decode XML entities
|
||||
let decoded = text
|
||||
.replace("&", "&")
|
||||
.replace("<", "<")
|
||||
.replace(">", ">")
|
||||
.replace(""", "\"")
|
||||
.replace("'", "'")
|
||||
.replace("'", "'")
|
||||
.replace("\n", " ");
|
||||
lines.push(decoded);
|
||||
}
|
||||
|
||||
lines.join(" ")
|
||||
}
|
||||
|
||||
/// Format extracted metadata into structured markdown.
|
||||
fn format_markdown(meta: &VideoMeta) -> String {
|
||||
let mut md = format!("# {}\n\n", meta.title);
|
||||
|
||||
md.push_str(&format!(
|
||||
"**Channel:** {} | **Views:** {} | **Published:** {} | **Duration:** {}\n\n",
|
||||
meta.author, meta.view_count, meta.upload_date, meta.duration
|
||||
));
|
||||
|
||||
if !meta.description.is_empty() {
|
||||
md.push_str("## Description\n\n");
|
||||
md.push_str(&meta.description);
|
||||
md.push('\n');
|
||||
}
|
||||
|
||||
md
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn detects_youtube_urls() {
|
||||
assert!(is_youtube_url(
|
||||
"https://www.youtube.com/watch?v=dQw4w9WgXcQ"
|
||||
));
|
||||
assert!(is_youtube_url("https://youtube.com/watch?v=abc123"));
|
||||
assert!(is_youtube_url("https://youtu.be/dQw4w9WgXcQ"));
|
||||
assert!(!is_youtube_url("https://example.com"));
|
||||
assert!(!is_youtube_url("https://vimeo.com/123456"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn format_duration_short() {
|
||||
assert_eq!(format_duration(0), "0:00");
|
||||
assert_eq!(format_duration(65), "1:05");
|
||||
assert_eq!(format_duration(3661), "1:01:01");
|
||||
assert_eq!(format_duration(754), "12:34");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn format_view_count_values() {
|
||||
assert_eq!(format_view_count("500"), "500");
|
||||
assert_eq!(format_view_count("1500"), "1.5K");
|
||||
assert_eq!(format_view_count("1234567"), "1.2M");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extracts_from_mock_html() {
|
||||
let html = r#"
|
||||
<html><head><title>Test Video</title></head>
|
||||
<body>
|
||||
<script>
|
||||
var ytInitialPlayerResponse = {"videoDetails":{"title":"Rust in 100 Seconds","author":"Fireship","viewCount":"5432100","shortDescription":"Learn Rust in 100 seconds.","lengthSeconds":"120"},"microformat":{"playerMicroformatRenderer":{"uploadDate":"2023-01-15"}}};
|
||||
</script>
|
||||
</body></html>
|
||||
"#;
|
||||
|
||||
let result = try_extract(html).unwrap();
|
||||
assert!(result.contains("# Rust in 100 Seconds"));
|
||||
assert!(result.contains("**Channel:** Fireship"));
|
||||
assert!(result.contains("5.4M"));
|
||||
assert!(result.contains("2023-01-15"));
|
||||
assert!(result.contains("2:00"));
|
||||
assert!(result.contains("Learn Rust in 100 seconds."));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn returns_none_for_non_youtube_html() {
|
||||
let html = "<html><body><p>Hello world</p></body></html>";
|
||||
assert!(try_extract(html).is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn handles_missing_optional_fields() {
|
||||
let html = r#"
|
||||
<html><body>
|
||||
<script>
|
||||
var ytInitialPlayerResponse = {"videoDetails":{"title":"Minimal Video","author":"Someone","viewCount":"100","shortDescription":"","lengthSeconds":"60"}};
|
||||
</script>
|
||||
</body></html>
|
||||
"#;
|
||||
|
||||
let result = try_extract(html).unwrap();
|
||||
assert!(result.contains("# Minimal Video"));
|
||||
assert!(result.contains("**Channel:** Someone"));
|
||||
// Upload date should be "Unknown" when microformat is missing
|
||||
assert!(result.contains("Unknown"));
|
||||
}
|
||||
}
|
||||
15
crates/noxa-core/testdata/express_test.html
vendored
Normal file
15
crates/noxa-core/testdata/express_test.html
vendored
Normal file
File diff suppressed because one or more lines are too long
Loading…
Add table
Add a link
Reference in a new issue