webclaw/crates/webclaw-core/src/data_island.rs
Valerio c99ec684fa Initial release: webclaw v0.1.0 — web content extraction for LLMs
CLI + MCP server for extracting clean, structured content from any URL.
6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats.

MIT Licensed | https://webclaw.io
2026-03-23 18:31:11 +01:00

551 lines
18 KiB
Rust

/// Extract content from JSON data islands embedded in `<script>` tags.
///
/// Many modern SPAs (React, Next.js, Nuxt) ship server-rendered page data
/// as JSON inside script tags rather than in visible DOM elements. This module
/// walks those JSON blobs and recovers text content as a fallback when normal
/// DOM extraction yields sparse results.
use once_cell::sync::Lazy;
use scraper::{Html, Selector};
use tracing::debug;
static SCRIPT_JSON_SELECTOR: Lazy<Selector> =
Lazy::new(|| Selector::parse("script[type='application/json']").unwrap());
/// Below this word count, try data islands for supplemental content.
/// Set high enough to cover marketing homepages with partial SSR (e.g., Notion
/// SSR-renders ~300 words but has ~800 words in __NEXT_DATA__).
const SPARSE_THRESHOLD: usize = 500;
/// Cap total extracted chunks to bound memory and CPU on adversarial inputs.
const MAX_CHUNKS: usize = 1000;
/// A chunk of text extracted from a JSON data island, with optional heading.
#[derive(Debug)]
struct TextChunk {
heading: Option<String>,
body: String,
}
/// Try to extract content from JSON data islands when DOM extraction is sparse.
/// Deduplicates against existing markdown so we only add genuinely new content.
pub fn try_extract(doc: &Html, dom_word_count: usize, existing_markdown: &str) -> Option<String> {
if dom_word_count >= SPARSE_THRESHOLD {
return None;
}
let mut all_chunks: Vec<TextChunk> = Vec::new();
let existing_lower = existing_markdown.to_lowercase();
for script in doc.select(&SCRIPT_JSON_SELECTOR) {
if all_chunks.len() >= MAX_CHUNKS {
break;
}
let json_text = script.text().collect::<String>();
if json_text.len() < 50 {
continue;
}
let Ok(value) = serde_json::from_str::<serde_json::Value>(&json_text) else {
continue;
};
let mut chunks = Vec::new();
walk_json(&value, &mut chunks, 0);
if !chunks.is_empty() {
debug!(
script_id = script.value().attr("id").unwrap_or(""),
data_target = script.value().attr("data-target").unwrap_or(""),
chunks = chunks.len(),
"extracted text from data island"
);
all_chunks.extend(chunks);
}
}
if all_chunks.is_empty() {
return None;
}
// Enforce limit after collecting from all scripts
all_chunks.truncate(MAX_CHUNKS);
// Dedup: remove chunks whose text already appears in DOM markdown
let mut seen = std::collections::HashSet::new();
all_chunks.retain(|c| {
// Must have heading or body
let key = if !c.body.is_empty() {
c.body.clone()
} else if let Some(ref h) = c.heading {
h.clone()
} else {
return false;
};
if !seen.insert(key.clone()) {
return false;
}
// Skip if the text already exists in the DOM-extracted content
!existing_lower.contains(&key.to_lowercase())
});
if all_chunks.is_empty() {
return None;
}
let mut md = String::new();
for chunk in &all_chunks {
if let Some(ref h) = chunk.heading {
md.push_str(&format!("\n## {h}\n\n"));
}
md.push_str(&chunk.body);
md.push_str("\n\n");
}
let md = md.trim().to_string();
if md.is_empty() {
None
} else {
debug!(chars = md.len(), "data island content recovered");
Some(md)
}
}
/// Recursively walk a JSON value and extract text content.
fn walk_json(value: &serde_json::Value, chunks: &mut Vec<TextChunk>, depth: usize) {
if depth > 15 {
return;
}
match value {
serde_json::Value::Object(map) => {
// Contentful rich text node: { "nodeType": "...", "content": [...] }
if let Some(node_type) = map.get("nodeType").and_then(|v| v.as_str())
&& let Some(text) = extract_contentful_node(map, node_type)
{
chunks.push(text);
return;
}
// CMS-style entry with heading + subheading/description
if is_cms_entry(map)
&& let Some(chunk) = extract_cms_entry(map)
{
chunks.push(chunk);
return;
}
// Quote/testimonial pattern
if let Some(chunk) = extract_quote(map) {
chunks.push(chunk);
return;
}
// Extract orphaned content strings from known field names
// before recursing (they won't be caught by CMS/quote patterns)
extract_orphan_texts(map, chunks);
// Recurse into all values, skipping image/media/asset fields
for (key, v) in map {
if is_media_key(key) {
continue;
}
walk_json(v, chunks, depth + 1);
}
}
serde_json::Value::Array(arr) => {
// Check for stat-style string arrays (e.g., ["100M+ users", "#1 rated"])
let content_strings: Vec<&str> = arr
.iter()
.filter_map(|v| v.as_str())
.filter(|s| s.len() > 10 && s.contains(' '))
.collect();
if content_strings.len() >= 2 {
let body = content_strings.join(" | ");
chunks.push(TextChunk {
heading: None,
body,
});
return;
}
for v in arr {
walk_json(v, chunks, depth + 1);
}
}
_ => {}
}
}
/// Extract text from a Contentful rich text node.
/// Handles: document, paragraph, heading-1..6, blockquote, etc.
fn extract_contentful_node(
map: &serde_json::Map<String, serde_json::Value>,
node_type: &str,
) -> Option<TextChunk> {
match node_type {
"document" => {
// Top-level document — collect children
let content = map.get("content")?.as_array()?;
let mut parts = Vec::new();
for child in content {
if let Some(chunk) = child
.as_object()
.and_then(|m| m.get("nodeType").and_then(|v| v.as_str()))
.and_then(|nt| extract_contentful_node(child.as_object().unwrap(), nt))
{
if let Some(h) = &chunk.heading {
parts.push(format!("## {h}"));
}
if !chunk.body.is_empty() {
parts.push(chunk.body);
}
}
}
if parts.is_empty() {
return None;
}
Some(TextChunk {
heading: None,
body: parts.join("\n\n"),
})
}
"paragraph" | "text" => {
let text = collect_text_content(map);
if is_content_text(&text) {
Some(TextChunk {
heading: None,
body: text,
})
} else {
None
}
}
nt if nt.starts_with("heading-") => {
let text = collect_text_content(map);
if text.is_empty() {
None
} else {
Some(TextChunk {
heading: Some(text),
body: String::new(),
})
}
}
"blockquote" => {
let text = collect_text_content(map);
if is_content_text(&text) {
Some(TextChunk {
heading: None,
body: format!("> {text}"),
})
} else {
None
}
}
_ => None,
}
}
/// Recursively collect plain text from a Contentful rich text node tree.
fn collect_text_content(map: &serde_json::Map<String, serde_json::Value>) -> String {
let mut text = String::new();
if let Some(v) = map.get("value").and_then(|v| v.as_str()) {
text.push_str(v);
}
if let Some(content) = map.get("content").and_then(|v| v.as_array()) {
for child in content {
if let Some(child_map) = child.as_object() {
let child_text = collect_text_content(child_map);
text.push_str(&child_text);
}
}
}
text.trim().to_string()
}
/// Check if a JSON object looks like a CMS entry with heading + description.
fn is_cms_entry(map: &serde_json::Map<String, serde_json::Value>) -> bool {
let has_heading =
map.contains_key("heading") || map.contains_key("title") || map.contains_key("headline");
let has_body = map.contains_key("description")
|| map.contains_key("subheading")
|| map.contains_key("body")
|| map.contains_key("text");
has_heading && has_body
}
/// Extract heading + body from a CMS-style entry.
fn extract_cms_entry(map: &serde_json::Map<String, serde_json::Value>) -> Option<TextChunk> {
let heading = extract_text_field(map, "heading")
.or_else(|| extract_text_field(map, "title"))
.or_else(|| extract_text_field(map, "headline"))
.filter(|h| !is_cms_internal_title(h) && h.len() > 5)?;
let body = extract_text_field(map, "description")
.or_else(|| extract_text_field(map, "subheading"))
.or_else(|| extract_text_field(map, "body"))
.or_else(|| extract_text_field(map, "text"))
.unwrap_or_default();
if !is_content_text(&heading) && !is_content_text(&body) {
return None;
}
Some(TextChunk {
heading: Some(heading),
body,
})
}
/// Extract a quote/testimonial from a JSON object.
fn extract_quote(map: &serde_json::Map<String, serde_json::Value>) -> Option<TextChunk> {
let quote =
extract_text_field(map, "quote").or_else(|| extract_text_field(map, "quoteText"))?;
if !is_content_text(&quote) {
return None;
}
let attribution = extract_text_field(map, "position")
.or_else(|| extract_text_field(map, "author"))
.or_else(|| extract_text_field(map, "name"))
.unwrap_or_default();
let body = if attribution.is_empty() {
format!("> {quote}")
} else {
format!("> {quote}\n> — {attribution}")
};
Some(TextChunk {
heading: None,
body,
})
}
/// Extract standalone content strings from known field names that weren't
/// caught by the CMS entry or quote patterns. These are body/description/
/// subheading/eyebrow fields on objects that lack a paired heading, or
/// headline fields on objects that lack a body.
fn extract_orphan_texts(
map: &serde_json::Map<String, serde_json::Value>,
chunks: &mut Vec<TextChunk>,
) {
const BODY_KEYS: &[&str] = &["body", "description", "subheading", "eyebrow", "children"];
const HEADING_KEYS: &[&str] = &["heading", "title", "headline"];
// Don't extract if this object was already handled as a CMS entry
if is_cms_entry(map) {
return;
}
// Try extracting a standalone heading (without body)
for key in HEADING_KEYS {
if let Some(text) = extract_text_field(map, key)
&& is_content_text(&text)
{
chunks.push(TextChunk {
heading: Some(text),
body: String::new(),
});
return;
}
}
// Try extracting a standalone body field
for key in BODY_KEYS {
if let Some(text) = extract_text_field(map, key)
&& is_content_text(&text)
{
chunks.push(TextChunk {
heading: None,
body: text,
});
return;
}
}
}
/// Extract a text value from a JSON field, handling both plain strings and
/// Contentful rich text objects.
fn extract_text_field(
map: &serde_json::Map<String, serde_json::Value>,
key: &str,
) -> Option<String> {
let value = map.get(key)?;
// Plain string
if let Some(s) = value.as_str() {
let s = s.trim().to_string();
return if s.is_empty() { None } else { Some(s) };
}
// Contentful rich text object: { "content": [{ "content": [{ "value": "..." }] }] }
if let Some(obj) = value.as_object() {
let text = collect_text_content(obj);
return if text.is_empty() { None } else { Some(text) };
}
None
}
/// JSON keys that hold image/media/asset data — skip recursing into these
/// to avoid extracting CMS alt text as content.
fn is_media_key(key: &str) -> bool {
let k = key.to_lowercase();
k == "alt"
|| k.contains("image")
|| k.contains("poster")
|| k.contains("video")
|| k.contains("thumbnail")
|| k.contains("icon")
|| k.contains("logo")
|| k == "src"
|| k == "url"
|| k == "href"
}
/// CMS internal titles like "/home Customer Stories: Logo" or
/// "Copilot agent mode hero poster desktop" are editorial labels, not user-facing text.
fn is_cms_internal_title(s: &str) -> bool {
// Contentful path-style titles
if s.starts_with("/home ") || s.starts_with("/page ") {
return true;
}
// Titles that look like asset/component labels (short words, no sentence structure)
let words: Vec<&str> = s.split_whitespace().collect();
if words.len() >= 3 {
let has_label_keyword = words
.iter()
.any(|w| ["poster", "logo", "image", "icon", "asset", "thumbnail"].contains(w));
if has_label_keyword {
return true;
}
}
false
}
/// Heuristic: is this string actual content (not an ID, URL, class name, etc.)?
fn is_content_text(s: &str) -> bool {
let s = s.trim();
if s.len() < 15 {
return false;
}
// Skip URLs, IDs, technical strings
if s.starts_with("http") || s.starts_with('/') || s.starts_with('{') || s.starts_with('[') {
return false;
}
// Must contain spaces (prose), not just a single technical token
if !s.contains(' ') {
return false;
}
// Skip strings that are mostly hex/base64 (hashes, IDs)
let alnum_ratio = s.chars().filter(|c| c.is_alphanumeric()).count() as f64 / s.len() as f64;
if alnum_ratio < 0.6 {
return false;
}
true
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn extracts_contentful_rich_text() {
let html = r#"<html><body>
<script type="application/json" data-target="react-app.embeddedData">
{"payload":{"contentfulRawJsonResponse":{"includes":{"Entry":[
{"fields":{
"heading":"Ship faster with secure CI/CD",
"subheading":{"content":[{"content":[{"value":"Automate builds, tests, and deployments."}]}]}
}},
{"fields":{
"heading":"Built-in application security",
"description":{"content":[{"content":[{"value":"Use AI to find and fix vulnerabilities so your team can ship more secure software faster."}]}]}
}}
]}}}}
</script>
</body></html>"#;
let doc = Html::parse_document(html);
let result = try_extract(&doc, 0, "").unwrap();
assert!(result.contains("Ship faster with secure CI/CD"));
assert!(result.contains("Automate builds, tests, and deployments"));
assert!(result.contains("Built-in application security"));
assert!(result.contains("find and fix vulnerabilities"));
}
#[test]
fn skips_when_dom_has_enough_content() {
let html = r#"<html><body>
<script type="application/json">{"heading":"Foo","description":"Some long description here."}</script>
</body></html>"#;
let doc = Html::parse_document(html);
assert!(try_extract(&doc, 500, "").is_none());
}
#[test]
fn skips_non_content_strings() {
assert!(!is_content_text("abc123"));
assert!(!is_content_text("https://example.com/foo/bar"));
assert!(!is_content_text("/home Customer Stories: Logo"));
assert!(!is_content_text("a1b2c3d4e5f6a1b2c3d4e5f6"));
assert!(is_content_text(
"Automate builds, tests, and deployments with CI/CD."
));
}
#[test]
fn extracts_quotes() {
let html = r#"<html><body>
<script type="application/json">
{"fields":{"quote":{"content":[{"content":[{"value":"GitHub frees us from maintaining our own infrastructure."}]}]},"position":"CTO at Example Corp"}}
</script>
</body></html>"#;
let doc = Html::parse_document(html);
let result = try_extract(&doc, 0, "").unwrap();
assert!(result.contains("> GitHub frees us from maintaining our own infrastructure."));
assert!(result.contains("CTO at Example Corp"));
}
#[test]
fn skips_content_already_in_dom() {
let html = r#"<html><body>
<script type="application/json">
{"fields":{"heading":"Already in DOM heading","description":"This text already appears in the DOM markdown output."}}
</script>
</body></html>"#;
let doc = Html::parse_document(html);
let existing =
"# Already in DOM heading\n\nThis text already appears in the DOM markdown output.";
assert!(try_extract(&doc, 10, existing).is_none());
}
#[test]
fn deduplicates_chunks() {
let html = r#"<html><body>
<script type="application/json">
{"a":{"heading":"Same heading here","description":"Same body content across multiple entries."},
"b":{"heading":"Same heading here","description":"Same body content across multiple entries."}}
</script>
</body></html>"#;
let doc = Html::parse_document(html);
let result = try_extract(&doc, 0, "").unwrap();
// Should appear only once
assert_eq!(
result
.matches("Same body content across multiple entries")
.count(),
1
);
}
}