webclaw/crates/webclaw-core/src/structured_data.rs
Valerio c99ec684fa Initial release: webclaw v0.1.0 — web content extraction for LLMs
CLI + MCP server for extracting clean, structured content from any URL.
6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats.

MIT Licensed | https://webclaw.io
2026-03-23 18:31:11 +01:00

165 lines
5.3 KiB
Rust

/// Extract JSON-LD structured data from HTML.
///
/// Parses `<script type="application/ld+json">` blocks commonly found in
/// e-commerce, news, and recipe sites. Returns machine-readable product info,
/// prices, availability, reviews, etc. without needing JS rendering or LLM.
use serde_json::Value;
/// Extract all JSON-LD blocks from raw HTML.
///
/// Returns parsed JSON values, skipping any blocks that fail to parse.
/// Most e-commerce sites include Schema.org Product markup with prices,
/// sizes, availability, and images.
pub fn extract_json_ld(html: &str) -> Vec<Value> {
let mut results = Vec::new();
let needle = "application/ld+json";
// Walk through the HTML finding <script type="application/ld+json"> blocks.
// Using simple string scanning instead of a full HTML parser — these blocks
// are self-contained and reliably structured.
let mut search_from = 0;
while let Some(tag_start) = html[search_from..].find("<script") {
let abs_start = search_from + tag_start;
let tag_region = &html[abs_start..];
// Find the end of the opening tag
let Some(tag_end_offset) = tag_region.find('>') else {
search_from = abs_start + 7;
continue;
};
let opening_tag = &tag_region[..tag_end_offset];
// Check if this is a JSON-LD script
if !opening_tag.to_lowercase().contains(needle) {
search_from = abs_start + tag_end_offset + 1;
continue;
}
// Find the closing </script>
let content_start = abs_start + tag_end_offset + 1;
let remaining = &html[content_start..];
let Some(close_offset) = remaining.to_lowercase().find("</script>") else {
search_from = content_start;
continue;
};
let json_str = remaining[..close_offset].trim();
search_from = content_start + close_offset + 9;
if json_str.is_empty() {
continue;
}
// Parse — some sites have arrays at top level
match serde_json::from_str::<Value>(json_str) {
Ok(Value::Array(arr)) => results.extend(arr),
Ok(val) => results.push(val),
Err(_) => {}
}
}
results
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn extracts_single_json_ld() {
let html = r#"
<html><head>
<script type="application/ld+json">{"@type":"Product","name":"Test"}</script>
</head><body></body></html>
"#;
let results = extract_json_ld(html);
assert_eq!(results.len(), 1);
assert_eq!(results[0]["@type"], "Product");
assert_eq!(results[0]["name"], "Test");
}
#[test]
fn extracts_multiple_json_ld_blocks() {
let html = r#"
<script type="application/ld+json">{"@type":"WebSite","url":"https://example.com"}</script>
<script type="application/ld+json">{"@type":"Product","name":"Shoe","offers":{"price":99.99}}</script>
"#;
let results = extract_json_ld(html);
assert_eq!(results.len(), 2);
assert_eq!(results[0]["@type"], "WebSite");
assert_eq!(results[1]["@type"], "Product");
}
#[test]
fn handles_array_json_ld() {
let html = r#"
<script type="application/ld+json">[{"@type":"BreadcrumbList"},{"@type":"Product"}]</script>
"#;
let results = extract_json_ld(html);
assert_eq!(results.len(), 2);
}
#[test]
fn skips_invalid_json() {
let html = r#"
<script type="application/ld+json">{invalid json here}</script>
<script type="application/ld+json">{"@type":"Product","name":"Valid"}</script>
"#;
let results = extract_json_ld(html);
assert_eq!(results.len(), 1);
assert_eq!(results[0]["name"], "Valid");
}
#[test]
fn ignores_regular_script_tags() {
let html = r#"
<script>console.log("not json-ld")</script>
<script type="text/javascript">var x = 1;</script>
<script type="application/ld+json">{"@type":"Product"}</script>
"#;
let results = extract_json_ld(html);
assert_eq!(results.len(), 1);
}
#[test]
fn handles_no_json_ld() {
let html = "<html><body><p>No structured data here</p></body></html>";
let results = extract_json_ld(html);
assert!(results.is_empty());
}
#[test]
fn case_insensitive_type() {
let html = r#"
<script type="Application/LD+JSON">{"@type":"Product"}</script>
"#;
let results = extract_json_ld(html);
assert_eq!(results.len(), 1);
}
#[test]
fn handles_whitespace_in_json() {
let html = r#"
<script type="application/ld+json">
{
"@type": "Product",
"name": "Test"
}
</script>
"#;
let results = extract_json_ld(html);
assert_eq!(results.len(), 1);
assert_eq!(results[0]["name"], "Test");
}
#[test]
fn empty_script_tag_skipped() {
let html = r#"
<script type="application/ld+json"> </script>
<script type="application/ld+json">{"@type":"Product"}</script>
"#;
let results = extract_json_ld(html);
assert_eq!(results.len(), 1);
}
}