/// Extract JSON-LD structured data from HTML.
///
/// Parses `
let content_start = abs_start + tag_end_offset + 1;
let remaining = &html[content_start..];
let Some(close_offset) = remaining.to_lowercase().find("") else {
search_from = content_start;
continue;
};
let json_str = remaining[..close_offset].trim();
search_from = content_start + close_offset + 9;
if json_str.is_empty() {
continue;
}
// Parse — some sites have arrays at top level
match serde_json::from_str::(json_str) {
Ok(Value::Array(arr)) => results.extend(arr),
Ok(val) => results.push(val),
Err(_) => {}
}
}
results
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn extracts_single_json_ld() {
let html = r#"
"#;
let results = extract_json_ld(html);
assert_eq!(results.len(), 1);
assert_eq!(results[0]["@type"], "Product");
assert_eq!(results[0]["name"], "Test");
}
#[test]
fn extracts_multiple_json_ld_blocks() {
let html = r#"
"#;
let results = extract_json_ld(html);
assert_eq!(results.len(), 2);
assert_eq!(results[0]["@type"], "WebSite");
assert_eq!(results[1]["@type"], "Product");
}
#[test]
fn handles_array_json_ld() {
let html = r#"
"#;
let results = extract_json_ld(html);
assert_eq!(results.len(), 2);
}
#[test]
fn skips_invalid_json() {
let html = r#"
"#;
let results = extract_json_ld(html);
assert_eq!(results.len(), 1);
assert_eq!(results[0]["name"], "Valid");
}
#[test]
fn ignores_regular_script_tags() {
let html = r#"
"#;
let results = extract_json_ld(html);
assert_eq!(results.len(), 1);
}
#[test]
fn handles_no_json_ld() {
let html = "No structured data here
";
let results = extract_json_ld(html);
assert!(results.is_empty());
}
#[test]
fn case_insensitive_type() {
let html = r#"
"#;
let results = extract_json_ld(html);
assert_eq!(results.len(), 1);
}
#[test]
fn handles_whitespace_in_json() {
let html = r#"
"#;
let results = extract_json_ld(html);
assert_eq!(results.len(), 1);
assert_eq!(results[0]["name"], "Test");
}
#[test]
fn empty_script_tag_skipped() {
let html = r#"
"#;
let results = extract_json_ld(html);
assert_eq!(results.len(), 1);
}
}