webclaw/crates/webclaw-core/src/structured_data.rs

/// Extract structured data from HTML.
///
/// Handles two sources:
/// 1. JSON-LD (`<script type="application/ld+json">`) — e-commerce, news, recipes
/// 2. SvelteKit data islands (`kit.start(app, element, { data: [...] })`) — SPAs
use serde_json::Value;

/// Extract all JSON-LD blocks from raw HTML.
///
/// Returns parsed JSON values, skipping any blocks that fail to parse.
/// Most e-commerce sites include Schema.org Product markup with prices,
/// sizes, availability, and images.
pub fn extract_json_ld(html: &str) -> Vec<Value> {
    let mut results = Vec::new();
    let needle = "application/ld+json";

    // Walk through the HTML finding <script type="application/ld+json"> blocks.
    // Using simple string scanning instead of a full HTML parser — these blocks
    // are self-contained and reliably structured.
    let mut search_from = 0;
    while let Some(tag_start) = html[search_from..].find("<script") {
        let abs_start = search_from + tag_start;
        let tag_region = &html[abs_start..];

        // Find the end of the opening tag
        let Some(tag_end_offset) = tag_region.find('>') else {
            search_from = abs_start + 7;
            continue;
        };

        let opening_tag = &tag_region[..tag_end_offset];

        // Check if this is a JSON-LD script
        if !opening_tag.to_lowercase().contains(needle) {
            search_from = abs_start + tag_end_offset + 1;
            continue;
        }

        // Find the closing </script>
        let content_start = abs_start + tag_end_offset + 1;
        let remaining = &html[content_start..];
        let Some(close_offset) = remaining.to_lowercase().find("</script>") else {
            search_from = content_start;
            continue;
        };

        let json_str = remaining[..close_offset].trim();
        search_from = content_start + close_offset + 9;

        if json_str.is_empty() {
            continue;
        }

        // Parse — some sites have arrays at top level
        match serde_json::from_str::<Value>(json_str) {
            Ok(Value::Array(arr)) => results.extend(arr),
            Ok(val) => results.push(val),
            Err(_) => {}
        }
    }

    results
}

/// Extract data from SvelteKit's `kit.start()` pattern.
///
/// SvelteKit embeds page data inside:
/// `kit.start(app, element, { data: [null, null, {"type":"data","data":{...}}] })`
///
/// Returns parsed JSON objects from the data array (skipping nulls).
pub fn extract_sveltekit(html: &str) -> Vec<Value> {
    let Some(kit_pos) = html.find("kit.start(") else {
        return Vec::new();
    };
    let region = &html[kit_pos..];

    let Some(data_offset) = region.find("data: [") else {
        return Vec::new();
    };
    let bracket_start = kit_pos + data_offset + "data: ".len();
    let bracket_region = &html[bracket_start..];

    let Some(balanced) = extract_balanced(bracket_region, b'[', b']') else {
        return Vec::new();
    };
    if balanced.len() < 50 {
        return Vec::new();
    }

    // SvelteKit uses JS object literals (unquoted keys). Convert to valid JSON.
    let json_str = js_literal_to_json(&balanced);
    let Ok(arr) = serde_json::from_str::<Vec<Value>>(&json_str) else {
        return Vec::new();
    };

    let mut results = Vec::new();
    for item in arr {
        if item.is_null() {
            continue;
        }
        // SvelteKit wraps as {"type":"data","data":{...}} — unwrap if present
        if let Some(inner) = item.get("data")
            && (inner.is_object() || inner.is_array())
        {
            results.push(inner.clone());
            continue;
        }
        if item.is_object() || item.is_array() {
            results.push(item);
        }
    }
    results
}

/// Convert a JS object literal to valid JSON by quoting unquoted keys.
///
/// Handles: `{foo:"bar", baz:123}` → `{"foo":"bar", "baz":123}`
/// Preserves already-quoted keys and string values.
fn js_literal_to_json(input: &str) -> String {
    let bytes = input.as_bytes();
    let mut out = String::with_capacity(input.len() + input.len() / 10);
    let mut i = 0;
    let len = bytes.len();

    while i < len {
        let b = bytes[i];

        // Skip through strings
        if b == b'"' {
            out.push('"');
            i += 1;
            while i < len {
                let c = bytes[i];
                out.push(c as char);
                i += 1;
                if c == b'\\' && i < len {
                    out.push(bytes[i] as char);
                    i += 1;
                } else if c == b'"' {
                    break;
                }
            }
            continue;
        }

        // After { or , — look for unquoted key followed by :
        if (b == b'{' || b == b',' || b == b'[') && i + 1 < len {
            out.push(b as char);
            i += 1;
            // Skip whitespace
            while i < len && bytes[i].is_ascii_whitespace() {
                out.push(bytes[i] as char);
                i += 1;
            }
            // Check if next is an unquoted identifier (key)
            if i < len && (bytes[i].is_ascii_alphabetic() || bytes[i] == b'_') {
                let key_start = i;
                while i < len && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'_') {
                    i += 1;
                }
                let key = &input[key_start..i];
                // Skip whitespace after key
                while i < len && bytes[i].is_ascii_whitespace() {
                    i += 1;
                }
                // If followed by :, it's an unquoted key — quote it
                if i < len && bytes[i] == b':' {
                    out.push('"');
                    out.push_str(key);
                    out.push('"');
                } else {
                    // Not a key — might be a bare value like true/false/null
                    out.push_str(key);
                }
            }
            continue;
        }

        out.push(b as char);
        i += 1;
    }

    out
}

/// Extract content between balanced brackets, handling string escaping.
fn extract_balanced(text: &str, open: u8, close: u8) -> Option<String> {
    if text.as_bytes().first()? != &open {
        return None;
    }
    let mut depth: i32 = 0;
    let mut in_string = false;
    let mut escape_next = false;

    for (i, &b) in text.as_bytes().iter().enumerate() {
        if escape_next {
            escape_next = false;
            continue;
        }
        if b == b'\\' && in_string {
            escape_next = true;
            continue;
        }
        if b == b'"' {
            in_string = !in_string;
            continue;
        }
        if in_string {
            continue;
        }
        if b == open {
            depth += 1;
        } else if b == close {
            depth -= 1;
            if depth == 0 {
                return Some(text[..=i].to_string());
            }
        }
    }
    None
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn extracts_single_json_ld() {
        let html = r#"
            <html><head>
            <script type="application/ld+json">{"@type":"Product","name":"Test"}</script>
            </head><body></body></html>
        "#;
        let results = extract_json_ld(html);
        assert_eq!(results.len(), 1);
        assert_eq!(results[0]["@type"], "Product");
        assert_eq!(results[0]["name"], "Test");
    }

    #[test]
    fn extracts_multiple_json_ld_blocks() {
        let html = r#"
            <script type="application/ld+json">{"@type":"WebSite","url":"https://example.com"}</script>
            <script type="application/ld+json">{"@type":"Product","name":"Shoe","offers":{"price":99.99}}</script>
        "#;
        let results = extract_json_ld(html);
        assert_eq!(results.len(), 2);
        assert_eq!(results[0]["@type"], "WebSite");
        assert_eq!(results[1]["@type"], "Product");
    }

    #[test]
    fn handles_array_json_ld() {
        let html = r#"
            <script type="application/ld+json">[{"@type":"BreadcrumbList"},{"@type":"Product"}]</script>
        "#;
        let results = extract_json_ld(html);
        assert_eq!(results.len(), 2);
    }

    #[test]
    fn skips_invalid_json() {
        let html = r#"
            <script type="application/ld+json">{invalid json here}</script>
            <script type="application/ld+json">{"@type":"Product","name":"Valid"}</script>
        "#;
        let results = extract_json_ld(html);
        assert_eq!(results.len(), 1);
        assert_eq!(results[0]["name"], "Valid");
    }

    #[test]
    fn ignores_regular_script_tags() {
        let html = r#"
            <script>console.log("not json-ld")</script>
            <script type="text/javascript">var x = 1;</script>
            <script type="application/ld+json">{"@type":"Product"}</script>
        "#;
        let results = extract_json_ld(html);
        assert_eq!(results.len(), 1);
    }

    #[test]
    fn handles_no_json_ld() {
        let html = "<html><body><p>No structured data here</p></body></html>";
        let results = extract_json_ld(html);
        assert!(results.is_empty());
    }

    #[test]
    fn case_insensitive_type() {
        let html = r#"
            <script type="Application/LD+JSON">{"@type":"Product"}</script>
        "#;
        let results = extract_json_ld(html);
        assert_eq!(results.len(), 1);
    }

    #[test]
    fn handles_whitespace_in_json() {
        let html = r#"
            <script type="application/ld+json">
                {
                    "@type": "Product",
                    "name": "Test"
                }
            </script>
        "#;
        let results = extract_json_ld(html);
        assert_eq!(results.len(), 1);
        assert_eq!(results[0]["name"], "Test");
    }

    #[test]
    fn empty_script_tag_skipped() {
        let html = r#"
            <script type="application/ld+json">   </script>
            <script type="application/ld+json">{"@type":"Product"}</script>
        "#;
        let results = extract_json_ld(html);
        assert_eq!(results.len(), 1);
    }
}
feat: SvelteKit data extraction + license change to AGPL-3.0 - Extract structured JSON from SvelteKit kit.start() data arrays - Convert JS object literals (unquoted keys) to valid JSON - Data appears in structured_data field (machine-readable) - License changed from MIT to AGPL-3.0 - Bump to v0.3.4 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-04-01 20:37:56 +02:00			`/// Extract structured data from HTML.`
Initial release: webclaw v0.1.0 — web content extraction for LLMs CLI + MCP server for extracting clean, structured content from any URL. 6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats. MIT Licensed \| https://webclaw.io 2026-03-23 18:31:11 +01:00			`///`
feat: SvelteKit data extraction + license change to AGPL-3.0 - Extract structured JSON from SvelteKit kit.start() data arrays - Convert JS object literals (unquoted keys) to valid JSON - Data appears in structured_data field (machine-readable) - License changed from MIT to AGPL-3.0 - Bump to v0.3.4 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-04-01 20:37:56 +02:00			`/// Handles two sources:`
			/// 1. JSON-LD (`<script type="application/ld+json">`) — e-commerce, news, recipes
			/// 2. SvelteKit data islands (`kit.start(app, element, { data: [...] })`) — SPAs
Initial release: webclaw v0.1.0 — web content extraction for LLMs CLI + MCP server for extracting clean, structured content from any URL. 6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats. MIT Licensed \| https://webclaw.io 2026-03-23 18:31:11 +01:00			`use serde_json::Value;`

			`/// Extract all JSON-LD blocks from raw HTML.`
			`///`
			`/// Returns parsed JSON values, skipping any blocks that fail to parse.`
			`/// Most e-commerce sites include Schema.org Product markup with prices,`
			`/// sizes, availability, and images.`
			`pub fn extract_json_ld(html: &str) -> Vec<Value> {`
			`let mut results = Vec::new();`
			`let needle = "application/ld+json";`

			`// Walk through the HTML finding <script type="application/ld+json"> blocks.`
			`// Using simple string scanning instead of a full HTML parser — these blocks`
			`// are self-contained and reliably structured.`
			`let mut search_from = 0;`
			`while let Some(tag_start) = html[search_from..].find("<script") {`
			`let abs_start = search_from + tag_start;`
			`let tag_region = &html[abs_start..];`

			`// Find the end of the opening tag`
			`let Some(tag_end_offset) = tag_region.find('>') else {`
			`search_from = abs_start + 7;`
			`continue;`
			`};`

			`let opening_tag = &tag_region[..tag_end_offset];`

			`// Check if this is a JSON-LD script`
			`if !opening_tag.to_lowercase().contains(needle) {`
			`search_from = abs_start + tag_end_offset + 1;`
			`continue;`
			`}`

			`// Find the closing </script>`
			`let content_start = abs_start + tag_end_offset + 1;`
			`let remaining = &html[content_start..];`
			`let Some(close_offset) = remaining.to_lowercase().find("</script>") else {`
			`search_from = content_start;`
			`continue;`
			`};`

			`let json_str = remaining[..close_offset].trim();`
			`search_from = content_start + close_offset + 9;`

			`if json_str.is_empty() {`
			`continue;`
			`}`

			`// Parse — some sites have arrays at top level`
			`match serde_json::from_str::<Value>(json_str) {`
			`Ok(Value::Array(arr)) => results.extend(arr),`
			`Ok(val) => results.push(val),`
			`Err(_) => {}`
			`}`
			`}`

			`results`
			`}`

feat: SvelteKit data extraction + license change to AGPL-3.0 - Extract structured JSON from SvelteKit kit.start() data arrays - Convert JS object literals (unquoted keys) to valid JSON - Data appears in structured_data field (machine-readable) - License changed from MIT to AGPL-3.0 - Bump to v0.3.4 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-04-01 20:37:56 +02:00			/// Extract data from SvelteKit's `kit.start()` pattern.
			`///`
			`/// SvelteKit embeds page data inside:`
			/// `kit.start(app, element, { data: [null, null, {"type":"data","data":{...}}] })`
			`///`
			`/// Returns parsed JSON objects from the data array (skipping nulls).`
			`pub fn extract_sveltekit(html: &str) -> Vec<Value> {`
			`let Some(kit_pos) = html.find("kit.start(") else {`
			`return Vec::new();`
			`};`
			`let region = &html[kit_pos..];`

			`let Some(data_offset) = region.find("data: [") else {`
			`return Vec::new();`
			`};`
			`let bracket_start = kit_pos + data_offset + "data: ".len();`
			`let bracket_region = &html[bracket_start..];`

			`let Some(balanced) = extract_balanced(bracket_region, b'[', b']') else {`
			`return Vec::new();`
			`};`
			`if balanced.len() < 50 {`
			`return Vec::new();`
			`}`

			`// SvelteKit uses JS object literals (unquoted keys). Convert to valid JSON.`
			`let json_str = js_literal_to_json(&balanced);`
			`let Ok(arr) = serde_json::from_str::<Vec<Value>>(&json_str) else {`
			`return Vec::new();`
			`};`

			`let mut results = Vec::new();`
			`for item in arr {`
			`if item.is_null() {`
			`continue;`
			`}`
			`// SvelteKit wraps as {"type":"data","data":{...}} — unwrap if present`
			`if let Some(inner) = item.get("data")`
			`&& (inner.is_object() \|\| inner.is_array())`
			`{`
			`results.push(inner.clone());`
			`continue;`
			`}`
			`if item.is_object() \|\| item.is_array() {`
			`results.push(item);`
			`}`
			`}`
			`results`
			`}`

			`/// Convert a JS object literal to valid JSON by quoting unquoted keys.`
			`///`
			/// Handles: `{foo:"bar", baz:123}` → `{"foo":"bar", "baz":123}`
			`/// Preserves already-quoted keys and string values.`
			`fn js_literal_to_json(input: &str) -> String {`
			`let bytes = input.as_bytes();`
			`let mut out = String::with_capacity(input.len() + input.len() / 10);`
			`let mut i = 0;`
			`let len = bytes.len();`

			`while i < len {`
			`let b = bytes[i];`

			`// Skip through strings`
			`if b == b'"' {`
			`out.push('"');`
			`i += 1;`
			`while i < len {`
			`let c = bytes[i];`
			`out.push(c as char);`
			`i += 1;`
			`if c == b'\\' && i < len {`
			`out.push(bytes[i] as char);`
			`i += 1;`
			`} else if c == b'"' {`
			`break;`
			`}`
			`}`
			`continue;`
			`}`

			`// After { or , — look for unquoted key followed by :`
			`if (b == b'{' \|\| b == b',' \|\| b == b'[') && i + 1 < len {`
			`out.push(b as char);`
			`i += 1;`
			`// Skip whitespace`
			`while i < len && bytes[i].is_ascii_whitespace() {`
			`out.push(bytes[i] as char);`
			`i += 1;`
			`}`
			`// Check if next is an unquoted identifier (key)`
			`if i < len && (bytes[i].is_ascii_alphabetic() \|\| bytes[i] == b'_') {`
			`let key_start = i;`
			`while i < len && (bytes[i].is_ascii_alphanumeric() \|\| bytes[i] == b'_') {`
			`i += 1;`
			`}`
			`let key = &input[key_start..i];`
			`// Skip whitespace after key`
			`while i < len && bytes[i].is_ascii_whitespace() {`
			`i += 1;`
			`}`
			`// If followed by :, it's an unquoted key — quote it`
			`if i < len && bytes[i] == b':' {`
			`out.push('"');`
			`out.push_str(key);`
			`out.push('"');`
			`} else {`
			`// Not a key — might be a bare value like true/false/null`
			`out.push_str(key);`
			`}`
			`}`
			`continue;`
			`}`

			`out.push(b as char);`
			`i += 1;`
			`}`

			`out`
			`}`

			`/// Extract content between balanced brackets, handling string escaping.`
			`fn extract_balanced(text: &str, open: u8, close: u8) -> Option<String> {`
			`if text.as_bytes().first()? != &open {`
			`return None;`
			`}`
			`let mut depth: i32 = 0;`
			`let mut in_string = false;`
			`let mut escape_next = false;`

			`for (i, &b) in text.as_bytes().iter().enumerate() {`
			`if escape_next {`
			`escape_next = false;`
			`continue;`
			`}`
			`if b == b'\\' && in_string {`
			`escape_next = true;`
			`continue;`
			`}`
			`if b == b'"' {`
			`in_string = !in_string;`
			`continue;`
			`}`
			`if in_string {`
			`continue;`
			`}`
			`if b == open {`
			`depth += 1;`
			`} else if b == close {`
			`depth -= 1;`
			`if depth == 0 {`
			`return Some(text[..=i].to_string());`
			`}`
			`}`
			`}`
			`None`
			`}`

Initial release: webclaw v0.1.0 — web content extraction for LLMs CLI + MCP server for extracting clean, structured content from any URL. 6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats. MIT Licensed \| https://webclaw.io 2026-03-23 18:31:11 +01:00			`#[cfg(test)]`
			`mod tests {`
			`use super::*;`

			`#[test]`
			`fn extracts_single_json_ld() {`
			`let html = r#"`
			`<html><head>`
			`<script type="application/ld+json">{"@type":"Product","name":"Test"}</script>`
			`</head><body></body></html>`
			`"#;`
			`let results = extract_json_ld(html);`
			`assert_eq!(results.len(), 1);`
			`assert_eq!(results[0]["@type"], "Product");`
			`assert_eq!(results[0]["name"], "Test");`
			`}`

			`#[test]`
			`fn extracts_multiple_json_ld_blocks() {`
			`let html = r#"`
			`<script type="application/ld+json">{"@type":"WebSite","url":"https://example.com"}</script>`
			`<script type="application/ld+json">{"@type":"Product","name":"Shoe","offers":{"price":99.99}}</script>`
			`"#;`
			`let results = extract_json_ld(html);`
			`assert_eq!(results.len(), 2);`
			`assert_eq!(results[0]["@type"], "WebSite");`
			`assert_eq!(results[1]["@type"], "Product");`
			`}`

			`#[test]`
			`fn handles_array_json_ld() {`
			`let html = r#"`
			`<script type="application/ld+json">[{"@type":"BreadcrumbList"},{"@type":"Product"}]</script>`
			`"#;`
			`let results = extract_json_ld(html);`
			`assert_eq!(results.len(), 2);`
			`}`

			`#[test]`
			`fn skips_invalid_json() {`
			`let html = r#"`
			`<script type="application/ld+json">{invalid json here}</script>`
			`<script type="application/ld+json">{"@type":"Product","name":"Valid"}</script>`
			`"#;`
			`let results = extract_json_ld(html);`
			`assert_eq!(results.len(), 1);`
			`assert_eq!(results[0]["name"], "Valid");`
			`}`

			`#[test]`
			`fn ignores_regular_script_tags() {`
			`let html = r#"`
			`<script>console.log("not json-ld")</script>`
			`<script type="text/javascript">var x = 1;</script>`
			`<script type="application/ld+json">{"@type":"Product"}</script>`
			`"#;`
			`let results = extract_json_ld(html);`
			`assert_eq!(results.len(), 1);`
			`}`

			`#[test]`
			`fn handles_no_json_ld() {`
			`let html = "<html><body><p>No structured data here</p></body></html>";`
			`let results = extract_json_ld(html);`
			`assert!(results.is_empty());`
			`}`

			`#[test]`
			`fn case_insensitive_type() {`
			`let html = r#"`
			`<script type="Application/LD+JSON">{"@type":"Product"}</script>`
			`"#;`
			`let results = extract_json_ld(html);`
			`assert_eq!(results.len(), 1);`
			`}`

			`#[test]`
			`fn handles_whitespace_in_json() {`
			`let html = r#"`
			`<script type="application/ld+json">`
			`{`
			`"@type": "Product",`
			`"name": "Test"`
			`}`
			`</script>`
			`"#;`
			`let results = extract_json_ld(html);`
			`assert_eq!(results.len(), 1);`
			`assert_eq!(results[0]["name"], "Test");`
			`}`

			`#[test]`
			`fn empty_script_tag_skipped() {`
			`let html = r#"`
			`<script type="application/ld+json"> </script>`
			`<script type="application/ld+json">{"@type":"Product"}</script>`
			`"#;`
			`let results = extract_json_ld(html);`
			`assert_eq!(results.len(), 1);`
			`}`
			`}`