Initial release: webclaw v0.1.0 — web content extraction for LLMs

CLI + MCP server for extracting clean, structured content from any URL. 6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats. MIT Licensed | https://webclaw.io
2026-06-22 02:38:06 +02:00 · 2026-03-23 18:31:11 +01:00 · 2026-03-23 18:31:11 +01:00 · c99ec684fa
commit c99ec684fa
79 changed files with 24074 additions and 0 deletions
--- a/crates/webclaw-core/src/structured_data.rs
+++ b/crates/webclaw-core/src/structured_data.rs
@ -0,0 +1,165 @@
+/// Extract JSON-LD structured data from HTML.
+///
+/// Parses `<script type="application/ld+json">` blocks commonly found in
+/// e-commerce, news, and recipe sites. Returns machine-readable product info,
+/// prices, availability, reviews, etc. without needing JS rendering or LLM.
+use serde_json::Value;
+
+/// Extract all JSON-LD blocks from raw HTML.
+///
+/// Returns parsed JSON values, skipping any blocks that fail to parse.
+/// Most e-commerce sites include Schema.org Product markup with prices,
+/// sizes, availability, and images.
+pub fn extract_json_ld(html: &str) -> Vec<Value> {
+    let mut results = Vec::new();
+    let needle = "application/ld+json";
+
+    // Walk through the HTML finding <script type="application/ld+json"> blocks.
+    // Using simple string scanning instead of a full HTML parser — these blocks
+    // are self-contained and reliably structured.
+    let mut search_from = 0;
+    while let Some(tag_start) = html[search_from..].find("<script") {
+        let abs_start = search_from + tag_start;
+        let tag_region = &html[abs_start..];
+
+        // Find the end of the opening tag
+        let Some(tag_end_offset) = tag_region.find('>') else {
+            search_from = abs_start + 7;
+            continue;
+        };
+
+        let opening_tag = &tag_region[..tag_end_offset];
+
+        // Check if this is a JSON-LD script
+        if !opening_tag.to_lowercase().contains(needle) {
+            search_from = abs_start + tag_end_offset + 1;
+            continue;
+        }
+
+        // Find the closing </script>
+        let content_start = abs_start + tag_end_offset + 1;
+        let remaining = &html[content_start..];
+        let Some(close_offset) = remaining.to_lowercase().find("</script>") else {
+            search_from = content_start;
+            continue;
+        };
+
+        let json_str = remaining[..close_offset].trim();
+        search_from = content_start + close_offset + 9;
+
+        if json_str.is_empty() {
+            continue;
+        }
+
+        // Parse — some sites have arrays at top level
+        match serde_json::from_str::<Value>(json_str) {
+            Ok(Value::Array(arr)) => results.extend(arr),
+            Ok(val) => results.push(val),
+            Err(_) => {}
+        }
+    }
+
+    results
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn extracts_single_json_ld() {
+        let html = r#"
+            <html><head>
+            <script type="application/ld+json">{"@type":"Product","name":"Test"}</script>
+            </head><body></body></html>
+        "#;
+        let results = extract_json_ld(html);
+        assert_eq!(results.len(), 1);
+        assert_eq!(results[0]["@type"], "Product");
+        assert_eq!(results[0]["name"], "Test");
+    }
+
+    #[test]
+    fn extracts_multiple_json_ld_blocks() {
+        let html = r#"
+            <script type="application/ld+json">{"@type":"WebSite","url":"https://example.com"}</script>
+            <script type="application/ld+json">{"@type":"Product","name":"Shoe","offers":{"price":99.99}}</script>
+        "#;
+        let results = extract_json_ld(html);
+        assert_eq!(results.len(), 2);
+        assert_eq!(results[0]["@type"], "WebSite");
+        assert_eq!(results[1]["@type"], "Product");
+    }
+
+    #[test]
+    fn handles_array_json_ld() {
+        let html = r#"
+            <script type="application/ld+json">[{"@type":"BreadcrumbList"},{"@type":"Product"}]</script>
+        "#;
+        let results = extract_json_ld(html);
+        assert_eq!(results.len(), 2);
+    }
+
+    #[test]
+    fn skips_invalid_json() {
+        let html = r#"
+            <script type="application/ld+json">{invalid json here}</script>
+            <script type="application/ld+json">{"@type":"Product","name":"Valid"}</script>
+        "#;
+        let results = extract_json_ld(html);
+        assert_eq!(results.len(), 1);
+        assert_eq!(results[0]["name"], "Valid");
+    }
+
+    #[test]
+    fn ignores_regular_script_tags() {
+        let html = r#"
+            <script>console.log("not json-ld")</script>
+            <script type="text/javascript">var x = 1;</script>
+            <script type="application/ld+json">{"@type":"Product"}</script>
+        "#;
+        let results = extract_json_ld(html);
+        assert_eq!(results.len(), 1);
+    }
+
+    #[test]
+    fn handles_no_json_ld() {
+        let html = "<html><body><p>No structured data here</p></body></html>";
+        let results = extract_json_ld(html);
+        assert!(results.is_empty());
+    }
+
+    #[test]
+    fn case_insensitive_type() {
+        let html = r#"
+            <script type="Application/LD+JSON">{"@type":"Product"}</script>
+        "#;
+        let results = extract_json_ld(html);
+        assert_eq!(results.len(), 1);
+    }
+
+    #[test]
+    fn handles_whitespace_in_json() {
+        let html = r#"
+            <script type="application/ld+json">
+                {
+                    "@type": "Product",
+                    "name": "Test"
+                }
+            </script>
+        "#;
+        let results = extract_json_ld(html);
+        assert_eq!(results.len(), 1);
+        assert_eq!(results[0]["name"], "Test");
+    }
+
+    #[test]
+    fn empty_script_tag_skipped() {
+        let html = r#"
+            <script type="application/ld+json">   </script>
+            <script type="application/ld+json">{"@type":"Product"}</script>
+        "#;
+        let results = extract_json_ld(html);
+        assert_eq!(results.len(), 1);
+    }
+}