mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-04-25 00:06:21 +02:00
feat: SvelteKit data extraction + license change to AGPL-3.0
- Extract structured JSON from SvelteKit kit.start() data arrays - Convert JS object literals (unquoted keys) to valid JSON - Data appears in structured_data field (machine-readable) - License changed from MIT to AGPL-3.0 - Bump to v0.3.4 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
b4800e681c
commit
84b2e6092e
7 changed files with 846 additions and 31 deletions
|
|
@ -1,8 +1,8 @@
|
|||
/// Extract JSON-LD structured data from HTML.
|
||||
/// Extract structured data from HTML.
|
||||
///
|
||||
/// Parses `<script type="application/ld+json">` blocks commonly found in
|
||||
/// e-commerce, news, and recipe sites. Returns machine-readable product info,
|
||||
/// prices, availability, reviews, etc. without needing JS rendering or LLM.
|
||||
/// Handles two sources:
|
||||
/// 1. JSON-LD (`<script type="application/ld+json">`) — e-commerce, news, recipes
|
||||
/// 2. SvelteKit data islands (`kit.start(app, element, { data: [...] })`) — SPAs
|
||||
use serde_json::Value;
|
||||
|
||||
/// Extract all JSON-LD blocks from raw HTML.
|
||||
|
|
@ -62,6 +62,164 @@ pub fn extract_json_ld(html: &str) -> Vec<Value> {
|
|||
results
|
||||
}
|
||||
|
||||
/// Extract data from SvelteKit's `kit.start()` pattern.
|
||||
///
|
||||
/// SvelteKit embeds page data inside:
|
||||
/// `kit.start(app, element, { data: [null, null, {"type":"data","data":{...}}] })`
|
||||
///
|
||||
/// Returns parsed JSON objects from the data array (skipping nulls).
|
||||
pub fn extract_sveltekit(html: &str) -> Vec<Value> {
|
||||
let Some(kit_pos) = html.find("kit.start(") else {
|
||||
return Vec::new();
|
||||
};
|
||||
let region = &html[kit_pos..];
|
||||
|
||||
let Some(data_offset) = region.find("data: [") else {
|
||||
return Vec::new();
|
||||
};
|
||||
let bracket_start = kit_pos + data_offset + "data: ".len();
|
||||
let bracket_region = &html[bracket_start..];
|
||||
|
||||
let Some(balanced) = extract_balanced(bracket_region, b'[', b']') else {
|
||||
return Vec::new();
|
||||
};
|
||||
if balanced.len() < 50 {
|
||||
return Vec::new();
|
||||
}
|
||||
|
||||
// SvelteKit uses JS object literals (unquoted keys). Convert to valid JSON.
|
||||
let json_str = js_literal_to_json(&balanced);
|
||||
let Ok(arr) = serde_json::from_str::<Vec<Value>>(&json_str) else {
|
||||
return Vec::new();
|
||||
};
|
||||
|
||||
let mut results = Vec::new();
|
||||
for item in arr {
|
||||
if item.is_null() {
|
||||
continue;
|
||||
}
|
||||
// SvelteKit wraps as {"type":"data","data":{...}} — unwrap if present
|
||||
if let Some(inner) = item.get("data")
|
||||
&& (inner.is_object() || inner.is_array())
|
||||
{
|
||||
results.push(inner.clone());
|
||||
continue;
|
||||
}
|
||||
if item.is_object() || item.is_array() {
|
||||
results.push(item);
|
||||
}
|
||||
}
|
||||
results
|
||||
}
|
||||
|
||||
/// Convert a JS object literal to valid JSON by quoting unquoted keys.
|
||||
///
|
||||
/// Handles: `{foo:"bar", baz:123}` → `{"foo":"bar", "baz":123}`
|
||||
/// Preserves already-quoted keys and string values.
|
||||
fn js_literal_to_json(input: &str) -> String {
|
||||
let bytes = input.as_bytes();
|
||||
let mut out = String::with_capacity(input.len() + input.len() / 10);
|
||||
let mut i = 0;
|
||||
let len = bytes.len();
|
||||
|
||||
while i < len {
|
||||
let b = bytes[i];
|
||||
|
||||
// Skip through strings
|
||||
if b == b'"' {
|
||||
out.push('"');
|
||||
i += 1;
|
||||
while i < len {
|
||||
let c = bytes[i];
|
||||
out.push(c as char);
|
||||
i += 1;
|
||||
if c == b'\\' && i < len {
|
||||
out.push(bytes[i] as char);
|
||||
i += 1;
|
||||
} else if c == b'"' {
|
||||
break;
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// After { or , — look for unquoted key followed by :
|
||||
if (b == b'{' || b == b',' || b == b'[') && i + 1 < len {
|
||||
out.push(b as char);
|
||||
i += 1;
|
||||
// Skip whitespace
|
||||
while i < len && bytes[i].is_ascii_whitespace() {
|
||||
out.push(bytes[i] as char);
|
||||
i += 1;
|
||||
}
|
||||
// Check if next is an unquoted identifier (key)
|
||||
if i < len && (bytes[i].is_ascii_alphabetic() || bytes[i] == b'_') {
|
||||
let key_start = i;
|
||||
while i < len && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'_') {
|
||||
i += 1;
|
||||
}
|
||||
let key = &input[key_start..i];
|
||||
// Skip whitespace after key
|
||||
while i < len && bytes[i].is_ascii_whitespace() {
|
||||
i += 1;
|
||||
}
|
||||
// If followed by :, it's an unquoted key — quote it
|
||||
if i < len && bytes[i] == b':' {
|
||||
out.push('"');
|
||||
out.push_str(key);
|
||||
out.push('"');
|
||||
} else {
|
||||
// Not a key — might be a bare value like true/false/null
|
||||
out.push_str(key);
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
out.push(b as char);
|
||||
i += 1;
|
||||
}
|
||||
|
||||
out
|
||||
}
|
||||
|
||||
/// Extract content between balanced brackets, handling string escaping.
|
||||
fn extract_balanced(text: &str, open: u8, close: u8) -> Option<String> {
|
||||
if text.as_bytes().first()? != &open {
|
||||
return None;
|
||||
}
|
||||
let mut depth: i32 = 0;
|
||||
let mut in_string = false;
|
||||
let mut escape_next = false;
|
||||
|
||||
for (i, &b) in text.as_bytes().iter().enumerate() {
|
||||
if escape_next {
|
||||
escape_next = false;
|
||||
continue;
|
||||
}
|
||||
if b == b'\\' && in_string {
|
||||
escape_next = true;
|
||||
continue;
|
||||
}
|
||||
if b == b'"' {
|
||||
in_string = !in_string;
|
||||
continue;
|
||||
}
|
||||
if in_string {
|
||||
continue;
|
||||
}
|
||||
if b == open {
|
||||
depth += 1;
|
||||
} else if b == close {
|
||||
depth -= 1;
|
||||
if depth == 0 {
|
||||
return Some(text[..=i].to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue