mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-04-25 00:06:21 +02:00
feat: extract __NEXT_DATA__ into structured_data
Next.js pages embed server-rendered data in <script id="__NEXT_DATA__">. Now extracted as structured JSON (pageProps) in the structured_data field. Tested on 45 sites — 13 return rich structured data including prices, product info, and page state not visible in the DOM. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
4e81c3430d
commit
8d29382b25
5 changed files with 72 additions and 10 deletions
|
|
@ -3,6 +3,13 @@
|
|||
All notable changes to webclaw are documented here.
|
||||
Format follows [Keep a Changelog](https://keepachangelog.com/).
|
||||
|
||||
## [0.3.5] — 2026-04-02
|
||||
|
||||
### Added
|
||||
- **`__NEXT_DATA__` extraction**: Next.js pages now have their `pageProps` JSON extracted into `structured_data`. Contains prices, product info, page state, and other data that isn't in the visible HTML. Tested on 45 sites — 13 now return rich structured data (BBC, Forbes, Nike, Stripe, TripAdvisor, Glassdoor, NASA, etc.).
|
||||
|
||||
---
|
||||
|
||||
## [0.3.4] — 2026-04-01
|
||||
|
||||
### Added
|
||||
|
|
|
|||
12
Cargo.lock
generated
12
Cargo.lock
generated
|
|
@ -3055,7 +3055,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-cli"
|
||||
version = "0.3.4"
|
||||
version = "0.3.5"
|
||||
dependencies = [
|
||||
"clap",
|
||||
"dotenvy",
|
||||
|
|
@ -3075,7 +3075,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-core"
|
||||
version = "0.3.4"
|
||||
version = "0.3.5"
|
||||
dependencies = [
|
||||
"ego-tree",
|
||||
"once_cell",
|
||||
|
|
@ -3093,7 +3093,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-fetch"
|
||||
version = "0.3.4"
|
||||
version = "0.3.5"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"calamine",
|
||||
|
|
@ -3115,7 +3115,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-llm"
|
||||
version = "0.3.4"
|
||||
version = "0.3.5"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"reqwest",
|
||||
|
|
@ -3128,7 +3128,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-mcp"
|
||||
version = "0.3.4"
|
||||
version = "0.3.5"
|
||||
dependencies = [
|
||||
"dotenvy",
|
||||
"reqwest",
|
||||
|
|
@ -3148,7 +3148,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-pdf"
|
||||
version = "0.3.4"
|
||||
version = "0.3.5"
|
||||
dependencies = [
|
||||
"pdf-extract",
|
||||
"thiserror",
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ resolver = "2"
|
|||
members = ["crates/*"]
|
||||
|
||||
[workspace.package]
|
||||
version = "0.3.4"
|
||||
version = "0.3.5"
|
||||
edition = "2024"
|
||||
license = "AGPL-3.0"
|
||||
repository = "https://github.com/0xMassi/webclaw"
|
||||
|
|
|
|||
|
|
@ -179,8 +179,9 @@ pub fn extract_with_options(
|
|||
let domain_type = domain::detect(url, html);
|
||||
let domain_data = Some(DomainData { domain_type });
|
||||
|
||||
// Structured data: JSON-LD + SvelteKit data islands
|
||||
// Structured data: JSON-LD + __NEXT_DATA__ + SvelteKit data islands
|
||||
let mut structured_data = structured_data::extract_json_ld(html);
|
||||
structured_data.extend(structured_data::extract_next_data(html));
|
||||
structured_data.extend(structured_data::extract_sveltekit(html));
|
||||
|
||||
Ok(ExtractionResult {
|
||||
|
|
|
|||
|
|
@ -1,8 +1,9 @@
|
|||
/// Extract structured data from HTML.
|
||||
///
|
||||
/// Handles two sources:
|
||||
/// Handles three sources:
|
||||
/// 1. JSON-LD (`<script type="application/ld+json">`) — e-commerce, news, recipes
|
||||
/// 2. SvelteKit data islands (`kit.start(app, element, { data: [...] })`) — SPAs
|
||||
/// 2. `__NEXT_DATA__` (`<script id="__NEXT_DATA__" type="application/json">`) — Next.js pages
|
||||
/// 3. SvelteKit data islands (`kit.start(app, element, { data: [...] })`) — SPAs
|
||||
use serde_json::Value;
|
||||
|
||||
/// Extract all JSON-LD blocks from raw HTML.
|
||||
|
|
@ -62,6 +63,59 @@ pub fn extract_json_ld(html: &str) -> Vec<Value> {
|
|||
results
|
||||
}
|
||||
|
||||
/// Extract `__NEXT_DATA__` from Next.js pages.
|
||||
///
|
||||
/// Next.js embeds server-rendered page data in:
|
||||
/// `<script id="__NEXT_DATA__" type="application/json">{...}</script>`
|
||||
///
|
||||
/// Returns the `pageProps` object (the actual page data), skipping Next.js
|
||||
/// internals like `buildId`, `isFallback`, etc.
|
||||
pub fn extract_next_data(html: &str) -> Vec<Value> {
|
||||
let Some(id_pos) = html.find("__NEXT_DATA__") else {
|
||||
return Vec::new();
|
||||
};
|
||||
|
||||
// Find the enclosing <script> tag
|
||||
let Some(tag_start) = html[..id_pos].rfind("<script") else {
|
||||
return Vec::new();
|
||||
};
|
||||
let tag_region = &html[tag_start..];
|
||||
|
||||
let Some(tag_end) = tag_region.find('>') else {
|
||||
return Vec::new();
|
||||
};
|
||||
|
||||
let content_start = tag_start + tag_end + 1;
|
||||
let remaining = &html[content_start..];
|
||||
let Some(close) = remaining.find("</script>") else {
|
||||
return Vec::new();
|
||||
};
|
||||
|
||||
let json_str = remaining[..close].trim();
|
||||
if json_str.len() < 20 {
|
||||
return Vec::new();
|
||||
}
|
||||
|
||||
let Ok(data) = serde_json::from_str::<Value>(json_str) else {
|
||||
return Vec::new();
|
||||
};
|
||||
|
||||
// Extract pageProps — the actual page data
|
||||
if let Some(page_props) = data.get("props").and_then(|p| p.get("pageProps"))
|
||||
&& page_props.is_object()
|
||||
&& page_props.as_object().is_some_and(|m| !m.is_empty())
|
||||
{
|
||||
return vec![page_props.clone()];
|
||||
}
|
||||
|
||||
// Fallback: return the whole thing if pageProps is missing/empty
|
||||
if data.is_object() {
|
||||
vec![data]
|
||||
} else {
|
||||
Vec::new()
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract data from SvelteKit's `kit.start()` pattern.
|
||||
///
|
||||
/// SvelteKit embeds page data inside:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue