feat: extract __NEXT_DATA__ into structured_data

Next.js pages embed server-rendered data in <script id="__NEXT_DATA__">.
Now extracted as structured JSON (pageProps) in the structured_data field.

Tested on 45 sites — 13 return rich structured data including prices,
product info, and page state not visible in the DOM.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Valerio 2026-04-02 16:04:51 +02:00
parent 4e81c3430d
commit 8d29382b25
5 changed files with 72 additions and 10 deletions

View file

@ -3,6 +3,13 @@
All notable changes to webclaw are documented here.
Format follows [Keep a Changelog](https://keepachangelog.com/).
## [0.3.5] — 2026-04-02
### Added
- **`__NEXT_DATA__` extraction**: Next.js pages now have their `pageProps` JSON extracted into `structured_data`. Contains prices, product info, page state, and other data that isn't in the visible HTML. Tested on 45 sites — 13 now return rich structured data (BBC, Forbes, Nike, Stripe, TripAdvisor, Glassdoor, NASA, etc.).
---
## [0.3.4] — 2026-04-01
### Added

12
Cargo.lock generated
View file

@ -3055,7 +3055,7 @@ dependencies = [
[[package]]
name = "webclaw-cli"
version = "0.3.4"
version = "0.3.5"
dependencies = [
"clap",
"dotenvy",
@ -3075,7 +3075,7 @@ dependencies = [
[[package]]
name = "webclaw-core"
version = "0.3.4"
version = "0.3.5"
dependencies = [
"ego-tree",
"once_cell",
@ -3093,7 +3093,7 @@ dependencies = [
[[package]]
name = "webclaw-fetch"
version = "0.3.4"
version = "0.3.5"
dependencies = [
"bytes",
"calamine",
@ -3115,7 +3115,7 @@ dependencies = [
[[package]]
name = "webclaw-llm"
version = "0.3.4"
version = "0.3.5"
dependencies = [
"async-trait",
"reqwest",
@ -3128,7 +3128,7 @@ dependencies = [
[[package]]
name = "webclaw-mcp"
version = "0.3.4"
version = "0.3.5"
dependencies = [
"dotenvy",
"reqwest",
@ -3148,7 +3148,7 @@ dependencies = [
[[package]]
name = "webclaw-pdf"
version = "0.3.4"
version = "0.3.5"
dependencies = [
"pdf-extract",
"thiserror",

View file

@ -3,7 +3,7 @@ resolver = "2"
members = ["crates/*"]
[workspace.package]
version = "0.3.4"
version = "0.3.5"
edition = "2024"
license = "AGPL-3.0"
repository = "https://github.com/0xMassi/webclaw"

View file

@ -179,8 +179,9 @@ pub fn extract_with_options(
let domain_type = domain::detect(url, html);
let domain_data = Some(DomainData { domain_type });
// Structured data: JSON-LD + SvelteKit data islands
// Structured data: JSON-LD + __NEXT_DATA__ + SvelteKit data islands
let mut structured_data = structured_data::extract_json_ld(html);
structured_data.extend(structured_data::extract_next_data(html));
structured_data.extend(structured_data::extract_sveltekit(html));
Ok(ExtractionResult {

View file

@ -1,8 +1,9 @@
/// Extract structured data from HTML.
///
/// Handles two sources:
/// Handles three sources:
/// 1. JSON-LD (`<script type="application/ld+json">`) — e-commerce, news, recipes
/// 2. SvelteKit data islands (`kit.start(app, element, { data: [...] })`) — SPAs
/// 2. `__NEXT_DATA__` (`<script id="__NEXT_DATA__" type="application/json">`) — Next.js pages
/// 3. SvelteKit data islands (`kit.start(app, element, { data: [...] })`) — SPAs
use serde_json::Value;
/// Extract all JSON-LD blocks from raw HTML.
@ -62,6 +63,59 @@ pub fn extract_json_ld(html: &str) -> Vec<Value> {
results
}
/// Extract `__NEXT_DATA__` from Next.js pages.
///
/// Next.js embeds server-rendered page data in:
/// `<script id="__NEXT_DATA__" type="application/json">{...}</script>`
///
/// Returns the `pageProps` object (the actual page data), skipping Next.js
/// internals like `buildId`, `isFallback`, etc.
pub fn extract_next_data(html: &str) -> Vec<Value> {
let Some(id_pos) = html.find("__NEXT_DATA__") else {
return Vec::new();
};
// Find the enclosing <script> tag
let Some(tag_start) = html[..id_pos].rfind("<script") else {
return Vec::new();
};
let tag_region = &html[tag_start..];
let Some(tag_end) = tag_region.find('>') else {
return Vec::new();
};
let content_start = tag_start + tag_end + 1;
let remaining = &html[content_start..];
let Some(close) = remaining.find("</script>") else {
return Vec::new();
};
let json_str = remaining[..close].trim();
if json_str.len() < 20 {
return Vec::new();
}
let Ok(data) = serde_json::from_str::<Value>(json_str) else {
return Vec::new();
};
// Extract pageProps — the actual page data
if let Some(page_props) = data.get("props").and_then(|p| p.get("pageProps"))
&& page_props.is_object()
&& page_props.as_object().is_some_and(|m| !m.is_empty())
{
return vec![page_props.clone()];
}
// Fallback: return the whole thing if pageProps is missing/empty
if data.is_object() {
vec![data]
} else {
Vec::new()
}
}
/// Extract data from SvelteKit's `kit.start()` pattern.
///
/// SvelteKit embeds page data inside: