feat: extract __NEXT_DATA__ into structured_data

Next.js pages embed server-rendered data in <script id="__NEXT_DATA__">. Now extracted as structured JSON (pageProps) in the structured_data field. Tested on 45 sites — 13 return rich structured data including prices, product info, and page state not visible in the DOM. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 00:06:21 +02:00 · 2026-04-02 16:04:51 +02:00 · 2026-04-02 16:04:51 +02:00 · 8d29382b25
commit 8d29382b25
parent 4e81c3430d
5 changed files with 72 additions and 10 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -3,6 +3,13 @@
 All notable changes to webclaw are documented here.
 Format follows [Keep a Changelog](https://keepachangelog.com/).

+## [0.3.5] — 2026-04-02
+
+### Added
+- **`__NEXT_DATA__` extraction**: Next.js pages now have their `pageProps` JSON extracted into `structured_data`. Contains prices, product info, page state, and other data that isn't in the visible HTML. Tested on 45 sites — 13 now return rich structured data (BBC, Forbes, Nike, Stripe, TripAdvisor, Glassdoor, NASA, etc.).
+
+---
+
 ## [0.3.4] — 2026-04-01

 ### Added
--- a/Cargo.lock
+++ b/Cargo.lock
@ -3055,7 +3055,7 @@ dependencies = [

 [[package]]
 name = "webclaw-cli"
-version = "0.3.4"
+version = "0.3.5"
 dependencies = [
 "clap",
 "dotenvy",
@ -3075,7 +3075,7 @@ dependencies = [

 [[package]]
 name = "webclaw-core"
-version = "0.3.4"
+version = "0.3.5"
 dependencies = [
 "ego-tree",
 "once_cell",
@ -3093,7 +3093,7 @@ dependencies = [

 [[package]]
 name = "webclaw-fetch"
-version = "0.3.4"
+version = "0.3.5"
 dependencies = [
 "bytes",
 "calamine",
@ -3115,7 +3115,7 @@ dependencies = [

 [[package]]
 name = "webclaw-llm"
-version = "0.3.4"
+version = "0.3.5"
 dependencies = [
 "async-trait",
 "reqwest",
@ -3128,7 +3128,7 @@ dependencies = [

 [[package]]
 name = "webclaw-mcp"
-version = "0.3.4"
+version = "0.3.5"
 dependencies = [
 "dotenvy",
 "reqwest",
@ -3148,7 +3148,7 @@ dependencies = [

 [[package]]
 name = "webclaw-pdf"
-version = "0.3.4"
+version = "0.3.5"
 dependencies = [
 "pdf-extract",
 "thiserror",
--- a/Cargo.toml
+++ b/Cargo.toml
@ -3,7 +3,7 @@ resolver = "2"
 members = ["crates/*"]

 [workspace.package]
-version = "0.3.4"
+version = "0.3.5"
 edition = "2024"
 license = "AGPL-3.0"
 repository = "https://github.com/0xMassi/webclaw"
--- a/crates/webclaw-core/src/lib.rs
+++ b/crates/webclaw-core/src/lib.rs
@ -179,8 +179,9 @@ pub fn extract_with_options(
    let domain_type = domain::detect(url, html);
    let domain_data = Some(DomainData { domain_type });

-    // Structured data: JSON-LD + SvelteKit data islands
+    // Structured data: JSON-LD + __NEXT_DATA__ + SvelteKit data islands
    let mut structured_data = structured_data::extract_json_ld(html);
+    structured_data.extend(structured_data::extract_next_data(html));
    structured_data.extend(structured_data::extract_sveltekit(html));

    Ok(ExtractionResult {
--- a/crates/webclaw-core/src/structured_data.rs
+++ b/crates/webclaw-core/src/structured_data.rs
@ -1,8 +1,9 @@
 /// Extract structured data from HTML.
 ///
-/// Handles two sources:
+/// Handles three sources:
 /// 1. JSON-LD (`<script type="application/ld+json">`) — e-commerce, news, recipes
-/// 2. SvelteKit data islands (`kit.start(app, element, { data: [...] })`) — SPAs
+/// 2. `__NEXT_DATA__` (`<script id="__NEXT_DATA__" type="application/json">`) — Next.js pages
+/// 3. SvelteKit data islands (`kit.start(app, element, { data: [...] })`) — SPAs
 use serde_json::Value;

 /// Extract all JSON-LD blocks from raw HTML.
@ -62,6 +63,59 @@ pub fn extract_json_ld(html: &str) -> Vec<Value> {
    results
 }

+/// Extract `__NEXT_DATA__` from Next.js pages.
+///
+/// Next.js embeds server-rendered page data in:
+/// `<script id="__NEXT_DATA__" type="application/json">{...}</script>`
+///
+/// Returns the `pageProps` object (the actual page data), skipping Next.js
+/// internals like `buildId`, `isFallback`, etc.
+pub fn extract_next_data(html: &str) -> Vec<Value> {
+    let Some(id_pos) = html.find("__NEXT_DATA__") else {
+        return Vec::new();
+    };
+
+    // Find the enclosing <script> tag
+    let Some(tag_start) = html[..id_pos].rfind("<script") else {
+        return Vec::new();
+    };
+    let tag_region = &html[tag_start..];
+
+    let Some(tag_end) = tag_region.find('>') else {
+        return Vec::new();
+    };
+
+    let content_start = tag_start + tag_end + 1;
+    let remaining = &html[content_start..];
+    let Some(close) = remaining.find("</script>") else {
+        return Vec::new();
+    };
+
+    let json_str = remaining[..close].trim();
+    if json_str.len() < 20 {
+        return Vec::new();
+    }
+
+    let Ok(data) = serde_json::from_str::<Value>(json_str) else {
+        return Vec::new();
+    };
+
+    // Extract pageProps — the actual page data
+    if let Some(page_props) = data.get("props").and_then(|p| p.get("pageProps"))
+        && page_props.is_object()
+        && page_props.as_object().is_some_and(|m| !m.is_empty())
+    {
+        return vec![page_props.clone()];
+    }
+
+    // Fallback: return the whole thing if pageProps is missing/empty
+    if data.is_object() {
+        vec![data]
+    } else {
+        Vec::new()
+    }
+}
+
 /// Extract data from SvelteKit's `kit.start()` pattern.
 ///
 /// SvelteKit embeds page data inside: