feat: extract __NEXT_DATA__ into structured_data

Next.js pages embed server-rendered data in <script id="__NEXT_DATA__">.
Now extracted as structured JSON (pageProps) in the structured_data field.

Tested on 45 sites — 13 return rich structured data including prices,
product info, and page state not visible in the DOM.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Valerio 2026-04-02 16:04:51 +02:00
parent 4e81c3430d
commit 8d29382b25
5 changed files with 72 additions and 10 deletions

12
Cargo.lock generated
View file

@ -3055,7 +3055,7 @@ dependencies = [
[[package]]
name = "webclaw-cli"
version = "0.3.4"
version = "0.3.5"
dependencies = [
"clap",
"dotenvy",
@ -3075,7 +3075,7 @@ dependencies = [
[[package]]
name = "webclaw-core"
version = "0.3.4"
version = "0.3.5"
dependencies = [
"ego-tree",
"once_cell",
@ -3093,7 +3093,7 @@ dependencies = [
[[package]]
name = "webclaw-fetch"
version = "0.3.4"
version = "0.3.5"
dependencies = [
"bytes",
"calamine",
@ -3115,7 +3115,7 @@ dependencies = [
[[package]]
name = "webclaw-llm"
version = "0.3.4"
version = "0.3.5"
dependencies = [
"async-trait",
"reqwest",
@ -3128,7 +3128,7 @@ dependencies = [
[[package]]
name = "webclaw-mcp"
version = "0.3.4"
version = "0.3.5"
dependencies = [
"dotenvy",
"reqwest",
@ -3148,7 +3148,7 @@ dependencies = [
[[package]]
name = "webclaw-pdf"
version = "0.3.4"
version = "0.3.5"
dependencies = [
"pdf-extract",
"thiserror",