mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-07 22:15:12 +02:00
feat: extract __NEXT_DATA__ into structured_data
Next.js pages embed server-rendered data in <script id="__NEXT_DATA__">. Now extracted as structured JSON (pageProps) in the structured_data field. Tested on 45 sites — 13 return rich structured data including prices, product info, and page state not visible in the DOM. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
4e81c3430d
commit
8d29382b25
5 changed files with 72 additions and 10 deletions
|
|
@ -179,8 +179,9 @@ pub fn extract_with_options(
|
|||
let domain_type = domain::detect(url, html);
|
||||
let domain_data = Some(DomainData { domain_type });
|
||||
|
||||
// Structured data: JSON-LD + SvelteKit data islands
|
||||
// Structured data: JSON-LD + __NEXT_DATA__ + SvelteKit data islands
|
||||
let mut structured_data = structured_data::extract_json_ld(html);
|
||||
structured_data.extend(structured_data::extract_next_data(html));
|
||||
structured_data.extend(structured_data::extract_sveltekit(html));
|
||||
|
||||
Ok(ExtractionResult {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue