feat: v0.1.4 — QuickJS integration for inline JavaScript data extraction

Embeds QuickJS (rquickjs) to execute inline <script> tags and extract
data hidden in JavaScript variable assignments. Captures window.__*
objects like __preloadedData (NYTimes), __PRELOADED_STATE__ (Wired),
and self.__next_f (Next.js RSC flight data).

Results:
- NYTimes: 1,552 → 4,162 words (+168%)
- Wired: 1,459 → 9,937 words (+580%)
- Zero measurable performance overhead (<15ms per page)
- Feature-gated: disable with --no-default-features for WASM

Smart text filtering rejects CSS, base64, file paths, code strings.
Only readable prose is appended under "## Additional Content".

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Valerio 2026-03-26 10:28:16 +01:00
parent 0c91c6d5a9
commit 32c035c543
6 changed files with 665 additions and 7 deletions

View file

@ -9,6 +9,8 @@ pub mod diff;
pub mod domain;
pub mod error;
pub mod extractor;
#[cfg(feature = "quickjs")]
pub mod js_eval;
pub mod llm;
pub mod markdown;
pub mod metadata;
@ -157,6 +159,22 @@ pub fn extract_with_options(
meta.word_count = extractor::word_count(&content.markdown);
}
// QuickJS: execute inline <script> tags to capture JS-assigned data blobs
// (e.g., window.__PRELOADED_STATE__, self.__next_f). This supplements the
// static JSON data island extraction above with runtime-evaluated data.
#[cfg(feature = "quickjs")]
{
let blobs = js_eval::extract_js_data(html);
if !blobs.is_empty() {
let js_text = js_eval::extract_readable_text(&blobs);
if !js_text.is_empty() {
content.markdown.push_str("\n\n");
content.markdown.push_str(&js_text);
meta.word_count = extractor::word_count(&content.markdown);
}
}
}
// Domain detection from URL patterns and DOM heuristics
let domain_type = domain::detect(url, html);
let domain_data = Some(DomainData { domain_type });