feat: v0.1.4 — QuickJS integration for inline JavaScript data extraction

Embeds QuickJS (rquickjs) to execute inline <script> tags and extract
data hidden in JavaScript variable assignments. Captures window.__*
objects like __preloadedData (NYTimes), __PRELOADED_STATE__ (Wired),
and self.__next_f (Next.js RSC flight data).

Results:
- NYTimes: 1,552 → 4,162 words (+168%)
- Wired: 1,459 → 9,937 words (+580%)
- Zero measurable performance overhead (<15ms per page)
- Feature-gated: disable with --no-default-features for WASM

Smart text filtering rejects CSS, base64, file paths, code strings.
Only readable prose is appended under "## Additional Content".

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Valerio 2026-03-26 10:28:16 +01:00
parent 0c91c6d5a9
commit 32c035c543
6 changed files with 665 additions and 7 deletions

View file

@ -0,0 +1,596 @@
/// QuickJS-based extraction of data from inline JavaScript in HTML pages.
///
/// Many modern websites embed page data as JavaScript variable assignments
/// (e.g., `window.__PRELOADED_STATE__`, Next.js `self.__next_f`). The static
/// JSON data island approach (`data_island.rs`) only handles `<script type="application/json">`.
/// This module executes inline `<script>` tags in a sandboxed QuickJS runtime
/// to capture those JS-assigned data blobs.
use once_cell::sync::Lazy;
use regex::Regex;
use rquickjs::{Context, Runtime};
use scraper::{Html, Selector};
use tracing::debug;
static SCRIPT_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("script").unwrap());
static HTML_TAG_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"<[^>]+>").unwrap());
/// A blob of data extracted from JS execution.
pub struct JsDataBlob {
pub name: String,
pub data: String,
pub size: usize,
}
/// Execute inline `<script>` tags in a QuickJS sandbox and extract `window.__*` data.
pub fn extract_js_data(html: &str) -> Vec<JsDataBlob> {
let doc = Html::parse_document(html);
let scripts: Vec<String> = doc
.select(&SCRIPT_SELECTOR)
.filter(|el| {
let v = el.value();
// Skip external scripts and ES modules
if v.attr("src").is_some() {
return false;
}
if v.attr("type").is_some_and(|t| t == "module") {
return false;
}
true
})
.map(|el| el.text().collect::<String>())
.filter(|s| !s.trim().is_empty())
.collect();
if scripts.is_empty() {
return Vec::new();
}
let rt = Runtime::new().expect("QuickJS runtime creation failed");
rt.set_memory_limit(64 * 1024 * 1024); // 64 MB
rt.set_max_stack_size(1024 * 1024); // 1 MB
let ctx = Context::full(&rt).expect("QuickJS context creation failed");
ctx.with(|ctx| {
// Set up minimal browser stubs so scripts don't crash on missing globals.
// We don't need real implementations — just enough to avoid ReferenceErrors.
let setup = r#"
globalThis.window = globalThis;
globalThis.self = globalThis;
globalThis.document = {
createElement: function() { return { style: {}, setAttribute: function(){}, appendChild: function(){} }; },
getElementById: function() { return null; },
querySelector: function() { return null; },
querySelectorAll: function() { return []; },
addEventListener: function() {},
createEvent: function() { return { initEvent: function(){} }; },
createTextNode: function() { return {}; },
head: { appendChild: function(){}, removeChild: function(){} },
body: { appendChild: function(){}, removeChild: function(){} },
documentElement: { style: {} },
cookie: "",
readyState: "complete",
location: { href: "", hostname: "", pathname: "/" }
};
globalThis.navigator = {
userAgent: "Mozilla/5.0",
language: "en-US",
languages: ["en-US"],
platform: "Linux x86_64",
cookieEnabled: true
};
globalThis.location = { href: "", hostname: "", pathname: "/", search: "", hash: "" };
globalThis.history = { pushState: function(){}, replaceState: function(){} };
globalThis.setTimeout = function(fn) { if (typeof fn === "function") { try { fn(); } catch(e) {} } return 0; };
globalThis.clearTimeout = function() {};
globalThis.setInterval = function() { return 0; };
globalThis.clearInterval = function() {};
globalThis.requestAnimationFrame = function() { return 0; };
globalThis.cancelAnimationFrame = function() {};
globalThis.console = { log: function(){}, warn: function(){}, error: function(){}, info: function(){}, debug: function(){} };
globalThis.fetch = function() { return Promise.resolve({ json: function(){ return Promise.resolve({}); }, text: function(){ return Promise.resolve(""); } }); };
globalThis.XMLHttpRequest = function() { this.open = function(){}; this.send = function(){}; this.setRequestHeader = function(){}; };
globalThis.localStorage = { getItem: function(){ return null; }, setItem: function(){}, removeItem: function(){}, clear: function(){} };
globalThis.sessionStorage = { getItem: function(){ return null; }, setItem: function(){}, removeItem: function(){}, clear: function(){} };
globalThis.addEventListener = function() {};
globalThis.removeEventListener = function() {};
globalThis.dispatchEvent = function() {};
globalThis.getComputedStyle = function() { return {}; };
globalThis.matchMedia = function() { return { matches: false, addListener: function(){}, removeListener: function(){} }; };
globalThis.Image = function() {};
globalThis.Event = function() {};
globalThis.CustomEvent = function() {};
globalThis.MutationObserver = function() { this.observe = function(){}; this.disconnect = function(){}; };
globalThis.IntersectionObserver = function() { this.observe = function(){}; this.disconnect = function(){}; };
globalThis.ResizeObserver = function() { this.observe = function(){}; this.disconnect = function(){}; };
globalThis.performance = { now: function(){ return 0; }, mark: function(){}, measure: function(){} };
globalThis.crypto = { getRandomValues: function(arr) { return arr; } };
globalThis.URL = function(u) { this.href = u || ""; this.searchParams = { get: function(){ return null; } }; };
globalThis.Promise = Promise;
self.__next_f = self.__next_f || [];
"#;
let _ = ctx.eval::<(), _>(setup);
// Execute each inline script, silently ignoring errors
for script in &scripts {
let _ = ctx.eval::<(), _>(script.as_str());
}
// Scan window.__* properties for data blobs
let scan = r#"
(function() {
var results = [];
var keys = Object.keys(globalThis);
for (var i = 0; i < keys.length; i++) {
var key = keys[i];
if (key.indexOf("__") !== 0) continue;
var val = globalThis[key];
if (val === null || val === undefined) continue;
// __next_f is an array of RSC flight data chunks
if (key === "__next_f") {
if (Array.isArray(val) && val.length > 0) {
var json = JSON.stringify(val);
if (json.length > 100) {
results.push({ name: key, data: json, size: json.length });
}
}
continue;
}
if (typeof val === "object") {
try {
var json = JSON.stringify(val);
if (json && json.length > 100) {
results.push({ name: key, data: json, size: json.length });
}
} catch(e) {}
}
}
return JSON.stringify(results);
})()
"#;
let Ok(raw): Result<String, _> = ctx.eval(scan) else {
return Vec::new();
};
let Ok(entries) = serde_json::from_str::<Vec<RawBlob>>(&raw) else {
return Vec::new();
};
let blobs: Vec<JsDataBlob> = entries
.into_iter()
.map(|e| JsDataBlob {
name: e.name,
size: e.size,
data: e.data,
})
.collect();
if !blobs.is_empty() {
debug!(
count = blobs.len(),
names = blobs
.iter()
.map(|b| b.name.as_str())
.collect::<Vec<_>>()
.join(", "),
"extracted JS data blobs"
);
}
blobs
})
}
/// Intermediate deserialization target for the scan script output.
#[derive(serde::Deserialize)]
struct RawBlob {
name: String,
data: String,
size: usize,
}
/// Extract readable text from JS data blobs and format as markdown.
///
/// Walks each blob's JSON looking for human-readable strings, filters out
/// URLs/paths/CSS/base64, deduplicates, and joins into a single section.
pub fn extract_readable_text(blobs: &[JsDataBlob]) -> String {
let mut texts: Vec<String> = Vec::new();
let mut seen = std::collections::HashSet::new();
for blob in blobs {
if blob.name == "__next_f" {
let rsc_texts = extract_next_f_text(&blob.data);
for t in rsc_texts {
if seen.insert(t.clone()) {
texts.push(t);
}
}
continue;
}
let Ok(value) = serde_json::from_str::<serde_json::Value>(&blob.data) else {
continue;
};
let mut found = Vec::new();
walk_json_for_text(&value, &mut found, 0);
for t in found {
if seen.insert(t.clone()) {
texts.push(t);
}
}
}
if texts.is_empty() {
return String::new();
}
let mut md = String::from("## Additional Content\n\n");
md.push_str(&texts.join("\n\n"));
md
}
/// Recursively walk JSON and collect readable text strings.
fn walk_json_for_text(value: &serde_json::Value, out: &mut Vec<String>, depth: usize) {
if depth > 15 {
return;
}
match value {
serde_json::Value::String(s) => {
if let Some(clean) = filter_readable(s) {
out.push(clean);
}
}
serde_json::Value::Object(map) => {
for (_, v) in map {
walk_json_for_text(v, out, depth + 1);
}
}
serde_json::Value::Array(arr) => {
for v in arr {
walk_json_for_text(v, out, depth + 1);
}
}
_ => {}
}
}
/// Filter a string for readability: must be >15 chars, mostly alphabetic,
/// not a URL, file path, CSS rule, or base64 blob. Strips inline HTML tags.
fn filter_readable(s: &str) -> Option<String> {
let s = s.trim();
if s.len() <= 15 {
return None;
}
// Skip URLs
if s.starts_with("http://") || s.starts_with("https://") || s.starts_with("//") {
return None;
}
// Skip file paths
if s.starts_with('/') || s.starts_with("./") || s.starts_with("../") {
return None;
}
// Skip CSS-like strings
if s.contains('{') && s.contains('}') && (s.contains(':') || s.contains(';')) {
return None;
}
// Skip CSS grid templates, layout strings, and dimension patterns
if s.contains("1fr")
|| s.contains("grid-")
|| s.contains("max-content")
|| s.contains("divider-v-")
|| s.contains("divider-h-")
{
return None;
}
// Skip CSS layout area definitions (e.g. "card1 card2 card3")
// These have repeated dash-separated tokens with digits
let dash_digit_tokens = s
.split_whitespace()
.filter(|w| w.contains('-') && w.chars().any(|c| c.is_ascii_digit()))
.count();
if dash_digit_tokens >= 2 {
return None;
}
// Skip strings containing literal quote characters (CSS grid areas, code snippets)
if s.contains('"') {
return None;
}
// Skip CSS grid area names and layout tokens.
// These are strings of short lowercase words/dots with no sentence structure.
if !s.chars().any(|c| c.is_uppercase()) {
let is_css_layout = s.split_whitespace().all(|w| {
w == "."
|| (w.len() <= 20
&& w.chars()
.all(|c| c.is_ascii_lowercase() || c.is_ascii_digit() || c == '-'))
});
if is_css_layout {
return None;
}
}
// Skip CSS dimension strings (e.g. "16px 0px 0px 0px")
if s.split_whitespace().all(|w| {
w.ends_with("px") || w.ends_with("em") || w.ends_with("rem") || w.ends_with("%") || w == "0"
}) {
return None;
}
// Skip base64
if s.len() > 50 && !s.contains(' ') {
return None;
}
// Skip strings that are mostly HTML tags
if s.matches('<').count() > 3 && s.matches('>').count() > 3 {
let stripped = HTML_TAG_RE.replace_all(s, "");
if stripped.trim().len() < 15 {
return None;
}
}
// Skip strings ending with file extensions
if s.ends_with(".js")
|| s.ends_with(".css")
|| s.ends_with(".png")
|| s.ends_with(".jpg")
|| s.ends_with(".svg")
|| s.ends_with(".woff2")
{
return None;
}
// Must be mostly alphabetic (spaces + letters should dominate)
let alpha_space = s
.chars()
.filter(|c| c.is_alphabetic() || c.is_whitespace())
.count();
let ratio = alpha_space as f64 / s.len() as f64;
if ratio < 0.6 {
return None;
}
// Must contain spaces (prose, not a single token)
if !s.contains(' ') {
return None;
}
// Strip any inline HTML tags
let clean = HTML_TAG_RE.replace_all(s, "").trim().to_string();
if clean.len() <= 15 {
return None;
}
Some(clean)
}
/// Parse Next.js RSC flight data (`self.__next_f`) and extract readable text.
///
/// Wire format: array of `[type, payload]` tuples. Type 1 contains the actual
/// RSC data as newline-delimited entries like `id:TYPE|payload`.
fn extract_next_f_text(raw_json: &str) -> Vec<String> {
let Ok(entries) = serde_json::from_str::<Vec<serde_json::Value>>(raw_json) else {
return Vec::new();
};
// Concatenate all type=1 payloads
let mut wire = String::new();
for entry in &entries {
let arr = match entry.as_array() {
Some(a) if a.len() >= 2 => a,
_ => continue,
};
let entry_type = arr[0].as_u64().unwrap_or(0);
if entry_type != 1 {
continue;
}
if let Some(payload) = arr[1].as_str() {
wire.push_str(payload);
}
}
if wire.is_empty() {
return Vec::new();
}
let mut texts = Vec::new();
// Each line is `id:TYPE|payload` — parse the JSON payloads
for line in wire.lines() {
// Find the payload after the first `|` or `:` marker
let payload = if let Some(pos) = line.find('|') {
&line[pos + 1..]
} else {
continue;
};
// Try to parse as JSON array (RSC element representation)
if let Ok(value) = serde_json::from_str::<serde_json::Value>(payload) {
walk_rsc_tree(&value, &mut texts, 0);
}
}
texts
}
/// Walk an RSC tree element extracting children text content.
fn walk_rsc_tree(value: &serde_json::Value, out: &mut Vec<String>, depth: usize) {
if depth > 20 {
return;
}
match value {
serde_json::Value::String(s) => {
if let Some(clean) = filter_readable(s) {
out.push(clean);
}
}
serde_json::Value::Array(arr) => {
for item in arr {
walk_rsc_tree(item, out, depth + 1);
}
}
serde_json::Value::Object(map) => {
// RSC elements have "children" that contain text
if let Some(children) = map.get("children") {
walk_rsc_tree(children, out, depth + 1);
}
// Also check other fields
for (key, v) in map {
if key == "children" {
continue;
}
walk_rsc_tree(v, out, depth + 1);
}
}
_ => {}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn extracts_window_preloaded_data() {
let html = r#"<html><body>
<script>
window.__preloadedData = {
"page": {
"title": "Hello World Article Title",
"body": "This is a longer paragraph of text that should be extracted from the preloaded data blob successfully."
}
};
</script>
</body></html>"#;
let blobs = extract_js_data(html);
assert!(!blobs.is_empty(), "should extract at least one blob");
assert!(
blobs.iter().any(|b| b.name == "__preloadedData"),
"should find __preloadedData"
);
let text = extract_readable_text(&blobs);
assert!(
text.contains("This is a longer paragraph"),
"should extract readable text from blob"
);
}
#[test]
fn skips_external_and_module_scripts() {
let html = r#"<html><body>
<script src="https://cdn.example.com/app.js"></script>
<script type="module">export default {};</script>
<script>window.__testData = {"content": "This is a test sentence that is long enough to be extracted from the page and it needs over one hundred characters of JSON to pass the threshold."};</script>
</body></html>"#;
let blobs = extract_js_data(html);
assert_eq!(
blobs.len(),
1,
"should only process inline non-module script"
);
assert_eq!(blobs[0].name, "__testData");
}
#[test]
fn empty_html_returns_no_blobs() {
let blobs = extract_js_data("<html><body></body></html>");
assert!(blobs.is_empty());
}
#[test]
fn filter_readable_rejects_junk() {
assert!(filter_readable("short").is_none());
assert!(filter_readable("https://example.com/some/long/path").is_none());
assert!(filter_readable("/static/js/bundle.min.js").is_none());
assert!(filter_readable("aGVsbG8gd29ybGQgdGhpcyBpcyBhIGJhc2U2NCBzdHJpbmc=").is_none());
assert!(filter_readable(".container { display: flex; padding: 10px; }").is_none());
}
#[test]
fn filter_readable_accepts_prose() {
let result = filter_readable("This is a normal sentence with enough words.");
assert!(result.is_some());
assert_eq!(
result.unwrap(),
"This is a normal sentence with enough words."
);
}
#[test]
fn strips_html_tags_from_text() {
let result = filter_readable(
"This has <strong>bold</strong> and <em>italic</em> formatting inside it.",
);
assert!(result.is_some());
let clean = result.unwrap();
assert!(!clean.contains('<'));
assert!(clean.contains("bold"));
assert!(clean.contains("italic"));
}
#[test]
fn extract_readable_text_produces_markdown() {
let blobs = vec![JsDataBlob {
name: "__data".to_string(),
data: r#"{"article":"This is the main article content that should appear in the extracted text."}"#
.to_string(),
size: 100,
}];
let text = extract_readable_text(&blobs);
assert!(text.starts_with("## Additional Content"));
assert!(text.contains("main article content"));
}
#[test]
fn extract_next_f_rsc_data() {
let blobs = vec![JsDataBlob {
name: "__next_f".to_string(),
data: r#"[[0,""],
[1,"0:T1234|{\"children\":\"This is some Next.js RSC flight data content that we want to extract.\"}\n"]]"#
.to_string(),
size: 200,
}];
let text = extract_readable_text(&blobs);
assert!(
text.contains("Next.js RSC flight data content"),
"should extract text from RSC flight data. Got: {text}"
);
}
#[test]
fn handles_script_errors_gracefully() {
// Scripts that throw errors should be silently ignored
let html = r#"<html><body>
<script>throw new Error("intentional crash");</script>
<script>undefined_function();</script>
<script>window.__survived = {"message": "This script ran after the errors and the data should still be found in the extracted blobs because it exceeds the minimum threshold."};</script>
</body></html>"#;
let blobs = extract_js_data(html);
assert!(
blobs.iter().any(|b| b.name == "__survived"),
"should extract data from scripts that succeed after failures"
);
}
}

View file

@ -9,6 +9,8 @@ pub mod diff;
pub mod domain;
pub mod error;
pub mod extractor;
#[cfg(feature = "quickjs")]
pub mod js_eval;
pub mod llm;
pub mod markdown;
pub mod metadata;
@ -157,6 +159,22 @@ pub fn extract_with_options(
meta.word_count = extractor::word_count(&content.markdown);
}
// QuickJS: execute inline <script> tags to capture JS-assigned data blobs
// (e.g., window.__PRELOADED_STATE__, self.__next_f). This supplements the
// static JSON data island extraction above with runtime-evaluated data.
#[cfg(feature = "quickjs")]
{
let blobs = js_eval::extract_js_data(html);
if !blobs.is_empty() {
let js_text = js_eval::extract_readable_text(&blobs);
if !js_text.is_empty() {
content.markdown.push_str("\n\n");
content.markdown.push_str(&js_text);
meta.word_count = extractor::word_count(&content.markdown);
}
}
}
// Domain detection from URL patterns and DOM heuristics
let domain_type = domain::detect(url, html);
let domain_data = Some(DomainData { domain_type });