mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-08 22:25:12 +02:00
Embeds QuickJS (rquickjs) to execute inline <script> tags and extract data hidden in JavaScript variable assignments. Captures window.__* objects like __preloadedData (NYTimes), __PRELOADED_STATE__ (Wired), and self.__next_f (Next.js RSC flight data). Results: - NYTimes: 1,552 → 4,162 words (+168%) - Wired: 1,459 → 9,937 words (+580%) - Zero measurable performance overhead (<15ms per page) - Feature-gated: disable with --no-default-features for WASM Smart text filtering rejects CSS, base64, file paths, code strings. Only readable prose is appended under "## Additional Content". Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
596 lines
20 KiB
Rust
596 lines
20 KiB
Rust
/// QuickJS-based extraction of data from inline JavaScript in HTML pages.
|
|
///
|
|
/// Many modern websites embed page data as JavaScript variable assignments
|
|
/// (e.g., `window.__PRELOADED_STATE__`, Next.js `self.__next_f`). The static
|
|
/// JSON data island approach (`data_island.rs`) only handles `<script type="application/json">`.
|
|
/// This module executes inline `<script>` tags in a sandboxed QuickJS runtime
|
|
/// to capture those JS-assigned data blobs.
|
|
use once_cell::sync::Lazy;
|
|
use regex::Regex;
|
|
use rquickjs::{Context, Runtime};
|
|
use scraper::{Html, Selector};
|
|
use tracing::debug;
|
|
|
|
static SCRIPT_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("script").unwrap());
|
|
static HTML_TAG_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"<[^>]+>").unwrap());
|
|
|
|
/// A blob of data extracted from JS execution.
|
|
pub struct JsDataBlob {
|
|
pub name: String,
|
|
pub data: String,
|
|
pub size: usize,
|
|
}
|
|
|
|
/// Execute inline `<script>` tags in a QuickJS sandbox and extract `window.__*` data.
|
|
pub fn extract_js_data(html: &str) -> Vec<JsDataBlob> {
|
|
let doc = Html::parse_document(html);
|
|
|
|
let scripts: Vec<String> = doc
|
|
.select(&SCRIPT_SELECTOR)
|
|
.filter(|el| {
|
|
let v = el.value();
|
|
// Skip external scripts and ES modules
|
|
if v.attr("src").is_some() {
|
|
return false;
|
|
}
|
|
if v.attr("type").is_some_and(|t| t == "module") {
|
|
return false;
|
|
}
|
|
true
|
|
})
|
|
.map(|el| el.text().collect::<String>())
|
|
.filter(|s| !s.trim().is_empty())
|
|
.collect();
|
|
|
|
if scripts.is_empty() {
|
|
return Vec::new();
|
|
}
|
|
|
|
let rt = Runtime::new().expect("QuickJS runtime creation failed");
|
|
rt.set_memory_limit(64 * 1024 * 1024); // 64 MB
|
|
rt.set_max_stack_size(1024 * 1024); // 1 MB
|
|
|
|
let ctx = Context::full(&rt).expect("QuickJS context creation failed");
|
|
|
|
ctx.with(|ctx| {
|
|
// Set up minimal browser stubs so scripts don't crash on missing globals.
|
|
// We don't need real implementations — just enough to avoid ReferenceErrors.
|
|
let setup = r#"
|
|
globalThis.window = globalThis;
|
|
globalThis.self = globalThis;
|
|
globalThis.document = {
|
|
createElement: function() { return { style: {}, setAttribute: function(){}, appendChild: function(){} }; },
|
|
getElementById: function() { return null; },
|
|
querySelector: function() { return null; },
|
|
querySelectorAll: function() { return []; },
|
|
addEventListener: function() {},
|
|
createEvent: function() { return { initEvent: function(){} }; },
|
|
createTextNode: function() { return {}; },
|
|
head: { appendChild: function(){}, removeChild: function(){} },
|
|
body: { appendChild: function(){}, removeChild: function(){} },
|
|
documentElement: { style: {} },
|
|
cookie: "",
|
|
readyState: "complete",
|
|
location: { href: "", hostname: "", pathname: "/" }
|
|
};
|
|
globalThis.navigator = {
|
|
userAgent: "Mozilla/5.0",
|
|
language: "en-US",
|
|
languages: ["en-US"],
|
|
platform: "Linux x86_64",
|
|
cookieEnabled: true
|
|
};
|
|
globalThis.location = { href: "", hostname: "", pathname: "/", search: "", hash: "" };
|
|
globalThis.history = { pushState: function(){}, replaceState: function(){} };
|
|
globalThis.setTimeout = function(fn) { if (typeof fn === "function") { try { fn(); } catch(e) {} } return 0; };
|
|
globalThis.clearTimeout = function() {};
|
|
globalThis.setInterval = function() { return 0; };
|
|
globalThis.clearInterval = function() {};
|
|
globalThis.requestAnimationFrame = function() { return 0; };
|
|
globalThis.cancelAnimationFrame = function() {};
|
|
globalThis.console = { log: function(){}, warn: function(){}, error: function(){}, info: function(){}, debug: function(){} };
|
|
globalThis.fetch = function() { return Promise.resolve({ json: function(){ return Promise.resolve({}); }, text: function(){ return Promise.resolve(""); } }); };
|
|
globalThis.XMLHttpRequest = function() { this.open = function(){}; this.send = function(){}; this.setRequestHeader = function(){}; };
|
|
globalThis.localStorage = { getItem: function(){ return null; }, setItem: function(){}, removeItem: function(){}, clear: function(){} };
|
|
globalThis.sessionStorage = { getItem: function(){ return null; }, setItem: function(){}, removeItem: function(){}, clear: function(){} };
|
|
globalThis.addEventListener = function() {};
|
|
globalThis.removeEventListener = function() {};
|
|
globalThis.dispatchEvent = function() {};
|
|
globalThis.getComputedStyle = function() { return {}; };
|
|
globalThis.matchMedia = function() { return { matches: false, addListener: function(){}, removeListener: function(){} }; };
|
|
globalThis.Image = function() {};
|
|
globalThis.Event = function() {};
|
|
globalThis.CustomEvent = function() {};
|
|
globalThis.MutationObserver = function() { this.observe = function(){}; this.disconnect = function(){}; };
|
|
globalThis.IntersectionObserver = function() { this.observe = function(){}; this.disconnect = function(){}; };
|
|
globalThis.ResizeObserver = function() { this.observe = function(){}; this.disconnect = function(){}; };
|
|
globalThis.performance = { now: function(){ return 0; }, mark: function(){}, measure: function(){} };
|
|
globalThis.crypto = { getRandomValues: function(arr) { return arr; } };
|
|
globalThis.URL = function(u) { this.href = u || ""; this.searchParams = { get: function(){ return null; } }; };
|
|
globalThis.Promise = Promise;
|
|
self.__next_f = self.__next_f || [];
|
|
"#;
|
|
let _ = ctx.eval::<(), _>(setup);
|
|
|
|
// Execute each inline script, silently ignoring errors
|
|
for script in &scripts {
|
|
let _ = ctx.eval::<(), _>(script.as_str());
|
|
}
|
|
|
|
// Scan window.__* properties for data blobs
|
|
let scan = r#"
|
|
(function() {
|
|
var results = [];
|
|
var keys = Object.keys(globalThis);
|
|
for (var i = 0; i < keys.length; i++) {
|
|
var key = keys[i];
|
|
if (key.indexOf("__") !== 0) continue;
|
|
var val = globalThis[key];
|
|
if (val === null || val === undefined) continue;
|
|
|
|
// __next_f is an array of RSC flight data chunks
|
|
if (key === "__next_f") {
|
|
if (Array.isArray(val) && val.length > 0) {
|
|
var json = JSON.stringify(val);
|
|
if (json.length > 100) {
|
|
results.push({ name: key, data: json, size: json.length });
|
|
}
|
|
}
|
|
continue;
|
|
}
|
|
|
|
if (typeof val === "object") {
|
|
try {
|
|
var json = JSON.stringify(val);
|
|
if (json && json.length > 100) {
|
|
results.push({ name: key, data: json, size: json.length });
|
|
}
|
|
} catch(e) {}
|
|
}
|
|
}
|
|
return JSON.stringify(results);
|
|
})()
|
|
"#;
|
|
|
|
let Ok(raw): Result<String, _> = ctx.eval(scan) else {
|
|
return Vec::new();
|
|
};
|
|
|
|
let Ok(entries) = serde_json::from_str::<Vec<RawBlob>>(&raw) else {
|
|
return Vec::new();
|
|
};
|
|
|
|
let blobs: Vec<JsDataBlob> = entries
|
|
.into_iter()
|
|
.map(|e| JsDataBlob {
|
|
name: e.name,
|
|
size: e.size,
|
|
data: e.data,
|
|
})
|
|
.collect();
|
|
|
|
if !blobs.is_empty() {
|
|
debug!(
|
|
count = blobs.len(),
|
|
names = blobs
|
|
.iter()
|
|
.map(|b| b.name.as_str())
|
|
.collect::<Vec<_>>()
|
|
.join(", "),
|
|
"extracted JS data blobs"
|
|
);
|
|
}
|
|
|
|
blobs
|
|
})
|
|
}
|
|
|
|
/// Intermediate deserialization target for the scan script output.
|
|
#[derive(serde::Deserialize)]
|
|
struct RawBlob {
|
|
name: String,
|
|
data: String,
|
|
size: usize,
|
|
}
|
|
|
|
/// Extract readable text from JS data blobs and format as markdown.
|
|
///
|
|
/// Walks each blob's JSON looking for human-readable strings, filters out
|
|
/// URLs/paths/CSS/base64, deduplicates, and joins into a single section.
|
|
pub fn extract_readable_text(blobs: &[JsDataBlob]) -> String {
|
|
let mut texts: Vec<String> = Vec::new();
|
|
let mut seen = std::collections::HashSet::new();
|
|
|
|
for blob in blobs {
|
|
if blob.name == "__next_f" {
|
|
let rsc_texts = extract_next_f_text(&blob.data);
|
|
for t in rsc_texts {
|
|
if seen.insert(t.clone()) {
|
|
texts.push(t);
|
|
}
|
|
}
|
|
continue;
|
|
}
|
|
|
|
let Ok(value) = serde_json::from_str::<serde_json::Value>(&blob.data) else {
|
|
continue;
|
|
};
|
|
|
|
let mut found = Vec::new();
|
|
walk_json_for_text(&value, &mut found, 0);
|
|
|
|
for t in found {
|
|
if seen.insert(t.clone()) {
|
|
texts.push(t);
|
|
}
|
|
}
|
|
}
|
|
|
|
if texts.is_empty() {
|
|
return String::new();
|
|
}
|
|
|
|
let mut md = String::from("## Additional Content\n\n");
|
|
md.push_str(&texts.join("\n\n"));
|
|
md
|
|
}
|
|
|
|
/// Recursively walk JSON and collect readable text strings.
|
|
fn walk_json_for_text(value: &serde_json::Value, out: &mut Vec<String>, depth: usize) {
|
|
if depth > 15 {
|
|
return;
|
|
}
|
|
|
|
match value {
|
|
serde_json::Value::String(s) => {
|
|
if let Some(clean) = filter_readable(s) {
|
|
out.push(clean);
|
|
}
|
|
}
|
|
serde_json::Value::Object(map) => {
|
|
for (_, v) in map {
|
|
walk_json_for_text(v, out, depth + 1);
|
|
}
|
|
}
|
|
serde_json::Value::Array(arr) => {
|
|
for v in arr {
|
|
walk_json_for_text(v, out, depth + 1);
|
|
}
|
|
}
|
|
_ => {}
|
|
}
|
|
}
|
|
|
|
/// Filter a string for readability: must be >15 chars, mostly alphabetic,
|
|
/// not a URL, file path, CSS rule, or base64 blob. Strips inline HTML tags.
|
|
fn filter_readable(s: &str) -> Option<String> {
|
|
let s = s.trim();
|
|
if s.len() <= 15 {
|
|
return None;
|
|
}
|
|
|
|
// Skip URLs
|
|
if s.starts_with("http://") || s.starts_with("https://") || s.starts_with("//") {
|
|
return None;
|
|
}
|
|
|
|
// Skip file paths
|
|
if s.starts_with('/') || s.starts_with("./") || s.starts_with("../") {
|
|
return None;
|
|
}
|
|
|
|
// Skip CSS-like strings
|
|
if s.contains('{') && s.contains('}') && (s.contains(':') || s.contains(';')) {
|
|
return None;
|
|
}
|
|
|
|
// Skip CSS grid templates, layout strings, and dimension patterns
|
|
if s.contains("1fr")
|
|
|| s.contains("grid-")
|
|
|| s.contains("max-content")
|
|
|| s.contains("divider-v-")
|
|
|| s.contains("divider-h-")
|
|
{
|
|
return None;
|
|
}
|
|
|
|
// Skip CSS layout area definitions (e.g. "card1 card2 card3")
|
|
// These have repeated dash-separated tokens with digits
|
|
let dash_digit_tokens = s
|
|
.split_whitespace()
|
|
.filter(|w| w.contains('-') && w.chars().any(|c| c.is_ascii_digit()))
|
|
.count();
|
|
if dash_digit_tokens >= 2 {
|
|
return None;
|
|
}
|
|
|
|
// Skip strings containing literal quote characters (CSS grid areas, code snippets)
|
|
if s.contains('"') {
|
|
return None;
|
|
}
|
|
|
|
// Skip CSS grid area names and layout tokens.
|
|
// These are strings of short lowercase words/dots with no sentence structure.
|
|
if !s.chars().any(|c| c.is_uppercase()) {
|
|
let is_css_layout = s.split_whitespace().all(|w| {
|
|
w == "."
|
|
|| (w.len() <= 20
|
|
&& w.chars()
|
|
.all(|c| c.is_ascii_lowercase() || c.is_ascii_digit() || c == '-'))
|
|
});
|
|
if is_css_layout {
|
|
return None;
|
|
}
|
|
}
|
|
|
|
// Skip CSS dimension strings (e.g. "16px 0px 0px 0px")
|
|
if s.split_whitespace().all(|w| {
|
|
w.ends_with("px") || w.ends_with("em") || w.ends_with("rem") || w.ends_with("%") || w == "0"
|
|
}) {
|
|
return None;
|
|
}
|
|
|
|
// Skip base64
|
|
if s.len() > 50 && !s.contains(' ') {
|
|
return None;
|
|
}
|
|
|
|
// Skip strings that are mostly HTML tags
|
|
if s.matches('<').count() > 3 && s.matches('>').count() > 3 {
|
|
let stripped = HTML_TAG_RE.replace_all(s, "");
|
|
if stripped.trim().len() < 15 {
|
|
return None;
|
|
}
|
|
}
|
|
|
|
// Skip strings ending with file extensions
|
|
if s.ends_with(".js")
|
|
|| s.ends_with(".css")
|
|
|| s.ends_with(".png")
|
|
|| s.ends_with(".jpg")
|
|
|| s.ends_with(".svg")
|
|
|| s.ends_with(".woff2")
|
|
{
|
|
return None;
|
|
}
|
|
|
|
// Must be mostly alphabetic (spaces + letters should dominate)
|
|
let alpha_space = s
|
|
.chars()
|
|
.filter(|c| c.is_alphabetic() || c.is_whitespace())
|
|
.count();
|
|
let ratio = alpha_space as f64 / s.len() as f64;
|
|
if ratio < 0.6 {
|
|
return None;
|
|
}
|
|
|
|
// Must contain spaces (prose, not a single token)
|
|
if !s.contains(' ') {
|
|
return None;
|
|
}
|
|
|
|
// Strip any inline HTML tags
|
|
let clean = HTML_TAG_RE.replace_all(s, "").trim().to_string();
|
|
|
|
if clean.len() <= 15 {
|
|
return None;
|
|
}
|
|
|
|
Some(clean)
|
|
}
|
|
|
|
/// Parse Next.js RSC flight data (`self.__next_f`) and extract readable text.
|
|
///
|
|
/// Wire format: array of `[type, payload]` tuples. Type 1 contains the actual
|
|
/// RSC data as newline-delimited entries like `id:TYPE|payload`.
|
|
fn extract_next_f_text(raw_json: &str) -> Vec<String> {
|
|
let Ok(entries) = serde_json::from_str::<Vec<serde_json::Value>>(raw_json) else {
|
|
return Vec::new();
|
|
};
|
|
|
|
// Concatenate all type=1 payloads
|
|
let mut wire = String::new();
|
|
for entry in &entries {
|
|
let arr = match entry.as_array() {
|
|
Some(a) if a.len() >= 2 => a,
|
|
_ => continue,
|
|
};
|
|
let entry_type = arr[0].as_u64().unwrap_or(0);
|
|
if entry_type != 1 {
|
|
continue;
|
|
}
|
|
if let Some(payload) = arr[1].as_str() {
|
|
wire.push_str(payload);
|
|
}
|
|
}
|
|
|
|
if wire.is_empty() {
|
|
return Vec::new();
|
|
}
|
|
|
|
let mut texts = Vec::new();
|
|
|
|
// Each line is `id:TYPE|payload` — parse the JSON payloads
|
|
for line in wire.lines() {
|
|
// Find the payload after the first `|` or `:` marker
|
|
let payload = if let Some(pos) = line.find('|') {
|
|
&line[pos + 1..]
|
|
} else {
|
|
continue;
|
|
};
|
|
|
|
// Try to parse as JSON array (RSC element representation)
|
|
if let Ok(value) = serde_json::from_str::<serde_json::Value>(payload) {
|
|
walk_rsc_tree(&value, &mut texts, 0);
|
|
}
|
|
}
|
|
|
|
texts
|
|
}
|
|
|
|
/// Walk an RSC tree element extracting children text content.
|
|
fn walk_rsc_tree(value: &serde_json::Value, out: &mut Vec<String>, depth: usize) {
|
|
if depth > 20 {
|
|
return;
|
|
}
|
|
|
|
match value {
|
|
serde_json::Value::String(s) => {
|
|
if let Some(clean) = filter_readable(s) {
|
|
out.push(clean);
|
|
}
|
|
}
|
|
serde_json::Value::Array(arr) => {
|
|
for item in arr {
|
|
walk_rsc_tree(item, out, depth + 1);
|
|
}
|
|
}
|
|
serde_json::Value::Object(map) => {
|
|
// RSC elements have "children" that contain text
|
|
if let Some(children) = map.get("children") {
|
|
walk_rsc_tree(children, out, depth + 1);
|
|
}
|
|
// Also check other fields
|
|
for (key, v) in map {
|
|
if key == "children" {
|
|
continue;
|
|
}
|
|
walk_rsc_tree(v, out, depth + 1);
|
|
}
|
|
}
|
|
_ => {}
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn extracts_window_preloaded_data() {
|
|
let html = r#"<html><body>
|
|
<script>
|
|
window.__preloadedData = {
|
|
"page": {
|
|
"title": "Hello World Article Title",
|
|
"body": "This is a longer paragraph of text that should be extracted from the preloaded data blob successfully."
|
|
}
|
|
};
|
|
</script>
|
|
</body></html>"#;
|
|
|
|
let blobs = extract_js_data(html);
|
|
assert!(!blobs.is_empty(), "should extract at least one blob");
|
|
assert!(
|
|
blobs.iter().any(|b| b.name == "__preloadedData"),
|
|
"should find __preloadedData"
|
|
);
|
|
|
|
let text = extract_readable_text(&blobs);
|
|
assert!(
|
|
text.contains("This is a longer paragraph"),
|
|
"should extract readable text from blob"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn skips_external_and_module_scripts() {
|
|
let html = r#"<html><body>
|
|
<script src="https://cdn.example.com/app.js"></script>
|
|
<script type="module">export default {};</script>
|
|
<script>window.__testData = {"content": "This is a test sentence that is long enough to be extracted from the page and it needs over one hundred characters of JSON to pass the threshold."};</script>
|
|
</body></html>"#;
|
|
|
|
let blobs = extract_js_data(html);
|
|
assert_eq!(
|
|
blobs.len(),
|
|
1,
|
|
"should only process inline non-module script"
|
|
);
|
|
assert_eq!(blobs[0].name, "__testData");
|
|
}
|
|
|
|
#[test]
|
|
fn empty_html_returns_no_blobs() {
|
|
let blobs = extract_js_data("<html><body></body></html>");
|
|
assert!(blobs.is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn filter_readable_rejects_junk() {
|
|
assert!(filter_readable("short").is_none());
|
|
assert!(filter_readable("https://example.com/some/long/path").is_none());
|
|
assert!(filter_readable("/static/js/bundle.min.js").is_none());
|
|
assert!(filter_readable("aGVsbG8gd29ybGQgdGhpcyBpcyBhIGJhc2U2NCBzdHJpbmc=").is_none());
|
|
assert!(filter_readable(".container { display: flex; padding: 10px; }").is_none());
|
|
}
|
|
|
|
#[test]
|
|
fn filter_readable_accepts_prose() {
|
|
let result = filter_readable("This is a normal sentence with enough words.");
|
|
assert!(result.is_some());
|
|
assert_eq!(
|
|
result.unwrap(),
|
|
"This is a normal sentence with enough words."
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn strips_html_tags_from_text() {
|
|
let result = filter_readable(
|
|
"This has <strong>bold</strong> and <em>italic</em> formatting inside it.",
|
|
);
|
|
assert!(result.is_some());
|
|
let clean = result.unwrap();
|
|
assert!(!clean.contains('<'));
|
|
assert!(clean.contains("bold"));
|
|
assert!(clean.contains("italic"));
|
|
}
|
|
|
|
#[test]
|
|
fn extract_readable_text_produces_markdown() {
|
|
let blobs = vec![JsDataBlob {
|
|
name: "__data".to_string(),
|
|
data: r#"{"article":"This is the main article content that should appear in the extracted text."}"#
|
|
.to_string(),
|
|
size: 100,
|
|
}];
|
|
|
|
let text = extract_readable_text(&blobs);
|
|
assert!(text.starts_with("## Additional Content"));
|
|
assert!(text.contains("main article content"));
|
|
}
|
|
|
|
#[test]
|
|
fn extract_next_f_rsc_data() {
|
|
let blobs = vec![JsDataBlob {
|
|
name: "__next_f".to_string(),
|
|
data: r#"[[0,""],
|
|
[1,"0:T1234|{\"children\":\"This is some Next.js RSC flight data content that we want to extract.\"}\n"]]"#
|
|
.to_string(),
|
|
size: 200,
|
|
}];
|
|
|
|
let text = extract_readable_text(&blobs);
|
|
assert!(
|
|
text.contains("Next.js RSC flight data content"),
|
|
"should extract text from RSC flight data. Got: {text}"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn handles_script_errors_gracefully() {
|
|
// Scripts that throw errors should be silently ignored
|
|
let html = r#"<html><body>
|
|
<script>throw new Error("intentional crash");</script>
|
|
<script>undefined_function();</script>
|
|
<script>window.__survived = {"message": "This script ran after the errors and the data should still be found in the extracted blobs because it exceeds the minimum threshold."};</script>
|
|
</body></html>"#;
|
|
|
|
let blobs = extract_js_data(html);
|
|
assert!(
|
|
blobs.iter().any(|b| b.name == "__survived"),
|
|
"should extract data from scripts that succeed after failures"
|
|
);
|
|
}
|
|
}
|