mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-05-15 18:25:24 +02:00
feat: v0.1.4 — QuickJS integration for inline JavaScript data extraction
Embeds QuickJS (rquickjs) to execute inline <script> tags and extract data hidden in JavaScript variable assignments. Captures window.__* objects like __preloadedData (NYTimes), __PRELOADED_STATE__ (Wired), and self.__next_f (Next.js RSC flight data). Results: - NYTimes: 1,552 → 4,162 words (+168%) - Wired: 1,459 → 9,937 words (+580%) - Zero measurable performance overhead (<15ms per page) - Feature-gated: disable with --no-default-features for WASM Smart text filtering rejects CSS, base64, file paths, code strings. Only readable prose is appended under "## Additional Content". Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
0c91c6d5a9
commit
32c035c543
6 changed files with 665 additions and 7 deletions
596
crates/webclaw-core/src/js_eval.rs
Normal file
596
crates/webclaw-core/src/js_eval.rs
Normal file
|
|
@ -0,0 +1,596 @@
|
|||
/// QuickJS-based extraction of data from inline JavaScript in HTML pages.
|
||||
///
|
||||
/// Many modern websites embed page data as JavaScript variable assignments
|
||||
/// (e.g., `window.__PRELOADED_STATE__`, Next.js `self.__next_f`). The static
|
||||
/// JSON data island approach (`data_island.rs`) only handles `<script type="application/json">`.
|
||||
/// This module executes inline `<script>` tags in a sandboxed QuickJS runtime
|
||||
/// to capture those JS-assigned data blobs.
|
||||
use once_cell::sync::Lazy;
|
||||
use regex::Regex;
|
||||
use rquickjs::{Context, Runtime};
|
||||
use scraper::{Html, Selector};
|
||||
use tracing::debug;
|
||||
|
||||
static SCRIPT_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("script").unwrap());
|
||||
static HTML_TAG_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"<[^>]+>").unwrap());
|
||||
|
||||
/// A blob of data extracted from JS execution.
|
||||
pub struct JsDataBlob {
|
||||
pub name: String,
|
||||
pub data: String,
|
||||
pub size: usize,
|
||||
}
|
||||
|
||||
/// Execute inline `<script>` tags in a QuickJS sandbox and extract `window.__*` data.
|
||||
pub fn extract_js_data(html: &str) -> Vec<JsDataBlob> {
|
||||
let doc = Html::parse_document(html);
|
||||
|
||||
let scripts: Vec<String> = doc
|
||||
.select(&SCRIPT_SELECTOR)
|
||||
.filter(|el| {
|
||||
let v = el.value();
|
||||
// Skip external scripts and ES modules
|
||||
if v.attr("src").is_some() {
|
||||
return false;
|
||||
}
|
||||
if v.attr("type").is_some_and(|t| t == "module") {
|
||||
return false;
|
||||
}
|
||||
true
|
||||
})
|
||||
.map(|el| el.text().collect::<String>())
|
||||
.filter(|s| !s.trim().is_empty())
|
||||
.collect();
|
||||
|
||||
if scripts.is_empty() {
|
||||
return Vec::new();
|
||||
}
|
||||
|
||||
let rt = Runtime::new().expect("QuickJS runtime creation failed");
|
||||
rt.set_memory_limit(64 * 1024 * 1024); // 64 MB
|
||||
rt.set_max_stack_size(1024 * 1024); // 1 MB
|
||||
|
||||
let ctx = Context::full(&rt).expect("QuickJS context creation failed");
|
||||
|
||||
ctx.with(|ctx| {
|
||||
// Set up minimal browser stubs so scripts don't crash on missing globals.
|
||||
// We don't need real implementations — just enough to avoid ReferenceErrors.
|
||||
let setup = r#"
|
||||
globalThis.window = globalThis;
|
||||
globalThis.self = globalThis;
|
||||
globalThis.document = {
|
||||
createElement: function() { return { style: {}, setAttribute: function(){}, appendChild: function(){} }; },
|
||||
getElementById: function() { return null; },
|
||||
querySelector: function() { return null; },
|
||||
querySelectorAll: function() { return []; },
|
||||
addEventListener: function() {},
|
||||
createEvent: function() { return { initEvent: function(){} }; },
|
||||
createTextNode: function() { return {}; },
|
||||
head: { appendChild: function(){}, removeChild: function(){} },
|
||||
body: { appendChild: function(){}, removeChild: function(){} },
|
||||
documentElement: { style: {} },
|
||||
cookie: "",
|
||||
readyState: "complete",
|
||||
location: { href: "", hostname: "", pathname: "/" }
|
||||
};
|
||||
globalThis.navigator = {
|
||||
userAgent: "Mozilla/5.0",
|
||||
language: "en-US",
|
||||
languages: ["en-US"],
|
||||
platform: "Linux x86_64",
|
||||
cookieEnabled: true
|
||||
};
|
||||
globalThis.location = { href: "", hostname: "", pathname: "/", search: "", hash: "" };
|
||||
globalThis.history = { pushState: function(){}, replaceState: function(){} };
|
||||
globalThis.setTimeout = function(fn) { if (typeof fn === "function") { try { fn(); } catch(e) {} } return 0; };
|
||||
globalThis.clearTimeout = function() {};
|
||||
globalThis.setInterval = function() { return 0; };
|
||||
globalThis.clearInterval = function() {};
|
||||
globalThis.requestAnimationFrame = function() { return 0; };
|
||||
globalThis.cancelAnimationFrame = function() {};
|
||||
globalThis.console = { log: function(){}, warn: function(){}, error: function(){}, info: function(){}, debug: function(){} };
|
||||
globalThis.fetch = function() { return Promise.resolve({ json: function(){ return Promise.resolve({}); }, text: function(){ return Promise.resolve(""); } }); };
|
||||
globalThis.XMLHttpRequest = function() { this.open = function(){}; this.send = function(){}; this.setRequestHeader = function(){}; };
|
||||
globalThis.localStorage = { getItem: function(){ return null; }, setItem: function(){}, removeItem: function(){}, clear: function(){} };
|
||||
globalThis.sessionStorage = { getItem: function(){ return null; }, setItem: function(){}, removeItem: function(){}, clear: function(){} };
|
||||
globalThis.addEventListener = function() {};
|
||||
globalThis.removeEventListener = function() {};
|
||||
globalThis.dispatchEvent = function() {};
|
||||
globalThis.getComputedStyle = function() { return {}; };
|
||||
globalThis.matchMedia = function() { return { matches: false, addListener: function(){}, removeListener: function(){} }; };
|
||||
globalThis.Image = function() {};
|
||||
globalThis.Event = function() {};
|
||||
globalThis.CustomEvent = function() {};
|
||||
globalThis.MutationObserver = function() { this.observe = function(){}; this.disconnect = function(){}; };
|
||||
globalThis.IntersectionObserver = function() { this.observe = function(){}; this.disconnect = function(){}; };
|
||||
globalThis.ResizeObserver = function() { this.observe = function(){}; this.disconnect = function(){}; };
|
||||
globalThis.performance = { now: function(){ return 0; }, mark: function(){}, measure: function(){} };
|
||||
globalThis.crypto = { getRandomValues: function(arr) { return arr; } };
|
||||
globalThis.URL = function(u) { this.href = u || ""; this.searchParams = { get: function(){ return null; } }; };
|
||||
globalThis.Promise = Promise;
|
||||
self.__next_f = self.__next_f || [];
|
||||
"#;
|
||||
let _ = ctx.eval::<(), _>(setup);
|
||||
|
||||
// Execute each inline script, silently ignoring errors
|
||||
for script in &scripts {
|
||||
let _ = ctx.eval::<(), _>(script.as_str());
|
||||
}
|
||||
|
||||
// Scan window.__* properties for data blobs
|
||||
let scan = r#"
|
||||
(function() {
|
||||
var results = [];
|
||||
var keys = Object.keys(globalThis);
|
||||
for (var i = 0; i < keys.length; i++) {
|
||||
var key = keys[i];
|
||||
if (key.indexOf("__") !== 0) continue;
|
||||
var val = globalThis[key];
|
||||
if (val === null || val === undefined) continue;
|
||||
|
||||
// __next_f is an array of RSC flight data chunks
|
||||
if (key === "__next_f") {
|
||||
if (Array.isArray(val) && val.length > 0) {
|
||||
var json = JSON.stringify(val);
|
||||
if (json.length > 100) {
|
||||
results.push({ name: key, data: json, size: json.length });
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (typeof val === "object") {
|
||||
try {
|
||||
var json = JSON.stringify(val);
|
||||
if (json && json.length > 100) {
|
||||
results.push({ name: key, data: json, size: json.length });
|
||||
}
|
||||
} catch(e) {}
|
||||
}
|
||||
}
|
||||
return JSON.stringify(results);
|
||||
})()
|
||||
"#;
|
||||
|
||||
let Ok(raw): Result<String, _> = ctx.eval(scan) else {
|
||||
return Vec::new();
|
||||
};
|
||||
|
||||
let Ok(entries) = serde_json::from_str::<Vec<RawBlob>>(&raw) else {
|
||||
return Vec::new();
|
||||
};
|
||||
|
||||
let blobs: Vec<JsDataBlob> = entries
|
||||
.into_iter()
|
||||
.map(|e| JsDataBlob {
|
||||
name: e.name,
|
||||
size: e.size,
|
||||
data: e.data,
|
||||
})
|
||||
.collect();
|
||||
|
||||
if !blobs.is_empty() {
|
||||
debug!(
|
||||
count = blobs.len(),
|
||||
names = blobs
|
||||
.iter()
|
||||
.map(|b| b.name.as_str())
|
||||
.collect::<Vec<_>>()
|
||||
.join(", "),
|
||||
"extracted JS data blobs"
|
||||
);
|
||||
}
|
||||
|
||||
blobs
|
||||
})
|
||||
}
|
||||
|
||||
/// Intermediate deserialization target for the scan script output.
|
||||
#[derive(serde::Deserialize)]
|
||||
struct RawBlob {
|
||||
name: String,
|
||||
data: String,
|
||||
size: usize,
|
||||
}
|
||||
|
||||
/// Extract readable text from JS data blobs and format as markdown.
|
||||
///
|
||||
/// Walks each blob's JSON looking for human-readable strings, filters out
|
||||
/// URLs/paths/CSS/base64, deduplicates, and joins into a single section.
|
||||
pub fn extract_readable_text(blobs: &[JsDataBlob]) -> String {
|
||||
let mut texts: Vec<String> = Vec::new();
|
||||
let mut seen = std::collections::HashSet::new();
|
||||
|
||||
for blob in blobs {
|
||||
if blob.name == "__next_f" {
|
||||
let rsc_texts = extract_next_f_text(&blob.data);
|
||||
for t in rsc_texts {
|
||||
if seen.insert(t.clone()) {
|
||||
texts.push(t);
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
let Ok(value) = serde_json::from_str::<serde_json::Value>(&blob.data) else {
|
||||
continue;
|
||||
};
|
||||
|
||||
let mut found = Vec::new();
|
||||
walk_json_for_text(&value, &mut found, 0);
|
||||
|
||||
for t in found {
|
||||
if seen.insert(t.clone()) {
|
||||
texts.push(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if texts.is_empty() {
|
||||
return String::new();
|
||||
}
|
||||
|
||||
let mut md = String::from("## Additional Content\n\n");
|
||||
md.push_str(&texts.join("\n\n"));
|
||||
md
|
||||
}
|
||||
|
||||
/// Recursively walk JSON and collect readable text strings.
|
||||
fn walk_json_for_text(value: &serde_json::Value, out: &mut Vec<String>, depth: usize) {
|
||||
if depth > 15 {
|
||||
return;
|
||||
}
|
||||
|
||||
match value {
|
||||
serde_json::Value::String(s) => {
|
||||
if let Some(clean) = filter_readable(s) {
|
||||
out.push(clean);
|
||||
}
|
||||
}
|
||||
serde_json::Value::Object(map) => {
|
||||
for (_, v) in map {
|
||||
walk_json_for_text(v, out, depth + 1);
|
||||
}
|
||||
}
|
||||
serde_json::Value::Array(arr) => {
|
||||
for v in arr {
|
||||
walk_json_for_text(v, out, depth + 1);
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
/// Filter a string for readability: must be >15 chars, mostly alphabetic,
|
||||
/// not a URL, file path, CSS rule, or base64 blob. Strips inline HTML tags.
|
||||
fn filter_readable(s: &str) -> Option<String> {
|
||||
let s = s.trim();
|
||||
if s.len() <= 15 {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Skip URLs
|
||||
if s.starts_with("http://") || s.starts_with("https://") || s.starts_with("//") {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Skip file paths
|
||||
if s.starts_with('/') || s.starts_with("./") || s.starts_with("../") {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Skip CSS-like strings
|
||||
if s.contains('{') && s.contains('}') && (s.contains(':') || s.contains(';')) {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Skip CSS grid templates, layout strings, and dimension patterns
|
||||
if s.contains("1fr")
|
||||
|| s.contains("grid-")
|
||||
|| s.contains("max-content")
|
||||
|| s.contains("divider-v-")
|
||||
|| s.contains("divider-h-")
|
||||
{
|
||||
return None;
|
||||
}
|
||||
|
||||
// Skip CSS layout area definitions (e.g. "card1 card2 card3")
|
||||
// These have repeated dash-separated tokens with digits
|
||||
let dash_digit_tokens = s
|
||||
.split_whitespace()
|
||||
.filter(|w| w.contains('-') && w.chars().any(|c| c.is_ascii_digit()))
|
||||
.count();
|
||||
if dash_digit_tokens >= 2 {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Skip strings containing literal quote characters (CSS grid areas, code snippets)
|
||||
if s.contains('"') {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Skip CSS grid area names and layout tokens.
|
||||
// These are strings of short lowercase words/dots with no sentence structure.
|
||||
if !s.chars().any(|c| c.is_uppercase()) {
|
||||
let is_css_layout = s.split_whitespace().all(|w| {
|
||||
w == "."
|
||||
|| (w.len() <= 20
|
||||
&& w.chars()
|
||||
.all(|c| c.is_ascii_lowercase() || c.is_ascii_digit() || c == '-'))
|
||||
});
|
||||
if is_css_layout {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
|
||||
// Skip CSS dimension strings (e.g. "16px 0px 0px 0px")
|
||||
if s.split_whitespace().all(|w| {
|
||||
w.ends_with("px") || w.ends_with("em") || w.ends_with("rem") || w.ends_with("%") || w == "0"
|
||||
}) {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Skip base64
|
||||
if s.len() > 50 && !s.contains(' ') {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Skip strings that are mostly HTML tags
|
||||
if s.matches('<').count() > 3 && s.matches('>').count() > 3 {
|
||||
let stripped = HTML_TAG_RE.replace_all(s, "");
|
||||
if stripped.trim().len() < 15 {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
|
||||
// Skip strings ending with file extensions
|
||||
if s.ends_with(".js")
|
||||
|| s.ends_with(".css")
|
||||
|| s.ends_with(".png")
|
||||
|| s.ends_with(".jpg")
|
||||
|| s.ends_with(".svg")
|
||||
|| s.ends_with(".woff2")
|
||||
{
|
||||
return None;
|
||||
}
|
||||
|
||||
// Must be mostly alphabetic (spaces + letters should dominate)
|
||||
let alpha_space = s
|
||||
.chars()
|
||||
.filter(|c| c.is_alphabetic() || c.is_whitespace())
|
||||
.count();
|
||||
let ratio = alpha_space as f64 / s.len() as f64;
|
||||
if ratio < 0.6 {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Must contain spaces (prose, not a single token)
|
||||
if !s.contains(' ') {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Strip any inline HTML tags
|
||||
let clean = HTML_TAG_RE.replace_all(s, "").trim().to_string();
|
||||
|
||||
if clean.len() <= 15 {
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(clean)
|
||||
}
|
||||
|
||||
/// Parse Next.js RSC flight data (`self.__next_f`) and extract readable text.
|
||||
///
|
||||
/// Wire format: array of `[type, payload]` tuples. Type 1 contains the actual
|
||||
/// RSC data as newline-delimited entries like `id:TYPE|payload`.
|
||||
fn extract_next_f_text(raw_json: &str) -> Vec<String> {
|
||||
let Ok(entries) = serde_json::from_str::<Vec<serde_json::Value>>(raw_json) else {
|
||||
return Vec::new();
|
||||
};
|
||||
|
||||
// Concatenate all type=1 payloads
|
||||
let mut wire = String::new();
|
||||
for entry in &entries {
|
||||
let arr = match entry.as_array() {
|
||||
Some(a) if a.len() >= 2 => a,
|
||||
_ => continue,
|
||||
};
|
||||
let entry_type = arr[0].as_u64().unwrap_or(0);
|
||||
if entry_type != 1 {
|
||||
continue;
|
||||
}
|
||||
if let Some(payload) = arr[1].as_str() {
|
||||
wire.push_str(payload);
|
||||
}
|
||||
}
|
||||
|
||||
if wire.is_empty() {
|
||||
return Vec::new();
|
||||
}
|
||||
|
||||
let mut texts = Vec::new();
|
||||
|
||||
// Each line is `id:TYPE|payload` — parse the JSON payloads
|
||||
for line in wire.lines() {
|
||||
// Find the payload after the first `|` or `:` marker
|
||||
let payload = if let Some(pos) = line.find('|') {
|
||||
&line[pos + 1..]
|
||||
} else {
|
||||
continue;
|
||||
};
|
||||
|
||||
// Try to parse as JSON array (RSC element representation)
|
||||
if let Ok(value) = serde_json::from_str::<serde_json::Value>(payload) {
|
||||
walk_rsc_tree(&value, &mut texts, 0);
|
||||
}
|
||||
}
|
||||
|
||||
texts
|
||||
}
|
||||
|
||||
/// Walk an RSC tree element extracting children text content.
|
||||
fn walk_rsc_tree(value: &serde_json::Value, out: &mut Vec<String>, depth: usize) {
|
||||
if depth > 20 {
|
||||
return;
|
||||
}
|
||||
|
||||
match value {
|
||||
serde_json::Value::String(s) => {
|
||||
if let Some(clean) = filter_readable(s) {
|
||||
out.push(clean);
|
||||
}
|
||||
}
|
||||
serde_json::Value::Array(arr) => {
|
||||
for item in arr {
|
||||
walk_rsc_tree(item, out, depth + 1);
|
||||
}
|
||||
}
|
||||
serde_json::Value::Object(map) => {
|
||||
// RSC elements have "children" that contain text
|
||||
if let Some(children) = map.get("children") {
|
||||
walk_rsc_tree(children, out, depth + 1);
|
||||
}
|
||||
// Also check other fields
|
||||
for (key, v) in map {
|
||||
if key == "children" {
|
||||
continue;
|
||||
}
|
||||
walk_rsc_tree(v, out, depth + 1);
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn extracts_window_preloaded_data() {
|
||||
let html = r#"<html><body>
|
||||
<script>
|
||||
window.__preloadedData = {
|
||||
"page": {
|
||||
"title": "Hello World Article Title",
|
||||
"body": "This is a longer paragraph of text that should be extracted from the preloaded data blob successfully."
|
||||
}
|
||||
};
|
||||
</script>
|
||||
</body></html>"#;
|
||||
|
||||
let blobs = extract_js_data(html);
|
||||
assert!(!blobs.is_empty(), "should extract at least one blob");
|
||||
assert!(
|
||||
blobs.iter().any(|b| b.name == "__preloadedData"),
|
||||
"should find __preloadedData"
|
||||
);
|
||||
|
||||
let text = extract_readable_text(&blobs);
|
||||
assert!(
|
||||
text.contains("This is a longer paragraph"),
|
||||
"should extract readable text from blob"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn skips_external_and_module_scripts() {
|
||||
let html = r#"<html><body>
|
||||
<script src="https://cdn.example.com/app.js"></script>
|
||||
<script type="module">export default {};</script>
|
||||
<script>window.__testData = {"content": "This is a test sentence that is long enough to be extracted from the page and it needs over one hundred characters of JSON to pass the threshold."};</script>
|
||||
</body></html>"#;
|
||||
|
||||
let blobs = extract_js_data(html);
|
||||
assert_eq!(
|
||||
blobs.len(),
|
||||
1,
|
||||
"should only process inline non-module script"
|
||||
);
|
||||
assert_eq!(blobs[0].name, "__testData");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty_html_returns_no_blobs() {
|
||||
let blobs = extract_js_data("<html><body></body></html>");
|
||||
assert!(blobs.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn filter_readable_rejects_junk() {
|
||||
assert!(filter_readable("short").is_none());
|
||||
assert!(filter_readable("https://example.com/some/long/path").is_none());
|
||||
assert!(filter_readable("/static/js/bundle.min.js").is_none());
|
||||
assert!(filter_readable("aGVsbG8gd29ybGQgdGhpcyBpcyBhIGJhc2U2NCBzdHJpbmc=").is_none());
|
||||
assert!(filter_readable(".container { display: flex; padding: 10px; }").is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn filter_readable_accepts_prose() {
|
||||
let result = filter_readable("This is a normal sentence with enough words.");
|
||||
assert!(result.is_some());
|
||||
assert_eq!(
|
||||
result.unwrap(),
|
||||
"This is a normal sentence with enough words."
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn strips_html_tags_from_text() {
|
||||
let result = filter_readable(
|
||||
"This has <strong>bold</strong> and <em>italic</em> formatting inside it.",
|
||||
);
|
||||
assert!(result.is_some());
|
||||
let clean = result.unwrap();
|
||||
assert!(!clean.contains('<'));
|
||||
assert!(clean.contains("bold"));
|
||||
assert!(clean.contains("italic"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extract_readable_text_produces_markdown() {
|
||||
let blobs = vec![JsDataBlob {
|
||||
name: "__data".to_string(),
|
||||
data: r#"{"article":"This is the main article content that should appear in the extracted text."}"#
|
||||
.to_string(),
|
||||
size: 100,
|
||||
}];
|
||||
|
||||
let text = extract_readable_text(&blobs);
|
||||
assert!(text.starts_with("## Additional Content"));
|
||||
assert!(text.contains("main article content"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extract_next_f_rsc_data() {
|
||||
let blobs = vec![JsDataBlob {
|
||||
name: "__next_f".to_string(),
|
||||
data: r#"[[0,""],
|
||||
[1,"0:T1234|{\"children\":\"This is some Next.js RSC flight data content that we want to extract.\"}\n"]]"#
|
||||
.to_string(),
|
||||
size: 200,
|
||||
}];
|
||||
|
||||
let text = extract_readable_text(&blobs);
|
||||
assert!(
|
||||
text.contains("Next.js RSC flight data content"),
|
||||
"should extract text from RSC flight data. Got: {text}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn handles_script_errors_gracefully() {
|
||||
// Scripts that throw errors should be silently ignored
|
||||
let html = r#"<html><body>
|
||||
<script>throw new Error("intentional crash");</script>
|
||||
<script>undefined_function();</script>
|
||||
<script>window.__survived = {"message": "This script ran after the errors and the data should still be found in the extracted blobs because it exceeds the minimum threshold."};</script>
|
||||
</body></html>"#;
|
||||
|
||||
let blobs = extract_js_data(html);
|
||||
assert!(
|
||||
blobs.iter().any(|b| b.name == "__survived"),
|
||||
"should extract data from scripts that succeed after failures"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
@ -9,6 +9,8 @@ pub mod diff;
|
|||
pub mod domain;
|
||||
pub mod error;
|
||||
pub mod extractor;
|
||||
#[cfg(feature = "quickjs")]
|
||||
pub mod js_eval;
|
||||
pub mod llm;
|
||||
pub mod markdown;
|
||||
pub mod metadata;
|
||||
|
|
@ -157,6 +159,22 @@ pub fn extract_with_options(
|
|||
meta.word_count = extractor::word_count(&content.markdown);
|
||||
}
|
||||
|
||||
// QuickJS: execute inline <script> tags to capture JS-assigned data blobs
|
||||
// (e.g., window.__PRELOADED_STATE__, self.__next_f). This supplements the
|
||||
// static JSON data island extraction above with runtime-evaluated data.
|
||||
#[cfg(feature = "quickjs")]
|
||||
{
|
||||
let blobs = js_eval::extract_js_data(html);
|
||||
if !blobs.is_empty() {
|
||||
let js_text = js_eval::extract_readable_text(&blobs);
|
||||
if !js_text.is_empty() {
|
||||
content.markdown.push_str("\n\n");
|
||||
content.markdown.push_str(&js_text);
|
||||
meta.word_count = extractor::word_count(&content.markdown);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Domain detection from URL patterns and DOM heuristics
|
||||
let domain_type = domain::detect(url, html);
|
||||
let domain_data = Some(DomainData { domain_type });
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue