mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-07 22:15:12 +02:00
feat: v0.1.4 — QuickJS integration for inline JavaScript data extraction
Embeds QuickJS (rquickjs) to execute inline <script> tags and extract data hidden in JavaScript variable assignments. Captures window.__* objects like __preloadedData (NYTimes), __PRELOADED_STATE__ (Wired), and self.__next_f (Next.js RSC flight data). Results: - NYTimes: 1,552 → 4,162 words (+168%) - Wired: 1,459 → 9,937 words (+580%) - Zero measurable performance overhead (<15ms per page) - Feature-gated: disable with --no-default-features for WASM Smart text filtering rejects CSS, base64, file paths, code strings. Only readable prose is appended under "## Additional Content". Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
0c91c6d5a9
commit
32c035c543
6 changed files with 665 additions and 7 deletions
|
|
@ -9,6 +9,8 @@ pub mod diff;
|
|||
pub mod domain;
|
||||
pub mod error;
|
||||
pub mod extractor;
|
||||
#[cfg(feature = "quickjs")]
|
||||
pub mod js_eval;
|
||||
pub mod llm;
|
||||
pub mod markdown;
|
||||
pub mod metadata;
|
||||
|
|
@ -157,6 +159,22 @@ pub fn extract_with_options(
|
|||
meta.word_count = extractor::word_count(&content.markdown);
|
||||
}
|
||||
|
||||
// QuickJS: execute inline <script> tags to capture JS-assigned data blobs
|
||||
// (e.g., window.__PRELOADED_STATE__, self.__next_f). This supplements the
|
||||
// static JSON data island extraction above with runtime-evaluated data.
|
||||
#[cfg(feature = "quickjs")]
|
||||
{
|
||||
let blobs = js_eval::extract_js_data(html);
|
||||
if !blobs.is_empty() {
|
||||
let js_text = js_eval::extract_readable_text(&blobs);
|
||||
if !js_text.is_empty() {
|
||||
content.markdown.push_str("\n\n");
|
||||
content.markdown.push_str(&js_text);
|
||||
meta.word_count = extractor::word_count(&content.markdown);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Domain detection from URL patterns and DOM heuristics
|
||||
let domain_type = domain::detect(url, html);
|
||||
let domain_data = Some(DomainData { domain_type });
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue