mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-08 22:25:12 +02:00
feat: v0.1.4 — QuickJS integration for inline JavaScript data extraction
Embeds QuickJS (rquickjs) to execute inline <script> tags and extract data hidden in JavaScript variable assignments. Captures window.__* objects like __preloadedData (NYTimes), __PRELOADED_STATE__ (Wired), and self.__next_f (Next.js RSC flight data). Results: - NYTimes: 1,552 → 4,162 words (+168%) - Wired: 1,459 → 9,937 words (+580%) - Zero measurable performance overhead (<15ms per page) - Feature-gated: disable with --no-default-features for WASM Smart text filtering rejects CSS, base64, file paths, code strings. Only readable prose is appended under "## Additional Content". Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
0c91c6d5a9
commit
32c035c543
6 changed files with 665 additions and 7 deletions
40
Cargo.lock
generated
40
Cargo.lock
generated
|
|
@ -1932,6 +1932,33 @@ dependencies = [
|
|||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rquickjs"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5c5227859c4dfc83f428e58f9569bf439e628c8d139020e7faff437e6f5abaa0"
|
||||
dependencies = [
|
||||
"rquickjs-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rquickjs-core"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e82e0ca83028ad5b533b53b96c395bbaab905a5774de4aaf1004eeacafa3d85d"
|
||||
dependencies = [
|
||||
"rquickjs-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rquickjs-sys"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7fed0097b0b4fbb2a87f6dd3b995a7c64ca56de30007eb7e867dfdfc78324ba5"
|
||||
dependencies = [
|
||||
"cc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustc-hash"
|
||||
version = "2.1.1"
|
||||
|
|
@ -2854,7 +2881,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-cli"
|
||||
version = "0.1.3"
|
||||
version = "0.1.4"
|
||||
dependencies = [
|
||||
"clap",
|
||||
"dotenvy",
|
||||
|
|
@ -2874,11 +2901,12 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-core"
|
||||
version = "0.1.3"
|
||||
version = "0.1.4"
|
||||
dependencies = [
|
||||
"ego-tree",
|
||||
"once_cell",
|
||||
"regex",
|
||||
"rquickjs",
|
||||
"scraper",
|
||||
"serde",
|
||||
"serde_json",
|
||||
|
|
@ -2891,7 +2919,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-fetch"
|
||||
version = "0.1.3"
|
||||
version = "0.1.4"
|
||||
dependencies = [
|
||||
"primp",
|
||||
"quick-xml",
|
||||
|
|
@ -2909,7 +2937,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-llm"
|
||||
version = "0.1.3"
|
||||
version = "0.1.4"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"reqwest",
|
||||
|
|
@ -2922,7 +2950,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-mcp"
|
||||
version = "0.1.3"
|
||||
version = "0.1.4"
|
||||
dependencies = [
|
||||
"dotenvy",
|
||||
"reqwest",
|
||||
|
|
@ -2942,7 +2970,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-pdf"
|
||||
version = "0.1.3"
|
||||
version = "0.1.4"
|
||||
dependencies = [
|
||||
"pdf-extract",
|
||||
"thiserror",
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue