diff --git a/CHANGELOG.md b/CHANGELOG.md index 3e10de8..ed4d200 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,17 @@ All notable changes to webclaw are documented here. Format follows [Keep a Changelog](https://keepachangelog.com/). +## [0.1.4] — 2026-03-26 + +### Added +- QuickJS integration for extracting data from inline JavaScript (NYTimes +168%, Wired +580% more content) +- Executes inline ` + "#; + + let blobs = extract_js_data(html); + assert!(!blobs.is_empty(), "should extract at least one blob"); + assert!( + blobs.iter().any(|b| b.name == "__preloadedData"), + "should find __preloadedData" + ); + + let text = extract_readable_text(&blobs); + assert!( + text.contains("This is a longer paragraph"), + "should extract readable text from blob" + ); + } + + #[test] + fn skips_external_and_module_scripts() { + let html = r#" + + + + "#; + + let blobs = extract_js_data(html); + assert_eq!( + blobs.len(), + 1, + "should only process inline non-module script" + ); + assert_eq!(blobs[0].name, "__testData"); + } + + #[test] + fn empty_html_returns_no_blobs() { + let blobs = extract_js_data(""); + assert!(blobs.is_empty()); + } + + #[test] + fn filter_readable_rejects_junk() { + assert!(filter_readable("short").is_none()); + assert!(filter_readable("https://example.com/some/long/path").is_none()); + assert!(filter_readable("/static/js/bundle.min.js").is_none()); + assert!(filter_readable("aGVsbG8gd29ybGQgdGhpcyBpcyBhIGJhc2U2NCBzdHJpbmc=").is_none()); + assert!(filter_readable(".container { display: flex; padding: 10px; }").is_none()); + } + + #[test] + fn filter_readable_accepts_prose() { + let result = filter_readable("This is a normal sentence with enough words."); + assert!(result.is_some()); + assert_eq!( + result.unwrap(), + "This is a normal sentence with enough words." + ); + } + + #[test] + fn strips_html_tags_from_text() { + let result = filter_readable( + "This has bold and italic formatting inside it.", + ); + assert!(result.is_some()); + let clean = result.unwrap(); + assert!(!clean.contains('<')); + assert!(clean.contains("bold")); + assert!(clean.contains("italic")); + } + + #[test] + fn extract_readable_text_produces_markdown() { + let blobs = vec![JsDataBlob { + name: "__data".to_string(), + data: r#"{"article":"This is the main article content that should appear in the extracted text."}"# + .to_string(), + size: 100, + }]; + + let text = extract_readable_text(&blobs); + assert!(text.starts_with("## Additional Content")); + assert!(text.contains("main article content")); + } + + #[test] + fn extract_next_f_rsc_data() { + let blobs = vec![JsDataBlob { + name: "__next_f".to_string(), + data: r#"[[0,""], + [1,"0:T1234|{\"children\":\"This is some Next.js RSC flight data content that we want to extract.\"}\n"]]"# + .to_string(), + size: 200, + }]; + + let text = extract_readable_text(&blobs); + assert!( + text.contains("Next.js RSC flight data content"), + "should extract text from RSC flight data. Got: {text}" + ); + } + + #[test] + fn handles_script_errors_gracefully() { + // Scripts that throw errors should be silently ignored + let html = r#" + + + + "#; + + let blobs = extract_js_data(html); + assert!( + blobs.iter().any(|b| b.name == "__survived"), + "should extract data from scripts that succeed after failures" + ); + } +} diff --git a/crates/webclaw-core/src/lib.rs b/crates/webclaw-core/src/lib.rs index 3e740d5..9fc7945 100644 --- a/crates/webclaw-core/src/lib.rs +++ b/crates/webclaw-core/src/lib.rs @@ -9,6 +9,8 @@ pub mod diff; pub mod domain; pub mod error; pub mod extractor; +#[cfg(feature = "quickjs")] +pub mod js_eval; pub mod llm; pub mod markdown; pub mod metadata; @@ -157,6 +159,22 @@ pub fn extract_with_options( meta.word_count = extractor::word_count(&content.markdown); } + // QuickJS: execute inline