/// QuickJS-based extraction of data from inline JavaScript in HTML pages. /// /// Many modern websites embed page data as JavaScript variable assignments /// (e.g., `window.__PRELOADED_STATE__`, Next.js `self.__next_f`). The static /// JSON data island approach (`data_island.rs`) only handles ` "#; let blobs = extract_js_data(html); assert!(!blobs.is_empty(), "should extract at least one blob"); assert!( blobs.iter().any(|b| b.name == "__preloadedData"), "should find __preloadedData" ); let text = extract_readable_text(&blobs); assert!( text.contains("This is a longer paragraph"), "should extract readable text from blob" ); } #[test] fn skips_external_and_module_scripts() { let html = r#" "#; let blobs = extract_js_data(html); assert_eq!( blobs.len(), 1, "should only process inline non-module script" ); assert_eq!(blobs[0].name, "__testData"); } #[test] fn empty_html_returns_no_blobs() { let blobs = extract_js_data(""); assert!(blobs.is_empty()); } #[test] fn filter_readable_rejects_junk() { assert!(filter_readable("short").is_none()); assert!(filter_readable("https://example.com/some/long/path").is_none()); assert!(filter_readable("/static/js/bundle.min.js").is_none()); assert!(filter_readable("aGVsbG8gd29ybGQgdGhpcyBpcyBhIGJhc2U2NCBzdHJpbmc=").is_none()); assert!(filter_readable(".container { display: flex; padding: 10px; }").is_none()); } #[test] fn filter_readable_accepts_prose() { let result = filter_readable("This is a normal sentence with enough words."); assert!(result.is_some()); assert_eq!( result.unwrap(), "This is a normal sentence with enough words." ); } #[test] fn strips_html_tags_from_text() { let result = filter_readable( "This has bold and italic formatting inside it.", ); assert!(result.is_some()); let clean = result.unwrap(); assert!(!clean.contains('<')); assert!(clean.contains("bold")); assert!(clean.contains("italic")); } #[test] fn extract_readable_text_produces_markdown() { let blobs = vec![JsDataBlob { name: "__data".to_string(), data: r#"{"article":"This is the main article content that should appear in the extracted text."}"# .to_string(), size: 100, }]; let text = extract_readable_text(&blobs); assert!(text.starts_with("## Additional Content")); assert!(text.contains("main article content")); } #[test] fn extract_next_f_rsc_data() { let blobs = vec![JsDataBlob { name: "__next_f".to_string(), data: r#"[[0,""], [1,"0:T1234|{\"children\":\"This is some Next.js RSC flight data content that we want to extract.\"}\n"]]"# .to_string(), size: 200, }]; let text = extract_readable_text(&blobs); assert!( text.contains("Next.js RSC flight data content"), "should extract text from RSC flight data. Got: {text}" ); } #[test] fn handles_script_errors_gracefully() { // Scripts that throw errors should be silently ignored let html = r#" "#; let blobs = extract_js_data(html); assert!( blobs.iter().any(|b| b.name == "__survived"), "should extract data from scripts that succeed after failures" ); } }