//! API/endpoint surface discovery from HTML + JS bundle text. //! //! Pure and zero-network: callers fetch the page and its ` "#; let srcs = script_srcs(html, "https://www.ticketmaster.co.uk/"); assert!(srcs.contains( &"https://www.ticketmaster.co.uk/_next/static/chunks/main-abc.js".to_string() )); assert!(srcs.contains(&"https://cdn.example.net/lib.js".to_string())); assert_eq!(srcs.len(), 2, "inline + data: ignored"); } #[test] fn extracts_inline_and_bundle_endpoints_with_classification() { let html = r#" "#; let bundles = vec![( "https://www.ticketmaster.co.uk/app.js".to_string(), r#" const GQL = "https://pubapi.ticketmaster.co.uk/graphql"; axios.post("https://services.ticketmaster.co.uk/discovery/v2/events"); new WebSocket("wss://live.ticketmaster.co.uk/socket"); const ga = "https://www.googletagservices.com/tag/js/gpt.js"; const img = "https://cdn.tmol.co/hero.png"; "# .to_string(), )]; let r = extract_endpoints(html, "https://www.ticketmaster.co.uk/", &bundles); let vals: Vec<&str> = r.endpoints.iter().map(|e| e.value.as_str()).collect(); assert!(vals.contains(&"/api/search/events")); assert!(vals.contains(&"/api/search/search-suggest")); assert!(vals.contains(&"/api/venue/info")); assert!(vals.contains(&"https://pubapi.ticketmaster.co.uk/graphql")); assert!(vals.contains(&"https://services.ticketmaster.co.uk/discovery/v2/events")); assert!(vals.contains(&"wss://live.ticketmaster.co.uk/socket")); // static .js asset is not an endpoint, but its host is recorded assert!(!vals.contains(&"https://www.googletagservices.com/tag/js/gpt.js")); assert!(r.hosts.iter().any(|h| h == "www.googletagservices.com")); let gql = r .endpoints .iter() .find(|e| e.value.contains("graphql")) .unwrap(); assert_eq!(gql.kind, EndpointKind::GraphQl); assert!( gql.first_party, "pubapi.ticketmaster.co.uk is first-party to .co.uk" ); let third = r .endpoints .iter() .find(|e| e.value.starts_with("/api/venue")); assert!(third.unwrap().first_party, "relative path is same-origin"); assert_eq!(r.bundles_scanned, 1); } #[test] fn third_party_absolute_is_flagged_not_first_party() { let bundles = vec![( "b".to_string(), r#"x="https://api.stripe.com/v1/charges""#.to_string(), )]; let r = extract_endpoints("", "https://www.ticketmaster.co.uk/", &bundles); let e = r .endpoints .iter() .find(|e| e.value.contains("stripe")) .unwrap(); assert!(!e.first_party); } #[test] fn caps_bound_pathological_input() { // A huge blob of fake endpoints must not exceed MAX_ENDPOINTS and // must return promptly (regex crate is linear-time). let mut big = String::new(); for i in 0..50_000 { big.push_str(&format!("\"/api/v1/item/{i}\" ")); } let bundles = vec![("big".to_string(), big)]; let r = extract_endpoints("", "https://x.com/", &bundles); assert!(r.endpoints.len() <= MAX_ENDPOINTS); assert!(r.truncated); } #[test] fn empty_inputs_are_safe() { let r = extract_endpoints("", "not a url", &[]); assert!(r.endpoints.is_empty()); assert_eq!(r.bundles_scanned, 0); assert!(!r.truncated); } #[test] fn v1_1_noise_is_filtered() { let bundles = vec![( "b.js".to_string(), r#" "/api/search/events"; "/api"; "/api/"; "http://f"; "http://n/x"; "https://schema.org/Thing"; "http://json-schema.org/draft-07/schema"; "https://www.ticketmaster.co.uk/"; "https://pubapi.ticketmaster.co.uk/discovery/v2/events"; "wss://live.ticketmaster.co.uk/socket"; "# .to_string(), )]; let r = extract_endpoints("", "https://www.ticketmaster.co.uk/", &bundles); let vals: std::collections::BTreeSet<&str> = r.endpoints.iter().map(|e| e.value.as_str()).collect(); assert!(vals.contains("/api/search/events")); assert!(vals.contains("https://pubapi.ticketmaster.co.uk/discovery/v2/events")); assert!(vals.contains("wss://live.ticketmaster.co.uk/socket")); for junk in [ "/api", "/api/", "http://f", "http://n/x", "https://schema.org/Thing", "http://json-schema.org/draft-07/schema", "https://www.ticketmaster.co.uk/", ] { assert!(!vals.contains(junk), "noise leaked: {junk}"); } assert!( !r.hosts .iter() .any(|h| h == "f" || h == "n" || h == "schema.org") ); assert!(r.hosts.iter().any(|h| h == "pubapi.ticketmaster.co.uk")); } #[test] fn scan_truncation_at_non_ascii_boundary_does_not_panic() { // A bundle just over the scan budget, padded with a multibyte char // ('é' is 2 bytes) so the cut lands mid-codepoint. The old // `&text[..budget]` slice panicked here; the boundary snap must not. let pad = "é".repeat(MAX_SCAN_BYTES); // ~2× budget in bytes let bundle = format!("{pad} fetch(\"/api/x\")"); let bundles = vec![("big.js".to_string(), bundle)]; let r = extract_endpoints("", "https://example.com/", &bundles); assert!(r.truncated, "oversized bundle should mark truncated"); } }