From 3781c2874fc1446d03cb373580fca363ed8a7a0b Mon Sep 17 00:00:00 2001 From: Valerio Date: Tue, 19 May 2026 18:00:56 +0200 Subject: [PATCH] =?UTF-8?q?feat(core):=20endpoints=20module=20=E2=80=94=20?= =?UTF-8?q?extract=20API=20surface=20from=20HTML=20+=20JS=20bundles?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/webclaw-core/src/endpoints.rs | 426 +++++++++++++++++++++++++++ crates/webclaw-core/src/lib.rs | 1 + 2 files changed, 427 insertions(+) create mode 100644 crates/webclaw-core/src/endpoints.rs diff --git a/crates/webclaw-core/src/endpoints.rs b/crates/webclaw-core/src/endpoints.rs new file mode 100644 index 0000000..854d79e --- /dev/null +++ b/crates/webclaw-core/src/endpoints.rs @@ -0,0 +1,426 @@ +//! API/endpoint surface discovery from HTML + JS bundle text. +//! +//! Pure and zero-network: callers fetch the page and its ` + + + + "#; + let srcs = script_srcs(html, "https://www.ticketmaster.co.uk/"); + assert!(srcs.contains( + &"https://www.ticketmaster.co.uk/_next/static/chunks/main-abc.js".to_string() + )); + assert!(srcs.contains(&"https://cdn.example.net/lib.js".to_string())); + assert_eq!(srcs.len(), 2, "inline + data: ignored"); + } + + #[test] + fn extracts_inline_and_bundle_endpoints_with_classification() { + let html = r#" + + + "#; + let bundles = vec![( + "https://www.ticketmaster.co.uk/app.js".to_string(), + r#" + const GQL = "https://pubapi.ticketmaster.co.uk/graphql"; + axios.post("https://services.ticketmaster.co.uk/discovery/v2/events"); + new WebSocket("wss://live.ticketmaster.co.uk/socket"); + const ga = "https://www.googletagservices.com/tag/js/gpt.js"; + const img = "https://cdn.tmol.co/hero.png"; + "# + .to_string(), + )]; + let r = extract_endpoints(html, "https://www.ticketmaster.co.uk/", &bundles); + let vals: Vec<&str> = r.endpoints.iter().map(|e| e.value.as_str()).collect(); + + assert!(vals.contains(&"/api/search/events")); + assert!(vals.contains(&"/api/search/search-suggest")); + assert!(vals.contains(&"/api/venue/info")); + assert!(vals.contains(&"https://pubapi.ticketmaster.co.uk/graphql")); + assert!(vals.contains(&"https://services.ticketmaster.co.uk/discovery/v2/events")); + assert!(vals.contains(&"wss://live.ticketmaster.co.uk/socket")); + // static .js asset is not an endpoint, but its host is recorded + assert!(!vals.contains(&"https://www.googletagservices.com/tag/js/gpt.js")); + assert!(r.hosts.iter().any(|h| h == "www.googletagservices.com")); + + let gql = r + .endpoints + .iter() + .find(|e| e.value.contains("graphql")) + .unwrap(); + assert_eq!(gql.kind, EndpointKind::GraphQl); + assert!( + gql.first_party, + "pubapi.ticketmaster.co.uk is first-party to .co.uk" + ); + + let third = r + .endpoints + .iter() + .find(|e| e.value.starts_with("/api/venue")); + assert!(third.unwrap().first_party, "relative path is same-origin"); + assert_eq!(r.bundles_scanned, 1); + } + + #[test] + fn third_party_absolute_is_flagged_not_first_party() { + let bundles = vec![( + "b".to_string(), + r#"x="https://api.stripe.com/v1/charges""#.to_string(), + )]; + let r = extract_endpoints("", "https://www.ticketmaster.co.uk/", &bundles); + let e = r + .endpoints + .iter() + .find(|e| e.value.contains("stripe")) + .unwrap(); + assert!(!e.first_party); + } + + #[test] + fn caps_bound_pathological_input() { + // A huge blob of fake endpoints must not exceed MAX_ENDPOINTS and + // must return promptly (regex crate is linear-time). + let mut big = String::new(); + for i in 0..50_000 { + big.push_str(&format!("\"/api/v1/item/{i}\" ")); + } + let bundles = vec![("big".to_string(), big)]; + let r = extract_endpoints("", "https://x.com/", &bundles); + assert!(r.endpoints.len() <= MAX_ENDPOINTS); + assert!(r.truncated); + } + + #[test] + fn empty_inputs_are_safe() { + let r = extract_endpoints("", "not a url", &[]); + assert!(r.endpoints.is_empty()); + assert_eq!(r.bundles_scanned, 0); + assert!(!r.truncated); + } +} diff --git a/crates/webclaw-core/src/lib.rs b/crates/webclaw-core/src/lib.rs index a3e0725..1ddd1f0 100644 --- a/crates/webclaw-core/src/lib.rs +++ b/crates/webclaw-core/src/lib.rs @@ -7,6 +7,7 @@ pub(crate) mod data_island; /// Zero network dependencies — WASM-compatible by design. pub mod diff; pub mod domain; +pub mod endpoints; pub mod error; pub mod extractor; #[cfg(all(feature = "quickjs", not(target_arch = "wasm32")))]