diff --git a/Dockerfile b/Dockerfile index 552aea7..fefb39b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -59,9 +59,9 @@ RUN touch crates/*/src/*.rs \ # --------------------------------------------------------------------------- FROM ubuntu:24.04 -RUN apt-get update && apt-get install -y --no-install-recommends \ - ca-certificates \ - && rm -rf /var/lib/apt/lists/* +# CA bundle from distroless (ships it, multi-arch, gcr.io) instead of +# apt-installing from ports.ubuntu.com (unreachable for arm64 on CI runners). +COPY --from=gcr.io/distroless/static-debian12 /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt # Copy all three binaries COPY --from=builder /build/target/release/webclaw /usr/local/bin/webclaw diff --git a/Dockerfile.ci b/Dockerfile.ci index ccd8a33..7b62718 100644 --- a/Dockerfile.ci +++ b/Dockerfile.ci @@ -5,9 +5,10 @@ ARG BINARY_DIR=binaries FROM ubuntu:24.04 -RUN apt-get update && apt-get install -y --no-install-recommends \ - ca-certificates \ - && rm -rf /var/lib/apt/lists/* +# CA bundle copied from a reliable multi-arch image instead of apt-installing +# from ports.ubuntu.com — Canonical's arm64 ports mirror is unreachable from +# CI runners and breaks the multi-arch release build. No build-time network. +COPY --from=gcr.io/distroless/static-debian12 /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt ARG BINARY_DIR COPY ${BINARY_DIR}/webclaw /usr/local/bin/webclaw diff --git a/crates/webclaw-core/src/endpoints.rs b/crates/webclaw-core/src/endpoints.rs new file mode 100644 index 0000000..21c5280 --- /dev/null +++ b/crates/webclaw-core/src/endpoints.rs @@ -0,0 +1,515 @@ +//! API/endpoint surface discovery from HTML + JS bundle text. +//! +//! Pure and zero-network: callers fetch the page and its ` + + + + "#; + let srcs = script_srcs(html, "https://www.ticketmaster.co.uk/"); + assert!(srcs.contains( + &"https://www.ticketmaster.co.uk/_next/static/chunks/main-abc.js".to_string() + )); + assert!(srcs.contains(&"https://cdn.example.net/lib.js".to_string())); + assert_eq!(srcs.len(), 2, "inline + data: ignored"); + } + + #[test] + fn extracts_inline_and_bundle_endpoints_with_classification() { + let html = r#" + + + "#; + let bundles = vec![( + "https://www.ticketmaster.co.uk/app.js".to_string(), + r#" + const GQL = "https://pubapi.ticketmaster.co.uk/graphql"; + axios.post("https://services.ticketmaster.co.uk/discovery/v2/events"); + new WebSocket("wss://live.ticketmaster.co.uk/socket"); + const ga = "https://www.googletagservices.com/tag/js/gpt.js"; + const img = "https://cdn.tmol.co/hero.png"; + "# + .to_string(), + )]; + let r = extract_endpoints(html, "https://www.ticketmaster.co.uk/", &bundles); + let vals: Vec<&str> = r.endpoints.iter().map(|e| e.value.as_str()).collect(); + + assert!(vals.contains(&"/api/search/events")); + assert!(vals.contains(&"/api/search/search-suggest")); + assert!(vals.contains(&"/api/venue/info")); + assert!(vals.contains(&"https://pubapi.ticketmaster.co.uk/graphql")); + assert!(vals.contains(&"https://services.ticketmaster.co.uk/discovery/v2/events")); + assert!(vals.contains(&"wss://live.ticketmaster.co.uk/socket")); + // static .js asset is not an endpoint, but its host is recorded + assert!(!vals.contains(&"https://www.googletagservices.com/tag/js/gpt.js")); + assert!(r.hosts.iter().any(|h| h == "www.googletagservices.com")); + + let gql = r + .endpoints + .iter() + .find(|e| e.value.contains("graphql")) + .unwrap(); + assert_eq!(gql.kind, EndpointKind::GraphQl); + assert!( + gql.first_party, + "pubapi.ticketmaster.co.uk is first-party to .co.uk" + ); + + let third = r + .endpoints + .iter() + .find(|e| e.value.starts_with("/api/venue")); + assert!(third.unwrap().first_party, "relative path is same-origin"); + assert_eq!(r.bundles_scanned, 1); + } + + #[test] + fn third_party_absolute_is_flagged_not_first_party() { + let bundles = vec![( + "b".to_string(), + r#"x="https://api.stripe.com/v1/charges""#.to_string(), + )]; + let r = extract_endpoints("", "https://www.ticketmaster.co.uk/", &bundles); + let e = r + .endpoints + .iter() + .find(|e| e.value.contains("stripe")) + .unwrap(); + assert!(!e.first_party); + } + + #[test] + fn caps_bound_pathological_input() { + // A huge blob of fake endpoints must not exceed MAX_ENDPOINTS and + // must return promptly (regex crate is linear-time). + let mut big = String::new(); + for i in 0..50_000 { + big.push_str(&format!("\"/api/v1/item/{i}\" ")); + } + let bundles = vec![("big".to_string(), big)]; + let r = extract_endpoints("", "https://x.com/", &bundles); + assert!(r.endpoints.len() <= MAX_ENDPOINTS); + assert!(r.truncated); + } + + #[test] + fn empty_inputs_are_safe() { + let r = extract_endpoints("", "not a url", &[]); + assert!(r.endpoints.is_empty()); + assert_eq!(r.bundles_scanned, 0); + assert!(!r.truncated); + } + + #[test] + fn v1_1_noise_is_filtered() { + let bundles = vec![( + "b.js".to_string(), + r#" + "/api/search/events"; + "/api"; "/api/"; + "http://f"; "http://n/x"; + "https://schema.org/Thing"; + "http://json-schema.org/draft-07/schema"; + "https://www.ticketmaster.co.uk/"; + "https://pubapi.ticketmaster.co.uk/discovery/v2/events"; + "wss://live.ticketmaster.co.uk/socket"; + "# + .to_string(), + )]; + let r = extract_endpoints("", "https://www.ticketmaster.co.uk/", &bundles); + let vals: std::collections::BTreeSet<&str> = + r.endpoints.iter().map(|e| e.value.as_str()).collect(); + assert!(vals.contains("/api/search/events")); + assert!(vals.contains("https://pubapi.ticketmaster.co.uk/discovery/v2/events")); + assert!(vals.contains("wss://live.ticketmaster.co.uk/socket")); + for junk in [ + "/api", + "/api/", + "http://f", + "http://n/x", + "https://schema.org/Thing", + "http://json-schema.org/draft-07/schema", + "https://www.ticketmaster.co.uk/", + ] { + assert!(!vals.contains(junk), "noise leaked: {junk}"); + } + assert!( + !r.hosts + .iter() + .any(|h| h == "f" || h == "n" || h == "schema.org") + ); + assert!(r.hosts.iter().any(|h| h == "pubapi.ticketmaster.co.uk")); + } +} diff --git a/crates/webclaw-core/src/lib.rs b/crates/webclaw-core/src/lib.rs index a3e0725..1ddd1f0 100644 --- a/crates/webclaw-core/src/lib.rs +++ b/crates/webclaw-core/src/lib.rs @@ -7,6 +7,7 @@ pub(crate) mod data_island; /// Zero network dependencies — WASM-compatible by design. pub mod diff; pub mod domain; +pub mod endpoints; pub mod error; pub mod extractor; #[cfg(all(feature = "quickjs", not(target_arch = "wasm32")))] diff --git a/crates/webclaw-fetch/src/cloud.rs b/crates/webclaw-fetch/src/cloud.rs index 3bad383..7d4978e 100644 --- a/crates/webclaw-fetch/src/cloud.rs +++ b/crates/webclaw-fetch/src/cloud.rs @@ -810,13 +810,18 @@ mod tests { // --- CloudClient construction ------------------------------------------ + // `WEBCLAW_API_KEY` is process-global; cargo runs tests in parallel + // threads. Without serialization, a test that sets the var can race a + // test asserting it is absent. This lock makes the env-mutating + // CloudClient tests mutually exclusive (poison-tolerant: a panicking + // test must not wedge the others). + static ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(()); + #[test] fn cloud_client_explicit_key_wins_over_env() { - // SAFETY: this test mutates process env. Serial tests only. - // Set env to something, pass an explicit key, explicit should win. - // (We don't actually *call* the API, just check the struct stored - // the right key.) - // rustc std::env::set_var is unsafe in newer toolchains. + let _guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner()); + // SAFETY: env mutation is serialized by ENV_LOCK; set_var/remove_var + // are unsafe on the 2024 toolchain. Explicit key must beat the env. unsafe { std::env::set_var("WEBCLAW_API_KEY", "from-env"); } @@ -829,6 +834,9 @@ mod tests { #[test] fn cloud_client_none_when_empty() { + let _guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner()); + // SAFETY: env mutation serialized by ENV_LOCK. Clearing the var + // (incl. any ambient runner value) is what makes this deterministic. unsafe { std::env::remove_var("WEBCLAW_API_KEY"); }