From 6bca7a7c97949300fa64b5a49d394b8d73964ab1 Mon Sep 17 00:00:00 2001 From: elipeter Date: Mon, 1 Jun 2026 10:49:32 -0500 Subject: [PATCH] feat(dynamic): replace fixed canary with per-spec cryptographically-random canary for enhanced security --- src/dynamic/corpus/json_parse/javascript.rs | 4 +- src/dynamic/corpus/json_parse/python.rs | 4 +- src/dynamic/corpus/json_parse/ruby.rs | 4 +- .../corpus/prototype_pollution/javascript.rs | 4 +- .../corpus/prototype_pollution/typescript.rs | 4 +- src/dynamic/lang/js_shared.rs | 8 +- src/dynamic/oracle.rs | 273 +++++++++++++++++- src/dynamic/probe.rs | 9 +- src/dynamic/runner.rs | 73 ++++- tests/oracle_canary_audit.rs | 214 ++++++++++++++ 10 files changed, 569 insertions(+), 28 deletions(-) create mode 100644 tests/oracle_canary_audit.rs diff --git a/src/dynamic/corpus/json_parse/javascript.rs b/src/dynamic/corpus/json_parse/javascript.rs index 19a81677..92a33dcc 100644 --- a/src/dynamic/corpus/json_parse/javascript.rs +++ b/src/dynamic/corpus/json_parse/javascript.rs @@ -10,9 +10,9 @@ //! (`NYX_JSON_DEEP` vs `NYX_JSON_SHALLOW`) picks the branch. use super::super::{CuratedPayload, Oracle, PayloadProvenance, PayloadRef}; -use crate::dynamic::oracle::ProbePredicate; +use crate::dynamic::oracle::{Canary, ProbePredicate}; -const CANARY: &str = "__nyx_canary"; +const CANARY: &str = Canary::PLACEHOLDER; const MAX_DEPTH: u32 = 64; pub const PAYLOADS: &[CuratedPayload] = &[ diff --git a/src/dynamic/corpus/json_parse/python.rs b/src/dynamic/corpus/json_parse/python.rs index 6ab091d6..55e36ed1 100644 --- a/src/dynamic/corpus/json_parse/python.rs +++ b/src/dynamic/corpus/json_parse/python.rs @@ -5,9 +5,9 @@ //! vulnerable and benign attempts. use super::super::{CuratedPayload, Oracle, PayloadProvenance, PayloadRef}; -use crate::dynamic::oracle::ProbePredicate; +use crate::dynamic::oracle::{Canary, ProbePredicate}; -const CANARY: &str = "__nyx_canary"; +const CANARY: &str = Canary::PLACEHOLDER; const MAX_DEPTH: u32 = 64; pub const PAYLOADS: &[CuratedPayload] = &[ diff --git a/src/dynamic/corpus/json_parse/ruby.rs b/src/dynamic/corpus/json_parse/ruby.rs index ada2d017..346f96e5 100644 --- a/src/dynamic/corpus/json_parse/ruby.rs +++ b/src/dynamic/corpus/json_parse/ruby.rs @@ -9,9 +9,9 @@ //! (`NYX_JSON_DEEP` vs `NYX_JSON_SHALLOW`) picks the branch. use super::super::{CuratedPayload, Oracle, PayloadProvenance, PayloadRef}; -use crate::dynamic::oracle::ProbePredicate; +use crate::dynamic::oracle::{Canary, ProbePredicate}; -const CANARY: &str = "__nyx_canary"; +const CANARY: &str = Canary::PLACEHOLDER; const MAX_DEPTH: u32 = 64; pub const PAYLOADS: &[CuratedPayload] = &[ diff --git a/src/dynamic/corpus/prototype_pollution/javascript.rs b/src/dynamic/corpus/prototype_pollution/javascript.rs index 1654275e..0ce273e5 100644 --- a/src/dynamic/corpus/prototype_pollution/javascript.rs +++ b/src/dynamic/corpus/prototype_pollution/javascript.rs @@ -22,9 +22,9 @@ //! in the payload bytes. use super::super::{CuratedPayload, Oracle, PayloadProvenance, PayloadRef}; -use crate::dynamic::oracle::ProbePredicate; +use crate::dynamic::oracle::{Canary, ProbePredicate}; -const CANARY: &str = "__nyx_canary"; +const CANARY: &str = Canary::PLACEHOLDER; pub const PAYLOADS: &[CuratedPayload] = &[ CuratedPayload { diff --git a/src/dynamic/corpus/prototype_pollution/typescript.rs b/src/dynamic/corpus/prototype_pollution/typescript.rs index 599345e1..0166beca 100644 --- a/src/dynamic/corpus/prototype_pollution/typescript.rs +++ b/src/dynamic/corpus/prototype_pollution/typescript.rs @@ -8,9 +8,9 @@ //! fixtures the static-analysis side consumes). use super::super::{CuratedPayload, Oracle, PayloadProvenance, PayloadRef}; -use crate::dynamic::oracle::ProbePredicate; +use crate::dynamic::oracle::{Canary, ProbePredicate}; -const CANARY: &str = "__nyx_canary"; +const CANARY: &str = Canary::PLACEHOLDER; pub const PAYLOADS: &[CuratedPayload] = &[ CuratedPayload { diff --git a/src/dynamic/lang/js_shared.rs b/src/dynamic/lang/js_shared.rs index 013a5187..e7b46287 100644 --- a/src/dynamic/lang/js_shared.rs +++ b/src/dynamic/lang/js_shared.rs @@ -2635,7 +2635,13 @@ pub fn emit_prototype_pollution_harness(_spec: &HarnessSpec) -> HarnessSource { r#"// Nyx dynamic harness — PROTOTYPE_POLLUTION canary trap (Phase 10 / Track J.8). {shim} -const NYX_PP_CANARY = '__nyx_canary'; +// Per-spec canary (Phase 30 / Track N.0): the runner derives a +// cryptographically-random, per-`spec_hash` canary and passes it in via +// NYX_CANARY, substituting the same value into the payload bytes and the +// oracle match. The '__nyx_canary' fallback keeps this source a +// deterministic function of the spec (cache-safe) and preserves the +// legacy behaviour for any path that does not set the env var. +const NYX_PP_CANARY = process.env.NYX_CANARY || '__nyx_canary'; function nyxPrototypePollutionProbe(value) {{ const p = process.env.NYX_PROBE_PATH; diff --git a/src/dynamic/oracle.rs b/src/dynamic/oracle.rs index e348c78b..3c104bfb 100644 --- a/src/dynamic/oracle.rs +++ b/src/dynamic/oracle.rs @@ -342,10 +342,15 @@ pub enum ProbePredicate { /// [`Self::RedirectHostNotIn`] — evaluated across every drained /// probe rather than against a single record. PrototypeCanaryTouched { - /// Canary property name the harness installed on - /// `Object.prototype` (typically `"__nyx_canary"`). Compared - /// case-sensitively against + /// Canary property name, compared case-sensitively against /// [`ProbeKind::PrototypePollution::property`]. + /// + /// The const corpus stores only [`Canary::PLACEHOLDER`] here; at + /// run time [`oracle_fired_full`] is handed the per-spec + /// [`Canary`] the runner substituted into the payload bytes and + /// the harness's `NYX_CANARY` environment, and matches against + /// that instead — so this field is the low-entropy placeholder, + /// never the value actually compared in production. canary: &'static str, }, /// Phase 11 (Track J.9): CRYPTO weak-key entropy predicate. @@ -521,12 +526,43 @@ pub fn oracle_fired(oracle: &Oracle, outcome: &SandboxOutcome, probes: &[SinkPro /// scope. See [`Oracle::StubEvent`] for the semantics of the new /// branch and [`ProbePredicate::StubEventMatches`] for the new /// `Oracle::SinkProbe` cross-cutting predicate. -#[allow(deprecated)] +/// +/// Thin wrapper over [`oracle_fired_full`] with no per-spec canary — +/// every [`ProbePredicate::PrototypeCanaryTouched`] matches against the +/// const corpus's stored [`Canary::PLACEHOLDER`] token. Production +/// callers in the runner use [`oracle_fired_full`] with the per-spec +/// canary; this entry point is preserved for tests and pre-Phase-30 +/// callers. pub fn oracle_fired_with_stubs( oracle: &Oracle, outcome: &SandboxOutcome, probes: &[SinkProbe], stub_events: &[StubEvent], +) -> bool { + oracle_fired_full(oracle, outcome, probes, stub_events, None) +} + +/// Phase 30 (Track N.0): evaluate an oracle with the per-spec +/// verification [`Canary`] threaded in. +/// +/// When `canary` is `Some`, every +/// [`ProbePredicate::PrototypeCanaryTouched`] matches the drained probe's +/// `property` against the runtime canary the runner derived from the +/// finding's `spec_hash` and substituted into the payload bytes + the +/// harness's `NYX_CANARY` environment — rather than the const corpus's +/// low-entropy [`Canary::PLACEHOLDER`] token. Keying the match on a +/// per-spec value means a probe record left over from one finding's run +/// (or ambient harness output that happens to mention the historical +/// `__nyx_canary` sentinel) can never satisfy a different finding's +/// oracle. `None` keeps the placeholder-match path for unit tests and +/// any caller that has not derived a per-spec canary. +#[allow(deprecated)] +pub fn oracle_fired_full( + oracle: &Oracle, + outcome: &SandboxOutcome, + probes: &[SinkProbe], + stub_events: &[StubEvent], + canary: Option<&str>, ) -> bool { match oracle { Oracle::SinkProbe { predicates } => { @@ -635,9 +671,9 @@ pub fn oracle_fired_with_stubs( // [`ProbeKind::PrototypePollution`] record whose // `property` matches the canary name. let canary_ok = cross.iter().all(|p| match p { - ProbePredicate::PrototypeCanaryTouched { canary } => { - probes_satisfy_prototype_canary(probes, canary) - } + ProbePredicate::PrototypeCanaryTouched { + canary: placeholder, + } => probes_satisfy_prototype_canary(probes, canary.unwrap_or(placeholder)), _ => true, }); if !canary_ok { @@ -1212,6 +1248,140 @@ pub fn probe_crash_signal(probe: &SinkProbe) -> Option { } } +/// Per-spec verification canary (Phase 30 — Track N.0). +/// +/// Tracks J.1–J.9 (phases 03–11) seeded their probe-based oracles with a +/// single fixed sentinel string, `__nyx_canary`: the *same* low-entropy +/// token appeared in every spec's payload bytes, every prototype-pollution +/// harness's setter trap, and every +/// [`ProbePredicate::PrototypeCanaryTouched`] in the const corpus. A fixed +/// token is wrong on three counts the plan calls out: it is (a) not +/// cryptographically random, (b) not collision-resistant against ambient +/// harness output (anything that prints `__nyx_canary` matches), and (c) not +/// per-spec — a probe record left in a reused workdir from one finding's run +/// could satisfy a different finding's oracle. +/// +/// `Canary` replaces it with a value derived per finding from the finding's +/// [`spec_hash`](crate::dynamic::spec::HarnessSpec::spec_hash) and a +/// process-global run nonce. The const corpus carries only the +/// [`PLACEHOLDER`](Canary::PLACEHOLDER) token; the runner computes the real +/// canary once per spec via [`generate`](Canary::generate) + +/// [`render`](Canary::render) and substitutes it into (1) the payload bytes, +/// (2) the harness's `NYX_CANARY` environment variable, and (3) the oracle +/// match (threaded through [`oracle_fired_full`]). All three agree on the +/// same per-spec value at run time while the corpus source stays +/// `const`-declarable. +/// +/// The verdict never depends on the canary's *value* — only on whether the +/// pollution reached it — so deriving it from a fresh run nonce does not +/// break the engine's rerun-determinism contract (identical inputs still +/// produce identical verdicts). +pub struct Canary; + +impl Canary { + /// Placeholder token embedded in the const corpus: payload byte + /// literals, the `canary` field of + /// [`ProbePredicate::PrototypeCanaryTouched`], and the per-language + /// harness's `NYX_CANARY` fallback. Substituted with a per-spec + /// [`render`](Canary::render)ed value at run time. + /// + /// Kept byte-for-byte equal to the historical `__nyx_canary` sentinel so + /// legacy fixtures, the harness env fallback, and the colocated unit + /// tests that exercise the placeholder-match path keep resolving. The + /// Phase 30 audit (`tests/oracle_canary_audit.rs`) asserts every + /// canary-bearing predicate in the corpus uses exactly this constant, so + /// a new ad-hoc literal fails the build. + pub const PLACEHOLDER: &'static str = "__nyx_canary"; + + /// Bits of entropy a [`render`](Canary::render)ed canary carries. + /// + /// [`generate`](Canary::generate) returns 32 bytes and `render` encodes + /// every byte, so a rendered canary is 256 bits — comfortably above the + /// 128-bit floor the Phase 30 audit enforces. + pub const ENTROPY_BITS: u32 = 256; + + /// Derive a 32-byte canary for the finding identified by `spec_hash`. + /// + /// `BLAKE3("nyx.dynamic.canary.v1" ‖ run_nonce ‖ spec_hash)`. The + /// [`run_nonce`] is a process-global value seeded once from the OS + /// CSPRNG (mixed with time + pid as a fallback), so two runs of the same + /// spec draw different canaries and a stale probe record cannot satisfy a + /// later run. Keying on `spec_hash` gives every finding in a single run + /// a distinct canary, so one finding's canary can never collide with + /// another's. Deterministic within a process — the audit relies on this. + pub fn generate(spec_hash: &str) -> [u8; 32] { + let mut h = blake3::Hasher::new(); + h.update(b"nyx.dynamic.canary.v1\0"); + h.update(&run_nonce()); + h.update(b"\0"); + h.update(spec_hash.as_bytes()); + *h.finalize().as_bytes() + } + + /// Render a generated canary as a 64-character lowercase-hex token. + /// + /// Hex keeps the canary safe to embed verbatim as a JSON object key, a + /// JavaScript property name, and a header / filter token without + /// escaping. Every byte is encoded, so the token carries the full + /// [`ENTROPY_BITS`](Canary::ENTROPY_BITS). + pub fn render(bytes: &[u8; 32]) -> String { + let mut s = String::with_capacity(bytes.len() * 2); + for b in bytes { + s.push(char::from_digit((b >> 4) as u32, 16).unwrap()); + s.push(char::from_digit((b & 0x0f) as u32, 16).unwrap()); + } + s + } + + /// Convenience: the per-spec canary already rendered to its run-time + /// string form. Equivalent to `render(&generate(spec_hash))`. + pub fn for_spec(spec_hash: &str) -> String { + Self::render(&Self::generate(spec_hash)) + } +} + +/// Process-global run nonce backing [`Canary::generate`]. +/// +/// Seeded once, lazily, from the OS CSPRNG (`/dev/urandom` on Unix) mixed +/// with the wall clock, pid, and a counter so the value is fresh per process +/// but stable within it. The fallback mixing guarantees a non-repeating seed +/// even when no CSPRNG source is reachable. +fn run_nonce() -> [u8; 32] { + use std::sync::OnceLock; + static RUN_NONCE: OnceLock<[u8; 32]> = OnceLock::new(); + *RUN_NONCE.get_or_init(|| { + let mut h = blake3::Hasher::new(); + h.update(b"nyx.dynamic.run_nonce.v1\0"); + let mut os = [0u8; 32]; + if read_os_entropy(&mut os) { + h.update(&os); + } + // Always mix time + pid + a counter so a missing or blocked CSPRNG + // still yields a fresh, non-repeating seed. + if let Ok(d) = std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH) { + h.update(&d.as_nanos().to_le_bytes()); + } + h.update(&(std::process::id() as u64).to_le_bytes()); + static CTR: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(0); + let c = CTR.fetch_add(1, std::sync::atomic::Ordering::Relaxed); + h.update(&c.to_le_bytes()); + *h.finalize().as_bytes() + }) +} + +/// Fill `buf` from the OS CSPRNG. Returns `false` (caller falls back to the +/// time + pid mixing) when no source is available on the platform. +fn read_os_entropy(buf: &mut [u8]) -> bool { + #[cfg(unix)] + { + use std::io::Read; + if let Ok(mut f) = std::fs::File::open("/dev/urandom") { + return f.read_exact(buf).is_ok(); + } + } + false +} + #[cfg(test)] mod tests { use super::*; @@ -1829,4 +1999,93 @@ mod tests { let probes = vec![header_emit_probe("Set-Cookie", "noise")]; assert!(!oracle_fired(&oracle, &outcome(), &probes)); } + + // ── Phase 30 (Track N.0): per-spec canary ─────────────────────────── + + #[test] + fn canary_generate_is_deterministic_within_process() { + let a = Canary::generate("deadbeefcafe0001"); + let b = Canary::generate("deadbeefcafe0001"); + assert_eq!(a, b, "same spec_hash must yield the same canary in-process"); + assert_eq!(Canary::for_spec("h"), Canary::for_spec("h")); + } + + #[test] + fn canary_render_is_64_lowercase_hex() { + let bytes = Canary::generate("spec-hash-xyz"); + assert_eq!(bytes.len(), 32, "canary is 32 bytes / 256 bits"); + let r = Canary::render(&bytes); + assert_eq!(r.len(), 64, "render encodes every byte as two hex digits"); + assert!( + r.bytes() + .all(|b| b.is_ascii_hexdigit() && !b.is_ascii_uppercase()), + "render must be lowercase hex: {r}", + ); + assert!(Canary::ENTROPY_BITS >= 128); + assert!(r.len() * 4 >= 128, "rendered canary clears the 128-bit floor"); + } + + #[test] + fn canary_distinct_spec_hashes_yield_distinct_canaries() { + assert_ne!(Canary::for_spec("aaaa"), Canary::for_spec("bbbb")); + // No collisions across a large sweep of distinct spec hashes: + // distinct findings always get distinct canaries. + let mut seen = std::collections::HashSet::new(); + for i in 0..4096u32 { + let sh = format!("{i:016x}"); + assert!( + seen.insert(Canary::for_spec(&sh)), + "canary collision at spec_hash {sh}", + ); + } + } + + #[test] + fn oracle_full_canary_override_matches_runtime_property_not_placeholder() { + // The corpus predicate stores only the placeholder; the runner + // supplies the per-spec canary. A probe whose `property` is the + // runtime canary must fire under the override and NOT under the + // stale placeholder. + let runtime = Canary::for_spec("phase30-spec"); + let oracle = Oracle::SinkProbe { + predicates: &[ProbePredicate::PrototypeCanaryTouched { + canary: Canary::PLACEHOLDER, + }], + }; + let probes = vec![prototype_pollution_probe(&runtime, "pwned")]; + // With the per-spec override: fires. + assert!(oracle_fired_full( + &oracle, + &outcome(), + &probes, + &[], + Some(&runtime), + )); + // Without an override (None): the predicate's placeholder does not + // match the runtime property, so it does NOT fire — proving a + // probe carrying the per-spec canary cannot satisfy a placeholder + // match, and vice-versa. + assert!(!oracle_fired_full(&oracle, &outcome(), &probes, &[], None)); + } + + #[test] + fn oracle_full_canary_override_rejects_stale_placeholder_probe() { + // A probe carrying the historical `__nyx_canary` sentinel (e.g. + // left over from a pre-Phase-30 run or ambient output) must NOT + // satisfy a run whose per-spec canary differs. + let runtime = Canary::for_spec("phase30-spec-2"); + let oracle = Oracle::SinkProbe { + predicates: &[ProbePredicate::PrototypeCanaryTouched { + canary: Canary::PLACEHOLDER, + }], + }; + let probes = vec![prototype_pollution_probe(Canary::PLACEHOLDER, "pwned")]; + assert!(!oracle_fired_full( + &oracle, + &outcome(), + &probes, + &[], + Some(&runtime), + )); + } } diff --git a/src/dynamic/probe.rs b/src/dynamic/probe.rs index 727af82f..880e69cd 100644 --- a/src/dynamic/probe.rs +++ b/src/dynamic/probe.rs @@ -326,9 +326,12 @@ pub enum ProbeKind { /// `PrototypePollution` probe. PrototypePollution { /// Property name the host attempted to set on - /// `Object.prototype` — always `"__nyx_canary"` for Phase 10 - /// but parametrised so future per-sink canaries reuse the - /// kind without proliferating variants. + /// `Object.prototype`. Pre-Phase-30 this was always the fixed + /// `"__nyx_canary"` sentinel; Phase 30 (Track N.0) feeds the + /// harness a per-spec [`crate::dynamic::oracle::Canary`] via the + /// `NYX_CANARY` environment variable, so this carries the + /// cryptographically-random per-finding token the trap was + /// installed under. property: String, /// Stringified value the host attempted to bind. Echoed /// verbatim so repro tooling can pin the exact payload bytes diff --git a/src/dynamic/runner.rs b/src/dynamic/runner.rs index b646b28b..9628f9ce 100644 --- a/src/dynamic/runner.rs +++ b/src/dynamic/runner.rs @@ -13,7 +13,7 @@ use crate::dynamic::corpus::{ use crate::dynamic::differential; use crate::dynamic::harness::{self, HarnessError}; use crate::dynamic::middleware_demotion; -use crate::dynamic::oracle::{Oracle, oracle_fired_with_stubs, probe_crash_signal}; +use crate::dynamic::oracle::{Canary, Oracle, oracle_fired_full, probe_crash_signal}; use crate::dynamic::probe::{ProbeChannel, SinkProbe}; use crate::dynamic::sandbox::{self, SandboxBackend, SandboxError, SandboxOptions, SandboxOutcome}; use crate::dynamic::spec::HarnessSpec; @@ -463,6 +463,21 @@ pub fn run_spec(spec: &HarnessSpec, opts: &SandboxOptions) -> Result> = effective_opts.probe_channel.clone(); + // ── Phase 30 (Track N.0): per-spec verification canary ────────────── + // Derive a cryptographically-random, per-`spec_hash` canary, hand it to + // the harness via `NYX_CANARY` (the prototype-pollution setter trap and + // any future per-spec sentinel read it from the environment), and thread + // it into the oracle match below. Each payload's bytes have the const + // corpus's `Canary::PLACEHOLDER` token rewritten to this value, so the + // harness trap, the polluted property name, and the oracle all agree on + // a token unique to this finding — a stale probe from another run (or + // ambient output mentioning the historical `__nyx_canary` sentinel) can + // never satisfy this run's oracle. + let run_canary = Canary::for_spec(&spec.spec_hash); + effective_opts + .extra_env + .push(("NYX_CANARY".to_string(), run_canary.clone())); + // Run only vuln (non-benign) payloads in the main loop. let vuln_payloads: Vec<&Payload> = payloads.iter().filter(|p| !p.is_benign).collect(); @@ -510,6 +525,9 @@ pub fn run_spec(spec: &HarnessSpec, opts: &SandboxOptions) -> Result Result Result { - let benign_bytes = materialise_bytes(benign, None) - .map(|b| b.into_owned()) - .unwrap_or_default(); + let benign_bytes = substitute_canary_bytes( + materialise_bytes(benign, None) + .map(|b| b.into_owned()) + .unwrap_or_default(), + &run_canary, + ); if let Some(ch) = &probe_channel { let _ = ch.clear(); } @@ -725,11 +751,12 @@ pub fn run_spec(spec: &HarnessSpec, opts: &SandboxOptions) -> Result bool { } } +/// Rewrite every occurrence of [`Canary::PLACEHOLDER`] in `bytes` to the +/// per-spec `canary` (Phase 30 — Track N.0). +/// +/// Const corpus payloads embed the placeholder token; the runner swaps in +/// the finding's per-spec canary before the harness runs so the polluted +/// property name matches the trap the harness installed from `NYX_CANARY` +/// and the oracle's per-spec match. A cheap no-op for the vast majority of +/// payloads — those that never mention the placeholder return their input +/// buffer unchanged without reallocating. +fn substitute_canary_bytes(bytes: Vec, canary: &str) -> Vec { + let needle = Canary::PLACEHOLDER.as_bytes(); + if needle.is_empty() + || needle.len() > bytes.len() + || !bytes.windows(needle.len()).any(|w| w == needle) + { + return bytes; + } + let repl = canary.as_bytes(); + let mut out = Vec::with_capacity(bytes.len()); + let mut i = 0; + while i < bytes.len() { + if bytes[i..].starts_with(needle) { + out.extend_from_slice(repl); + i += needle.len(); + } else { + out.push(bytes[i]); + i += 1; + } + } + out +} + /// Generate a random 16-character hex nonce for OOB callback tracking. fn generate_nonce() -> String { use std::time::{SystemTime, UNIX_EPOCH}; diff --git a/tests/oracle_canary_audit.rs b/tests/oracle_canary_audit.rs new file mode 100644 index 00000000..94d3cbee --- /dev/null +++ b/tests/oracle_canary_audit.rs @@ -0,0 +1,214 @@ +//! Phase 30 (Track N.0) — oracle library consolidation + canary uniqueness +//! audit. +//! +//! Tracks J.1–J.9 seeded their probe-based oracles with a single fixed +//! sentinel string (`__nyx_canary`). Phase 30 replaces it with a per-spec +//! [`Canary`] derived from the finding's `spec_hash`, substituted at run time +//! into the payload bytes, the harness's `NYX_CANARY` environment, and the +//! oracle match. This test is the build-time guard the plan calls for: it +//! +//! 1. enumerates every `ProbePredicate` carried by the const corpus and +//! asserts each canary-bearing predicate uses exactly +//! [`Canary::PLACEHOLDER`] (a new ad-hoc literal fails the build); +//! 2. asserts the runtime [`Canary`] clears the 128-bit entropy floor, is +//! deterministic within a process, and is collision-free across a large +//! spec-hash sweep (so distinct findings — and therefore the eval corpora +//! — never share a canary); and +//! 3. classifies *every* `ProbePredicate` variant with an exhaustive match, +//! so adding a new variant without classifying it as canary-bearing or +//! structural fails to compile here. +//! +//! `cargo nextest run --features dynamic --test oracle_canary_audit`. + +#![cfg(feature = "dynamic")] + +use std::collections::HashSet; + +use nyx_scanner::dynamic::corpus::CORPUS; +use nyx_scanner::dynamic::oracle::{Canary, Oracle, ProbePredicate}; + +/// Classify a predicate as canary-bearing (returns its stored canary token) +/// or structural (returns `None`). +/// +/// The match is intentionally exhaustive with no `_` arm: a new +/// `ProbePredicate` variant added to the library forces a classification +/// decision here, which is the Phase 30 guard that "CI fails the build if a +/// new ad-hoc canary lands". Structural predicates carry header names, +/// allowlists, thresholds, or needles — intentionally low-entropy, public +/// values that are *not* secret sentinels and must not be treated as +/// canaries. +fn canary_token(p: &ProbePredicate) -> Option<&str> { + match p { + // The one secret-sentinel predicate: its `canary` is the property a + // prototype-pollution sink writes onto `Object.prototype` and the + // oracle matches against the drained probe. + ProbePredicate::PrototypeCanaryTouched { canary } => Some(canary), + + // Structural predicates — no secret sentinel. + ProbePredicate::ArgContains { .. } + | ProbePredicate::ArgEquals { .. } + | ProbePredicate::AnyArgContains(_) + | ProbePredicate::CalleeEquals(_) + | ProbePredicate::MinArgs(_) + | ProbePredicate::StubEventMatches { .. } + | ProbePredicate::DeserializeGadgetInvoked { .. } + | ProbePredicate::TemplateEvalEqual { .. } + | ProbePredicate::XxeEntityExpanded { .. } + | ProbePredicate::HeaderInjected { .. } + | ProbePredicate::HeaderSmuggledInWire { .. } + | ProbePredicate::RedirectHostNotIn { .. } + | ProbePredicate::WeakKeyEntropy { .. } + | ProbePredicate::IdorBoundaryCrossed + | ProbePredicate::OutboundHostNotIn { .. } + | ProbePredicate::QueryResultCountGreaterThan { .. } + | ProbePredicate::JsonParseExcessiveDepth { .. } => None, + } +} + +/// Visit every `ProbePredicate` the corpus carries — both the active +/// `Oracle::SinkProbe { predicates }` slice and the parallel +/// `CuratedPayload::probe_predicates` slice — for every `(cap, lang)` entry. +fn for_each_corpus_predicate(mut visit: impl FnMut(&str /*label*/, &[u8] /*bytes*/, &ProbePredicate)) { + for &(_cap, _lang, slice) in CORPUS.entries { + for payload in slice { + if let Oracle::SinkProbe { predicates } = &payload.oracle { + for p in *predicates { + visit(payload.label, payload.bytes, p); + } + } + for p in payload.probe_predicates { + visit(payload.label, payload.bytes, p); + } + } + } +} + +/// No corpus predicate may carry an ad-hoc canary literal: every +/// canary-bearing predicate must reference [`Canary::PLACEHOLDER`], and the +/// owning payload's bytes must embed that placeholder so the runner's +/// run-time substitution actually has a token to rewrite. +#[test] +fn corpus_canaries_use_placeholder_and_are_substitutable() { + let mut canary_predicates = 0usize; + for_each_corpus_predicate(|label, bytes, p| { + let Some(token) = canary_token(p) else { + return; + }; + canary_predicates += 1; + assert_eq!( + token, + Canary::PLACEHOLDER, + "payload {label:?} carries an ad-hoc canary literal {token:?}; \ + canary-bearing predicates must use Canary::PLACEHOLDER so the \ + runner can substitute a per-spec canary", + ); + let needle = Canary::PLACEHOLDER.as_bytes(); + let embedded = bytes.windows(needle.len()).any(|w| w == needle); + assert!( + embedded, + "payload {label:?} carries a PrototypeCanaryTouched predicate but \ + its bytes do not embed Canary::PLACEHOLDER ({:?}); run-time \ + substitution would have nothing to rewrite and the harness trap \ + would never match", + Canary::PLACEHOLDER, + ); + }); + // Sanity: the prototype-pollution + json_parse slices contribute these, + // so the audit must actually have inspected some. A zero here means the + // corpus walk silently stopped finding canary predicates. + assert!( + canary_predicates > 0, + "expected at least one canary-bearing predicate in the corpus", + ); +} + +/// A generated canary is 32 bytes / 256 bits; its rendered form is 64 +/// lowercase-hex characters, clears the 128-bit floor, and is deterministic +/// within a process (the runner derives it twice — once for the harness env, +/// once for the oracle — and the two must agree). +#[test] +fn canary_entropy_and_determinism() { + assert!( + Canary::ENTROPY_BITS >= 128, + "Canary::ENTROPY_BITS must clear the 128-bit floor", + ); + + let bytes = Canary::generate("spec-hash-under-audit"); + assert_eq!(bytes.len(), 32, "canary is 256 bits of BLAKE3 output"); + + let rendered = Canary::render(&bytes); + assert_eq!(rendered.len(), 64, "render encodes all 32 bytes as hex"); + assert!( + rendered.len() * 4 >= 128, + "rendered canary must carry at least 128 bits", + ); + assert!( + rendered + .bytes() + .all(|b| b.is_ascii_hexdigit() && !b.is_ascii_uppercase()), + "rendered canary must be lowercase hex (safe as a JSON key / JS \ + property / header token): {rendered}", + ); + + // Deterministic within the process. + assert_eq!(bytes, Canary::generate("spec-hash-under-audit")); + assert_eq!( + Canary::for_spec("spec-hash-under-audit"), + Canary::for_spec("spec-hash-under-audit"), + ); + + // Not a fixed string: the rendered canary differs from the historical + // placeholder sentinel. + assert_ne!(Canary::for_spec("anything"), Canary::PLACEHOLDER); +} + +/// Distinct findings get distinct canaries: a large sweep of distinct +/// `spec_hash` values produces no collisions. This is the "no oracle +/// collision in any of the eval corpora" guarantee — every finding in a run +/// has a unique `spec_hash`, hence a unique canary, hence one finding's probe +/// record can never satisfy another's oracle. +#[test] +fn canary_is_collision_free_across_spec_hash_sweep() { + let mut seen = HashSet::new(); + let n = 50_000u32; + for i in 0..n { + // Vary the hash shape the way real spec hashes do (16 hex chars) plus + // a few longer forms to exercise the input space. + let spec_hash = format!("{i:016x}"); + let canary = Canary::for_spec(&spec_hash); + assert!( + seen.insert(canary), + "canary collision at spec_hash {spec_hash}", + ); + } + assert_eq!(seen.len() as u32, n, "every spec_hash produced a unique canary"); +} + +/// The byte output of `generate` exercises the full space: across many +/// samples every byte position takes both low and high values, so no position +/// is stuck (a coarse but effective check that the BLAKE3 mixing is wired up +/// rather than, say, a zero-fill). +#[test] +fn canary_byte_positions_are_not_stuck() { + let mut saw_low = [false; 32]; + let mut saw_high = [false; 32]; + for i in 0..512u32 { + let b = Canary::generate(&format!("stuck-check-{i}")); + for (pos, byte) in b.iter().enumerate() { + if *byte < 0x40 { + saw_low[pos] = true; + } + if *byte >= 0xc0 { + saw_high[pos] = true; + } + } + } + for pos in 0..32 { + assert!( + saw_low[pos] && saw_high[pos], + "byte position {pos} looks stuck (low={}, high={})", + saw_low[pos], + saw_high[pos], + ); + } +}