//! Track-security cross-cutting policy module (Phase 08 — Track C.4 + C.5). //! //! Centralises the deny rules and byte-bound limits that the per-run //! [`crate::dynamic::probe::ProbeWitness`] construction uses to keep //! captured forensic data both privacy-safe and bounded in size. //! //! Two responsibilities, intentionally kept in one module so the security //! envelope is auditable in a single file: //! //! 1. **Env scrubbing** — [`scrub_env`] redacts the host environment when //! snapshotted onto a [`crate::dynamic::probe::ProbeWitness`]. Any key //! matching a [`DENY_KEY_SUBSTRINGS`] entry (case-insensitive substring //! match against the upper-cased key) has its value replaced with //! [`REDACTED_VALUE`]. Whitelist semantics (allow-list) were rejected //! because the harness env is heterogeneous across CI / local / //! container runs; a deny-substring list matches the common-suffix //! naming used in practice (`*_TOKEN`, `*_KEY`, `*_SECRET`, …) with no //! false negatives on the cases we have evidence for. //! 2. **Byte bounds** — [`PAYLOAD_CAPTURE_LIMIT_BYTES`] caps the //! `payload_bytes` field at 16 KiB so a fuzzer-emitted megabyte payload //! does not turn the probe file into a memory hog or balloon downstream //! repro artifacts. [`truncate_payload_bytes`] is the only sanctioned //! truncation entry point — every probe construction path goes through //! it so the bound is enforced uniformly. //! //! The module deliberately depends on `std` only (no third-party crates) //! so `cargo deny check` and `cargo doc` both see it as a leaf with no //! transitive license risk. //! //! # Phase 28 extension (Track H.5 — PII scrubber) //! //! [`Scrubber`] hashes probe-witness values whose textual shape matches a //! project secret pattern. The pattern set is the one //! [`crate::utils::redact`] already applies to dynamic sandbox output — //! repro bundle `outcome.json` redaction and telemetry payload scrubbing //! before they hit disk. Covered shapes: AWS access key IDs, GitHub / //! Slack / OpenAI tokens, PEM blocks, `password=` / `api_key=` / `secret=` //! query strings, and `Bearer` headers. Re-using the redactor's pattern //! list keeps the rule "what counts as PII" defined in exactly one place //! across the project — adding a new pattern in `redact.rs` also tightens //! probe-witness scrubbing without a second registry to maintain. //! //! Note on the `--show-suppressed` CLI flag: that flag is a boolean //! toggle for inline-comment suppression of static findings //! ([`crate::commands::scan`] `show_suppressed`); it does not consume //! the secret-pattern set defined here. A future user-configurable //! "what counts as a secret in this project" regex list (e.g. a //! `[scrubber]` section in `default-nyx.conf`) would plug into //! [`Scrubber::project_default`] alongside the static //! [`crate::utils::redact`] patterns, not the suppression flag. //! //! The witness scrubber differs from the redactor in one respect: instead //! of erasing the secret behind a `` placeholder it replaces it //! with `>` where the prefix is the first 16 hex //! chars of the BLAKE3 digest. This preserves enough signal to (a) //! correlate the same secret across multiple witness fields without //! exposing it and (b) detect via dedup analysis that two probe runs //! observed the same credential when a leaked token gets cycled into //! payloads. use std::collections::BTreeMap; use crate::utils::redact; /// Maximum number of bytes retained in /// [`crate::dynamic::probe::ProbeWitness::payload_bytes`]. /// /// 16 KiB is the cap the Phase 08 plan calls for; matches the upper bound /// any reasonable injection payload will need (the existing curated corpus /// peaks under 200 B). Anything larger is truncated head-first via /// [`truncate_payload_bytes`] because that is the prefix the sink actually /// sees first. pub const PAYLOAD_CAPTURE_LIMIT_BYTES: usize = 16 * 1024; /// Placeholder written in place of a denied environment variable's value /// when [`scrub_env`] redacts it. Lower-case so it is visually distinct /// from a real CI env value (which is overwhelmingly upper-snake). pub const REDACTED_VALUE: &str = ""; /// Substrings that mark a key as carrying credential-shaped data. /// /// Matched case-insensitively against the upper-cased env var key. Order /// is not significant — the first match wins because all matches lead to /// the same redaction. /// /// The list is intentionally short and high-precision: false-positive /// redactions just remove a value from a forensic snapshot, but false /// negatives leak credentials into a probe file that may be persisted as /// a repro artifact. pub const DENY_KEY_SUBSTRINGS: &[&str] = &[ "TOKEN", "SECRET", "PASSWORD", "PASSWD", "API_KEY", "APIKEY", "PRIVATE_KEY", "CREDENTIAL", "SESSION", "COOKIE", "AUTH", "BEARER", // Cloud provider shapes that don't end in TOKEN / SECRET / KEY. "AWS_ACCESS", "AWS_SESSION", "GH_TOKEN", "GITHUB_TOKEN", "NPM_TOKEN", "PYPI_TOKEN", "DOCKER_PASS", ]; /// True iff `key` matches any [`DENY_KEY_SUBSTRINGS`] entry under /// case-insensitive substring comparison. The exposed predicate so /// [`crate::dynamic::probe`] tests can reason about individual keys /// without round-tripping through [`scrub_env`]. pub fn is_denied_env_key(key: &str) -> bool { let upper = key.to_ascii_uppercase(); DENY_KEY_SUBSTRINGS .iter() .any(|needle| upper.contains(*needle)) } /// Redact denied keys' values in an env iterator and collect into a /// [`BTreeMap`]. `BTreeMap` rather than `HashMap` so the serialised /// witness is byte-deterministic across runs — repro reproducibility /// depends on it. pub fn scrub_env(iter: I) -> BTreeMap where I: IntoIterator, S: Into, { let mut out = BTreeMap::new(); for (k, v) in iter { let k: String = k.into(); let v: String = v.into(); if is_denied_env_key(&k) { out.insert(k, REDACTED_VALUE.to_owned()); } else { out.insert(k, v); } } out } /// Prefix written before the BLAKE3 hex digest by [`Scrubber::scrub_string`] /// when a witness value matches a project secret pattern. Operators /// grepping for leaked credentials in a probe witness see /// `` and know the bytes were classified as PII before /// the file landed on disk. pub const SCRUB_HASH_PREFIX: &str = ">`. Hashing /// rather than dropping the value lets downstream forensic analysis /// dedup repeated occurrences of the same credential across witness /// fields without exposing the credential itself. /// /// Constructed via [`Scrubber::project_default`] for the standard /// pattern set; the type is left as a struct (rather than a free /// function) so future per-project allow-listing can attach to the same /// API surface without breaking call sites. #[derive(Debug, Default, Clone)] pub struct Scrubber { _private: (), } impl Scrubber { /// Scrubber wired to the project-default secret regex set. Cheap to /// construct — holds no compiled state because [`crate::utils::redact`] /// is stateless. pub fn project_default() -> Self { Self { _private: () } } /// True iff `text` contains any project secret pattern (regex set or /// literal substring). Useful for tests asserting that a witness /// field would be scrubbed without allocating the rewritten string. pub fn matches_any(&self, text: &str) -> bool { if redact::contains_secret(text.as_bytes()) { return true; } let lower = text.to_ascii_lowercase(); PII_LITERAL_SUBSTRINGS .iter() .any(|needle| lower.contains(*needle)) } /// Scrub `text`, returning a new `String` whose value is either the /// input unchanged (no pattern matched) or `>` /// (hashes the whole value). Hashing the whole value rather than /// each matched substring keeps the rewrite mechanism trivial — the /// witness fields are short forensic strings, not long log lines, /// and shipping the entire field plus a marker is what downstream /// repro tooling expects. pub fn scrub_string(&self, text: &str) -> String { if self.matches_any(text) { hash_token(text) } else { text.to_owned() } } /// Scrub raw bytes from a sink-side payload capture. Returns the /// input unchanged when no project secret pattern matches; on a hit, /// returns a deterministic same-length placeholder derived from the /// blake3 digest of the input so downstream forensic tooling that /// keys on payload length (e.g. corpus-promote diffing) keeps its /// invariants. /// /// The deferred Phase 28 follow-up flagged this gap: the textual /// scrubber already covers `env_snapshot` / `cwd` / `args_repr` / /// `callee`, but `ProbeWitness::payload_bytes` was passed through /// raw because curated corpus payloads are deterministic literals /// known not to contain credentials. Real-world Track B sinks can /// surface attacker-controlled bytes that contain credentials, and /// this routes that path through the same regex set as everything /// else. pub fn scrub_bytes(&self, bytes: &[u8]) -> Vec { if !redact::contains_secret(bytes) { return bytes.to_vec(); } // Same-length deterministic placeholder: tile the input's blake3 // hex digest across `bytes.len()`. Length is preserved so any // downstream tooling that asserts on payload length (the // `events.jsonl` size budget, the corpus-promote diff) keeps // working; content is replaced with a fixed-vocabulary marker // derived from a one-way hash of the original. let digest = blake3::hash(bytes).to_hex(); let hex = digest.as_bytes(); debug_assert!(!hex.is_empty(), "blake3 hex digest is never empty"); (0..bytes.len()).map(|i| hex[i % hex.len()]).collect() } } /// Hash a matched secret into the `>` shape. fn hash_token(secret: &str) -> String { let digest = blake3::hash(secret.as_bytes()); let hex = digest.to_hex(); let prefix: String = hex.chars().take(SCRUB_HASH_PREFIX_LEN).collect(); format!("{SCRUB_HASH_PREFIX}{prefix}>") } /// Outcome of [`evaluate`]. /// /// Either `Allow` (let the verifier execute the finding) or `Deny` with /// the rule that fired and an evidence excerpt that triage can quote in /// the audit log. `Deny` is the second security layer above the /// per-witness [`Scrubber`]: the scrubber redacts already-captured /// bytes, while `Deny` short-circuits execution before the sandbox ever /// loads the payload, so the credential never touches the harness in /// the first place. #[derive(Debug, Clone, PartialEq, Eq)] pub enum PolicyDecision { /// Finding cleared every deny rule; the verifier may proceed. Allow, /// Finding matched a deny rule. Deny { /// Stable rule identifier — one of [`DenyRule::CREDENTIALS`], /// [`DenyRule::PRIVATE_KEY`], [`DenyRule::PRODUCTION_ENDPOINT`]. rule: &'static str, /// Logical name of the diag field that produced the matched text /// (e.g. `path`, `message`, `evidence.notes[2]`, /// `flow_steps[1].snippet`). Lets operators triage *where* the /// rule fired without having to re-derive the match from the /// scrubbed excerpt alone. field: String, /// Short text excerpt (max 120 chars, scrubbed via /// [`Scrubber::scrub_string`]) of the offending field so an /// operator can identify *why* the deny fired without having to /// re-derive the match. excerpt: String, }, } impl PolicyDecision { /// Convenience accessor; lets call sites match on the boolean /// outcome before unpacking the typed reason. pub fn is_deny(&self) -> bool { matches!(self, PolicyDecision::Deny { .. }) } } /// Rule-name constants exposed for the /// [`crate::evidence::InconclusiveReason::PolicyDeniedDynamic`] field /// and for tests that need to assert *which* deny rule fired. Strings /// rather than an enum so they read identically in JSON output, audit /// logs, and the `Display` impl on `InconclusiveReason`. pub struct DenyRule; impl DenyRule { /// Finding mentions a credential-shaped token (AWS key, GitHub / /// Slack / OpenAI token, `password=` query string, `Bearer` /// header) — re-uses the project-wide secret regex set via /// [`crate::utils::redact::contains_secret`]. pub const CREDENTIALS: &'static str = "credentials"; /// Finding mentions a private key (PEM block opener, OpenSSH /// private key block, base64-shaped key payload). pub const PRIVATE_KEY: &'static str = "private-key"; /// Finding's path or evidence references a production endpoint /// (e.g. `api.prod.example.com`, `*.production.*`, /// `*-prod.amazonaws.com`). Conservative: matched against the /// short list in [`PROD_ENDPOINT_REGEXES`]. pub const PRODUCTION_ENDPOINT: &'static str = "production-endpoint"; } /// Substrings that mark a [`DenyRule::PRIVATE_KEY`] hit on their own, /// independent of the [`crate::utils::redact`] regex set. The redact /// regex covers the `-----BEGIN ... PRIVATE KEY-----` shape; the /// literals below add coverage for evidence-snippet excerpts where the /// trailing newline has been stripped (a common occurrence in CLI /// output that gets folded into a one-line `notes` entry). const PRIVATE_KEY_LITERALS: &[&str] = &[ "-----begin rsa private key", "-----begin openssh private key", "-----begin ec private key", "-----begin private key", "-----begin dsa private key", "-----begin pgp private key", "ssh-rsa aaaa", "ssh-ed25519 aaaa", ]; /// Substrings that mark a [`DenyRule::PRODUCTION_ENDPOINT`] hit. /// /// Conservative starter set: the regex shapes most security teams ban /// from a dynamic re-execution sandbox. Matched case-insensitively as /// a substring of the diag's path / sink callee / flow-step snippets. /// /// `*.production.*` and `*-prod.*` shapes are folded into a single /// `".prod"` / `"-prod"` / `"production"` substring set rather than /// using a full regex engine — the regex shape would be more /// permissive but at the cost of a dependency the dynamic crate does /// not currently pull in. The substring set deliberately false- /// positives on `productionalize` / `reproduction` because both reads /// of the data deserve a human eye before dynamic execution. const PROD_ENDPOINT_REGEXES: &[&str] = &[ "api.prod.", "api-prod.", ".production.", "-production.", "-prod.amazonaws.com", "prod.example.com", "prod-api.", "prod-db.", "prod-cluster.", ]; /// Evaluate `diag` against the cross-cutting security deny list. /// /// Walks the finding's id, path, message, evidence notes, flow-step /// snippets, and the `SpanEvidence` snippets for source/sink/guard/ /// sanitizer entries. Each text is fed to three predicates in turn /// — [`DenyRule::CREDENTIALS`] (via [`crate::utils::redact::contains_secret`]), /// [`DenyRule::PRIVATE_KEY`] (via [`PRIVATE_KEY_LITERALS`]), /// [`DenyRule::PRODUCTION_ENDPOINT`] (via [`PROD_ENDPOINT_REGEXES`]). /// The first match wins and the verifier short-circuits to /// [`crate::evidence::InconclusiveReason::PolicyDeniedDynamic`]. /// /// Multiple rules matching the same evidence pick private-key first /// (most precise — PEM blocks also satisfy the credentials regex set, /// so private-key is checked first to avoid burying the precise label /// under a generic one), credentials second, production-endpoint /// third — the ordering surfaces the most actionable rule label given /// the leak shape. pub fn evaluate(diag: &crate::commands::scan::Diag) -> PolicyDecision { let texts = collect_diag_texts(diag); for (field, text) in &texts { if let Some(hit) = match_text(text) { return PolicyDecision::Deny { rule: hit.0, field: field.clone(), excerpt: excerpt_with_scrubber(hit.1), }; } } PolicyDecision::Allow } /// Collect every text fragment from `diag` paired with a stable name for /// the source field. The returned field names are intentionally /// human-readable (e.g. `evidence.notes[2]`, `flow_steps[1].snippet`) /// rather than enum variants so they read identically in audit logs and /// in `Display` output. fn collect_diag_texts(diag: &crate::commands::scan::Diag) -> Vec<(String, String)> { let mut out: Vec<(String, String)> = Vec::new(); if !diag.id.is_empty() { out.push(("id".into(), diag.id.clone())); } if !diag.path.is_empty() { out.push(("path".into(), diag.path.clone())); } if let Some(msg) = diag.message.as_ref() { out.push(("message".into(), msg.clone())); } if let Some(ev) = diag.evidence.as_ref() { for (i, note) in ev.notes.iter().enumerate() { out.push((format!("evidence.notes[{i}]"), note.clone())); } if let Some(exp) = ev.explanation.as_ref() { out.push(("evidence.explanation".into(), exp.clone())); } for (label, s) in [("source", &ev.source), ("sink", &ev.sink)] { if let Some(span) = s.as_ref() { out.push((format!("evidence.{label}.path"), span.path.clone())); if let Some(sn) = span.snippet.as_ref() { out.push((format!("evidence.{label}.snippet"), sn.clone())); } } } for (i, span) in ev.guards.iter().enumerate() { if let Some(sn) = span.snippet.as_ref() { out.push((format!("evidence.guards[{i}].snippet"), sn.clone())); } } for (i, span) in ev.sanitizers.iter().enumerate() { if let Some(sn) = span.snippet.as_ref() { out.push((format!("evidence.sanitizers[{i}].snippet"), sn.clone())); } } for (i, step) in ev.flow_steps.iter().enumerate() { if !step.file.is_empty() { out.push((format!("flow_steps[{i}].file"), step.file.clone())); } if let Some(sn) = step.snippet.as_ref() { out.push((format!("flow_steps[{i}].snippet"), sn.clone())); } if let Some(callee) = step.callee.as_ref() { out.push((format!("flow_steps[{i}].callee"), callee.clone())); } } } out } /// Match a single text against the deny set. Returns /// `Some((rule_name, matched_text))` on hit, `None` otherwise. Matched /// text is the original text (not the rule needle) so the excerpt /// surfaced on the verdict shows the operator *which* field caused the /// refusal, not just the rule that fired. fn match_text(text: &str) -> Option<(&'static str, &str)> { if text.is_empty() { return None; } let lower = text.to_ascii_lowercase(); // Private-key literals checked first: PEM blocks also satisfy the // generic credentials regex set in [`crate::utils::redact`], so a // PEM hit would otherwise misclassify as `credentials`. Surfacing // the more precise rule lets operators triage the leak shape from // the verdict alone. if PRIVATE_KEY_LITERALS.iter().any(|n| lower.contains(*n)) { return Some((DenyRule::PRIVATE_KEY, text)); } if redact::contains_secret(text.as_bytes()) { return Some((DenyRule::CREDENTIALS, text)); } if PROD_ENDPOINT_REGEXES.iter().any(|n| lower.contains(*n)) { return Some((DenyRule::PRODUCTION_ENDPOINT, text)); } None } /// Build a short excerpt suitable for embedding in a /// [`crate::evidence::InconclusiveReason::PolicyDeniedDynamic`]. /// /// Routes the text through [`Scrubber::scrub_string`] first so the /// excerpt itself cannot leak the credential, then truncates to 120 /// `chars` to keep the audit log compact. Truncation walks /// codepoints (not bytes) because PROD_ENDPOINT hits pass through the /// scrubber unchanged — a long file-path or snippet with non-ASCII /// content (e.g. Unicode in a source comment) would otherwise panic /// the verifier on a mid-codepoint byte slice. fn excerpt_with_scrubber(text: &str) -> String { let scrubbed = Scrubber::project_default().scrub_string(text); let mut indices = scrubbed.char_indices(); match indices.nth(120) { None => scrubbed, Some((cut, _)) => format!("{}…", &scrubbed[..cut]), } } /// Truncate `bytes` to at most [`PAYLOAD_CAPTURE_LIMIT_BYTES`]. /// /// Head-keeping: the prefix the sink reads first is retained; the tail is /// dropped. Returns `bytes` unchanged when it already fits the cap so /// callers can use the return value without allocating in the common case. pub fn truncate_payload_bytes(bytes: &[u8]) -> &[u8] { if bytes.len() <= PAYLOAD_CAPTURE_LIMIT_BYTES { bytes } else { &bytes[..PAYLOAD_CAPTURE_LIMIT_BYTES] } } #[cfg(test)] mod tests { use super::*; #[test] fn deny_substring_match_is_case_insensitive() { assert!(is_denied_env_key("AWS_SECRET_ACCESS_KEY")); assert!(is_denied_env_key("aws_secret_access_key")); assert!(is_denied_env_key("MyToken")); assert!(is_denied_env_key("DATABASE_PASSWORD")); } #[test] fn non_credential_keys_pass_through() { assert!(!is_denied_env_key("PATH")); assert!(!is_denied_env_key("HOME")); assert!(!is_denied_env_key("NYX_PAYLOAD")); } #[test] fn scrub_redacts_denied_keys_and_keeps_others() { let env = vec![ ("PATH".to_owned(), "/usr/bin".to_owned()), ("AWS_SECRET_ACCESS_KEY".to_owned(), "AKIA...".to_owned()), ("HOME".to_owned(), "/home/x".to_owned()), ]; let scrubbed = scrub_env(env); assert_eq!(scrubbed.get("PATH").map(String::as_str), Some("/usr/bin")); assert_eq!(scrubbed.get("HOME").map(String::as_str), Some("/home/x")); assert_eq!( scrubbed.get("AWS_SECRET_ACCESS_KEY").map(String::as_str), Some(REDACTED_VALUE) ); } #[test] fn truncate_keeps_short_payloads_unchanged() { let bytes = b"short payload"; assert_eq!(truncate_payload_bytes(bytes), bytes); } #[test] fn truncate_caps_long_payloads_at_limit() { let bytes = vec![b'A'; PAYLOAD_CAPTURE_LIMIT_BYTES + 100]; let truncated = truncate_payload_bytes(&bytes); assert_eq!(truncated.len(), PAYLOAD_CAPTURE_LIMIT_BYTES); assert!(truncated.iter().all(|b| *b == b'A')); } #[test] fn truncate_at_exact_boundary_unchanged() { let bytes = vec![0u8; PAYLOAD_CAPTURE_LIMIT_BYTES]; assert_eq!( truncate_payload_bytes(&bytes).len(), PAYLOAD_CAPTURE_LIMIT_BYTES ); } #[test] fn scrubber_passes_through_clean_value() { let s = Scrubber::project_default(); let out = s.scrub_string("hello world"); assert_eq!(out, "hello world"); assert!(!s.matches_any("hello world")); } #[test] fn scrubber_hashes_aws_key_value() { let s = Scrubber::project_default(); let value = "key=AKIAFAKETEST00000000"; assert!(s.matches_any(value)); let out = s.scrub_string(value); assert!(out.starts_with(SCRUB_HASH_PREFIX), "got {out}"); assert!(out.ends_with('>')); assert!(!out.contains("AKIAFAKETEST00000000")); } #[test] fn scrubber_hashes_project_stub_secret() { let s = Scrubber::project_default(); let value = "nyx-stub-secret-abc123-deadbeef"; assert!(s.matches_any(value)); let out = s.scrub_string(value); assert!(out.starts_with(SCRUB_HASH_PREFIX), "got {out}"); assert!(!out.contains("abc123-deadbeef")); } #[test] fn scrubber_hash_is_stable_for_same_input() { let s = Scrubber::project_default(); let a = s.scrub_string("AKIAFAKETEST00000000"); let b = s.scrub_string("AKIAFAKETEST00000000"); assert_eq!(a, b); } #[test] fn scrubber_hash_differs_for_different_inputs() { let s = Scrubber::project_default(); let a = s.scrub_string("AKIAFAKETEST00000000"); let b = s.scrub_string("AKIAFAKETEST11111111"); assert_ne!(a, b); } #[test] fn scrub_bytes_passes_through_clean_payload() { let s = Scrubber::project_default(); let original = b"".to_vec(); let out = s.scrub_bytes(&original); assert_eq!(out, original); } #[test] fn scrub_bytes_replaces_credential_payload_same_length() { let s = Scrubber::project_default(); let original = b"username=admin&token=AKIAFAKETEST00000000&action=login".to_vec(); let out = s.scrub_bytes(&original); assert_eq!(out.len(), original.len(), "same-length contract"); assert!(!out.windows(20).any(|w| w == b"AKIAFAKETEST00000000")); assert!(out.iter().all(|b| b.is_ascii_hexdigit())); } #[test] fn scrub_bytes_is_deterministic() { let s = Scrubber::project_default(); let original = b"AKIAFAKETEST00000000 payload tail".to_vec(); let a = s.scrub_bytes(&original); let b = s.scrub_bytes(&original); assert_eq!(a, b); } #[test] fn scrub_bytes_differs_for_different_inputs() { let s = Scrubber::project_default(); let a = s.scrub_bytes(b"AKIAFAKETEST00000000 alpha"); let b = s.scrub_bytes(b"AKIAFAKETEST11111111 alpha"); assert_ne!(a, b); } #[test] fn scrub_bytes_handles_empty() { let s = Scrubber::project_default(); assert_eq!(s.scrub_bytes(&[]), Vec::::new()); } #[test] fn scrub_is_deterministic_btree() { // Same iterator yields the same map; BTreeMap guarantees iteration order. let env = vec![ ("B".to_owned(), "1".to_owned()), ("A".to_owned(), "2".to_owned()), ]; let m = scrub_env(env); let keys: Vec<&str> = m.keys().map(String::as_str).collect(); assert_eq!(keys, vec!["A", "B"]); } }