diff --git a/benches/dynamic_bench.rs b/benches/dynamic_bench.rs index 678ea330..4dae488b 100644 --- a/benches/dynamic_bench.rs +++ b/benches/dynamic_bench.rs @@ -18,7 +18,7 @@ use criterion::{Criterion, criterion_group, criterion_main}; #[cfg(feature = "dynamic")] -use nyx_scanner::dynamic::spec::{EntryKind, HarnessSpec, PayloadSlot}; +use nyx_scanner::dynamic::spec::{EntryKind, HarnessSpec, PayloadSlot, SpecDerivationStrategy}; #[cfg(feature = "dynamic")] use nyx_scanner::labels::Cap; #[cfg(feature = "dynamic")] @@ -39,6 +39,8 @@ fn make_rust_sqli_spec() -> HarnessSpec { sink_file: "tests/dynamic_fixtures/rust/sqli_positive.rs".into(), sink_line: 18, spec_hash: "benchrustsqli0001".into(), + derivation: SpecDerivationStrategy::FromFlowSteps, + stubs_required: vec![], } } @@ -57,6 +59,8 @@ fn make_sqli_spec() -> HarnessSpec { sink_file: "tests/dynamic_fixtures/python/sqli_positive.py".into(), sink_line: 7, spec_hash: "benchsqli000001".into(), + derivation: SpecDerivationStrategy::FromFlowSteps, + stubs_required: vec![], } } @@ -101,7 +105,7 @@ fn bench_sandbox_run_payload(c: &mut Criterion) { }; c.bench_function("sandbox_run_payload", |b| { - b.iter(|| sandbox::run(&harness, payload, &opts).expect("sandbox run")); + b.iter(|| sandbox::run(&harness, &payload.bytes, &opts).expect("sandbox run")); }); } @@ -213,7 +217,7 @@ fn bench_docker_payload_cost(c: &mut Criterion) { c.bench_function("docker_payload_cost", |b| { b.iter(|| { - let _ = sandbox::run(&built, payload, &opts); + let _ = sandbox::run(&built, &payload.bytes, &opts); }); }); } @@ -253,6 +257,8 @@ fn make_js_sqli_spec() -> HarnessSpec { sink_file: "tests/dynamic_fixtures/js/sqli_positive.js".into(), sink_line: 8, spec_hash: "benchjssqli000001".into(), + derivation: SpecDerivationStrategy::FromFlowSteps, + stubs_required: vec![], } } @@ -271,6 +277,8 @@ fn make_go_sqli_spec() -> HarnessSpec { sink_file: "tests/dynamic_fixtures/go/sqli_positive.go".into(), sink_line: 12, spec_hash: "benchgosqli000001".into(), + derivation: SpecDerivationStrategy::FromFlowSteps, + stubs_required: vec![], } } @@ -289,6 +297,8 @@ fn make_java_sqli_spec() -> HarnessSpec { sink_file: "tests/dynamic_fixtures/java/sqli_positive.java".into(), sink_line: 9, spec_hash: "benchjavasqli00001".into(), + derivation: SpecDerivationStrategy::FromFlowSteps, + stubs_required: vec![], } } @@ -307,6 +317,8 @@ fn make_php_sqli_spec() -> HarnessSpec { sink_file: "tests/dynamic_fixtures/php/sqli_positive.php".into(), sink_line: 9, spec_hash: "benchphpsqli000001".into(), + derivation: SpecDerivationStrategy::FromFlowSteps, + stubs_required: vec![], } } diff --git a/src/dynamic/lang/c.rs b/src/dynamic/lang/c.rs index 8fa0e152..f8d4fa7e 100644 --- a/src/dynamic/lang/c.rs +++ b/src/dynamic/lang/c.rs @@ -314,12 +314,20 @@ impl LangEmitter for CEmitter { } /// Phase 26 — C chain-step harness. +/// +/// Shell-wraps `cc` + run so the compiled binary actually executes after +/// the build completes — `ChainStepHarness.command` models a single +/// process, so the build-then-run sequence must collapse to one `sh -c`. fn chain_step(prev_output: Option<&[u8]>) -> ChainStepHarness { let source = "#include \n#include \n\nint main(void) {\n const char *prev = getenv(\"NYX_PREV_OUTPUT\");\n if (prev) fputs(prev, stdout);\n return 0;\n}\n".to_owned(); ChainStepHarness { source, filename: "step.c".to_owned(), - command: vec!["cc".to_owned(), "step.c".to_owned(), "-o".to_owned(), "step".to_owned()], + command: vec![ + "sh".to_owned(), + "-c".to_owned(), + "cc step.c -o step && ./step".to_owned(), + ], extra_env: prev_output .map(|bytes| { vec![( @@ -356,6 +364,7 @@ pub fn emit(spec: &HarnessSpec) -> Result { /// Generate the harness `main.c` for the resolved shape. fn generate_main_c(spec: &HarnessSpec, shape: CShape) -> String { let invocation = invoke_for_shape(spec, shape); + let (entry_open, entry_close) = entry_include_guards(spec); format!( r#"/* Nyx dynamic harness — auto-generated, do not edit (Phase 16 — CShape::{shape:?}). */ @@ -370,8 +379,8 @@ fn generate_main_c(spec: &HarnessSpec, shape: CShape) -> String { * compilation unit. */ static char *nyx_payload(void); -#include "entry.c" - +{entry_open}#include "entry.c" +{entry_close} int main(int argc, char *argv[]) {{ (void)argc; (void)argv; char *payload = nyx_payload(); @@ -430,11 +439,33 @@ static char *nyx_payload(void) {{ "#, shape = shape, invocation = invocation, + entry_open = entry_open, + entry_close = entry_close, ) } +/// Preprocessor wrapper around `#include "entry.c"` that renames the user's +/// `int main(...)` to `__nyx_entry_main(...)` when the spec's entry symbol IS +/// `main` (i.e. a real CLI under Track B). Without this, the entry's `main` +/// collides with the harness's own `main` at link time. +/// +/// Fixture authors who already expose a non-`main` entry name (e.g. +/// `nyx_entry_main` under `tests/dynamic_fixtures/c/main_argv/`) get +/// empty guards. +fn entry_include_guards(spec: &HarnessSpec) -> (&'static str, &'static str) { + if spec.entry_name == "main" { + ("#define main __nyx_entry_main\n", "#undef main\n") + } else { + ("", "") + } +} + fn invoke_for_shape(spec: &HarnessSpec, shape: CShape) -> String { - let entry_fn = &spec.entry_name; + let entry_fn: &str = if spec.entry_name == "main" { + "__nyx_entry_main" + } else { + spec.entry_name.as_str() + }; match shape { CShape::FreeFn => match &spec.payload_slot { PayloadSlot::EnvVar(name) => format!( @@ -450,14 +481,15 @@ fn invoke_for_shape(spec: &HarnessSpec, shape: CShape) -> String { ) } CShape::MainArgv => { - // Rename the user-supplied entry to `nyx_entry_main` via macro so - // it does not collide with the harness `main` symbol when the - // entry source defines `int main(...)`. Fixture authors should - // expose the entry as a function named in `spec.entry_name`. - // // Heap-allocate `new_argv` so a future `PayloadSlot::Argv(n)` with // `n >= 6` cannot overrun a fixed stack array. Slots: 1 // ("nyx_harness") + pad + 1 (payload) + 1 (NULL terminator). + // + // When `spec.entry_name == "main"` the entry's `int main(...)` is + // renamed to `__nyx_entry_main` via the preprocessor guards on + // `#include "entry.c"`, and the call site below targets that + // renamed symbol. Fixtures that already expose a non-`main` + // entry symbol are called by name unchanged. let pad = match &spec.payload_slot { PayloadSlot::Argv(n) => *n, _ => 0, @@ -607,6 +639,40 @@ mod tests { assert!(h6.source.contains("free(new_argv);")); } + #[test] + fn emit_main_argv_renames_main_when_entry_named_main() { + // Real-world Track B CLI vuln: the spec.entry_name IS "main", and the + // entry source defines `int main(int argc, char *argv[])`. Without + // preprocessor rename guards, the entry's `main` collides with the + // harness's own `main` at link time. + let mut spec = make_spec(PayloadSlot::Argv(0)); + spec.entry_kind = EntryKind::CliSubcommand; + spec.entry_name = "main".into(); + let h = emit(&spec).unwrap(); + assert!( + h.source.contains("#define main __nyx_entry_main"), + "rename guard missing from emitted source", + ); + assert!( + h.source.contains("#undef main"), + "undef guard missing — harness `int main(...)` definition follows the include", + ); + assert!( + h.source.contains("__nyx_entry_main(new_argc, new_argv)"), + "harness call site must target the renamed symbol", + ); + // The harness's own `main` must remain a real entry point. + assert!(h.source.contains("int main(int argc, char *argv[])")); + // Guards must NOT fire for fixture-style non-main entry names. + let mut fixture_spec = make_spec(PayloadSlot::Argv(0)); + fixture_spec.entry_kind = EntryKind::CliSubcommand; + fixture_spec.entry_name = "nyx_entry_main".into(); + let fh = emit(&fixture_spec).unwrap(); + assert!(!fh.source.contains("#define main")); + assert!(!fh.source.contains("#undef main")); + assert!(fh.source.contains("nyx_entry_main(new_argc, new_argv)")); + } + #[test] fn emit_libfuzzer_shape_passes_bytes() { let mut spec = make_spec(PayloadSlot::Param(0)); diff --git a/src/dynamic/lang/cpp.rs b/src/dynamic/lang/cpp.rs index 28bab4c5..779242b7 100644 --- a/src/dynamic/lang/cpp.rs +++ b/src/dynamic/lang/cpp.rs @@ -287,12 +287,19 @@ impl LangEmitter for CppEmitter { } /// Phase 26 — C++ chain-step harness. +/// +/// Shell-wraps `c++` + run so the compiled binary actually executes +/// after the build completes (see C-side commentary for the rationale). fn chain_step(prev_output: Option<&[u8]>) -> ChainStepHarness { let source = "#include \n#include \n\nint main() {\n const char *prev = std::getenv(\"NYX_PREV_OUTPUT\");\n if (prev) std::fputs(prev, stdout);\n return 0;\n}\n".to_owned(); ChainStepHarness { source, filename: "step.cpp".to_owned(), - command: vec!["c++".to_owned(), "step.cpp".to_owned(), "-o".to_owned(), "step".to_owned()], + command: vec![ + "sh".to_owned(), + "-c".to_owned(), + "c++ step.cpp -o step && ./step".to_owned(), + ], extra_env: prev_output .map(|bytes| { vec![( @@ -328,6 +335,7 @@ pub fn emit(spec: &HarnessSpec) -> Result { fn generate_main_cpp(spec: &HarnessSpec, shape: CppShape) -> String { let invocation = invoke_for_shape(spec, shape); + let (entry_open, entry_close) = entry_include_guards(spec); format!( r#"// Nyx dynamic harness — auto-generated, do not edit (Phase 16 — CppShape::{shape:?}). @@ -341,8 +349,8 @@ fn generate_main_cpp(spec: &HarnessSpec, shape: CppShape) -> String { static std::string nyx_payload(); -#include "entry.cpp" - +{entry_open}#include "entry.cpp" +{entry_close} int main(int argc, char *argv[]) {{ (void)argc; (void)argv; std::string payload = nyx_payload(); @@ -390,11 +398,29 @@ static std::string nyx_payload() {{ "#, shape = shape, invocation = invocation, + entry_open = entry_open, + entry_close = entry_close, ) } +/// Preprocessor guards that rename the entry source's `int main(...)` to +/// `__nyx_entry_main(...)` when the spec entry symbol IS `main`. Mirrors +/// the C-side fix; without it the user's `main` collides with the harness's +/// own `main` at link time. +fn entry_include_guards(spec: &HarnessSpec) -> (&'static str, &'static str) { + if spec.entry_name == "main" { + ("#define main __nyx_entry_main\n", "#undef main\n") + } else { + ("", "") + } +} + fn invoke_for_shape(spec: &HarnessSpec, shape: CppShape) -> String { - let entry_fn = &spec.entry_name; + let entry_fn: &str = if spec.entry_name == "main" { + "__nyx_entry_main" + } else { + spec.entry_name.as_str() + }; match shape { CppShape::FreeFn => match &spec.payload_slot { PayloadSlot::EnvVar(name) => format!( @@ -539,6 +565,35 @@ mod tests { assert!(h.source.contains("nyx_entry_main(static_cast(argv_storage.size()), new_argv.data())")); } + #[test] + fn emit_main_argv_renames_main_when_entry_named_main() { + // Real-world Track B CLI vuln: spec.entry_name IS "main". Without + // preprocessor rename guards, the entry's `int main(...)` collides + // with the harness's own `main` at link time. + let mut spec = make_spec(PayloadSlot::Argv(0)); + spec.entry_kind = EntryKind::CliSubcommand; + spec.entry_name = "main".into(); + let h = emit(&spec).unwrap(); + assert!( + h.source.contains("#define main __nyx_entry_main"), + "rename guard missing", + ); + assert!(h.source.contains("#undef main"), "undef guard missing"); + assert!( + h.source.contains("__nyx_entry_main(static_cast(argv_storage.size()), new_argv.data())"), + "harness call site must target the renamed symbol", + ); + assert!(h.source.contains("int main(int argc, char *argv[])")); + // Guards must not fire for fixture-style non-main entry names. + let mut fixture_spec = make_spec(PayloadSlot::Argv(0)); + fixture_spec.entry_kind = EntryKind::CliSubcommand; + fixture_spec.entry_name = "nyx_entry_main".into(); + let fh = emit(&fixture_spec).unwrap(); + assert!(!fh.source.contains("#define main")); + assert!(!fh.source.contains("#undef main")); + assert!(fh.source.contains("nyx_entry_main(static_cast(argv_storage.size()), new_argv.data())")); + } + #[test] fn emit_cmake_in_extra_files() { let spec = make_spec(PayloadSlot::Param(0)); diff --git a/src/dynamic/lang/java.rs b/src/dynamic/lang/java.rs index de344eed..64a2f30e 100644 --- a/src/dynamic/lang/java.rs +++ b/src/dynamic/lang/java.rs @@ -83,16 +83,23 @@ impl LangEmitter for JavaEmitter { /// Phase 26 — Java chain-step harness. /// /// Emits a `Step.java` class whose `main` reads `NYX_PREV_OUTPUT` and -/// forwards it on stdout. The Java probe shim is class-level and -/// requires `System`/`java.io.*` imports the chain step already pulls in -/// implicitly; wiring the full shim is tracked alongside the Phase 14 -/// emitter follow-up about probe shim splicing. +/// forwards it on stdout. The command shell-wraps `javac` + `java` so +/// the step actually runs after the build step completes (the +/// `ChainStepHarness.command` slot models a single process). The Java +/// probe shim is class-level and requires `System` / `java.io.*` imports +/// the chain step already pulls in implicitly; wiring the full shim is +/// tracked alongside the Phase 14 emitter follow-up about probe shim +/// splicing. fn chain_step(prev_output: Option<&[u8]>) -> ChainStepHarness { let source = "public class Step {\n public static void main(String[] args) {\n String prev = System.getenv(\"NYX_PREV_OUTPUT\");\n if (prev == null) prev = \"\";\n System.out.print(prev);\n }\n}\n".to_owned(); ChainStepHarness { source, filename: "Step.java".to_owned(), - command: vec!["java".to_owned(), "Step".to_owned()], + command: vec![ + "sh".to_owned(), + "-c".to_owned(), + "javac Step.java && java Step".to_owned(), + ], extra_env: prev_output .map(|bytes| { vec![( diff --git a/src/dynamic/lang/js_shared.rs b/src/dynamic/lang/js_shared.rs index 46a93aa3..fc34de98 100644 --- a/src/dynamic/lang/js_shared.rs +++ b/src/dynamic/lang/js_shared.rs @@ -403,10 +403,21 @@ pub fn emit(spec: &HarnessSpec, is_typescript: bool) -> Result, is_typescript: bool) -> ChainStepHarness { let probe = probe_shim(); let driver = "\nprocess.stdout.write(process.env.NYX_PREV_OUTPUT || '');\n"; + // The chain-step source is pure JS even under the TypeScript emitter + // — the probe shim uses no TS-specific syntax — so we keep the `.ts` + // filename intent (so the workdir reflects which emitter produced + // the step) but stage a `.js` sibling and run that. Without this, + // `node step.ts` fails on stock Node before 22.6 (the + // `--experimental-strip-types` flag) and on any host that has not + // installed `tsx` / `ts-node`. let (filename, command) = if is_typescript { ( "step.ts".to_owned(), - vec!["node".to_owned(), "step.ts".to_owned()], + vec![ + "sh".to_owned(), + "-c".to_owned(), + "cp step.ts step.js && node step.js".to_owned(), + ], ) } else { ( diff --git a/src/dynamic/lang/rust.rs b/src/dynamic/lang/rust.rs index 2a0fe1ad..dca65071 100644 --- a/src/dynamic/lang/rust.rs +++ b/src/dynamic/lang/rust.rs @@ -78,10 +78,17 @@ impl LangEmitter for RustEmitter { /// via the standard emit path. fn chain_step(prev_output: Option<&[u8]>) -> ChainStepHarness { let source = "use std::env;\nuse std::io::{self, Write};\n\nfn main() {\n let prev = env::var(\"NYX_PREV_OUTPUT\").unwrap_or_default();\n let _ = io::stdout().write_all(prev.as_bytes());\n}\n".to_owned(); + // Shell-wrap build + run so the step actually executes the compiled binary. + // `ChainStepHarness.command` models a single process; without the wrap the + // step ends after `rustc` exits and the next chain member sees no output. ChainStepHarness { source, filename: "step.rs".to_owned(), - command: vec!["rustc".to_owned(), "step.rs".to_owned(), "-o".to_owned(), "step".to_owned()], + command: vec![ + "sh".to_owned(), + "-c".to_owned(), + "rustc step.rs -o step && ./step".to_owned(), + ], extra_env: prev_output .map(|bytes| { vec![( diff --git a/src/dynamic/policy.rs b/src/dynamic/policy.rs index c78f0c06..6432baea 100644 --- a/src/dynamic/policy.rs +++ b/src/dynamic/policy.rs @@ -218,6 +218,37 @@ impl Scrubber { text.to_owned() } } + + /// Scrub raw bytes from a sink-side payload capture. Returns the + /// input unchanged when no project secret pattern matches; on a hit, + /// returns a deterministic same-length placeholder derived from the + /// blake3 digest of the input so downstream forensic tooling that + /// keys on payload length (e.g. corpus-promote diffing) keeps its + /// invariants. + /// + /// The deferred Phase 28 follow-up flagged this gap: the textual + /// scrubber already covers `env_snapshot` / `cwd` / `args_repr` / + /// `callee`, but `ProbeWitness::payload_bytes` was passed through + /// raw because curated corpus payloads are deterministic literals + /// known not to contain credentials. Real-world Track B sinks can + /// surface attacker-controlled bytes that contain credentials, and + /// this routes that path through the same regex set as everything + /// else. + pub fn scrub_bytes(&self, bytes: &[u8]) -> Vec { + if !redact::contains_secret(bytes) { + return bytes.to_vec(); + } + // Same-length deterministic placeholder: tile the input's blake3 + // hex digest across `bytes.len()`. Length is preserved so any + // downstream tooling that asserts on payload length (the + // `events.jsonl` size budget, the corpus-promote diff) keeps + // working; content is replaced with a fixed-vocabulary marker + // derived from a one-way hash of the original. + let digest = blake3::hash(bytes).to_hex(); + let hex = digest.as_bytes(); + debug_assert!(!hex.is_empty(), "blake3 hex digest is never empty"); + (0..bytes.len()).map(|i| hex[i % hex.len()]).collect() + } } /// Hash a matched secret into the `>` shape. @@ -562,6 +593,47 @@ mod tests { assert_ne!(a, b); } + #[test] + fn scrub_bytes_passes_through_clean_payload() { + let s = Scrubber::project_default(); + let original = b"".to_vec(); + let out = s.scrub_bytes(&original); + assert_eq!(out, original); + } + + #[test] + fn scrub_bytes_replaces_credential_payload_same_length() { + let s = Scrubber::project_default(); + let original = b"username=admin&token=AKIAFAKETEST00000000&action=login".to_vec(); + let out = s.scrub_bytes(&original); + assert_eq!(out.len(), original.len(), "same-length contract"); + assert!(!out.windows(20).any(|w| w == b"AKIAFAKETEST00000000")); + assert!(out.iter().all(|b| b.is_ascii_hexdigit())); + } + + #[test] + fn scrub_bytes_is_deterministic() { + let s = Scrubber::project_default(); + let original = b"AKIAFAKETEST00000000 payload tail".to_vec(); + let a = s.scrub_bytes(&original); + let b = s.scrub_bytes(&original); + assert_eq!(a, b); + } + + #[test] + fn scrub_bytes_differs_for_different_inputs() { + let s = Scrubber::project_default(); + let a = s.scrub_bytes(b"AKIAFAKETEST00000000 alpha"); + let b = s.scrub_bytes(b"AKIAFAKETEST11111111 alpha"); + assert_ne!(a, b); + } + + #[test] + fn scrub_bytes_handles_empty() { + let s = Scrubber::project_default(); + assert_eq!(s.scrub_bytes(&[]), Vec::::new()); + } + #[test] fn scrub_is_deterministic_btree() { // Same iterator yields the same map; BTreeMap guarantees iteration order. diff --git a/src/dynamic/probe.rs b/src/dynamic/probe.rs index 3be976df..c3ca2818 100644 --- a/src/dynamic/probe.rs +++ b/src/dynamic/probe.rs @@ -185,10 +185,12 @@ impl ProbeWitness { /// the host-side constructor cannot accidentally produce an /// unscrubbed / unbounded witness. Every textual field /// (`env_snapshot` values, `cwd`, each `args_repr` entry) is routed - /// through the scrubber before the witness is serialised; the - /// `payload_bytes` field is left as raw bytes because the curated - /// payload corpus is checked into the repo and grepping it is the - /// only reliable forensic signal for triage. + /// through the scrubber before the witness is serialised, and the + /// truncated `payload_bytes` slice is routed through the + /// byte-aware [`crate::dynamic::policy::Scrubber::scrub_bytes`] so + /// real-world payloads carrying credential tokens are replaced with + /// a deterministic same-length placeholder while curated corpus + /// payloads pass through unchanged. pub fn from_inputs( env: I, cwd: impl Into, @@ -211,10 +213,12 @@ impl ProbeWitness { .collect(); let scrubbed_callee = scrubber.scrub_string(&callee.into()); let scrubbed_cwd = scrubber.scrub_string(&cwd.into()); + let truncated = policy::truncate_payload_bytes(payload); + let scrubbed_payload = scrubber.scrub_bytes(truncated); Self { env_snapshot, cwd: scrubbed_cwd, - payload_bytes: policy::truncate_payload_bytes(payload).to_vec(), + payload_bytes: scrubbed_payload, callee: scrubbed_callee, args_repr: scrubbed_args, }