feat(dynamic): enhance corpus sync script with improved payload parsing, registry checks, and expanded validation logic

2026-06-21 20:18:06 +02:00 · 2026-06-01 22:51:05 -05:00 · 2026-06-01 22:51:05 -05:00 · 8ee6e3af7c
commit 8ee6e3af7c
parent 467d41dcfb
22 changed files with 810 additions and 230 deletions
--- a/src/cfg/literals.rs
+++ b/src/cfg/literals.rs
@ -2544,6 +2544,37 @@ pub(super) fn def_use(
                    }
                }
            }
+            // Java `enhanced_for_statement` binds the loop variable on the
+            // `name` field and the iterable on the `value` field; Ruby's
+            // `for x in coll` uses `pattern`/`value`.  Neither uses the
+            // JS/Python `left`/`right` convention, so without this mapping
+            // the loop binding was never recorded as a define and taint on
+            // the iterable could not reach the loop variable (OWASP's
+            // dominant `for (Cookie c : req.getCookies())` shape).
+            if left.is_none() && right.is_none() {
+                if let Some(v) = ast.child_by_field_name("value") {
+                    left = ast
+                        .child_by_field_name("name")
+                        .or_else(|| ast.child_by_field_name("pattern"));
+                    right = Some(v);
+                }
+            }
+            // PHP `foreach ($coll as $v)` / `foreach ($coll as $k => $v)`:
+            // the iterable and binding are unnamed children separated by the
+            // `as` keyword (only `body` is a named field).  Map the binding
+            // onto `left` and the iterable onto `right` so the shared
+            // define/use logic below records the loop variable.
+            if left.is_none() && right.is_none() && ast.kind() == "foreach_statement" {
+                let mut cursor = ast.walk();
+                let kids: Vec<Node> = ast.children(&mut cursor).collect();
+                if let Some(as_pos) = kids.iter().position(|c| c.kind() == "as") {
+                    right = kids[..as_pos].iter().rev().find(|c| c.is_named()).copied();
+                    left = kids[as_pos + 1..]
+                        .iter()
+                        .find(|c| c.is_named() && lookup(lang, c.kind()) != Kind::Block)
+                        .copied();
+                }
+            }
            if left.is_none() && right.is_none() {
                // C-style for, defer to default ident collection.
                let mut idents = Vec::new();
--- a/src/cfg/mod.rs
+++ b/src/cfg/mod.rs
@ -2067,6 +2067,32 @@ fn is_binary_expr_kind(kind: &str, lang: &str) -> bool {
    }
 }

+/// Classification text for a for-each loop's iterable expression.
+///
+/// Subscript / index iterables (`$_GET['x']`, `params[:list]`, `arr[i]`)
+/// classify on their **base object**: taint sources are keyed on the base
+/// name (`$_GET`, `params`), and the trailing index would otherwise break
+/// the word-boundary suffix match in `classify`.  Non-subscript iterables
+/// (method calls, member chains, bare identifiers) use their full text.
+fn iterable_label_text(iter: Node, code: &[u8]) -> Option<String> {
+    if matches!(
+        iter.kind(),
+        "subscript_expression" | "subscript" | "index_expression" | "element_reference"
+    ) {
+        let base = iter
+            .child_by_field_name("object")
+            .or_else(|| iter.child_by_field_name("operand"))
+            .or_else(|| iter.child_by_field_name("value"))
+            .or_else(|| iter.child(0));
+        if let Some(b) = base
+            && let Some(t) = text_of(b, code)
+        {
+            return Some(t);
+        }
+    }
+    text_of(iter, code)
+}
+
 /// Create a node in one short borrow and optionally attach a taint label.
 #[allow(clippy::too_many_arguments)]
 pub(super) fn push_node<'a>(
@ -2208,6 +2234,51 @@ pub(super) fn push_node<'a>(
        text = iter_text;
    }

+    // Java `for (T x : iter)`: tree-sitter-java emits `enhanced_for_statement`
+    // with the iterable on the `value` field.  Classify against the iterable
+    // text so a source-returning call (`req.getCookies()`,
+    // `req.getParameterValues(..)`) lights up a Source on the loop node and
+    // the loop binding inherits its taint — the same loop-binding-inherits-
+    // iterator-taint contract the JS/Python rewrites above provide.  The
+    // loop variable itself is recorded as a define by `def_use`'s Kind::For
+    // arm (via the `name`/`value` mapping), so the Source-labeled loop node
+    // taints the binding directly.
+    if lang == "java"
+        && ast.kind() == "enhanced_for_statement"
+        && let Some(value) = ast.child_by_field_name("value")
+        && let Some(iter_text) = iterable_label_text(value, code)
+    {
+        text = iter_text;
+    }
+
+    // PHP `foreach ($iter as $v)` / `foreach ($iter as $k => $v)`: the
+    // iterable is the named child immediately preceding the `as` keyword
+    // (only `body` is a named field).  Classify against the iterable text so
+    // a superglobal/source iterable (`$_GET[..]`, `$_POST[..]`) taints the
+    // loop binding, matching the JS/Python/Java rewrites.
+    if lang == "php" && ast.kind() == "foreach_statement" {
+        let mut cursor = ast.walk();
+        let kids: Vec<Node> = ast.children(&mut cursor).collect();
+        if let Some(as_pos) = kids.iter().position(|c| c.kind() == "as")
+            && let Some(iter_node) = kids[..as_pos].iter().rev().find(|c| c.is_named()).copied()
+            && let Some(iter_text) = iterable_label_text(iter_node, code)
+        {
+            text = iter_text;
+        }
+    }
+
+    // Ruby `for x in coll`: tree-sitter-ruby's `for` node carries the
+    // iterable on the `value` field.  (The idiomatic `coll.each { |x| }`
+    // form is a method call with a block and is handled by the call/block
+    // machinery, not here.)
+    if lang == "ruby"
+        && ast.kind() == "for"
+        && let Some(value) = ast.child_by_field_name("value")
+        && let Some(iter_text) = iterable_label_text(value, code)
+    {
+        text = iter_text;
+    }
+
    // If this is a declaration/expression wrapper or an assignment that
    // *contains* a call, prefer the first inner call identifier instead of
    // the whole line.  Track the inner call's byte span so we can populate
--- a/src/cfg_analysis/guards.rs
+++ b/src/cfg_analysis/guards.rs
@ -2493,6 +2493,18 @@ fn local_is_param_derived<'a>(
            continue;
        }
        found_def = true;
+        // A `foreach` / `for-each` loop binding iterates collection
+        // *elements*, not a direct parameter pass-through.  Even when the
+        // iterable is a bare parameter (`foreach ($param as $v)`), the
+        // per-element values are not simple wrapper plumbing, so do not
+        // clear them as parameter-derived — keep the structural finding
+        // for `foreach ($param as $v) { sink($v) }` shapes (literal-keyed
+        // arrays are already suppressed earlier by
+        // `sink_arg_uses_safe_foreach_key`).
+        if info.kind == StmtKind::Loop {
+            all_def_clear = false;
+            break;
+        }
        if info
            .taint
            .labels
--- a/src/dynamic/build_pool/ruby.rs
+++ b/src/dynamic/build_pool/ruby.rs
@ -53,7 +53,23 @@ impl BuildPool for RubyPool {
        let start = Instant::now();

        // `bundle check` short-circuits when the host already has every gem.
-        if let Ok(o) = self.bundle(workdir).arg("check").output()
+        //
+        // Run the check with the *runtime* environment — plain system gems, no
+        // `GEM_HOME`/`BUNDLE_PATH` override.  The harness is executed as
+        // `ruby harness.rb`, whose `require 'bundler/setup'` resolves against
+        // the system gem path, so the build-time check must consult that same
+        // path to predict whether the run will succeed.  The hermetic
+        // `GEM_HOME` override (below) exists only to give `bundle install` a
+        // writable, sudo-free target for *missing* gems; applying it to the
+        // check breaks Bundler 1.x's ability to see an already-installed system
+        // gem (e.g. `rack`), turning a satisfiable Gemfile into a spurious
+        // BuildFailed.
+        let mut check = base_command(&self.bundle_bin);
+        check.current_dir(workdir);
+        if let Some(cache) = pool_cache_dir("ruby", "bootsnap") {
+            check.env("BOOTSNAP_CACHE_DIR", cache);
+        }
+        if let Ok(o) = check.arg("check").output()
            && o.status.success()
        {
            return PoolCompileResult {
--- a/src/dynamic/build_pool/rust.rs
+++ b/src/dynamic/build_pool/rust.rs
@ -18,8 +18,8 @@

 use super::{BuildPool, PoolCompileResult, base_command, binary_runnable, pool_cache_dir};
 use blake3::Hasher;
-use std::path::Path;
-use std::time::Instant;
+use std::path::{Path, PathBuf};
+use std::time::{Duration, Instant};

 pub struct RustPool {
    cargo_bin: String,
@ -79,6 +79,23 @@ impl BuildPool for RustPool {
            }
        };

+        // Serialise build + copy across processes for this shared target dir.
+        //
+        // The target dir is keyed only on the Cargo manifest hash, so every
+        // fixture that shares a `Cargo.toml` compiles the same bin name
+        // (`nyx_harness`) into the same `release/nyx_harness` path here.
+        // `cargo` already serialises the *build* across processes via its own
+        // target lock, but releases that lock the moment it exits — before the
+        // copy below moves `release/nyx_harness` to the caller's per-fixture
+        // cache slot.  A second process's `cargo build` landing in that window
+        // overwrites `release/nyx_harness`, so we copy a *different* fixture's
+        // binary into our slot and poison its build cache (observed as
+        // cross-fixture verdict corruption under a parallel `cargo test`).
+        // Holding this lock across build+copy folds the copy into the existing
+        // serialised section, so it adds the copy's few milliseconds, not a
+        // new build barrier.
+        let _build_lock = TargetDirLock::acquire(&target_dir);
+
        let mut cmd = base_command(&self.cargo_bin);
        cmd.args(["build", "--release"])
            .current_dir(workdir)
@ -143,6 +160,78 @@ fn default_cargo_home() -> String {
        .unwrap_or_else(|_| ".cargo".to_owned())
 }

+/// Cross-process advisory lock guarding build+copy for a shared
+/// `CARGO_TARGET_DIR` (see the call site in [`RustPool::compile_batch`]).
+///
+/// Implemented as an atomic `create_new` (O_EXCL) lockfile so it works across
+/// the separate processes a parallel `cargo test` spawns — an in-process
+/// `Mutex` would not.  A lock older than `STALE_AFTER` is stolen so a crashed
+/// holder cannot wedge the pool, and acquisition gives up after `MAX_WAIT`
+/// (proceeding unlocked) so a pathological case degrades to the pre-fix
+/// behaviour rather than deadlocking.
+struct TargetDirLock {
+    path: PathBuf,
+    /// Only the process that created the lockfile removes it on drop, so a
+    /// give-up / steal path never deletes another holder's lock.
+    owned: bool,
+}
+
+impl TargetDirLock {
+    fn acquire(target_dir: &Path) -> Self {
+        const MAX_WAIT: Duration = Duration::from_secs(300);
+        const STALE_AFTER: Duration = Duration::from_secs(180);
+        let path = target_dir.join(".nyx-pool-build.lock");
+        let start = Instant::now();
+        let mut spins: u64 = 0;
+        loop {
+            match std::fs::OpenOptions::new()
+                .write(true)
+                .create_new(true)
+                .open(&path)
+            {
+                Ok(mut f) => {
+                    use std::io::Write;
+                    let _ = writeln!(f, "{}", std::process::id());
+                    return Self { path, owned: true };
+                }
+                Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {
+                    // Steal a stale lock left behind by a crashed holder.
+                    if let Ok(meta) = std::fs::metadata(&path)
+                        && let Ok(mtime) = meta.modified()
+                        && mtime.elapsed().map(|d| d > STALE_AFTER).unwrap_or(false)
+                    {
+                        let _ = std::fs::remove_file(&path);
+                        continue;
+                    }
+                    if start.elapsed() > MAX_WAIT {
+                        // Best-effort: a slow build beats a deadlock.
+                        return Self { path, owned: false };
+                    }
+                    let nap = 10u64.saturating_add(spins.min(40).saturating_mul(2));
+                    std::thread::sleep(Duration::from_millis(nap));
+                    spins = spins.saturating_add(1);
+                }
+                Err(_) => {
+                    // Cannot create the lockfile (perms / race on dir) — proceed
+                    // unlocked rather than fail the build outright.
+                    return Self {
+                        path,
+                        owned: false,
+                    };
+                }
+            }
+        }
+    }
+}
+
+impl Drop for TargetDirLock {
+    fn drop(&mut self) {
+        if self.owned {
+            let _ = std::fs::remove_file(&self.path);
+        }
+    }
+}
+
 /// Stable short hash of the named manifest files under `workdir`.
 fn hash_files(workdir: &Path, files: &[&str]) -> String {
    let mut h = Hasher::new();
--- a/src/dynamic/build_sandbox.rs
+++ b/src/dynamic/build_sandbox.rs
@ -535,8 +535,19 @@ fn try_bundle_install(workdir: &Path) -> Result<(), String> {
 }

 fn bundle_check(bundle: &str, workdir: &Path) -> Result<bool, String> {
-    let output = ruby_build_command(bundle, workdir)
+    // Run with the runtime environment (plain system gems), NOT the hermetic
+    // `GEM_HOME`/`BUNDLE_PATH` override that `ruby_build_command` applies.  The
+    // harness runs as `ruby harness.rb` and resolves its `require`s against the
+    // system gem path, so the check must too; the override only breaks Bundler
+    // 1.x's view of already-installed system gems and produces spurious
+    // BuildFailed for a Gemfile the host can already satisfy.  See the parallel
+    // comment in `RubyPool::compile_batch`.
+    let output = Command::new(bundle)
        .arg("check")
+        .current_dir(workdir)
+        .env_clear()
+        .env("PATH", std::env::var("PATH").unwrap_or_default())
+        .env("HOME", std::env::var("HOME").unwrap_or_default())
        .output()
        .map_err(|e| format!("bundle check: {e}"))?;
    Ok(output.status.success())
@ -1103,8 +1114,37 @@ fn try_compile_java_with_toolchain(
        args.push(rel.to_string());
    }
    if lib_on_cp {
+        // Build an explicit, absolute classpath: `<workdir>` plus every jar
+        // under `<workdir>/lib`.  Two independent reasons rule out the
+        // shorthand `.:lib/*`:
+        //   1. The javac pool worker is a long-lived JVM and the JDK compiler
+        //      API has no per-task working directory (it sets `user.dir`
+        //      defensively, but that does not change file/classpath
+        //      resolution in an already-running JVM), so a *relative* entry
+        //      resolves against the worker's launch dir, not `<workdir>`.
+        //   2. The `lib/*` classpath wildcard is expanded by the `javac`
+        //      launcher, not by `ToolProvider.getSystemJavaCompiler().run`
+        //      (the in-process path the pool uses), so a `*` entry silently
+        //      contributes no jars there.
+        // Either way the Maven-resolved framework jars under `<workdir>/lib`
+        // go missing and framework imports fail to compile
+        // ("package ... does not exist").  Enumerating the jars explicitly is
+        // unambiguous for both the pool and the direct-spawn javac path.
+        let mut cp = workdir.to_string_lossy().into_owned();
+        let mut jars: Vec<PathBuf> = std::fs::read_dir(workdir.join("lib"))
+            .into_iter()
+            .flatten()
+            .flatten()
+            .map(|e| e.path())
+            .filter(|p| p.extension().map(|x| x == "jar").unwrap_or(false))
+            .collect();
+        jars.sort();
+        for jar in &jars {
+            cp.push(':');
+            cp.push_str(&jar.to_string_lossy());
+        }
        args.push("-cp".to_owned());
-        args.push(".:lib/*".to_owned());
+        args.push(cp);
    }
    for src in &sources {
        args.push(src.to_string_lossy().into_owned());
--- a/src/dynamic/framework/adapters/js_routes.rs
+++ b/src/dynamic/framework/adapters/js_routes.rs
@ -963,10 +963,8 @@ fn collect_options_middleware_names(args: Node<'_>, bytes: &[u8], target: &str)
            };
            let key = key_raw.trim_matches(['\'', '"', '`']);
            match key {
-                "handler" => {
-                    if view_arg_references(value, bytes, target) {
-                        handler_matches = true;
-                    }
+                "handler" if view_arg_references(value, bytes, target) => {
+                    handler_matches = true;
                }
                "onRequest" | "preParsing" | "preValidation" | "preHandler" => {
                    collect_hook_value_names(value, bytes, &mut hook_names);
@ -1052,10 +1050,8 @@ fn parse_options_route(args: Node<'_>, bytes: &[u8], target: &str) -> Option<(Ht
                    let text = value.utf8_text(bytes).ok().unwrap_or("");
                    url = Some(strip_quotes(text).to_owned());
                }
-                "handler" => {
-                    if view_arg_references(value, bytes, target) {
-                        handler_matches = true;
-                    }
+                "handler" if view_arg_references(value, bytes, target) => {
+                    handler_matches = true;
                }
                _ => {}
            }
--- a/src/dynamic/lang/java.rs
+++ b/src/dynamic/lang/java.rs
@ -2399,7 +2399,7 @@ public class NyxHarness {{
            "NyxHarness".to_owned(),
        ],
        extra_files: Vec::new(),
-        entry_subpath: None,
+        entry_subpath: Some(format!("{entry_class}.java")),
    }
 }

@ -6418,7 +6418,7 @@ mod tests {
    #[test]
    fn emit_dispatches_to_crypto_harness_when_cap_is_crypto() {
        let h = emit(&make_crypto_spec(
-            "tests/dynamic_fixtures/crypto/java/Vuln.java",
+            "tests/dynamic_fixtures/crypto/java/vuln.java",
            "run",
        ))
        .unwrap();
@ -6435,7 +6435,7 @@ mod tests {
    #[test]
    fn emit_crypto_harness_routes_through_reflective_entry_invocation() {
        let h = emit_crypto_harness(&make_crypto_spec(
-            "tests/dynamic_fixtures/crypto/java/Vuln.java",
+            "tests/dynamic_fixtures/crypto/java/vuln.java",
            "run",
        ));
        assert!(
@ -6460,12 +6460,17 @@ mod tests {
            h.extra_files.is_empty(),
            "Java CRYPTO harness must not stage extra files — java.util.Random + SecureRandom are JDK built-ins",
        );
+        assert!(
+            matches!(h.entry_subpath.as_deref(), Some(p) if p == "Vuln.java"),
+            "Java CRYPTO harness must stage the fixture under its public-class filename for javac on case-sensitive filesystems: {:?}",
+            h.entry_subpath,
+        );
    }

    #[test]
    fn emit_crypto_harness_emits_weak_key_probe_kind() {
        let h = emit_crypto_harness(&make_crypto_spec(
-            "tests/dynamic_fixtures/crypto/java/Vuln.java",
+            "tests/dynamic_fixtures/crypto/java/vuln.java",
            "run",
        ));
        assert!(
@ -6483,7 +6488,7 @@ mod tests {
    #[test]
    fn emit_crypto_harness_reduces_byte_array_returns_via_byte_buffer() {
        let h = emit_crypto_harness(&make_crypto_spec(
-            "tests/dynamic_fixtures/crypto/java/Benign.java",
+            "tests/dynamic_fixtures/crypto/java/benign.java",
            "run",
        ));
        assert!(
@ -6504,7 +6509,7 @@ mod tests {
    #[test]
    fn emit_crypto_harness_falls_back_when_reflection_fails() {
        let h = emit_crypto_harness(&make_crypto_spec(
-            "tests/dynamic_fixtures/crypto/java/Vuln.java",
+            "tests/dynamic_fixtures/crypto/java/vuln.java",
            "run",
        ));
        assert!(
--- a/src/dynamic/oracle.rs
+++ b/src/dynamic/oracle.rs
@ -1135,10 +1135,8 @@ fn extract_redirect_host(location: &str) -> Option<String> {
    }
    let rest = if let Some(after_scheme) = trimmed.find("://") {
        &trimmed[after_scheme + 3..]
-    } else if let Some(stripped) = trimmed.strip_prefix("//") {
-        stripped
    } else {
-        return None;
+        trimmed.strip_prefix("//")?
    };
    // Strip path / query / fragment from the host segment.
    let end = rest.find(['/', '?', '#']).unwrap_or(rest.len());
--- a/src/dynamic/spec.rs
+++ b/src/dynamic/spec.rs
@ -563,7 +563,7 @@ impl HarnessSpec {
        // that order within equal scores — so the final element is the
        // highest-scoring candidate, and on a score tie it is the
        // highest-precedence one (legacy ladder tie-break).
-        scored.sort_by(|a, b| a.1.cmp(&b.1));
+        scored.sort_by_key(|a| a.1);
        let (winner, _winner_score) = scored.pop().expect("non-empty checked above");
        let mut runners_up: Vec<(SpecDerivationStrategy, SpecScore)> = scored
            .into_iter()
--- a/src/dynamic/stubs/broker.rs
+++ b/src/dynamic/stubs/broker.rs
@ -2157,10 +2157,7 @@ fn handle_rabbit_amqp_connection(
    let mut owned_consumer_tags = Vec::new();
    let mut confirms_enabled = false;
    let mut next_publish_tag = 0_u64;
-    loop {
-        let Some(frame) = amqp_read_frame(&mut reader) else {
-            break;
-        };
+    while let Some(frame) = amqp_read_frame(&mut reader) {
        if frame.frame_type == AMQP_FRAME_HEARTBEAT {
            let _ = amqp_write_frame(&mut writer, AMQP_FRAME_HEARTBEAT, 0, &[]);
            continue;
--- a/src/labels/java.rs
+++ b/src/labels/java.rs
@ -14,8 +14,17 @@ pub static RULES: &[LabelRule] = &[
    LabelRule {
        matchers: &[
            "getParameter",
+            // Iterable/collection-returning request accessors.  `getParameter`
+            // (word-boundary suffix match) does NOT cover `getParameterValues`
+            // etc., and these are the dominant untrusted-input shapes inside
+            // for-each loops (`for (String s : req.getParameterValues("v"))`).
+            "getParameterValues",
+            "getParameterMap",
+            "getParameterNames",
            "getInputStream",
            "getHeader",
+            "getHeaders",
+            "getHeaderNames",
            "getCookies",
            "getReader",
            "getQueryString",