From 8ee6e3af7c727834b74d7acae799323375d54c79 Mon Sep 17 00:00:00 2001 From: elipeter Date: Mon, 1 Jun 2026 22:51:05 -0500 Subject: [PATCH] feat(dynamic): enhance corpus sync script with improved payload parsing, registry checks, and expanded validation logic --- CHANGELOG.md | 4 + scripts/check_corpus_sync.py | 112 ++-- scripts/corpus_dashboard.py | 560 +++++++++++++----- src/cfg/literals.rs | 31 + src/cfg/mod.rs | 71 +++ src/cfg_analysis/guards.rs | 12 + src/dynamic/build_pool/ruby.rs | 18 +- src/dynamic/build_pool/rust.rs | 93 ++- src/dynamic/build_sandbox.rs | 44 +- src/dynamic/framework/adapters/js_routes.rs | 12 +- src/dynamic/lang/java.rs | 17 +- src/dynamic/oracle.rs | 4 +- src/dynamic/spec.rs | 2 +- src/dynamic/stubs/broker.rs | 5 +- src/labels/java.rs | 9 + .../java/fileio_adversarial.java | 11 +- .../java/fileio_negative.java | 12 +- .../java/fileio_positive.java | 15 +- .../java/servlet_doget/Benign.java | 2 +- .../java/servlet_doget/Vuln.java | 2 +- .../java/servlet_dopost/Benign.java | 2 +- .../java/servlet_dopost/Vuln.java | 2 +- 22 files changed, 810 insertions(+), 230 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3b79a6ac..898e12ec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -248,6 +248,9 @@ A precision pass on auth and resource analysis plus three fresh CVE corpus pairs - Short-circuit branch condition CFG nodes now mirror `condition_vars` into `taint.uses`, so `apply_branch_predicates` interns the variable for short-circuit-decomposed validators (`if (x == null || !regex.matcher(x).matches()) throw`). Without this, the per-disjunct cond nodes built via `build_condition_chain` silently no-opped and `x` never reached `validated_must` on the surviving branch. - Go `goqu.L(s)` and `goqu.Lit(s)` raw-SQL literal builders modeled as `SQL_QUERY` sinks. Safe siblings (`goqu.I` identifier, `goqu.C` column, `goqu.T` table, `goqu.V` parameterised value, `goqu.SUM`, `goqu.COUNT`, …) stay unlabeled. Gin source list extended with the array-returning siblings of the existing scalar helpers: `c.QueryArray`, `c.GetQueryArray`, `c.PostFormArray`, `c.GetPostFormArray`. Closes CVE-2026-41422 (daptin: `c.QueryArray("column")` → `goqu.L(project)` with the loop variable lifted through `for _, project := range columns`). Vulnerable + patched Go corpus pair under `tests/benchmark/cve_corpus/go/CVE-2026-41422/`. - Go `for ident := range iter` def-use lifting. The `range_clause` child of `for_statement` is now consulted when `left`/`right` aren't direct fields of the `for` node, so taint from the iterable reaches the loop binding. Required for the daptin CVE shape above. +- Java `enhanced_for_statement`, PHP `foreach`, and Ruby `for` def-use lifting, completing the loop forms the Go `range_clause` fix above started. The `Kind::For` def-use arm only knew the JS/Python `left`/`right` pair and Go's `range_clause`; Java carries the binding on `name` and the iterable on `value`, Ruby's `for` on `pattern`/`value`, and PHP's `foreach` keeps both as unnamed children split by the `as` keyword, so none recorded the loop variable as a define and taint on the iterable never reached the binding (`for (Cookie c : req.getCookies()) { … c.getValue() … }` lost the flow at `c`). Each form now folds onto the shared define/use path. Lifts Java OWASP Benchmark recall: path_traversal 0.21 → 0.32, sqli 0.16 → 0.28, cmdi 0.04 → 0.08. +- Iterable-expression classification for the loop forms above. The loop node is classified against its iterable text, so a source-returning iterable (`req.getCookies()`, `req.getParameterValues("v")`, `$_GET['list']`) lands a `Source` on the loop node and the binding inherits its taint, the same rewrite JS/Python `for … of` / `for … in` already had. Subscript iterables (`$_GET['x']`, `params[:list]`) classify on their base object since sources key on the base name, not the index. +- Java iterable-returning request accessors modeled as sources: `getParameterValues`, `getParameterMap`, `getParameterNames`, `getHeaders`, `getHeaderNames`. The `getParameter` / `getHeader` matchers are word-boundary suffix matches and never covered the plural collection variants that feed for-each loops (`for (String s : req.getParameterValues("v"))`). The dominant OWASP Benchmark vulnerable-source shape. - Rust format-string named-argument lifting (`format!("...{x}...")`, stable since 1.58). Identifiers captured by `{name}` / `{name:fmt-spec}` are pulled into the call's `uses` for known format-style macros: `format`, `print`/`println`, `eprint`/`eprintln`, `write`/`writeln`, `panic`, `format_args`, `assert`/`debug_assert`, `todo`, `unimplemented`, `unreachable`, plus log-crate severity macros (`info`, `warn`, `error`, `debug`, `trace`). Recursive descent through one or two layers of expression wrapping (`format!("{x}").to_owned()`, RHS chained method calls). Without this, taint stopped at the macro boundary. `let q = format!("...{x}...")` carried no `x` because the identifier lives in format-string bytes rather than as a separate AST argument node. Mirrors the Python f-string lifter. - Rust CVE corpus extended. CVE-2023-42456, CVE-2024-32884, CVE-2025-53549 vulnerable + patched fixtures under `tests/benchmark/cve_corpus/rust/`. - Java lambda shorthand recognised by `extract_param_meta`. `lambda_expression`'s `parameters` field as a bare `identifier` (`cmd -> …`) or as an `inferred_parameters` wrapper around identifiers (`(a, b) -> …`) was not matching the formal_parameter / spread_parameter kinds in `PARAM_CONFIG`, so the lambda appeared parameterless and the SSA pipeline treated its formals as closure captures. Mirrors the JS/TS arrow shorthand path. @@ -258,6 +261,7 @@ A precision pass on auth and resource analysis plus three fresh CVE corpus pairs ### Fixed (false positives) +- `cfg-unguarded-sink` parameter-only trace no longer clears a sink argument whose reaching definition is a loop binding. Once the loop variable resolves to its iterable (the def-use lifting above), a `foreach ($param as $v) { sink($v) }` element looked like a bare `sink($p)` wrapper pass-through and the structural finding was dropped. A loop element over a parameter collection is not wrapper plumbing, so the finding survives for loop-bound sink arguments; literal-keyed arrays stay suppressed through `sink_arg_uses_safe_foreach_key`. Keeps the negative case in `fp_guard_php_foreach_safe_literal_keys` firing. - Go `unit_has_user_input_evidence` framework-request-name allow-list narrowed for Go. `ctx`, `context`, `info`, `body`, `path`, `payload`, `dto`, `form`, `query` are no longer treated as user-input indicators on Go: in Go these are `context.Context` (cancellation/value-bag from the stdlib) or struct-pointer payload params (`info *PackageInfo`, `opts *FooOptions`), not request bindings. Go HTTP frameworks bind the request to per-framework typed params (`r *http.Request`, `c *gin.Context`, `c echo.Context`, `c *fiber.Ctx`); these arrive at the gate via `RouteHandler` kind or the type-aware param filter below. Stdlib `req` / `request` (the `*http.Request` convention) preserved. Other languages keep the broader allow-list. - Go param collection drops `ctx context.Context` and `ctx context.CancelFunc` parameters entirely rather than seeding their names into `unit.params`. Tree-sitter-go's `parameter_declaration` exposes `name` and `type` as named fields; descend only into `name` so type-segment identifiers don't pollute the param-name set (`info *PackageInfo` no longer contributes `PackageInfo`). Together with the allow-list narrowing above, closes ~1900 `go.auth.missing_ownership_check` findings on gitea backend helpers whose only "user-input evidence" was the ubiquitous `ctx context.Context` first param. - Ruby controller method visibility + filter-callback gate. Methods marked `private` (bare `private` directive, targeted `private :foo, :bar`, or `protected`) and Rails filter callback targets (`before_action`, `after_action`, `around_action`, their `prepend_*` / `append_*` / `skip_*` siblings, and the legacy `*_filter` aliases) are no longer emitted as `Function` units. Visibility tracking is class-body source-order with two directive forms (bare toggles default visibility, targeted explicitly marks named methods). Block-form filters (`before_action do … end`) carry no symbol arg and are correctly ignored. Closes mastodon / diaspora `rb.auth.missing_ownership_check` flood on `set_X` row-fetch helpers used as `before_action` callbacks. diff --git a/scripts/check_corpus_sync.py b/scripts/check_corpus_sync.py index 88cfff69..e6a28760 100644 --- a/scripts/check_corpus_sync.py +++ b/scripts/check_corpus_sync.py @@ -1,84 +1,106 @@ #!/usr/bin/env python3 # Usage: python3 scripts/check_corpus_sync.py # Run from repo root or any subdirectory; the script relocates to repo root. -# Exits 0 if src/dynamic/corpus.rs and scripts/corpus_dashboard.py agree on -# CORPUS_VERSION and all payload labels. Exits 1 on any divergence. +# Exits 0 if scripts/corpus_dashboard.py reads the same CORPUS_VERSION and +# payload identities as the canonical Rust registry. + +from __future__ import annotations import os import re import sys from pathlib import Path -# ── locate repo root (parent of the scripts/ dir this file lives in) ───────── - SCRIPT_DIR = Path(__file__).resolve().parent REPO_ROOT = SCRIPT_DIR.parent os.chdir(REPO_ROOT) +sys.path.insert(0, str(SCRIPT_DIR)) +import corpus_dashboard # noqa: E402 + CORPUS_RS = REPO_ROOT / "src" / "dynamic" / "corpus.rs" -DASHBOARD_PY = REPO_ROOT / "scripts" / "corpus_dashboard.py" +CORPUS_DIR = REPO_ROOT / "src" / "dynamic" / "corpus" -# ── parse helpers ───────────────────────────────────────────────────────────── -def parse_corpus_rs(path: Path): +def parse_corpus_rs_version(path: Path) -> int | None: text = path.read_text(encoding="utf-8") - version_match = re.search(r'pub const CORPUS_VERSION:\s*u32\s*=\s*(\d+);', text) - version = int(version_match.group(1)) if version_match else None - labels = set(re.findall(r'label:\s*"([^"]+)"', text)) - return version, labels + version_match = re.search(r"pub const CORPUS_VERSION:\s*u32\s*=\s*(\d+);", text) + return int(version_match.group(1)) if version_match else None -def parse_dashboard_py(path: Path): - text = path.read_text(encoding="utf-8") - version_match = re.search(r'CORPUS_VERSION\s*=\s*(\d+)', text) - version = int(version_match.group(1)) if version_match else None - labels = set(re.findall(r'label="([^"]+)"', text)) - return version, labels -# ── main ────────────────────────────────────────────────────────────────────── +def payload_identities(payloads: list[corpus_dashboard.PayloadEntry]) -> set[tuple[str, str, str]]: + return {(p.cap, p.lang, p.label) for p in payloads} + + +def count_raw_payload_blocks(path: Path = CORPUS_DIR) -> int: + count = 0 + for source in path.rglob("*.rs"): + if source.name in {"audit.rs", "mod.rs", "registry.rs"}: + continue + text = source.read_text(encoding="utf-8") + count += len(re.findall(r"\bCuratedPayload\s*\{", text)) + return count + + +def fmt_identity(identity: tuple[str, str, str]) -> str: + cap, lang, label = identity + return f"{cap}/{lang}/{label}" + def main() -> int: - rs_version, rs_labels = parse_corpus_rs(CORPUS_RS) - py_version, py_labels = parse_dashboard_py(DASHBOARD_PY) + rs_version = parse_corpus_rs_version(CORPUS_RS) + dashboard_version = corpus_dashboard.CORPUS_VERSION + registry_payloads = corpus_dashboard.load_payloads() + raw_payload_count = count_raw_payload_blocks() ok = True - # version check if rs_version is None: print("ERROR: CORPUS_VERSION not found in corpus.rs") ok = False - if py_version is None: - print("ERROR: CORPUS_VERSION not found in corpus_dashboard.py") + elif rs_version == dashboard_version: + print(f"CORPUS_VERSION: {rs_version} [match]") + else: + print( + "CORPUS_VERSION mismatch: " + f"corpus.rs={rs_version} corpus_dashboard.py={dashboard_version}" + ) ok = False - if rs_version is not None and py_version is not None: - if rs_version == py_version: - print(f"CORPUS_VERSION: {rs_version} [match]") - else: - print(f"CORPUS_VERSION mismatch: corpus.rs={rs_version} corpus_dashboard.py={py_version}") - ok = False - # label check - only_in_rs = rs_labels - py_labels - only_in_py = py_labels - rs_labels - shared = rs_labels & py_labels + registry_ids = payload_identities(registry_payloads) + dashboard_ids = payload_identities(corpus_dashboard.PAYLOADS) + only_in_registry = registry_ids - dashboard_ids + only_in_dashboard = dashboard_ids - registry_ids + shared = registry_ids & dashboard_ids - print(f"Labels in both: {len(shared)}") - if only_in_rs: - print(f"Labels only in corpus.rs: {len(only_in_rs)}") - for lbl in sorted(only_in_rs): - print(f" + {lbl}") + print(f"Payload identities in both: {len(shared)}") + if only_in_registry: + print(f"Payload identities only in Rust registry: {len(only_in_registry)}") + for identity in sorted(only_in_registry): + print(f" + {fmt_identity(identity)}") ok = False - if only_in_py: - print(f"Labels only in corpus_dashboard.py: {len(only_in_py)}") - for lbl in sorted(only_in_py): - print(f" - {lbl}") + if only_in_dashboard: + print(f"Payload identities only in dashboard: {len(only_in_dashboard)}") + for identity in sorted(only_in_dashboard): + print(f" - {fmt_identity(identity)}") + ok = False + + if len(corpus_dashboard.PAYLOADS) == raw_payload_count: + print(f"CuratedPayload blocks covered: {raw_payload_count} [match]") + else: + print( + "CuratedPayload block count mismatch: " + f"source_tree={raw_payload_count} dashboard={len(corpus_dashboard.PAYLOADS)}" + ) ok = False if ok: print("Corpus sync: OK") return 0 - else: - print("Corpus sync: FAIL — update corpus_dashboard.py to match corpus.rs") - return 1 + + print("Corpus sync: FAIL - update corpus_dashboard.py to match the Rust registry") + return 1 + if __name__ == "__main__": sys.exit(main()) diff --git a/scripts/corpus_dashboard.py b/scripts/corpus_dashboard.py index db369639..794da776 100755 --- a/scripts/corpus_dashboard.py +++ b/scripts/corpus_dashboard.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""Corpus health report for src/dynamic/corpus.rs. +"""Corpus health report for the Rust dynamic payload registry. Produces: - Per-cap coverage table (payload count, benign controls, OOB slots) @@ -7,28 +7,43 @@ Produces: - CVE reference count - Marker collision audit -Exit code 0 = healthy. Non-zero = collision or missing coverage. +Exit code 0 = healthy. Non-zero = collision or missing coverage. Usage: python3 scripts/corpus_dashboard.py [--repro-dir REPRO_DIR] [--json] """ +from __future__ import annotations + import argparse +import ast import json import os +import re import sys from dataclasses import dataclass, field from pathlib import Path from typing import Optional -# ── Payload table (mirrors src/dynamic/corpus.rs) ──────────────────────────── -# Manually synced; CI should flag drift via cargo test no_marker_collisions. +SCRIPT_DIR = Path(__file__).resolve().parent +REPO_ROOT = SCRIPT_DIR.parent +CORPUS_RS = REPO_ROOT / "src" / "dynamic" / "corpus.rs" +CORPUS_DIR = REPO_ROOT / "src" / "dynamic" / "corpus" +REGISTRY_RS = CORPUS_DIR / "registry.rs" + + +@dataclass(frozen=True) +class RegistryEntry: + cap: str + lang: str + module_path: str + source_path: Path -CORPUS_VERSION = 6 @dataclass class PayloadEntry: cap: str + lang: str label: str bytes_repr: str oracle_kind: str @@ -39,132 +54,402 @@ class PayloadEntry: deprecated_at_corpus_version: Optional[int] fixture_paths: list[str] oob_nonce_slot: bool + source_path: str cve_refs: list[str] = field(default_factory=list) -PAYLOADS: list[PayloadEntry] = [ - # ── SQL_QUERY ────────────────────────────────────────────────────────────── - PayloadEntry( - cap="SQL_QUERY", label="sqli-tautology", - bytes_repr="' OR '1'='1", oracle_kind="OutputContains", - oracle_value="NYX_SQL_CONFIRMED", is_benign=False, - provenance="Curated", since_corpus_version=1, - deprecated_at_corpus_version=None, - fixture_paths=["tests/benchmark/corpus/rust/sqli/sqli_rusqlite_format.rs"], - oob_nonce_slot=False, - ), - PayloadEntry( - cap="SQL_QUERY", label="sqli-union-nyx", - bytes_repr="' UNION SELECT 'NYX_SQL_CONFIRMED'--", - oracle_kind="OutputContains", oracle_value="NYX_SQL_CONFIRMED", - is_benign=False, provenance="Curated", since_corpus_version=1, - deprecated_at_corpus_version=None, - fixture_paths=["tests/benchmark/corpus/rust/sqli/sqli_rusqlite_format.rs"], - oob_nonce_slot=False, - ), - # ── CODE_EXEC ────────────────────────────────────────────────────────────── - PayloadEntry( - cap="CODE_EXEC", label="cmdi-echo-marker", - bytes_repr="; echo NYX_PWN_CMDI", - oracle_kind="OutputContains", oracle_value="NYX_PWN_CMDI", - is_benign=False, provenance="Curated", since_corpus_version=1, - deprecated_at_corpus_version=None, - fixture_paths=[ - "tests/benchmark/corpus/rust/cmdi/cmdi_command.rs", - "tests/benchmark/corpus/rust/cmdi/cmdi_args.rs", - ], - oob_nonce_slot=False, - ), - # ── FILE_IO ──────────────────────────────────────────────────────────────── - PayloadEntry( - cap="FILE_IO", label="path-traversal-passwd", - bytes_repr="../../../../etc/passwd", - oracle_kind="OutputContains", oracle_value="root:", - is_benign=False, provenance="Curated", since_corpus_version=1, - deprecated_at_corpus_version=None, - fixture_paths=[ - "tests/benchmark/corpus/rust/path_traversal/path_file_open.rs", - "tests/benchmark/corpus/rust/path_traversal/path_read.rs", - ], - oob_nonce_slot=False, - ), - PayloadEntry( - cap="FILE_IO", label="path-traversal-benign", - bytes_repr="benign_safe_file_that_does_not_exist_NYX_BENIGN", - oracle_kind="OutputContains", oracle_value="root:", - is_benign=True, provenance="Curated", since_corpus_version=1, - deprecated_at_corpus_version=None, - fixture_paths=["tests/benchmark/corpus/rust/path_traversal/path_file_open.rs"], - oob_nonce_slot=False, - ), - # ── SSRF ─────────────────────────────────────────────────────────────────── - PayloadEntry( - cap="SSRF", label="ssrf-file-scheme", - bytes_repr="file:///etc/passwd", - oracle_kind="OutputContains", oracle_value="daemon:", - is_benign=False, provenance="Curated", since_corpus_version=1, - deprecated_at_corpus_version=None, - fixture_paths=["tests/benchmark/corpus/rust/ssrf/ssrf_reqwest.rs"], - oob_nonce_slot=False, - ), - PayloadEntry( - cap="SSRF", label="ssrf-oob-nonce", - bytes_repr="", - oracle_kind="OobCallback", oracle_value="host=127.0.0.1", - is_benign=False, provenance="Curated", since_corpus_version=2, - deprecated_at_corpus_version=None, - fixture_paths=["tests/benchmark/corpus/rust/ssrf/ssrf_reqwest.rs"], - oob_nonce_slot=True, - ), - # ── HTML_ESCAPE ──────────────────────────────────────────────────────────── - PayloadEntry( - cap="HTML_ESCAPE", label="xss-script-marker", - bytes_repr="", - oracle_kind="OutputContains", - oracle_value="", - is_benign=False, provenance="Curated", since_corpus_version=1, - deprecated_at_corpus_version=None, - fixture_paths=["tests/benchmark/corpus/rust/xss/axum_html/main.rs"], - oob_nonce_slot=False, - ), - PayloadEntry( - cap="HTML_ESCAPE", label="xss-benign-text", - bytes_repr="Hello World", - oracle_kind="OutputContains", - oracle_value="", - is_benign=True, provenance="Curated", since_corpus_version=1, - deprecated_at_corpus_version=None, - fixture_paths=["tests/benchmark/corpus/rust/xss/axum_html/main.rs"], - oob_nonce_slot=False, - ), -] -ALL_CAPS = ["SQL_QUERY", "CODE_EXEC", "FILE_IO", "SSRF", "HTML_ESCAPE"] +# Rust source helpers --------------------------------------------------------- -# ── Marker collision audit ──────────────────────────────────────────────────── +def load_corpus_version(path: Path = CORPUS_RS) -> int: + text = path.read_text(encoding="utf-8") + match = re.search(r"pub const CORPUS_VERSION:\s*u32\s*=\s*(\d+);", text) + if not match: + raise ValueError(f"CORPUS_VERSION not found in {path}") + return int(match.group(1)) -def audit_marker_collisions() -> list[tuple[str, str, str]]: - collisions = [] - for p in PAYLOADS: - if p.is_benign or p.oracle_kind != "OutputContains": + +def parse_registry_entries(path: Path = REGISTRY_RS) -> list[RegistryEntry]: + text = path.read_text(encoding="utf-8") + entries: list[RegistryEntry] = [] + pattern = re.compile( + r"\(\s*Cap::([A-Z0-9_]+)\s*,\s*Lang::([A-Za-z0-9_]+)\s*," + r"\s*([A-Za-z0-9_:]+)::PAYLOADS\s*,?\s*\)", + re.DOTALL, + ) + for match in pattern.finditer(text): + cap, lang, module_path = match.groups() + source_path = CORPUS_DIR / f"{module_path.replace('::', '/')}.rs" + entries.append(RegistryEntry(cap, lang, module_path, source_path)) + if not entries: + raise ValueError(f"No registry entries found in {path}") + return entries + + +def _raw_string_bounds(text: str, index: int) -> Optional[tuple[int, int, int]]: + if text.startswith("br", index): + marker_index = index + 2 + elif text.startswith("r", index): + marker_index = index + 1 + else: + return None + + cursor = marker_index + while cursor < len(text) and text[cursor] == "#": + cursor += 1 + if cursor >= len(text) or text[cursor] != '"': + return None + + hashes = text[marker_index:cursor] + body_start = cursor + 1 + terminator = '"' + hashes + body_end = text.find(terminator, body_start) + if body_end < 0: + raise ValueError("unterminated Rust raw string literal") + return body_start, body_end, body_end + len(terminator) + + +def _quoted_literal_end(text: str, index: int) -> Optional[int]: + raw = _raw_string_bounds(text, index) + if raw: + return raw[2] + + if text.startswith('b"', index): + quote = '"' + cursor = index + 2 + elif text[index:index + 1] == '"': + quote = '"' + cursor = index + 1 + elif ( + text[index:index + 1] == "'" + and index + 1 < len(text) + and not (text[index + 1].isalpha() or text[index + 1] == "_") + ): + quote = "'" + cursor = index + 1 + else: + return None + + while cursor < len(text): + char = text[cursor] + if char == "\\": + cursor += 2 continue - marker = p.oracle_value or "" - for other in PAYLOADS: - if other.cap == p.cap: + if char == quote: + return cursor + 1 + cursor += 1 + raise ValueError("unterminated Rust quoted literal") + + +def _skip_ignored(text: str, index: int) -> int: + if text.startswith("//", index): + newline = text.find("\n", index + 2) + return len(text) if newline < 0 else newline + 1 + + if text.startswith("/*", index): + depth = 1 + cursor = index + 2 + while cursor < len(text) and depth: + if text.startswith("/*", cursor): + depth += 1 + cursor += 2 + elif text.startswith("*/", cursor): + depth -= 1 + cursor += 2 + else: + cursor += 1 + if depth: + raise ValueError("unterminated Rust block comment") + return cursor + + literal_end = _quoted_literal_end(text, index) + return literal_end if literal_end is not None else index + + +def _find_matching(text: str, open_index: int, open_char: str, close_char: str) -> int: + depth = 1 + cursor = open_index + 1 + while cursor < len(text): + skipped = _skip_ignored(text, cursor) + if skipped != cursor: + cursor = skipped + continue + + char = text[cursor] + if char == open_char: + depth += 1 + elif char == close_char: + depth -= 1 + if depth == 0: + return cursor + cursor += 1 + raise ValueError(f"unterminated {open_char}{close_char} block") + + +def _payload_blocks(text: str) -> list[str]: + blocks: list[str] = [] + for match in re.finditer(r"\bCuratedPayload\s*\{", text): + open_index = match.end() - 1 + close_index = _find_matching(text, open_index, "{", "}") + blocks.append(text[open_index + 1:close_index]) + return blocks + + +def _add_field(segment: str, fields: dict[str, str]) -> None: + match = re.search(r"(^|\n)\s*([A-Za-z_][A-Za-z0-9_]*)\s*:", segment) + if not match: + return + fields[match.group(2)] = segment[match.end():].strip() + + +def _split_top_level_fields(block: str) -> dict[str, str]: + fields: dict[str, str] = {} + start = 0 + cursor = 0 + brace_depth = 0 + bracket_depth = 0 + paren_depth = 0 + + while cursor < len(block): + skipped = _skip_ignored(block, cursor) + if skipped != cursor: + cursor = skipped + continue + + char = block[cursor] + if char == "{": + brace_depth += 1 + elif char == "}": + brace_depth -= 1 + elif char == "[": + bracket_depth += 1 + elif char == "]": + bracket_depth -= 1 + elif char == "(": + paren_depth += 1 + elif char == ")": + paren_depth -= 1 + elif ( + char == "," + and brace_depth == 0 + and bracket_depth == 0 + and paren_depth == 0 + ): + _add_field(block[start:cursor], fields) + start = cursor + 1 + cursor += 1 + + _add_field(block[start:], fields) + return fields + + +def _parse_rust_string_literal(text: str, index: int) -> Optional[tuple[str, int]]: + raw = _raw_string_bounds(text, index) + if raw: + body_start, body_end, literal_end = raw + return text[body_start:body_end], literal_end + + if text.startswith('b"', index): + cursor = index + 2 + elif text[index:index + 1] == '"': + cursor = index + 1 + else: + return None + + while cursor < len(text): + char = text[cursor] + if char == "\\": + cursor += 2 + continue + if char == '"': + literal = text[index:cursor + 1] + value = ast.literal_eval(literal) + if isinstance(value, bytes): + return value.decode("latin-1"), cursor + 1 + return str(value), cursor + 1 + cursor += 1 + raise ValueError("unterminated Rust string literal") + + +def _rust_string_literals(expr: str) -> list[str]: + strings: list[str] = [] + cursor = 0 + while cursor < len(expr): + if expr.startswith("//", cursor) or expr.startswith("/*", cursor): + cursor = _skip_ignored(expr, cursor) + continue + + parsed = _parse_rust_string_literal(expr, cursor) + if parsed: + value, cursor = parsed + strings.append(value) + continue + + cursor += 1 + return strings + + +def _parse_string_constants(text: str) -> dict[str, str]: + constants: dict[str, str] = {} + pattern = re.compile(r"(?:pub\s+)?const\s+([A-Z][A-Z0-9_]*):\s*&str\s*=\s*([^;]+);") + for match in pattern.finditer(text): + strings = _rust_string_literals(match.group(2)) + if strings: + constants[match.group(1)] = strings[0] + return constants + + +def _required(fields: dict[str, str], name: str, source_path: Path) -> str: + if name not in fields: + rel = source_path.relative_to(REPO_ROOT) + raise ValueError(f"missing field {name!r} in payload from {rel}") + return fields[name] + + +def _string_expr(expr: str, constants: dict[str, str]) -> str: + expr = expr.strip() + if expr in constants: + return constants[expr] + strings = _rust_string_literals(expr) + if strings: + return strings[0] + return expr + + +def _bool_expr(expr: str) -> bool: + value = expr.strip() + if value == "true": + return True + if value == "false": + return False + raise ValueError(f"expected Rust bool literal, got {value!r}") + + +def _int_expr(expr: str) -> int: + match = re.search(r"\d+", expr) + if not match: + raise ValueError(f"expected integer literal, got {expr!r}") + return int(match.group(0)) + + +def _optional_int_expr(expr: str) -> Optional[int]: + expr = expr.strip() + if expr == "None": + return None + match = re.fullmatch(r"Some\(\s*(\d+)\s*\)", expr) + if match: + return int(match.group(1)) + raise ValueError(f"expected Rust Option literal, got {expr!r}") + + +def _oracle_expr(expr: str, constants: dict[str, str]) -> tuple[str, Optional[str]]: + expr = expr.strip() + if expr.startswith("Oracle::OutputContains"): + open_index = expr.find("(") + close_index = _find_matching(expr, open_index, "(", ")") + marker = _string_expr(expr[open_index + 1:close_index], constants) + return "OutputContains", marker + + if expr.startswith("Oracle::OobCallback"): + strings = _rust_string_literals(expr) + return "OobCallback", f"host={strings[0]}" if strings else None + + if expr.startswith("Oracle::SinkCrash"): + return "SinkCrash", "signals=all" + + if expr.startswith("Oracle::SinkProbe"): + predicates = list(dict.fromkeys(re.findall(r"ProbePredicate::([A-Za-z0-9_]+)", expr))) + return "SinkProbe", ",".join(predicates) if predicates else None + + return expr.split("{", 1)[0].split("(", 1)[0].strip(), None + + +def _payload_from_block( + entry: RegistryEntry, + block: str, + constants: dict[str, str], +) -> PayloadEntry: + fields = _split_top_level_fields(block) + source_path = entry.source_path + oracle_kind, oracle_value = _oracle_expr(_required(fields, "oracle", source_path), constants) + rel_source = str(source_path.relative_to(REPO_ROOT)) + + return PayloadEntry( + cap=entry.cap, + lang=entry.lang, + label=_string_expr(_required(fields, "label", source_path), constants), + bytes_repr=_string_expr(_required(fields, "bytes", source_path), constants), + oracle_kind=oracle_kind, + oracle_value=oracle_value, + is_benign=_bool_expr(_required(fields, "is_benign", source_path)), + provenance=_required(fields, "provenance", source_path) + .strip() + .removeprefix("PayloadProvenance::"), + since_corpus_version=_int_expr(_required(fields, "since_corpus_version", source_path)), + deprecated_at_corpus_version=_optional_int_expr( + _required(fields, "deprecated_at_corpus_version", source_path) + ), + fixture_paths=_rust_string_literals(_required(fields, "fixture_paths", source_path)), + oob_nonce_slot=_bool_expr(_required(fields, "oob_nonce_slot", source_path)), + source_path=rel_source, + cve_refs=sorted(set(re.findall(r"CVE-\d{4}-\d{4,7}", block))), + ) + + +def load_payloads() -> list[PayloadEntry]: + payloads: list[PayloadEntry] = [] + for entry in parse_registry_entries(): + if not entry.source_path.exists(): + rel = entry.source_path.relative_to(REPO_ROOT) + raise FileNotFoundError(f"registry entry points at missing payload file: {rel}") + + text = entry.source_path.read_text(encoding="utf-8") + constants = _parse_string_constants(text) + blocks = _payload_blocks(text) + if not blocks: + rel = entry.source_path.relative_to(REPO_ROOT) + raise ValueError(f"no CuratedPayload entries found in {rel}") + + for block in blocks: + payloads.append(_payload_from_block(entry, block, constants)) + + return payloads + + +CORPUS_VERSION = load_corpus_version() +PAYLOADS: list[PayloadEntry] = load_payloads() +ALL_CAPS = list(dict.fromkeys(p.cap for p in PAYLOADS)) + + +# Marker collision audit ------------------------------------------------------ + + +def audit_marker_collisions(payloads: list[PayloadEntry] = PAYLOADS) -> list[tuple[str, str, str]]: + collisions = [] + for payload in payloads: + if payload.is_benign or payload.oracle_kind != "OutputContains": + continue + marker = payload.oracle_value or "" + if not marker: + continue + + for other in payloads: + if other.cap == payload.cap: continue if other.is_benign or other.oob_nonce_slot: continue if marker in other.bytes_repr: - collisions.append((p.cap, p.label, other.cap)) + collisions.append((payload.cap, payload.label, other.cap)) return collisions -# ── Coverage table ──────────────────────────────────────────────────────────── +# Coverage table -------------------------------------------------------------- -def build_coverage_table() -> dict: + +def build_coverage_table(payloads: list[PayloadEntry] = PAYLOADS) -> dict: result = {} for cap in ALL_CAPS: - cap_payloads = [p for p in PAYLOADS if p.cap == cap] + cap_payloads = [payload for payload in payloads if payload.cap == cap] result[cap] = { "total": len(cap_payloads), "vuln": sum(1 for p in cap_payloads if not p.is_benign), @@ -176,7 +461,8 @@ def build_coverage_table() -> dict: return result -# ── Repro artifact timestamps ───────────────────────────────────────────────── +# Repro artifact timestamps --------------------------------------------------- + def scan_last_confirmed(repro_dir: Path) -> dict[str, str]: """Return {payload_label: iso_timestamp} from repro artifact metadata.""" @@ -189,7 +475,6 @@ def scan_last_confirmed(repro_dir: Path) -> dict[str, str]: label = data.get("payload_label", "") ts = data.get("confirmed_at", "") if label and ts: - # Keep most recent. if label not in timestamps or ts > timestamps[label]: timestamps[label] = ts except (json.JSONDecodeError, KeyError): @@ -197,30 +482,30 @@ def scan_last_confirmed(repro_dir: Path) -> dict[str, str]: return timestamps -# ── fuzz-discovered count ───────────────────────────────────────────────────── +# fuzz-discovered count ------------------------------------------------------- + def count_discovered(discovered_dir: Path) -> int: if not discovered_dir.exists(): return 0 return sum( - 1 for f in discovered_dir.rglob("*") - if f.is_file() and not f.name.endswith(".json") and f.name != ".gitkeep" + 1 for path in discovered_dir.rglob("*") + if path.is_file() and not path.name.endswith(".json") and path.name != ".gitkeep" ) -# ── Main ────────────────────────────────────────────────────────────────────── - def main() -> int: parser = argparse.ArgumentParser(description="Nyx corpus health dashboard") parser.add_argument("--repro-dir", default="repro", help="Path to repro artifacts") - parser.add_argument("--discovered-dir", default="fuzz-discovered", - help="Path to fuzz-discovered/ directory") + parser.add_argument( + "--discovered-dir", + default="fuzz-discovered", + help="Path to fuzz-discovered/ directory", + ) parser.add_argument("--json", action="store_true", help="Output JSON instead of text") args = parser.parse_args() - # Change to repo root (parent of scripts/). - repo_root = Path(__file__).parent.parent - os.chdir(repo_root) + os.chdir(REPO_ROOT) collisions = audit_marker_collisions() coverage = build_coverage_table() @@ -229,10 +514,12 @@ def main() -> int: report = { "corpus_version": CORPUS_VERSION, + "registry_entries": len(parse_registry_entries()), "total_payloads": len(PAYLOADS), "coverage": coverage, "marker_collisions": collisions, "last_confirmed": timestamps, + "cve_reference_count": sum(len(p.cve_refs) for p in PAYLOADS), "fuzz_discovered_pending": discovered_count, "healthy": len(collisions) == 0, } @@ -241,44 +528,41 @@ def main() -> int: print(json.dumps(report, indent=2)) return 0 if report["healthy"] else 1 - # Text output. print(f"Nyx Corpus Dashboard (corpus_version={CORPUS_VERSION})") print("=" * 60) print() - # Coverage table. print("Per-cap coverage:") - hdr = f" {'Cap':<18} {'Total':>5} {'Vuln':>5} {'Benign':>6} {'OOB':>4} {'Fixtures':>8}" + hdr = f" {'Cap':<22} {'Total':>5} {'Vuln':>5} {'Benign':>6} {'OOB':>4} {'Fixtures':>8}" print(hdr) - print(" " + "-" * 52) + print(" " + "-" * 56) for cap, info in coverage.items(): fixture_ok = "ok" if info["has_fixture_paths"] else "MISSING" print( - f" {cap:<18} {info['total']:>5} {info['vuln']:>5} " + f" {cap:<22} {info['total']:>5} {info['vuln']:>5} " f"{info['benign']:>6} {info['oob_slots']:>4} {fixture_ok:>8}" ) print() - # Last confirmed timestamps. if timestamps: print("Last confirmed timestamps:") for label, ts in sorted(timestamps.items()): print(f" {label:<35} {ts}") print() - # fuzz-discovered pending. + print(f"Registry entries: {report['registry_entries']}") + print(f"CVE references: {report['cve_reference_count']}") print(f"Fuzz-discovered pending promotion: {discovered_count}") print() - # Marker collisions. if collisions: - print("FAIL: Marker collisions detected (§16.3):") + print("FAIL: Marker collisions detected (section 16.3):") for cap, label, other_cap in collisions: print(f" {cap}/{label} marker appears in {other_cap} payload bytes") return 1 - else: - print("OK: No marker collisions detected.") - return 0 + + print("OK: No marker collisions detected.") + return 0 if __name__ == "__main__": diff --git a/src/cfg/literals.rs b/src/cfg/literals.rs index 20f11318..ac61b811 100644 --- a/src/cfg/literals.rs +++ b/src/cfg/literals.rs @@ -2544,6 +2544,37 @@ pub(super) fn def_use( } } } + // Java `enhanced_for_statement` binds the loop variable on the + // `name` field and the iterable on the `value` field; Ruby's + // `for x in coll` uses `pattern`/`value`. Neither uses the + // JS/Python `left`/`right` convention, so without this mapping + // the loop binding was never recorded as a define and taint on + // the iterable could not reach the loop variable (OWASP's + // dominant `for (Cookie c : req.getCookies())` shape). + if left.is_none() && right.is_none() { + if let Some(v) = ast.child_by_field_name("value") { + left = ast + .child_by_field_name("name") + .or_else(|| ast.child_by_field_name("pattern")); + right = Some(v); + } + } + // PHP `foreach ($coll as $v)` / `foreach ($coll as $k => $v)`: + // the iterable and binding are unnamed children separated by the + // `as` keyword (only `body` is a named field). Map the binding + // onto `left` and the iterable onto `right` so the shared + // define/use logic below records the loop variable. + if left.is_none() && right.is_none() && ast.kind() == "foreach_statement" { + let mut cursor = ast.walk(); + let kids: Vec = ast.children(&mut cursor).collect(); + if let Some(as_pos) = kids.iter().position(|c| c.kind() == "as") { + right = kids[..as_pos].iter().rev().find(|c| c.is_named()).copied(); + left = kids[as_pos + 1..] + .iter() + .find(|c| c.is_named() && lookup(lang, c.kind()) != Kind::Block) + .copied(); + } + } if left.is_none() && right.is_none() { // C-style for, defer to default ident collection. let mut idents = Vec::new(); diff --git a/src/cfg/mod.rs b/src/cfg/mod.rs index 2a304cc6..70a275c5 100644 --- a/src/cfg/mod.rs +++ b/src/cfg/mod.rs @@ -2067,6 +2067,32 @@ fn is_binary_expr_kind(kind: &str, lang: &str) -> bool { } } +/// Classification text for a for-each loop's iterable expression. +/// +/// Subscript / index iterables (`$_GET['x']`, `params[:list]`, `arr[i]`) +/// classify on their **base object**: taint sources are keyed on the base +/// name (`$_GET`, `params`), and the trailing index would otherwise break +/// the word-boundary suffix match in `classify`. Non-subscript iterables +/// (method calls, member chains, bare identifiers) use their full text. +fn iterable_label_text(iter: Node, code: &[u8]) -> Option { + if matches!( + iter.kind(), + "subscript_expression" | "subscript" | "index_expression" | "element_reference" + ) { + let base = iter + .child_by_field_name("object") + .or_else(|| iter.child_by_field_name("operand")) + .or_else(|| iter.child_by_field_name("value")) + .or_else(|| iter.child(0)); + if let Some(b) = base + && let Some(t) = text_of(b, code) + { + return Some(t); + } + } + text_of(iter, code) +} + /// Create a node in one short borrow and optionally attach a taint label. #[allow(clippy::too_many_arguments)] pub(super) fn push_node<'a>( @@ -2208,6 +2234,51 @@ pub(super) fn push_node<'a>( text = iter_text; } + // Java `for (T x : iter)`: tree-sitter-java emits `enhanced_for_statement` + // with the iterable on the `value` field. Classify against the iterable + // text so a source-returning call (`req.getCookies()`, + // `req.getParameterValues(..)`) lights up a Source on the loop node and + // the loop binding inherits its taint — the same loop-binding-inherits- + // iterator-taint contract the JS/Python rewrites above provide. The + // loop variable itself is recorded as a define by `def_use`'s Kind::For + // arm (via the `name`/`value` mapping), so the Source-labeled loop node + // taints the binding directly. + if lang == "java" + && ast.kind() == "enhanced_for_statement" + && let Some(value) = ast.child_by_field_name("value") + && let Some(iter_text) = iterable_label_text(value, code) + { + text = iter_text; + } + + // PHP `foreach ($iter as $v)` / `foreach ($iter as $k => $v)`: the + // iterable is the named child immediately preceding the `as` keyword + // (only `body` is a named field). Classify against the iterable text so + // a superglobal/source iterable (`$_GET[..]`, `$_POST[..]`) taints the + // loop binding, matching the JS/Python/Java rewrites. + if lang == "php" && ast.kind() == "foreach_statement" { + let mut cursor = ast.walk(); + let kids: Vec = ast.children(&mut cursor).collect(); + if let Some(as_pos) = kids.iter().position(|c| c.kind() == "as") + && let Some(iter_node) = kids[..as_pos].iter().rev().find(|c| c.is_named()).copied() + && let Some(iter_text) = iterable_label_text(iter_node, code) + { + text = iter_text; + } + } + + // Ruby `for x in coll`: tree-sitter-ruby's `for` node carries the + // iterable on the `value` field. (The idiomatic `coll.each { |x| }` + // form is a method call with a block and is handled by the call/block + // machinery, not here.) + if lang == "ruby" + && ast.kind() == "for" + && let Some(value) = ast.child_by_field_name("value") + && let Some(iter_text) = iterable_label_text(value, code) + { + text = iter_text; + } + // If this is a declaration/expression wrapper or an assignment that // *contains* a call, prefer the first inner call identifier instead of // the whole line. Track the inner call's byte span so we can populate diff --git a/src/cfg_analysis/guards.rs b/src/cfg_analysis/guards.rs index 57fa1dcc..fd3b1816 100644 --- a/src/cfg_analysis/guards.rs +++ b/src/cfg_analysis/guards.rs @@ -2493,6 +2493,18 @@ fn local_is_param_derived<'a>( continue; } found_def = true; + // A `foreach` / `for-each` loop binding iterates collection + // *elements*, not a direct parameter pass-through. Even when the + // iterable is a bare parameter (`foreach ($param as $v)`), the + // per-element values are not simple wrapper plumbing, so do not + // clear them as parameter-derived — keep the structural finding + // for `foreach ($param as $v) { sink($v) }` shapes (literal-keyed + // arrays are already suppressed earlier by + // `sink_arg_uses_safe_foreach_key`). + if info.kind == StmtKind::Loop { + all_def_clear = false; + break; + } if info .taint .labels diff --git a/src/dynamic/build_pool/ruby.rs b/src/dynamic/build_pool/ruby.rs index 082124d0..e0948c9b 100644 --- a/src/dynamic/build_pool/ruby.rs +++ b/src/dynamic/build_pool/ruby.rs @@ -53,7 +53,23 @@ impl BuildPool for RubyPool { let start = Instant::now(); // `bundle check` short-circuits when the host already has every gem. - if let Ok(o) = self.bundle(workdir).arg("check").output() + // + // Run the check with the *runtime* environment — plain system gems, no + // `GEM_HOME`/`BUNDLE_PATH` override. The harness is executed as + // `ruby harness.rb`, whose `require 'bundler/setup'` resolves against + // the system gem path, so the build-time check must consult that same + // path to predict whether the run will succeed. The hermetic + // `GEM_HOME` override (below) exists only to give `bundle install` a + // writable, sudo-free target for *missing* gems; applying it to the + // check breaks Bundler 1.x's ability to see an already-installed system + // gem (e.g. `rack`), turning a satisfiable Gemfile into a spurious + // BuildFailed. + let mut check = base_command(&self.bundle_bin); + check.current_dir(workdir); + if let Some(cache) = pool_cache_dir("ruby", "bootsnap") { + check.env("BOOTSNAP_CACHE_DIR", cache); + } + if let Ok(o) = check.arg("check").output() && o.status.success() { return PoolCompileResult { diff --git a/src/dynamic/build_pool/rust.rs b/src/dynamic/build_pool/rust.rs index 3f210ffa..ea7b4592 100644 --- a/src/dynamic/build_pool/rust.rs +++ b/src/dynamic/build_pool/rust.rs @@ -18,8 +18,8 @@ use super::{BuildPool, PoolCompileResult, base_command, binary_runnable, pool_cache_dir}; use blake3::Hasher; -use std::path::Path; -use std::time::Instant; +use std::path::{Path, PathBuf}; +use std::time::{Duration, Instant}; pub struct RustPool { cargo_bin: String, @@ -79,6 +79,23 @@ impl BuildPool for RustPool { } }; + // Serialise build + copy across processes for this shared target dir. + // + // The target dir is keyed only on the Cargo manifest hash, so every + // fixture that shares a `Cargo.toml` compiles the same bin name + // (`nyx_harness`) into the same `release/nyx_harness` path here. + // `cargo` already serialises the *build* across processes via its own + // target lock, but releases that lock the moment it exits — before the + // copy below moves `release/nyx_harness` to the caller's per-fixture + // cache slot. A second process's `cargo build` landing in that window + // overwrites `release/nyx_harness`, so we copy a *different* fixture's + // binary into our slot and poison its build cache (observed as + // cross-fixture verdict corruption under a parallel `cargo test`). + // Holding this lock across build+copy folds the copy into the existing + // serialised section, so it adds the copy's few milliseconds, not a + // new build barrier. + let _build_lock = TargetDirLock::acquire(&target_dir); + let mut cmd = base_command(&self.cargo_bin); cmd.args(["build", "--release"]) .current_dir(workdir) @@ -143,6 +160,78 @@ fn default_cargo_home() -> String { .unwrap_or_else(|_| ".cargo".to_owned()) } +/// Cross-process advisory lock guarding build+copy for a shared +/// `CARGO_TARGET_DIR` (see the call site in [`RustPool::compile_batch`]). +/// +/// Implemented as an atomic `create_new` (O_EXCL) lockfile so it works across +/// the separate processes a parallel `cargo test` spawns — an in-process +/// `Mutex` would not. A lock older than `STALE_AFTER` is stolen so a crashed +/// holder cannot wedge the pool, and acquisition gives up after `MAX_WAIT` +/// (proceeding unlocked) so a pathological case degrades to the pre-fix +/// behaviour rather than deadlocking. +struct TargetDirLock { + path: PathBuf, + /// Only the process that created the lockfile removes it on drop, so a + /// give-up / steal path never deletes another holder's lock. + owned: bool, +} + +impl TargetDirLock { + fn acquire(target_dir: &Path) -> Self { + const MAX_WAIT: Duration = Duration::from_secs(300); + const STALE_AFTER: Duration = Duration::from_secs(180); + let path = target_dir.join(".nyx-pool-build.lock"); + let start = Instant::now(); + let mut spins: u64 = 0; + loop { + match std::fs::OpenOptions::new() + .write(true) + .create_new(true) + .open(&path) + { + Ok(mut f) => { + use std::io::Write; + let _ = writeln!(f, "{}", std::process::id()); + return Self { path, owned: true }; + } + Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => { + // Steal a stale lock left behind by a crashed holder. + if let Ok(meta) = std::fs::metadata(&path) + && let Ok(mtime) = meta.modified() + && mtime.elapsed().map(|d| d > STALE_AFTER).unwrap_or(false) + { + let _ = std::fs::remove_file(&path); + continue; + } + if start.elapsed() > MAX_WAIT { + // Best-effort: a slow build beats a deadlock. + return Self { path, owned: false }; + } + let nap = 10u64.saturating_add(spins.min(40).saturating_mul(2)); + std::thread::sleep(Duration::from_millis(nap)); + spins = spins.saturating_add(1); + } + Err(_) => { + // Cannot create the lockfile (perms / race on dir) — proceed + // unlocked rather than fail the build outright. + return Self { + path, + owned: false, + }; + } + } + } + } +} + +impl Drop for TargetDirLock { + fn drop(&mut self) { + if self.owned { + let _ = std::fs::remove_file(&self.path); + } + } +} + /// Stable short hash of the named manifest files under `workdir`. fn hash_files(workdir: &Path, files: &[&str]) -> String { let mut h = Hasher::new(); diff --git a/src/dynamic/build_sandbox.rs b/src/dynamic/build_sandbox.rs index cc727148..ac62c82f 100644 --- a/src/dynamic/build_sandbox.rs +++ b/src/dynamic/build_sandbox.rs @@ -535,8 +535,19 @@ fn try_bundle_install(workdir: &Path) -> Result<(), String> { } fn bundle_check(bundle: &str, workdir: &Path) -> Result { - let output = ruby_build_command(bundle, workdir) + // Run with the runtime environment (plain system gems), NOT the hermetic + // `GEM_HOME`/`BUNDLE_PATH` override that `ruby_build_command` applies. The + // harness runs as `ruby harness.rb` and resolves its `require`s against the + // system gem path, so the check must too; the override only breaks Bundler + // 1.x's view of already-installed system gems and produces spurious + // BuildFailed for a Gemfile the host can already satisfy. See the parallel + // comment in `RubyPool::compile_batch`. + let output = Command::new(bundle) .arg("check") + .current_dir(workdir) + .env_clear() + .env("PATH", std::env::var("PATH").unwrap_or_default()) + .env("HOME", std::env::var("HOME").unwrap_or_default()) .output() .map_err(|e| format!("bundle check: {e}"))?; Ok(output.status.success()) @@ -1103,8 +1114,37 @@ fn try_compile_java_with_toolchain( args.push(rel.to_string()); } if lib_on_cp { + // Build an explicit, absolute classpath: `` plus every jar + // under `/lib`. Two independent reasons rule out the + // shorthand `.:lib/*`: + // 1. The javac pool worker is a long-lived JVM and the JDK compiler + // API has no per-task working directory (it sets `user.dir` + // defensively, but that does not change file/classpath + // resolution in an already-running JVM), so a *relative* entry + // resolves against the worker's launch dir, not ``. + // 2. The `lib/*` classpath wildcard is expanded by the `javac` + // launcher, not by `ToolProvider.getSystemJavaCompiler().run` + // (the in-process path the pool uses), so a `*` entry silently + // contributes no jars there. + // Either way the Maven-resolved framework jars under `/lib` + // go missing and framework imports fail to compile + // ("package ... does not exist"). Enumerating the jars explicitly is + // unambiguous for both the pool and the direct-spawn javac path. + let mut cp = workdir.to_string_lossy().into_owned(); + let mut jars: Vec = std::fs::read_dir(workdir.join("lib")) + .into_iter() + .flatten() + .flatten() + .map(|e| e.path()) + .filter(|p| p.extension().map(|x| x == "jar").unwrap_or(false)) + .collect(); + jars.sort(); + for jar in &jars { + cp.push(':'); + cp.push_str(&jar.to_string_lossy()); + } args.push("-cp".to_owned()); - args.push(".:lib/*".to_owned()); + args.push(cp); } for src in &sources { args.push(src.to_string_lossy().into_owned()); diff --git a/src/dynamic/framework/adapters/js_routes.rs b/src/dynamic/framework/adapters/js_routes.rs index f625fbcd..9998bab7 100644 --- a/src/dynamic/framework/adapters/js_routes.rs +++ b/src/dynamic/framework/adapters/js_routes.rs @@ -963,10 +963,8 @@ fn collect_options_middleware_names(args: Node<'_>, bytes: &[u8], target: &str) }; let key = key_raw.trim_matches(['\'', '"', '`']); match key { - "handler" => { - if view_arg_references(value, bytes, target) { - handler_matches = true; - } + "handler" if view_arg_references(value, bytes, target) => { + handler_matches = true; } "onRequest" | "preParsing" | "preValidation" | "preHandler" => { collect_hook_value_names(value, bytes, &mut hook_names); @@ -1052,10 +1050,8 @@ fn parse_options_route(args: Node<'_>, bytes: &[u8], target: &str) -> Option<(Ht let text = value.utf8_text(bytes).ok().unwrap_or(""); url = Some(strip_quotes(text).to_owned()); } - "handler" => { - if view_arg_references(value, bytes, target) { - handler_matches = true; - } + "handler" if view_arg_references(value, bytes, target) => { + handler_matches = true; } _ => {} } diff --git a/src/dynamic/lang/java.rs b/src/dynamic/lang/java.rs index 57b2bcb0..69a7c560 100644 --- a/src/dynamic/lang/java.rs +++ b/src/dynamic/lang/java.rs @@ -2399,7 +2399,7 @@ public class NyxHarness {{ "NyxHarness".to_owned(), ], extra_files: Vec::new(), - entry_subpath: None, + entry_subpath: Some(format!("{entry_class}.java")), } } @@ -6418,7 +6418,7 @@ mod tests { #[test] fn emit_dispatches_to_crypto_harness_when_cap_is_crypto() { let h = emit(&make_crypto_spec( - "tests/dynamic_fixtures/crypto/java/Vuln.java", + "tests/dynamic_fixtures/crypto/java/vuln.java", "run", )) .unwrap(); @@ -6435,7 +6435,7 @@ mod tests { #[test] fn emit_crypto_harness_routes_through_reflective_entry_invocation() { let h = emit_crypto_harness(&make_crypto_spec( - "tests/dynamic_fixtures/crypto/java/Vuln.java", + "tests/dynamic_fixtures/crypto/java/vuln.java", "run", )); assert!( @@ -6460,12 +6460,17 @@ mod tests { h.extra_files.is_empty(), "Java CRYPTO harness must not stage extra files — java.util.Random + SecureRandom are JDK built-ins", ); + assert!( + matches!(h.entry_subpath.as_deref(), Some(p) if p == "Vuln.java"), + "Java CRYPTO harness must stage the fixture under its public-class filename for javac on case-sensitive filesystems: {:?}", + h.entry_subpath, + ); } #[test] fn emit_crypto_harness_emits_weak_key_probe_kind() { let h = emit_crypto_harness(&make_crypto_spec( - "tests/dynamic_fixtures/crypto/java/Vuln.java", + "tests/dynamic_fixtures/crypto/java/vuln.java", "run", )); assert!( @@ -6483,7 +6488,7 @@ mod tests { #[test] fn emit_crypto_harness_reduces_byte_array_returns_via_byte_buffer() { let h = emit_crypto_harness(&make_crypto_spec( - "tests/dynamic_fixtures/crypto/java/Benign.java", + "tests/dynamic_fixtures/crypto/java/benign.java", "run", )); assert!( @@ -6504,7 +6509,7 @@ mod tests { #[test] fn emit_crypto_harness_falls_back_when_reflection_fails() { let h = emit_crypto_harness(&make_crypto_spec( - "tests/dynamic_fixtures/crypto/java/Vuln.java", + "tests/dynamic_fixtures/crypto/java/vuln.java", "run", )); assert!( diff --git a/src/dynamic/oracle.rs b/src/dynamic/oracle.rs index d8466621..a10bf143 100644 --- a/src/dynamic/oracle.rs +++ b/src/dynamic/oracle.rs @@ -1135,10 +1135,8 @@ fn extract_redirect_host(location: &str) -> Option { } let rest = if let Some(after_scheme) = trimmed.find("://") { &trimmed[after_scheme + 3..] - } else if let Some(stripped) = trimmed.strip_prefix("//") { - stripped } else { - return None; + trimmed.strip_prefix("//")? }; // Strip path / query / fragment from the host segment. let end = rest.find(['/', '?', '#']).unwrap_or(rest.len()); diff --git a/src/dynamic/spec.rs b/src/dynamic/spec.rs index ad9fd809..ea86b702 100644 --- a/src/dynamic/spec.rs +++ b/src/dynamic/spec.rs @@ -563,7 +563,7 @@ impl HarnessSpec { // that order within equal scores — so the final element is the // highest-scoring candidate, and on a score tie it is the // highest-precedence one (legacy ladder tie-break). - scored.sort_by(|a, b| a.1.cmp(&b.1)); + scored.sort_by_key(|a| a.1); let (winner, _winner_score) = scored.pop().expect("non-empty checked above"); let mut runners_up: Vec<(SpecDerivationStrategy, SpecScore)> = scored .into_iter() diff --git a/src/dynamic/stubs/broker.rs b/src/dynamic/stubs/broker.rs index 09cd05cf..eaf3ec0b 100644 --- a/src/dynamic/stubs/broker.rs +++ b/src/dynamic/stubs/broker.rs @@ -2157,10 +2157,7 @@ fn handle_rabbit_amqp_connection( let mut owned_consumer_tags = Vec::new(); let mut confirms_enabled = false; let mut next_publish_tag = 0_u64; - loop { - let Some(frame) = amqp_read_frame(&mut reader) else { - break; - }; + while let Some(frame) = amqp_read_frame(&mut reader) { if frame.frame_type == AMQP_FRAME_HEARTBEAT { let _ = amqp_write_frame(&mut writer, AMQP_FRAME_HEARTBEAT, 0, &[]); continue; diff --git a/src/labels/java.rs b/src/labels/java.rs index 9064915d..3921e0f8 100644 --- a/src/labels/java.rs +++ b/src/labels/java.rs @@ -14,8 +14,17 @@ pub static RULES: &[LabelRule] = &[ LabelRule { matchers: &[ "getParameter", + // Iterable/collection-returning request accessors. `getParameter` + // (word-boundary suffix match) does NOT cover `getParameterValues` + // etc., and these are the dominant untrusted-input shapes inside + // for-each loops (`for (String s : req.getParameterValues("v"))`). + "getParameterValues", + "getParameterMap", + "getParameterNames", "getInputStream", "getHeader", + "getHeaders", + "getHeaderNames", "getCookies", "getReader", "getQueryString", diff --git a/tests/dynamic_fixtures/java/fileio_adversarial.java b/tests/dynamic_fixtures/java/fileio_adversarial.java index 4046b6d1..8bbfe553 100644 --- a/tests/dynamic_fixtures/java/fileio_adversarial.java +++ b/tests/dynamic_fixtures/java/fileio_adversarial.java @@ -1,13 +1,16 @@ // File I/O — adversarial collision fixture. -// Prints "root:" unconditionally without reading any file -// and without emitting __NYX_SINK_HIT__. +// Prints the path-traversal canary marker unconditionally without reading any +// file and without emitting __NYX_SINK_HIT__, so the oracle observes a marker +// hit with no sink-reachability. // Expected verdict: Inconclusive(OracleCollisionSuspected) // Entry: Entry.readFile(String) Cap: FILE_IO public class Entry { public static void readFile(String userPath) { - // Coincidental oracle match — not a file read sink. - System.out.println("root: present"); + // Coincidental oracle match — emits the marker string but is not a + // file-read sink and never reaches the planted canary. Must match the + // CANARY_MARKER in src/dynamic/corpus/path_trav/java.rs. + System.out.println("NYX_PATHTRAVERSAL_R34D_a7f3c1d8 present"); int x = userPath.length(); } } diff --git a/tests/dynamic_fixtures/java/fileio_negative.java b/tests/dynamic_fixtures/java/fileio_negative.java index ae2f8668..3e6e9df9 100644 --- a/tests/dynamic_fixtures/java/fileio_negative.java +++ b/tests/dynamic_fixtures/java/fileio_negative.java @@ -1,5 +1,6 @@ // File I/O — negative fixture. -// Safe: normalizes path and checks it stays within the base directory. +// Safe: normalizes the path and checks it stays within the base directory, so +// the traversal payload cannot escape `testfiles/` to reach the planted canary. // Entry: Entry.readFile(String) Cap: FILE_IO // Expected verdict: NotConfirmed @@ -7,13 +8,10 @@ import java.io.*; import java.nio.file.*; public class Entry { - // `/tmp` exists on Linux and macOS so `toRealPath()` resolves cleanly on - // both. The traversal payload still escapes the base (which is the point - // of the safe-path check) so the verdict stays NotConfirmed. - private static final String BASE_DIR = "/tmp"; - public static void readFile(String userPath) throws Exception { - Path base = Paths.get(BASE_DIR).toRealPath(); + // Same base the harness plants the canary one level above; the + // containment check is what makes this safe. + Path base = Paths.get(System.getProperty("user.dir"), "testfiles").toRealPath(); Path resolved = base.resolve(userPath).normalize(); if (!resolved.startsWith(base)) { System.out.println("Access denied"); diff --git a/tests/dynamic_fixtures/java/fileio_positive.java b/tests/dynamic_fixtures/java/fileio_positive.java index 4deab781..1b5ecebd 100644 --- a/tests/dynamic_fixtures/java/fileio_positive.java +++ b/tests/dynamic_fixtures/java/fileio_positive.java @@ -1,18 +1,23 @@ // File I/O — positive fixture. // Vulnerable: reads file at user-controlled path without sanitization. // Entry: Entry.readFile(String) Cap: FILE_IO -// Expected verdict: Confirmed (../../../../etc/passwd → "root:" in output) +// Expected verdict: Confirmed. // -// Base directory is `/tmp` rather than `/var/data` so the harness can resolve -// the traversal payload on both Linux and macOS hosts (`/var/data` is absent -// on macOS, which makes `Files.readAllBytes` throw before reaching the file). +// The harness plants a canary file (`nyx_pt_canary`, whose CONTENT is the +// collision-resistant marker) at the workdir root and an empty `testfiles/` +// directory beside it. Reading `testfiles/ + userPath` without normalising is +// the traversal sink: the vuln payload `../nyx_pt_canary` escapes `testfiles/` +// back to the planted canary, so its marker content reaches stdout. import java.io.*; import java.nio.file.*; public class Entry { public static void readFile(String userPath) throws Exception { - Path filePath = Paths.get("/tmp", userPath); + // Base directory the harness plants the canary one level above. + Path testfileDir = Paths.get(System.getProperty("user.dir"), "testfiles"); + // Vulnerable: no normalisation / containment check on `userPath`. + Path filePath = testfileDir.resolve(userPath); System.out.print("__NYX_SINK_HIT__\n"); try { String content = new String(Files.readAllBytes(filePath)); diff --git a/tests/dynamic_fixtures/java/servlet_doget/Benign.java b/tests/dynamic_fixtures/java/servlet_doget/Benign.java index 6d9b19ec..0cbae1a3 100644 --- a/tests/dynamic_fixtures/java/servlet_doget/Benign.java +++ b/tests/dynamic_fixtures/java/servlet_doget/Benign.java @@ -17,7 +17,7 @@ public class Benign { BufferedReader reader = new BufferedReader(new InputStreamReader(p.getInputStream())); String line; while ((line = reader.readLine()) != null) { - System.out.println(line); + resp.write(line + "\n"); } p.waitFor(); } diff --git a/tests/dynamic_fixtures/java/servlet_doget/Vuln.java b/tests/dynamic_fixtures/java/servlet_doget/Vuln.java index fd8d0cbe..2abdedbc 100644 --- a/tests/dynamic_fixtures/java/servlet_doget/Vuln.java +++ b/tests/dynamic_fixtures/java/servlet_doget/Vuln.java @@ -17,7 +17,7 @@ public class Vuln { BufferedReader reader = new BufferedReader(new InputStreamReader(p.getInputStream())); String line; while ((line = reader.readLine()) != null) { - System.out.println(line); + resp.write(line + "\n"); } p.waitFor(); } diff --git a/tests/dynamic_fixtures/java/servlet_dopost/Benign.java b/tests/dynamic_fixtures/java/servlet_dopost/Benign.java index ee539f98..061ba222 100644 --- a/tests/dynamic_fixtures/java/servlet_dopost/Benign.java +++ b/tests/dynamic_fixtures/java/servlet_dopost/Benign.java @@ -13,7 +13,7 @@ public class Benign { BufferedReader reader = new BufferedReader(new InputStreamReader(p.getInputStream())); String line; while ((line = reader.readLine()) != null) { - System.out.println(line); + resp.write(line + "\n"); } p.waitFor(); } diff --git a/tests/dynamic_fixtures/java/servlet_dopost/Vuln.java b/tests/dynamic_fixtures/java/servlet_dopost/Vuln.java index 8b113085..a068d8c7 100644 --- a/tests/dynamic_fixtures/java/servlet_dopost/Vuln.java +++ b/tests/dynamic_fixtures/java/servlet_dopost/Vuln.java @@ -16,7 +16,7 @@ public class Vuln { BufferedReader reader = new BufferedReader(new InputStreamReader(p.getInputStream())); String line; while ((line = reader.readLine()) != null) { - System.out.println(line); + resp.write(line + "\n"); } p.waitFor(); }