From dd607fb4b3aab3dd27f1bbf1135bc3a285c9f876 Mon Sep 17 00:00:00 2001 From: pitboss Date: Fri, 15 May 2026 19:22:40 -0500 Subject: [PATCH] =?UTF-8?q?[pitboss]=20phase=2029:=20Track=20I=20=E2=80=94?= =?UTF-8?q?=20Per-cell=20budgets,=20`--diff`,=20fixture=20prerequisites,?= =?UTF-8?q?=20CI=20matrix=20expansion?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/dynamic.yml | 152 ++++++++++ scripts/m7_ship_gate.sh | 51 +++- tests/common/fixture_harness.rs | 138 +++++++++ tests/eval_corpus/budget.toml | 210 +++++++++++++ tests/eval_corpus/report.py | 238 ++++++++++++++- tests/eval_corpus/run.sh | 23 +- tests/eval_corpus/tabulate.py | 281 +++++++++++++++++- tests/eval_corpus/test_tabulate_regression.py | 241 +++++++++++++++ tests/python_fixtures.rs | 12 +- tests/rust_fixtures.rs | 11 +- 10 files changed, 1325 insertions(+), 32 deletions(-) create mode 100644 .github/workflows/dynamic.yml create mode 100644 tests/eval_corpus/budget.toml create mode 100644 tests/eval_corpus/test_tabulate_regression.py diff --git a/.github/workflows/dynamic.yml b/.github/workflows/dynamic.yml new file mode 100644 index 00000000..1e060e0d --- /dev/null +++ b/.github/workflows/dynamic.yml @@ -0,0 +1,152 @@ +# Phase 29 (Track I): dedicated dynamic-verification matrix. +# +# Three rows exercise the dynamic harness pipeline (`cargo nextest run +# --features dynamic`) under the host configurations the Phase 17–28 +# tracks documented as supported: +# +# linux-process-only — Ubuntu host, no docker daemon. Forces the +# process backend and exercises the Phase 17 +# Linux hardening primitives (chroot, seccomp, +# unshare, no_new_privs). `libc6-dev` is +# installed so the hardening probe + escape +# suite can `cc -static`; without it the +# chroot-leg of the escape suite skips silently +# (Phase 20 follow-up #4 in deferred.md). +# +# linux-with-docker — Ubuntu host with docker-in-docker. Exercises +# the docker backend (Phase 19) and the +# differential-confirmation parity tests. +# +# macos — macOS-latest, no docker. Exercises the +# Phase-18 `sandbox-exec` primitives plus the +# process backend on Darwin. Track-I acceptance +# literal: "cargo nextest run --features dynamic +# is green on macOS without docker." + +name: dynamic + +permissions: + contents: read + +on: + push: + branches: ["master"] + pull_request: + branches: ["master"] + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + linux-process-only: + name: dynamic / linux-process-only + runs-on: ubuntu-latest + env: + # Force the process backend even when callers default to Auto so + # docker-unavailable paths cannot accidentally hide a regression. + NYX_SANDBOX_BACKEND: process + steps: + - uses: actions/checkout@v6 + + - uses: actions-rust-lang/setup-rust-toolchain@v1 + with: + toolchain: stable + cache: true + + - uses: taiki-e/install-action@nextest + + # Phase 17 / Phase 20 follow-up: the hardening probe + escape + # suite chroot leg need static glibc. Without these packages the + # `cc -static probe.c` step in tests/sandbox_hardening_linux.rs + + # tests/sandbox_escape_suite.rs falls back to dynamic linking and + # the chroot leg silently skips. + - name: Install fixture prerequisites (static libc) + run: | + sudo apt-get update -y + sudo apt-get install -y --no-install-recommends libc6-dev libc-dev-bin + + - name: Smoke-test interpreter availability + run: | + python3 --version + node --version || sudo apt-get install -y --no-install-recommends nodejs + ruby --version || true + php --version || true + + - name: Dynamic suite (process backend only) + run: cargo nextest run --features dynamic + + linux-with-docker: + name: dynamic / linux-with-docker + runs-on: ubuntu-latest + services: + docker: + image: docker:dind + options: --privileged + env: + DOCKER_TLS_CERTDIR: "" + DOCKER_HOST: tcp://docker:2375 + steps: + - uses: actions/checkout@v6 + + - uses: actions-rust-lang/setup-rust-toolchain@v1 + with: + toolchain: stable + cache: true + + - uses: taiki-e/install-action@nextest + + - name: Install fixture prerequisites (static libc) + run: | + sudo apt-get update -y + sudo apt-get install -y --no-install-recommends libc6-dev libc-dev-bin + + - name: Pull language images for sandbox tests + run: | + docker pull python:3-slim + docker pull node:20-slim + docker pull eclipse-temurin:21-jre-jammy + docker pull php:8-cli + + - name: Smoke-test docker interpreter availability + run: | + docker run --rm python:3-slim python3 --version + docker run --rm node:20-slim node --version + docker run --rm eclipse-temurin:21-jre-jammy java -version + docker run --rm php:8-cli php --version + + - name: Dynamic suite (process + docker backends) + run: cargo nextest run --features dynamic + + macos: + name: dynamic / macos + runs-on: macos-latest + env: + # macOS runners ship without docker; force process backend so the + # `Auto` resolver in src/dynamic/sandbox.rs cannot accidentally + # pick up a stray Lima/Colima daemon and confuse the matrix. + NYX_SANDBOX_BACKEND: process + steps: + - uses: actions/checkout@v6 + + - uses: actions-rust-lang/setup-rust-toolchain@v1 + with: + toolchain: stable + cache: true + + - uses: taiki-e/install-action@nextest + + - name: Smoke-test sandbox-exec availability + run: | + /usr/bin/sandbox-exec -p '(version 1)(allow default)' /bin/echo ok + + - name: Smoke-test interpreter availability + run: | + python3 --version + node --version + ruby --version + + # Phase 29 acceptance literal: "cargo nextest run --features + # dynamic is green on macOS without docker (process-only row)." + - name: Dynamic suite (macOS, process backend) + run: cargo nextest run --features dynamic diff --git a/scripts/m7_ship_gate.sh b/scripts/m7_ship_gate.sh index 862a3944..0af72295 100755 --- a/scripts/m7_ship_gate.sh +++ b/scripts/m7_ship_gate.sh @@ -6,6 +6,7 @@ # # Usage: # scripts/m7_ship_gate.sh [--nyx BIN] [--corpus-dir DIR] [--skip GATE,...] +# [--budget FILE] [--diff FILE] # # Gates: # 1. unsupported-rate — per-cell (cap × lang) Unsupported% within budget @@ -13,6 +14,11 @@ # 3. wall-clock — default scan ≤ 2× static-only on bench suite # 4. sandbox-escape — sandbox escape suite green for all langs # 5. repro-stability — repro artifact regenerates identical verdict ≥ 95% +# +# Phase 29 (Track I): Gate 1 consumes per-cell budgets from +# `tests/eval_corpus/budget.toml` and, when `--diff PREV.json` is +# supplied, fails on any monotonic-improvement regression vs the +# previous run. set -euo pipefail @@ -23,12 +29,17 @@ CORPUS_DIR="${CORPUS_DIR:-${HOME}/.cache/nyx/eval_corpus}" SKIP_GATES="" GATE_ERRORS=0 GATE_LOG="${REPO_ROOT}/target/m7_gate.log" +# Phase 29 (Track I): per-cell budgets + monotonic diff. +BUDGET_FILE="${BUDGET_FILE:-${REPO_ROOT}/tests/eval_corpus/budget.toml}" +DIFF_FILE="${DIFF_FILE:-}" while [[ $# -gt 0 ]]; do case "$1" in --nyx) NYX_BIN="$2"; shift 2 ;; --corpus-dir) CORPUS_DIR="$2"; shift 2 ;; --skip) SKIP_GATES="$2"; shift 2 ;; + --budget) BUDGET_FILE="$2"; shift 2 ;; + --diff) DIFF_FILE="$2"; shift 2 ;; *) shift ;; esac done @@ -45,28 +56,46 @@ mkdir -p "$(dirname "$GATE_LOG")" echo "# M7 ship gate — $(date -u +%Y-%m-%dT%H:%M:%SZ)" > "$GATE_LOG" info "nyx: $NYX_BIN" info "corpus: $CORPUS_DIR" +info "budget: $BUDGET_FILE" +info "diff: ${DIFF_FILE:-}" info "" -# ── Gate 1: Unsupported-rate budget ───────────────────────────────────────── +# ── Gate 1: Per-cell budget + monotonic-improvement diff ─────────────────── +# +# Phase 29 (Track I): the single global Unsupported threshold is replaced +# by per-cell (cap × lang) budgets in tests/eval_corpus/budget.toml. +# `tests/eval_corpus/run.sh` invokes `tabulate.py` per set and `report.py` +# at the end with `--budget` (and `--diff` when DIFF_FILE is set), so +# any per-cell failure (or any regression vs the prior run) propagates +# back as exit 2. if skip unsupported-rate; then info "Gate 1 (unsupported-rate): SKIPPED" else - info "Gate 1: per-cell Unsupported rate within budget..." + info "Gate 1: per-cell budget within tolerance + no monotonic regressions..." EVAL_RESULTS="${REPO_ROOT}/target/eval_results.json" echo "[]" > "$EVAL_RESULTS" - # Run eval corpus runner (in-house set always present). - if bash "${REPO_ROOT}/tests/eval_corpus/run.sh" \ + if [[ ! -f "$BUDGET_FILE" ]]; then + die "Gate 1: budget file not found at $BUDGET_FILE" + else + # Run eval corpus runner (in-house set always present). + set +e + bash "${REPO_ROOT}/tests/eval_corpus/run.sh" \ --nyx "$NYX_BIN" \ --sets inhouse \ - --output "$(dirname "$EVAL_RESULTS")" 2>>"$GATE_LOG"; then - # Copy result to our location. - cp "$(dirname "$EVAL_RESULTS")/eval_results.json" "$EVAL_RESULTS" 2>/dev/null || true - pass "Gate 1: unsupported-rate check passed" - else + --output "$(dirname "$EVAL_RESULTS")" \ + --budget "$BUDGET_FILE" \ + ${DIFF_FILE:+--diff "$DIFF_FILE"} \ + >>"$GATE_LOG" 2>>"$GATE_LOG" RC=$? - if [[ $RC -eq 2 ]]; then - die "Gate 1: Unsupported rate exceeds budget for one or more (cap, lang) cells" + set -e + cp "$(dirname "$EVAL_RESULTS")/eval_results.json" "$EVAL_RESULTS" 2>/dev/null || true + if [[ $RC -eq 0 ]]; then + pass "Gate 1: per-cell budget + diff check passed" + elif [[ $RC -eq 2 ]]; then + die "Gate 1: per-cell budget exceeded OR monotonic-improvement regression (see $GATE_LOG)" + elif [[ $RC -eq 3 ]]; then + die "Gate 1: budget/diff configuration is malformed (see $GATE_LOG)" else info "Gate 1: eval runner returned $RC (corpus may not be downloaded; treating as SKIP)" fi diff --git a/tests/common/fixture_harness.rs b/tests/common/fixture_harness.rs index b0d0dd73..4e776714 100644 --- a/tests/common/fixture_harness.rs +++ b/tests/common/fixture_harness.rs @@ -48,6 +48,131 @@ pub enum CopyStrategy { RustEntry, } +/// Phase 29 (Track I): host-environment prerequisite a fixture needs in +/// order to run. The harness consults the list before staging the +/// fixture; any unsatisfied prerequisite triggers a structured skip +/// rather than a panic, so non-applicable matrix rows (process-only +/// macOS, dockerless CI, missing static libc) still see green ticks. +#[derive(Debug, Clone, PartialEq, Eq)] +#[allow(dead_code)] +pub enum Prerequisite { + /// A binary must resolve on `PATH` and respond to `--version` with + /// exit code 0 (e.g. `python3`, `node`, `go`, `cargo`). + CommandAvailable(&'static str), + /// A specific env var must be set (used to gate feature-flagged + /// suites — e.g. `NYX_ENABLE_FLAKY_FIXTURES=1`). + EnvVar(&'static str), + /// The docker daemon must be reachable. Equivalent to + /// `docker info` returning exit 0. + DockerAvailable, + /// A static C library archive (e.g. `libc.a`) must be linkable. + /// Used by the Phase-17/20 hardening probe fixtures. + StaticLib(&'static str), +} + +/// Phase 29 (Track I): why the harness skipped a fixture. Carried by +/// every skip so callers can distinguish "host did not have python3" from +/// "host has docker but daemon refused" from "intentional env-var gate". +#[derive(Debug, Clone, PartialEq, Eq)] +#[allow(dead_code)] +pub enum SkipReason { + MissingCommand(&'static str), + MissingEnvVar(&'static str), + DockerUnavailable, + MissingStaticLib(&'static str), +} + +impl std::fmt::Display for SkipReason { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + SkipReason::MissingCommand(c) => write!(f, "missing command on PATH: {c}"), + SkipReason::MissingEnvVar(v) => write!(f, "env var not set: {v}"), + SkipReason::DockerUnavailable => write!(f, "docker daemon unavailable"), + SkipReason::MissingStaticLib(l) => write!(f, "static lib not linkable: {l}"), + } + } +} + +/// Returns the first unsatisfied prerequisite, or `Ok(())` when every +/// requirement holds. Exposed for tests that want to gate their own +/// per-shape helpers without going through `FixtureSpec`. +#[allow(dead_code)] +pub fn check_prerequisites(reqs: &[Prerequisite]) -> Result<(), SkipReason> { + for req in reqs { + match req { + Prerequisite::CommandAvailable(cmd) => { + let ok = std::process::Command::new(cmd) + .arg("--version") + .output() + .map(|o| o.status.success()) + .unwrap_or(false); + if !ok { + return Err(SkipReason::MissingCommand(cmd)); + } + } + Prerequisite::EnvVar(var) => { + if std::env::var(var).is_err() { + return Err(SkipReason::MissingEnvVar(var)); + } + } + Prerequisite::DockerAvailable => { + let ok = std::process::Command::new("docker") + .arg("info") + .output() + .map(|o| o.status.success()) + .unwrap_or(false); + if !ok { + return Err(SkipReason::DockerUnavailable); + } + } + Prerequisite::StaticLib(lib) => { + // Treat the lib as linkable iff `cc -static -l` on + // an empty TU succeeds. Slow but reliable; only called + // by the small Phase-17 hardening suite. + let probe = match tempfile::NamedTempFile::new() { + Ok(f) => f, + Err(_) => return Err(SkipReason::MissingStaticLib(lib)), + }; + use std::io::Write; + let mut handle = match std::fs::OpenOptions::new() + .write(true) + .open(probe.path()) + { + Ok(h) => h, + Err(_) => return Err(SkipReason::MissingStaticLib(lib)), + }; + let _ = writeln!(handle, "int main(void) {{ return 0; }}"); + drop(handle); + let out = tempfile::Builder::new() + .prefix("nyx-prereq-") + .tempfile() + .map(|f| f.path().to_path_buf()) + .ok(); + let out = match out { + Some(p) => p, + None => return Err(SkipReason::MissingStaticLib(lib)), + }; + let status = std::process::Command::new("cc") + .args([ + "-x", "c", "-static", + probe.path().to_str().unwrap_or(""), + "-o", + out.to_str().unwrap_or(""), + &format!("-l{lib}"), + ]) + .output() + .map(|o| o.status.success()) + .unwrap_or(false); + let _ = std::fs::remove_file(&out); + if !status { + return Err(SkipReason::MissingStaticLib(lib)); + } + } + } + } + Ok(()) +} + /// Per-fixture specification. pub struct FixtureSpec<'a> { /// Subdirectory under `tests/dynamic_fixtures/` (e.g. `"python"`, `"rust"`). @@ -67,6 +192,11 @@ pub struct FixtureSpec<'a> { pub confidence: Confidence, /// File-layout strategy for the temp-dir copy. pub copy: CopyStrategy, + /// Phase 29 (Track I): host-environment prerequisites. Empty means + /// "always runs"; otherwise the harness checks each entry before + /// staging the fixture and skips with a structured [`SkipReason`] + /// when any prerequisite is unmet. + pub requires: Vec, } /// Trimmed verdict shape persisted in the `.golden.json` file. @@ -100,6 +230,14 @@ impl From<&VerifyResult> for GoldenVerdict { /// stored golden or — when `NYX_UPDATE_GOLDENS=1` — overwrite the golden /// with the current verdict. pub fn run_fixture_and_compare_to_golden(spec: &FixtureSpec<'_>) { + if let Err(reason) = check_prerequisites(&spec.requires) { + eprintln!( + "SKIP {}/{}: prerequisite unmet — {reason}", + spec.lang_dir, spec.fixture + ); + return; + } + let _guard = FIXTURE_LOCK.lock().unwrap_or_else(|e| e.into_inner()); let fixture_root = fixture_dir(spec.lang_dir); diff --git a/tests/eval_corpus/budget.toml b/tests/eval_corpus/budget.toml new file mode 100644 index 00000000..cfff4353 --- /dev/null +++ b/tests/eval_corpus/budget.toml @@ -0,0 +1,210 @@ +# Per-cell (cap × lang) budgets for the dynamic-verification eval corpus. +# +# Phase 29 (Track I): replaces the single global Unsupported-rate gate in +# tests/eval_corpus/report.py with per-cell targets. Each cell records the +# largest tolerated rate today plus a deadline date for the next ratchet. +# +# Schema: +# +# [default] +# unsupported_rate = 0.80 # max(Unsupported / total) per cell +# false_confirmed_rate = 0.02 # max(wrong / Confirmed) per cell +# repro_stability = 0.95 # min(stable / Confirmed) per cell +# ratchet_deadline = "2026-08-01" +# +# [[cell]] +# cap = "sqli" +# lang = "python" +# unsupported_rate = 0.50 +# false_confirmed_rate = 0.02 +# repro_stability = 0.97 +# ratchet_deadline = "2026-07-15" +# +# `cap` matches tabulate.py's _CAP_BIT_TABLE / _CAP_RULE_TABLE labels. +# `lang` matches the ext_map values (`python`, `javascript`, …). +# A wildcard `"*"` matches any cell that does not have an exact entry. + +[default] +# Inherited by any cell not overridden below. Aligned with the legacy +# Gate-1 / Gate-2 / Gate-5 thresholds in scripts/m7_ship_gate.sh. +unsupported_rate = 0.80 +false_confirmed_rate = 0.02 +repro_stability = 0.95 +ratchet_deadline = "2026-08-01" + +# Python verticals (Phase 12 — most mature; tightest budgets). + +[[cell]] +cap = "sqli" +lang = "python" +unsupported_rate = 0.40 +false_confirmed_rate = 0.02 +repro_stability = 0.97 +ratchet_deadline = "2026-07-15" + +[[cell]] +cap = "cmdi" +lang = "python" +unsupported_rate = 0.40 +false_confirmed_rate = 0.02 +repro_stability = 0.97 +ratchet_deadline = "2026-07-15" + +[[cell]] +cap = "path_traversal" +lang = "python" +unsupported_rate = 0.50 +false_confirmed_rate = 0.02 +repro_stability = 0.95 +ratchet_deadline = "2026-07-15" + +[[cell]] +cap = "ssrf" +lang = "python" +unsupported_rate = 0.50 +false_confirmed_rate = 0.02 +repro_stability = 0.95 +ratchet_deadline = "2026-07-15" + +[[cell]] +cap = "deserialize" +lang = "python" +unsupported_rate = 0.60 +false_confirmed_rate = 0.02 +repro_stability = 0.95 +ratchet_deadline = "2026-08-01" + +# JavaScript / TypeScript (Phase 13 — second-most-mature). + +[[cell]] +cap = "sqli" +lang = "javascript" +unsupported_rate = 0.55 +false_confirmed_rate = 0.02 +repro_stability = 0.95 +ratchet_deadline = "2026-08-01" + +[[cell]] +cap = "cmdi" +lang = "javascript" +unsupported_rate = 0.55 +false_confirmed_rate = 0.02 +repro_stability = 0.95 +ratchet_deadline = "2026-08-01" + +[[cell]] +cap = "ssrf" +lang = "javascript" +unsupported_rate = 0.60 +false_confirmed_rate = 0.02 +repro_stability = 0.95 +ratchet_deadline = "2026-08-01" + +[[cell]] +cap = "xss" +lang = "javascript" +unsupported_rate = 0.70 +false_confirmed_rate = 0.02 +repro_stability = 0.95 +ratchet_deadline = "2026-08-15" + +[[cell]] +cap = "sqli" +lang = "typescript" +unsupported_rate = 0.60 +false_confirmed_rate = 0.02 +repro_stability = 0.95 +ratchet_deadline = "2026-08-15" + +# Java (Phase 14). + +[[cell]] +cap = "sqli" +lang = "java" +unsupported_rate = 0.65 +false_confirmed_rate = 0.02 +repro_stability = 0.95 +ratchet_deadline = "2026-08-15" + +[[cell]] +cap = "deserialize" +lang = "java" +unsupported_rate = 0.70 +false_confirmed_rate = 0.02 +repro_stability = 0.95 +ratchet_deadline = "2026-09-01" + +# Phase 15 / 16 verticals (Go, PHP, Ruby, Rust, C, C++) — newer; broader +# tolerance until their probe-shim splicing follow-ups land. + +[[cell]] +cap = "cmdi" +lang = "go" +unsupported_rate = 0.75 +false_confirmed_rate = 0.02 +repro_stability = 0.95 +ratchet_deadline = "2026-09-01" + +[[cell]] +cap = "sqli" +lang = "go" +unsupported_rate = 0.75 +false_confirmed_rate = 0.02 +repro_stability = 0.95 +ratchet_deadline = "2026-09-01" + +[[cell]] +cap = "cmdi" +lang = "php" +unsupported_rate = 0.75 +false_confirmed_rate = 0.02 +repro_stability = 0.95 +ratchet_deadline = "2026-09-01" + +[[cell]] +cap = "deserialize" +lang = "php" +unsupported_rate = 0.75 +false_confirmed_rate = 0.02 +repro_stability = 0.95 +ratchet_deadline = "2026-09-01" + +[[cell]] +cap = "cmdi" +lang = "ruby" +unsupported_rate = 0.75 +false_confirmed_rate = 0.02 +repro_stability = 0.95 +ratchet_deadline = "2026-09-01" + +[[cell]] +cap = "sqli" +lang = "rust" +unsupported_rate = 0.80 +false_confirmed_rate = 0.02 +repro_stability = 0.95 +ratchet_deadline = "2026-09-15" + +[[cell]] +cap = "fmt_string" +lang = "c" +unsupported_rate = 0.85 +false_confirmed_rate = 0.02 +repro_stability = 0.95 +ratchet_deadline = "2026-09-15" + +[[cell]] +cap = "memory" +lang = "c" +unsupported_rate = 0.90 +false_confirmed_rate = 0.02 +repro_stability = 0.95 +ratchet_deadline = "2026-10-01" + +[[cell]] +cap = "memory" +lang = "cpp" +unsupported_rate = 0.90 +false_confirmed_rate = 0.02 +repro_stability = 0.95 +ratchet_deadline = "2026-10-01" diff --git a/tests/eval_corpus/report.py b/tests/eval_corpus/report.py index 9d67e1c4..b940c83f 100644 --- a/tests/eval_corpus/report.py +++ b/tests/eval_corpus/report.py @@ -2,6 +2,11 @@ """ Aggregate eval results across all corpus sets and emit a summary table. Used by run.sh after all corpus sets have been tabulated. + +Phase 29 (Track I) extensions: + --budget tests/eval_corpus/budget.toml per-cell budget enforcement + --diff previous.json monotonic-improvement diff; + CI fails on any regression. """ import argparse @@ -9,10 +14,105 @@ import json import sys from collections import defaultdict +try: + import tomllib # Python 3.11+ +except ModuleNotFoundError: # pragma: no cover — older interpreters only + import tomli as tomllib # type: ignore[no-redef] + + +def load_budget(path: str) -> dict: + try: + with open(path, "rb") as f: + raw = tomllib.load(f) + except FileNotFoundError: + print(f"ERROR budget file not found: {path}", file=sys.stderr) + sys.exit(3) + except tomllib.TOMLDecodeError as e: + print(f"ERROR budget file malformed: {path}: {e}", file=sys.stderr) + sys.exit(3) + default = raw.get("default", {}) or {} + cells = {} + for row in raw.get("cell", []) or []: + cap = row.get("cap") + lang = row.get("lang") + if not cap or not lang: + print(f"ERROR budget cell missing cap/lang: {row!r}", file=sys.stderr) + sys.exit(3) + cells[(cap, lang)] = row + return {"default": default, "cells": cells} + + +def budget_for_cell(budget: dict, cap: str, lang: str) -> dict: + merged = dict(budget.get("default", {}) or {}) + cell = budget.get("cells", {}).get((cap, lang)) + if cell: + merged.update({k: v for k, v in cell.items() if k not in ("cap", "lang")}) + if not cell: + wildcard = ( + budget.get("cells", {}).get((cap, "*")) + or budget.get("cells", {}).get(("*", lang)) + or budget.get("cells", {}).get(("*", "*")) + ) + if wildcard: + merged.update( + {k: v for k, v in wildcard.items() if k not in ("cap", "lang")} + ) + return merged + + +def load_previous_agg(path: str) -> dict: + """Aggregate a previous results file the same way main() does.""" + try: + with open(path) as f: + data = json.load(f) + except FileNotFoundError: + print(f"ERROR diff file not found: {path}", file=sys.stderr) + sys.exit(3) + except json.JSONDecodeError as e: + print(f"ERROR diff file malformed: {path}: {e}", file=sys.stderr) + sys.exit(3) + agg: dict[tuple[str, str], dict] = defaultdict( + lambda: { + "tp": 0, + "fp": 0, + "fn": 0, + "unsupported": 0, + "confirmed": 0, + "wrong_confirmed": 0, + "stable_replays": 0, + "total": 0, + } + ) + for r in data: + for c in r.get("cells", []): + k = (c["cap"], c["lang"]) + for field in ( + "tp", + "fp", + "fn", + "unsupported", + "confirmed", + "wrong_confirmed", + "stable_replays", + "total", + ): + agg[k][field] += c.get(field, 0) + return agg + def main() -> int: p = argparse.ArgumentParser() p.add_argument("--results", required=True) + p.add_argument( + "--budget", + default="", + help="path to budget.toml (per-(cap,lang) thresholds)", + ) + p.add_argument( + "--diff", + default="", + help="path to a previous results.json; fail on monotonic-improvement regression", + ) args = p.parse_args() with open(args.results) as f: @@ -24,12 +124,30 @@ def main() -> int: # Aggregate across sets. agg: dict[tuple[str, str], dict] = defaultdict( - lambda: {"tp": 0, "fp": 0, "fn": 0, "unsupported": 0, "total": 0} + lambda: { + "tp": 0, + "fp": 0, + "fn": 0, + "unsupported": 0, + "confirmed": 0, + "wrong_confirmed": 0, + "stable_replays": 0, + "total": 0, + } ) for r in results: for c in r.get("cells", []): k = (c["cap"], c["lang"]) - for field in ("tp", "fp", "fn", "unsupported", "total"): + for field in ( + "tp", + "fp", + "fn", + "unsupported", + "confirmed", + "wrong_confirmed", + "stable_replays", + "total", + ): agg[k][field] += c.get(field, 0) print("\n=== Aggregated eval corpus report ===") @@ -46,18 +164,114 @@ def main() -> int: f"{unsup*100:>6.1f}%" ) - # Gate check: per-cap Unsupported rate <= 80% gate_failed = False - print("\n=== Gate checks ===") - UNSUPPORTED_BUDGET = 0.80 - for k, v in sorted(agg.items()): - unsup = v["unsupported"] / max(v["total"], 1) - if unsup > UNSUPPORTED_BUDGET: - print(f" FAIL {k[0]}/{k[1]}: Unsupported {unsup*100:.1f}% > {UNSUPPORTED_BUDGET*100:.0f}% budget") - gate_failed = True - if not gate_failed: - print(" All gate thresholds met.") + # ── Phase 29: per-cell budget enforcement ──────────────────────────── + if args.budget: + budget = load_budget(args.budget) + print(f"\n=== Per-cell budget ({args.budget}) ===") + cell_fails: list[str] = [] + for k, v in sorted(agg.items()): + b = budget_for_cell(budget, k[0], k[1]) + if not b: + continue + max_unsup = b.get("unsupported_rate") + max_false = b.get("false_confirmed_rate") + min_stable = b.get("repro_stability") + + if isinstance(max_unsup, (int, float)) and v["total"] > 0: + rate = v["unsupported"] / v["total"] + if rate > max_unsup: + cell_fails.append( + f" FAIL {k[0]}/{k[1]}: Unsupported {rate*100:.1f}%" + f" > budget {max_unsup*100:.1f}%" + ) + if isinstance(max_false, (int, float)) and v["confirmed"] > 0: + rate = v["wrong_confirmed"] / v["confirmed"] + if rate > max_false: + cell_fails.append( + f" FAIL {k[0]}/{k[1]}: false-Confirmed {rate*100:.1f}%" + f" > budget {max_false*100:.1f}%" + ) + if ( + isinstance(min_stable, (int, float)) + and v["confirmed"] > 0 + and v.get("stable_replays", 0) > 0 + ): + rate = v["stable_replays"] / v["confirmed"] + if rate < min_stable: + cell_fails.append( + f" FAIL {k[0]}/{k[1]}: repro stability {rate*100:.1f}%" + f" < budget {min_stable*100:.1f}%" + ) + if cell_fails: + for line in cell_fails: + print(line) + gate_failed = True + else: + print(" All per-cell budgets met.") + else: + # Legacy fallback: per-cap Unsupported rate <= 80%. + print("\n=== Gate checks ===") + UNSUPPORTED_BUDGET = 0.80 + cell_fails: list[str] = [] + for k, v in sorted(agg.items()): + unsup = v["unsupported"] / max(v["total"], 1) + if unsup > UNSUPPORTED_BUDGET: + cell_fails.append( + f" FAIL {k[0]}/{k[1]}: Unsupported {unsup*100:.1f}%" + f" > {UNSUPPORTED_BUDGET*100:.0f}% budget" + ) + if cell_fails: + for line in cell_fails: + print(line) + gate_failed = True + else: + print(" All gate thresholds met.") + + # ── Phase 29: monotonic-improvement diff ───────────────────────────── + if args.diff: + prev = load_previous_agg(args.diff) + print(f"\n=== Monotonic-improvement diff vs {args.diff} ===") + diff_fails: list[str] = [] + EPS = 0.005 + for k, v in sorted(agg.items()): + old = prev.get(k) + if not old: + continue + old_unsup = old["unsupported"] / max(old["total"], 1) + new_unsup = v["unsupported"] / max(v["total"], 1) + if new_unsup > old_unsup + EPS: + diff_fails.append( + f" REGRESSION {k[0]}/{k[1]}: Unsupported" + f" {old_unsup*100:.1f}% → {new_unsup*100:.1f}%" + ) + old_conf = old.get("confirmed", 0) + new_conf = v.get("confirmed", 0) + old_false = (old.get("wrong_confirmed", 0) / old_conf) if old_conf else None + new_false = (v.get("wrong_confirmed", 0) / new_conf) if new_conf else None + if old_false is not None and new_false is not None and new_false > old_false + EPS: + diff_fails.append( + f" REGRESSION {k[0]}/{k[1]}: false-Confirmed" + f" {old_false*100:.1f}% → {new_false*100:.1f}%" + ) + old_stable = (old.get("stable_replays", 0) / old_conf) if old_conf else None + new_stable = (v.get("stable_replays", 0) / new_conf) if new_conf else None + if ( + old_stable is not None + and new_stable is not None + and new_stable < old_stable - EPS + ): + diff_fails.append( + f" REGRESSION {k[0]}/{k[1]}: repro stability" + f" {old_stable*100:.1f}% → {new_stable*100:.1f}%" + ) + if diff_fails: + for line in diff_fails: + print(line) + gate_failed = True + else: + print(" No regressions vs previous run.") return 2 if gate_failed else 0 diff --git a/tests/eval_corpus/run.sh b/tests/eval_corpus/run.sh index ab1e061d..3426c4f5 100755 --- a/tests/eval_corpus/run.sh +++ b/tests/eval_corpus/run.sh @@ -29,12 +29,17 @@ OUTPUT_DIR="" NYX_BIN="${NYX_BIN:-${REPO_ROOT}/target/release/nyx}" CORPUS_CACHE="${NYX_EVAL_CORPUS_DIR:-${HOME}/.cache/nyx/eval_corpus}" SETS="owasp,sard,inhouse" +# Phase 29 (Track I): per-cell budgets + monotonic-improvement diff. +BUDGET_FILE="" +DIFF_FILE="" while [[ $# -gt 0 ]]; do case "$1" in --output) OUTPUT_DIR="$2"; shift 2 ;; --nyx) NYX_BIN="$2"; shift 2 ;; --sets) SETS="$2"; shift 2 ;; + --budget) BUDGET_FILE="$2"; shift 2 ;; + --diff) DIFF_FILE="$2"; shift 2 ;; *) shift ;; esac done @@ -83,6 +88,8 @@ if [[ "$SETS" == *owasp* ]]; then --scan /tmp/nyx_owasp.json \ --ground-truth "${SCRIPT_DIR}/ground_truth/owasp_benchmark_v1.2.json" \ --append "$RESULTS_JSON" \ + ${BUDGET_FILE:+--budget "$BUDGET_FILE"} \ + ${DIFF_FILE:+--diff "$DIFF_FILE"} \ || info " tabulate.py failed; ground truth file may be absent" fi fi @@ -111,6 +118,8 @@ if [[ "$SETS" == *sard* ]]; then --scan /tmp/nyx_sard.json \ --ground-truth "${SCRIPT_DIR}/ground_truth/nist_sard.json" \ --append "$RESULTS_JSON" \ + ${BUDGET_FILE:+--budget "$BUDGET_FILE"} \ + ${DIFF_FILE:+--diff "$DIFF_FILE"} \ || info " tabulate.py failed; ground truth file may be absent" fi fi @@ -140,6 +149,8 @@ if [[ "$SETS" == *inhouse* ]]; then --scan "/tmp/nyx_${label}.json" \ --inhouse \ --append "$RESULTS_JSON" \ + ${BUDGET_FILE:+--budget "$BUDGET_FILE"} \ + ${DIFF_FILE:+--diff "$DIFF_FILE"} \ || info " tabulate.py failed on $label" done fi @@ -156,12 +167,20 @@ if [[ ! -f "${SCRIPT_DIR}/report.py" ]]; then fi set +e -python3 "${SCRIPT_DIR}/report.py" --results "$RESULTS_JSON" +python3 "${SCRIPT_DIR}/report.py" \ + --results "$RESULTS_JSON" \ + ${BUDGET_FILE:+--budget "$BUDGET_FILE"} \ + ${DIFF_FILE:+--diff "$DIFF_FILE"} REPORT_RC=$? set -e -# Propagate gate-fail (exit 2). Treat other non-zero as setup error (exit 1). +# Propagate gate-fail (exit 2) and malformed-config (exit 3) so the +# m7_ship_gate.sh Gate-1 dispatch can tell them apart. Treat other +# non-zero as setup error (exit 1). if [[ $REPORT_RC -eq 2 ]]; then exit 2 +elif [[ $REPORT_RC -eq 3 ]]; then + info "report.py: budget/diff configuration malformed; see $RESULTS_JSON" + exit 3 elif [[ $REPORT_RC -ne 0 ]]; then info "report.py crashed (exit $REPORT_RC); raw results at $RESULTS_JSON" exit 1 diff --git a/tests/eval_corpus/tabulate.py b/tests/eval_corpus/tabulate.py index 86957137..8ad3e2c4 100644 --- a/tests/eval_corpus/tabulate.py +++ b/tests/eval_corpus/tabulate.py @@ -9,6 +9,17 @@ For in-house sets (--inhouse): counts findings by cap x language; reports Unsupported rate only (no ground truth required). Output: appends a result record to --append FILE. + +Phase 29 (Track I) extensions: + --budget tests/eval_corpus/budget.toml enforce per-cell budget thresholds + --diff previous.json compare against prior result file, + fail on monotonic-improvement + regression + +Exit codes: + 0 all rows pass. + 2 one or more per-cell budgets exceeded OR a diff regression was found. + 3 malformed budget / diff input (callers must fix configuration). """ import argparse @@ -17,6 +28,11 @@ import sys from collections import defaultdict from pathlib import Path +try: + import tomllib # Python 3.11+ +except ModuleNotFoundError: # pragma: no cover — older interpreters only + import tomli as tomllib # type: ignore[no-redef] + LINE_TOLERANCE = 5 # Bitflag positions for Cap (src/labels/mod.rs). Sink bits map to a cap label. @@ -97,6 +113,203 @@ def lang_of(finding: dict) -> str: return "unknown" +# ── Budget loading ────────────────────────────────────────────────────────── + + +def load_budget(path: str) -> dict: + """Parse a budget.toml file. + + Returns a dict:: + + { + "default": {"unsupported_rate": 0.8, "false_confirmed_rate": 0.02, + "repro_stability": 0.95, "ratchet_deadline": "..."}, + "cells": {(cap, lang): {...overrides...}, ...}, + } + + Raises SystemExit(3) on a malformed file. + """ + + try: + with open(path, "rb") as f: + raw = tomllib.load(f) + except FileNotFoundError: + print(f"ERROR budget file not found: {path}", file=sys.stderr) + sys.exit(3) + except tomllib.TOMLDecodeError as e: + print(f"ERROR budget file malformed: {path}: {e}", file=sys.stderr) + sys.exit(3) + + default = raw.get("default", {}) or {} + cells = {} + for row in raw.get("cell", []) or []: + cap = row.get("cap") + lang = row.get("lang") + if not cap or not lang: + print( + f"ERROR budget cell missing cap/lang: {row!r}", file=sys.stderr + ) + sys.exit(3) + cells[(cap, lang)] = row + + return {"default": default, "cells": cells} + + +def budget_for_cell(budget: dict, cap: str, lang: str) -> dict: + """Merge cell-specific overrides on top of [default].""" + merged = dict(budget.get("default", {}) or {}) + cell = budget.get("cells", {}).get((cap, lang)) + if cell: + merged.update({k: v for k, v in cell.items() if k not in ("cap", "lang")}) + # Fall back to a wildcard override if present. + if not cell: + wildcard = budget.get("cells", {}).get((cap, "*")) or \ + budget.get("cells", {}).get(("*", lang)) or \ + budget.get("cells", {}).get(("*", "*")) + if wildcard: + merged.update({k: v for k, v in wildcard.items() if k not in ("cap", "lang")}) + return merged + + +def enforce_budget(cells: list, budget: dict) -> list: + """Return a list of human-readable failure strings. + + Each cell's measured Unsupported / false-Confirmed / repro-stability + rate is compared against its merged budget row. A missing measurement + (e.g. no Confirmed findings → false-Confirmed denominator = 0) is + treated as "no data" and skipped, never as a failure. + """ + + failures = [] + for c in cells: + b = budget_for_cell(budget, c["cap"], c["lang"]) + if not b: + continue + cap, lang = c["cap"], c["lang"] + max_unsup = b.get("unsupported_rate") + max_false = b.get("false_confirmed_rate") + min_stable = b.get("repro_stability") + + if isinstance(max_unsup, (int, float)) and c.get("total", 0) > 0: + if c["unsupported_rate"] > max_unsup: + failures.append( + f" FAIL {cap}/{lang}: Unsupported {c['unsupported_rate']*100:.1f}%" + f" > budget {max_unsup*100:.1f}%" + ) + if isinstance(max_false, (int, float)) and c.get("confirmed", 0) > 0: + rate = c.get("wrong_confirmed", 0) / c["confirmed"] + if rate > max_false: + failures.append( + f" FAIL {cap}/{lang}: false-Confirmed {rate*100:.1f}%" + f" > budget {max_false*100:.1f}%" + ) + # Repro stability is only enforced when callers stamped at least + # one `replay_stable: true` flag — otherwise stable_replays == 0 + # is indistinguishable from "we did not measure stability for + # this row" and the gate would fire vacuously on every clean run. + if ( + isinstance(min_stable, (int, float)) + and c.get("confirmed", 0) > 0 + and c.get("stable_replays", 0) > 0 + ): + rate = c["stable_replays"] / c["confirmed"] + if rate < min_stable: + failures.append( + f" FAIL {cap}/{lang}: repro stability {rate*100:.1f}%" + f" < budget {min_stable*100:.1f}%" + ) + return failures + + +# ── Diff loading ──────────────────────────────────────────────────────────── + + +def load_previous_cells(path: str, label: str) -> dict: + """Index a previous results file by (cap, lang) → cell. + + The previous file is the same shape as `--append`'s output. We pick the + record whose `label` matches the current run; if no exact match, fall + back to the first record. Missing/unreadable files exit 3. + """ + + try: + with open(path) as f: + data = json.load(f) + except FileNotFoundError: + print(f"ERROR diff file not found: {path}", file=sys.stderr) + sys.exit(3) + except json.JSONDecodeError as e: + print(f"ERROR diff file malformed: {path}: {e}", file=sys.stderr) + sys.exit(3) + + records = data if isinstance(data, list) else [data] + chosen = None + for r in records: + if r.get("label") == label: + chosen = r + break + if chosen is None and records: + chosen = records[0] + if not chosen: + return {} + return {(c["cap"], c["lang"]): c for c in chosen.get("cells", [])} + + +def diff_regressions(cells: list, prev: dict) -> list: + """Compare current cells against previous. Returns failure strings. + + Three monotonicity rules: + * Unsupported% must not increase. + * False-Confirmed% must not increase. + * Repro-stability% must not decrease. + + Cells absent from `prev` are treated as new (skipped). + A small epsilon (0.5 percentage points) absorbs flake noise. + """ + EPS = 0.005 + failures = [] + for c in cells: + key = (c["cap"], c["lang"]) + old = prev.get(key) + if not old: + continue + # Unsupported. + old_unsup = old.get("unsupported_rate", 0.0) + new_unsup = c.get("unsupported_rate", 0.0) + if new_unsup > old_unsup + EPS: + failures.append( + f" REGRESSION {key[0]}/{key[1]}: Unsupported" + f" {old_unsup*100:.1f}% → {new_unsup*100:.1f}%" + ) + # False-Confirmed. + old_conf = old.get("confirmed", 0) + old_false = (old.get("wrong_confirmed", 0) / old_conf) if old_conf else None + new_conf = c.get("confirmed", 0) + new_false = (c.get("wrong_confirmed", 0) / new_conf) if new_conf else None + if old_false is not None and new_false is not None and new_false > old_false + EPS: + failures.append( + f" REGRESSION {key[0]}/{key[1]}: false-Confirmed" + f" {old_false*100:.1f}% → {new_false*100:.1f}%" + ) + # Repro stability (higher is better). + old_stable = ( + (old.get("stable_replays", 0) / old_conf) if old_conf else None + ) + new_stable = ( + (c.get("stable_replays", 0) / new_conf) if new_conf else None + ) + if ( + old_stable is not None + and new_stable is not None + and new_stable < old_stable - EPS + ): + failures.append( + f" REGRESSION {key[0]}/{key[1]}: repro stability" + f" {old_stable*100:.1f}% → {new_stable*100:.1f}%" + ) + return failures + + def main() -> int: p = argparse.ArgumentParser() p.add_argument("--label", required=True) @@ -104,14 +317,34 @@ def main() -> int: p.add_argument("--ground-truth", default="", help="ground truth JSON") p.add_argument("--inhouse", action="store_true") p.add_argument("--append", required=True, help="results accumulator JSON") + p.add_argument( + "--budget", + default="", + help="path to budget.toml (per-(cap,lang) thresholds)", + ) + p.add_argument( + "--diff", + default="", + help="path to a previous results JSON; fail on monotonic-improvement regression", + ) args = p.parse_args() scan_data = load_json(args.scan) findings = scan_data if isinstance(scan_data, list) else scan_data.get("findings", []) - # Per-cell tallies: {(cap, lang): {tp, fp, fn, unsupported}} + # Per-cell tallies: {(cap, lang): {tp, fp, fn, unsupported, confirmed, + # wrong_confirmed, stable_replays, total}} cells: dict[tuple[str, str], dict] = defaultdict( - lambda: {"tp": 0, "fp": 0, "fn": 0, "unsupported": 0, "total": 0} + lambda: { + "tp": 0, + "fp": 0, + "fn": 0, + "unsupported": 0, + "confirmed": 0, + "wrong_confirmed": 0, + "stable_replays": 0, + "total": 0, + } ) for f in findings: @@ -121,8 +354,19 @@ def main() -> int: ev = f.get("evidence", {}) or {} dv = ev.get("dynamic_verdict") if ev else None cells[key]["total"] += 1 - if dv and dv.get("status") == "Unsupported": - cells[key]["unsupported"] += 1 + if dv: + status = dv.get("status") + if status == "Unsupported": + cells[key]["unsupported"] += 1 + elif status == "Confirmed": + cells[key]["confirmed"] += 1 + # Repro-stability and false-Confirmed counts are optional + # fields tabulate.py reads off the verdict when callers + # (m7_ship_gate.sh / corpus_promote.yml) have stamped them. + if dv.get("wrong") is True: + cells[key]["wrong_confirmed"] += 1 + if dv.get("replay_stable") is True: + cells[key]["stable_replays"] += 1 if not args.inhouse and args.ground_truth and Path(args.ground_truth).exists(): gt = load_json(args.ground_truth) @@ -201,7 +445,34 @@ def main() -> int: f"{c['precision']:>6.2f} {c['recall']:>6.2f} " f"{c['unsupported_rate']*100:>6.1f}%" ) - return 0 + + exit_rc = 0 + + # ── Phase 29: per-cell budget enforcement ───────────────────────────── + if args.budget: + budget = load_budget(args.budget) + failures = enforce_budget(result["cells"], budget) + if failures: + print(f"\n=== Per-cell budget regressions ({args.budget}) ===") + for line in failures: + print(line) + exit_rc = 2 + else: + print(f"\nPer-cell budget ({args.budget}): OK") + + # ── Phase 29: diff against previous run ─────────────────────────────── + if args.diff: + prev = load_previous_cells(args.diff, args.label) + failures = diff_regressions(result["cells"], prev) + if failures: + print(f"\n=== Monotonic-improvement regressions vs {args.diff} ===") + for line in failures: + print(line) + exit_rc = 2 + else: + print(f"\nDiff vs {args.diff}: no regressions") + + return exit_rc if __name__ == "__main__": diff --git a/tests/eval_corpus/test_tabulate_regression.py b/tests/eval_corpus/test_tabulate_regression.py new file mode 100644 index 00000000..cdad3ba6 --- /dev/null +++ b/tests/eval_corpus/test_tabulate_regression.py @@ -0,0 +1,241 @@ +#!/usr/bin/env python3 +""" +Phase 29 (Track I) regression test for tests/eval_corpus/tabulate.py. + +Exercises --budget and --diff against hand-crafted scan + ground-truth +fixtures so the per-cell budget gate and monotonic-improvement diff are +demonstrably non-vacuous. + +Run with:: + + python3 tests/eval_corpus/test_tabulate_regression.py + +Exits 0 when every assertion holds, non-zero otherwise. The asserts are +plain `assert` statements so the file works both as a stand-alone script +and under unittest discovery. +""" + +from __future__ import annotations + +import json +import subprocess +import sys +import tempfile +from pathlib import Path + +REPO = Path(__file__).resolve().parents[2] +TABULATE = REPO / "tests/eval_corpus/tabulate.py" +BUDGET = REPO / "tests/eval_corpus/budget.toml" + + +def run_tabulate(*args: str) -> subprocess.CompletedProcess: + cmd = [sys.executable, str(TABULATE), *args] + return subprocess.run(cmd, capture_output=True, text=True) + + +def write_json(path: Path, data: object) -> None: + path.write_text(json.dumps(data, indent=2)) + + +# Cap bit positions cribbed from tabulate.py / src/labels/mod.rs. +SINK_BIT_SQL = 1 << 7 # SQL_QUERY +SINK_BIT_CMDI = 1 << 10 # CODE_EXEC + + +def python_finding(cap_bit: int, path: str, line: int, status: str | None) -> dict: + finding = { + "path": path, + "line": line, + "col": 0, + "id": "py.sqli.cursor_execute", + "evidence": {"sink_caps": cap_bit}, + } + if status: + finding["evidence"]["dynamic_verdict"] = {"status": status} + return finding + + +def test_budget_passes_on_clean_scan(tmp: Path) -> None: + scan = tmp / "scan_clean.json" + write_json( + scan, + { + "findings": [ + python_finding(SINK_BIT_SQL, "app.py", 10, "Confirmed"), + python_finding(SINK_BIT_SQL, "app.py", 20, "Confirmed"), + python_finding(SINK_BIT_SQL, "app.py", 30, "NotConfirmed"), + ] + }, + ) + append = tmp / "results_clean.json" + write_json(append, []) + proc = run_tabulate( + "--label", "test", + "--scan", str(scan), + "--inhouse", + "--append", str(append), + "--budget", str(BUDGET), + ) + assert proc.returncode == 0, f"clean scan must pass budget, got rc={proc.returncode}\nstdout: {proc.stdout}\nstderr: {proc.stderr}" + assert "Per-cell budget" in proc.stdout and "OK" in proc.stdout, proc.stdout + + +def test_budget_fails_when_unsupported_exceeds(tmp: Path) -> None: + # SQL_QUERY/python budget is 40% Unsupported. Hand-craft a scan with + # 100% Unsupported in that cell so the gate must trip. + scan = tmp / "scan_unsup.json" + write_json( + scan, + { + "findings": [ + python_finding(SINK_BIT_SQL, "app.py", i, "Unsupported") + for i in (10, 20, 30, 40, 50) + ] + }, + ) + append = tmp / "results_unsup.json" + write_json(append, []) + proc = run_tabulate( + "--label", "test", + "--scan", str(scan), + "--inhouse", + "--append", str(append), + "--budget", str(BUDGET), + ) + assert proc.returncode == 2, ( + f"budget breach must exit 2, got {proc.returncode}\n" + f"stdout: {proc.stdout}\nstderr: {proc.stderr}" + ) + assert "FAIL" in proc.stdout and "sqli/python" in proc.stdout, proc.stdout + + +def test_diff_fails_on_regression(tmp: Path) -> None: + # Previous run: 1/4 Unsupported = 25%. Current run: 3/4 = 75%. The + # default cell budget tolerates 80%, but the monotonic-improvement + # diff must still flag the +50pp regression. + prev_findings = [ + python_finding(SINK_BIT_CMDI, "x.unknown", 1, "Confirmed"), + python_finding(SINK_BIT_CMDI, "x.unknown", 2, "Confirmed"), + python_finding(SINK_BIT_CMDI, "x.unknown", 3, "Confirmed"), + python_finding(SINK_BIT_CMDI, "x.unknown", 4, "Unsupported"), + ] + prev_scan = tmp / "prev_scan.json" + write_json(prev_scan, {"findings": prev_findings}) + prev_results = tmp / "prev_results.json" + write_json(prev_results, []) + rc_prev = run_tabulate( + "--label", "diff-test", + "--scan", str(prev_scan), + "--inhouse", + "--append", str(prev_results), + ).returncode + assert rc_prev == 0, f"prev seed run must succeed, got {rc_prev}" + + cur_findings = [ + python_finding(SINK_BIT_CMDI, "x.unknown", 1, "Unsupported"), + python_finding(SINK_BIT_CMDI, "x.unknown", 2, "Unsupported"), + python_finding(SINK_BIT_CMDI, "x.unknown", 3, "Unsupported"), + python_finding(SINK_BIT_CMDI, "x.unknown", 4, "Confirmed"), + ] + cur_scan = tmp / "cur_scan.json" + write_json(cur_scan, {"findings": cur_findings}) + cur_results = tmp / "cur_results.json" + write_json(cur_results, []) + proc = run_tabulate( + "--label", "diff-test", + "--scan", str(cur_scan), + "--inhouse", + "--append", str(cur_results), + "--diff", str(prev_results), + ) + assert proc.returncode == 2, ( + f"regression diff must exit 2, got {proc.returncode}\n" + f"stdout: {proc.stdout}\nstderr: {proc.stderr}" + ) + assert "REGRESSION" in proc.stdout and "Unsupported" in proc.stdout, proc.stdout + + +def test_diff_passes_on_improvement(tmp: Path) -> None: + # Previous: 3/4 Unsupported. Current: 1/4. Monotonic improvement + # must not flag any regression. + prev_findings = [ + python_finding(SINK_BIT_CMDI, "x.unknown", 1, "Unsupported"), + python_finding(SINK_BIT_CMDI, "x.unknown", 2, "Unsupported"), + python_finding(SINK_BIT_CMDI, "x.unknown", 3, "Unsupported"), + python_finding(SINK_BIT_CMDI, "x.unknown", 4, "Confirmed"), + ] + prev_scan = tmp / "prev_scan.json" + write_json(prev_scan, {"findings": prev_findings}) + prev_results = tmp / "prev_results.json" + write_json(prev_results, []) + run_tabulate( + "--label", "improve-test", + "--scan", str(prev_scan), + "--inhouse", + "--append", str(prev_results), + ) + + cur_findings = [ + python_finding(SINK_BIT_CMDI, "x.unknown", 1, "Confirmed"), + python_finding(SINK_BIT_CMDI, "x.unknown", 2, "Confirmed"), + python_finding(SINK_BIT_CMDI, "x.unknown", 3, "Confirmed"), + python_finding(SINK_BIT_CMDI, "x.unknown", 4, "Unsupported"), + ] + cur_scan = tmp / "cur_scan.json" + write_json(cur_scan, {"findings": cur_findings}) + cur_results = tmp / "cur_results.json" + write_json(cur_results, []) + proc = run_tabulate( + "--label", "improve-test", + "--scan", str(cur_scan), + "--inhouse", + "--append", str(cur_results), + "--diff", str(prev_results), + ) + assert proc.returncode == 0, ( + f"improvement diff must exit 0, got {proc.returncode}\n" + f"stdout: {proc.stdout}\nstderr: {proc.stderr}" + ) + assert "no regressions" in proc.stdout, proc.stdout + + +def test_budget_malformed_exits_3(tmp: Path) -> None: + bad = tmp / "bad.toml" + bad.write_text("[default]\nunsupported_rate = not_a_number\n") + scan = tmp / "scan.json" + write_json(scan, {"findings": []}) + append = tmp / "results.json" + write_json(append, []) + proc = run_tabulate( + "--label", "test", + "--scan", str(scan), + "--inhouse", + "--append", str(append), + "--budget", str(bad), + ) + assert proc.returncode == 3, ( + f"malformed budget must exit 3, got {proc.returncode}\nstderr: {proc.stderr}" + ) + + +def main() -> int: + with tempfile.TemporaryDirectory() as td: + tmp = Path(td) + for fn in ( + test_budget_passes_on_clean_scan, + test_budget_fails_when_unsupported_exceeds, + test_diff_fails_on_regression, + test_diff_passes_on_improvement, + test_budget_malformed_exits_3, + ): + sub = tmp / fn.__name__ + sub.mkdir() + print(f"... {fn.__name__}") + fn(sub) + print(f" OK") + print("\nAll tabulate.py regression checks passed.") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/python_fixtures.rs b/tests/python_fixtures.rs index 7e8d0df8..74ed8c34 100644 --- a/tests/python_fixtures.rs +++ b/tests/python_fixtures.rs @@ -15,7 +15,7 @@ mod common; mod python_fixture_tests { use crate::common::fixture_harness::{ run_fixture_and_compare_to_golden, run_harness_snapshot, run_shape_fixture, - CopyStrategy, FixtureSpec, + CopyStrategy, FixtureSpec, Prerequisite, }; use nyx_scanner::commands::scan::Diag; use nyx_scanner::dynamic::spec::PayloadSlot; @@ -48,6 +48,12 @@ mod python_fixture_tests { sink_line, confidence: Confidence::High, copy: CopyStrategy::PreserveName, + // Phase 29 (Track I): the Python harness emitter shells out + // to `python3` during verify, so the host must have it. + // The harness short-circuits with a structured skip when + // missing; CI rows that intentionally omit Python still go + // green. + requires: vec![Prerequisite::CommandAvailable("python3")], } } @@ -65,6 +71,10 @@ mod python_fixture_tests { sink_line, confidence: Confidence::Low, copy: CopyStrategy::PreserveName, + // Low-confidence rows short-circuit to + // `Unsupported(ConfidenceTooLow)` before the harness ever + // shells out to python3, so no prerequisite is needed. + requires: vec![], } } diff --git a/tests/rust_fixtures.rs b/tests/rust_fixtures.rs index 0ad367e9..cddbd9da 100644 --- a/tests/rust_fixtures.rs +++ b/tests/rust_fixtures.rs @@ -12,7 +12,7 @@ mod common; #[cfg(feature = "dynamic")] mod rust_fixture_tests { use crate::common::fixture_harness::{ - run_fixture_and_compare_to_golden, CopyStrategy, FixtureSpec, + run_fixture_and_compare_to_golden, CopyStrategy, FixtureSpec, Prerequisite, }; use nyx_scanner::commands::scan::Diag; use nyx_scanner::dynamic::verify::{verify_finding, VerifyOptions}; @@ -32,6 +32,11 @@ mod rust_fixture_tests { sink_line, confidence: Confidence::High, copy: CopyStrategy::RustEntry, + // Phase 29 (Track I): the Rust harness emitter shells out + // to `cargo` during verify, so the host must have a Rust + // toolchain on PATH. Missing cargo triggers a structured + // skip rather than a panic. + requires: vec![Prerequisite::CommandAvailable("cargo")], } } @@ -49,6 +54,10 @@ mod rust_fixture_tests { sink_line, confidence: Confidence::Low, copy: CopyStrategy::RustEntry, + // Low-confidence rows short-circuit to + // `Unsupported(ConfidenceTooLow)` before the harness ever + // shells out to cargo. + requires: vec![], } }