[pitboss] phase 29: Track I — Per-cell budgets, --diff, fixture prerequisites, CI matrix expansion

This commit is contained in:
pitboss 2026-05-15 19:22:40 -05:00
parent 760bc1beb2
commit dd607fb4b3
10 changed files with 1325 additions and 32 deletions

152
.github/workflows/dynamic.yml vendored Normal file
View file

@ -0,0 +1,152 @@
# Phase 29 (Track I): dedicated dynamic-verification matrix.
#
# Three rows exercise the dynamic harness pipeline (`cargo nextest run
# --features dynamic`) under the host configurations the Phase 1728
# tracks documented as supported:
#
# linux-process-only — Ubuntu host, no docker daemon. Forces the
# process backend and exercises the Phase 17
# Linux hardening primitives (chroot, seccomp,
# unshare, no_new_privs). `libc6-dev` is
# installed so the hardening probe + escape
# suite can `cc -static`; without it the
# chroot-leg of the escape suite skips silently
# (Phase 20 follow-up #4 in deferred.md).
#
# linux-with-docker — Ubuntu host with docker-in-docker. Exercises
# the docker backend (Phase 19) and the
# differential-confirmation parity tests.
#
# macos — macOS-latest, no docker. Exercises the
# Phase-18 `sandbox-exec` primitives plus the
# process backend on Darwin. Track-I acceptance
# literal: "cargo nextest run --features dynamic
# is green on macOS without docker."
name: dynamic
permissions:
contents: read
on:
push:
branches: ["master"]
pull_request:
branches: ["master"]
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
linux-process-only:
name: dynamic / linux-process-only
runs-on: ubuntu-latest
env:
# Force the process backend even when callers default to Auto so
# docker-unavailable paths cannot accidentally hide a regression.
NYX_SANDBOX_BACKEND: process
steps:
- uses: actions/checkout@v6
- uses: actions-rust-lang/setup-rust-toolchain@v1
with:
toolchain: stable
cache: true
- uses: taiki-e/install-action@nextest
# Phase 17 / Phase 20 follow-up: the hardening probe + escape
# suite chroot leg need static glibc. Without these packages the
# `cc -static probe.c` step in tests/sandbox_hardening_linux.rs +
# tests/sandbox_escape_suite.rs falls back to dynamic linking and
# the chroot leg silently skips.
- name: Install fixture prerequisites (static libc)
run: |
sudo apt-get update -y
sudo apt-get install -y --no-install-recommends libc6-dev libc-dev-bin
- name: Smoke-test interpreter availability
run: |
python3 --version
node --version || sudo apt-get install -y --no-install-recommends nodejs
ruby --version || true
php --version || true
- name: Dynamic suite (process backend only)
run: cargo nextest run --features dynamic
linux-with-docker:
name: dynamic / linux-with-docker
runs-on: ubuntu-latest
services:
docker:
image: docker:dind
options: --privileged
env:
DOCKER_TLS_CERTDIR: ""
DOCKER_HOST: tcp://docker:2375
steps:
- uses: actions/checkout@v6
- uses: actions-rust-lang/setup-rust-toolchain@v1
with:
toolchain: stable
cache: true
- uses: taiki-e/install-action@nextest
- name: Install fixture prerequisites (static libc)
run: |
sudo apt-get update -y
sudo apt-get install -y --no-install-recommends libc6-dev libc-dev-bin
- name: Pull language images for sandbox tests
run: |
docker pull python:3-slim
docker pull node:20-slim
docker pull eclipse-temurin:21-jre-jammy
docker pull php:8-cli
- name: Smoke-test docker interpreter availability
run: |
docker run --rm python:3-slim python3 --version
docker run --rm node:20-slim node --version
docker run --rm eclipse-temurin:21-jre-jammy java -version
docker run --rm php:8-cli php --version
- name: Dynamic suite (process + docker backends)
run: cargo nextest run --features dynamic
macos:
name: dynamic / macos
runs-on: macos-latest
env:
# macOS runners ship without docker; force process backend so the
# `Auto` resolver in src/dynamic/sandbox.rs cannot accidentally
# pick up a stray Lima/Colima daemon and confuse the matrix.
NYX_SANDBOX_BACKEND: process
steps:
- uses: actions/checkout@v6
- uses: actions-rust-lang/setup-rust-toolchain@v1
with:
toolchain: stable
cache: true
- uses: taiki-e/install-action@nextest
- name: Smoke-test sandbox-exec availability
run: |
/usr/bin/sandbox-exec -p '(version 1)(allow default)' /bin/echo ok
- name: Smoke-test interpreter availability
run: |
python3 --version
node --version
ruby --version
# Phase 29 acceptance literal: "cargo nextest run --features
# dynamic is green on macOS without docker (process-only row)."
- name: Dynamic suite (macOS, process backend)
run: cargo nextest run --features dynamic

View file

@ -6,6 +6,7 @@
#
# Usage:
# scripts/m7_ship_gate.sh [--nyx BIN] [--corpus-dir DIR] [--skip GATE,...]
# [--budget FILE] [--diff FILE]
#
# Gates:
# 1. unsupported-rate — per-cell (cap × lang) Unsupported% within budget
@ -13,6 +14,11 @@
# 3. wall-clock — default scan ≤ 2× static-only on bench suite
# 4. sandbox-escape — sandbox escape suite green for all langs
# 5. repro-stability — repro artifact regenerates identical verdict ≥ 95%
#
# Phase 29 (Track I): Gate 1 consumes per-cell budgets from
# `tests/eval_corpus/budget.toml` and, when `--diff PREV.json` is
# supplied, fails on any monotonic-improvement regression vs the
# previous run.
set -euo pipefail
@ -23,12 +29,17 @@ CORPUS_DIR="${CORPUS_DIR:-${HOME}/.cache/nyx/eval_corpus}"
SKIP_GATES=""
GATE_ERRORS=0
GATE_LOG="${REPO_ROOT}/target/m7_gate.log"
# Phase 29 (Track I): per-cell budgets + monotonic diff.
BUDGET_FILE="${BUDGET_FILE:-${REPO_ROOT}/tests/eval_corpus/budget.toml}"
DIFF_FILE="${DIFF_FILE:-}"
while [[ $# -gt 0 ]]; do
case "$1" in
--nyx) NYX_BIN="$2"; shift 2 ;;
--corpus-dir) CORPUS_DIR="$2"; shift 2 ;;
--skip) SKIP_GATES="$2"; shift 2 ;;
--budget) BUDGET_FILE="$2"; shift 2 ;;
--diff) DIFF_FILE="$2"; shift 2 ;;
*) shift ;;
esac
done
@ -45,28 +56,46 @@ mkdir -p "$(dirname "$GATE_LOG")"
echo "# M7 ship gate — $(date -u +%Y-%m-%dT%H:%M:%SZ)" > "$GATE_LOG"
info "nyx: $NYX_BIN"
info "corpus: $CORPUS_DIR"
info "budget: $BUDGET_FILE"
info "diff: ${DIFF_FILE:-<none>}"
info ""
# ── Gate 1: Unsupported-rate budget ─────────────────────────────────────────
# ── Gate 1: Per-cell budget + monotonic-improvement diff ───────────────────
#
# Phase 29 (Track I): the single global Unsupported threshold is replaced
# by per-cell (cap × lang) budgets in tests/eval_corpus/budget.toml.
# `tests/eval_corpus/run.sh` invokes `tabulate.py` per set and `report.py`
# at the end with `--budget` (and `--diff` when DIFF_FILE is set), so
# any per-cell failure (or any regression vs the prior run) propagates
# back as exit 2.
if skip unsupported-rate; then
info "Gate 1 (unsupported-rate): SKIPPED"
else
info "Gate 1: per-cell Unsupported rate within budget..."
info "Gate 1: per-cell budget within tolerance + no monotonic regressions..."
EVAL_RESULTS="${REPO_ROOT}/target/eval_results.json"
echo "[]" > "$EVAL_RESULTS"
# Run eval corpus runner (in-house set always present).
if bash "${REPO_ROOT}/tests/eval_corpus/run.sh" \
if [[ ! -f "$BUDGET_FILE" ]]; then
die "Gate 1: budget file not found at $BUDGET_FILE"
else
# Run eval corpus runner (in-house set always present).
set +e
bash "${REPO_ROOT}/tests/eval_corpus/run.sh" \
--nyx "$NYX_BIN" \
--sets inhouse \
--output "$(dirname "$EVAL_RESULTS")" 2>>"$GATE_LOG"; then
# Copy result to our location.
cp "$(dirname "$EVAL_RESULTS")/eval_results.json" "$EVAL_RESULTS" 2>/dev/null || true
pass "Gate 1: unsupported-rate check passed"
else
--output "$(dirname "$EVAL_RESULTS")" \
--budget "$BUDGET_FILE" \
${DIFF_FILE:+--diff "$DIFF_FILE"} \
>>"$GATE_LOG" 2>>"$GATE_LOG"
RC=$?
if [[ $RC -eq 2 ]]; then
die "Gate 1: Unsupported rate exceeds budget for one or more (cap, lang) cells"
set -e
cp "$(dirname "$EVAL_RESULTS")/eval_results.json" "$EVAL_RESULTS" 2>/dev/null || true
if [[ $RC -eq 0 ]]; then
pass "Gate 1: per-cell budget + diff check passed"
elif [[ $RC -eq 2 ]]; then
die "Gate 1: per-cell budget exceeded OR monotonic-improvement regression (see $GATE_LOG)"
elif [[ $RC -eq 3 ]]; then
die "Gate 1: budget/diff configuration is malformed (see $GATE_LOG)"
else
info "Gate 1: eval runner returned $RC (corpus may not be downloaded; treating as SKIP)"
fi

View file

@ -48,6 +48,131 @@ pub enum CopyStrategy {
RustEntry,
}
/// Phase 29 (Track I): host-environment prerequisite a fixture needs in
/// order to run. The harness consults the list before staging the
/// fixture; any unsatisfied prerequisite triggers a structured skip
/// rather than a panic, so non-applicable matrix rows (process-only
/// macOS, dockerless CI, missing static libc) still see green ticks.
#[derive(Debug, Clone, PartialEq, Eq)]
#[allow(dead_code)]
pub enum Prerequisite {
/// A binary must resolve on `PATH` and respond to `--version` with
/// exit code 0 (e.g. `python3`, `node`, `go`, `cargo`).
CommandAvailable(&'static str),
/// A specific env var must be set (used to gate feature-flagged
/// suites — e.g. `NYX_ENABLE_FLAKY_FIXTURES=1`).
EnvVar(&'static str),
/// The docker daemon must be reachable. Equivalent to
/// `docker info` returning exit 0.
DockerAvailable,
/// A static C library archive (e.g. `libc.a`) must be linkable.
/// Used by the Phase-17/20 hardening probe fixtures.
StaticLib(&'static str),
}
/// Phase 29 (Track I): why the harness skipped a fixture. Carried by
/// every skip so callers can distinguish "host did not have python3" from
/// "host has docker but daemon refused" from "intentional env-var gate".
#[derive(Debug, Clone, PartialEq, Eq)]
#[allow(dead_code)]
pub enum SkipReason {
MissingCommand(&'static str),
MissingEnvVar(&'static str),
DockerUnavailable,
MissingStaticLib(&'static str),
}
impl std::fmt::Display for SkipReason {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
SkipReason::MissingCommand(c) => write!(f, "missing command on PATH: {c}"),
SkipReason::MissingEnvVar(v) => write!(f, "env var not set: {v}"),
SkipReason::DockerUnavailable => write!(f, "docker daemon unavailable"),
SkipReason::MissingStaticLib(l) => write!(f, "static lib not linkable: {l}"),
}
}
}
/// Returns the first unsatisfied prerequisite, or `Ok(())` when every
/// requirement holds. Exposed for tests that want to gate their own
/// per-shape helpers without going through `FixtureSpec`.
#[allow(dead_code)]
pub fn check_prerequisites(reqs: &[Prerequisite]) -> Result<(), SkipReason> {
for req in reqs {
match req {
Prerequisite::CommandAvailable(cmd) => {
let ok = std::process::Command::new(cmd)
.arg("--version")
.output()
.map(|o| o.status.success())
.unwrap_or(false);
if !ok {
return Err(SkipReason::MissingCommand(cmd));
}
}
Prerequisite::EnvVar(var) => {
if std::env::var(var).is_err() {
return Err(SkipReason::MissingEnvVar(var));
}
}
Prerequisite::DockerAvailable => {
let ok = std::process::Command::new("docker")
.arg("info")
.output()
.map(|o| o.status.success())
.unwrap_or(false);
if !ok {
return Err(SkipReason::DockerUnavailable);
}
}
Prerequisite::StaticLib(lib) => {
// Treat the lib as linkable iff `cc -static -l<lib>` on
// an empty TU succeeds. Slow but reliable; only called
// by the small Phase-17 hardening suite.
let probe = match tempfile::NamedTempFile::new() {
Ok(f) => f,
Err(_) => return Err(SkipReason::MissingStaticLib(lib)),
};
use std::io::Write;
let mut handle = match std::fs::OpenOptions::new()
.write(true)
.open(probe.path())
{
Ok(h) => h,
Err(_) => return Err(SkipReason::MissingStaticLib(lib)),
};
let _ = writeln!(handle, "int main(void) {{ return 0; }}");
drop(handle);
let out = tempfile::Builder::new()
.prefix("nyx-prereq-")
.tempfile()
.map(|f| f.path().to_path_buf())
.ok();
let out = match out {
Some(p) => p,
None => return Err(SkipReason::MissingStaticLib(lib)),
};
let status = std::process::Command::new("cc")
.args([
"-x", "c", "-static",
probe.path().to_str().unwrap_or(""),
"-o",
out.to_str().unwrap_or(""),
&format!("-l{lib}"),
])
.output()
.map(|o| o.status.success())
.unwrap_or(false);
let _ = std::fs::remove_file(&out);
if !status {
return Err(SkipReason::MissingStaticLib(lib));
}
}
}
}
Ok(())
}
/// Per-fixture specification.
pub struct FixtureSpec<'a> {
/// Subdirectory under `tests/dynamic_fixtures/` (e.g. `"python"`, `"rust"`).
@ -67,6 +192,11 @@ pub struct FixtureSpec<'a> {
pub confidence: Confidence,
/// File-layout strategy for the temp-dir copy.
pub copy: CopyStrategy,
/// Phase 29 (Track I): host-environment prerequisites. Empty means
/// "always runs"; otherwise the harness checks each entry before
/// staging the fixture and skips with a structured [`SkipReason`]
/// when any prerequisite is unmet.
pub requires: Vec<Prerequisite>,
}
/// Trimmed verdict shape persisted in the `.golden.json` file.
@ -100,6 +230,14 @@ impl From<&VerifyResult> for GoldenVerdict {
/// stored golden or — when `NYX_UPDATE_GOLDENS=1` — overwrite the golden
/// with the current verdict.
pub fn run_fixture_and_compare_to_golden(spec: &FixtureSpec<'_>) {
if let Err(reason) = check_prerequisites(&spec.requires) {
eprintln!(
"SKIP {}/{}: prerequisite unmet — {reason}",
spec.lang_dir, spec.fixture
);
return;
}
let _guard = FIXTURE_LOCK.lock().unwrap_or_else(|e| e.into_inner());
let fixture_root = fixture_dir(spec.lang_dir);

View file

@ -0,0 +1,210 @@
# Per-cell (cap × lang) budgets for the dynamic-verification eval corpus.
#
# Phase 29 (Track I): replaces the single global Unsupported-rate gate in
# tests/eval_corpus/report.py with per-cell targets. Each cell records the
# largest tolerated rate today plus a deadline date for the next ratchet.
#
# Schema:
#
# [default]
# unsupported_rate = 0.80 # max(Unsupported / total) per cell
# false_confirmed_rate = 0.02 # max(wrong / Confirmed) per cell
# repro_stability = 0.95 # min(stable / Confirmed) per cell
# ratchet_deadline = "2026-08-01"
#
# [[cell]]
# cap = "sqli"
# lang = "python"
# unsupported_rate = 0.50
# false_confirmed_rate = 0.02
# repro_stability = 0.97
# ratchet_deadline = "2026-07-15"
#
# `cap` matches tabulate.py's _CAP_BIT_TABLE / _CAP_RULE_TABLE labels.
# `lang` matches the ext_map values (`python`, `javascript`, …).
# A wildcard `"*"` matches any cell that does not have an exact entry.
[default]
# Inherited by any cell not overridden below. Aligned with the legacy
# Gate-1 / Gate-2 / Gate-5 thresholds in scripts/m7_ship_gate.sh.
unsupported_rate = 0.80
false_confirmed_rate = 0.02
repro_stability = 0.95
ratchet_deadline = "2026-08-01"
# Python verticals (Phase 12 — most mature; tightest budgets).
[[cell]]
cap = "sqli"
lang = "python"
unsupported_rate = 0.40
false_confirmed_rate = 0.02
repro_stability = 0.97
ratchet_deadline = "2026-07-15"
[[cell]]
cap = "cmdi"
lang = "python"
unsupported_rate = 0.40
false_confirmed_rate = 0.02
repro_stability = 0.97
ratchet_deadline = "2026-07-15"
[[cell]]
cap = "path_traversal"
lang = "python"
unsupported_rate = 0.50
false_confirmed_rate = 0.02
repro_stability = 0.95
ratchet_deadline = "2026-07-15"
[[cell]]
cap = "ssrf"
lang = "python"
unsupported_rate = 0.50
false_confirmed_rate = 0.02
repro_stability = 0.95
ratchet_deadline = "2026-07-15"
[[cell]]
cap = "deserialize"
lang = "python"
unsupported_rate = 0.60
false_confirmed_rate = 0.02
repro_stability = 0.95
ratchet_deadline = "2026-08-01"
# JavaScript / TypeScript (Phase 13 — second-most-mature).
[[cell]]
cap = "sqli"
lang = "javascript"
unsupported_rate = 0.55
false_confirmed_rate = 0.02
repro_stability = 0.95
ratchet_deadline = "2026-08-01"
[[cell]]
cap = "cmdi"
lang = "javascript"
unsupported_rate = 0.55
false_confirmed_rate = 0.02
repro_stability = 0.95
ratchet_deadline = "2026-08-01"
[[cell]]
cap = "ssrf"
lang = "javascript"
unsupported_rate = 0.60
false_confirmed_rate = 0.02
repro_stability = 0.95
ratchet_deadline = "2026-08-01"
[[cell]]
cap = "xss"
lang = "javascript"
unsupported_rate = 0.70
false_confirmed_rate = 0.02
repro_stability = 0.95
ratchet_deadline = "2026-08-15"
[[cell]]
cap = "sqli"
lang = "typescript"
unsupported_rate = 0.60
false_confirmed_rate = 0.02
repro_stability = 0.95
ratchet_deadline = "2026-08-15"
# Java (Phase 14).
[[cell]]
cap = "sqli"
lang = "java"
unsupported_rate = 0.65
false_confirmed_rate = 0.02
repro_stability = 0.95
ratchet_deadline = "2026-08-15"
[[cell]]
cap = "deserialize"
lang = "java"
unsupported_rate = 0.70
false_confirmed_rate = 0.02
repro_stability = 0.95
ratchet_deadline = "2026-09-01"
# Phase 15 / 16 verticals (Go, PHP, Ruby, Rust, C, C++) — newer; broader
# tolerance until their probe-shim splicing follow-ups land.
[[cell]]
cap = "cmdi"
lang = "go"
unsupported_rate = 0.75
false_confirmed_rate = 0.02
repro_stability = 0.95
ratchet_deadline = "2026-09-01"
[[cell]]
cap = "sqli"
lang = "go"
unsupported_rate = 0.75
false_confirmed_rate = 0.02
repro_stability = 0.95
ratchet_deadline = "2026-09-01"
[[cell]]
cap = "cmdi"
lang = "php"
unsupported_rate = 0.75
false_confirmed_rate = 0.02
repro_stability = 0.95
ratchet_deadline = "2026-09-01"
[[cell]]
cap = "deserialize"
lang = "php"
unsupported_rate = 0.75
false_confirmed_rate = 0.02
repro_stability = 0.95
ratchet_deadline = "2026-09-01"
[[cell]]
cap = "cmdi"
lang = "ruby"
unsupported_rate = 0.75
false_confirmed_rate = 0.02
repro_stability = 0.95
ratchet_deadline = "2026-09-01"
[[cell]]
cap = "sqli"
lang = "rust"
unsupported_rate = 0.80
false_confirmed_rate = 0.02
repro_stability = 0.95
ratchet_deadline = "2026-09-15"
[[cell]]
cap = "fmt_string"
lang = "c"
unsupported_rate = 0.85
false_confirmed_rate = 0.02
repro_stability = 0.95
ratchet_deadline = "2026-09-15"
[[cell]]
cap = "memory"
lang = "c"
unsupported_rate = 0.90
false_confirmed_rate = 0.02
repro_stability = 0.95
ratchet_deadline = "2026-10-01"
[[cell]]
cap = "memory"
lang = "cpp"
unsupported_rate = 0.90
false_confirmed_rate = 0.02
repro_stability = 0.95
ratchet_deadline = "2026-10-01"

View file

@ -2,6 +2,11 @@
"""
Aggregate eval results across all corpus sets and emit a summary table.
Used by run.sh after all corpus sets have been tabulated.
Phase 29 (Track I) extensions:
--budget tests/eval_corpus/budget.toml per-cell budget enforcement
--diff previous.json monotonic-improvement diff;
CI fails on any regression.
"""
import argparse
@ -9,10 +14,105 @@ import json
import sys
from collections import defaultdict
try:
import tomllib # Python 3.11+
except ModuleNotFoundError: # pragma: no cover — older interpreters only
import tomli as tomllib # type: ignore[no-redef]
def load_budget(path: str) -> dict:
try:
with open(path, "rb") as f:
raw = tomllib.load(f)
except FileNotFoundError:
print(f"ERROR budget file not found: {path}", file=sys.stderr)
sys.exit(3)
except tomllib.TOMLDecodeError as e:
print(f"ERROR budget file malformed: {path}: {e}", file=sys.stderr)
sys.exit(3)
default = raw.get("default", {}) or {}
cells = {}
for row in raw.get("cell", []) or []:
cap = row.get("cap")
lang = row.get("lang")
if not cap or not lang:
print(f"ERROR budget cell missing cap/lang: {row!r}", file=sys.stderr)
sys.exit(3)
cells[(cap, lang)] = row
return {"default": default, "cells": cells}
def budget_for_cell(budget: dict, cap: str, lang: str) -> dict:
merged = dict(budget.get("default", {}) or {})
cell = budget.get("cells", {}).get((cap, lang))
if cell:
merged.update({k: v for k, v in cell.items() if k not in ("cap", "lang")})
if not cell:
wildcard = (
budget.get("cells", {}).get((cap, "*"))
or budget.get("cells", {}).get(("*", lang))
or budget.get("cells", {}).get(("*", "*"))
)
if wildcard:
merged.update(
{k: v for k, v in wildcard.items() if k not in ("cap", "lang")}
)
return merged
def load_previous_agg(path: str) -> dict:
"""Aggregate a previous results file the same way main() does."""
try:
with open(path) as f:
data = json.load(f)
except FileNotFoundError:
print(f"ERROR diff file not found: {path}", file=sys.stderr)
sys.exit(3)
except json.JSONDecodeError as e:
print(f"ERROR diff file malformed: {path}: {e}", file=sys.stderr)
sys.exit(3)
agg: dict[tuple[str, str], dict] = defaultdict(
lambda: {
"tp": 0,
"fp": 0,
"fn": 0,
"unsupported": 0,
"confirmed": 0,
"wrong_confirmed": 0,
"stable_replays": 0,
"total": 0,
}
)
for r in data:
for c in r.get("cells", []):
k = (c["cap"], c["lang"])
for field in (
"tp",
"fp",
"fn",
"unsupported",
"confirmed",
"wrong_confirmed",
"stable_replays",
"total",
):
agg[k][field] += c.get(field, 0)
return agg
def main() -> int:
p = argparse.ArgumentParser()
p.add_argument("--results", required=True)
p.add_argument(
"--budget",
default="",
help="path to budget.toml (per-(cap,lang) thresholds)",
)
p.add_argument(
"--diff",
default="",
help="path to a previous results.json; fail on monotonic-improvement regression",
)
args = p.parse_args()
with open(args.results) as f:
@ -24,12 +124,30 @@ def main() -> int:
# Aggregate across sets.
agg: dict[tuple[str, str], dict] = defaultdict(
lambda: {"tp": 0, "fp": 0, "fn": 0, "unsupported": 0, "total": 0}
lambda: {
"tp": 0,
"fp": 0,
"fn": 0,
"unsupported": 0,
"confirmed": 0,
"wrong_confirmed": 0,
"stable_replays": 0,
"total": 0,
}
)
for r in results:
for c in r.get("cells", []):
k = (c["cap"], c["lang"])
for field in ("tp", "fp", "fn", "unsupported", "total"):
for field in (
"tp",
"fp",
"fn",
"unsupported",
"confirmed",
"wrong_confirmed",
"stable_replays",
"total",
):
agg[k][field] += c.get(field, 0)
print("\n=== Aggregated eval corpus report ===")
@ -46,18 +164,114 @@ def main() -> int:
f"{unsup*100:>6.1f}%"
)
# Gate check: per-cap Unsupported rate <= 80%
gate_failed = False
print("\n=== Gate checks ===")
UNSUPPORTED_BUDGET = 0.80
for k, v in sorted(agg.items()):
unsup = v["unsupported"] / max(v["total"], 1)
if unsup > UNSUPPORTED_BUDGET:
print(f" FAIL {k[0]}/{k[1]}: Unsupported {unsup*100:.1f}% > {UNSUPPORTED_BUDGET*100:.0f}% budget")
gate_failed = True
if not gate_failed:
print(" All gate thresholds met.")
# ── Phase 29: per-cell budget enforcement ────────────────────────────
if args.budget:
budget = load_budget(args.budget)
print(f"\n=== Per-cell budget ({args.budget}) ===")
cell_fails: list[str] = []
for k, v in sorted(agg.items()):
b = budget_for_cell(budget, k[0], k[1])
if not b:
continue
max_unsup = b.get("unsupported_rate")
max_false = b.get("false_confirmed_rate")
min_stable = b.get("repro_stability")
if isinstance(max_unsup, (int, float)) and v["total"] > 0:
rate = v["unsupported"] / v["total"]
if rate > max_unsup:
cell_fails.append(
f" FAIL {k[0]}/{k[1]}: Unsupported {rate*100:.1f}%"
f" > budget {max_unsup*100:.1f}%"
)
if isinstance(max_false, (int, float)) and v["confirmed"] > 0:
rate = v["wrong_confirmed"] / v["confirmed"]
if rate > max_false:
cell_fails.append(
f" FAIL {k[0]}/{k[1]}: false-Confirmed {rate*100:.1f}%"
f" > budget {max_false*100:.1f}%"
)
if (
isinstance(min_stable, (int, float))
and v["confirmed"] > 0
and v.get("stable_replays", 0) > 0
):
rate = v["stable_replays"] / v["confirmed"]
if rate < min_stable:
cell_fails.append(
f" FAIL {k[0]}/{k[1]}: repro stability {rate*100:.1f}%"
f" < budget {min_stable*100:.1f}%"
)
if cell_fails:
for line in cell_fails:
print(line)
gate_failed = True
else:
print(" All per-cell budgets met.")
else:
# Legacy fallback: per-cap Unsupported rate <= 80%.
print("\n=== Gate checks ===")
UNSUPPORTED_BUDGET = 0.80
cell_fails: list[str] = []
for k, v in sorted(agg.items()):
unsup = v["unsupported"] / max(v["total"], 1)
if unsup > UNSUPPORTED_BUDGET:
cell_fails.append(
f" FAIL {k[0]}/{k[1]}: Unsupported {unsup*100:.1f}%"
f" > {UNSUPPORTED_BUDGET*100:.0f}% budget"
)
if cell_fails:
for line in cell_fails:
print(line)
gate_failed = True
else:
print(" All gate thresholds met.")
# ── Phase 29: monotonic-improvement diff ─────────────────────────────
if args.diff:
prev = load_previous_agg(args.diff)
print(f"\n=== Monotonic-improvement diff vs {args.diff} ===")
diff_fails: list[str] = []
EPS = 0.005
for k, v in sorted(agg.items()):
old = prev.get(k)
if not old:
continue
old_unsup = old["unsupported"] / max(old["total"], 1)
new_unsup = v["unsupported"] / max(v["total"], 1)
if new_unsup > old_unsup + EPS:
diff_fails.append(
f" REGRESSION {k[0]}/{k[1]}: Unsupported"
f" {old_unsup*100:.1f}% → {new_unsup*100:.1f}%"
)
old_conf = old.get("confirmed", 0)
new_conf = v.get("confirmed", 0)
old_false = (old.get("wrong_confirmed", 0) / old_conf) if old_conf else None
new_false = (v.get("wrong_confirmed", 0) / new_conf) if new_conf else None
if old_false is not None and new_false is not None and new_false > old_false + EPS:
diff_fails.append(
f" REGRESSION {k[0]}/{k[1]}: false-Confirmed"
f" {old_false*100:.1f}% → {new_false*100:.1f}%"
)
old_stable = (old.get("stable_replays", 0) / old_conf) if old_conf else None
new_stable = (v.get("stable_replays", 0) / new_conf) if new_conf else None
if (
old_stable is not None
and new_stable is not None
and new_stable < old_stable - EPS
):
diff_fails.append(
f" REGRESSION {k[0]}/{k[1]}: repro stability"
f" {old_stable*100:.1f}% → {new_stable*100:.1f}%"
)
if diff_fails:
for line in diff_fails:
print(line)
gate_failed = True
else:
print(" No regressions vs previous run.")
return 2 if gate_failed else 0

View file

@ -29,12 +29,17 @@ OUTPUT_DIR=""
NYX_BIN="${NYX_BIN:-${REPO_ROOT}/target/release/nyx}"
CORPUS_CACHE="${NYX_EVAL_CORPUS_DIR:-${HOME}/.cache/nyx/eval_corpus}"
SETS="owasp,sard,inhouse"
# Phase 29 (Track I): per-cell budgets + monotonic-improvement diff.
BUDGET_FILE=""
DIFF_FILE=""
while [[ $# -gt 0 ]]; do
case "$1" in
--output) OUTPUT_DIR="$2"; shift 2 ;;
--nyx) NYX_BIN="$2"; shift 2 ;;
--sets) SETS="$2"; shift 2 ;;
--budget) BUDGET_FILE="$2"; shift 2 ;;
--diff) DIFF_FILE="$2"; shift 2 ;;
*) shift ;;
esac
done
@ -83,6 +88,8 @@ if [[ "$SETS" == *owasp* ]]; then
--scan /tmp/nyx_owasp.json \
--ground-truth "${SCRIPT_DIR}/ground_truth/owasp_benchmark_v1.2.json" \
--append "$RESULTS_JSON" \
${BUDGET_FILE:+--budget "$BUDGET_FILE"} \
${DIFF_FILE:+--diff "$DIFF_FILE"} \
|| info " tabulate.py failed; ground truth file may be absent"
fi
fi
@ -111,6 +118,8 @@ if [[ "$SETS" == *sard* ]]; then
--scan /tmp/nyx_sard.json \
--ground-truth "${SCRIPT_DIR}/ground_truth/nist_sard.json" \
--append "$RESULTS_JSON" \
${BUDGET_FILE:+--budget "$BUDGET_FILE"} \
${DIFF_FILE:+--diff "$DIFF_FILE"} \
|| info " tabulate.py failed; ground truth file may be absent"
fi
fi
@ -140,6 +149,8 @@ if [[ "$SETS" == *inhouse* ]]; then
--scan "/tmp/nyx_${label}.json" \
--inhouse \
--append "$RESULTS_JSON" \
${BUDGET_FILE:+--budget "$BUDGET_FILE"} \
${DIFF_FILE:+--diff "$DIFF_FILE"} \
|| info " tabulate.py failed on $label"
done
fi
@ -156,12 +167,20 @@ if [[ ! -f "${SCRIPT_DIR}/report.py" ]]; then
fi
set +e
python3 "${SCRIPT_DIR}/report.py" --results "$RESULTS_JSON"
python3 "${SCRIPT_DIR}/report.py" \
--results "$RESULTS_JSON" \
${BUDGET_FILE:+--budget "$BUDGET_FILE"} \
${DIFF_FILE:+--diff "$DIFF_FILE"}
REPORT_RC=$?
set -e
# Propagate gate-fail (exit 2). Treat other non-zero as setup error (exit 1).
# Propagate gate-fail (exit 2) and malformed-config (exit 3) so the
# m7_ship_gate.sh Gate-1 dispatch can tell them apart. Treat other
# non-zero as setup error (exit 1).
if [[ $REPORT_RC -eq 2 ]]; then
exit 2
elif [[ $REPORT_RC -eq 3 ]]; then
info "report.py: budget/diff configuration malformed; see $RESULTS_JSON"
exit 3
elif [[ $REPORT_RC -ne 0 ]]; then
info "report.py crashed (exit $REPORT_RC); raw results at $RESULTS_JSON"
exit 1

View file

@ -9,6 +9,17 @@ For in-house sets (--inhouse): counts findings by cap x language; reports
Unsupported rate only (no ground truth required).
Output: appends a result record to --append FILE.
Phase 29 (Track I) extensions:
--budget tests/eval_corpus/budget.toml enforce per-cell budget thresholds
--diff previous.json compare against prior result file,
fail on monotonic-improvement
regression
Exit codes:
0 all rows pass.
2 one or more per-cell budgets exceeded OR a diff regression was found.
3 malformed budget / diff input (callers must fix configuration).
"""
import argparse
@ -17,6 +28,11 @@ import sys
from collections import defaultdict
from pathlib import Path
try:
import tomllib # Python 3.11+
except ModuleNotFoundError: # pragma: no cover — older interpreters only
import tomli as tomllib # type: ignore[no-redef]
LINE_TOLERANCE = 5
# Bitflag positions for Cap (src/labels/mod.rs). Sink bits map to a cap label.
@ -97,6 +113,203 @@ def lang_of(finding: dict) -> str:
return "unknown"
# ── Budget loading ──────────────────────────────────────────────────────────
def load_budget(path: str) -> dict:
"""Parse a budget.toml file.
Returns a dict::
{
"default": {"unsupported_rate": 0.8, "false_confirmed_rate": 0.02,
"repro_stability": 0.95, "ratchet_deadline": "..."},
"cells": {(cap, lang): {...overrides...}, ...},
}
Raises SystemExit(3) on a malformed file.
"""
try:
with open(path, "rb") as f:
raw = tomllib.load(f)
except FileNotFoundError:
print(f"ERROR budget file not found: {path}", file=sys.stderr)
sys.exit(3)
except tomllib.TOMLDecodeError as e:
print(f"ERROR budget file malformed: {path}: {e}", file=sys.stderr)
sys.exit(3)
default = raw.get("default", {}) or {}
cells = {}
for row in raw.get("cell", []) or []:
cap = row.get("cap")
lang = row.get("lang")
if not cap or not lang:
print(
f"ERROR budget cell missing cap/lang: {row!r}", file=sys.stderr
)
sys.exit(3)
cells[(cap, lang)] = row
return {"default": default, "cells": cells}
def budget_for_cell(budget: dict, cap: str, lang: str) -> dict:
"""Merge cell-specific overrides on top of [default]."""
merged = dict(budget.get("default", {}) or {})
cell = budget.get("cells", {}).get((cap, lang))
if cell:
merged.update({k: v for k, v in cell.items() if k not in ("cap", "lang")})
# Fall back to a wildcard override if present.
if not cell:
wildcard = budget.get("cells", {}).get((cap, "*")) or \
budget.get("cells", {}).get(("*", lang)) or \
budget.get("cells", {}).get(("*", "*"))
if wildcard:
merged.update({k: v for k, v in wildcard.items() if k not in ("cap", "lang")})
return merged
def enforce_budget(cells: list, budget: dict) -> list:
"""Return a list of human-readable failure strings.
Each cell's measured Unsupported / false-Confirmed / repro-stability
rate is compared against its merged budget row. A missing measurement
(e.g. no Confirmed findings false-Confirmed denominator = 0) is
treated as "no data" and skipped, never as a failure.
"""
failures = []
for c in cells:
b = budget_for_cell(budget, c["cap"], c["lang"])
if not b:
continue
cap, lang = c["cap"], c["lang"]
max_unsup = b.get("unsupported_rate")
max_false = b.get("false_confirmed_rate")
min_stable = b.get("repro_stability")
if isinstance(max_unsup, (int, float)) and c.get("total", 0) > 0:
if c["unsupported_rate"] > max_unsup:
failures.append(
f" FAIL {cap}/{lang}: Unsupported {c['unsupported_rate']*100:.1f}%"
f" > budget {max_unsup*100:.1f}%"
)
if isinstance(max_false, (int, float)) and c.get("confirmed", 0) > 0:
rate = c.get("wrong_confirmed", 0) / c["confirmed"]
if rate > max_false:
failures.append(
f" FAIL {cap}/{lang}: false-Confirmed {rate*100:.1f}%"
f" > budget {max_false*100:.1f}%"
)
# Repro stability is only enforced when callers stamped at least
# one `replay_stable: true` flag — otherwise stable_replays == 0
# is indistinguishable from "we did not measure stability for
# this row" and the gate would fire vacuously on every clean run.
if (
isinstance(min_stable, (int, float))
and c.get("confirmed", 0) > 0
and c.get("stable_replays", 0) > 0
):
rate = c["stable_replays"] / c["confirmed"]
if rate < min_stable:
failures.append(
f" FAIL {cap}/{lang}: repro stability {rate*100:.1f}%"
f" < budget {min_stable*100:.1f}%"
)
return failures
# ── Diff loading ────────────────────────────────────────────────────────────
def load_previous_cells(path: str, label: str) -> dict:
"""Index a previous results file by (cap, lang) → cell.
The previous file is the same shape as `--append`'s output. We pick the
record whose `label` matches the current run; if no exact match, fall
back to the first record. Missing/unreadable files exit 3.
"""
try:
with open(path) as f:
data = json.load(f)
except FileNotFoundError:
print(f"ERROR diff file not found: {path}", file=sys.stderr)
sys.exit(3)
except json.JSONDecodeError as e:
print(f"ERROR diff file malformed: {path}: {e}", file=sys.stderr)
sys.exit(3)
records = data if isinstance(data, list) else [data]
chosen = None
for r in records:
if r.get("label") == label:
chosen = r
break
if chosen is None and records:
chosen = records[0]
if not chosen:
return {}
return {(c["cap"], c["lang"]): c for c in chosen.get("cells", [])}
def diff_regressions(cells: list, prev: dict) -> list:
"""Compare current cells against previous. Returns failure strings.
Three monotonicity rules:
* Unsupported% must not increase.
* False-Confirmed% must not increase.
* Repro-stability% must not decrease.
Cells absent from `prev` are treated as new (skipped).
A small epsilon (0.5 percentage points) absorbs flake noise.
"""
EPS = 0.005
failures = []
for c in cells:
key = (c["cap"], c["lang"])
old = prev.get(key)
if not old:
continue
# Unsupported.
old_unsup = old.get("unsupported_rate", 0.0)
new_unsup = c.get("unsupported_rate", 0.0)
if new_unsup > old_unsup + EPS:
failures.append(
f" REGRESSION {key[0]}/{key[1]}: Unsupported"
f" {old_unsup*100:.1f}% → {new_unsup*100:.1f}%"
)
# False-Confirmed.
old_conf = old.get("confirmed", 0)
old_false = (old.get("wrong_confirmed", 0) / old_conf) if old_conf else None
new_conf = c.get("confirmed", 0)
new_false = (c.get("wrong_confirmed", 0) / new_conf) if new_conf else None
if old_false is not None and new_false is not None and new_false > old_false + EPS:
failures.append(
f" REGRESSION {key[0]}/{key[1]}: false-Confirmed"
f" {old_false*100:.1f}% → {new_false*100:.1f}%"
)
# Repro stability (higher is better).
old_stable = (
(old.get("stable_replays", 0) / old_conf) if old_conf else None
)
new_stable = (
(c.get("stable_replays", 0) / new_conf) if new_conf else None
)
if (
old_stable is not None
and new_stable is not None
and new_stable < old_stable - EPS
):
failures.append(
f" REGRESSION {key[0]}/{key[1]}: repro stability"
f" {old_stable*100:.1f}% → {new_stable*100:.1f}%"
)
return failures
def main() -> int:
p = argparse.ArgumentParser()
p.add_argument("--label", required=True)
@ -104,14 +317,34 @@ def main() -> int:
p.add_argument("--ground-truth", default="", help="ground truth JSON")
p.add_argument("--inhouse", action="store_true")
p.add_argument("--append", required=True, help="results accumulator JSON")
p.add_argument(
"--budget",
default="",
help="path to budget.toml (per-(cap,lang) thresholds)",
)
p.add_argument(
"--diff",
default="",
help="path to a previous results JSON; fail on monotonic-improvement regression",
)
args = p.parse_args()
scan_data = load_json(args.scan)
findings = scan_data if isinstance(scan_data, list) else scan_data.get("findings", [])
# Per-cell tallies: {(cap, lang): {tp, fp, fn, unsupported}}
# Per-cell tallies: {(cap, lang): {tp, fp, fn, unsupported, confirmed,
# wrong_confirmed, stable_replays, total}}
cells: dict[tuple[str, str], dict] = defaultdict(
lambda: {"tp": 0, "fp": 0, "fn": 0, "unsupported": 0, "total": 0}
lambda: {
"tp": 0,
"fp": 0,
"fn": 0,
"unsupported": 0,
"confirmed": 0,
"wrong_confirmed": 0,
"stable_replays": 0,
"total": 0,
}
)
for f in findings:
@ -121,8 +354,19 @@ def main() -> int:
ev = f.get("evidence", {}) or {}
dv = ev.get("dynamic_verdict") if ev else None
cells[key]["total"] += 1
if dv and dv.get("status") == "Unsupported":
cells[key]["unsupported"] += 1
if dv:
status = dv.get("status")
if status == "Unsupported":
cells[key]["unsupported"] += 1
elif status == "Confirmed":
cells[key]["confirmed"] += 1
# Repro-stability and false-Confirmed counts are optional
# fields tabulate.py reads off the verdict when callers
# (m7_ship_gate.sh / corpus_promote.yml) have stamped them.
if dv.get("wrong") is True:
cells[key]["wrong_confirmed"] += 1
if dv.get("replay_stable") is True:
cells[key]["stable_replays"] += 1
if not args.inhouse and args.ground_truth and Path(args.ground_truth).exists():
gt = load_json(args.ground_truth)
@ -201,7 +445,34 @@ def main() -> int:
f"{c['precision']:>6.2f} {c['recall']:>6.2f} "
f"{c['unsupported_rate']*100:>6.1f}%"
)
return 0
exit_rc = 0
# ── Phase 29: per-cell budget enforcement ─────────────────────────────
if args.budget:
budget = load_budget(args.budget)
failures = enforce_budget(result["cells"], budget)
if failures:
print(f"\n=== Per-cell budget regressions ({args.budget}) ===")
for line in failures:
print(line)
exit_rc = 2
else:
print(f"\nPer-cell budget ({args.budget}): OK")
# ── Phase 29: diff against previous run ───────────────────────────────
if args.diff:
prev = load_previous_cells(args.diff, args.label)
failures = diff_regressions(result["cells"], prev)
if failures:
print(f"\n=== Monotonic-improvement regressions vs {args.diff} ===")
for line in failures:
print(line)
exit_rc = 2
else:
print(f"\nDiff vs {args.diff}: no regressions")
return exit_rc
if __name__ == "__main__":

View file

@ -0,0 +1,241 @@
#!/usr/bin/env python3
"""
Phase 29 (Track I) regression test for tests/eval_corpus/tabulate.py.
Exercises --budget and --diff against hand-crafted scan + ground-truth
fixtures so the per-cell budget gate and monotonic-improvement diff are
demonstrably non-vacuous.
Run with::
python3 tests/eval_corpus/test_tabulate_regression.py
Exits 0 when every assertion holds, non-zero otherwise. The asserts are
plain `assert` statements so the file works both as a stand-alone script
and under unittest discovery.
"""
from __future__ import annotations
import json
import subprocess
import sys
import tempfile
from pathlib import Path
REPO = Path(__file__).resolve().parents[2]
TABULATE = REPO / "tests/eval_corpus/tabulate.py"
BUDGET = REPO / "tests/eval_corpus/budget.toml"
def run_tabulate(*args: str) -> subprocess.CompletedProcess:
cmd = [sys.executable, str(TABULATE), *args]
return subprocess.run(cmd, capture_output=True, text=True)
def write_json(path: Path, data: object) -> None:
path.write_text(json.dumps(data, indent=2))
# Cap bit positions cribbed from tabulate.py / src/labels/mod.rs.
SINK_BIT_SQL = 1 << 7 # SQL_QUERY
SINK_BIT_CMDI = 1 << 10 # CODE_EXEC
def python_finding(cap_bit: int, path: str, line: int, status: str | None) -> dict:
finding = {
"path": path,
"line": line,
"col": 0,
"id": "py.sqli.cursor_execute",
"evidence": {"sink_caps": cap_bit},
}
if status:
finding["evidence"]["dynamic_verdict"] = {"status": status}
return finding
def test_budget_passes_on_clean_scan(tmp: Path) -> None:
scan = tmp / "scan_clean.json"
write_json(
scan,
{
"findings": [
python_finding(SINK_BIT_SQL, "app.py", 10, "Confirmed"),
python_finding(SINK_BIT_SQL, "app.py", 20, "Confirmed"),
python_finding(SINK_BIT_SQL, "app.py", 30, "NotConfirmed"),
]
},
)
append = tmp / "results_clean.json"
write_json(append, [])
proc = run_tabulate(
"--label", "test",
"--scan", str(scan),
"--inhouse",
"--append", str(append),
"--budget", str(BUDGET),
)
assert proc.returncode == 0, f"clean scan must pass budget, got rc={proc.returncode}\nstdout: {proc.stdout}\nstderr: {proc.stderr}"
assert "Per-cell budget" in proc.stdout and "OK" in proc.stdout, proc.stdout
def test_budget_fails_when_unsupported_exceeds(tmp: Path) -> None:
# SQL_QUERY/python budget is 40% Unsupported. Hand-craft a scan with
# 100% Unsupported in that cell so the gate must trip.
scan = tmp / "scan_unsup.json"
write_json(
scan,
{
"findings": [
python_finding(SINK_BIT_SQL, "app.py", i, "Unsupported")
for i in (10, 20, 30, 40, 50)
]
},
)
append = tmp / "results_unsup.json"
write_json(append, [])
proc = run_tabulate(
"--label", "test",
"--scan", str(scan),
"--inhouse",
"--append", str(append),
"--budget", str(BUDGET),
)
assert proc.returncode == 2, (
f"budget breach must exit 2, got {proc.returncode}\n"
f"stdout: {proc.stdout}\nstderr: {proc.stderr}"
)
assert "FAIL" in proc.stdout and "sqli/python" in proc.stdout, proc.stdout
def test_diff_fails_on_regression(tmp: Path) -> None:
# Previous run: 1/4 Unsupported = 25%. Current run: 3/4 = 75%. The
# default cell budget tolerates 80%, but the monotonic-improvement
# diff must still flag the +50pp regression.
prev_findings = [
python_finding(SINK_BIT_CMDI, "x.unknown", 1, "Confirmed"),
python_finding(SINK_BIT_CMDI, "x.unknown", 2, "Confirmed"),
python_finding(SINK_BIT_CMDI, "x.unknown", 3, "Confirmed"),
python_finding(SINK_BIT_CMDI, "x.unknown", 4, "Unsupported"),
]
prev_scan = tmp / "prev_scan.json"
write_json(prev_scan, {"findings": prev_findings})
prev_results = tmp / "prev_results.json"
write_json(prev_results, [])
rc_prev = run_tabulate(
"--label", "diff-test",
"--scan", str(prev_scan),
"--inhouse",
"--append", str(prev_results),
).returncode
assert rc_prev == 0, f"prev seed run must succeed, got {rc_prev}"
cur_findings = [
python_finding(SINK_BIT_CMDI, "x.unknown", 1, "Unsupported"),
python_finding(SINK_BIT_CMDI, "x.unknown", 2, "Unsupported"),
python_finding(SINK_BIT_CMDI, "x.unknown", 3, "Unsupported"),
python_finding(SINK_BIT_CMDI, "x.unknown", 4, "Confirmed"),
]
cur_scan = tmp / "cur_scan.json"
write_json(cur_scan, {"findings": cur_findings})
cur_results = tmp / "cur_results.json"
write_json(cur_results, [])
proc = run_tabulate(
"--label", "diff-test",
"--scan", str(cur_scan),
"--inhouse",
"--append", str(cur_results),
"--diff", str(prev_results),
)
assert proc.returncode == 2, (
f"regression diff must exit 2, got {proc.returncode}\n"
f"stdout: {proc.stdout}\nstderr: {proc.stderr}"
)
assert "REGRESSION" in proc.stdout and "Unsupported" in proc.stdout, proc.stdout
def test_diff_passes_on_improvement(tmp: Path) -> None:
# Previous: 3/4 Unsupported. Current: 1/4. Monotonic improvement
# must not flag any regression.
prev_findings = [
python_finding(SINK_BIT_CMDI, "x.unknown", 1, "Unsupported"),
python_finding(SINK_BIT_CMDI, "x.unknown", 2, "Unsupported"),
python_finding(SINK_BIT_CMDI, "x.unknown", 3, "Unsupported"),
python_finding(SINK_BIT_CMDI, "x.unknown", 4, "Confirmed"),
]
prev_scan = tmp / "prev_scan.json"
write_json(prev_scan, {"findings": prev_findings})
prev_results = tmp / "prev_results.json"
write_json(prev_results, [])
run_tabulate(
"--label", "improve-test",
"--scan", str(prev_scan),
"--inhouse",
"--append", str(prev_results),
)
cur_findings = [
python_finding(SINK_BIT_CMDI, "x.unknown", 1, "Confirmed"),
python_finding(SINK_BIT_CMDI, "x.unknown", 2, "Confirmed"),
python_finding(SINK_BIT_CMDI, "x.unknown", 3, "Confirmed"),
python_finding(SINK_BIT_CMDI, "x.unknown", 4, "Unsupported"),
]
cur_scan = tmp / "cur_scan.json"
write_json(cur_scan, {"findings": cur_findings})
cur_results = tmp / "cur_results.json"
write_json(cur_results, [])
proc = run_tabulate(
"--label", "improve-test",
"--scan", str(cur_scan),
"--inhouse",
"--append", str(cur_results),
"--diff", str(prev_results),
)
assert proc.returncode == 0, (
f"improvement diff must exit 0, got {proc.returncode}\n"
f"stdout: {proc.stdout}\nstderr: {proc.stderr}"
)
assert "no regressions" in proc.stdout, proc.stdout
def test_budget_malformed_exits_3(tmp: Path) -> None:
bad = tmp / "bad.toml"
bad.write_text("[default]\nunsupported_rate = not_a_number\n")
scan = tmp / "scan.json"
write_json(scan, {"findings": []})
append = tmp / "results.json"
write_json(append, [])
proc = run_tabulate(
"--label", "test",
"--scan", str(scan),
"--inhouse",
"--append", str(append),
"--budget", str(bad),
)
assert proc.returncode == 3, (
f"malformed budget must exit 3, got {proc.returncode}\nstderr: {proc.stderr}"
)
def main() -> int:
with tempfile.TemporaryDirectory() as td:
tmp = Path(td)
for fn in (
test_budget_passes_on_clean_scan,
test_budget_fails_when_unsupported_exceeds,
test_diff_fails_on_regression,
test_diff_passes_on_improvement,
test_budget_malformed_exits_3,
):
sub = tmp / fn.__name__
sub.mkdir()
print(f"... {fn.__name__}")
fn(sub)
print(f" OK")
print("\nAll tabulate.py regression checks passed.")
return 0
if __name__ == "__main__":
sys.exit(main())

View file

@ -15,7 +15,7 @@ mod common;
mod python_fixture_tests {
use crate::common::fixture_harness::{
run_fixture_and_compare_to_golden, run_harness_snapshot, run_shape_fixture,
CopyStrategy, FixtureSpec,
CopyStrategy, FixtureSpec, Prerequisite,
};
use nyx_scanner::commands::scan::Diag;
use nyx_scanner::dynamic::spec::PayloadSlot;
@ -48,6 +48,12 @@ mod python_fixture_tests {
sink_line,
confidence: Confidence::High,
copy: CopyStrategy::PreserveName,
// Phase 29 (Track I): the Python harness emitter shells out
// to `python3` during verify, so the host must have it.
// The harness short-circuits with a structured skip when
// missing; CI rows that intentionally omit Python still go
// green.
requires: vec![Prerequisite::CommandAvailable("python3")],
}
}
@ -65,6 +71,10 @@ mod python_fixture_tests {
sink_line,
confidence: Confidence::Low,
copy: CopyStrategy::PreserveName,
// Low-confidence rows short-circuit to
// `Unsupported(ConfidenceTooLow)` before the harness ever
// shells out to python3, so no prerequisite is needed.
requires: vec![],
}
}

View file

@ -12,7 +12,7 @@ mod common;
#[cfg(feature = "dynamic")]
mod rust_fixture_tests {
use crate::common::fixture_harness::{
run_fixture_and_compare_to_golden, CopyStrategy, FixtureSpec,
run_fixture_and_compare_to_golden, CopyStrategy, FixtureSpec, Prerequisite,
};
use nyx_scanner::commands::scan::Diag;
use nyx_scanner::dynamic::verify::{verify_finding, VerifyOptions};
@ -32,6 +32,11 @@ mod rust_fixture_tests {
sink_line,
confidence: Confidence::High,
copy: CopyStrategy::RustEntry,
// Phase 29 (Track I): the Rust harness emitter shells out
// to `cargo` during verify, so the host must have a Rust
// toolchain on PATH. Missing cargo triggers a structured
// skip rather than a panic.
requires: vec![Prerequisite::CommandAvailable("cargo")],
}
}
@ -49,6 +54,10 @@ mod rust_fixture_tests {
sink_line,
confidence: Confidence::Low,
copy: CopyStrategy::RustEntry,
// Low-confidence rows short-circuit to
// `Unsupported(ConfidenceTooLow)` before the harness ever
// shells out to cargo.
requires: vec![],
}
}