[pitboss] phase 29: Track I — Per-cell budgets, --diff, fixture prerequisites, CI matrix expansion

2026-07-24 21:41:02 +02:00 · 2026-05-15 19:22:40 -05:00 · 2026-05-15 19:22:40 -05:00 · dd607fb4b3
commit dd607fb4b3
parent 760bc1beb2
10 changed files with 1325 additions and 32 deletions
--- a/.github/workflows/dynamic.yml
+++ b/.github/workflows/dynamic.yml
@ -0,0 +1,152 @@
+# Phase 29 (Track I): dedicated dynamic-verification matrix.
+#
+# Three rows exercise the dynamic harness pipeline (`cargo nextest run
+# --features dynamic`) under the host configurations the Phase 17–28
+# tracks documented as supported:
+#
+#   linux-process-only — Ubuntu host, no docker daemon.  Forces the
+#                        process backend and exercises the Phase 17
+#                        Linux hardening primitives (chroot, seccomp,
+#                        unshare, no_new_privs).  `libc6-dev` is
+#                        installed so the hardening probe + escape
+#                        suite can `cc -static`; without it the
+#                        chroot-leg of the escape suite skips silently
+#                        (Phase 20 follow-up #4 in deferred.md).
+#
+#   linux-with-docker  — Ubuntu host with docker-in-docker.  Exercises
+#                        the docker backend (Phase 19) and the
+#                        differential-confirmation parity tests.
+#
+#   macos              — macOS-latest, no docker.  Exercises the
+#                        Phase-18 `sandbox-exec` primitives plus the
+#                        process backend on Darwin.  Track-I acceptance
+#                        literal: "cargo nextest run --features dynamic
+#                        is green on macOS without docker."
+
+name: dynamic
+
+permissions:
+  contents: read
+
+on:
+  push:
+    branches: ["master"]
+  pull_request:
+    branches: ["master"]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  linux-process-only:
+    name: dynamic / linux-process-only
+    runs-on: ubuntu-latest
+    env:
+      # Force the process backend even when callers default to Auto so
+      # docker-unavailable paths cannot accidentally hide a regression.
+      NYX_SANDBOX_BACKEND: process
+    steps:
+      - uses: actions/checkout@v6
+
+      - uses: actions-rust-lang/setup-rust-toolchain@v1
+        with:
+          toolchain: stable
+          cache: true
+
+      - uses: taiki-e/install-action@nextest
+
+      # Phase 17 / Phase 20 follow-up: the hardening probe + escape
+      # suite chroot leg need static glibc.  Without these packages the
+      # `cc -static probe.c` step in tests/sandbox_hardening_linux.rs +
+      # tests/sandbox_escape_suite.rs falls back to dynamic linking and
+      # the chroot leg silently skips.
+      - name: Install fixture prerequisites (static libc)
+        run: |
+          sudo apt-get update -y
+          sudo apt-get install -y --no-install-recommends libc6-dev libc-dev-bin
+
+      - name: Smoke-test interpreter availability
+        run: |
+          python3 --version
+          node --version || sudo apt-get install -y --no-install-recommends nodejs
+          ruby --version || true
+          php --version || true
+
+      - name: Dynamic suite (process backend only)
+        run: cargo nextest run --features dynamic
+
+  linux-with-docker:
+    name: dynamic / linux-with-docker
+    runs-on: ubuntu-latest
+    services:
+      docker:
+        image: docker:dind
+        options: --privileged
+    env:
+      DOCKER_TLS_CERTDIR: ""
+      DOCKER_HOST: tcp://docker:2375
+    steps:
+      - uses: actions/checkout@v6
+
+      - uses: actions-rust-lang/setup-rust-toolchain@v1
+        with:
+          toolchain: stable
+          cache: true
+
+      - uses: taiki-e/install-action@nextest
+
+      - name: Install fixture prerequisites (static libc)
+        run: |
+          sudo apt-get update -y
+          sudo apt-get install -y --no-install-recommends libc6-dev libc-dev-bin
+
+      - name: Pull language images for sandbox tests
+        run: |
+          docker pull python:3-slim
+          docker pull node:20-slim
+          docker pull eclipse-temurin:21-jre-jammy
+          docker pull php:8-cli
+
+      - name: Smoke-test docker interpreter availability
+        run: |
+          docker run --rm python:3-slim python3 --version
+          docker run --rm node:20-slim node --version
+          docker run --rm eclipse-temurin:21-jre-jammy java -version
+          docker run --rm php:8-cli php --version
+
+      - name: Dynamic suite (process + docker backends)
+        run: cargo nextest run --features dynamic
+
+  macos:
+    name: dynamic / macos
+    runs-on: macos-latest
+    env:
+      # macOS runners ship without docker; force process backend so the
+      # `Auto` resolver in src/dynamic/sandbox.rs cannot accidentally
+      # pick up a stray Lima/Colima daemon and confuse the matrix.
+      NYX_SANDBOX_BACKEND: process
+    steps:
+      - uses: actions/checkout@v6
+
+      - uses: actions-rust-lang/setup-rust-toolchain@v1
+        with:
+          toolchain: stable
+          cache: true
+
+      - uses: taiki-e/install-action@nextest
+
+      - name: Smoke-test sandbox-exec availability
+        run: |
+          /usr/bin/sandbox-exec -p '(version 1)(allow default)' /bin/echo ok
+
+      - name: Smoke-test interpreter availability
+        run: |
+          python3 --version
+          node --version
+          ruby --version
+
+      # Phase 29 acceptance literal: "cargo nextest run --features
+      # dynamic is green on macOS without docker (process-only row)."
+      - name: Dynamic suite (macOS, process backend)
+        run: cargo nextest run --features dynamic
--- a/scripts/m7_ship_gate.sh
+++ b/scripts/m7_ship_gate.sh
@ -6,6 +6,7 @@
 #
 # Usage:
 #   scripts/m7_ship_gate.sh [--nyx BIN] [--corpus-dir DIR] [--skip GATE,...]
+#                           [--budget FILE] [--diff FILE]
 #
 # Gates:
 #   1. unsupported-rate   — per-cell (cap × lang) Unsupported% within budget
@ -13,6 +14,11 @@
 #   3. wall-clock         — default scan ≤ 2× static-only on bench suite
 #   4. sandbox-escape     — sandbox escape suite green for all langs
 #   5. repro-stability    — repro artifact regenerates identical verdict ≥ 95%
+#
+# Phase 29 (Track I): Gate 1 consumes per-cell budgets from
+# `tests/eval_corpus/budget.toml` and, when `--diff PREV.json` is
+# supplied, fails on any monotonic-improvement regression vs the
+# previous run.

 set -euo pipefail

@ -23,12 +29,17 @@ CORPUS_DIR="${CORPUS_DIR:-${HOME}/.cache/nyx/eval_corpus}"
 SKIP_GATES=""
 GATE_ERRORS=0
 GATE_LOG="${REPO_ROOT}/target/m7_gate.log"
+# Phase 29 (Track I): per-cell budgets + monotonic diff.
+BUDGET_FILE="${BUDGET_FILE:-${REPO_ROOT}/tests/eval_corpus/budget.toml}"
+DIFF_FILE="${DIFF_FILE:-}"

 while [[ $# -gt 0 ]]; do
  case "$1" in
    --nyx)         NYX_BIN="$2"; shift 2 ;;
    --corpus-dir)  CORPUS_DIR="$2"; shift 2 ;;
    --skip)        SKIP_GATES="$2"; shift 2 ;;
+    --budget)      BUDGET_FILE="$2"; shift 2 ;;
+    --diff)        DIFF_FILE="$2"; shift 2 ;;
    *)             shift ;;
  esac
 done
@ -45,28 +56,46 @@ mkdir -p "$(dirname "$GATE_LOG")"
 echo "# M7 ship gate — $(date -u +%Y-%m-%dT%H:%M:%SZ)" > "$GATE_LOG"
 info "nyx: $NYX_BIN"
 info "corpus: $CORPUS_DIR"
+info "budget: $BUDGET_FILE"
+info "diff:   ${DIFF_FILE:-<none>}"
 info ""

-# ── Gate 1: Unsupported-rate budget ─────────────────────────────────────────
+# ── Gate 1: Per-cell budget + monotonic-improvement diff ───────────────────
+#
+# Phase 29 (Track I): the single global Unsupported threshold is replaced
+# by per-cell (cap × lang) budgets in tests/eval_corpus/budget.toml.
+# `tests/eval_corpus/run.sh` invokes `tabulate.py` per set and `report.py`
+# at the end with `--budget` (and `--diff` when DIFF_FILE is set), so
+# any per-cell failure (or any regression vs the prior run) propagates
+# back as exit 2.
 if skip unsupported-rate; then
  info "Gate 1 (unsupported-rate): SKIPPED"
 else
-  info "Gate 1: per-cell Unsupported rate within budget..."
+  info "Gate 1: per-cell budget within tolerance + no monotonic regressions..."
  EVAL_RESULTS="${REPO_ROOT}/target/eval_results.json"
  echo "[]" > "$EVAL_RESULTS"

-  # Run eval corpus runner (in-house set always present).
-  if bash "${REPO_ROOT}/tests/eval_corpus/run.sh" \
+  if [[ ! -f "$BUDGET_FILE" ]]; then
+    die "Gate 1: budget file not found at $BUDGET_FILE"
+  else
+    # Run eval corpus runner (in-house set always present).
+    set +e
+    bash "${REPO_ROOT}/tests/eval_corpus/run.sh" \
      --nyx "$NYX_BIN" \
      --sets inhouse \
-      --output "$(dirname "$EVAL_RESULTS")" 2>>"$GATE_LOG"; then
-    # Copy result to our location.
-    cp "$(dirname "$EVAL_RESULTS")/eval_results.json" "$EVAL_RESULTS" 2>/dev/null || true
-    pass "Gate 1: unsupported-rate check passed"
-  else
+      --output "$(dirname "$EVAL_RESULTS")" \
+      --budget "$BUDGET_FILE" \
+      ${DIFF_FILE:+--diff "$DIFF_FILE"} \
+      >>"$GATE_LOG" 2>>"$GATE_LOG"
    RC=$?
-    if [[ $RC -eq 2 ]]; then
-      die "Gate 1: Unsupported rate exceeds budget for one or more (cap, lang) cells"
+    set -e
+    cp "$(dirname "$EVAL_RESULTS")/eval_results.json" "$EVAL_RESULTS" 2>/dev/null || true
+    if [[ $RC -eq 0 ]]; then
+      pass "Gate 1: per-cell budget + diff check passed"
+    elif [[ $RC -eq 2 ]]; then
+      die "Gate 1: per-cell budget exceeded OR monotonic-improvement regression (see $GATE_LOG)"
+    elif [[ $RC -eq 3 ]]; then
+      die "Gate 1: budget/diff configuration is malformed (see $GATE_LOG)"
    else
      info "Gate 1: eval runner returned $RC (corpus may not be downloaded; treating as SKIP)"
    fi
--- a/tests/common/fixture_harness.rs
+++ b/tests/common/fixture_harness.rs
@ -48,6 +48,131 @@ pub enum CopyStrategy {
    RustEntry,
 }

+/// Phase 29 (Track I): host-environment prerequisite a fixture needs in
+/// order to run. The harness consults the list before staging the
+/// fixture; any unsatisfied prerequisite triggers a structured skip
+/// rather than a panic, so non-applicable matrix rows (process-only
+/// macOS, dockerless CI, missing static libc) still see green ticks.
+#[derive(Debug, Clone, PartialEq, Eq)]
+#[allow(dead_code)]
+pub enum Prerequisite {
+    /// A binary must resolve on `PATH` and respond to `--version` with
+    /// exit code 0 (e.g. `python3`, `node`, `go`, `cargo`).
+    CommandAvailable(&'static str),
+    /// A specific env var must be set (used to gate feature-flagged
+    /// suites — e.g. `NYX_ENABLE_FLAKY_FIXTURES=1`).
+    EnvVar(&'static str),
+    /// The docker daemon must be reachable.  Equivalent to
+    /// `docker info` returning exit 0.
+    DockerAvailable,
+    /// A static C library archive (e.g. `libc.a`) must be linkable.
+    /// Used by the Phase-17/20 hardening probe fixtures.
+    StaticLib(&'static str),
+}
+
+/// Phase 29 (Track I): why the harness skipped a fixture.  Carried by
+/// every skip so callers can distinguish "host did not have python3" from
+/// "host has docker but daemon refused" from "intentional env-var gate".
+#[derive(Debug, Clone, PartialEq, Eq)]
+#[allow(dead_code)]
+pub enum SkipReason {
+    MissingCommand(&'static str),
+    MissingEnvVar(&'static str),
+    DockerUnavailable,
+    MissingStaticLib(&'static str),
+}
+
+impl std::fmt::Display for SkipReason {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            SkipReason::MissingCommand(c) => write!(f, "missing command on PATH: {c}"),
+            SkipReason::MissingEnvVar(v) => write!(f, "env var not set: {v}"),
+            SkipReason::DockerUnavailable => write!(f, "docker daemon unavailable"),
+            SkipReason::MissingStaticLib(l) => write!(f, "static lib not linkable: {l}"),
+        }
+    }
+}
+
+/// Returns the first unsatisfied prerequisite, or `Ok(())` when every
+/// requirement holds. Exposed for tests that want to gate their own
+/// per-shape helpers without going through `FixtureSpec`.
+#[allow(dead_code)]
+pub fn check_prerequisites(reqs: &[Prerequisite]) -> Result<(), SkipReason> {
+    for req in reqs {
+        match req {
+            Prerequisite::CommandAvailable(cmd) => {
+                let ok = std::process::Command::new(cmd)
+                    .arg("--version")
+                    .output()
+                    .map(|o| o.status.success())
+                    .unwrap_or(false);
+                if !ok {
+                    return Err(SkipReason::MissingCommand(cmd));
+                }
+            }
+            Prerequisite::EnvVar(var) => {
+                if std::env::var(var).is_err() {
+                    return Err(SkipReason::MissingEnvVar(var));
+                }
+            }
+            Prerequisite::DockerAvailable => {
+                let ok = std::process::Command::new("docker")
+                    .arg("info")
+                    .output()
+                    .map(|o| o.status.success())
+                    .unwrap_or(false);
+                if !ok {
+                    return Err(SkipReason::DockerUnavailable);
+                }
+            }
+            Prerequisite::StaticLib(lib) => {
+                // Treat the lib as linkable iff `cc -static -l<lib>` on
+                // an empty TU succeeds.  Slow but reliable; only called
+                // by the small Phase-17 hardening suite.
+                let probe = match tempfile::NamedTempFile::new() {
+                    Ok(f) => f,
+                    Err(_) => return Err(SkipReason::MissingStaticLib(lib)),
+                };
+                use std::io::Write;
+                let mut handle = match std::fs::OpenOptions::new()
+                    .write(true)
+                    .open(probe.path())
+                {
+                    Ok(h) => h,
+                    Err(_) => return Err(SkipReason::MissingStaticLib(lib)),
+                };
+                let _ = writeln!(handle, "int main(void) {{ return 0; }}");
+                drop(handle);
+                let out = tempfile::Builder::new()
+                    .prefix("nyx-prereq-")
+                    .tempfile()
+                    .map(|f| f.path().to_path_buf())
+                    .ok();
+                let out = match out {
+                    Some(p) => p,
+                    None => return Err(SkipReason::MissingStaticLib(lib)),
+                };
+                let status = std::process::Command::new("cc")
+                    .args([
+                        "-x", "c", "-static",
+                        probe.path().to_str().unwrap_or(""),
+                        "-o",
+                        out.to_str().unwrap_or(""),
+                        &format!("-l{lib}"),
+                    ])
+                    .output()
+                    .map(|o| o.status.success())
+                    .unwrap_or(false);
+                let _ = std::fs::remove_file(&out);
+                if !status {
+                    return Err(SkipReason::MissingStaticLib(lib));
+                }
+            }
+        }
+    }
+    Ok(())
+}
+
 /// Per-fixture specification.
 pub struct FixtureSpec<'a> {
    /// Subdirectory under `tests/dynamic_fixtures/` (e.g. `"python"`, `"rust"`).
@ -67,6 +192,11 @@ pub struct FixtureSpec<'a> {
    pub confidence: Confidence,
    /// File-layout strategy for the temp-dir copy.
    pub copy: CopyStrategy,
+    /// Phase 29 (Track I): host-environment prerequisites. Empty means
+    /// "always runs"; otherwise the harness checks each entry before
+    /// staging the fixture and skips with a structured [`SkipReason`]
+    /// when any prerequisite is unmet.
+    pub requires: Vec<Prerequisite>,
 }

 /// Trimmed verdict shape persisted in the `.golden.json` file.
@ -100,6 +230,14 @@ impl From<&VerifyResult> for GoldenVerdict {
 /// stored golden or — when `NYX_UPDATE_GOLDENS=1` — overwrite the golden
 /// with the current verdict.
 pub fn run_fixture_and_compare_to_golden(spec: &FixtureSpec<'_>) {
+    if let Err(reason) = check_prerequisites(&spec.requires) {
+        eprintln!(
+            "SKIP {}/{}: prerequisite unmet — {reason}",
+            spec.lang_dir, spec.fixture
+        );
+        return;
+    }
+
    let _guard = FIXTURE_LOCK.lock().unwrap_or_else(|e| e.into_inner());

    let fixture_root = fixture_dir(spec.lang_dir);
--- a/tests/eval_corpus/budget.toml
+++ b/tests/eval_corpus/budget.toml
@ -0,0 +1,210 @@
+# Per-cell (cap × lang) budgets for the dynamic-verification eval corpus.
+#
+# Phase 29 (Track I): replaces the single global Unsupported-rate gate in
+# tests/eval_corpus/report.py with per-cell targets. Each cell records the
+# largest tolerated rate today plus a deadline date for the next ratchet.
+#
+# Schema:
+#
+#   [default]
+#   unsupported_rate    = 0.80   # max(Unsupported / total) per cell
+#   false_confirmed_rate = 0.02  # max(wrong / Confirmed) per cell
+#   repro_stability     = 0.95   # min(stable / Confirmed) per cell
+#   ratchet_deadline    = "2026-08-01"
+#
+#   [[cell]]
+#   cap                 = "sqli"
+#   lang                = "python"
+#   unsupported_rate    = 0.50
+#   false_confirmed_rate = 0.02
+#   repro_stability     = 0.97
+#   ratchet_deadline    = "2026-07-15"
+#
+# `cap` matches tabulate.py's _CAP_BIT_TABLE / _CAP_RULE_TABLE labels.
+# `lang` matches the ext_map values (`python`, `javascript`, …).
+# A wildcard `"*"` matches any cell that does not have an exact entry.
+
+[default]
+# Inherited by any cell not overridden below.  Aligned with the legacy
+# Gate-1 / Gate-2 / Gate-5 thresholds in scripts/m7_ship_gate.sh.
+unsupported_rate     = 0.80
+false_confirmed_rate = 0.02
+repro_stability      = 0.95
+ratchet_deadline     = "2026-08-01"
+
+# Python verticals (Phase 12 — most mature; tightest budgets).
+
+[[cell]]
+cap = "sqli"
+lang = "python"
+unsupported_rate     = 0.40
+false_confirmed_rate = 0.02
+repro_stability      = 0.97
+ratchet_deadline     = "2026-07-15"
+
+[[cell]]
+cap = "cmdi"
+lang = "python"
+unsupported_rate     = 0.40
+false_confirmed_rate = 0.02
+repro_stability      = 0.97
+ratchet_deadline     = "2026-07-15"
+
+[[cell]]
+cap = "path_traversal"
+lang = "python"
+unsupported_rate     = 0.50
+false_confirmed_rate = 0.02
+repro_stability      = 0.95
+ratchet_deadline     = "2026-07-15"
+
+[[cell]]
+cap = "ssrf"
+lang = "python"
+unsupported_rate     = 0.50
+false_confirmed_rate = 0.02
+repro_stability      = 0.95
+ratchet_deadline     = "2026-07-15"
+
+[[cell]]
+cap = "deserialize"
+lang = "python"
+unsupported_rate     = 0.60
+false_confirmed_rate = 0.02
+repro_stability      = 0.95
+ratchet_deadline     = "2026-08-01"
+
+# JavaScript / TypeScript (Phase 13 — second-most-mature).
+
+[[cell]]
+cap = "sqli"
+lang = "javascript"
+unsupported_rate     = 0.55
+false_confirmed_rate = 0.02
+repro_stability      = 0.95
+ratchet_deadline     = "2026-08-01"
+
+[[cell]]
+cap = "cmdi"
+lang = "javascript"
+unsupported_rate     = 0.55
+false_confirmed_rate = 0.02
+repro_stability      = 0.95
+ratchet_deadline     = "2026-08-01"
+
+[[cell]]
+cap = "ssrf"
+lang = "javascript"
+unsupported_rate     = 0.60
+false_confirmed_rate = 0.02
+repro_stability      = 0.95
+ratchet_deadline     = "2026-08-01"
+
+[[cell]]
+cap = "xss"
+lang = "javascript"
+unsupported_rate     = 0.70
+false_confirmed_rate = 0.02
+repro_stability      = 0.95
+ratchet_deadline     = "2026-08-15"
+
+[[cell]]
+cap = "sqli"
+lang = "typescript"
+unsupported_rate     = 0.60
+false_confirmed_rate = 0.02
+repro_stability      = 0.95
+ratchet_deadline     = "2026-08-15"
+
+# Java (Phase 14).
+
+[[cell]]
+cap = "sqli"
+lang = "java"
+unsupported_rate     = 0.65
+false_confirmed_rate = 0.02
+repro_stability      = 0.95
+ratchet_deadline     = "2026-08-15"
+
+[[cell]]
+cap = "deserialize"
+lang = "java"
+unsupported_rate     = 0.70
+false_confirmed_rate = 0.02
+repro_stability      = 0.95
+ratchet_deadline     = "2026-09-01"
+
+# Phase 15 / 16 verticals (Go, PHP, Ruby, Rust, C, C++) — newer; broader
+# tolerance until their probe-shim splicing follow-ups land.
+
+[[cell]]
+cap = "cmdi"
+lang = "go"
+unsupported_rate     = 0.75
+false_confirmed_rate = 0.02
+repro_stability      = 0.95
+ratchet_deadline     = "2026-09-01"
+
+[[cell]]
+cap = "sqli"
+lang = "go"
+unsupported_rate     = 0.75
+false_confirmed_rate = 0.02
+repro_stability      = 0.95
+ratchet_deadline     = "2026-09-01"
+
+[[cell]]
+cap = "cmdi"
+lang = "php"
+unsupported_rate     = 0.75
+false_confirmed_rate = 0.02
+repro_stability      = 0.95
+ratchet_deadline     = "2026-09-01"
+
+[[cell]]
+cap = "deserialize"
+lang = "php"
+unsupported_rate     = 0.75
+false_confirmed_rate = 0.02
+repro_stability      = 0.95
+ratchet_deadline     = "2026-09-01"
+
+[[cell]]
+cap = "cmdi"
+lang = "ruby"
+unsupported_rate     = 0.75
+false_confirmed_rate = 0.02
+repro_stability      = 0.95
+ratchet_deadline     = "2026-09-01"
+
+[[cell]]
+cap = "sqli"
+lang = "rust"
+unsupported_rate     = 0.80
+false_confirmed_rate = 0.02
+repro_stability      = 0.95
+ratchet_deadline     = "2026-09-15"
+
+[[cell]]
+cap = "fmt_string"
+lang = "c"
+unsupported_rate     = 0.85
+false_confirmed_rate = 0.02
+repro_stability      = 0.95
+ratchet_deadline     = "2026-09-15"
+
+[[cell]]
+cap = "memory"
+lang = "c"
+unsupported_rate     = 0.90
+false_confirmed_rate = 0.02
+repro_stability      = 0.95
+ratchet_deadline     = "2026-10-01"
+
+[[cell]]
+cap = "memory"
+lang = "cpp"
+unsupported_rate     = 0.90
+false_confirmed_rate = 0.02
+repro_stability      = 0.95
+ratchet_deadline     = "2026-10-01"
--- a/tests/eval_corpus/report.py
+++ b/tests/eval_corpus/report.py
@ -2,6 +2,11 @@
 """
 Aggregate eval results across all corpus sets and emit a summary table.
 Used by run.sh after all corpus sets have been tabulated.
+
+Phase 29 (Track I) extensions:
+  --budget tests/eval_corpus/budget.toml   per-cell budget enforcement
+  --diff   previous.json                   monotonic-improvement diff;
+                                           CI fails on any regression.
 """

 import argparse
@ -9,10 +14,105 @@ import json
 import sys
 from collections import defaultdict

+try:
+    import tomllib  # Python 3.11+
+except ModuleNotFoundError:  # pragma: no cover — older interpreters only
+    import tomli as tomllib  # type: ignore[no-redef]
+
+
+def load_budget(path: str) -> dict:
+    try:
+        with open(path, "rb") as f:
+            raw = tomllib.load(f)
+    except FileNotFoundError:
+        print(f"ERROR  budget file not found: {path}", file=sys.stderr)
+        sys.exit(3)
+    except tomllib.TOMLDecodeError as e:
+        print(f"ERROR  budget file malformed: {path}: {e}", file=sys.stderr)
+        sys.exit(3)
+    default = raw.get("default", {}) or {}
+    cells = {}
+    for row in raw.get("cell", []) or []:
+        cap = row.get("cap")
+        lang = row.get("lang")
+        if not cap or not lang:
+            print(f"ERROR  budget cell missing cap/lang: {row!r}", file=sys.stderr)
+            sys.exit(3)
+        cells[(cap, lang)] = row
+    return {"default": default, "cells": cells}
+
+
+def budget_for_cell(budget: dict, cap: str, lang: str) -> dict:
+    merged = dict(budget.get("default", {}) or {})
+    cell = budget.get("cells", {}).get((cap, lang))
+    if cell:
+        merged.update({k: v for k, v in cell.items() if k not in ("cap", "lang")})
+    if not cell:
+        wildcard = (
+            budget.get("cells", {}).get((cap, "*"))
+            or budget.get("cells", {}).get(("*", lang))
+            or budget.get("cells", {}).get(("*", "*"))
+        )
+        if wildcard:
+            merged.update(
+                {k: v for k, v in wildcard.items() if k not in ("cap", "lang")}
+            )
+    return merged
+
+
+def load_previous_agg(path: str) -> dict:
+    """Aggregate a previous results file the same way main() does."""
+    try:
+        with open(path) as f:
+            data = json.load(f)
+    except FileNotFoundError:
+        print(f"ERROR  diff file not found: {path}", file=sys.stderr)
+        sys.exit(3)
+    except json.JSONDecodeError as e:
+        print(f"ERROR  diff file malformed: {path}: {e}", file=sys.stderr)
+        sys.exit(3)
+    agg: dict[tuple[str, str], dict] = defaultdict(
+        lambda: {
+            "tp": 0,
+            "fp": 0,
+            "fn": 0,
+            "unsupported": 0,
+            "confirmed": 0,
+            "wrong_confirmed": 0,
+            "stable_replays": 0,
+            "total": 0,
+        }
+    )
+    for r in data:
+        for c in r.get("cells", []):
+            k = (c["cap"], c["lang"])
+            for field in (
+                "tp",
+                "fp",
+                "fn",
+                "unsupported",
+                "confirmed",
+                "wrong_confirmed",
+                "stable_replays",
+                "total",
+            ):
+                agg[k][field] += c.get(field, 0)
+    return agg
+

 def main() -> int:
    p = argparse.ArgumentParser()
    p.add_argument("--results", required=True)
+    p.add_argument(
+        "--budget",
+        default="",
+        help="path to budget.toml (per-(cap,lang) thresholds)",
+    )
+    p.add_argument(
+        "--diff",
+        default="",
+        help="path to a previous results.json; fail on monotonic-improvement regression",
+    )
    args = p.parse_args()

    with open(args.results) as f:
@ -24,12 +124,30 @@ def main() -> int:

    # Aggregate across sets.
    agg: dict[tuple[str, str], dict] = defaultdict(
-        lambda: {"tp": 0, "fp": 0, "fn": 0, "unsupported": 0, "total": 0}
+        lambda: {
+            "tp": 0,
+            "fp": 0,
+            "fn": 0,
+            "unsupported": 0,
+            "confirmed": 0,
+            "wrong_confirmed": 0,
+            "stable_replays": 0,
+            "total": 0,
+        }
    )
    for r in results:
        for c in r.get("cells", []):
            k = (c["cap"], c["lang"])
-            for field in ("tp", "fp", "fn", "unsupported", "total"):
+            for field in (
+                "tp",
+                "fp",
+                "fn",
+                "unsupported",
+                "confirmed",
+                "wrong_confirmed",
+                "stable_replays",
+                "total",
+            ):
                agg[k][field] += c.get(field, 0)

    print("\n=== Aggregated eval corpus report ===")
@ -46,18 +164,114 @@ def main() -> int:
            f"{unsup*100:>6.1f}%"
        )

-    # Gate check: per-cap Unsupported rate <= 80%
    gate_failed = False
-    print("\n=== Gate checks ===")
-    UNSUPPORTED_BUDGET = 0.80
-    for k, v in sorted(agg.items()):
-        unsup = v["unsupported"] / max(v["total"], 1)
-        if unsup > UNSUPPORTED_BUDGET:
-            print(f"  FAIL  {k[0]}/{k[1]}: Unsupported {unsup*100:.1f}% > {UNSUPPORTED_BUDGET*100:.0f}% budget")
-            gate_failed = True

-    if not gate_failed:
-        print("  All gate thresholds met.")
+    # ── Phase 29: per-cell budget enforcement ────────────────────────────
+    if args.budget:
+        budget = load_budget(args.budget)
+        print(f"\n=== Per-cell budget ({args.budget}) ===")
+        cell_fails: list[str] = []
+        for k, v in sorted(agg.items()):
+            b = budget_for_cell(budget, k[0], k[1])
+            if not b:
+                continue
+            max_unsup = b.get("unsupported_rate")
+            max_false = b.get("false_confirmed_rate")
+            min_stable = b.get("repro_stability")
+
+            if isinstance(max_unsup, (int, float)) and v["total"] > 0:
+                rate = v["unsupported"] / v["total"]
+                if rate > max_unsup:
+                    cell_fails.append(
+                        f"  FAIL  {k[0]}/{k[1]}: Unsupported {rate*100:.1f}%"
+                        f" > budget {max_unsup*100:.1f}%"
+                    )
+            if isinstance(max_false, (int, float)) and v["confirmed"] > 0:
+                rate = v["wrong_confirmed"] / v["confirmed"]
+                if rate > max_false:
+                    cell_fails.append(
+                        f"  FAIL  {k[0]}/{k[1]}: false-Confirmed {rate*100:.1f}%"
+                        f" > budget {max_false*100:.1f}%"
+                    )
+            if (
+                isinstance(min_stable, (int, float))
+                and v["confirmed"] > 0
+                and v.get("stable_replays", 0) > 0
+            ):
+                rate = v["stable_replays"] / v["confirmed"]
+                if rate < min_stable:
+                    cell_fails.append(
+                        f"  FAIL  {k[0]}/{k[1]}: repro stability {rate*100:.1f}%"
+                        f" < budget {min_stable*100:.1f}%"
+                    )
+        if cell_fails:
+            for line in cell_fails:
+                print(line)
+            gate_failed = True
+        else:
+            print("  All per-cell budgets met.")
+    else:
+        # Legacy fallback: per-cap Unsupported rate <= 80%.
+        print("\n=== Gate checks ===")
+        UNSUPPORTED_BUDGET = 0.80
+        cell_fails: list[str] = []
+        for k, v in sorted(agg.items()):
+            unsup = v["unsupported"] / max(v["total"], 1)
+            if unsup > UNSUPPORTED_BUDGET:
+                cell_fails.append(
+                    f"  FAIL  {k[0]}/{k[1]}: Unsupported {unsup*100:.1f}%"
+                    f" > {UNSUPPORTED_BUDGET*100:.0f}% budget"
+                )
+        if cell_fails:
+            for line in cell_fails:
+                print(line)
+            gate_failed = True
+        else:
+            print("  All gate thresholds met.")
+
+    # ── Phase 29: monotonic-improvement diff ─────────────────────────────
+    if args.diff:
+        prev = load_previous_agg(args.diff)
+        print(f"\n=== Monotonic-improvement diff vs {args.diff} ===")
+        diff_fails: list[str] = []
+        EPS = 0.005
+        for k, v in sorted(agg.items()):
+            old = prev.get(k)
+            if not old:
+                continue
+            old_unsup = old["unsupported"] / max(old["total"], 1)
+            new_unsup = v["unsupported"] / max(v["total"], 1)
+            if new_unsup > old_unsup + EPS:
+                diff_fails.append(
+                    f"  REGRESSION  {k[0]}/{k[1]}: Unsupported"
+                    f" {old_unsup*100:.1f}% → {new_unsup*100:.1f}%"
+                )
+            old_conf = old.get("confirmed", 0)
+            new_conf = v.get("confirmed", 0)
+            old_false = (old.get("wrong_confirmed", 0) / old_conf) if old_conf else None
+            new_false = (v.get("wrong_confirmed", 0) / new_conf) if new_conf else None
+            if old_false is not None and new_false is not None and new_false > old_false + EPS:
+                diff_fails.append(
+                    f"  REGRESSION  {k[0]}/{k[1]}: false-Confirmed"
+                    f" {old_false*100:.1f}% → {new_false*100:.1f}%"
+                )
+            old_stable = (old.get("stable_replays", 0) / old_conf) if old_conf else None
+            new_stable = (v.get("stable_replays", 0) / new_conf) if new_conf else None
+            if (
+                old_stable is not None
+                and new_stable is not None
+                and new_stable < old_stable - EPS
+            ):
+                diff_fails.append(
+                    f"  REGRESSION  {k[0]}/{k[1]}: repro stability"
+                    f" {old_stable*100:.1f}% → {new_stable*100:.1f}%"
+                )
+        if diff_fails:
+            for line in diff_fails:
+                print(line)
+            gate_failed = True
+        else:
+            print("  No regressions vs previous run.")

    return 2 if gate_failed else 0

--- a/tests/eval_corpus/run.sh
+++ b/tests/eval_corpus/run.sh
@ -29,12 +29,17 @@ OUTPUT_DIR=""
 NYX_BIN="${NYX_BIN:-${REPO_ROOT}/target/release/nyx}"
 CORPUS_CACHE="${NYX_EVAL_CORPUS_DIR:-${HOME}/.cache/nyx/eval_corpus}"
 SETS="owasp,sard,inhouse"
+# Phase 29 (Track I): per-cell budgets + monotonic-improvement diff.
+BUDGET_FILE=""
+DIFF_FILE=""

 while [[ $# -gt 0 ]]; do
  case "$1" in
    --output) OUTPUT_DIR="$2"; shift 2 ;;
    --nyx)    NYX_BIN="$2"; shift 2 ;;
    --sets)   SETS="$2"; shift 2 ;;
+    --budget) BUDGET_FILE="$2"; shift 2 ;;
+    --diff)   DIFF_FILE="$2"; shift 2 ;;
    *)        shift ;;
  esac
 done
@ -83,6 +88,8 @@ if [[ "$SETS" == *owasp* ]]; then
        --scan /tmp/nyx_owasp.json \
        --ground-truth "${SCRIPT_DIR}/ground_truth/owasp_benchmark_v1.2.json" \
        --append "$RESULTS_JSON" \
+        ${BUDGET_FILE:+--budget "$BUDGET_FILE"} \
+        ${DIFF_FILE:+--diff "$DIFF_FILE"} \
        || info "  tabulate.py failed; ground truth file may be absent"
    fi
  fi
@ -111,6 +118,8 @@ if [[ "$SETS" == *sard* ]]; then
        --scan /tmp/nyx_sard.json \
        --ground-truth "${SCRIPT_DIR}/ground_truth/nist_sard.json" \
        --append "$RESULTS_JSON" \
+        ${BUDGET_FILE:+--budget "$BUDGET_FILE"} \
+        ${DIFF_FILE:+--diff "$DIFF_FILE"} \
        || info "  tabulate.py failed; ground truth file may be absent"
    fi
  fi
@ -140,6 +149,8 @@ if [[ "$SETS" == *inhouse* ]]; then
      --scan "/tmp/nyx_${label}.json" \
      --inhouse \
      --append "$RESULTS_JSON" \
+      ${BUDGET_FILE:+--budget "$BUDGET_FILE"} \
+      ${DIFF_FILE:+--diff "$DIFF_FILE"} \
      || info "  tabulate.py failed on $label"
  done
 fi
@ -156,12 +167,20 @@ if [[ ! -f "${SCRIPT_DIR}/report.py" ]]; then
 fi

 set +e
-python3 "${SCRIPT_DIR}/report.py" --results "$RESULTS_JSON"
+python3 "${SCRIPT_DIR}/report.py" \
+  --results "$RESULTS_JSON" \
+  ${BUDGET_FILE:+--budget "$BUDGET_FILE"} \
+  ${DIFF_FILE:+--diff "$DIFF_FILE"}
 REPORT_RC=$?
 set -e
-# Propagate gate-fail (exit 2). Treat other non-zero as setup error (exit 1).
+# Propagate gate-fail (exit 2) and malformed-config (exit 3) so the
+# m7_ship_gate.sh Gate-1 dispatch can tell them apart.  Treat other
+# non-zero as setup error (exit 1).
 if [[ $REPORT_RC -eq 2 ]]; then
  exit 2
+elif [[ $REPORT_RC -eq 3 ]]; then
+  info "report.py: budget/diff configuration malformed; see $RESULTS_JSON"
+  exit 3
 elif [[ $REPORT_RC -ne 0 ]]; then
  info "report.py crashed (exit $REPORT_RC); raw results at $RESULTS_JSON"
  exit 1
--- a/tests/eval_corpus/tabulate.py
+++ b/tests/eval_corpus/tabulate.py
@ -9,6 +9,17 @@ For in-house sets (--inhouse): counts findings by cap x language; reports
 Unsupported rate only (no ground truth required).

 Output: appends a result record to --append FILE.
+
+Phase 29 (Track I) extensions:
+  --budget tests/eval_corpus/budget.toml   enforce per-cell budget thresholds
+  --diff   previous.json                   compare against prior result file,
+                                           fail on monotonic-improvement
+                                           regression
+
+Exit codes:
+  0  all rows pass.
+  2  one or more per-cell budgets exceeded OR a diff regression was found.
+  3  malformed budget / diff input (callers must fix configuration).
 """

 import argparse
@ -17,6 +28,11 @@ import sys
 from collections import defaultdict
 from pathlib import Path

+try:
+    import tomllib  # Python 3.11+
+except ModuleNotFoundError:  # pragma: no cover — older interpreters only
+    import tomli as tomllib  # type: ignore[no-redef]
+
 LINE_TOLERANCE = 5

 # Bitflag positions for Cap (src/labels/mod.rs). Sink bits map to a cap label.
@ -97,6 +113,203 @@ def lang_of(finding: dict) -> str:
    return "unknown"


+# ── Budget loading ──────────────────────────────────────────────────────────
+
+
+def load_budget(path: str) -> dict:
+    """Parse a budget.toml file.
+
+    Returns a dict::
+
+        {
+            "default": {"unsupported_rate": 0.8, "false_confirmed_rate": 0.02,
+                        "repro_stability": 0.95, "ratchet_deadline": "..."},
+            "cells": {(cap, lang): {...overrides...}, ...},
+        }
+
+    Raises SystemExit(3) on a malformed file.
+    """
+
+    try:
+        with open(path, "rb") as f:
+            raw = tomllib.load(f)
+    except FileNotFoundError:
+        print(f"ERROR  budget file not found: {path}", file=sys.stderr)
+        sys.exit(3)
+    except tomllib.TOMLDecodeError as e:
+        print(f"ERROR  budget file malformed: {path}: {e}", file=sys.stderr)
+        sys.exit(3)
+
+    default = raw.get("default", {}) or {}
+    cells = {}
+    for row in raw.get("cell", []) or []:
+        cap = row.get("cap")
+        lang = row.get("lang")
+        if not cap or not lang:
+            print(
+                f"ERROR  budget cell missing cap/lang: {row!r}", file=sys.stderr
+            )
+            sys.exit(3)
+        cells[(cap, lang)] = row
+
+    return {"default": default, "cells": cells}
+
+
+def budget_for_cell(budget: dict, cap: str, lang: str) -> dict:
+    """Merge cell-specific overrides on top of [default]."""
+    merged = dict(budget.get("default", {}) or {})
+    cell = budget.get("cells", {}).get((cap, lang))
+    if cell:
+        merged.update({k: v for k, v in cell.items() if k not in ("cap", "lang")})
+    # Fall back to a wildcard override if present.
+    if not cell:
+        wildcard = budget.get("cells", {}).get((cap, "*")) or \
+                   budget.get("cells", {}).get(("*", lang)) or \
+                   budget.get("cells", {}).get(("*", "*"))
+        if wildcard:
+            merged.update({k: v for k, v in wildcard.items() if k not in ("cap", "lang")})
+    return merged
+
+
+def enforce_budget(cells: list, budget: dict) -> list:
+    """Return a list of human-readable failure strings.
+
+    Each cell's measured Unsupported / false-Confirmed / repro-stability
+    rate is compared against its merged budget row. A missing measurement
+    (e.g. no Confirmed findings → false-Confirmed denominator = 0) is
+    treated as "no data" and skipped, never as a failure.
+    """
+
+    failures = []
+    for c in cells:
+        b = budget_for_cell(budget, c["cap"], c["lang"])
+        if not b:
+            continue
+        cap, lang = c["cap"], c["lang"]
+        max_unsup = b.get("unsupported_rate")
+        max_false = b.get("false_confirmed_rate")
+        min_stable = b.get("repro_stability")
+
+        if isinstance(max_unsup, (int, float)) and c.get("total", 0) > 0:
+            if c["unsupported_rate"] > max_unsup:
+                failures.append(
+                    f"  FAIL  {cap}/{lang}: Unsupported {c['unsupported_rate']*100:.1f}%"
+                    f" > budget {max_unsup*100:.1f}%"
+                )
+        if isinstance(max_false, (int, float)) and c.get("confirmed", 0) > 0:
+            rate = c.get("wrong_confirmed", 0) / c["confirmed"]
+            if rate > max_false:
+                failures.append(
+                    f"  FAIL  {cap}/{lang}: false-Confirmed {rate*100:.1f}%"
+                    f" > budget {max_false*100:.1f}%"
+                )
+        # Repro stability is only enforced when callers stamped at least
+        # one `replay_stable: true` flag — otherwise stable_replays == 0
+        # is indistinguishable from "we did not measure stability for
+        # this row" and the gate would fire vacuously on every clean run.
+        if (
+            isinstance(min_stable, (int, float))
+            and c.get("confirmed", 0) > 0
+            and c.get("stable_replays", 0) > 0
+        ):
+            rate = c["stable_replays"] / c["confirmed"]
+            if rate < min_stable:
+                failures.append(
+                    f"  FAIL  {cap}/{lang}: repro stability {rate*100:.1f}%"
+                    f" < budget {min_stable*100:.1f}%"
+                )
+    return failures
+
+
+# ── Diff loading ────────────────────────────────────────────────────────────
+
+
+def load_previous_cells(path: str, label: str) -> dict:
+    """Index a previous results file by (cap, lang) → cell.
+
+    The previous file is the same shape as `--append`'s output. We pick the
+    record whose `label` matches the current run; if no exact match, fall
+    back to the first record. Missing/unreadable files exit 3.
+    """
+
+    try:
+        with open(path) as f:
+            data = json.load(f)
+    except FileNotFoundError:
+        print(f"ERROR  diff file not found: {path}", file=sys.stderr)
+        sys.exit(3)
+    except json.JSONDecodeError as e:
+        print(f"ERROR  diff file malformed: {path}: {e}", file=sys.stderr)
+        sys.exit(3)
+
+    records = data if isinstance(data, list) else [data]
+    chosen = None
+    for r in records:
+        if r.get("label") == label:
+            chosen = r
+            break
+    if chosen is None and records:
+        chosen = records[0]
+    if not chosen:
+        return {}
+    return {(c["cap"], c["lang"]): c for c in chosen.get("cells", [])}
+
+
+def diff_regressions(cells: list, prev: dict) -> list:
+    """Compare current cells against previous. Returns failure strings.
+
+    Three monotonicity rules:
+      * Unsupported% must not increase.
+      * False-Confirmed% must not increase.
+      * Repro-stability% must not decrease.
+
+    Cells absent from `prev` are treated as new (skipped).
+    A small epsilon (0.5 percentage points) absorbs flake noise.
+    """
+    EPS = 0.005
+    failures = []
+    for c in cells:
+        key = (c["cap"], c["lang"])
+        old = prev.get(key)
+        if not old:
+            continue
+        # Unsupported.
+        old_unsup = old.get("unsupported_rate", 0.0)
+        new_unsup = c.get("unsupported_rate", 0.0)
+        if new_unsup > old_unsup + EPS:
+            failures.append(
+                f"  REGRESSION  {key[0]}/{key[1]}: Unsupported"
+                f" {old_unsup*100:.1f}% → {new_unsup*100:.1f}%"
+            )
+        # False-Confirmed.
+        old_conf = old.get("confirmed", 0)
+        old_false = (old.get("wrong_confirmed", 0) / old_conf) if old_conf else None
+        new_conf = c.get("confirmed", 0)
+        new_false = (c.get("wrong_confirmed", 0) / new_conf) if new_conf else None
+        if old_false is not None and new_false is not None and new_false > old_false + EPS:
+            failures.append(
+                f"  REGRESSION  {key[0]}/{key[1]}: false-Confirmed"
+                f" {old_false*100:.1f}% → {new_false*100:.1f}%"
+            )
+        # Repro stability (higher is better).
+        old_stable = (
+            (old.get("stable_replays", 0) / old_conf) if old_conf else None
+        )
+        new_stable = (
+            (c.get("stable_replays", 0) / new_conf) if new_conf else None
+        )
+        if (
+            old_stable is not None
+            and new_stable is not None
+            and new_stable < old_stable - EPS
+        ):
+            failures.append(
+                f"  REGRESSION  {key[0]}/{key[1]}: repro stability"
+                f" {old_stable*100:.1f}% → {new_stable*100:.1f}%"
+            )
+    return failures
+
+
 def main() -> int:
    p = argparse.ArgumentParser()
    p.add_argument("--label", required=True)
@ -104,14 +317,34 @@ def main() -> int:
    p.add_argument("--ground-truth", default="", help="ground truth JSON")
    p.add_argument("--inhouse", action="store_true")
    p.add_argument("--append", required=True, help="results accumulator JSON")
+    p.add_argument(
+        "--budget",
+        default="",
+        help="path to budget.toml (per-(cap,lang) thresholds)",
+    )
+    p.add_argument(
+        "--diff",
+        default="",
+        help="path to a previous results JSON; fail on monotonic-improvement regression",
+    )
    args = p.parse_args()

    scan_data = load_json(args.scan)
    findings = scan_data if isinstance(scan_data, list) else scan_data.get("findings", [])

-    # Per-cell tallies: {(cap, lang): {tp, fp, fn, unsupported}}
+    # Per-cell tallies: {(cap, lang): {tp, fp, fn, unsupported, confirmed,
+    # wrong_confirmed, stable_replays, total}}
    cells: dict[tuple[str, str], dict] = defaultdict(
-        lambda: {"tp": 0, "fp": 0, "fn": 0, "unsupported": 0, "total": 0}
+        lambda: {
+            "tp": 0,
+            "fp": 0,
+            "fn": 0,
+            "unsupported": 0,
+            "confirmed": 0,
+            "wrong_confirmed": 0,
+            "stable_replays": 0,
+            "total": 0,
+        }
    )

    for f in findings:
@ -121,8 +354,19 @@ def main() -> int:
        ev = f.get("evidence", {}) or {}
        dv = ev.get("dynamic_verdict") if ev else None
        cells[key]["total"] += 1
-        if dv and dv.get("status") == "Unsupported":
-            cells[key]["unsupported"] += 1
+        if dv:
+            status = dv.get("status")
+            if status == "Unsupported":
+                cells[key]["unsupported"] += 1
+            elif status == "Confirmed":
+                cells[key]["confirmed"] += 1
+                # Repro-stability and false-Confirmed counts are optional
+                # fields tabulate.py reads off the verdict when callers
+                # (m7_ship_gate.sh / corpus_promote.yml) have stamped them.
+                if dv.get("wrong") is True:
+                    cells[key]["wrong_confirmed"] += 1
+                if dv.get("replay_stable") is True:
+                    cells[key]["stable_replays"] += 1

    if not args.inhouse and args.ground_truth and Path(args.ground_truth).exists():
        gt = load_json(args.ground_truth)
@ -201,7 +445,34 @@ def main() -> int:
            f"{c['precision']:>6.2f} {c['recall']:>6.2f} "
            f"{c['unsupported_rate']*100:>6.1f}%"
        )
-    return 0
+
+    exit_rc = 0
+
+    # ── Phase 29: per-cell budget enforcement ─────────────────────────────
+    if args.budget:
+        budget = load_budget(args.budget)
+        failures = enforce_budget(result["cells"], budget)
+        if failures:
+            print(f"\n=== Per-cell budget regressions ({args.budget}) ===")
+            for line in failures:
+                print(line)
+            exit_rc = 2
+        else:
+            print(f"\nPer-cell budget ({args.budget}): OK")
+
+    # ── Phase 29: diff against previous run ───────────────────────────────
+    if args.diff:
+        prev = load_previous_cells(args.diff, args.label)
+        failures = diff_regressions(result["cells"], prev)
+        if failures:
+            print(f"\n=== Monotonic-improvement regressions vs {args.diff} ===")
+            for line in failures:
+                print(line)
+            exit_rc = 2
+        else:
+            print(f"\nDiff vs {args.diff}: no regressions")
+
+    return exit_rc


 if __name__ == "__main__":
--- a/tests/eval_corpus/test_tabulate_regression.py
+++ b/tests/eval_corpus/test_tabulate_regression.py
@ -0,0 +1,241 @@
+#!/usr/bin/env python3
+"""
+Phase 29 (Track I) regression test for tests/eval_corpus/tabulate.py.
+
+Exercises --budget and --diff against hand-crafted scan + ground-truth
+fixtures so the per-cell budget gate and monotonic-improvement diff are
+demonstrably non-vacuous.
+
+Run with::
+
+    python3 tests/eval_corpus/test_tabulate_regression.py
+
+Exits 0 when every assertion holds, non-zero otherwise.  The asserts are
+plain `assert` statements so the file works both as a stand-alone script
+and under unittest discovery.
+"""
+
+from __future__ import annotations
+
+import json
+import subprocess
+import sys
+import tempfile
+from pathlib import Path
+
+REPO = Path(__file__).resolve().parents[2]
+TABULATE = REPO / "tests/eval_corpus/tabulate.py"
+BUDGET = REPO / "tests/eval_corpus/budget.toml"
+
+
+def run_tabulate(*args: str) -> subprocess.CompletedProcess:
+    cmd = [sys.executable, str(TABULATE), *args]
+    return subprocess.run(cmd, capture_output=True, text=True)
+
+
+def write_json(path: Path, data: object) -> None:
+    path.write_text(json.dumps(data, indent=2))
+
+
+# Cap bit positions cribbed from tabulate.py / src/labels/mod.rs.
+SINK_BIT_SQL = 1 << 7   # SQL_QUERY
+SINK_BIT_CMDI = 1 << 10  # CODE_EXEC
+
+
+def python_finding(cap_bit: int, path: str, line: int, status: str | None) -> dict:
+    finding = {
+        "path": path,
+        "line": line,
+        "col": 0,
+        "id": "py.sqli.cursor_execute",
+        "evidence": {"sink_caps": cap_bit},
+    }
+    if status:
+        finding["evidence"]["dynamic_verdict"] = {"status": status}
+    return finding
+
+
+def test_budget_passes_on_clean_scan(tmp: Path) -> None:
+    scan = tmp / "scan_clean.json"
+    write_json(
+        scan,
+        {
+            "findings": [
+                python_finding(SINK_BIT_SQL, "app.py", 10, "Confirmed"),
+                python_finding(SINK_BIT_SQL, "app.py", 20, "Confirmed"),
+                python_finding(SINK_BIT_SQL, "app.py", 30, "NotConfirmed"),
+            ]
+        },
+    )
+    append = tmp / "results_clean.json"
+    write_json(append, [])
+    proc = run_tabulate(
+        "--label", "test",
+        "--scan", str(scan),
+        "--inhouse",
+        "--append", str(append),
+        "--budget", str(BUDGET),
+    )
+    assert proc.returncode == 0, f"clean scan must pass budget, got rc={proc.returncode}\nstdout: {proc.stdout}\nstderr: {proc.stderr}"
+    assert "Per-cell budget" in proc.stdout and "OK" in proc.stdout, proc.stdout
+
+
+def test_budget_fails_when_unsupported_exceeds(tmp: Path) -> None:
+    # SQL_QUERY/python budget is 40% Unsupported. Hand-craft a scan with
+    # 100% Unsupported in that cell so the gate must trip.
+    scan = tmp / "scan_unsup.json"
+    write_json(
+        scan,
+        {
+            "findings": [
+                python_finding(SINK_BIT_SQL, "app.py", i, "Unsupported")
+                for i in (10, 20, 30, 40, 50)
+            ]
+        },
+    )
+    append = tmp / "results_unsup.json"
+    write_json(append, [])
+    proc = run_tabulate(
+        "--label", "test",
+        "--scan", str(scan),
+        "--inhouse",
+        "--append", str(append),
+        "--budget", str(BUDGET),
+    )
+    assert proc.returncode == 2, (
+        f"budget breach must exit 2, got {proc.returncode}\n"
+        f"stdout: {proc.stdout}\nstderr: {proc.stderr}"
+    )
+    assert "FAIL" in proc.stdout and "sqli/python" in proc.stdout, proc.stdout
+
+
+def test_diff_fails_on_regression(tmp: Path) -> None:
+    # Previous run: 1/4 Unsupported = 25%.  Current run: 3/4 = 75%.  The
+    # default cell budget tolerates 80%, but the monotonic-improvement
+    # diff must still flag the +50pp regression.
+    prev_findings = [
+        python_finding(SINK_BIT_CMDI, "x.unknown", 1, "Confirmed"),
+        python_finding(SINK_BIT_CMDI, "x.unknown", 2, "Confirmed"),
+        python_finding(SINK_BIT_CMDI, "x.unknown", 3, "Confirmed"),
+        python_finding(SINK_BIT_CMDI, "x.unknown", 4, "Unsupported"),
+    ]
+    prev_scan = tmp / "prev_scan.json"
+    write_json(prev_scan, {"findings": prev_findings})
+    prev_results = tmp / "prev_results.json"
+    write_json(prev_results, [])
+    rc_prev = run_tabulate(
+        "--label", "diff-test",
+        "--scan", str(prev_scan),
+        "--inhouse",
+        "--append", str(prev_results),
+    ).returncode
+    assert rc_prev == 0, f"prev seed run must succeed, got {rc_prev}"
+
+    cur_findings = [
+        python_finding(SINK_BIT_CMDI, "x.unknown", 1, "Unsupported"),
+        python_finding(SINK_BIT_CMDI, "x.unknown", 2, "Unsupported"),
+        python_finding(SINK_BIT_CMDI, "x.unknown", 3, "Unsupported"),
+        python_finding(SINK_BIT_CMDI, "x.unknown", 4, "Confirmed"),
+    ]
+    cur_scan = tmp / "cur_scan.json"
+    write_json(cur_scan, {"findings": cur_findings})
+    cur_results = tmp / "cur_results.json"
+    write_json(cur_results, [])
+    proc = run_tabulate(
+        "--label", "diff-test",
+        "--scan", str(cur_scan),
+        "--inhouse",
+        "--append", str(cur_results),
+        "--diff", str(prev_results),
+    )
+    assert proc.returncode == 2, (
+        f"regression diff must exit 2, got {proc.returncode}\n"
+        f"stdout: {proc.stdout}\nstderr: {proc.stderr}"
+    )
+    assert "REGRESSION" in proc.stdout and "Unsupported" in proc.stdout, proc.stdout
+
+
+def test_diff_passes_on_improvement(tmp: Path) -> None:
+    # Previous: 3/4 Unsupported.  Current: 1/4.  Monotonic improvement
+    # must not flag any regression.
+    prev_findings = [
+        python_finding(SINK_BIT_CMDI, "x.unknown", 1, "Unsupported"),
+        python_finding(SINK_BIT_CMDI, "x.unknown", 2, "Unsupported"),
+        python_finding(SINK_BIT_CMDI, "x.unknown", 3, "Unsupported"),
+        python_finding(SINK_BIT_CMDI, "x.unknown", 4, "Confirmed"),
+    ]
+    prev_scan = tmp / "prev_scan.json"
+    write_json(prev_scan, {"findings": prev_findings})
+    prev_results = tmp / "prev_results.json"
+    write_json(prev_results, [])
+    run_tabulate(
+        "--label", "improve-test",
+        "--scan", str(prev_scan),
+        "--inhouse",
+        "--append", str(prev_results),
+    )
+
+    cur_findings = [
+        python_finding(SINK_BIT_CMDI, "x.unknown", 1, "Confirmed"),
+        python_finding(SINK_BIT_CMDI, "x.unknown", 2, "Confirmed"),
+        python_finding(SINK_BIT_CMDI, "x.unknown", 3, "Confirmed"),
+        python_finding(SINK_BIT_CMDI, "x.unknown", 4, "Unsupported"),
+    ]
+    cur_scan = tmp / "cur_scan.json"
+    write_json(cur_scan, {"findings": cur_findings})
+    cur_results = tmp / "cur_results.json"
+    write_json(cur_results, [])
+    proc = run_tabulate(
+        "--label", "improve-test",
+        "--scan", str(cur_scan),
+        "--inhouse",
+        "--append", str(cur_results),
+        "--diff", str(prev_results),
+    )
+    assert proc.returncode == 0, (
+        f"improvement diff must exit 0, got {proc.returncode}\n"
+        f"stdout: {proc.stdout}\nstderr: {proc.stderr}"
+    )
+    assert "no regressions" in proc.stdout, proc.stdout
+
+
+def test_budget_malformed_exits_3(tmp: Path) -> None:
+    bad = tmp / "bad.toml"
+    bad.write_text("[default]\nunsupported_rate = not_a_number\n")
+    scan = tmp / "scan.json"
+    write_json(scan, {"findings": []})
+    append = tmp / "results.json"
+    write_json(append, [])
+    proc = run_tabulate(
+        "--label", "test",
+        "--scan", str(scan),
+        "--inhouse",
+        "--append", str(append),
+        "--budget", str(bad),
+    )
+    assert proc.returncode == 3, (
+        f"malformed budget must exit 3, got {proc.returncode}\nstderr: {proc.stderr}"
+    )
+
+
+def main() -> int:
+    with tempfile.TemporaryDirectory() as td:
+        tmp = Path(td)
+        for fn in (
+            test_budget_passes_on_clean_scan,
+            test_budget_fails_when_unsupported_exceeds,
+            test_diff_fails_on_regression,
+            test_diff_passes_on_improvement,
+            test_budget_malformed_exits_3,
+        ):
+            sub = tmp / fn.__name__
+            sub.mkdir()
+            print(f"... {fn.__name__}")
+            fn(sub)
+            print(f"    OK")
+    print("\nAll tabulate.py regression checks passed.")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/tests/python_fixtures.rs
+++ b/tests/python_fixtures.rs
@ -15,7 +15,7 @@ mod common;
 mod python_fixture_tests {
    use crate::common::fixture_harness::{
        run_fixture_and_compare_to_golden, run_harness_snapshot, run_shape_fixture,
-        CopyStrategy, FixtureSpec,
+        CopyStrategy, FixtureSpec, Prerequisite,
    };
    use nyx_scanner::commands::scan::Diag;
    use nyx_scanner::dynamic::spec::PayloadSlot;
@ -48,6 +48,12 @@ mod python_fixture_tests {
            sink_line,
            confidence: Confidence::High,
            copy: CopyStrategy::PreserveName,
+            // Phase 29 (Track I): the Python harness emitter shells out
+            // to `python3` during verify, so the host must have it.
+            // The harness short-circuits with a structured skip when
+            // missing; CI rows that intentionally omit Python still go
+            // green.
+            requires: vec![Prerequisite::CommandAvailable("python3")],
        }
    }

@ -65,6 +71,10 @@ mod python_fixture_tests {
            sink_line,
            confidence: Confidence::Low,
            copy: CopyStrategy::PreserveName,
+            // Low-confidence rows short-circuit to
+            // `Unsupported(ConfidenceTooLow)` before the harness ever
+            // shells out to python3, so no prerequisite is needed.
+            requires: vec![],
        }
    }

--- a/tests/rust_fixtures.rs
+++ b/tests/rust_fixtures.rs
@ -12,7 +12,7 @@ mod common;
 #[cfg(feature = "dynamic")]
 mod rust_fixture_tests {
    use crate::common::fixture_harness::{
-        run_fixture_and_compare_to_golden, CopyStrategy, FixtureSpec,
+        run_fixture_and_compare_to_golden, CopyStrategy, FixtureSpec, Prerequisite,
    };
    use nyx_scanner::commands::scan::Diag;
    use nyx_scanner::dynamic::verify::{verify_finding, VerifyOptions};
@ -32,6 +32,11 @@ mod rust_fixture_tests {
            sink_line,
            confidence: Confidence::High,
            copy: CopyStrategy::RustEntry,
+            // Phase 29 (Track I): the Rust harness emitter shells out
+            // to `cargo` during verify, so the host must have a Rust
+            // toolchain on PATH.  Missing cargo triggers a structured
+            // skip rather than a panic.
+            requires: vec![Prerequisite::CommandAvailable("cargo")],
        }
    }

@ -49,6 +54,10 @@ mod rust_fixture_tests {
            sink_line,
            confidence: Confidence::Low,
            copy: CopyStrategy::RustEntry,
+            // Low-confidence rows short-circuit to
+            // `Unsupported(ConfidenceTooLow)` before the harness ever
+            // shells out to cargo.
+            requires: vec![],
        }
    }