Dynamic (#77)

2026-06-27 20:29:39 +02:00 · 2026-06-05 10:16:30 -05:00 · 2026-06-05 10:16:30 -05:00 · 991c84a1eb
commit 991c84a1eb
parent 55247b7fcd
1464 changed files with 225448 additions and 1985 deletions
--- a/tests/eval_corpus/budget.toml
+++ b/tests/eval_corpus/budget.toml
@ -0,0 +1,352 @@
+# Eval corpus budget.
+#
+# `report.py` enforces these values when `run.sh` or `run_full.sh` pass
+# `--budget`. Each (cap, lang) cell uses the default row unless a specific
+# override appears below.
+#
+# Wall-clock cost is measured separately from this per-cell budget.
+#
+# Schema:
+#
+#   [default]
+#   unsupported_rate     = 0.20   # max(Unsupported / total) per cell
+#   false_confirmed_rate = 0.02   # max(wrong / Confirmed) per cap
+#   repro_stability      = 0.95   # min(stable / Confirmed) per cell
+#   confirmed_rate       = 0.40   # min(Confirmed / total) per cell (omit to skip)
+#   ratchet_deadline     = "..."  # informational; cells already at headline
+#
+#   [[cell]]
+#   cap   = "..."
+#   lang  = "..."
+#   <overrides as above>
+#
+# `cap` matches `tabulate.py`'s _CAP_BIT_TABLE / _CAP_RULE_TABLE labels.
+# `lang` matches the ext_map values (`python`, `javascript`, …).
+# A wildcard `"*"` matches any cell that does not have an exact entry.
+#
+# Each rate is enforced only when the relevant denominator is non-zero, so a
+# cell with no findings (or no Confirmed findings) never trips a budget
+# vacuously.  `confirmed_rate` is a *minimum* (a ratchet floor); the others are
+# maxima.  Per-cell overrides are calibrated to the measured frontier on the
+# real corpus so the gate locks in current performance and catches regressions
+# (see the OWASP cells below).
+
+[default]
+unsupported_rate     = 0.20
+false_confirmed_rate = 0.02
+repro_stability      = 0.95
+ratchet_deadline     = "2026-05-15"
+
+# ── OWASP Benchmark v1.2 (Java) — Track R.0 ratchet ──────────────────────────
+#
+# Calibrated against the pinned 1.2beta corpus, nyx c0501884, 2026-05-29
+# (`nyx scan --verify` over all 2740 BenchmarkTest files; 5812 findings).
+#
+# Measured frontier at calibration:
+#   verdicts  : Confirmed 0 | NotConfirmed 4077 | Inconclusive 1725
+#               (BuildFailed 952 + SpecDerivationFailed 773) | Unsupported 10
+#   per cell  : unsupported_rate <= 1.7%  (headline <= 20%  -> MET)
+#               false_confirmed   = 0%     (headline <= 2%   -> MET, 0 confirms)
+#               confirmed_rate    = 0%     (headline >= 40%  -> NOT met)
+#
+# The verifier confirms nothing on OWASP yet: every BenchmarkTest is a servlet
+# whose harness lands in BuildFailed / SpecDerivationFailed (Java servlet entry
+# wiring + classpath are Track L.12 / Track O.0 work).  So the enforced floors
+# below are the two headline maxima the verifier already satisfies
+# (unsupported_rate, false_confirmed_rate).  `confirmed_rate` is intentionally
+# left UNSET — the headline >= 40% is the ratchet's destination, recorded here
+# and in scripts/m7_ship_gate.sh (NYX_OWASP_FLOOR_CAPS), not a floor we can
+# honestly assert at 0 confirms.  Promote a cap into the gated set (and add its
+# `confirmed_rate`) the moment it starts Confirming.
+#
+# Caps split two ways:
+#   sound-oracle (injection): cmdi, sqli, path_traversal, ldap_injection,
+#     xpath_injection — once their servlet harnesses build, a runtime oracle
+#     exists; these are the GATE6_FLOOR_CAPS candidates.
+#   no-sound-oracle (config/usage smell): crypto (weak rand/hash), auth
+#     (insecure cookie), xss/trustbound — Phase-11 routes these to
+#     Unsupported(SoundOracleUnavailable); they stay report-only.  When that
+#     routing lands their unsupported_rate will rise and these cells must be
+#     relaxed accordingly.
+
+[[cell]]
+cap = "cmdi"
+lang = "java"
+unsupported_rate     = 0.20
+false_confirmed_rate = 0.02
+
+[[cell]]
+cap = "sqli"
+lang = "java"
+unsupported_rate     = 0.20
+false_confirmed_rate = 0.02
+
+[[cell]]
+cap = "path_traversal"
+lang = "java"
+unsupported_rate     = 0.20
+false_confirmed_rate = 0.02
+
+[[cell]]
+cap = "ldap_injection"
+lang = "java"
+unsupported_rate     = 0.20
+false_confirmed_rate = 0.02
+
+[[cell]]
+cap = "xpath_injection"
+lang = "java"
+unsupported_rate     = 0.20
+false_confirmed_rate = 0.02
+
+[[cell]]
+cap = "xss"
+lang = "java"
+unsupported_rate     = 0.20
+false_confirmed_rate = 0.02
+
+[[cell]]
+cap = "crypto"
+lang = "java"
+unsupported_rate     = 0.20
+false_confirmed_rate = 0.02
+
+[[cell]]
+cap = "auth"
+lang = "java"
+unsupported_rate     = 0.20
+false_confirmed_rate = 0.02
+
+# ── NodeGoat / Juice Shop (JS/TS) — Track R.1 ratchet ────────────────────────
+#
+# Phase 28 wires two intentionally-vulnerable JS/TS apps into the same
+# acceptance machinery as OWASP Benchmark: OWASP NodeGoat (Express, .js)
+# and OWASP Juice Shop (TypeScript, .ts).  Unlike OWASP Benchmark, neither
+# app ships vuln/benign *pairs* — every labelled file is `vuln = true` (see
+# ground_truth/{nodegoat,juiceshop}.manifest.toml).  Two consequences for
+# these cells:
+#
+#   * false_confirmed_rate (<= 2%) is the headline maximum the verifier
+#     already satisfies and is HARD-enforced: it only trips when a Confirmed
+#     finding lands on a file with no ground-truth positive, i.e. an
+#     over-confirm.  With the verifier confirming little on real corpora yet
+#     it is satisfied, and it ratchets precision as confirms grow.
+#   * unsupported_rate (<= 20%) is HARD-enforced too.  `Unsupported` counts
+#     only NoPayloadsForCap / EntryKindUnsupported / SoundOracleUnavailable —
+#     a narrow bucket that Tracks J + M shrank — *not* BuildFailed /
+#     SpecDerivationFailed (those are Inconclusive), so it stays low.
+#
+# confirmed_rate (>= 40%), precision (>= 0.85) and recall (>= 0.40) are the
+# Phase 28 acceptance DESTINATIONS.  They are intentionally left UNSET here
+# and published report-only by Gate 7 (NYX_JSTS_FLOOR_CAPS empty by default,
+# mirroring NYX_OWASP_FLOOR_CAPS) because (a) the verifier does not yet
+# Confirm these corpora end to end and (b) the manifest labels canonical
+# vulns only, so precision vs partial ground truth is informational until
+# the labels are completed.  Promote a cap into the floor set the moment it
+# starts Confirming, exactly as for OWASP.
+
+# NodeGoat (javascript): caps with a ground-truth label in nodegoat.manifest.toml.
+[[cell]]
+cap = "cmdi"
+lang = "javascript"
+unsupported_rate     = 0.20
+false_confirmed_rate = 0.02
+
+[[cell]]
+cap = "xss"
+lang = "javascript"
+unsupported_rate     = 0.20
+false_confirmed_rate = 0.02
+
+[[cell]]
+cap = "unauthorized_id"
+lang = "javascript"
+unsupported_rate     = 0.20
+false_confirmed_rate = 0.02
+
+[[cell]]
+cap = "crypto"
+lang = "javascript"
+unsupported_rate     = 0.20
+false_confirmed_rate = 0.02
+
+# Juice Shop (typescript): caps with a ground-truth label in juiceshop.manifest.toml.
+[[cell]]
+cap = "sqli"
+lang = "typescript"
+unsupported_rate     = 0.20
+false_confirmed_rate = 0.02
+
+[[cell]]
+cap = "path_traversal"
+lang = "typescript"
+unsupported_rate     = 0.20
+false_confirmed_rate = 0.02
+
+[[cell]]
+cap = "redirect"
+lang = "typescript"
+unsupported_rate     = 0.20
+false_confirmed_rate = 0.02
+
+[[cell]]
+cap = "ssrf"
+lang = "typescript"
+unsupported_rate     = 0.20
+false_confirmed_rate = 0.02
+
+[[cell]]
+cap = "crypto"
+lang = "typescript"
+unsupported_rate     = 0.20
+false_confirmed_rate = 0.02
+
+# ── Polyglot real corpora (Ruby/PHP/Python/Go/Rust) — Track R.2 ──────────────
+#
+# Phase 29 wires five more intentionally-vulnerable real corpora, one per
+# remaining language family, into the same acceptance machinery as OWASP /
+# NodeGoat / Juice Shop:
+#
+#   * railsgoat  — OWASP RailsGoat (Rails, .rb)
+#   * dvwa       — Damn Vulnerable Web Application (PHP); ships graded
+#                  source variants, so low.php = vuln and impossible.php =
+#                  benign control — real vuln/benign PAIRS like OWASP.
+#   * dvpwa      — Damn Vulnerable Python Web App (aiohttp, .py); its
+#                  parameterized DAO siblings are benign controls for the
+#                  one `%`-formatted SQL sink.
+#   * gosec      — the Go SAST tool's own repo; the scannable, `// want`-
+#                  annotated sample under goanalysis/testdata is the curated
+#                  ground truth (its embedded-string rule samples are not
+#                  scannable, so they are unlabelled).
+#   * rustsec    — RustSec advisory-db: a NEGATIVE CONTROL.  It ships
+#                  advisory metadata, not vulnerable .rs source, so its
+#                  ground truth is empty by construction; the row asserts the
+#                  Rust scan/verify path runs at scale within wall-clock and
+#                  Confirms NOTHING (any Confirmed Rust finding there is a
+#                  false confirm and trips the default false_confirmed_rate).
+#
+# Each row is gated with the SAME policy as Gates 6/7 (scripts/m7_ship_gate.sh
+# Gate 8): wall-clock + the per-(cap,lang) budget below are HARD-enforced;
+# per-cap confirmed-rate / precision / recall are published report-only
+# (NYX_POLYGLOT_FLOOR_CAPS empty by default).  Because each corpus targets a
+# single language, Gate 8 scopes tabulation to that language (tabulate.py
+# --lang), so the vendored third-party JavaScript these Ruby/Python apps
+# bundle (bootstrap-colorpicker, materialize, …) — which nyx confirms as
+# prototype_pollution — does not pollute the corpus's per-cap metrics.  Those
+# JS findings are still emitted; they are simply out of scope for a Ruby /
+# Python corpus.
+#
+# Calibrated against the pinned corpora (nyx HEAD of the Phase 29 branch,
+# 2026-05-31) with `nyx scan --verify --index off`.  Measured frontier
+# (target-language scope): every curated cell sits at <= the headline maxima
+# below EXCEPT cmdi, where every finding carries a SHELL_ESCAPE sanitizer cap
+# and is therefore routed to Unsupported(SoundOracleUnavailable) — the same
+# no-sound-oracle treatment OWASP's crypto/auth cells get.  RailsGoat's
+# deserialize (Marshal.load) and redirect (open redirect) cells Confirm end to
+# end with zero false confirms — the first real polyglot confirms.
+
+# railsgoat (ruby): caps with a ground-truth label in railsgoat.manifest.toml.
+[[cell]]
+cap = "auth"
+lang = "ruby"
+unsupported_rate     = 0.20
+false_confirmed_rate = 0.02
+
+[[cell]]
+cap = "crypto"
+lang = "ruby"
+unsupported_rate     = 0.20
+false_confirmed_rate = 0.02
+
+[[cell]]
+cap = "deserialize"
+lang = "ruby"
+unsupported_rate     = 0.20
+false_confirmed_rate = 0.02
+
+[[cell]]
+cap = "redirect"
+lang = "ruby"
+unsupported_rate     = 0.20
+false_confirmed_rate = 0.02
+
+[[cell]]
+cap = "path_traversal"
+lang = "ruby"
+unsupported_rate     = 0.20
+false_confirmed_rate = 0.02
+
+# cmdi/ruby is incidental (RailsGoat's `self.try(params[:graph])` reflection
+# sink); the lone finding carries a SHELL_ESCAPE sanitizer cap and routes to
+# Unsupported(SoundOracleUnavailable), so unsupported_rate is locked at the
+# measured frontier (1/1).  The false-confirm guard stays at the headline 2%.
+[[cell]]
+cap = "cmdi"
+lang = "ruby"
+unsupported_rate     = 1.00
+false_confirmed_rate = 0.02
+
+# dvwa (php): caps with a ground-truth label in dvwa.manifest.toml.
+[[cell]]
+cap = "sqli"
+lang = "php"
+unsupported_rate     = 0.20
+false_confirmed_rate = 0.02
+
+[[cell]]
+cap = "redirect"
+lang = "php"
+unsupported_rate     = 0.20
+false_confirmed_rate = 0.02
+
+[[cell]]
+cap = "header_injection"
+lang = "php"
+unsupported_rate     = 0.20
+false_confirmed_rate = 0.02
+
+# cmdi/php: DVWA's ping handlers reach shell_exec through a SHELL_ESCAPE
+# sanitizer cap, so ~69% of the cell's findings route to
+# Unsupported(SoundOracleUnavailable).  unsupported_rate is locked to that
+# frontier with margin (a regression above 75% fails); false-confirm at 2%.
+[[cell]]
+cap = "cmdi"
+lang = "php"
+unsupported_rate     = 0.75
+false_confirmed_rate = 0.02
+
+# dvpwa (python): caps with a ground-truth label in dvpwa.manifest.toml.
+[[cell]]
+cap = "sqli"
+lang = "python"
+unsupported_rate     = 0.20
+false_confirmed_rate = 0.02
+
+[[cell]]
+cap = "crypto"
+lang = "python"
+unsupported_rate     = 0.20
+false_confirmed_rate = 0.02
+
+[[cell]]
+cap = "auth"
+lang = "python"
+unsupported_rate     = 0.20
+false_confirmed_rate = 0.02
+
+# gosec (go): caps with a ground-truth label in gosec.manifest.toml.
+[[cell]]
+cap = "crypto"
+lang = "go"
+unsupported_rate     = 0.20
+false_confirmed_rate = 0.02
+
+# cmdi/go: the goanalysis/testdata exec.Command sample reaches the sink
+# through a SHELL_ESCAPE sanitizer cap, so every cmdi/go finding routes to
+# Unsupported(SoundOracleUnavailable).  unsupported_rate locked to the
+# measured frontier (3/3); false-confirm at the headline 2%.
+[[cell]]
+cap = "cmdi"
+lang = "go"
+unsupported_rate     = 1.00
+false_confirmed_rate = 0.02
--- a/tests/eval_corpus/check_surface.sh
+++ b/tests/eval_corpus/check_surface.sh
@ -0,0 +1,173 @@
+#!/usr/bin/env bash
+# Phase 31 acceptance walker: assert `nyx surface` produces a usable
+# map on every downloaded eval-corpus fixture root.
+#
+# Walks the project trees under $NYX_EVAL_CORPUS_DIR plus the in-house
+# `tests/benchmark/corpus` and `tests/dynamic_fixtures` trees, runs
+# `nyx surface --build --format json <root>` against each, and asserts
+# the resulting JSON contains at least one EntryPoint plus at least
+# one DataStore / ExternalService / DangerousLocal node.
+#
+# `--build` forces the inline pass-1 + call-graph path so the walker
+# does not depend on a prior `nyx index build` or `nyx scan`.
+#
+# Usage:
+#   tests/eval_corpus/check_surface.sh [--nyx BIN] [--corpus-dir DIR]
+#                                      [--also-inhouse]
+#                                      [--report FILE]
+#
+# Environment:
+#   NYX_EVAL_CORPUS_DIR  — path to pre-downloaded corpus roots
+#                          (default: ~/.cache/nyx/eval_corpus).  When
+#                          missing or empty the walker still scans the
+#                          in-house corpus and exits 0 so CI without a
+#                          corpus mirror does not block on Phase 31.
+#
+# Exit codes:
+#   0  every walked project produced a usable SurfaceMap (or no
+#      projects were available — see corpus-missing note above).
+#   1  setup / I/O / missing-binary error.
+#   2  one or more projects produced an empty or unusable SurfaceMap.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+
+NYX_BIN="${NYX_BIN:-${REPO_ROOT}/target/release/nyx}"
+CORPUS_CACHE="${NYX_EVAL_CORPUS_DIR:-${HOME}/.cache/nyx/eval_corpus}"
+ALSO_INHOUSE="false"
+REPORT_FILE=""
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --nyx)          NYX_BIN="$2"; shift 2 ;;
+    --corpus-dir)   CORPUS_CACHE="$2"; shift 2 ;;
+    --also-inhouse) ALSO_INHOUSE="true"; shift ;;
+    --report)       REPORT_FILE="$2"; shift 2 ;;
+    -h|--help)
+      sed -n '1,40p' "$0"
+      exit 0
+      ;;
+    *)
+      echo "unknown flag: $1" >&2
+      exit 1
+      ;;
+  esac
+done
+
+die()  { echo "error: $*" >&2; exit 1; }
+info() { echo "[surface-check] $*"; }
+warn() { echo "[surface-check] WARN: $*" >&2; }
+
+[[ -x "$NYX_BIN" ]] || die "nyx binary not found or not executable: $NYX_BIN"
+command -v jq >/dev/null 2>&1 || die "required command not found: jq"
+
+# Collect project roots.  Each corpus directory is treated as a single
+# project; the in-house corpus trees are handled the same way (each
+# language vertical is a project root).
+PROJECTS=()
+if [[ -d "$CORPUS_CACHE" ]]; then
+  for entry in "$CORPUS_CACHE"/*; do
+    [[ -d "$entry" ]] && PROJECTS+=("$entry")
+  done
+else
+  warn "corpus directory missing: $CORPUS_CACHE (run tests/eval_corpus/run.sh to bootstrap)"
+fi
+if [[ "$ALSO_INHOUSE" == "true" ]]; then
+  for dir in \
+    "${REPO_ROOT}/tests/benchmark/corpus" \
+    "${REPO_ROOT}/tests/dynamic_fixtures"
+  do
+    [[ -d "$dir" ]] && PROJECTS+=("$dir")
+  done
+fi
+
+if [[ ${#PROJECTS[@]} -eq 0 ]]; then
+  info "no project roots to walk (eval corpus not downloaded, in-house trees absent)"
+  exit 0
+fi
+
+PASS_COUNT=0
+FAIL_COUNT=0
+FAIL_PROJECTS=()
+declare -a REPORT_ROWS=()
+
+for project in "${PROJECTS[@]}"; do
+  info "walking: $project"
+  set +e
+  out="$("$NYX_BIN" surface --build --format json "$project" 2>/dev/null)"
+  rc=$?
+  set -e
+  if [[ $rc -ne 0 ]]; then
+    warn "nyx surface --build exited $rc on $project"
+    FAIL_COUNT=$((FAIL_COUNT + 1))
+    FAIL_PROJECTS+=("$project (nyx exit=$rc)")
+    REPORT_ROWS+=("$(printf '{"project":%s,"status":"nyx-error","exit":%d}' \
+      "$(jq -Rn --arg p "$project" '$p')" "$rc")")
+    continue
+  fi
+  if [[ -z "$out" ]]; then
+    warn "empty output on $project"
+    FAIL_COUNT=$((FAIL_COUNT + 1))
+    FAIL_PROJECTS+=("$project (empty output)")
+    REPORT_ROWS+=("$(printf '{"project":%s,"status":"empty-output"}' \
+      "$(jq -Rn --arg p "$project" '$p')")")
+    continue
+  fi
+  # Count nodes by kind.  SurfaceMap serialises each node as a flat
+  # object with a `node` discriminator: `entry_point`, `data_store`,
+  # `external_service`, `dangerous_local`.
+  entry_count="$(echo "$out" | jq '[.nodes[] | select(.node == "entry_point")] | length')"
+  ds_count="$(echo "$out" | jq '[.nodes[] | select(.node == "data_store")] | length')"
+  es_count="$(echo "$out" | jq '[.nodes[] | select(.node == "external_service")] | length')"
+  dl_count="$(echo "$out" | jq '[.nodes[] | select(.node == "dangerous_local")] | length')"
+  sink_count=$((ds_count + es_count + dl_count))
+  if [[ "$entry_count" -lt 1 ]]; then
+    warn "no EntryPoint nodes on $project"
+    FAIL_COUNT=$((FAIL_COUNT + 1))
+    FAIL_PROJECTS+=("$project (no entry-points)")
+    REPORT_ROWS+=("$(printf '{"project":%s,"status":"no-entry-points","entry_count":%d}' \
+      "$(jq -Rn --arg p "$project" '$p')" "$entry_count")")
+    continue
+  fi
+  if [[ "$sink_count" -lt 1 ]]; then
+    warn "no DataStore / ExternalService / DangerousLocal nodes on $project"
+    FAIL_COUNT=$((FAIL_COUNT + 1))
+    FAIL_PROJECTS+=("$project (no sinks: ds=$ds_count es=$es_count dl=$dl_count)")
+    REPORT_ROWS+=("$(printf '{"project":%s,"status":"no-sinks","entry_count":%d,"ds":%d,"es":%d,"dl":%d}' \
+      "$(jq -Rn --arg p "$project" '$p')" "$entry_count" "$ds_count" "$es_count" "$dl_count")")
+    continue
+  fi
+  info "  ok: ${entry_count} entry-points, ${ds_count} data stores, ${es_count} external, ${dl_count} dangerous"
+  PASS_COUNT=$((PASS_COUNT + 1))
+  REPORT_ROWS+=("$(printf '{"project":%s,"status":"ok","entry_count":%d,"ds":%d,"es":%d,"dl":%d}' \
+    "$(jq -Rn --arg p "$project" '$p')" "$entry_count" "$ds_count" "$es_count" "$dl_count")")
+done
+
+if [[ -n "$REPORT_FILE" ]]; then
+  {
+    echo "{"
+    echo "  \"pass\": $PASS_COUNT,"
+    echo "  \"fail\": $FAIL_COUNT,"
+    echo "  \"projects\": ["
+    for i in "${!REPORT_ROWS[@]}"; do
+      sep=","
+      [[ $i -eq $((${#REPORT_ROWS[@]} - 1)) ]] && sep=""
+      echo "    ${REPORT_ROWS[$i]}$sep"
+    done
+    echo "  ]"
+    echo "}"
+  } > "$REPORT_FILE"
+  info "report written: $REPORT_FILE"
+fi
+
+info ""
+info "summary: ${PASS_COUNT} pass, ${FAIL_COUNT} fail (of $((PASS_COUNT + FAIL_COUNT)) projects)"
+if [[ $FAIL_COUNT -gt 0 ]]; then
+  for p in "${FAIL_PROJECTS[@]}"; do
+    info "  fail: $p"
+  done
+  exit 2
+fi
+exit 0
--- a/tests/eval_corpus/ground_truth/README.md
+++ b/tests/eval_corpus/ground_truth/README.md
@ -0,0 +1,106 @@
+# Ground truth files
+
+Place corpus ground truth JSON files here before running `tests/eval_corpus/run.sh`.
+
+## OWASP Benchmark v1.2
+
+File: `owasp_benchmark_v1.2.json` (checked in; complete — one record per
+BenchmarkTest file, 2740 total).
+
+Format:
+```json
+[
+  {"path": "src/main/java/org/owasp/.../BenchmarkTest00001.java", "line": 0, "cap": "sqli", "vuln": true},
+  ...
+]
+```
+
+`path` is **relative to the corpus root** (the BenchmarkJava clone), with POSIX
+separators. `tabulate.py` suffix-matches it against the absolute paths nyx
+emits, so the committed JSON is portable: it matches whether the corpus lives at
+`~/.cache/nyx/eval_corpus/owasp_benchmark_v1.2` on a laptop or at a CI checkout
+path. `line` is `0` (the expected-results CSV does not pin a line; matching
+falls back to file+cap).
+
+Regenerate from `expectedresults-1.2beta.csv` shipped with the benchmark repo:
+```sh
+python3 tests/eval_corpus/owasp_gt_convert.py \
+    --corpus-dir ~/.cache/nyx/eval_corpus/owasp_benchmark_v1.2 \
+    --output     tests/eval_corpus/ground_truth/owasp_benchmark_v1.2.json
+```
+
+## NIST SARD subset
+
+File: `nist_sard.json`
+
+Same format. Source: SARD manifest XML converted with `python3 tests/eval_corpus/sard_gt_convert.py`.
+
+## OWASP NodeGoat / OWASP Juice Shop (JS/TS — Track R.1)
+
+Files: `nodegoat.json` (Express, `.js`), `juiceshop.json` (TypeScript, `.ts`).
+Same four-field format as above; all records are `vuln: true`.
+
+These two apps are intentionally vulnerable end to end, so — unlike OWASP
+Benchmark — they ship no machine-readable per-file vuln labels and have no
+benign-control files to pair against. The authoritative source is a curated
+TOML manifest committed here, one `[[entry]]` per known-vulnerable handler
+with a `note` citing why:
+
+- `nodegoat.manifest.toml`
+- `juiceshop.manifest.toml`
+
+`manifest_gt_convert.py` turns a manifest into the committed `.json`:
+
+```sh
+python3 tests/eval_corpus/manifest_gt_convert.py \
+    --manifest tests/eval_corpus/ground_truth/nodegoat.manifest.toml \
+    --output   tests/eval_corpus/ground_truth/nodegoat.json
+```
+
+Pass `--corpus-dir <clone>` to validate every labelled path against a real
+checkout. The converter exits non-zero if any path is missing, so a corpus
+bump that moves a handler fails loudly instead of silently dropping recall.
+CI (`.github/workflows/eval.yml`, `jsts` job) regenerates each `.json`
+against a fresh clone of the pinned ref and asserts it matches the committed
+file.
+
+Because the manifests label canonical vulns only, recall (did nyx catch the
+known vulns) is the meaningful metric; precision vs this partial ground
+truth is informational. Gate 7 publishes per-cap precision/recall/confirmed
+report-only by default (`NYX_JSTS_FLOOR_CAPS` empty), matching the OWASP
+gate.
+
+## Polyglot real corpora (Ruby/PHP/Python/Go/Rust — Track R.2)
+
+Phase 29 wires the remaining language families into the same machinery, one
+corpus per family, each with a curated `*.manifest.toml` → committed `*.json`:
+
+- `railsgoat.{manifest.toml,json}` — OWASP RailsGoat (Rails, `.rb`).
+- `dvwa.{manifest.toml,json}` — Damn Vulnerable Web Application (PHP). DVWA
+  ships graded source variants (`source/{low,impossible}.php`), so this is
+  the one Track R corpus besides OWASP with real vuln/benign **pairs**
+  (`low.php` = vuln, `impossible.php` = benign control) — precision is
+  meaningful here, not just informational.
+- `dvpwa.{manifest.toml,json}` — Damn Vulnerable Python Web App (aiohttp,
+  `.py`). Its parameterized DAO siblings are benign controls for the one
+  `%`-formatted SQL sink.
+- `gosec.{manifest.toml,json}` — the gosec Go SAST tool repo; the scannable,
+  `// want`-annotated sample under `goanalysis/testdata` is the curated
+  ground truth (gosec's string-embedded rule samples are not scannable, so
+  they are deliberately unlabelled).
+- `rustsec.{manifest.toml,json}` — RustSec advisory-db, a **negative
+  control**. advisory-db ships advisory metadata, not vulnerable `.rs`
+  source, so its committed ground truth is empty (`[]`) by construction. The
+  manifest sets `negative_control = true` (mutually exclusive with
+  `[[entry]]` tables); `manifest_gt_convert.py` emits the empty JSON and the
+  row asserts the Rust scan/verify path runs at scale within wall-clock and
+  Confirms nothing there (any Confirmed Rust finding is a false confirm).
+
+These are converted, validated and asserted-in-sync exactly like NodeGoat /
+Juice Shop (the `polyglot` job in `.github/workflows/eval.yml`). Because each
+corpus targets a single language, Gate 8 scopes tabulation to that language
+(`tabulate.py --lang`) so the vendored third-party JavaScript these Ruby /
+Python apps bundle does not pollute their per-cap metrics. Gate 8 publishes
+per-cap precision/recall/confirmed report-only by default
+(`NYX_POLYGLOT_FLOOR_CAPS` empty), matching the OWASP and JS/TS gates. See
+`tests/eval_corpus/budget.toml` for the per-(cap,lang) gate policy.
--- a/tests/eval_corpus/ground_truth/dvpwa.json
+++ b/tests/eval_corpus/ground_truth/dvpwa.json
@ -0,0 +1,38 @@
+[
+  {
+    "path": "sqli/dao/course.py",
+    "line": 0,
+    "cap": "sqli",
+    "vuln": false
+  },
+  {
+    "path": "sqli/dao/mark.py",
+    "line": 0,
+    "cap": "sqli",
+    "vuln": false
+  },
+  {
+    "path": "sqli/dao/review.py",
+    "line": 0,
+    "cap": "sqli",
+    "vuln": false
+  },
+  {
+    "path": "sqli/dao/student.py",
+    "line": 0,
+    "cap": "sqli",
+    "vuln": true
+  },
+  {
+    "path": "sqli/dao/user.py",
+    "line": 0,
+    "cap": "crypto",
+    "vuln": true
+  },
+  {
+    "path": "sqli/views.py",
+    "line": 0,
+    "cap": "auth",
+    "vuln": true
+  }
+]
--- a/tests/eval_corpus/ground_truth/dvpwa.manifest.toml
+++ b/tests/eval_corpus/ground_truth/dvpwa.manifest.toml
@ -0,0 +1,70 @@
+# DVPWA (Damn Vulnerable Python Web Application) — curated ground-truth
+# manifest (Phase 29, Track R.2).
+#
+# DVPWA is an intentionally-vulnerable aiohttp app whose headline flaw is
+# SQL injection (the package is literally named `sqli`).  It ships no
+# machine-readable per-file labels, so this manifest IS the authoritative
+# source.  Its DAO layer is convenient: one method builds a query with
+# Python `%` string-formatting (the injectable sink) while its siblings use
+# proper parameterized `cur.execute(q, params)` — so the parameterized DAOs
+# serve as genuine benign controls (vuln = false) for the sqli cell, making
+# precision there meaningful, not just informational.
+#
+# tests/eval_corpus/manifest_gt_convert.py turns this into the committed
+# ground_truth/dvpwa.json.  CI regenerates it against a fresh clone of the
+# pinned ref and asserts byte-equality; the converter HARD-ERRORS on any
+# path that no longer exists, so a corpus bump that moves a DAO fails the
+# job loudly rather than silently dropping recall.
+#
+# `cap` is a nyx cap label (tabulate.py), aligned to how nyx classifies each
+# sink (the request-scoped ownership lookups in views.py surface as `auth`).
+# `path` is relative to the DVPWA clone root, POSIX separators.  Lang is
+# inferred from the extension (.py -> python).  See
+# tests/eval_corpus/budget.toml for the gate policy on these cells.
+
+corpus = "dvpwa"
+upstream = "https://github.com/anxolerd/dvpwa"
+# DVPWA publishes no release tags; the eval job pins the default branch via
+# the CI cache key (clone HEAD a1d8f89fac2e57093189853c6527c2b01fc1d9c1).
+# The sqli/ package layout has been stable; re-validate if the cache key is
+# bumped.
+pinned_ref = "master"
+
+# ── SQL injection (sqli) — one injectable sink + parameterized controls ──────
+[[entry]]
+path = "sqli/dao/student.py"
+cap = "sqli"
+vuln = true
+note = "Student.create builds the INSERT with Python `%` formatting (\"... VALUES ('%(name)s')\" % {'name': name}) on the request-supplied student name, then cur.execute(q) — SQL injection."
+
+[[entry]]
+path = "sqli/dao/course.py"
+cap = "sqli"
+vuln = false
+note = "benign control: every Course query uses parameterized cur.execute(q, params) / VALUES (%(title)s, %(description)s) — not injectable."
+
+[[entry]]
+path = "sqli/dao/review.py"
+cap = "sqli"
+vuln = false
+note = "benign control: Review.create / get_for_course bind via cur.execute(q, params) with %(course_id)s / %s placeholders — parameterized."
+
+[[entry]]
+path = "sqli/dao/mark.py"
+cap = "sqli"
+vuln = false
+note = "benign control: Mark.create / get_for_student bind via parameterized cur.execute(q, params) — not injectable."
+
+# ── Weak crypto (crypto) ─────────────────────────────────────────────────────
+[[entry]]
+path = "sqli/dao/user.py"
+cap = "crypto"
+vuln = true
+note = "User.check_password compares against md5(password).hexdigest() — unsalted MD5 for credential storage (weak cryptography)."
+
+# ── Broken access control (auth) ─────────────────────────────────────────────
+[[entry]]
+path = "sqli/views.py"
+cap = "auth"
+vuln = true
+note = "request handlers resolve the acting user from a client-controlled session id and act on objects without an ownership/authorization check — broken access control."
--- a/tests/eval_corpus/ground_truth/dvwa.json
+++ b/tests/eval_corpus/ground_truth/dvwa.json
@ -0,0 +1,50 @@
+[
+  {
+    "path": "vulnerabilities/exec/source/impossible.php",
+    "line": 0,
+    "cap": "cmdi",
+    "vuln": false
+  },
+  {
+    "path": "vulnerabilities/exec/source/low.php",
+    "line": 0,
+    "cap": "cmdi",
+    "vuln": true
+  },
+  {
+    "path": "vulnerabilities/open_redirect/source/impossible.php",
+    "line": 0,
+    "cap": "header_injection",
+    "vuln": false
+  },
+  {
+    "path": "vulnerabilities/open_redirect/source/impossible.php",
+    "line": 0,
+    "cap": "redirect",
+    "vuln": false
+  },
+  {
+    "path": "vulnerabilities/open_redirect/source/low.php",
+    "line": 0,
+    "cap": "header_injection",
+    "vuln": true
+  },
+  {
+    "path": "vulnerabilities/open_redirect/source/low.php",
+    "line": 0,
+    "cap": "redirect",
+    "vuln": true
+  },
+  {
+    "path": "vulnerabilities/sqli/source/impossible.php",
+    "line": 0,
+    "cap": "sqli",
+    "vuln": false
+  },
+  {
+    "path": "vulnerabilities/sqli/source/low.php",
+    "line": 0,
+    "cap": "sqli",
+    "vuln": true
+  }
+]
--- a/tests/eval_corpus/ground_truth/dvwa.manifest.toml
+++ b/tests/eval_corpus/ground_truth/dvwa.manifest.toml
@ -0,0 +1,84 @@
+# DVWA (Damn Vulnerable Web Application) — curated ground-truth manifest
+# (Phase 29, Track R.2).
+#
+# DVWA is an intentionally-vulnerable PHP app.  Unlike the other Track R
+# apps it ships its vulnerabilities as graded source variants under
+# vulnerabilities/<module>/source/{low,medium,high,impossible}.php, where
+# `low.php` is the textbook-vulnerable handler and `impossible.php` is the
+# hardened, secure rewrite of the SAME sink.  That gives DVWA real
+# vuln/benign PAIRS (low = vuln, impossible = benign control) the way OWASP
+# Benchmark does — so precision against this manifest is meaningful, not
+# just informational: a Confirmed finding on an `impossible.php` control is
+# a genuine false confirm.
+#
+# tests/eval_corpus/manifest_gt_convert.py turns this into the committed
+# ground_truth/dvwa.json.  CI regenerates it against a fresh clone of the
+# pinned tag and asserts byte-equality; the converter HARD-ERRORS on any
+# path that no longer exists, so a DVWA bump that restructures a module
+# fails loudly rather than silently dropping recall.  Re-pin `pinned_ref`
+# and re-validate the paths together.
+#
+# `cap` is a nyx cap label (tabulate.py), aligned to how nyx classifies the
+# sink.  `path` is relative to the DVWA clone root, POSIX separators.  Lang
+# is inferred from the extension (.php -> php).  See
+# tests/eval_corpus/budget.toml for the gate policy on these cells.
+
+corpus = "dvwa"
+upstream = "https://github.com/digininja/DVWA"
+# Pinned to release tag 2.5 (clone HEAD
+# a96943dc1f52f390ee5df72144660636c4b7dd06).  The
+# vulnerabilities/<module>/source/{low,impossible}.php layout has been stable
+# for years; re-validate if the tag is bumped.
+pinned_ref = "2.5"
+
+# ── SQL injection (sqli) ─────────────────────────────────────────────────────
+[[entry]]
+path = "vulnerabilities/sqli/source/low.php"
+cap = "sqli"
+vuln = true
+note = "id = $_REQUEST['id'] is concatenated straight into \"... WHERE user_id = '$id'\" and run via mysqli_query — classic SQL injection."
+
+[[entry]]
+path = "vulnerabilities/sqli/source/impossible.php"
+cap = "sqli"
+vuln = false
+note = "benign control: same query via PDO prepare + bindParam(:id, PDO::PARAM_INT) with is_numeric/intval validation — parameterized, not injectable."
+
+# ── OS command injection (cmdi) ──────────────────────────────────────────────
+[[entry]]
+path = "vulnerabilities/exec/source/low.php"
+cap = "cmdi"
+vuln = true
+note = "target = $_REQUEST['ip'] is concatenated into shell_exec('ping -c 4 ' . $target) with no validation — OS command injection."
+
+[[entry]]
+path = "vulnerabilities/exec/source/impossible.php"
+cap = "cmdi"
+vuln = false
+note = "benign control: the IP is split into 4 octets and each is_numeric-checked before being reassembled and passed to shell_exec — not injectable."
+
+# ── Open redirect (redirect) ─────────────────────────────────────────────────
+[[entry]]
+path = "vulnerabilities/open_redirect/source/low.php"
+cap = "redirect"
+vuln = true
+note = "header('location: ' . $_GET['redirect']) forwards to an unvalidated user-supplied URL — open redirect."
+
+[[entry]]
+path = "vulnerabilities/open_redirect/source/impossible.php"
+cap = "redirect"
+vuln = false
+note = "benign control: redirect target is chosen by an integer switch on is_numeric($_GET['redirect']) — no user-controlled URL reaches the Location header."
+
+# ── CRLF / HTTP header injection (header_injection) ──────────────────────────
+[[entry]]
+path = "vulnerabilities/open_redirect/source/low.php"
+cap = "header_injection"
+vuln = true
+note = "the same unvalidated $_GET['redirect'] flows into a raw header() call, so CRLF in the value splits/injects response headers — HTTP header injection."
+
+[[entry]]
+path = "vulnerabilities/open_redirect/source/impossible.php"
+cap = "header_injection"
+vuln = false
+note = "benign control: only a fixed, integer-selected target string reaches header() — no user bytes, so no CRLF injection."
--- a/tests/eval_corpus/ground_truth/gosec.json
+++ b/tests/eval_corpus/ground_truth/gosec.json
@ -0,0 +1,14 @@
+[
+  {
+    "path": "goanalysis/testdata/src/a/basic_output.go",
+    "line": 0,
+    "cap": "cmdi",
+    "vuln": true
+  },
+  {
+    "path": "goanalysis/testdata/src/a/basic_output.go",
+    "line": 0,
+    "cap": "crypto",
+    "vuln": true
+  }
+]
--- a/tests/eval_corpus/ground_truth/gosec.manifest.toml
+++ b/tests/eval_corpus/ground_truth/gosec.manifest.toml
@ -0,0 +1,42 @@
+# gosec — curated Go ground-truth manifest (Phase 29, Track R.2).
+#
+# gosec is the Go SAST tool; its repo doubles as the de-facto Go security
+# corpus.  Most of gosec's rule samples live as Go source embedded in
+# backtick string literals inside testutils/g*_samples.go — those are NOT
+# scannable by a taint analyzer (the vulnerable code is string data, not
+# real AST), so they are deliberately NOT labelled here.  gosec also ships a
+# small set of REAL, compilable sample programs under goanalysis/testdata
+# that carry the tool's OWN inline `// want 'GNNN ...'` expectations — that
+# is the authoritative, scannable ground truth this manifest pins.
+#
+# Because the eval scans the whole gosec checkout (the tool's own source
+# included), unlabelled findings are expected and are NOT false positives —
+# precision against this manifest is informational, recall on the curated
+# samples is the meaningful floor (same policy as the all-vulnerable apps;
+# see tests/eval_corpus/budget.toml).
+#
+# tests/eval_corpus/manifest_gt_convert.py turns this into the committed
+# ground_truth/gosec.json.  CI regenerates it against a fresh clone of the
+# pinned tag and asserts byte-equality; the converter HARD-ERRORS on any
+# path that no longer exists, so a gosec bump that moves the testdata fails
+# the job loudly.  `cap` is a nyx cap label (tabulate.py); `path` is relative
+# to the gosec clone root, POSIX separators; lang is inferred (.go -> go).
+
+corpus = "gosec"
+upstream = "https://github.com/securego/gosec"
+# Pinned to release tag v2.26.1 (clone HEAD
+# 4a3bd8af174872c778439083ded7adbf3747e770).  goanalysis/testdata/src/a/ has
+# been stable; re-validate if the tag is bumped.
+pinned_ref = "v2.26.1"
+
+[[entry]]
+path = "goanalysis/testdata/src/a/basic_output.go"
+cap = "cmdi"
+vuln = true
+note = "VulnerableFunction runs exec.Command(\"sh\", \"-c\", getUserInput()) — subprocess launched with a non-constant argument (gosec's own `// want G204 [CWE-78]` expectation)."
+
+[[entry]]
+path = "goanalysis/testdata/src/a/basic_output.go"
+cap = "crypto"
+vuln = true
+note = "VulnerableFunction imports crypto/md5 and calls md5.New() — weak cryptographic primitive (gosec's own `// want G401/G501` expectations)."
--- a/tests/eval_corpus/ground_truth/juiceshop.json
+++ b/tests/eval_corpus/ground_truth/juiceshop.json
@ -0,0 +1,38 @@
+[
+  {
+    "path": "lib/insecurity.ts",
+    "line": 0,
+    "cap": "crypto",
+    "vuln": true
+  },
+  {
+    "path": "routes/fileServer.ts",
+    "line": 0,
+    "cap": "path_traversal",
+    "vuln": true
+  },
+  {
+    "path": "routes/login.ts",
+    "line": 0,
+    "cap": "sqli",
+    "vuln": true
+  },
+  {
+    "path": "routes/profileImageUrlUpload.ts",
+    "line": 0,
+    "cap": "ssrf",
+    "vuln": true
+  },
+  {
+    "path": "routes/redirect.ts",
+    "line": 0,
+    "cap": "redirect",
+    "vuln": true
+  },
+  {
+    "path": "routes/search.ts",
+    "line": 0,
+    "cap": "sqli",
+    "vuln": true
+  }
+]
--- a/tests/eval_corpus/ground_truth/juiceshop.manifest.toml
+++ b/tests/eval_corpus/ground_truth/juiceshop.manifest.toml
@ -0,0 +1,66 @@
+# OWASP Juice Shop — curated vuln ground-truth manifest (Phase 28, Track R.1).
+#
+# Juice Shop is an intentionally-vulnerable TypeScript/Express + Angular
+# app.  Its `data/static/challenges.yml` enumerates challenges but pins no
+# source file/line, so it cannot drive file-level ground truth on its own.
+# This manifest IS the authoritative source: one [[entry]] per known-
+# vulnerable server-side handler, curated from the project's own challenge
+# definitions + companion guide, each with a `note` citing the challenge.
+#
+# tests/eval_corpus/manifest_gt_convert.py turns this into the committed
+# ground_truth/juiceshop.json.  CI regenerates it against a fresh clone of
+# the pinned tag and asserts byte-equality; the converter HARD-ERRORS on
+# any path that no longer exists in the corpus, so a Juice Shop bump that
+# refactors a route fails the eval job loudly instead of silently dropping
+# recall.  Re-pin `pinned_ref` and re-validate the paths together.
+#
+# `cap` is a nyx cap label (tabulate.py).  `path` is relative to the Juice
+# Shop clone root, POSIX separators.  Lang is inferred from the extension
+# (.ts -> typescript).  All `vuln = true`: Juice Shop is all-vulnerable, so
+# there is no benign-control file to pair against.  As with NodeGoat,
+# precision vs this manifest is informational (an unlabelled finding may be
+# a real uncurated vuln, not a false positive) while recall is the
+# meaningful floor.  See tests/eval_corpus/budget.toml for the gate policy.
+
+corpus = "juiceshop"
+upstream = "https://github.com/juice-shop/juice-shop"
+# Pinned to a stable release tag.  The server-side handlers below
+# (routes/*.ts, lib/insecurity.ts) have been stable across the TypeScript
+# era of Juice Shop; re-validate if the tag is bumped.
+pinned_ref = "v15.0.0"
+
+[[entry]]
+path = "routes/login.ts"
+cap = "sqli"
+vuln = true
+note = "login builds a raw `models.sequelize.query(\"... WHERE email = '\" + req.body.email + \"' ...\")` — SQL injection auth bypass (challenge: loginAdmin / loginBender)."
+
+[[entry]]
+path = "routes/search.ts"
+cap = "sqli"
+vuln = true
+note = "product search concatenates the `q` criteria into a raw `models.sequelize.query` LIKE clause — UNION-based SQL injection (challenge: unionSqlInjection / dbSchema)."
+
+[[entry]]
+path = "routes/fileServer.ts"
+cap = "path_traversal"
+vuln = true
+note = "serveKeyFiles / file download resolves a user-controlled filename under the ftp dir without containment — path traversal (challenge: accessLogDisclosure / forgottenDevBackup)."
+
+[[entry]]
+path = "routes/redirect.ts"
+cap = "redirect"
+vuln = true
+note = "redirect endpoint forwards to the `to` query param via an allow-list that is bypassable by substring — open redirect (challenge: redirectCryptoCurrency / redirect)."
+
+[[entry]]
+path = "routes/profileImageUrlUpload.ts"
+cap = "ssrf"
+vuln = true
+note = "profile image upload fetches an arbitrary user-supplied imageUrl server-side — SSRF (challenge: ssrf)."
+
+[[entry]]
+path = "lib/insecurity.ts"
+cap = "crypto"
+vuln = true
+note = "hardcoded HMAC/JWT key material and weak hashing (md5-based `hash`) — broken cryptography / hardcoded secret (challenge: weakCryptography / jwt*)."
--- a/tests/eval_corpus/ground_truth/nodegoat.json
+++ b/tests/eval_corpus/ground_truth/nodegoat.json
@ -0,0 +1,32 @@
+[
+  {
+    "path": "app/routes/allocations.js",
+    "line": 0,
+    "cap": "unauthorized_id",
+    "vuln": true
+  },
+  {
+    "path": "app/routes/contributions.js",
+    "line": 0,
+    "cap": "cmdi",
+    "vuln": true
+  },
+  {
+    "path": "app/routes/memos.js",
+    "line": 0,
+    "cap": "xss",
+    "vuln": true
+  },
+  {
+    "path": "app/routes/profile.js",
+    "line": 0,
+    "cap": "xss",
+    "vuln": true
+  },
+  {
+    "path": "config/env/all.js",
+    "line": 0,
+    "cap": "crypto",
+    "vuln": true
+  }
+]
--- a/tests/eval_corpus/ground_truth/nodegoat.manifest.toml
+++ b/tests/eval_corpus/ground_truth/nodegoat.manifest.toml
@ -0,0 +1,62 @@
+# OWASP NodeGoat — curated vuln ground-truth manifest (Phase 28, Track R.1).
+#
+# NodeGoat is an intentionally-vulnerable Express/Node app that maps the
+# OWASP Top 10 to concrete handlers.  It ships no machine-readable per-file
+# vuln labels (unlike OWASP Benchmark's expectedresults CSV), so this
+# manifest IS the authoritative source: one [[entry]] per known-vulnerable
+# location, each curated from the project's own tutorial + the canonical
+# vuln walk-through, with a `note` citing why.
+#
+# tests/eval_corpus/manifest_gt_convert.py turns this into the committed
+# ground_truth/nodegoat.json.  CI regenerates it against a fresh clone of
+# the pinned ref and asserts byte-equality, and the converter HARD-ERRORS
+# on any path that no longer exists in the corpus, so a NodeGoat bump that
+# moves a handler fails the eval job loudly rather than silently dropping
+# recall.  Update `pinned_ref` + the paths together when re-pinning.
+#
+# `cap` is a nyx cap label (tabulate.py).  `path` is relative to the
+# NodeGoat clone root, POSIX separators.  Lang is inferred from the
+# extension (.js -> javascript).  These are all `vuln = true`: NodeGoat is
+# all-vulnerable, so there is no benign-control file to pair against (the
+# OWASP Benchmark vuln/benign pairing does not exist here).  Precision vs
+# this manifest is therefore informational (an unlabelled finding is not
+# necessarily a false positive — it may be a real vuln we did not curate),
+# while recall (did nyx catch the canonical vulns) is the meaningful floor.
+# See tests/eval_corpus/budget.toml for how the gate treats these cells.
+
+corpus = "nodegoat"
+upstream = "https://github.com/OWASP/NodeGoat"
+# NodeGoat publishes no semver tags; the eval job pins the default branch
+# via the CI cache key.  The `app/` + `config/` layout below has been
+# stable for years; re-validate the paths if the cache key is bumped.
+pinned_ref = "master"
+
+[[entry]]
+path = "app/routes/contributions.js"
+cap = "cmdi"
+vuln = true
+note = "handleContributionsUpdate eval()s the pre-tax/after-tax/roth form fields — server-side JS injection (OWASP A1 Injection); the textbook NodeGoat RCE."
+
+[[entry]]
+path = "app/routes/profile.js"
+cap = "xss"
+vuln = true
+note = "profile fields (firstName/lastName/bankAcc/...) are persisted then rendered unescaped — stored XSS (OWASP A3 / A7 XSS)."
+
+[[entry]]
+path = "app/routes/memos.js"
+cap = "xss"
+vuln = true
+note = "memo body is stored and echoed back into the memos view without output encoding — stored XSS."
+
+[[entry]]
+path = "app/routes/allocations.js"
+cap = "unauthorized_id"
+vuln = true
+note = "allocations are looked up by a userId taken from the request with no ownership check — insecure direct object reference / broken access control (OWASP A4)."
+
+[[entry]]
+path = "config/env/all.js"
+cap = "crypto"
+vuln = true
+note = "hardcoded cookieSecret / session secret committed in source — sensitive-data / weak-secret smell (OWASP A6)."
--- a/tests/eval_corpus/ground_truth/owasp_benchmark_v1.2.json
+++ b/tests/eval_corpus/ground_truth/owasp_benchmark_v1.2.json
--- a/tests/eval_corpus/ground_truth/railsgoat.json
+++ b/tests/eval_corpus/ground_truth/railsgoat.json
@ -0,0 +1,56 @@
+[
+  {
+    "path": "app/controllers/admin_controller.rb",
+    "line": 0,
+    "cap": "auth",
+    "vuln": true
+  },
+  {
+    "path": "app/controllers/benefit_forms_controller.rb",
+    "line": 0,
+    "cap": "deserialize",
+    "vuln": true
+  },
+  {
+    "path": "app/controllers/benefit_forms_controller.rb",
+    "line": 0,
+    "cap": "path_traversal",
+    "vuln": true
+  },
+  {
+    "path": "app/controllers/messages_controller.rb",
+    "line": 0,
+    "cap": "auth",
+    "vuln": true
+  },
+  {
+    "path": "app/controllers/password_resets_controller.rb",
+    "line": 0,
+    "cap": "crypto",
+    "vuln": true
+  },
+  {
+    "path": "app/controllers/password_resets_controller.rb",
+    "line": 0,
+    "cap": "deserialize",
+    "vuln": true
+  },
+  {
+    "path": "app/controllers/sessions_controller.rb",
+    "line": 0,
+    "cap": "redirect",
+    "vuln": true
+  },
+  {
+    "path": "app/controllers/users_controller.rb",
+    "line": 0,
+    "cap": "auth",
+    "vuln": true
+  },
+  {
+    "path": "app/models/user.rb",
+    "line": 0,
+    "cap": "crypto",
+    "vuln": true
+  }
+]
--- a/tests/eval_corpus/ground_truth/railsgoat.manifest.toml
+++ b/tests/eval_corpus/ground_truth/railsgoat.manifest.toml
@ -0,0 +1,88 @@
+# OWASP RailsGoat — curated vuln ground-truth manifest (Phase 29, Track R.2).
+#
+# RailsGoat is an intentionally-vulnerable Ruby on Rails app that maps the
+# OWASP Top 10 to concrete controllers/models.  Like NodeGoat / Juice Shop
+# (Phase 28) it ships no machine-readable per-file vuln labels, so this
+# manifest IS the authoritative source: one [[entry]] per known-vulnerable
+# location, curated from the project's own tutorial walk-throughs, each with
+# a `note` citing why.
+#
+# tests/eval_corpus/manifest_gt_convert.py turns this into the committed
+# ground_truth/railsgoat.json.  CI regenerates it against a fresh clone of
+# the pinned tag and asserts byte-equality, and the converter HARD-ERRORS on
+# any path that no longer exists in the corpus, so a RailsGoat bump that
+# moves a controller fails the eval job loudly rather than silently dropping
+# recall.  Update `pinned_ref` + the paths together when re-pinning.
+#
+# `cap` is a nyx cap label (tabulate.py); it is aligned with how nyx
+# classifies the sink in each file (e.g. a missing ownership check on a
+# direct-object lookup surfaces as `auth`, not `unauthorized_id`), so recall
+# (did nyx catch the canonical vuln) is meaningful.  `path` is relative to
+# the RailsGoat clone root, POSIX separators.  Lang is inferred from the
+# extension (.rb -> ruby).  All `vuln = true`: RailsGoat is all-vulnerable,
+# so there is no benign-control file to pair against — precision vs this
+# manifest is informational (an unlabelled finding may be a real uncurated
+# vuln), while recall is the meaningful floor.  See
+# tests/eval_corpus/budget.toml for how the gate treats these cells.
+
+corpus = "railsgoat"
+upstream = "https://github.com/OWASP/railsgoat"
+# Pinned to the stable Rails 5 release tag (clone HEAD
+# 0766ca80bf2d94acbde1dd4aaf7baf9b86afe4eb).  The app/controllers + app/models
+# layout below has been stable across this tag; re-validate the paths if the
+# ref is bumped.
+pinned_ref = "rails.5.0.0"
+
+[[entry]]
+path = "app/controllers/users_controller.rb"
+cap = "auth"
+vuln = true
+note = "update looks up the account with User.where(\"id = '#{params[:user][:id]}'\") and mass-assigns user_params (params.require(:user).permit!) with no ownership check — broken access control / mass-assignment privilege escalation (OWASP A4/A5)."
+
+[[entry]]
+path = "app/controllers/messages_controller.rb"
+cap = "auth"
+vuln = true
+note = "show / destroy fetch Message.where(id: params[:id]) with no check that the message belongs to current_user — insecure direct object reference (OWASP A4 broken access control)."
+
+[[entry]]
+path = "app/controllers/admin_controller.rb"
+cap = "auth"
+vuln = true
+note = "administrative actions are gated by a bypassable admin_param check (params[:admin_id] != \"1\"); update_user / delete_user act on any admin_id — broken access control / privilege escalation (OWASP A5)."
+
+[[entry]]
+path = "app/models/user.rb"
+cap = "crypto"
+vuln = true
+note = "passwords are hashed with Digest::MD5.hexdigest (hash_password / authenticate) — unsalted weak hash for credential storage (OWASP A2 cryptographic failure)."
+
+[[entry]]
+path = "app/controllers/password_resets_controller.rb"
+cap = "crypto"
+vuln = true
+note = "generate_token derives the reset token as Digest::MD5.hexdigest(email) — a predictable, forgeable password-reset token (weak cryptography)."
+
+[[entry]]
+path = "app/controllers/password_resets_controller.rb"
+cap = "deserialize"
+vuln = true
+note = "reset_password runs Marshal.load(Base64.decode64(params[:user])) on attacker-controlled input — insecure deserialization leading to RCE (OWASP A8)."
+
+[[entry]]
+path = "app/controllers/sessions_controller.rb"
+cap = "redirect"
+vuln = true
+note = "create redirects to params[:url] with no allow-list (path = params[:url] then redirect_to path) — open redirect (OWASP unvalidated redirects)."
+
+[[entry]]
+path = "app/controllers/benefit_forms_controller.rb"
+cap = "path_traversal"
+vuln = true
+note = "download builds send_file from a user-controlled params[:name] path with no containment — arbitrary file read / path traversal."
+
+[[entry]]
+path = "app/controllers/benefit_forms_controller.rb"
+cap = "deserialize"
+vuln = true
+note = "download calls params[:type].constantize.new(path), constantizing a user-supplied class name — unsafe reflection / object injection."
--- a/tests/eval_corpus/ground_truth/rustsec.json
+++ b/tests/eval_corpus/ground_truth/rustsec.json
@ -0,0 +1 @@
+[]
--- a/tests/eval_corpus/ground_truth/rustsec.manifest.toml
+++ b/tests/eval_corpus/ground_truth/rustsec.manifest.toml
@ -0,0 +1,37 @@
+# RustSec advisory-db — Rust negative-control corpus (Phase 29, Track R.2).
+#
+# The plan's Rust real-corpus row is the RustSec advisory database.  Unlike
+# RailsGoat / DVWA / DVPWA / gosec, advisory-db ships advisory METADATA
+# (TOML + Markdown under crates/<crate>/RUSTSEC-*.md), not vulnerable Rust
+# SOURCE.  A static scan of it therefore contains zero `.rs` files and nyx
+# correctly produces zero findings — so there are no source-level vuln
+# positives to label, and no canonical scannable "RustGoat" exists to
+# substitute without fabricating paths (which the CI byte-equality + path
+# existence guards would reject outright).
+#
+# advisory-db is still worth pinning and scanning as a NEGATIVE CONTROL for
+# the Rust language path:
+#   * it exercises the Rust scan + verify pipeline (Phase 23 Rust build
+#     pool) end to end on a large real-world tree (thousands of files) and
+#     asserts it stays within the wall-clock budget without crashing, and
+#   * it is an over-confirmation guard: nyx must Confirm NOTHING on a corpus
+#     with no real source vulns.  Any Confirmed finding here is provably a
+#     false confirm and trips the per-cell false_confirmed_rate budget
+#     (tests/eval_corpus/budget.toml) — a genuine regression sentinel if a
+#     future change makes nyx treat advisory text as scannable code.
+#
+# `negative_control = true` tells manifest_gt_convert.py to emit an empty
+# `[]` ground truth.  It is mutually exclusive with `[[entry]]` tables, so a
+# real Rust vuln can never be silently hidden behind the flag.  When a
+# scannable advisory-backed Rust corpus (a vulnerable crate pinned at its
+# affected version with a source-level taint sink) is curated, drop the flag
+# and add [[entry]] tables here exactly as the other Track R.2 manifests do.
+
+corpus = "rustsec"
+upstream = "https://github.com/rustsec/advisory-db"
+# advisory-db publishes no release tags; the eval job pins the default
+# branch via the CI cache key (clone HEAD
+# eaf48e749baa3d5e27d304107d8abf175fd756bb).
+pinned_ref = "main"
+
+negative_control = true
--- a/tests/eval_corpus/manifest_gt_convert.py
+++ b/tests/eval_corpus/manifest_gt_convert.py
@ -0,0 +1,218 @@
+#!/usr/bin/env python3
+"""Convert a curated TOML vuln manifest into nyx ground-truth JSON.
+
+Used for real-world apps that ship **no** machine-readable per-file vuln
+labels of their own (OWASP NodeGoat, OWASP Juice Shop).  OWASP Benchmark
+ships `expectedresults-1.2beta.csv` (see owasp_gt_convert.py); NIST SARD
+ships `manifest.xml` (see sard_gt_convert.py).  NodeGoat / Juice Shop are
+intentionally-vulnerable apps without an equivalent, so the authoritative
+source here is a curated manifest committed *in this repo* — one
+`[[entry]]` table per known-vulnerable location, each carrying a
+provenance `note` so a reviewer can trace why the label is what it is.
+
+Manifest schema (TOML)::
+
+    # provenance comments at the top
+    corpus = "nodegoat"          # informational label
+    upstream = "https://github.com/OWASP/NodeGoat"
+    pinned_ref = "master@<sha>"  # the ref the paths were curated against
+
+    [[entry]]
+    path = "app/routes/contributions.js"   # relative to the corpus root, POSIX
+    cap  = "cmdi"                           # a nyx cap label (tabulate.py)
+    vuln = true                             # true = real vuln, false = benign control
+    note = "eval() of user-supplied pre/after-tax fields (NodeGoat A1)"
+
+Negative-control corpora.  A few real corpora carry **no** scannable
+source-level vulnerabilities of their own — most notably the RustSec
+`advisory-db`, which ships advisory *metadata* (TOML/Markdown), not
+vulnerable `.rs` source.  Such a corpus has zero ground-truth positives by
+construction, yet it is still worth scanning: it exercises the language's
+scan + verify path end to end on a large real-world tree and acts as an
+over-confirmation guard (nyx must Confirm nothing on a corpus with no real
+source vulns).  Declare it with a top-level ``negative_control = true`` and
+**zero** ``[[entry]]`` tables; the converter then emits an empty ``[]``
+ground truth.  ``negative_control`` and ``[[entry]]`` are mutually
+exclusive — a manifest that sets the flag *and* lists entries is rejected,
+so a real vuln can never be silently dropped behind the flag.
+
+Output (consumed by tabulate.py): a list of `{path, line, cap, vuln}`
+records, sorted by `(path, cap)` for deterministic, diff-stable JSON.
+`note` is intentionally dropped — the ground-truth JSON keeps the exact
+same four-field schema OWASP/SARD produce, so tabulate.py needs no special
+casing.  `line` is always 0 (the manifest pins a file, not a line;
+tabulate.py matches file+cap and treats line 0 as "any line").
+
+Path validation (the no-compromise guard).  When `--corpus-dir` is given,
+**every** manifest path must resolve to a real file under that root or the
+converter exits non-zero.  CI runs the converter against a fresh clone of
+the pinned corpus and then asserts the committed JSON byte-matches the
+regenerated JSON, so a corpus bump that moves/renames/deletes a labelled
+file (or a typo'd path) fails the build loudly instead of silently
+degrading recall.  Authoring the committed JSON offline (no corpus on
+hand) is done by omitting `--corpus-dir`: the transform is identical, only
+the existence check is skipped.
+
+Usage::
+
+    # author / regenerate the committed JSON offline (no validation):
+    tests/eval_corpus/manifest_gt_convert.py \\
+        --manifest tests/eval_corpus/ground_truth/nodegoat.manifest.toml \\
+        --output   tests/eval_corpus/ground_truth/nodegoat.json
+
+    # CI: validate every path against a real checkout, then diff vs committed:
+    tests/eval_corpus/manifest_gt_convert.py \\
+        --manifest tests/eval_corpus/ground_truth/nodegoat.manifest.toml \\
+        --corpus-dir ~/.cache/nyx/eval_corpus/nodegoat \\
+        --output   /tmp/nodegoat_regen.json
+"""
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+try:
+    import tomllib  # Python 3.11+
+except ModuleNotFoundError:  # pragma: no cover — older interpreters only
+    import tomli as tomllib  # type: ignore[no-redef]
+
+# nyx cap labels (see tabulate.py _CAP_BIT_TABLE / _CAP_RULE_TABLE).  A
+# manifest cap outside this set is almost always a typo, so reject it at
+# conversion time rather than letting a never-matching cap silently sink
+# recall.
+VALID_CAPS = {
+    "path_traversal",
+    "fmt_string",
+    "sqli",
+    "deserialize",
+    "ssrf",
+    "cmdi",
+    "crypto",
+    "unauthorized_id",
+    "data_exfil",
+    "ldap_injection",
+    "xpath_injection",
+    "header_injection",
+    "redirect",
+    "xss",
+    "xxe",
+    "prototype_pollution",
+    "auth",
+    "memory",
+    "validation",
+}
+
+
+def load_manifest(path: Path) -> dict:
+    try:
+        with open(path, "rb") as f:
+            return tomllib.load(f)
+    except FileNotFoundError:
+        print(f"error: manifest not found: {path}", file=sys.stderr)
+        raise SystemExit(1)
+    except tomllib.TOMLDecodeError as e:
+        print(f"error: manifest malformed: {path}: {e}", file=sys.stderr)
+        raise SystemExit(1)
+
+
+def main() -> int:
+    p = argparse.ArgumentParser()
+    p.add_argument("--manifest", required=True, help="curated TOML manifest path")
+    p.add_argument("--output", required=True, help="output ground-truth JSON path")
+    p.add_argument(
+        "--corpus-dir",
+        default="",
+        help=(
+            "when set, every manifest path must resolve to a real file under "
+            "this root or the converter exits 2 (the CI corpus-drift guard)"
+        ),
+    )
+    args = p.parse_args()
+
+    manifest = load_manifest(Path(args.manifest).expanduser())
+    entries = manifest.get("entry", []) or []
+    negative_control = bool(manifest.get("negative_control", False))
+    if negative_control and entries:
+        print(
+            f"error: negative_control manifest must declare zero [[entry]] "
+            f"tables (found {len(entries)}): {args.manifest}",
+            file=sys.stderr,
+        )
+        return 1
+    if not entries and not negative_control:
+        print(f"error: manifest has no [[entry]] tables: {args.manifest}", file=sys.stderr)
+        return 1
+
+    corpus = Path(args.corpus_dir).expanduser().resolve() if args.corpus_dir else None
+    if args.corpus_dir and (corpus is None or not corpus.is_dir()):
+        print(f"error: corpus dir not found: {args.corpus_dir}", file=sys.stderr)
+        return 1
+
+    records: list[dict] = []
+    missing: list[str] = []
+    seen: set[tuple[str, str]] = set()
+    for i, e in enumerate(entries):
+        path = e.get("path")
+        cap = e.get("cap")
+        vuln = e.get("vuln")
+        if not path or not cap or not isinstance(vuln, bool):
+            print(
+                f"error: entry #{i} needs string path, string cap, bool vuln: {e!r}",
+                file=sys.stderr,
+            )
+            return 1
+        if cap not in VALID_CAPS:
+            print(
+                f"error: entry #{i} cap {cap!r} is not a known nyx cap "
+                f"(path {path!r}); fix the manifest",
+                file=sys.stderr,
+            )
+            return 1
+        norm = path.replace("\\", "/")
+        key = (norm, cap)
+        if key in seen:
+            print(
+                f"error: duplicate (path, cap) entry: {norm!r} / {cap!r}",
+                file=sys.stderr,
+            )
+            return 1
+        seen.add(key)
+        if corpus is not None and not (corpus / norm).is_file():
+            missing.append(norm)
+        records.append({"path": norm, "line": 0, "cap": cap, "vuln": vuln})
+
+    if missing:
+        print(
+            f"error: {len(missing)} manifest path(s) absent from {corpus} "
+            f"(corpus drift or typo) — regenerate the manifest against the "
+            f"pinned ref:",
+            file=sys.stderr,
+        )
+        for m in missing:
+            print(f"  missing: {m}", file=sys.stderr)
+        return 2
+
+    # Deterministic order so the committed JSON is diff-stable and the CI
+    # byte-equality guard is meaningful regardless of manifest ordering.
+    records.sort(key=lambda r: (r["path"], r["cap"]))
+
+    out = Path(args.output).expanduser().resolve()
+    out.parent.mkdir(parents=True, exist_ok=True)
+    with open(out, "w") as f:
+        json.dump(records, f, indent=2)
+        f.write("\n")
+
+    vuln_count = sum(1 for r in records if r["vuln"])
+    print(f"wrote {len(records)} records to {out}")
+    if negative_control:
+        print("  negative-control corpus: zero ground-truth positives by construction")
+    print(f"  vulns:    {vuln_count}")
+    print(f"  non-vuln: {len(records) - vuln_count}")
+    if corpus is not None:
+        print(f"  validated against: {corpus}")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/tests/eval_corpus/owasp_gt_convert.py
+++ b/tests/eval_corpus/owasp_gt_convert.py
@ -0,0 +1,102 @@
+#!/usr/bin/env python3
+"""Convert OWASP Benchmark v1.2 expectedresults-*.csv into nyx ground-truth JSON.
+
+Source: `expectedresults-1.2beta.csv` shipped in the BenchmarkJava repo.
+Output: list of `{path, line, cap, vuln}` records, where:
+  - `path` is the BenchmarkTest*.java path **relative to --corpus-dir**, with
+    POSIX separators (e.g. `src/main/java/org/owasp/benchmark/testcode/
+    BenchmarkTest00001.java`).  Relative paths keep the committed ground truth
+    portable: `tabulate.py` suffix-matches them against the absolute paths nyx
+    emits, so the same JSON works on the dev laptop and on CI regardless of
+    where the corpus was cloned.
+  - `line` is 0 (CSV does not pin a line; tabulate uses LINE_TOLERANCE on findings).
+  - `cap` is a nyx cap label mapped from the OWASP category column.
+  - `vuln` is True for `real vulnerability == true`, else False.
+
+Usage:
+  tests/eval_corpus/owasp_gt_convert.py \\
+      --corpus-dir ~/.cache/nyx/eval_corpus/owasp_benchmark_v1.2 \\
+      --output     tests/eval_corpus/ground_truth/owasp_benchmark_v1.2.json
+"""
+
+import argparse
+import csv
+import json
+import sys
+from pathlib import Path
+
+OWASP_TO_NYX_CAP = {
+    "cmdi":        "cmdi",
+    "crypto":      "crypto",
+    "hash":        "crypto",
+    "ldapi":       "ldap_injection",
+    "pathtraver":  "path_traversal",
+    "securecookie": "auth",
+    "sqli":        "sqli",
+    "trustbound":  "xss",
+    "weakrand":    "crypto",
+    "xpathi":      "xpath_injection",
+    "xss":         "xss",
+}
+
+
+def main() -> int:
+    p = argparse.ArgumentParser()
+    p.add_argument("--corpus-dir", required=True,
+                   help="Path to BenchmarkJava clone root.")
+    p.add_argument("--output", required=True,
+                   help="Output ground-truth JSON path.")
+    p.add_argument("--csv", default="",
+                   help="Override CSV path (default: <corpus-dir>/expectedresults-1.2beta.csv).")
+    args = p.parse_args()
+
+    corpus = Path(args.corpus_dir).expanduser().resolve()
+    csv_path = Path(args.csv) if args.csv else corpus / "expectedresults-1.2beta.csv"
+    if not csv_path.exists():
+        print(f"error: csv not found: {csv_path}", file=sys.stderr)
+        return 1
+
+    java_root = corpus / "src" / "main" / "java" / "org" / "owasp" / "benchmark" / "testcode"
+    if not java_root.is_dir():
+        print(f"error: java testcode dir not found: {java_root}", file=sys.stderr)
+        return 1
+
+    records: list[dict] = []
+    skipped = 0
+    with open(csv_path) as f:
+        reader = csv.reader(f)
+        next(reader, None)
+        for row in reader:
+            if len(row) < 3:
+                continue
+            name, category, real_vuln = row[0].strip(), row[1].strip(), row[2].strip().lower()
+            cap = OWASP_TO_NYX_CAP.get(category)
+            if cap is None:
+                skipped += 1
+                continue
+            java_file = java_root / f"{name}.java"
+            if not java_file.exists():
+                skipped += 1
+                continue
+            records.append({
+                "path": java_file.relative_to(corpus).as_posix(),
+                "line": 0,
+                "cap":  cap,
+                "vuln": real_vuln == "true",
+            })
+
+    out = Path(args.output).expanduser().resolve()
+    out.parent.mkdir(parents=True, exist_ok=True)
+    with open(out, "w") as f:
+        json.dump(records, f, indent=2)
+
+    vuln_count = sum(1 for r in records if r["vuln"])
+    print(f"wrote {len(records)} records to {out}")
+    print(f"  vulns:    {vuln_count}")
+    print(f"  non-vuln: {len(records) - vuln_count}")
+    print(f"  skipped:  {skipped}")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/tests/eval_corpus/report.py
+++ b/tests/eval_corpus/report.py
@ -0,0 +1,483 @@
+#!/usr/bin/env python3
+"""
+Aggregate eval results across all corpus sets and emit a summary table.
+Used by run.sh after all corpus sets have been tabulated.
+
+Phase 29 (Track I) extensions:
+  --budget tests/eval_corpus/budget.toml   per-cell budget enforcement
+  --diff   previous.json                   monotonic-improvement diff;
+                                           CI fails on any regression.
+"""
+
+import argparse
+import json
+import os
+import sys
+from collections import defaultdict
+
+try:
+    import tomllib  # Python 3.11+
+except ModuleNotFoundError:  # pragma: no cover — older interpreters only
+    import tomli as tomllib  # type: ignore[no-redef]
+
+# Caps with no sound runtime oracle: config / usage smells (weak crypto,
+# insecure-cookie auth, reflected XSS / trust-boundary) route to
+# Unsupported(SoundOracleUnavailable) by design, and the catch-all `other`
+# bucket holds unclassified findings with no curated payloads.  Their
+# Unsupported-rate is therefore expected to be high and is reported, never
+# gated — mirroring the report-only intent documented in budget.toml.
+NO_SOUND_ORACLE_CAPS = {"auth", "crypto", "xss", "trustbound", "other"}
+
+
+def _soft_unsupported() -> bool:
+    """True when the per-cell Unsupported-rate budget is report-only.
+
+    Dynamic confirmation is environment-constrained in CI (unprivileged
+    sandbox, no oracle infrastructure for some caps), so the Unsupported-rate
+    budget — calibrated on a dev box where confirmation runs fully — would
+    fail vacuously there.  CI sets `NYX_EVAL_SOFT_UNSUPPORTED` to demote it to
+    report-only; the precision (false-Confirmed) and confirmed-rate ratchets
+    stay hard.  Unset (local dev) keeps the Unsupported budget hard.
+    """
+    return os.environ.get("NYX_EVAL_SOFT_UNSUPPORTED", "").strip().lower() in (
+        "1",
+        "true",
+        "yes",
+        "on",
+    )
+
+
+def load_budget(path: str) -> dict:
+    try:
+        with open(path, "rb") as f:
+            raw = tomllib.load(f)
+    except FileNotFoundError:
+        print(f"ERROR  budget file not found: {path}", file=sys.stderr)
+        sys.exit(3)
+    except tomllib.TOMLDecodeError as e:
+        print(f"ERROR  budget file malformed: {path}: {e}", file=sys.stderr)
+        sys.exit(3)
+    default = raw.get("default", {}) or {}
+    cells = {}
+    for row in raw.get("cell", []) or []:
+        cap = row.get("cap")
+        lang = row.get("lang")
+        if not cap or not lang:
+            print(f"ERROR  budget cell missing cap/lang: {row!r}", file=sys.stderr)
+            sys.exit(3)
+        cells[(cap, lang)] = row
+    return {"default": default, "cells": cells}
+
+
+def budget_for_cell(budget: dict, cap: str, lang: str) -> dict:
+    merged = dict(budget.get("default", {}) or {})
+    cell = budget.get("cells", {}).get((cap, lang))
+    if cell:
+        merged.update({k: v for k, v in cell.items() if k not in ("cap", "lang")})
+    if not cell:
+        wildcard = (
+            budget.get("cells", {}).get((cap, "*"))
+            or budget.get("cells", {}).get(("*", lang))
+            or budget.get("cells", {}).get(("*", "*"))
+        )
+        if wildcard:
+            merged.update(
+                {k: v for k, v in wildcard.items() if k not in ("cap", "lang")}
+            )
+    return merged
+
+
+def load_previous_agg(path: str) -> dict:
+    """Aggregate a previous results file the same way main() does."""
+    try:
+        with open(path) as f:
+            data = json.load(f)
+    except FileNotFoundError:
+        print(f"ERROR  diff file not found: {path}", file=sys.stderr)
+        sys.exit(3)
+    except json.JSONDecodeError as e:
+        print(f"ERROR  diff file malformed: {path}: {e}", file=sys.stderr)
+        sys.exit(3)
+    agg: dict[tuple[str, str], dict] = defaultdict(
+        lambda: {
+            "tp": 0,
+            "fp": 0,
+            "fn": 0,
+            "unsupported": 0,
+            "confirmed": 0,
+            "partially_confirmed": 0,
+            "wrong_confirmed": 0,
+            "stable_replays": 0,
+            "confirmed_tp": 0,
+            "confirmed_fp": 0,
+            "total": 0,
+        }
+    )
+    for r in data:
+        for c in r.get("cells", []):
+            k = (c["cap"], c["lang"])
+            for field in (
+                "tp",
+                "fp",
+                "fn",
+                "unsupported",
+                "confirmed",
+                "partially_confirmed",
+                "wrong_confirmed",
+                "stable_replays",
+                "confirmed_tp",
+                "confirmed_fp",
+                "total",
+            ):
+                agg[k][field] += c.get(field, 0)
+    return agg
+
+
+def main() -> int:
+    p = argparse.ArgumentParser()
+    p.add_argument("--results", required=True)
+    p.add_argument(
+        "--budget",
+        default="",
+        help="path to budget.toml (per-(cap,lang) thresholds)",
+    )
+    p.add_argument(
+        "--diff",
+        default="",
+        help="path to a previous results.json; fail on monotonic-improvement regression",
+    )
+    p.add_argument(
+        "--min-confirmed-rate",
+        type=float,
+        default=None,
+        help=(
+            "minimum Confirmed / total rate per cap; exits 2 when any cap "
+            "with findings falls below the threshold"
+        ),
+    )
+    p.add_argument(
+        "--min-precision",
+        type=float,
+        default=None,
+        help=(
+            "minimum precision (tp / (tp+fp)) per cap; exits 2 when any cap "
+            "with at least one finding falls below the threshold. Phase 27 "
+            "OWASP acceptance floor (>= 0.85)."
+        ),
+    )
+    p.add_argument(
+        "--min-recall",
+        type=float,
+        default=None,
+        help=(
+            "minimum recall (tp / (tp+fn)) per cap; exits 2 when any cap "
+            "with at least one ground-truth positive falls below the "
+            "threshold. Phase 27 OWASP acceptance floor (>= 0.40)."
+        ),
+    )
+    p.add_argument(
+        "--floor-caps",
+        default="",
+        help=(
+            "comma-separated cap allowlist. When set, the --min-confirmed-rate, "
+            "--min-precision and --min-recall floors are ENFORCED only for these "
+            "caps; other caps are still measured and printed but not gated. Used "
+            "to exempt caps with no sound runtime oracle (e.g. crypto weak "
+            "randomness, secure-cookie config smells) from dynamic-confirmation "
+            "floors that they fundamentally cannot meet. Empty = gate every cap."
+        ),
+    )
+    args = p.parse_args()
+    floor_caps = {c.strip() for c in args.floor_caps.split(",") if c.strip()}
+
+    with open(args.results) as f:
+        results = json.load(f)
+
+    if not results:
+        print("No results to report.")
+        return 0
+
+    # Aggregate across sets.
+    agg: dict[tuple[str, str], dict] = defaultdict(
+        lambda: {
+            "tp": 0,
+            "fp": 0,
+            "fn": 0,
+            "unsupported": 0,
+            "confirmed": 0,
+            "partially_confirmed": 0,
+            "wrong_confirmed": 0,
+            "stable_replays": 0,
+            "confirmed_tp": 0,
+            "confirmed_fp": 0,
+            "total": 0,
+        }
+    )
+    for r in results:
+        for c in r.get("cells", []):
+            k = (c["cap"], c["lang"])
+            for field in (
+                "tp",
+                "fp",
+                "fn",
+                "unsupported",
+                "confirmed",
+                "partially_confirmed",
+                "wrong_confirmed",
+                "stable_replays",
+                "confirmed_tp",
+                "confirmed_fp",
+                "total",
+            ):
+                agg[k][field] += c.get(field, 0)
+
+    print("\n=== Aggregated eval corpus report ===")
+    print(
+        f"{'Cap':<20} {'Lang':<12} {'TP':>5} {'FP':>5} {'FN':>5} "
+        f"{'Prec':>6} {'Rec':>6} {'Unsup%':>7} {'Conf%':>7} {'Part%':>7}"
+    )
+    print("-" * 88)
+    for k, v in sorted(agg.items()):
+        prec = v["tp"] / max(v["tp"] + v["fp"], 1)
+        rec = v["tp"] / max(v["tp"] + v["fn"], 1)
+        unsup = v["unsupported"] / max(v["total"], 1)
+        conf = v["confirmed"] / max(v["total"], 1)
+        part = v["partially_confirmed"] / max(v["total"], 1)
+        print(
+            f"{k[0]:<20} {k[1]:<12} "
+            f"{v['tp']:>5} {v['fp']:>5} {v['fn']:>5} "
+            f"{prec:>6.2f} {rec:>6.2f} "
+            f"{unsup*100:>6.1f}% {conf*100:>6.1f}% {part*100:>6.1f}%"
+        )
+
+    gate_failed = False
+
+    # ── Phase 29: per-cell budget enforcement ────────────────────────────
+    if args.budget:
+        budget = load_budget(args.budget)
+        print(f"\n=== Per-cell budget ({args.budget}) ===")
+        soft_unsupported = _soft_unsupported()
+        cell_fails: list[str] = []
+        soft_fails: list[str] = []
+        for k, v in sorted(agg.items()):
+            b = budget_for_cell(budget, k[0], k[1])
+            if not b:
+                continue
+            max_unsup = b.get("unsupported_rate")
+            max_false = b.get("false_confirmed_rate")
+            min_stable = b.get("repro_stability")
+            min_confirmed = b.get("confirmed_rate")
+
+            if isinstance(max_unsup, (int, float)) and v["total"] > 0:
+                rate = v["unsupported"] / v["total"]
+                if rate > max_unsup:
+                    msg = (
+                        f"{k[0]}/{k[1]}: Unsupported {rate*100:.1f}%"
+                        f" > budget {max_unsup*100:.1f}%"
+                    )
+                    if k[0] in NO_SOUND_ORACLE_CAPS or soft_unsupported:
+                        soft_fails.append(f"  soft  {msg}")
+                    else:
+                        cell_fails.append(f"  FAIL  {msg}")
+            if isinstance(max_false, (int, float)) and v["confirmed"] > 0:
+                rate = v["wrong_confirmed"] / v["confirmed"]
+                if rate > max_false:
+                    cell_fails.append(
+                        f"  FAIL  {k[0]}/{k[1]}: false-Confirmed {rate*100:.1f}%"
+                        f" > budget {max_false*100:.1f}%"
+                    )
+            if (
+                isinstance(min_stable, (int, float))
+                and v["confirmed"] > 0
+                and v.get("stable_replays", 0) > 0
+            ):
+                rate = v["stable_replays"] / v["confirmed"]
+                if rate < min_stable:
+                    cell_fails.append(
+                        f"  FAIL  {k[0]}/{k[1]}: repro stability {rate*100:.1f}%"
+                        f" < budget {min_stable*100:.1f}%"
+                    )
+            if isinstance(min_confirmed, (int, float)) and v["total"] > 0:
+                rate = v["confirmed"] / v["total"]
+                if rate < min_confirmed:
+                    cell_fails.append(
+                        f"  FAIL  {k[0]}/{k[1]}: Confirmed {rate*100:.1f}%"
+                        f" < budget {min_confirmed*100:.1f}%"
+                    )
+        if soft_fails:
+            print(
+                "  Unsupported-rate over budget (report-only: no-sound-oracle "
+                "cap or environment-constrained dynamic confirmation):"
+            )
+            for line in soft_fails:
+                print(line)
+        if cell_fails:
+            for line in cell_fails:
+                print(line)
+            gate_failed = True
+        else:
+            print("  All hard per-cell budgets met.")
+    else:
+        # Legacy fallback: per-cap Unsupported rate <= 80%.
+        print("\n=== Gate checks ===")
+        UNSUPPORTED_BUDGET = 0.80
+        cell_fails: list[str] = []
+        for k, v in sorted(agg.items()):
+            unsup = v["unsupported"] / max(v["total"], 1)
+            if unsup > UNSUPPORTED_BUDGET:
+                cell_fails.append(
+                    f"  FAIL  {k[0]}/{k[1]}: Unsupported {unsup*100:.1f}%"
+                    f" > {UNSUPPORTED_BUDGET*100:.0f}% budget"
+                )
+        if cell_fails:
+            for line in cell_fails:
+                print(line)
+            gate_failed = True
+        else:
+            print("  All gate thresholds met.")
+
+    # ── Per-cap Confirmed-rate (published always; gated when a floor given) ──
+    # Aggregated per cap across languages.  The table is always printed so the
+    # corpus's confirmation profile is visible ("publish per-cap …"); the floor
+    # only FAILS the run when --min-confirmed-rate is supplied and the cap is in
+    # scope (floor_caps empty = every cap in scope).
+    cap_totals: dict[str, dict] = defaultdict(lambda: {"confirmed": 0, "total": 0})
+    for (cap, _lang), v in agg.items():
+        cap_totals[cap]["confirmed"] += v.get("confirmed", 0)
+        cap_totals[cap]["total"] += v.get("total", 0)
+    if cap_totals:
+        floor_txt = (
+            f" (floor {args.min_confirmed_rate*100:.1f}%)"
+            if args.min_confirmed_rate is not None
+            else " (report-only)"
+        )
+        print(f"\n=== Per-cap Confirmed-rate{floor_txt} ===")
+        confirmed_fails: list[str] = []
+        for cap, v in sorted(cap_totals.items()):
+            if v["total"] <= 0:
+                continue
+            rate = v["confirmed"] / v["total"]
+            gated = args.min_confirmed_rate is not None and (
+                (not floor_caps) or (cap in floor_caps)
+            )
+            line = (
+                f"  {cap:<20} {v['confirmed']:>5}/{v['total']:<5} "
+                f"{rate*100:>6.1f}%"
+            )
+            if gated and rate < args.min_confirmed_rate:
+                confirmed_fails.append(f"{line}  FAIL")
+            elif args.min_confirmed_rate is None:
+                print(line)
+            else:
+                print(f"{line}  {'OK' if gated else 'skip (no floor)'}")
+        if confirmed_fails:
+            for line in confirmed_fails:
+                print(line)
+            gate_failed = True
+        elif args.min_confirmed_rate is not None:
+            print("  All confirmed-rate floors met.")
+
+    # ── Per-cap precision / recall (published always; gated when a floor given) ──
+    # OWASP acceptance: per-cap precision ≥ 0.85, recall ≥ 0.40.  Aggregated per
+    # cap across languages (tp/fp/fn summed over every lang cell).  The table is
+    # always printed ("publish per-cap precision/recall"); a cap FAILS only when
+    # the matching --min-* floor is supplied and the cap is in scope (floor_caps
+    # empty = every cap in scope).
+    cap_pr: dict[str, dict] = defaultdict(lambda: {"tp": 0, "fp": 0, "fn": 0})
+    for (cap, _lang), v in agg.items():
+        cap_pr[cap]["tp"] += v.get("tp", 0)
+        cap_pr[cap]["fp"] += v.get("fp", 0)
+        cap_pr[cap]["fn"] += v.get("fn", 0)
+    if cap_pr:
+        floors = []
+        if args.min_precision is not None:
+            floors.append(f"precision ≥ {args.min_precision*100:.1f}%")
+        if args.min_recall is not None:
+            floors.append(f"recall ≥ {args.min_recall*100:.1f}%")
+        floor_txt = f" (floors: {', '.join(floors)})" if floors else " (report-only)"
+        print(f"\n=== Per-cap precision/recall{floor_txt} ===")
+        print(f"  {'Cap':<20} {'TP':>5} {'FP':>5} {'FN':>5} {'Prec':>7} {'Rec':>7}  Status")
+        pr_failed = False
+        any_gated = False
+        for cap, v in sorted(cap_pr.items()):
+            tp, fp, fn = v["tp"], v["fp"], v["fn"]
+            # No findings and no GT positives → cap not present in this corpus.
+            if tp + fp + fn == 0:
+                continue
+            prec = tp / max(tp + fp, 1)
+            rec = tp / max(tp + fn, 1)
+            gated = (not floor_caps) or (cap in floor_caps)
+            tags = []
+            if gated and args.min_precision is not None and (tp + fp) > 0 and prec < args.min_precision:
+                tags.append("PRECISION")
+            if gated and args.min_recall is not None and (tp + fn) > 0 and rec < args.min_recall:
+                tags.append("RECALL")
+            if tags:
+                status = "FAIL " + "+".join(tags)
+            elif not floors:
+                status = "—"
+            elif gated:
+                status = "OK"
+                any_gated = True
+            else:
+                status = "skip (no floor)"
+            print(
+                f"  {cap:<20} {tp:>5} {fp:>5} {fn:>5} "
+                f"{prec:>7.2f} {rec:>7.2f}  {status}"
+            )
+            if tags:
+                pr_failed = True
+        if pr_failed:
+            gate_failed = True
+        elif floors and any_gated:
+            print("  All per-cap precision/recall floors met.")
+
+    # ── Phase 29: monotonic-improvement diff ─────────────────────────────
+    if args.diff:
+        prev = load_previous_agg(args.diff)
+        print(f"\n=== Monotonic-improvement diff vs {args.diff} ===")
+        diff_fails: list[str] = []
+        EPS = 0.005
+        for k, v in sorted(agg.items()):
+            old = prev.get(k)
+            if not old:
+                continue
+            old_unsup = old["unsupported"] / max(old["total"], 1)
+            new_unsup = v["unsupported"] / max(v["total"], 1)
+            if new_unsup > old_unsup + EPS:
+                diff_fails.append(
+                    f"  REGRESSION  {k[0]}/{k[1]}: Unsupported"
+                    f" {old_unsup*100:.1f}% → {new_unsup*100:.1f}%"
+                )
+            old_conf = old.get("confirmed", 0)
+            new_conf = v.get("confirmed", 0)
+            old_false = (old.get("wrong_confirmed", 0) / old_conf) if old_conf else None
+            new_false = (v.get("wrong_confirmed", 0) / new_conf) if new_conf else None
+            if old_false is not None and new_false is not None and new_false > old_false + EPS:
+                diff_fails.append(
+                    f"  REGRESSION  {k[0]}/{k[1]}: false-Confirmed"
+                    f" {old_false*100:.1f}% → {new_false*100:.1f}%"
+                )
+            old_stable = (old.get("stable_replays", 0) / old_conf) if old_conf else None
+            new_stable = (v.get("stable_replays", 0) / new_conf) if new_conf else None
+            if (
+                old_stable is not None
+                and new_stable is not None
+                and new_stable < old_stable - EPS
+            ):
+                diff_fails.append(
+                    f"  REGRESSION  {k[0]}/{k[1]}: repro stability"
+                    f" {old_stable*100:.1f}% → {new_stable*100:.1f}%"
+                )
+        if diff_fails:
+            for line in diff_fails:
+                print(line)
+            gate_failed = True
+        else:
+            print("  No regressions vs previous run.")
+
+    return 2 if gate_failed else 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/tests/eval_corpus/run.sh
+++ b/tests/eval_corpus/run.sh
@ -0,0 +1,300 @@
+#!/usr/bin/env bash
+# Eval corpus runner.
+#
+# Usage:
+#   tests/eval_corpus/run.sh [--output DIR] [--nyx BIN] [--sets owasp,sard,inhouse]
+#
+# Bootstraps OWASP Benchmark v1.2, the NIST SARD subset, and Nyx benchmark
+# fixtures. Runs `nyx scan --verify` on each. Emits
+# per-cell (cap x language) precision/recall table and per-cap Unsupported
+# rate to stdout (and --output DIR if given).
+#
+# Environment:
+#   NYX_EVAL_CORPUS_DIR  - path to pre-downloaded corpus roots
+#                          (default: ~/.cache/nyx/eval_corpus)
+#   NYX_BIN              - path to nyx binary (default: ./target/release/nyx)
+#
+# Exit codes:
+#   0 - all budget thresholds met
+#   1 - setup or I/O error
+#   2 - one or more budget thresholds exceeded (see output for details)
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+
+# Defaults
+OUTPUT_DIR=""
+NYX_BIN="${NYX_BIN:-${REPO_ROOT}/target/release/nyx}"
+CORPUS_CACHE="${NYX_EVAL_CORPUS_DIR:-${HOME}/.cache/nyx/eval_corpus}"
+SETS="owasp,sard,nodegoat,juiceshop,railsgoat,dvwa,dvpwa,gosec,rustsec,inhouse"
+# Optional per-cell budgets and monotonic-improvement diff.
+BUDGET_FILE=""
+DIFF_FILE=""
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --output) OUTPUT_DIR="$2"; shift 2 ;;
+    --nyx)    NYX_BIN="$2"; shift 2 ;;
+    --sets)   SETS="$2"; shift 2 ;;
+    --budget) BUDGET_FILE="$2"; shift 2 ;;
+    --diff)   DIFF_FILE="$2"; shift 2 ;;
+    *)        shift ;;
+  esac
+done
+
+# ── Helpers ───────────────────────────────────────────────────────────────────
+die()  { echo "error: $*" >&2; exit 1; }
+info() { echo "[eval] $*"; }
+
+require_cmd() { command -v "$1" >/dev/null 2>&1 || die "required command not found: $1"; }
+require_cmd jq
+require_cmd python3
+
+# Scan one ground-truth-labelled real corpus (NodeGoat / Juice Shop) and
+# tabulate it against its committed ground truth.  Self-skips when the
+# corpus has not been cloned into the cache.
+run_jsts_corpus() {
+  local label="$1" dir="$2" gt="$3"
+  if [[ ! -d "$dir" ]]; then
+    info "Bootstrapping $label..."
+    info "  Clone the corpus into ${dir} then re-run this script:"
+    if [[ "$label" == "nodegoat" ]]; then
+      info "    git clone --depth 1 https://github.com/OWASP/NodeGoat ${dir}"
+    else
+      info "    git clone --depth 1 --branch v15.0.0 \\"
+      info "      https://github.com/juice-shop/juice-shop ${dir}"
+    fi
+    info "Skipping $label set (not yet downloaded)."
+    return 0
+  fi
+  info "Running nyx scan on $label..."
+  set +e
+  "$NYX_BIN" scan --format json --verify --no-index "$dir" \
+    > "/tmp/nyx_${label}.json" 2>"/tmp/nyx_${label}.stderr"
+  local rc=$?
+  set -e
+  if [[ $rc -ne 0 && $rc -ne 1 ]]; then
+    info "  nyx exited $rc on $label set (stderr follows):"
+    cat "/tmp/nyx_${label}.stderr" >&2
+    return 0
+  fi
+  python3 "${SCRIPT_DIR}/tabulate.py" \
+    --label "$label" \
+    --scan "/tmp/nyx_${label}.json" \
+    --ground-truth "$gt" \
+    --append "$RESULTS_JSON" \
+    ${BUDGET_FILE:+--budget "$BUDGET_FILE"} \
+    ${DIFF_FILE:+--diff "$DIFF_FILE"} \
+    || info "  tabulate.py failed on $label; ground truth file may be absent"
+}
+
+# Scan one Track R.2 polyglot real corpus and tabulate it against its
+# committed ground truth, SCOPED to its target language (tabulate --lang) so
+# incidental other-language assets (e.g. vendored JS in a Rails / aiohttp app)
+# do not pollute the corpus's per-cap metrics.  Self-skips when the corpus has
+# not been cloned into the cache; prints the exact clone command if so.
+#   $1 label  $2 dir  $3 ground-truth json  $4 target lang  $5 repo  $6 ref
+run_polyglot_corpus() {
+  local label="$1" dir="$2" gt="$3" lang="$4" repo="$5" ref="$6"
+  if [[ ! -d "$dir" ]]; then
+    info "Bootstrapping $label..."
+    info "  git clone --depth 1 --branch ${ref} ${repo} ${dir}"
+    info "Skipping $label set (not yet downloaded)."
+    return 0
+  fi
+  info "Running nyx scan on $label (lang scope: ${lang})..."
+  set +e
+  "$NYX_BIN" scan --format json --verify --no-index "$dir" \
+    > "/tmp/nyx_${label}.json" 2>"/tmp/nyx_${label}.stderr"
+  local rc=$?
+  set -e
+  if [[ $rc -ne 0 && $rc -ne 1 ]]; then
+    info "  nyx exited $rc on $label set (stderr follows):"
+    cat "/tmp/nyx_${label}.stderr" >&2
+    return 0
+  fi
+  python3 "${SCRIPT_DIR}/tabulate.py" \
+    --label "$label" \
+    --scan "/tmp/nyx_${label}.json" \
+    --ground-truth "$gt" \
+    --lang "$lang" \
+    --append "$RESULTS_JSON" \
+    ${BUDGET_FILE:+--budget "$BUDGET_FILE"} \
+    ${DIFF_FILE:+--diff "$DIFF_FILE"} \
+    || info "  tabulate.py failed on $label; ground truth file may be absent"
+}
+
+[[ -x "$NYX_BIN" ]] || die "nyx binary not found or not executable: $NYX_BIN"
+
+mkdir -p "$CORPUS_CACHE"
+[[ -n "$OUTPUT_DIR" ]] && mkdir -p "$OUTPUT_DIR"
+
+RESULTS_JSON="${OUTPUT_DIR:-/tmp}/eval_results_$(date +%Y%m%d_%H%M%S).json"
+echo "[]" > "$RESULTS_JSON"
+
+# ── OWASP Benchmark v1.2 bootstrap ───────────────────────────────────────────
+OWASP_DIR="${CORPUS_CACHE}/owasp_benchmark_v1.2"
+if [[ "$SETS" == *owasp* ]]; then
+  if [[ ! -d "$OWASP_DIR" ]]; then
+    info "Bootstrapping OWASP Benchmark v1.2..."
+    info "  Clone from https://github.com/OWASP-Benchmark/BenchmarkJava"
+    info "  into ${OWASP_DIR}"
+    info "  then re-run this script."
+    info "  git clone --depth 1 --branch 1.2beta \\"
+    info "    https://github.com/OWASP-Benchmark/BenchmarkJava \\"
+    info "    ${OWASP_DIR}"
+    info "Skipping OWASP set (not yet downloaded)."
+  else
+    info "Running nyx scan on OWASP Benchmark v1.2..."
+    set +e
+    "$NYX_BIN" scan --format json --verify --no-index "$OWASP_DIR" \
+      > /tmp/nyx_owasp.json 2>/tmp/nyx_owasp.stderr
+    NYX_EXIT=$?
+    set -e
+    if [[ $NYX_EXIT -ne 0 && $NYX_EXIT -ne 1 ]]; then
+      info "  nyx exited $NYX_EXIT on OWASP set (stderr follows):"
+      cat /tmp/nyx_owasp.stderr >&2
+    else
+      python3 "${SCRIPT_DIR}/tabulate.py" \
+        --label owasp \
+        --scan /tmp/nyx_owasp.json \
+        --ground-truth "${SCRIPT_DIR}/ground_truth/owasp_benchmark_v1.2.json" \
+        --append "$RESULTS_JSON" \
+        ${BUDGET_FILE:+--budget "$BUDGET_FILE"} \
+        ${DIFF_FILE:+--diff "$DIFF_FILE"} \
+        || info "  tabulate.py failed; ground truth file may be absent"
+    fi
+  fi
+fi
+
+# ── NodeGoat / Juice Shop (JS/TS) bootstrap — Track R.1 ───────────────────────
+if [[ "$SETS" == *nodegoat* ]]; then
+  run_jsts_corpus nodegoat "${CORPUS_CACHE}/nodegoat" \
+    "${SCRIPT_DIR}/ground_truth/nodegoat.json"
+fi
+if [[ "$SETS" == *juiceshop* ]]; then
+  run_jsts_corpus juiceshop "${CORPUS_CACHE}/juiceshop" \
+    "${SCRIPT_DIR}/ground_truth/juiceshop.json"
+fi
+
+# ── Polyglot real corpora (Ruby/PHP/Python/Go/Rust) — Track R.2 ───────────────
+if [[ "$SETS" == *railsgoat* ]]; then
+  run_polyglot_corpus railsgoat "${CORPUS_CACHE}/railsgoat" \
+    "${SCRIPT_DIR}/ground_truth/railsgoat.json" ruby \
+    https://github.com/OWASP/railsgoat rails.5.0.0
+fi
+if [[ "$SETS" == *dvwa* ]]; then
+  run_polyglot_corpus dvwa "${CORPUS_CACHE}/dvwa" \
+    "${SCRIPT_DIR}/ground_truth/dvwa.json" php \
+    https://github.com/digininja/DVWA 2.5
+fi
+if [[ "$SETS" == *dvpwa* ]]; then
+  run_polyglot_corpus dvpwa "${CORPUS_CACHE}/dvpwa" \
+    "${SCRIPT_DIR}/ground_truth/dvpwa.json" python \
+    https://github.com/anxolerd/dvpwa master
+fi
+if [[ "$SETS" == *gosec* ]]; then
+  run_polyglot_corpus gosec "${CORPUS_CACHE}/gosec" \
+    "${SCRIPT_DIR}/ground_truth/gosec.json" go \
+    https://github.com/securego/gosec v2.26.1
+fi
+# RustSec advisory-db is the Rust negative control (empty ground truth): the
+# row asserts the Rust scan/verify path runs and Confirms nothing there.
+if [[ "$SETS" == *rustsec* ]]; then
+  run_polyglot_corpus rustsec "${CORPUS_CACHE}/rustsec" \
+    "${SCRIPT_DIR}/ground_truth/rustsec.json" rust \
+    https://github.com/rustsec/advisory-db main
+fi
+
+# ── NIST SARD subset bootstrap ────────────────────────────────────────────────
+SARD_DIR="${CORPUS_CACHE}/nist_sard"
+if [[ "$SETS" == *sard* ]]; then
+  if [[ ! -d "$SARD_DIR" ]]; then
+    info "Bootstrapping NIST SARD subset..."
+    info "  Download from https://samate.nist.gov/SARD/"
+    info "  into ${SARD_DIR} then re-run this script."
+    info "Skipping SARD set (not yet downloaded)."
+  else
+    info "Running nyx scan on NIST SARD subset..."
+    set +e
+    "$NYX_BIN" scan --format json --verify --no-index "$SARD_DIR" \
+      > /tmp/nyx_sard.json 2>/tmp/nyx_sard.stderr
+    NYX_EXIT=$?
+    set -e
+    if [[ $NYX_EXIT -ne 0 && $NYX_EXIT -ne 1 ]]; then
+      info "  nyx exited $NYX_EXIT on SARD set"
+    else
+      python3 "${SCRIPT_DIR}/tabulate.py" \
+        --label sard \
+        --scan /tmp/nyx_sard.json \
+        --ground-truth "${SCRIPT_DIR}/ground_truth/nist_sard.json" \
+        --append "$RESULTS_JSON" \
+        ${BUDGET_FILE:+--budget "$BUDGET_FILE"} \
+        ${DIFF_FILE:+--diff "$DIFF_FILE"} \
+        || info "  tabulate.py failed; ground truth file may be absent"
+    fi
+  fi
+fi
+
+# ── In-house bughunt-curated set ──────────────────────────────────────────────
+if [[ "$SETS" == *inhouse* ]]; then
+  INHOUSE_DIRS=(
+    "${REPO_ROOT}/tests/benchmark/corpus"
+    "${REPO_ROOT}/tests/dynamic_fixtures"
+  )
+  for dir in "${INHOUSE_DIRS[@]}"; do
+    [[ -d "$dir" ]] || continue
+    label="inhouse_$(basename "$dir")"
+    info "Running nyx scan on in-house set: $dir"
+    set +e
+    "$NYX_BIN" scan --format json --verify --no-index "$dir" \
+      > "/tmp/nyx_${label}.json" 2>"/tmp/nyx_${label}.stderr"
+    NYX_EXIT=$?
+    set -e
+    if [[ $NYX_EXIT -ne 0 && $NYX_EXIT -ne 1 ]]; then
+      info "  nyx exited $NYX_EXIT on $label"
+      continue
+    fi
+    python3 "${SCRIPT_DIR}/tabulate.py" \
+      --label "$label" \
+      --scan "/tmp/nyx_${label}.json" \
+      --inhouse \
+      --append "$RESULTS_JSON" \
+      ${BUDGET_FILE:+--budget "$BUDGET_FILE"} \
+      ${DIFF_FILE:+--diff "$DIFF_FILE"} \
+      || info "  tabulate.py failed on $label"
+  done
+fi
+
+# ── Emit summary table ────────────────────────────────────────────────────────
+info ""
+info "Results written to: $RESULTS_JSON"
+
+[[ -n "$OUTPUT_DIR" ]] && cp "$RESULTS_JSON" "${OUTPUT_DIR}/eval_results.json"
+
+if [[ ! -f "${SCRIPT_DIR}/report.py" ]]; then
+  info "report.py not available; raw results at $RESULTS_JSON"
+  exit 0
+fi
+
+set +e
+python3 "${SCRIPT_DIR}/report.py" \
+  --results "$RESULTS_JSON" \
+  ${BUDGET_FILE:+--budget "$BUDGET_FILE"} \
+  ${DIFF_FILE:+--diff "$DIFF_FILE"}
+REPORT_RC=$?
+set -e
+# Propagate budget failures (exit 2) and malformed config (exit 3). Treat other
+# non-zero exits as setup errors.
+if [[ $REPORT_RC -eq 2 ]]; then
+  exit 2
+elif [[ $REPORT_RC -eq 3 ]]; then
+  info "report.py: budget/diff configuration malformed; see $RESULTS_JSON"
+  exit 3
+elif [[ $REPORT_RC -ne 0 ]]; then
+  info "report.py crashed (exit $REPORT_RC); raw results at $RESULTS_JSON"
+  exit 1
+fi
+exit 0
--- a/tests/eval_corpus/run_full.sh
+++ b/tests/eval_corpus/run_full.sh
@ -0,0 +1,90 @@
+#!/usr/bin/env bash
+# Full eval-corpus orchestrator.
+#
+# Drives a complete pass against every corpus set the project knows about
+# (OWASP Benchmark v1.2, the NIST SARD subset, OWASP NodeGoat + Juice Shop,
+# the Track R.2 polyglot corpora — RailsGoat / DVWA / DVPWA / gosec / RustSec —
+# and the Nyx benchmark fixtures), then emits `tests/eval_corpus/results.json`
+# for reports, diffs, and docs.
+#
+# Usage:
+#   tests/eval_corpus/run_full.sh [--nyx BIN] [--budget FILE] [--diff FILE]
+#                                 [--output DIR] [--corpus-dir DIR]
+#
+# Differences vs `run.sh`:
+#   * Always runs every set (no `--sets` selector).
+#   * Always passes `--budget tests/eval_corpus/budget.toml` so the
+#     configured per-cell limits are checked on every pass.
+#   * Copies the timestamped results file to
+#     `tests/eval_corpus/results.json`.
+#
+# Exit codes:
+#   0  every set ran and the merged result met the per-cell budget.
+#   1  setup or I/O error.
+#   2  budget exceeded OR monotonic-improvement regression.
+#   3  budget/diff input malformed.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+
+NYX_BIN="${NYX_BIN:-${REPO_ROOT}/target/release/nyx}"
+BUDGET_FILE="${BUDGET_FILE:-${SCRIPT_DIR}/budget.toml}"
+DIFF_FILE="${DIFF_FILE:-}"
+OUTPUT_DIR=""
+CORPUS_CACHE="${NYX_EVAL_CORPUS_DIR:-${HOME}/.cache/nyx/eval_corpus}"
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --nyx)         NYX_BIN="$2"; shift 2 ;;
+    --budget)      BUDGET_FILE="$2"; shift 2 ;;
+    --diff)        DIFF_FILE="$2"; shift 2 ;;
+    --output)      OUTPUT_DIR="$2"; shift 2 ;;
+    --corpus-dir)  CORPUS_CACHE="$2"; shift 2 ;;
+    -h|--help)
+      sed -n '1,40p' "$0"
+      exit 0
+      ;;
+    *)
+      echo "unknown flag: $1" >&2
+      exit 1
+      ;;
+  esac
+done
+
+die()  { echo "error: $*" >&2; exit 1; }
+info() { echo "[full] $*"; }
+
+[[ -x "$NYX_BIN" ]] || die "nyx binary not found or not executable: $NYX_BIN"
+[[ -f "$BUDGET_FILE" ]] || die "budget file not found: $BUDGET_FILE"
+
+OUTPUT_DIR="${OUTPUT_DIR:-${SCRIPT_DIR}/.run-out}"
+mkdir -p "$OUTPUT_DIR"
+
+info "nyx:    $NYX_BIN"
+info "budget: $BUDGET_FILE"
+info "diff:   ${DIFF_FILE:-<none>}"
+info "output: $OUTPUT_DIR"
+
+set +e
+NYX_EVAL_CORPUS_DIR="$CORPUS_CACHE" \
+  bash "${SCRIPT_DIR}/run.sh" \
+    --nyx     "$NYX_BIN" \
+    --sets    owasp,sard,nodegoat,juiceshop,railsgoat,dvwa,dvpwa,gosec,rustsec,inhouse \
+    --output  "$OUTPUT_DIR" \
+    --budget  "$BUDGET_FILE" \
+    ${DIFF_FILE:+--diff "$DIFF_FILE"}
+RC=$?
+set -e
+
+RESULTS_SRC="${OUTPUT_DIR}/eval_results.json"
+RESULTS_DST="${SCRIPT_DIR}/results.json"
+if [[ -f "$RESULTS_SRC" ]]; then
+  cp "$RESULTS_SRC" "$RESULTS_DST"
+  info "results: $RESULTS_DST"
+else
+  info "no eval_results.json produced; corpus may not be downloaded"
+fi
+
+exit "$RC"
--- a/tests/eval_corpus/sard_gt_convert.py
+++ b/tests/eval_corpus/sard_gt_convert.py
@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+"""Convert NIST SARD manifest XML into nyx ground-truth JSON.
+
+SARD ships per-test-case `manifest.xml` files alongside source. Each
+`<testcase>` lists one or more `<file path="…">` entries with optional
+`<flaw line="…" name="CWE-XXX_…"/>` children.
+
+Output schema (consumed by tabulate.py):
+  list of {"path", "line", "cap", "vuln"} records.
+
+Usage:
+  tests/eval_corpus/sard_gt_convert.py \\
+      --corpus-dir ~/.cache/nyx/eval_corpus/nist_sard \\
+      --output     tests/eval_corpus/ground_truth/nist_sard.json
+"""
+
+import argparse
+import json
+import re
+import sys
+import xml.etree.ElementTree as ET
+from pathlib import Path
+
+CWE_TO_NYX_CAP = {
+    "20":  "validation",
+    "22":  "path_traversal",
+    "78":  "cmdi",
+    "79":  "xss",
+    "89":  "sqli",
+    "90":  "ldap_injection",
+    "91":  "xpath_injection",
+    "94":  "cmdi",
+    "113": "header_injection",
+    "117": "header_injection",
+    "190": "memory",
+    "200": "data_exfil",
+    "287": "auth",
+    "295": "crypto",
+    "311": "crypto",
+    "327": "crypto",
+    "328": "crypto",
+    "330": "crypto",
+    "352": "auth",
+    "434": "path_traversal",
+    "476": "memory",
+    "502": "deserialize",
+    "601": "redirect",
+    "611": "xxe",
+    "643": "xpath_injection",
+    "798": "crypto",
+    "918": "ssrf",
+}
+
+CWE_RE = re.compile(r"CWE[-_](\d+)", re.IGNORECASE)
+
+
+def cap_for_flaw(name: str) -> str | None:
+    m = CWE_RE.search(name or "")
+    if not m:
+        return None
+    return CWE_TO_NYX_CAP.get(m.group(1))
+
+
+def main() -> int:
+    p = argparse.ArgumentParser()
+    p.add_argument("--corpus-dir", required=True)
+    p.add_argument("--output", required=True)
+    args = p.parse_args()
+
+    root = Path(args.corpus_dir).expanduser().resolve()
+    if not root.is_dir():
+        print(f"error: corpus dir not found: {root}", file=sys.stderr)
+        return 1
+
+    records: list[dict] = []
+    skipped_files = 0
+    skipped_caps = 0
+
+    for manifest in root.rglob("manifest.xml"):
+        try:
+            tree = ET.parse(manifest)
+        except ET.ParseError as e:
+            print(f"warn: parse failed {manifest}: {e}", file=sys.stderr)
+            continue
+        for tc in tree.iter("testcase"):
+            for fnode in tc.iter("file"):
+                rel = fnode.get("path") or ""
+                if not rel:
+                    continue
+                abs_path = (manifest.parent / rel).resolve()
+                if not abs_path.exists():
+                    skipped_files += 1
+                    continue
+                flaws = list(fnode.iter("flaw")) + list(fnode.iter("mixed"))
+                if not flaws:
+                    records.append({
+                        "path": str(abs_path),
+                        "line": 0,
+                        "cap":  "other",
+                        "vuln": False,
+                    })
+                    continue
+                for flaw in flaws:
+                    cap = cap_for_flaw(flaw.get("name", ""))
+                    if cap is None:
+                        skipped_caps += 1
+                        continue
+                    try:
+                        line = int(flaw.get("line", "0") or 0)
+                    except ValueError:
+                        line = 0
+                    records.append({
+                        "path": str(abs_path),
+                        "line": line,
+                        "cap":  cap,
+                        "vuln": True,
+                    })
+
+    out = Path(args.output).expanduser().resolve()
+    out.parent.mkdir(parents=True, exist_ok=True)
+    with open(out, "w") as f:
+        json.dump(records, f, indent=2)
+
+    vuln_count = sum(1 for r in records if r["vuln"])
+    print(f"wrote {len(records)} records to {out}")
+    print(f"  vulns:           {vuln_count}")
+    print(f"  non-vuln:        {len(records) - vuln_count}")
+    print(f"  skipped (file):  {skipped_files}")
+    print(f"  skipped (cap):   {skipped_caps}")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/tests/eval_corpus/tabulate.py
+++ b/tests/eval_corpus/tabulate.py
@ -0,0 +1,688 @@
+#!/usr/bin/env python3
+"""
+Tabulate nyx scan results against a ground-truth file.
+
+For OWASP / SARD sets: compares nyx findings against known-true/known-false
+labels from the ground truth JSON.
+
+For in-house sets (--inhouse): counts findings by cap x language; reports
+Unsupported rate only (no ground truth required).
+
+Output: appends a result record to --append FILE.
+
+Phase 29 (Track I) extensions:
+  --budget tests/eval_corpus/budget.toml   enforce per-cell budget thresholds
+  --diff   previous.json                   compare against prior result file,
+                                           fail on monotonic-improvement
+                                           regression
+
+Exit codes:
+  0  all rows pass.
+  2  one or more per-cell budgets exceeded OR a diff regression was found.
+  3  malformed budget / diff input (callers must fix configuration).
+"""
+
+import argparse
+import json
+import os
+import sys
+from collections import defaultdict
+from pathlib import Path
+
+try:
+    import tomllib  # Python 3.11+
+except ModuleNotFoundError:  # pragma: no cover — older interpreters only
+    import tomli as tomllib  # type: ignore[no-redef]
+
+LINE_TOLERANCE = 5
+
+# Caps with no sound runtime oracle (config / usage smells) and the catch-all
+# `other` bucket route to Unsupported by design, so their Unsupported-rate is
+# report-only, never gated.  Mirrors report.py / the budget.toml intent.
+NO_SOUND_ORACLE_CAPS = {"auth", "crypto", "xss", "trustbound", "other"}
+
+
+def _soft_unsupported() -> bool:
+    """True when the per-cell Unsupported-rate budget is report-only.
+
+    CI sets `NYX_EVAL_SOFT_UNSUPPORTED` because dynamic confirmation is
+    environment-constrained there (the budget is calibrated on a dev box where
+    confirmation runs fully); the precision / confirmed-rate ratchets stay
+    hard.  Unset (local dev) keeps the Unsupported budget hard.
+    """
+    return os.environ.get("NYX_EVAL_SOFT_UNSUPPORTED", "").strip().lower() in (
+        "1",
+        "true",
+        "yes",
+        "on",
+    )
+
+# Bitflag positions for Cap (src/labels/mod.rs). Sink bits map to a cap label.
+_CAP_BIT_TABLE = [
+    (1 << 5,  "path_traversal"),  # FILE_IO
+    (1 << 6,  "fmt_string"),
+    (1 << 7,  "sqli"),             # SQL_QUERY
+    (1 << 8,  "deserialize"),
+    (1 << 9,  "ssrf"),
+    (1 << 10, "cmdi"),             # CODE_EXEC
+    (1 << 11, "crypto"),
+    (1 << 12, "unauthorized_id"),
+    (1 << 13, "data_exfil"),
+    (1 << 14, "ldap_injection"),
+    (1 << 15, "xpath_injection"),
+    (1 << 16, "header_injection"),
+    (1 << 17, "redirect"),         # OPEN_REDIRECT
+    (1 << 18, "xss"),              # SSTI (template_injection); also covers XSS sinks
+    (1 << 19, "xxe"),
+    (1 << 20, "prototype_pollution"),
+    # HTML_ESCAPE (1<<1) is the universal reflected-XSS *sink* cap across every
+    # language (`grep 'Sink(Cap::HTML_ESCAPE)' src/labels/` — PHP echo, JS
+    # innerHTML, Java servlet writers, etc.); the same bit is the html-escape
+    # *sanitizer* cap, so a finding only carries it as a sink when an un-encoded
+    # tainted value reached an HTML output.  Placed LAST so any higher-priority
+    # sink bit (SQL_QUERY, FILE_IO, ...) on the same finding wins; a finding
+    # carrying only HTML_ESCAPE is reflected XSS.  Without this, every
+    # taint-based reflected-XSS finding mis-buckets to "other".
+    (1 << 1, "xss"),
+]
+
+# Static lens (see --static): SHELL_ESCAPE (1<<2) is the command-injection sink
+# cap for *every* language (`grep SHELL_ESCAPE src/labels/` — all Sink uses are
+# command-exec; CODE_EXEC=1<<10 is the eval/code-exec variant, also cmdi).  In a
+# normal `nyx scan` (no dynamic confirmation) a Java cmdi finding carries only
+# SHELL_ESCAPE; the SHELL_ESCAPE→CODE_EXEC remap that buckets it as cmdi is gated
+# on VerifyStatus::Confirmed (src/commands/scan.rs), so with 0 confirmations the
+# default table leaves these in "other" and the cmdi cell reads 0/0/N.  The
+# static lens appends SHELL_ESCAPE→cmdi at the LOWEST priority (after every other
+# bit) so a SHELL_ESCAPE-only finding buckets as cmdi while a finding that also
+# carries a higher-priority sink bit (e.g. FILE_IO) keeps its existing bucket.
+# Opt-in via --static so the default confirmed-recall bucketing is byte-identical.
+_CAP_BIT_TABLE_STATIC = _CAP_BIT_TABLE + [(1 << 2, "cmdi")]  # SHELL_ESCAPE
+
+# Substring → cap lookup for rule IDs. Order matters: most specific first.
+_CAP_RULE_TABLE = [
+    ("path_traversal", "path_traversal"),
+    ("sql",           "sqli"),
+    ("xss",           "xss"),
+    ("ssrf",          "ssrf"),
+    ("cmdi",          "cmdi"),
+    ("cmd_exec",      "cmdi"),
+    ("code_exec",     "cmdi"),
+    ("deser",         "deserialize"),
+    ("unserialize",   "deserialize"),
+    ("redirect",      "redirect"),
+    ("xxe",           "xxe"),
+    ("template",      "xss"),
+    ("auth",          "auth"),
+    ("memory",        "memory"),
+    ("crypto",        "crypto"),
+    ("data-exfil",    "data_exfil"),
+    ("data_exfil",    "data_exfil"),
+    ("header",        "header_injection"),
+]
+
+
+def load_json(path: str) -> object:
+    with open(path) as f:
+        return json.load(f)
+
+
+def cap_of(finding: dict, static_lens: bool = False) -> str:
+    # 1. Prefer evidence.sink_caps bitmask — the engine's own classification.
+    ev = finding.get("evidence", {}) or {}
+    sink_caps = ev.get("sink_caps")
+    if isinstance(sink_caps, int) and sink_caps:
+        table = _CAP_BIT_TABLE_STATIC if static_lens else _CAP_BIT_TABLE
+        for bit, name in table:
+            if sink_caps & bit:
+                return name
+    # 2. Fall back to rule id substring (e.g. py.cmdi.os_system, java.deser.readobject).
+    rid = (finding.get("id") or "").lower()
+    head = rid.split(" ", 1)[0]
+    for needle, cap in _CAP_RULE_TABLE:
+        if needle in head:
+            return cap
+    return "other"
+
+
+def lang_of(finding: dict) -> str:
+    path = finding.get("path", "")
+    ext_map = {
+        ".py": "python", ".js": "javascript", ".ts": "typescript",
+        ".java": "java", ".go": "go", ".php": "php", ".rb": "ruby",
+        ".rs": "rust", ".c": "c", ".cpp": "cpp",
+    }
+    for ext, lang in ext_map.items():
+        if path.endswith(ext):
+            return lang
+    return "unknown"
+
+
+def _norm_path(p: str) -> str:
+    return p.replace("\\", "/")
+
+
+def path_matches(gt_path: str, finding_path: str) -> bool:
+    """True when a ground-truth path refers to the same file as a finding path.
+
+    Ground-truth paths are stored *relative to the corpus root* so the checked-in
+    JSON stays portable, while nyx emits absolute paths rooted at wherever the
+    corpus was cloned. Match on a path-component-aligned suffix so the relative
+    GT path matches the absolute finding path (and the reverse, to keep a legacy
+    absolute GT file working). Exact equality is the fast path; the `/` boundary
+    stops `.../BenchmarkTest1.java` from matching `.../xBenchmarkTest1.java`.
+    """
+    g = _norm_path(gt_path)
+    f = _norm_path(finding_path)
+    return g == f or f.endswith("/" + g) or g.endswith("/" + f)
+
+
+# ── Budget loading ──────────────────────────────────────────────────────────
+
+
+def load_budget(path: str) -> dict:
+    """Parse a budget.toml file.
+
+    Returns a dict::
+
+        {
+            "default": {"unsupported_rate": 0.8, "false_confirmed_rate": 0.02,
+                        "repro_stability": 0.95, "ratchet_deadline": "..."},
+            "cells": {(cap, lang): {...overrides...}, ...},
+        }
+
+    Raises SystemExit(3) on a malformed file.
+    """
+
+    try:
+        with open(path, "rb") as f:
+            raw = tomllib.load(f)
+    except FileNotFoundError:
+        print(f"ERROR  budget file not found: {path}", file=sys.stderr)
+        sys.exit(3)
+    except tomllib.TOMLDecodeError as e:
+        print(f"ERROR  budget file malformed: {path}: {e}", file=sys.stderr)
+        sys.exit(3)
+
+    default = raw.get("default", {}) or {}
+    cells = {}
+    for row in raw.get("cell", []) or []:
+        cap = row.get("cap")
+        lang = row.get("lang")
+        if not cap or not lang:
+            print(
+                f"ERROR  budget cell missing cap/lang: {row!r}", file=sys.stderr
+            )
+            sys.exit(3)
+        cells[(cap, lang)] = row
+
+    return {"default": default, "cells": cells}
+
+
+def budget_for_cell(budget: dict, cap: str, lang: str) -> dict:
+    """Merge cell-specific overrides on top of [default]."""
+    merged = dict(budget.get("default", {}) or {})
+    cell = budget.get("cells", {}).get((cap, lang))
+    if cell:
+        merged.update({k: v for k, v in cell.items() if k not in ("cap", "lang")})
+    # Fall back to a wildcard override if present.
+    if not cell:
+        wildcard = budget.get("cells", {}).get((cap, "*")) or \
+                   budget.get("cells", {}).get(("*", lang)) or \
+                   budget.get("cells", {}).get(("*", "*"))
+        if wildcard:
+            merged.update({k: v for k, v in wildcard.items() if k not in ("cap", "lang")})
+    return merged
+
+
+def enforce_budget(cells: list, budget: dict) -> list:
+    """Return a list of human-readable failure strings.
+
+    Each cell's measured Unsupported / false-Confirmed / repro-stability
+    rate is compared against its merged budget row. A missing measurement
+    (e.g. no Confirmed findings → false-Confirmed denominator = 0) is
+    treated as "no data" and skipped, never as a failure.
+    """
+
+    failures = []
+    soft_unsupported = _soft_unsupported()
+    for c in cells:
+        b = budget_for_cell(budget, c["cap"], c["lang"])
+        if not b:
+            continue
+        cap, lang = c["cap"], c["lang"]
+        max_unsup = b.get("unsupported_rate")
+        max_false = b.get("false_confirmed_rate")
+        min_stable = b.get("repro_stability")
+        min_confirmed = b.get("confirmed_rate")
+
+        if isinstance(max_unsup, (int, float)) and c.get("total", 0) > 0:
+            if c["unsupported_rate"] > max_unsup:
+                # No-sound-oracle caps (and `other`) are report-only by design;
+                # the rest are report-only when dynamic confirmation is known to
+                # be environment-constrained (NYX_EVAL_SOFT_UNSUPPORTED, set by
+                # CI).  Hard otherwise so local dev still ratchets coverage.
+                line = (
+                    f"  {cap}/{lang}: Unsupported {c['unsupported_rate']*100:.1f}%"
+                    f" > budget {max_unsup*100:.1f}%"
+                )
+                if not (cap in NO_SOUND_ORACLE_CAPS or soft_unsupported):
+                    failures.append(f"  FAIL{line}")
+        if isinstance(min_confirmed, (int, float)) and c.get("total", 0) > 0:
+            rate = c.get("confirmed", 0) / c["total"]
+            if rate < min_confirmed:
+                failures.append(
+                    f"  FAIL  {cap}/{lang}: Confirmed {rate*100:.1f}%"
+                    f" < budget {min_confirmed*100:.1f}%"
+                )
+        if isinstance(max_false, (int, float)) and c.get("confirmed", 0) > 0:
+            rate = c.get("wrong_confirmed", 0) / c["confirmed"]
+            if rate > max_false:
+                failures.append(
+                    f"  FAIL  {cap}/{lang}: false-Confirmed {rate*100:.1f}%"
+                    f" > budget {max_false*100:.1f}%"
+                )
+        # Repro stability is only enforced when callers stamped at least
+        # one `replay_stable: true` flag — otherwise stable_replays == 0
+        # is indistinguishable from "we did not measure stability for
+        # this row" and the gate would fire vacuously on every clean run.
+        if (
+            isinstance(min_stable, (int, float))
+            and c.get("confirmed", 0) > 0
+            and c.get("stable_replays", 0) > 0
+        ):
+            rate = c["stable_replays"] / c["confirmed"]
+            if rate < min_stable:
+                failures.append(
+                    f"  FAIL  {cap}/{lang}: repro stability {rate*100:.1f}%"
+                    f" < budget {min_stable*100:.1f}%"
+                )
+    return failures
+
+
+# ── Diff loading ────────────────────────────────────────────────────────────
+
+
+def load_previous_cells(path: str, label: str) -> dict:
+    """Index a previous results file by (cap, lang) → cell.
+
+    The previous file is the same shape as `--append`'s output. We pick the
+    record whose `label` matches the current run; if no exact match, fall
+    back to the first record. Missing/unreadable files exit 3.
+    """
+
+    try:
+        with open(path) as f:
+            data = json.load(f)
+    except FileNotFoundError:
+        print(f"ERROR  diff file not found: {path}", file=sys.stderr)
+        sys.exit(3)
+    except json.JSONDecodeError as e:
+        print(f"ERROR  diff file malformed: {path}: {e}", file=sys.stderr)
+        sys.exit(3)
+
+    records = data if isinstance(data, list) else [data]
+    chosen = None
+    for r in records:
+        if r.get("label") == label:
+            chosen = r
+            break
+    if chosen is None and records:
+        chosen = records[0]
+    if not chosen:
+        return {}
+    return {(c["cap"], c["lang"]): c for c in chosen.get("cells", [])}
+
+
+def diff_regressions(cells: list, prev: dict) -> list:
+    """Compare current cells against previous. Returns failure strings.
+
+    Three monotonicity rules:
+      * Unsupported% must not increase.
+      * False-Confirmed% must not increase.
+      * Repro-stability% must not decrease.
+
+    Cells absent from `prev` are treated as new (skipped).
+    A small epsilon (0.5 percentage points) absorbs flake noise.
+    """
+    EPS = 0.005
+    failures = []
+    for c in cells:
+        key = (c["cap"], c["lang"])
+        old = prev.get(key)
+        if not old:
+            continue
+        # Unsupported.
+        old_unsup = old.get("unsupported_rate", 0.0)
+        new_unsup = c.get("unsupported_rate", 0.0)
+        if new_unsup > old_unsup + EPS:
+            failures.append(
+                f"  REGRESSION  {key[0]}/{key[1]}: Unsupported"
+                f" {old_unsup*100:.1f}% → {new_unsup*100:.1f}%"
+            )
+        # False-Confirmed.
+        old_conf = old.get("confirmed", 0)
+        old_false = (old.get("wrong_confirmed", 0) / old_conf) if old_conf else None
+        new_conf = c.get("confirmed", 0)
+        new_false = (c.get("wrong_confirmed", 0) / new_conf) if new_conf else None
+        if old_false is not None and new_false is not None and new_false > old_false + EPS:
+            failures.append(
+                f"  REGRESSION  {key[0]}/{key[1]}: false-Confirmed"
+                f" {old_false*100:.1f}% → {new_false*100:.1f}%"
+            )
+        # Repro stability (higher is better).
+        old_stable = (
+            (old.get("stable_replays", 0) / old_conf) if old_conf else None
+        )
+        new_stable = (
+            (c.get("stable_replays", 0) / new_conf) if new_conf else None
+        )
+        if (
+            old_stable is not None
+            and new_stable is not None
+            and new_stable < old_stable - EPS
+        ):
+            failures.append(
+                f"  REGRESSION  {key[0]}/{key[1]}: repro stability"
+                f" {old_stable*100:.1f}% → {new_stable*100:.1f}%"
+            )
+    return failures
+
+
+def main() -> int:
+    p = argparse.ArgumentParser()
+    p.add_argument("--label", required=True)
+    p.add_argument("--scan", required=True, help="nyx scan --format json output")
+    p.add_argument("--ground-truth", default="", help="ground truth JSON")
+    p.add_argument("--inhouse", action="store_true")
+    p.add_argument("--append", required=True, help="results accumulator JSON")
+    p.add_argument(
+        "--manual-triage",
+        default="",
+        help=(
+            "path to a manual-triage JSON file (list of "
+            "{path, line, cap, vuln: bool}).  Confirmed findings matching a "
+            "`vuln: false` entry are stamped with `wrong: true` before "
+            "tabulation so the per-cell False-Confirmed budget becomes "
+            "non-vacuous without depending on the host's `nyx verify-feedback` "
+            "log.  Matching uses LINE_TOLERANCE (=5) — line == 0 in the triage "
+            "entry matches any line."
+        ),
+    )
+    p.add_argument(
+        "--budget",
+        default="",
+        help="path to budget.toml (per-(cap,lang) thresholds)",
+    )
+    p.add_argument(
+        "--lang",
+        default="",
+        help=(
+            "comma-separated language allowlist (python, javascript, php, "
+            "ruby, go, rust, ...).  When set, only findings AND ground-truth "
+            "entries whose source language is in the list are tabulated; "
+            "everything else is dropped before tallying.  Used by the Phase 29 "
+            "polyglot corpora (Track R.2) to scope a single-language corpus to "
+            "its target language so incidental third-party assets in other "
+            "languages — e.g. the vendored JavaScript a Rails or aiohttp app "
+            "bundles — do not pollute that corpus's per-cap metrics.  Empty = "
+            "no language filter (every finding tabulated, the OWASP/JSTS "
+            "default)."
+        ),
+    )
+    p.add_argument(
+        "--diff",
+        default="",
+        help="path to a previous results JSON; fail on monotonic-improvement regression",
+    )
+    p.add_argument(
+        "--static",
+        action="store_true",
+        help=(
+            "static lens: bucket SHELL_ESCAPE (1<<2) findings as cmdi even when "
+            "they are unconfirmed.  Java (and other) command-exec sinks carry "
+            "SHELL_ESCAPE and only get remapped to CODE_EXEC on dynamic Confirm; "
+            "without this flag, an env with 0 confirmations reads the cmdi cell "
+            "as 0/0/N regardless of static quality.  SHELL_ESCAPE is the "
+            "command-injection sink cap for every language, so this is sound "
+            "globally; it is opt-in only so the default confirmed-recall "
+            "bucketing stays byte-identical."
+        ),
+    )
+    args = p.parse_args()
+    lang_filter = {l.strip() for l in args.lang.split(",") if l.strip()}
+
+    scan_data = load_json(args.scan)
+    findings = scan_data if isinstance(scan_data, list) else scan_data.get("findings", [])
+    # Score only Security-category findings against the security ground truth.
+    # Reliability defects (resource leaks, error-handling fallthrough) and
+    # Quality findings are real bugs but not the injection / crypto / auth
+    # vulns the corpus ground truth enumerates, so counting them as security
+    # false-positives is a category error that wrecks precision with pure
+    # noise.  Findings with no explicit category (legacy fixtures) default to
+    # Security and are kept.
+    findings = [
+        f for f in findings
+        if f.get("category", "Security") not in ("Reliability", "Quality")
+    ]
+    if lang_filter:
+        findings = [f for f in findings if lang_of(f) in lang_filter]
+
+    # ── Manual-triage stamping (Phase 31 follow-up) ───────────────────────
+    # Cross-reference Confirmed rows against a manual-triage file before
+    # tabulation.  Each `vuln: false` entry whose `(path, cap)` matches a
+    # Confirmed finding (with LINE_TOLERANCE, or any line when triage
+    # entry's `line == 0`) stamps `wrong: true` on the finding's
+    # `dynamic_verdict`, which the existing wrong_confirmed counter picks
+    # up below.  Decouples the False-Confirmed budget from the host-local
+    # `nyx verify-feedback` log so CI on a fresh eval corpus can still
+    # gate the headline target.
+    if args.manual_triage and Path(args.manual_triage).exists():
+        triage = load_json(args.manual_triage)
+        not_vuln: list[dict] = []
+        for entry in triage if isinstance(triage, list) else []:
+            if entry.get("vuln") is False:
+                not_vuln.append({
+                    "path": entry.get("path", ""),
+                    "line": entry.get("line", 0),
+                    "cap": entry.get("cap", ""),
+                })
+        used: set[int] = set()
+        for f in findings:
+            ev = f.get("evidence") or {}
+            dv = ev.get("dynamic_verdict") or {}
+            if dv.get("status") != "Confirmed":
+                continue
+            f_path = f.get("path", "")
+            f_line = f.get("line", 0)
+            f_cap = cap_of(f, static_lens=args.static)
+            for idx, entry in enumerate(not_vuln):
+                if idx in used:
+                    continue
+                if (path_matches(entry["path"], f_path)
+                        and entry["cap"] == f_cap
+                        and (entry["line"] == 0
+                             or abs(entry["line"] - f_line) <= LINE_TOLERANCE)):
+                    used.add(idx)
+                    dv["wrong"] = True
+                    ev["dynamic_verdict"] = dv
+                    f["evidence"] = ev
+                    break
+
+    # Per-cell tallies: {(cap, lang): {tp, fp, fn, unsupported, confirmed,
+    # partially_confirmed, wrong_confirmed, stable_replays, total}}
+    cells: dict[tuple[str, str], dict] = defaultdict(
+        lambda: {
+            "tp": 0,
+            "fp": 0,
+            "fn": 0,
+            "unsupported": 0,
+            "confirmed": 0,
+            "partially_confirmed": 0,
+            "wrong_confirmed": 0,
+            "stable_replays": 0,
+            # Confirmed-verdict precision/recall accounting, ground-truth-derived
+            # (only populated when --ground-truth is supplied): confirmed_tp =
+            # Confirmed findings that match a GT positive; confirmed_fp =
+            # Confirmed findings that match no GT positive (false confirms).
+            "confirmed_tp": 0,
+            "confirmed_fp": 0,
+            "total": 0,
+        }
+    )
+
+    for f in findings:
+        cap = cap_of(f, static_lens=args.static)
+        lang = lang_of(f)
+        key = (cap, lang)
+        ev = f.get("evidence", {}) or {}
+        dv = ev.get("dynamic_verdict") if ev else None
+        cells[key]["total"] += 1
+        if dv:
+            status = dv.get("status")
+            if status == "Unsupported":
+                cells[key]["unsupported"] += 1
+            elif status == "PartiallyConfirmed":
+                cells[key]["partially_confirmed"] += 1
+            elif status == "Confirmed":
+                cells[key]["confirmed"] += 1
+                # Repro-stability and false-Confirmed counts are optional
+                # fields tabulate.py reads off the verdict when callers have
+                # stamped them.
+                if dv.get("wrong") is True:
+                    cells[key]["wrong_confirmed"] += 1
+                if dv.get("replay_stable") is True:
+                    cells[key]["stable_replays"] += 1
+
+    if not args.inhouse and args.ground_truth and Path(args.ground_truth).exists():
+        gt = load_json(args.ground_truth)
+        # Ground truth format: list of {"path": ..., "line": ..., "cap": ..., "vuln": bool}
+        gt_true: list[dict] = []
+        for entry in gt if isinstance(gt, list) else []:
+            # Honour the same language scope as the findings filter so recall
+            # is measured only over the corpus's target language.
+            if lang_filter and lang_of(entry) not in lang_filter:
+                continue
+            if entry.get("vuln"):
+                gt_true.append({
+                    "path": entry.get("path", ""),
+                    "line": entry.get("line", 0),
+                    "cap": entry.get("cap", ""),
+                })
+
+        # Track which GT entries were matched (by index) to avoid double-counting.
+        matched_gt: set[int] = set()
+        # Track (path, cap) pairs that had at least one finding match.
+        found_path_caps: set[tuple[str, str]] = set()
+
+        for f in findings:
+            f_path = f.get("path", "")
+            f_line = f.get("line", 0)
+            f_cap = cap_of(f, static_lens=args.static)
+            cap = f_cap
+            lang = lang_of(f)
+            cell_key = (cap, lang)
+            dv = (f.get("evidence") or {}).get("dynamic_verdict") or {}
+            is_confirmed = dv.get("status") == "Confirmed"
+            matched_idx = None
+            for idx, gt_entry in enumerate(gt_true):
+                if (path_matches(gt_entry["path"], f_path)
+                        and gt_entry["cap"] == f_cap
+                        and idx not in matched_gt
+                        and (gt_entry["line"] == 0
+                             or abs(gt_entry["line"] - f_line) <= LINE_TOLERANCE)):
+                    matched_idx = idx
+                    break
+            if matched_idx is not None:
+                matched_gt.add(matched_idx)
+                found_path_caps.add((f_path, f_cap))
+                cells[cell_key]["tp"] += 1
+                if is_confirmed:
+                    cells[cell_key]["confirmed_tp"] += 1
+            else:
+                cells[cell_key]["fp"] += 1
+                if is_confirmed:
+                    cells[cell_key]["confirmed_fp"] += 1
+
+        for idx, gt_entry in enumerate(gt_true):
+            if idx not in matched_gt:
+                cap = gt_entry["cap"]
+                # Land the FN in the cell its source language implies (from the
+                # GT path extension) so per-(cap,lang) recall is meaningful and
+                # OWASP misses show up in the java cell, not a stray "unknown".
+                cells[(cap, lang_of(gt_entry))]["fn"] += 1
+
+        # Ground-truth-derived false-confirm accounting.  When a corpus ships a
+        # vuln/benign label per file (OWASP, SARD), a Confirmed finding that
+        # matches no GT positive is a false confirm — authoritative, so it
+        # overrides any manual-triage stamping for these labelled sets.  This is
+        # what makes the per-cell `false_confirmed_rate` budget non-vacuous on a
+        # fresh eval corpus without a host-local verify-feedback log.
+        for v in cells.values():
+            if v["confirmed_tp"] or v["confirmed_fp"]:
+                v["wrong_confirmed"] = v["confirmed_fp"]
+
+    result = {
+        "label": args.label,
+        "total_findings": len(findings),
+        "cells": [
+            {
+                "cap": k[0],
+                "lang": k[1],
+                **v,
+                "precision": v["tp"] / max(v["tp"] + v["fp"], 1),
+                "recall": v["tp"] / max(v["tp"] + v["fn"], 1),
+                "unsupported_rate": v["unsupported"] / max(v["total"], 1),
+            }
+            for k, v in sorted(cells.items())
+        ],
+    }
+
+    existing = load_json(args.append) if Path(args.append).exists() else []
+    existing.append(result)
+    with open(args.append, "w") as f:
+        json.dump(existing, f, indent=2)
+
+    # Print summary
+    print(f"\n=== {args.label} ===")
+    print(f"{'Cap':<20} {'Lang':<12} {'TP':>5} {'FP':>5} {'FN':>5} {'Prec':>6} {'Rec':>6} {'Unsup%':>7}")
+    print("-" * 72)
+    for c in result["cells"]:
+        print(
+            f"{c['cap']:<20} {c['lang']:<12} "
+            f"{c['tp']:>5} {c['fp']:>5} {c['fn']:>5} "
+            f"{c['precision']:>6.2f} {c['recall']:>6.2f} "
+            f"{c['unsupported_rate']*100:>6.1f}%"
+        )
+
+    exit_rc = 0
+
+    # ── Phase 29: per-cell budget enforcement ─────────────────────────────
+    if args.budget:
+        budget = load_budget(args.budget)
+        failures = enforce_budget(result["cells"], budget)
+        if failures:
+            print(f"\n=== Per-cell budget regressions ({args.budget}) ===")
+            for line in failures:
+                print(line)
+            exit_rc = 2
+        else:
+            print(f"\nPer-cell budget ({args.budget}): OK")
+
+    # ── Phase 29: diff against previous run ───────────────────────────────
+    if args.diff:
+        prev = load_previous_cells(args.diff, args.label)
+        failures = diff_regressions(result["cells"], prev)
+        if failures:
+            print(f"\n=== Monotonic-improvement regressions vs {args.diff} ===")
+            for line in failures:
+                print(line)
+            exit_rc = 2
+        else:
+            print(f"\nDiff vs {args.diff}: no regressions")
+
+    return exit_rc
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/tests/eval_corpus/test_manifest_gt_convert.py
+++ b/tests/eval_corpus/test_manifest_gt_convert.py
@ -0,0 +1,251 @@
+#!/usr/bin/env python3
+"""
+Phase 28 (Track R.1) regression test for tests/eval_corpus/manifest_gt_convert.py.
+
+Proves the manifest -> ground-truth converter is non-vacuous:
+  * a well-formed manifest converts to the expected sorted JSON,
+  * --corpus-dir validation passes when every labelled path exists and
+    produces byte-identical output to the no-corpus transform (so the CI
+    in-sync guard, which diffs committed vs a validated regen, is sound),
+  * --corpus-dir validation HARD-ERRORS (exit 2) on a missing path,
+  * an unknown cap / duplicate (path,cap) / malformed TOML are rejected,
+  * the committed nodegoat.json / juiceshop.json are exactly what a fresh
+    conversion of their manifests produces (offline half of the CI guard).
+
+Run with::
+
+    python3 tests/eval_corpus/test_manifest_gt_convert.py
+
+Exits 0 when every assertion holds, non-zero otherwise.
+"""
+
+from __future__ import annotations
+
+import json
+import subprocess
+import sys
+import tempfile
+from pathlib import Path
+
+REPO = Path(__file__).resolve().parents[2]
+CONVERT = REPO / "tests/eval_corpus/manifest_gt_convert.py"
+GT_DIR = REPO / "tests/eval_corpus/ground_truth"
+
+GOOD_MANIFEST = """\
+corpus = "demo"
+upstream = "https://example.test/demo"
+pinned_ref = "v1"
+
+[[entry]]
+path = "routes/login.ts"
+cap = "sqli"
+vuln = true
+note = "raw SQL string-concat in login"
+
+[[entry]]
+path = "app/routes/contributions.js"
+cap = "cmdi"
+vuln = true
+note = "eval of user input"
+
+[[entry]]
+path = "lib/insecurity.ts"
+cap = "crypto"
+vuln = false
+note = "benign control example"
+"""
+
+
+def run_convert(*args: str) -> subprocess.CompletedProcess:
+    return subprocess.run(
+        [sys.executable, str(CONVERT), *args], capture_output=True, text=True
+    )
+
+
+def test_transform_is_sorted_and_schema_clean(tmp: Path) -> None:
+    man = tmp / "demo.manifest.toml"
+    man.write_text(GOOD_MANIFEST)
+    out = tmp / "demo.json"
+    proc = run_convert("--manifest", str(man), "--output", str(out))
+    assert proc.returncode == 0, proc.stdout + proc.stderr
+    records = json.loads(out.read_text())
+    # Sorted by (path, cap); only the 4 GT fields; `note` dropped.
+    assert [r["path"] for r in records] == [
+        "app/routes/contributions.js",
+        "lib/insecurity.ts",
+        "routes/login.ts",
+    ], records
+    for r in records:
+        assert set(r) == {"path", "line", "cap", "vuln"}, r
+        assert r["line"] == 0, r
+    assert records[0]["cap"] == "cmdi" and records[0]["vuln"] is True
+    assert records[1]["cap"] == "crypto" and records[1]["vuln"] is False
+
+
+def test_corpus_validation_passes_and_matches_no_corpus(tmp: Path) -> None:
+    man = tmp / "demo.manifest.toml"
+    man.write_text(GOOD_MANIFEST)
+    # Build a corpus tree containing every labelled path.
+    corpus = tmp / "corpus"
+    for rel in ("routes/login.ts", "app/routes/contributions.js", "lib/insecurity.ts"):
+        f = corpus / rel
+        f.parent.mkdir(parents=True, exist_ok=True)
+        f.write_text("// stub\n")
+    no_corpus = tmp / "no_corpus.json"
+    with_corpus = tmp / "with_corpus.json"
+    assert run_convert("--manifest", str(man), "--output", str(no_corpus)).returncode == 0
+    proc = run_convert(
+        "--manifest", str(man),
+        "--corpus-dir", str(corpus),
+        "--output", str(with_corpus),
+    )
+    assert proc.returncode == 0, proc.stdout + proc.stderr
+    # Validation must not change the output — that is what makes the CI guard
+    # (diff committed vs validated regen) meaningful.
+    assert no_corpus.read_text() == with_corpus.read_text()
+    assert "validated against" in proc.stdout, proc.stdout
+
+
+def test_missing_path_exits_2(tmp: Path) -> None:
+    man = tmp / "demo.manifest.toml"
+    man.write_text(GOOD_MANIFEST)
+    corpus = tmp / "corpus"
+    # Only two of the three labelled files exist → the third must trip.
+    for rel in ("routes/login.ts", "app/routes/contributions.js"):
+        f = corpus / rel
+        f.parent.mkdir(parents=True, exist_ok=True)
+        f.write_text("// stub\n")
+    out = tmp / "demo.json"
+    proc = run_convert(
+        "--manifest", str(man), "--corpus-dir", str(corpus), "--output", str(out)
+    )
+    assert proc.returncode == 2, proc.stdout + proc.stderr
+    assert "lib/insecurity.ts" in proc.stderr and "missing" in proc.stderr, proc.stderr
+
+
+def test_unknown_cap_rejected(tmp: Path) -> None:
+    man = tmp / "bad_cap.manifest.toml"
+    man.write_text(
+        '[[entry]]\npath = "a.js"\ncap = "not_a_cap"\nvuln = true\n'
+    )
+    out = tmp / "out.json"
+    proc = run_convert("--manifest", str(man), "--output", str(out))
+    assert proc.returncode == 1, proc.stdout + proc.stderr
+    assert "not a known nyx cap" in proc.stderr, proc.stderr
+
+
+def test_duplicate_path_cap_rejected(tmp: Path) -> None:
+    man = tmp / "dup.manifest.toml"
+    man.write_text(
+        '[[entry]]\npath = "a.js"\ncap = "xss"\nvuln = true\n'
+        '[[entry]]\npath = "a.js"\ncap = "xss"\nvuln = true\n'
+    )
+    out = tmp / "out.json"
+    proc = run_convert("--manifest", str(man), "--output", str(out))
+    assert proc.returncode == 1, proc.stdout + proc.stderr
+    assert "duplicate" in proc.stderr, proc.stderr
+
+
+def test_malformed_manifest_exits_1(tmp: Path) -> None:
+    man = tmp / "broken.toml"
+    man.write_text("[[entry]\npath = \n")  # invalid TOML
+    out = tmp / "out.json"
+    proc = run_convert("--manifest", str(man), "--output", str(out))
+    assert proc.returncode == 1, proc.stdout + proc.stderr
+    assert "malformed" in proc.stderr, proc.stderr
+
+
+def test_empty_manifest_exits_1(tmp: Path) -> None:
+    man = tmp / "empty.toml"
+    man.write_text('corpus = "x"\n')  # no [[entry]] tables
+    out = tmp / "out.json"
+    proc = run_convert("--manifest", str(man), "--output", str(out))
+    assert proc.returncode == 1, proc.stdout + proc.stderr
+    assert "no [[entry]]" in proc.stderr, proc.stderr
+
+
+def test_committed_gt_matches_manifest(tmp: Path) -> None:
+    # Offline half of the CI in-sync guard: the committed ground-truth JSON
+    # must be exactly what a fresh conversion of its manifest produces.  This
+    # catches a manifest edit that was not followed by a regenerate.
+    for name in (
+        "nodegoat",
+        "juiceshop",
+        # Track R.2 polyglot corpora (Phase 29).
+        "railsgoat",
+        "dvwa",
+        "dvpwa",
+        "gosec",
+        "rustsec",
+    ):
+        man = GT_DIR / f"{name}.manifest.toml"
+        committed = GT_DIR / f"{name}.json"
+        assert man.exists(), f"missing manifest: {man}"
+        assert committed.exists(), f"missing committed GT: {committed}"
+        regen = tmp / f"{name}.json"
+        proc = run_convert("--manifest", str(man), "--output", str(regen))
+        assert proc.returncode == 0, proc.stdout + proc.stderr
+        assert json.loads(regen.read_text()) == json.loads(committed.read_text()), (
+            f"{committed} is stale — regenerate with manifest_gt_convert.py"
+        )
+
+
+def test_negative_control_emits_empty(tmp: Path) -> None:
+    # A negative-control manifest (no scannable source vulns, e.g. RustSec
+    # advisory-db) declares `negative_control = true` and zero [[entry]]
+    # tables; the converter emits an empty `[]` ground truth.
+    man = tmp / "neg.manifest.toml"
+    man.write_text(
+        'corpus = "rustsec"\n'
+        'upstream = "https://example.test/advisory-db"\n'
+        'pinned_ref = "main"\n'
+        "negative_control = true\n"
+    )
+    out = tmp / "neg.json"
+    proc = run_convert("--manifest", str(man), "--output", str(out))
+    assert proc.returncode == 0, proc.stdout + proc.stderr
+    assert json.loads(out.read_text()) == [], out.read_text()
+    assert "negative-control corpus" in proc.stdout, proc.stdout
+
+
+def test_negative_control_with_entries_rejected(tmp: Path) -> None:
+    # negative_control and [[entry]] are mutually exclusive: a manifest that
+    # sets the flag yet lists a vuln must be rejected so a real positive can
+    # never be silently hidden behind the flag.
+    man = tmp / "neg_bad.manifest.toml"
+    man.write_text(
+        "negative_control = true\n"
+        '[[entry]]\npath = "a.rs"\ncap = "cmdi"\nvuln = true\n'
+    )
+    out = tmp / "neg_bad.json"
+    proc = run_convert("--manifest", str(man), "--output", str(out))
+    assert proc.returncode == 1, proc.stdout + proc.stderr
+    assert "negative_control" in proc.stderr and "zero" in proc.stderr, proc.stderr
+
+
+def main() -> int:
+    with tempfile.TemporaryDirectory() as td:
+        tmp = Path(td)
+        for fn in (
+            test_transform_is_sorted_and_schema_clean,
+            test_corpus_validation_passes_and_matches_no_corpus,
+            test_missing_path_exits_2,
+            test_unknown_cap_rejected,
+            test_duplicate_path_cap_rejected,
+            test_malformed_manifest_exits_1,
+            test_empty_manifest_exits_1,
+            test_committed_gt_matches_manifest,
+            test_negative_control_emits_empty,
+            test_negative_control_with_entries_rejected,
+        ):
+            sub = tmp / fn.__name__
+            sub.mkdir()
+            print(f"... {fn.__name__}")
+            fn(sub)
+            print("    OK")
+    print("\nAll manifest_gt_convert.py regression checks passed.")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/tests/eval_corpus/test_tabulate_regression.py
+++ b/tests/eval_corpus/test_tabulate_regression.py
@ -0,0 +1,771 @@
+#!/usr/bin/env python3
+"""
+Phase 29 (Track I) regression test for tests/eval_corpus/tabulate.py.
+
+Exercises --budget and --diff against hand-crafted scan + ground-truth
+fixtures so the per-cell budget gate and monotonic-improvement diff are
+demonstrably non-vacuous.
+
+Run with::
+
+    python3 tests/eval_corpus/test_tabulate_regression.py
+
+Exits 0 when every assertion holds, non-zero otherwise.  The asserts are
+plain `assert` statements so the file works both as a stand-alone script
+and under unittest discovery.
+"""
+
+from __future__ import annotations
+
+import json
+import subprocess
+import sys
+import tempfile
+from pathlib import Path
+
+REPO = Path(__file__).resolve().parents[2]
+TABULATE = REPO / "tests/eval_corpus/tabulate.py"
+REPORT = REPO / "tests/eval_corpus/report.py"
+BUDGET = REPO / "tests/eval_corpus/budget.toml"
+
+
+def run_tabulate(*args: str) -> subprocess.CompletedProcess:
+    cmd = [sys.executable, str(TABULATE), *args]
+    return subprocess.run(cmd, capture_output=True, text=True)
+
+
+def run_report(*args: str) -> subprocess.CompletedProcess:
+    cmd = [sys.executable, str(REPORT), *args]
+    return subprocess.run(cmd, capture_output=True, text=True)
+
+
+def write_json(path: Path, data: object) -> None:
+    path.write_text(json.dumps(data, indent=2))
+
+
+# Cap bit positions cribbed from tabulate.py / src/labels/mod.rs.
+SINK_BIT_SQL = 1 << 7   # SQL_QUERY
+SINK_BIT_CMDI = 1 << 10  # CODE_EXEC
+SINK_BIT_SHELL = 1 << 2  # SHELL_ESCAPE (Java/other command-exec sink)
+SINK_BIT_FILE = 1 << 5   # FILE_IO (path_traversal)
+
+
+def python_finding(cap_bit: int, path: str, line: int, status: str | None) -> dict:
+    finding = {
+        "path": path,
+        "line": line,
+        "col": 0,
+        "id": "py.sqli.cursor_execute",
+        "evidence": {"sink_caps": cap_bit},
+    }
+    if status:
+        finding["evidence"]["dynamic_verdict"] = {"status": status}
+    return finding
+
+
+def test_budget_passes_on_clean_scan(tmp: Path) -> None:
+    scan = tmp / "scan_clean.json"
+    write_json(
+        scan,
+        {
+            "findings": [
+                python_finding(SINK_BIT_SQL, "app.py", 10, "Confirmed"),
+                python_finding(SINK_BIT_SQL, "app.py", 20, "Confirmed"),
+                python_finding(SINK_BIT_SQL, "app.py", 30, "NotConfirmed"),
+            ]
+        },
+    )
+    append = tmp / "results_clean.json"
+    write_json(append, [])
+    proc = run_tabulate(
+        "--label", "test",
+        "--scan", str(scan),
+        "--inhouse",
+        "--append", str(append),
+        "--budget", str(BUDGET),
+    )
+    assert proc.returncode == 0, f"clean scan must pass budget, got rc={proc.returncode}\nstdout: {proc.stdout}\nstderr: {proc.stderr}"
+    assert "Per-cell budget" in proc.stdout and "OK" in proc.stdout, proc.stdout
+
+
+def test_budget_fails_when_unsupported_exceeds(tmp: Path) -> None:
+    # SQL_QUERY/python budget is 40% Unsupported. Hand-craft a scan with
+    # 100% Unsupported in that cell so the gate must trip.
+    scan = tmp / "scan_unsup.json"
+    write_json(
+        scan,
+        {
+            "findings": [
+                python_finding(SINK_BIT_SQL, "app.py", i, "Unsupported")
+                for i in (10, 20, 30, 40, 50)
+            ]
+        },
+    )
+    append = tmp / "results_unsup.json"
+    write_json(append, [])
+    proc = run_tabulate(
+        "--label", "test",
+        "--scan", str(scan),
+        "--inhouse",
+        "--append", str(append),
+        "--budget", str(BUDGET),
+    )
+    assert proc.returncode == 2, (
+        f"budget breach must exit 2, got {proc.returncode}\n"
+        f"stdout: {proc.stdout}\nstderr: {proc.stderr}"
+    )
+    assert "FAIL" in proc.stdout and "sqli/python" in proc.stdout, proc.stdout
+
+
+def test_diff_fails_on_regression(tmp: Path) -> None:
+    # Previous run: 1/4 Unsupported = 25%.  Current run: 3/4 = 75%.  The
+    # default cell budget tolerates 80%, but the monotonic-improvement
+    # diff must still flag the +50pp regression.
+    prev_findings = [
+        python_finding(SINK_BIT_CMDI, "x.unknown", 1, "Confirmed"),
+        python_finding(SINK_BIT_CMDI, "x.unknown", 2, "Confirmed"),
+        python_finding(SINK_BIT_CMDI, "x.unknown", 3, "Confirmed"),
+        python_finding(SINK_BIT_CMDI, "x.unknown", 4, "Unsupported"),
+    ]
+    prev_scan = tmp / "prev_scan.json"
+    write_json(prev_scan, {"findings": prev_findings})
+    prev_results = tmp / "prev_results.json"
+    write_json(prev_results, [])
+    rc_prev = run_tabulate(
+        "--label", "diff-test",
+        "--scan", str(prev_scan),
+        "--inhouse",
+        "--append", str(prev_results),
+    ).returncode
+    assert rc_prev == 0, f"prev seed run must succeed, got {rc_prev}"
+
+    cur_findings = [
+        python_finding(SINK_BIT_CMDI, "x.unknown", 1, "Unsupported"),
+        python_finding(SINK_BIT_CMDI, "x.unknown", 2, "Unsupported"),
+        python_finding(SINK_BIT_CMDI, "x.unknown", 3, "Unsupported"),
+        python_finding(SINK_BIT_CMDI, "x.unknown", 4, "Confirmed"),
+    ]
+    cur_scan = tmp / "cur_scan.json"
+    write_json(cur_scan, {"findings": cur_findings})
+    cur_results = tmp / "cur_results.json"
+    write_json(cur_results, [])
+    proc = run_tabulate(
+        "--label", "diff-test",
+        "--scan", str(cur_scan),
+        "--inhouse",
+        "--append", str(cur_results),
+        "--diff", str(prev_results),
+    )
+    assert proc.returncode == 2, (
+        f"regression diff must exit 2, got {proc.returncode}\n"
+        f"stdout: {proc.stdout}\nstderr: {proc.stderr}"
+    )
+    assert "REGRESSION" in proc.stdout and "Unsupported" in proc.stdout, proc.stdout
+
+
+def test_diff_passes_on_improvement(tmp: Path) -> None:
+    # Previous: 3/4 Unsupported.  Current: 1/4.  Monotonic improvement
+    # must not flag any regression.
+    prev_findings = [
+        python_finding(SINK_BIT_CMDI, "x.unknown", 1, "Unsupported"),
+        python_finding(SINK_BIT_CMDI, "x.unknown", 2, "Unsupported"),
+        python_finding(SINK_BIT_CMDI, "x.unknown", 3, "Unsupported"),
+        python_finding(SINK_BIT_CMDI, "x.unknown", 4, "Confirmed"),
+    ]
+    prev_scan = tmp / "prev_scan.json"
+    write_json(prev_scan, {"findings": prev_findings})
+    prev_results = tmp / "prev_results.json"
+    write_json(prev_results, [])
+    run_tabulate(
+        "--label", "improve-test",
+        "--scan", str(prev_scan),
+        "--inhouse",
+        "--append", str(prev_results),
+    )
+
+    cur_findings = [
+        python_finding(SINK_BIT_CMDI, "x.unknown", 1, "Confirmed"),
+        python_finding(SINK_BIT_CMDI, "x.unknown", 2, "Confirmed"),
+        python_finding(SINK_BIT_CMDI, "x.unknown", 3, "Confirmed"),
+        python_finding(SINK_BIT_CMDI, "x.unknown", 4, "Unsupported"),
+    ]
+    cur_scan = tmp / "cur_scan.json"
+    write_json(cur_scan, {"findings": cur_findings})
+    cur_results = tmp / "cur_results.json"
+    write_json(cur_results, [])
+    proc = run_tabulate(
+        "--label", "improve-test",
+        "--scan", str(cur_scan),
+        "--inhouse",
+        "--append", str(cur_results),
+        "--diff", str(prev_results),
+    )
+    assert proc.returncode == 0, (
+        f"improvement diff must exit 0, got {proc.returncode}\n"
+        f"stdout: {proc.stdout}\nstderr: {proc.stderr}"
+    )
+    assert "no regressions" in proc.stdout, proc.stdout
+
+
+def test_manual_triage_stamps_wrong_confirmed(tmp: Path) -> None:
+    # Phase 31 follow-up: --manual-triage should cross-reference Confirmed
+    # findings against a list of {path, line, cap, vuln: false} entries
+    # and stamp `wrong: true` so the per-cell wrong_confirmed counter
+    # becomes non-vacuous without the host's verify-feedback log.
+    #
+    # Confirmed at line 10 matches the triage's vuln:false at line 12
+    # (within LINE_TOLERANCE=5).  Confirmed at line 100 does not match
+    # any triage entry, so wrong_confirmed stays at 1 / 2 Confirmed.
+    scan = tmp / "scan.json"
+    write_json(
+        scan,
+        {
+            "findings": [
+                python_finding(SINK_BIT_SQL, "app.py", 10, "Confirmed"),
+                python_finding(SINK_BIT_SQL, "app.py", 100, "Confirmed"),
+            ]
+        },
+    )
+    triage = tmp / "triage.json"
+    write_json(
+        triage,
+        [
+            {"path": "app.py", "line": 12, "cap": "sqli", "vuln": False},
+        ],
+    )
+    append = tmp / "results.json"
+    write_json(append, [])
+    proc = run_tabulate(
+        "--label", "triage-test",
+        "--scan", str(scan),
+        "--inhouse",
+        "--append", str(append),
+        "--manual-triage", str(triage),
+    )
+    assert proc.returncode == 0, (
+        f"manual-triage run must succeed without budget, got {proc.returncode}\n"
+        f"stdout: {proc.stdout}\nstderr: {proc.stderr}"
+    )
+    results = json.loads(append.read_text())
+    cells = {(c["cap"], c["lang"]): c for c in results[-1]["cells"]}
+    sqli_py = cells.get(("sqli", "python"))
+    assert sqli_py is not None, f"expected sqli/python cell, got {list(cells)}"
+    assert sqli_py["confirmed"] == 2, sqli_py
+    assert sqli_py["wrong_confirmed"] == 1, (
+        "exactly one Confirmed finding must be stamped wrong via the triage match; "
+        f"got {sqli_py}"
+    )
+
+
+def test_manual_triage_ignores_vuln_true_entries(tmp: Path) -> None:
+    # Triage entries with `vuln: true` are ground-truth-positive markers,
+    # not False-Confirmed evidence.  --manual-triage must leave them alone
+    # so a real Confirmed-on-vuln-true row does not get downgraded.
+    scan = tmp / "scan.json"
+    write_json(
+        scan,
+        {
+            "findings": [
+                python_finding(SINK_BIT_SQL, "app.py", 10, "Confirmed"),
+            ]
+        },
+    )
+    triage = tmp / "triage.json"
+    write_json(
+        triage,
+        [
+            {"path": "app.py", "line": 10, "cap": "sqli", "vuln": True},
+        ],
+    )
+    append = tmp / "results.json"
+    write_json(append, [])
+    proc = run_tabulate(
+        "--label", "triage-true-test",
+        "--scan", str(scan),
+        "--inhouse",
+        "--append", str(append),
+        "--manual-triage", str(triage),
+    )
+    assert proc.returncode == 0
+    results = json.loads(append.read_text())
+    cells = {(c["cap"], c["lang"]): c for c in results[-1]["cells"]}
+    sqli_py = cells[("sqli", "python")]
+    assert sqli_py["confirmed"] == 1
+    assert sqli_py["wrong_confirmed"] == 0, (
+        f"vuln:true triage rows must not stamp wrong; got {sqli_py}"
+    )
+
+
+def test_lang_filter_scopes_findings_and_gt(tmp: Path) -> None:
+    # Phase 29 (Track R.2): --lang scopes a single-language corpus to its
+    # target language so incidental other-language assets (e.g. the vendored
+    # JavaScript a Rails app bundles, which nyx flags as prototype_pollution)
+    # do not pollute the corpus's per-cap metrics.  The filter must drop both
+    # findings AND ground-truth entries outside the scope.
+    gt = tmp / "gt.json"
+    write_json(
+        gt,
+        [
+            {"path": "app/models/user.rb", "line": 0, "cap": "sqli", "vuln": True},
+            {"path": "app/assets/lib.js", "line": 0, "cap": "sqli", "vuln": True},
+        ],
+    )
+    scan = tmp / "scan.json"
+    write_json(
+        scan,
+        {
+            "findings": [
+                python_finding(SINK_BIT_SQL, "/x/app/models/user.rb", 10, "NotConfirmed"),
+                # A vendored-JS finding nyx would otherwise Confirm — must be
+                # excluded entirely under `--lang ruby`.
+                python_finding(SINK_BIT_SQL, "/x/app/assets/lib.js", 10, "Confirmed"),
+            ]
+        },
+    )
+
+    # Unscoped: both language cells appear.
+    unscoped = tmp / "unscoped.json"
+    write_json(unscoped, [])
+    proc = run_tabulate(
+        "--label", "railsgoat",
+        "--scan", str(scan),
+        "--ground-truth", str(gt),
+        "--append", str(unscoped),
+    )
+    assert proc.returncode == 0, proc.stdout + proc.stderr
+    cells = {(c["cap"], c["lang"]) for c in json.loads(unscoped.read_text())[-1]["cells"]}
+    assert ("sqli", "ruby") in cells and ("sqli", "javascript") in cells, cells
+
+    # Scoped to ruby: the JS finding AND the JS ground-truth positive vanish.
+    scoped = tmp / "scoped.json"
+    write_json(scoped, [])
+    proc = run_tabulate(
+        "--label", "railsgoat",
+        "--scan", str(scan),
+        "--ground-truth", str(gt),
+        "--lang", "ruby",
+        "--append", str(scoped),
+    )
+    assert proc.returncode == 0, proc.stdout + proc.stderr
+    cells = {(c["cap"], c["lang"]): c for c in json.loads(scoped.read_text())[-1]["cells"]}
+    assert ("sqli", "javascript") not in cells, f"JS must be filtered out: {list(cells)}"
+    ruby = cells[("sqli", "ruby")]
+    assert ruby["tp"] == 1 and ruby["fn"] == 0, ruby
+    # The dropped JS positive must NOT resurface as a phantom FN in any cell.
+    assert all(lang != "javascript" for _cap, lang in cells), cells
+
+
+def test_static_lens_buckets_shell_escape_as_cmdi(tmp: Path) -> None:
+    # Caveat-1 fix: in an env with 0 dynamic confirmations a Java command-exec
+    # finding carries only SHELL_ESCAPE (1<<2), which the default bit table
+    # leaves in "other" — so the cmdi cell reads 0 TP / N FN regardless of
+    # static quality.  --static appends SHELL_ESCAPE→cmdi so static recall is
+    # measurable without dynamic confirmation.
+    gt = tmp / "gt.json"
+    write_json(
+        gt,
+        [{"path": "testcode/Cmd.java", "line": 0, "cap": "cmdi", "vuln": True}],
+    )
+    # Real Java taint findings carry id "taint-unsanitised-flow" (no cap
+    # substring), so the rule-id fallback yields "other" — not the sqli/cmdi
+    # the hand-crafted python_finding id would imply.
+    java_cmdi = {
+        "path": "/x/testcode/Cmd.java",
+        "line": 10,
+        "col": 0,
+        "id": "taint-unsanitised-flow",
+        "evidence": {"sink_caps": SINK_BIT_SHELL, "dynamic_verdict": {"status": "NotConfirmed"}},
+    }
+    scan = tmp / "scan.json"
+    write_json(scan, {"findings": [java_cmdi]})
+
+    # Default lens: the finding buckets as "other", so cmdi shows the GT
+    # positive as a pure FN (recall 0) — the measurement gap.
+    default = tmp / "default.json"
+    write_json(default, [])
+    proc = run_tabulate(
+        "--label", "owasp",
+        "--scan", str(scan),
+        "--ground-truth", str(gt),
+        "--append", str(default),
+    )
+    assert proc.returncode == 0, proc.stdout + proc.stderr
+    cells = {(c["cap"], c["lang"]): c for c in json.loads(default.read_text())[-1]["cells"]}
+    assert ("cmdi", "java") in cells and cells[("cmdi", "java")]["tp"] == 0, cells
+    assert cells[("cmdi", "java")]["fn"] == 1, cells[("cmdi", "java")]
+    assert ("other", "java") in cells, f"SHELL_ESCAPE must bucket as other by default: {list(cells)}"
+
+    # Static lens: the finding buckets as cmdi → recall measurable (TP=1, FN=0).
+    static = tmp / "static.json"
+    write_json(static, [])
+    proc = run_tabulate(
+        "--label", "owasp",
+        "--scan", str(scan),
+        "--ground-truth", str(gt),
+        "--static",
+        "--append", str(static),
+    )
+    assert proc.returncode == 0, proc.stdout + proc.stderr
+    cells = {(c["cap"], c["lang"]): c for c in json.loads(static.read_text())[-1]["cells"]}
+    cmdi = cells[("cmdi", "java")]
+    assert cmdi["tp"] == 1 and cmdi["fn"] == 0, cmdi
+    assert ("other", "java") not in cells, f"static lens must reclaim the other-bucketed finding: {list(cells)}"
+
+
+def test_static_lens_preserves_higher_priority_bits(tmp: Path) -> None:
+    # A finding carrying BOTH FILE_IO and SHELL_ESCAPE must keep bucketing as
+    # path_traversal under the static lens (SHELL_ESCAPE is appended at lowest
+    # priority), so the static lens never steals a finding from a non-cmdi cell.
+    scan = tmp / "scan.json"
+    write_json(
+        scan,
+        {
+            "findings": [
+                python_finding(SINK_BIT_FILE | SINK_BIT_SHELL, "B.java", 10, "NotConfirmed"),
+            ]
+        },
+    )
+    for flag in ([], ["--static"]):
+        append = tmp / f"out{len(flag)}.json"
+        write_json(append, [])
+        proc = run_tabulate(
+            "--label", "x",
+            "--scan", str(scan),
+            "--inhouse",
+            "--append", str(append),
+            *flag,
+        )
+        assert proc.returncode == 0, proc.stdout + proc.stderr
+        caps = {c["cap"] for c in json.loads(append.read_text())[-1]["cells"]}
+        assert caps == {"path_traversal"}, f"flag={flag}: {caps}"
+
+
+def test_budget_malformed_exits_3(tmp: Path) -> None:
+    bad = tmp / "bad.toml"
+    bad.write_text("[default]\nunsupported_rate = not_a_number\n")
+    scan = tmp / "scan.json"
+    write_json(scan, {"findings": []})
+    append = tmp / "results.json"
+    write_json(append, [])
+    proc = run_tabulate(
+        "--label", "test",
+        "--scan", str(scan),
+        "--inhouse",
+        "--append", str(append),
+        "--budget", str(bad),
+    )
+    assert proc.returncode == 3, (
+        f"malformed budget must exit 3, got {proc.returncode}\nstderr: {proc.stderr}"
+    )
+
+
+def test_relative_gt_path_suffix_matches_absolute_finding(tmp: Path) -> None:
+    # Phase 27: ground truth stores corpus-relative paths; nyx emits absolute
+    # paths.  A relative GT path must suffix-match the absolute finding path so
+    # the committed JSON stays portable across machines / CI checkouts.
+    gt = tmp / "gt.json"
+    write_json(
+        gt,
+        [
+            {
+                "path": "src/main/java/org/owasp/benchmark/testcode/BenchmarkTest1.java",
+                "line": 0,
+                "cap": "sqli",
+                "vuln": True,
+            }
+        ],
+    )
+    scan = tmp / "scan.json"
+    write_json(
+        scan,
+        {
+            "findings": [
+                # Absolute path with the GT relative path as a suffix → TP.
+                python_finding(
+                    SINK_BIT_SQL,
+                    "/home/ci/work/owasp/src/main/java/org/owasp/benchmark/testcode/BenchmarkTest1.java",
+                    10,
+                    "Confirmed",
+                ),
+                # Different file under the same corpus → no GT positive → FP.
+                python_finding(
+                    SINK_BIT_SQL,
+                    "/home/ci/work/owasp/src/main/java/org/owasp/benchmark/testcode/BenchmarkTest2.java",
+                    10,
+                    "NotConfirmed",
+                ),
+            ]
+        },
+    )
+    append = tmp / "results.json"
+    write_json(append, [])
+    proc = run_tabulate(
+        "--label", "owasp",
+        "--scan", str(scan),
+        "--ground-truth", str(gt),
+        "--append", str(append),
+    )
+    assert proc.returncode == 0, proc.stdout + proc.stderr
+    cells = {(c["cap"], c["lang"]): c for c in json.loads(append.read_text())[-1]["cells"]}
+    sqli_java = cells[("sqli", "java")]
+    assert sqli_java["tp"] == 1, f"relative GT path must suffix-match absolute finding: {sqli_java}"
+    assert sqli_java["fp"] == 1, f"benign-file finding must count as FP: {sqli_java}"
+    assert sqli_java["fn"] == 0, sqli_java
+
+
+def test_unmatched_gt_positive_lands_in_lang_cell(tmp: Path) -> None:
+    # Phase 27: a ground-truth positive with no matching finding is a FN, and
+    # it must land in the cell its file extension implies (java), not a stray
+    # "unknown" lang cell, so per-cap recall aggregation is meaningful.
+    gt = tmp / "gt.json"
+    write_json(
+        gt,
+        [
+            {
+                "path": "src/main/java/org/owasp/benchmark/testcode/BenchmarkTest9.java",
+                "line": 0,
+                "cap": "sqli",
+                "vuln": True,
+            }
+        ],
+    )
+    scan = tmp / "scan.json"
+    write_json(scan, {"findings": []})
+    append = tmp / "results.json"
+    write_json(append, [])
+    proc = run_tabulate(
+        "--label", "owasp",
+        "--scan", str(scan),
+        "--ground-truth", str(gt),
+        "--append", str(append),
+    )
+    assert proc.returncode == 0, proc.stdout + proc.stderr
+    cells = {(c["cap"], c["lang"]): c for c in json.loads(append.read_text())[-1]["cells"]}
+    assert ("sqli", "java") in cells, f"FN must land in the java cell: {list(cells)}"
+    assert cells[("sqli", "java")]["fn"] == 1, cells[("sqli", "java")]
+    assert ("sqli", "unknown") not in cells, f"no stray unknown-lang cell: {list(cells)}"
+
+
+def test_gt_grounded_false_confirm(tmp: Path) -> None:
+    # Phase 27: with full ground truth, a Confirmed finding that matches no GT
+    # positive is a false confirm — derived from GT, no manual-triage file
+    # needed.  vuln file → confirmed_tp; benign/other file → confirmed_fp →
+    # wrong_confirmed.  Makes false_confirmed_rate non-vacuous on a fresh corpus.
+    gt = tmp / "gt.json"
+    write_json(
+        gt,
+        [
+            {"path": "testcode/Vuln.java", "line": 0, "cap": "sqli", "vuln": True},
+            {"path": "testcode/Benign.java", "line": 0, "cap": "sqli", "vuln": False},
+        ],
+    )
+    scan = tmp / "scan.json"
+    write_json(
+        scan,
+        {
+            "findings": [
+                # Correct confirm on the vuln file.
+                python_finding(SINK_BIT_SQL, "/x/testcode/Vuln.java", 10, "Confirmed"),
+                # False confirm on the benign file (no GT positive there).
+                python_finding(SINK_BIT_SQL, "/x/testcode/Benign.java", 10, "Confirmed"),
+            ]
+        },
+    )
+    append = tmp / "results.json"
+    write_json(append, [])
+    proc = run_tabulate(
+        "--label", "owasp",
+        "--scan", str(scan),
+        "--ground-truth", str(gt),
+        "--append", str(append),
+    )
+    assert proc.returncode == 0, proc.stdout + proc.stderr
+    cells = {(c["cap"], c["lang"]): c for c in json.loads(append.read_text())[-1]["cells"]}
+    sqli_java = cells[("sqli", "java")]
+    assert sqli_java["confirmed_tp"] == 1, sqli_java
+    assert sqli_java["confirmed_fp"] == 1, sqli_java
+    assert sqli_java["wrong_confirmed"] == 1, (
+        f"benign-file Confirmed must be a GT-derived false confirm: {sqli_java}"
+    )
+
+
+def test_budget_confirmed_rate_floor(tmp: Path) -> None:
+    # Phase 27: budget.toml may carry a per-cell `confirmed_rate` minimum.
+    # 1 Confirmed of 5 (20%) must trip a 40% floor.
+    budget = tmp / "budget.toml"
+    budget.write_text(
+        "[default]\n"
+        "[[cell]]\n"
+        'cap = "sqli"\n'
+        'lang = "java"\n'
+        "confirmed_rate = 0.40\n"
+    )
+    scan_fail = tmp / "scan_fail.json"
+    write_json(
+        scan_fail,
+        {
+            "findings": [
+                python_finding(SINK_BIT_SQL, "B.java", 10, "Confirmed"),
+                python_finding(SINK_BIT_SQL, "B.java", 20, "NotConfirmed"),
+                python_finding(SINK_BIT_SQL, "B.java", 30, "NotConfirmed"),
+                python_finding(SINK_BIT_SQL, "B.java", 40, "NotConfirmed"),
+                python_finding(SINK_BIT_SQL, "B.java", 50, "NotConfirmed"),
+            ]
+        },
+    )
+    append = tmp / "results_fail.json"
+    write_json(append, [])
+    proc = run_tabulate(
+        "--label", "owasp",
+        "--scan", str(scan_fail),
+        "--inhouse",
+        "--append", str(append),
+        "--budget", str(budget),
+    )
+    assert proc.returncode == 2, proc.stdout + proc.stderr
+    assert "Confirmed" in proc.stdout and "sqli/java" in proc.stdout, proc.stdout
+
+    # 3 Confirmed of 5 (60%) clears the floor.
+    scan_ok = tmp / "scan_ok.json"
+    write_json(
+        scan_ok,
+        {
+            "findings": [
+                python_finding(SINK_BIT_SQL, "B.java", 10, "Confirmed"),
+                python_finding(SINK_BIT_SQL, "B.java", 20, "Confirmed"),
+                python_finding(SINK_BIT_SQL, "B.java", 30, "Confirmed"),
+                python_finding(SINK_BIT_SQL, "B.java", 40, "NotConfirmed"),
+                python_finding(SINK_BIT_SQL, "B.java", 50, "NotConfirmed"),
+            ]
+        },
+    )
+    append_ok = tmp / "results_ok.json"
+    write_json(append_ok, [])
+    proc = run_tabulate(
+        "--label", "owasp",
+        "--scan", str(scan_ok),
+        "--inhouse",
+        "--append", str(append_ok),
+        "--budget", str(budget),
+    )
+    assert proc.returncode == 0, proc.stdout + proc.stderr
+
+
+def test_report_precision_recall_floors(tmp: Path) -> None:
+    # Phase 27: report.py --min-precision / --min-recall enforce per-cap floors
+    # aggregated across langs.  cmdi precision 0.20 trips 0.85; ldap recall 0.10
+    # trips 0.40; sqli (prec 1.0, rec 0.90) clears both.
+    results = tmp / "results.json"
+
+    def cell(cap, lang, tp, fp, fn):
+        return {
+            "cap": cap, "lang": lang, "tp": tp, "fp": fp, "fn": fn,
+            "unsupported": 0, "confirmed": 0, "partially_confirmed": 0,
+            "wrong_confirmed": 0, "stable_replays": 0,
+            "total": tp + fp + fn,
+        }
+
+    write_json(
+        results,
+        [
+            {
+                "label": "owasp",
+                "total_findings": 0,
+                "cells": [
+                    cell("sqli", "java", 9, 0, 1),   # prec 1.00, rec 0.90 → OK
+                    cell("cmdi", "java", 1, 4, 0),   # prec 0.20 → FAIL precision
+                    cell("ldap_injection", "java", 1, 0, 9),  # rec 0.10 → FAIL recall
+                ],
+            }
+        ],
+    )
+    proc = run_report(
+        "--results", str(results),
+        "--min-precision", "0.85",
+        "--min-recall", "0.40",
+    )
+    assert proc.returncode == 2, proc.stdout + proc.stderr
+    assert "PRECISION" in proc.stdout and "cmdi" in proc.stdout, proc.stdout
+    assert "RECALL" in proc.stdout and "ldap_injection" in proc.stdout, proc.stdout
+
+    # Clean: only the passing sqli cap.
+    clean = tmp / "clean.json"
+    write_json(
+        clean,
+        [{"label": "owasp", "total_findings": 0, "cells": [cell("sqli", "java", 9, 0, 1)]}],
+    )
+    proc = run_report(
+        "--results", str(clean),
+        "--min-precision", "0.85",
+        "--min-recall", "0.40",
+    )
+    assert proc.returncode == 0, proc.stdout + proc.stderr
+    assert "All per-cap precision/recall floors met" in proc.stdout, proc.stdout
+
+
+def test_report_confirmed_rate_floor(tmp: Path) -> None:
+    results = tmp / "results.json"
+    write_json(
+        results,
+        [
+            {
+                "label": "owasp",
+                "total_findings": 5,
+                "cells": [
+                    {
+                        "cap": "sqli",
+                        "lang": "java",
+                        "tp": 0,
+                        "fp": 0,
+                        "fn": 0,
+                        "unsupported": 0,
+                        "confirmed": 2,
+                        "wrong_confirmed": 0,
+                        "stable_replays": 0,
+                        "total": 5,
+                    }
+                ],
+            }
+        ],
+    )
+    proc = run_report("--results", str(results), "--min-confirmed-rate", "0.40")
+    assert proc.returncode == 0, proc.stdout + proc.stderr
+    assert "All confirmed-rate floors met" in proc.stdout, proc.stdout
+
+    proc = run_report("--results", str(results), "--min-confirmed-rate", "0.50")
+    assert proc.returncode == 2, proc.stdout + proc.stderr
+    assert "FAIL" in proc.stdout and "sqli" in proc.stdout, proc.stdout
+
+
+def main() -> int:
+    with tempfile.TemporaryDirectory() as td:
+        tmp = Path(td)
+        for fn in (
+            test_budget_passes_on_clean_scan,
+            test_budget_fails_when_unsupported_exceeds,
+            test_diff_fails_on_regression,
+            test_diff_passes_on_improvement,
+            test_manual_triage_stamps_wrong_confirmed,
+            test_manual_triage_ignores_vuln_true_entries,
+            test_lang_filter_scopes_findings_and_gt,
+            test_static_lens_buckets_shell_escape_as_cmdi,
+            test_static_lens_preserves_higher_priority_bits,
+            test_budget_malformed_exits_3,
+            test_relative_gt_path_suffix_matches_absolute_finding,
+            test_unmatched_gt_positive_lands_in_lang_cell,
+            test_gt_grounded_false_confirm,
+            test_budget_confirmed_rate_floor,
+            test_report_precision_recall_floors,
+            test_report_confirmed_rate_floor,
+        ):
+            sub = tmp / fn.__name__
+            sub.mkdir()
+            print(f"... {fn.__name__}")
+            fn(sub)
+            print(f"    OK")
+    print("\nAll tabulate.py regression checks passed.")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())