[pitboss] phase 29: Track I — Per-cell budgets, --diff, fixture prerequisites, CI matrix expansion

2026-06-21 20:18:06 +02:00 · 2026-05-15 19:22:40 -05:00 · 2026-05-15 19:22:40 -05:00 · dd607fb4b3
commit dd607fb4b3
parent 760bc1beb2
10 changed files with 1325 additions and 32 deletions
--- a/tests/eval_corpus/tabulate.py
+++ b/tests/eval_corpus/tabulate.py
@ -9,6 +9,17 @@ For in-house sets (--inhouse): counts findings by cap x language; reports
 Unsupported rate only (no ground truth required).

 Output: appends a result record to --append FILE.
+
+Phase 29 (Track I) extensions:
+  --budget tests/eval_corpus/budget.toml   enforce per-cell budget thresholds
+  --diff   previous.json                   compare against prior result file,
+                                           fail on monotonic-improvement
+                                           regression
+
+Exit codes:
+  0  all rows pass.
+  2  one or more per-cell budgets exceeded OR a diff regression was found.
+  3  malformed budget / diff input (callers must fix configuration).
 """

 import argparse
@ -17,6 +28,11 @@ import sys
 from collections import defaultdict
 from pathlib import Path

+try:
+    import tomllib  # Python 3.11+
+except ModuleNotFoundError:  # pragma: no cover — older interpreters only
+    import tomli as tomllib  # type: ignore[no-redef]
+
 LINE_TOLERANCE = 5

 # Bitflag positions for Cap (src/labels/mod.rs). Sink bits map to a cap label.
@ -97,6 +113,203 @@ def lang_of(finding: dict) -> str:
    return "unknown"


+# ── Budget loading ──────────────────────────────────────────────────────────
+
+
+def load_budget(path: str) -> dict:
+    """Parse a budget.toml file.
+
+    Returns a dict::
+
+        {
+            "default": {"unsupported_rate": 0.8, "false_confirmed_rate": 0.02,
+                        "repro_stability": 0.95, "ratchet_deadline": "..."},
+            "cells": {(cap, lang): {...overrides...}, ...},
+        }
+
+    Raises SystemExit(3) on a malformed file.
+    """
+
+    try:
+        with open(path, "rb") as f:
+            raw = tomllib.load(f)
+    except FileNotFoundError:
+        print(f"ERROR  budget file not found: {path}", file=sys.stderr)
+        sys.exit(3)
+    except tomllib.TOMLDecodeError as e:
+        print(f"ERROR  budget file malformed: {path}: {e}", file=sys.stderr)
+        sys.exit(3)
+
+    default = raw.get("default", {}) or {}
+    cells = {}
+    for row in raw.get("cell", []) or []:
+        cap = row.get("cap")
+        lang = row.get("lang")
+        if not cap or not lang:
+            print(
+                f"ERROR  budget cell missing cap/lang: {row!r}", file=sys.stderr
+            )
+            sys.exit(3)
+        cells[(cap, lang)] = row
+
+    return {"default": default, "cells": cells}
+
+
+def budget_for_cell(budget: dict, cap: str, lang: str) -> dict:
+    """Merge cell-specific overrides on top of [default]."""
+    merged = dict(budget.get("default", {}) or {})
+    cell = budget.get("cells", {}).get((cap, lang))
+    if cell:
+        merged.update({k: v for k, v in cell.items() if k not in ("cap", "lang")})
+    # Fall back to a wildcard override if present.
+    if not cell:
+        wildcard = budget.get("cells", {}).get((cap, "*")) or \
+                   budget.get("cells", {}).get(("*", lang)) or \
+                   budget.get("cells", {}).get(("*", "*"))
+        if wildcard:
+            merged.update({k: v for k, v in wildcard.items() if k not in ("cap", "lang")})
+    return merged
+
+
+def enforce_budget(cells: list, budget: dict) -> list:
+    """Return a list of human-readable failure strings.
+
+    Each cell's measured Unsupported / false-Confirmed / repro-stability
+    rate is compared against its merged budget row. A missing measurement
+    (e.g. no Confirmed findings → false-Confirmed denominator = 0) is
+    treated as "no data" and skipped, never as a failure.
+    """
+
+    failures = []
+    for c in cells:
+        b = budget_for_cell(budget, c["cap"], c["lang"])
+        if not b:
+            continue
+        cap, lang = c["cap"], c["lang"]
+        max_unsup = b.get("unsupported_rate")
+        max_false = b.get("false_confirmed_rate")
+        min_stable = b.get("repro_stability")
+
+        if isinstance(max_unsup, (int, float)) and c.get("total", 0) > 0:
+            if c["unsupported_rate"] > max_unsup:
+                failures.append(
+                    f"  FAIL  {cap}/{lang}: Unsupported {c['unsupported_rate']*100:.1f}%"
+                    f" > budget {max_unsup*100:.1f}%"
+                )
+        if isinstance(max_false, (int, float)) and c.get("confirmed", 0) > 0:
+            rate = c.get("wrong_confirmed", 0) / c["confirmed"]
+            if rate > max_false:
+                failures.append(
+                    f"  FAIL  {cap}/{lang}: false-Confirmed {rate*100:.1f}%"
+                    f" > budget {max_false*100:.1f}%"
+                )
+        # Repro stability is only enforced when callers stamped at least
+        # one `replay_stable: true` flag — otherwise stable_replays == 0
+        # is indistinguishable from "we did not measure stability for
+        # this row" and the gate would fire vacuously on every clean run.
+        if (
+            isinstance(min_stable, (int, float))
+            and c.get("confirmed", 0) > 0
+            and c.get("stable_replays", 0) > 0
+        ):
+            rate = c["stable_replays"] / c["confirmed"]
+            if rate < min_stable:
+                failures.append(
+                    f"  FAIL  {cap}/{lang}: repro stability {rate*100:.1f}%"
+                    f" < budget {min_stable*100:.1f}%"
+                )
+    return failures
+
+
+# ── Diff loading ────────────────────────────────────────────────────────────
+
+
+def load_previous_cells(path: str, label: str) -> dict:
+    """Index a previous results file by (cap, lang) → cell.
+
+    The previous file is the same shape as `--append`'s output. We pick the
+    record whose `label` matches the current run; if no exact match, fall
+    back to the first record. Missing/unreadable files exit 3.
+    """
+
+    try:
+        with open(path) as f:
+            data = json.load(f)
+    except FileNotFoundError:
+        print(f"ERROR  diff file not found: {path}", file=sys.stderr)
+        sys.exit(3)
+    except json.JSONDecodeError as e:
+        print(f"ERROR  diff file malformed: {path}: {e}", file=sys.stderr)
+        sys.exit(3)
+
+    records = data if isinstance(data, list) else [data]
+    chosen = None
+    for r in records:
+        if r.get("label") == label:
+            chosen = r
+            break
+    if chosen is None and records:
+        chosen = records[0]
+    if not chosen:
+        return {}
+    return {(c["cap"], c["lang"]): c for c in chosen.get("cells", [])}
+
+
+def diff_regressions(cells: list, prev: dict) -> list:
+    """Compare current cells against previous. Returns failure strings.
+
+    Three monotonicity rules:
+      * Unsupported% must not increase.
+      * False-Confirmed% must not increase.
+      * Repro-stability% must not decrease.
+
+    Cells absent from `prev` are treated as new (skipped).
+    A small epsilon (0.5 percentage points) absorbs flake noise.
+    """
+    EPS = 0.005
+    failures = []
+    for c in cells:
+        key = (c["cap"], c["lang"])
+        old = prev.get(key)
+        if not old:
+            continue
+        # Unsupported.
+        old_unsup = old.get("unsupported_rate", 0.0)
+        new_unsup = c.get("unsupported_rate", 0.0)
+        if new_unsup > old_unsup + EPS:
+            failures.append(
+                f"  REGRESSION  {key[0]}/{key[1]}: Unsupported"
+                f" {old_unsup*100:.1f}% → {new_unsup*100:.1f}%"
+            )
+        # False-Confirmed.
+        old_conf = old.get("confirmed", 0)
+        old_false = (old.get("wrong_confirmed", 0) / old_conf) if old_conf else None
+        new_conf = c.get("confirmed", 0)
+        new_false = (c.get("wrong_confirmed", 0) / new_conf) if new_conf else None
+        if old_false is not None and new_false is not None and new_false > old_false + EPS:
+            failures.append(
+                f"  REGRESSION  {key[0]}/{key[1]}: false-Confirmed"
+                f" {old_false*100:.1f}% → {new_false*100:.1f}%"
+            )
+        # Repro stability (higher is better).
+        old_stable = (
+            (old.get("stable_replays", 0) / old_conf) if old_conf else None
+        )
+        new_stable = (
+            (c.get("stable_replays", 0) / new_conf) if new_conf else None
+        )
+        if (
+            old_stable is not None
+            and new_stable is not None
+            and new_stable < old_stable - EPS
+        ):
+            failures.append(
+                f"  REGRESSION  {key[0]}/{key[1]}: repro stability"
+                f" {old_stable*100:.1f}% → {new_stable*100:.1f}%"
+            )
+    return failures
+
+
 def main() -> int:
    p = argparse.ArgumentParser()
    p.add_argument("--label", required=True)
@ -104,14 +317,34 @@ def main() -> int:
    p.add_argument("--ground-truth", default="", help="ground truth JSON")
    p.add_argument("--inhouse", action="store_true")
    p.add_argument("--append", required=True, help="results accumulator JSON")
+    p.add_argument(
+        "--budget",
+        default="",
+        help="path to budget.toml (per-(cap,lang) thresholds)",
+    )
+    p.add_argument(
+        "--diff",
+        default="",
+        help="path to a previous results JSON; fail on monotonic-improvement regression",
+    )
    args = p.parse_args()

    scan_data = load_json(args.scan)
    findings = scan_data if isinstance(scan_data, list) else scan_data.get("findings", [])

-    # Per-cell tallies: {(cap, lang): {tp, fp, fn, unsupported}}
+    # Per-cell tallies: {(cap, lang): {tp, fp, fn, unsupported, confirmed,
+    # wrong_confirmed, stable_replays, total}}
    cells: dict[tuple[str, str], dict] = defaultdict(
-        lambda: {"tp": 0, "fp": 0, "fn": 0, "unsupported": 0, "total": 0}
+        lambda: {
+            "tp": 0,
+            "fp": 0,
+            "fn": 0,
+            "unsupported": 0,
+            "confirmed": 0,
+            "wrong_confirmed": 0,
+            "stable_replays": 0,
+            "total": 0,
+        }
    )

    for f in findings:
@ -121,8 +354,19 @@ def main() -> int:
        ev = f.get("evidence", {}) or {}
        dv = ev.get("dynamic_verdict") if ev else None
        cells[key]["total"] += 1
-        if dv and dv.get("status") == "Unsupported":
-            cells[key]["unsupported"] += 1
+        if dv:
+            status = dv.get("status")
+            if status == "Unsupported":
+                cells[key]["unsupported"] += 1
+            elif status == "Confirmed":
+                cells[key]["confirmed"] += 1
+                # Repro-stability and false-Confirmed counts are optional
+                # fields tabulate.py reads off the verdict when callers
+                # (m7_ship_gate.sh / corpus_promote.yml) have stamped them.
+                if dv.get("wrong") is True:
+                    cells[key]["wrong_confirmed"] += 1
+                if dv.get("replay_stable") is True:
+                    cells[key]["stable_replays"] += 1

    if not args.inhouse and args.ground_truth and Path(args.ground_truth).exists():
        gt = load_json(args.ground_truth)
@ -201,7 +445,34 @@ def main() -> int:
            f"{c['precision']:>6.2f} {c['recall']:>6.2f} "
            f"{c['unsupported_rate']*100:>6.1f}%"
        )
-    return 0
+
+    exit_rc = 0
+
+    # ── Phase 29: per-cell budget enforcement ─────────────────────────────
+    if args.budget:
+        budget = load_budget(args.budget)
+        failures = enforce_budget(result["cells"], budget)
+        if failures:
+            print(f"\n=== Per-cell budget regressions ({args.budget}) ===")
+            for line in failures:
+                print(line)
+            exit_rc = 2
+        else:
+            print(f"\nPer-cell budget ({args.budget}): OK")
+
+    # ── Phase 29: diff against previous run ───────────────────────────────
+    if args.diff:
+        prev = load_previous_cells(args.diff, args.label)
+        failures = diff_regressions(result["cells"], prev)
+        if failures:
+            print(f"\n=== Monotonic-improvement regressions vs {args.diff} ===")
+            for line in failures:
+                print(line)
+            exit_rc = 2
+        else:
+            print(f"\nDiff vs {args.diff}: no regressions")
+
+    return exit_rc


 if __name__ == "__main__":