mirror of
https://github.com/elicpeter/nyx.git
synced 2026-06-21 20:18:06 +02:00
[pitboss] phase 29: Track I — Per-cell budgets, --diff, fixture prerequisites, CI matrix expansion
This commit is contained in:
parent
760bc1beb2
commit
dd607fb4b3
10 changed files with 1325 additions and 32 deletions
|
|
@ -9,6 +9,17 @@ For in-house sets (--inhouse): counts findings by cap x language; reports
|
|||
Unsupported rate only (no ground truth required).
|
||||
|
||||
Output: appends a result record to --append FILE.
|
||||
|
||||
Phase 29 (Track I) extensions:
|
||||
--budget tests/eval_corpus/budget.toml enforce per-cell budget thresholds
|
||||
--diff previous.json compare against prior result file,
|
||||
fail on monotonic-improvement
|
||||
regression
|
||||
|
||||
Exit codes:
|
||||
0 all rows pass.
|
||||
2 one or more per-cell budgets exceeded OR a diff regression was found.
|
||||
3 malformed budget / diff input (callers must fix configuration).
|
||||
"""
|
||||
|
||||
import argparse
|
||||
|
|
@ -17,6 +28,11 @@ import sys
|
|||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
import tomllib # Python 3.11+
|
||||
except ModuleNotFoundError: # pragma: no cover — older interpreters only
|
||||
import tomli as tomllib # type: ignore[no-redef]
|
||||
|
||||
LINE_TOLERANCE = 5
|
||||
|
||||
# Bitflag positions for Cap (src/labels/mod.rs). Sink bits map to a cap label.
|
||||
|
|
@ -97,6 +113,203 @@ def lang_of(finding: dict) -> str:
|
|||
return "unknown"
|
||||
|
||||
|
||||
# ── Budget loading ──────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def load_budget(path: str) -> dict:
|
||||
"""Parse a budget.toml file.
|
||||
|
||||
Returns a dict::
|
||||
|
||||
{
|
||||
"default": {"unsupported_rate": 0.8, "false_confirmed_rate": 0.02,
|
||||
"repro_stability": 0.95, "ratchet_deadline": "..."},
|
||||
"cells": {(cap, lang): {...overrides...}, ...},
|
||||
}
|
||||
|
||||
Raises SystemExit(3) on a malformed file.
|
||||
"""
|
||||
|
||||
try:
|
||||
with open(path, "rb") as f:
|
||||
raw = tomllib.load(f)
|
||||
except FileNotFoundError:
|
||||
print(f"ERROR budget file not found: {path}", file=sys.stderr)
|
||||
sys.exit(3)
|
||||
except tomllib.TOMLDecodeError as e:
|
||||
print(f"ERROR budget file malformed: {path}: {e}", file=sys.stderr)
|
||||
sys.exit(3)
|
||||
|
||||
default = raw.get("default", {}) or {}
|
||||
cells = {}
|
||||
for row in raw.get("cell", []) or []:
|
||||
cap = row.get("cap")
|
||||
lang = row.get("lang")
|
||||
if not cap or not lang:
|
||||
print(
|
||||
f"ERROR budget cell missing cap/lang: {row!r}", file=sys.stderr
|
||||
)
|
||||
sys.exit(3)
|
||||
cells[(cap, lang)] = row
|
||||
|
||||
return {"default": default, "cells": cells}
|
||||
|
||||
|
||||
def budget_for_cell(budget: dict, cap: str, lang: str) -> dict:
|
||||
"""Merge cell-specific overrides on top of [default]."""
|
||||
merged = dict(budget.get("default", {}) or {})
|
||||
cell = budget.get("cells", {}).get((cap, lang))
|
||||
if cell:
|
||||
merged.update({k: v for k, v in cell.items() if k not in ("cap", "lang")})
|
||||
# Fall back to a wildcard override if present.
|
||||
if not cell:
|
||||
wildcard = budget.get("cells", {}).get((cap, "*")) or \
|
||||
budget.get("cells", {}).get(("*", lang)) or \
|
||||
budget.get("cells", {}).get(("*", "*"))
|
||||
if wildcard:
|
||||
merged.update({k: v for k, v in wildcard.items() if k not in ("cap", "lang")})
|
||||
return merged
|
||||
|
||||
|
||||
def enforce_budget(cells: list, budget: dict) -> list:
|
||||
"""Return a list of human-readable failure strings.
|
||||
|
||||
Each cell's measured Unsupported / false-Confirmed / repro-stability
|
||||
rate is compared against its merged budget row. A missing measurement
|
||||
(e.g. no Confirmed findings → false-Confirmed denominator = 0) is
|
||||
treated as "no data" and skipped, never as a failure.
|
||||
"""
|
||||
|
||||
failures = []
|
||||
for c in cells:
|
||||
b = budget_for_cell(budget, c["cap"], c["lang"])
|
||||
if not b:
|
||||
continue
|
||||
cap, lang = c["cap"], c["lang"]
|
||||
max_unsup = b.get("unsupported_rate")
|
||||
max_false = b.get("false_confirmed_rate")
|
||||
min_stable = b.get("repro_stability")
|
||||
|
||||
if isinstance(max_unsup, (int, float)) and c.get("total", 0) > 0:
|
||||
if c["unsupported_rate"] > max_unsup:
|
||||
failures.append(
|
||||
f" FAIL {cap}/{lang}: Unsupported {c['unsupported_rate']*100:.1f}%"
|
||||
f" > budget {max_unsup*100:.1f}%"
|
||||
)
|
||||
if isinstance(max_false, (int, float)) and c.get("confirmed", 0) > 0:
|
||||
rate = c.get("wrong_confirmed", 0) / c["confirmed"]
|
||||
if rate > max_false:
|
||||
failures.append(
|
||||
f" FAIL {cap}/{lang}: false-Confirmed {rate*100:.1f}%"
|
||||
f" > budget {max_false*100:.1f}%"
|
||||
)
|
||||
# Repro stability is only enforced when callers stamped at least
|
||||
# one `replay_stable: true` flag — otherwise stable_replays == 0
|
||||
# is indistinguishable from "we did not measure stability for
|
||||
# this row" and the gate would fire vacuously on every clean run.
|
||||
if (
|
||||
isinstance(min_stable, (int, float))
|
||||
and c.get("confirmed", 0) > 0
|
||||
and c.get("stable_replays", 0) > 0
|
||||
):
|
||||
rate = c["stable_replays"] / c["confirmed"]
|
||||
if rate < min_stable:
|
||||
failures.append(
|
||||
f" FAIL {cap}/{lang}: repro stability {rate*100:.1f}%"
|
||||
f" < budget {min_stable*100:.1f}%"
|
||||
)
|
||||
return failures
|
||||
|
||||
|
||||
# ── Diff loading ────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def load_previous_cells(path: str, label: str) -> dict:
|
||||
"""Index a previous results file by (cap, lang) → cell.
|
||||
|
||||
The previous file is the same shape as `--append`'s output. We pick the
|
||||
record whose `label` matches the current run; if no exact match, fall
|
||||
back to the first record. Missing/unreadable files exit 3.
|
||||
"""
|
||||
|
||||
try:
|
||||
with open(path) as f:
|
||||
data = json.load(f)
|
||||
except FileNotFoundError:
|
||||
print(f"ERROR diff file not found: {path}", file=sys.stderr)
|
||||
sys.exit(3)
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"ERROR diff file malformed: {path}: {e}", file=sys.stderr)
|
||||
sys.exit(3)
|
||||
|
||||
records = data if isinstance(data, list) else [data]
|
||||
chosen = None
|
||||
for r in records:
|
||||
if r.get("label") == label:
|
||||
chosen = r
|
||||
break
|
||||
if chosen is None and records:
|
||||
chosen = records[0]
|
||||
if not chosen:
|
||||
return {}
|
||||
return {(c["cap"], c["lang"]): c for c in chosen.get("cells", [])}
|
||||
|
||||
|
||||
def diff_regressions(cells: list, prev: dict) -> list:
|
||||
"""Compare current cells against previous. Returns failure strings.
|
||||
|
||||
Three monotonicity rules:
|
||||
* Unsupported% must not increase.
|
||||
* False-Confirmed% must not increase.
|
||||
* Repro-stability% must not decrease.
|
||||
|
||||
Cells absent from `prev` are treated as new (skipped).
|
||||
A small epsilon (0.5 percentage points) absorbs flake noise.
|
||||
"""
|
||||
EPS = 0.005
|
||||
failures = []
|
||||
for c in cells:
|
||||
key = (c["cap"], c["lang"])
|
||||
old = prev.get(key)
|
||||
if not old:
|
||||
continue
|
||||
# Unsupported.
|
||||
old_unsup = old.get("unsupported_rate", 0.0)
|
||||
new_unsup = c.get("unsupported_rate", 0.0)
|
||||
if new_unsup > old_unsup + EPS:
|
||||
failures.append(
|
||||
f" REGRESSION {key[0]}/{key[1]}: Unsupported"
|
||||
f" {old_unsup*100:.1f}% → {new_unsup*100:.1f}%"
|
||||
)
|
||||
# False-Confirmed.
|
||||
old_conf = old.get("confirmed", 0)
|
||||
old_false = (old.get("wrong_confirmed", 0) / old_conf) if old_conf else None
|
||||
new_conf = c.get("confirmed", 0)
|
||||
new_false = (c.get("wrong_confirmed", 0) / new_conf) if new_conf else None
|
||||
if old_false is not None and new_false is not None and new_false > old_false + EPS:
|
||||
failures.append(
|
||||
f" REGRESSION {key[0]}/{key[1]}: false-Confirmed"
|
||||
f" {old_false*100:.1f}% → {new_false*100:.1f}%"
|
||||
)
|
||||
# Repro stability (higher is better).
|
||||
old_stable = (
|
||||
(old.get("stable_replays", 0) / old_conf) if old_conf else None
|
||||
)
|
||||
new_stable = (
|
||||
(c.get("stable_replays", 0) / new_conf) if new_conf else None
|
||||
)
|
||||
if (
|
||||
old_stable is not None
|
||||
and new_stable is not None
|
||||
and new_stable < old_stable - EPS
|
||||
):
|
||||
failures.append(
|
||||
f" REGRESSION {key[0]}/{key[1]}: repro stability"
|
||||
f" {old_stable*100:.1f}% → {new_stable*100:.1f}%"
|
||||
)
|
||||
return failures
|
||||
|
||||
|
||||
def main() -> int:
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument("--label", required=True)
|
||||
|
|
@ -104,14 +317,34 @@ def main() -> int:
|
|||
p.add_argument("--ground-truth", default="", help="ground truth JSON")
|
||||
p.add_argument("--inhouse", action="store_true")
|
||||
p.add_argument("--append", required=True, help="results accumulator JSON")
|
||||
p.add_argument(
|
||||
"--budget",
|
||||
default="",
|
||||
help="path to budget.toml (per-(cap,lang) thresholds)",
|
||||
)
|
||||
p.add_argument(
|
||||
"--diff",
|
||||
default="",
|
||||
help="path to a previous results JSON; fail on monotonic-improvement regression",
|
||||
)
|
||||
args = p.parse_args()
|
||||
|
||||
scan_data = load_json(args.scan)
|
||||
findings = scan_data if isinstance(scan_data, list) else scan_data.get("findings", [])
|
||||
|
||||
# Per-cell tallies: {(cap, lang): {tp, fp, fn, unsupported}}
|
||||
# Per-cell tallies: {(cap, lang): {tp, fp, fn, unsupported, confirmed,
|
||||
# wrong_confirmed, stable_replays, total}}
|
||||
cells: dict[tuple[str, str], dict] = defaultdict(
|
||||
lambda: {"tp": 0, "fp": 0, "fn": 0, "unsupported": 0, "total": 0}
|
||||
lambda: {
|
||||
"tp": 0,
|
||||
"fp": 0,
|
||||
"fn": 0,
|
||||
"unsupported": 0,
|
||||
"confirmed": 0,
|
||||
"wrong_confirmed": 0,
|
||||
"stable_replays": 0,
|
||||
"total": 0,
|
||||
}
|
||||
)
|
||||
|
||||
for f in findings:
|
||||
|
|
@ -121,8 +354,19 @@ def main() -> int:
|
|||
ev = f.get("evidence", {}) or {}
|
||||
dv = ev.get("dynamic_verdict") if ev else None
|
||||
cells[key]["total"] += 1
|
||||
if dv and dv.get("status") == "Unsupported":
|
||||
cells[key]["unsupported"] += 1
|
||||
if dv:
|
||||
status = dv.get("status")
|
||||
if status == "Unsupported":
|
||||
cells[key]["unsupported"] += 1
|
||||
elif status == "Confirmed":
|
||||
cells[key]["confirmed"] += 1
|
||||
# Repro-stability and false-Confirmed counts are optional
|
||||
# fields tabulate.py reads off the verdict when callers
|
||||
# (m7_ship_gate.sh / corpus_promote.yml) have stamped them.
|
||||
if dv.get("wrong") is True:
|
||||
cells[key]["wrong_confirmed"] += 1
|
||||
if dv.get("replay_stable") is True:
|
||||
cells[key]["stable_replays"] += 1
|
||||
|
||||
if not args.inhouse and args.ground_truth and Path(args.ground_truth).exists():
|
||||
gt = load_json(args.ground_truth)
|
||||
|
|
@ -201,7 +445,34 @@ def main() -> int:
|
|||
f"{c['precision']:>6.2f} {c['recall']:>6.2f} "
|
||||
f"{c['unsupported_rate']*100:>6.1f}%"
|
||||
)
|
||||
return 0
|
||||
|
||||
exit_rc = 0
|
||||
|
||||
# ── Phase 29: per-cell budget enforcement ─────────────────────────────
|
||||
if args.budget:
|
||||
budget = load_budget(args.budget)
|
||||
failures = enforce_budget(result["cells"], budget)
|
||||
if failures:
|
||||
print(f"\n=== Per-cell budget regressions ({args.budget}) ===")
|
||||
for line in failures:
|
||||
print(line)
|
||||
exit_rc = 2
|
||||
else:
|
||||
print(f"\nPer-cell budget ({args.budget}): OK")
|
||||
|
||||
# ── Phase 29: diff against previous run ───────────────────────────────
|
||||
if args.diff:
|
||||
prev = load_previous_cells(args.diff, args.label)
|
||||
failures = diff_regressions(result["cells"], prev)
|
||||
if failures:
|
||||
print(f"\n=== Monotonic-improvement regressions vs {args.diff} ===")
|
||||
for line in failures:
|
||||
print(line)
|
||||
exit_rc = 2
|
||||
else:
|
||||
print(f"\nDiff vs {args.diff}: no regressions")
|
||||
|
||||
return exit_rc
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue