nyx/tests/eval_corpus/budget.toml

38 lines
1.6 KiB
TOML
Raw Normal View History

# Phase 31: ratchet values set to the headline targets.
#
# These are the published acceptance numbers behind the dynamic-verification
# overhaul (see `docs/dynamic.md` "Headline metrics"). The ratchet schedule
# from Phase 29 collapsed into a single target row: every (cap, lang) cell is
# now gated against the same headline thresholds. Per-cell carve-outs were
# dropped in Phase 31; if a cell is still wider than these numbers in practice
# it shows up as a per-cell `FAIL` in `report.py` and as a gate-1 failure in
# `scripts/m7_ship_gate.sh`, which is the intended forcing function for the
# remaining engine follow-ups tracked in `.pitboss/play/deferred.md`.
#
# Wall-clock cost (≤ 2× static-only) is enforced separately by Gate 3 of
# `scripts/m7_ship_gate.sh` against `benches/fixtures/`; it is not a per-cell
# budget knob and has no entry in this file.
#
# Schema:
#
# [default]
# unsupported_rate = 0.20 # max(Unsupported / total) per cell
# false_confirmed_rate = 0.02 # max(wrong / Confirmed) per cap
# repro_stability = 0.95 # min(stable / Confirmed) per cell
# ratchet_deadline = "..." # informational; cells already at headline
#
# [[cell]]
# cap = "..."
# lang = "..."
# <overrides as above>
#
# `cap` matches `tabulate.py`'s _CAP_BIT_TABLE / _CAP_RULE_TABLE labels.
# `lang` matches the ext_map values (`python`, `javascript`, …).
# A wildcard `"*"` matches any cell that does not have an exact entry.
[default]
unsupported_rate = 0.20
false_confirmed_rate = 0.02
repro_stability = 0.95
ratchet_deadline = "2026-05-15"