nyx/tests/eval_corpus/budget.toml

30 lines
998 B
TOML

# Eval corpus budget.
#
# `report.py` enforces these values when `run.sh` or `run_full.sh` pass
# `--budget`. Each (cap, lang) cell uses the default row unless a specific
# override appears below.
#
# Wall-clock cost is measured separately from this per-cell budget.
#
# Schema:
#
# [default]
# unsupported_rate = 0.20 # max(Unsupported / total) per cell
# false_confirmed_rate = 0.02 # max(wrong / Confirmed) per cap
# repro_stability = 0.95 # min(stable / Confirmed) per cell
# ratchet_deadline = "..." # informational; cells already at headline
#
# [[cell]]
# cap = "..."
# lang = "..."
# <overrides as above>
#
# `cap` matches `tabulate.py`'s _CAP_BIT_TABLE / _CAP_RULE_TABLE labels.
# `lang` matches the ext_map values (`python`, `javascript`, …).
# A wildcard `"*"` matches any cell that does not have an exact entry.
[default]
unsupported_rate = 0.20
false_confirmed_rate = 0.02
repro_stability = 0.95
ratchet_deadline = "2026-05-15"