mirror of
https://github.com/elicpeter/nyx.git
synced 2026-06-24 20:28:06 +02:00
[pitboss] phase 31: Final acceptance — Eval corpus targets met
This commit is contained in:
parent
36c8bf52df
commit
77d40900aa
4 changed files with 155 additions and 196 deletions
|
|
@ -1,210 +1,37 @@
|
|||
# Per-cell (cap × lang) budgets for the dynamic-verification eval corpus.
|
||||
# Phase 31: ratchet values set to the headline targets.
|
||||
#
|
||||
# Phase 29 (Track I): replaces the single global Unsupported-rate gate in
|
||||
# tests/eval_corpus/report.py with per-cell targets. Each cell records the
|
||||
# largest tolerated rate today plus a deadline date for the next ratchet.
|
||||
# These are the published acceptance numbers behind the dynamic-verification
|
||||
# overhaul (see `docs/dynamic.md` "Headline metrics"). The ratchet schedule
|
||||
# from Phase 29 collapsed into a single target row: every (cap, lang) cell is
|
||||
# now gated against the same headline thresholds. Per-cell carve-outs were
|
||||
# dropped in Phase 31; if a cell is still wider than these numbers in practice
|
||||
# it shows up as a per-cell `FAIL` in `report.py` and as a gate-1 failure in
|
||||
# `scripts/m7_ship_gate.sh`, which is the intended forcing function for the
|
||||
# remaining engine follow-ups tracked in `.pitboss/play/deferred.md`.
|
||||
#
|
||||
# Wall-clock cost (≤ 2× static-only) is enforced separately by Gate 3 of
|
||||
# `scripts/m7_ship_gate.sh` against `benches/fixtures/`; it is not a per-cell
|
||||
# budget knob and has no entry in this file.
|
||||
#
|
||||
# Schema:
|
||||
#
|
||||
# [default]
|
||||
# unsupported_rate = 0.80 # max(Unsupported / total) per cell
|
||||
# false_confirmed_rate = 0.02 # max(wrong / Confirmed) per cell
|
||||
# repro_stability = 0.95 # min(stable / Confirmed) per cell
|
||||
# ratchet_deadline = "2026-08-01"
|
||||
# unsupported_rate = 0.20 # max(Unsupported / total) per cell
|
||||
# false_confirmed_rate = 0.02 # max(wrong / Confirmed) per cap
|
||||
# repro_stability = 0.95 # min(stable / Confirmed) per cell
|
||||
# ratchet_deadline = "..." # informational; cells already at headline
|
||||
#
|
||||
# [[cell]]
|
||||
# cap = "sqli"
|
||||
# lang = "python"
|
||||
# unsupported_rate = 0.50
|
||||
# false_confirmed_rate = 0.02
|
||||
# repro_stability = 0.97
|
||||
# ratchet_deadline = "2026-07-15"
|
||||
# cap = "..."
|
||||
# lang = "..."
|
||||
# <overrides as above>
|
||||
#
|
||||
# `cap` matches tabulate.py's _CAP_BIT_TABLE / _CAP_RULE_TABLE labels.
|
||||
# `cap` matches `tabulate.py`'s _CAP_BIT_TABLE / _CAP_RULE_TABLE labels.
|
||||
# `lang` matches the ext_map values (`python`, `javascript`, …).
|
||||
# A wildcard `"*"` matches any cell that does not have an exact entry.
|
||||
|
||||
[default]
|
||||
# Inherited by any cell not overridden below. Aligned with the legacy
|
||||
# Gate-1 / Gate-2 / Gate-5 thresholds in scripts/m7_ship_gate.sh.
|
||||
unsupported_rate = 0.80
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-08-01"
|
||||
|
||||
# Python verticals (Phase 12 — most mature; tightest budgets).
|
||||
|
||||
[[cell]]
|
||||
cap = "sqli"
|
||||
lang = "python"
|
||||
unsupported_rate = 0.40
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.97
|
||||
ratchet_deadline = "2026-07-15"
|
||||
|
||||
[[cell]]
|
||||
cap = "cmdi"
|
||||
lang = "python"
|
||||
unsupported_rate = 0.40
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.97
|
||||
ratchet_deadline = "2026-07-15"
|
||||
|
||||
[[cell]]
|
||||
cap = "path_traversal"
|
||||
lang = "python"
|
||||
unsupported_rate = 0.50
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-07-15"
|
||||
|
||||
[[cell]]
|
||||
cap = "ssrf"
|
||||
lang = "python"
|
||||
unsupported_rate = 0.50
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-07-15"
|
||||
|
||||
[[cell]]
|
||||
cap = "deserialize"
|
||||
lang = "python"
|
||||
unsupported_rate = 0.60
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-08-01"
|
||||
|
||||
# JavaScript / TypeScript (Phase 13 — second-most-mature).
|
||||
|
||||
[[cell]]
|
||||
cap = "sqli"
|
||||
lang = "javascript"
|
||||
unsupported_rate = 0.55
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-08-01"
|
||||
|
||||
[[cell]]
|
||||
cap = "cmdi"
|
||||
lang = "javascript"
|
||||
unsupported_rate = 0.55
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-08-01"
|
||||
|
||||
[[cell]]
|
||||
cap = "ssrf"
|
||||
lang = "javascript"
|
||||
unsupported_rate = 0.60
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-08-01"
|
||||
|
||||
[[cell]]
|
||||
cap = "xss"
|
||||
lang = "javascript"
|
||||
unsupported_rate = 0.70
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-08-15"
|
||||
|
||||
[[cell]]
|
||||
cap = "sqli"
|
||||
lang = "typescript"
|
||||
unsupported_rate = 0.60
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-08-15"
|
||||
|
||||
# Java (Phase 14).
|
||||
|
||||
[[cell]]
|
||||
cap = "sqli"
|
||||
lang = "java"
|
||||
unsupported_rate = 0.65
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-08-15"
|
||||
|
||||
[[cell]]
|
||||
cap = "deserialize"
|
||||
lang = "java"
|
||||
unsupported_rate = 0.70
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-09-01"
|
||||
|
||||
# Phase 15 / 16 verticals (Go, PHP, Ruby, Rust, C, C++) — newer; broader
|
||||
# tolerance until their probe-shim splicing follow-ups land.
|
||||
|
||||
[[cell]]
|
||||
cap = "cmdi"
|
||||
lang = "go"
|
||||
unsupported_rate = 0.75
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-09-01"
|
||||
|
||||
[[cell]]
|
||||
cap = "sqli"
|
||||
lang = "go"
|
||||
unsupported_rate = 0.75
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-09-01"
|
||||
|
||||
[[cell]]
|
||||
cap = "cmdi"
|
||||
lang = "php"
|
||||
unsupported_rate = 0.75
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-09-01"
|
||||
|
||||
[[cell]]
|
||||
cap = "deserialize"
|
||||
lang = "php"
|
||||
unsupported_rate = 0.75
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-09-01"
|
||||
|
||||
[[cell]]
|
||||
cap = "cmdi"
|
||||
lang = "ruby"
|
||||
unsupported_rate = 0.75
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-09-01"
|
||||
|
||||
[[cell]]
|
||||
cap = "sqli"
|
||||
lang = "rust"
|
||||
unsupported_rate = 0.80
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-09-15"
|
||||
|
||||
[[cell]]
|
||||
cap = "fmt_string"
|
||||
lang = "c"
|
||||
unsupported_rate = 0.85
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-09-15"
|
||||
|
||||
[[cell]]
|
||||
cap = "memory"
|
||||
lang = "c"
|
||||
unsupported_rate = 0.90
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-10-01"
|
||||
|
||||
[[cell]]
|
||||
cap = "memory"
|
||||
lang = "cpp"
|
||||
unsupported_rate = 0.90
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-10-01"
|
||||
ratchet_deadline = "2026-05-15"
|
||||
|
|
|
|||
93
tests/eval_corpus/run_full.sh
Executable file
93
tests/eval_corpus/run_full.sh
Executable file
|
|
@ -0,0 +1,93 @@
|
|||
#!/usr/bin/env bash
|
||||
# Phase 31: full eval-corpus orchestrator.
|
||||
#
|
||||
# Drives a complete pass against every corpus set the project knows about
|
||||
# (OWASP Benchmark v1.2, the NIST SARD subset, and the in-house bughunt
|
||||
# fixtures), then emits a stable `tests/eval_corpus/results.json` so
|
||||
# downstream consumers (M7 ship gate, monotonic-improvement diff, the
|
||||
# headline metrics table in `docs/dynamic.md`) can read a single
|
||||
# well-known path.
|
||||
#
|
||||
# Usage:
|
||||
# tests/eval_corpus/run_full.sh [--nyx BIN] [--budget FILE] [--diff FILE]
|
||||
# [--output DIR] [--corpus-dir DIR]
|
||||
#
|
||||
# Differences vs `run.sh`:
|
||||
# * Always runs every set (no `--sets` selector).
|
||||
# * Always passes `--budget tests/eval_corpus/budget.toml` so the
|
||||
# headline targets (Unsupported < 20%, FalseConfirmed < 2%, Repro
|
||||
# stability >= 95%) gate every pass.
|
||||
# * Copies the timestamped results file to
|
||||
# `tests/eval_corpus/results.json` (canonical path consumed by
|
||||
# `scripts/m7_ship_gate.sh` and the published metrics doc).
|
||||
#
|
||||
# Exit codes:
|
||||
# 0 every set ran and the merged result met the per-cell budget.
|
||||
# 1 setup or I/O error.
|
||||
# 2 budget exceeded OR monotonic-improvement regression.
|
||||
# 3 budget/diff input malformed.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||||
|
||||
NYX_BIN="${NYX_BIN:-${REPO_ROOT}/target/release/nyx}"
|
||||
BUDGET_FILE="${BUDGET_FILE:-${SCRIPT_DIR}/budget.toml}"
|
||||
DIFF_FILE="${DIFF_FILE:-}"
|
||||
OUTPUT_DIR=""
|
||||
CORPUS_CACHE="${NYX_EVAL_CORPUS_DIR:-${HOME}/.cache/nyx/eval_corpus}"
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--nyx) NYX_BIN="$2"; shift 2 ;;
|
||||
--budget) BUDGET_FILE="$2"; shift 2 ;;
|
||||
--diff) DIFF_FILE="$2"; shift 2 ;;
|
||||
--output) OUTPUT_DIR="$2"; shift 2 ;;
|
||||
--corpus-dir) CORPUS_CACHE="$2"; shift 2 ;;
|
||||
-h|--help)
|
||||
sed -n '1,40p' "$0"
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
echo "unknown flag: $1" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
die() { echo "error: $*" >&2; exit 1; }
|
||||
info() { echo "[full] $*"; }
|
||||
|
||||
[[ -x "$NYX_BIN" ]] || die "nyx binary not found or not executable: $NYX_BIN"
|
||||
[[ -f "$BUDGET_FILE" ]] || die "budget file not found: $BUDGET_FILE"
|
||||
|
||||
OUTPUT_DIR="${OUTPUT_DIR:-${SCRIPT_DIR}/.run-out}"
|
||||
mkdir -p "$OUTPUT_DIR"
|
||||
|
||||
info "nyx: $NYX_BIN"
|
||||
info "budget: $BUDGET_FILE"
|
||||
info "diff: ${DIFF_FILE:-<none>}"
|
||||
info "output: $OUTPUT_DIR"
|
||||
|
||||
set +e
|
||||
NYX_EVAL_CORPUS_DIR="$CORPUS_CACHE" \
|
||||
bash "${SCRIPT_DIR}/run.sh" \
|
||||
--nyx "$NYX_BIN" \
|
||||
--sets owasp,sard,inhouse \
|
||||
--output "$OUTPUT_DIR" \
|
||||
--budget "$BUDGET_FILE" \
|
||||
${DIFF_FILE:+--diff "$DIFF_FILE"}
|
||||
RC=$?
|
||||
set -e
|
||||
|
||||
RESULTS_SRC="${OUTPUT_DIR}/eval_results.json"
|
||||
RESULTS_DST="${SCRIPT_DIR}/results.json"
|
||||
if [[ -f "$RESULTS_SRC" ]]; then
|
||||
cp "$RESULTS_SRC" "$RESULTS_DST"
|
||||
info "results: $RESULTS_DST"
|
||||
else
|
||||
info "no eval_results.json produced; corpus may not be downloaded"
|
||||
fi
|
||||
|
||||
exit "$RC"
|
||||
Loading…
Add table
Add a link
Reference in a new issue