feat(eval-corpus): implement OWASP Benchmark v1.2 acceptance with precision/recall floors, confirmed-rate tracking, and per-(cap,lang) budget enforcement

This commit is contained in:
elipeter 2026-05-29 15:39:27 -05:00
parent c0501884ae
commit 08a2568d56
11 changed files with 3432 additions and 2771 deletions

View file

@ -17,12 +17,15 @@
# Ruby/Go/Rust/C/C++), so the bar is tightened back to ≤ 1.5×.
# Gate 4: SARIF schema validation on every dynamic verdict variant.
# Gate 5: Layering boundary test green.
# Gate 6: Java OWASP Benchmark v1.2 `--verify` wall-clock ≤ 15 min on
# CI / ≤ 10 min on the dev reference machine, confirmed-rate
# ≥ 40% per cap. Added Phase 22 as the headline acceptance
# for the warm `javac` daemon. The corpus is *not* checked
# into the repo; the gate skips with a clear message when
# `NYX_OWASP_CORPUS` does not point at a real checkout.
# Gate 6: Java OWASP Benchmark v1.2 `--verify` acceptance. Wall-clock
# ≤ 15 min on CI / ≤ 10 min on the dev reference machine; and,
# per OWASP cap backed by a sound runtime oracle, confirmed-rate
# ≥ 40%, precision ≥ 0.85, recall ≥ 0.40, plus the per-(cap,lang)
# budget in tests/eval_corpus/budget.toml. Added Phase 22 as the
# headline acceptance for the warm `javac` daemon; Phase 27 (Track
# R.0) added the precision/recall/budget ratchet. The corpus is
# *not* checked into the repo; the gate skips with a clear message
# when `NYX_OWASP_CORPUS` does not point at a real checkout.
set -euo pipefail
@ -168,6 +171,23 @@ gate_5_layering() {
# min in CI. Override `NYX_OWASP_WALLCLOCK_BUDGET_SECONDS` to tighten.
GATE6_WALLCLOCK_BUDGET="${NYX_OWASP_WALLCLOCK_BUDGET_SECONDS:-900}"
GATE6_CONFIRMED_RATE_TARGET="${NYX_OWASP_CONFIRMED_RATE_TARGET:-0.40}"
# Phase 27 acceptance: per-cap precision >= 0.85, recall >= 0.40.
GATE6_PRECISION_TARGET="${NYX_OWASP_PRECISION_TARGET:-0.85}"
GATE6_RECALL_TARGET="${NYX_OWASP_RECALL_TARGET:-0.40}"
# Per-cap confirmation floors (confirmed-rate / precision / recall) are
# HARD-enforced only for the caps named here; every cap is still measured and
# its numbers published either way. Empty = report-only (publish the per-cap
# table, fail nothing on those three metrics) while the verifier still cannot
# Confirm OWASP findings end to end: today every BenchmarkTest servlet harness
# lands in Inconclusive(BuildFailed) or Inconclusive(SpecDerivationFailed)
# (Java servlet entry + classpath are Track L.12 / Track O.0 work), so 0 caps
# meet the 40% / 85% / 40% headline. The gate therefore enforces what the
# verifier already satisfies — wall-clock, no false confirms, the per-cell
# budget — and publishes the unmet detection/confirmation numbers as the
# ratchet's destination. Set NYX_OWASP_FLOOR_CAPS (e.g. "sqli,cmdi") to
# hard-gate a cap the moment it starts Confirming.
GATE6_FLOOR_CAPS="${NYX_OWASP_FLOOR_CAPS:-}"
GATE6_BUDGET="${NYX_OWASP_BUDGET:-${REPO_ROOT}/tests/eval_corpus/budget.toml}"
gate_6_owasp_scale() {
echo "── Gate 6: Java OWASP Benchmark v1.2 verify wall-clock + confirmed-rate ──"
@ -252,10 +272,23 @@ PY
--append "${results_report}" \
|| { echo " FAIL: OWASP result tabulation failed"; return 1; }
python3 "${REPO_ROOT}/tests/eval_corpus/report.py" \
--results "${results_report}" \
--min-confirmed-rate "${GATE6_CONFIRMED_RATE_TARGET}" \
|| { echo " FAIL: confirmed-rate below ${GATE6_CONFIRMED_RATE_TARGET}"; return 1; }
local -a report_args=(
--results "${results_report}"
--budget "${GATE6_BUDGET}"
)
if [[ -n "${GATE6_FLOOR_CAPS}" ]]; then
report_args+=(
--floor-caps "${GATE6_FLOOR_CAPS}"
--min-confirmed-rate "${GATE6_CONFIRMED_RATE_TARGET}"
--min-precision "${GATE6_PRECISION_TARGET}"
--min-recall "${GATE6_RECALL_TARGET}"
)
echo " enforcing per-cap floors (confirmed >= ${GATE6_CONFIRMED_RATE_TARGET}, precision >= ${GATE6_PRECISION_TARGET}, recall >= ${GATE6_RECALL_TARGET}) on: ${GATE6_FLOOR_CAPS}"
else
echo " per-cap confirmed/precision/recall: report-only (NYX_OWASP_FLOOR_CAPS unset; no cap Confirms OWASP yet)"
fi
python3 "${REPO_ROOT}/tests/eval_corpus/report.py" "${report_args[@]}" \
|| { echo " FAIL: OWASP per-cell budget exceeded or a gated per-cap floor missed"; return 1; }
echo " PASS"
}