feat(eval-corpus): add Track R.2 polyglot corpora (RailsGoat, DVWA, DVPWA, gosec, RustSec) with curated manifests, negative controls, and CI validation

This commit is contained in:
elipeter 2026-06-01 10:04:38 -05:00
parent 2a4d49b68b
commit e0833537e4
20 changed files with 1181 additions and 53 deletions

View file

@ -8,6 +8,8 @@
# scripts/m7_ship_gate.sh --sets owasp # Java OWASP corpus only
# scripts/m7_ship_gate.sh --sets jsts # NodeGoat + Juice Shop only
# scripts/m7_ship_gate.sh --sets nodegoat # one JS/TS corpus only
# scripts/m7_ship_gate.sh --sets polyglot # RailsGoat+DVWA+DVPWA+gosec+RustSec
# scripts/m7_ship_gate.sh --sets railsgoat # one polyglot corpus only
#
# Gate map (kept in sync with .pitboss/play/plan.md track M.7):
# Gate 1: Static-only scan is green on `tests/benchmark/corpus`.
@ -37,13 +39,21 @@
# (NYX_JSTS_FLOOR_CAPS empty by default). Each corpus row
# self-skips unless its NYX_NODEGOAT_CORPUS / NYX_JUICESHOP_CORPUS
# points at a real checkout.
# Gate 8: Polyglot real-corpus acceptance (Track R.2 / Phase 29). OWASP
# RailsGoat (Rails, .rb), DVWA (PHP), DVPWA (aiohttp, .py), gosec
# (Go) and the RustSec advisory-db (Rust negative control), one
# row per corpus. Same shape as Gate 7: wall-clock budget + the
# per-(cap,lang) budget hard-enforced; per-cap confirmed/precision/
# recall report-only (NYX_POLYGLOT_FLOOR_CAPS empty by default).
# Each row self-skips unless its NYX_<NAME>_CORPUS points at a real
# checkout.
set -euo pipefail
REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
cd "${REPO_ROOT}"
GATES="1,2,3,4,5,6,7"
GATES="1,2,3,4,5,6,7,8"
SETS=""
while [[ $# -gt 0 ]]; do
@ -71,9 +81,10 @@ done
# `jsts` (both JS/TS corpora) / `nodegoat` / `juiceshop` -> Gate 7, with the
# corpus name passed through so Gate 7 runs only the requested row.
case "${SETS}" in
owasp) GATES="6" ;;
jsts|nodegoat|juiceshop) GATES="7" ;;
"") ;; # no --sets: run the requested --gates
owasp) GATES="6" ;;
jsts|nodegoat|juiceshop) GATES="7" ;;
polyglot|railsgoat|dvwa|dvpwa|gosec|rustsec) GATES="8" ;;
"") ;; # no --sets: run the requested --gates
*) echo "unknown --sets: ${SETS}" >&2; exit 2 ;;
esac
@ -308,34 +319,31 @@ PY
echo " PASS"
}
# ── Gate 7: JS/TS real-corpus acceptance (NodeGoat + Juice Shop) ──────────────
# Phase 28 (Track R.1) mirror of Gate 6 for the JS/TS corpora. Same
# wall-clock split (10 min dev reference / 15 min CI) and the same
# report-only-by-default floor policy: NYX_JSTS_FLOOR_CAPS is empty, so the
# per-cap confirmed-rate / precision / recall numbers are published but gate
# nothing, while the per-(cap,lang) budget (unsupported_rate,
# false_confirmed_rate) is hard-enforced. Promote a cap into the floor set
# once it starts Confirming end to end.
GATE7_WALLCLOCK_BUDGET="${NYX_JSTS_WALLCLOCK_BUDGET_SECONDS:-900}"
GATE7_CONFIRMED_RATE_TARGET="${NYX_JSTS_CONFIRMED_RATE_TARGET:-0.40}"
GATE7_PRECISION_TARGET="${NYX_JSTS_PRECISION_TARGET:-0.85}"
GATE7_RECALL_TARGET="${NYX_JSTS_RECALL_TARGET:-0.40}"
GATE7_FLOOR_CAPS="${NYX_JSTS_FLOOR_CAPS:-}"
GATE7_BUDGET="${NYX_JSTS_BUDGET:-${REPO_ROOT}/tests/eval_corpus/budget.toml}"
# ── Shared real-corpus acceptance runner (Gates 7 + 8) ────────────────────────
# Run one real-corpus `--verify` row: scan under a wall-clock guard,
# tabulate against the committed ground truth, enforce the per-cell budget,
# publish (or, when floor caps are set, enforce) the per-cap floors.
# $1 label $2 corpus dir $3 ground-truth json
# publish (or, when floor caps are set, enforce) the per-cap floors. Every
# random source nyx uses is seeded from spec_hash, so reruns are
# deterministic. Generic across gates — all gate-specific knobs are passed
# in so Gate 7 (JS/TS) and Gate 8 (polyglot) share one code path.
# $1 label $2 corpus dir $3 ground-truth json
# $4 wallclock(s) $5 budget.toml $6 floor caps (may be empty)
# $7 confirmed target $8 precision target $9 recall target
# $10 floor-unset hint (e.g. "NYX_POLYGLOT_FLOOR_CAPS unset")
# $11 lang filter (may be empty) — scope tabulation to one language so
# incidental other-language assets (vendored JS in a Rails/aiohttp app)
# do not pollute the corpus's per-cap metrics
# Returns 0 on pass, 1 on fail. Caller decides skip.
_gate7_run_corpus() {
local label="$1" corpus="$2" gt="$3"
local scan_report="/tmp/m7_gate7_${label}_scan.json"
local results_report="/tmp/m7_gate7_${label}_results.json"
local wallclock_report="/tmp/m7_gate7_${label}_wallclock.txt"
local gate_home="${TMPDIR:-/tmp}/nyx_m7_gate7_${label}_home"
local gate_build_pool="${TMPDIR:-/tmp}/nyx_m7_gate7_${label}_build_pool"
_run_corpus_acceptance() {
local label="$1" corpus="$2" gt="$3" wallclock_budget="$4" budget_file="$5"
local floor_caps="$6" confirmed_target="$7" precision_target="$8"
local recall_target="$9" floor_hint="${10}" lang_filter="${11:-}"
local scan_report="/tmp/m7_corpus_${label}_scan.json"
local results_report="/tmp/m7_corpus_${label}_results.json"
local wallclock_report="/tmp/m7_corpus_${label}_wallclock.txt"
local gate_home="${TMPDIR:-/tmp}/nyx_m7_corpus_${label}_home"
local gate_build_pool="${TMPDIR:-/tmp}/nyx_m7_corpus_${label}_build_pool"
local wallclock
mkdir -p "${gate_home}" "${gate_build_pool}"
@ -344,7 +352,7 @@ _gate7_run_corpus() {
set +e
HOME="${gate_home}" \
NYX_BUILD_POOL_DIR="${gate_build_pool}" \
python3 - "${GATE7_WALLCLOCK_BUDGET}" "${scan_report}" "${wallclock_report}" \
python3 - "${wallclock_budget}" "${scan_report}" "${wallclock_report}" \
"${REPO_ROOT}/target/release/nyx" scan \
--verify \
--index off \
@ -375,9 +383,9 @@ sys.exit(rc)
PY
local nyx_exit=$?
set -e
wallclock="$(cat "${wallclock_report}" 2>/dev/null || printf "%s" "${GATE7_WALLCLOCK_BUDGET}")"
wallclock="$(cat "${wallclock_report}" 2>/dev/null || printf "%s" "${wallclock_budget}")"
echo " ${label} verify wall-clock: ${wallclock}s (budget ${GATE7_WALLCLOCK_BUDGET}s)"
echo " ${label} verify wall-clock: ${wallclock}s (budget ${wallclock_budget}s)"
if [[ ${nyx_exit} -eq 124 ]]; then
echo " FAIL: ${label} scan exceeded wall-clock budget"
@ -391,38 +399,60 @@ PY
echo " FAIL: ${label} scan produced no JSON report"
return 1
fi
awk -v w="${wallclock}" -v b="${GATE7_WALLCLOCK_BUDGET}" \
awk -v w="${wallclock}" -v b="${wallclock_budget}" \
'BEGIN { if (w+0 > b+0) exit 1 }' \
|| { echo " FAIL: ${label} wall-clock exceeds budget"; return 1; }
echo "[]" > "${results_report}"
python3 "${REPO_ROOT}/tests/eval_corpus/tabulate.py" \
--label "${label}" \
--scan "${scan_report}" \
--ground-truth "${gt}" \
--append "${results_report}" \
local -a tabulate_args=(
--label "${label}"
--scan "${scan_report}"
--ground-truth "${gt}"
--append "${results_report}"
)
if [[ -n "${lang_filter}" ]]; then
tabulate_args+=(--lang "${lang_filter}")
echo " scoping tabulation to language(s): ${lang_filter}"
fi
python3 "${REPO_ROOT}/tests/eval_corpus/tabulate.py" "${tabulate_args[@]}" \
|| { echo " FAIL: ${label} result tabulation failed"; return 1; }
local -a report_args=(
--results "${results_report}"
--budget "${GATE7_BUDGET}"
--budget "${budget_file}"
)
if [[ -n "${GATE7_FLOOR_CAPS}" ]]; then
if [[ -n "${floor_caps}" ]]; then
report_args+=(
--floor-caps "${GATE7_FLOOR_CAPS}"
--min-confirmed-rate "${GATE7_CONFIRMED_RATE_TARGET}"
--min-precision "${GATE7_PRECISION_TARGET}"
--min-recall "${GATE7_RECALL_TARGET}"
--floor-caps "${floor_caps}"
--min-confirmed-rate "${confirmed_target}"
--min-precision "${precision_target}"
--min-recall "${recall_target}"
)
echo " enforcing per-cap floors (confirmed >= ${GATE7_CONFIRMED_RATE_TARGET}, precision >= ${GATE7_PRECISION_TARGET}, recall >= ${GATE7_RECALL_TARGET}) on: ${GATE7_FLOOR_CAPS}"
echo " enforcing per-cap floors (confirmed >= ${confirmed_target}, precision >= ${precision_target}, recall >= ${recall_target}) on: ${floor_caps}"
else
echo " per-cap confirmed/precision/recall: report-only (NYX_JSTS_FLOOR_CAPS unset)"
echo " per-cap confirmed/precision/recall: report-only (${floor_hint})"
fi
python3 "${REPO_ROOT}/tests/eval_corpus/report.py" "${report_args[@]}" \
|| { echo " FAIL: ${label} per-cell budget exceeded or a gated per-cap floor missed"; return 1; }
return 0
}
# ── Gate 7: JS/TS real-corpus acceptance (NodeGoat + Juice Shop) ──────────────
# Phase 28 (Track R.1) mirror of Gate 6 for the JS/TS corpora. Same
# wall-clock split (10 min dev reference / 15 min CI) and the same
# report-only-by-default floor policy: NYX_JSTS_FLOOR_CAPS is empty, so the
# per-cap confirmed-rate / precision / recall numbers are published but gate
# nothing, while the per-(cap,lang) budget (unsupported_rate,
# false_confirmed_rate) is hard-enforced. Promote a cap into the floor set
# once it starts Confirming end to end.
GATE7_WALLCLOCK_BUDGET="${NYX_JSTS_WALLCLOCK_BUDGET_SECONDS:-900}"
GATE7_CONFIRMED_RATE_TARGET="${NYX_JSTS_CONFIRMED_RATE_TARGET:-0.40}"
GATE7_PRECISION_TARGET="${NYX_JSTS_PRECISION_TARGET:-0.85}"
GATE7_RECALL_TARGET="${NYX_JSTS_RECALL_TARGET:-0.40}"
GATE7_FLOOR_CAPS="${NYX_JSTS_FLOOR_CAPS:-}"
GATE7_BUDGET="${NYX_JSTS_BUDGET:-${REPO_ROOT}/tests/eval_corpus/budget.toml}"
gate_7_jsts_scale() {
echo "── Gate 7: JS/TS real-corpus (NodeGoat + Juice Shop) verify acceptance ──"
cargo build --release --quiet --features dynamic
@ -447,8 +477,13 @@ gate_7_jsts_scale() {
fi
any_ran=1
echo " ── ${name} (${corpus}) ──"
if _gate7_run_corpus "${name}" "${corpus}" \
"${REPO_ROOT}/tests/eval_corpus/ground_truth/${gtfile}"; then
# No --lang scope: NodeGoat/Juice Shop are single-language (js/ts), so
# there is no cross-language asset noise to filter (unchanged Gate 7).
if _run_corpus_acceptance "${name}" "${corpus}" \
"${REPO_ROOT}/tests/eval_corpus/ground_truth/${gtfile}" \
"${GATE7_WALLCLOCK_BUDGET}" "${GATE7_BUDGET}" "${GATE7_FLOOR_CAPS}" \
"${GATE7_CONFIRMED_RATE_TARGET}" "${GATE7_PRECISION_TARGET}" \
"${GATE7_RECALL_TARGET}" "NYX_JSTS_FLOOR_CAPS unset" ""; then
echo " PASS ${name}"
else
any_failed=1
@ -464,6 +499,76 @@ gate_7_jsts_scale() {
echo " PASS"
}
# ── Gate 8: Polyglot real-corpus acceptance (Track R.2 / Phase 29) ────────────
# RailsGoat (Rails, .rb) + DVWA (PHP) + DVPWA (aiohttp, .py) + gosec (Go) +
# the RustSec advisory-db (Rust negative control). Same wall-clock split and
# the same report-only-by-default floor policy as Gates 6/7: the per-(cap,lang)
# budget in tests/eval_corpus/budget.toml is hard-enforced, while per-cap
# confirmed-rate / precision / recall are published but gate nothing until
# NYX_POLYGLOT_FLOOR_CAPS names a cap. Each row self-skips unless its
# corpus env var points at a real checkout. The RustSec row is a NEGATIVE
# CONTROL: advisory-db ships advisory metadata, not vulnerable source, so its
# ground truth is empty by construction and the row asserts nyx Confirms
# nothing there (false_confirmed_rate guard).
GATE8_WALLCLOCK_BUDGET="${NYX_POLYGLOT_WALLCLOCK_BUDGET_SECONDS:-900}"
GATE8_CONFIRMED_RATE_TARGET="${NYX_POLYGLOT_CONFIRMED_RATE_TARGET:-0.40}"
GATE8_PRECISION_TARGET="${NYX_POLYGLOT_PRECISION_TARGET:-0.85}"
GATE8_RECALL_TARGET="${NYX_POLYGLOT_RECALL_TARGET:-0.40}"
GATE8_FLOOR_CAPS="${NYX_POLYGLOT_FLOOR_CAPS:-}"
GATE8_BUDGET="${NYX_POLYGLOT_BUDGET:-${REPO_ROOT}/tests/eval_corpus/budget.toml}"
gate_8_polyglot_scale() {
echo "── Gate 8: polyglot real-corpus (RailsGoat/DVWA/DVPWA/gosec/RustSec) verify acceptance ──"
cargo build --release --quiet --features dynamic
# name : env var holding the corpus dir : committed ground-truth file :
# target language (tabulation is scoped to it so incidental other-language
# assets — e.g. vendored JS in the Rails / aiohttp apps — do not pollute
# the corpus's per-cap metrics).
local rows=(
"railsgoat:NYX_RAILSGOAT_CORPUS:railsgoat.json:ruby"
"dvwa:NYX_DVWA_CORPUS:dvwa.json:php"
"dvpwa:NYX_DVPWA_CORPUS:dvpwa.json:python"
"gosec:NYX_GOSEC_CORPUS:gosec.json:go"
"rustsec:NYX_RUSTSEC_CORPUS:rustsec.json:rust"
)
local any_ran=0 any_failed=0
for row in "${rows[@]}"; do
local name envvar gtfile lang
IFS=: read -r name envvar gtfile lang <<<"${row}"
# When --sets names a single corpus, only run that row.
if [[ -n "${SETS}" && "${SETS}" != "polyglot" && "${SETS}" != "${name}" ]]; then
continue
fi
local corpus="${!envvar:-}"
if [[ -z "${corpus}" || ! -d "${corpus}" ]]; then
echo " SKIP ${name}: set ${envvar} to a checkout to run this row."
continue
fi
any_ran=1
echo " ── ${name} (${corpus}) ──"
if _run_corpus_acceptance "${name}" "${corpus}" \
"${REPO_ROOT}/tests/eval_corpus/ground_truth/${gtfile}" \
"${GATE8_WALLCLOCK_BUDGET}" "${GATE8_BUDGET}" "${GATE8_FLOOR_CAPS}" \
"${GATE8_CONFIRMED_RATE_TARGET}" "${GATE8_PRECISION_TARGET}" \
"${GATE8_RECALL_TARGET}" "NYX_POLYGLOT_FLOOR_CAPS unset" "${lang}"; then
echo " PASS ${name}"
else
any_failed=1
fi
done
if [[ ${any_ran} -eq 0 ]]; then
echo " SKIP: no polyglot corpus configured (set NYX_RAILSGOAT_CORPUS /"
echo " NYX_DVWA_CORPUS / NYX_DVPWA_CORPUS / NYX_GOSEC_CORPUS / NYX_RUSTSEC_CORPUS)."
echo " (Gate 8 is Phase 29's headline acceptance for the polyglot real corpora.)"
return 0
fi
[[ ${any_failed} -eq 0 ]] || return 1
echo " PASS"
}
# ── Driver ────────────────────────────────────────────────────────────────────
declare -a FAILED=()
@ -483,6 +588,7 @@ run_gate 4 sarif_schema
run_gate 5 layering
run_gate 6 owasp_scale
run_gate 7 jsts_scale
run_gate 8 polyglot_scale
if [[ ${#FAILED[@]} -gt 0 ]]; then
echo