nyx/scripts/m7_ship_gate.sh

493 lines
20 KiB
Bash
Executable file
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
# m7_ship_gate.sh — milestone-7 ship gates.
#
# Each gate runs as an isolated function so CI can call a subset:
#
# scripts/m7_ship_gate.sh # every gate
# scripts/m7_ship_gate.sh --gates 3,6 # only gates 3 + 6
# scripts/m7_ship_gate.sh --sets owasp # Java OWASP corpus only
# scripts/m7_ship_gate.sh --sets jsts # NodeGoat + Juice Shop only
# scripts/m7_ship_gate.sh --sets nodegoat # one JS/TS corpus only
#
# Gate map (kept in sync with .pitboss/play/plan.md track M.7):
# Gate 1: Static-only scan is green on `tests/benchmark/corpus`.
# Gate 2: `cargo nextest run --features dynamic` is green.
# Gate 3: With-verify / static-only wall-clock ratio ≤ 1.5× on
# `benches/fixtures/`. Phase 22 had relaxed this to ≤ 2×
# while only `javac` had a warm daemon; Phase 23 lands the
# cross-lang build pools (shared caches for Node/Python/PHP/
# Ruby/Go/Rust/C/C++), so the bar is tightened back to ≤ 1.5×.
# Gate 4: SARIF schema validation on every dynamic verdict variant.
# Gate 5: Layering boundary test green.
# Gate 6: Java OWASP Benchmark v1.2 `--verify` acceptance. Wall-clock
# ≤ 15 min on CI / ≤ 10 min on the dev reference machine; and,
# per OWASP cap backed by a sound runtime oracle, confirmed-rate
# ≥ 40%, precision ≥ 0.85, recall ≥ 0.40, plus the per-(cap,lang)
# budget in tests/eval_corpus/budget.toml. Added Phase 22 as the
# headline acceptance for the warm `javac` daemon; Phase 27 (Track
# R.0) added the precision/recall/budget ratchet. The corpus is
# *not* checked into the repo; the gate skips with a clear message
# when `NYX_OWASP_CORPUS` does not point at a real checkout.
# Gate 7: JS/TS real-corpus acceptance (Track R.1 / Phase 28). OWASP
# NodeGoat (Express, .js) + OWASP Juice Shop (TypeScript, .ts)
# `--verify` against the committed ground truth. Same shape as
# Gate 6: wall-clock budget + the per-(cap,lang) budget in
# tests/eval_corpus/budget.toml hard-enforced; per-cap
# confirmed-rate / precision / recall published report-only
# (NYX_JSTS_FLOOR_CAPS empty by default). Each corpus row
# self-skips unless its NYX_NODEGOAT_CORPUS / NYX_JUICESHOP_CORPUS
# points at a real checkout.
set -euo pipefail
REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
cd "${REPO_ROOT}"
GATES="1,2,3,4,5,6,7"
SETS=""
while [[ $# -gt 0 ]]; do
case "$1" in
--gates)
GATES="$2"
shift 2
;;
--sets)
SETS="$2"
shift 2
;;
-h | --help)
sed -n '2,/^$/p' "${BASH_SOURCE[0]}"
exit 0
;;
*)
echo "unknown flag: $1" >&2
exit 2
;;
esac
done
# `--sets` lets CI run a single real-corpus gate. `owasp` -> Gate 6;
# `jsts` (both JS/TS corpora) / `nodegoat` / `juiceshop` -> Gate 7, with the
# corpus name passed through so Gate 7 runs only the requested row.
case "${SETS}" in
owasp) GATES="6" ;;
jsts|nodegoat|juiceshop) GATES="7" ;;
"") ;; # no --sets: run the requested --gates
*) echo "unknown --sets: ${SETS}" >&2; exit 2 ;;
esac
want_gate() {
[[ ",${GATES}," == *",$1,"* ]]
}
# ── Gate 1 ────────────────────────────────────────────────────────────────────
gate_1_static_corpus() {
echo "── Gate 1: static-only scan on tests/benchmark/corpus ──"
if [[ ! -d "${REPO_ROOT}/tests/benchmark/corpus" ]]; then
echo " SKIP: tests/benchmark/corpus not present"
return 0
fi
cargo run --release --quiet -- scan \
--format json \
"${REPO_ROOT}/tests/benchmark/corpus" > /tmp/m7_gate1.json
echo " PASS: static scan completed"
}
# ── Gate 2 ────────────────────────────────────────────────────────────────────
gate_2_dynamic_tests() {
echo "── Gate 2: cargo nextest run --features dynamic ──"
cargo nextest run --features dynamic
# The real-toolchain build-pool perf benches (dynamic_*_build_pool +
# dynamic_java_compile_pool) are #[ignore]d so the default inner-loop
# suite stays hermetic + fast: no cargo/go/cc/c++/npm/pip/composer/
# bundle/javac spawns. Run them explicitly here so CI still exercises
# the warm-pool compile path end to end. They self-skip when a
# toolchain is missing, so a toolchain-less CI row stays green.
cargo nextest run --features dynamic --run-ignored ignored-only \
-E 'binary(~build_pool) | binary(~compile_pool)'
echo " PASS: dynamic test suite green"
}
# ── Gate 3: with-verify / static-only ratio ───────────────────────────────────
# Phase 23 target: ratio ≤ 1.5×, now that the cross-lang build pools
# give every shipped language a warm cache (was ≤ 2× under Phase 22).
GATE3_RATIO_TARGET="${GATE3_RATIO_TARGET:-1.5}"
gate_3_verify_ratio() {
echo "── Gate 3: with-verify / static-only ratio on benches/fixtures/ ──"
local fixtures="${REPO_ROOT}/benches/fixtures"
if [[ ! -d "${fixtures}" ]]; then
echo " SKIP: ${fixtures} not present"
return 0
fi
# Phase 23: the warm build pools are what buy the ≤ 1.5× ratio, so
# make sure they are on for both scans even if the caller's env
# disabled them. Default is already ON for every shipped language.
export NYX_DYNAMIC_BUILD_POOL="java=1,node=1,python=1,php=1,ruby=1,go=1,rust=1,c=1,cpp=1"
local static_seconds verify_seconds
static_seconds="$(time_scan "${fixtures}" 0)"
verify_seconds="$(time_scan "${fixtures}" 1)"
local ratio
ratio="$(awk -v v="${verify_seconds}" -v s="${static_seconds}" \
'BEGIN { if (s <= 0) { print "inf"; exit } printf "%.3f", v / s }')"
echo " static-only wall-clock: ${static_seconds}s"
echo " with-verify wall-clock: ${verify_seconds}s"
echo " ratio: ${ratio} (target ≤ ${GATE3_RATIO_TARGET})"
awk -v r="${ratio}" -v t="${GATE3_RATIO_TARGET}" \
'BEGIN { if (r+0 > t+0) exit 1 }' \
|| { echo " FAIL: ratio exceeds target"; return 1; }
echo " PASS"
}
# Print wall-clock seconds for a single scan run.
# $1 = path to scan
# $2 = 0 for static-only, 1 for --verify
time_scan() {
local path="$1" verify="$2"
local args=("--format" "json")
if [[ "${verify}" == "1" ]]; then
args+=("--verify")
fi
args+=("${path}")
local start end
start="$(python3 -c 'import time;print(time.monotonic())')"
cargo run --release --quiet --features dynamic -- scan "${args[@]}" > /dev/null
end="$(python3 -c 'import time;print(time.monotonic())')"
awk -v a="${start}" -v b="${end}" 'BEGIN { printf "%.3f", b - a }'
}
# ── Gate 4 ────────────────────────────────────────────────────────────────────
gate_4_sarif_schema() {
echo "── Gate 4: SARIF schema validation ──"
cargo nextest run --features dynamic --test sarif_dynamic_verdict_tests
echo " PASS"
}
# ── Gate 5 ────────────────────────────────────────────────────────────────────
gate_5_layering() {
echo "── Gate 5: dynamic layering boundary ──"
cargo nextest run --features dynamic --test dynamic_layering
echo " PASS"
}
# ── Gate 6: Java OWASP-scale ratio ────────────────────────────────────────────
# Phase 22 + Phase 27 jointly own this gate. The wall-clock budgets
# are split: 10 min on the dev reference (M1 macOS w/ JDK 21) and 15
# min in CI. Override `NYX_OWASP_WALLCLOCK_BUDGET_SECONDS` to tighten.
GATE6_WALLCLOCK_BUDGET="${NYX_OWASP_WALLCLOCK_BUDGET_SECONDS:-900}"
GATE6_CONFIRMED_RATE_TARGET="${NYX_OWASP_CONFIRMED_RATE_TARGET:-0.40}"
# Phase 27 acceptance: per-cap precision >= 0.85, recall >= 0.40.
GATE6_PRECISION_TARGET="${NYX_OWASP_PRECISION_TARGET:-0.85}"
GATE6_RECALL_TARGET="${NYX_OWASP_RECALL_TARGET:-0.40}"
# Per-cap confirmation floors (confirmed-rate / precision / recall) are
# HARD-enforced only for the caps named here; every cap is still measured and
# its numbers published either way. Empty = report-only (publish the per-cap
# table, fail nothing on those three metrics) while the verifier still cannot
# Confirm OWASP findings end to end: today every BenchmarkTest servlet harness
# lands in Inconclusive(BuildFailed) or Inconclusive(SpecDerivationFailed)
# (Java servlet entry + classpath are Track L.12 / Track O.0 work), so 0 caps
# meet the 40% / 85% / 40% headline. The gate therefore enforces what the
# verifier already satisfies — wall-clock, no false confirms, the per-cell
# budget — and publishes the unmet detection/confirmation numbers as the
# ratchet's destination. Set NYX_OWASP_FLOOR_CAPS (e.g. "sqli,cmdi") to
# hard-gate a cap the moment it starts Confirming.
GATE6_FLOOR_CAPS="${NYX_OWASP_FLOOR_CAPS:-}"
GATE6_BUDGET="${NYX_OWASP_BUDGET:-${REPO_ROOT}/tests/eval_corpus/budget.toml}"
gate_6_owasp_scale() {
echo "── Gate 6: Java OWASP Benchmark v1.2 verify wall-clock + confirmed-rate ──"
local corpus="${NYX_OWASP_CORPUS:-}"
if [[ -z "${corpus}" || ! -d "${corpus}" ]]; then
echo " SKIP: set NYX_OWASP_CORPUS to a v1.2 checkout to run this gate."
echo " (Gate 6 is Phase 22's headline acceptance for the warm javac daemon.)"
return 0
fi
local scan_report="/tmp/m7_gate6_scan.json"
local results_report="/tmp/m7_gate6_results.json"
local wallclock_report="/tmp/m7_gate6_wallclock.txt"
local gate_home="${TMPDIR:-/tmp}/nyx_m7_gate6_home"
local gate_build_pool="${TMPDIR:-/tmp}/nyx_m7_gate6_build_pool"
local wallclock
cargo build --release --quiet --features dynamic
mkdir -p "${gate_home}" "${gate_build_pool}"
rm -f "${scan_report}" "${results_report}" "${wallclock_report}"
set +e
HOME="${gate_home}" \
NYX_BUILD_POOL_DIR="${gate_build_pool}" \
python3 - "${GATE6_WALLCLOCK_BUDGET}" "${scan_report}" "${wallclock_report}" \
"${REPO_ROOT}/target/release/nyx" scan \
--verify \
--index off \
--format json \
--quiet \
"${corpus}" <<'PY'
import subprocess
import sys
import time
budget = float(sys.argv[1])
scan_report = sys.argv[2]
wallclock_report = sys.argv[3]
cmd = sys.argv[4:]
start = time.monotonic()
rc = 0
try:
with open(scan_report, "wb") as out:
completed = subprocess.run(cmd, stdout=out, timeout=budget)
rc = completed.returncode
except subprocess.TimeoutExpired:
rc = 124
finally:
elapsed = time.monotonic() - start
with open(wallclock_report, "w") as f:
f.write(f"{elapsed:.1f}\n")
sys.exit(rc)
PY
local nyx_exit=$?
set -e
wallclock="$(cat "${wallclock_report}" 2>/dev/null || printf "%s" "${GATE6_WALLCLOCK_BUDGET}")"
echo " OWASP verify wall-clock: ${wallclock}s (budget ${GATE6_WALLCLOCK_BUDGET}s)"
if [[ ${nyx_exit} -eq 124 ]]; then
echo " FAIL: nyx scan exceeded wall-clock budget"
return 1
fi
if [[ ${nyx_exit} -ne 0 && ${nyx_exit} -ne 1 ]]; then
echo " FAIL: nyx scan exited ${nyx_exit}"
return 1
fi
if [[ ! -s "${scan_report}" ]]; then
echo " FAIL: nyx scan produced no JSON report"
return 1
fi
awk -v w="${wallclock}" -v b="${GATE6_WALLCLOCK_BUDGET}" \
'BEGIN { if (w+0 > b+0) exit 1 }' \
|| { echo " FAIL: wall-clock exceeds budget"; return 1; }
echo "[]" > "${results_report}"
python3 "${REPO_ROOT}/tests/eval_corpus/tabulate.py" \
--label owasp \
--scan "${scan_report}" \
--ground-truth "${REPO_ROOT}/tests/eval_corpus/ground_truth/owasp_benchmark_v1.2.json" \
--append "${results_report}" \
|| { echo " FAIL: OWASP result tabulation failed"; return 1; }
local -a report_args=(
--results "${results_report}"
--budget "${GATE6_BUDGET}"
)
if [[ -n "${GATE6_FLOOR_CAPS}" ]]; then
report_args+=(
--floor-caps "${GATE6_FLOOR_CAPS}"
--min-confirmed-rate "${GATE6_CONFIRMED_RATE_TARGET}"
--min-precision "${GATE6_PRECISION_TARGET}"
--min-recall "${GATE6_RECALL_TARGET}"
)
echo " enforcing per-cap floors (confirmed >= ${GATE6_CONFIRMED_RATE_TARGET}, precision >= ${GATE6_PRECISION_TARGET}, recall >= ${GATE6_RECALL_TARGET}) on: ${GATE6_FLOOR_CAPS}"
else
echo " per-cap confirmed/precision/recall: report-only (NYX_OWASP_FLOOR_CAPS unset; no cap Confirms OWASP yet)"
fi
python3 "${REPO_ROOT}/tests/eval_corpus/report.py" "${report_args[@]}" \
|| { echo " FAIL: OWASP per-cell budget exceeded or a gated per-cap floor missed"; return 1; }
echo " PASS"
}
# ── Gate 7: JS/TS real-corpus acceptance (NodeGoat + Juice Shop) ──────────────
# Phase 28 (Track R.1) mirror of Gate 6 for the JS/TS corpora. Same
# wall-clock split (10 min dev reference / 15 min CI) and the same
# report-only-by-default floor policy: NYX_JSTS_FLOOR_CAPS is empty, so the
# per-cap confirmed-rate / precision / recall numbers are published but gate
# nothing, while the per-(cap,lang) budget (unsupported_rate,
# false_confirmed_rate) is hard-enforced. Promote a cap into the floor set
# once it starts Confirming end to end.
GATE7_WALLCLOCK_BUDGET="${NYX_JSTS_WALLCLOCK_BUDGET_SECONDS:-900}"
GATE7_CONFIRMED_RATE_TARGET="${NYX_JSTS_CONFIRMED_RATE_TARGET:-0.40}"
GATE7_PRECISION_TARGET="${NYX_JSTS_PRECISION_TARGET:-0.85}"
GATE7_RECALL_TARGET="${NYX_JSTS_RECALL_TARGET:-0.40}"
GATE7_FLOOR_CAPS="${NYX_JSTS_FLOOR_CAPS:-}"
GATE7_BUDGET="${NYX_JSTS_BUDGET:-${REPO_ROOT}/tests/eval_corpus/budget.toml}"
# Run one real-corpus `--verify` row: scan under a wall-clock guard,
# tabulate against the committed ground truth, enforce the per-cell budget,
# publish (or, when floor caps are set, enforce) the per-cap floors.
# $1 label $2 corpus dir $3 ground-truth json
# Returns 0 on pass, 1 on fail. Caller decides skip.
_gate7_run_corpus() {
local label="$1" corpus="$2" gt="$3"
local scan_report="/tmp/m7_gate7_${label}_scan.json"
local results_report="/tmp/m7_gate7_${label}_results.json"
local wallclock_report="/tmp/m7_gate7_${label}_wallclock.txt"
local gate_home="${TMPDIR:-/tmp}/nyx_m7_gate7_${label}_home"
local gate_build_pool="${TMPDIR:-/tmp}/nyx_m7_gate7_${label}_build_pool"
local wallclock
mkdir -p "${gate_home}" "${gate_build_pool}"
rm -f "${scan_report}" "${results_report}" "${wallclock_report}"
set +e
HOME="${gate_home}" \
NYX_BUILD_POOL_DIR="${gate_build_pool}" \
python3 - "${GATE7_WALLCLOCK_BUDGET}" "${scan_report}" "${wallclock_report}" \
"${REPO_ROOT}/target/release/nyx" scan \
--verify \
--index off \
--format json \
--quiet \
"${corpus}" <<'PY'
import subprocess
import sys
import time
budget = float(sys.argv[1])
scan_report = sys.argv[2]
wallclock_report = sys.argv[3]
cmd = sys.argv[4:]
start = time.monotonic()
rc = 0
try:
with open(scan_report, "wb") as out:
completed = subprocess.run(cmd, stdout=out, timeout=budget)
rc = completed.returncode
except subprocess.TimeoutExpired:
rc = 124
finally:
elapsed = time.monotonic() - start
with open(wallclock_report, "w") as f:
f.write(f"{elapsed:.1f}\n")
sys.exit(rc)
PY
local nyx_exit=$?
set -e
wallclock="$(cat "${wallclock_report}" 2>/dev/null || printf "%s" "${GATE7_WALLCLOCK_BUDGET}")"
echo " ${label} verify wall-clock: ${wallclock}s (budget ${GATE7_WALLCLOCK_BUDGET}s)"
if [[ ${nyx_exit} -eq 124 ]]; then
echo " FAIL: ${label} scan exceeded wall-clock budget"
return 1
fi
if [[ ${nyx_exit} -ne 0 && ${nyx_exit} -ne 1 ]]; then
echo " FAIL: ${label} scan exited ${nyx_exit}"
return 1
fi
if [[ ! -s "${scan_report}" ]]; then
echo " FAIL: ${label} scan produced no JSON report"
return 1
fi
awk -v w="${wallclock}" -v b="${GATE7_WALLCLOCK_BUDGET}" \
'BEGIN { if (w+0 > b+0) exit 1 }' \
|| { echo " FAIL: ${label} wall-clock exceeds budget"; return 1; }
echo "[]" > "${results_report}"
python3 "${REPO_ROOT}/tests/eval_corpus/tabulate.py" \
--label "${label}" \
--scan "${scan_report}" \
--ground-truth "${gt}" \
--append "${results_report}" \
|| { echo " FAIL: ${label} result tabulation failed"; return 1; }
local -a report_args=(
--results "${results_report}"
--budget "${GATE7_BUDGET}"
)
if [[ -n "${GATE7_FLOOR_CAPS}" ]]; then
report_args+=(
--floor-caps "${GATE7_FLOOR_CAPS}"
--min-confirmed-rate "${GATE7_CONFIRMED_RATE_TARGET}"
--min-precision "${GATE7_PRECISION_TARGET}"
--min-recall "${GATE7_RECALL_TARGET}"
)
echo " enforcing per-cap floors (confirmed >= ${GATE7_CONFIRMED_RATE_TARGET}, precision >= ${GATE7_PRECISION_TARGET}, recall >= ${GATE7_RECALL_TARGET}) on: ${GATE7_FLOOR_CAPS}"
else
echo " per-cap confirmed/precision/recall: report-only (NYX_JSTS_FLOOR_CAPS unset)"
fi
python3 "${REPO_ROOT}/tests/eval_corpus/report.py" "${report_args[@]}" \
|| { echo " FAIL: ${label} per-cell budget exceeded or a gated per-cap floor missed"; return 1; }
return 0
}
gate_7_jsts_scale() {
echo "── Gate 7: JS/TS real-corpus (NodeGoat + Juice Shop) verify acceptance ──"
cargo build --release --quiet --features dynamic
# name : env var holding the corpus dir : committed ground-truth file
local rows=(
"nodegoat:NYX_NODEGOAT_CORPUS:nodegoat.json"
"juiceshop:NYX_JUICESHOP_CORPUS:juiceshop.json"
)
local any_ran=0 any_failed=0
for row in "${rows[@]}"; do
local name envvar gtfile
IFS=: read -r name envvar gtfile <<<"${row}"
# When --sets names a single corpus, only run that row.
if [[ -n "${SETS}" && "${SETS}" != "jsts" && "${SETS}" != "${name}" ]]; then
continue
fi
local corpus="${!envvar:-}"
if [[ -z "${corpus}" || ! -d "${corpus}" ]]; then
echo " SKIP ${name}: set ${envvar} to a checkout to run this row."
continue
fi
any_ran=1
echo " ── ${name} (${corpus}) ──"
if _gate7_run_corpus "${name}" "${corpus}" \
"${REPO_ROOT}/tests/eval_corpus/ground_truth/${gtfile}"; then
echo " PASS ${name}"
else
any_failed=1
fi
done
if [[ ${any_ran} -eq 0 ]]; then
echo " SKIP: no JS/TS corpus configured (set NYX_NODEGOAT_CORPUS / NYX_JUICESHOP_CORPUS)."
echo " (Gate 7 is Phase 28's headline acceptance for the JS/TS real corpora.)"
return 0
fi
[[ ${any_failed} -eq 0 ]] || return 1
echo " PASS"
}
# ── Driver ────────────────────────────────────────────────────────────────────
declare -a FAILED=()
run_gate() {
local idx="$1" name="$2"
if want_gate "${idx}"; then
if ! "gate_${idx}_${name}"; then
FAILED+=("${idx}")
fi
fi
}
run_gate 1 static_corpus
run_gate 2 dynamic_tests
run_gate 3 verify_ratio
run_gate 4 sarif_schema
run_gate 5 layering
run_gate 6 owasp_scale
run_gate 7 jsts_scale
if [[ ${#FAILED[@]} -gt 0 ]]; then
echo
echo "FAILED gates: ${FAILED[*]}"
exit 1
fi
echo
echo "All requested gates passed."