#!/usr/bin/env bash # Eval corpus runner. # # Usage: # tests/eval_corpus/run.sh [--output DIR] [--nyx BIN] [--sets owasp,sard,inhouse] # # Bootstraps OWASP Benchmark v1.2, the NIST SARD subset, and Nyx benchmark # fixtures. Runs `nyx scan --verify` on each. Emits # per-cell (cap x language) precision/recall table and per-cap Unsupported # rate to stdout (and --output DIR if given). # # Environment: # NYX_EVAL_CORPUS_DIR - path to pre-downloaded corpus roots # (default: ~/.cache/nyx/eval_corpus) # NYX_BIN - path to nyx binary (default: ./target/release/nyx) # # Exit codes: # 0 - all budget thresholds met # 1 - setup or I/O error # 2 - one or more budget thresholds exceeded (see output for details) set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" # Defaults OUTPUT_DIR="" NYX_BIN="${NYX_BIN:-${REPO_ROOT}/target/release/nyx}" CORPUS_CACHE="${NYX_EVAL_CORPUS_DIR:-${HOME}/.cache/nyx/eval_corpus}" SETS="owasp,sard,nodegoat,juiceshop,railsgoat,dvwa,dvpwa,gosec,rustsec,inhouse" # Optional per-cell budgets and monotonic-improvement diff. BUDGET_FILE="" DIFF_FILE="" while [[ $# -gt 0 ]]; do case "$1" in --output) OUTPUT_DIR="$2"; shift 2 ;; --nyx) NYX_BIN="$2"; shift 2 ;; --sets) SETS="$2"; shift 2 ;; --budget) BUDGET_FILE="$2"; shift 2 ;; --diff) DIFF_FILE="$2"; shift 2 ;; *) shift ;; esac done # ── Helpers ─────────────────────────────────────────────────────────────────── die() { echo "error: $*" >&2; exit 1; } info() { echo "[eval] $*"; } require_cmd() { command -v "$1" >/dev/null 2>&1 || die "required command not found: $1"; } require_cmd jq require_cmd python3 # Scan one ground-truth-labelled real corpus (NodeGoat / Juice Shop) and # tabulate it against its committed ground truth. Self-skips when the # corpus has not been cloned into the cache. run_jsts_corpus() { local label="$1" dir="$2" gt="$3" if [[ ! -d "$dir" ]]; then info "Bootstrapping $label..." info " Clone the corpus into ${dir} then re-run this script:" if [[ "$label" == "nodegoat" ]]; then info " git clone --depth 1 https://github.com/OWASP/NodeGoat ${dir}" else info " git clone --depth 1 --branch v15.0.0 \\" info " https://github.com/juice-shop/juice-shop ${dir}" fi info "Skipping $label set (not yet downloaded)." return 0 fi info "Running nyx scan on $label..." set +e "$NYX_BIN" scan --format json --verify --no-index "$dir" \ > "/tmp/nyx_${label}.json" 2>"/tmp/nyx_${label}.stderr" local rc=$? set -e if [[ $rc -ne 0 && $rc -ne 1 ]]; then info " nyx exited $rc on $label set (stderr follows):" cat "/tmp/nyx_${label}.stderr" >&2 return 0 fi python3 "${SCRIPT_DIR}/tabulate.py" \ --label "$label" \ --scan "/tmp/nyx_${label}.json" \ --ground-truth "$gt" \ --append "$RESULTS_JSON" \ ${BUDGET_FILE:+--budget "$BUDGET_FILE"} \ ${DIFF_FILE:+--diff "$DIFF_FILE"} \ || info " tabulate.py failed on $label; ground truth file may be absent" } # Scan one Track R.2 polyglot real corpus and tabulate it against its # committed ground truth, SCOPED to its target language (tabulate --lang) so # incidental other-language assets (e.g. vendored JS in a Rails / aiohttp app) # do not pollute the corpus's per-cap metrics. Self-skips when the corpus has # not been cloned into the cache; prints the exact clone command if so. # $1 label $2 dir $3 ground-truth json $4 target lang $5 repo $6 ref run_polyglot_corpus() { local label="$1" dir="$2" gt="$3" lang="$4" repo="$5" ref="$6" if [[ ! -d "$dir" ]]; then info "Bootstrapping $label..." info " git clone --depth 1 --branch ${ref} ${repo} ${dir}" info "Skipping $label set (not yet downloaded)." return 0 fi info "Running nyx scan on $label (lang scope: ${lang})..." set +e "$NYX_BIN" scan --format json --verify --no-index "$dir" \ > "/tmp/nyx_${label}.json" 2>"/tmp/nyx_${label}.stderr" local rc=$? set -e if [[ $rc -ne 0 && $rc -ne 1 ]]; then info " nyx exited $rc on $label set (stderr follows):" cat "/tmp/nyx_${label}.stderr" >&2 return 0 fi python3 "${SCRIPT_DIR}/tabulate.py" \ --label "$label" \ --scan "/tmp/nyx_${label}.json" \ --ground-truth "$gt" \ --lang "$lang" \ --append "$RESULTS_JSON" \ ${BUDGET_FILE:+--budget "$BUDGET_FILE"} \ ${DIFF_FILE:+--diff "$DIFF_FILE"} \ || info " tabulate.py failed on $label; ground truth file may be absent" } [[ -x "$NYX_BIN" ]] || die "nyx binary not found or not executable: $NYX_BIN" mkdir -p "$CORPUS_CACHE" [[ -n "$OUTPUT_DIR" ]] && mkdir -p "$OUTPUT_DIR" RESULTS_JSON="${OUTPUT_DIR:-/tmp}/eval_results_$(date +%Y%m%d_%H%M%S).json" echo "[]" > "$RESULTS_JSON" # ── OWASP Benchmark v1.2 bootstrap ─────────────────────────────────────────── OWASP_DIR="${CORPUS_CACHE}/owasp_benchmark_v1.2" if [[ "$SETS" == *owasp* ]]; then if [[ ! -d "$OWASP_DIR" ]]; then info "Bootstrapping OWASP Benchmark v1.2..." info " Clone from https://github.com/OWASP-Benchmark/BenchmarkJava" info " into ${OWASP_DIR}" info " then re-run this script." info " git clone --depth 1 --branch 1.2beta \\" info " https://github.com/OWASP-Benchmark/BenchmarkJava \\" info " ${OWASP_DIR}" info "Skipping OWASP set (not yet downloaded)." else info "Running nyx scan on OWASP Benchmark v1.2..." set +e "$NYX_BIN" scan --format json --verify --no-index "$OWASP_DIR" \ > /tmp/nyx_owasp.json 2>/tmp/nyx_owasp.stderr NYX_EXIT=$? set -e if [[ $NYX_EXIT -ne 0 && $NYX_EXIT -ne 1 ]]; then info " nyx exited $NYX_EXIT on OWASP set (stderr follows):" cat /tmp/nyx_owasp.stderr >&2 else python3 "${SCRIPT_DIR}/tabulate.py" \ --label owasp \ --scan /tmp/nyx_owasp.json \ --ground-truth "${SCRIPT_DIR}/ground_truth/owasp_benchmark_v1.2.json" \ --append "$RESULTS_JSON" \ ${BUDGET_FILE:+--budget "$BUDGET_FILE"} \ ${DIFF_FILE:+--diff "$DIFF_FILE"} \ || info " tabulate.py failed; ground truth file may be absent" fi fi fi # ── NodeGoat / Juice Shop (JS/TS) bootstrap — Track R.1 ─────────────────────── if [[ "$SETS" == *nodegoat* ]]; then run_jsts_corpus nodegoat "${CORPUS_CACHE}/nodegoat" \ "${SCRIPT_DIR}/ground_truth/nodegoat.json" fi if [[ "$SETS" == *juiceshop* ]]; then run_jsts_corpus juiceshop "${CORPUS_CACHE}/juiceshop" \ "${SCRIPT_DIR}/ground_truth/juiceshop.json" fi # ── Polyglot real corpora (Ruby/PHP/Python/Go/Rust) — Track R.2 ─────────────── if [[ "$SETS" == *railsgoat* ]]; then run_polyglot_corpus railsgoat "${CORPUS_CACHE}/railsgoat" \ "${SCRIPT_DIR}/ground_truth/railsgoat.json" ruby \ https://github.com/OWASP/railsgoat rails.5.0.0 fi if [[ "$SETS" == *dvwa* ]]; then run_polyglot_corpus dvwa "${CORPUS_CACHE}/dvwa" \ "${SCRIPT_DIR}/ground_truth/dvwa.json" php \ https://github.com/digininja/DVWA 2.5 fi if [[ "$SETS" == *dvpwa* ]]; then run_polyglot_corpus dvpwa "${CORPUS_CACHE}/dvpwa" \ "${SCRIPT_DIR}/ground_truth/dvpwa.json" python \ https://github.com/anxolerd/dvpwa master fi if [[ "$SETS" == *gosec* ]]; then run_polyglot_corpus gosec "${CORPUS_CACHE}/gosec" \ "${SCRIPT_DIR}/ground_truth/gosec.json" go \ https://github.com/securego/gosec v2.26.1 fi # RustSec advisory-db is the Rust negative control (empty ground truth): the # row asserts the Rust scan/verify path runs and Confirms nothing there. if [[ "$SETS" == *rustsec* ]]; then run_polyglot_corpus rustsec "${CORPUS_CACHE}/rustsec" \ "${SCRIPT_DIR}/ground_truth/rustsec.json" rust \ https://github.com/rustsec/advisory-db main fi # ── NIST SARD subset bootstrap ──────────────────────────────────────────────── SARD_DIR="${CORPUS_CACHE}/nist_sard" if [[ "$SETS" == *sard* ]]; then if [[ ! -d "$SARD_DIR" ]]; then info "Bootstrapping NIST SARD subset..." info " Download from https://samate.nist.gov/SARD/" info " into ${SARD_DIR} then re-run this script." info "Skipping SARD set (not yet downloaded)." else info "Running nyx scan on NIST SARD subset..." set +e "$NYX_BIN" scan --format json --verify --no-index "$SARD_DIR" \ > /tmp/nyx_sard.json 2>/tmp/nyx_sard.stderr NYX_EXIT=$? set -e if [[ $NYX_EXIT -ne 0 && $NYX_EXIT -ne 1 ]]; then info " nyx exited $NYX_EXIT on SARD set" else python3 "${SCRIPT_DIR}/tabulate.py" \ --label sard \ --scan /tmp/nyx_sard.json \ --ground-truth "${SCRIPT_DIR}/ground_truth/nist_sard.json" \ --append "$RESULTS_JSON" \ ${BUDGET_FILE:+--budget "$BUDGET_FILE"} \ ${DIFF_FILE:+--diff "$DIFF_FILE"} \ || info " tabulate.py failed; ground truth file may be absent" fi fi fi # ── In-house bughunt-curated set ────────────────────────────────────────────── if [[ "$SETS" == *inhouse* ]]; then INHOUSE_DIRS=( "${REPO_ROOT}/tests/benchmark/corpus" "${REPO_ROOT}/tests/dynamic_fixtures" ) for dir in "${INHOUSE_DIRS[@]}"; do [[ -d "$dir" ]] || continue label="inhouse_$(basename "$dir")" info "Running nyx scan on in-house set: $dir" set +e "$NYX_BIN" scan --format json --verify --no-index "$dir" \ > "/tmp/nyx_${label}.json" 2>"/tmp/nyx_${label}.stderr" NYX_EXIT=$? set -e if [[ $NYX_EXIT -ne 0 && $NYX_EXIT -ne 1 ]]; then info " nyx exited $NYX_EXIT on $label" continue fi python3 "${SCRIPT_DIR}/tabulate.py" \ --label "$label" \ --scan "/tmp/nyx_${label}.json" \ --inhouse \ --append "$RESULTS_JSON" \ ${BUDGET_FILE:+--budget "$BUDGET_FILE"} \ ${DIFF_FILE:+--diff "$DIFF_FILE"} \ || info " tabulate.py failed on $label" done fi # ── Emit summary table ──────────────────────────────────────────────────────── info "" info "Results written to: $RESULTS_JSON" [[ -n "$OUTPUT_DIR" ]] && cp "$RESULTS_JSON" "${OUTPUT_DIR}/eval_results.json" if [[ ! -f "${SCRIPT_DIR}/report.py" ]]; then info "report.py not available; raw results at $RESULTS_JSON" exit 0 fi set +e python3 "${SCRIPT_DIR}/report.py" \ --results "$RESULTS_JSON" \ ${BUDGET_FILE:+--budget "$BUDGET_FILE"} \ ${DIFF_FILE:+--diff "$DIFF_FILE"} REPORT_RC=$? set -e # Propagate budget failures (exit 2) and malformed config (exit 3). Treat other # non-zero exits as setup errors. if [[ $REPORT_RC -eq 2 ]]; then exit 2 elif [[ $REPORT_RC -eq 3 ]]; then info "report.py: budget/diff configuration malformed; see $RESULTS_JSON" exit 3 elif [[ $REPORT_RC -ne 0 ]]; then info "report.py crashed (exit $REPORT_RC); raw results at $RESULTS_JSON" exit 1 fi exit 0