nyx/tests/eval_corpus/run.sh

#!/usr/bin/env bash
# Eval corpus runner for M7 pre-flip gate calibration.
#
# Usage:
#   tests/eval_corpus/run.sh [--output DIR] [--nyx BIN] [--sets owasp,sard,inhouse]
#
# Bootstraps OWASP Benchmark v1.2, NIST SARD subset, and in-house
# bughunt-curated fixtures. Runs `nyx scan --verify` on each. Emits
# per-cell (cap x language) precision/recall table and per-cap Unsupported
# rate to stdout (and --output DIR if given).
#
# Environment:
#   NYX_EVAL_CORPUS_DIR  — path to pre-downloaded corpus roots
#                          (default: ~/.cache/nyx/eval_corpus)
#   NYX_BIN              — path to nyx binary (default: ./target/release/nyx)
#
# Exit codes:
#   0 — all gate thresholds met
#   1 — setup or I/O error
#   2 — one or more gate thresholds exceeded (see output for details)

set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"

# ── Defaults ──────────────────────────────────────────────────────────────────
OUTPUT_DIR=""
NYX_BIN="${NYX_BIN:-${REPO_ROOT}/target/release/nyx}"
CORPUS_CACHE="${NYX_EVAL_CORPUS_DIR:-${HOME}/.cache/nyx/eval_corpus}"
SETS="owasp,sard,inhouse"
# Phase 29 (Track I): per-cell budgets + monotonic-improvement diff.
BUDGET_FILE=""
DIFF_FILE=""

while [[ $# -gt 0 ]]; do
  case "$1" in
    --output) OUTPUT_DIR="$2"; shift 2 ;;
    --nyx)    NYX_BIN="$2"; shift 2 ;;
    --sets)   SETS="$2"; shift 2 ;;
    --budget) BUDGET_FILE="$2"; shift 2 ;;
    --diff)   DIFF_FILE="$2"; shift 2 ;;
    *)        shift ;;
  esac
done

# ── Helpers ───────────────────────────────────────────────────────────────────
die()  { echo "error: $*" >&2; exit 1; }
info() { echo "[eval] $*"; }

require_cmd() { command -v "$1" >/dev/null 2>&1 || die "required command not found: $1"; }
require_cmd jq
require_cmd python3

[[ -x "$NYX_BIN" ]] || die "nyx binary not found or not executable: $NYX_BIN"

mkdir -p "$CORPUS_CACHE"
[[ -n "$OUTPUT_DIR" ]] && mkdir -p "$OUTPUT_DIR"

RESULTS_JSON="${OUTPUT_DIR:-/tmp}/eval_results_$(date +%Y%m%d_%H%M%S).json"
echo "[]" > "$RESULTS_JSON"

# ── OWASP Benchmark v1.2 bootstrap ───────────────────────────────────────────
OWASP_DIR="${CORPUS_CACHE}/owasp_benchmark_v1.2"
if [[ "$SETS" == *owasp* ]]; then
  if [[ ! -d "$OWASP_DIR" ]]; then
    info "Bootstrapping OWASP Benchmark v1.2..."
    info "  Clone from https://github.com/OWASP-Benchmark/BenchmarkJava"
    info "  into ${OWASP_DIR}"
    info "  then re-run this script."
    info "  git clone --depth 1 --branch v1.2 \\"
    info "    https://github.com/OWASP-Benchmark/BenchmarkJava \\"
    info "    ${OWASP_DIR}"
    info "Skipping OWASP set (not yet downloaded)."
  else
    info "Running nyx scan on OWASP Benchmark v1.2..."
    set +e
    "$NYX_BIN" scan --format json --verify --no-index "$OWASP_DIR" \
      > /tmp/nyx_owasp.json 2>/tmp/nyx_owasp.stderr
    NYX_EXIT=$?
    set -e
    if [[ $NYX_EXIT -ne 0 && $NYX_EXIT -ne 1 ]]; then
      info "  nyx exited $NYX_EXIT on OWASP set (stderr follows):"
      cat /tmp/nyx_owasp.stderr >&2
    else
      python3 "${SCRIPT_DIR}/tabulate.py" \
        --label owasp \
        --scan /tmp/nyx_owasp.json \
        --ground-truth "${SCRIPT_DIR}/ground_truth/owasp_benchmark_v1.2.json" \
        --append "$RESULTS_JSON" \
        ${BUDGET_FILE:+--budget "$BUDGET_FILE"} \
        ${DIFF_FILE:+--diff "$DIFF_FILE"} \
        || info "  tabulate.py failed; ground truth file may be absent"
    fi
  fi
fi

# ── NIST SARD subset bootstrap ────────────────────────────────────────────────
SARD_DIR="${CORPUS_CACHE}/nist_sard"
if [[ "$SETS" == *sard* ]]; then
  if [[ ! -d "$SARD_DIR" ]]; then
    info "Bootstrapping NIST SARD subset..."
    info "  Download from https://samate.nist.gov/SARD/"
    info "  into ${SARD_DIR} then re-run this script."
    info "Skipping SARD set (not yet downloaded)."
  else
    info "Running nyx scan on NIST SARD subset..."
    set +e
    "$NYX_BIN" scan --format json --verify --no-index "$SARD_DIR" \
      > /tmp/nyx_sard.json 2>/tmp/nyx_sard.stderr
    NYX_EXIT=$?
    set -e
    if [[ $NYX_EXIT -ne 0 && $NYX_EXIT -ne 1 ]]; then
      info "  nyx exited $NYX_EXIT on SARD set"
    else
      python3 "${SCRIPT_DIR}/tabulate.py" \
        --label sard \
        --scan /tmp/nyx_sard.json \
        --ground-truth "${SCRIPT_DIR}/ground_truth/nist_sard.json" \
        --append "$RESULTS_JSON" \
        ${BUDGET_FILE:+--budget "$BUDGET_FILE"} \
        ${DIFF_FILE:+--diff "$DIFF_FILE"} \
        || info "  tabulate.py failed; ground truth file may be absent"
    fi
  fi
fi

# ── In-house bughunt-curated set ──────────────────────────────────────────────
if [[ "$SETS" == *inhouse* ]]; then
  INHOUSE_DIRS=(
    "${REPO_ROOT}/tests/benchmark/corpus"
    "${REPO_ROOT}/tests/dynamic_fixtures"
  )
  for dir in "${INHOUSE_DIRS[@]}"; do
    [[ -d "$dir" ]] || continue
    label="inhouse_$(basename "$dir")"
    info "Running nyx scan on in-house set: $dir"
    set +e
    "$NYX_BIN" scan --format json --verify --no-index "$dir" \
      > "/tmp/nyx_${label}.json" 2>"/tmp/nyx_${label}.stderr"
    NYX_EXIT=$?
    set -e
    if [[ $NYX_EXIT -ne 0 && $NYX_EXIT -ne 1 ]]; then
      info "  nyx exited $NYX_EXIT on $label"
      continue
    fi
    python3 "${SCRIPT_DIR}/tabulate.py" \
      --label "$label" \
      --scan "/tmp/nyx_${label}.json" \
      --inhouse \
      --append "$RESULTS_JSON" \
      ${BUDGET_FILE:+--budget "$BUDGET_FILE"} \
      ${DIFF_FILE:+--diff "$DIFF_FILE"} \
      || info "  tabulate.py failed on $label"
  done
fi

# ── Emit summary table ────────────────────────────────────────────────────────
info ""
info "Results written to: $RESULTS_JSON"

[[ -n "$OUTPUT_DIR" ]] && cp "$RESULTS_JSON" "${OUTPUT_DIR}/eval_results.json"

if [[ ! -f "${SCRIPT_DIR}/report.py" ]]; then
  info "report.py not available; raw results at $RESULTS_JSON"
  exit 0
fi

set +e
python3 "${SCRIPT_DIR}/report.py" \
  --results "$RESULTS_JSON" \
  ${BUDGET_FILE:+--budget "$BUDGET_FILE"} \
  ${DIFF_FILE:+--diff "$DIFF_FILE"}
REPORT_RC=$?
set -e
# Propagate gate-fail (exit 2) and malformed-config (exit 3) so the
# m7_ship_gate.sh Gate-1 dispatch can tell them apart.  Treat other
# non-zero as setup error (exit 1).
if [[ $REPORT_RC -eq 2 ]]; then
  exit 2
elif [[ $REPORT_RC -eq 3 ]]; then
  info "report.py: budget/diff configuration malformed; see $RESULTS_JSON"
  exit 3
elif [[ $REPORT_RC -ne 0 ]]; then
  info "report.py crashed (exit $REPORT_RC); raw results at $RESULTS_JSON"
  exit 1
fi
exit 0