nyx/.github/workflows/eval.yml

105 lines
3.8 KiB
YAML

# Phase 27 (Track R.0): OWASP Benchmark v1.2 real-corpus acceptance.
#
# Runs Gate 6 of scripts/m7_ship_gate.sh against a real OWASP BenchmarkJava
# checkout on every PR that touches the dynamic verifier (src/dynamic/), the
# eval-corpus harness (tests/eval_corpus/), or the gate script itself.
#
# Gate 6 enforces, against the committed ground truth:
# * verify wall-clock <= 15 min (CI budget; the dev reference is 10 min),
# * per-cap confirmed-rate >= 40%, precision >= 0.85, recall >= 0.40 for the
# dynamically-supported OWASP caps,
# * the per-(cap,lang) budget in tests/eval_corpus/budget.toml.
#
# The corpus is *not* vendored. It is cloned at the pinned 1.2beta tag (the
# tag that produced expectedresults-1.2beta.csv, the source of the ground
# truth) and cached so reruns skip the clone.
name: eval
permissions:
contents: read
on:
push:
branches: ["master"]
paths:
- "src/dynamic/**"
- "tests/eval_corpus/**"
- "scripts/m7_ship_gate.sh"
- ".github/workflows/eval.yml"
pull_request:
branches: ["master"]
paths:
- "src/dynamic/**"
- "tests/eval_corpus/**"
- "scripts/m7_ship_gate.sh"
- ".github/workflows/eval.yml"
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
owasp:
name: eval / owasp-benchmark-v1.2
runs-on: ubuntu-latest
env:
# Gate 6 self-skips unless this points at a real checkout.
NYX_OWASP_CORPUS: ${{ github.workspace }}/.eval-corpus/owasp_benchmark_v1.2
# CI wall-clock budget: 15 min. Override locally to tighten.
NYX_OWASP_WALLCLOCK_BUDGET_SECONDS: "900"
steps:
- uses: actions/checkout@v6
- uses: actions-rust-lang/setup-rust-toolchain@v1
with:
toolchain: stable
cache: true
- uses: taiki-e/install-action@nextest
# The Phase 22 Java compile pool drives `com.sun.tools.javac` out of a
# warm JDK; temurin 21 ships the compiler module the pool loads.
- name: Set up JDK 21
uses: actions/setup-java@v4
with:
distribution: temurin
java-version: "21"
- name: Cache OWASP BenchmarkJava (1.2beta)
id: cache-owasp
uses: actions/cache@v4
with:
path: .eval-corpus/owasp_benchmark_v1.2
key: owasp-benchmark-1.2beta
- name: Clone OWASP BenchmarkJava (1.2beta tag)
if: steps.cache-owasp.outputs.cache-hit != 'true'
run: |
git clone --depth 1 --branch 1.2beta \
https://github.com/OWASP-Benchmark/BenchmarkJava \
.eval-corpus/owasp_benchmark_v1.2
# No-compromise guard: the committed ground truth must be exactly what a
# fresh conversion of the pinned CSV produces. Catches GT drift (a
# corpus bump, a hand-edit) before the gate runs on stale labels.
- name: Verify ground truth is in sync with the pinned corpus
run: |
python3 tests/eval_corpus/owasp_gt_convert.py \
--corpus-dir .eval-corpus/owasp_benchmark_v1.2 \
--output /tmp/owasp_gt_regen.json
python3 - <<'PY'
import json, sys
committed = json.load(open("tests/eval_corpus/ground_truth/owasp_benchmark_v1.2.json"))
regen = json.load(open("/tmp/owasp_gt_regen.json"))
if committed != regen:
sys.exit("committed ground truth diverges from a fresh conversion of "
"the 1.2beta CSV; regenerate with owasp_gt_convert.py")
print(f"ground truth in sync: {len(committed)} records")
PY
- name: eval-corpus harness regression tests
run: python3 tests/eval_corpus/test_tabulate_regression.py
- name: Gate 6 — OWASP Benchmark v1.2 acceptance
run: scripts/m7_ship_gate.sh --sets owasp