nyx/.github/workflows/eval.yml
2026-06-05 10:16:30 -05:00

348 lines
14 KiB
YAML

# Real-corpus acceptance (Track R).
#
# * owasp (Phase 27 / Track R.0): Gate 6 vs a real OWASP BenchmarkJava
# checkout (Java).
# * jsts (Phase 28 / Track R.1): Gate 7 vs OWASP NodeGoat (Express, .js)
# and OWASP Juice Shop (TypeScript, .ts), one matrix row per corpus.
# * polyglot (Phase 29 / Track R.2): Gate 8 vs OWASP RailsGoat (Rails, .rb),
# DVWA (PHP), DVPWA (aiohttp, .py), gosec (Go) and the RustSec advisory-db
# (Rust negative control), one matrix row per corpus.
#
# Runs on every PR that touches the dynamic verifier (src/dynamic/), the
# eval-corpus harness (tests/eval_corpus/), or the gate script itself.
#
# Each gate enforces, against the committed ground truth:
# * verify wall-clock <= 15 min (CI budget; the dev reference is 10 min),
# * the per-(cap,lang) budget in tests/eval_corpus/budget.toml,
# * per-cap confirmed-rate / precision / recall — hard-gated only for caps
# in NYX_*_FLOOR_CAPS (empty by default → published report-only until a
# cap Confirms end to end), with destinations >= 40% / >= 0.85 / >= 0.40.
#
# No corpus is vendored. Each is cloned at a pinned ref and cached so reruns
# skip the clone. Before the gate runs, the committed ground truth is
# regenerated from its source against the fresh clone and asserted in sync,
# and the converter hard-errors on any labelled path missing from the corpus,
# so a corpus bump that drifts the labels fails the job loudly.
name: eval
permissions:
contents: read
on:
push:
branches: ["master"]
paths:
- "src/dynamic/**"
- "tests/eval_corpus/**"
- "scripts/m7_ship_gate.sh"
- ".github/workflows/eval.yml"
pull_request:
branches: ["master"]
paths:
- "src/dynamic/**"
- "tests/eval_corpus/**"
- "scripts/m7_ship_gate.sh"
- ".github/workflows/eval.yml"
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
owasp:
name: eval / owasp-benchmark-v1.2
runs-on: ubuntu-latest
env:
# Gate 6 self-skips unless this points at a real checkout.
NYX_OWASP_CORPUS: ${{ github.workspace }}/.eval-corpus/owasp_benchmark_v1.2
# CI wall-clock budget: 20 min. The 2740-file OWASP scan+verify lands
# right at the old 15-min ceiling on the hosted runners (observed 900.2s),
# so the gate tripped on CI variance alone; 1200s restores headroom. The
# dev reference stays 10 min — override locally to tighten.
NYX_OWASP_WALLCLOCK_BUDGET_SECONDS: "1200"
steps:
- uses: actions/checkout@v6
- uses: actions-rust-lang/setup-rust-toolchain@v1
with:
toolchain: stable
cache: true
- uses: taiki-e/install-action@nextest
# The Phase 22 Java compile pool drives `com.sun.tools.javac` out of a
# warm JDK; temurin 21 ships the compiler module the pool loads.
- name: Set up JDK 21
uses: actions/setup-java@v4
with:
distribution: temurin
java-version: "21"
- name: Cache OWASP BenchmarkJava (1.2beta)
id: cache-owasp
uses: actions/cache@v4
with:
path: .eval-corpus/owasp_benchmark_v1.2
key: owasp-benchmark-1.2beta
- name: Clone OWASP BenchmarkJava (1.2beta tag)
if: steps.cache-owasp.outputs.cache-hit != 'true'
run: |
git clone --depth 1 --branch 1.2beta \
https://github.com/OWASP-Benchmark/BenchmarkJava \
.eval-corpus/owasp_benchmark_v1.2
# No-compromise guard: the committed ground truth must be exactly what a
# fresh conversion of the pinned CSV produces. Catches GT drift (a
# corpus bump, a hand-edit) before the gate runs on stale labels.
- name: Verify ground truth is in sync with the pinned corpus
run: |
python3 tests/eval_corpus/owasp_gt_convert.py \
--corpus-dir .eval-corpus/owasp_benchmark_v1.2 \
--output /tmp/owasp_gt_regen.json
python3 - <<'PY'
import json, sys
committed = json.load(open("tests/eval_corpus/ground_truth/owasp_benchmark_v1.2.json"))
regen = json.load(open("/tmp/owasp_gt_regen.json"))
if committed != regen:
sys.exit("committed ground truth diverges from a fresh conversion of "
"the 1.2beta CSV; regenerate with owasp_gt_convert.py")
print(f"ground truth in sync: {len(committed)} records")
PY
- name: eval-corpus harness regression tests
run: |
python3 tests/eval_corpus/test_tabulate_regression.py
python3 tests/eval_corpus/test_manifest_gt_convert.py
- name: Gate 6 — OWASP Benchmark v1.2 acceptance
run: scripts/m7_ship_gate.sh --sets owasp
jsts:
name: eval / ${{ matrix.corpus.name }}
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
corpus:
- name: nodegoat
repo: https://github.com/OWASP/NodeGoat
# NodeGoat ships no release tags; pin the default branch and let
# the cache key hold it stable. The manifest's path layout
# (app/, config/) has been constant for years.
ref: master
env: NYX_NODEGOAT_CORPUS
manifest: nodegoat.manifest.toml
ground_truth: nodegoat.json
- name: juiceshop
repo: https://github.com/juice-shop/juice-shop
ref: v15.0.0
env: NYX_JUICESHOP_CORPUS
manifest: juiceshop.manifest.toml
ground_truth: juiceshop.json
env:
# CI wall-clock budget: 15 min. Override locally to tighten.
NYX_JSTS_WALLCLOCK_BUDGET_SECONDS: "900"
steps:
- uses: actions/checkout@v6
- uses: actions-rust-lang/setup-rust-toolchain@v1
with:
toolchain: stable
cache: true
- uses: taiki-e/install-action@nextest
# The dynamic verifier's Node build pool (Phase 23) compiles its
# harnesses with a real node/npm toolchain.
- name: Set up Node 20
uses: actions/setup-node@v4
with:
node-version: "20"
- name: Cache ${{ matrix.corpus.name }}
id: cache-corpus
uses: actions/cache@v4
with:
path: .eval-corpus/${{ matrix.corpus.name }}
key: jsts-${{ matrix.corpus.name }}-${{ matrix.corpus.ref }}
- name: Clone ${{ matrix.corpus.name }} (${{ matrix.corpus.ref }})
if: steps.cache-corpus.outputs.cache-hit != 'true'
run: |
git clone --depth 1 --branch ${{ matrix.corpus.ref }} \
${{ matrix.corpus.repo }} \
.eval-corpus/${{ matrix.corpus.name }}
# No-compromise guard: the committed ground truth must be exactly what a
# fresh conversion of the curated manifest produces *against this
# corpus*. manifest_gt_convert.py hard-errors on any labelled path that
# no longer exists in the clone (corpus drift / typo), and the diff
# below catches a stale committed JSON.
- name: Verify ground truth is in sync with the pinned corpus
run: |
python3 tests/eval_corpus/manifest_gt_convert.py \
--manifest tests/eval_corpus/ground_truth/${{ matrix.corpus.manifest }} \
--corpus-dir .eval-corpus/${{ matrix.corpus.name }} \
--output /tmp/${{ matrix.corpus.name }}_gt_regen.json
python3 - <<'PY'
import json, sys
name = "${{ matrix.corpus.ground_truth }}"
committed = json.load(open(f"tests/eval_corpus/ground_truth/{name}"))
regen = json.load(open("/tmp/${{ matrix.corpus.name }}_gt_regen.json"))
if committed != regen:
sys.exit("committed ground truth diverges from a fresh conversion of "
"the manifest against the pinned corpus; regenerate with "
"manifest_gt_convert.py")
print(f"ground truth in sync: {len(committed)} records")
PY
- name: eval-corpus harness regression tests
run: |
python3 tests/eval_corpus/test_tabulate_regression.py
python3 tests/eval_corpus/test_manifest_gt_convert.py
- name: Gate 7 — ${{ matrix.corpus.name }} acceptance
run: |
export ${{ matrix.corpus.env }}="${{ github.workspace }}/.eval-corpus/${{ matrix.corpus.name }}"
scripts/m7_ship_gate.sh --sets ${{ matrix.corpus.name }}
polyglot:
name: eval / ${{ matrix.corpus.name }}
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
corpus:
- name: railsgoat
repo: https://github.com/OWASP/railsgoat
ref: rails.5.0.0
lang: ruby
env: NYX_RAILSGOAT_CORPUS
manifest: railsgoat.manifest.toml
ground_truth: railsgoat.json
- name: dvwa
repo: https://github.com/digininja/DVWA
ref: "2.5"
lang: php
env: NYX_DVWA_CORPUS
manifest: dvwa.manifest.toml
ground_truth: dvwa.json
- name: dvpwa
repo: https://github.com/anxolerd/dvpwa
# DVPWA ships no release tags; pin the default branch and let the
# cache key hold it stable.
ref: master
lang: python
env: NYX_DVPWA_CORPUS
manifest: dvpwa.manifest.toml
ground_truth: dvpwa.json
- name: gosec
repo: https://github.com/securego/gosec
ref: v2.26.1
lang: go
env: NYX_GOSEC_CORPUS
manifest: gosec.manifest.toml
ground_truth: gosec.json
- name: rustsec
repo: https://github.com/rustsec/advisory-db
# advisory-db ships no release tags; pin the default branch. This
# is the Rust NEGATIVE CONTROL (advisory metadata, no scannable
# source) — its committed ground truth is empty by construction.
ref: main
lang: rust
env: NYX_RUSTSEC_CORPUS
manifest: rustsec.manifest.toml
ground_truth: rustsec.json
env:
# CI wall-clock budget: 15 min. Override locally to tighten.
NYX_POLYGLOT_WALLCLOCK_BUDGET_SECONDS: "900"
steps:
- uses: actions/checkout@v6
- uses: actions-rust-lang/setup-rust-toolchain@v1
with:
toolchain: stable
cache: true
- uses: taiki-e/install-action@nextest
# The dynamic verifier's per-language build pool (Phase 22/23) compiles
# its harnesses with a real toolchain. Each matrix row sets up only the
# toolchain for its corpus's target language; the Rust row needs no extra
# step (the rust toolchain above covers it, and advisory-db has no
# buildable source anyway).
- name: Set up Ruby
if: matrix.corpus.lang == 'ruby'
uses: ruby/setup-ruby@v1
with:
ruby-version: "3.3"
- name: Set up PHP
if: matrix.corpus.lang == 'php'
uses: shivammathur/setup-php@v2
with:
php-version: "8.3"
- name: Set up Python
if: matrix.corpus.lang == 'python'
uses: actions/setup-python@v5
with:
python-version: "3.12"
- name: Set up Go
if: matrix.corpus.lang == 'go'
uses: actions/setup-go@v5
with:
go-version: "1.22"
- name: Cache ${{ matrix.corpus.name }}
id: cache-corpus
uses: actions/cache@v4
with:
path: .eval-corpus/${{ matrix.corpus.name }}
key: polyglot-${{ matrix.corpus.name }}-${{ matrix.corpus.ref }}
- name: Clone ${{ matrix.corpus.name }} (${{ matrix.corpus.ref }})
if: steps.cache-corpus.outputs.cache-hit != 'true'
run: |
git clone --depth 1 --branch ${{ matrix.corpus.ref }} \
${{ matrix.corpus.repo }} \
.eval-corpus/${{ matrix.corpus.name }}
# No-compromise guard: the committed ground truth must be exactly what a
# fresh conversion of the curated manifest produces *against this corpus*.
# manifest_gt_convert.py hard-errors on any labelled path that no longer
# exists in the clone (corpus drift / typo); the diff below catches a
# stale committed JSON. For the RustSec negative control the manifest
# carries `negative_control = true` and zero entries, so the converter
# emits an empty `[]` — still validated against the real clone.
- name: Verify ground truth is in sync with the pinned corpus
run: |
python3 tests/eval_corpus/manifest_gt_convert.py \
--manifest tests/eval_corpus/ground_truth/${{ matrix.corpus.manifest }} \
--corpus-dir .eval-corpus/${{ matrix.corpus.name }} \
--output /tmp/${{ matrix.corpus.name }}_gt_regen.json
python3 - <<'PY'
import json, sys
name = "${{ matrix.corpus.ground_truth }}"
committed = json.load(open(f"tests/eval_corpus/ground_truth/{name}"))
regen = json.load(open("/tmp/${{ matrix.corpus.name }}_gt_regen.json"))
if committed != regen:
sys.exit("committed ground truth diverges from a fresh conversion of "
"the manifest against the pinned corpus; regenerate with "
"manifest_gt_convert.py")
print(f"ground truth in sync: {len(committed)} records")
PY
- name: eval-corpus harness regression tests
run: |
python3 tests/eval_corpus/test_tabulate_regression.py
python3 tests/eval_corpus/test_manifest_gt_convert.py
- name: Gate 8 — ${{ matrix.corpus.name }} acceptance
run: |
export ${{ matrix.corpus.env }}="${{ github.workspace }}/.eval-corpus/${{ matrix.corpus.name }}"
scripts/m7_ship_gate.sh --sets ${{ matrix.corpus.name }}