mirror of
https://github.com/elicpeter/nyx.git
synced 2026-06-09 19:45:13 +02:00
348 lines
14 KiB
YAML
348 lines
14 KiB
YAML
# Real-corpus acceptance (Track R).
|
|
#
|
|
# * owasp (Phase 27 / Track R.0): Gate 6 vs a real OWASP BenchmarkJava
|
|
# checkout (Java).
|
|
# * jsts (Phase 28 / Track R.1): Gate 7 vs OWASP NodeGoat (Express, .js)
|
|
# and OWASP Juice Shop (TypeScript, .ts), one matrix row per corpus.
|
|
# * polyglot (Phase 29 / Track R.2): Gate 8 vs OWASP RailsGoat (Rails, .rb),
|
|
# DVWA (PHP), DVPWA (aiohttp, .py), gosec (Go) and the RustSec advisory-db
|
|
# (Rust negative control), one matrix row per corpus.
|
|
#
|
|
# Runs on every PR that touches the dynamic verifier (src/dynamic/), the
|
|
# eval-corpus harness (tests/eval_corpus/), or the gate script itself.
|
|
#
|
|
# Each gate enforces, against the committed ground truth:
|
|
# * verify wall-clock <= 15 min (CI budget; the dev reference is 10 min),
|
|
# * the per-(cap,lang) budget in tests/eval_corpus/budget.toml,
|
|
# * per-cap confirmed-rate / precision / recall — hard-gated only for caps
|
|
# in NYX_*_FLOOR_CAPS (empty by default → published report-only until a
|
|
# cap Confirms end to end), with destinations >= 40% / >= 0.85 / >= 0.40.
|
|
#
|
|
# No corpus is vendored. Each is cloned at a pinned ref and cached so reruns
|
|
# skip the clone. Before the gate runs, the committed ground truth is
|
|
# regenerated from its source against the fresh clone and asserted in sync,
|
|
# and the converter hard-errors on any labelled path missing from the corpus,
|
|
# so a corpus bump that drifts the labels fails the job loudly.
|
|
|
|
name: eval
|
|
|
|
permissions:
|
|
contents: read
|
|
|
|
on:
|
|
push:
|
|
branches: ["master"]
|
|
paths:
|
|
- "src/dynamic/**"
|
|
- "tests/eval_corpus/**"
|
|
- "scripts/m7_ship_gate.sh"
|
|
- ".github/workflows/eval.yml"
|
|
pull_request:
|
|
branches: ["master"]
|
|
paths:
|
|
- "src/dynamic/**"
|
|
- "tests/eval_corpus/**"
|
|
- "scripts/m7_ship_gate.sh"
|
|
- ".github/workflows/eval.yml"
|
|
workflow_dispatch:
|
|
|
|
concurrency:
|
|
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
|
cancel-in-progress: true
|
|
|
|
jobs:
|
|
owasp:
|
|
name: eval / owasp-benchmark-v1.2
|
|
runs-on: ubuntu-latest
|
|
env:
|
|
# Gate 6 self-skips unless this points at a real checkout.
|
|
NYX_OWASP_CORPUS: ${{ github.workspace }}/.eval-corpus/owasp_benchmark_v1.2
|
|
# CI wall-clock budget: 20 min. The 2740-file OWASP scan+verify lands
|
|
# right at the old 15-min ceiling on the hosted runners (observed 900.2s),
|
|
# so the gate tripped on CI variance alone; 1200s restores headroom. The
|
|
# dev reference stays 10 min — override locally to tighten.
|
|
NYX_OWASP_WALLCLOCK_BUDGET_SECONDS: "1200"
|
|
steps:
|
|
- uses: actions/checkout@v6
|
|
|
|
- uses: actions-rust-lang/setup-rust-toolchain@v1
|
|
with:
|
|
toolchain: stable
|
|
cache: true
|
|
|
|
- uses: taiki-e/install-action@nextest
|
|
|
|
# The Phase 22 Java compile pool drives `com.sun.tools.javac` out of a
|
|
# warm JDK; temurin 21 ships the compiler module the pool loads.
|
|
- name: Set up JDK 21
|
|
uses: actions/setup-java@v4
|
|
with:
|
|
distribution: temurin
|
|
java-version: "21"
|
|
|
|
- name: Cache OWASP BenchmarkJava (1.2beta)
|
|
id: cache-owasp
|
|
uses: actions/cache@v4
|
|
with:
|
|
path: .eval-corpus/owasp_benchmark_v1.2
|
|
key: owasp-benchmark-1.2beta
|
|
|
|
- name: Clone OWASP BenchmarkJava (1.2beta tag)
|
|
if: steps.cache-owasp.outputs.cache-hit != 'true'
|
|
run: |
|
|
git clone --depth 1 --branch 1.2beta \
|
|
https://github.com/OWASP-Benchmark/BenchmarkJava \
|
|
.eval-corpus/owasp_benchmark_v1.2
|
|
|
|
# No-compromise guard: the committed ground truth must be exactly what a
|
|
# fresh conversion of the pinned CSV produces. Catches GT drift (a
|
|
# corpus bump, a hand-edit) before the gate runs on stale labels.
|
|
- name: Verify ground truth is in sync with the pinned corpus
|
|
run: |
|
|
python3 tests/eval_corpus/owasp_gt_convert.py \
|
|
--corpus-dir .eval-corpus/owasp_benchmark_v1.2 \
|
|
--output /tmp/owasp_gt_regen.json
|
|
python3 - <<'PY'
|
|
import json, sys
|
|
committed = json.load(open("tests/eval_corpus/ground_truth/owasp_benchmark_v1.2.json"))
|
|
regen = json.load(open("/tmp/owasp_gt_regen.json"))
|
|
if committed != regen:
|
|
sys.exit("committed ground truth diverges from a fresh conversion of "
|
|
"the 1.2beta CSV; regenerate with owasp_gt_convert.py")
|
|
print(f"ground truth in sync: {len(committed)} records")
|
|
PY
|
|
|
|
- name: eval-corpus harness regression tests
|
|
run: |
|
|
python3 tests/eval_corpus/test_tabulate_regression.py
|
|
python3 tests/eval_corpus/test_manifest_gt_convert.py
|
|
|
|
- name: Gate 6 — OWASP Benchmark v1.2 acceptance
|
|
run: scripts/m7_ship_gate.sh --sets owasp
|
|
|
|
jsts:
|
|
name: eval / ${{ matrix.corpus.name }}
|
|
runs-on: ubuntu-latest
|
|
strategy:
|
|
fail-fast: false
|
|
matrix:
|
|
corpus:
|
|
- name: nodegoat
|
|
repo: https://github.com/OWASP/NodeGoat
|
|
# NodeGoat ships no release tags; pin the default branch and let
|
|
# the cache key hold it stable. The manifest's path layout
|
|
# (app/, config/) has been constant for years.
|
|
ref: master
|
|
env: NYX_NODEGOAT_CORPUS
|
|
manifest: nodegoat.manifest.toml
|
|
ground_truth: nodegoat.json
|
|
- name: juiceshop
|
|
repo: https://github.com/juice-shop/juice-shop
|
|
ref: v15.0.0
|
|
env: NYX_JUICESHOP_CORPUS
|
|
manifest: juiceshop.manifest.toml
|
|
ground_truth: juiceshop.json
|
|
env:
|
|
# CI wall-clock budget: 15 min. Override locally to tighten.
|
|
NYX_JSTS_WALLCLOCK_BUDGET_SECONDS: "900"
|
|
steps:
|
|
- uses: actions/checkout@v6
|
|
|
|
- uses: actions-rust-lang/setup-rust-toolchain@v1
|
|
with:
|
|
toolchain: stable
|
|
cache: true
|
|
|
|
- uses: taiki-e/install-action@nextest
|
|
|
|
# The dynamic verifier's Node build pool (Phase 23) compiles its
|
|
# harnesses with a real node/npm toolchain.
|
|
- name: Set up Node 20
|
|
uses: actions/setup-node@v4
|
|
with:
|
|
node-version: "20"
|
|
|
|
- name: Cache ${{ matrix.corpus.name }}
|
|
id: cache-corpus
|
|
uses: actions/cache@v4
|
|
with:
|
|
path: .eval-corpus/${{ matrix.corpus.name }}
|
|
key: jsts-${{ matrix.corpus.name }}-${{ matrix.corpus.ref }}
|
|
|
|
- name: Clone ${{ matrix.corpus.name }} (${{ matrix.corpus.ref }})
|
|
if: steps.cache-corpus.outputs.cache-hit != 'true'
|
|
run: |
|
|
git clone --depth 1 --branch ${{ matrix.corpus.ref }} \
|
|
${{ matrix.corpus.repo }} \
|
|
.eval-corpus/${{ matrix.corpus.name }}
|
|
|
|
# No-compromise guard: the committed ground truth must be exactly what a
|
|
# fresh conversion of the curated manifest produces *against this
|
|
# corpus*. manifest_gt_convert.py hard-errors on any labelled path that
|
|
# no longer exists in the clone (corpus drift / typo), and the diff
|
|
# below catches a stale committed JSON.
|
|
- name: Verify ground truth is in sync with the pinned corpus
|
|
run: |
|
|
python3 tests/eval_corpus/manifest_gt_convert.py \
|
|
--manifest tests/eval_corpus/ground_truth/${{ matrix.corpus.manifest }} \
|
|
--corpus-dir .eval-corpus/${{ matrix.corpus.name }} \
|
|
--output /tmp/${{ matrix.corpus.name }}_gt_regen.json
|
|
python3 - <<'PY'
|
|
import json, sys
|
|
name = "${{ matrix.corpus.ground_truth }}"
|
|
committed = json.load(open(f"tests/eval_corpus/ground_truth/{name}"))
|
|
regen = json.load(open("/tmp/${{ matrix.corpus.name }}_gt_regen.json"))
|
|
if committed != regen:
|
|
sys.exit("committed ground truth diverges from a fresh conversion of "
|
|
"the manifest against the pinned corpus; regenerate with "
|
|
"manifest_gt_convert.py")
|
|
print(f"ground truth in sync: {len(committed)} records")
|
|
PY
|
|
|
|
- name: eval-corpus harness regression tests
|
|
run: |
|
|
python3 tests/eval_corpus/test_tabulate_regression.py
|
|
python3 tests/eval_corpus/test_manifest_gt_convert.py
|
|
|
|
- name: Gate 7 — ${{ matrix.corpus.name }} acceptance
|
|
run: |
|
|
export ${{ matrix.corpus.env }}="${{ github.workspace }}/.eval-corpus/${{ matrix.corpus.name }}"
|
|
scripts/m7_ship_gate.sh --sets ${{ matrix.corpus.name }}
|
|
|
|
polyglot:
|
|
name: eval / ${{ matrix.corpus.name }}
|
|
runs-on: ubuntu-latest
|
|
strategy:
|
|
fail-fast: false
|
|
matrix:
|
|
corpus:
|
|
- name: railsgoat
|
|
repo: https://github.com/OWASP/railsgoat
|
|
ref: rails.5.0.0
|
|
lang: ruby
|
|
env: NYX_RAILSGOAT_CORPUS
|
|
manifest: railsgoat.manifest.toml
|
|
ground_truth: railsgoat.json
|
|
- name: dvwa
|
|
repo: https://github.com/digininja/DVWA
|
|
ref: "2.5"
|
|
lang: php
|
|
env: NYX_DVWA_CORPUS
|
|
manifest: dvwa.manifest.toml
|
|
ground_truth: dvwa.json
|
|
- name: dvpwa
|
|
repo: https://github.com/anxolerd/dvpwa
|
|
# DVPWA ships no release tags; pin the default branch and let the
|
|
# cache key hold it stable.
|
|
ref: master
|
|
lang: python
|
|
env: NYX_DVPWA_CORPUS
|
|
manifest: dvpwa.manifest.toml
|
|
ground_truth: dvpwa.json
|
|
- name: gosec
|
|
repo: https://github.com/securego/gosec
|
|
ref: v2.26.1
|
|
lang: go
|
|
env: NYX_GOSEC_CORPUS
|
|
manifest: gosec.manifest.toml
|
|
ground_truth: gosec.json
|
|
- name: rustsec
|
|
repo: https://github.com/rustsec/advisory-db
|
|
# advisory-db ships no release tags; pin the default branch. This
|
|
# is the Rust NEGATIVE CONTROL (advisory metadata, no scannable
|
|
# source) — its committed ground truth is empty by construction.
|
|
ref: main
|
|
lang: rust
|
|
env: NYX_RUSTSEC_CORPUS
|
|
manifest: rustsec.manifest.toml
|
|
ground_truth: rustsec.json
|
|
env:
|
|
# CI wall-clock budget: 15 min. Override locally to tighten.
|
|
NYX_POLYGLOT_WALLCLOCK_BUDGET_SECONDS: "900"
|
|
steps:
|
|
- uses: actions/checkout@v6
|
|
|
|
- uses: actions-rust-lang/setup-rust-toolchain@v1
|
|
with:
|
|
toolchain: stable
|
|
cache: true
|
|
|
|
- uses: taiki-e/install-action@nextest
|
|
|
|
# The dynamic verifier's per-language build pool (Phase 22/23) compiles
|
|
# its harnesses with a real toolchain. Each matrix row sets up only the
|
|
# toolchain for its corpus's target language; the Rust row needs no extra
|
|
# step (the rust toolchain above covers it, and advisory-db has no
|
|
# buildable source anyway).
|
|
- name: Set up Ruby
|
|
if: matrix.corpus.lang == 'ruby'
|
|
uses: ruby/setup-ruby@v1
|
|
with:
|
|
ruby-version: "3.3"
|
|
|
|
- name: Set up PHP
|
|
if: matrix.corpus.lang == 'php'
|
|
uses: shivammathur/setup-php@v2
|
|
with:
|
|
php-version: "8.3"
|
|
|
|
- name: Set up Python
|
|
if: matrix.corpus.lang == 'python'
|
|
uses: actions/setup-python@v5
|
|
with:
|
|
python-version: "3.12"
|
|
|
|
- name: Set up Go
|
|
if: matrix.corpus.lang == 'go'
|
|
uses: actions/setup-go@v5
|
|
with:
|
|
go-version: "1.22"
|
|
|
|
- name: Cache ${{ matrix.corpus.name }}
|
|
id: cache-corpus
|
|
uses: actions/cache@v4
|
|
with:
|
|
path: .eval-corpus/${{ matrix.corpus.name }}
|
|
key: polyglot-${{ matrix.corpus.name }}-${{ matrix.corpus.ref }}
|
|
|
|
- name: Clone ${{ matrix.corpus.name }} (${{ matrix.corpus.ref }})
|
|
if: steps.cache-corpus.outputs.cache-hit != 'true'
|
|
run: |
|
|
git clone --depth 1 --branch ${{ matrix.corpus.ref }} \
|
|
${{ matrix.corpus.repo }} \
|
|
.eval-corpus/${{ matrix.corpus.name }}
|
|
|
|
# No-compromise guard: the committed ground truth must be exactly what a
|
|
# fresh conversion of the curated manifest produces *against this corpus*.
|
|
# manifest_gt_convert.py hard-errors on any labelled path that no longer
|
|
# exists in the clone (corpus drift / typo); the diff below catches a
|
|
# stale committed JSON. For the RustSec negative control the manifest
|
|
# carries `negative_control = true` and zero entries, so the converter
|
|
# emits an empty `[]` — still validated against the real clone.
|
|
- name: Verify ground truth is in sync with the pinned corpus
|
|
run: |
|
|
python3 tests/eval_corpus/manifest_gt_convert.py \
|
|
--manifest tests/eval_corpus/ground_truth/${{ matrix.corpus.manifest }} \
|
|
--corpus-dir .eval-corpus/${{ matrix.corpus.name }} \
|
|
--output /tmp/${{ matrix.corpus.name }}_gt_regen.json
|
|
python3 - <<'PY'
|
|
import json, sys
|
|
name = "${{ matrix.corpus.ground_truth }}"
|
|
committed = json.load(open(f"tests/eval_corpus/ground_truth/{name}"))
|
|
regen = json.load(open("/tmp/${{ matrix.corpus.name }}_gt_regen.json"))
|
|
if committed != regen:
|
|
sys.exit("committed ground truth diverges from a fresh conversion of "
|
|
"the manifest against the pinned corpus; regenerate with "
|
|
"manifest_gt_convert.py")
|
|
print(f"ground truth in sync: {len(committed)} records")
|
|
PY
|
|
|
|
- name: eval-corpus harness regression tests
|
|
run: |
|
|
python3 tests/eval_corpus/test_tabulate_regression.py
|
|
python3 tests/eval_corpus/test_manifest_gt_convert.py
|
|
|
|
- name: Gate 8 — ${{ matrix.corpus.name }} acceptance
|
|
run: |
|
|
export ${{ matrix.corpus.env }}="${{ github.workspace }}/.eval-corpus/${{ matrix.corpus.name }}"
|
|
scripts/m7_ship_gate.sh --sets ${{ matrix.corpus.name }}
|