nyx/.github/workflows/eval.yml

# Real-corpus acceptance (Track R).
#
#   * owasp    (Phase 27 / Track R.0): Gate 6 vs a real OWASP BenchmarkJava
#     checkout (Java).
#   * jsts     (Phase 28 / Track R.1): Gate 7 vs OWASP NodeGoat (Express, .js)
#     and OWASP Juice Shop (TypeScript, .ts), one matrix row per corpus.
#   * polyglot (Phase 29 / Track R.2): Gate 8 vs OWASP RailsGoat (Rails, .rb),
#     DVWA (PHP), DVPWA (aiohttp, .py), gosec (Go) and the RustSec advisory-db
#     (Rust negative control), one matrix row per corpus.
#
# Runs on every PR that touches the dynamic verifier (src/dynamic/), the
# eval-corpus harness (tests/eval_corpus/), or the gate script itself.
#
# Each gate enforces, against the committed ground truth:
#   * verify wall-clock <= 15 min (CI budget; the dev reference is 10 min),
#   * the per-(cap,lang) budget in tests/eval_corpus/budget.toml,
#   * per-cap confirmed-rate / precision / recall — hard-gated only for caps
#     in NYX_*_FLOOR_CAPS (empty by default → published report-only until a
#     cap Confirms end to end), with destinations >= 40% / >= 0.85 / >= 0.40.
#
# No corpus is vendored.  Each is cloned at a pinned ref and cached so reruns
# skip the clone.  Before the gate runs, the committed ground truth is
# regenerated from its source against the fresh clone and asserted in sync,
# and the converter hard-errors on any labelled path missing from the corpus,
# so a corpus bump that drifts the labels fails the job loudly.

name: eval

permissions:
  contents: read

on:
  push:
    branches: ["master"]
    paths:
      - "src/dynamic/**"
      - "tests/eval_corpus/**"
      - "scripts/m7_ship_gate.sh"
      - ".github/workflows/eval.yml"
  pull_request:
    branches: ["master"]
    paths:
      - "src/dynamic/**"
      - "tests/eval_corpus/**"
      - "scripts/m7_ship_gate.sh"
      - ".github/workflows/eval.yml"
  workflow_dispatch:

concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
  cancel-in-progress: true

jobs:
  owasp:
    name: eval / owasp-benchmark-v1.2
    runs-on: ubuntu-latest
    env:
      # Gate 6 self-skips unless this points at a real checkout.
      NYX_OWASP_CORPUS: ${{ github.workspace }}/.eval-corpus/owasp_benchmark_v1.2
      # CI wall-clock budget: 20 min.  The 2740-file OWASP scan+verify lands
      # right at the old 15-min ceiling on the hosted runners (observed 900.2s),
      # so the gate tripped on CI variance alone; 1200s restores headroom.  The
      # dev reference stays 10 min — override locally to tighten.
      NYX_OWASP_WALLCLOCK_BUDGET_SECONDS: "1200"
    steps:
      - uses: actions/checkout@v6

      - uses: actions-rust-lang/setup-rust-toolchain@v1
        with:
          toolchain: stable
          cache: true

      - uses: taiki-e/install-action@nextest

      # The Phase 22 Java compile pool drives `com.sun.tools.javac` out of a
      # warm JDK; temurin 21 ships the compiler module the pool loads.
      - name: Set up JDK 21
        uses: actions/setup-java@v4
        with:
          distribution: temurin
          java-version: "21"

      - name: Cache OWASP BenchmarkJava (1.2beta)
        id: cache-owasp
        uses: actions/cache@v4
        with:
          path: .eval-corpus/owasp_benchmark_v1.2
          key: owasp-benchmark-1.2beta

      - name: Clone OWASP BenchmarkJava (1.2beta tag)
        if: steps.cache-owasp.outputs.cache-hit != 'true'
        run: |
          git clone --depth 1 --branch 1.2beta \
            https://github.com/OWASP-Benchmark/BenchmarkJava \
            .eval-corpus/owasp_benchmark_v1.2

      # No-compromise guard: the committed ground truth must be exactly what a
      # fresh conversion of the pinned CSV produces.  Catches GT drift (a
      # corpus bump, a hand-edit) before the gate runs on stale labels.
      - name: Verify ground truth is in sync with the pinned corpus
        run: |
          python3 tests/eval_corpus/owasp_gt_convert.py \
            --corpus-dir .eval-corpus/owasp_benchmark_v1.2 \
            --output /tmp/owasp_gt_regen.json
          python3 - <<'PY'
          import json, sys
          committed = json.load(open("tests/eval_corpus/ground_truth/owasp_benchmark_v1.2.json"))
          regen = json.load(open("/tmp/owasp_gt_regen.json"))
          if committed != regen:
              sys.exit("committed ground truth diverges from a fresh conversion of "
                       "the 1.2beta CSV; regenerate with owasp_gt_convert.py")
          print(f"ground truth in sync: {len(committed)} records")
          PY

      - name: eval-corpus harness regression tests
        run: |
          python3 tests/eval_corpus/test_tabulate_regression.py
          python3 tests/eval_corpus/test_manifest_gt_convert.py

      - name: Gate 6 — OWASP Benchmark v1.2 acceptance
        run: scripts/m7_ship_gate.sh --sets owasp

  jsts:
    name: eval / ${{ matrix.corpus.name }}
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
        corpus:
          - name: nodegoat
            repo: https://github.com/OWASP/NodeGoat
            # NodeGoat ships no release tags; pin the default branch and let
            # the cache key hold it stable.  The manifest's path layout
            # (app/, config/) has been constant for years.
            ref: master
            env: NYX_NODEGOAT_CORPUS
            manifest: nodegoat.manifest.toml
            ground_truth: nodegoat.json
          - name: juiceshop
            repo: https://github.com/juice-shop/juice-shop
            ref: v15.0.0
            env: NYX_JUICESHOP_CORPUS
            manifest: juiceshop.manifest.toml
            ground_truth: juiceshop.json
    env:
      # CI wall-clock budget: 15 min.  Override locally to tighten.
      NYX_JSTS_WALLCLOCK_BUDGET_SECONDS: "900"
    steps:
      - uses: actions/checkout@v6

      - uses: actions-rust-lang/setup-rust-toolchain@v1
        with:
          toolchain: stable
          cache: true

      - uses: taiki-e/install-action@nextest

      # The dynamic verifier's Node build pool (Phase 23) compiles its
      # harnesses with a real node/npm toolchain.
      - name: Set up Node 20
        uses: actions/setup-node@v4
        with:
          node-version: "20"

      - name: Cache ${{ matrix.corpus.name }}
        id: cache-corpus
        uses: actions/cache@v4
        with:
          path: .eval-corpus/${{ matrix.corpus.name }}
          key: jsts-${{ matrix.corpus.name }}-${{ matrix.corpus.ref }}

      - name: Clone ${{ matrix.corpus.name }} (${{ matrix.corpus.ref }})
        if: steps.cache-corpus.outputs.cache-hit != 'true'
        run: |
          git clone --depth 1 --branch ${{ matrix.corpus.ref }} \
            ${{ matrix.corpus.repo }} \
            .eval-corpus/${{ matrix.corpus.name }}

      # No-compromise guard: the committed ground truth must be exactly what a
      # fresh conversion of the curated manifest produces *against this
      # corpus*.  manifest_gt_convert.py hard-errors on any labelled path that
      # no longer exists in the clone (corpus drift / typo), and the diff
      # below catches a stale committed JSON.
      - name: Verify ground truth is in sync with the pinned corpus
        run: |
          python3 tests/eval_corpus/manifest_gt_convert.py \
            --manifest tests/eval_corpus/ground_truth/${{ matrix.corpus.manifest }} \
            --corpus-dir .eval-corpus/${{ matrix.corpus.name }} \
            --output /tmp/${{ matrix.corpus.name }}_gt_regen.json
          python3 - <<'PY'
          import json, sys
          name = "${{ matrix.corpus.ground_truth }}"
          committed = json.load(open(f"tests/eval_corpus/ground_truth/{name}"))
          regen = json.load(open("/tmp/${{ matrix.corpus.name }}_gt_regen.json"))
          if committed != regen:
              sys.exit("committed ground truth diverges from a fresh conversion of "
                       "the manifest against the pinned corpus; regenerate with "
                       "manifest_gt_convert.py")
          print(f"ground truth in sync: {len(committed)} records")
          PY

      - name: eval-corpus harness regression tests
        run: |
          python3 tests/eval_corpus/test_tabulate_regression.py
          python3 tests/eval_corpus/test_manifest_gt_convert.py

      - name: Gate 7 — ${{ matrix.corpus.name }} acceptance
        run: |
          export ${{ matrix.corpus.env }}="${{ github.workspace }}/.eval-corpus/${{ matrix.corpus.name }}"
          scripts/m7_ship_gate.sh --sets ${{ matrix.corpus.name }}

  polyglot:
    name: eval / ${{ matrix.corpus.name }}
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
        corpus:
          - name: railsgoat
            repo: https://github.com/OWASP/railsgoat
            ref: rails.5.0.0
            lang: ruby
            env: NYX_RAILSGOAT_CORPUS
            manifest: railsgoat.manifest.toml
            ground_truth: railsgoat.json
          - name: dvwa
            repo: https://github.com/digininja/DVWA
            ref: "2.5"
            lang: php
            env: NYX_DVWA_CORPUS
            manifest: dvwa.manifest.toml
            ground_truth: dvwa.json
          - name: dvpwa
            repo: https://github.com/anxolerd/dvpwa
            # DVPWA ships no release tags; pin the default branch and let the
            # cache key hold it stable.
            ref: master
            lang: python
            env: NYX_DVPWA_CORPUS
            manifest: dvpwa.manifest.toml
            ground_truth: dvpwa.json
          - name: gosec
            repo: https://github.com/securego/gosec
            ref: v2.26.1
            lang: go
            env: NYX_GOSEC_CORPUS
            manifest: gosec.manifest.toml
            ground_truth: gosec.json
          - name: rustsec
            repo: https://github.com/rustsec/advisory-db
            # advisory-db ships no release tags; pin the default branch.  This
            # is the Rust NEGATIVE CONTROL (advisory metadata, no scannable
            # source) — its committed ground truth is empty by construction.
            ref: main
            lang: rust
            env: NYX_RUSTSEC_CORPUS
            manifest: rustsec.manifest.toml
            ground_truth: rustsec.json
    env:
      # CI wall-clock budget: 15 min.  Override locally to tighten.
      NYX_POLYGLOT_WALLCLOCK_BUDGET_SECONDS: "900"
    steps:
      - uses: actions/checkout@v6

      - uses: actions-rust-lang/setup-rust-toolchain@v1
        with:
          toolchain: stable
          cache: true

      - uses: taiki-e/install-action@nextest

      # The dynamic verifier's per-language build pool (Phase 22/23) compiles
      # its harnesses with a real toolchain.  Each matrix row sets up only the
      # toolchain for its corpus's target language; the Rust row needs no extra
      # step (the rust toolchain above covers it, and advisory-db has no
      # buildable source anyway).
      - name: Set up Ruby
        if: matrix.corpus.lang == 'ruby'
        uses: ruby/setup-ruby@v1
        with:
          ruby-version: "3.3"

      - name: Set up PHP
        if: matrix.corpus.lang == 'php'
        uses: shivammathur/setup-php@v2
        with:
          php-version: "8.3"

      - name: Set up Python
        if: matrix.corpus.lang == 'python'
        uses: actions/setup-python@v5
        with:
          python-version: "3.12"

      - name: Set up Go
        if: matrix.corpus.lang == 'go'
        uses: actions/setup-go@v5
        with:
          go-version: "1.22"

      - name: Cache ${{ matrix.corpus.name }}
        id: cache-corpus
        uses: actions/cache@v4
        with:
          path: .eval-corpus/${{ matrix.corpus.name }}
          key: polyglot-${{ matrix.corpus.name }}-${{ matrix.corpus.ref }}

      - name: Clone ${{ matrix.corpus.name }} (${{ matrix.corpus.ref }})
        if: steps.cache-corpus.outputs.cache-hit != 'true'
        run: |
          git clone --depth 1 --branch ${{ matrix.corpus.ref }} \
            ${{ matrix.corpus.repo }} \
            .eval-corpus/${{ matrix.corpus.name }}

      # No-compromise guard: the committed ground truth must be exactly what a
      # fresh conversion of the curated manifest produces *against this corpus*.
      # manifest_gt_convert.py hard-errors on any labelled path that no longer
      # exists in the clone (corpus drift / typo); the diff below catches a
      # stale committed JSON.  For the RustSec negative control the manifest
      # carries `negative_control = true` and zero entries, so the converter
      # emits an empty `[]` — still validated against the real clone.
      - name: Verify ground truth is in sync with the pinned corpus
        run: |
          python3 tests/eval_corpus/manifest_gt_convert.py \
            --manifest tests/eval_corpus/ground_truth/${{ matrix.corpus.manifest }} \
            --corpus-dir .eval-corpus/${{ matrix.corpus.name }} \
            --output /tmp/${{ matrix.corpus.name }}_gt_regen.json
          python3 - <<'PY'
          import json, sys
          name = "${{ matrix.corpus.ground_truth }}"
          committed = json.load(open(f"tests/eval_corpus/ground_truth/{name}"))
          regen = json.load(open("/tmp/${{ matrix.corpus.name }}_gt_regen.json"))
          if committed != regen:
              sys.exit("committed ground truth diverges from a fresh conversion of "
                       "the manifest against the pinned corpus; regenerate with "
                       "manifest_gt_convert.py")
          print(f"ground truth in sync: {len(committed)} records")
          PY

      - name: eval-corpus harness regression tests
        run: |
          python3 tests/eval_corpus/test_tabulate_regression.py
          python3 tests/eval_corpus/test_manifest_gt_convert.py

      - name: Gate 8 — ${{ matrix.corpus.name }} acceptance
        run: |
          export ${{ matrix.corpus.env }}="${{ github.workspace }}/.eval-corpus/${{ matrix.corpus.name }}"
          scripts/m7_ship_gate.sh --sets ${{ matrix.corpus.name }}