diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml index e35481f1..3f7db77b 100644 --- a/.github/workflows/eval.yml +++ b/.github/workflows/eval.yml @@ -1,18 +1,25 @@ -# Phase 27 (Track R.0): OWASP Benchmark v1.2 real-corpus acceptance. +# Real-corpus acceptance (Track R). # -# Runs Gate 6 of scripts/m7_ship_gate.sh against a real OWASP BenchmarkJava -# checkout on every PR that touches the dynamic verifier (src/dynamic/), the +# * owasp (Phase 27 / Track R.0): Gate 6 vs a real OWASP BenchmarkJava +# checkout (Java). +# * jsts (Phase 28 / Track R.1): Gate 7 vs OWASP NodeGoat (Express, .js) +# and OWASP Juice Shop (TypeScript, .ts), one matrix row per corpus. +# +# Runs on every PR that touches the dynamic verifier (src/dynamic/), the # eval-corpus harness (tests/eval_corpus/), or the gate script itself. # -# Gate 6 enforces, against the committed ground truth: +# Each gate enforces, against the committed ground truth: # * verify wall-clock <= 15 min (CI budget; the dev reference is 10 min), -# * per-cap confirmed-rate >= 40%, precision >= 0.85, recall >= 0.40 for the -# dynamically-supported OWASP caps, -# * the per-(cap,lang) budget in tests/eval_corpus/budget.toml. +# * the per-(cap,lang) budget in tests/eval_corpus/budget.toml, +# * per-cap confirmed-rate / precision / recall — hard-gated only for caps +# in NYX_*_FLOOR_CAPS (empty by default → published report-only until a +# cap Confirms end to end), with destinations >= 40% / >= 0.85 / >= 0.40. # -# The corpus is *not* vendored. It is cloned at the pinned 1.2beta tag (the -# tag that produced expectedresults-1.2beta.csv, the source of the ground -# truth) and cached so reruns skip the clone. +# No corpus is vendored. Each is cloned at a pinned ref and cached so reruns +# skip the clone. Before the gate runs, the committed ground truth is +# regenerated from its source against the fresh clone and asserted in sync, +# and the converter hard-errors on any labelled path missing from the corpus, +# so a corpus bump that drifts the labels fails the job loudly. name: eval @@ -99,7 +106,98 @@ jobs: PY - name: eval-corpus harness regression tests - run: python3 tests/eval_corpus/test_tabulate_regression.py + run: | + python3 tests/eval_corpus/test_tabulate_regression.py + python3 tests/eval_corpus/test_manifest_gt_convert.py - name: Gate 6 — OWASP Benchmark v1.2 acceptance run: scripts/m7_ship_gate.sh --sets owasp + + jsts: + name: eval / ${{ matrix.corpus.name }} + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + corpus: + - name: nodegoat + repo: https://github.com/OWASP/NodeGoat + # NodeGoat ships no release tags; pin the default branch and let + # the cache key hold it stable. The manifest's path layout + # (app/, config/) has been constant for years. + ref: master + env: NYX_NODEGOAT_CORPUS + manifest: nodegoat.manifest.toml + ground_truth: nodegoat.json + - name: juiceshop + repo: https://github.com/juice-shop/juice-shop + ref: v15.0.0 + env: NYX_JUICESHOP_CORPUS + manifest: juiceshop.manifest.toml + ground_truth: juiceshop.json + env: + # CI wall-clock budget: 15 min. Override locally to tighten. + NYX_JSTS_WALLCLOCK_BUDGET_SECONDS: "900" + steps: + - uses: actions/checkout@v6 + + - uses: actions-rust-lang/setup-rust-toolchain@v1 + with: + toolchain: stable + cache: true + + - uses: taiki-e/install-action@nextest + + # The dynamic verifier's Node build pool (Phase 23) compiles its + # harnesses with a real node/npm toolchain. + - name: Set up Node 20 + uses: actions/setup-node@v4 + with: + node-version: "20" + + - name: Cache ${{ matrix.corpus.name }} + id: cache-corpus + uses: actions/cache@v4 + with: + path: .eval-corpus/${{ matrix.corpus.name }} + key: jsts-${{ matrix.corpus.name }}-${{ matrix.corpus.ref }} + + - name: Clone ${{ matrix.corpus.name }} (${{ matrix.corpus.ref }}) + if: steps.cache-corpus.outputs.cache-hit != 'true' + run: | + git clone --depth 1 --branch ${{ matrix.corpus.ref }} \ + ${{ matrix.corpus.repo }} \ + .eval-corpus/${{ matrix.corpus.name }} + + # No-compromise guard: the committed ground truth must be exactly what a + # fresh conversion of the curated manifest produces *against this + # corpus*. manifest_gt_convert.py hard-errors on any labelled path that + # no longer exists in the clone (corpus drift / typo), and the diff + # below catches a stale committed JSON. + - name: Verify ground truth is in sync with the pinned corpus + run: | + python3 tests/eval_corpus/manifest_gt_convert.py \ + --manifest tests/eval_corpus/ground_truth/${{ matrix.corpus.manifest }} \ + --corpus-dir .eval-corpus/${{ matrix.corpus.name }} \ + --output /tmp/${{ matrix.corpus.name }}_gt_regen.json + python3 - <<'PY' + import json, sys + name = "${{ matrix.corpus.ground_truth }}" + committed = json.load(open(f"tests/eval_corpus/ground_truth/{name}")) + regen = json.load(open("/tmp/${{ matrix.corpus.name }}_gt_regen.json")) + if committed != regen: + sys.exit("committed ground truth diverges from a fresh conversion of " + "the manifest against the pinned corpus; regenerate with " + "manifest_gt_convert.py") + print(f"ground truth in sync: {len(committed)} records") + PY + + - name: eval-corpus harness regression tests + run: | + python3 tests/eval_corpus/test_tabulate_regression.py + python3 tests/eval_corpus/test_manifest_gt_convert.py + + - name: Gate 7 — ${{ matrix.corpus.name }} acceptance + run: | + export ${{ matrix.corpus.env }}="${{ github.workspace }}/.eval-corpus/${{ matrix.corpus.name }}" + scripts/m7_ship_gate.sh --sets ${{ matrix.corpus.name }} diff --git a/scripts/m7_ship_gate.sh b/scripts/m7_ship_gate.sh index 29341579..7b8a0c28 100755 --- a/scripts/m7_ship_gate.sh +++ b/scripts/m7_ship_gate.sh @@ -6,6 +6,8 @@ # scripts/m7_ship_gate.sh # every gate # scripts/m7_ship_gate.sh --gates 3,6 # only gates 3 + 6 # scripts/m7_ship_gate.sh --sets owasp # Java OWASP corpus only +# scripts/m7_ship_gate.sh --sets jsts # NodeGoat + Juice Shop only +# scripts/m7_ship_gate.sh --sets nodegoat # one JS/TS corpus only # # Gate map (kept in sync with .pitboss/play/plan.md track M.7): # Gate 1: Static-only scan is green on `tests/benchmark/corpus`. @@ -26,13 +28,22 @@ # R.0) added the precision/recall/budget ratchet. The corpus is # *not* checked into the repo; the gate skips with a clear message # when `NYX_OWASP_CORPUS` does not point at a real checkout. +# Gate 7: JS/TS real-corpus acceptance (Track R.1 / Phase 28). OWASP +# NodeGoat (Express, .js) + OWASP Juice Shop (TypeScript, .ts) +# `--verify` against the committed ground truth. Same shape as +# Gate 6: wall-clock budget + the per-(cap,lang) budget in +# tests/eval_corpus/budget.toml hard-enforced; per-cap +# confirmed-rate / precision / recall published report-only +# (NYX_JSTS_FLOOR_CAPS empty by default). Each corpus row +# self-skips unless its NYX_NODEGOAT_CORPUS / NYX_JUICESHOP_CORPUS +# points at a real checkout. set -euo pipefail REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" cd "${REPO_ROOT}" -GATES="1,2,3,4,5,6" +GATES="1,2,3,4,5,6,7" SETS="" while [[ $# -gt 0 ]]; do @@ -56,10 +67,15 @@ while [[ $# -gt 0 ]]; do esac done -# When `--sets owasp` is passed CI only wants Gate 6. -if [[ "${SETS}" == "owasp" ]]; then - GATES="6" -fi +# `--sets` lets CI run a single real-corpus gate. `owasp` -> Gate 6; +# `jsts` (both JS/TS corpora) / `nodegoat` / `juiceshop` -> Gate 7, with the +# corpus name passed through so Gate 7 runs only the requested row. +case "${SETS}" in + owasp) GATES="6" ;; + jsts|nodegoat|juiceshop) GATES="7" ;; + "") ;; # no --sets: run the requested --gates + *) echo "unknown --sets: ${SETS}" >&2; exit 2 ;; +esac want_gate() { [[ ",${GATES}," == *",$1,"* ]] @@ -292,6 +308,162 @@ PY echo " PASS" } +# ── Gate 7: JS/TS real-corpus acceptance (NodeGoat + Juice Shop) ────────────── + +# Phase 28 (Track R.1) mirror of Gate 6 for the JS/TS corpora. Same +# wall-clock split (10 min dev reference / 15 min CI) and the same +# report-only-by-default floor policy: NYX_JSTS_FLOOR_CAPS is empty, so the +# per-cap confirmed-rate / precision / recall numbers are published but gate +# nothing, while the per-(cap,lang) budget (unsupported_rate, +# false_confirmed_rate) is hard-enforced. Promote a cap into the floor set +# once it starts Confirming end to end. +GATE7_WALLCLOCK_BUDGET="${NYX_JSTS_WALLCLOCK_BUDGET_SECONDS:-900}" +GATE7_CONFIRMED_RATE_TARGET="${NYX_JSTS_CONFIRMED_RATE_TARGET:-0.40}" +GATE7_PRECISION_TARGET="${NYX_JSTS_PRECISION_TARGET:-0.85}" +GATE7_RECALL_TARGET="${NYX_JSTS_RECALL_TARGET:-0.40}" +GATE7_FLOOR_CAPS="${NYX_JSTS_FLOOR_CAPS:-}" +GATE7_BUDGET="${NYX_JSTS_BUDGET:-${REPO_ROOT}/tests/eval_corpus/budget.toml}" + +# Run one real-corpus `--verify` row: scan under a wall-clock guard, +# tabulate against the committed ground truth, enforce the per-cell budget, +# publish (or, when floor caps are set, enforce) the per-cap floors. +# $1 label $2 corpus dir $3 ground-truth json +# Returns 0 on pass, 1 on fail. Caller decides skip. +_gate7_run_corpus() { + local label="$1" corpus="$2" gt="$3" + local scan_report="/tmp/m7_gate7_${label}_scan.json" + local results_report="/tmp/m7_gate7_${label}_results.json" + local wallclock_report="/tmp/m7_gate7_${label}_wallclock.txt" + local gate_home="${TMPDIR:-/tmp}/nyx_m7_gate7_${label}_home" + local gate_build_pool="${TMPDIR:-/tmp}/nyx_m7_gate7_${label}_build_pool" + local wallclock + + mkdir -p "${gate_home}" "${gate_build_pool}" + rm -f "${scan_report}" "${results_report}" "${wallclock_report}" + + set +e + HOME="${gate_home}" \ + NYX_BUILD_POOL_DIR="${gate_build_pool}" \ + python3 - "${GATE7_WALLCLOCK_BUDGET}" "${scan_report}" "${wallclock_report}" \ + "${REPO_ROOT}/target/release/nyx" scan \ + --verify \ + --index off \ + --format json \ + --quiet \ + "${corpus}" <<'PY' +import subprocess +import sys +import time + +budget = float(sys.argv[1]) +scan_report = sys.argv[2] +wallclock_report = sys.argv[3] +cmd = sys.argv[4:] +start = time.monotonic() +rc = 0 +try: + with open(scan_report, "wb") as out: + completed = subprocess.run(cmd, stdout=out, timeout=budget) + rc = completed.returncode +except subprocess.TimeoutExpired: + rc = 124 +finally: + elapsed = time.monotonic() - start + with open(wallclock_report, "w") as f: + f.write(f"{elapsed:.1f}\n") +sys.exit(rc) +PY + local nyx_exit=$? + set -e + wallclock="$(cat "${wallclock_report}" 2>/dev/null || printf "%s" "${GATE7_WALLCLOCK_BUDGET}")" + + echo " ${label} verify wall-clock: ${wallclock}s (budget ${GATE7_WALLCLOCK_BUDGET}s)" + + if [[ ${nyx_exit} -eq 124 ]]; then + echo " FAIL: ${label} scan exceeded wall-clock budget" + return 1 + fi + if [[ ${nyx_exit} -ne 0 && ${nyx_exit} -ne 1 ]]; then + echo " FAIL: ${label} scan exited ${nyx_exit}" + return 1 + fi + if [[ ! -s "${scan_report}" ]]; then + echo " FAIL: ${label} scan produced no JSON report" + return 1 + fi + awk -v w="${wallclock}" -v b="${GATE7_WALLCLOCK_BUDGET}" \ + 'BEGIN { if (w+0 > b+0) exit 1 }' \ + || { echo " FAIL: ${label} wall-clock exceeds budget"; return 1; } + + echo "[]" > "${results_report}" + python3 "${REPO_ROOT}/tests/eval_corpus/tabulate.py" \ + --label "${label}" \ + --scan "${scan_report}" \ + --ground-truth "${gt}" \ + --append "${results_report}" \ + || { echo " FAIL: ${label} result tabulation failed"; return 1; } + + local -a report_args=( + --results "${results_report}" + --budget "${GATE7_BUDGET}" + ) + if [[ -n "${GATE7_FLOOR_CAPS}" ]]; then + report_args+=( + --floor-caps "${GATE7_FLOOR_CAPS}" + --min-confirmed-rate "${GATE7_CONFIRMED_RATE_TARGET}" + --min-precision "${GATE7_PRECISION_TARGET}" + --min-recall "${GATE7_RECALL_TARGET}" + ) + echo " enforcing per-cap floors (confirmed >= ${GATE7_CONFIRMED_RATE_TARGET}, precision >= ${GATE7_PRECISION_TARGET}, recall >= ${GATE7_RECALL_TARGET}) on: ${GATE7_FLOOR_CAPS}" + else + echo " per-cap confirmed/precision/recall: report-only (NYX_JSTS_FLOOR_CAPS unset)" + fi + python3 "${REPO_ROOT}/tests/eval_corpus/report.py" "${report_args[@]}" \ + || { echo " FAIL: ${label} per-cell budget exceeded or a gated per-cap floor missed"; return 1; } + return 0 +} + +gate_7_jsts_scale() { + echo "── Gate 7: JS/TS real-corpus (NodeGoat + Juice Shop) verify acceptance ──" + cargo build --release --quiet --features dynamic + + # name : env var holding the corpus dir : committed ground-truth file + local rows=( + "nodegoat:NYX_NODEGOAT_CORPUS:nodegoat.json" + "juiceshop:NYX_JUICESHOP_CORPUS:juiceshop.json" + ) + local any_ran=0 any_failed=0 + for row in "${rows[@]}"; do + local name envvar gtfile + IFS=: read -r name envvar gtfile <<<"${row}" + # When --sets names a single corpus, only run that row. + if [[ -n "${SETS}" && "${SETS}" != "jsts" && "${SETS}" != "${name}" ]]; then + continue + fi + local corpus="${!envvar:-}" + if [[ -z "${corpus}" || ! -d "${corpus}" ]]; then + echo " SKIP ${name}: set ${envvar} to a checkout to run this row." + continue + fi + any_ran=1 + echo " ── ${name} (${corpus}) ──" + if _gate7_run_corpus "${name}" "${corpus}" \ + "${REPO_ROOT}/tests/eval_corpus/ground_truth/${gtfile}"; then + echo " PASS ${name}" + else + any_failed=1 + fi + done + + if [[ ${any_ran} -eq 0 ]]; then + echo " SKIP: no JS/TS corpus configured (set NYX_NODEGOAT_CORPUS / NYX_JUICESHOP_CORPUS)." + echo " (Gate 7 is Phase 28's headline acceptance for the JS/TS real corpora.)" + return 0 + fi + [[ ${any_failed} -eq 0 ]] || return 1 + echo " PASS" +} + # ── Driver ──────────────────────────────────────────────────────────────────── declare -a FAILED=() @@ -310,6 +482,7 @@ run_gate 3 verify_ratio run_gate 4 sarif_schema run_gate 5 layering run_gate 6 owasp_scale +run_gate 7 jsts_scale if [[ ${#FAILED[@]} -gt 0 ]]; then echo diff --git a/tests/eval_corpus/budget.toml b/tests/eval_corpus/budget.toml index 0b84ee9f..340da270 100644 --- a/tests/eval_corpus/budget.toml +++ b/tests/eval_corpus/budget.toml @@ -116,3 +116,87 @@ cap = "auth" lang = "java" unsupported_rate = 0.20 false_confirmed_rate = 0.02 + +# ── NodeGoat / Juice Shop (JS/TS) — Track R.1 ratchet ──────────────────────── +# +# Phase 28 wires two intentionally-vulnerable JS/TS apps into the same +# acceptance machinery as OWASP Benchmark: OWASP NodeGoat (Express, .js) +# and OWASP Juice Shop (TypeScript, .ts). Unlike OWASP Benchmark, neither +# app ships vuln/benign *pairs* — every labelled file is `vuln = true` (see +# ground_truth/{nodegoat,juiceshop}.manifest.toml). Two consequences for +# these cells: +# +# * false_confirmed_rate (<= 2%) is the headline maximum the verifier +# already satisfies and is HARD-enforced: it only trips when a Confirmed +# finding lands on a file with no ground-truth positive, i.e. an +# over-confirm. With the verifier confirming little on real corpora yet +# it is satisfied, and it ratchets precision as confirms grow. +# * unsupported_rate (<= 20%) is HARD-enforced too. `Unsupported` counts +# only NoPayloadsForCap / EntryKindUnsupported / SoundOracleUnavailable — +# a narrow bucket that Tracks J + M shrank — *not* BuildFailed / +# SpecDerivationFailed (those are Inconclusive), so it stays low. +# +# confirmed_rate (>= 40%), precision (>= 0.85) and recall (>= 0.40) are the +# Phase 28 acceptance DESTINATIONS. They are intentionally left UNSET here +# and published report-only by Gate 7 (NYX_JSTS_FLOOR_CAPS empty by default, +# mirroring NYX_OWASP_FLOOR_CAPS) because (a) the verifier does not yet +# Confirm these corpora end to end and (b) the manifest labels canonical +# vulns only, so precision vs partial ground truth is informational until +# the labels are completed. Promote a cap into the floor set the moment it +# starts Confirming, exactly as for OWASP. + +# NodeGoat (javascript): caps with a ground-truth label in nodegoat.manifest.toml. +[[cell]] +cap = "cmdi" +lang = "javascript" +unsupported_rate = 0.20 +false_confirmed_rate = 0.02 + +[[cell]] +cap = "xss" +lang = "javascript" +unsupported_rate = 0.20 +false_confirmed_rate = 0.02 + +[[cell]] +cap = "unauthorized_id" +lang = "javascript" +unsupported_rate = 0.20 +false_confirmed_rate = 0.02 + +[[cell]] +cap = "crypto" +lang = "javascript" +unsupported_rate = 0.20 +false_confirmed_rate = 0.02 + +# Juice Shop (typescript): caps with a ground-truth label in juiceshop.manifest.toml. +[[cell]] +cap = "sqli" +lang = "typescript" +unsupported_rate = 0.20 +false_confirmed_rate = 0.02 + +[[cell]] +cap = "path_traversal" +lang = "typescript" +unsupported_rate = 0.20 +false_confirmed_rate = 0.02 + +[[cell]] +cap = "redirect" +lang = "typescript" +unsupported_rate = 0.20 +false_confirmed_rate = 0.02 + +[[cell]] +cap = "ssrf" +lang = "typescript" +unsupported_rate = 0.20 +false_confirmed_rate = 0.02 + +[[cell]] +cap = "crypto" +lang = "typescript" +unsupported_rate = 0.20 +false_confirmed_rate = 0.02 diff --git a/tests/eval_corpus/ground_truth/README.md b/tests/eval_corpus/ground_truth/README.md index 47da8809..663a0be6 100644 --- a/tests/eval_corpus/ground_truth/README.md +++ b/tests/eval_corpus/ground_truth/README.md @@ -34,3 +34,38 @@ python3 tests/eval_corpus/owasp_gt_convert.py \ File: `nist_sard.json` Same format. Source: SARD manifest XML converted with `python3 tests/eval_corpus/sard_gt_convert.py`. + +## OWASP NodeGoat / OWASP Juice Shop (JS/TS — Track R.1) + +Files: `nodegoat.json` (Express, `.js`), `juiceshop.json` (TypeScript, `.ts`). +Same four-field format as above; all records are `vuln: true`. + +These two apps are intentionally vulnerable end to end, so — unlike OWASP +Benchmark — they ship no machine-readable per-file vuln labels and have no +benign-control files to pair against. The authoritative source is a curated +TOML manifest committed here, one `[[entry]]` per known-vulnerable handler +with a `note` citing why: + +- `nodegoat.manifest.toml` +- `juiceshop.manifest.toml` + +`manifest_gt_convert.py` turns a manifest into the committed `.json`: + +```sh +python3 tests/eval_corpus/manifest_gt_convert.py \ + --manifest tests/eval_corpus/ground_truth/nodegoat.manifest.toml \ + --output tests/eval_corpus/ground_truth/nodegoat.json +``` + +Pass `--corpus-dir ` to validate every labelled path against a real +checkout. The converter exits non-zero if any path is missing, so a corpus +bump that moves a handler fails loudly instead of silently dropping recall. +CI (`.github/workflows/eval.yml`, `jsts` job) regenerates each `.json` +against a fresh clone of the pinned ref and asserts it matches the committed +file. + +Because the manifests label canonical vulns only, recall (did nyx catch the +known vulns) is the meaningful metric; precision vs this partial ground +truth is informational. Gate 7 publishes per-cap precision/recall/confirmed +report-only by default (`NYX_JSTS_FLOOR_CAPS` empty), matching the OWASP +gate. diff --git a/tests/eval_corpus/ground_truth/juiceshop.json b/tests/eval_corpus/ground_truth/juiceshop.json new file mode 100644 index 00000000..3981effa --- /dev/null +++ b/tests/eval_corpus/ground_truth/juiceshop.json @@ -0,0 +1,38 @@ +[ + { + "path": "lib/insecurity.ts", + "line": 0, + "cap": "crypto", + "vuln": true + }, + { + "path": "routes/fileServer.ts", + "line": 0, + "cap": "path_traversal", + "vuln": true + }, + { + "path": "routes/login.ts", + "line": 0, + "cap": "sqli", + "vuln": true + }, + { + "path": "routes/profileImageUrlUpload.ts", + "line": 0, + "cap": "ssrf", + "vuln": true + }, + { + "path": "routes/redirect.ts", + "line": 0, + "cap": "redirect", + "vuln": true + }, + { + "path": "routes/search.ts", + "line": 0, + "cap": "sqli", + "vuln": true + } +] diff --git a/tests/eval_corpus/ground_truth/juiceshop.manifest.toml b/tests/eval_corpus/ground_truth/juiceshop.manifest.toml new file mode 100644 index 00000000..c8aeee0d --- /dev/null +++ b/tests/eval_corpus/ground_truth/juiceshop.manifest.toml @@ -0,0 +1,66 @@ +# OWASP Juice Shop — curated vuln ground-truth manifest (Phase 28, Track R.1). +# +# Juice Shop is an intentionally-vulnerable TypeScript/Express + Angular +# app. Its `data/static/challenges.yml` enumerates challenges but pins no +# source file/line, so it cannot drive file-level ground truth on its own. +# This manifest IS the authoritative source: one [[entry]] per known- +# vulnerable server-side handler, curated from the project's own challenge +# definitions + companion guide, each with a `note` citing the challenge. +# +# tests/eval_corpus/manifest_gt_convert.py turns this into the committed +# ground_truth/juiceshop.json. CI regenerates it against a fresh clone of +# the pinned tag and asserts byte-equality; the converter HARD-ERRORS on +# any path that no longer exists in the corpus, so a Juice Shop bump that +# refactors a route fails the eval job loudly instead of silently dropping +# recall. Re-pin `pinned_ref` and re-validate the paths together. +# +# `cap` is a nyx cap label (tabulate.py). `path` is relative to the Juice +# Shop clone root, POSIX separators. Lang is inferred from the extension +# (.ts -> typescript). All `vuln = true`: Juice Shop is all-vulnerable, so +# there is no benign-control file to pair against. As with NodeGoat, +# precision vs this manifest is informational (an unlabelled finding may be +# a real uncurated vuln, not a false positive) while recall is the +# meaningful floor. See tests/eval_corpus/budget.toml for the gate policy. + +corpus = "juiceshop" +upstream = "https://github.com/juice-shop/juice-shop" +# Pinned to a stable release tag. The server-side handlers below +# (routes/*.ts, lib/insecurity.ts) have been stable across the TypeScript +# era of Juice Shop; re-validate if the tag is bumped. +pinned_ref = "v15.0.0" + +[[entry]] +path = "routes/login.ts" +cap = "sqli" +vuln = true +note = "login builds a raw `models.sequelize.query(\"... WHERE email = '\" + req.body.email + \"' ...\")` — SQL injection auth bypass (challenge: loginAdmin / loginBender)." + +[[entry]] +path = "routes/search.ts" +cap = "sqli" +vuln = true +note = "product search concatenates the `q` criteria into a raw `models.sequelize.query` LIKE clause — UNION-based SQL injection (challenge: unionSqlInjection / dbSchema)." + +[[entry]] +path = "routes/fileServer.ts" +cap = "path_traversal" +vuln = true +note = "serveKeyFiles / file download resolves a user-controlled filename under the ftp dir without containment — path traversal (challenge: accessLogDisclosure / forgottenDevBackup)." + +[[entry]] +path = "routes/redirect.ts" +cap = "redirect" +vuln = true +note = "redirect endpoint forwards to the `to` query param via an allow-list that is bypassable by substring — open redirect (challenge: redirectCryptoCurrency / redirect)." + +[[entry]] +path = "routes/profileImageUrlUpload.ts" +cap = "ssrf" +vuln = true +note = "profile image upload fetches an arbitrary user-supplied imageUrl server-side — SSRF (challenge: ssrf)." + +[[entry]] +path = "lib/insecurity.ts" +cap = "crypto" +vuln = true +note = "hardcoded HMAC/JWT key material and weak hashing (md5-based `hash`) — broken cryptography / hardcoded secret (challenge: weakCryptography / jwt*)." diff --git a/tests/eval_corpus/ground_truth/nodegoat.json b/tests/eval_corpus/ground_truth/nodegoat.json new file mode 100644 index 00000000..e7a06dc7 --- /dev/null +++ b/tests/eval_corpus/ground_truth/nodegoat.json @@ -0,0 +1,32 @@ +[ + { + "path": "app/routes/allocations.js", + "line": 0, + "cap": "unauthorized_id", + "vuln": true + }, + { + "path": "app/routes/contributions.js", + "line": 0, + "cap": "cmdi", + "vuln": true + }, + { + "path": "app/routes/memos.js", + "line": 0, + "cap": "xss", + "vuln": true + }, + { + "path": "app/routes/profile.js", + "line": 0, + "cap": "xss", + "vuln": true + }, + { + "path": "config/env/all.js", + "line": 0, + "cap": "crypto", + "vuln": true + } +] diff --git a/tests/eval_corpus/ground_truth/nodegoat.manifest.toml b/tests/eval_corpus/ground_truth/nodegoat.manifest.toml new file mode 100644 index 00000000..b51242af --- /dev/null +++ b/tests/eval_corpus/ground_truth/nodegoat.manifest.toml @@ -0,0 +1,62 @@ +# OWASP NodeGoat — curated vuln ground-truth manifest (Phase 28, Track R.1). +# +# NodeGoat is an intentionally-vulnerable Express/Node app that maps the +# OWASP Top 10 to concrete handlers. It ships no machine-readable per-file +# vuln labels (unlike OWASP Benchmark's expectedresults CSV), so this +# manifest IS the authoritative source: one [[entry]] per known-vulnerable +# location, each curated from the project's own tutorial + the canonical +# vuln walk-through, with a `note` citing why. +# +# tests/eval_corpus/manifest_gt_convert.py turns this into the committed +# ground_truth/nodegoat.json. CI regenerates it against a fresh clone of +# the pinned ref and asserts byte-equality, and the converter HARD-ERRORS +# on any path that no longer exists in the corpus, so a NodeGoat bump that +# moves a handler fails the eval job loudly rather than silently dropping +# recall. Update `pinned_ref` + the paths together when re-pinning. +# +# `cap` is a nyx cap label (tabulate.py). `path` is relative to the +# NodeGoat clone root, POSIX separators. Lang is inferred from the +# extension (.js -> javascript). These are all `vuln = true`: NodeGoat is +# all-vulnerable, so there is no benign-control file to pair against (the +# OWASP Benchmark vuln/benign pairing does not exist here). Precision vs +# this manifest is therefore informational (an unlabelled finding is not +# necessarily a false positive — it may be a real vuln we did not curate), +# while recall (did nyx catch the canonical vulns) is the meaningful floor. +# See tests/eval_corpus/budget.toml for how the gate treats these cells. + +corpus = "nodegoat" +upstream = "https://github.com/OWASP/NodeGoat" +# NodeGoat publishes no semver tags; the eval job pins the default branch +# via the CI cache key. The `app/` + `config/` layout below has been +# stable for years; re-validate the paths if the cache key is bumped. +pinned_ref = "master" + +[[entry]] +path = "app/routes/contributions.js" +cap = "cmdi" +vuln = true +note = "handleContributionsUpdate eval()s the pre-tax/after-tax/roth form fields — server-side JS injection (OWASP A1 Injection); the textbook NodeGoat RCE." + +[[entry]] +path = "app/routes/profile.js" +cap = "xss" +vuln = true +note = "profile fields (firstName/lastName/bankAcc/...) are persisted then rendered unescaped — stored XSS (OWASP A3 / A7 XSS)." + +[[entry]] +path = "app/routes/memos.js" +cap = "xss" +vuln = true +note = "memo body is stored and echoed back into the memos view without output encoding — stored XSS." + +[[entry]] +path = "app/routes/allocations.js" +cap = "unauthorized_id" +vuln = true +note = "allocations are looked up by a userId taken from the request with no ownership check — insecure direct object reference / broken access control (OWASP A4)." + +[[entry]] +path = "config/env/all.js" +cap = "crypto" +vuln = true +note = "hardcoded cookieSecret / session secret committed in source — sensitive-data / weak-secret smell (OWASP A6)." diff --git a/tests/eval_corpus/manifest_gt_convert.py b/tests/eval_corpus/manifest_gt_convert.py new file mode 100755 index 00000000..792338ad --- /dev/null +++ b/tests/eval_corpus/manifest_gt_convert.py @@ -0,0 +1,195 @@ +#!/usr/bin/env python3 +"""Convert a curated TOML vuln manifest into nyx ground-truth JSON. + +Used for real-world apps that ship **no** machine-readable per-file vuln +labels of their own (OWASP NodeGoat, OWASP Juice Shop). OWASP Benchmark +ships `expectedresults-1.2beta.csv` (see owasp_gt_convert.py); NIST SARD +ships `manifest.xml` (see sard_gt_convert.py). NodeGoat / Juice Shop are +intentionally-vulnerable apps without an equivalent, so the authoritative +source here is a curated manifest committed *in this repo* — one +`[[entry]]` table per known-vulnerable location, each carrying a +provenance `note` so a reviewer can trace why the label is what it is. + +Manifest schema (TOML):: + + # provenance comments at the top + corpus = "nodegoat" # informational label + upstream = "https://github.com/OWASP/NodeGoat" + pinned_ref = "master@" # the ref the paths were curated against + + [[entry]] + path = "app/routes/contributions.js" # relative to the corpus root, POSIX + cap = "cmdi" # a nyx cap label (tabulate.py) + vuln = true # true = real vuln, false = benign control + note = "eval() of user-supplied pre/after-tax fields (NodeGoat A1)" + +Output (consumed by tabulate.py): a list of `{path, line, cap, vuln}` +records, sorted by `(path, cap)` for deterministic, diff-stable JSON. +`note` is intentionally dropped — the ground-truth JSON keeps the exact +same four-field schema OWASP/SARD produce, so tabulate.py needs no special +casing. `line` is always 0 (the manifest pins a file, not a line; +tabulate.py matches file+cap and treats line 0 as "any line"). + +Path validation (the no-compromise guard). When `--corpus-dir` is given, +**every** manifest path must resolve to a real file under that root or the +converter exits non-zero. CI runs the converter against a fresh clone of +the pinned corpus and then asserts the committed JSON byte-matches the +regenerated JSON, so a corpus bump that moves/renames/deletes a labelled +file (or a typo'd path) fails the build loudly instead of silently +degrading recall. Authoring the committed JSON offline (no corpus on +hand) is done by omitting `--corpus-dir`: the transform is identical, only +the existence check is skipped. + +Usage:: + + # author / regenerate the committed JSON offline (no validation): + tests/eval_corpus/manifest_gt_convert.py \\ + --manifest tests/eval_corpus/ground_truth/nodegoat.manifest.toml \\ + --output tests/eval_corpus/ground_truth/nodegoat.json + + # CI: validate every path against a real checkout, then diff vs committed: + tests/eval_corpus/manifest_gt_convert.py \\ + --manifest tests/eval_corpus/ground_truth/nodegoat.manifest.toml \\ + --corpus-dir ~/.cache/nyx/eval_corpus/nodegoat \\ + --output /tmp/nodegoat_regen.json +""" + +import argparse +import json +import sys +from pathlib import Path + +try: + import tomllib # Python 3.11+ +except ModuleNotFoundError: # pragma: no cover — older interpreters only + import tomli as tomllib # type: ignore[no-redef] + +# nyx cap labels (see tabulate.py _CAP_BIT_TABLE / _CAP_RULE_TABLE). A +# manifest cap outside this set is almost always a typo, so reject it at +# conversion time rather than letting a never-matching cap silently sink +# recall. +VALID_CAPS = { + "path_traversal", + "fmt_string", + "sqli", + "deserialize", + "ssrf", + "cmdi", + "crypto", + "unauthorized_id", + "data_exfil", + "ldap_injection", + "xpath_injection", + "header_injection", + "redirect", + "xss", + "xxe", + "prototype_pollution", + "auth", + "memory", + "validation", +} + + +def load_manifest(path: Path) -> dict: + try: + with open(path, "rb") as f: + return tomllib.load(f) + except FileNotFoundError: + print(f"error: manifest not found: {path}", file=sys.stderr) + raise SystemExit(1) + except tomllib.TOMLDecodeError as e: + print(f"error: manifest malformed: {path}: {e}", file=sys.stderr) + raise SystemExit(1) + + +def main() -> int: + p = argparse.ArgumentParser() + p.add_argument("--manifest", required=True, help="curated TOML manifest path") + p.add_argument("--output", required=True, help="output ground-truth JSON path") + p.add_argument( + "--corpus-dir", + default="", + help=( + "when set, every manifest path must resolve to a real file under " + "this root or the converter exits 2 (the CI corpus-drift guard)" + ), + ) + args = p.parse_args() + + manifest = load_manifest(Path(args.manifest).expanduser()) + entries = manifest.get("entry", []) or [] + if not entries: + print(f"error: manifest has no [[entry]] tables: {args.manifest}", file=sys.stderr) + return 1 + + corpus = Path(args.corpus_dir).expanduser().resolve() if args.corpus_dir else None + if args.corpus_dir and (corpus is None or not corpus.is_dir()): + print(f"error: corpus dir not found: {args.corpus_dir}", file=sys.stderr) + return 1 + + records: list[dict] = [] + missing: list[str] = [] + seen: set[tuple[str, str]] = set() + for i, e in enumerate(entries): + path = e.get("path") + cap = e.get("cap") + vuln = e.get("vuln") + if not path or not cap or not isinstance(vuln, bool): + print( + f"error: entry #{i} needs string path, string cap, bool vuln: {e!r}", + file=sys.stderr, + ) + return 1 + if cap not in VALID_CAPS: + print( + f"error: entry #{i} cap {cap!r} is not a known nyx cap " + f"(path {path!r}); fix the manifest", + file=sys.stderr, + ) + return 1 + norm = path.replace("\\", "/") + key = (norm, cap) + if key in seen: + print( + f"error: duplicate (path, cap) entry: {norm!r} / {cap!r}", + file=sys.stderr, + ) + return 1 + seen.add(key) + if corpus is not None and not (corpus / norm).is_file(): + missing.append(norm) + records.append({"path": norm, "line": 0, "cap": cap, "vuln": vuln}) + + if missing: + print( + f"error: {len(missing)} manifest path(s) absent from {corpus} " + f"(corpus drift or typo) — regenerate the manifest against the " + f"pinned ref:", + file=sys.stderr, + ) + for m in missing: + print(f" missing: {m}", file=sys.stderr) + return 2 + + # Deterministic order so the committed JSON is diff-stable and the CI + # byte-equality guard is meaningful regardless of manifest ordering. + records.sort(key=lambda r: (r["path"], r["cap"])) + + out = Path(args.output).expanduser().resolve() + out.parent.mkdir(parents=True, exist_ok=True) + with open(out, "w") as f: + json.dump(records, f, indent=2) + f.write("\n") + + vuln_count = sum(1 for r in records if r["vuln"]) + print(f"wrote {len(records)} records to {out}") + print(f" vulns: {vuln_count}") + print(f" non-vuln: {len(records) - vuln_count}") + if corpus is not None: + print(f" validated against: {corpus}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/eval_corpus/run.sh b/tests/eval_corpus/run.sh index fa2721b8..5ff19001 100755 --- a/tests/eval_corpus/run.sh +++ b/tests/eval_corpus/run.sh @@ -28,7 +28,7 @@ REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" OUTPUT_DIR="" NYX_BIN="${NYX_BIN:-${REPO_ROOT}/target/release/nyx}" CORPUS_CACHE="${NYX_EVAL_CORPUS_DIR:-${HOME}/.cache/nyx/eval_corpus}" -SETS="owasp,sard,inhouse" +SETS="owasp,sard,nodegoat,juiceshop,inhouse" # Optional per-cell budgets and monotonic-improvement diff. BUDGET_FILE="" DIFF_FILE="" @@ -52,6 +52,44 @@ require_cmd() { command -v "$1" >/dev/null 2>&1 || die "required command not fou require_cmd jq require_cmd python3 +# Scan one ground-truth-labelled real corpus (NodeGoat / Juice Shop) and +# tabulate it against its committed ground truth. Self-skips when the +# corpus has not been cloned into the cache. +run_jsts_corpus() { + local label="$1" dir="$2" gt="$3" + if [[ ! -d "$dir" ]]; then + info "Bootstrapping $label..." + info " Clone the corpus into ${dir} then re-run this script:" + if [[ "$label" == "nodegoat" ]]; then + info " git clone --depth 1 https://github.com/OWASP/NodeGoat ${dir}" + else + info " git clone --depth 1 --branch v15.0.0 \\" + info " https://github.com/juice-shop/juice-shop ${dir}" + fi + info "Skipping $label set (not yet downloaded)." + return 0 + fi + info "Running nyx scan on $label..." + set +e + "$NYX_BIN" scan --format json --verify --no-index "$dir" \ + > "/tmp/nyx_${label}.json" 2>"/tmp/nyx_${label}.stderr" + local rc=$? + set -e + if [[ $rc -ne 0 && $rc -ne 1 ]]; then + info " nyx exited $rc on $label set (stderr follows):" + cat "/tmp/nyx_${label}.stderr" >&2 + return 0 + fi + python3 "${SCRIPT_DIR}/tabulate.py" \ + --label "$label" \ + --scan "/tmp/nyx_${label}.json" \ + --ground-truth "$gt" \ + --append "$RESULTS_JSON" \ + ${BUDGET_FILE:+--budget "$BUDGET_FILE"} \ + ${DIFF_FILE:+--diff "$DIFF_FILE"} \ + || info " tabulate.py failed on $label; ground truth file may be absent" +} + [[ -x "$NYX_BIN" ]] || die "nyx binary not found or not executable: $NYX_BIN" mkdir -p "$CORPUS_CACHE" @@ -95,6 +133,16 @@ if [[ "$SETS" == *owasp* ]]; then fi fi +# ── NodeGoat / Juice Shop (JS/TS) bootstrap — Track R.1 ─────────────────────── +if [[ "$SETS" == *nodegoat* ]]; then + run_jsts_corpus nodegoat "${CORPUS_CACHE}/nodegoat" \ + "${SCRIPT_DIR}/ground_truth/nodegoat.json" +fi +if [[ "$SETS" == *juiceshop* ]]; then + run_jsts_corpus juiceshop "${CORPUS_CACHE}/juiceshop" \ + "${SCRIPT_DIR}/ground_truth/juiceshop.json" +fi + # ── NIST SARD subset bootstrap ──────────────────────────────────────────────── SARD_DIR="${CORPUS_CACHE}/nist_sard" if [[ "$SETS" == *sard* ]]; then diff --git a/tests/eval_corpus/run_full.sh b/tests/eval_corpus/run_full.sh index 381ddcc9..948d1642 100755 --- a/tests/eval_corpus/run_full.sh +++ b/tests/eval_corpus/run_full.sh @@ -2,9 +2,9 @@ # Full eval-corpus orchestrator. # # Drives a complete pass against every corpus set the project knows about -# (OWASP Benchmark v1.2, the NIST SARD subset, and the Nyx benchmark -# fixtures), then emits `tests/eval_corpus/results.json` for reports, -# diffs, and docs. +# (OWASP Benchmark v1.2, the NIST SARD subset, OWASP NodeGoat + Juice Shop, +# and the Nyx benchmark fixtures), then emits `tests/eval_corpus/results.json` +# for reports, diffs, and docs. # # Usage: # tests/eval_corpus/run_full.sh [--nyx BIN] [--budget FILE] [--diff FILE] @@ -70,7 +70,7 @@ set +e NYX_EVAL_CORPUS_DIR="$CORPUS_CACHE" \ bash "${SCRIPT_DIR}/run.sh" \ --nyx "$NYX_BIN" \ - --sets owasp,sard,inhouse \ + --sets owasp,sard,nodegoat,juiceshop,inhouse \ --output "$OUTPUT_DIR" \ --budget "$BUDGET_FILE" \ ${DIFF_FILE:+--diff "$DIFF_FILE"} diff --git a/tests/eval_corpus/test_manifest_gt_convert.py b/tests/eval_corpus/test_manifest_gt_convert.py new file mode 100644 index 00000000..729adde2 --- /dev/null +++ b/tests/eval_corpus/test_manifest_gt_convert.py @@ -0,0 +1,207 @@ +#!/usr/bin/env python3 +""" +Phase 28 (Track R.1) regression test for tests/eval_corpus/manifest_gt_convert.py. + +Proves the manifest -> ground-truth converter is non-vacuous: + * a well-formed manifest converts to the expected sorted JSON, + * --corpus-dir validation passes when every labelled path exists and + produces byte-identical output to the no-corpus transform (so the CI + in-sync guard, which diffs committed vs a validated regen, is sound), + * --corpus-dir validation HARD-ERRORS (exit 2) on a missing path, + * an unknown cap / duplicate (path,cap) / malformed TOML are rejected, + * the committed nodegoat.json / juiceshop.json are exactly what a fresh + conversion of their manifests produces (offline half of the CI guard). + +Run with:: + + python3 tests/eval_corpus/test_manifest_gt_convert.py + +Exits 0 when every assertion holds, non-zero otherwise. +""" + +from __future__ import annotations + +import json +import subprocess +import sys +import tempfile +from pathlib import Path + +REPO = Path(__file__).resolve().parents[2] +CONVERT = REPO / "tests/eval_corpus/manifest_gt_convert.py" +GT_DIR = REPO / "tests/eval_corpus/ground_truth" + +GOOD_MANIFEST = """\ +corpus = "demo" +upstream = "https://example.test/demo" +pinned_ref = "v1" + +[[entry]] +path = "routes/login.ts" +cap = "sqli" +vuln = true +note = "raw SQL string-concat in login" + +[[entry]] +path = "app/routes/contributions.js" +cap = "cmdi" +vuln = true +note = "eval of user input" + +[[entry]] +path = "lib/insecurity.ts" +cap = "crypto" +vuln = false +note = "benign control example" +""" + + +def run_convert(*args: str) -> subprocess.CompletedProcess: + return subprocess.run( + [sys.executable, str(CONVERT), *args], capture_output=True, text=True + ) + + +def test_transform_is_sorted_and_schema_clean(tmp: Path) -> None: + man = tmp / "demo.manifest.toml" + man.write_text(GOOD_MANIFEST) + out = tmp / "demo.json" + proc = run_convert("--manifest", str(man), "--output", str(out)) + assert proc.returncode == 0, proc.stdout + proc.stderr + records = json.loads(out.read_text()) + # Sorted by (path, cap); only the 4 GT fields; `note` dropped. + assert [r["path"] for r in records] == [ + "app/routes/contributions.js", + "lib/insecurity.ts", + "routes/login.ts", + ], records + for r in records: + assert set(r) == {"path", "line", "cap", "vuln"}, r + assert r["line"] == 0, r + assert records[0]["cap"] == "cmdi" and records[0]["vuln"] is True + assert records[1]["cap"] == "crypto" and records[1]["vuln"] is False + + +def test_corpus_validation_passes_and_matches_no_corpus(tmp: Path) -> None: + man = tmp / "demo.manifest.toml" + man.write_text(GOOD_MANIFEST) + # Build a corpus tree containing every labelled path. + corpus = tmp / "corpus" + for rel in ("routes/login.ts", "app/routes/contributions.js", "lib/insecurity.ts"): + f = corpus / rel + f.parent.mkdir(parents=True, exist_ok=True) + f.write_text("// stub\n") + no_corpus = tmp / "no_corpus.json" + with_corpus = tmp / "with_corpus.json" + assert run_convert("--manifest", str(man), "--output", str(no_corpus)).returncode == 0 + proc = run_convert( + "--manifest", str(man), + "--corpus-dir", str(corpus), + "--output", str(with_corpus), + ) + assert proc.returncode == 0, proc.stdout + proc.stderr + # Validation must not change the output — that is what makes the CI guard + # (diff committed vs validated regen) meaningful. + assert no_corpus.read_text() == with_corpus.read_text() + assert "validated against" in proc.stdout, proc.stdout + + +def test_missing_path_exits_2(tmp: Path) -> None: + man = tmp / "demo.manifest.toml" + man.write_text(GOOD_MANIFEST) + corpus = tmp / "corpus" + # Only two of the three labelled files exist → the third must trip. + for rel in ("routes/login.ts", "app/routes/contributions.js"): + f = corpus / rel + f.parent.mkdir(parents=True, exist_ok=True) + f.write_text("// stub\n") + out = tmp / "demo.json" + proc = run_convert( + "--manifest", str(man), "--corpus-dir", str(corpus), "--output", str(out) + ) + assert proc.returncode == 2, proc.stdout + proc.stderr + assert "lib/insecurity.ts" in proc.stderr and "missing" in proc.stderr, proc.stderr + + +def test_unknown_cap_rejected(tmp: Path) -> None: + man = tmp / "bad_cap.manifest.toml" + man.write_text( + '[[entry]]\npath = "a.js"\ncap = "not_a_cap"\nvuln = true\n' + ) + out = tmp / "out.json" + proc = run_convert("--manifest", str(man), "--output", str(out)) + assert proc.returncode == 1, proc.stdout + proc.stderr + assert "not a known nyx cap" in proc.stderr, proc.stderr + + +def test_duplicate_path_cap_rejected(tmp: Path) -> None: + man = tmp / "dup.manifest.toml" + man.write_text( + '[[entry]]\npath = "a.js"\ncap = "xss"\nvuln = true\n' + '[[entry]]\npath = "a.js"\ncap = "xss"\nvuln = true\n' + ) + out = tmp / "out.json" + proc = run_convert("--manifest", str(man), "--output", str(out)) + assert proc.returncode == 1, proc.stdout + proc.stderr + assert "duplicate" in proc.stderr, proc.stderr + + +def test_malformed_manifest_exits_1(tmp: Path) -> None: + man = tmp / "broken.toml" + man.write_text("[[entry]\npath = \n") # invalid TOML + out = tmp / "out.json" + proc = run_convert("--manifest", str(man), "--output", str(out)) + assert proc.returncode == 1, proc.stdout + proc.stderr + assert "malformed" in proc.stderr, proc.stderr + + +def test_empty_manifest_exits_1(tmp: Path) -> None: + man = tmp / "empty.toml" + man.write_text('corpus = "x"\n') # no [[entry]] tables + out = tmp / "out.json" + proc = run_convert("--manifest", str(man), "--output", str(out)) + assert proc.returncode == 1, proc.stdout + proc.stderr + assert "no [[entry]]" in proc.stderr, proc.stderr + + +def test_committed_gt_matches_manifest(tmp: Path) -> None: + # Offline half of the CI in-sync guard: the committed ground-truth JSON + # must be exactly what a fresh conversion of its manifest produces. This + # catches a manifest edit that was not followed by a regenerate. + for name in ("nodegoat", "juiceshop"): + man = GT_DIR / f"{name}.manifest.toml" + committed = GT_DIR / f"{name}.json" + assert man.exists(), f"missing manifest: {man}" + assert committed.exists(), f"missing committed GT: {committed}" + regen = tmp / f"{name}.json" + proc = run_convert("--manifest", str(man), "--output", str(regen)) + assert proc.returncode == 0, proc.stdout + proc.stderr + assert json.loads(regen.read_text()) == json.loads(committed.read_text()), ( + f"{committed} is stale — regenerate with manifest_gt_convert.py" + ) + + +def main() -> int: + with tempfile.TemporaryDirectory() as td: + tmp = Path(td) + for fn in ( + test_transform_is_sorted_and_schema_clean, + test_corpus_validation_passes_and_matches_no_corpus, + test_missing_path_exits_2, + test_unknown_cap_rejected, + test_duplicate_path_cap_rejected, + test_malformed_manifest_exits_1, + test_empty_manifest_exits_1, + test_committed_gt_matches_manifest, + ): + sub = tmp / fn.__name__ + sub.mkdir() + print(f"... {fn.__name__}") + fn(sub) + print(" OK") + print("\nAll manifest_gt_convert.py regression checks passed.") + return 0 + + +if __name__ == "__main__": + sys.exit(main())