chore: remove stale scheduled_tasks.lock file

2026-06-09 19:45:13 +02:00 · 2026-05-31 21:18:38 -05:00 · 2026-05-31 21:18:38 -05:00 · 2a4d49b68b
commit 2a4d49b68b
parent a5929bb169
12 changed files with 1059 additions and 21 deletions
--- a/.github/workflows/eval.yml
+++ b/.github/workflows/eval.yml
@ -1,18 +1,25 @@
-# Phase 27 (Track R.0): OWASP Benchmark v1.2 real-corpus acceptance.
+# Real-corpus acceptance (Track R).
 #
-# Runs Gate 6 of scripts/m7_ship_gate.sh against a real OWASP BenchmarkJava
-# checkout on every PR that touches the dynamic verifier (src/dynamic/), the
+#   * owasp  (Phase 27 / Track R.0): Gate 6 vs a real OWASP BenchmarkJava
+#     checkout (Java).
+#   * jsts   (Phase 28 / Track R.1): Gate 7 vs OWASP NodeGoat (Express, .js)
+#     and OWASP Juice Shop (TypeScript, .ts), one matrix row per corpus.
+#
+# Runs on every PR that touches the dynamic verifier (src/dynamic/), the
 # eval-corpus harness (tests/eval_corpus/), or the gate script itself.
 #
-# Gate 6 enforces, against the committed ground truth:
+# Each gate enforces, against the committed ground truth:
 #   * verify wall-clock <= 15 min (CI budget; the dev reference is 10 min),
-#   * per-cap confirmed-rate >= 40%, precision >= 0.85, recall >= 0.40 for the
-#     dynamically-supported OWASP caps,
-#   * the per-(cap,lang) budget in tests/eval_corpus/budget.toml.
+#   * the per-(cap,lang) budget in tests/eval_corpus/budget.toml,
+#   * per-cap confirmed-rate / precision / recall — hard-gated only for caps
+#     in NYX_*_FLOOR_CAPS (empty by default → published report-only until a
+#     cap Confirms end to end), with destinations >= 40% / >= 0.85 / >= 0.40.
 #
-# The corpus is *not* vendored.  It is cloned at the pinned 1.2beta tag (the
-# tag that produced expectedresults-1.2beta.csv, the source of the ground
-# truth) and cached so reruns skip the clone.
+# No corpus is vendored.  Each is cloned at a pinned ref and cached so reruns
+# skip the clone.  Before the gate runs, the committed ground truth is
+# regenerated from its source against the fresh clone and asserted in sync,
+# and the converter hard-errors on any labelled path missing from the corpus,
+# so a corpus bump that drifts the labels fails the job loudly.

 name: eval

@ -99,7 +106,98 @@ jobs:
          PY

      - name: eval-corpus harness regression tests
-        run: python3 tests/eval_corpus/test_tabulate_regression.py
+        run: |
+          python3 tests/eval_corpus/test_tabulate_regression.py
+          python3 tests/eval_corpus/test_manifest_gt_convert.py

      - name: Gate 6 — OWASP Benchmark v1.2 acceptance
        run: scripts/m7_ship_gate.sh --sets owasp
+
+  jsts:
+    name: eval / ${{ matrix.corpus.name }}
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        corpus:
+          - name: nodegoat
+            repo: https://github.com/OWASP/NodeGoat
+            # NodeGoat ships no release tags; pin the default branch and let
+            # the cache key hold it stable.  The manifest's path layout
+            # (app/, config/) has been constant for years.
+            ref: master
+            env: NYX_NODEGOAT_CORPUS
+            manifest: nodegoat.manifest.toml
+            ground_truth: nodegoat.json
+          - name: juiceshop
+            repo: https://github.com/juice-shop/juice-shop
+            ref: v15.0.0
+            env: NYX_JUICESHOP_CORPUS
+            manifest: juiceshop.manifest.toml
+            ground_truth: juiceshop.json
+    env:
+      # CI wall-clock budget: 15 min.  Override locally to tighten.
+      NYX_JSTS_WALLCLOCK_BUDGET_SECONDS: "900"
+    steps:
+      - uses: actions/checkout@v6
+
+      - uses: actions-rust-lang/setup-rust-toolchain@v1
+        with:
+          toolchain: stable
+          cache: true
+
+      - uses: taiki-e/install-action@nextest
+
+      # The dynamic verifier's Node build pool (Phase 23) compiles its
+      # harnesses with a real node/npm toolchain.
+      - name: Set up Node 20
+        uses: actions/setup-node@v4
+        with:
+          node-version: "20"
+
+      - name: Cache ${{ matrix.corpus.name }}
+        id: cache-corpus
+        uses: actions/cache@v4
+        with:
+          path: .eval-corpus/${{ matrix.corpus.name }}
+          key: jsts-${{ matrix.corpus.name }}-${{ matrix.corpus.ref }}
+
+      - name: Clone ${{ matrix.corpus.name }} (${{ matrix.corpus.ref }})
+        if: steps.cache-corpus.outputs.cache-hit != 'true'
+        run: |
+          git clone --depth 1 --branch ${{ matrix.corpus.ref }} \
+            ${{ matrix.corpus.repo }} \
+            .eval-corpus/${{ matrix.corpus.name }}
+
+      # No-compromise guard: the committed ground truth must be exactly what a
+      # fresh conversion of the curated manifest produces *against this
+      # corpus*.  manifest_gt_convert.py hard-errors on any labelled path that
+      # no longer exists in the clone (corpus drift / typo), and the diff
+      # below catches a stale committed JSON.
+      - name: Verify ground truth is in sync with the pinned corpus
+        run: |
+          python3 tests/eval_corpus/manifest_gt_convert.py \
+            --manifest tests/eval_corpus/ground_truth/${{ matrix.corpus.manifest }} \
+            --corpus-dir .eval-corpus/${{ matrix.corpus.name }} \
+            --output /tmp/${{ matrix.corpus.name }}_gt_regen.json
+          python3 - <<'PY'
+          import json, sys
+          name = "${{ matrix.corpus.ground_truth }}"
+          committed = json.load(open(f"tests/eval_corpus/ground_truth/{name}"))
+          regen = json.load(open("/tmp/${{ matrix.corpus.name }}_gt_regen.json"))
+          if committed != regen:
+              sys.exit("committed ground truth diverges from a fresh conversion of "
+                       "the manifest against the pinned corpus; regenerate with "
+                       "manifest_gt_convert.py")
+          print(f"ground truth in sync: {len(committed)} records")
+          PY
+
+      - name: eval-corpus harness regression tests
+        run: |
+          python3 tests/eval_corpus/test_tabulate_regression.py
+          python3 tests/eval_corpus/test_manifest_gt_convert.py
+
+      - name: Gate 7 — ${{ matrix.corpus.name }} acceptance
+        run: |
+          export ${{ matrix.corpus.env }}="${{ github.workspace }}/.eval-corpus/${{ matrix.corpus.name }}"
+          scripts/m7_ship_gate.sh --sets ${{ matrix.corpus.name }}
--- a/scripts/m7_ship_gate.sh
+++ b/scripts/m7_ship_gate.sh
@ -6,6 +6,8 @@
 #   scripts/m7_ship_gate.sh                     # every gate
 #   scripts/m7_ship_gate.sh --gates 3,6         # only gates 3 + 6
 #   scripts/m7_ship_gate.sh --sets owasp        # Java OWASP corpus only
+#   scripts/m7_ship_gate.sh --sets jsts         # NodeGoat + Juice Shop only
+#   scripts/m7_ship_gate.sh --sets nodegoat     # one JS/TS corpus only
 #
 # Gate map (kept in sync with .pitboss/play/plan.md track M.7):
 #   Gate 1: Static-only scan is green on `tests/benchmark/corpus`.
@ -26,13 +28,22 @@
 #           R.0) added the precision/recall/budget ratchet.  The corpus is
 #           *not* checked into the repo; the gate skips with a clear message
 #           when `NYX_OWASP_CORPUS` does not point at a real checkout.
+#   Gate 7: JS/TS real-corpus acceptance (Track R.1 / Phase 28).  OWASP
+#           NodeGoat (Express, .js) + OWASP Juice Shop (TypeScript, .ts)
+#           `--verify` against the committed ground truth.  Same shape as
+#           Gate 6: wall-clock budget + the per-(cap,lang) budget in
+#           tests/eval_corpus/budget.toml hard-enforced; per-cap
+#           confirmed-rate / precision / recall published report-only
+#           (NYX_JSTS_FLOOR_CAPS empty by default).  Each corpus row
+#           self-skips unless its NYX_NODEGOAT_CORPUS / NYX_JUICESHOP_CORPUS
+#           points at a real checkout.

 set -euo pipefail

 REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 cd "${REPO_ROOT}"

-GATES="1,2,3,4,5,6"
+GATES="1,2,3,4,5,6,7"
 SETS=""

 while [[ $# -gt 0 ]]; do
@ -56,10 +67,15 @@ while [[ $# -gt 0 ]]; do
    esac
 done

-# When `--sets owasp` is passed CI only wants Gate 6.
-if [[ "${SETS}" == "owasp" ]]; then
-    GATES="6"
-fi
+# `--sets` lets CI run a single real-corpus gate.  `owasp` -> Gate 6;
+# `jsts` (both JS/TS corpora) / `nodegoat` / `juiceshop` -> Gate 7, with the
+# corpus name passed through so Gate 7 runs only the requested row.
+case "${SETS}" in
+    owasp)                    GATES="6" ;;
+    jsts|nodegoat|juiceshop)  GATES="7" ;;
+    "")                       ;;  # no --sets: run the requested --gates
+    *)                        echo "unknown --sets: ${SETS}" >&2; exit 2 ;;
+esac

 want_gate() {
    [[ ",${GATES}," == *",$1,"* ]]
@ -292,6 +308,162 @@ PY
    echo "  PASS"
 }

+# ── Gate 7: JS/TS real-corpus acceptance (NodeGoat + Juice Shop) ──────────────
+
+# Phase 28 (Track R.1) mirror of Gate 6 for the JS/TS corpora.  Same
+# wall-clock split (10 min dev reference / 15 min CI) and the same
+# report-only-by-default floor policy: NYX_JSTS_FLOOR_CAPS is empty, so the
+# per-cap confirmed-rate / precision / recall numbers are published but gate
+# nothing, while the per-(cap,lang) budget (unsupported_rate,
+# false_confirmed_rate) is hard-enforced.  Promote a cap into the floor set
+# once it starts Confirming end to end.
+GATE7_WALLCLOCK_BUDGET="${NYX_JSTS_WALLCLOCK_BUDGET_SECONDS:-900}"
+GATE7_CONFIRMED_RATE_TARGET="${NYX_JSTS_CONFIRMED_RATE_TARGET:-0.40}"
+GATE7_PRECISION_TARGET="${NYX_JSTS_PRECISION_TARGET:-0.85}"
+GATE7_RECALL_TARGET="${NYX_JSTS_RECALL_TARGET:-0.40}"
+GATE7_FLOOR_CAPS="${NYX_JSTS_FLOOR_CAPS:-}"
+GATE7_BUDGET="${NYX_JSTS_BUDGET:-${REPO_ROOT}/tests/eval_corpus/budget.toml}"
+
+# Run one real-corpus `--verify` row: scan under a wall-clock guard,
+# tabulate against the committed ground truth, enforce the per-cell budget,
+# publish (or, when floor caps are set, enforce) the per-cap floors.
+#   $1 label  $2 corpus dir  $3 ground-truth json
+# Returns 0 on pass, 1 on fail.  Caller decides skip.
+_gate7_run_corpus() {
+    local label="$1" corpus="$2" gt="$3"
+    local scan_report="/tmp/m7_gate7_${label}_scan.json"
+    local results_report="/tmp/m7_gate7_${label}_results.json"
+    local wallclock_report="/tmp/m7_gate7_${label}_wallclock.txt"
+    local gate_home="${TMPDIR:-/tmp}/nyx_m7_gate7_${label}_home"
+    local gate_build_pool="${TMPDIR:-/tmp}/nyx_m7_gate7_${label}_build_pool"
+    local wallclock
+
+    mkdir -p "${gate_home}" "${gate_build_pool}"
+    rm -f "${scan_report}" "${results_report}" "${wallclock_report}"
+
+    set +e
+    HOME="${gate_home}" \
+    NYX_BUILD_POOL_DIR="${gate_build_pool}" \
+    python3 - "${GATE7_WALLCLOCK_BUDGET}" "${scan_report}" "${wallclock_report}" \
+        "${REPO_ROOT}/target/release/nyx" scan \
+        --verify \
+        --index off \
+        --format json \
+        --quiet \
+        "${corpus}" <<'PY'
+import subprocess
+import sys
+import time
+
+budget = float(sys.argv[1])
+scan_report = sys.argv[2]
+wallclock_report = sys.argv[3]
+cmd = sys.argv[4:]
+start = time.monotonic()
+rc = 0
+try:
+    with open(scan_report, "wb") as out:
+        completed = subprocess.run(cmd, stdout=out, timeout=budget)
+        rc = completed.returncode
+except subprocess.TimeoutExpired:
+    rc = 124
+finally:
+    elapsed = time.monotonic() - start
+    with open(wallclock_report, "w") as f:
+        f.write(f"{elapsed:.1f}\n")
+sys.exit(rc)
+PY
+    local nyx_exit=$?
+    set -e
+    wallclock="$(cat "${wallclock_report}" 2>/dev/null || printf "%s" "${GATE7_WALLCLOCK_BUDGET}")"
+
+    echo "    ${label} verify wall-clock: ${wallclock}s (budget ${GATE7_WALLCLOCK_BUDGET}s)"
+
+    if [[ ${nyx_exit} -eq 124 ]]; then
+        echo "    FAIL: ${label} scan exceeded wall-clock budget"
+        return 1
+    fi
+    if [[ ${nyx_exit} -ne 0 && ${nyx_exit} -ne 1 ]]; then
+        echo "    FAIL: ${label} scan exited ${nyx_exit}"
+        return 1
+    fi
+    if [[ ! -s "${scan_report}" ]]; then
+        echo "    FAIL: ${label} scan produced no JSON report"
+        return 1
+    fi
+    awk -v w="${wallclock}" -v b="${GATE7_WALLCLOCK_BUDGET}" \
+        'BEGIN { if (w+0 > b+0) exit 1 }' \
+        || { echo "    FAIL: ${label} wall-clock exceeds budget"; return 1; }
+
+    echo "[]" > "${results_report}"
+    python3 "${REPO_ROOT}/tests/eval_corpus/tabulate.py" \
+        --label "${label}" \
+        --scan "${scan_report}" \
+        --ground-truth "${gt}" \
+        --append "${results_report}" \
+        || { echo "    FAIL: ${label} result tabulation failed"; return 1; }
+
+    local -a report_args=(
+        --results "${results_report}"
+        --budget "${GATE7_BUDGET}"
+    )
+    if [[ -n "${GATE7_FLOOR_CAPS}" ]]; then
+        report_args+=(
+            --floor-caps "${GATE7_FLOOR_CAPS}"
+            --min-confirmed-rate "${GATE7_CONFIRMED_RATE_TARGET}"
+            --min-precision "${GATE7_PRECISION_TARGET}"
+            --min-recall "${GATE7_RECALL_TARGET}"
+        )
+        echo "    enforcing per-cap floors (confirmed >= ${GATE7_CONFIRMED_RATE_TARGET}, precision >= ${GATE7_PRECISION_TARGET}, recall >= ${GATE7_RECALL_TARGET}) on: ${GATE7_FLOOR_CAPS}"
+    else
+        echo "    per-cap confirmed/precision/recall: report-only (NYX_JSTS_FLOOR_CAPS unset)"
+    fi
+    python3 "${REPO_ROOT}/tests/eval_corpus/report.py" "${report_args[@]}" \
+        || { echo "    FAIL: ${label} per-cell budget exceeded or a gated per-cap floor missed"; return 1; }
+    return 0
+}
+
+gate_7_jsts_scale() {
+    echo "── Gate 7: JS/TS real-corpus (NodeGoat + Juice Shop) verify acceptance ──"
+    cargo build --release --quiet --features dynamic
+
+    # name : env var holding the corpus dir : committed ground-truth file
+    local rows=(
+        "nodegoat:NYX_NODEGOAT_CORPUS:nodegoat.json"
+        "juiceshop:NYX_JUICESHOP_CORPUS:juiceshop.json"
+    )
+    local any_ran=0 any_failed=0
+    for row in "${rows[@]}"; do
+        local name envvar gtfile
+        IFS=: read -r name envvar gtfile <<<"${row}"
+        # When --sets names a single corpus, only run that row.
+        if [[ -n "${SETS}" && "${SETS}" != "jsts" && "${SETS}" != "${name}" ]]; then
+            continue
+        fi
+        local corpus="${!envvar:-}"
+        if [[ -z "${corpus}" || ! -d "${corpus}" ]]; then
+            echo "  SKIP ${name}: set ${envvar} to a checkout to run this row."
+            continue
+        fi
+        any_ran=1
+        echo "  ── ${name} (${corpus}) ──"
+        if _gate7_run_corpus "${name}" "${corpus}" \
+                "${REPO_ROOT}/tests/eval_corpus/ground_truth/${gtfile}"; then
+            echo "  PASS ${name}"
+        else
+            any_failed=1
+        fi
+    done
+
+    if [[ ${any_ran} -eq 0 ]]; then
+        echo "  SKIP: no JS/TS corpus configured (set NYX_NODEGOAT_CORPUS / NYX_JUICESHOP_CORPUS)."
+        echo "        (Gate 7 is Phase 28's headline acceptance for the JS/TS real corpora.)"
+        return 0
+    fi
+    [[ ${any_failed} -eq 0 ]] || return 1
+    echo "  PASS"
+}
+
 # ── Driver ────────────────────────────────────────────────────────────────────

 declare -a FAILED=()
@ -310,6 +482,7 @@ run_gate 3 verify_ratio
 run_gate 4 sarif_schema
 run_gate 5 layering
 run_gate 6 owasp_scale
+run_gate 7 jsts_scale

 if [[ ${#FAILED[@]} -gt 0 ]]; then
    echo
--- a/tests/eval_corpus/budget.toml
+++ b/tests/eval_corpus/budget.toml
@ -116,3 +116,87 @@ cap = "auth"
 lang = "java"
 unsupported_rate     = 0.20
 false_confirmed_rate = 0.02
+
+# ── NodeGoat / Juice Shop (JS/TS) — Track R.1 ratchet ────────────────────────
+#
+# Phase 28 wires two intentionally-vulnerable JS/TS apps into the same
+# acceptance machinery as OWASP Benchmark: OWASP NodeGoat (Express, .js)
+# and OWASP Juice Shop (TypeScript, .ts).  Unlike OWASP Benchmark, neither
+# app ships vuln/benign *pairs* — every labelled file is `vuln = true` (see
+# ground_truth/{nodegoat,juiceshop}.manifest.toml).  Two consequences for
+# these cells:
+#
+#   * false_confirmed_rate (<= 2%) is the headline maximum the verifier
+#     already satisfies and is HARD-enforced: it only trips when a Confirmed
+#     finding lands on a file with no ground-truth positive, i.e. an
+#     over-confirm.  With the verifier confirming little on real corpora yet
+#     it is satisfied, and it ratchets precision as confirms grow.
+#   * unsupported_rate (<= 20%) is HARD-enforced too.  `Unsupported` counts
+#     only NoPayloadsForCap / EntryKindUnsupported / SoundOracleUnavailable —
+#     a narrow bucket that Tracks J + M shrank — *not* BuildFailed /
+#     SpecDerivationFailed (those are Inconclusive), so it stays low.
+#
+# confirmed_rate (>= 40%), precision (>= 0.85) and recall (>= 0.40) are the
+# Phase 28 acceptance DESTINATIONS.  They are intentionally left UNSET here
+# and published report-only by Gate 7 (NYX_JSTS_FLOOR_CAPS empty by default,
+# mirroring NYX_OWASP_FLOOR_CAPS) because (a) the verifier does not yet
+# Confirm these corpora end to end and (b) the manifest labels canonical
+# vulns only, so precision vs partial ground truth is informational until
+# the labels are completed.  Promote a cap into the floor set the moment it
+# starts Confirming, exactly as for OWASP.
+
+# NodeGoat (javascript): caps with a ground-truth label in nodegoat.manifest.toml.
+[[cell]]
+cap = "cmdi"
+lang = "javascript"
+unsupported_rate     = 0.20
+false_confirmed_rate = 0.02
+
+[[cell]]
+cap = "xss"
+lang = "javascript"
+unsupported_rate     = 0.20
+false_confirmed_rate = 0.02
+
+[[cell]]
+cap = "unauthorized_id"
+lang = "javascript"
+unsupported_rate     = 0.20
+false_confirmed_rate = 0.02
+
+[[cell]]
+cap = "crypto"
+lang = "javascript"
+unsupported_rate     = 0.20
+false_confirmed_rate = 0.02
+
+# Juice Shop (typescript): caps with a ground-truth label in juiceshop.manifest.toml.
+[[cell]]
+cap = "sqli"
+lang = "typescript"
+unsupported_rate     = 0.20
+false_confirmed_rate = 0.02
+
+[[cell]]
+cap = "path_traversal"
+lang = "typescript"
+unsupported_rate     = 0.20
+false_confirmed_rate = 0.02
+
+[[cell]]
+cap = "redirect"
+lang = "typescript"
+unsupported_rate     = 0.20
+false_confirmed_rate = 0.02
+
+[[cell]]
+cap = "ssrf"
+lang = "typescript"
+unsupported_rate     = 0.20
+false_confirmed_rate = 0.02
+
+[[cell]]
+cap = "crypto"
+lang = "typescript"
+unsupported_rate     = 0.20
+false_confirmed_rate = 0.02
--- a/tests/eval_corpus/ground_truth/README.md
+++ b/tests/eval_corpus/ground_truth/README.md
@ -34,3 +34,38 @@ python3 tests/eval_corpus/owasp_gt_convert.py \
 File: `nist_sard.json`

 Same format. Source: SARD manifest XML converted with `python3 tests/eval_corpus/sard_gt_convert.py`.
+
+## OWASP NodeGoat / OWASP Juice Shop (JS/TS — Track R.1)
+
+Files: `nodegoat.json` (Express, `.js`), `juiceshop.json` (TypeScript, `.ts`).
+Same four-field format as above; all records are `vuln: true`.
+
+These two apps are intentionally vulnerable end to end, so — unlike OWASP
+Benchmark — they ship no machine-readable per-file vuln labels and have no
+benign-control files to pair against. The authoritative source is a curated
+TOML manifest committed here, one `[[entry]]` per known-vulnerable handler
+with a `note` citing why:
+
+- `nodegoat.manifest.toml`
+- `juiceshop.manifest.toml`
+
+`manifest_gt_convert.py` turns a manifest into the committed `.json`:
+
+```sh
+python3 tests/eval_corpus/manifest_gt_convert.py \
+    --manifest tests/eval_corpus/ground_truth/nodegoat.manifest.toml \
+    --output   tests/eval_corpus/ground_truth/nodegoat.json
+```
+
+Pass `--corpus-dir <clone>` to validate every labelled path against a real
+checkout. The converter exits non-zero if any path is missing, so a corpus
+bump that moves a handler fails loudly instead of silently dropping recall.
+CI (`.github/workflows/eval.yml`, `jsts` job) regenerates each `.json`
+against a fresh clone of the pinned ref and asserts it matches the committed
+file.
+
+Because the manifests label canonical vulns only, recall (did nyx catch the
+known vulns) is the meaningful metric; precision vs this partial ground
+truth is informational. Gate 7 publishes per-cap precision/recall/confirmed
+report-only by default (`NYX_JSTS_FLOOR_CAPS` empty), matching the OWASP
+gate.
--- a/tests/eval_corpus/ground_truth/juiceshop.json
+++ b/tests/eval_corpus/ground_truth/juiceshop.json
@ -0,0 +1,38 @@
+[
+  {
+    "path": "lib/insecurity.ts",
+    "line": 0,
+    "cap": "crypto",
+    "vuln": true
+  },
+  {
+    "path": "routes/fileServer.ts",
+    "line": 0,
+    "cap": "path_traversal",
+    "vuln": true
+  },
+  {
+    "path": "routes/login.ts",
+    "line": 0,
+    "cap": "sqli",
+    "vuln": true
+  },
+  {
+    "path": "routes/profileImageUrlUpload.ts",
+    "line": 0,
+    "cap": "ssrf",
+    "vuln": true
+  },
+  {
+    "path": "routes/redirect.ts",
+    "line": 0,
+    "cap": "redirect",
+    "vuln": true
+  },
+  {
+    "path": "routes/search.ts",
+    "line": 0,
+    "cap": "sqli",
+    "vuln": true
+  }
+]
--- a/tests/eval_corpus/ground_truth/juiceshop.manifest.toml
+++ b/tests/eval_corpus/ground_truth/juiceshop.manifest.toml
@ -0,0 +1,66 @@
+# OWASP Juice Shop — curated vuln ground-truth manifest (Phase 28, Track R.1).
+#
+# Juice Shop is an intentionally-vulnerable TypeScript/Express + Angular
+# app.  Its `data/static/challenges.yml` enumerates challenges but pins no
+# source file/line, so it cannot drive file-level ground truth on its own.
+# This manifest IS the authoritative source: one [[entry]] per known-
+# vulnerable server-side handler, curated from the project's own challenge
+# definitions + companion guide, each with a `note` citing the challenge.
+#
+# tests/eval_corpus/manifest_gt_convert.py turns this into the committed
+# ground_truth/juiceshop.json.  CI regenerates it against a fresh clone of
+# the pinned tag and asserts byte-equality; the converter HARD-ERRORS on
+# any path that no longer exists in the corpus, so a Juice Shop bump that
+# refactors a route fails the eval job loudly instead of silently dropping
+# recall.  Re-pin `pinned_ref` and re-validate the paths together.
+#
+# `cap` is a nyx cap label (tabulate.py).  `path` is relative to the Juice
+# Shop clone root, POSIX separators.  Lang is inferred from the extension
+# (.ts -> typescript).  All `vuln = true`: Juice Shop is all-vulnerable, so
+# there is no benign-control file to pair against.  As with NodeGoat,
+# precision vs this manifest is informational (an unlabelled finding may be
+# a real uncurated vuln, not a false positive) while recall is the
+# meaningful floor.  See tests/eval_corpus/budget.toml for the gate policy.
+
+corpus = "juiceshop"
+upstream = "https://github.com/juice-shop/juice-shop"
+# Pinned to a stable release tag.  The server-side handlers below
+# (routes/*.ts, lib/insecurity.ts) have been stable across the TypeScript
+# era of Juice Shop; re-validate if the tag is bumped.
+pinned_ref = "v15.0.0"
+
+[[entry]]
+path = "routes/login.ts"
+cap = "sqli"
+vuln = true
+note = "login builds a raw `models.sequelize.query(\"... WHERE email = '\" + req.body.email + \"' ...\")` — SQL injection auth bypass (challenge: loginAdmin / loginBender)."
+
+[[entry]]
+path = "routes/search.ts"
+cap = "sqli"
+vuln = true
+note = "product search concatenates the `q` criteria into a raw `models.sequelize.query` LIKE clause — UNION-based SQL injection (challenge: unionSqlInjection / dbSchema)."
+
+[[entry]]
+path = "routes/fileServer.ts"
+cap = "path_traversal"
+vuln = true
+note = "serveKeyFiles / file download resolves a user-controlled filename under the ftp dir without containment — path traversal (challenge: accessLogDisclosure / forgottenDevBackup)."
+
+[[entry]]
+path = "routes/redirect.ts"
+cap = "redirect"
+vuln = true
+note = "redirect endpoint forwards to the `to` query param via an allow-list that is bypassable by substring — open redirect (challenge: redirectCryptoCurrency / redirect)."
+
+[[entry]]
+path = "routes/profileImageUrlUpload.ts"
+cap = "ssrf"
+vuln = true
+note = "profile image upload fetches an arbitrary user-supplied imageUrl server-side — SSRF (challenge: ssrf)."
+
+[[entry]]
+path = "lib/insecurity.ts"
+cap = "crypto"
+vuln = true
+note = "hardcoded HMAC/JWT key material and weak hashing (md5-based `hash`) — broken cryptography / hardcoded secret (challenge: weakCryptography / jwt*)."
--- a/tests/eval_corpus/ground_truth/nodegoat.json
+++ b/tests/eval_corpus/ground_truth/nodegoat.json
@ -0,0 +1,32 @@
+[
+  {
+    "path": "app/routes/allocations.js",
+    "line": 0,
+    "cap": "unauthorized_id",
+    "vuln": true
+  },
+  {
+    "path": "app/routes/contributions.js",
+    "line": 0,
+    "cap": "cmdi",
+    "vuln": true
+  },
+  {
+    "path": "app/routes/memos.js",
+    "line": 0,
+    "cap": "xss",
+    "vuln": true
+  },
+  {
+    "path": "app/routes/profile.js",
+    "line": 0,
+    "cap": "xss",
+    "vuln": true
+  },
+  {
+    "path": "config/env/all.js",
+    "line": 0,
+    "cap": "crypto",
+    "vuln": true
+  }
+]
--- a/tests/eval_corpus/ground_truth/nodegoat.manifest.toml
+++ b/tests/eval_corpus/ground_truth/nodegoat.manifest.toml
@ -0,0 +1,62 @@
+# OWASP NodeGoat — curated vuln ground-truth manifest (Phase 28, Track R.1).
+#
+# NodeGoat is an intentionally-vulnerable Express/Node app that maps the
+# OWASP Top 10 to concrete handlers.  It ships no machine-readable per-file
+# vuln labels (unlike OWASP Benchmark's expectedresults CSV), so this
+# manifest IS the authoritative source: one [[entry]] per known-vulnerable
+# location, each curated from the project's own tutorial + the canonical
+# vuln walk-through, with a `note` citing why.
+#
+# tests/eval_corpus/manifest_gt_convert.py turns this into the committed
+# ground_truth/nodegoat.json.  CI regenerates it against a fresh clone of
+# the pinned ref and asserts byte-equality, and the converter HARD-ERRORS
+# on any path that no longer exists in the corpus, so a NodeGoat bump that
+# moves a handler fails the eval job loudly rather than silently dropping
+# recall.  Update `pinned_ref` + the paths together when re-pinning.
+#
+# `cap` is a nyx cap label (tabulate.py).  `path` is relative to the
+# NodeGoat clone root, POSIX separators.  Lang is inferred from the
+# extension (.js -> javascript).  These are all `vuln = true`: NodeGoat is
+# all-vulnerable, so there is no benign-control file to pair against (the
+# OWASP Benchmark vuln/benign pairing does not exist here).  Precision vs
+# this manifest is therefore informational (an unlabelled finding is not
+# necessarily a false positive — it may be a real vuln we did not curate),
+# while recall (did nyx catch the canonical vulns) is the meaningful floor.
+# See tests/eval_corpus/budget.toml for how the gate treats these cells.
+
+corpus = "nodegoat"
+upstream = "https://github.com/OWASP/NodeGoat"
+# NodeGoat publishes no semver tags; the eval job pins the default branch
+# via the CI cache key.  The `app/` + `config/` layout below has been
+# stable for years; re-validate the paths if the cache key is bumped.
+pinned_ref = "master"
+
+[[entry]]
+path = "app/routes/contributions.js"
+cap = "cmdi"
+vuln = true
+note = "handleContributionsUpdate eval()s the pre-tax/after-tax/roth form fields — server-side JS injection (OWASP A1 Injection); the textbook NodeGoat RCE."
+
+[[entry]]
+path = "app/routes/profile.js"
+cap = "xss"
+vuln = true
+note = "profile fields (firstName/lastName/bankAcc/...) are persisted then rendered unescaped — stored XSS (OWASP A3 / A7 XSS)."
+
+[[entry]]
+path = "app/routes/memos.js"
+cap = "xss"
+vuln = true
+note = "memo body is stored and echoed back into the memos view without output encoding — stored XSS."
+
+[[entry]]
+path = "app/routes/allocations.js"
+cap = "unauthorized_id"
+vuln = true
+note = "allocations are looked up by a userId taken from the request with no ownership check — insecure direct object reference / broken access control (OWASP A4)."
+
+[[entry]]
+path = "config/env/all.js"
+cap = "crypto"
+vuln = true
+note = "hardcoded cookieSecret / session secret committed in source — sensitive-data / weak-secret smell (OWASP A6)."
--- a/tests/eval_corpus/manifest_gt_convert.py
+++ b/tests/eval_corpus/manifest_gt_convert.py
@ -0,0 +1,195 @@
+#!/usr/bin/env python3
+"""Convert a curated TOML vuln manifest into nyx ground-truth JSON.
+
+Used for real-world apps that ship **no** machine-readable per-file vuln
+labels of their own (OWASP NodeGoat, OWASP Juice Shop).  OWASP Benchmark
+ships `expectedresults-1.2beta.csv` (see owasp_gt_convert.py); NIST SARD
+ships `manifest.xml` (see sard_gt_convert.py).  NodeGoat / Juice Shop are
+intentionally-vulnerable apps without an equivalent, so the authoritative
+source here is a curated manifest committed *in this repo* — one
+`[[entry]]` table per known-vulnerable location, each carrying a
+provenance `note` so a reviewer can trace why the label is what it is.
+
+Manifest schema (TOML)::
+
+    # provenance comments at the top
+    corpus = "nodegoat"          # informational label
+    upstream = "https://github.com/OWASP/NodeGoat"
+    pinned_ref = "master@<sha>"  # the ref the paths were curated against
+
+    [[entry]]
+    path = "app/routes/contributions.js"   # relative to the corpus root, POSIX
+    cap  = "cmdi"                           # a nyx cap label (tabulate.py)
+    vuln = true                             # true = real vuln, false = benign control
+    note = "eval() of user-supplied pre/after-tax fields (NodeGoat A1)"
+
+Output (consumed by tabulate.py): a list of `{path, line, cap, vuln}`
+records, sorted by `(path, cap)` for deterministic, diff-stable JSON.
+`note` is intentionally dropped — the ground-truth JSON keeps the exact
+same four-field schema OWASP/SARD produce, so tabulate.py needs no special
+casing.  `line` is always 0 (the manifest pins a file, not a line;
+tabulate.py matches file+cap and treats line 0 as "any line").
+
+Path validation (the no-compromise guard).  When `--corpus-dir` is given,
+**every** manifest path must resolve to a real file under that root or the
+converter exits non-zero.  CI runs the converter against a fresh clone of
+the pinned corpus and then asserts the committed JSON byte-matches the
+regenerated JSON, so a corpus bump that moves/renames/deletes a labelled
+file (or a typo'd path) fails the build loudly instead of silently
+degrading recall.  Authoring the committed JSON offline (no corpus on
+hand) is done by omitting `--corpus-dir`: the transform is identical, only
+the existence check is skipped.
+
+Usage::
+
+    # author / regenerate the committed JSON offline (no validation):
+    tests/eval_corpus/manifest_gt_convert.py \\
+        --manifest tests/eval_corpus/ground_truth/nodegoat.manifest.toml \\
+        --output   tests/eval_corpus/ground_truth/nodegoat.json
+
+    # CI: validate every path against a real checkout, then diff vs committed:
+    tests/eval_corpus/manifest_gt_convert.py \\
+        --manifest tests/eval_corpus/ground_truth/nodegoat.manifest.toml \\
+        --corpus-dir ~/.cache/nyx/eval_corpus/nodegoat \\
+        --output   /tmp/nodegoat_regen.json
+"""
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+try:
+    import tomllib  # Python 3.11+
+except ModuleNotFoundError:  # pragma: no cover — older interpreters only
+    import tomli as tomllib  # type: ignore[no-redef]
+
+# nyx cap labels (see tabulate.py _CAP_BIT_TABLE / _CAP_RULE_TABLE).  A
+# manifest cap outside this set is almost always a typo, so reject it at
+# conversion time rather than letting a never-matching cap silently sink
+# recall.
+VALID_CAPS = {
+    "path_traversal",
+    "fmt_string",
+    "sqli",
+    "deserialize",
+    "ssrf",
+    "cmdi",
+    "crypto",
+    "unauthorized_id",
+    "data_exfil",
+    "ldap_injection",
+    "xpath_injection",
+    "header_injection",
+    "redirect",
+    "xss",
+    "xxe",
+    "prototype_pollution",
+    "auth",
+    "memory",
+    "validation",
+}
+
+
+def load_manifest(path: Path) -> dict:
+    try:
+        with open(path, "rb") as f:
+            return tomllib.load(f)
+    except FileNotFoundError:
+        print(f"error: manifest not found: {path}", file=sys.stderr)
+        raise SystemExit(1)
+    except tomllib.TOMLDecodeError as e:
+        print(f"error: manifest malformed: {path}: {e}", file=sys.stderr)
+        raise SystemExit(1)
+
+
+def main() -> int:
+    p = argparse.ArgumentParser()
+    p.add_argument("--manifest", required=True, help="curated TOML manifest path")
+    p.add_argument("--output", required=True, help="output ground-truth JSON path")
+    p.add_argument(
+        "--corpus-dir",
+        default="",
+        help=(
+            "when set, every manifest path must resolve to a real file under "
+            "this root or the converter exits 2 (the CI corpus-drift guard)"
+        ),
+    )
+    args = p.parse_args()
+
+    manifest = load_manifest(Path(args.manifest).expanduser())
+    entries = manifest.get("entry", []) or []
+    if not entries:
+        print(f"error: manifest has no [[entry]] tables: {args.manifest}", file=sys.stderr)
+        return 1
+
+    corpus = Path(args.corpus_dir).expanduser().resolve() if args.corpus_dir else None
+    if args.corpus_dir and (corpus is None or not corpus.is_dir()):
+        print(f"error: corpus dir not found: {args.corpus_dir}", file=sys.stderr)
+        return 1
+
+    records: list[dict] = []
+    missing: list[str] = []
+    seen: set[tuple[str, str]] = set()
+    for i, e in enumerate(entries):
+        path = e.get("path")
+        cap = e.get("cap")
+        vuln = e.get("vuln")
+        if not path or not cap or not isinstance(vuln, bool):
+            print(
+                f"error: entry #{i} needs string path, string cap, bool vuln: {e!r}",
+                file=sys.stderr,
+            )
+            return 1
+        if cap not in VALID_CAPS:
+            print(
+                f"error: entry #{i} cap {cap!r} is not a known nyx cap "
+                f"(path {path!r}); fix the manifest",
+                file=sys.stderr,
+            )
+            return 1
+        norm = path.replace("\\", "/")
+        key = (norm, cap)
+        if key in seen:
+            print(
+                f"error: duplicate (path, cap) entry: {norm!r} / {cap!r}",
+                file=sys.stderr,
+            )
+            return 1
+        seen.add(key)
+        if corpus is not None and not (corpus / norm).is_file():
+            missing.append(norm)
+        records.append({"path": norm, "line": 0, "cap": cap, "vuln": vuln})
+
+    if missing:
+        print(
+            f"error: {len(missing)} manifest path(s) absent from {corpus} "
+            f"(corpus drift or typo) — regenerate the manifest against the "
+            f"pinned ref:",
+            file=sys.stderr,
+        )
+        for m in missing:
+            print(f"  missing: {m}", file=sys.stderr)
+        return 2
+
+    # Deterministic order so the committed JSON is diff-stable and the CI
+    # byte-equality guard is meaningful regardless of manifest ordering.
+    records.sort(key=lambda r: (r["path"], r["cap"]))
+
+    out = Path(args.output).expanduser().resolve()
+    out.parent.mkdir(parents=True, exist_ok=True)
+    with open(out, "w") as f:
+        json.dump(records, f, indent=2)
+        f.write("\n")
+
+    vuln_count = sum(1 for r in records if r["vuln"])
+    print(f"wrote {len(records)} records to {out}")
+    print(f"  vulns:    {vuln_count}")
+    print(f"  non-vuln: {len(records) - vuln_count}")
+    if corpus is not None:
+        print(f"  validated against: {corpus}")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/tests/eval_corpus/run.sh
+++ b/tests/eval_corpus/run.sh
@ -28,7 +28,7 @@ REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
 OUTPUT_DIR=""
 NYX_BIN="${NYX_BIN:-${REPO_ROOT}/target/release/nyx}"
 CORPUS_CACHE="${NYX_EVAL_CORPUS_DIR:-${HOME}/.cache/nyx/eval_corpus}"
-SETS="owasp,sard,inhouse"
+SETS="owasp,sard,nodegoat,juiceshop,inhouse"
 # Optional per-cell budgets and monotonic-improvement diff.
 BUDGET_FILE=""
 DIFF_FILE=""
@ -52,6 +52,44 @@ require_cmd() { command -v "$1" >/dev/null 2>&1 || die "required command not fou
 require_cmd jq
 require_cmd python3

+# Scan one ground-truth-labelled real corpus (NodeGoat / Juice Shop) and
+# tabulate it against its committed ground truth.  Self-skips when the
+# corpus has not been cloned into the cache.
+run_jsts_corpus() {
+  local label="$1" dir="$2" gt="$3"
+  if [[ ! -d "$dir" ]]; then
+    info "Bootstrapping $label..."
+    info "  Clone the corpus into ${dir} then re-run this script:"
+    if [[ "$label" == "nodegoat" ]]; then
+      info "    git clone --depth 1 https://github.com/OWASP/NodeGoat ${dir}"
+    else
+      info "    git clone --depth 1 --branch v15.0.0 \\"
+      info "      https://github.com/juice-shop/juice-shop ${dir}"
+    fi
+    info "Skipping $label set (not yet downloaded)."
+    return 0
+  fi
+  info "Running nyx scan on $label..."
+  set +e
+  "$NYX_BIN" scan --format json --verify --no-index "$dir" \
+    > "/tmp/nyx_${label}.json" 2>"/tmp/nyx_${label}.stderr"
+  local rc=$?
+  set -e
+  if [[ $rc -ne 0 && $rc -ne 1 ]]; then
+    info "  nyx exited $rc on $label set (stderr follows):"
+    cat "/tmp/nyx_${label}.stderr" >&2
+    return 0
+  fi
+  python3 "${SCRIPT_DIR}/tabulate.py" \
+    --label "$label" \
+    --scan "/tmp/nyx_${label}.json" \
+    --ground-truth "$gt" \
+    --append "$RESULTS_JSON" \
+    ${BUDGET_FILE:+--budget "$BUDGET_FILE"} \
+    ${DIFF_FILE:+--diff "$DIFF_FILE"} \
+    || info "  tabulate.py failed on $label; ground truth file may be absent"
+}
+
 [[ -x "$NYX_BIN" ]] || die "nyx binary not found or not executable: $NYX_BIN"

 mkdir -p "$CORPUS_CACHE"
@ -95,6 +133,16 @@ if [[ "$SETS" == *owasp* ]]; then
  fi
 fi

+# ── NodeGoat / Juice Shop (JS/TS) bootstrap — Track R.1 ───────────────────────
+if [[ "$SETS" == *nodegoat* ]]; then
+  run_jsts_corpus nodegoat "${CORPUS_CACHE}/nodegoat" \
+    "${SCRIPT_DIR}/ground_truth/nodegoat.json"
+fi
+if [[ "$SETS" == *juiceshop* ]]; then
+  run_jsts_corpus juiceshop "${CORPUS_CACHE}/juiceshop" \
+    "${SCRIPT_DIR}/ground_truth/juiceshop.json"
+fi
+
 # ── NIST SARD subset bootstrap ────────────────────────────────────────────────
 SARD_DIR="${CORPUS_CACHE}/nist_sard"
 if [[ "$SETS" == *sard* ]]; then
--- a/tests/eval_corpus/run_full.sh
+++ b/tests/eval_corpus/run_full.sh
@ -2,9 +2,9 @@
 # Full eval-corpus orchestrator.
 #
 # Drives a complete pass against every corpus set the project knows about
-# (OWASP Benchmark v1.2, the NIST SARD subset, and the Nyx benchmark
-# fixtures), then emits `tests/eval_corpus/results.json` for reports,
-# diffs, and docs.
+# (OWASP Benchmark v1.2, the NIST SARD subset, OWASP NodeGoat + Juice Shop,
+# and the Nyx benchmark fixtures), then emits `tests/eval_corpus/results.json`
+# for reports, diffs, and docs.
 #
 # Usage:
 #   tests/eval_corpus/run_full.sh [--nyx BIN] [--budget FILE] [--diff FILE]
@ -70,7 +70,7 @@ set +e
 NYX_EVAL_CORPUS_DIR="$CORPUS_CACHE" \
  bash "${SCRIPT_DIR}/run.sh" \
    --nyx     "$NYX_BIN" \
-    --sets    owasp,sard,inhouse \
+    --sets    owasp,sard,nodegoat,juiceshop,inhouse \
    --output  "$OUTPUT_DIR" \
    --budget  "$BUDGET_FILE" \
    ${DIFF_FILE:+--diff "$DIFF_FILE"}
--- a/tests/eval_corpus/test_manifest_gt_convert.py
+++ b/tests/eval_corpus/test_manifest_gt_convert.py
@ -0,0 +1,207 @@
+#!/usr/bin/env python3
+"""
+Phase 28 (Track R.1) regression test for tests/eval_corpus/manifest_gt_convert.py.
+
+Proves the manifest -> ground-truth converter is non-vacuous:
+  * a well-formed manifest converts to the expected sorted JSON,
+  * --corpus-dir validation passes when every labelled path exists and
+    produces byte-identical output to the no-corpus transform (so the CI
+    in-sync guard, which diffs committed vs a validated regen, is sound),
+  * --corpus-dir validation HARD-ERRORS (exit 2) on a missing path,
+  * an unknown cap / duplicate (path,cap) / malformed TOML are rejected,
+  * the committed nodegoat.json / juiceshop.json are exactly what a fresh
+    conversion of their manifests produces (offline half of the CI guard).
+
+Run with::
+
+    python3 tests/eval_corpus/test_manifest_gt_convert.py
+
+Exits 0 when every assertion holds, non-zero otherwise.
+"""
+
+from __future__ import annotations
+
+import json
+import subprocess
+import sys
+import tempfile
+from pathlib import Path
+
+REPO = Path(__file__).resolve().parents[2]
+CONVERT = REPO / "tests/eval_corpus/manifest_gt_convert.py"
+GT_DIR = REPO / "tests/eval_corpus/ground_truth"
+
+GOOD_MANIFEST = """\
+corpus = "demo"
+upstream = "https://example.test/demo"
+pinned_ref = "v1"
+
+[[entry]]
+path = "routes/login.ts"
+cap = "sqli"
+vuln = true
+note = "raw SQL string-concat in login"
+
+[[entry]]
+path = "app/routes/contributions.js"
+cap = "cmdi"
+vuln = true
+note = "eval of user input"
+
+[[entry]]
+path = "lib/insecurity.ts"
+cap = "crypto"
+vuln = false
+note = "benign control example"
+"""
+
+
+def run_convert(*args: str) -> subprocess.CompletedProcess:
+    return subprocess.run(
+        [sys.executable, str(CONVERT), *args], capture_output=True, text=True
+    )
+
+
+def test_transform_is_sorted_and_schema_clean(tmp: Path) -> None:
+    man = tmp / "demo.manifest.toml"
+    man.write_text(GOOD_MANIFEST)
+    out = tmp / "demo.json"
+    proc = run_convert("--manifest", str(man), "--output", str(out))
+    assert proc.returncode == 0, proc.stdout + proc.stderr
+    records = json.loads(out.read_text())
+    # Sorted by (path, cap); only the 4 GT fields; `note` dropped.
+    assert [r["path"] for r in records] == [
+        "app/routes/contributions.js",
+        "lib/insecurity.ts",
+        "routes/login.ts",
+    ], records
+    for r in records:
+        assert set(r) == {"path", "line", "cap", "vuln"}, r
+        assert r["line"] == 0, r
+    assert records[0]["cap"] == "cmdi" and records[0]["vuln"] is True
+    assert records[1]["cap"] == "crypto" and records[1]["vuln"] is False
+
+
+def test_corpus_validation_passes_and_matches_no_corpus(tmp: Path) -> None:
+    man = tmp / "demo.manifest.toml"
+    man.write_text(GOOD_MANIFEST)
+    # Build a corpus tree containing every labelled path.
+    corpus = tmp / "corpus"
+    for rel in ("routes/login.ts", "app/routes/contributions.js", "lib/insecurity.ts"):
+        f = corpus / rel
+        f.parent.mkdir(parents=True, exist_ok=True)
+        f.write_text("// stub\n")
+    no_corpus = tmp / "no_corpus.json"
+    with_corpus = tmp / "with_corpus.json"
+    assert run_convert("--manifest", str(man), "--output", str(no_corpus)).returncode == 0
+    proc = run_convert(
+        "--manifest", str(man),
+        "--corpus-dir", str(corpus),
+        "--output", str(with_corpus),
+    )
+    assert proc.returncode == 0, proc.stdout + proc.stderr
+    # Validation must not change the output — that is what makes the CI guard
+    # (diff committed vs validated regen) meaningful.
+    assert no_corpus.read_text() == with_corpus.read_text()
+    assert "validated against" in proc.stdout, proc.stdout
+
+
+def test_missing_path_exits_2(tmp: Path) -> None:
+    man = tmp / "demo.manifest.toml"
+    man.write_text(GOOD_MANIFEST)
+    corpus = tmp / "corpus"
+    # Only two of the three labelled files exist → the third must trip.
+    for rel in ("routes/login.ts", "app/routes/contributions.js"):
+        f = corpus / rel
+        f.parent.mkdir(parents=True, exist_ok=True)
+        f.write_text("// stub\n")
+    out = tmp / "demo.json"
+    proc = run_convert(
+        "--manifest", str(man), "--corpus-dir", str(corpus), "--output", str(out)
+    )
+    assert proc.returncode == 2, proc.stdout + proc.stderr
+    assert "lib/insecurity.ts" in proc.stderr and "missing" in proc.stderr, proc.stderr
+
+
+def test_unknown_cap_rejected(tmp: Path) -> None:
+    man = tmp / "bad_cap.manifest.toml"
+    man.write_text(
+        '[[entry]]\npath = "a.js"\ncap = "not_a_cap"\nvuln = true\n'
+    )
+    out = tmp / "out.json"
+    proc = run_convert("--manifest", str(man), "--output", str(out))
+    assert proc.returncode == 1, proc.stdout + proc.stderr
+    assert "not a known nyx cap" in proc.stderr, proc.stderr
+
+
+def test_duplicate_path_cap_rejected(tmp: Path) -> None:
+    man = tmp / "dup.manifest.toml"
+    man.write_text(
+        '[[entry]]\npath = "a.js"\ncap = "xss"\nvuln = true\n'
+        '[[entry]]\npath = "a.js"\ncap = "xss"\nvuln = true\n'
+    )
+    out = tmp / "out.json"
+    proc = run_convert("--manifest", str(man), "--output", str(out))
+    assert proc.returncode == 1, proc.stdout + proc.stderr
+    assert "duplicate" in proc.stderr, proc.stderr
+
+
+def test_malformed_manifest_exits_1(tmp: Path) -> None:
+    man = tmp / "broken.toml"
+    man.write_text("[[entry]\npath = \n")  # invalid TOML
+    out = tmp / "out.json"
+    proc = run_convert("--manifest", str(man), "--output", str(out))
+    assert proc.returncode == 1, proc.stdout + proc.stderr
+    assert "malformed" in proc.stderr, proc.stderr
+
+
+def test_empty_manifest_exits_1(tmp: Path) -> None:
+    man = tmp / "empty.toml"
+    man.write_text('corpus = "x"\n')  # no [[entry]] tables
+    out = tmp / "out.json"
+    proc = run_convert("--manifest", str(man), "--output", str(out))
+    assert proc.returncode == 1, proc.stdout + proc.stderr
+    assert "no [[entry]]" in proc.stderr, proc.stderr
+
+
+def test_committed_gt_matches_manifest(tmp: Path) -> None:
+    # Offline half of the CI in-sync guard: the committed ground-truth JSON
+    # must be exactly what a fresh conversion of its manifest produces.  This
+    # catches a manifest edit that was not followed by a regenerate.
+    for name in ("nodegoat", "juiceshop"):
+        man = GT_DIR / f"{name}.manifest.toml"
+        committed = GT_DIR / f"{name}.json"
+        assert man.exists(), f"missing manifest: {man}"
+        assert committed.exists(), f"missing committed GT: {committed}"
+        regen = tmp / f"{name}.json"
+        proc = run_convert("--manifest", str(man), "--output", str(regen))
+        assert proc.returncode == 0, proc.stdout + proc.stderr
+        assert json.loads(regen.read_text()) == json.loads(committed.read_text()), (
+            f"{committed} is stale — regenerate with manifest_gt_convert.py"
+        )
+
+
+def main() -> int:
+    with tempfile.TemporaryDirectory() as td:
+        tmp = Path(td)
+        for fn in (
+            test_transform_is_sorted_and_schema_clean,
+            test_corpus_validation_passes_and_matches_no_corpus,
+            test_missing_path_exits_2,
+            test_unknown_cap_rejected,
+            test_duplicate_path_cap_rejected,
+            test_malformed_manifest_exits_1,
+            test_empty_manifest_exits_1,
+            test_committed_gt_matches_manifest,
+        ):
+            sub = tmp / fn.__name__
+            sub.mkdir()
+            print(f"... {fn.__name__}")
+            fn(sub)
+            print("    OK")
+    print("\nAll manifest_gt_convert.py regression checks passed.")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())