chore: remove stale scheduled_tasks.lock file

This commit is contained in:
elipeter 2026-05-31 21:18:38 -05:00
parent a5929bb169
commit 2a4d49b68b
12 changed files with 1059 additions and 21 deletions

View file

@ -1,18 +1,25 @@
# Phase 27 (Track R.0): OWASP Benchmark v1.2 real-corpus acceptance.
# Real-corpus acceptance (Track R).
#
# Runs Gate 6 of scripts/m7_ship_gate.sh against a real OWASP BenchmarkJava
# checkout on every PR that touches the dynamic verifier (src/dynamic/), the
# * owasp (Phase 27 / Track R.0): Gate 6 vs a real OWASP BenchmarkJava
# checkout (Java).
# * jsts (Phase 28 / Track R.1): Gate 7 vs OWASP NodeGoat (Express, .js)
# and OWASP Juice Shop (TypeScript, .ts), one matrix row per corpus.
#
# Runs on every PR that touches the dynamic verifier (src/dynamic/), the
# eval-corpus harness (tests/eval_corpus/), or the gate script itself.
#
# Gate 6 enforces, against the committed ground truth:
# Each gate enforces, against the committed ground truth:
# * verify wall-clock <= 15 min (CI budget; the dev reference is 10 min),
# * per-cap confirmed-rate >= 40%, precision >= 0.85, recall >= 0.40 for the
# dynamically-supported OWASP caps,
# * the per-(cap,lang) budget in tests/eval_corpus/budget.toml.
# * the per-(cap,lang) budget in tests/eval_corpus/budget.toml,
# * per-cap confirmed-rate / precision / recall — hard-gated only for caps
# in NYX_*_FLOOR_CAPS (empty by default → published report-only until a
# cap Confirms end to end), with destinations >= 40% / >= 0.85 / >= 0.40.
#
# The corpus is *not* vendored. It is cloned at the pinned 1.2beta tag (the
# tag that produced expectedresults-1.2beta.csv, the source of the ground
# truth) and cached so reruns skip the clone.
# No corpus is vendored. Each is cloned at a pinned ref and cached so reruns
# skip the clone. Before the gate runs, the committed ground truth is
# regenerated from its source against the fresh clone and asserted in sync,
# and the converter hard-errors on any labelled path missing from the corpus,
# so a corpus bump that drifts the labels fails the job loudly.
name: eval
@ -99,7 +106,98 @@ jobs:
PY
- name: eval-corpus harness regression tests
run: python3 tests/eval_corpus/test_tabulate_regression.py
run: |
python3 tests/eval_corpus/test_tabulate_regression.py
python3 tests/eval_corpus/test_manifest_gt_convert.py
- name: Gate 6 — OWASP Benchmark v1.2 acceptance
run: scripts/m7_ship_gate.sh --sets owasp
jsts:
name: eval / ${{ matrix.corpus.name }}
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
corpus:
- name: nodegoat
repo: https://github.com/OWASP/NodeGoat
# NodeGoat ships no release tags; pin the default branch and let
# the cache key hold it stable. The manifest's path layout
# (app/, config/) has been constant for years.
ref: master
env: NYX_NODEGOAT_CORPUS
manifest: nodegoat.manifest.toml
ground_truth: nodegoat.json
- name: juiceshop
repo: https://github.com/juice-shop/juice-shop
ref: v15.0.0
env: NYX_JUICESHOP_CORPUS
manifest: juiceshop.manifest.toml
ground_truth: juiceshop.json
env:
# CI wall-clock budget: 15 min. Override locally to tighten.
NYX_JSTS_WALLCLOCK_BUDGET_SECONDS: "900"
steps:
- uses: actions/checkout@v6
- uses: actions-rust-lang/setup-rust-toolchain@v1
with:
toolchain: stable
cache: true
- uses: taiki-e/install-action@nextest
# The dynamic verifier's Node build pool (Phase 23) compiles its
# harnesses with a real node/npm toolchain.
- name: Set up Node 20
uses: actions/setup-node@v4
with:
node-version: "20"
- name: Cache ${{ matrix.corpus.name }}
id: cache-corpus
uses: actions/cache@v4
with:
path: .eval-corpus/${{ matrix.corpus.name }}
key: jsts-${{ matrix.corpus.name }}-${{ matrix.corpus.ref }}
- name: Clone ${{ matrix.corpus.name }} (${{ matrix.corpus.ref }})
if: steps.cache-corpus.outputs.cache-hit != 'true'
run: |
git clone --depth 1 --branch ${{ matrix.corpus.ref }} \
${{ matrix.corpus.repo }} \
.eval-corpus/${{ matrix.corpus.name }}
# No-compromise guard: the committed ground truth must be exactly what a
# fresh conversion of the curated manifest produces *against this
# corpus*. manifest_gt_convert.py hard-errors on any labelled path that
# no longer exists in the clone (corpus drift / typo), and the diff
# below catches a stale committed JSON.
- name: Verify ground truth is in sync with the pinned corpus
run: |
python3 tests/eval_corpus/manifest_gt_convert.py \
--manifest tests/eval_corpus/ground_truth/${{ matrix.corpus.manifest }} \
--corpus-dir .eval-corpus/${{ matrix.corpus.name }} \
--output /tmp/${{ matrix.corpus.name }}_gt_regen.json
python3 - <<'PY'
import json, sys
name = "${{ matrix.corpus.ground_truth }}"
committed = json.load(open(f"tests/eval_corpus/ground_truth/{name}"))
regen = json.load(open("/tmp/${{ matrix.corpus.name }}_gt_regen.json"))
if committed != regen:
sys.exit("committed ground truth diverges from a fresh conversion of "
"the manifest against the pinned corpus; regenerate with "
"manifest_gt_convert.py")
print(f"ground truth in sync: {len(committed)} records")
PY
- name: eval-corpus harness regression tests
run: |
python3 tests/eval_corpus/test_tabulate_regression.py
python3 tests/eval_corpus/test_manifest_gt_convert.py
- name: Gate 7 — ${{ matrix.corpus.name }} acceptance
run: |
export ${{ matrix.corpus.env }}="${{ github.workspace }}/.eval-corpus/${{ matrix.corpus.name }}"
scripts/m7_ship_gate.sh --sets ${{ matrix.corpus.name }}

View file

@ -6,6 +6,8 @@
# scripts/m7_ship_gate.sh # every gate
# scripts/m7_ship_gate.sh --gates 3,6 # only gates 3 + 6
# scripts/m7_ship_gate.sh --sets owasp # Java OWASP corpus only
# scripts/m7_ship_gate.sh --sets jsts # NodeGoat + Juice Shop only
# scripts/m7_ship_gate.sh --sets nodegoat # one JS/TS corpus only
#
# Gate map (kept in sync with .pitboss/play/plan.md track M.7):
# Gate 1: Static-only scan is green on `tests/benchmark/corpus`.
@ -26,13 +28,22 @@
# R.0) added the precision/recall/budget ratchet. The corpus is
# *not* checked into the repo; the gate skips with a clear message
# when `NYX_OWASP_CORPUS` does not point at a real checkout.
# Gate 7: JS/TS real-corpus acceptance (Track R.1 / Phase 28). OWASP
# NodeGoat (Express, .js) + OWASP Juice Shop (TypeScript, .ts)
# `--verify` against the committed ground truth. Same shape as
# Gate 6: wall-clock budget + the per-(cap,lang) budget in
# tests/eval_corpus/budget.toml hard-enforced; per-cap
# confirmed-rate / precision / recall published report-only
# (NYX_JSTS_FLOOR_CAPS empty by default). Each corpus row
# self-skips unless its NYX_NODEGOAT_CORPUS / NYX_JUICESHOP_CORPUS
# points at a real checkout.
set -euo pipefail
REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
cd "${REPO_ROOT}"
GATES="1,2,3,4,5,6"
GATES="1,2,3,4,5,6,7"
SETS=""
while [[ $# -gt 0 ]]; do
@ -56,10 +67,15 @@ while [[ $# -gt 0 ]]; do
esac
done
# When `--sets owasp` is passed CI only wants Gate 6.
if [[ "${SETS}" == "owasp" ]]; then
GATES="6"
fi
# `--sets` lets CI run a single real-corpus gate. `owasp` -> Gate 6;
# `jsts` (both JS/TS corpora) / `nodegoat` / `juiceshop` -> Gate 7, with the
# corpus name passed through so Gate 7 runs only the requested row.
case "${SETS}" in
owasp) GATES="6" ;;
jsts|nodegoat|juiceshop) GATES="7" ;;
"") ;; # no --sets: run the requested --gates
*) echo "unknown --sets: ${SETS}" >&2; exit 2 ;;
esac
want_gate() {
[[ ",${GATES}," == *",$1,"* ]]
@ -292,6 +308,162 @@ PY
echo " PASS"
}
# ── Gate 7: JS/TS real-corpus acceptance (NodeGoat + Juice Shop) ──────────────
# Phase 28 (Track R.1) mirror of Gate 6 for the JS/TS corpora. Same
# wall-clock split (10 min dev reference / 15 min CI) and the same
# report-only-by-default floor policy: NYX_JSTS_FLOOR_CAPS is empty, so the
# per-cap confirmed-rate / precision / recall numbers are published but gate
# nothing, while the per-(cap,lang) budget (unsupported_rate,
# false_confirmed_rate) is hard-enforced. Promote a cap into the floor set
# once it starts Confirming end to end.
GATE7_WALLCLOCK_BUDGET="${NYX_JSTS_WALLCLOCK_BUDGET_SECONDS:-900}"
GATE7_CONFIRMED_RATE_TARGET="${NYX_JSTS_CONFIRMED_RATE_TARGET:-0.40}"
GATE7_PRECISION_TARGET="${NYX_JSTS_PRECISION_TARGET:-0.85}"
GATE7_RECALL_TARGET="${NYX_JSTS_RECALL_TARGET:-0.40}"
GATE7_FLOOR_CAPS="${NYX_JSTS_FLOOR_CAPS:-}"
GATE7_BUDGET="${NYX_JSTS_BUDGET:-${REPO_ROOT}/tests/eval_corpus/budget.toml}"
# Run one real-corpus `--verify` row: scan under a wall-clock guard,
# tabulate against the committed ground truth, enforce the per-cell budget,
# publish (or, when floor caps are set, enforce) the per-cap floors.
# $1 label $2 corpus dir $3 ground-truth json
# Returns 0 on pass, 1 on fail. Caller decides skip.
_gate7_run_corpus() {
local label="$1" corpus="$2" gt="$3"
local scan_report="/tmp/m7_gate7_${label}_scan.json"
local results_report="/tmp/m7_gate7_${label}_results.json"
local wallclock_report="/tmp/m7_gate7_${label}_wallclock.txt"
local gate_home="${TMPDIR:-/tmp}/nyx_m7_gate7_${label}_home"
local gate_build_pool="${TMPDIR:-/tmp}/nyx_m7_gate7_${label}_build_pool"
local wallclock
mkdir -p "${gate_home}" "${gate_build_pool}"
rm -f "${scan_report}" "${results_report}" "${wallclock_report}"
set +e
HOME="${gate_home}" \
NYX_BUILD_POOL_DIR="${gate_build_pool}" \
python3 - "${GATE7_WALLCLOCK_BUDGET}" "${scan_report}" "${wallclock_report}" \
"${REPO_ROOT}/target/release/nyx" scan \
--verify \
--index off \
--format json \
--quiet \
"${corpus}" <<'PY'
import subprocess
import sys
import time
budget = float(sys.argv[1])
scan_report = sys.argv[2]
wallclock_report = sys.argv[3]
cmd = sys.argv[4:]
start = time.monotonic()
rc = 0
try:
with open(scan_report, "wb") as out:
completed = subprocess.run(cmd, stdout=out, timeout=budget)
rc = completed.returncode
except subprocess.TimeoutExpired:
rc = 124
finally:
elapsed = time.monotonic() - start
with open(wallclock_report, "w") as f:
f.write(f"{elapsed:.1f}\n")
sys.exit(rc)
PY
local nyx_exit=$?
set -e
wallclock="$(cat "${wallclock_report}" 2>/dev/null || printf "%s" "${GATE7_WALLCLOCK_BUDGET}")"
echo " ${label} verify wall-clock: ${wallclock}s (budget ${GATE7_WALLCLOCK_BUDGET}s)"
if [[ ${nyx_exit} -eq 124 ]]; then
echo " FAIL: ${label} scan exceeded wall-clock budget"
return 1
fi
if [[ ${nyx_exit} -ne 0 && ${nyx_exit} -ne 1 ]]; then
echo " FAIL: ${label} scan exited ${nyx_exit}"
return 1
fi
if [[ ! -s "${scan_report}" ]]; then
echo " FAIL: ${label} scan produced no JSON report"
return 1
fi
awk -v w="${wallclock}" -v b="${GATE7_WALLCLOCK_BUDGET}" \
'BEGIN { if (w+0 > b+0) exit 1 }' \
|| { echo " FAIL: ${label} wall-clock exceeds budget"; return 1; }
echo "[]" > "${results_report}"
python3 "${REPO_ROOT}/tests/eval_corpus/tabulate.py" \
--label "${label}" \
--scan "${scan_report}" \
--ground-truth "${gt}" \
--append "${results_report}" \
|| { echo " FAIL: ${label} result tabulation failed"; return 1; }
local -a report_args=(
--results "${results_report}"
--budget "${GATE7_BUDGET}"
)
if [[ -n "${GATE7_FLOOR_CAPS}" ]]; then
report_args+=(
--floor-caps "${GATE7_FLOOR_CAPS}"
--min-confirmed-rate "${GATE7_CONFIRMED_RATE_TARGET}"
--min-precision "${GATE7_PRECISION_TARGET}"
--min-recall "${GATE7_RECALL_TARGET}"
)
echo " enforcing per-cap floors (confirmed >= ${GATE7_CONFIRMED_RATE_TARGET}, precision >= ${GATE7_PRECISION_TARGET}, recall >= ${GATE7_RECALL_TARGET}) on: ${GATE7_FLOOR_CAPS}"
else
echo " per-cap confirmed/precision/recall: report-only (NYX_JSTS_FLOOR_CAPS unset)"
fi
python3 "${REPO_ROOT}/tests/eval_corpus/report.py" "${report_args[@]}" \
|| { echo " FAIL: ${label} per-cell budget exceeded or a gated per-cap floor missed"; return 1; }
return 0
}
gate_7_jsts_scale() {
echo "── Gate 7: JS/TS real-corpus (NodeGoat + Juice Shop) verify acceptance ──"
cargo build --release --quiet --features dynamic
# name : env var holding the corpus dir : committed ground-truth file
local rows=(
"nodegoat:NYX_NODEGOAT_CORPUS:nodegoat.json"
"juiceshop:NYX_JUICESHOP_CORPUS:juiceshop.json"
)
local any_ran=0 any_failed=0
for row in "${rows[@]}"; do
local name envvar gtfile
IFS=: read -r name envvar gtfile <<<"${row}"
# When --sets names a single corpus, only run that row.
if [[ -n "${SETS}" && "${SETS}" != "jsts" && "${SETS}" != "${name}" ]]; then
continue
fi
local corpus="${!envvar:-}"
if [[ -z "${corpus}" || ! -d "${corpus}" ]]; then
echo " SKIP ${name}: set ${envvar} to a checkout to run this row."
continue
fi
any_ran=1
echo " ── ${name} (${corpus}) ──"
if _gate7_run_corpus "${name}" "${corpus}" \
"${REPO_ROOT}/tests/eval_corpus/ground_truth/${gtfile}"; then
echo " PASS ${name}"
else
any_failed=1
fi
done
if [[ ${any_ran} -eq 0 ]]; then
echo " SKIP: no JS/TS corpus configured (set NYX_NODEGOAT_CORPUS / NYX_JUICESHOP_CORPUS)."
echo " (Gate 7 is Phase 28's headline acceptance for the JS/TS real corpora.)"
return 0
fi
[[ ${any_failed} -eq 0 ]] || return 1
echo " PASS"
}
# ── Driver ────────────────────────────────────────────────────────────────────
declare -a FAILED=()
@ -310,6 +482,7 @@ run_gate 3 verify_ratio
run_gate 4 sarif_schema
run_gate 5 layering
run_gate 6 owasp_scale
run_gate 7 jsts_scale
if [[ ${#FAILED[@]} -gt 0 ]]; then
echo

View file

@ -116,3 +116,87 @@ cap = "auth"
lang = "java"
unsupported_rate = 0.20
false_confirmed_rate = 0.02
# ── NodeGoat / Juice Shop (JS/TS) — Track R.1 ratchet ────────────────────────
#
# Phase 28 wires two intentionally-vulnerable JS/TS apps into the same
# acceptance machinery as OWASP Benchmark: OWASP NodeGoat (Express, .js)
# and OWASP Juice Shop (TypeScript, .ts). Unlike OWASP Benchmark, neither
# app ships vuln/benign *pairs* — every labelled file is `vuln = true` (see
# ground_truth/{nodegoat,juiceshop}.manifest.toml). Two consequences for
# these cells:
#
# * false_confirmed_rate (<= 2%) is the headline maximum the verifier
# already satisfies and is HARD-enforced: it only trips when a Confirmed
# finding lands on a file with no ground-truth positive, i.e. an
# over-confirm. With the verifier confirming little on real corpora yet
# it is satisfied, and it ratchets precision as confirms grow.
# * unsupported_rate (<= 20%) is HARD-enforced too. `Unsupported` counts
# only NoPayloadsForCap / EntryKindUnsupported / SoundOracleUnavailable —
# a narrow bucket that Tracks J + M shrank — *not* BuildFailed /
# SpecDerivationFailed (those are Inconclusive), so it stays low.
#
# confirmed_rate (>= 40%), precision (>= 0.85) and recall (>= 0.40) are the
# Phase 28 acceptance DESTINATIONS. They are intentionally left UNSET here
# and published report-only by Gate 7 (NYX_JSTS_FLOOR_CAPS empty by default,
# mirroring NYX_OWASP_FLOOR_CAPS) because (a) the verifier does not yet
# Confirm these corpora end to end and (b) the manifest labels canonical
# vulns only, so precision vs partial ground truth is informational until
# the labels are completed. Promote a cap into the floor set the moment it
# starts Confirming, exactly as for OWASP.
# NodeGoat (javascript): caps with a ground-truth label in nodegoat.manifest.toml.
[[cell]]
cap = "cmdi"
lang = "javascript"
unsupported_rate = 0.20
false_confirmed_rate = 0.02
[[cell]]
cap = "xss"
lang = "javascript"
unsupported_rate = 0.20
false_confirmed_rate = 0.02
[[cell]]
cap = "unauthorized_id"
lang = "javascript"
unsupported_rate = 0.20
false_confirmed_rate = 0.02
[[cell]]
cap = "crypto"
lang = "javascript"
unsupported_rate = 0.20
false_confirmed_rate = 0.02
# Juice Shop (typescript): caps with a ground-truth label in juiceshop.manifest.toml.
[[cell]]
cap = "sqli"
lang = "typescript"
unsupported_rate = 0.20
false_confirmed_rate = 0.02
[[cell]]
cap = "path_traversal"
lang = "typescript"
unsupported_rate = 0.20
false_confirmed_rate = 0.02
[[cell]]
cap = "redirect"
lang = "typescript"
unsupported_rate = 0.20
false_confirmed_rate = 0.02
[[cell]]
cap = "ssrf"
lang = "typescript"
unsupported_rate = 0.20
false_confirmed_rate = 0.02
[[cell]]
cap = "crypto"
lang = "typescript"
unsupported_rate = 0.20
false_confirmed_rate = 0.02

View file

@ -34,3 +34,38 @@ python3 tests/eval_corpus/owasp_gt_convert.py \
File: `nist_sard.json`
Same format. Source: SARD manifest XML converted with `python3 tests/eval_corpus/sard_gt_convert.py`.
## OWASP NodeGoat / OWASP Juice Shop (JS/TS — Track R.1)
Files: `nodegoat.json` (Express, `.js`), `juiceshop.json` (TypeScript, `.ts`).
Same four-field format as above; all records are `vuln: true`.
These two apps are intentionally vulnerable end to end, so — unlike OWASP
Benchmark — they ship no machine-readable per-file vuln labels and have no
benign-control files to pair against. The authoritative source is a curated
TOML manifest committed here, one `[[entry]]` per known-vulnerable handler
with a `note` citing why:
- `nodegoat.manifest.toml`
- `juiceshop.manifest.toml`
`manifest_gt_convert.py` turns a manifest into the committed `.json`:
```sh
python3 tests/eval_corpus/manifest_gt_convert.py \
--manifest tests/eval_corpus/ground_truth/nodegoat.manifest.toml \
--output tests/eval_corpus/ground_truth/nodegoat.json
```
Pass `--corpus-dir <clone>` to validate every labelled path against a real
checkout. The converter exits non-zero if any path is missing, so a corpus
bump that moves a handler fails loudly instead of silently dropping recall.
CI (`.github/workflows/eval.yml`, `jsts` job) regenerates each `.json`
against a fresh clone of the pinned ref and asserts it matches the committed
file.
Because the manifests label canonical vulns only, recall (did nyx catch the
known vulns) is the meaningful metric; precision vs this partial ground
truth is informational. Gate 7 publishes per-cap precision/recall/confirmed
report-only by default (`NYX_JSTS_FLOOR_CAPS` empty), matching the OWASP
gate.

View file

@ -0,0 +1,38 @@
[
{
"path": "lib/insecurity.ts",
"line": 0,
"cap": "crypto",
"vuln": true
},
{
"path": "routes/fileServer.ts",
"line": 0,
"cap": "path_traversal",
"vuln": true
},
{
"path": "routes/login.ts",
"line": 0,
"cap": "sqli",
"vuln": true
},
{
"path": "routes/profileImageUrlUpload.ts",
"line": 0,
"cap": "ssrf",
"vuln": true
},
{
"path": "routes/redirect.ts",
"line": 0,
"cap": "redirect",
"vuln": true
},
{
"path": "routes/search.ts",
"line": 0,
"cap": "sqli",
"vuln": true
}
]

View file

@ -0,0 +1,66 @@
# OWASP Juice Shop — curated vuln ground-truth manifest (Phase 28, Track R.1).
#
# Juice Shop is an intentionally-vulnerable TypeScript/Express + Angular
# app. Its `data/static/challenges.yml` enumerates challenges but pins no
# source file/line, so it cannot drive file-level ground truth on its own.
# This manifest IS the authoritative source: one [[entry]] per known-
# vulnerable server-side handler, curated from the project's own challenge
# definitions + companion guide, each with a `note` citing the challenge.
#
# tests/eval_corpus/manifest_gt_convert.py turns this into the committed
# ground_truth/juiceshop.json. CI regenerates it against a fresh clone of
# the pinned tag and asserts byte-equality; the converter HARD-ERRORS on
# any path that no longer exists in the corpus, so a Juice Shop bump that
# refactors a route fails the eval job loudly instead of silently dropping
# recall. Re-pin `pinned_ref` and re-validate the paths together.
#
# `cap` is a nyx cap label (tabulate.py). `path` is relative to the Juice
# Shop clone root, POSIX separators. Lang is inferred from the extension
# (.ts -> typescript). All `vuln = true`: Juice Shop is all-vulnerable, so
# there is no benign-control file to pair against. As with NodeGoat,
# precision vs this manifest is informational (an unlabelled finding may be
# a real uncurated vuln, not a false positive) while recall is the
# meaningful floor. See tests/eval_corpus/budget.toml for the gate policy.
corpus = "juiceshop"
upstream = "https://github.com/juice-shop/juice-shop"
# Pinned to a stable release tag. The server-side handlers below
# (routes/*.ts, lib/insecurity.ts) have been stable across the TypeScript
# era of Juice Shop; re-validate if the tag is bumped.
pinned_ref = "v15.0.0"
[[entry]]
path = "routes/login.ts"
cap = "sqli"
vuln = true
note = "login builds a raw `models.sequelize.query(\"... WHERE email = '\" + req.body.email + \"' ...\")` — SQL injection auth bypass (challenge: loginAdmin / loginBender)."
[[entry]]
path = "routes/search.ts"
cap = "sqli"
vuln = true
note = "product search concatenates the `q` criteria into a raw `models.sequelize.query` LIKE clause — UNION-based SQL injection (challenge: unionSqlInjection / dbSchema)."
[[entry]]
path = "routes/fileServer.ts"
cap = "path_traversal"
vuln = true
note = "serveKeyFiles / file download resolves a user-controlled filename under the ftp dir without containment — path traversal (challenge: accessLogDisclosure / forgottenDevBackup)."
[[entry]]
path = "routes/redirect.ts"
cap = "redirect"
vuln = true
note = "redirect endpoint forwards to the `to` query param via an allow-list that is bypassable by substring — open redirect (challenge: redirectCryptoCurrency / redirect)."
[[entry]]
path = "routes/profileImageUrlUpload.ts"
cap = "ssrf"
vuln = true
note = "profile image upload fetches an arbitrary user-supplied imageUrl server-side — SSRF (challenge: ssrf)."
[[entry]]
path = "lib/insecurity.ts"
cap = "crypto"
vuln = true
note = "hardcoded HMAC/JWT key material and weak hashing (md5-based `hash`) — broken cryptography / hardcoded secret (challenge: weakCryptography / jwt*)."

View file

@ -0,0 +1,32 @@
[
{
"path": "app/routes/allocations.js",
"line": 0,
"cap": "unauthorized_id",
"vuln": true
},
{
"path": "app/routes/contributions.js",
"line": 0,
"cap": "cmdi",
"vuln": true
},
{
"path": "app/routes/memos.js",
"line": 0,
"cap": "xss",
"vuln": true
},
{
"path": "app/routes/profile.js",
"line": 0,
"cap": "xss",
"vuln": true
},
{
"path": "config/env/all.js",
"line": 0,
"cap": "crypto",
"vuln": true
}
]

View file

@ -0,0 +1,62 @@
# OWASP NodeGoat — curated vuln ground-truth manifest (Phase 28, Track R.1).
#
# NodeGoat is an intentionally-vulnerable Express/Node app that maps the
# OWASP Top 10 to concrete handlers. It ships no machine-readable per-file
# vuln labels (unlike OWASP Benchmark's expectedresults CSV), so this
# manifest IS the authoritative source: one [[entry]] per known-vulnerable
# location, each curated from the project's own tutorial + the canonical
# vuln walk-through, with a `note` citing why.
#
# tests/eval_corpus/manifest_gt_convert.py turns this into the committed
# ground_truth/nodegoat.json. CI regenerates it against a fresh clone of
# the pinned ref and asserts byte-equality, and the converter HARD-ERRORS
# on any path that no longer exists in the corpus, so a NodeGoat bump that
# moves a handler fails the eval job loudly rather than silently dropping
# recall. Update `pinned_ref` + the paths together when re-pinning.
#
# `cap` is a nyx cap label (tabulate.py). `path` is relative to the
# NodeGoat clone root, POSIX separators. Lang is inferred from the
# extension (.js -> javascript). These are all `vuln = true`: NodeGoat is
# all-vulnerable, so there is no benign-control file to pair against (the
# OWASP Benchmark vuln/benign pairing does not exist here). Precision vs
# this manifest is therefore informational (an unlabelled finding is not
# necessarily a false positive — it may be a real vuln we did not curate),
# while recall (did nyx catch the canonical vulns) is the meaningful floor.
# See tests/eval_corpus/budget.toml for how the gate treats these cells.
corpus = "nodegoat"
upstream = "https://github.com/OWASP/NodeGoat"
# NodeGoat publishes no semver tags; the eval job pins the default branch
# via the CI cache key. The `app/` + `config/` layout below has been
# stable for years; re-validate the paths if the cache key is bumped.
pinned_ref = "master"
[[entry]]
path = "app/routes/contributions.js"
cap = "cmdi"
vuln = true
note = "handleContributionsUpdate eval()s the pre-tax/after-tax/roth form fields — server-side JS injection (OWASP A1 Injection); the textbook NodeGoat RCE."
[[entry]]
path = "app/routes/profile.js"
cap = "xss"
vuln = true
note = "profile fields (firstName/lastName/bankAcc/...) are persisted then rendered unescaped — stored XSS (OWASP A3 / A7 XSS)."
[[entry]]
path = "app/routes/memos.js"
cap = "xss"
vuln = true
note = "memo body is stored and echoed back into the memos view without output encoding — stored XSS."
[[entry]]
path = "app/routes/allocations.js"
cap = "unauthorized_id"
vuln = true
note = "allocations are looked up by a userId taken from the request with no ownership check — insecure direct object reference / broken access control (OWASP A4)."
[[entry]]
path = "config/env/all.js"
cap = "crypto"
vuln = true
note = "hardcoded cookieSecret / session secret committed in source — sensitive-data / weak-secret smell (OWASP A6)."

View file

@ -0,0 +1,195 @@
#!/usr/bin/env python3
"""Convert a curated TOML vuln manifest into nyx ground-truth JSON.
Used for real-world apps that ship **no** machine-readable per-file vuln
labels of their own (OWASP NodeGoat, OWASP Juice Shop). OWASP Benchmark
ships `expectedresults-1.2beta.csv` (see owasp_gt_convert.py); NIST SARD
ships `manifest.xml` (see sard_gt_convert.py). NodeGoat / Juice Shop are
intentionally-vulnerable apps without an equivalent, so the authoritative
source here is a curated manifest committed *in this repo* one
`[[entry]]` table per known-vulnerable location, each carrying a
provenance `note` so a reviewer can trace why the label is what it is.
Manifest schema (TOML)::
# provenance comments at the top
corpus = "nodegoat" # informational label
upstream = "https://github.com/OWASP/NodeGoat"
pinned_ref = "master@<sha>" # the ref the paths were curated against
[[entry]]
path = "app/routes/contributions.js" # relative to the corpus root, POSIX
cap = "cmdi" # a nyx cap label (tabulate.py)
vuln = true # true = real vuln, false = benign control
note = "eval() of user-supplied pre/after-tax fields (NodeGoat A1)"
Output (consumed by tabulate.py): a list of `{path, line, cap, vuln}`
records, sorted by `(path, cap)` for deterministic, diff-stable JSON.
`note` is intentionally dropped the ground-truth JSON keeps the exact
same four-field schema OWASP/SARD produce, so tabulate.py needs no special
casing. `line` is always 0 (the manifest pins a file, not a line;
tabulate.py matches file+cap and treats line 0 as "any line").
Path validation (the no-compromise guard). When `--corpus-dir` is given,
**every** manifest path must resolve to a real file under that root or the
converter exits non-zero. CI runs the converter against a fresh clone of
the pinned corpus and then asserts the committed JSON byte-matches the
regenerated JSON, so a corpus bump that moves/renames/deletes a labelled
file (or a typo'd path) fails the build loudly instead of silently
degrading recall. Authoring the committed JSON offline (no corpus on
hand) is done by omitting `--corpus-dir`: the transform is identical, only
the existence check is skipped.
Usage::
# author / regenerate the committed JSON offline (no validation):
tests/eval_corpus/manifest_gt_convert.py \\
--manifest tests/eval_corpus/ground_truth/nodegoat.manifest.toml \\
--output tests/eval_corpus/ground_truth/nodegoat.json
# CI: validate every path against a real checkout, then diff vs committed:
tests/eval_corpus/manifest_gt_convert.py \\
--manifest tests/eval_corpus/ground_truth/nodegoat.manifest.toml \\
--corpus-dir ~/.cache/nyx/eval_corpus/nodegoat \\
--output /tmp/nodegoat_regen.json
"""
import argparse
import json
import sys
from pathlib import Path
try:
import tomllib # Python 3.11+
except ModuleNotFoundError: # pragma: no cover — older interpreters only
import tomli as tomllib # type: ignore[no-redef]
# nyx cap labels (see tabulate.py _CAP_BIT_TABLE / _CAP_RULE_TABLE). A
# manifest cap outside this set is almost always a typo, so reject it at
# conversion time rather than letting a never-matching cap silently sink
# recall.
VALID_CAPS = {
"path_traversal",
"fmt_string",
"sqli",
"deserialize",
"ssrf",
"cmdi",
"crypto",
"unauthorized_id",
"data_exfil",
"ldap_injection",
"xpath_injection",
"header_injection",
"redirect",
"xss",
"xxe",
"prototype_pollution",
"auth",
"memory",
"validation",
}
def load_manifest(path: Path) -> dict:
try:
with open(path, "rb") as f:
return tomllib.load(f)
except FileNotFoundError:
print(f"error: manifest not found: {path}", file=sys.stderr)
raise SystemExit(1)
except tomllib.TOMLDecodeError as e:
print(f"error: manifest malformed: {path}: {e}", file=sys.stderr)
raise SystemExit(1)
def main() -> int:
p = argparse.ArgumentParser()
p.add_argument("--manifest", required=True, help="curated TOML manifest path")
p.add_argument("--output", required=True, help="output ground-truth JSON path")
p.add_argument(
"--corpus-dir",
default="",
help=(
"when set, every manifest path must resolve to a real file under "
"this root or the converter exits 2 (the CI corpus-drift guard)"
),
)
args = p.parse_args()
manifest = load_manifest(Path(args.manifest).expanduser())
entries = manifest.get("entry", []) or []
if not entries:
print(f"error: manifest has no [[entry]] tables: {args.manifest}", file=sys.stderr)
return 1
corpus = Path(args.corpus_dir).expanduser().resolve() if args.corpus_dir else None
if args.corpus_dir and (corpus is None or not corpus.is_dir()):
print(f"error: corpus dir not found: {args.corpus_dir}", file=sys.stderr)
return 1
records: list[dict] = []
missing: list[str] = []
seen: set[tuple[str, str]] = set()
for i, e in enumerate(entries):
path = e.get("path")
cap = e.get("cap")
vuln = e.get("vuln")
if not path or not cap or not isinstance(vuln, bool):
print(
f"error: entry #{i} needs string path, string cap, bool vuln: {e!r}",
file=sys.stderr,
)
return 1
if cap not in VALID_CAPS:
print(
f"error: entry #{i} cap {cap!r} is not a known nyx cap "
f"(path {path!r}); fix the manifest",
file=sys.stderr,
)
return 1
norm = path.replace("\\", "/")
key = (norm, cap)
if key in seen:
print(
f"error: duplicate (path, cap) entry: {norm!r} / {cap!r}",
file=sys.stderr,
)
return 1
seen.add(key)
if corpus is not None and not (corpus / norm).is_file():
missing.append(norm)
records.append({"path": norm, "line": 0, "cap": cap, "vuln": vuln})
if missing:
print(
f"error: {len(missing)} manifest path(s) absent from {corpus} "
f"(corpus drift or typo) — regenerate the manifest against the "
f"pinned ref:",
file=sys.stderr,
)
for m in missing:
print(f" missing: {m}", file=sys.stderr)
return 2
# Deterministic order so the committed JSON is diff-stable and the CI
# byte-equality guard is meaningful regardless of manifest ordering.
records.sort(key=lambda r: (r["path"], r["cap"]))
out = Path(args.output).expanduser().resolve()
out.parent.mkdir(parents=True, exist_ok=True)
with open(out, "w") as f:
json.dump(records, f, indent=2)
f.write("\n")
vuln_count = sum(1 for r in records if r["vuln"])
print(f"wrote {len(records)} records to {out}")
print(f" vulns: {vuln_count}")
print(f" non-vuln: {len(records) - vuln_count}")
if corpus is not None:
print(f" validated against: {corpus}")
return 0
if __name__ == "__main__":
sys.exit(main())

View file

@ -28,7 +28,7 @@ REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
OUTPUT_DIR=""
NYX_BIN="${NYX_BIN:-${REPO_ROOT}/target/release/nyx}"
CORPUS_CACHE="${NYX_EVAL_CORPUS_DIR:-${HOME}/.cache/nyx/eval_corpus}"
SETS="owasp,sard,inhouse"
SETS="owasp,sard,nodegoat,juiceshop,inhouse"
# Optional per-cell budgets and monotonic-improvement diff.
BUDGET_FILE=""
DIFF_FILE=""
@ -52,6 +52,44 @@ require_cmd() { command -v "$1" >/dev/null 2>&1 || die "required command not fou
require_cmd jq
require_cmd python3
# Scan one ground-truth-labelled real corpus (NodeGoat / Juice Shop) and
# tabulate it against its committed ground truth. Self-skips when the
# corpus has not been cloned into the cache.
run_jsts_corpus() {
local label="$1" dir="$2" gt="$3"
if [[ ! -d "$dir" ]]; then
info "Bootstrapping $label..."
info " Clone the corpus into ${dir} then re-run this script:"
if [[ "$label" == "nodegoat" ]]; then
info " git clone --depth 1 https://github.com/OWASP/NodeGoat ${dir}"
else
info " git clone --depth 1 --branch v15.0.0 \\"
info " https://github.com/juice-shop/juice-shop ${dir}"
fi
info "Skipping $label set (not yet downloaded)."
return 0
fi
info "Running nyx scan on $label..."
set +e
"$NYX_BIN" scan --format json --verify --no-index "$dir" \
> "/tmp/nyx_${label}.json" 2>"/tmp/nyx_${label}.stderr"
local rc=$?
set -e
if [[ $rc -ne 0 && $rc -ne 1 ]]; then
info " nyx exited $rc on $label set (stderr follows):"
cat "/tmp/nyx_${label}.stderr" >&2
return 0
fi
python3 "${SCRIPT_DIR}/tabulate.py" \
--label "$label" \
--scan "/tmp/nyx_${label}.json" \
--ground-truth "$gt" \
--append "$RESULTS_JSON" \
${BUDGET_FILE:+--budget "$BUDGET_FILE"} \
${DIFF_FILE:+--diff "$DIFF_FILE"} \
|| info " tabulate.py failed on $label; ground truth file may be absent"
}
[[ -x "$NYX_BIN" ]] || die "nyx binary not found or not executable: $NYX_BIN"
mkdir -p "$CORPUS_CACHE"
@ -95,6 +133,16 @@ if [[ "$SETS" == *owasp* ]]; then
fi
fi
# ── NodeGoat / Juice Shop (JS/TS) bootstrap — Track R.1 ───────────────────────
if [[ "$SETS" == *nodegoat* ]]; then
run_jsts_corpus nodegoat "${CORPUS_CACHE}/nodegoat" \
"${SCRIPT_DIR}/ground_truth/nodegoat.json"
fi
if [[ "$SETS" == *juiceshop* ]]; then
run_jsts_corpus juiceshop "${CORPUS_CACHE}/juiceshop" \
"${SCRIPT_DIR}/ground_truth/juiceshop.json"
fi
# ── NIST SARD subset bootstrap ────────────────────────────────────────────────
SARD_DIR="${CORPUS_CACHE}/nist_sard"
if [[ "$SETS" == *sard* ]]; then

View file

@ -2,9 +2,9 @@
# Full eval-corpus orchestrator.
#
# Drives a complete pass against every corpus set the project knows about
# (OWASP Benchmark v1.2, the NIST SARD subset, and the Nyx benchmark
# fixtures), then emits `tests/eval_corpus/results.json` for reports,
# diffs, and docs.
# (OWASP Benchmark v1.2, the NIST SARD subset, OWASP NodeGoat + Juice Shop,
# and the Nyx benchmark fixtures), then emits `tests/eval_corpus/results.json`
# for reports, diffs, and docs.
#
# Usage:
# tests/eval_corpus/run_full.sh [--nyx BIN] [--budget FILE] [--diff FILE]
@ -70,7 +70,7 @@ set +e
NYX_EVAL_CORPUS_DIR="$CORPUS_CACHE" \
bash "${SCRIPT_DIR}/run.sh" \
--nyx "$NYX_BIN" \
--sets owasp,sard,inhouse \
--sets owasp,sard,nodegoat,juiceshop,inhouse \
--output "$OUTPUT_DIR" \
--budget "$BUDGET_FILE" \
${DIFF_FILE:+--diff "$DIFF_FILE"}

View file

@ -0,0 +1,207 @@
#!/usr/bin/env python3
"""
Phase 28 (Track R.1) regression test for tests/eval_corpus/manifest_gt_convert.py.
Proves the manifest -> ground-truth converter is non-vacuous:
* a well-formed manifest converts to the expected sorted JSON,
* --corpus-dir validation passes when every labelled path exists and
produces byte-identical output to the no-corpus transform (so the CI
in-sync guard, which diffs committed vs a validated regen, is sound),
* --corpus-dir validation HARD-ERRORS (exit 2) on a missing path,
* an unknown cap / duplicate (path,cap) / malformed TOML are rejected,
* the committed nodegoat.json / juiceshop.json are exactly what a fresh
conversion of their manifests produces (offline half of the CI guard).
Run with::
python3 tests/eval_corpus/test_manifest_gt_convert.py
Exits 0 when every assertion holds, non-zero otherwise.
"""
from __future__ import annotations
import json
import subprocess
import sys
import tempfile
from pathlib import Path
REPO = Path(__file__).resolve().parents[2]
CONVERT = REPO / "tests/eval_corpus/manifest_gt_convert.py"
GT_DIR = REPO / "tests/eval_corpus/ground_truth"
GOOD_MANIFEST = """\
corpus = "demo"
upstream = "https://example.test/demo"
pinned_ref = "v1"
[[entry]]
path = "routes/login.ts"
cap = "sqli"
vuln = true
note = "raw SQL string-concat in login"
[[entry]]
path = "app/routes/contributions.js"
cap = "cmdi"
vuln = true
note = "eval of user input"
[[entry]]
path = "lib/insecurity.ts"
cap = "crypto"
vuln = false
note = "benign control example"
"""
def run_convert(*args: str) -> subprocess.CompletedProcess:
return subprocess.run(
[sys.executable, str(CONVERT), *args], capture_output=True, text=True
)
def test_transform_is_sorted_and_schema_clean(tmp: Path) -> None:
man = tmp / "demo.manifest.toml"
man.write_text(GOOD_MANIFEST)
out = tmp / "demo.json"
proc = run_convert("--manifest", str(man), "--output", str(out))
assert proc.returncode == 0, proc.stdout + proc.stderr
records = json.loads(out.read_text())
# Sorted by (path, cap); only the 4 GT fields; `note` dropped.
assert [r["path"] for r in records] == [
"app/routes/contributions.js",
"lib/insecurity.ts",
"routes/login.ts",
], records
for r in records:
assert set(r) == {"path", "line", "cap", "vuln"}, r
assert r["line"] == 0, r
assert records[0]["cap"] == "cmdi" and records[0]["vuln"] is True
assert records[1]["cap"] == "crypto" and records[1]["vuln"] is False
def test_corpus_validation_passes_and_matches_no_corpus(tmp: Path) -> None:
man = tmp / "demo.manifest.toml"
man.write_text(GOOD_MANIFEST)
# Build a corpus tree containing every labelled path.
corpus = tmp / "corpus"
for rel in ("routes/login.ts", "app/routes/contributions.js", "lib/insecurity.ts"):
f = corpus / rel
f.parent.mkdir(parents=True, exist_ok=True)
f.write_text("// stub\n")
no_corpus = tmp / "no_corpus.json"
with_corpus = tmp / "with_corpus.json"
assert run_convert("--manifest", str(man), "--output", str(no_corpus)).returncode == 0
proc = run_convert(
"--manifest", str(man),
"--corpus-dir", str(corpus),
"--output", str(with_corpus),
)
assert proc.returncode == 0, proc.stdout + proc.stderr
# Validation must not change the output — that is what makes the CI guard
# (diff committed vs validated regen) meaningful.
assert no_corpus.read_text() == with_corpus.read_text()
assert "validated against" in proc.stdout, proc.stdout
def test_missing_path_exits_2(tmp: Path) -> None:
man = tmp / "demo.manifest.toml"
man.write_text(GOOD_MANIFEST)
corpus = tmp / "corpus"
# Only two of the three labelled files exist → the third must trip.
for rel in ("routes/login.ts", "app/routes/contributions.js"):
f = corpus / rel
f.parent.mkdir(parents=True, exist_ok=True)
f.write_text("// stub\n")
out = tmp / "demo.json"
proc = run_convert(
"--manifest", str(man), "--corpus-dir", str(corpus), "--output", str(out)
)
assert proc.returncode == 2, proc.stdout + proc.stderr
assert "lib/insecurity.ts" in proc.stderr and "missing" in proc.stderr, proc.stderr
def test_unknown_cap_rejected(tmp: Path) -> None:
man = tmp / "bad_cap.manifest.toml"
man.write_text(
'[[entry]]\npath = "a.js"\ncap = "not_a_cap"\nvuln = true\n'
)
out = tmp / "out.json"
proc = run_convert("--manifest", str(man), "--output", str(out))
assert proc.returncode == 1, proc.stdout + proc.stderr
assert "not a known nyx cap" in proc.stderr, proc.stderr
def test_duplicate_path_cap_rejected(tmp: Path) -> None:
man = tmp / "dup.manifest.toml"
man.write_text(
'[[entry]]\npath = "a.js"\ncap = "xss"\nvuln = true\n'
'[[entry]]\npath = "a.js"\ncap = "xss"\nvuln = true\n'
)
out = tmp / "out.json"
proc = run_convert("--manifest", str(man), "--output", str(out))
assert proc.returncode == 1, proc.stdout + proc.stderr
assert "duplicate" in proc.stderr, proc.stderr
def test_malformed_manifest_exits_1(tmp: Path) -> None:
man = tmp / "broken.toml"
man.write_text("[[entry]\npath = \n") # invalid TOML
out = tmp / "out.json"
proc = run_convert("--manifest", str(man), "--output", str(out))
assert proc.returncode == 1, proc.stdout + proc.stderr
assert "malformed" in proc.stderr, proc.stderr
def test_empty_manifest_exits_1(tmp: Path) -> None:
man = tmp / "empty.toml"
man.write_text('corpus = "x"\n') # no [[entry]] tables
out = tmp / "out.json"
proc = run_convert("--manifest", str(man), "--output", str(out))
assert proc.returncode == 1, proc.stdout + proc.stderr
assert "no [[entry]]" in proc.stderr, proc.stderr
def test_committed_gt_matches_manifest(tmp: Path) -> None:
# Offline half of the CI in-sync guard: the committed ground-truth JSON
# must be exactly what a fresh conversion of its manifest produces. This
# catches a manifest edit that was not followed by a regenerate.
for name in ("nodegoat", "juiceshop"):
man = GT_DIR / f"{name}.manifest.toml"
committed = GT_DIR / f"{name}.json"
assert man.exists(), f"missing manifest: {man}"
assert committed.exists(), f"missing committed GT: {committed}"
regen = tmp / f"{name}.json"
proc = run_convert("--manifest", str(man), "--output", str(regen))
assert proc.returncode == 0, proc.stdout + proc.stderr
assert json.loads(regen.read_text()) == json.loads(committed.read_text()), (
f"{committed} is stale — regenerate with manifest_gt_convert.py"
)
def main() -> int:
with tempfile.TemporaryDirectory() as td:
tmp = Path(td)
for fn in (
test_transform_is_sorted_and_schema_clean,
test_corpus_validation_passes_and_matches_no_corpus,
test_missing_path_exits_2,
test_unknown_cap_rejected,
test_duplicate_path_cap_rejected,
test_malformed_manifest_exits_1,
test_empty_manifest_exits_1,
test_committed_gt_matches_manifest,
):
sub = tmp / fn.__name__
sub.mkdir()
print(f"... {fn.__name__}")
fn(sub)
print(" OK")
print("\nAll manifest_gt_convert.py regression checks passed.")
return 0
if __name__ == "__main__":
sys.exit(main())