mirror of
https://github.com/elicpeter/nyx.git
synced 2026-06-09 19:45:13 +02:00
chore: remove stale scheduled_tasks.lock file
This commit is contained in:
parent
a5929bb169
commit
2a4d49b68b
12 changed files with 1059 additions and 21 deletions
120
.github/workflows/eval.yml
vendored
120
.github/workflows/eval.yml
vendored
|
|
@ -1,18 +1,25 @@
|
|||
# Phase 27 (Track R.0): OWASP Benchmark v1.2 real-corpus acceptance.
|
||||
# Real-corpus acceptance (Track R).
|
||||
#
|
||||
# Runs Gate 6 of scripts/m7_ship_gate.sh against a real OWASP BenchmarkJava
|
||||
# checkout on every PR that touches the dynamic verifier (src/dynamic/), the
|
||||
# * owasp (Phase 27 / Track R.0): Gate 6 vs a real OWASP BenchmarkJava
|
||||
# checkout (Java).
|
||||
# * jsts (Phase 28 / Track R.1): Gate 7 vs OWASP NodeGoat (Express, .js)
|
||||
# and OWASP Juice Shop (TypeScript, .ts), one matrix row per corpus.
|
||||
#
|
||||
# Runs on every PR that touches the dynamic verifier (src/dynamic/), the
|
||||
# eval-corpus harness (tests/eval_corpus/), or the gate script itself.
|
||||
#
|
||||
# Gate 6 enforces, against the committed ground truth:
|
||||
# Each gate enforces, against the committed ground truth:
|
||||
# * verify wall-clock <= 15 min (CI budget; the dev reference is 10 min),
|
||||
# * per-cap confirmed-rate >= 40%, precision >= 0.85, recall >= 0.40 for the
|
||||
# dynamically-supported OWASP caps,
|
||||
# * the per-(cap,lang) budget in tests/eval_corpus/budget.toml.
|
||||
# * the per-(cap,lang) budget in tests/eval_corpus/budget.toml,
|
||||
# * per-cap confirmed-rate / precision / recall — hard-gated only for caps
|
||||
# in NYX_*_FLOOR_CAPS (empty by default → published report-only until a
|
||||
# cap Confirms end to end), with destinations >= 40% / >= 0.85 / >= 0.40.
|
||||
#
|
||||
# The corpus is *not* vendored. It is cloned at the pinned 1.2beta tag (the
|
||||
# tag that produced expectedresults-1.2beta.csv, the source of the ground
|
||||
# truth) and cached so reruns skip the clone.
|
||||
# No corpus is vendored. Each is cloned at a pinned ref and cached so reruns
|
||||
# skip the clone. Before the gate runs, the committed ground truth is
|
||||
# regenerated from its source against the fresh clone and asserted in sync,
|
||||
# and the converter hard-errors on any labelled path missing from the corpus,
|
||||
# so a corpus bump that drifts the labels fails the job loudly.
|
||||
|
||||
name: eval
|
||||
|
||||
|
|
@ -99,7 +106,98 @@ jobs:
|
|||
PY
|
||||
|
||||
- name: eval-corpus harness regression tests
|
||||
run: python3 tests/eval_corpus/test_tabulate_regression.py
|
||||
run: |
|
||||
python3 tests/eval_corpus/test_tabulate_regression.py
|
||||
python3 tests/eval_corpus/test_manifest_gt_convert.py
|
||||
|
||||
- name: Gate 6 — OWASP Benchmark v1.2 acceptance
|
||||
run: scripts/m7_ship_gate.sh --sets owasp
|
||||
|
||||
jsts:
|
||||
name: eval / ${{ matrix.corpus.name }}
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
corpus:
|
||||
- name: nodegoat
|
||||
repo: https://github.com/OWASP/NodeGoat
|
||||
# NodeGoat ships no release tags; pin the default branch and let
|
||||
# the cache key hold it stable. The manifest's path layout
|
||||
# (app/, config/) has been constant for years.
|
||||
ref: master
|
||||
env: NYX_NODEGOAT_CORPUS
|
||||
manifest: nodegoat.manifest.toml
|
||||
ground_truth: nodegoat.json
|
||||
- name: juiceshop
|
||||
repo: https://github.com/juice-shop/juice-shop
|
||||
ref: v15.0.0
|
||||
env: NYX_JUICESHOP_CORPUS
|
||||
manifest: juiceshop.manifest.toml
|
||||
ground_truth: juiceshop.json
|
||||
env:
|
||||
# CI wall-clock budget: 15 min. Override locally to tighten.
|
||||
NYX_JSTS_WALLCLOCK_BUDGET_SECONDS: "900"
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- uses: actions-rust-lang/setup-rust-toolchain@v1
|
||||
with:
|
||||
toolchain: stable
|
||||
cache: true
|
||||
|
||||
- uses: taiki-e/install-action@nextest
|
||||
|
||||
# The dynamic verifier's Node build pool (Phase 23) compiles its
|
||||
# harnesses with a real node/npm toolchain.
|
||||
- name: Set up Node 20
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: "20"
|
||||
|
||||
- name: Cache ${{ matrix.corpus.name }}
|
||||
id: cache-corpus
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: .eval-corpus/${{ matrix.corpus.name }}
|
||||
key: jsts-${{ matrix.corpus.name }}-${{ matrix.corpus.ref }}
|
||||
|
||||
- name: Clone ${{ matrix.corpus.name }} (${{ matrix.corpus.ref }})
|
||||
if: steps.cache-corpus.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
git clone --depth 1 --branch ${{ matrix.corpus.ref }} \
|
||||
${{ matrix.corpus.repo }} \
|
||||
.eval-corpus/${{ matrix.corpus.name }}
|
||||
|
||||
# No-compromise guard: the committed ground truth must be exactly what a
|
||||
# fresh conversion of the curated manifest produces *against this
|
||||
# corpus*. manifest_gt_convert.py hard-errors on any labelled path that
|
||||
# no longer exists in the clone (corpus drift / typo), and the diff
|
||||
# below catches a stale committed JSON.
|
||||
- name: Verify ground truth is in sync with the pinned corpus
|
||||
run: |
|
||||
python3 tests/eval_corpus/manifest_gt_convert.py \
|
||||
--manifest tests/eval_corpus/ground_truth/${{ matrix.corpus.manifest }} \
|
||||
--corpus-dir .eval-corpus/${{ matrix.corpus.name }} \
|
||||
--output /tmp/${{ matrix.corpus.name }}_gt_regen.json
|
||||
python3 - <<'PY'
|
||||
import json, sys
|
||||
name = "${{ matrix.corpus.ground_truth }}"
|
||||
committed = json.load(open(f"tests/eval_corpus/ground_truth/{name}"))
|
||||
regen = json.load(open("/tmp/${{ matrix.corpus.name }}_gt_regen.json"))
|
||||
if committed != regen:
|
||||
sys.exit("committed ground truth diverges from a fresh conversion of "
|
||||
"the manifest against the pinned corpus; regenerate with "
|
||||
"manifest_gt_convert.py")
|
||||
print(f"ground truth in sync: {len(committed)} records")
|
||||
PY
|
||||
|
||||
- name: eval-corpus harness regression tests
|
||||
run: |
|
||||
python3 tests/eval_corpus/test_tabulate_regression.py
|
||||
python3 tests/eval_corpus/test_manifest_gt_convert.py
|
||||
|
||||
- name: Gate 7 — ${{ matrix.corpus.name }} acceptance
|
||||
run: |
|
||||
export ${{ matrix.corpus.env }}="${{ github.workspace }}/.eval-corpus/${{ matrix.corpus.name }}"
|
||||
scripts/m7_ship_gate.sh --sets ${{ matrix.corpus.name }}
|
||||
|
|
|
|||
|
|
@ -6,6 +6,8 @@
|
|||
# scripts/m7_ship_gate.sh # every gate
|
||||
# scripts/m7_ship_gate.sh --gates 3,6 # only gates 3 + 6
|
||||
# scripts/m7_ship_gate.sh --sets owasp # Java OWASP corpus only
|
||||
# scripts/m7_ship_gate.sh --sets jsts # NodeGoat + Juice Shop only
|
||||
# scripts/m7_ship_gate.sh --sets nodegoat # one JS/TS corpus only
|
||||
#
|
||||
# Gate map (kept in sync with .pitboss/play/plan.md track M.7):
|
||||
# Gate 1: Static-only scan is green on `tests/benchmark/corpus`.
|
||||
|
|
@ -26,13 +28,22 @@
|
|||
# R.0) added the precision/recall/budget ratchet. The corpus is
|
||||
# *not* checked into the repo; the gate skips with a clear message
|
||||
# when `NYX_OWASP_CORPUS` does not point at a real checkout.
|
||||
# Gate 7: JS/TS real-corpus acceptance (Track R.1 / Phase 28). OWASP
|
||||
# NodeGoat (Express, .js) + OWASP Juice Shop (TypeScript, .ts)
|
||||
# `--verify` against the committed ground truth. Same shape as
|
||||
# Gate 6: wall-clock budget + the per-(cap,lang) budget in
|
||||
# tests/eval_corpus/budget.toml hard-enforced; per-cap
|
||||
# confirmed-rate / precision / recall published report-only
|
||||
# (NYX_JSTS_FLOOR_CAPS empty by default). Each corpus row
|
||||
# self-skips unless its NYX_NODEGOAT_CORPUS / NYX_JUICESHOP_CORPUS
|
||||
# points at a real checkout.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
cd "${REPO_ROOT}"
|
||||
|
||||
GATES="1,2,3,4,5,6"
|
||||
GATES="1,2,3,4,5,6,7"
|
||||
SETS=""
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
|
|
@ -56,10 +67,15 @@ while [[ $# -gt 0 ]]; do
|
|||
esac
|
||||
done
|
||||
|
||||
# When `--sets owasp` is passed CI only wants Gate 6.
|
||||
if [[ "${SETS}" == "owasp" ]]; then
|
||||
GATES="6"
|
||||
fi
|
||||
# `--sets` lets CI run a single real-corpus gate. `owasp` -> Gate 6;
|
||||
# `jsts` (both JS/TS corpora) / `nodegoat` / `juiceshop` -> Gate 7, with the
|
||||
# corpus name passed through so Gate 7 runs only the requested row.
|
||||
case "${SETS}" in
|
||||
owasp) GATES="6" ;;
|
||||
jsts|nodegoat|juiceshop) GATES="7" ;;
|
||||
"") ;; # no --sets: run the requested --gates
|
||||
*) echo "unknown --sets: ${SETS}" >&2; exit 2 ;;
|
||||
esac
|
||||
|
||||
want_gate() {
|
||||
[[ ",${GATES}," == *",$1,"* ]]
|
||||
|
|
@ -292,6 +308,162 @@ PY
|
|||
echo " PASS"
|
||||
}
|
||||
|
||||
# ── Gate 7: JS/TS real-corpus acceptance (NodeGoat + Juice Shop) ──────────────
|
||||
|
||||
# Phase 28 (Track R.1) mirror of Gate 6 for the JS/TS corpora. Same
|
||||
# wall-clock split (10 min dev reference / 15 min CI) and the same
|
||||
# report-only-by-default floor policy: NYX_JSTS_FLOOR_CAPS is empty, so the
|
||||
# per-cap confirmed-rate / precision / recall numbers are published but gate
|
||||
# nothing, while the per-(cap,lang) budget (unsupported_rate,
|
||||
# false_confirmed_rate) is hard-enforced. Promote a cap into the floor set
|
||||
# once it starts Confirming end to end.
|
||||
GATE7_WALLCLOCK_BUDGET="${NYX_JSTS_WALLCLOCK_BUDGET_SECONDS:-900}"
|
||||
GATE7_CONFIRMED_RATE_TARGET="${NYX_JSTS_CONFIRMED_RATE_TARGET:-0.40}"
|
||||
GATE7_PRECISION_TARGET="${NYX_JSTS_PRECISION_TARGET:-0.85}"
|
||||
GATE7_RECALL_TARGET="${NYX_JSTS_RECALL_TARGET:-0.40}"
|
||||
GATE7_FLOOR_CAPS="${NYX_JSTS_FLOOR_CAPS:-}"
|
||||
GATE7_BUDGET="${NYX_JSTS_BUDGET:-${REPO_ROOT}/tests/eval_corpus/budget.toml}"
|
||||
|
||||
# Run one real-corpus `--verify` row: scan under a wall-clock guard,
|
||||
# tabulate against the committed ground truth, enforce the per-cell budget,
|
||||
# publish (or, when floor caps are set, enforce) the per-cap floors.
|
||||
# $1 label $2 corpus dir $3 ground-truth json
|
||||
# Returns 0 on pass, 1 on fail. Caller decides skip.
|
||||
_gate7_run_corpus() {
|
||||
local label="$1" corpus="$2" gt="$3"
|
||||
local scan_report="/tmp/m7_gate7_${label}_scan.json"
|
||||
local results_report="/tmp/m7_gate7_${label}_results.json"
|
||||
local wallclock_report="/tmp/m7_gate7_${label}_wallclock.txt"
|
||||
local gate_home="${TMPDIR:-/tmp}/nyx_m7_gate7_${label}_home"
|
||||
local gate_build_pool="${TMPDIR:-/tmp}/nyx_m7_gate7_${label}_build_pool"
|
||||
local wallclock
|
||||
|
||||
mkdir -p "${gate_home}" "${gate_build_pool}"
|
||||
rm -f "${scan_report}" "${results_report}" "${wallclock_report}"
|
||||
|
||||
set +e
|
||||
HOME="${gate_home}" \
|
||||
NYX_BUILD_POOL_DIR="${gate_build_pool}" \
|
||||
python3 - "${GATE7_WALLCLOCK_BUDGET}" "${scan_report}" "${wallclock_report}" \
|
||||
"${REPO_ROOT}/target/release/nyx" scan \
|
||||
--verify \
|
||||
--index off \
|
||||
--format json \
|
||||
--quiet \
|
||||
"${corpus}" <<'PY'
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
|
||||
budget = float(sys.argv[1])
|
||||
scan_report = sys.argv[2]
|
||||
wallclock_report = sys.argv[3]
|
||||
cmd = sys.argv[4:]
|
||||
start = time.monotonic()
|
||||
rc = 0
|
||||
try:
|
||||
with open(scan_report, "wb") as out:
|
||||
completed = subprocess.run(cmd, stdout=out, timeout=budget)
|
||||
rc = completed.returncode
|
||||
except subprocess.TimeoutExpired:
|
||||
rc = 124
|
||||
finally:
|
||||
elapsed = time.monotonic() - start
|
||||
with open(wallclock_report, "w") as f:
|
||||
f.write(f"{elapsed:.1f}\n")
|
||||
sys.exit(rc)
|
||||
PY
|
||||
local nyx_exit=$?
|
||||
set -e
|
||||
wallclock="$(cat "${wallclock_report}" 2>/dev/null || printf "%s" "${GATE7_WALLCLOCK_BUDGET}")"
|
||||
|
||||
echo " ${label} verify wall-clock: ${wallclock}s (budget ${GATE7_WALLCLOCK_BUDGET}s)"
|
||||
|
||||
if [[ ${nyx_exit} -eq 124 ]]; then
|
||||
echo " FAIL: ${label} scan exceeded wall-clock budget"
|
||||
return 1
|
||||
fi
|
||||
if [[ ${nyx_exit} -ne 0 && ${nyx_exit} -ne 1 ]]; then
|
||||
echo " FAIL: ${label} scan exited ${nyx_exit}"
|
||||
return 1
|
||||
fi
|
||||
if [[ ! -s "${scan_report}" ]]; then
|
||||
echo " FAIL: ${label} scan produced no JSON report"
|
||||
return 1
|
||||
fi
|
||||
awk -v w="${wallclock}" -v b="${GATE7_WALLCLOCK_BUDGET}" \
|
||||
'BEGIN { if (w+0 > b+0) exit 1 }' \
|
||||
|| { echo " FAIL: ${label} wall-clock exceeds budget"; return 1; }
|
||||
|
||||
echo "[]" > "${results_report}"
|
||||
python3 "${REPO_ROOT}/tests/eval_corpus/tabulate.py" \
|
||||
--label "${label}" \
|
||||
--scan "${scan_report}" \
|
||||
--ground-truth "${gt}" \
|
||||
--append "${results_report}" \
|
||||
|| { echo " FAIL: ${label} result tabulation failed"; return 1; }
|
||||
|
||||
local -a report_args=(
|
||||
--results "${results_report}"
|
||||
--budget "${GATE7_BUDGET}"
|
||||
)
|
||||
if [[ -n "${GATE7_FLOOR_CAPS}" ]]; then
|
||||
report_args+=(
|
||||
--floor-caps "${GATE7_FLOOR_CAPS}"
|
||||
--min-confirmed-rate "${GATE7_CONFIRMED_RATE_TARGET}"
|
||||
--min-precision "${GATE7_PRECISION_TARGET}"
|
||||
--min-recall "${GATE7_RECALL_TARGET}"
|
||||
)
|
||||
echo " enforcing per-cap floors (confirmed >= ${GATE7_CONFIRMED_RATE_TARGET}, precision >= ${GATE7_PRECISION_TARGET}, recall >= ${GATE7_RECALL_TARGET}) on: ${GATE7_FLOOR_CAPS}"
|
||||
else
|
||||
echo " per-cap confirmed/precision/recall: report-only (NYX_JSTS_FLOOR_CAPS unset)"
|
||||
fi
|
||||
python3 "${REPO_ROOT}/tests/eval_corpus/report.py" "${report_args[@]}" \
|
||||
|| { echo " FAIL: ${label} per-cell budget exceeded or a gated per-cap floor missed"; return 1; }
|
||||
return 0
|
||||
}
|
||||
|
||||
gate_7_jsts_scale() {
|
||||
echo "── Gate 7: JS/TS real-corpus (NodeGoat + Juice Shop) verify acceptance ──"
|
||||
cargo build --release --quiet --features dynamic
|
||||
|
||||
# name : env var holding the corpus dir : committed ground-truth file
|
||||
local rows=(
|
||||
"nodegoat:NYX_NODEGOAT_CORPUS:nodegoat.json"
|
||||
"juiceshop:NYX_JUICESHOP_CORPUS:juiceshop.json"
|
||||
)
|
||||
local any_ran=0 any_failed=0
|
||||
for row in "${rows[@]}"; do
|
||||
local name envvar gtfile
|
||||
IFS=: read -r name envvar gtfile <<<"${row}"
|
||||
# When --sets names a single corpus, only run that row.
|
||||
if [[ -n "${SETS}" && "${SETS}" != "jsts" && "${SETS}" != "${name}" ]]; then
|
||||
continue
|
||||
fi
|
||||
local corpus="${!envvar:-}"
|
||||
if [[ -z "${corpus}" || ! -d "${corpus}" ]]; then
|
||||
echo " SKIP ${name}: set ${envvar} to a checkout to run this row."
|
||||
continue
|
||||
fi
|
||||
any_ran=1
|
||||
echo " ── ${name} (${corpus}) ──"
|
||||
if _gate7_run_corpus "${name}" "${corpus}" \
|
||||
"${REPO_ROOT}/tests/eval_corpus/ground_truth/${gtfile}"; then
|
||||
echo " PASS ${name}"
|
||||
else
|
||||
any_failed=1
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ ${any_ran} -eq 0 ]]; then
|
||||
echo " SKIP: no JS/TS corpus configured (set NYX_NODEGOAT_CORPUS / NYX_JUICESHOP_CORPUS)."
|
||||
echo " (Gate 7 is Phase 28's headline acceptance for the JS/TS real corpora.)"
|
||||
return 0
|
||||
fi
|
||||
[[ ${any_failed} -eq 0 ]] || return 1
|
||||
echo " PASS"
|
||||
}
|
||||
|
||||
# ── Driver ────────────────────────────────────────────────────────────────────
|
||||
|
||||
declare -a FAILED=()
|
||||
|
|
@ -310,6 +482,7 @@ run_gate 3 verify_ratio
|
|||
run_gate 4 sarif_schema
|
||||
run_gate 5 layering
|
||||
run_gate 6 owasp_scale
|
||||
run_gate 7 jsts_scale
|
||||
|
||||
if [[ ${#FAILED[@]} -gt 0 ]]; then
|
||||
echo
|
||||
|
|
|
|||
|
|
@ -116,3 +116,87 @@ cap = "auth"
|
|||
lang = "java"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
# ── NodeGoat / Juice Shop (JS/TS) — Track R.1 ratchet ────────────────────────
|
||||
#
|
||||
# Phase 28 wires two intentionally-vulnerable JS/TS apps into the same
|
||||
# acceptance machinery as OWASP Benchmark: OWASP NodeGoat (Express, .js)
|
||||
# and OWASP Juice Shop (TypeScript, .ts). Unlike OWASP Benchmark, neither
|
||||
# app ships vuln/benign *pairs* — every labelled file is `vuln = true` (see
|
||||
# ground_truth/{nodegoat,juiceshop}.manifest.toml). Two consequences for
|
||||
# these cells:
|
||||
#
|
||||
# * false_confirmed_rate (<= 2%) is the headline maximum the verifier
|
||||
# already satisfies and is HARD-enforced: it only trips when a Confirmed
|
||||
# finding lands on a file with no ground-truth positive, i.e. an
|
||||
# over-confirm. With the verifier confirming little on real corpora yet
|
||||
# it is satisfied, and it ratchets precision as confirms grow.
|
||||
# * unsupported_rate (<= 20%) is HARD-enforced too. `Unsupported` counts
|
||||
# only NoPayloadsForCap / EntryKindUnsupported / SoundOracleUnavailable —
|
||||
# a narrow bucket that Tracks J + M shrank — *not* BuildFailed /
|
||||
# SpecDerivationFailed (those are Inconclusive), so it stays low.
|
||||
#
|
||||
# confirmed_rate (>= 40%), precision (>= 0.85) and recall (>= 0.40) are the
|
||||
# Phase 28 acceptance DESTINATIONS. They are intentionally left UNSET here
|
||||
# and published report-only by Gate 7 (NYX_JSTS_FLOOR_CAPS empty by default,
|
||||
# mirroring NYX_OWASP_FLOOR_CAPS) because (a) the verifier does not yet
|
||||
# Confirm these corpora end to end and (b) the manifest labels canonical
|
||||
# vulns only, so precision vs partial ground truth is informational until
|
||||
# the labels are completed. Promote a cap into the floor set the moment it
|
||||
# starts Confirming, exactly as for OWASP.
|
||||
|
||||
# NodeGoat (javascript): caps with a ground-truth label in nodegoat.manifest.toml.
|
||||
[[cell]]
|
||||
cap = "cmdi"
|
||||
lang = "javascript"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
[[cell]]
|
||||
cap = "xss"
|
||||
lang = "javascript"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
[[cell]]
|
||||
cap = "unauthorized_id"
|
||||
lang = "javascript"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
[[cell]]
|
||||
cap = "crypto"
|
||||
lang = "javascript"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
# Juice Shop (typescript): caps with a ground-truth label in juiceshop.manifest.toml.
|
||||
[[cell]]
|
||||
cap = "sqli"
|
||||
lang = "typescript"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
[[cell]]
|
||||
cap = "path_traversal"
|
||||
lang = "typescript"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
[[cell]]
|
||||
cap = "redirect"
|
||||
lang = "typescript"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
[[cell]]
|
||||
cap = "ssrf"
|
||||
lang = "typescript"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
[[cell]]
|
||||
cap = "crypto"
|
||||
lang = "typescript"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
|
|
|||
|
|
@ -34,3 +34,38 @@ python3 tests/eval_corpus/owasp_gt_convert.py \
|
|||
File: `nist_sard.json`
|
||||
|
||||
Same format. Source: SARD manifest XML converted with `python3 tests/eval_corpus/sard_gt_convert.py`.
|
||||
|
||||
## OWASP NodeGoat / OWASP Juice Shop (JS/TS — Track R.1)
|
||||
|
||||
Files: `nodegoat.json` (Express, `.js`), `juiceshop.json` (TypeScript, `.ts`).
|
||||
Same four-field format as above; all records are `vuln: true`.
|
||||
|
||||
These two apps are intentionally vulnerable end to end, so — unlike OWASP
|
||||
Benchmark — they ship no machine-readable per-file vuln labels and have no
|
||||
benign-control files to pair against. The authoritative source is a curated
|
||||
TOML manifest committed here, one `[[entry]]` per known-vulnerable handler
|
||||
with a `note` citing why:
|
||||
|
||||
- `nodegoat.manifest.toml`
|
||||
- `juiceshop.manifest.toml`
|
||||
|
||||
`manifest_gt_convert.py` turns a manifest into the committed `.json`:
|
||||
|
||||
```sh
|
||||
python3 tests/eval_corpus/manifest_gt_convert.py \
|
||||
--manifest tests/eval_corpus/ground_truth/nodegoat.manifest.toml \
|
||||
--output tests/eval_corpus/ground_truth/nodegoat.json
|
||||
```
|
||||
|
||||
Pass `--corpus-dir <clone>` to validate every labelled path against a real
|
||||
checkout. The converter exits non-zero if any path is missing, so a corpus
|
||||
bump that moves a handler fails loudly instead of silently dropping recall.
|
||||
CI (`.github/workflows/eval.yml`, `jsts` job) regenerates each `.json`
|
||||
against a fresh clone of the pinned ref and asserts it matches the committed
|
||||
file.
|
||||
|
||||
Because the manifests label canonical vulns only, recall (did nyx catch the
|
||||
known vulns) is the meaningful metric; precision vs this partial ground
|
||||
truth is informational. Gate 7 publishes per-cap precision/recall/confirmed
|
||||
report-only by default (`NYX_JSTS_FLOOR_CAPS` empty), matching the OWASP
|
||||
gate.
|
||||
|
|
|
|||
38
tests/eval_corpus/ground_truth/juiceshop.json
Normal file
38
tests/eval_corpus/ground_truth/juiceshop.json
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
[
|
||||
{
|
||||
"path": "lib/insecurity.ts",
|
||||
"line": 0,
|
||||
"cap": "crypto",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "routes/fileServer.ts",
|
||||
"line": 0,
|
||||
"cap": "path_traversal",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "routes/login.ts",
|
||||
"line": 0,
|
||||
"cap": "sqli",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "routes/profileImageUrlUpload.ts",
|
||||
"line": 0,
|
||||
"cap": "ssrf",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "routes/redirect.ts",
|
||||
"line": 0,
|
||||
"cap": "redirect",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "routes/search.ts",
|
||||
"line": 0,
|
||||
"cap": "sqli",
|
||||
"vuln": true
|
||||
}
|
||||
]
|
||||
66
tests/eval_corpus/ground_truth/juiceshop.manifest.toml
Normal file
66
tests/eval_corpus/ground_truth/juiceshop.manifest.toml
Normal file
|
|
@ -0,0 +1,66 @@
|
|||
# OWASP Juice Shop — curated vuln ground-truth manifest (Phase 28, Track R.1).
|
||||
#
|
||||
# Juice Shop is an intentionally-vulnerable TypeScript/Express + Angular
|
||||
# app. Its `data/static/challenges.yml` enumerates challenges but pins no
|
||||
# source file/line, so it cannot drive file-level ground truth on its own.
|
||||
# This manifest IS the authoritative source: one [[entry]] per known-
|
||||
# vulnerable server-side handler, curated from the project's own challenge
|
||||
# definitions + companion guide, each with a `note` citing the challenge.
|
||||
#
|
||||
# tests/eval_corpus/manifest_gt_convert.py turns this into the committed
|
||||
# ground_truth/juiceshop.json. CI regenerates it against a fresh clone of
|
||||
# the pinned tag and asserts byte-equality; the converter HARD-ERRORS on
|
||||
# any path that no longer exists in the corpus, so a Juice Shop bump that
|
||||
# refactors a route fails the eval job loudly instead of silently dropping
|
||||
# recall. Re-pin `pinned_ref` and re-validate the paths together.
|
||||
#
|
||||
# `cap` is a nyx cap label (tabulate.py). `path` is relative to the Juice
|
||||
# Shop clone root, POSIX separators. Lang is inferred from the extension
|
||||
# (.ts -> typescript). All `vuln = true`: Juice Shop is all-vulnerable, so
|
||||
# there is no benign-control file to pair against. As with NodeGoat,
|
||||
# precision vs this manifest is informational (an unlabelled finding may be
|
||||
# a real uncurated vuln, not a false positive) while recall is the
|
||||
# meaningful floor. See tests/eval_corpus/budget.toml for the gate policy.
|
||||
|
||||
corpus = "juiceshop"
|
||||
upstream = "https://github.com/juice-shop/juice-shop"
|
||||
# Pinned to a stable release tag. The server-side handlers below
|
||||
# (routes/*.ts, lib/insecurity.ts) have been stable across the TypeScript
|
||||
# era of Juice Shop; re-validate if the tag is bumped.
|
||||
pinned_ref = "v15.0.0"
|
||||
|
||||
[[entry]]
|
||||
path = "routes/login.ts"
|
||||
cap = "sqli"
|
||||
vuln = true
|
||||
note = "login builds a raw `models.sequelize.query(\"... WHERE email = '\" + req.body.email + \"' ...\")` — SQL injection auth bypass (challenge: loginAdmin / loginBender)."
|
||||
|
||||
[[entry]]
|
||||
path = "routes/search.ts"
|
||||
cap = "sqli"
|
||||
vuln = true
|
||||
note = "product search concatenates the `q` criteria into a raw `models.sequelize.query` LIKE clause — UNION-based SQL injection (challenge: unionSqlInjection / dbSchema)."
|
||||
|
||||
[[entry]]
|
||||
path = "routes/fileServer.ts"
|
||||
cap = "path_traversal"
|
||||
vuln = true
|
||||
note = "serveKeyFiles / file download resolves a user-controlled filename under the ftp dir without containment — path traversal (challenge: accessLogDisclosure / forgottenDevBackup)."
|
||||
|
||||
[[entry]]
|
||||
path = "routes/redirect.ts"
|
||||
cap = "redirect"
|
||||
vuln = true
|
||||
note = "redirect endpoint forwards to the `to` query param via an allow-list that is bypassable by substring — open redirect (challenge: redirectCryptoCurrency / redirect)."
|
||||
|
||||
[[entry]]
|
||||
path = "routes/profileImageUrlUpload.ts"
|
||||
cap = "ssrf"
|
||||
vuln = true
|
||||
note = "profile image upload fetches an arbitrary user-supplied imageUrl server-side — SSRF (challenge: ssrf)."
|
||||
|
||||
[[entry]]
|
||||
path = "lib/insecurity.ts"
|
||||
cap = "crypto"
|
||||
vuln = true
|
||||
note = "hardcoded HMAC/JWT key material and weak hashing (md5-based `hash`) — broken cryptography / hardcoded secret (challenge: weakCryptography / jwt*)."
|
||||
32
tests/eval_corpus/ground_truth/nodegoat.json
Normal file
32
tests/eval_corpus/ground_truth/nodegoat.json
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
[
|
||||
{
|
||||
"path": "app/routes/allocations.js",
|
||||
"line": 0,
|
||||
"cap": "unauthorized_id",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "app/routes/contributions.js",
|
||||
"line": 0,
|
||||
"cap": "cmdi",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "app/routes/memos.js",
|
||||
"line": 0,
|
||||
"cap": "xss",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "app/routes/profile.js",
|
||||
"line": 0,
|
||||
"cap": "xss",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "config/env/all.js",
|
||||
"line": 0,
|
||||
"cap": "crypto",
|
||||
"vuln": true
|
||||
}
|
||||
]
|
||||
62
tests/eval_corpus/ground_truth/nodegoat.manifest.toml
Normal file
62
tests/eval_corpus/ground_truth/nodegoat.manifest.toml
Normal file
|
|
@ -0,0 +1,62 @@
|
|||
# OWASP NodeGoat — curated vuln ground-truth manifest (Phase 28, Track R.1).
|
||||
#
|
||||
# NodeGoat is an intentionally-vulnerable Express/Node app that maps the
|
||||
# OWASP Top 10 to concrete handlers. It ships no machine-readable per-file
|
||||
# vuln labels (unlike OWASP Benchmark's expectedresults CSV), so this
|
||||
# manifest IS the authoritative source: one [[entry]] per known-vulnerable
|
||||
# location, each curated from the project's own tutorial + the canonical
|
||||
# vuln walk-through, with a `note` citing why.
|
||||
#
|
||||
# tests/eval_corpus/manifest_gt_convert.py turns this into the committed
|
||||
# ground_truth/nodegoat.json. CI regenerates it against a fresh clone of
|
||||
# the pinned ref and asserts byte-equality, and the converter HARD-ERRORS
|
||||
# on any path that no longer exists in the corpus, so a NodeGoat bump that
|
||||
# moves a handler fails the eval job loudly rather than silently dropping
|
||||
# recall. Update `pinned_ref` + the paths together when re-pinning.
|
||||
#
|
||||
# `cap` is a nyx cap label (tabulate.py). `path` is relative to the
|
||||
# NodeGoat clone root, POSIX separators. Lang is inferred from the
|
||||
# extension (.js -> javascript). These are all `vuln = true`: NodeGoat is
|
||||
# all-vulnerable, so there is no benign-control file to pair against (the
|
||||
# OWASP Benchmark vuln/benign pairing does not exist here). Precision vs
|
||||
# this manifest is therefore informational (an unlabelled finding is not
|
||||
# necessarily a false positive — it may be a real vuln we did not curate),
|
||||
# while recall (did nyx catch the canonical vulns) is the meaningful floor.
|
||||
# See tests/eval_corpus/budget.toml for how the gate treats these cells.
|
||||
|
||||
corpus = "nodegoat"
|
||||
upstream = "https://github.com/OWASP/NodeGoat"
|
||||
# NodeGoat publishes no semver tags; the eval job pins the default branch
|
||||
# via the CI cache key. The `app/` + `config/` layout below has been
|
||||
# stable for years; re-validate the paths if the cache key is bumped.
|
||||
pinned_ref = "master"
|
||||
|
||||
[[entry]]
|
||||
path = "app/routes/contributions.js"
|
||||
cap = "cmdi"
|
||||
vuln = true
|
||||
note = "handleContributionsUpdate eval()s the pre-tax/after-tax/roth form fields — server-side JS injection (OWASP A1 Injection); the textbook NodeGoat RCE."
|
||||
|
||||
[[entry]]
|
||||
path = "app/routes/profile.js"
|
||||
cap = "xss"
|
||||
vuln = true
|
||||
note = "profile fields (firstName/lastName/bankAcc/...) are persisted then rendered unescaped — stored XSS (OWASP A3 / A7 XSS)."
|
||||
|
||||
[[entry]]
|
||||
path = "app/routes/memos.js"
|
||||
cap = "xss"
|
||||
vuln = true
|
||||
note = "memo body is stored and echoed back into the memos view without output encoding — stored XSS."
|
||||
|
||||
[[entry]]
|
||||
path = "app/routes/allocations.js"
|
||||
cap = "unauthorized_id"
|
||||
vuln = true
|
||||
note = "allocations are looked up by a userId taken from the request with no ownership check — insecure direct object reference / broken access control (OWASP A4)."
|
||||
|
||||
[[entry]]
|
||||
path = "config/env/all.js"
|
||||
cap = "crypto"
|
||||
vuln = true
|
||||
note = "hardcoded cookieSecret / session secret committed in source — sensitive-data / weak-secret smell (OWASP A6)."
|
||||
195
tests/eval_corpus/manifest_gt_convert.py
Executable file
195
tests/eval_corpus/manifest_gt_convert.py
Executable file
|
|
@ -0,0 +1,195 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Convert a curated TOML vuln manifest into nyx ground-truth JSON.
|
||||
|
||||
Used for real-world apps that ship **no** machine-readable per-file vuln
|
||||
labels of their own (OWASP NodeGoat, OWASP Juice Shop). OWASP Benchmark
|
||||
ships `expectedresults-1.2beta.csv` (see owasp_gt_convert.py); NIST SARD
|
||||
ships `manifest.xml` (see sard_gt_convert.py). NodeGoat / Juice Shop are
|
||||
intentionally-vulnerable apps without an equivalent, so the authoritative
|
||||
source here is a curated manifest committed *in this repo* — one
|
||||
`[[entry]]` table per known-vulnerable location, each carrying a
|
||||
provenance `note` so a reviewer can trace why the label is what it is.
|
||||
|
||||
Manifest schema (TOML)::
|
||||
|
||||
# provenance comments at the top
|
||||
corpus = "nodegoat" # informational label
|
||||
upstream = "https://github.com/OWASP/NodeGoat"
|
||||
pinned_ref = "master@<sha>" # the ref the paths were curated against
|
||||
|
||||
[[entry]]
|
||||
path = "app/routes/contributions.js" # relative to the corpus root, POSIX
|
||||
cap = "cmdi" # a nyx cap label (tabulate.py)
|
||||
vuln = true # true = real vuln, false = benign control
|
||||
note = "eval() of user-supplied pre/after-tax fields (NodeGoat A1)"
|
||||
|
||||
Output (consumed by tabulate.py): a list of `{path, line, cap, vuln}`
|
||||
records, sorted by `(path, cap)` for deterministic, diff-stable JSON.
|
||||
`note` is intentionally dropped — the ground-truth JSON keeps the exact
|
||||
same four-field schema OWASP/SARD produce, so tabulate.py needs no special
|
||||
casing. `line` is always 0 (the manifest pins a file, not a line;
|
||||
tabulate.py matches file+cap and treats line 0 as "any line").
|
||||
|
||||
Path validation (the no-compromise guard). When `--corpus-dir` is given,
|
||||
**every** manifest path must resolve to a real file under that root or the
|
||||
converter exits non-zero. CI runs the converter against a fresh clone of
|
||||
the pinned corpus and then asserts the committed JSON byte-matches the
|
||||
regenerated JSON, so a corpus bump that moves/renames/deletes a labelled
|
||||
file (or a typo'd path) fails the build loudly instead of silently
|
||||
degrading recall. Authoring the committed JSON offline (no corpus on
|
||||
hand) is done by omitting `--corpus-dir`: the transform is identical, only
|
||||
the existence check is skipped.
|
||||
|
||||
Usage::
|
||||
|
||||
# author / regenerate the committed JSON offline (no validation):
|
||||
tests/eval_corpus/manifest_gt_convert.py \\
|
||||
--manifest tests/eval_corpus/ground_truth/nodegoat.manifest.toml \\
|
||||
--output tests/eval_corpus/ground_truth/nodegoat.json
|
||||
|
||||
# CI: validate every path against a real checkout, then diff vs committed:
|
||||
tests/eval_corpus/manifest_gt_convert.py \\
|
||||
--manifest tests/eval_corpus/ground_truth/nodegoat.manifest.toml \\
|
||||
--corpus-dir ~/.cache/nyx/eval_corpus/nodegoat \\
|
||||
--output /tmp/nodegoat_regen.json
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
import tomllib # Python 3.11+
|
||||
except ModuleNotFoundError: # pragma: no cover — older interpreters only
|
||||
import tomli as tomllib # type: ignore[no-redef]
|
||||
|
||||
# nyx cap labels (see tabulate.py _CAP_BIT_TABLE / _CAP_RULE_TABLE). A
|
||||
# manifest cap outside this set is almost always a typo, so reject it at
|
||||
# conversion time rather than letting a never-matching cap silently sink
|
||||
# recall.
|
||||
VALID_CAPS = {
|
||||
"path_traversal",
|
||||
"fmt_string",
|
||||
"sqli",
|
||||
"deserialize",
|
||||
"ssrf",
|
||||
"cmdi",
|
||||
"crypto",
|
||||
"unauthorized_id",
|
||||
"data_exfil",
|
||||
"ldap_injection",
|
||||
"xpath_injection",
|
||||
"header_injection",
|
||||
"redirect",
|
||||
"xss",
|
||||
"xxe",
|
||||
"prototype_pollution",
|
||||
"auth",
|
||||
"memory",
|
||||
"validation",
|
||||
}
|
||||
|
||||
|
||||
def load_manifest(path: Path) -> dict:
|
||||
try:
|
||||
with open(path, "rb") as f:
|
||||
return tomllib.load(f)
|
||||
except FileNotFoundError:
|
||||
print(f"error: manifest not found: {path}", file=sys.stderr)
|
||||
raise SystemExit(1)
|
||||
except tomllib.TOMLDecodeError as e:
|
||||
print(f"error: manifest malformed: {path}: {e}", file=sys.stderr)
|
||||
raise SystemExit(1)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument("--manifest", required=True, help="curated TOML manifest path")
|
||||
p.add_argument("--output", required=True, help="output ground-truth JSON path")
|
||||
p.add_argument(
|
||||
"--corpus-dir",
|
||||
default="",
|
||||
help=(
|
||||
"when set, every manifest path must resolve to a real file under "
|
||||
"this root or the converter exits 2 (the CI corpus-drift guard)"
|
||||
),
|
||||
)
|
||||
args = p.parse_args()
|
||||
|
||||
manifest = load_manifest(Path(args.manifest).expanduser())
|
||||
entries = manifest.get("entry", []) or []
|
||||
if not entries:
|
||||
print(f"error: manifest has no [[entry]] tables: {args.manifest}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
corpus = Path(args.corpus_dir).expanduser().resolve() if args.corpus_dir else None
|
||||
if args.corpus_dir and (corpus is None or not corpus.is_dir()):
|
||||
print(f"error: corpus dir not found: {args.corpus_dir}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
records: list[dict] = []
|
||||
missing: list[str] = []
|
||||
seen: set[tuple[str, str]] = set()
|
||||
for i, e in enumerate(entries):
|
||||
path = e.get("path")
|
||||
cap = e.get("cap")
|
||||
vuln = e.get("vuln")
|
||||
if not path or not cap or not isinstance(vuln, bool):
|
||||
print(
|
||||
f"error: entry #{i} needs string path, string cap, bool vuln: {e!r}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 1
|
||||
if cap not in VALID_CAPS:
|
||||
print(
|
||||
f"error: entry #{i} cap {cap!r} is not a known nyx cap "
|
||||
f"(path {path!r}); fix the manifest",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 1
|
||||
norm = path.replace("\\", "/")
|
||||
key = (norm, cap)
|
||||
if key in seen:
|
||||
print(
|
||||
f"error: duplicate (path, cap) entry: {norm!r} / {cap!r}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 1
|
||||
seen.add(key)
|
||||
if corpus is not None and not (corpus / norm).is_file():
|
||||
missing.append(norm)
|
||||
records.append({"path": norm, "line": 0, "cap": cap, "vuln": vuln})
|
||||
|
||||
if missing:
|
||||
print(
|
||||
f"error: {len(missing)} manifest path(s) absent from {corpus} "
|
||||
f"(corpus drift or typo) — regenerate the manifest against the "
|
||||
f"pinned ref:",
|
||||
file=sys.stderr,
|
||||
)
|
||||
for m in missing:
|
||||
print(f" missing: {m}", file=sys.stderr)
|
||||
return 2
|
||||
|
||||
# Deterministic order so the committed JSON is diff-stable and the CI
|
||||
# byte-equality guard is meaningful regardless of manifest ordering.
|
||||
records.sort(key=lambda r: (r["path"], r["cap"]))
|
||||
|
||||
out = Path(args.output).expanduser().resolve()
|
||||
out.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(out, "w") as f:
|
||||
json.dump(records, f, indent=2)
|
||||
f.write("\n")
|
||||
|
||||
vuln_count = sum(1 for r in records if r["vuln"])
|
||||
print(f"wrote {len(records)} records to {out}")
|
||||
print(f" vulns: {vuln_count}")
|
||||
print(f" non-vuln: {len(records) - vuln_count}")
|
||||
if corpus is not None:
|
||||
print(f" validated against: {corpus}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
|
|
@ -28,7 +28,7 @@ REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
|||
OUTPUT_DIR=""
|
||||
NYX_BIN="${NYX_BIN:-${REPO_ROOT}/target/release/nyx}"
|
||||
CORPUS_CACHE="${NYX_EVAL_CORPUS_DIR:-${HOME}/.cache/nyx/eval_corpus}"
|
||||
SETS="owasp,sard,inhouse"
|
||||
SETS="owasp,sard,nodegoat,juiceshop,inhouse"
|
||||
# Optional per-cell budgets and monotonic-improvement diff.
|
||||
BUDGET_FILE=""
|
||||
DIFF_FILE=""
|
||||
|
|
@ -52,6 +52,44 @@ require_cmd() { command -v "$1" >/dev/null 2>&1 || die "required command not fou
|
|||
require_cmd jq
|
||||
require_cmd python3
|
||||
|
||||
# Scan one ground-truth-labelled real corpus (NodeGoat / Juice Shop) and
|
||||
# tabulate it against its committed ground truth. Self-skips when the
|
||||
# corpus has not been cloned into the cache.
|
||||
run_jsts_corpus() {
|
||||
local label="$1" dir="$2" gt="$3"
|
||||
if [[ ! -d "$dir" ]]; then
|
||||
info "Bootstrapping $label..."
|
||||
info " Clone the corpus into ${dir} then re-run this script:"
|
||||
if [[ "$label" == "nodegoat" ]]; then
|
||||
info " git clone --depth 1 https://github.com/OWASP/NodeGoat ${dir}"
|
||||
else
|
||||
info " git clone --depth 1 --branch v15.0.0 \\"
|
||||
info " https://github.com/juice-shop/juice-shop ${dir}"
|
||||
fi
|
||||
info "Skipping $label set (not yet downloaded)."
|
||||
return 0
|
||||
fi
|
||||
info "Running nyx scan on $label..."
|
||||
set +e
|
||||
"$NYX_BIN" scan --format json --verify --no-index "$dir" \
|
||||
> "/tmp/nyx_${label}.json" 2>"/tmp/nyx_${label}.stderr"
|
||||
local rc=$?
|
||||
set -e
|
||||
if [[ $rc -ne 0 && $rc -ne 1 ]]; then
|
||||
info " nyx exited $rc on $label set (stderr follows):"
|
||||
cat "/tmp/nyx_${label}.stderr" >&2
|
||||
return 0
|
||||
fi
|
||||
python3 "${SCRIPT_DIR}/tabulate.py" \
|
||||
--label "$label" \
|
||||
--scan "/tmp/nyx_${label}.json" \
|
||||
--ground-truth "$gt" \
|
||||
--append "$RESULTS_JSON" \
|
||||
${BUDGET_FILE:+--budget "$BUDGET_FILE"} \
|
||||
${DIFF_FILE:+--diff "$DIFF_FILE"} \
|
||||
|| info " tabulate.py failed on $label; ground truth file may be absent"
|
||||
}
|
||||
|
||||
[[ -x "$NYX_BIN" ]] || die "nyx binary not found or not executable: $NYX_BIN"
|
||||
|
||||
mkdir -p "$CORPUS_CACHE"
|
||||
|
|
@ -95,6 +133,16 @@ if [[ "$SETS" == *owasp* ]]; then
|
|||
fi
|
||||
fi
|
||||
|
||||
# ── NodeGoat / Juice Shop (JS/TS) bootstrap — Track R.1 ───────────────────────
|
||||
if [[ "$SETS" == *nodegoat* ]]; then
|
||||
run_jsts_corpus nodegoat "${CORPUS_CACHE}/nodegoat" \
|
||||
"${SCRIPT_DIR}/ground_truth/nodegoat.json"
|
||||
fi
|
||||
if [[ "$SETS" == *juiceshop* ]]; then
|
||||
run_jsts_corpus juiceshop "${CORPUS_CACHE}/juiceshop" \
|
||||
"${SCRIPT_DIR}/ground_truth/juiceshop.json"
|
||||
fi
|
||||
|
||||
# ── NIST SARD subset bootstrap ────────────────────────────────────────────────
|
||||
SARD_DIR="${CORPUS_CACHE}/nist_sard"
|
||||
if [[ "$SETS" == *sard* ]]; then
|
||||
|
|
|
|||
|
|
@ -2,9 +2,9 @@
|
|||
# Full eval-corpus orchestrator.
|
||||
#
|
||||
# Drives a complete pass against every corpus set the project knows about
|
||||
# (OWASP Benchmark v1.2, the NIST SARD subset, and the Nyx benchmark
|
||||
# fixtures), then emits `tests/eval_corpus/results.json` for reports,
|
||||
# diffs, and docs.
|
||||
# (OWASP Benchmark v1.2, the NIST SARD subset, OWASP NodeGoat + Juice Shop,
|
||||
# and the Nyx benchmark fixtures), then emits `tests/eval_corpus/results.json`
|
||||
# for reports, diffs, and docs.
|
||||
#
|
||||
# Usage:
|
||||
# tests/eval_corpus/run_full.sh [--nyx BIN] [--budget FILE] [--diff FILE]
|
||||
|
|
@ -70,7 +70,7 @@ set +e
|
|||
NYX_EVAL_CORPUS_DIR="$CORPUS_CACHE" \
|
||||
bash "${SCRIPT_DIR}/run.sh" \
|
||||
--nyx "$NYX_BIN" \
|
||||
--sets owasp,sard,inhouse \
|
||||
--sets owasp,sard,nodegoat,juiceshop,inhouse \
|
||||
--output "$OUTPUT_DIR" \
|
||||
--budget "$BUDGET_FILE" \
|
||||
${DIFF_FILE:+--diff "$DIFF_FILE"}
|
||||
|
|
|
|||
207
tests/eval_corpus/test_manifest_gt_convert.py
Normal file
207
tests/eval_corpus/test_manifest_gt_convert.py
Normal file
|
|
@ -0,0 +1,207 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Phase 28 (Track R.1) regression test for tests/eval_corpus/manifest_gt_convert.py.
|
||||
|
||||
Proves the manifest -> ground-truth converter is non-vacuous:
|
||||
* a well-formed manifest converts to the expected sorted JSON,
|
||||
* --corpus-dir validation passes when every labelled path exists and
|
||||
produces byte-identical output to the no-corpus transform (so the CI
|
||||
in-sync guard, which diffs committed vs a validated regen, is sound),
|
||||
* --corpus-dir validation HARD-ERRORS (exit 2) on a missing path,
|
||||
* an unknown cap / duplicate (path,cap) / malformed TOML are rejected,
|
||||
* the committed nodegoat.json / juiceshop.json are exactly what a fresh
|
||||
conversion of their manifests produces (offline half of the CI guard).
|
||||
|
||||
Run with::
|
||||
|
||||
python3 tests/eval_corpus/test_manifest_gt_convert.py
|
||||
|
||||
Exits 0 when every assertion holds, non-zero otherwise.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
REPO = Path(__file__).resolve().parents[2]
|
||||
CONVERT = REPO / "tests/eval_corpus/manifest_gt_convert.py"
|
||||
GT_DIR = REPO / "tests/eval_corpus/ground_truth"
|
||||
|
||||
GOOD_MANIFEST = """\
|
||||
corpus = "demo"
|
||||
upstream = "https://example.test/demo"
|
||||
pinned_ref = "v1"
|
||||
|
||||
[[entry]]
|
||||
path = "routes/login.ts"
|
||||
cap = "sqli"
|
||||
vuln = true
|
||||
note = "raw SQL string-concat in login"
|
||||
|
||||
[[entry]]
|
||||
path = "app/routes/contributions.js"
|
||||
cap = "cmdi"
|
||||
vuln = true
|
||||
note = "eval of user input"
|
||||
|
||||
[[entry]]
|
||||
path = "lib/insecurity.ts"
|
||||
cap = "crypto"
|
||||
vuln = false
|
||||
note = "benign control example"
|
||||
"""
|
||||
|
||||
|
||||
def run_convert(*args: str) -> subprocess.CompletedProcess:
|
||||
return subprocess.run(
|
||||
[sys.executable, str(CONVERT), *args], capture_output=True, text=True
|
||||
)
|
||||
|
||||
|
||||
def test_transform_is_sorted_and_schema_clean(tmp: Path) -> None:
|
||||
man = tmp / "demo.manifest.toml"
|
||||
man.write_text(GOOD_MANIFEST)
|
||||
out = tmp / "demo.json"
|
||||
proc = run_convert("--manifest", str(man), "--output", str(out))
|
||||
assert proc.returncode == 0, proc.stdout + proc.stderr
|
||||
records = json.loads(out.read_text())
|
||||
# Sorted by (path, cap); only the 4 GT fields; `note` dropped.
|
||||
assert [r["path"] for r in records] == [
|
||||
"app/routes/contributions.js",
|
||||
"lib/insecurity.ts",
|
||||
"routes/login.ts",
|
||||
], records
|
||||
for r in records:
|
||||
assert set(r) == {"path", "line", "cap", "vuln"}, r
|
||||
assert r["line"] == 0, r
|
||||
assert records[0]["cap"] == "cmdi" and records[0]["vuln"] is True
|
||||
assert records[1]["cap"] == "crypto" and records[1]["vuln"] is False
|
||||
|
||||
|
||||
def test_corpus_validation_passes_and_matches_no_corpus(tmp: Path) -> None:
|
||||
man = tmp / "demo.manifest.toml"
|
||||
man.write_text(GOOD_MANIFEST)
|
||||
# Build a corpus tree containing every labelled path.
|
||||
corpus = tmp / "corpus"
|
||||
for rel in ("routes/login.ts", "app/routes/contributions.js", "lib/insecurity.ts"):
|
||||
f = corpus / rel
|
||||
f.parent.mkdir(parents=True, exist_ok=True)
|
||||
f.write_text("// stub\n")
|
||||
no_corpus = tmp / "no_corpus.json"
|
||||
with_corpus = tmp / "with_corpus.json"
|
||||
assert run_convert("--manifest", str(man), "--output", str(no_corpus)).returncode == 0
|
||||
proc = run_convert(
|
||||
"--manifest", str(man),
|
||||
"--corpus-dir", str(corpus),
|
||||
"--output", str(with_corpus),
|
||||
)
|
||||
assert proc.returncode == 0, proc.stdout + proc.stderr
|
||||
# Validation must not change the output — that is what makes the CI guard
|
||||
# (diff committed vs validated regen) meaningful.
|
||||
assert no_corpus.read_text() == with_corpus.read_text()
|
||||
assert "validated against" in proc.stdout, proc.stdout
|
||||
|
||||
|
||||
def test_missing_path_exits_2(tmp: Path) -> None:
|
||||
man = tmp / "demo.manifest.toml"
|
||||
man.write_text(GOOD_MANIFEST)
|
||||
corpus = tmp / "corpus"
|
||||
# Only two of the three labelled files exist → the third must trip.
|
||||
for rel in ("routes/login.ts", "app/routes/contributions.js"):
|
||||
f = corpus / rel
|
||||
f.parent.mkdir(parents=True, exist_ok=True)
|
||||
f.write_text("// stub\n")
|
||||
out = tmp / "demo.json"
|
||||
proc = run_convert(
|
||||
"--manifest", str(man), "--corpus-dir", str(corpus), "--output", str(out)
|
||||
)
|
||||
assert proc.returncode == 2, proc.stdout + proc.stderr
|
||||
assert "lib/insecurity.ts" in proc.stderr and "missing" in proc.stderr, proc.stderr
|
||||
|
||||
|
||||
def test_unknown_cap_rejected(tmp: Path) -> None:
|
||||
man = tmp / "bad_cap.manifest.toml"
|
||||
man.write_text(
|
||||
'[[entry]]\npath = "a.js"\ncap = "not_a_cap"\nvuln = true\n'
|
||||
)
|
||||
out = tmp / "out.json"
|
||||
proc = run_convert("--manifest", str(man), "--output", str(out))
|
||||
assert proc.returncode == 1, proc.stdout + proc.stderr
|
||||
assert "not a known nyx cap" in proc.stderr, proc.stderr
|
||||
|
||||
|
||||
def test_duplicate_path_cap_rejected(tmp: Path) -> None:
|
||||
man = tmp / "dup.manifest.toml"
|
||||
man.write_text(
|
||||
'[[entry]]\npath = "a.js"\ncap = "xss"\nvuln = true\n'
|
||||
'[[entry]]\npath = "a.js"\ncap = "xss"\nvuln = true\n'
|
||||
)
|
||||
out = tmp / "out.json"
|
||||
proc = run_convert("--manifest", str(man), "--output", str(out))
|
||||
assert proc.returncode == 1, proc.stdout + proc.stderr
|
||||
assert "duplicate" in proc.stderr, proc.stderr
|
||||
|
||||
|
||||
def test_malformed_manifest_exits_1(tmp: Path) -> None:
|
||||
man = tmp / "broken.toml"
|
||||
man.write_text("[[entry]\npath = \n") # invalid TOML
|
||||
out = tmp / "out.json"
|
||||
proc = run_convert("--manifest", str(man), "--output", str(out))
|
||||
assert proc.returncode == 1, proc.stdout + proc.stderr
|
||||
assert "malformed" in proc.stderr, proc.stderr
|
||||
|
||||
|
||||
def test_empty_manifest_exits_1(tmp: Path) -> None:
|
||||
man = tmp / "empty.toml"
|
||||
man.write_text('corpus = "x"\n') # no [[entry]] tables
|
||||
out = tmp / "out.json"
|
||||
proc = run_convert("--manifest", str(man), "--output", str(out))
|
||||
assert proc.returncode == 1, proc.stdout + proc.stderr
|
||||
assert "no [[entry]]" in proc.stderr, proc.stderr
|
||||
|
||||
|
||||
def test_committed_gt_matches_manifest(tmp: Path) -> None:
|
||||
# Offline half of the CI in-sync guard: the committed ground-truth JSON
|
||||
# must be exactly what a fresh conversion of its manifest produces. This
|
||||
# catches a manifest edit that was not followed by a regenerate.
|
||||
for name in ("nodegoat", "juiceshop"):
|
||||
man = GT_DIR / f"{name}.manifest.toml"
|
||||
committed = GT_DIR / f"{name}.json"
|
||||
assert man.exists(), f"missing manifest: {man}"
|
||||
assert committed.exists(), f"missing committed GT: {committed}"
|
||||
regen = tmp / f"{name}.json"
|
||||
proc = run_convert("--manifest", str(man), "--output", str(regen))
|
||||
assert proc.returncode == 0, proc.stdout + proc.stderr
|
||||
assert json.loads(regen.read_text()) == json.loads(committed.read_text()), (
|
||||
f"{committed} is stale — regenerate with manifest_gt_convert.py"
|
||||
)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
tmp = Path(td)
|
||||
for fn in (
|
||||
test_transform_is_sorted_and_schema_clean,
|
||||
test_corpus_validation_passes_and_matches_no_corpus,
|
||||
test_missing_path_exits_2,
|
||||
test_unknown_cap_rejected,
|
||||
test_duplicate_path_cap_rejected,
|
||||
test_malformed_manifest_exits_1,
|
||||
test_empty_manifest_exits_1,
|
||||
test_committed_gt_matches_manifest,
|
||||
):
|
||||
sub = tmp / fn.__name__
|
||||
sub.mkdir()
|
||||
print(f"... {fn.__name__}")
|
||||
fn(sub)
|
||||
print(" OK")
|
||||
print("\nAll manifest_gt_convert.py regression checks passed.")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Loading…
Add table
Add a link
Reference in a new issue