This commit is contained in:
Eli Peter 2026-06-05 10:16:30 -05:00 committed by GitHub
parent 55247b7fcd
commit 991c84a1eb
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
1464 changed files with 225448 additions and 1985 deletions

View file

@ -0,0 +1,352 @@
# Eval corpus budget.
#
# `report.py` enforces these values when `run.sh` or `run_full.sh` pass
# `--budget`. Each (cap, lang) cell uses the default row unless a specific
# override appears below.
#
# Wall-clock cost is measured separately from this per-cell budget.
#
# Schema:
#
# [default]
# unsupported_rate = 0.20 # max(Unsupported / total) per cell
# false_confirmed_rate = 0.02 # max(wrong / Confirmed) per cap
# repro_stability = 0.95 # min(stable / Confirmed) per cell
# confirmed_rate = 0.40 # min(Confirmed / total) per cell (omit to skip)
# ratchet_deadline = "..." # informational; cells already at headline
#
# [[cell]]
# cap = "..."
# lang = "..."
# <overrides as above>
#
# `cap` matches `tabulate.py`'s _CAP_BIT_TABLE / _CAP_RULE_TABLE labels.
# `lang` matches the ext_map values (`python`, `javascript`, …).
# A wildcard `"*"` matches any cell that does not have an exact entry.
#
# Each rate is enforced only when the relevant denominator is non-zero, so a
# cell with no findings (or no Confirmed findings) never trips a budget
# vacuously. `confirmed_rate` is a *minimum* (a ratchet floor); the others are
# maxima. Per-cell overrides are calibrated to the measured frontier on the
# real corpus so the gate locks in current performance and catches regressions
# (see the OWASP cells below).
[default]
unsupported_rate = 0.20
false_confirmed_rate = 0.02
repro_stability = 0.95
ratchet_deadline = "2026-05-15"
# ── OWASP Benchmark v1.2 (Java) — Track R.0 ratchet ──────────────────────────
#
# Calibrated against the pinned 1.2beta corpus, nyx c0501884, 2026-05-29
# (`nyx scan --verify` over all 2740 BenchmarkTest files; 5812 findings).
#
# Measured frontier at calibration:
# verdicts : Confirmed 0 | NotConfirmed 4077 | Inconclusive 1725
# (BuildFailed 952 + SpecDerivationFailed 773) | Unsupported 10
# per cell : unsupported_rate <= 1.7% (headline <= 20% -> MET)
# false_confirmed = 0% (headline <= 2% -> MET, 0 confirms)
# confirmed_rate = 0% (headline >= 40% -> NOT met)
#
# The verifier confirms nothing on OWASP yet: every BenchmarkTest is a servlet
# whose harness lands in BuildFailed / SpecDerivationFailed (Java servlet entry
# wiring + classpath are Track L.12 / Track O.0 work). So the enforced floors
# below are the two headline maxima the verifier already satisfies
# (unsupported_rate, false_confirmed_rate). `confirmed_rate` is intentionally
# left UNSET — the headline >= 40% is the ratchet's destination, recorded here
# and in scripts/m7_ship_gate.sh (NYX_OWASP_FLOOR_CAPS), not a floor we can
# honestly assert at 0 confirms. Promote a cap into the gated set (and add its
# `confirmed_rate`) the moment it starts Confirming.
#
# Caps split two ways:
# sound-oracle (injection): cmdi, sqli, path_traversal, ldap_injection,
# xpath_injection — once their servlet harnesses build, a runtime oracle
# exists; these are the GATE6_FLOOR_CAPS candidates.
# no-sound-oracle (config/usage smell): crypto (weak rand/hash), auth
# (insecure cookie), xss/trustbound — Phase-11 routes these to
# Unsupported(SoundOracleUnavailable); they stay report-only. When that
# routing lands their unsupported_rate will rise and these cells must be
# relaxed accordingly.
[[cell]]
cap = "cmdi"
lang = "java"
unsupported_rate = 0.20
false_confirmed_rate = 0.02
[[cell]]
cap = "sqli"
lang = "java"
unsupported_rate = 0.20
false_confirmed_rate = 0.02
[[cell]]
cap = "path_traversal"
lang = "java"
unsupported_rate = 0.20
false_confirmed_rate = 0.02
[[cell]]
cap = "ldap_injection"
lang = "java"
unsupported_rate = 0.20
false_confirmed_rate = 0.02
[[cell]]
cap = "xpath_injection"
lang = "java"
unsupported_rate = 0.20
false_confirmed_rate = 0.02
[[cell]]
cap = "xss"
lang = "java"
unsupported_rate = 0.20
false_confirmed_rate = 0.02
[[cell]]
cap = "crypto"
lang = "java"
unsupported_rate = 0.20
false_confirmed_rate = 0.02
[[cell]]
cap = "auth"
lang = "java"
unsupported_rate = 0.20
false_confirmed_rate = 0.02
# ── NodeGoat / Juice Shop (JS/TS) — Track R.1 ratchet ────────────────────────
#
# Phase 28 wires two intentionally-vulnerable JS/TS apps into the same
# acceptance machinery as OWASP Benchmark: OWASP NodeGoat (Express, .js)
# and OWASP Juice Shop (TypeScript, .ts). Unlike OWASP Benchmark, neither
# app ships vuln/benign *pairs* — every labelled file is `vuln = true` (see
# ground_truth/{nodegoat,juiceshop}.manifest.toml). Two consequences for
# these cells:
#
# * false_confirmed_rate (<= 2%) is the headline maximum the verifier
# already satisfies and is HARD-enforced: it only trips when a Confirmed
# finding lands on a file with no ground-truth positive, i.e. an
# over-confirm. With the verifier confirming little on real corpora yet
# it is satisfied, and it ratchets precision as confirms grow.
# * unsupported_rate (<= 20%) is HARD-enforced too. `Unsupported` counts
# only NoPayloadsForCap / EntryKindUnsupported / SoundOracleUnavailable —
# a narrow bucket that Tracks J + M shrank — *not* BuildFailed /
# SpecDerivationFailed (those are Inconclusive), so it stays low.
#
# confirmed_rate (>= 40%), precision (>= 0.85) and recall (>= 0.40) are the
# Phase 28 acceptance DESTINATIONS. They are intentionally left UNSET here
# and published report-only by Gate 7 (NYX_JSTS_FLOOR_CAPS empty by default,
# mirroring NYX_OWASP_FLOOR_CAPS) because (a) the verifier does not yet
# Confirm these corpora end to end and (b) the manifest labels canonical
# vulns only, so precision vs partial ground truth is informational until
# the labels are completed. Promote a cap into the floor set the moment it
# starts Confirming, exactly as for OWASP.
# NodeGoat (javascript): caps with a ground-truth label in nodegoat.manifest.toml.
[[cell]]
cap = "cmdi"
lang = "javascript"
unsupported_rate = 0.20
false_confirmed_rate = 0.02
[[cell]]
cap = "xss"
lang = "javascript"
unsupported_rate = 0.20
false_confirmed_rate = 0.02
[[cell]]
cap = "unauthorized_id"
lang = "javascript"
unsupported_rate = 0.20
false_confirmed_rate = 0.02
[[cell]]
cap = "crypto"
lang = "javascript"
unsupported_rate = 0.20
false_confirmed_rate = 0.02
# Juice Shop (typescript): caps with a ground-truth label in juiceshop.manifest.toml.
[[cell]]
cap = "sqli"
lang = "typescript"
unsupported_rate = 0.20
false_confirmed_rate = 0.02
[[cell]]
cap = "path_traversal"
lang = "typescript"
unsupported_rate = 0.20
false_confirmed_rate = 0.02
[[cell]]
cap = "redirect"
lang = "typescript"
unsupported_rate = 0.20
false_confirmed_rate = 0.02
[[cell]]
cap = "ssrf"
lang = "typescript"
unsupported_rate = 0.20
false_confirmed_rate = 0.02
[[cell]]
cap = "crypto"
lang = "typescript"
unsupported_rate = 0.20
false_confirmed_rate = 0.02
# ── Polyglot real corpora (Ruby/PHP/Python/Go/Rust) — Track R.2 ──────────────
#
# Phase 29 wires five more intentionally-vulnerable real corpora, one per
# remaining language family, into the same acceptance machinery as OWASP /
# NodeGoat / Juice Shop:
#
# * railsgoat — OWASP RailsGoat (Rails, .rb)
# * dvwa — Damn Vulnerable Web Application (PHP); ships graded
# source variants, so low.php = vuln and impossible.php =
# benign control — real vuln/benign PAIRS like OWASP.
# * dvpwa — Damn Vulnerable Python Web App (aiohttp, .py); its
# parameterized DAO siblings are benign controls for the
# one `%`-formatted SQL sink.
# * gosec — the Go SAST tool's own repo; the scannable, `// want`-
# annotated sample under goanalysis/testdata is the curated
# ground truth (its embedded-string rule samples are not
# scannable, so they are unlabelled).
# * rustsec — RustSec advisory-db: a NEGATIVE CONTROL. It ships
# advisory metadata, not vulnerable .rs source, so its
# ground truth is empty by construction; the row asserts the
# Rust scan/verify path runs at scale within wall-clock and
# Confirms NOTHING (any Confirmed Rust finding there is a
# false confirm and trips the default false_confirmed_rate).
#
# Each row is gated with the SAME policy as Gates 6/7 (scripts/m7_ship_gate.sh
# Gate 8): wall-clock + the per-(cap,lang) budget below are HARD-enforced;
# per-cap confirmed-rate / precision / recall are published report-only
# (NYX_POLYGLOT_FLOOR_CAPS empty by default). Because each corpus targets a
# single language, Gate 8 scopes tabulation to that language (tabulate.py
# --lang), so the vendored third-party JavaScript these Ruby/Python apps
# bundle (bootstrap-colorpicker, materialize, …) — which nyx confirms as
# prototype_pollution — does not pollute the corpus's per-cap metrics. Those
# JS findings are still emitted; they are simply out of scope for a Ruby /
# Python corpus.
#
# Calibrated against the pinned corpora (nyx HEAD of the Phase 29 branch,
# 2026-05-31) with `nyx scan --verify --index off`. Measured frontier
# (target-language scope): every curated cell sits at <= the headline maxima
# below EXCEPT cmdi, where every finding carries a SHELL_ESCAPE sanitizer cap
# and is therefore routed to Unsupported(SoundOracleUnavailable) — the same
# no-sound-oracle treatment OWASP's crypto/auth cells get. RailsGoat's
# deserialize (Marshal.load) and redirect (open redirect) cells Confirm end to
# end with zero false confirms — the first real polyglot confirms.
# railsgoat (ruby): caps with a ground-truth label in railsgoat.manifest.toml.
[[cell]]
cap = "auth"
lang = "ruby"
unsupported_rate = 0.20
false_confirmed_rate = 0.02
[[cell]]
cap = "crypto"
lang = "ruby"
unsupported_rate = 0.20
false_confirmed_rate = 0.02
[[cell]]
cap = "deserialize"
lang = "ruby"
unsupported_rate = 0.20
false_confirmed_rate = 0.02
[[cell]]
cap = "redirect"
lang = "ruby"
unsupported_rate = 0.20
false_confirmed_rate = 0.02
[[cell]]
cap = "path_traversal"
lang = "ruby"
unsupported_rate = 0.20
false_confirmed_rate = 0.02
# cmdi/ruby is incidental (RailsGoat's `self.try(params[:graph])` reflection
# sink); the lone finding carries a SHELL_ESCAPE sanitizer cap and routes to
# Unsupported(SoundOracleUnavailable), so unsupported_rate is locked at the
# measured frontier (1/1). The false-confirm guard stays at the headline 2%.
[[cell]]
cap = "cmdi"
lang = "ruby"
unsupported_rate = 1.00
false_confirmed_rate = 0.02
# dvwa (php): caps with a ground-truth label in dvwa.manifest.toml.
[[cell]]
cap = "sqli"
lang = "php"
unsupported_rate = 0.20
false_confirmed_rate = 0.02
[[cell]]
cap = "redirect"
lang = "php"
unsupported_rate = 0.20
false_confirmed_rate = 0.02
[[cell]]
cap = "header_injection"
lang = "php"
unsupported_rate = 0.20
false_confirmed_rate = 0.02
# cmdi/php: DVWA's ping handlers reach shell_exec through a SHELL_ESCAPE
# sanitizer cap, so ~69% of the cell's findings route to
# Unsupported(SoundOracleUnavailable). unsupported_rate is locked to that
# frontier with margin (a regression above 75% fails); false-confirm at 2%.
[[cell]]
cap = "cmdi"
lang = "php"
unsupported_rate = 0.75
false_confirmed_rate = 0.02
# dvpwa (python): caps with a ground-truth label in dvpwa.manifest.toml.
[[cell]]
cap = "sqli"
lang = "python"
unsupported_rate = 0.20
false_confirmed_rate = 0.02
[[cell]]
cap = "crypto"
lang = "python"
unsupported_rate = 0.20
false_confirmed_rate = 0.02
[[cell]]
cap = "auth"
lang = "python"
unsupported_rate = 0.20
false_confirmed_rate = 0.02
# gosec (go): caps with a ground-truth label in gosec.manifest.toml.
[[cell]]
cap = "crypto"
lang = "go"
unsupported_rate = 0.20
false_confirmed_rate = 0.02
# cmdi/go: the goanalysis/testdata exec.Command sample reaches the sink
# through a SHELL_ESCAPE sanitizer cap, so every cmdi/go finding routes to
# Unsupported(SoundOracleUnavailable). unsupported_rate locked to the
# measured frontier (3/3); false-confirm at the headline 2%.
[[cell]]
cap = "cmdi"
lang = "go"
unsupported_rate = 1.00
false_confirmed_rate = 0.02

View file

@ -0,0 +1,173 @@
#!/usr/bin/env bash
# Phase 31 acceptance walker: assert `nyx surface` produces a usable
# map on every downloaded eval-corpus fixture root.
#
# Walks the project trees under $NYX_EVAL_CORPUS_DIR plus the in-house
# `tests/benchmark/corpus` and `tests/dynamic_fixtures` trees, runs
# `nyx surface --build --format json <root>` against each, and asserts
# the resulting JSON contains at least one EntryPoint plus at least
# one DataStore / ExternalService / DangerousLocal node.
#
# `--build` forces the inline pass-1 + call-graph path so the walker
# does not depend on a prior `nyx index build` or `nyx scan`.
#
# Usage:
# tests/eval_corpus/check_surface.sh [--nyx BIN] [--corpus-dir DIR]
# [--also-inhouse]
# [--report FILE]
#
# Environment:
# NYX_EVAL_CORPUS_DIR — path to pre-downloaded corpus roots
# (default: ~/.cache/nyx/eval_corpus). When
# missing or empty the walker still scans the
# in-house corpus and exits 0 so CI without a
# corpus mirror does not block on Phase 31.
#
# Exit codes:
# 0 every walked project produced a usable SurfaceMap (or no
# projects were available — see corpus-missing note above).
# 1 setup / I/O / missing-binary error.
# 2 one or more projects produced an empty or unusable SurfaceMap.
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
NYX_BIN="${NYX_BIN:-${REPO_ROOT}/target/release/nyx}"
CORPUS_CACHE="${NYX_EVAL_CORPUS_DIR:-${HOME}/.cache/nyx/eval_corpus}"
ALSO_INHOUSE="false"
REPORT_FILE=""
while [[ $# -gt 0 ]]; do
case "$1" in
--nyx) NYX_BIN="$2"; shift 2 ;;
--corpus-dir) CORPUS_CACHE="$2"; shift 2 ;;
--also-inhouse) ALSO_INHOUSE="true"; shift ;;
--report) REPORT_FILE="$2"; shift 2 ;;
-h|--help)
sed -n '1,40p' "$0"
exit 0
;;
*)
echo "unknown flag: $1" >&2
exit 1
;;
esac
done
die() { echo "error: $*" >&2; exit 1; }
info() { echo "[surface-check] $*"; }
warn() { echo "[surface-check] WARN: $*" >&2; }
[[ -x "$NYX_BIN" ]] || die "nyx binary not found or not executable: $NYX_BIN"
command -v jq >/dev/null 2>&1 || die "required command not found: jq"
# Collect project roots. Each corpus directory is treated as a single
# project; the in-house corpus trees are handled the same way (each
# language vertical is a project root).
PROJECTS=()
if [[ -d "$CORPUS_CACHE" ]]; then
for entry in "$CORPUS_CACHE"/*; do
[[ -d "$entry" ]] && PROJECTS+=("$entry")
done
else
warn "corpus directory missing: $CORPUS_CACHE (run tests/eval_corpus/run.sh to bootstrap)"
fi
if [[ "$ALSO_INHOUSE" == "true" ]]; then
for dir in \
"${REPO_ROOT}/tests/benchmark/corpus" \
"${REPO_ROOT}/tests/dynamic_fixtures"
do
[[ -d "$dir" ]] && PROJECTS+=("$dir")
done
fi
if [[ ${#PROJECTS[@]} -eq 0 ]]; then
info "no project roots to walk (eval corpus not downloaded, in-house trees absent)"
exit 0
fi
PASS_COUNT=0
FAIL_COUNT=0
FAIL_PROJECTS=()
declare -a REPORT_ROWS=()
for project in "${PROJECTS[@]}"; do
info "walking: $project"
set +e
out="$("$NYX_BIN" surface --build --format json "$project" 2>/dev/null)"
rc=$?
set -e
if [[ $rc -ne 0 ]]; then
warn "nyx surface --build exited $rc on $project"
FAIL_COUNT=$((FAIL_COUNT + 1))
FAIL_PROJECTS+=("$project (nyx exit=$rc)")
REPORT_ROWS+=("$(printf '{"project":%s,"status":"nyx-error","exit":%d}' \
"$(jq -Rn --arg p "$project" '$p')" "$rc")")
continue
fi
if [[ -z "$out" ]]; then
warn "empty output on $project"
FAIL_COUNT=$((FAIL_COUNT + 1))
FAIL_PROJECTS+=("$project (empty output)")
REPORT_ROWS+=("$(printf '{"project":%s,"status":"empty-output"}' \
"$(jq -Rn --arg p "$project" '$p')")")
continue
fi
# Count nodes by kind. SurfaceMap serialises each node as a flat
# object with a `node` discriminator: `entry_point`, `data_store`,
# `external_service`, `dangerous_local`.
entry_count="$(echo "$out" | jq '[.nodes[] | select(.node == "entry_point")] | length')"
ds_count="$(echo "$out" | jq '[.nodes[] | select(.node == "data_store")] | length')"
es_count="$(echo "$out" | jq '[.nodes[] | select(.node == "external_service")] | length')"
dl_count="$(echo "$out" | jq '[.nodes[] | select(.node == "dangerous_local")] | length')"
sink_count=$((ds_count + es_count + dl_count))
if [[ "$entry_count" -lt 1 ]]; then
warn "no EntryPoint nodes on $project"
FAIL_COUNT=$((FAIL_COUNT + 1))
FAIL_PROJECTS+=("$project (no entry-points)")
REPORT_ROWS+=("$(printf '{"project":%s,"status":"no-entry-points","entry_count":%d}' \
"$(jq -Rn --arg p "$project" '$p')" "$entry_count")")
continue
fi
if [[ "$sink_count" -lt 1 ]]; then
warn "no DataStore / ExternalService / DangerousLocal nodes on $project"
FAIL_COUNT=$((FAIL_COUNT + 1))
FAIL_PROJECTS+=("$project (no sinks: ds=$ds_count es=$es_count dl=$dl_count)")
REPORT_ROWS+=("$(printf '{"project":%s,"status":"no-sinks","entry_count":%d,"ds":%d,"es":%d,"dl":%d}' \
"$(jq -Rn --arg p "$project" '$p')" "$entry_count" "$ds_count" "$es_count" "$dl_count")")
continue
fi
info " ok: ${entry_count} entry-points, ${ds_count} data stores, ${es_count} external, ${dl_count} dangerous"
PASS_COUNT=$((PASS_COUNT + 1))
REPORT_ROWS+=("$(printf '{"project":%s,"status":"ok","entry_count":%d,"ds":%d,"es":%d,"dl":%d}' \
"$(jq -Rn --arg p "$project" '$p')" "$entry_count" "$ds_count" "$es_count" "$dl_count")")
done
if [[ -n "$REPORT_FILE" ]]; then
{
echo "{"
echo " \"pass\": $PASS_COUNT,"
echo " \"fail\": $FAIL_COUNT,"
echo " \"projects\": ["
for i in "${!REPORT_ROWS[@]}"; do
sep=","
[[ $i -eq $((${#REPORT_ROWS[@]} - 1)) ]] && sep=""
echo " ${REPORT_ROWS[$i]}$sep"
done
echo " ]"
echo "}"
} > "$REPORT_FILE"
info "report written: $REPORT_FILE"
fi
info ""
info "summary: ${PASS_COUNT} pass, ${FAIL_COUNT} fail (of $((PASS_COUNT + FAIL_COUNT)) projects)"
if [[ $FAIL_COUNT -gt 0 ]]; then
for p in "${FAIL_PROJECTS[@]}"; do
info " fail: $p"
done
exit 2
fi
exit 0

View file

@ -0,0 +1,106 @@
# Ground truth files
Place corpus ground truth JSON files here before running `tests/eval_corpus/run.sh`.
## OWASP Benchmark v1.2
File: `owasp_benchmark_v1.2.json` (checked in; complete — one record per
BenchmarkTest file, 2740 total).
Format:
```json
[
{"path": "src/main/java/org/owasp/.../BenchmarkTest00001.java", "line": 0, "cap": "sqli", "vuln": true},
...
]
```
`path` is **relative to the corpus root** (the BenchmarkJava clone), with POSIX
separators. `tabulate.py` suffix-matches it against the absolute paths nyx
emits, so the committed JSON is portable: it matches whether the corpus lives at
`~/.cache/nyx/eval_corpus/owasp_benchmark_v1.2` on a laptop or at a CI checkout
path. `line` is `0` (the expected-results CSV does not pin a line; matching
falls back to file+cap).
Regenerate from `expectedresults-1.2beta.csv` shipped with the benchmark repo:
```sh
python3 tests/eval_corpus/owasp_gt_convert.py \
--corpus-dir ~/.cache/nyx/eval_corpus/owasp_benchmark_v1.2 \
--output tests/eval_corpus/ground_truth/owasp_benchmark_v1.2.json
```
## NIST SARD subset
File: `nist_sard.json`
Same format. Source: SARD manifest XML converted with `python3 tests/eval_corpus/sard_gt_convert.py`.
## OWASP NodeGoat / OWASP Juice Shop (JS/TS — Track R.1)
Files: `nodegoat.json` (Express, `.js`), `juiceshop.json` (TypeScript, `.ts`).
Same four-field format as above; all records are `vuln: true`.
These two apps are intentionally vulnerable end to end, so — unlike OWASP
Benchmark — they ship no machine-readable per-file vuln labels and have no
benign-control files to pair against. The authoritative source is a curated
TOML manifest committed here, one `[[entry]]` per known-vulnerable handler
with a `note` citing why:
- `nodegoat.manifest.toml`
- `juiceshop.manifest.toml`
`manifest_gt_convert.py` turns a manifest into the committed `.json`:
```sh
python3 tests/eval_corpus/manifest_gt_convert.py \
--manifest tests/eval_corpus/ground_truth/nodegoat.manifest.toml \
--output tests/eval_corpus/ground_truth/nodegoat.json
```
Pass `--corpus-dir <clone>` to validate every labelled path against a real
checkout. The converter exits non-zero if any path is missing, so a corpus
bump that moves a handler fails loudly instead of silently dropping recall.
CI (`.github/workflows/eval.yml`, `jsts` job) regenerates each `.json`
against a fresh clone of the pinned ref and asserts it matches the committed
file.
Because the manifests label canonical vulns only, recall (did nyx catch the
known vulns) is the meaningful metric; precision vs this partial ground
truth is informational. Gate 7 publishes per-cap precision/recall/confirmed
report-only by default (`NYX_JSTS_FLOOR_CAPS` empty), matching the OWASP
gate.
## Polyglot real corpora (Ruby/PHP/Python/Go/Rust — Track R.2)
Phase 29 wires the remaining language families into the same machinery, one
corpus per family, each with a curated `*.manifest.toml` → committed `*.json`:
- `railsgoat.{manifest.toml,json}` — OWASP RailsGoat (Rails, `.rb`).
- `dvwa.{manifest.toml,json}` — Damn Vulnerable Web Application (PHP). DVWA
ships graded source variants (`source/{low,impossible}.php`), so this is
the one Track R corpus besides OWASP with real vuln/benign **pairs**
(`low.php` = vuln, `impossible.php` = benign control) — precision is
meaningful here, not just informational.
- `dvpwa.{manifest.toml,json}` — Damn Vulnerable Python Web App (aiohttp,
`.py`). Its parameterized DAO siblings are benign controls for the one
`%`-formatted SQL sink.
- `gosec.{manifest.toml,json}` — the gosec Go SAST tool repo; the scannable,
`// want`-annotated sample under `goanalysis/testdata` is the curated
ground truth (gosec's string-embedded rule samples are not scannable, so
they are deliberately unlabelled).
- `rustsec.{manifest.toml,json}` — RustSec advisory-db, a **negative
control**. advisory-db ships advisory metadata, not vulnerable `.rs`
source, so its committed ground truth is empty (`[]`) by construction. The
manifest sets `negative_control = true` (mutually exclusive with
`[[entry]]` tables); `manifest_gt_convert.py` emits the empty JSON and the
row asserts the Rust scan/verify path runs at scale within wall-clock and
Confirms nothing there (any Confirmed Rust finding is a false confirm).
These are converted, validated and asserted-in-sync exactly like NodeGoat /
Juice Shop (the `polyglot` job in `.github/workflows/eval.yml`). Because each
corpus targets a single language, Gate 8 scopes tabulation to that language
(`tabulate.py --lang`) so the vendored third-party JavaScript these Ruby /
Python apps bundle does not pollute their per-cap metrics. Gate 8 publishes
per-cap precision/recall/confirmed report-only by default
(`NYX_POLYGLOT_FLOOR_CAPS` empty), matching the OWASP and JS/TS gates. See
`tests/eval_corpus/budget.toml` for the per-(cap,lang) gate policy.

View file

@ -0,0 +1,38 @@
[
{
"path": "sqli/dao/course.py",
"line": 0,
"cap": "sqli",
"vuln": false
},
{
"path": "sqli/dao/mark.py",
"line": 0,
"cap": "sqli",
"vuln": false
},
{
"path": "sqli/dao/review.py",
"line": 0,
"cap": "sqli",
"vuln": false
},
{
"path": "sqli/dao/student.py",
"line": 0,
"cap": "sqli",
"vuln": true
},
{
"path": "sqli/dao/user.py",
"line": 0,
"cap": "crypto",
"vuln": true
},
{
"path": "sqli/views.py",
"line": 0,
"cap": "auth",
"vuln": true
}
]

View file

@ -0,0 +1,70 @@
# DVPWA (Damn Vulnerable Python Web Application) — curated ground-truth
# manifest (Phase 29, Track R.2).
#
# DVPWA is an intentionally-vulnerable aiohttp app whose headline flaw is
# SQL injection (the package is literally named `sqli`). It ships no
# machine-readable per-file labels, so this manifest IS the authoritative
# source. Its DAO layer is convenient: one method builds a query with
# Python `%` string-formatting (the injectable sink) while its siblings use
# proper parameterized `cur.execute(q, params)` — so the parameterized DAOs
# serve as genuine benign controls (vuln = false) for the sqli cell, making
# precision there meaningful, not just informational.
#
# tests/eval_corpus/manifest_gt_convert.py turns this into the committed
# ground_truth/dvpwa.json. CI regenerates it against a fresh clone of the
# pinned ref and asserts byte-equality; the converter HARD-ERRORS on any
# path that no longer exists, so a corpus bump that moves a DAO fails the
# job loudly rather than silently dropping recall.
#
# `cap` is a nyx cap label (tabulate.py), aligned to how nyx classifies each
# sink (the request-scoped ownership lookups in views.py surface as `auth`).
# `path` is relative to the DVPWA clone root, POSIX separators. Lang is
# inferred from the extension (.py -> python). See
# tests/eval_corpus/budget.toml for the gate policy on these cells.
corpus = "dvpwa"
upstream = "https://github.com/anxolerd/dvpwa"
# DVPWA publishes no release tags; the eval job pins the default branch via
# the CI cache key (clone HEAD a1d8f89fac2e57093189853c6527c2b01fc1d9c1).
# The sqli/ package layout has been stable; re-validate if the cache key is
# bumped.
pinned_ref = "master"
# ── SQL injection (sqli) — one injectable sink + parameterized controls ──────
[[entry]]
path = "sqli/dao/student.py"
cap = "sqli"
vuln = true
note = "Student.create builds the INSERT with Python `%` formatting (\"... VALUES ('%(name)s')\" % {'name': name}) on the request-supplied student name, then cur.execute(q) — SQL injection."
[[entry]]
path = "sqli/dao/course.py"
cap = "sqli"
vuln = false
note = "benign control: every Course query uses parameterized cur.execute(q, params) / VALUES (%(title)s, %(description)s) — not injectable."
[[entry]]
path = "sqli/dao/review.py"
cap = "sqli"
vuln = false
note = "benign control: Review.create / get_for_course bind via cur.execute(q, params) with %(course_id)s / %s placeholders — parameterized."
[[entry]]
path = "sqli/dao/mark.py"
cap = "sqli"
vuln = false
note = "benign control: Mark.create / get_for_student bind via parameterized cur.execute(q, params) — not injectable."
# ── Weak crypto (crypto) ─────────────────────────────────────────────────────
[[entry]]
path = "sqli/dao/user.py"
cap = "crypto"
vuln = true
note = "User.check_password compares against md5(password).hexdigest() — unsalted MD5 for credential storage (weak cryptography)."
# ── Broken access control (auth) ─────────────────────────────────────────────
[[entry]]
path = "sqli/views.py"
cap = "auth"
vuln = true
note = "request handlers resolve the acting user from a client-controlled session id and act on objects without an ownership/authorization check — broken access control."

View file

@ -0,0 +1,50 @@
[
{
"path": "vulnerabilities/exec/source/impossible.php",
"line": 0,
"cap": "cmdi",
"vuln": false
},
{
"path": "vulnerabilities/exec/source/low.php",
"line": 0,
"cap": "cmdi",
"vuln": true
},
{
"path": "vulnerabilities/open_redirect/source/impossible.php",
"line": 0,
"cap": "header_injection",
"vuln": false
},
{
"path": "vulnerabilities/open_redirect/source/impossible.php",
"line": 0,
"cap": "redirect",
"vuln": false
},
{
"path": "vulnerabilities/open_redirect/source/low.php",
"line": 0,
"cap": "header_injection",
"vuln": true
},
{
"path": "vulnerabilities/open_redirect/source/low.php",
"line": 0,
"cap": "redirect",
"vuln": true
},
{
"path": "vulnerabilities/sqli/source/impossible.php",
"line": 0,
"cap": "sqli",
"vuln": false
},
{
"path": "vulnerabilities/sqli/source/low.php",
"line": 0,
"cap": "sqli",
"vuln": true
}
]

View file

@ -0,0 +1,84 @@
# DVWA (Damn Vulnerable Web Application) — curated ground-truth manifest
# (Phase 29, Track R.2).
#
# DVWA is an intentionally-vulnerable PHP app. Unlike the other Track R
# apps it ships its vulnerabilities as graded source variants under
# vulnerabilities/<module>/source/{low,medium,high,impossible}.php, where
# `low.php` is the textbook-vulnerable handler and `impossible.php` is the
# hardened, secure rewrite of the SAME sink. That gives DVWA real
# vuln/benign PAIRS (low = vuln, impossible = benign control) the way OWASP
# Benchmark does — so precision against this manifest is meaningful, not
# just informational: a Confirmed finding on an `impossible.php` control is
# a genuine false confirm.
#
# tests/eval_corpus/manifest_gt_convert.py turns this into the committed
# ground_truth/dvwa.json. CI regenerates it against a fresh clone of the
# pinned tag and asserts byte-equality; the converter HARD-ERRORS on any
# path that no longer exists, so a DVWA bump that restructures a module
# fails loudly rather than silently dropping recall. Re-pin `pinned_ref`
# and re-validate the paths together.
#
# `cap` is a nyx cap label (tabulate.py), aligned to how nyx classifies the
# sink. `path` is relative to the DVWA clone root, POSIX separators. Lang
# is inferred from the extension (.php -> php). See
# tests/eval_corpus/budget.toml for the gate policy on these cells.
corpus = "dvwa"
upstream = "https://github.com/digininja/DVWA"
# Pinned to release tag 2.5 (clone HEAD
# a96943dc1f52f390ee5df72144660636c4b7dd06). The
# vulnerabilities/<module>/source/{low,impossible}.php layout has been stable
# for years; re-validate if the tag is bumped.
pinned_ref = "2.5"
# ── SQL injection (sqli) ─────────────────────────────────────────────────────
[[entry]]
path = "vulnerabilities/sqli/source/low.php"
cap = "sqli"
vuln = true
note = "id = $_REQUEST['id'] is concatenated straight into \"... WHERE user_id = '$id'\" and run via mysqli_query — classic SQL injection."
[[entry]]
path = "vulnerabilities/sqli/source/impossible.php"
cap = "sqli"
vuln = false
note = "benign control: same query via PDO prepare + bindParam(:id, PDO::PARAM_INT) with is_numeric/intval validation — parameterized, not injectable."
# ── OS command injection (cmdi) ──────────────────────────────────────────────
[[entry]]
path = "vulnerabilities/exec/source/low.php"
cap = "cmdi"
vuln = true
note = "target = $_REQUEST['ip'] is concatenated into shell_exec('ping -c 4 ' . $target) with no validation — OS command injection."
[[entry]]
path = "vulnerabilities/exec/source/impossible.php"
cap = "cmdi"
vuln = false
note = "benign control: the IP is split into 4 octets and each is_numeric-checked before being reassembled and passed to shell_exec — not injectable."
# ── Open redirect (redirect) ─────────────────────────────────────────────────
[[entry]]
path = "vulnerabilities/open_redirect/source/low.php"
cap = "redirect"
vuln = true
note = "header('location: ' . $_GET['redirect']) forwards to an unvalidated user-supplied URL — open redirect."
[[entry]]
path = "vulnerabilities/open_redirect/source/impossible.php"
cap = "redirect"
vuln = false
note = "benign control: redirect target is chosen by an integer switch on is_numeric($_GET['redirect']) — no user-controlled URL reaches the Location header."
# ── CRLF / HTTP header injection (header_injection) ──────────────────────────
[[entry]]
path = "vulnerabilities/open_redirect/source/low.php"
cap = "header_injection"
vuln = true
note = "the same unvalidated $_GET['redirect'] flows into a raw header() call, so CRLF in the value splits/injects response headers — HTTP header injection."
[[entry]]
path = "vulnerabilities/open_redirect/source/impossible.php"
cap = "header_injection"
vuln = false
note = "benign control: only a fixed, integer-selected target string reaches header() — no user bytes, so no CRLF injection."

View file

@ -0,0 +1,14 @@
[
{
"path": "goanalysis/testdata/src/a/basic_output.go",
"line": 0,
"cap": "cmdi",
"vuln": true
},
{
"path": "goanalysis/testdata/src/a/basic_output.go",
"line": 0,
"cap": "crypto",
"vuln": true
}
]

View file

@ -0,0 +1,42 @@
# gosec — curated Go ground-truth manifest (Phase 29, Track R.2).
#
# gosec is the Go SAST tool; its repo doubles as the de-facto Go security
# corpus. Most of gosec's rule samples live as Go source embedded in
# backtick string literals inside testutils/g*_samples.go — those are NOT
# scannable by a taint analyzer (the vulnerable code is string data, not
# real AST), so they are deliberately NOT labelled here. gosec also ships a
# small set of REAL, compilable sample programs under goanalysis/testdata
# that carry the tool's OWN inline `// want 'GNNN ...'` expectations — that
# is the authoritative, scannable ground truth this manifest pins.
#
# Because the eval scans the whole gosec checkout (the tool's own source
# included), unlabelled findings are expected and are NOT false positives —
# precision against this manifest is informational, recall on the curated
# samples is the meaningful floor (same policy as the all-vulnerable apps;
# see tests/eval_corpus/budget.toml).
#
# tests/eval_corpus/manifest_gt_convert.py turns this into the committed
# ground_truth/gosec.json. CI regenerates it against a fresh clone of the
# pinned tag and asserts byte-equality; the converter HARD-ERRORS on any
# path that no longer exists, so a gosec bump that moves the testdata fails
# the job loudly. `cap` is a nyx cap label (tabulate.py); `path` is relative
# to the gosec clone root, POSIX separators; lang is inferred (.go -> go).
corpus = "gosec"
upstream = "https://github.com/securego/gosec"
# Pinned to release tag v2.26.1 (clone HEAD
# 4a3bd8af174872c778439083ded7adbf3747e770). goanalysis/testdata/src/a/ has
# been stable; re-validate if the tag is bumped.
pinned_ref = "v2.26.1"
[[entry]]
path = "goanalysis/testdata/src/a/basic_output.go"
cap = "cmdi"
vuln = true
note = "VulnerableFunction runs exec.Command(\"sh\", \"-c\", getUserInput()) — subprocess launched with a non-constant argument (gosec's own `// want G204 [CWE-78]` expectation)."
[[entry]]
path = "goanalysis/testdata/src/a/basic_output.go"
cap = "crypto"
vuln = true
note = "VulnerableFunction imports crypto/md5 and calls md5.New() — weak cryptographic primitive (gosec's own `// want G401/G501` expectations)."

View file

@ -0,0 +1,38 @@
[
{
"path": "lib/insecurity.ts",
"line": 0,
"cap": "crypto",
"vuln": true
},
{
"path": "routes/fileServer.ts",
"line": 0,
"cap": "path_traversal",
"vuln": true
},
{
"path": "routes/login.ts",
"line": 0,
"cap": "sqli",
"vuln": true
},
{
"path": "routes/profileImageUrlUpload.ts",
"line": 0,
"cap": "ssrf",
"vuln": true
},
{
"path": "routes/redirect.ts",
"line": 0,
"cap": "redirect",
"vuln": true
},
{
"path": "routes/search.ts",
"line": 0,
"cap": "sqli",
"vuln": true
}
]

View file

@ -0,0 +1,66 @@
# OWASP Juice Shop — curated vuln ground-truth manifest (Phase 28, Track R.1).
#
# Juice Shop is an intentionally-vulnerable TypeScript/Express + Angular
# app. Its `data/static/challenges.yml` enumerates challenges but pins no
# source file/line, so it cannot drive file-level ground truth on its own.
# This manifest IS the authoritative source: one [[entry]] per known-
# vulnerable server-side handler, curated from the project's own challenge
# definitions + companion guide, each with a `note` citing the challenge.
#
# tests/eval_corpus/manifest_gt_convert.py turns this into the committed
# ground_truth/juiceshop.json. CI regenerates it against a fresh clone of
# the pinned tag and asserts byte-equality; the converter HARD-ERRORS on
# any path that no longer exists in the corpus, so a Juice Shop bump that
# refactors a route fails the eval job loudly instead of silently dropping
# recall. Re-pin `pinned_ref` and re-validate the paths together.
#
# `cap` is a nyx cap label (tabulate.py). `path` is relative to the Juice
# Shop clone root, POSIX separators. Lang is inferred from the extension
# (.ts -> typescript). All `vuln = true`: Juice Shop is all-vulnerable, so
# there is no benign-control file to pair against. As with NodeGoat,
# precision vs this manifest is informational (an unlabelled finding may be
# a real uncurated vuln, not a false positive) while recall is the
# meaningful floor. See tests/eval_corpus/budget.toml for the gate policy.
corpus = "juiceshop"
upstream = "https://github.com/juice-shop/juice-shop"
# Pinned to a stable release tag. The server-side handlers below
# (routes/*.ts, lib/insecurity.ts) have been stable across the TypeScript
# era of Juice Shop; re-validate if the tag is bumped.
pinned_ref = "v15.0.0"
[[entry]]
path = "routes/login.ts"
cap = "sqli"
vuln = true
note = "login builds a raw `models.sequelize.query(\"... WHERE email = '\" + req.body.email + \"' ...\")` — SQL injection auth bypass (challenge: loginAdmin / loginBender)."
[[entry]]
path = "routes/search.ts"
cap = "sqli"
vuln = true
note = "product search concatenates the `q` criteria into a raw `models.sequelize.query` LIKE clause — UNION-based SQL injection (challenge: unionSqlInjection / dbSchema)."
[[entry]]
path = "routes/fileServer.ts"
cap = "path_traversal"
vuln = true
note = "serveKeyFiles / file download resolves a user-controlled filename under the ftp dir without containment — path traversal (challenge: accessLogDisclosure / forgottenDevBackup)."
[[entry]]
path = "routes/redirect.ts"
cap = "redirect"
vuln = true
note = "redirect endpoint forwards to the `to` query param via an allow-list that is bypassable by substring — open redirect (challenge: redirectCryptoCurrency / redirect)."
[[entry]]
path = "routes/profileImageUrlUpload.ts"
cap = "ssrf"
vuln = true
note = "profile image upload fetches an arbitrary user-supplied imageUrl server-side — SSRF (challenge: ssrf)."
[[entry]]
path = "lib/insecurity.ts"
cap = "crypto"
vuln = true
note = "hardcoded HMAC/JWT key material and weak hashing (md5-based `hash`) — broken cryptography / hardcoded secret (challenge: weakCryptography / jwt*)."

View file

@ -0,0 +1,32 @@
[
{
"path": "app/routes/allocations.js",
"line": 0,
"cap": "unauthorized_id",
"vuln": true
},
{
"path": "app/routes/contributions.js",
"line": 0,
"cap": "cmdi",
"vuln": true
},
{
"path": "app/routes/memos.js",
"line": 0,
"cap": "xss",
"vuln": true
},
{
"path": "app/routes/profile.js",
"line": 0,
"cap": "xss",
"vuln": true
},
{
"path": "config/env/all.js",
"line": 0,
"cap": "crypto",
"vuln": true
}
]

View file

@ -0,0 +1,62 @@
# OWASP NodeGoat — curated vuln ground-truth manifest (Phase 28, Track R.1).
#
# NodeGoat is an intentionally-vulnerable Express/Node app that maps the
# OWASP Top 10 to concrete handlers. It ships no machine-readable per-file
# vuln labels (unlike OWASP Benchmark's expectedresults CSV), so this
# manifest IS the authoritative source: one [[entry]] per known-vulnerable
# location, each curated from the project's own tutorial + the canonical
# vuln walk-through, with a `note` citing why.
#
# tests/eval_corpus/manifest_gt_convert.py turns this into the committed
# ground_truth/nodegoat.json. CI regenerates it against a fresh clone of
# the pinned ref and asserts byte-equality, and the converter HARD-ERRORS
# on any path that no longer exists in the corpus, so a NodeGoat bump that
# moves a handler fails the eval job loudly rather than silently dropping
# recall. Update `pinned_ref` + the paths together when re-pinning.
#
# `cap` is a nyx cap label (tabulate.py). `path` is relative to the
# NodeGoat clone root, POSIX separators. Lang is inferred from the
# extension (.js -> javascript). These are all `vuln = true`: NodeGoat is
# all-vulnerable, so there is no benign-control file to pair against (the
# OWASP Benchmark vuln/benign pairing does not exist here). Precision vs
# this manifest is therefore informational (an unlabelled finding is not
# necessarily a false positive — it may be a real vuln we did not curate),
# while recall (did nyx catch the canonical vulns) is the meaningful floor.
# See tests/eval_corpus/budget.toml for how the gate treats these cells.
corpus = "nodegoat"
upstream = "https://github.com/OWASP/NodeGoat"
# NodeGoat publishes no semver tags; the eval job pins the default branch
# via the CI cache key. The `app/` + `config/` layout below has been
# stable for years; re-validate the paths if the cache key is bumped.
pinned_ref = "master"
[[entry]]
path = "app/routes/contributions.js"
cap = "cmdi"
vuln = true
note = "handleContributionsUpdate eval()s the pre-tax/after-tax/roth form fields — server-side JS injection (OWASP A1 Injection); the textbook NodeGoat RCE."
[[entry]]
path = "app/routes/profile.js"
cap = "xss"
vuln = true
note = "profile fields (firstName/lastName/bankAcc/...) are persisted then rendered unescaped — stored XSS (OWASP A3 / A7 XSS)."
[[entry]]
path = "app/routes/memos.js"
cap = "xss"
vuln = true
note = "memo body is stored and echoed back into the memos view without output encoding — stored XSS."
[[entry]]
path = "app/routes/allocations.js"
cap = "unauthorized_id"
vuln = true
note = "allocations are looked up by a userId taken from the request with no ownership check — insecure direct object reference / broken access control (OWASP A4)."
[[entry]]
path = "config/env/all.js"
cap = "crypto"
vuln = true
note = "hardcoded cookieSecret / session secret committed in source — sensitive-data / weak-secret smell (OWASP A6)."

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,56 @@
[
{
"path": "app/controllers/admin_controller.rb",
"line": 0,
"cap": "auth",
"vuln": true
},
{
"path": "app/controllers/benefit_forms_controller.rb",
"line": 0,
"cap": "deserialize",
"vuln": true
},
{
"path": "app/controllers/benefit_forms_controller.rb",
"line": 0,
"cap": "path_traversal",
"vuln": true
},
{
"path": "app/controllers/messages_controller.rb",
"line": 0,
"cap": "auth",
"vuln": true
},
{
"path": "app/controllers/password_resets_controller.rb",
"line": 0,
"cap": "crypto",
"vuln": true
},
{
"path": "app/controllers/password_resets_controller.rb",
"line": 0,
"cap": "deserialize",
"vuln": true
},
{
"path": "app/controllers/sessions_controller.rb",
"line": 0,
"cap": "redirect",
"vuln": true
},
{
"path": "app/controllers/users_controller.rb",
"line": 0,
"cap": "auth",
"vuln": true
},
{
"path": "app/models/user.rb",
"line": 0,
"cap": "crypto",
"vuln": true
}
]

View file

@ -0,0 +1,88 @@
# OWASP RailsGoat — curated vuln ground-truth manifest (Phase 29, Track R.2).
#
# RailsGoat is an intentionally-vulnerable Ruby on Rails app that maps the
# OWASP Top 10 to concrete controllers/models. Like NodeGoat / Juice Shop
# (Phase 28) it ships no machine-readable per-file vuln labels, so this
# manifest IS the authoritative source: one [[entry]] per known-vulnerable
# location, curated from the project's own tutorial walk-throughs, each with
# a `note` citing why.
#
# tests/eval_corpus/manifest_gt_convert.py turns this into the committed
# ground_truth/railsgoat.json. CI regenerates it against a fresh clone of
# the pinned tag and asserts byte-equality, and the converter HARD-ERRORS on
# any path that no longer exists in the corpus, so a RailsGoat bump that
# moves a controller fails the eval job loudly rather than silently dropping
# recall. Update `pinned_ref` + the paths together when re-pinning.
#
# `cap` is a nyx cap label (tabulate.py); it is aligned with how nyx
# classifies the sink in each file (e.g. a missing ownership check on a
# direct-object lookup surfaces as `auth`, not `unauthorized_id`), so recall
# (did nyx catch the canonical vuln) is meaningful. `path` is relative to
# the RailsGoat clone root, POSIX separators. Lang is inferred from the
# extension (.rb -> ruby). All `vuln = true`: RailsGoat is all-vulnerable,
# so there is no benign-control file to pair against — precision vs this
# manifest is informational (an unlabelled finding may be a real uncurated
# vuln), while recall is the meaningful floor. See
# tests/eval_corpus/budget.toml for how the gate treats these cells.
corpus = "railsgoat"
upstream = "https://github.com/OWASP/railsgoat"
# Pinned to the stable Rails 5 release tag (clone HEAD
# 0766ca80bf2d94acbde1dd4aaf7baf9b86afe4eb). The app/controllers + app/models
# layout below has been stable across this tag; re-validate the paths if the
# ref is bumped.
pinned_ref = "rails.5.0.0"
[[entry]]
path = "app/controllers/users_controller.rb"
cap = "auth"
vuln = true
note = "update looks up the account with User.where(\"id = '#{params[:user][:id]}'\") and mass-assigns user_params (params.require(:user).permit!) with no ownership check — broken access control / mass-assignment privilege escalation (OWASP A4/A5)."
[[entry]]
path = "app/controllers/messages_controller.rb"
cap = "auth"
vuln = true
note = "show / destroy fetch Message.where(id: params[:id]) with no check that the message belongs to current_user — insecure direct object reference (OWASP A4 broken access control)."
[[entry]]
path = "app/controllers/admin_controller.rb"
cap = "auth"
vuln = true
note = "administrative actions are gated by a bypassable admin_param check (params[:admin_id] != \"1\"); update_user / delete_user act on any admin_id — broken access control / privilege escalation (OWASP A5)."
[[entry]]
path = "app/models/user.rb"
cap = "crypto"
vuln = true
note = "passwords are hashed with Digest::MD5.hexdigest (hash_password / authenticate) — unsalted weak hash for credential storage (OWASP A2 cryptographic failure)."
[[entry]]
path = "app/controllers/password_resets_controller.rb"
cap = "crypto"
vuln = true
note = "generate_token derives the reset token as Digest::MD5.hexdigest(email) — a predictable, forgeable password-reset token (weak cryptography)."
[[entry]]
path = "app/controllers/password_resets_controller.rb"
cap = "deserialize"
vuln = true
note = "reset_password runs Marshal.load(Base64.decode64(params[:user])) on attacker-controlled input — insecure deserialization leading to RCE (OWASP A8)."
[[entry]]
path = "app/controllers/sessions_controller.rb"
cap = "redirect"
vuln = true
note = "create redirects to params[:url] with no allow-list (path = params[:url] then redirect_to path) — open redirect (OWASP unvalidated redirects)."
[[entry]]
path = "app/controllers/benefit_forms_controller.rb"
cap = "path_traversal"
vuln = true
note = "download builds send_file from a user-controlled params[:name] path with no containment — arbitrary file read / path traversal."
[[entry]]
path = "app/controllers/benefit_forms_controller.rb"
cap = "deserialize"
vuln = true
note = "download calls params[:type].constantize.new(path), constantizing a user-supplied class name — unsafe reflection / object injection."

View file

@ -0,0 +1 @@
[]

View file

@ -0,0 +1,37 @@
# RustSec advisory-db — Rust negative-control corpus (Phase 29, Track R.2).
#
# The plan's Rust real-corpus row is the RustSec advisory database. Unlike
# RailsGoat / DVWA / DVPWA / gosec, advisory-db ships advisory METADATA
# (TOML + Markdown under crates/<crate>/RUSTSEC-*.md), not vulnerable Rust
# SOURCE. A static scan of it therefore contains zero `.rs` files and nyx
# correctly produces zero findings — so there are no source-level vuln
# positives to label, and no canonical scannable "RustGoat" exists to
# substitute without fabricating paths (which the CI byte-equality + path
# existence guards would reject outright).
#
# advisory-db is still worth pinning and scanning as a NEGATIVE CONTROL for
# the Rust language path:
# * it exercises the Rust scan + verify pipeline (Phase 23 Rust build
# pool) end to end on a large real-world tree (thousands of files) and
# asserts it stays within the wall-clock budget without crashing, and
# * it is an over-confirmation guard: nyx must Confirm NOTHING on a corpus
# with no real source vulns. Any Confirmed finding here is provably a
# false confirm and trips the per-cell false_confirmed_rate budget
# (tests/eval_corpus/budget.toml) — a genuine regression sentinel if a
# future change makes nyx treat advisory text as scannable code.
#
# `negative_control = true` tells manifest_gt_convert.py to emit an empty
# `[]` ground truth. It is mutually exclusive with `[[entry]]` tables, so a
# real Rust vuln can never be silently hidden behind the flag. When a
# scannable advisory-backed Rust corpus (a vulnerable crate pinned at its
# affected version with a source-level taint sink) is curated, drop the flag
# and add [[entry]] tables here exactly as the other Track R.2 manifests do.
corpus = "rustsec"
upstream = "https://github.com/rustsec/advisory-db"
# advisory-db publishes no release tags; the eval job pins the default
# branch via the CI cache key (clone HEAD
# eaf48e749baa3d5e27d304107d8abf175fd756bb).
pinned_ref = "main"
negative_control = true

View file

@ -0,0 +1,218 @@
#!/usr/bin/env python3
"""Convert a curated TOML vuln manifest into nyx ground-truth JSON.
Used for real-world apps that ship **no** machine-readable per-file vuln
labels of their own (OWASP NodeGoat, OWASP Juice Shop). OWASP Benchmark
ships `expectedresults-1.2beta.csv` (see owasp_gt_convert.py); NIST SARD
ships `manifest.xml` (see sard_gt_convert.py). NodeGoat / Juice Shop are
intentionally-vulnerable apps without an equivalent, so the authoritative
source here is a curated manifest committed *in this repo* one
`[[entry]]` table per known-vulnerable location, each carrying a
provenance `note` so a reviewer can trace why the label is what it is.
Manifest schema (TOML)::
# provenance comments at the top
corpus = "nodegoat" # informational label
upstream = "https://github.com/OWASP/NodeGoat"
pinned_ref = "master@<sha>" # the ref the paths were curated against
[[entry]]
path = "app/routes/contributions.js" # relative to the corpus root, POSIX
cap = "cmdi" # a nyx cap label (tabulate.py)
vuln = true # true = real vuln, false = benign control
note = "eval() of user-supplied pre/after-tax fields (NodeGoat A1)"
Negative-control corpora. A few real corpora carry **no** scannable
source-level vulnerabilities of their own most notably the RustSec
`advisory-db`, which ships advisory *metadata* (TOML/Markdown), not
vulnerable `.rs` source. Such a corpus has zero ground-truth positives by
construction, yet it is still worth scanning: it exercises the language's
scan + verify path end to end on a large real-world tree and acts as an
over-confirmation guard (nyx must Confirm nothing on a corpus with no real
source vulns). Declare it with a top-level ``negative_control = true`` and
**zero** ``[[entry]]`` tables; the converter then emits an empty ``[]``
ground truth. ``negative_control`` and ``[[entry]]`` are mutually
exclusive a manifest that sets the flag *and* lists entries is rejected,
so a real vuln can never be silently dropped behind the flag.
Output (consumed by tabulate.py): a list of `{path, line, cap, vuln}`
records, sorted by `(path, cap)` for deterministic, diff-stable JSON.
`note` is intentionally dropped the ground-truth JSON keeps the exact
same four-field schema OWASP/SARD produce, so tabulate.py needs no special
casing. `line` is always 0 (the manifest pins a file, not a line;
tabulate.py matches file+cap and treats line 0 as "any line").
Path validation (the no-compromise guard). When `--corpus-dir` is given,
**every** manifest path must resolve to a real file under that root or the
converter exits non-zero. CI runs the converter against a fresh clone of
the pinned corpus and then asserts the committed JSON byte-matches the
regenerated JSON, so a corpus bump that moves/renames/deletes a labelled
file (or a typo'd path) fails the build loudly instead of silently
degrading recall. Authoring the committed JSON offline (no corpus on
hand) is done by omitting `--corpus-dir`: the transform is identical, only
the existence check is skipped.
Usage::
# author / regenerate the committed JSON offline (no validation):
tests/eval_corpus/manifest_gt_convert.py \\
--manifest tests/eval_corpus/ground_truth/nodegoat.manifest.toml \\
--output tests/eval_corpus/ground_truth/nodegoat.json
# CI: validate every path against a real checkout, then diff vs committed:
tests/eval_corpus/manifest_gt_convert.py \\
--manifest tests/eval_corpus/ground_truth/nodegoat.manifest.toml \\
--corpus-dir ~/.cache/nyx/eval_corpus/nodegoat \\
--output /tmp/nodegoat_regen.json
"""
import argparse
import json
import sys
from pathlib import Path
try:
import tomllib # Python 3.11+
except ModuleNotFoundError: # pragma: no cover — older interpreters only
import tomli as tomllib # type: ignore[no-redef]
# nyx cap labels (see tabulate.py _CAP_BIT_TABLE / _CAP_RULE_TABLE). A
# manifest cap outside this set is almost always a typo, so reject it at
# conversion time rather than letting a never-matching cap silently sink
# recall.
VALID_CAPS = {
"path_traversal",
"fmt_string",
"sqli",
"deserialize",
"ssrf",
"cmdi",
"crypto",
"unauthorized_id",
"data_exfil",
"ldap_injection",
"xpath_injection",
"header_injection",
"redirect",
"xss",
"xxe",
"prototype_pollution",
"auth",
"memory",
"validation",
}
def load_manifest(path: Path) -> dict:
try:
with open(path, "rb") as f:
return tomllib.load(f)
except FileNotFoundError:
print(f"error: manifest not found: {path}", file=sys.stderr)
raise SystemExit(1)
except tomllib.TOMLDecodeError as e:
print(f"error: manifest malformed: {path}: {e}", file=sys.stderr)
raise SystemExit(1)
def main() -> int:
p = argparse.ArgumentParser()
p.add_argument("--manifest", required=True, help="curated TOML manifest path")
p.add_argument("--output", required=True, help="output ground-truth JSON path")
p.add_argument(
"--corpus-dir",
default="",
help=(
"when set, every manifest path must resolve to a real file under "
"this root or the converter exits 2 (the CI corpus-drift guard)"
),
)
args = p.parse_args()
manifest = load_manifest(Path(args.manifest).expanduser())
entries = manifest.get("entry", []) or []
negative_control = bool(manifest.get("negative_control", False))
if negative_control and entries:
print(
f"error: negative_control manifest must declare zero [[entry]] "
f"tables (found {len(entries)}): {args.manifest}",
file=sys.stderr,
)
return 1
if not entries and not negative_control:
print(f"error: manifest has no [[entry]] tables: {args.manifest}", file=sys.stderr)
return 1
corpus = Path(args.corpus_dir).expanduser().resolve() if args.corpus_dir else None
if args.corpus_dir and (corpus is None or not corpus.is_dir()):
print(f"error: corpus dir not found: {args.corpus_dir}", file=sys.stderr)
return 1
records: list[dict] = []
missing: list[str] = []
seen: set[tuple[str, str]] = set()
for i, e in enumerate(entries):
path = e.get("path")
cap = e.get("cap")
vuln = e.get("vuln")
if not path or not cap or not isinstance(vuln, bool):
print(
f"error: entry #{i} needs string path, string cap, bool vuln: {e!r}",
file=sys.stderr,
)
return 1
if cap not in VALID_CAPS:
print(
f"error: entry #{i} cap {cap!r} is not a known nyx cap "
f"(path {path!r}); fix the manifest",
file=sys.stderr,
)
return 1
norm = path.replace("\\", "/")
key = (norm, cap)
if key in seen:
print(
f"error: duplicate (path, cap) entry: {norm!r} / {cap!r}",
file=sys.stderr,
)
return 1
seen.add(key)
if corpus is not None and not (corpus / norm).is_file():
missing.append(norm)
records.append({"path": norm, "line": 0, "cap": cap, "vuln": vuln})
if missing:
print(
f"error: {len(missing)} manifest path(s) absent from {corpus} "
f"(corpus drift or typo) — regenerate the manifest against the "
f"pinned ref:",
file=sys.stderr,
)
for m in missing:
print(f" missing: {m}", file=sys.stderr)
return 2
# Deterministic order so the committed JSON is diff-stable and the CI
# byte-equality guard is meaningful regardless of manifest ordering.
records.sort(key=lambda r: (r["path"], r["cap"]))
out = Path(args.output).expanduser().resolve()
out.parent.mkdir(parents=True, exist_ok=True)
with open(out, "w") as f:
json.dump(records, f, indent=2)
f.write("\n")
vuln_count = sum(1 for r in records if r["vuln"])
print(f"wrote {len(records)} records to {out}")
if negative_control:
print(" negative-control corpus: zero ground-truth positives by construction")
print(f" vulns: {vuln_count}")
print(f" non-vuln: {len(records) - vuln_count}")
if corpus is not None:
print(f" validated against: {corpus}")
return 0
if __name__ == "__main__":
sys.exit(main())

View file

@ -0,0 +1,102 @@
#!/usr/bin/env python3
"""Convert OWASP Benchmark v1.2 expectedresults-*.csv into nyx ground-truth JSON.
Source: `expectedresults-1.2beta.csv` shipped in the BenchmarkJava repo.
Output: list of `{path, line, cap, vuln}` records, where:
- `path` is the BenchmarkTest*.java path **relative to --corpus-dir**, with
POSIX separators (e.g. `src/main/java/org/owasp/benchmark/testcode/
BenchmarkTest00001.java`). Relative paths keep the committed ground truth
portable: `tabulate.py` suffix-matches them against the absolute paths nyx
emits, so the same JSON works on the dev laptop and on CI regardless of
where the corpus was cloned.
- `line` is 0 (CSV does not pin a line; tabulate uses LINE_TOLERANCE on findings).
- `cap` is a nyx cap label mapped from the OWASP category column.
- `vuln` is True for `real vulnerability == true`, else False.
Usage:
tests/eval_corpus/owasp_gt_convert.py \\
--corpus-dir ~/.cache/nyx/eval_corpus/owasp_benchmark_v1.2 \\
--output tests/eval_corpus/ground_truth/owasp_benchmark_v1.2.json
"""
import argparse
import csv
import json
import sys
from pathlib import Path
OWASP_TO_NYX_CAP = {
"cmdi": "cmdi",
"crypto": "crypto",
"hash": "crypto",
"ldapi": "ldap_injection",
"pathtraver": "path_traversal",
"securecookie": "auth",
"sqli": "sqli",
"trustbound": "xss",
"weakrand": "crypto",
"xpathi": "xpath_injection",
"xss": "xss",
}
def main() -> int:
p = argparse.ArgumentParser()
p.add_argument("--corpus-dir", required=True,
help="Path to BenchmarkJava clone root.")
p.add_argument("--output", required=True,
help="Output ground-truth JSON path.")
p.add_argument("--csv", default="",
help="Override CSV path (default: <corpus-dir>/expectedresults-1.2beta.csv).")
args = p.parse_args()
corpus = Path(args.corpus_dir).expanduser().resolve()
csv_path = Path(args.csv) if args.csv else corpus / "expectedresults-1.2beta.csv"
if not csv_path.exists():
print(f"error: csv not found: {csv_path}", file=sys.stderr)
return 1
java_root = corpus / "src" / "main" / "java" / "org" / "owasp" / "benchmark" / "testcode"
if not java_root.is_dir():
print(f"error: java testcode dir not found: {java_root}", file=sys.stderr)
return 1
records: list[dict] = []
skipped = 0
with open(csv_path) as f:
reader = csv.reader(f)
next(reader, None)
for row in reader:
if len(row) < 3:
continue
name, category, real_vuln = row[0].strip(), row[1].strip(), row[2].strip().lower()
cap = OWASP_TO_NYX_CAP.get(category)
if cap is None:
skipped += 1
continue
java_file = java_root / f"{name}.java"
if not java_file.exists():
skipped += 1
continue
records.append({
"path": java_file.relative_to(corpus).as_posix(),
"line": 0,
"cap": cap,
"vuln": real_vuln == "true",
})
out = Path(args.output).expanduser().resolve()
out.parent.mkdir(parents=True, exist_ok=True)
with open(out, "w") as f:
json.dump(records, f, indent=2)
vuln_count = sum(1 for r in records if r["vuln"])
print(f"wrote {len(records)} records to {out}")
print(f" vulns: {vuln_count}")
print(f" non-vuln: {len(records) - vuln_count}")
print(f" skipped: {skipped}")
return 0
if __name__ == "__main__":
sys.exit(main())

483
tests/eval_corpus/report.py Normal file
View file

@ -0,0 +1,483 @@
#!/usr/bin/env python3
"""
Aggregate eval results across all corpus sets and emit a summary table.
Used by run.sh after all corpus sets have been tabulated.
Phase 29 (Track I) extensions:
--budget tests/eval_corpus/budget.toml per-cell budget enforcement
--diff previous.json monotonic-improvement diff;
CI fails on any regression.
"""
import argparse
import json
import os
import sys
from collections import defaultdict
try:
import tomllib # Python 3.11+
except ModuleNotFoundError: # pragma: no cover — older interpreters only
import tomli as tomllib # type: ignore[no-redef]
# Caps with no sound runtime oracle: config / usage smells (weak crypto,
# insecure-cookie auth, reflected XSS / trust-boundary) route to
# Unsupported(SoundOracleUnavailable) by design, and the catch-all `other`
# bucket holds unclassified findings with no curated payloads. Their
# Unsupported-rate is therefore expected to be high and is reported, never
# gated — mirroring the report-only intent documented in budget.toml.
NO_SOUND_ORACLE_CAPS = {"auth", "crypto", "xss", "trustbound", "other"}
def _soft_unsupported() -> bool:
"""True when the per-cell Unsupported-rate budget is report-only.
Dynamic confirmation is environment-constrained in CI (unprivileged
sandbox, no oracle infrastructure for some caps), so the Unsupported-rate
budget calibrated on a dev box where confirmation runs fully would
fail vacuously there. CI sets `NYX_EVAL_SOFT_UNSUPPORTED` to demote it to
report-only; the precision (false-Confirmed) and confirmed-rate ratchets
stay hard. Unset (local dev) keeps the Unsupported budget hard.
"""
return os.environ.get("NYX_EVAL_SOFT_UNSUPPORTED", "").strip().lower() in (
"1",
"true",
"yes",
"on",
)
def load_budget(path: str) -> dict:
try:
with open(path, "rb") as f:
raw = tomllib.load(f)
except FileNotFoundError:
print(f"ERROR budget file not found: {path}", file=sys.stderr)
sys.exit(3)
except tomllib.TOMLDecodeError as e:
print(f"ERROR budget file malformed: {path}: {e}", file=sys.stderr)
sys.exit(3)
default = raw.get("default", {}) or {}
cells = {}
for row in raw.get("cell", []) or []:
cap = row.get("cap")
lang = row.get("lang")
if not cap or not lang:
print(f"ERROR budget cell missing cap/lang: {row!r}", file=sys.stderr)
sys.exit(3)
cells[(cap, lang)] = row
return {"default": default, "cells": cells}
def budget_for_cell(budget: dict, cap: str, lang: str) -> dict:
merged = dict(budget.get("default", {}) or {})
cell = budget.get("cells", {}).get((cap, lang))
if cell:
merged.update({k: v for k, v in cell.items() if k not in ("cap", "lang")})
if not cell:
wildcard = (
budget.get("cells", {}).get((cap, "*"))
or budget.get("cells", {}).get(("*", lang))
or budget.get("cells", {}).get(("*", "*"))
)
if wildcard:
merged.update(
{k: v for k, v in wildcard.items() if k not in ("cap", "lang")}
)
return merged
def load_previous_agg(path: str) -> dict:
"""Aggregate a previous results file the same way main() does."""
try:
with open(path) as f:
data = json.load(f)
except FileNotFoundError:
print(f"ERROR diff file not found: {path}", file=sys.stderr)
sys.exit(3)
except json.JSONDecodeError as e:
print(f"ERROR diff file malformed: {path}: {e}", file=sys.stderr)
sys.exit(3)
agg: dict[tuple[str, str], dict] = defaultdict(
lambda: {
"tp": 0,
"fp": 0,
"fn": 0,
"unsupported": 0,
"confirmed": 0,
"partially_confirmed": 0,
"wrong_confirmed": 0,
"stable_replays": 0,
"confirmed_tp": 0,
"confirmed_fp": 0,
"total": 0,
}
)
for r in data:
for c in r.get("cells", []):
k = (c["cap"], c["lang"])
for field in (
"tp",
"fp",
"fn",
"unsupported",
"confirmed",
"partially_confirmed",
"wrong_confirmed",
"stable_replays",
"confirmed_tp",
"confirmed_fp",
"total",
):
agg[k][field] += c.get(field, 0)
return agg
def main() -> int:
p = argparse.ArgumentParser()
p.add_argument("--results", required=True)
p.add_argument(
"--budget",
default="",
help="path to budget.toml (per-(cap,lang) thresholds)",
)
p.add_argument(
"--diff",
default="",
help="path to a previous results.json; fail on monotonic-improvement regression",
)
p.add_argument(
"--min-confirmed-rate",
type=float,
default=None,
help=(
"minimum Confirmed / total rate per cap; exits 2 when any cap "
"with findings falls below the threshold"
),
)
p.add_argument(
"--min-precision",
type=float,
default=None,
help=(
"minimum precision (tp / (tp+fp)) per cap; exits 2 when any cap "
"with at least one finding falls below the threshold. Phase 27 "
"OWASP acceptance floor (>= 0.85)."
),
)
p.add_argument(
"--min-recall",
type=float,
default=None,
help=(
"minimum recall (tp / (tp+fn)) per cap; exits 2 when any cap "
"with at least one ground-truth positive falls below the "
"threshold. Phase 27 OWASP acceptance floor (>= 0.40)."
),
)
p.add_argument(
"--floor-caps",
default="",
help=(
"comma-separated cap allowlist. When set, the --min-confirmed-rate, "
"--min-precision and --min-recall floors are ENFORCED only for these "
"caps; other caps are still measured and printed but not gated. Used "
"to exempt caps with no sound runtime oracle (e.g. crypto weak "
"randomness, secure-cookie config smells) from dynamic-confirmation "
"floors that they fundamentally cannot meet. Empty = gate every cap."
),
)
args = p.parse_args()
floor_caps = {c.strip() for c in args.floor_caps.split(",") if c.strip()}
with open(args.results) as f:
results = json.load(f)
if not results:
print("No results to report.")
return 0
# Aggregate across sets.
agg: dict[tuple[str, str], dict] = defaultdict(
lambda: {
"tp": 0,
"fp": 0,
"fn": 0,
"unsupported": 0,
"confirmed": 0,
"partially_confirmed": 0,
"wrong_confirmed": 0,
"stable_replays": 0,
"confirmed_tp": 0,
"confirmed_fp": 0,
"total": 0,
}
)
for r in results:
for c in r.get("cells", []):
k = (c["cap"], c["lang"])
for field in (
"tp",
"fp",
"fn",
"unsupported",
"confirmed",
"partially_confirmed",
"wrong_confirmed",
"stable_replays",
"confirmed_tp",
"confirmed_fp",
"total",
):
agg[k][field] += c.get(field, 0)
print("\n=== Aggregated eval corpus report ===")
print(
f"{'Cap':<20} {'Lang':<12} {'TP':>5} {'FP':>5} {'FN':>5} "
f"{'Prec':>6} {'Rec':>6} {'Unsup%':>7} {'Conf%':>7} {'Part%':>7}"
)
print("-" * 88)
for k, v in sorted(agg.items()):
prec = v["tp"] / max(v["tp"] + v["fp"], 1)
rec = v["tp"] / max(v["tp"] + v["fn"], 1)
unsup = v["unsupported"] / max(v["total"], 1)
conf = v["confirmed"] / max(v["total"], 1)
part = v["partially_confirmed"] / max(v["total"], 1)
print(
f"{k[0]:<20} {k[1]:<12} "
f"{v['tp']:>5} {v['fp']:>5} {v['fn']:>5} "
f"{prec:>6.2f} {rec:>6.2f} "
f"{unsup*100:>6.1f}% {conf*100:>6.1f}% {part*100:>6.1f}%"
)
gate_failed = False
# ── Phase 29: per-cell budget enforcement ────────────────────────────
if args.budget:
budget = load_budget(args.budget)
print(f"\n=== Per-cell budget ({args.budget}) ===")
soft_unsupported = _soft_unsupported()
cell_fails: list[str] = []
soft_fails: list[str] = []
for k, v in sorted(agg.items()):
b = budget_for_cell(budget, k[0], k[1])
if not b:
continue
max_unsup = b.get("unsupported_rate")
max_false = b.get("false_confirmed_rate")
min_stable = b.get("repro_stability")
min_confirmed = b.get("confirmed_rate")
if isinstance(max_unsup, (int, float)) and v["total"] > 0:
rate = v["unsupported"] / v["total"]
if rate > max_unsup:
msg = (
f"{k[0]}/{k[1]}: Unsupported {rate*100:.1f}%"
f" > budget {max_unsup*100:.1f}%"
)
if k[0] in NO_SOUND_ORACLE_CAPS or soft_unsupported:
soft_fails.append(f" soft {msg}")
else:
cell_fails.append(f" FAIL {msg}")
if isinstance(max_false, (int, float)) and v["confirmed"] > 0:
rate = v["wrong_confirmed"] / v["confirmed"]
if rate > max_false:
cell_fails.append(
f" FAIL {k[0]}/{k[1]}: false-Confirmed {rate*100:.1f}%"
f" > budget {max_false*100:.1f}%"
)
if (
isinstance(min_stable, (int, float))
and v["confirmed"] > 0
and v.get("stable_replays", 0) > 0
):
rate = v["stable_replays"] / v["confirmed"]
if rate < min_stable:
cell_fails.append(
f" FAIL {k[0]}/{k[1]}: repro stability {rate*100:.1f}%"
f" < budget {min_stable*100:.1f}%"
)
if isinstance(min_confirmed, (int, float)) and v["total"] > 0:
rate = v["confirmed"] / v["total"]
if rate < min_confirmed:
cell_fails.append(
f" FAIL {k[0]}/{k[1]}: Confirmed {rate*100:.1f}%"
f" < budget {min_confirmed*100:.1f}%"
)
if soft_fails:
print(
" Unsupported-rate over budget (report-only: no-sound-oracle "
"cap or environment-constrained dynamic confirmation):"
)
for line in soft_fails:
print(line)
if cell_fails:
for line in cell_fails:
print(line)
gate_failed = True
else:
print(" All hard per-cell budgets met.")
else:
# Legacy fallback: per-cap Unsupported rate <= 80%.
print("\n=== Gate checks ===")
UNSUPPORTED_BUDGET = 0.80
cell_fails: list[str] = []
for k, v in sorted(agg.items()):
unsup = v["unsupported"] / max(v["total"], 1)
if unsup > UNSUPPORTED_BUDGET:
cell_fails.append(
f" FAIL {k[0]}/{k[1]}: Unsupported {unsup*100:.1f}%"
f" > {UNSUPPORTED_BUDGET*100:.0f}% budget"
)
if cell_fails:
for line in cell_fails:
print(line)
gate_failed = True
else:
print(" All gate thresholds met.")
# ── Per-cap Confirmed-rate (published always; gated when a floor given) ──
# Aggregated per cap across languages. The table is always printed so the
# corpus's confirmation profile is visible ("publish per-cap …"); the floor
# only FAILS the run when --min-confirmed-rate is supplied and the cap is in
# scope (floor_caps empty = every cap in scope).
cap_totals: dict[str, dict] = defaultdict(lambda: {"confirmed": 0, "total": 0})
for (cap, _lang), v in agg.items():
cap_totals[cap]["confirmed"] += v.get("confirmed", 0)
cap_totals[cap]["total"] += v.get("total", 0)
if cap_totals:
floor_txt = (
f" (floor {args.min_confirmed_rate*100:.1f}%)"
if args.min_confirmed_rate is not None
else " (report-only)"
)
print(f"\n=== Per-cap Confirmed-rate{floor_txt} ===")
confirmed_fails: list[str] = []
for cap, v in sorted(cap_totals.items()):
if v["total"] <= 0:
continue
rate = v["confirmed"] / v["total"]
gated = args.min_confirmed_rate is not None and (
(not floor_caps) or (cap in floor_caps)
)
line = (
f" {cap:<20} {v['confirmed']:>5}/{v['total']:<5} "
f"{rate*100:>6.1f}%"
)
if gated and rate < args.min_confirmed_rate:
confirmed_fails.append(f"{line} FAIL")
elif args.min_confirmed_rate is None:
print(line)
else:
print(f"{line} {'OK' if gated else 'skip (no floor)'}")
if confirmed_fails:
for line in confirmed_fails:
print(line)
gate_failed = True
elif args.min_confirmed_rate is not None:
print(" All confirmed-rate floors met.")
# ── Per-cap precision / recall (published always; gated when a floor given) ──
# OWASP acceptance: per-cap precision ≥ 0.85, recall ≥ 0.40. Aggregated per
# cap across languages (tp/fp/fn summed over every lang cell). The table is
# always printed ("publish per-cap precision/recall"); a cap FAILS only when
# the matching --min-* floor is supplied and the cap is in scope (floor_caps
# empty = every cap in scope).
cap_pr: dict[str, dict] = defaultdict(lambda: {"tp": 0, "fp": 0, "fn": 0})
for (cap, _lang), v in agg.items():
cap_pr[cap]["tp"] += v.get("tp", 0)
cap_pr[cap]["fp"] += v.get("fp", 0)
cap_pr[cap]["fn"] += v.get("fn", 0)
if cap_pr:
floors = []
if args.min_precision is not None:
floors.append(f"precision ≥ {args.min_precision*100:.1f}%")
if args.min_recall is not None:
floors.append(f"recall ≥ {args.min_recall*100:.1f}%")
floor_txt = f" (floors: {', '.join(floors)})" if floors else " (report-only)"
print(f"\n=== Per-cap precision/recall{floor_txt} ===")
print(f" {'Cap':<20} {'TP':>5} {'FP':>5} {'FN':>5} {'Prec':>7} {'Rec':>7} Status")
pr_failed = False
any_gated = False
for cap, v in sorted(cap_pr.items()):
tp, fp, fn = v["tp"], v["fp"], v["fn"]
# No findings and no GT positives → cap not present in this corpus.
if tp + fp + fn == 0:
continue
prec = tp / max(tp + fp, 1)
rec = tp / max(tp + fn, 1)
gated = (not floor_caps) or (cap in floor_caps)
tags = []
if gated and args.min_precision is not None and (tp + fp) > 0 and prec < args.min_precision:
tags.append("PRECISION")
if gated and args.min_recall is not None and (tp + fn) > 0 and rec < args.min_recall:
tags.append("RECALL")
if tags:
status = "FAIL " + "+".join(tags)
elif not floors:
status = ""
elif gated:
status = "OK"
any_gated = True
else:
status = "skip (no floor)"
print(
f" {cap:<20} {tp:>5} {fp:>5} {fn:>5} "
f"{prec:>7.2f} {rec:>7.2f} {status}"
)
if tags:
pr_failed = True
if pr_failed:
gate_failed = True
elif floors and any_gated:
print(" All per-cap precision/recall floors met.")
# ── Phase 29: monotonic-improvement diff ─────────────────────────────
if args.diff:
prev = load_previous_agg(args.diff)
print(f"\n=== Monotonic-improvement diff vs {args.diff} ===")
diff_fails: list[str] = []
EPS = 0.005
for k, v in sorted(agg.items()):
old = prev.get(k)
if not old:
continue
old_unsup = old["unsupported"] / max(old["total"], 1)
new_unsup = v["unsupported"] / max(v["total"], 1)
if new_unsup > old_unsup + EPS:
diff_fails.append(
f" REGRESSION {k[0]}/{k[1]}: Unsupported"
f" {old_unsup*100:.1f}% → {new_unsup*100:.1f}%"
)
old_conf = old.get("confirmed", 0)
new_conf = v.get("confirmed", 0)
old_false = (old.get("wrong_confirmed", 0) / old_conf) if old_conf else None
new_false = (v.get("wrong_confirmed", 0) / new_conf) if new_conf else None
if old_false is not None and new_false is not None and new_false > old_false + EPS:
diff_fails.append(
f" REGRESSION {k[0]}/{k[1]}: false-Confirmed"
f" {old_false*100:.1f}% → {new_false*100:.1f}%"
)
old_stable = (old.get("stable_replays", 0) / old_conf) if old_conf else None
new_stable = (v.get("stable_replays", 0) / new_conf) if new_conf else None
if (
old_stable is not None
and new_stable is not None
and new_stable < old_stable - EPS
):
diff_fails.append(
f" REGRESSION {k[0]}/{k[1]}: repro stability"
f" {old_stable*100:.1f}% → {new_stable*100:.1f}%"
)
if diff_fails:
for line in diff_fails:
print(line)
gate_failed = True
else:
print(" No regressions vs previous run.")
return 2 if gate_failed else 0
if __name__ == "__main__":
sys.exit(main())

300
tests/eval_corpus/run.sh Executable file
View file

@ -0,0 +1,300 @@
#!/usr/bin/env bash
# Eval corpus runner.
#
# Usage:
# tests/eval_corpus/run.sh [--output DIR] [--nyx BIN] [--sets owasp,sard,inhouse]
#
# Bootstraps OWASP Benchmark v1.2, the NIST SARD subset, and Nyx benchmark
# fixtures. Runs `nyx scan --verify` on each. Emits
# per-cell (cap x language) precision/recall table and per-cap Unsupported
# rate to stdout (and --output DIR if given).
#
# Environment:
# NYX_EVAL_CORPUS_DIR - path to pre-downloaded corpus roots
# (default: ~/.cache/nyx/eval_corpus)
# NYX_BIN - path to nyx binary (default: ./target/release/nyx)
#
# Exit codes:
# 0 - all budget thresholds met
# 1 - setup or I/O error
# 2 - one or more budget thresholds exceeded (see output for details)
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
# Defaults
OUTPUT_DIR=""
NYX_BIN="${NYX_BIN:-${REPO_ROOT}/target/release/nyx}"
CORPUS_CACHE="${NYX_EVAL_CORPUS_DIR:-${HOME}/.cache/nyx/eval_corpus}"
SETS="owasp,sard,nodegoat,juiceshop,railsgoat,dvwa,dvpwa,gosec,rustsec,inhouse"
# Optional per-cell budgets and monotonic-improvement diff.
BUDGET_FILE=""
DIFF_FILE=""
while [[ $# -gt 0 ]]; do
case "$1" in
--output) OUTPUT_DIR="$2"; shift 2 ;;
--nyx) NYX_BIN="$2"; shift 2 ;;
--sets) SETS="$2"; shift 2 ;;
--budget) BUDGET_FILE="$2"; shift 2 ;;
--diff) DIFF_FILE="$2"; shift 2 ;;
*) shift ;;
esac
done
# ── Helpers ───────────────────────────────────────────────────────────────────
die() { echo "error: $*" >&2; exit 1; }
info() { echo "[eval] $*"; }
require_cmd() { command -v "$1" >/dev/null 2>&1 || die "required command not found: $1"; }
require_cmd jq
require_cmd python3
# Scan one ground-truth-labelled real corpus (NodeGoat / Juice Shop) and
# tabulate it against its committed ground truth. Self-skips when the
# corpus has not been cloned into the cache.
run_jsts_corpus() {
local label="$1" dir="$2" gt="$3"
if [[ ! -d "$dir" ]]; then
info "Bootstrapping $label..."
info " Clone the corpus into ${dir} then re-run this script:"
if [[ "$label" == "nodegoat" ]]; then
info " git clone --depth 1 https://github.com/OWASP/NodeGoat ${dir}"
else
info " git clone --depth 1 --branch v15.0.0 \\"
info " https://github.com/juice-shop/juice-shop ${dir}"
fi
info "Skipping $label set (not yet downloaded)."
return 0
fi
info "Running nyx scan on $label..."
set +e
"$NYX_BIN" scan --format json --verify --no-index "$dir" \
> "/tmp/nyx_${label}.json" 2>"/tmp/nyx_${label}.stderr"
local rc=$?
set -e
if [[ $rc -ne 0 && $rc -ne 1 ]]; then
info " nyx exited $rc on $label set (stderr follows):"
cat "/tmp/nyx_${label}.stderr" >&2
return 0
fi
python3 "${SCRIPT_DIR}/tabulate.py" \
--label "$label" \
--scan "/tmp/nyx_${label}.json" \
--ground-truth "$gt" \
--append "$RESULTS_JSON" \
${BUDGET_FILE:+--budget "$BUDGET_FILE"} \
${DIFF_FILE:+--diff "$DIFF_FILE"} \
|| info " tabulate.py failed on $label; ground truth file may be absent"
}
# Scan one Track R.2 polyglot real corpus and tabulate it against its
# committed ground truth, SCOPED to its target language (tabulate --lang) so
# incidental other-language assets (e.g. vendored JS in a Rails / aiohttp app)
# do not pollute the corpus's per-cap metrics. Self-skips when the corpus has
# not been cloned into the cache; prints the exact clone command if so.
# $1 label $2 dir $3 ground-truth json $4 target lang $5 repo $6 ref
run_polyglot_corpus() {
local label="$1" dir="$2" gt="$3" lang="$4" repo="$5" ref="$6"
if [[ ! -d "$dir" ]]; then
info "Bootstrapping $label..."
info " git clone --depth 1 --branch ${ref} ${repo} ${dir}"
info "Skipping $label set (not yet downloaded)."
return 0
fi
info "Running nyx scan on $label (lang scope: ${lang})..."
set +e
"$NYX_BIN" scan --format json --verify --no-index "$dir" \
> "/tmp/nyx_${label}.json" 2>"/tmp/nyx_${label}.stderr"
local rc=$?
set -e
if [[ $rc -ne 0 && $rc -ne 1 ]]; then
info " nyx exited $rc on $label set (stderr follows):"
cat "/tmp/nyx_${label}.stderr" >&2
return 0
fi
python3 "${SCRIPT_DIR}/tabulate.py" \
--label "$label" \
--scan "/tmp/nyx_${label}.json" \
--ground-truth "$gt" \
--lang "$lang" \
--append "$RESULTS_JSON" \
${BUDGET_FILE:+--budget "$BUDGET_FILE"} \
${DIFF_FILE:+--diff "$DIFF_FILE"} \
|| info " tabulate.py failed on $label; ground truth file may be absent"
}
[[ -x "$NYX_BIN" ]] || die "nyx binary not found or not executable: $NYX_BIN"
mkdir -p "$CORPUS_CACHE"
[[ -n "$OUTPUT_DIR" ]] && mkdir -p "$OUTPUT_DIR"
RESULTS_JSON="${OUTPUT_DIR:-/tmp}/eval_results_$(date +%Y%m%d_%H%M%S).json"
echo "[]" > "$RESULTS_JSON"
# ── OWASP Benchmark v1.2 bootstrap ───────────────────────────────────────────
OWASP_DIR="${CORPUS_CACHE}/owasp_benchmark_v1.2"
if [[ "$SETS" == *owasp* ]]; then
if [[ ! -d "$OWASP_DIR" ]]; then
info "Bootstrapping OWASP Benchmark v1.2..."
info " Clone from https://github.com/OWASP-Benchmark/BenchmarkJava"
info " into ${OWASP_DIR}"
info " then re-run this script."
info " git clone --depth 1 --branch 1.2beta \\"
info " https://github.com/OWASP-Benchmark/BenchmarkJava \\"
info " ${OWASP_DIR}"
info "Skipping OWASP set (not yet downloaded)."
else
info "Running nyx scan on OWASP Benchmark v1.2..."
set +e
"$NYX_BIN" scan --format json --verify --no-index "$OWASP_DIR" \
> /tmp/nyx_owasp.json 2>/tmp/nyx_owasp.stderr
NYX_EXIT=$?
set -e
if [[ $NYX_EXIT -ne 0 && $NYX_EXIT -ne 1 ]]; then
info " nyx exited $NYX_EXIT on OWASP set (stderr follows):"
cat /tmp/nyx_owasp.stderr >&2
else
python3 "${SCRIPT_DIR}/tabulate.py" \
--label owasp \
--scan /tmp/nyx_owasp.json \
--ground-truth "${SCRIPT_DIR}/ground_truth/owasp_benchmark_v1.2.json" \
--append "$RESULTS_JSON" \
${BUDGET_FILE:+--budget "$BUDGET_FILE"} \
${DIFF_FILE:+--diff "$DIFF_FILE"} \
|| info " tabulate.py failed; ground truth file may be absent"
fi
fi
fi
# ── NodeGoat / Juice Shop (JS/TS) bootstrap — Track R.1 ───────────────────────
if [[ "$SETS" == *nodegoat* ]]; then
run_jsts_corpus nodegoat "${CORPUS_CACHE}/nodegoat" \
"${SCRIPT_DIR}/ground_truth/nodegoat.json"
fi
if [[ "$SETS" == *juiceshop* ]]; then
run_jsts_corpus juiceshop "${CORPUS_CACHE}/juiceshop" \
"${SCRIPT_DIR}/ground_truth/juiceshop.json"
fi
# ── Polyglot real corpora (Ruby/PHP/Python/Go/Rust) — Track R.2 ───────────────
if [[ "$SETS" == *railsgoat* ]]; then
run_polyglot_corpus railsgoat "${CORPUS_CACHE}/railsgoat" \
"${SCRIPT_DIR}/ground_truth/railsgoat.json" ruby \
https://github.com/OWASP/railsgoat rails.5.0.0
fi
if [[ "$SETS" == *dvwa* ]]; then
run_polyglot_corpus dvwa "${CORPUS_CACHE}/dvwa" \
"${SCRIPT_DIR}/ground_truth/dvwa.json" php \
https://github.com/digininja/DVWA 2.5
fi
if [[ "$SETS" == *dvpwa* ]]; then
run_polyglot_corpus dvpwa "${CORPUS_CACHE}/dvpwa" \
"${SCRIPT_DIR}/ground_truth/dvpwa.json" python \
https://github.com/anxolerd/dvpwa master
fi
if [[ "$SETS" == *gosec* ]]; then
run_polyglot_corpus gosec "${CORPUS_CACHE}/gosec" \
"${SCRIPT_DIR}/ground_truth/gosec.json" go \
https://github.com/securego/gosec v2.26.1
fi
# RustSec advisory-db is the Rust negative control (empty ground truth): the
# row asserts the Rust scan/verify path runs and Confirms nothing there.
if [[ "$SETS" == *rustsec* ]]; then
run_polyglot_corpus rustsec "${CORPUS_CACHE}/rustsec" \
"${SCRIPT_DIR}/ground_truth/rustsec.json" rust \
https://github.com/rustsec/advisory-db main
fi
# ── NIST SARD subset bootstrap ────────────────────────────────────────────────
SARD_DIR="${CORPUS_CACHE}/nist_sard"
if [[ "$SETS" == *sard* ]]; then
if [[ ! -d "$SARD_DIR" ]]; then
info "Bootstrapping NIST SARD subset..."
info " Download from https://samate.nist.gov/SARD/"
info " into ${SARD_DIR} then re-run this script."
info "Skipping SARD set (not yet downloaded)."
else
info "Running nyx scan on NIST SARD subset..."
set +e
"$NYX_BIN" scan --format json --verify --no-index "$SARD_DIR" \
> /tmp/nyx_sard.json 2>/tmp/nyx_sard.stderr
NYX_EXIT=$?
set -e
if [[ $NYX_EXIT -ne 0 && $NYX_EXIT -ne 1 ]]; then
info " nyx exited $NYX_EXIT on SARD set"
else
python3 "${SCRIPT_DIR}/tabulate.py" \
--label sard \
--scan /tmp/nyx_sard.json \
--ground-truth "${SCRIPT_DIR}/ground_truth/nist_sard.json" \
--append "$RESULTS_JSON" \
${BUDGET_FILE:+--budget "$BUDGET_FILE"} \
${DIFF_FILE:+--diff "$DIFF_FILE"} \
|| info " tabulate.py failed; ground truth file may be absent"
fi
fi
fi
# ── In-house bughunt-curated set ──────────────────────────────────────────────
if [[ "$SETS" == *inhouse* ]]; then
INHOUSE_DIRS=(
"${REPO_ROOT}/tests/benchmark/corpus"
"${REPO_ROOT}/tests/dynamic_fixtures"
)
for dir in "${INHOUSE_DIRS[@]}"; do
[[ -d "$dir" ]] || continue
label="inhouse_$(basename "$dir")"
info "Running nyx scan on in-house set: $dir"
set +e
"$NYX_BIN" scan --format json --verify --no-index "$dir" \
> "/tmp/nyx_${label}.json" 2>"/tmp/nyx_${label}.stderr"
NYX_EXIT=$?
set -e
if [[ $NYX_EXIT -ne 0 && $NYX_EXIT -ne 1 ]]; then
info " nyx exited $NYX_EXIT on $label"
continue
fi
python3 "${SCRIPT_DIR}/tabulate.py" \
--label "$label" \
--scan "/tmp/nyx_${label}.json" \
--inhouse \
--append "$RESULTS_JSON" \
${BUDGET_FILE:+--budget "$BUDGET_FILE"} \
${DIFF_FILE:+--diff "$DIFF_FILE"} \
|| info " tabulate.py failed on $label"
done
fi
# ── Emit summary table ────────────────────────────────────────────────────────
info ""
info "Results written to: $RESULTS_JSON"
[[ -n "$OUTPUT_DIR" ]] && cp "$RESULTS_JSON" "${OUTPUT_DIR}/eval_results.json"
if [[ ! -f "${SCRIPT_DIR}/report.py" ]]; then
info "report.py not available; raw results at $RESULTS_JSON"
exit 0
fi
set +e
python3 "${SCRIPT_DIR}/report.py" \
--results "$RESULTS_JSON" \
${BUDGET_FILE:+--budget "$BUDGET_FILE"} \
${DIFF_FILE:+--diff "$DIFF_FILE"}
REPORT_RC=$?
set -e
# Propagate budget failures (exit 2) and malformed config (exit 3). Treat other
# non-zero exits as setup errors.
if [[ $REPORT_RC -eq 2 ]]; then
exit 2
elif [[ $REPORT_RC -eq 3 ]]; then
info "report.py: budget/diff configuration malformed; see $RESULTS_JSON"
exit 3
elif [[ $REPORT_RC -ne 0 ]]; then
info "report.py crashed (exit $REPORT_RC); raw results at $RESULTS_JSON"
exit 1
fi
exit 0

90
tests/eval_corpus/run_full.sh Executable file
View file

@ -0,0 +1,90 @@
#!/usr/bin/env bash
# Full eval-corpus orchestrator.
#
# Drives a complete pass against every corpus set the project knows about
# (OWASP Benchmark v1.2, the NIST SARD subset, OWASP NodeGoat + Juice Shop,
# the Track R.2 polyglot corpora — RailsGoat / DVWA / DVPWA / gosec / RustSec —
# and the Nyx benchmark fixtures), then emits `tests/eval_corpus/results.json`
# for reports, diffs, and docs.
#
# Usage:
# tests/eval_corpus/run_full.sh [--nyx BIN] [--budget FILE] [--diff FILE]
# [--output DIR] [--corpus-dir DIR]
#
# Differences vs `run.sh`:
# * Always runs every set (no `--sets` selector).
# * Always passes `--budget tests/eval_corpus/budget.toml` so the
# configured per-cell limits are checked on every pass.
# * Copies the timestamped results file to
# `tests/eval_corpus/results.json`.
#
# Exit codes:
# 0 every set ran and the merged result met the per-cell budget.
# 1 setup or I/O error.
# 2 budget exceeded OR monotonic-improvement regression.
# 3 budget/diff input malformed.
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
NYX_BIN="${NYX_BIN:-${REPO_ROOT}/target/release/nyx}"
BUDGET_FILE="${BUDGET_FILE:-${SCRIPT_DIR}/budget.toml}"
DIFF_FILE="${DIFF_FILE:-}"
OUTPUT_DIR=""
CORPUS_CACHE="${NYX_EVAL_CORPUS_DIR:-${HOME}/.cache/nyx/eval_corpus}"
while [[ $# -gt 0 ]]; do
case "$1" in
--nyx) NYX_BIN="$2"; shift 2 ;;
--budget) BUDGET_FILE="$2"; shift 2 ;;
--diff) DIFF_FILE="$2"; shift 2 ;;
--output) OUTPUT_DIR="$2"; shift 2 ;;
--corpus-dir) CORPUS_CACHE="$2"; shift 2 ;;
-h|--help)
sed -n '1,40p' "$0"
exit 0
;;
*)
echo "unknown flag: $1" >&2
exit 1
;;
esac
done
die() { echo "error: $*" >&2; exit 1; }
info() { echo "[full] $*"; }
[[ -x "$NYX_BIN" ]] || die "nyx binary not found or not executable: $NYX_BIN"
[[ -f "$BUDGET_FILE" ]] || die "budget file not found: $BUDGET_FILE"
OUTPUT_DIR="${OUTPUT_DIR:-${SCRIPT_DIR}/.run-out}"
mkdir -p "$OUTPUT_DIR"
info "nyx: $NYX_BIN"
info "budget: $BUDGET_FILE"
info "diff: ${DIFF_FILE:-<none>}"
info "output: $OUTPUT_DIR"
set +e
NYX_EVAL_CORPUS_DIR="$CORPUS_CACHE" \
bash "${SCRIPT_DIR}/run.sh" \
--nyx "$NYX_BIN" \
--sets owasp,sard,nodegoat,juiceshop,railsgoat,dvwa,dvpwa,gosec,rustsec,inhouse \
--output "$OUTPUT_DIR" \
--budget "$BUDGET_FILE" \
${DIFF_FILE:+--diff "$DIFF_FILE"}
RC=$?
set -e
RESULTS_SRC="${OUTPUT_DIR}/eval_results.json"
RESULTS_DST="${SCRIPT_DIR}/results.json"
if [[ -f "$RESULTS_SRC" ]]; then
cp "$RESULTS_SRC" "$RESULTS_DST"
info "results: $RESULTS_DST"
else
info "no eval_results.json produced; corpus may not be downloaded"
fi
exit "$RC"

View file

@ -0,0 +1,134 @@
#!/usr/bin/env python3
"""Convert NIST SARD manifest XML into nyx ground-truth JSON.
SARD ships per-test-case `manifest.xml` files alongside source. Each
`<testcase>` lists one or more `<file path="">` entries with optional
`<flaw line="" name="CWE-XXX_…"/>` children.
Output schema (consumed by tabulate.py):
list of {"path", "line", "cap", "vuln"} records.
Usage:
tests/eval_corpus/sard_gt_convert.py \\
--corpus-dir ~/.cache/nyx/eval_corpus/nist_sard \\
--output tests/eval_corpus/ground_truth/nist_sard.json
"""
import argparse
import json
import re
import sys
import xml.etree.ElementTree as ET
from pathlib import Path
CWE_TO_NYX_CAP = {
"20": "validation",
"22": "path_traversal",
"78": "cmdi",
"79": "xss",
"89": "sqli",
"90": "ldap_injection",
"91": "xpath_injection",
"94": "cmdi",
"113": "header_injection",
"117": "header_injection",
"190": "memory",
"200": "data_exfil",
"287": "auth",
"295": "crypto",
"311": "crypto",
"327": "crypto",
"328": "crypto",
"330": "crypto",
"352": "auth",
"434": "path_traversal",
"476": "memory",
"502": "deserialize",
"601": "redirect",
"611": "xxe",
"643": "xpath_injection",
"798": "crypto",
"918": "ssrf",
}
CWE_RE = re.compile(r"CWE[-_](\d+)", re.IGNORECASE)
def cap_for_flaw(name: str) -> str | None:
m = CWE_RE.search(name or "")
if not m:
return None
return CWE_TO_NYX_CAP.get(m.group(1))
def main() -> int:
p = argparse.ArgumentParser()
p.add_argument("--corpus-dir", required=True)
p.add_argument("--output", required=True)
args = p.parse_args()
root = Path(args.corpus_dir).expanduser().resolve()
if not root.is_dir():
print(f"error: corpus dir not found: {root}", file=sys.stderr)
return 1
records: list[dict] = []
skipped_files = 0
skipped_caps = 0
for manifest in root.rglob("manifest.xml"):
try:
tree = ET.parse(manifest)
except ET.ParseError as e:
print(f"warn: parse failed {manifest}: {e}", file=sys.stderr)
continue
for tc in tree.iter("testcase"):
for fnode in tc.iter("file"):
rel = fnode.get("path") or ""
if not rel:
continue
abs_path = (manifest.parent / rel).resolve()
if not abs_path.exists():
skipped_files += 1
continue
flaws = list(fnode.iter("flaw")) + list(fnode.iter("mixed"))
if not flaws:
records.append({
"path": str(abs_path),
"line": 0,
"cap": "other",
"vuln": False,
})
continue
for flaw in flaws:
cap = cap_for_flaw(flaw.get("name", ""))
if cap is None:
skipped_caps += 1
continue
try:
line = int(flaw.get("line", "0") or 0)
except ValueError:
line = 0
records.append({
"path": str(abs_path),
"line": line,
"cap": cap,
"vuln": True,
})
out = Path(args.output).expanduser().resolve()
out.parent.mkdir(parents=True, exist_ok=True)
with open(out, "w") as f:
json.dump(records, f, indent=2)
vuln_count = sum(1 for r in records if r["vuln"])
print(f"wrote {len(records)} records to {out}")
print(f" vulns: {vuln_count}")
print(f" non-vuln: {len(records) - vuln_count}")
print(f" skipped (file): {skipped_files}")
print(f" skipped (cap): {skipped_caps}")
return 0
if __name__ == "__main__":
sys.exit(main())

View file

@ -0,0 +1,688 @@
#!/usr/bin/env python3
"""
Tabulate nyx scan results against a ground-truth file.
For OWASP / SARD sets: compares nyx findings against known-true/known-false
labels from the ground truth JSON.
For in-house sets (--inhouse): counts findings by cap x language; reports
Unsupported rate only (no ground truth required).
Output: appends a result record to --append FILE.
Phase 29 (Track I) extensions:
--budget tests/eval_corpus/budget.toml enforce per-cell budget thresholds
--diff previous.json compare against prior result file,
fail on monotonic-improvement
regression
Exit codes:
0 all rows pass.
2 one or more per-cell budgets exceeded OR a diff regression was found.
3 malformed budget / diff input (callers must fix configuration).
"""
import argparse
import json
import os
import sys
from collections import defaultdict
from pathlib import Path
try:
import tomllib # Python 3.11+
except ModuleNotFoundError: # pragma: no cover — older interpreters only
import tomli as tomllib # type: ignore[no-redef]
LINE_TOLERANCE = 5
# Caps with no sound runtime oracle (config / usage smells) and the catch-all
# `other` bucket route to Unsupported by design, so their Unsupported-rate is
# report-only, never gated. Mirrors report.py / the budget.toml intent.
NO_SOUND_ORACLE_CAPS = {"auth", "crypto", "xss", "trustbound", "other"}
def _soft_unsupported() -> bool:
"""True when the per-cell Unsupported-rate budget is report-only.
CI sets `NYX_EVAL_SOFT_UNSUPPORTED` because dynamic confirmation is
environment-constrained there (the budget is calibrated on a dev box where
confirmation runs fully); the precision / confirmed-rate ratchets stay
hard. Unset (local dev) keeps the Unsupported budget hard.
"""
return os.environ.get("NYX_EVAL_SOFT_UNSUPPORTED", "").strip().lower() in (
"1",
"true",
"yes",
"on",
)
# Bitflag positions for Cap (src/labels/mod.rs). Sink bits map to a cap label.
_CAP_BIT_TABLE = [
(1 << 5, "path_traversal"), # FILE_IO
(1 << 6, "fmt_string"),
(1 << 7, "sqli"), # SQL_QUERY
(1 << 8, "deserialize"),
(1 << 9, "ssrf"),
(1 << 10, "cmdi"), # CODE_EXEC
(1 << 11, "crypto"),
(1 << 12, "unauthorized_id"),
(1 << 13, "data_exfil"),
(1 << 14, "ldap_injection"),
(1 << 15, "xpath_injection"),
(1 << 16, "header_injection"),
(1 << 17, "redirect"), # OPEN_REDIRECT
(1 << 18, "xss"), # SSTI (template_injection); also covers XSS sinks
(1 << 19, "xxe"),
(1 << 20, "prototype_pollution"),
# HTML_ESCAPE (1<<1) is the universal reflected-XSS *sink* cap across every
# language (`grep 'Sink(Cap::HTML_ESCAPE)' src/labels/` — PHP echo, JS
# innerHTML, Java servlet writers, etc.); the same bit is the html-escape
# *sanitizer* cap, so a finding only carries it as a sink when an un-encoded
# tainted value reached an HTML output. Placed LAST so any higher-priority
# sink bit (SQL_QUERY, FILE_IO, ...) on the same finding wins; a finding
# carrying only HTML_ESCAPE is reflected XSS. Without this, every
# taint-based reflected-XSS finding mis-buckets to "other".
(1 << 1, "xss"),
]
# Static lens (see --static): SHELL_ESCAPE (1<<2) is the command-injection sink
# cap for *every* language (`grep SHELL_ESCAPE src/labels/` — all Sink uses are
# command-exec; CODE_EXEC=1<<10 is the eval/code-exec variant, also cmdi). In a
# normal `nyx scan` (no dynamic confirmation) a Java cmdi finding carries only
# SHELL_ESCAPE; the SHELL_ESCAPE→CODE_EXEC remap that buckets it as cmdi is gated
# on VerifyStatus::Confirmed (src/commands/scan.rs), so with 0 confirmations the
# default table leaves these in "other" and the cmdi cell reads 0/0/N. The
# static lens appends SHELL_ESCAPE→cmdi at the LOWEST priority (after every other
# bit) so a SHELL_ESCAPE-only finding buckets as cmdi while a finding that also
# carries a higher-priority sink bit (e.g. FILE_IO) keeps its existing bucket.
# Opt-in via --static so the default confirmed-recall bucketing is byte-identical.
_CAP_BIT_TABLE_STATIC = _CAP_BIT_TABLE + [(1 << 2, "cmdi")] # SHELL_ESCAPE
# Substring → cap lookup for rule IDs. Order matters: most specific first.
_CAP_RULE_TABLE = [
("path_traversal", "path_traversal"),
("sql", "sqli"),
("xss", "xss"),
("ssrf", "ssrf"),
("cmdi", "cmdi"),
("cmd_exec", "cmdi"),
("code_exec", "cmdi"),
("deser", "deserialize"),
("unserialize", "deserialize"),
("redirect", "redirect"),
("xxe", "xxe"),
("template", "xss"),
("auth", "auth"),
("memory", "memory"),
("crypto", "crypto"),
("data-exfil", "data_exfil"),
("data_exfil", "data_exfil"),
("header", "header_injection"),
]
def load_json(path: str) -> object:
with open(path) as f:
return json.load(f)
def cap_of(finding: dict, static_lens: bool = False) -> str:
# 1. Prefer evidence.sink_caps bitmask — the engine's own classification.
ev = finding.get("evidence", {}) or {}
sink_caps = ev.get("sink_caps")
if isinstance(sink_caps, int) and sink_caps:
table = _CAP_BIT_TABLE_STATIC if static_lens else _CAP_BIT_TABLE
for bit, name in table:
if sink_caps & bit:
return name
# 2. Fall back to rule id substring (e.g. py.cmdi.os_system, java.deser.readobject).
rid = (finding.get("id") or "").lower()
head = rid.split(" ", 1)[0]
for needle, cap in _CAP_RULE_TABLE:
if needle in head:
return cap
return "other"
def lang_of(finding: dict) -> str:
path = finding.get("path", "")
ext_map = {
".py": "python", ".js": "javascript", ".ts": "typescript",
".java": "java", ".go": "go", ".php": "php", ".rb": "ruby",
".rs": "rust", ".c": "c", ".cpp": "cpp",
}
for ext, lang in ext_map.items():
if path.endswith(ext):
return lang
return "unknown"
def _norm_path(p: str) -> str:
return p.replace("\\", "/")
def path_matches(gt_path: str, finding_path: str) -> bool:
"""True when a ground-truth path refers to the same file as a finding path.
Ground-truth paths are stored *relative to the corpus root* so the checked-in
JSON stays portable, while nyx emits absolute paths rooted at wherever the
corpus was cloned. Match on a path-component-aligned suffix so the relative
GT path matches the absolute finding path (and the reverse, to keep a legacy
absolute GT file working). Exact equality is the fast path; the `/` boundary
stops `.../BenchmarkTest1.java` from matching `.../xBenchmarkTest1.java`.
"""
g = _norm_path(gt_path)
f = _norm_path(finding_path)
return g == f or f.endswith("/" + g) or g.endswith("/" + f)
# ── Budget loading ──────────────────────────────────────────────────────────
def load_budget(path: str) -> dict:
"""Parse a budget.toml file.
Returns a dict::
{
"default": {"unsupported_rate": 0.8, "false_confirmed_rate": 0.02,
"repro_stability": 0.95, "ratchet_deadline": "..."},
"cells": {(cap, lang): {...overrides...}, ...},
}
Raises SystemExit(3) on a malformed file.
"""
try:
with open(path, "rb") as f:
raw = tomllib.load(f)
except FileNotFoundError:
print(f"ERROR budget file not found: {path}", file=sys.stderr)
sys.exit(3)
except tomllib.TOMLDecodeError as e:
print(f"ERROR budget file malformed: {path}: {e}", file=sys.stderr)
sys.exit(3)
default = raw.get("default", {}) or {}
cells = {}
for row in raw.get("cell", []) or []:
cap = row.get("cap")
lang = row.get("lang")
if not cap or not lang:
print(
f"ERROR budget cell missing cap/lang: {row!r}", file=sys.stderr
)
sys.exit(3)
cells[(cap, lang)] = row
return {"default": default, "cells": cells}
def budget_for_cell(budget: dict, cap: str, lang: str) -> dict:
"""Merge cell-specific overrides on top of [default]."""
merged = dict(budget.get("default", {}) or {})
cell = budget.get("cells", {}).get((cap, lang))
if cell:
merged.update({k: v for k, v in cell.items() if k not in ("cap", "lang")})
# Fall back to a wildcard override if present.
if not cell:
wildcard = budget.get("cells", {}).get((cap, "*")) or \
budget.get("cells", {}).get(("*", lang)) or \
budget.get("cells", {}).get(("*", "*"))
if wildcard:
merged.update({k: v for k, v in wildcard.items() if k not in ("cap", "lang")})
return merged
def enforce_budget(cells: list, budget: dict) -> list:
"""Return a list of human-readable failure strings.
Each cell's measured Unsupported / false-Confirmed / repro-stability
rate is compared against its merged budget row. A missing measurement
(e.g. no Confirmed findings false-Confirmed denominator = 0) is
treated as "no data" and skipped, never as a failure.
"""
failures = []
soft_unsupported = _soft_unsupported()
for c in cells:
b = budget_for_cell(budget, c["cap"], c["lang"])
if not b:
continue
cap, lang = c["cap"], c["lang"]
max_unsup = b.get("unsupported_rate")
max_false = b.get("false_confirmed_rate")
min_stable = b.get("repro_stability")
min_confirmed = b.get("confirmed_rate")
if isinstance(max_unsup, (int, float)) and c.get("total", 0) > 0:
if c["unsupported_rate"] > max_unsup:
# No-sound-oracle caps (and `other`) are report-only by design;
# the rest are report-only when dynamic confirmation is known to
# be environment-constrained (NYX_EVAL_SOFT_UNSUPPORTED, set by
# CI). Hard otherwise so local dev still ratchets coverage.
line = (
f" {cap}/{lang}: Unsupported {c['unsupported_rate']*100:.1f}%"
f" > budget {max_unsup*100:.1f}%"
)
if not (cap in NO_SOUND_ORACLE_CAPS or soft_unsupported):
failures.append(f" FAIL{line}")
if isinstance(min_confirmed, (int, float)) and c.get("total", 0) > 0:
rate = c.get("confirmed", 0) / c["total"]
if rate < min_confirmed:
failures.append(
f" FAIL {cap}/{lang}: Confirmed {rate*100:.1f}%"
f" < budget {min_confirmed*100:.1f}%"
)
if isinstance(max_false, (int, float)) and c.get("confirmed", 0) > 0:
rate = c.get("wrong_confirmed", 0) / c["confirmed"]
if rate > max_false:
failures.append(
f" FAIL {cap}/{lang}: false-Confirmed {rate*100:.1f}%"
f" > budget {max_false*100:.1f}%"
)
# Repro stability is only enforced when callers stamped at least
# one `replay_stable: true` flag — otherwise stable_replays == 0
# is indistinguishable from "we did not measure stability for
# this row" and the gate would fire vacuously on every clean run.
if (
isinstance(min_stable, (int, float))
and c.get("confirmed", 0) > 0
and c.get("stable_replays", 0) > 0
):
rate = c["stable_replays"] / c["confirmed"]
if rate < min_stable:
failures.append(
f" FAIL {cap}/{lang}: repro stability {rate*100:.1f}%"
f" < budget {min_stable*100:.1f}%"
)
return failures
# ── Diff loading ────────────────────────────────────────────────────────────
def load_previous_cells(path: str, label: str) -> dict:
"""Index a previous results file by (cap, lang) → cell.
The previous file is the same shape as `--append`'s output. We pick the
record whose `label` matches the current run; if no exact match, fall
back to the first record. Missing/unreadable files exit 3.
"""
try:
with open(path) as f:
data = json.load(f)
except FileNotFoundError:
print(f"ERROR diff file not found: {path}", file=sys.stderr)
sys.exit(3)
except json.JSONDecodeError as e:
print(f"ERROR diff file malformed: {path}: {e}", file=sys.stderr)
sys.exit(3)
records = data if isinstance(data, list) else [data]
chosen = None
for r in records:
if r.get("label") == label:
chosen = r
break
if chosen is None and records:
chosen = records[0]
if not chosen:
return {}
return {(c["cap"], c["lang"]): c for c in chosen.get("cells", [])}
def diff_regressions(cells: list, prev: dict) -> list:
"""Compare current cells against previous. Returns failure strings.
Three monotonicity rules:
* Unsupported% must not increase.
* False-Confirmed% must not increase.
* Repro-stability% must not decrease.
Cells absent from `prev` are treated as new (skipped).
A small epsilon (0.5 percentage points) absorbs flake noise.
"""
EPS = 0.005
failures = []
for c in cells:
key = (c["cap"], c["lang"])
old = prev.get(key)
if not old:
continue
# Unsupported.
old_unsup = old.get("unsupported_rate", 0.0)
new_unsup = c.get("unsupported_rate", 0.0)
if new_unsup > old_unsup + EPS:
failures.append(
f" REGRESSION {key[0]}/{key[1]}: Unsupported"
f" {old_unsup*100:.1f}% → {new_unsup*100:.1f}%"
)
# False-Confirmed.
old_conf = old.get("confirmed", 0)
old_false = (old.get("wrong_confirmed", 0) / old_conf) if old_conf else None
new_conf = c.get("confirmed", 0)
new_false = (c.get("wrong_confirmed", 0) / new_conf) if new_conf else None
if old_false is not None and new_false is not None and new_false > old_false + EPS:
failures.append(
f" REGRESSION {key[0]}/{key[1]}: false-Confirmed"
f" {old_false*100:.1f}% → {new_false*100:.1f}%"
)
# Repro stability (higher is better).
old_stable = (
(old.get("stable_replays", 0) / old_conf) if old_conf else None
)
new_stable = (
(c.get("stable_replays", 0) / new_conf) if new_conf else None
)
if (
old_stable is not None
and new_stable is not None
and new_stable < old_stable - EPS
):
failures.append(
f" REGRESSION {key[0]}/{key[1]}: repro stability"
f" {old_stable*100:.1f}% → {new_stable*100:.1f}%"
)
return failures
def main() -> int:
p = argparse.ArgumentParser()
p.add_argument("--label", required=True)
p.add_argument("--scan", required=True, help="nyx scan --format json output")
p.add_argument("--ground-truth", default="", help="ground truth JSON")
p.add_argument("--inhouse", action="store_true")
p.add_argument("--append", required=True, help="results accumulator JSON")
p.add_argument(
"--manual-triage",
default="",
help=(
"path to a manual-triage JSON file (list of "
"{path, line, cap, vuln: bool}). Confirmed findings matching a "
"`vuln: false` entry are stamped with `wrong: true` before "
"tabulation so the per-cell False-Confirmed budget becomes "
"non-vacuous without depending on the host's `nyx verify-feedback` "
"log. Matching uses LINE_TOLERANCE (=5) — line == 0 in the triage "
"entry matches any line."
),
)
p.add_argument(
"--budget",
default="",
help="path to budget.toml (per-(cap,lang) thresholds)",
)
p.add_argument(
"--lang",
default="",
help=(
"comma-separated language allowlist (python, javascript, php, "
"ruby, go, rust, ...). When set, only findings AND ground-truth "
"entries whose source language is in the list are tabulated; "
"everything else is dropped before tallying. Used by the Phase 29 "
"polyglot corpora (Track R.2) to scope a single-language corpus to "
"its target language so incidental third-party assets in other "
"languages — e.g. the vendored JavaScript a Rails or aiohttp app "
"bundles — do not pollute that corpus's per-cap metrics. Empty = "
"no language filter (every finding tabulated, the OWASP/JSTS "
"default)."
),
)
p.add_argument(
"--diff",
default="",
help="path to a previous results JSON; fail on monotonic-improvement regression",
)
p.add_argument(
"--static",
action="store_true",
help=(
"static lens: bucket SHELL_ESCAPE (1<<2) findings as cmdi even when "
"they are unconfirmed. Java (and other) command-exec sinks carry "
"SHELL_ESCAPE and only get remapped to CODE_EXEC on dynamic Confirm; "
"without this flag, an env with 0 confirmations reads the cmdi cell "
"as 0/0/N regardless of static quality. SHELL_ESCAPE is the "
"command-injection sink cap for every language, so this is sound "
"globally; it is opt-in only so the default confirmed-recall "
"bucketing stays byte-identical."
),
)
args = p.parse_args()
lang_filter = {l.strip() for l in args.lang.split(",") if l.strip()}
scan_data = load_json(args.scan)
findings = scan_data if isinstance(scan_data, list) else scan_data.get("findings", [])
# Score only Security-category findings against the security ground truth.
# Reliability defects (resource leaks, error-handling fallthrough) and
# Quality findings are real bugs but not the injection / crypto / auth
# vulns the corpus ground truth enumerates, so counting them as security
# false-positives is a category error that wrecks precision with pure
# noise. Findings with no explicit category (legacy fixtures) default to
# Security and are kept.
findings = [
f for f in findings
if f.get("category", "Security") not in ("Reliability", "Quality")
]
if lang_filter:
findings = [f for f in findings if lang_of(f) in lang_filter]
# ── Manual-triage stamping (Phase 31 follow-up) ───────────────────────
# Cross-reference Confirmed rows against a manual-triage file before
# tabulation. Each `vuln: false` entry whose `(path, cap)` matches a
# Confirmed finding (with LINE_TOLERANCE, or any line when triage
# entry's `line == 0`) stamps `wrong: true` on the finding's
# `dynamic_verdict`, which the existing wrong_confirmed counter picks
# up below. Decouples the False-Confirmed budget from the host-local
# `nyx verify-feedback` log so CI on a fresh eval corpus can still
# gate the headline target.
if args.manual_triage and Path(args.manual_triage).exists():
triage = load_json(args.manual_triage)
not_vuln: list[dict] = []
for entry in triage if isinstance(triage, list) else []:
if entry.get("vuln") is False:
not_vuln.append({
"path": entry.get("path", ""),
"line": entry.get("line", 0),
"cap": entry.get("cap", ""),
})
used: set[int] = set()
for f in findings:
ev = f.get("evidence") or {}
dv = ev.get("dynamic_verdict") or {}
if dv.get("status") != "Confirmed":
continue
f_path = f.get("path", "")
f_line = f.get("line", 0)
f_cap = cap_of(f, static_lens=args.static)
for idx, entry in enumerate(not_vuln):
if idx in used:
continue
if (path_matches(entry["path"], f_path)
and entry["cap"] == f_cap
and (entry["line"] == 0
or abs(entry["line"] - f_line) <= LINE_TOLERANCE)):
used.add(idx)
dv["wrong"] = True
ev["dynamic_verdict"] = dv
f["evidence"] = ev
break
# Per-cell tallies: {(cap, lang): {tp, fp, fn, unsupported, confirmed,
# partially_confirmed, wrong_confirmed, stable_replays, total}}
cells: dict[tuple[str, str], dict] = defaultdict(
lambda: {
"tp": 0,
"fp": 0,
"fn": 0,
"unsupported": 0,
"confirmed": 0,
"partially_confirmed": 0,
"wrong_confirmed": 0,
"stable_replays": 0,
# Confirmed-verdict precision/recall accounting, ground-truth-derived
# (only populated when --ground-truth is supplied): confirmed_tp =
# Confirmed findings that match a GT positive; confirmed_fp =
# Confirmed findings that match no GT positive (false confirms).
"confirmed_tp": 0,
"confirmed_fp": 0,
"total": 0,
}
)
for f in findings:
cap = cap_of(f, static_lens=args.static)
lang = lang_of(f)
key = (cap, lang)
ev = f.get("evidence", {}) or {}
dv = ev.get("dynamic_verdict") if ev else None
cells[key]["total"] += 1
if dv:
status = dv.get("status")
if status == "Unsupported":
cells[key]["unsupported"] += 1
elif status == "PartiallyConfirmed":
cells[key]["partially_confirmed"] += 1
elif status == "Confirmed":
cells[key]["confirmed"] += 1
# Repro-stability and false-Confirmed counts are optional
# fields tabulate.py reads off the verdict when callers have
# stamped them.
if dv.get("wrong") is True:
cells[key]["wrong_confirmed"] += 1
if dv.get("replay_stable") is True:
cells[key]["stable_replays"] += 1
if not args.inhouse and args.ground_truth and Path(args.ground_truth).exists():
gt = load_json(args.ground_truth)
# Ground truth format: list of {"path": ..., "line": ..., "cap": ..., "vuln": bool}
gt_true: list[dict] = []
for entry in gt if isinstance(gt, list) else []:
# Honour the same language scope as the findings filter so recall
# is measured only over the corpus's target language.
if lang_filter and lang_of(entry) not in lang_filter:
continue
if entry.get("vuln"):
gt_true.append({
"path": entry.get("path", ""),
"line": entry.get("line", 0),
"cap": entry.get("cap", ""),
})
# Track which GT entries were matched (by index) to avoid double-counting.
matched_gt: set[int] = set()
# Track (path, cap) pairs that had at least one finding match.
found_path_caps: set[tuple[str, str]] = set()
for f in findings:
f_path = f.get("path", "")
f_line = f.get("line", 0)
f_cap = cap_of(f, static_lens=args.static)
cap = f_cap
lang = lang_of(f)
cell_key = (cap, lang)
dv = (f.get("evidence") or {}).get("dynamic_verdict") or {}
is_confirmed = dv.get("status") == "Confirmed"
matched_idx = None
for idx, gt_entry in enumerate(gt_true):
if (path_matches(gt_entry["path"], f_path)
and gt_entry["cap"] == f_cap
and idx not in matched_gt
and (gt_entry["line"] == 0
or abs(gt_entry["line"] - f_line) <= LINE_TOLERANCE)):
matched_idx = idx
break
if matched_idx is not None:
matched_gt.add(matched_idx)
found_path_caps.add((f_path, f_cap))
cells[cell_key]["tp"] += 1
if is_confirmed:
cells[cell_key]["confirmed_tp"] += 1
else:
cells[cell_key]["fp"] += 1
if is_confirmed:
cells[cell_key]["confirmed_fp"] += 1
for idx, gt_entry in enumerate(gt_true):
if idx not in matched_gt:
cap = gt_entry["cap"]
# Land the FN in the cell its source language implies (from the
# GT path extension) so per-(cap,lang) recall is meaningful and
# OWASP misses show up in the java cell, not a stray "unknown".
cells[(cap, lang_of(gt_entry))]["fn"] += 1
# Ground-truth-derived false-confirm accounting. When a corpus ships a
# vuln/benign label per file (OWASP, SARD), a Confirmed finding that
# matches no GT positive is a false confirm — authoritative, so it
# overrides any manual-triage stamping for these labelled sets. This is
# what makes the per-cell `false_confirmed_rate` budget non-vacuous on a
# fresh eval corpus without a host-local verify-feedback log.
for v in cells.values():
if v["confirmed_tp"] or v["confirmed_fp"]:
v["wrong_confirmed"] = v["confirmed_fp"]
result = {
"label": args.label,
"total_findings": len(findings),
"cells": [
{
"cap": k[0],
"lang": k[1],
**v,
"precision": v["tp"] / max(v["tp"] + v["fp"], 1),
"recall": v["tp"] / max(v["tp"] + v["fn"], 1),
"unsupported_rate": v["unsupported"] / max(v["total"], 1),
}
for k, v in sorted(cells.items())
],
}
existing = load_json(args.append) if Path(args.append).exists() else []
existing.append(result)
with open(args.append, "w") as f:
json.dump(existing, f, indent=2)
# Print summary
print(f"\n=== {args.label} ===")
print(f"{'Cap':<20} {'Lang':<12} {'TP':>5} {'FP':>5} {'FN':>5} {'Prec':>6} {'Rec':>6} {'Unsup%':>7}")
print("-" * 72)
for c in result["cells"]:
print(
f"{c['cap']:<20} {c['lang']:<12} "
f"{c['tp']:>5} {c['fp']:>5} {c['fn']:>5} "
f"{c['precision']:>6.2f} {c['recall']:>6.2f} "
f"{c['unsupported_rate']*100:>6.1f}%"
)
exit_rc = 0
# ── Phase 29: per-cell budget enforcement ─────────────────────────────
if args.budget:
budget = load_budget(args.budget)
failures = enforce_budget(result["cells"], budget)
if failures:
print(f"\n=== Per-cell budget regressions ({args.budget}) ===")
for line in failures:
print(line)
exit_rc = 2
else:
print(f"\nPer-cell budget ({args.budget}): OK")
# ── Phase 29: diff against previous run ───────────────────────────────
if args.diff:
prev = load_previous_cells(args.diff, args.label)
failures = diff_regressions(result["cells"], prev)
if failures:
print(f"\n=== Monotonic-improvement regressions vs {args.diff} ===")
for line in failures:
print(line)
exit_rc = 2
else:
print(f"\nDiff vs {args.diff}: no regressions")
return exit_rc
if __name__ == "__main__":
sys.exit(main())

View file

@ -0,0 +1,251 @@
#!/usr/bin/env python3
"""
Phase 28 (Track R.1) regression test for tests/eval_corpus/manifest_gt_convert.py.
Proves the manifest -> ground-truth converter is non-vacuous:
* a well-formed manifest converts to the expected sorted JSON,
* --corpus-dir validation passes when every labelled path exists and
produces byte-identical output to the no-corpus transform (so the CI
in-sync guard, which diffs committed vs a validated regen, is sound),
* --corpus-dir validation HARD-ERRORS (exit 2) on a missing path,
* an unknown cap / duplicate (path,cap) / malformed TOML are rejected,
* the committed nodegoat.json / juiceshop.json are exactly what a fresh
conversion of their manifests produces (offline half of the CI guard).
Run with::
python3 tests/eval_corpus/test_manifest_gt_convert.py
Exits 0 when every assertion holds, non-zero otherwise.
"""
from __future__ import annotations
import json
import subprocess
import sys
import tempfile
from pathlib import Path
REPO = Path(__file__).resolve().parents[2]
CONVERT = REPO / "tests/eval_corpus/manifest_gt_convert.py"
GT_DIR = REPO / "tests/eval_corpus/ground_truth"
GOOD_MANIFEST = """\
corpus = "demo"
upstream = "https://example.test/demo"
pinned_ref = "v1"
[[entry]]
path = "routes/login.ts"
cap = "sqli"
vuln = true
note = "raw SQL string-concat in login"
[[entry]]
path = "app/routes/contributions.js"
cap = "cmdi"
vuln = true
note = "eval of user input"
[[entry]]
path = "lib/insecurity.ts"
cap = "crypto"
vuln = false
note = "benign control example"
"""
def run_convert(*args: str) -> subprocess.CompletedProcess:
return subprocess.run(
[sys.executable, str(CONVERT), *args], capture_output=True, text=True
)
def test_transform_is_sorted_and_schema_clean(tmp: Path) -> None:
man = tmp / "demo.manifest.toml"
man.write_text(GOOD_MANIFEST)
out = tmp / "demo.json"
proc = run_convert("--manifest", str(man), "--output", str(out))
assert proc.returncode == 0, proc.stdout + proc.stderr
records = json.loads(out.read_text())
# Sorted by (path, cap); only the 4 GT fields; `note` dropped.
assert [r["path"] for r in records] == [
"app/routes/contributions.js",
"lib/insecurity.ts",
"routes/login.ts",
], records
for r in records:
assert set(r) == {"path", "line", "cap", "vuln"}, r
assert r["line"] == 0, r
assert records[0]["cap"] == "cmdi" and records[0]["vuln"] is True
assert records[1]["cap"] == "crypto" and records[1]["vuln"] is False
def test_corpus_validation_passes_and_matches_no_corpus(tmp: Path) -> None:
man = tmp / "demo.manifest.toml"
man.write_text(GOOD_MANIFEST)
# Build a corpus tree containing every labelled path.
corpus = tmp / "corpus"
for rel in ("routes/login.ts", "app/routes/contributions.js", "lib/insecurity.ts"):
f = corpus / rel
f.parent.mkdir(parents=True, exist_ok=True)
f.write_text("// stub\n")
no_corpus = tmp / "no_corpus.json"
with_corpus = tmp / "with_corpus.json"
assert run_convert("--manifest", str(man), "--output", str(no_corpus)).returncode == 0
proc = run_convert(
"--manifest", str(man),
"--corpus-dir", str(corpus),
"--output", str(with_corpus),
)
assert proc.returncode == 0, proc.stdout + proc.stderr
# Validation must not change the output — that is what makes the CI guard
# (diff committed vs validated regen) meaningful.
assert no_corpus.read_text() == with_corpus.read_text()
assert "validated against" in proc.stdout, proc.stdout
def test_missing_path_exits_2(tmp: Path) -> None:
man = tmp / "demo.manifest.toml"
man.write_text(GOOD_MANIFEST)
corpus = tmp / "corpus"
# Only two of the three labelled files exist → the third must trip.
for rel in ("routes/login.ts", "app/routes/contributions.js"):
f = corpus / rel
f.parent.mkdir(parents=True, exist_ok=True)
f.write_text("// stub\n")
out = tmp / "demo.json"
proc = run_convert(
"--manifest", str(man), "--corpus-dir", str(corpus), "--output", str(out)
)
assert proc.returncode == 2, proc.stdout + proc.stderr
assert "lib/insecurity.ts" in proc.stderr and "missing" in proc.stderr, proc.stderr
def test_unknown_cap_rejected(tmp: Path) -> None:
man = tmp / "bad_cap.manifest.toml"
man.write_text(
'[[entry]]\npath = "a.js"\ncap = "not_a_cap"\nvuln = true\n'
)
out = tmp / "out.json"
proc = run_convert("--manifest", str(man), "--output", str(out))
assert proc.returncode == 1, proc.stdout + proc.stderr
assert "not a known nyx cap" in proc.stderr, proc.stderr
def test_duplicate_path_cap_rejected(tmp: Path) -> None:
man = tmp / "dup.manifest.toml"
man.write_text(
'[[entry]]\npath = "a.js"\ncap = "xss"\nvuln = true\n'
'[[entry]]\npath = "a.js"\ncap = "xss"\nvuln = true\n'
)
out = tmp / "out.json"
proc = run_convert("--manifest", str(man), "--output", str(out))
assert proc.returncode == 1, proc.stdout + proc.stderr
assert "duplicate" in proc.stderr, proc.stderr
def test_malformed_manifest_exits_1(tmp: Path) -> None:
man = tmp / "broken.toml"
man.write_text("[[entry]\npath = \n") # invalid TOML
out = tmp / "out.json"
proc = run_convert("--manifest", str(man), "--output", str(out))
assert proc.returncode == 1, proc.stdout + proc.stderr
assert "malformed" in proc.stderr, proc.stderr
def test_empty_manifest_exits_1(tmp: Path) -> None:
man = tmp / "empty.toml"
man.write_text('corpus = "x"\n') # no [[entry]] tables
out = tmp / "out.json"
proc = run_convert("--manifest", str(man), "--output", str(out))
assert proc.returncode == 1, proc.stdout + proc.stderr
assert "no [[entry]]" in proc.stderr, proc.stderr
def test_committed_gt_matches_manifest(tmp: Path) -> None:
# Offline half of the CI in-sync guard: the committed ground-truth JSON
# must be exactly what a fresh conversion of its manifest produces. This
# catches a manifest edit that was not followed by a regenerate.
for name in (
"nodegoat",
"juiceshop",
# Track R.2 polyglot corpora (Phase 29).
"railsgoat",
"dvwa",
"dvpwa",
"gosec",
"rustsec",
):
man = GT_DIR / f"{name}.manifest.toml"
committed = GT_DIR / f"{name}.json"
assert man.exists(), f"missing manifest: {man}"
assert committed.exists(), f"missing committed GT: {committed}"
regen = tmp / f"{name}.json"
proc = run_convert("--manifest", str(man), "--output", str(regen))
assert proc.returncode == 0, proc.stdout + proc.stderr
assert json.loads(regen.read_text()) == json.loads(committed.read_text()), (
f"{committed} is stale — regenerate with manifest_gt_convert.py"
)
def test_negative_control_emits_empty(tmp: Path) -> None:
# A negative-control manifest (no scannable source vulns, e.g. RustSec
# advisory-db) declares `negative_control = true` and zero [[entry]]
# tables; the converter emits an empty `[]` ground truth.
man = tmp / "neg.manifest.toml"
man.write_text(
'corpus = "rustsec"\n'
'upstream = "https://example.test/advisory-db"\n'
'pinned_ref = "main"\n'
"negative_control = true\n"
)
out = tmp / "neg.json"
proc = run_convert("--manifest", str(man), "--output", str(out))
assert proc.returncode == 0, proc.stdout + proc.stderr
assert json.loads(out.read_text()) == [], out.read_text()
assert "negative-control corpus" in proc.stdout, proc.stdout
def test_negative_control_with_entries_rejected(tmp: Path) -> None:
# negative_control and [[entry]] are mutually exclusive: a manifest that
# sets the flag yet lists a vuln must be rejected so a real positive can
# never be silently hidden behind the flag.
man = tmp / "neg_bad.manifest.toml"
man.write_text(
"negative_control = true\n"
'[[entry]]\npath = "a.rs"\ncap = "cmdi"\nvuln = true\n'
)
out = tmp / "neg_bad.json"
proc = run_convert("--manifest", str(man), "--output", str(out))
assert proc.returncode == 1, proc.stdout + proc.stderr
assert "negative_control" in proc.stderr and "zero" in proc.stderr, proc.stderr
def main() -> int:
with tempfile.TemporaryDirectory() as td:
tmp = Path(td)
for fn in (
test_transform_is_sorted_and_schema_clean,
test_corpus_validation_passes_and_matches_no_corpus,
test_missing_path_exits_2,
test_unknown_cap_rejected,
test_duplicate_path_cap_rejected,
test_malformed_manifest_exits_1,
test_empty_manifest_exits_1,
test_committed_gt_matches_manifest,
test_negative_control_emits_empty,
test_negative_control_with_entries_rejected,
):
sub = tmp / fn.__name__
sub.mkdir()
print(f"... {fn.__name__}")
fn(sub)
print(" OK")
print("\nAll manifest_gt_convert.py regression checks passed.")
return 0
if __name__ == "__main__":
sys.exit(main())

View file

@ -0,0 +1,771 @@
#!/usr/bin/env python3
"""
Phase 29 (Track I) regression test for tests/eval_corpus/tabulate.py.
Exercises --budget and --diff against hand-crafted scan + ground-truth
fixtures so the per-cell budget gate and monotonic-improvement diff are
demonstrably non-vacuous.
Run with::
python3 tests/eval_corpus/test_tabulate_regression.py
Exits 0 when every assertion holds, non-zero otherwise. The asserts are
plain `assert` statements so the file works both as a stand-alone script
and under unittest discovery.
"""
from __future__ import annotations
import json
import subprocess
import sys
import tempfile
from pathlib import Path
REPO = Path(__file__).resolve().parents[2]
TABULATE = REPO / "tests/eval_corpus/tabulate.py"
REPORT = REPO / "tests/eval_corpus/report.py"
BUDGET = REPO / "tests/eval_corpus/budget.toml"
def run_tabulate(*args: str) -> subprocess.CompletedProcess:
cmd = [sys.executable, str(TABULATE), *args]
return subprocess.run(cmd, capture_output=True, text=True)
def run_report(*args: str) -> subprocess.CompletedProcess:
cmd = [sys.executable, str(REPORT), *args]
return subprocess.run(cmd, capture_output=True, text=True)
def write_json(path: Path, data: object) -> None:
path.write_text(json.dumps(data, indent=2))
# Cap bit positions cribbed from tabulate.py / src/labels/mod.rs.
SINK_BIT_SQL = 1 << 7 # SQL_QUERY
SINK_BIT_CMDI = 1 << 10 # CODE_EXEC
SINK_BIT_SHELL = 1 << 2 # SHELL_ESCAPE (Java/other command-exec sink)
SINK_BIT_FILE = 1 << 5 # FILE_IO (path_traversal)
def python_finding(cap_bit: int, path: str, line: int, status: str | None) -> dict:
finding = {
"path": path,
"line": line,
"col": 0,
"id": "py.sqli.cursor_execute",
"evidence": {"sink_caps": cap_bit},
}
if status:
finding["evidence"]["dynamic_verdict"] = {"status": status}
return finding
def test_budget_passes_on_clean_scan(tmp: Path) -> None:
scan = tmp / "scan_clean.json"
write_json(
scan,
{
"findings": [
python_finding(SINK_BIT_SQL, "app.py", 10, "Confirmed"),
python_finding(SINK_BIT_SQL, "app.py", 20, "Confirmed"),
python_finding(SINK_BIT_SQL, "app.py", 30, "NotConfirmed"),
]
},
)
append = tmp / "results_clean.json"
write_json(append, [])
proc = run_tabulate(
"--label", "test",
"--scan", str(scan),
"--inhouse",
"--append", str(append),
"--budget", str(BUDGET),
)
assert proc.returncode == 0, f"clean scan must pass budget, got rc={proc.returncode}\nstdout: {proc.stdout}\nstderr: {proc.stderr}"
assert "Per-cell budget" in proc.stdout and "OK" in proc.stdout, proc.stdout
def test_budget_fails_when_unsupported_exceeds(tmp: Path) -> None:
# SQL_QUERY/python budget is 40% Unsupported. Hand-craft a scan with
# 100% Unsupported in that cell so the gate must trip.
scan = tmp / "scan_unsup.json"
write_json(
scan,
{
"findings": [
python_finding(SINK_BIT_SQL, "app.py", i, "Unsupported")
for i in (10, 20, 30, 40, 50)
]
},
)
append = tmp / "results_unsup.json"
write_json(append, [])
proc = run_tabulate(
"--label", "test",
"--scan", str(scan),
"--inhouse",
"--append", str(append),
"--budget", str(BUDGET),
)
assert proc.returncode == 2, (
f"budget breach must exit 2, got {proc.returncode}\n"
f"stdout: {proc.stdout}\nstderr: {proc.stderr}"
)
assert "FAIL" in proc.stdout and "sqli/python" in proc.stdout, proc.stdout
def test_diff_fails_on_regression(tmp: Path) -> None:
# Previous run: 1/4 Unsupported = 25%. Current run: 3/4 = 75%. The
# default cell budget tolerates 80%, but the monotonic-improvement
# diff must still flag the +50pp regression.
prev_findings = [
python_finding(SINK_BIT_CMDI, "x.unknown", 1, "Confirmed"),
python_finding(SINK_BIT_CMDI, "x.unknown", 2, "Confirmed"),
python_finding(SINK_BIT_CMDI, "x.unknown", 3, "Confirmed"),
python_finding(SINK_BIT_CMDI, "x.unknown", 4, "Unsupported"),
]
prev_scan = tmp / "prev_scan.json"
write_json(prev_scan, {"findings": prev_findings})
prev_results = tmp / "prev_results.json"
write_json(prev_results, [])
rc_prev = run_tabulate(
"--label", "diff-test",
"--scan", str(prev_scan),
"--inhouse",
"--append", str(prev_results),
).returncode
assert rc_prev == 0, f"prev seed run must succeed, got {rc_prev}"
cur_findings = [
python_finding(SINK_BIT_CMDI, "x.unknown", 1, "Unsupported"),
python_finding(SINK_BIT_CMDI, "x.unknown", 2, "Unsupported"),
python_finding(SINK_BIT_CMDI, "x.unknown", 3, "Unsupported"),
python_finding(SINK_BIT_CMDI, "x.unknown", 4, "Confirmed"),
]
cur_scan = tmp / "cur_scan.json"
write_json(cur_scan, {"findings": cur_findings})
cur_results = tmp / "cur_results.json"
write_json(cur_results, [])
proc = run_tabulate(
"--label", "diff-test",
"--scan", str(cur_scan),
"--inhouse",
"--append", str(cur_results),
"--diff", str(prev_results),
)
assert proc.returncode == 2, (
f"regression diff must exit 2, got {proc.returncode}\n"
f"stdout: {proc.stdout}\nstderr: {proc.stderr}"
)
assert "REGRESSION" in proc.stdout and "Unsupported" in proc.stdout, proc.stdout
def test_diff_passes_on_improvement(tmp: Path) -> None:
# Previous: 3/4 Unsupported. Current: 1/4. Monotonic improvement
# must not flag any regression.
prev_findings = [
python_finding(SINK_BIT_CMDI, "x.unknown", 1, "Unsupported"),
python_finding(SINK_BIT_CMDI, "x.unknown", 2, "Unsupported"),
python_finding(SINK_BIT_CMDI, "x.unknown", 3, "Unsupported"),
python_finding(SINK_BIT_CMDI, "x.unknown", 4, "Confirmed"),
]
prev_scan = tmp / "prev_scan.json"
write_json(prev_scan, {"findings": prev_findings})
prev_results = tmp / "prev_results.json"
write_json(prev_results, [])
run_tabulate(
"--label", "improve-test",
"--scan", str(prev_scan),
"--inhouse",
"--append", str(prev_results),
)
cur_findings = [
python_finding(SINK_BIT_CMDI, "x.unknown", 1, "Confirmed"),
python_finding(SINK_BIT_CMDI, "x.unknown", 2, "Confirmed"),
python_finding(SINK_BIT_CMDI, "x.unknown", 3, "Confirmed"),
python_finding(SINK_BIT_CMDI, "x.unknown", 4, "Unsupported"),
]
cur_scan = tmp / "cur_scan.json"
write_json(cur_scan, {"findings": cur_findings})
cur_results = tmp / "cur_results.json"
write_json(cur_results, [])
proc = run_tabulate(
"--label", "improve-test",
"--scan", str(cur_scan),
"--inhouse",
"--append", str(cur_results),
"--diff", str(prev_results),
)
assert proc.returncode == 0, (
f"improvement diff must exit 0, got {proc.returncode}\n"
f"stdout: {proc.stdout}\nstderr: {proc.stderr}"
)
assert "no regressions" in proc.stdout, proc.stdout
def test_manual_triage_stamps_wrong_confirmed(tmp: Path) -> None:
# Phase 31 follow-up: --manual-triage should cross-reference Confirmed
# findings against a list of {path, line, cap, vuln: false} entries
# and stamp `wrong: true` so the per-cell wrong_confirmed counter
# becomes non-vacuous without the host's verify-feedback log.
#
# Confirmed at line 10 matches the triage's vuln:false at line 12
# (within LINE_TOLERANCE=5). Confirmed at line 100 does not match
# any triage entry, so wrong_confirmed stays at 1 / 2 Confirmed.
scan = tmp / "scan.json"
write_json(
scan,
{
"findings": [
python_finding(SINK_BIT_SQL, "app.py", 10, "Confirmed"),
python_finding(SINK_BIT_SQL, "app.py", 100, "Confirmed"),
]
},
)
triage = tmp / "triage.json"
write_json(
triage,
[
{"path": "app.py", "line": 12, "cap": "sqli", "vuln": False},
],
)
append = tmp / "results.json"
write_json(append, [])
proc = run_tabulate(
"--label", "triage-test",
"--scan", str(scan),
"--inhouse",
"--append", str(append),
"--manual-triage", str(triage),
)
assert proc.returncode == 0, (
f"manual-triage run must succeed without budget, got {proc.returncode}\n"
f"stdout: {proc.stdout}\nstderr: {proc.stderr}"
)
results = json.loads(append.read_text())
cells = {(c["cap"], c["lang"]): c for c in results[-1]["cells"]}
sqli_py = cells.get(("sqli", "python"))
assert sqli_py is not None, f"expected sqli/python cell, got {list(cells)}"
assert sqli_py["confirmed"] == 2, sqli_py
assert sqli_py["wrong_confirmed"] == 1, (
"exactly one Confirmed finding must be stamped wrong via the triage match; "
f"got {sqli_py}"
)
def test_manual_triage_ignores_vuln_true_entries(tmp: Path) -> None:
# Triage entries with `vuln: true` are ground-truth-positive markers,
# not False-Confirmed evidence. --manual-triage must leave them alone
# so a real Confirmed-on-vuln-true row does not get downgraded.
scan = tmp / "scan.json"
write_json(
scan,
{
"findings": [
python_finding(SINK_BIT_SQL, "app.py", 10, "Confirmed"),
]
},
)
triage = tmp / "triage.json"
write_json(
triage,
[
{"path": "app.py", "line": 10, "cap": "sqli", "vuln": True},
],
)
append = tmp / "results.json"
write_json(append, [])
proc = run_tabulate(
"--label", "triage-true-test",
"--scan", str(scan),
"--inhouse",
"--append", str(append),
"--manual-triage", str(triage),
)
assert proc.returncode == 0
results = json.loads(append.read_text())
cells = {(c["cap"], c["lang"]): c for c in results[-1]["cells"]}
sqli_py = cells[("sqli", "python")]
assert sqli_py["confirmed"] == 1
assert sqli_py["wrong_confirmed"] == 0, (
f"vuln:true triage rows must not stamp wrong; got {sqli_py}"
)
def test_lang_filter_scopes_findings_and_gt(tmp: Path) -> None:
# Phase 29 (Track R.2): --lang scopes a single-language corpus to its
# target language so incidental other-language assets (e.g. the vendored
# JavaScript a Rails app bundles, which nyx flags as prototype_pollution)
# do not pollute the corpus's per-cap metrics. The filter must drop both
# findings AND ground-truth entries outside the scope.
gt = tmp / "gt.json"
write_json(
gt,
[
{"path": "app/models/user.rb", "line": 0, "cap": "sqli", "vuln": True},
{"path": "app/assets/lib.js", "line": 0, "cap": "sqli", "vuln": True},
],
)
scan = tmp / "scan.json"
write_json(
scan,
{
"findings": [
python_finding(SINK_BIT_SQL, "/x/app/models/user.rb", 10, "NotConfirmed"),
# A vendored-JS finding nyx would otherwise Confirm — must be
# excluded entirely under `--lang ruby`.
python_finding(SINK_BIT_SQL, "/x/app/assets/lib.js", 10, "Confirmed"),
]
},
)
# Unscoped: both language cells appear.
unscoped = tmp / "unscoped.json"
write_json(unscoped, [])
proc = run_tabulate(
"--label", "railsgoat",
"--scan", str(scan),
"--ground-truth", str(gt),
"--append", str(unscoped),
)
assert proc.returncode == 0, proc.stdout + proc.stderr
cells = {(c["cap"], c["lang"]) for c in json.loads(unscoped.read_text())[-1]["cells"]}
assert ("sqli", "ruby") in cells and ("sqli", "javascript") in cells, cells
# Scoped to ruby: the JS finding AND the JS ground-truth positive vanish.
scoped = tmp / "scoped.json"
write_json(scoped, [])
proc = run_tabulate(
"--label", "railsgoat",
"--scan", str(scan),
"--ground-truth", str(gt),
"--lang", "ruby",
"--append", str(scoped),
)
assert proc.returncode == 0, proc.stdout + proc.stderr
cells = {(c["cap"], c["lang"]): c for c in json.loads(scoped.read_text())[-1]["cells"]}
assert ("sqli", "javascript") not in cells, f"JS must be filtered out: {list(cells)}"
ruby = cells[("sqli", "ruby")]
assert ruby["tp"] == 1 and ruby["fn"] == 0, ruby
# The dropped JS positive must NOT resurface as a phantom FN in any cell.
assert all(lang != "javascript" for _cap, lang in cells), cells
def test_static_lens_buckets_shell_escape_as_cmdi(tmp: Path) -> None:
# Caveat-1 fix: in an env with 0 dynamic confirmations a Java command-exec
# finding carries only SHELL_ESCAPE (1<<2), which the default bit table
# leaves in "other" — so the cmdi cell reads 0 TP / N FN regardless of
# static quality. --static appends SHELL_ESCAPE→cmdi so static recall is
# measurable without dynamic confirmation.
gt = tmp / "gt.json"
write_json(
gt,
[{"path": "testcode/Cmd.java", "line": 0, "cap": "cmdi", "vuln": True}],
)
# Real Java taint findings carry id "taint-unsanitised-flow" (no cap
# substring), so the rule-id fallback yields "other" — not the sqli/cmdi
# the hand-crafted python_finding id would imply.
java_cmdi = {
"path": "/x/testcode/Cmd.java",
"line": 10,
"col": 0,
"id": "taint-unsanitised-flow",
"evidence": {"sink_caps": SINK_BIT_SHELL, "dynamic_verdict": {"status": "NotConfirmed"}},
}
scan = tmp / "scan.json"
write_json(scan, {"findings": [java_cmdi]})
# Default lens: the finding buckets as "other", so cmdi shows the GT
# positive as a pure FN (recall 0) — the measurement gap.
default = tmp / "default.json"
write_json(default, [])
proc = run_tabulate(
"--label", "owasp",
"--scan", str(scan),
"--ground-truth", str(gt),
"--append", str(default),
)
assert proc.returncode == 0, proc.stdout + proc.stderr
cells = {(c["cap"], c["lang"]): c for c in json.loads(default.read_text())[-1]["cells"]}
assert ("cmdi", "java") in cells and cells[("cmdi", "java")]["tp"] == 0, cells
assert cells[("cmdi", "java")]["fn"] == 1, cells[("cmdi", "java")]
assert ("other", "java") in cells, f"SHELL_ESCAPE must bucket as other by default: {list(cells)}"
# Static lens: the finding buckets as cmdi → recall measurable (TP=1, FN=0).
static = tmp / "static.json"
write_json(static, [])
proc = run_tabulate(
"--label", "owasp",
"--scan", str(scan),
"--ground-truth", str(gt),
"--static",
"--append", str(static),
)
assert proc.returncode == 0, proc.stdout + proc.stderr
cells = {(c["cap"], c["lang"]): c for c in json.loads(static.read_text())[-1]["cells"]}
cmdi = cells[("cmdi", "java")]
assert cmdi["tp"] == 1 and cmdi["fn"] == 0, cmdi
assert ("other", "java") not in cells, f"static lens must reclaim the other-bucketed finding: {list(cells)}"
def test_static_lens_preserves_higher_priority_bits(tmp: Path) -> None:
# A finding carrying BOTH FILE_IO and SHELL_ESCAPE must keep bucketing as
# path_traversal under the static lens (SHELL_ESCAPE is appended at lowest
# priority), so the static lens never steals a finding from a non-cmdi cell.
scan = tmp / "scan.json"
write_json(
scan,
{
"findings": [
python_finding(SINK_BIT_FILE | SINK_BIT_SHELL, "B.java", 10, "NotConfirmed"),
]
},
)
for flag in ([], ["--static"]):
append = tmp / f"out{len(flag)}.json"
write_json(append, [])
proc = run_tabulate(
"--label", "x",
"--scan", str(scan),
"--inhouse",
"--append", str(append),
*flag,
)
assert proc.returncode == 0, proc.stdout + proc.stderr
caps = {c["cap"] for c in json.loads(append.read_text())[-1]["cells"]}
assert caps == {"path_traversal"}, f"flag={flag}: {caps}"
def test_budget_malformed_exits_3(tmp: Path) -> None:
bad = tmp / "bad.toml"
bad.write_text("[default]\nunsupported_rate = not_a_number\n")
scan = tmp / "scan.json"
write_json(scan, {"findings": []})
append = tmp / "results.json"
write_json(append, [])
proc = run_tabulate(
"--label", "test",
"--scan", str(scan),
"--inhouse",
"--append", str(append),
"--budget", str(bad),
)
assert proc.returncode == 3, (
f"malformed budget must exit 3, got {proc.returncode}\nstderr: {proc.stderr}"
)
def test_relative_gt_path_suffix_matches_absolute_finding(tmp: Path) -> None:
# Phase 27: ground truth stores corpus-relative paths; nyx emits absolute
# paths. A relative GT path must suffix-match the absolute finding path so
# the committed JSON stays portable across machines / CI checkouts.
gt = tmp / "gt.json"
write_json(
gt,
[
{
"path": "src/main/java/org/owasp/benchmark/testcode/BenchmarkTest1.java",
"line": 0,
"cap": "sqli",
"vuln": True,
}
],
)
scan = tmp / "scan.json"
write_json(
scan,
{
"findings": [
# Absolute path with the GT relative path as a suffix → TP.
python_finding(
SINK_BIT_SQL,
"/home/ci/work/owasp/src/main/java/org/owasp/benchmark/testcode/BenchmarkTest1.java",
10,
"Confirmed",
),
# Different file under the same corpus → no GT positive → FP.
python_finding(
SINK_BIT_SQL,
"/home/ci/work/owasp/src/main/java/org/owasp/benchmark/testcode/BenchmarkTest2.java",
10,
"NotConfirmed",
),
]
},
)
append = tmp / "results.json"
write_json(append, [])
proc = run_tabulate(
"--label", "owasp",
"--scan", str(scan),
"--ground-truth", str(gt),
"--append", str(append),
)
assert proc.returncode == 0, proc.stdout + proc.stderr
cells = {(c["cap"], c["lang"]): c for c in json.loads(append.read_text())[-1]["cells"]}
sqli_java = cells[("sqli", "java")]
assert sqli_java["tp"] == 1, f"relative GT path must suffix-match absolute finding: {sqli_java}"
assert sqli_java["fp"] == 1, f"benign-file finding must count as FP: {sqli_java}"
assert sqli_java["fn"] == 0, sqli_java
def test_unmatched_gt_positive_lands_in_lang_cell(tmp: Path) -> None:
# Phase 27: a ground-truth positive with no matching finding is a FN, and
# it must land in the cell its file extension implies (java), not a stray
# "unknown" lang cell, so per-cap recall aggregation is meaningful.
gt = tmp / "gt.json"
write_json(
gt,
[
{
"path": "src/main/java/org/owasp/benchmark/testcode/BenchmarkTest9.java",
"line": 0,
"cap": "sqli",
"vuln": True,
}
],
)
scan = tmp / "scan.json"
write_json(scan, {"findings": []})
append = tmp / "results.json"
write_json(append, [])
proc = run_tabulate(
"--label", "owasp",
"--scan", str(scan),
"--ground-truth", str(gt),
"--append", str(append),
)
assert proc.returncode == 0, proc.stdout + proc.stderr
cells = {(c["cap"], c["lang"]): c for c in json.loads(append.read_text())[-1]["cells"]}
assert ("sqli", "java") in cells, f"FN must land in the java cell: {list(cells)}"
assert cells[("sqli", "java")]["fn"] == 1, cells[("sqli", "java")]
assert ("sqli", "unknown") not in cells, f"no stray unknown-lang cell: {list(cells)}"
def test_gt_grounded_false_confirm(tmp: Path) -> None:
# Phase 27: with full ground truth, a Confirmed finding that matches no GT
# positive is a false confirm — derived from GT, no manual-triage file
# needed. vuln file → confirmed_tp; benign/other file → confirmed_fp →
# wrong_confirmed. Makes false_confirmed_rate non-vacuous on a fresh corpus.
gt = tmp / "gt.json"
write_json(
gt,
[
{"path": "testcode/Vuln.java", "line": 0, "cap": "sqli", "vuln": True},
{"path": "testcode/Benign.java", "line": 0, "cap": "sqli", "vuln": False},
],
)
scan = tmp / "scan.json"
write_json(
scan,
{
"findings": [
# Correct confirm on the vuln file.
python_finding(SINK_BIT_SQL, "/x/testcode/Vuln.java", 10, "Confirmed"),
# False confirm on the benign file (no GT positive there).
python_finding(SINK_BIT_SQL, "/x/testcode/Benign.java", 10, "Confirmed"),
]
},
)
append = tmp / "results.json"
write_json(append, [])
proc = run_tabulate(
"--label", "owasp",
"--scan", str(scan),
"--ground-truth", str(gt),
"--append", str(append),
)
assert proc.returncode == 0, proc.stdout + proc.stderr
cells = {(c["cap"], c["lang"]): c for c in json.loads(append.read_text())[-1]["cells"]}
sqli_java = cells[("sqli", "java")]
assert sqli_java["confirmed_tp"] == 1, sqli_java
assert sqli_java["confirmed_fp"] == 1, sqli_java
assert sqli_java["wrong_confirmed"] == 1, (
f"benign-file Confirmed must be a GT-derived false confirm: {sqli_java}"
)
def test_budget_confirmed_rate_floor(tmp: Path) -> None:
# Phase 27: budget.toml may carry a per-cell `confirmed_rate` minimum.
# 1 Confirmed of 5 (20%) must trip a 40% floor.
budget = tmp / "budget.toml"
budget.write_text(
"[default]\n"
"[[cell]]\n"
'cap = "sqli"\n'
'lang = "java"\n'
"confirmed_rate = 0.40\n"
)
scan_fail = tmp / "scan_fail.json"
write_json(
scan_fail,
{
"findings": [
python_finding(SINK_BIT_SQL, "B.java", 10, "Confirmed"),
python_finding(SINK_BIT_SQL, "B.java", 20, "NotConfirmed"),
python_finding(SINK_BIT_SQL, "B.java", 30, "NotConfirmed"),
python_finding(SINK_BIT_SQL, "B.java", 40, "NotConfirmed"),
python_finding(SINK_BIT_SQL, "B.java", 50, "NotConfirmed"),
]
},
)
append = tmp / "results_fail.json"
write_json(append, [])
proc = run_tabulate(
"--label", "owasp",
"--scan", str(scan_fail),
"--inhouse",
"--append", str(append),
"--budget", str(budget),
)
assert proc.returncode == 2, proc.stdout + proc.stderr
assert "Confirmed" in proc.stdout and "sqli/java" in proc.stdout, proc.stdout
# 3 Confirmed of 5 (60%) clears the floor.
scan_ok = tmp / "scan_ok.json"
write_json(
scan_ok,
{
"findings": [
python_finding(SINK_BIT_SQL, "B.java", 10, "Confirmed"),
python_finding(SINK_BIT_SQL, "B.java", 20, "Confirmed"),
python_finding(SINK_BIT_SQL, "B.java", 30, "Confirmed"),
python_finding(SINK_BIT_SQL, "B.java", 40, "NotConfirmed"),
python_finding(SINK_BIT_SQL, "B.java", 50, "NotConfirmed"),
]
},
)
append_ok = tmp / "results_ok.json"
write_json(append_ok, [])
proc = run_tabulate(
"--label", "owasp",
"--scan", str(scan_ok),
"--inhouse",
"--append", str(append_ok),
"--budget", str(budget),
)
assert proc.returncode == 0, proc.stdout + proc.stderr
def test_report_precision_recall_floors(tmp: Path) -> None:
# Phase 27: report.py --min-precision / --min-recall enforce per-cap floors
# aggregated across langs. cmdi precision 0.20 trips 0.85; ldap recall 0.10
# trips 0.40; sqli (prec 1.0, rec 0.90) clears both.
results = tmp / "results.json"
def cell(cap, lang, tp, fp, fn):
return {
"cap": cap, "lang": lang, "tp": tp, "fp": fp, "fn": fn,
"unsupported": 0, "confirmed": 0, "partially_confirmed": 0,
"wrong_confirmed": 0, "stable_replays": 0,
"total": tp + fp + fn,
}
write_json(
results,
[
{
"label": "owasp",
"total_findings": 0,
"cells": [
cell("sqli", "java", 9, 0, 1), # prec 1.00, rec 0.90 → OK
cell("cmdi", "java", 1, 4, 0), # prec 0.20 → FAIL precision
cell("ldap_injection", "java", 1, 0, 9), # rec 0.10 → FAIL recall
],
}
],
)
proc = run_report(
"--results", str(results),
"--min-precision", "0.85",
"--min-recall", "0.40",
)
assert proc.returncode == 2, proc.stdout + proc.stderr
assert "PRECISION" in proc.stdout and "cmdi" in proc.stdout, proc.stdout
assert "RECALL" in proc.stdout and "ldap_injection" in proc.stdout, proc.stdout
# Clean: only the passing sqli cap.
clean = tmp / "clean.json"
write_json(
clean,
[{"label": "owasp", "total_findings": 0, "cells": [cell("sqli", "java", 9, 0, 1)]}],
)
proc = run_report(
"--results", str(clean),
"--min-precision", "0.85",
"--min-recall", "0.40",
)
assert proc.returncode == 0, proc.stdout + proc.stderr
assert "All per-cap precision/recall floors met" in proc.stdout, proc.stdout
def test_report_confirmed_rate_floor(tmp: Path) -> None:
results = tmp / "results.json"
write_json(
results,
[
{
"label": "owasp",
"total_findings": 5,
"cells": [
{
"cap": "sqli",
"lang": "java",
"tp": 0,
"fp": 0,
"fn": 0,
"unsupported": 0,
"confirmed": 2,
"wrong_confirmed": 0,
"stable_replays": 0,
"total": 5,
}
],
}
],
)
proc = run_report("--results", str(results), "--min-confirmed-rate", "0.40")
assert proc.returncode == 0, proc.stdout + proc.stderr
assert "All confirmed-rate floors met" in proc.stdout, proc.stdout
proc = run_report("--results", str(results), "--min-confirmed-rate", "0.50")
assert proc.returncode == 2, proc.stdout + proc.stderr
assert "FAIL" in proc.stdout and "sqli" in proc.stdout, proc.stdout
def main() -> int:
with tempfile.TemporaryDirectory() as td:
tmp = Path(td)
for fn in (
test_budget_passes_on_clean_scan,
test_budget_fails_when_unsupported_exceeds,
test_diff_fails_on_regression,
test_diff_passes_on_improvement,
test_manual_triage_stamps_wrong_confirmed,
test_manual_triage_ignores_vuln_true_entries,
test_lang_filter_scopes_findings_and_gt,
test_static_lens_buckets_shell_escape_as_cmdi,
test_static_lens_preserves_higher_priority_bits,
test_budget_malformed_exits_3,
test_relative_gt_path_suffix_matches_absolute_finding,
test_unmatched_gt_positive_lands_in_lang_cell,
test_gt_grounded_false_confirm,
test_budget_confirmed_rate_floor,
test_report_precision_recall_floors,
test_report_confirmed_rate_floor,
):
sub = tmp / fn.__name__
sub.mkdir()
print(f"... {fn.__name__}")
fn(sub)
print(f" OK")
print("\nAll tabulate.py regression checks passed.")
return 0
if __name__ == "__main__":
sys.exit(main())