mirror of
https://github.com/elicpeter/nyx.git
synced 2026-06-27 20:29:39 +02:00
Dynamic (#77)
This commit is contained in:
parent
55247b7fcd
commit
991c84a1eb
1464 changed files with 225448 additions and 1985 deletions
352
tests/eval_corpus/budget.toml
Normal file
352
tests/eval_corpus/budget.toml
Normal file
|
|
@ -0,0 +1,352 @@
|
|||
# Eval corpus budget.
|
||||
#
|
||||
# `report.py` enforces these values when `run.sh` or `run_full.sh` pass
|
||||
# `--budget`. Each (cap, lang) cell uses the default row unless a specific
|
||||
# override appears below.
|
||||
#
|
||||
# Wall-clock cost is measured separately from this per-cell budget.
|
||||
#
|
||||
# Schema:
|
||||
#
|
||||
# [default]
|
||||
# unsupported_rate = 0.20 # max(Unsupported / total) per cell
|
||||
# false_confirmed_rate = 0.02 # max(wrong / Confirmed) per cap
|
||||
# repro_stability = 0.95 # min(stable / Confirmed) per cell
|
||||
# confirmed_rate = 0.40 # min(Confirmed / total) per cell (omit to skip)
|
||||
# ratchet_deadline = "..." # informational; cells already at headline
|
||||
#
|
||||
# [[cell]]
|
||||
# cap = "..."
|
||||
# lang = "..."
|
||||
# <overrides as above>
|
||||
#
|
||||
# `cap` matches `tabulate.py`'s _CAP_BIT_TABLE / _CAP_RULE_TABLE labels.
|
||||
# `lang` matches the ext_map values (`python`, `javascript`, …).
|
||||
# A wildcard `"*"` matches any cell that does not have an exact entry.
|
||||
#
|
||||
# Each rate is enforced only when the relevant denominator is non-zero, so a
|
||||
# cell with no findings (or no Confirmed findings) never trips a budget
|
||||
# vacuously. `confirmed_rate` is a *minimum* (a ratchet floor); the others are
|
||||
# maxima. Per-cell overrides are calibrated to the measured frontier on the
|
||||
# real corpus so the gate locks in current performance and catches regressions
|
||||
# (see the OWASP cells below).
|
||||
|
||||
[default]
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-05-15"
|
||||
|
||||
# ── OWASP Benchmark v1.2 (Java) — Track R.0 ratchet ──────────────────────────
|
||||
#
|
||||
# Calibrated against the pinned 1.2beta corpus, nyx c0501884, 2026-05-29
|
||||
# (`nyx scan --verify` over all 2740 BenchmarkTest files; 5812 findings).
|
||||
#
|
||||
# Measured frontier at calibration:
|
||||
# verdicts : Confirmed 0 | NotConfirmed 4077 | Inconclusive 1725
|
||||
# (BuildFailed 952 + SpecDerivationFailed 773) | Unsupported 10
|
||||
# per cell : unsupported_rate <= 1.7% (headline <= 20% -> MET)
|
||||
# false_confirmed = 0% (headline <= 2% -> MET, 0 confirms)
|
||||
# confirmed_rate = 0% (headline >= 40% -> NOT met)
|
||||
#
|
||||
# The verifier confirms nothing on OWASP yet: every BenchmarkTest is a servlet
|
||||
# whose harness lands in BuildFailed / SpecDerivationFailed (Java servlet entry
|
||||
# wiring + classpath are Track L.12 / Track O.0 work). So the enforced floors
|
||||
# below are the two headline maxima the verifier already satisfies
|
||||
# (unsupported_rate, false_confirmed_rate). `confirmed_rate` is intentionally
|
||||
# left UNSET — the headline >= 40% is the ratchet's destination, recorded here
|
||||
# and in scripts/m7_ship_gate.sh (NYX_OWASP_FLOOR_CAPS), not a floor we can
|
||||
# honestly assert at 0 confirms. Promote a cap into the gated set (and add its
|
||||
# `confirmed_rate`) the moment it starts Confirming.
|
||||
#
|
||||
# Caps split two ways:
|
||||
# sound-oracle (injection): cmdi, sqli, path_traversal, ldap_injection,
|
||||
# xpath_injection — once their servlet harnesses build, a runtime oracle
|
||||
# exists; these are the GATE6_FLOOR_CAPS candidates.
|
||||
# no-sound-oracle (config/usage smell): crypto (weak rand/hash), auth
|
||||
# (insecure cookie), xss/trustbound — Phase-11 routes these to
|
||||
# Unsupported(SoundOracleUnavailable); they stay report-only. When that
|
||||
# routing lands their unsupported_rate will rise and these cells must be
|
||||
# relaxed accordingly.
|
||||
|
||||
[[cell]]
|
||||
cap = "cmdi"
|
||||
lang = "java"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
[[cell]]
|
||||
cap = "sqli"
|
||||
lang = "java"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
[[cell]]
|
||||
cap = "path_traversal"
|
||||
lang = "java"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
[[cell]]
|
||||
cap = "ldap_injection"
|
||||
lang = "java"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
[[cell]]
|
||||
cap = "xpath_injection"
|
||||
lang = "java"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
[[cell]]
|
||||
cap = "xss"
|
||||
lang = "java"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
[[cell]]
|
||||
cap = "crypto"
|
||||
lang = "java"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
[[cell]]
|
||||
cap = "auth"
|
||||
lang = "java"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
# ── NodeGoat / Juice Shop (JS/TS) — Track R.1 ratchet ────────────────────────
|
||||
#
|
||||
# Phase 28 wires two intentionally-vulnerable JS/TS apps into the same
|
||||
# acceptance machinery as OWASP Benchmark: OWASP NodeGoat (Express, .js)
|
||||
# and OWASP Juice Shop (TypeScript, .ts). Unlike OWASP Benchmark, neither
|
||||
# app ships vuln/benign *pairs* — every labelled file is `vuln = true` (see
|
||||
# ground_truth/{nodegoat,juiceshop}.manifest.toml). Two consequences for
|
||||
# these cells:
|
||||
#
|
||||
# * false_confirmed_rate (<= 2%) is the headline maximum the verifier
|
||||
# already satisfies and is HARD-enforced: it only trips when a Confirmed
|
||||
# finding lands on a file with no ground-truth positive, i.e. an
|
||||
# over-confirm. With the verifier confirming little on real corpora yet
|
||||
# it is satisfied, and it ratchets precision as confirms grow.
|
||||
# * unsupported_rate (<= 20%) is HARD-enforced too. `Unsupported` counts
|
||||
# only NoPayloadsForCap / EntryKindUnsupported / SoundOracleUnavailable —
|
||||
# a narrow bucket that Tracks J + M shrank — *not* BuildFailed /
|
||||
# SpecDerivationFailed (those are Inconclusive), so it stays low.
|
||||
#
|
||||
# confirmed_rate (>= 40%), precision (>= 0.85) and recall (>= 0.40) are the
|
||||
# Phase 28 acceptance DESTINATIONS. They are intentionally left UNSET here
|
||||
# and published report-only by Gate 7 (NYX_JSTS_FLOOR_CAPS empty by default,
|
||||
# mirroring NYX_OWASP_FLOOR_CAPS) because (a) the verifier does not yet
|
||||
# Confirm these corpora end to end and (b) the manifest labels canonical
|
||||
# vulns only, so precision vs partial ground truth is informational until
|
||||
# the labels are completed. Promote a cap into the floor set the moment it
|
||||
# starts Confirming, exactly as for OWASP.
|
||||
|
||||
# NodeGoat (javascript): caps with a ground-truth label in nodegoat.manifest.toml.
|
||||
[[cell]]
|
||||
cap = "cmdi"
|
||||
lang = "javascript"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
[[cell]]
|
||||
cap = "xss"
|
||||
lang = "javascript"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
[[cell]]
|
||||
cap = "unauthorized_id"
|
||||
lang = "javascript"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
[[cell]]
|
||||
cap = "crypto"
|
||||
lang = "javascript"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
# Juice Shop (typescript): caps with a ground-truth label in juiceshop.manifest.toml.
|
||||
[[cell]]
|
||||
cap = "sqli"
|
||||
lang = "typescript"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
[[cell]]
|
||||
cap = "path_traversal"
|
||||
lang = "typescript"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
[[cell]]
|
||||
cap = "redirect"
|
||||
lang = "typescript"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
[[cell]]
|
||||
cap = "ssrf"
|
||||
lang = "typescript"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
[[cell]]
|
||||
cap = "crypto"
|
||||
lang = "typescript"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
# ── Polyglot real corpora (Ruby/PHP/Python/Go/Rust) — Track R.2 ──────────────
|
||||
#
|
||||
# Phase 29 wires five more intentionally-vulnerable real corpora, one per
|
||||
# remaining language family, into the same acceptance machinery as OWASP /
|
||||
# NodeGoat / Juice Shop:
|
||||
#
|
||||
# * railsgoat — OWASP RailsGoat (Rails, .rb)
|
||||
# * dvwa — Damn Vulnerable Web Application (PHP); ships graded
|
||||
# source variants, so low.php = vuln and impossible.php =
|
||||
# benign control — real vuln/benign PAIRS like OWASP.
|
||||
# * dvpwa — Damn Vulnerable Python Web App (aiohttp, .py); its
|
||||
# parameterized DAO siblings are benign controls for the
|
||||
# one `%`-formatted SQL sink.
|
||||
# * gosec — the Go SAST tool's own repo; the scannable, `// want`-
|
||||
# annotated sample under goanalysis/testdata is the curated
|
||||
# ground truth (its embedded-string rule samples are not
|
||||
# scannable, so they are unlabelled).
|
||||
# * rustsec — RustSec advisory-db: a NEGATIVE CONTROL. It ships
|
||||
# advisory metadata, not vulnerable .rs source, so its
|
||||
# ground truth is empty by construction; the row asserts the
|
||||
# Rust scan/verify path runs at scale within wall-clock and
|
||||
# Confirms NOTHING (any Confirmed Rust finding there is a
|
||||
# false confirm and trips the default false_confirmed_rate).
|
||||
#
|
||||
# Each row is gated with the SAME policy as Gates 6/7 (scripts/m7_ship_gate.sh
|
||||
# Gate 8): wall-clock + the per-(cap,lang) budget below are HARD-enforced;
|
||||
# per-cap confirmed-rate / precision / recall are published report-only
|
||||
# (NYX_POLYGLOT_FLOOR_CAPS empty by default). Because each corpus targets a
|
||||
# single language, Gate 8 scopes tabulation to that language (tabulate.py
|
||||
# --lang), so the vendored third-party JavaScript these Ruby/Python apps
|
||||
# bundle (bootstrap-colorpicker, materialize, …) — which nyx confirms as
|
||||
# prototype_pollution — does not pollute the corpus's per-cap metrics. Those
|
||||
# JS findings are still emitted; they are simply out of scope for a Ruby /
|
||||
# Python corpus.
|
||||
#
|
||||
# Calibrated against the pinned corpora (nyx HEAD of the Phase 29 branch,
|
||||
# 2026-05-31) with `nyx scan --verify --index off`. Measured frontier
|
||||
# (target-language scope): every curated cell sits at <= the headline maxima
|
||||
# below EXCEPT cmdi, where every finding carries a SHELL_ESCAPE sanitizer cap
|
||||
# and is therefore routed to Unsupported(SoundOracleUnavailable) — the same
|
||||
# no-sound-oracle treatment OWASP's crypto/auth cells get. RailsGoat's
|
||||
# deserialize (Marshal.load) and redirect (open redirect) cells Confirm end to
|
||||
# end with zero false confirms — the first real polyglot confirms.
|
||||
|
||||
# railsgoat (ruby): caps with a ground-truth label in railsgoat.manifest.toml.
|
||||
[[cell]]
|
||||
cap = "auth"
|
||||
lang = "ruby"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
[[cell]]
|
||||
cap = "crypto"
|
||||
lang = "ruby"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
[[cell]]
|
||||
cap = "deserialize"
|
||||
lang = "ruby"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
[[cell]]
|
||||
cap = "redirect"
|
||||
lang = "ruby"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
[[cell]]
|
||||
cap = "path_traversal"
|
||||
lang = "ruby"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
# cmdi/ruby is incidental (RailsGoat's `self.try(params[:graph])` reflection
|
||||
# sink); the lone finding carries a SHELL_ESCAPE sanitizer cap and routes to
|
||||
# Unsupported(SoundOracleUnavailable), so unsupported_rate is locked at the
|
||||
# measured frontier (1/1). The false-confirm guard stays at the headline 2%.
|
||||
[[cell]]
|
||||
cap = "cmdi"
|
||||
lang = "ruby"
|
||||
unsupported_rate = 1.00
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
# dvwa (php): caps with a ground-truth label in dvwa.manifest.toml.
|
||||
[[cell]]
|
||||
cap = "sqli"
|
||||
lang = "php"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
[[cell]]
|
||||
cap = "redirect"
|
||||
lang = "php"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
[[cell]]
|
||||
cap = "header_injection"
|
||||
lang = "php"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
# cmdi/php: DVWA's ping handlers reach shell_exec through a SHELL_ESCAPE
|
||||
# sanitizer cap, so ~69% of the cell's findings route to
|
||||
# Unsupported(SoundOracleUnavailable). unsupported_rate is locked to that
|
||||
# frontier with margin (a regression above 75% fails); false-confirm at 2%.
|
||||
[[cell]]
|
||||
cap = "cmdi"
|
||||
lang = "php"
|
||||
unsupported_rate = 0.75
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
# dvpwa (python): caps with a ground-truth label in dvpwa.manifest.toml.
|
||||
[[cell]]
|
||||
cap = "sqli"
|
||||
lang = "python"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
[[cell]]
|
||||
cap = "crypto"
|
||||
lang = "python"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
[[cell]]
|
||||
cap = "auth"
|
||||
lang = "python"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
# gosec (go): caps with a ground-truth label in gosec.manifest.toml.
|
||||
[[cell]]
|
||||
cap = "crypto"
|
||||
lang = "go"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
# cmdi/go: the goanalysis/testdata exec.Command sample reaches the sink
|
||||
# through a SHELL_ESCAPE sanitizer cap, so every cmdi/go finding routes to
|
||||
# Unsupported(SoundOracleUnavailable). unsupported_rate locked to the
|
||||
# measured frontier (3/3); false-confirm at the headline 2%.
|
||||
[[cell]]
|
||||
cap = "cmdi"
|
||||
lang = "go"
|
||||
unsupported_rate = 1.00
|
||||
false_confirmed_rate = 0.02
|
||||
173
tests/eval_corpus/check_surface.sh
Executable file
173
tests/eval_corpus/check_surface.sh
Executable file
|
|
@ -0,0 +1,173 @@
|
|||
#!/usr/bin/env bash
|
||||
# Phase 31 acceptance walker: assert `nyx surface` produces a usable
|
||||
# map on every downloaded eval-corpus fixture root.
|
||||
#
|
||||
# Walks the project trees under $NYX_EVAL_CORPUS_DIR plus the in-house
|
||||
# `tests/benchmark/corpus` and `tests/dynamic_fixtures` trees, runs
|
||||
# `nyx surface --build --format json <root>` against each, and asserts
|
||||
# the resulting JSON contains at least one EntryPoint plus at least
|
||||
# one DataStore / ExternalService / DangerousLocal node.
|
||||
#
|
||||
# `--build` forces the inline pass-1 + call-graph path so the walker
|
||||
# does not depend on a prior `nyx index build` or `nyx scan`.
|
||||
#
|
||||
# Usage:
|
||||
# tests/eval_corpus/check_surface.sh [--nyx BIN] [--corpus-dir DIR]
|
||||
# [--also-inhouse]
|
||||
# [--report FILE]
|
||||
#
|
||||
# Environment:
|
||||
# NYX_EVAL_CORPUS_DIR — path to pre-downloaded corpus roots
|
||||
# (default: ~/.cache/nyx/eval_corpus). When
|
||||
# missing or empty the walker still scans the
|
||||
# in-house corpus and exits 0 so CI without a
|
||||
# corpus mirror does not block on Phase 31.
|
||||
#
|
||||
# Exit codes:
|
||||
# 0 every walked project produced a usable SurfaceMap (or no
|
||||
# projects were available — see corpus-missing note above).
|
||||
# 1 setup / I/O / missing-binary error.
|
||||
# 2 one or more projects produced an empty or unusable SurfaceMap.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||||
|
||||
NYX_BIN="${NYX_BIN:-${REPO_ROOT}/target/release/nyx}"
|
||||
CORPUS_CACHE="${NYX_EVAL_CORPUS_DIR:-${HOME}/.cache/nyx/eval_corpus}"
|
||||
ALSO_INHOUSE="false"
|
||||
REPORT_FILE=""
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--nyx) NYX_BIN="$2"; shift 2 ;;
|
||||
--corpus-dir) CORPUS_CACHE="$2"; shift 2 ;;
|
||||
--also-inhouse) ALSO_INHOUSE="true"; shift ;;
|
||||
--report) REPORT_FILE="$2"; shift 2 ;;
|
||||
-h|--help)
|
||||
sed -n '1,40p' "$0"
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
echo "unknown flag: $1" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
die() { echo "error: $*" >&2; exit 1; }
|
||||
info() { echo "[surface-check] $*"; }
|
||||
warn() { echo "[surface-check] WARN: $*" >&2; }
|
||||
|
||||
[[ -x "$NYX_BIN" ]] || die "nyx binary not found or not executable: $NYX_BIN"
|
||||
command -v jq >/dev/null 2>&1 || die "required command not found: jq"
|
||||
|
||||
# Collect project roots. Each corpus directory is treated as a single
|
||||
# project; the in-house corpus trees are handled the same way (each
|
||||
# language vertical is a project root).
|
||||
PROJECTS=()
|
||||
if [[ -d "$CORPUS_CACHE" ]]; then
|
||||
for entry in "$CORPUS_CACHE"/*; do
|
||||
[[ -d "$entry" ]] && PROJECTS+=("$entry")
|
||||
done
|
||||
else
|
||||
warn "corpus directory missing: $CORPUS_CACHE (run tests/eval_corpus/run.sh to bootstrap)"
|
||||
fi
|
||||
if [[ "$ALSO_INHOUSE" == "true" ]]; then
|
||||
for dir in \
|
||||
"${REPO_ROOT}/tests/benchmark/corpus" \
|
||||
"${REPO_ROOT}/tests/dynamic_fixtures"
|
||||
do
|
||||
[[ -d "$dir" ]] && PROJECTS+=("$dir")
|
||||
done
|
||||
fi
|
||||
|
||||
if [[ ${#PROJECTS[@]} -eq 0 ]]; then
|
||||
info "no project roots to walk (eval corpus not downloaded, in-house trees absent)"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
PASS_COUNT=0
|
||||
FAIL_COUNT=0
|
||||
FAIL_PROJECTS=()
|
||||
declare -a REPORT_ROWS=()
|
||||
|
||||
for project in "${PROJECTS[@]}"; do
|
||||
info "walking: $project"
|
||||
set +e
|
||||
out="$("$NYX_BIN" surface --build --format json "$project" 2>/dev/null)"
|
||||
rc=$?
|
||||
set -e
|
||||
if [[ $rc -ne 0 ]]; then
|
||||
warn "nyx surface --build exited $rc on $project"
|
||||
FAIL_COUNT=$((FAIL_COUNT + 1))
|
||||
FAIL_PROJECTS+=("$project (nyx exit=$rc)")
|
||||
REPORT_ROWS+=("$(printf '{"project":%s,"status":"nyx-error","exit":%d}' \
|
||||
"$(jq -Rn --arg p "$project" '$p')" "$rc")")
|
||||
continue
|
||||
fi
|
||||
if [[ -z "$out" ]]; then
|
||||
warn "empty output on $project"
|
||||
FAIL_COUNT=$((FAIL_COUNT + 1))
|
||||
FAIL_PROJECTS+=("$project (empty output)")
|
||||
REPORT_ROWS+=("$(printf '{"project":%s,"status":"empty-output"}' \
|
||||
"$(jq -Rn --arg p "$project" '$p')")")
|
||||
continue
|
||||
fi
|
||||
# Count nodes by kind. SurfaceMap serialises each node as a flat
|
||||
# object with a `node` discriminator: `entry_point`, `data_store`,
|
||||
# `external_service`, `dangerous_local`.
|
||||
entry_count="$(echo "$out" | jq '[.nodes[] | select(.node == "entry_point")] | length')"
|
||||
ds_count="$(echo "$out" | jq '[.nodes[] | select(.node == "data_store")] | length')"
|
||||
es_count="$(echo "$out" | jq '[.nodes[] | select(.node == "external_service")] | length')"
|
||||
dl_count="$(echo "$out" | jq '[.nodes[] | select(.node == "dangerous_local")] | length')"
|
||||
sink_count=$((ds_count + es_count + dl_count))
|
||||
if [[ "$entry_count" -lt 1 ]]; then
|
||||
warn "no EntryPoint nodes on $project"
|
||||
FAIL_COUNT=$((FAIL_COUNT + 1))
|
||||
FAIL_PROJECTS+=("$project (no entry-points)")
|
||||
REPORT_ROWS+=("$(printf '{"project":%s,"status":"no-entry-points","entry_count":%d}' \
|
||||
"$(jq -Rn --arg p "$project" '$p')" "$entry_count")")
|
||||
continue
|
||||
fi
|
||||
if [[ "$sink_count" -lt 1 ]]; then
|
||||
warn "no DataStore / ExternalService / DangerousLocal nodes on $project"
|
||||
FAIL_COUNT=$((FAIL_COUNT + 1))
|
||||
FAIL_PROJECTS+=("$project (no sinks: ds=$ds_count es=$es_count dl=$dl_count)")
|
||||
REPORT_ROWS+=("$(printf '{"project":%s,"status":"no-sinks","entry_count":%d,"ds":%d,"es":%d,"dl":%d}' \
|
||||
"$(jq -Rn --arg p "$project" '$p')" "$entry_count" "$ds_count" "$es_count" "$dl_count")")
|
||||
continue
|
||||
fi
|
||||
info " ok: ${entry_count} entry-points, ${ds_count} data stores, ${es_count} external, ${dl_count} dangerous"
|
||||
PASS_COUNT=$((PASS_COUNT + 1))
|
||||
REPORT_ROWS+=("$(printf '{"project":%s,"status":"ok","entry_count":%d,"ds":%d,"es":%d,"dl":%d}' \
|
||||
"$(jq -Rn --arg p "$project" '$p')" "$entry_count" "$ds_count" "$es_count" "$dl_count")")
|
||||
done
|
||||
|
||||
if [[ -n "$REPORT_FILE" ]]; then
|
||||
{
|
||||
echo "{"
|
||||
echo " \"pass\": $PASS_COUNT,"
|
||||
echo " \"fail\": $FAIL_COUNT,"
|
||||
echo " \"projects\": ["
|
||||
for i in "${!REPORT_ROWS[@]}"; do
|
||||
sep=","
|
||||
[[ $i -eq $((${#REPORT_ROWS[@]} - 1)) ]] && sep=""
|
||||
echo " ${REPORT_ROWS[$i]}$sep"
|
||||
done
|
||||
echo " ]"
|
||||
echo "}"
|
||||
} > "$REPORT_FILE"
|
||||
info "report written: $REPORT_FILE"
|
||||
fi
|
||||
|
||||
info ""
|
||||
info "summary: ${PASS_COUNT} pass, ${FAIL_COUNT} fail (of $((PASS_COUNT + FAIL_COUNT)) projects)"
|
||||
if [[ $FAIL_COUNT -gt 0 ]]; then
|
||||
for p in "${FAIL_PROJECTS[@]}"; do
|
||||
info " fail: $p"
|
||||
done
|
||||
exit 2
|
||||
fi
|
||||
exit 0
|
||||
106
tests/eval_corpus/ground_truth/README.md
Normal file
106
tests/eval_corpus/ground_truth/README.md
Normal file
|
|
@ -0,0 +1,106 @@
|
|||
# Ground truth files
|
||||
|
||||
Place corpus ground truth JSON files here before running `tests/eval_corpus/run.sh`.
|
||||
|
||||
## OWASP Benchmark v1.2
|
||||
|
||||
File: `owasp_benchmark_v1.2.json` (checked in; complete — one record per
|
||||
BenchmarkTest file, 2740 total).
|
||||
|
||||
Format:
|
||||
```json
|
||||
[
|
||||
{"path": "src/main/java/org/owasp/.../BenchmarkTest00001.java", "line": 0, "cap": "sqli", "vuln": true},
|
||||
...
|
||||
]
|
||||
```
|
||||
|
||||
`path` is **relative to the corpus root** (the BenchmarkJava clone), with POSIX
|
||||
separators. `tabulate.py` suffix-matches it against the absolute paths nyx
|
||||
emits, so the committed JSON is portable: it matches whether the corpus lives at
|
||||
`~/.cache/nyx/eval_corpus/owasp_benchmark_v1.2` on a laptop or at a CI checkout
|
||||
path. `line` is `0` (the expected-results CSV does not pin a line; matching
|
||||
falls back to file+cap).
|
||||
|
||||
Regenerate from `expectedresults-1.2beta.csv` shipped with the benchmark repo:
|
||||
```sh
|
||||
python3 tests/eval_corpus/owasp_gt_convert.py \
|
||||
--corpus-dir ~/.cache/nyx/eval_corpus/owasp_benchmark_v1.2 \
|
||||
--output tests/eval_corpus/ground_truth/owasp_benchmark_v1.2.json
|
||||
```
|
||||
|
||||
## NIST SARD subset
|
||||
|
||||
File: `nist_sard.json`
|
||||
|
||||
Same format. Source: SARD manifest XML converted with `python3 tests/eval_corpus/sard_gt_convert.py`.
|
||||
|
||||
## OWASP NodeGoat / OWASP Juice Shop (JS/TS — Track R.1)
|
||||
|
||||
Files: `nodegoat.json` (Express, `.js`), `juiceshop.json` (TypeScript, `.ts`).
|
||||
Same four-field format as above; all records are `vuln: true`.
|
||||
|
||||
These two apps are intentionally vulnerable end to end, so — unlike OWASP
|
||||
Benchmark — they ship no machine-readable per-file vuln labels and have no
|
||||
benign-control files to pair against. The authoritative source is a curated
|
||||
TOML manifest committed here, one `[[entry]]` per known-vulnerable handler
|
||||
with a `note` citing why:
|
||||
|
||||
- `nodegoat.manifest.toml`
|
||||
- `juiceshop.manifest.toml`
|
||||
|
||||
`manifest_gt_convert.py` turns a manifest into the committed `.json`:
|
||||
|
||||
```sh
|
||||
python3 tests/eval_corpus/manifest_gt_convert.py \
|
||||
--manifest tests/eval_corpus/ground_truth/nodegoat.manifest.toml \
|
||||
--output tests/eval_corpus/ground_truth/nodegoat.json
|
||||
```
|
||||
|
||||
Pass `--corpus-dir <clone>` to validate every labelled path against a real
|
||||
checkout. The converter exits non-zero if any path is missing, so a corpus
|
||||
bump that moves a handler fails loudly instead of silently dropping recall.
|
||||
CI (`.github/workflows/eval.yml`, `jsts` job) regenerates each `.json`
|
||||
against a fresh clone of the pinned ref and asserts it matches the committed
|
||||
file.
|
||||
|
||||
Because the manifests label canonical vulns only, recall (did nyx catch the
|
||||
known vulns) is the meaningful metric; precision vs this partial ground
|
||||
truth is informational. Gate 7 publishes per-cap precision/recall/confirmed
|
||||
report-only by default (`NYX_JSTS_FLOOR_CAPS` empty), matching the OWASP
|
||||
gate.
|
||||
|
||||
## Polyglot real corpora (Ruby/PHP/Python/Go/Rust — Track R.2)
|
||||
|
||||
Phase 29 wires the remaining language families into the same machinery, one
|
||||
corpus per family, each with a curated `*.manifest.toml` → committed `*.json`:
|
||||
|
||||
- `railsgoat.{manifest.toml,json}` — OWASP RailsGoat (Rails, `.rb`).
|
||||
- `dvwa.{manifest.toml,json}` — Damn Vulnerable Web Application (PHP). DVWA
|
||||
ships graded source variants (`source/{low,impossible}.php`), so this is
|
||||
the one Track R corpus besides OWASP with real vuln/benign **pairs**
|
||||
(`low.php` = vuln, `impossible.php` = benign control) — precision is
|
||||
meaningful here, not just informational.
|
||||
- `dvpwa.{manifest.toml,json}` — Damn Vulnerable Python Web App (aiohttp,
|
||||
`.py`). Its parameterized DAO siblings are benign controls for the one
|
||||
`%`-formatted SQL sink.
|
||||
- `gosec.{manifest.toml,json}` — the gosec Go SAST tool repo; the scannable,
|
||||
`// want`-annotated sample under `goanalysis/testdata` is the curated
|
||||
ground truth (gosec's string-embedded rule samples are not scannable, so
|
||||
they are deliberately unlabelled).
|
||||
- `rustsec.{manifest.toml,json}` — RustSec advisory-db, a **negative
|
||||
control**. advisory-db ships advisory metadata, not vulnerable `.rs`
|
||||
source, so its committed ground truth is empty (`[]`) by construction. The
|
||||
manifest sets `negative_control = true` (mutually exclusive with
|
||||
`[[entry]]` tables); `manifest_gt_convert.py` emits the empty JSON and the
|
||||
row asserts the Rust scan/verify path runs at scale within wall-clock and
|
||||
Confirms nothing there (any Confirmed Rust finding is a false confirm).
|
||||
|
||||
These are converted, validated and asserted-in-sync exactly like NodeGoat /
|
||||
Juice Shop (the `polyglot` job in `.github/workflows/eval.yml`). Because each
|
||||
corpus targets a single language, Gate 8 scopes tabulation to that language
|
||||
(`tabulate.py --lang`) so the vendored third-party JavaScript these Ruby /
|
||||
Python apps bundle does not pollute their per-cap metrics. Gate 8 publishes
|
||||
per-cap precision/recall/confirmed report-only by default
|
||||
(`NYX_POLYGLOT_FLOOR_CAPS` empty), matching the OWASP and JS/TS gates. See
|
||||
`tests/eval_corpus/budget.toml` for the per-(cap,lang) gate policy.
|
||||
38
tests/eval_corpus/ground_truth/dvpwa.json
Normal file
38
tests/eval_corpus/ground_truth/dvpwa.json
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
[
|
||||
{
|
||||
"path": "sqli/dao/course.py",
|
||||
"line": 0,
|
||||
"cap": "sqli",
|
||||
"vuln": false
|
||||
},
|
||||
{
|
||||
"path": "sqli/dao/mark.py",
|
||||
"line": 0,
|
||||
"cap": "sqli",
|
||||
"vuln": false
|
||||
},
|
||||
{
|
||||
"path": "sqli/dao/review.py",
|
||||
"line": 0,
|
||||
"cap": "sqli",
|
||||
"vuln": false
|
||||
},
|
||||
{
|
||||
"path": "sqli/dao/student.py",
|
||||
"line": 0,
|
||||
"cap": "sqli",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "sqli/dao/user.py",
|
||||
"line": 0,
|
||||
"cap": "crypto",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "sqli/views.py",
|
||||
"line": 0,
|
||||
"cap": "auth",
|
||||
"vuln": true
|
||||
}
|
||||
]
|
||||
70
tests/eval_corpus/ground_truth/dvpwa.manifest.toml
Normal file
70
tests/eval_corpus/ground_truth/dvpwa.manifest.toml
Normal file
|
|
@ -0,0 +1,70 @@
|
|||
# DVPWA (Damn Vulnerable Python Web Application) — curated ground-truth
|
||||
# manifest (Phase 29, Track R.2).
|
||||
#
|
||||
# DVPWA is an intentionally-vulnerable aiohttp app whose headline flaw is
|
||||
# SQL injection (the package is literally named `sqli`). It ships no
|
||||
# machine-readable per-file labels, so this manifest IS the authoritative
|
||||
# source. Its DAO layer is convenient: one method builds a query with
|
||||
# Python `%` string-formatting (the injectable sink) while its siblings use
|
||||
# proper parameterized `cur.execute(q, params)` — so the parameterized DAOs
|
||||
# serve as genuine benign controls (vuln = false) for the sqli cell, making
|
||||
# precision there meaningful, not just informational.
|
||||
#
|
||||
# tests/eval_corpus/manifest_gt_convert.py turns this into the committed
|
||||
# ground_truth/dvpwa.json. CI regenerates it against a fresh clone of the
|
||||
# pinned ref and asserts byte-equality; the converter HARD-ERRORS on any
|
||||
# path that no longer exists, so a corpus bump that moves a DAO fails the
|
||||
# job loudly rather than silently dropping recall.
|
||||
#
|
||||
# `cap` is a nyx cap label (tabulate.py), aligned to how nyx classifies each
|
||||
# sink (the request-scoped ownership lookups in views.py surface as `auth`).
|
||||
# `path` is relative to the DVPWA clone root, POSIX separators. Lang is
|
||||
# inferred from the extension (.py -> python). See
|
||||
# tests/eval_corpus/budget.toml for the gate policy on these cells.
|
||||
|
||||
corpus = "dvpwa"
|
||||
upstream = "https://github.com/anxolerd/dvpwa"
|
||||
# DVPWA publishes no release tags; the eval job pins the default branch via
|
||||
# the CI cache key (clone HEAD a1d8f89fac2e57093189853c6527c2b01fc1d9c1).
|
||||
# The sqli/ package layout has been stable; re-validate if the cache key is
|
||||
# bumped.
|
||||
pinned_ref = "master"
|
||||
|
||||
# ── SQL injection (sqli) — one injectable sink + parameterized controls ──────
|
||||
[[entry]]
|
||||
path = "sqli/dao/student.py"
|
||||
cap = "sqli"
|
||||
vuln = true
|
||||
note = "Student.create builds the INSERT with Python `%` formatting (\"... VALUES ('%(name)s')\" % {'name': name}) on the request-supplied student name, then cur.execute(q) — SQL injection."
|
||||
|
||||
[[entry]]
|
||||
path = "sqli/dao/course.py"
|
||||
cap = "sqli"
|
||||
vuln = false
|
||||
note = "benign control: every Course query uses parameterized cur.execute(q, params) / VALUES (%(title)s, %(description)s) — not injectable."
|
||||
|
||||
[[entry]]
|
||||
path = "sqli/dao/review.py"
|
||||
cap = "sqli"
|
||||
vuln = false
|
||||
note = "benign control: Review.create / get_for_course bind via cur.execute(q, params) with %(course_id)s / %s placeholders — parameterized."
|
||||
|
||||
[[entry]]
|
||||
path = "sqli/dao/mark.py"
|
||||
cap = "sqli"
|
||||
vuln = false
|
||||
note = "benign control: Mark.create / get_for_student bind via parameterized cur.execute(q, params) — not injectable."
|
||||
|
||||
# ── Weak crypto (crypto) ─────────────────────────────────────────────────────
|
||||
[[entry]]
|
||||
path = "sqli/dao/user.py"
|
||||
cap = "crypto"
|
||||
vuln = true
|
||||
note = "User.check_password compares against md5(password).hexdigest() — unsalted MD5 for credential storage (weak cryptography)."
|
||||
|
||||
# ── Broken access control (auth) ─────────────────────────────────────────────
|
||||
[[entry]]
|
||||
path = "sqli/views.py"
|
||||
cap = "auth"
|
||||
vuln = true
|
||||
note = "request handlers resolve the acting user from a client-controlled session id and act on objects without an ownership/authorization check — broken access control."
|
||||
50
tests/eval_corpus/ground_truth/dvwa.json
Normal file
50
tests/eval_corpus/ground_truth/dvwa.json
Normal file
|
|
@ -0,0 +1,50 @@
|
|||
[
|
||||
{
|
||||
"path": "vulnerabilities/exec/source/impossible.php",
|
||||
"line": 0,
|
||||
"cap": "cmdi",
|
||||
"vuln": false
|
||||
},
|
||||
{
|
||||
"path": "vulnerabilities/exec/source/low.php",
|
||||
"line": 0,
|
||||
"cap": "cmdi",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "vulnerabilities/open_redirect/source/impossible.php",
|
||||
"line": 0,
|
||||
"cap": "header_injection",
|
||||
"vuln": false
|
||||
},
|
||||
{
|
||||
"path": "vulnerabilities/open_redirect/source/impossible.php",
|
||||
"line": 0,
|
||||
"cap": "redirect",
|
||||
"vuln": false
|
||||
},
|
||||
{
|
||||
"path": "vulnerabilities/open_redirect/source/low.php",
|
||||
"line": 0,
|
||||
"cap": "header_injection",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "vulnerabilities/open_redirect/source/low.php",
|
||||
"line": 0,
|
||||
"cap": "redirect",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "vulnerabilities/sqli/source/impossible.php",
|
||||
"line": 0,
|
||||
"cap": "sqli",
|
||||
"vuln": false
|
||||
},
|
||||
{
|
||||
"path": "vulnerabilities/sqli/source/low.php",
|
||||
"line": 0,
|
||||
"cap": "sqli",
|
||||
"vuln": true
|
||||
}
|
||||
]
|
||||
84
tests/eval_corpus/ground_truth/dvwa.manifest.toml
Normal file
84
tests/eval_corpus/ground_truth/dvwa.manifest.toml
Normal file
|
|
@ -0,0 +1,84 @@
|
|||
# DVWA (Damn Vulnerable Web Application) — curated ground-truth manifest
|
||||
# (Phase 29, Track R.2).
|
||||
#
|
||||
# DVWA is an intentionally-vulnerable PHP app. Unlike the other Track R
|
||||
# apps it ships its vulnerabilities as graded source variants under
|
||||
# vulnerabilities/<module>/source/{low,medium,high,impossible}.php, where
|
||||
# `low.php` is the textbook-vulnerable handler and `impossible.php` is the
|
||||
# hardened, secure rewrite of the SAME sink. That gives DVWA real
|
||||
# vuln/benign PAIRS (low = vuln, impossible = benign control) the way OWASP
|
||||
# Benchmark does — so precision against this manifest is meaningful, not
|
||||
# just informational: a Confirmed finding on an `impossible.php` control is
|
||||
# a genuine false confirm.
|
||||
#
|
||||
# tests/eval_corpus/manifest_gt_convert.py turns this into the committed
|
||||
# ground_truth/dvwa.json. CI regenerates it against a fresh clone of the
|
||||
# pinned tag and asserts byte-equality; the converter HARD-ERRORS on any
|
||||
# path that no longer exists, so a DVWA bump that restructures a module
|
||||
# fails loudly rather than silently dropping recall. Re-pin `pinned_ref`
|
||||
# and re-validate the paths together.
|
||||
#
|
||||
# `cap` is a nyx cap label (tabulate.py), aligned to how nyx classifies the
|
||||
# sink. `path` is relative to the DVWA clone root, POSIX separators. Lang
|
||||
# is inferred from the extension (.php -> php). See
|
||||
# tests/eval_corpus/budget.toml for the gate policy on these cells.
|
||||
|
||||
corpus = "dvwa"
|
||||
upstream = "https://github.com/digininja/DVWA"
|
||||
# Pinned to release tag 2.5 (clone HEAD
|
||||
# a96943dc1f52f390ee5df72144660636c4b7dd06). The
|
||||
# vulnerabilities/<module>/source/{low,impossible}.php layout has been stable
|
||||
# for years; re-validate if the tag is bumped.
|
||||
pinned_ref = "2.5"
|
||||
|
||||
# ── SQL injection (sqli) ─────────────────────────────────────────────────────
|
||||
[[entry]]
|
||||
path = "vulnerabilities/sqli/source/low.php"
|
||||
cap = "sqli"
|
||||
vuln = true
|
||||
note = "id = $_REQUEST['id'] is concatenated straight into \"... WHERE user_id = '$id'\" and run via mysqli_query — classic SQL injection."
|
||||
|
||||
[[entry]]
|
||||
path = "vulnerabilities/sqli/source/impossible.php"
|
||||
cap = "sqli"
|
||||
vuln = false
|
||||
note = "benign control: same query via PDO prepare + bindParam(:id, PDO::PARAM_INT) with is_numeric/intval validation — parameterized, not injectable."
|
||||
|
||||
# ── OS command injection (cmdi) ──────────────────────────────────────────────
|
||||
[[entry]]
|
||||
path = "vulnerabilities/exec/source/low.php"
|
||||
cap = "cmdi"
|
||||
vuln = true
|
||||
note = "target = $_REQUEST['ip'] is concatenated into shell_exec('ping -c 4 ' . $target) with no validation — OS command injection."
|
||||
|
||||
[[entry]]
|
||||
path = "vulnerabilities/exec/source/impossible.php"
|
||||
cap = "cmdi"
|
||||
vuln = false
|
||||
note = "benign control: the IP is split into 4 octets and each is_numeric-checked before being reassembled and passed to shell_exec — not injectable."
|
||||
|
||||
# ── Open redirect (redirect) ─────────────────────────────────────────────────
|
||||
[[entry]]
|
||||
path = "vulnerabilities/open_redirect/source/low.php"
|
||||
cap = "redirect"
|
||||
vuln = true
|
||||
note = "header('location: ' . $_GET['redirect']) forwards to an unvalidated user-supplied URL — open redirect."
|
||||
|
||||
[[entry]]
|
||||
path = "vulnerabilities/open_redirect/source/impossible.php"
|
||||
cap = "redirect"
|
||||
vuln = false
|
||||
note = "benign control: redirect target is chosen by an integer switch on is_numeric($_GET['redirect']) — no user-controlled URL reaches the Location header."
|
||||
|
||||
# ── CRLF / HTTP header injection (header_injection) ──────────────────────────
|
||||
[[entry]]
|
||||
path = "vulnerabilities/open_redirect/source/low.php"
|
||||
cap = "header_injection"
|
||||
vuln = true
|
||||
note = "the same unvalidated $_GET['redirect'] flows into a raw header() call, so CRLF in the value splits/injects response headers — HTTP header injection."
|
||||
|
||||
[[entry]]
|
||||
path = "vulnerabilities/open_redirect/source/impossible.php"
|
||||
cap = "header_injection"
|
||||
vuln = false
|
||||
note = "benign control: only a fixed, integer-selected target string reaches header() — no user bytes, so no CRLF injection."
|
||||
14
tests/eval_corpus/ground_truth/gosec.json
Normal file
14
tests/eval_corpus/ground_truth/gosec.json
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
[
|
||||
{
|
||||
"path": "goanalysis/testdata/src/a/basic_output.go",
|
||||
"line": 0,
|
||||
"cap": "cmdi",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "goanalysis/testdata/src/a/basic_output.go",
|
||||
"line": 0,
|
||||
"cap": "crypto",
|
||||
"vuln": true
|
||||
}
|
||||
]
|
||||
42
tests/eval_corpus/ground_truth/gosec.manifest.toml
Normal file
42
tests/eval_corpus/ground_truth/gosec.manifest.toml
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
# gosec — curated Go ground-truth manifest (Phase 29, Track R.2).
|
||||
#
|
||||
# gosec is the Go SAST tool; its repo doubles as the de-facto Go security
|
||||
# corpus. Most of gosec's rule samples live as Go source embedded in
|
||||
# backtick string literals inside testutils/g*_samples.go — those are NOT
|
||||
# scannable by a taint analyzer (the vulnerable code is string data, not
|
||||
# real AST), so they are deliberately NOT labelled here. gosec also ships a
|
||||
# small set of REAL, compilable sample programs under goanalysis/testdata
|
||||
# that carry the tool's OWN inline `// want 'GNNN ...'` expectations — that
|
||||
# is the authoritative, scannable ground truth this manifest pins.
|
||||
#
|
||||
# Because the eval scans the whole gosec checkout (the tool's own source
|
||||
# included), unlabelled findings are expected and are NOT false positives —
|
||||
# precision against this manifest is informational, recall on the curated
|
||||
# samples is the meaningful floor (same policy as the all-vulnerable apps;
|
||||
# see tests/eval_corpus/budget.toml).
|
||||
#
|
||||
# tests/eval_corpus/manifest_gt_convert.py turns this into the committed
|
||||
# ground_truth/gosec.json. CI regenerates it against a fresh clone of the
|
||||
# pinned tag and asserts byte-equality; the converter HARD-ERRORS on any
|
||||
# path that no longer exists, so a gosec bump that moves the testdata fails
|
||||
# the job loudly. `cap` is a nyx cap label (tabulate.py); `path` is relative
|
||||
# to the gosec clone root, POSIX separators; lang is inferred (.go -> go).
|
||||
|
||||
corpus = "gosec"
|
||||
upstream = "https://github.com/securego/gosec"
|
||||
# Pinned to release tag v2.26.1 (clone HEAD
|
||||
# 4a3bd8af174872c778439083ded7adbf3747e770). goanalysis/testdata/src/a/ has
|
||||
# been stable; re-validate if the tag is bumped.
|
||||
pinned_ref = "v2.26.1"
|
||||
|
||||
[[entry]]
|
||||
path = "goanalysis/testdata/src/a/basic_output.go"
|
||||
cap = "cmdi"
|
||||
vuln = true
|
||||
note = "VulnerableFunction runs exec.Command(\"sh\", \"-c\", getUserInput()) — subprocess launched with a non-constant argument (gosec's own `// want G204 [CWE-78]` expectation)."
|
||||
|
||||
[[entry]]
|
||||
path = "goanalysis/testdata/src/a/basic_output.go"
|
||||
cap = "crypto"
|
||||
vuln = true
|
||||
note = "VulnerableFunction imports crypto/md5 and calls md5.New() — weak cryptographic primitive (gosec's own `// want G401/G501` expectations)."
|
||||
38
tests/eval_corpus/ground_truth/juiceshop.json
Normal file
38
tests/eval_corpus/ground_truth/juiceshop.json
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
[
|
||||
{
|
||||
"path": "lib/insecurity.ts",
|
||||
"line": 0,
|
||||
"cap": "crypto",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "routes/fileServer.ts",
|
||||
"line": 0,
|
||||
"cap": "path_traversal",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "routes/login.ts",
|
||||
"line": 0,
|
||||
"cap": "sqli",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "routes/profileImageUrlUpload.ts",
|
||||
"line": 0,
|
||||
"cap": "ssrf",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "routes/redirect.ts",
|
||||
"line": 0,
|
||||
"cap": "redirect",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "routes/search.ts",
|
||||
"line": 0,
|
||||
"cap": "sqli",
|
||||
"vuln": true
|
||||
}
|
||||
]
|
||||
66
tests/eval_corpus/ground_truth/juiceshop.manifest.toml
Normal file
66
tests/eval_corpus/ground_truth/juiceshop.manifest.toml
Normal file
|
|
@ -0,0 +1,66 @@
|
|||
# OWASP Juice Shop — curated vuln ground-truth manifest (Phase 28, Track R.1).
|
||||
#
|
||||
# Juice Shop is an intentionally-vulnerable TypeScript/Express + Angular
|
||||
# app. Its `data/static/challenges.yml` enumerates challenges but pins no
|
||||
# source file/line, so it cannot drive file-level ground truth on its own.
|
||||
# This manifest IS the authoritative source: one [[entry]] per known-
|
||||
# vulnerable server-side handler, curated from the project's own challenge
|
||||
# definitions + companion guide, each with a `note` citing the challenge.
|
||||
#
|
||||
# tests/eval_corpus/manifest_gt_convert.py turns this into the committed
|
||||
# ground_truth/juiceshop.json. CI regenerates it against a fresh clone of
|
||||
# the pinned tag and asserts byte-equality; the converter HARD-ERRORS on
|
||||
# any path that no longer exists in the corpus, so a Juice Shop bump that
|
||||
# refactors a route fails the eval job loudly instead of silently dropping
|
||||
# recall. Re-pin `pinned_ref` and re-validate the paths together.
|
||||
#
|
||||
# `cap` is a nyx cap label (tabulate.py). `path` is relative to the Juice
|
||||
# Shop clone root, POSIX separators. Lang is inferred from the extension
|
||||
# (.ts -> typescript). All `vuln = true`: Juice Shop is all-vulnerable, so
|
||||
# there is no benign-control file to pair against. As with NodeGoat,
|
||||
# precision vs this manifest is informational (an unlabelled finding may be
|
||||
# a real uncurated vuln, not a false positive) while recall is the
|
||||
# meaningful floor. See tests/eval_corpus/budget.toml for the gate policy.
|
||||
|
||||
corpus = "juiceshop"
|
||||
upstream = "https://github.com/juice-shop/juice-shop"
|
||||
# Pinned to a stable release tag. The server-side handlers below
|
||||
# (routes/*.ts, lib/insecurity.ts) have been stable across the TypeScript
|
||||
# era of Juice Shop; re-validate if the tag is bumped.
|
||||
pinned_ref = "v15.0.0"
|
||||
|
||||
[[entry]]
|
||||
path = "routes/login.ts"
|
||||
cap = "sqli"
|
||||
vuln = true
|
||||
note = "login builds a raw `models.sequelize.query(\"... WHERE email = '\" + req.body.email + \"' ...\")` — SQL injection auth bypass (challenge: loginAdmin / loginBender)."
|
||||
|
||||
[[entry]]
|
||||
path = "routes/search.ts"
|
||||
cap = "sqli"
|
||||
vuln = true
|
||||
note = "product search concatenates the `q` criteria into a raw `models.sequelize.query` LIKE clause — UNION-based SQL injection (challenge: unionSqlInjection / dbSchema)."
|
||||
|
||||
[[entry]]
|
||||
path = "routes/fileServer.ts"
|
||||
cap = "path_traversal"
|
||||
vuln = true
|
||||
note = "serveKeyFiles / file download resolves a user-controlled filename under the ftp dir without containment — path traversal (challenge: accessLogDisclosure / forgottenDevBackup)."
|
||||
|
||||
[[entry]]
|
||||
path = "routes/redirect.ts"
|
||||
cap = "redirect"
|
||||
vuln = true
|
||||
note = "redirect endpoint forwards to the `to` query param via an allow-list that is bypassable by substring — open redirect (challenge: redirectCryptoCurrency / redirect)."
|
||||
|
||||
[[entry]]
|
||||
path = "routes/profileImageUrlUpload.ts"
|
||||
cap = "ssrf"
|
||||
vuln = true
|
||||
note = "profile image upload fetches an arbitrary user-supplied imageUrl server-side — SSRF (challenge: ssrf)."
|
||||
|
||||
[[entry]]
|
||||
path = "lib/insecurity.ts"
|
||||
cap = "crypto"
|
||||
vuln = true
|
||||
note = "hardcoded HMAC/JWT key material and weak hashing (md5-based `hash`) — broken cryptography / hardcoded secret (challenge: weakCryptography / jwt*)."
|
||||
32
tests/eval_corpus/ground_truth/nodegoat.json
Normal file
32
tests/eval_corpus/ground_truth/nodegoat.json
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
[
|
||||
{
|
||||
"path": "app/routes/allocations.js",
|
||||
"line": 0,
|
||||
"cap": "unauthorized_id",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "app/routes/contributions.js",
|
||||
"line": 0,
|
||||
"cap": "cmdi",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "app/routes/memos.js",
|
||||
"line": 0,
|
||||
"cap": "xss",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "app/routes/profile.js",
|
||||
"line": 0,
|
||||
"cap": "xss",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "config/env/all.js",
|
||||
"line": 0,
|
||||
"cap": "crypto",
|
||||
"vuln": true
|
||||
}
|
||||
]
|
||||
62
tests/eval_corpus/ground_truth/nodegoat.manifest.toml
Normal file
62
tests/eval_corpus/ground_truth/nodegoat.manifest.toml
Normal file
|
|
@ -0,0 +1,62 @@
|
|||
# OWASP NodeGoat — curated vuln ground-truth manifest (Phase 28, Track R.1).
|
||||
#
|
||||
# NodeGoat is an intentionally-vulnerable Express/Node app that maps the
|
||||
# OWASP Top 10 to concrete handlers. It ships no machine-readable per-file
|
||||
# vuln labels (unlike OWASP Benchmark's expectedresults CSV), so this
|
||||
# manifest IS the authoritative source: one [[entry]] per known-vulnerable
|
||||
# location, each curated from the project's own tutorial + the canonical
|
||||
# vuln walk-through, with a `note` citing why.
|
||||
#
|
||||
# tests/eval_corpus/manifest_gt_convert.py turns this into the committed
|
||||
# ground_truth/nodegoat.json. CI regenerates it against a fresh clone of
|
||||
# the pinned ref and asserts byte-equality, and the converter HARD-ERRORS
|
||||
# on any path that no longer exists in the corpus, so a NodeGoat bump that
|
||||
# moves a handler fails the eval job loudly rather than silently dropping
|
||||
# recall. Update `pinned_ref` + the paths together when re-pinning.
|
||||
#
|
||||
# `cap` is a nyx cap label (tabulate.py). `path` is relative to the
|
||||
# NodeGoat clone root, POSIX separators. Lang is inferred from the
|
||||
# extension (.js -> javascript). These are all `vuln = true`: NodeGoat is
|
||||
# all-vulnerable, so there is no benign-control file to pair against (the
|
||||
# OWASP Benchmark vuln/benign pairing does not exist here). Precision vs
|
||||
# this manifest is therefore informational (an unlabelled finding is not
|
||||
# necessarily a false positive — it may be a real vuln we did not curate),
|
||||
# while recall (did nyx catch the canonical vulns) is the meaningful floor.
|
||||
# See tests/eval_corpus/budget.toml for how the gate treats these cells.
|
||||
|
||||
corpus = "nodegoat"
|
||||
upstream = "https://github.com/OWASP/NodeGoat"
|
||||
# NodeGoat publishes no semver tags; the eval job pins the default branch
|
||||
# via the CI cache key. The `app/` + `config/` layout below has been
|
||||
# stable for years; re-validate the paths if the cache key is bumped.
|
||||
pinned_ref = "master"
|
||||
|
||||
[[entry]]
|
||||
path = "app/routes/contributions.js"
|
||||
cap = "cmdi"
|
||||
vuln = true
|
||||
note = "handleContributionsUpdate eval()s the pre-tax/after-tax/roth form fields — server-side JS injection (OWASP A1 Injection); the textbook NodeGoat RCE."
|
||||
|
||||
[[entry]]
|
||||
path = "app/routes/profile.js"
|
||||
cap = "xss"
|
||||
vuln = true
|
||||
note = "profile fields (firstName/lastName/bankAcc/...) are persisted then rendered unescaped — stored XSS (OWASP A3 / A7 XSS)."
|
||||
|
||||
[[entry]]
|
||||
path = "app/routes/memos.js"
|
||||
cap = "xss"
|
||||
vuln = true
|
||||
note = "memo body is stored and echoed back into the memos view without output encoding — stored XSS."
|
||||
|
||||
[[entry]]
|
||||
path = "app/routes/allocations.js"
|
||||
cap = "unauthorized_id"
|
||||
vuln = true
|
||||
note = "allocations are looked up by a userId taken from the request with no ownership check — insecure direct object reference / broken access control (OWASP A4)."
|
||||
|
||||
[[entry]]
|
||||
path = "config/env/all.js"
|
||||
cap = "crypto"
|
||||
vuln = true
|
||||
note = "hardcoded cookieSecret / session secret committed in source — sensitive-data / weak-secret smell (OWASP A6)."
|
||||
16442
tests/eval_corpus/ground_truth/owasp_benchmark_v1.2.json
Normal file
16442
tests/eval_corpus/ground_truth/owasp_benchmark_v1.2.json
Normal file
File diff suppressed because it is too large
Load diff
56
tests/eval_corpus/ground_truth/railsgoat.json
Normal file
56
tests/eval_corpus/ground_truth/railsgoat.json
Normal file
|
|
@ -0,0 +1,56 @@
|
|||
[
|
||||
{
|
||||
"path": "app/controllers/admin_controller.rb",
|
||||
"line": 0,
|
||||
"cap": "auth",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "app/controllers/benefit_forms_controller.rb",
|
||||
"line": 0,
|
||||
"cap": "deserialize",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "app/controllers/benefit_forms_controller.rb",
|
||||
"line": 0,
|
||||
"cap": "path_traversal",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "app/controllers/messages_controller.rb",
|
||||
"line": 0,
|
||||
"cap": "auth",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "app/controllers/password_resets_controller.rb",
|
||||
"line": 0,
|
||||
"cap": "crypto",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "app/controllers/password_resets_controller.rb",
|
||||
"line": 0,
|
||||
"cap": "deserialize",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "app/controllers/sessions_controller.rb",
|
||||
"line": 0,
|
||||
"cap": "redirect",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "app/controllers/users_controller.rb",
|
||||
"line": 0,
|
||||
"cap": "auth",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "app/models/user.rb",
|
||||
"line": 0,
|
||||
"cap": "crypto",
|
||||
"vuln": true
|
||||
}
|
||||
]
|
||||
88
tests/eval_corpus/ground_truth/railsgoat.manifest.toml
Normal file
88
tests/eval_corpus/ground_truth/railsgoat.manifest.toml
Normal file
|
|
@ -0,0 +1,88 @@
|
|||
# OWASP RailsGoat — curated vuln ground-truth manifest (Phase 29, Track R.2).
|
||||
#
|
||||
# RailsGoat is an intentionally-vulnerable Ruby on Rails app that maps the
|
||||
# OWASP Top 10 to concrete controllers/models. Like NodeGoat / Juice Shop
|
||||
# (Phase 28) it ships no machine-readable per-file vuln labels, so this
|
||||
# manifest IS the authoritative source: one [[entry]] per known-vulnerable
|
||||
# location, curated from the project's own tutorial walk-throughs, each with
|
||||
# a `note` citing why.
|
||||
#
|
||||
# tests/eval_corpus/manifest_gt_convert.py turns this into the committed
|
||||
# ground_truth/railsgoat.json. CI regenerates it against a fresh clone of
|
||||
# the pinned tag and asserts byte-equality, and the converter HARD-ERRORS on
|
||||
# any path that no longer exists in the corpus, so a RailsGoat bump that
|
||||
# moves a controller fails the eval job loudly rather than silently dropping
|
||||
# recall. Update `pinned_ref` + the paths together when re-pinning.
|
||||
#
|
||||
# `cap` is a nyx cap label (tabulate.py); it is aligned with how nyx
|
||||
# classifies the sink in each file (e.g. a missing ownership check on a
|
||||
# direct-object lookup surfaces as `auth`, not `unauthorized_id`), so recall
|
||||
# (did nyx catch the canonical vuln) is meaningful. `path` is relative to
|
||||
# the RailsGoat clone root, POSIX separators. Lang is inferred from the
|
||||
# extension (.rb -> ruby). All `vuln = true`: RailsGoat is all-vulnerable,
|
||||
# so there is no benign-control file to pair against — precision vs this
|
||||
# manifest is informational (an unlabelled finding may be a real uncurated
|
||||
# vuln), while recall is the meaningful floor. See
|
||||
# tests/eval_corpus/budget.toml for how the gate treats these cells.
|
||||
|
||||
corpus = "railsgoat"
|
||||
upstream = "https://github.com/OWASP/railsgoat"
|
||||
# Pinned to the stable Rails 5 release tag (clone HEAD
|
||||
# 0766ca80bf2d94acbde1dd4aaf7baf9b86afe4eb). The app/controllers + app/models
|
||||
# layout below has been stable across this tag; re-validate the paths if the
|
||||
# ref is bumped.
|
||||
pinned_ref = "rails.5.0.0"
|
||||
|
||||
[[entry]]
|
||||
path = "app/controllers/users_controller.rb"
|
||||
cap = "auth"
|
||||
vuln = true
|
||||
note = "update looks up the account with User.where(\"id = '#{params[:user][:id]}'\") and mass-assigns user_params (params.require(:user).permit!) with no ownership check — broken access control / mass-assignment privilege escalation (OWASP A4/A5)."
|
||||
|
||||
[[entry]]
|
||||
path = "app/controllers/messages_controller.rb"
|
||||
cap = "auth"
|
||||
vuln = true
|
||||
note = "show / destroy fetch Message.where(id: params[:id]) with no check that the message belongs to current_user — insecure direct object reference (OWASP A4 broken access control)."
|
||||
|
||||
[[entry]]
|
||||
path = "app/controllers/admin_controller.rb"
|
||||
cap = "auth"
|
||||
vuln = true
|
||||
note = "administrative actions are gated by a bypassable admin_param check (params[:admin_id] != \"1\"); update_user / delete_user act on any admin_id — broken access control / privilege escalation (OWASP A5)."
|
||||
|
||||
[[entry]]
|
||||
path = "app/models/user.rb"
|
||||
cap = "crypto"
|
||||
vuln = true
|
||||
note = "passwords are hashed with Digest::MD5.hexdigest (hash_password / authenticate) — unsalted weak hash for credential storage (OWASP A2 cryptographic failure)."
|
||||
|
||||
[[entry]]
|
||||
path = "app/controllers/password_resets_controller.rb"
|
||||
cap = "crypto"
|
||||
vuln = true
|
||||
note = "generate_token derives the reset token as Digest::MD5.hexdigest(email) — a predictable, forgeable password-reset token (weak cryptography)."
|
||||
|
||||
[[entry]]
|
||||
path = "app/controllers/password_resets_controller.rb"
|
||||
cap = "deserialize"
|
||||
vuln = true
|
||||
note = "reset_password runs Marshal.load(Base64.decode64(params[:user])) on attacker-controlled input — insecure deserialization leading to RCE (OWASP A8)."
|
||||
|
||||
[[entry]]
|
||||
path = "app/controllers/sessions_controller.rb"
|
||||
cap = "redirect"
|
||||
vuln = true
|
||||
note = "create redirects to params[:url] with no allow-list (path = params[:url] then redirect_to path) — open redirect (OWASP unvalidated redirects)."
|
||||
|
||||
[[entry]]
|
||||
path = "app/controllers/benefit_forms_controller.rb"
|
||||
cap = "path_traversal"
|
||||
vuln = true
|
||||
note = "download builds send_file from a user-controlled params[:name] path with no containment — arbitrary file read / path traversal."
|
||||
|
||||
[[entry]]
|
||||
path = "app/controllers/benefit_forms_controller.rb"
|
||||
cap = "deserialize"
|
||||
vuln = true
|
||||
note = "download calls params[:type].constantize.new(path), constantizing a user-supplied class name — unsafe reflection / object injection."
|
||||
1
tests/eval_corpus/ground_truth/rustsec.json
Normal file
1
tests/eval_corpus/ground_truth/rustsec.json
Normal file
|
|
@ -0,0 +1 @@
|
|||
[]
|
||||
37
tests/eval_corpus/ground_truth/rustsec.manifest.toml
Normal file
37
tests/eval_corpus/ground_truth/rustsec.manifest.toml
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
# RustSec advisory-db — Rust negative-control corpus (Phase 29, Track R.2).
|
||||
#
|
||||
# The plan's Rust real-corpus row is the RustSec advisory database. Unlike
|
||||
# RailsGoat / DVWA / DVPWA / gosec, advisory-db ships advisory METADATA
|
||||
# (TOML + Markdown under crates/<crate>/RUSTSEC-*.md), not vulnerable Rust
|
||||
# SOURCE. A static scan of it therefore contains zero `.rs` files and nyx
|
||||
# correctly produces zero findings — so there are no source-level vuln
|
||||
# positives to label, and no canonical scannable "RustGoat" exists to
|
||||
# substitute without fabricating paths (which the CI byte-equality + path
|
||||
# existence guards would reject outright).
|
||||
#
|
||||
# advisory-db is still worth pinning and scanning as a NEGATIVE CONTROL for
|
||||
# the Rust language path:
|
||||
# * it exercises the Rust scan + verify pipeline (Phase 23 Rust build
|
||||
# pool) end to end on a large real-world tree (thousands of files) and
|
||||
# asserts it stays within the wall-clock budget without crashing, and
|
||||
# * it is an over-confirmation guard: nyx must Confirm NOTHING on a corpus
|
||||
# with no real source vulns. Any Confirmed finding here is provably a
|
||||
# false confirm and trips the per-cell false_confirmed_rate budget
|
||||
# (tests/eval_corpus/budget.toml) — a genuine regression sentinel if a
|
||||
# future change makes nyx treat advisory text as scannable code.
|
||||
#
|
||||
# `negative_control = true` tells manifest_gt_convert.py to emit an empty
|
||||
# `[]` ground truth. It is mutually exclusive with `[[entry]]` tables, so a
|
||||
# real Rust vuln can never be silently hidden behind the flag. When a
|
||||
# scannable advisory-backed Rust corpus (a vulnerable crate pinned at its
|
||||
# affected version with a source-level taint sink) is curated, drop the flag
|
||||
# and add [[entry]] tables here exactly as the other Track R.2 manifests do.
|
||||
|
||||
corpus = "rustsec"
|
||||
upstream = "https://github.com/rustsec/advisory-db"
|
||||
# advisory-db publishes no release tags; the eval job pins the default
|
||||
# branch via the CI cache key (clone HEAD
|
||||
# eaf48e749baa3d5e27d304107d8abf175fd756bb).
|
||||
pinned_ref = "main"
|
||||
|
||||
negative_control = true
|
||||
218
tests/eval_corpus/manifest_gt_convert.py
Executable file
218
tests/eval_corpus/manifest_gt_convert.py
Executable file
|
|
@ -0,0 +1,218 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Convert a curated TOML vuln manifest into nyx ground-truth JSON.
|
||||
|
||||
Used for real-world apps that ship **no** machine-readable per-file vuln
|
||||
labels of their own (OWASP NodeGoat, OWASP Juice Shop). OWASP Benchmark
|
||||
ships `expectedresults-1.2beta.csv` (see owasp_gt_convert.py); NIST SARD
|
||||
ships `manifest.xml` (see sard_gt_convert.py). NodeGoat / Juice Shop are
|
||||
intentionally-vulnerable apps without an equivalent, so the authoritative
|
||||
source here is a curated manifest committed *in this repo* — one
|
||||
`[[entry]]` table per known-vulnerable location, each carrying a
|
||||
provenance `note` so a reviewer can trace why the label is what it is.
|
||||
|
||||
Manifest schema (TOML)::
|
||||
|
||||
# provenance comments at the top
|
||||
corpus = "nodegoat" # informational label
|
||||
upstream = "https://github.com/OWASP/NodeGoat"
|
||||
pinned_ref = "master@<sha>" # the ref the paths were curated against
|
||||
|
||||
[[entry]]
|
||||
path = "app/routes/contributions.js" # relative to the corpus root, POSIX
|
||||
cap = "cmdi" # a nyx cap label (tabulate.py)
|
||||
vuln = true # true = real vuln, false = benign control
|
||||
note = "eval() of user-supplied pre/after-tax fields (NodeGoat A1)"
|
||||
|
||||
Negative-control corpora. A few real corpora carry **no** scannable
|
||||
source-level vulnerabilities of their own — most notably the RustSec
|
||||
`advisory-db`, which ships advisory *metadata* (TOML/Markdown), not
|
||||
vulnerable `.rs` source. Such a corpus has zero ground-truth positives by
|
||||
construction, yet it is still worth scanning: it exercises the language's
|
||||
scan + verify path end to end on a large real-world tree and acts as an
|
||||
over-confirmation guard (nyx must Confirm nothing on a corpus with no real
|
||||
source vulns). Declare it with a top-level ``negative_control = true`` and
|
||||
**zero** ``[[entry]]`` tables; the converter then emits an empty ``[]``
|
||||
ground truth. ``negative_control`` and ``[[entry]]`` are mutually
|
||||
exclusive — a manifest that sets the flag *and* lists entries is rejected,
|
||||
so a real vuln can never be silently dropped behind the flag.
|
||||
|
||||
Output (consumed by tabulate.py): a list of `{path, line, cap, vuln}`
|
||||
records, sorted by `(path, cap)` for deterministic, diff-stable JSON.
|
||||
`note` is intentionally dropped — the ground-truth JSON keeps the exact
|
||||
same four-field schema OWASP/SARD produce, so tabulate.py needs no special
|
||||
casing. `line` is always 0 (the manifest pins a file, not a line;
|
||||
tabulate.py matches file+cap and treats line 0 as "any line").
|
||||
|
||||
Path validation (the no-compromise guard). When `--corpus-dir` is given,
|
||||
**every** manifest path must resolve to a real file under that root or the
|
||||
converter exits non-zero. CI runs the converter against a fresh clone of
|
||||
the pinned corpus and then asserts the committed JSON byte-matches the
|
||||
regenerated JSON, so a corpus bump that moves/renames/deletes a labelled
|
||||
file (or a typo'd path) fails the build loudly instead of silently
|
||||
degrading recall. Authoring the committed JSON offline (no corpus on
|
||||
hand) is done by omitting `--corpus-dir`: the transform is identical, only
|
||||
the existence check is skipped.
|
||||
|
||||
Usage::
|
||||
|
||||
# author / regenerate the committed JSON offline (no validation):
|
||||
tests/eval_corpus/manifest_gt_convert.py \\
|
||||
--manifest tests/eval_corpus/ground_truth/nodegoat.manifest.toml \\
|
||||
--output tests/eval_corpus/ground_truth/nodegoat.json
|
||||
|
||||
# CI: validate every path against a real checkout, then diff vs committed:
|
||||
tests/eval_corpus/manifest_gt_convert.py \\
|
||||
--manifest tests/eval_corpus/ground_truth/nodegoat.manifest.toml \\
|
||||
--corpus-dir ~/.cache/nyx/eval_corpus/nodegoat \\
|
||||
--output /tmp/nodegoat_regen.json
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
import tomllib # Python 3.11+
|
||||
except ModuleNotFoundError: # pragma: no cover — older interpreters only
|
||||
import tomli as tomllib # type: ignore[no-redef]
|
||||
|
||||
# nyx cap labels (see tabulate.py _CAP_BIT_TABLE / _CAP_RULE_TABLE). A
|
||||
# manifest cap outside this set is almost always a typo, so reject it at
|
||||
# conversion time rather than letting a never-matching cap silently sink
|
||||
# recall.
|
||||
VALID_CAPS = {
|
||||
"path_traversal",
|
||||
"fmt_string",
|
||||
"sqli",
|
||||
"deserialize",
|
||||
"ssrf",
|
||||
"cmdi",
|
||||
"crypto",
|
||||
"unauthorized_id",
|
||||
"data_exfil",
|
||||
"ldap_injection",
|
||||
"xpath_injection",
|
||||
"header_injection",
|
||||
"redirect",
|
||||
"xss",
|
||||
"xxe",
|
||||
"prototype_pollution",
|
||||
"auth",
|
||||
"memory",
|
||||
"validation",
|
||||
}
|
||||
|
||||
|
||||
def load_manifest(path: Path) -> dict:
|
||||
try:
|
||||
with open(path, "rb") as f:
|
||||
return tomllib.load(f)
|
||||
except FileNotFoundError:
|
||||
print(f"error: manifest not found: {path}", file=sys.stderr)
|
||||
raise SystemExit(1)
|
||||
except tomllib.TOMLDecodeError as e:
|
||||
print(f"error: manifest malformed: {path}: {e}", file=sys.stderr)
|
||||
raise SystemExit(1)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument("--manifest", required=True, help="curated TOML manifest path")
|
||||
p.add_argument("--output", required=True, help="output ground-truth JSON path")
|
||||
p.add_argument(
|
||||
"--corpus-dir",
|
||||
default="",
|
||||
help=(
|
||||
"when set, every manifest path must resolve to a real file under "
|
||||
"this root or the converter exits 2 (the CI corpus-drift guard)"
|
||||
),
|
||||
)
|
||||
args = p.parse_args()
|
||||
|
||||
manifest = load_manifest(Path(args.manifest).expanduser())
|
||||
entries = manifest.get("entry", []) or []
|
||||
negative_control = bool(manifest.get("negative_control", False))
|
||||
if negative_control and entries:
|
||||
print(
|
||||
f"error: negative_control manifest must declare zero [[entry]] "
|
||||
f"tables (found {len(entries)}): {args.manifest}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 1
|
||||
if not entries and not negative_control:
|
||||
print(f"error: manifest has no [[entry]] tables: {args.manifest}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
corpus = Path(args.corpus_dir).expanduser().resolve() if args.corpus_dir else None
|
||||
if args.corpus_dir and (corpus is None or not corpus.is_dir()):
|
||||
print(f"error: corpus dir not found: {args.corpus_dir}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
records: list[dict] = []
|
||||
missing: list[str] = []
|
||||
seen: set[tuple[str, str]] = set()
|
||||
for i, e in enumerate(entries):
|
||||
path = e.get("path")
|
||||
cap = e.get("cap")
|
||||
vuln = e.get("vuln")
|
||||
if not path or not cap or not isinstance(vuln, bool):
|
||||
print(
|
||||
f"error: entry #{i} needs string path, string cap, bool vuln: {e!r}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 1
|
||||
if cap not in VALID_CAPS:
|
||||
print(
|
||||
f"error: entry #{i} cap {cap!r} is not a known nyx cap "
|
||||
f"(path {path!r}); fix the manifest",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 1
|
||||
norm = path.replace("\\", "/")
|
||||
key = (norm, cap)
|
||||
if key in seen:
|
||||
print(
|
||||
f"error: duplicate (path, cap) entry: {norm!r} / {cap!r}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 1
|
||||
seen.add(key)
|
||||
if corpus is not None and not (corpus / norm).is_file():
|
||||
missing.append(norm)
|
||||
records.append({"path": norm, "line": 0, "cap": cap, "vuln": vuln})
|
||||
|
||||
if missing:
|
||||
print(
|
||||
f"error: {len(missing)} manifest path(s) absent from {corpus} "
|
||||
f"(corpus drift or typo) — regenerate the manifest against the "
|
||||
f"pinned ref:",
|
||||
file=sys.stderr,
|
||||
)
|
||||
for m in missing:
|
||||
print(f" missing: {m}", file=sys.stderr)
|
||||
return 2
|
||||
|
||||
# Deterministic order so the committed JSON is diff-stable and the CI
|
||||
# byte-equality guard is meaningful regardless of manifest ordering.
|
||||
records.sort(key=lambda r: (r["path"], r["cap"]))
|
||||
|
||||
out = Path(args.output).expanduser().resolve()
|
||||
out.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(out, "w") as f:
|
||||
json.dump(records, f, indent=2)
|
||||
f.write("\n")
|
||||
|
||||
vuln_count = sum(1 for r in records if r["vuln"])
|
||||
print(f"wrote {len(records)} records to {out}")
|
||||
if negative_control:
|
||||
print(" negative-control corpus: zero ground-truth positives by construction")
|
||||
print(f" vulns: {vuln_count}")
|
||||
print(f" non-vuln: {len(records) - vuln_count}")
|
||||
if corpus is not None:
|
||||
print(f" validated against: {corpus}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
102
tests/eval_corpus/owasp_gt_convert.py
Normal file
102
tests/eval_corpus/owasp_gt_convert.py
Normal file
|
|
@ -0,0 +1,102 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Convert OWASP Benchmark v1.2 expectedresults-*.csv into nyx ground-truth JSON.
|
||||
|
||||
Source: `expectedresults-1.2beta.csv` shipped in the BenchmarkJava repo.
|
||||
Output: list of `{path, line, cap, vuln}` records, where:
|
||||
- `path` is the BenchmarkTest*.java path **relative to --corpus-dir**, with
|
||||
POSIX separators (e.g. `src/main/java/org/owasp/benchmark/testcode/
|
||||
BenchmarkTest00001.java`). Relative paths keep the committed ground truth
|
||||
portable: `tabulate.py` suffix-matches them against the absolute paths nyx
|
||||
emits, so the same JSON works on the dev laptop and on CI regardless of
|
||||
where the corpus was cloned.
|
||||
- `line` is 0 (CSV does not pin a line; tabulate uses LINE_TOLERANCE on findings).
|
||||
- `cap` is a nyx cap label mapped from the OWASP category column.
|
||||
- `vuln` is True for `real vulnerability == true`, else False.
|
||||
|
||||
Usage:
|
||||
tests/eval_corpus/owasp_gt_convert.py \\
|
||||
--corpus-dir ~/.cache/nyx/eval_corpus/owasp_benchmark_v1.2 \\
|
||||
--output tests/eval_corpus/ground_truth/owasp_benchmark_v1.2.json
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
OWASP_TO_NYX_CAP = {
|
||||
"cmdi": "cmdi",
|
||||
"crypto": "crypto",
|
||||
"hash": "crypto",
|
||||
"ldapi": "ldap_injection",
|
||||
"pathtraver": "path_traversal",
|
||||
"securecookie": "auth",
|
||||
"sqli": "sqli",
|
||||
"trustbound": "xss",
|
||||
"weakrand": "crypto",
|
||||
"xpathi": "xpath_injection",
|
||||
"xss": "xss",
|
||||
}
|
||||
|
||||
|
||||
def main() -> int:
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument("--corpus-dir", required=True,
|
||||
help="Path to BenchmarkJava clone root.")
|
||||
p.add_argument("--output", required=True,
|
||||
help="Output ground-truth JSON path.")
|
||||
p.add_argument("--csv", default="",
|
||||
help="Override CSV path (default: <corpus-dir>/expectedresults-1.2beta.csv).")
|
||||
args = p.parse_args()
|
||||
|
||||
corpus = Path(args.corpus_dir).expanduser().resolve()
|
||||
csv_path = Path(args.csv) if args.csv else corpus / "expectedresults-1.2beta.csv"
|
||||
if not csv_path.exists():
|
||||
print(f"error: csv not found: {csv_path}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
java_root = corpus / "src" / "main" / "java" / "org" / "owasp" / "benchmark" / "testcode"
|
||||
if not java_root.is_dir():
|
||||
print(f"error: java testcode dir not found: {java_root}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
records: list[dict] = []
|
||||
skipped = 0
|
||||
with open(csv_path) as f:
|
||||
reader = csv.reader(f)
|
||||
next(reader, None)
|
||||
for row in reader:
|
||||
if len(row) < 3:
|
||||
continue
|
||||
name, category, real_vuln = row[0].strip(), row[1].strip(), row[2].strip().lower()
|
||||
cap = OWASP_TO_NYX_CAP.get(category)
|
||||
if cap is None:
|
||||
skipped += 1
|
||||
continue
|
||||
java_file = java_root / f"{name}.java"
|
||||
if not java_file.exists():
|
||||
skipped += 1
|
||||
continue
|
||||
records.append({
|
||||
"path": java_file.relative_to(corpus).as_posix(),
|
||||
"line": 0,
|
||||
"cap": cap,
|
||||
"vuln": real_vuln == "true",
|
||||
})
|
||||
|
||||
out = Path(args.output).expanduser().resolve()
|
||||
out.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(out, "w") as f:
|
||||
json.dump(records, f, indent=2)
|
||||
|
||||
vuln_count = sum(1 for r in records if r["vuln"])
|
||||
print(f"wrote {len(records)} records to {out}")
|
||||
print(f" vulns: {vuln_count}")
|
||||
print(f" non-vuln: {len(records) - vuln_count}")
|
||||
print(f" skipped: {skipped}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
483
tests/eval_corpus/report.py
Normal file
483
tests/eval_corpus/report.py
Normal file
|
|
@ -0,0 +1,483 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Aggregate eval results across all corpus sets and emit a summary table.
|
||||
Used by run.sh after all corpus sets have been tabulated.
|
||||
|
||||
Phase 29 (Track I) extensions:
|
||||
--budget tests/eval_corpus/budget.toml per-cell budget enforcement
|
||||
--diff previous.json monotonic-improvement diff;
|
||||
CI fails on any regression.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
|
||||
try:
|
||||
import tomllib # Python 3.11+
|
||||
except ModuleNotFoundError: # pragma: no cover — older interpreters only
|
||||
import tomli as tomllib # type: ignore[no-redef]
|
||||
|
||||
# Caps with no sound runtime oracle: config / usage smells (weak crypto,
|
||||
# insecure-cookie auth, reflected XSS / trust-boundary) route to
|
||||
# Unsupported(SoundOracleUnavailable) by design, and the catch-all `other`
|
||||
# bucket holds unclassified findings with no curated payloads. Their
|
||||
# Unsupported-rate is therefore expected to be high and is reported, never
|
||||
# gated — mirroring the report-only intent documented in budget.toml.
|
||||
NO_SOUND_ORACLE_CAPS = {"auth", "crypto", "xss", "trustbound", "other"}
|
||||
|
||||
|
||||
def _soft_unsupported() -> bool:
|
||||
"""True when the per-cell Unsupported-rate budget is report-only.
|
||||
|
||||
Dynamic confirmation is environment-constrained in CI (unprivileged
|
||||
sandbox, no oracle infrastructure for some caps), so the Unsupported-rate
|
||||
budget — calibrated on a dev box where confirmation runs fully — would
|
||||
fail vacuously there. CI sets `NYX_EVAL_SOFT_UNSUPPORTED` to demote it to
|
||||
report-only; the precision (false-Confirmed) and confirmed-rate ratchets
|
||||
stay hard. Unset (local dev) keeps the Unsupported budget hard.
|
||||
"""
|
||||
return os.environ.get("NYX_EVAL_SOFT_UNSUPPORTED", "").strip().lower() in (
|
||||
"1",
|
||||
"true",
|
||||
"yes",
|
||||
"on",
|
||||
)
|
||||
|
||||
|
||||
def load_budget(path: str) -> dict:
|
||||
try:
|
||||
with open(path, "rb") as f:
|
||||
raw = tomllib.load(f)
|
||||
except FileNotFoundError:
|
||||
print(f"ERROR budget file not found: {path}", file=sys.stderr)
|
||||
sys.exit(3)
|
||||
except tomllib.TOMLDecodeError as e:
|
||||
print(f"ERROR budget file malformed: {path}: {e}", file=sys.stderr)
|
||||
sys.exit(3)
|
||||
default = raw.get("default", {}) or {}
|
||||
cells = {}
|
||||
for row in raw.get("cell", []) or []:
|
||||
cap = row.get("cap")
|
||||
lang = row.get("lang")
|
||||
if not cap or not lang:
|
||||
print(f"ERROR budget cell missing cap/lang: {row!r}", file=sys.stderr)
|
||||
sys.exit(3)
|
||||
cells[(cap, lang)] = row
|
||||
return {"default": default, "cells": cells}
|
||||
|
||||
|
||||
def budget_for_cell(budget: dict, cap: str, lang: str) -> dict:
|
||||
merged = dict(budget.get("default", {}) or {})
|
||||
cell = budget.get("cells", {}).get((cap, lang))
|
||||
if cell:
|
||||
merged.update({k: v for k, v in cell.items() if k not in ("cap", "lang")})
|
||||
if not cell:
|
||||
wildcard = (
|
||||
budget.get("cells", {}).get((cap, "*"))
|
||||
or budget.get("cells", {}).get(("*", lang))
|
||||
or budget.get("cells", {}).get(("*", "*"))
|
||||
)
|
||||
if wildcard:
|
||||
merged.update(
|
||||
{k: v for k, v in wildcard.items() if k not in ("cap", "lang")}
|
||||
)
|
||||
return merged
|
||||
|
||||
|
||||
def load_previous_agg(path: str) -> dict:
|
||||
"""Aggregate a previous results file the same way main() does."""
|
||||
try:
|
||||
with open(path) as f:
|
||||
data = json.load(f)
|
||||
except FileNotFoundError:
|
||||
print(f"ERROR diff file not found: {path}", file=sys.stderr)
|
||||
sys.exit(3)
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"ERROR diff file malformed: {path}: {e}", file=sys.stderr)
|
||||
sys.exit(3)
|
||||
agg: dict[tuple[str, str], dict] = defaultdict(
|
||||
lambda: {
|
||||
"tp": 0,
|
||||
"fp": 0,
|
||||
"fn": 0,
|
||||
"unsupported": 0,
|
||||
"confirmed": 0,
|
||||
"partially_confirmed": 0,
|
||||
"wrong_confirmed": 0,
|
||||
"stable_replays": 0,
|
||||
"confirmed_tp": 0,
|
||||
"confirmed_fp": 0,
|
||||
"total": 0,
|
||||
}
|
||||
)
|
||||
for r in data:
|
||||
for c in r.get("cells", []):
|
||||
k = (c["cap"], c["lang"])
|
||||
for field in (
|
||||
"tp",
|
||||
"fp",
|
||||
"fn",
|
||||
"unsupported",
|
||||
"confirmed",
|
||||
"partially_confirmed",
|
||||
"wrong_confirmed",
|
||||
"stable_replays",
|
||||
"confirmed_tp",
|
||||
"confirmed_fp",
|
||||
"total",
|
||||
):
|
||||
agg[k][field] += c.get(field, 0)
|
||||
return agg
|
||||
|
||||
|
||||
def main() -> int:
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument("--results", required=True)
|
||||
p.add_argument(
|
||||
"--budget",
|
||||
default="",
|
||||
help="path to budget.toml (per-(cap,lang) thresholds)",
|
||||
)
|
||||
p.add_argument(
|
||||
"--diff",
|
||||
default="",
|
||||
help="path to a previous results.json; fail on monotonic-improvement regression",
|
||||
)
|
||||
p.add_argument(
|
||||
"--min-confirmed-rate",
|
||||
type=float,
|
||||
default=None,
|
||||
help=(
|
||||
"minimum Confirmed / total rate per cap; exits 2 when any cap "
|
||||
"with findings falls below the threshold"
|
||||
),
|
||||
)
|
||||
p.add_argument(
|
||||
"--min-precision",
|
||||
type=float,
|
||||
default=None,
|
||||
help=(
|
||||
"minimum precision (tp / (tp+fp)) per cap; exits 2 when any cap "
|
||||
"with at least one finding falls below the threshold. Phase 27 "
|
||||
"OWASP acceptance floor (>= 0.85)."
|
||||
),
|
||||
)
|
||||
p.add_argument(
|
||||
"--min-recall",
|
||||
type=float,
|
||||
default=None,
|
||||
help=(
|
||||
"minimum recall (tp / (tp+fn)) per cap; exits 2 when any cap "
|
||||
"with at least one ground-truth positive falls below the "
|
||||
"threshold. Phase 27 OWASP acceptance floor (>= 0.40)."
|
||||
),
|
||||
)
|
||||
p.add_argument(
|
||||
"--floor-caps",
|
||||
default="",
|
||||
help=(
|
||||
"comma-separated cap allowlist. When set, the --min-confirmed-rate, "
|
||||
"--min-precision and --min-recall floors are ENFORCED only for these "
|
||||
"caps; other caps are still measured and printed but not gated. Used "
|
||||
"to exempt caps with no sound runtime oracle (e.g. crypto weak "
|
||||
"randomness, secure-cookie config smells) from dynamic-confirmation "
|
||||
"floors that they fundamentally cannot meet. Empty = gate every cap."
|
||||
),
|
||||
)
|
||||
args = p.parse_args()
|
||||
floor_caps = {c.strip() for c in args.floor_caps.split(",") if c.strip()}
|
||||
|
||||
with open(args.results) as f:
|
||||
results = json.load(f)
|
||||
|
||||
if not results:
|
||||
print("No results to report.")
|
||||
return 0
|
||||
|
||||
# Aggregate across sets.
|
||||
agg: dict[tuple[str, str], dict] = defaultdict(
|
||||
lambda: {
|
||||
"tp": 0,
|
||||
"fp": 0,
|
||||
"fn": 0,
|
||||
"unsupported": 0,
|
||||
"confirmed": 0,
|
||||
"partially_confirmed": 0,
|
||||
"wrong_confirmed": 0,
|
||||
"stable_replays": 0,
|
||||
"confirmed_tp": 0,
|
||||
"confirmed_fp": 0,
|
||||
"total": 0,
|
||||
}
|
||||
)
|
||||
for r in results:
|
||||
for c in r.get("cells", []):
|
||||
k = (c["cap"], c["lang"])
|
||||
for field in (
|
||||
"tp",
|
||||
"fp",
|
||||
"fn",
|
||||
"unsupported",
|
||||
"confirmed",
|
||||
"partially_confirmed",
|
||||
"wrong_confirmed",
|
||||
"stable_replays",
|
||||
"confirmed_tp",
|
||||
"confirmed_fp",
|
||||
"total",
|
||||
):
|
||||
agg[k][field] += c.get(field, 0)
|
||||
|
||||
print("\n=== Aggregated eval corpus report ===")
|
||||
print(
|
||||
f"{'Cap':<20} {'Lang':<12} {'TP':>5} {'FP':>5} {'FN':>5} "
|
||||
f"{'Prec':>6} {'Rec':>6} {'Unsup%':>7} {'Conf%':>7} {'Part%':>7}"
|
||||
)
|
||||
print("-" * 88)
|
||||
for k, v in sorted(agg.items()):
|
||||
prec = v["tp"] / max(v["tp"] + v["fp"], 1)
|
||||
rec = v["tp"] / max(v["tp"] + v["fn"], 1)
|
||||
unsup = v["unsupported"] / max(v["total"], 1)
|
||||
conf = v["confirmed"] / max(v["total"], 1)
|
||||
part = v["partially_confirmed"] / max(v["total"], 1)
|
||||
print(
|
||||
f"{k[0]:<20} {k[1]:<12} "
|
||||
f"{v['tp']:>5} {v['fp']:>5} {v['fn']:>5} "
|
||||
f"{prec:>6.2f} {rec:>6.2f} "
|
||||
f"{unsup*100:>6.1f}% {conf*100:>6.1f}% {part*100:>6.1f}%"
|
||||
)
|
||||
|
||||
gate_failed = False
|
||||
|
||||
# ── Phase 29: per-cell budget enforcement ────────────────────────────
|
||||
if args.budget:
|
||||
budget = load_budget(args.budget)
|
||||
print(f"\n=== Per-cell budget ({args.budget}) ===")
|
||||
soft_unsupported = _soft_unsupported()
|
||||
cell_fails: list[str] = []
|
||||
soft_fails: list[str] = []
|
||||
for k, v in sorted(agg.items()):
|
||||
b = budget_for_cell(budget, k[0], k[1])
|
||||
if not b:
|
||||
continue
|
||||
max_unsup = b.get("unsupported_rate")
|
||||
max_false = b.get("false_confirmed_rate")
|
||||
min_stable = b.get("repro_stability")
|
||||
min_confirmed = b.get("confirmed_rate")
|
||||
|
||||
if isinstance(max_unsup, (int, float)) and v["total"] > 0:
|
||||
rate = v["unsupported"] / v["total"]
|
||||
if rate > max_unsup:
|
||||
msg = (
|
||||
f"{k[0]}/{k[1]}: Unsupported {rate*100:.1f}%"
|
||||
f" > budget {max_unsup*100:.1f}%"
|
||||
)
|
||||
if k[0] in NO_SOUND_ORACLE_CAPS or soft_unsupported:
|
||||
soft_fails.append(f" soft {msg}")
|
||||
else:
|
||||
cell_fails.append(f" FAIL {msg}")
|
||||
if isinstance(max_false, (int, float)) and v["confirmed"] > 0:
|
||||
rate = v["wrong_confirmed"] / v["confirmed"]
|
||||
if rate > max_false:
|
||||
cell_fails.append(
|
||||
f" FAIL {k[0]}/{k[1]}: false-Confirmed {rate*100:.1f}%"
|
||||
f" > budget {max_false*100:.1f}%"
|
||||
)
|
||||
if (
|
||||
isinstance(min_stable, (int, float))
|
||||
and v["confirmed"] > 0
|
||||
and v.get("stable_replays", 0) > 0
|
||||
):
|
||||
rate = v["stable_replays"] / v["confirmed"]
|
||||
if rate < min_stable:
|
||||
cell_fails.append(
|
||||
f" FAIL {k[0]}/{k[1]}: repro stability {rate*100:.1f}%"
|
||||
f" < budget {min_stable*100:.1f}%"
|
||||
)
|
||||
if isinstance(min_confirmed, (int, float)) and v["total"] > 0:
|
||||
rate = v["confirmed"] / v["total"]
|
||||
if rate < min_confirmed:
|
||||
cell_fails.append(
|
||||
f" FAIL {k[0]}/{k[1]}: Confirmed {rate*100:.1f}%"
|
||||
f" < budget {min_confirmed*100:.1f}%"
|
||||
)
|
||||
if soft_fails:
|
||||
print(
|
||||
" Unsupported-rate over budget (report-only: no-sound-oracle "
|
||||
"cap or environment-constrained dynamic confirmation):"
|
||||
)
|
||||
for line in soft_fails:
|
||||
print(line)
|
||||
if cell_fails:
|
||||
for line in cell_fails:
|
||||
print(line)
|
||||
gate_failed = True
|
||||
else:
|
||||
print(" All hard per-cell budgets met.")
|
||||
else:
|
||||
# Legacy fallback: per-cap Unsupported rate <= 80%.
|
||||
print("\n=== Gate checks ===")
|
||||
UNSUPPORTED_BUDGET = 0.80
|
||||
cell_fails: list[str] = []
|
||||
for k, v in sorted(agg.items()):
|
||||
unsup = v["unsupported"] / max(v["total"], 1)
|
||||
if unsup > UNSUPPORTED_BUDGET:
|
||||
cell_fails.append(
|
||||
f" FAIL {k[0]}/{k[1]}: Unsupported {unsup*100:.1f}%"
|
||||
f" > {UNSUPPORTED_BUDGET*100:.0f}% budget"
|
||||
)
|
||||
if cell_fails:
|
||||
for line in cell_fails:
|
||||
print(line)
|
||||
gate_failed = True
|
||||
else:
|
||||
print(" All gate thresholds met.")
|
||||
|
||||
# ── Per-cap Confirmed-rate (published always; gated when a floor given) ──
|
||||
# Aggregated per cap across languages. The table is always printed so the
|
||||
# corpus's confirmation profile is visible ("publish per-cap …"); the floor
|
||||
# only FAILS the run when --min-confirmed-rate is supplied and the cap is in
|
||||
# scope (floor_caps empty = every cap in scope).
|
||||
cap_totals: dict[str, dict] = defaultdict(lambda: {"confirmed": 0, "total": 0})
|
||||
for (cap, _lang), v in agg.items():
|
||||
cap_totals[cap]["confirmed"] += v.get("confirmed", 0)
|
||||
cap_totals[cap]["total"] += v.get("total", 0)
|
||||
if cap_totals:
|
||||
floor_txt = (
|
||||
f" (floor {args.min_confirmed_rate*100:.1f}%)"
|
||||
if args.min_confirmed_rate is not None
|
||||
else " (report-only)"
|
||||
)
|
||||
print(f"\n=== Per-cap Confirmed-rate{floor_txt} ===")
|
||||
confirmed_fails: list[str] = []
|
||||
for cap, v in sorted(cap_totals.items()):
|
||||
if v["total"] <= 0:
|
||||
continue
|
||||
rate = v["confirmed"] / v["total"]
|
||||
gated = args.min_confirmed_rate is not None and (
|
||||
(not floor_caps) or (cap in floor_caps)
|
||||
)
|
||||
line = (
|
||||
f" {cap:<20} {v['confirmed']:>5}/{v['total']:<5} "
|
||||
f"{rate*100:>6.1f}%"
|
||||
)
|
||||
if gated and rate < args.min_confirmed_rate:
|
||||
confirmed_fails.append(f"{line} FAIL")
|
||||
elif args.min_confirmed_rate is None:
|
||||
print(line)
|
||||
else:
|
||||
print(f"{line} {'OK' if gated else 'skip (no floor)'}")
|
||||
if confirmed_fails:
|
||||
for line in confirmed_fails:
|
||||
print(line)
|
||||
gate_failed = True
|
||||
elif args.min_confirmed_rate is not None:
|
||||
print(" All confirmed-rate floors met.")
|
||||
|
||||
# ── Per-cap precision / recall (published always; gated when a floor given) ──
|
||||
# OWASP acceptance: per-cap precision ≥ 0.85, recall ≥ 0.40. Aggregated per
|
||||
# cap across languages (tp/fp/fn summed over every lang cell). The table is
|
||||
# always printed ("publish per-cap precision/recall"); a cap FAILS only when
|
||||
# the matching --min-* floor is supplied and the cap is in scope (floor_caps
|
||||
# empty = every cap in scope).
|
||||
cap_pr: dict[str, dict] = defaultdict(lambda: {"tp": 0, "fp": 0, "fn": 0})
|
||||
for (cap, _lang), v in agg.items():
|
||||
cap_pr[cap]["tp"] += v.get("tp", 0)
|
||||
cap_pr[cap]["fp"] += v.get("fp", 0)
|
||||
cap_pr[cap]["fn"] += v.get("fn", 0)
|
||||
if cap_pr:
|
||||
floors = []
|
||||
if args.min_precision is not None:
|
||||
floors.append(f"precision ≥ {args.min_precision*100:.1f}%")
|
||||
if args.min_recall is not None:
|
||||
floors.append(f"recall ≥ {args.min_recall*100:.1f}%")
|
||||
floor_txt = f" (floors: {', '.join(floors)})" if floors else " (report-only)"
|
||||
print(f"\n=== Per-cap precision/recall{floor_txt} ===")
|
||||
print(f" {'Cap':<20} {'TP':>5} {'FP':>5} {'FN':>5} {'Prec':>7} {'Rec':>7} Status")
|
||||
pr_failed = False
|
||||
any_gated = False
|
||||
for cap, v in sorted(cap_pr.items()):
|
||||
tp, fp, fn = v["tp"], v["fp"], v["fn"]
|
||||
# No findings and no GT positives → cap not present in this corpus.
|
||||
if tp + fp + fn == 0:
|
||||
continue
|
||||
prec = tp / max(tp + fp, 1)
|
||||
rec = tp / max(tp + fn, 1)
|
||||
gated = (not floor_caps) or (cap in floor_caps)
|
||||
tags = []
|
||||
if gated and args.min_precision is not None and (tp + fp) > 0 and prec < args.min_precision:
|
||||
tags.append("PRECISION")
|
||||
if gated and args.min_recall is not None and (tp + fn) > 0 and rec < args.min_recall:
|
||||
tags.append("RECALL")
|
||||
if tags:
|
||||
status = "FAIL " + "+".join(tags)
|
||||
elif not floors:
|
||||
status = "—"
|
||||
elif gated:
|
||||
status = "OK"
|
||||
any_gated = True
|
||||
else:
|
||||
status = "skip (no floor)"
|
||||
print(
|
||||
f" {cap:<20} {tp:>5} {fp:>5} {fn:>5} "
|
||||
f"{prec:>7.2f} {rec:>7.2f} {status}"
|
||||
)
|
||||
if tags:
|
||||
pr_failed = True
|
||||
if pr_failed:
|
||||
gate_failed = True
|
||||
elif floors and any_gated:
|
||||
print(" All per-cap precision/recall floors met.")
|
||||
|
||||
# ── Phase 29: monotonic-improvement diff ─────────────────────────────
|
||||
if args.diff:
|
||||
prev = load_previous_agg(args.diff)
|
||||
print(f"\n=== Monotonic-improvement diff vs {args.diff} ===")
|
||||
diff_fails: list[str] = []
|
||||
EPS = 0.005
|
||||
for k, v in sorted(agg.items()):
|
||||
old = prev.get(k)
|
||||
if not old:
|
||||
continue
|
||||
old_unsup = old["unsupported"] / max(old["total"], 1)
|
||||
new_unsup = v["unsupported"] / max(v["total"], 1)
|
||||
if new_unsup > old_unsup + EPS:
|
||||
diff_fails.append(
|
||||
f" REGRESSION {k[0]}/{k[1]}: Unsupported"
|
||||
f" {old_unsup*100:.1f}% → {new_unsup*100:.1f}%"
|
||||
)
|
||||
old_conf = old.get("confirmed", 0)
|
||||
new_conf = v.get("confirmed", 0)
|
||||
old_false = (old.get("wrong_confirmed", 0) / old_conf) if old_conf else None
|
||||
new_false = (v.get("wrong_confirmed", 0) / new_conf) if new_conf else None
|
||||
if old_false is not None and new_false is not None and new_false > old_false + EPS:
|
||||
diff_fails.append(
|
||||
f" REGRESSION {k[0]}/{k[1]}: false-Confirmed"
|
||||
f" {old_false*100:.1f}% → {new_false*100:.1f}%"
|
||||
)
|
||||
old_stable = (old.get("stable_replays", 0) / old_conf) if old_conf else None
|
||||
new_stable = (v.get("stable_replays", 0) / new_conf) if new_conf else None
|
||||
if (
|
||||
old_stable is not None
|
||||
and new_stable is not None
|
||||
and new_stable < old_stable - EPS
|
||||
):
|
||||
diff_fails.append(
|
||||
f" REGRESSION {k[0]}/{k[1]}: repro stability"
|
||||
f" {old_stable*100:.1f}% → {new_stable*100:.1f}%"
|
||||
)
|
||||
if diff_fails:
|
||||
for line in diff_fails:
|
||||
print(line)
|
||||
gate_failed = True
|
||||
else:
|
||||
print(" No regressions vs previous run.")
|
||||
|
||||
return 2 if gate_failed else 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
300
tests/eval_corpus/run.sh
Executable file
300
tests/eval_corpus/run.sh
Executable file
|
|
@ -0,0 +1,300 @@
|
|||
#!/usr/bin/env bash
|
||||
# Eval corpus runner.
|
||||
#
|
||||
# Usage:
|
||||
# tests/eval_corpus/run.sh [--output DIR] [--nyx BIN] [--sets owasp,sard,inhouse]
|
||||
#
|
||||
# Bootstraps OWASP Benchmark v1.2, the NIST SARD subset, and Nyx benchmark
|
||||
# fixtures. Runs `nyx scan --verify` on each. Emits
|
||||
# per-cell (cap x language) precision/recall table and per-cap Unsupported
|
||||
# rate to stdout (and --output DIR if given).
|
||||
#
|
||||
# Environment:
|
||||
# NYX_EVAL_CORPUS_DIR - path to pre-downloaded corpus roots
|
||||
# (default: ~/.cache/nyx/eval_corpus)
|
||||
# NYX_BIN - path to nyx binary (default: ./target/release/nyx)
|
||||
#
|
||||
# Exit codes:
|
||||
# 0 - all budget thresholds met
|
||||
# 1 - setup or I/O error
|
||||
# 2 - one or more budget thresholds exceeded (see output for details)
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||||
|
||||
# Defaults
|
||||
OUTPUT_DIR=""
|
||||
NYX_BIN="${NYX_BIN:-${REPO_ROOT}/target/release/nyx}"
|
||||
CORPUS_CACHE="${NYX_EVAL_CORPUS_DIR:-${HOME}/.cache/nyx/eval_corpus}"
|
||||
SETS="owasp,sard,nodegoat,juiceshop,railsgoat,dvwa,dvpwa,gosec,rustsec,inhouse"
|
||||
# Optional per-cell budgets and monotonic-improvement diff.
|
||||
BUDGET_FILE=""
|
||||
DIFF_FILE=""
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--output) OUTPUT_DIR="$2"; shift 2 ;;
|
||||
--nyx) NYX_BIN="$2"; shift 2 ;;
|
||||
--sets) SETS="$2"; shift 2 ;;
|
||||
--budget) BUDGET_FILE="$2"; shift 2 ;;
|
||||
--diff) DIFF_FILE="$2"; shift 2 ;;
|
||||
*) shift ;;
|
||||
esac
|
||||
done
|
||||
|
||||
# ── Helpers ───────────────────────────────────────────────────────────────────
|
||||
die() { echo "error: $*" >&2; exit 1; }
|
||||
info() { echo "[eval] $*"; }
|
||||
|
||||
require_cmd() { command -v "$1" >/dev/null 2>&1 || die "required command not found: $1"; }
|
||||
require_cmd jq
|
||||
require_cmd python3
|
||||
|
||||
# Scan one ground-truth-labelled real corpus (NodeGoat / Juice Shop) and
|
||||
# tabulate it against its committed ground truth. Self-skips when the
|
||||
# corpus has not been cloned into the cache.
|
||||
run_jsts_corpus() {
|
||||
local label="$1" dir="$2" gt="$3"
|
||||
if [[ ! -d "$dir" ]]; then
|
||||
info "Bootstrapping $label..."
|
||||
info " Clone the corpus into ${dir} then re-run this script:"
|
||||
if [[ "$label" == "nodegoat" ]]; then
|
||||
info " git clone --depth 1 https://github.com/OWASP/NodeGoat ${dir}"
|
||||
else
|
||||
info " git clone --depth 1 --branch v15.0.0 \\"
|
||||
info " https://github.com/juice-shop/juice-shop ${dir}"
|
||||
fi
|
||||
info "Skipping $label set (not yet downloaded)."
|
||||
return 0
|
||||
fi
|
||||
info "Running nyx scan on $label..."
|
||||
set +e
|
||||
"$NYX_BIN" scan --format json --verify --no-index "$dir" \
|
||||
> "/tmp/nyx_${label}.json" 2>"/tmp/nyx_${label}.stderr"
|
||||
local rc=$?
|
||||
set -e
|
||||
if [[ $rc -ne 0 && $rc -ne 1 ]]; then
|
||||
info " nyx exited $rc on $label set (stderr follows):"
|
||||
cat "/tmp/nyx_${label}.stderr" >&2
|
||||
return 0
|
||||
fi
|
||||
python3 "${SCRIPT_DIR}/tabulate.py" \
|
||||
--label "$label" \
|
||||
--scan "/tmp/nyx_${label}.json" \
|
||||
--ground-truth "$gt" \
|
||||
--append "$RESULTS_JSON" \
|
||||
${BUDGET_FILE:+--budget "$BUDGET_FILE"} \
|
||||
${DIFF_FILE:+--diff "$DIFF_FILE"} \
|
||||
|| info " tabulate.py failed on $label; ground truth file may be absent"
|
||||
}
|
||||
|
||||
# Scan one Track R.2 polyglot real corpus and tabulate it against its
|
||||
# committed ground truth, SCOPED to its target language (tabulate --lang) so
|
||||
# incidental other-language assets (e.g. vendored JS in a Rails / aiohttp app)
|
||||
# do not pollute the corpus's per-cap metrics. Self-skips when the corpus has
|
||||
# not been cloned into the cache; prints the exact clone command if so.
|
||||
# $1 label $2 dir $3 ground-truth json $4 target lang $5 repo $6 ref
|
||||
run_polyglot_corpus() {
|
||||
local label="$1" dir="$2" gt="$3" lang="$4" repo="$5" ref="$6"
|
||||
if [[ ! -d "$dir" ]]; then
|
||||
info "Bootstrapping $label..."
|
||||
info " git clone --depth 1 --branch ${ref} ${repo} ${dir}"
|
||||
info "Skipping $label set (not yet downloaded)."
|
||||
return 0
|
||||
fi
|
||||
info "Running nyx scan on $label (lang scope: ${lang})..."
|
||||
set +e
|
||||
"$NYX_BIN" scan --format json --verify --no-index "$dir" \
|
||||
> "/tmp/nyx_${label}.json" 2>"/tmp/nyx_${label}.stderr"
|
||||
local rc=$?
|
||||
set -e
|
||||
if [[ $rc -ne 0 && $rc -ne 1 ]]; then
|
||||
info " nyx exited $rc on $label set (stderr follows):"
|
||||
cat "/tmp/nyx_${label}.stderr" >&2
|
||||
return 0
|
||||
fi
|
||||
python3 "${SCRIPT_DIR}/tabulate.py" \
|
||||
--label "$label" \
|
||||
--scan "/tmp/nyx_${label}.json" \
|
||||
--ground-truth "$gt" \
|
||||
--lang "$lang" \
|
||||
--append "$RESULTS_JSON" \
|
||||
${BUDGET_FILE:+--budget "$BUDGET_FILE"} \
|
||||
${DIFF_FILE:+--diff "$DIFF_FILE"} \
|
||||
|| info " tabulate.py failed on $label; ground truth file may be absent"
|
||||
}
|
||||
|
||||
[[ -x "$NYX_BIN" ]] || die "nyx binary not found or not executable: $NYX_BIN"
|
||||
|
||||
mkdir -p "$CORPUS_CACHE"
|
||||
[[ -n "$OUTPUT_DIR" ]] && mkdir -p "$OUTPUT_DIR"
|
||||
|
||||
RESULTS_JSON="${OUTPUT_DIR:-/tmp}/eval_results_$(date +%Y%m%d_%H%M%S).json"
|
||||
echo "[]" > "$RESULTS_JSON"
|
||||
|
||||
# ── OWASP Benchmark v1.2 bootstrap ───────────────────────────────────────────
|
||||
OWASP_DIR="${CORPUS_CACHE}/owasp_benchmark_v1.2"
|
||||
if [[ "$SETS" == *owasp* ]]; then
|
||||
if [[ ! -d "$OWASP_DIR" ]]; then
|
||||
info "Bootstrapping OWASP Benchmark v1.2..."
|
||||
info " Clone from https://github.com/OWASP-Benchmark/BenchmarkJava"
|
||||
info " into ${OWASP_DIR}"
|
||||
info " then re-run this script."
|
||||
info " git clone --depth 1 --branch 1.2beta \\"
|
||||
info " https://github.com/OWASP-Benchmark/BenchmarkJava \\"
|
||||
info " ${OWASP_DIR}"
|
||||
info "Skipping OWASP set (not yet downloaded)."
|
||||
else
|
||||
info "Running nyx scan on OWASP Benchmark v1.2..."
|
||||
set +e
|
||||
"$NYX_BIN" scan --format json --verify --no-index "$OWASP_DIR" \
|
||||
> /tmp/nyx_owasp.json 2>/tmp/nyx_owasp.stderr
|
||||
NYX_EXIT=$?
|
||||
set -e
|
||||
if [[ $NYX_EXIT -ne 0 && $NYX_EXIT -ne 1 ]]; then
|
||||
info " nyx exited $NYX_EXIT on OWASP set (stderr follows):"
|
||||
cat /tmp/nyx_owasp.stderr >&2
|
||||
else
|
||||
python3 "${SCRIPT_DIR}/tabulate.py" \
|
||||
--label owasp \
|
||||
--scan /tmp/nyx_owasp.json \
|
||||
--ground-truth "${SCRIPT_DIR}/ground_truth/owasp_benchmark_v1.2.json" \
|
||||
--append "$RESULTS_JSON" \
|
||||
${BUDGET_FILE:+--budget "$BUDGET_FILE"} \
|
||||
${DIFF_FILE:+--diff "$DIFF_FILE"} \
|
||||
|| info " tabulate.py failed; ground truth file may be absent"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
# ── NodeGoat / Juice Shop (JS/TS) bootstrap — Track R.1 ───────────────────────
|
||||
if [[ "$SETS" == *nodegoat* ]]; then
|
||||
run_jsts_corpus nodegoat "${CORPUS_CACHE}/nodegoat" \
|
||||
"${SCRIPT_DIR}/ground_truth/nodegoat.json"
|
||||
fi
|
||||
if [[ "$SETS" == *juiceshop* ]]; then
|
||||
run_jsts_corpus juiceshop "${CORPUS_CACHE}/juiceshop" \
|
||||
"${SCRIPT_DIR}/ground_truth/juiceshop.json"
|
||||
fi
|
||||
|
||||
# ── Polyglot real corpora (Ruby/PHP/Python/Go/Rust) — Track R.2 ───────────────
|
||||
if [[ "$SETS" == *railsgoat* ]]; then
|
||||
run_polyglot_corpus railsgoat "${CORPUS_CACHE}/railsgoat" \
|
||||
"${SCRIPT_DIR}/ground_truth/railsgoat.json" ruby \
|
||||
https://github.com/OWASP/railsgoat rails.5.0.0
|
||||
fi
|
||||
if [[ "$SETS" == *dvwa* ]]; then
|
||||
run_polyglot_corpus dvwa "${CORPUS_CACHE}/dvwa" \
|
||||
"${SCRIPT_DIR}/ground_truth/dvwa.json" php \
|
||||
https://github.com/digininja/DVWA 2.5
|
||||
fi
|
||||
if [[ "$SETS" == *dvpwa* ]]; then
|
||||
run_polyglot_corpus dvpwa "${CORPUS_CACHE}/dvpwa" \
|
||||
"${SCRIPT_DIR}/ground_truth/dvpwa.json" python \
|
||||
https://github.com/anxolerd/dvpwa master
|
||||
fi
|
||||
if [[ "$SETS" == *gosec* ]]; then
|
||||
run_polyglot_corpus gosec "${CORPUS_CACHE}/gosec" \
|
||||
"${SCRIPT_DIR}/ground_truth/gosec.json" go \
|
||||
https://github.com/securego/gosec v2.26.1
|
||||
fi
|
||||
# RustSec advisory-db is the Rust negative control (empty ground truth): the
|
||||
# row asserts the Rust scan/verify path runs and Confirms nothing there.
|
||||
if [[ "$SETS" == *rustsec* ]]; then
|
||||
run_polyglot_corpus rustsec "${CORPUS_CACHE}/rustsec" \
|
||||
"${SCRIPT_DIR}/ground_truth/rustsec.json" rust \
|
||||
https://github.com/rustsec/advisory-db main
|
||||
fi
|
||||
|
||||
# ── NIST SARD subset bootstrap ────────────────────────────────────────────────
|
||||
SARD_DIR="${CORPUS_CACHE}/nist_sard"
|
||||
if [[ "$SETS" == *sard* ]]; then
|
||||
if [[ ! -d "$SARD_DIR" ]]; then
|
||||
info "Bootstrapping NIST SARD subset..."
|
||||
info " Download from https://samate.nist.gov/SARD/"
|
||||
info " into ${SARD_DIR} then re-run this script."
|
||||
info "Skipping SARD set (not yet downloaded)."
|
||||
else
|
||||
info "Running nyx scan on NIST SARD subset..."
|
||||
set +e
|
||||
"$NYX_BIN" scan --format json --verify --no-index "$SARD_DIR" \
|
||||
> /tmp/nyx_sard.json 2>/tmp/nyx_sard.stderr
|
||||
NYX_EXIT=$?
|
||||
set -e
|
||||
if [[ $NYX_EXIT -ne 0 && $NYX_EXIT -ne 1 ]]; then
|
||||
info " nyx exited $NYX_EXIT on SARD set"
|
||||
else
|
||||
python3 "${SCRIPT_DIR}/tabulate.py" \
|
||||
--label sard \
|
||||
--scan /tmp/nyx_sard.json \
|
||||
--ground-truth "${SCRIPT_DIR}/ground_truth/nist_sard.json" \
|
||||
--append "$RESULTS_JSON" \
|
||||
${BUDGET_FILE:+--budget "$BUDGET_FILE"} \
|
||||
${DIFF_FILE:+--diff "$DIFF_FILE"} \
|
||||
|| info " tabulate.py failed; ground truth file may be absent"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
# ── In-house bughunt-curated set ──────────────────────────────────────────────
|
||||
if [[ "$SETS" == *inhouse* ]]; then
|
||||
INHOUSE_DIRS=(
|
||||
"${REPO_ROOT}/tests/benchmark/corpus"
|
||||
"${REPO_ROOT}/tests/dynamic_fixtures"
|
||||
)
|
||||
for dir in "${INHOUSE_DIRS[@]}"; do
|
||||
[[ -d "$dir" ]] || continue
|
||||
label="inhouse_$(basename "$dir")"
|
||||
info "Running nyx scan on in-house set: $dir"
|
||||
set +e
|
||||
"$NYX_BIN" scan --format json --verify --no-index "$dir" \
|
||||
> "/tmp/nyx_${label}.json" 2>"/tmp/nyx_${label}.stderr"
|
||||
NYX_EXIT=$?
|
||||
set -e
|
||||
if [[ $NYX_EXIT -ne 0 && $NYX_EXIT -ne 1 ]]; then
|
||||
info " nyx exited $NYX_EXIT on $label"
|
||||
continue
|
||||
fi
|
||||
python3 "${SCRIPT_DIR}/tabulate.py" \
|
||||
--label "$label" \
|
||||
--scan "/tmp/nyx_${label}.json" \
|
||||
--inhouse \
|
||||
--append "$RESULTS_JSON" \
|
||||
${BUDGET_FILE:+--budget "$BUDGET_FILE"} \
|
||||
${DIFF_FILE:+--diff "$DIFF_FILE"} \
|
||||
|| info " tabulate.py failed on $label"
|
||||
done
|
||||
fi
|
||||
|
||||
# ── Emit summary table ────────────────────────────────────────────────────────
|
||||
info ""
|
||||
info "Results written to: $RESULTS_JSON"
|
||||
|
||||
[[ -n "$OUTPUT_DIR" ]] && cp "$RESULTS_JSON" "${OUTPUT_DIR}/eval_results.json"
|
||||
|
||||
if [[ ! -f "${SCRIPT_DIR}/report.py" ]]; then
|
||||
info "report.py not available; raw results at $RESULTS_JSON"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
set +e
|
||||
python3 "${SCRIPT_DIR}/report.py" \
|
||||
--results "$RESULTS_JSON" \
|
||||
${BUDGET_FILE:+--budget "$BUDGET_FILE"} \
|
||||
${DIFF_FILE:+--diff "$DIFF_FILE"}
|
||||
REPORT_RC=$?
|
||||
set -e
|
||||
# Propagate budget failures (exit 2) and malformed config (exit 3). Treat other
|
||||
# non-zero exits as setup errors.
|
||||
if [[ $REPORT_RC -eq 2 ]]; then
|
||||
exit 2
|
||||
elif [[ $REPORT_RC -eq 3 ]]; then
|
||||
info "report.py: budget/diff configuration malformed; see $RESULTS_JSON"
|
||||
exit 3
|
||||
elif [[ $REPORT_RC -ne 0 ]]; then
|
||||
info "report.py crashed (exit $REPORT_RC); raw results at $RESULTS_JSON"
|
||||
exit 1
|
||||
fi
|
||||
exit 0
|
||||
90
tests/eval_corpus/run_full.sh
Executable file
90
tests/eval_corpus/run_full.sh
Executable file
|
|
@ -0,0 +1,90 @@
|
|||
#!/usr/bin/env bash
|
||||
# Full eval-corpus orchestrator.
|
||||
#
|
||||
# Drives a complete pass against every corpus set the project knows about
|
||||
# (OWASP Benchmark v1.2, the NIST SARD subset, OWASP NodeGoat + Juice Shop,
|
||||
# the Track R.2 polyglot corpora — RailsGoat / DVWA / DVPWA / gosec / RustSec —
|
||||
# and the Nyx benchmark fixtures), then emits `tests/eval_corpus/results.json`
|
||||
# for reports, diffs, and docs.
|
||||
#
|
||||
# Usage:
|
||||
# tests/eval_corpus/run_full.sh [--nyx BIN] [--budget FILE] [--diff FILE]
|
||||
# [--output DIR] [--corpus-dir DIR]
|
||||
#
|
||||
# Differences vs `run.sh`:
|
||||
# * Always runs every set (no `--sets` selector).
|
||||
# * Always passes `--budget tests/eval_corpus/budget.toml` so the
|
||||
# configured per-cell limits are checked on every pass.
|
||||
# * Copies the timestamped results file to
|
||||
# `tests/eval_corpus/results.json`.
|
||||
#
|
||||
# Exit codes:
|
||||
# 0 every set ran and the merged result met the per-cell budget.
|
||||
# 1 setup or I/O error.
|
||||
# 2 budget exceeded OR monotonic-improvement regression.
|
||||
# 3 budget/diff input malformed.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||||
|
||||
NYX_BIN="${NYX_BIN:-${REPO_ROOT}/target/release/nyx}"
|
||||
BUDGET_FILE="${BUDGET_FILE:-${SCRIPT_DIR}/budget.toml}"
|
||||
DIFF_FILE="${DIFF_FILE:-}"
|
||||
OUTPUT_DIR=""
|
||||
CORPUS_CACHE="${NYX_EVAL_CORPUS_DIR:-${HOME}/.cache/nyx/eval_corpus}"
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--nyx) NYX_BIN="$2"; shift 2 ;;
|
||||
--budget) BUDGET_FILE="$2"; shift 2 ;;
|
||||
--diff) DIFF_FILE="$2"; shift 2 ;;
|
||||
--output) OUTPUT_DIR="$2"; shift 2 ;;
|
||||
--corpus-dir) CORPUS_CACHE="$2"; shift 2 ;;
|
||||
-h|--help)
|
||||
sed -n '1,40p' "$0"
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
echo "unknown flag: $1" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
die() { echo "error: $*" >&2; exit 1; }
|
||||
info() { echo "[full] $*"; }
|
||||
|
||||
[[ -x "$NYX_BIN" ]] || die "nyx binary not found or not executable: $NYX_BIN"
|
||||
[[ -f "$BUDGET_FILE" ]] || die "budget file not found: $BUDGET_FILE"
|
||||
|
||||
OUTPUT_DIR="${OUTPUT_DIR:-${SCRIPT_DIR}/.run-out}"
|
||||
mkdir -p "$OUTPUT_DIR"
|
||||
|
||||
info "nyx: $NYX_BIN"
|
||||
info "budget: $BUDGET_FILE"
|
||||
info "diff: ${DIFF_FILE:-<none>}"
|
||||
info "output: $OUTPUT_DIR"
|
||||
|
||||
set +e
|
||||
NYX_EVAL_CORPUS_DIR="$CORPUS_CACHE" \
|
||||
bash "${SCRIPT_DIR}/run.sh" \
|
||||
--nyx "$NYX_BIN" \
|
||||
--sets owasp,sard,nodegoat,juiceshop,railsgoat,dvwa,dvpwa,gosec,rustsec,inhouse \
|
||||
--output "$OUTPUT_DIR" \
|
||||
--budget "$BUDGET_FILE" \
|
||||
${DIFF_FILE:+--diff "$DIFF_FILE"}
|
||||
RC=$?
|
||||
set -e
|
||||
|
||||
RESULTS_SRC="${OUTPUT_DIR}/eval_results.json"
|
||||
RESULTS_DST="${SCRIPT_DIR}/results.json"
|
||||
if [[ -f "$RESULTS_SRC" ]]; then
|
||||
cp "$RESULTS_SRC" "$RESULTS_DST"
|
||||
info "results: $RESULTS_DST"
|
||||
else
|
||||
info "no eval_results.json produced; corpus may not be downloaded"
|
||||
fi
|
||||
|
||||
exit "$RC"
|
||||
134
tests/eval_corpus/sard_gt_convert.py
Normal file
134
tests/eval_corpus/sard_gt_convert.py
Normal file
|
|
@ -0,0 +1,134 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Convert NIST SARD manifest XML into nyx ground-truth JSON.
|
||||
|
||||
SARD ships per-test-case `manifest.xml` files alongside source. Each
|
||||
`<testcase>` lists one or more `<file path="…">` entries with optional
|
||||
`<flaw line="…" name="CWE-XXX_…"/>` children.
|
||||
|
||||
Output schema (consumed by tabulate.py):
|
||||
list of {"path", "line", "cap", "vuln"} records.
|
||||
|
||||
Usage:
|
||||
tests/eval_corpus/sard_gt_convert.py \\
|
||||
--corpus-dir ~/.cache/nyx/eval_corpus/nist_sard \\
|
||||
--output tests/eval_corpus/ground_truth/nist_sard.json
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
import xml.etree.ElementTree as ET
|
||||
from pathlib import Path
|
||||
|
||||
CWE_TO_NYX_CAP = {
|
||||
"20": "validation",
|
||||
"22": "path_traversal",
|
||||
"78": "cmdi",
|
||||
"79": "xss",
|
||||
"89": "sqli",
|
||||
"90": "ldap_injection",
|
||||
"91": "xpath_injection",
|
||||
"94": "cmdi",
|
||||
"113": "header_injection",
|
||||
"117": "header_injection",
|
||||
"190": "memory",
|
||||
"200": "data_exfil",
|
||||
"287": "auth",
|
||||
"295": "crypto",
|
||||
"311": "crypto",
|
||||
"327": "crypto",
|
||||
"328": "crypto",
|
||||
"330": "crypto",
|
||||
"352": "auth",
|
||||
"434": "path_traversal",
|
||||
"476": "memory",
|
||||
"502": "deserialize",
|
||||
"601": "redirect",
|
||||
"611": "xxe",
|
||||
"643": "xpath_injection",
|
||||
"798": "crypto",
|
||||
"918": "ssrf",
|
||||
}
|
||||
|
||||
CWE_RE = re.compile(r"CWE[-_](\d+)", re.IGNORECASE)
|
||||
|
||||
|
||||
def cap_for_flaw(name: str) -> str | None:
|
||||
m = CWE_RE.search(name or "")
|
||||
if not m:
|
||||
return None
|
||||
return CWE_TO_NYX_CAP.get(m.group(1))
|
||||
|
||||
|
||||
def main() -> int:
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument("--corpus-dir", required=True)
|
||||
p.add_argument("--output", required=True)
|
||||
args = p.parse_args()
|
||||
|
||||
root = Path(args.corpus_dir).expanduser().resolve()
|
||||
if not root.is_dir():
|
||||
print(f"error: corpus dir not found: {root}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
records: list[dict] = []
|
||||
skipped_files = 0
|
||||
skipped_caps = 0
|
||||
|
||||
for manifest in root.rglob("manifest.xml"):
|
||||
try:
|
||||
tree = ET.parse(manifest)
|
||||
except ET.ParseError as e:
|
||||
print(f"warn: parse failed {manifest}: {e}", file=sys.stderr)
|
||||
continue
|
||||
for tc in tree.iter("testcase"):
|
||||
for fnode in tc.iter("file"):
|
||||
rel = fnode.get("path") or ""
|
||||
if not rel:
|
||||
continue
|
||||
abs_path = (manifest.parent / rel).resolve()
|
||||
if not abs_path.exists():
|
||||
skipped_files += 1
|
||||
continue
|
||||
flaws = list(fnode.iter("flaw")) + list(fnode.iter("mixed"))
|
||||
if not flaws:
|
||||
records.append({
|
||||
"path": str(abs_path),
|
||||
"line": 0,
|
||||
"cap": "other",
|
||||
"vuln": False,
|
||||
})
|
||||
continue
|
||||
for flaw in flaws:
|
||||
cap = cap_for_flaw(flaw.get("name", ""))
|
||||
if cap is None:
|
||||
skipped_caps += 1
|
||||
continue
|
||||
try:
|
||||
line = int(flaw.get("line", "0") or 0)
|
||||
except ValueError:
|
||||
line = 0
|
||||
records.append({
|
||||
"path": str(abs_path),
|
||||
"line": line,
|
||||
"cap": cap,
|
||||
"vuln": True,
|
||||
})
|
||||
|
||||
out = Path(args.output).expanduser().resolve()
|
||||
out.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(out, "w") as f:
|
||||
json.dump(records, f, indent=2)
|
||||
|
||||
vuln_count = sum(1 for r in records if r["vuln"])
|
||||
print(f"wrote {len(records)} records to {out}")
|
||||
print(f" vulns: {vuln_count}")
|
||||
print(f" non-vuln: {len(records) - vuln_count}")
|
||||
print(f" skipped (file): {skipped_files}")
|
||||
print(f" skipped (cap): {skipped_caps}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
688
tests/eval_corpus/tabulate.py
Normal file
688
tests/eval_corpus/tabulate.py
Normal file
|
|
@ -0,0 +1,688 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tabulate nyx scan results against a ground-truth file.
|
||||
|
||||
For OWASP / SARD sets: compares nyx findings against known-true/known-false
|
||||
labels from the ground truth JSON.
|
||||
|
||||
For in-house sets (--inhouse): counts findings by cap x language; reports
|
||||
Unsupported rate only (no ground truth required).
|
||||
|
||||
Output: appends a result record to --append FILE.
|
||||
|
||||
Phase 29 (Track I) extensions:
|
||||
--budget tests/eval_corpus/budget.toml enforce per-cell budget thresholds
|
||||
--diff previous.json compare against prior result file,
|
||||
fail on monotonic-improvement
|
||||
regression
|
||||
|
||||
Exit codes:
|
||||
0 all rows pass.
|
||||
2 one or more per-cell budgets exceeded OR a diff regression was found.
|
||||
3 malformed budget / diff input (callers must fix configuration).
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
import tomllib # Python 3.11+
|
||||
except ModuleNotFoundError: # pragma: no cover — older interpreters only
|
||||
import tomli as tomllib # type: ignore[no-redef]
|
||||
|
||||
LINE_TOLERANCE = 5
|
||||
|
||||
# Caps with no sound runtime oracle (config / usage smells) and the catch-all
|
||||
# `other` bucket route to Unsupported by design, so their Unsupported-rate is
|
||||
# report-only, never gated. Mirrors report.py / the budget.toml intent.
|
||||
NO_SOUND_ORACLE_CAPS = {"auth", "crypto", "xss", "trustbound", "other"}
|
||||
|
||||
|
||||
def _soft_unsupported() -> bool:
|
||||
"""True when the per-cell Unsupported-rate budget is report-only.
|
||||
|
||||
CI sets `NYX_EVAL_SOFT_UNSUPPORTED` because dynamic confirmation is
|
||||
environment-constrained there (the budget is calibrated on a dev box where
|
||||
confirmation runs fully); the precision / confirmed-rate ratchets stay
|
||||
hard. Unset (local dev) keeps the Unsupported budget hard.
|
||||
"""
|
||||
return os.environ.get("NYX_EVAL_SOFT_UNSUPPORTED", "").strip().lower() in (
|
||||
"1",
|
||||
"true",
|
||||
"yes",
|
||||
"on",
|
||||
)
|
||||
|
||||
# Bitflag positions for Cap (src/labels/mod.rs). Sink bits map to a cap label.
|
||||
_CAP_BIT_TABLE = [
|
||||
(1 << 5, "path_traversal"), # FILE_IO
|
||||
(1 << 6, "fmt_string"),
|
||||
(1 << 7, "sqli"), # SQL_QUERY
|
||||
(1 << 8, "deserialize"),
|
||||
(1 << 9, "ssrf"),
|
||||
(1 << 10, "cmdi"), # CODE_EXEC
|
||||
(1 << 11, "crypto"),
|
||||
(1 << 12, "unauthorized_id"),
|
||||
(1 << 13, "data_exfil"),
|
||||
(1 << 14, "ldap_injection"),
|
||||
(1 << 15, "xpath_injection"),
|
||||
(1 << 16, "header_injection"),
|
||||
(1 << 17, "redirect"), # OPEN_REDIRECT
|
||||
(1 << 18, "xss"), # SSTI (template_injection); also covers XSS sinks
|
||||
(1 << 19, "xxe"),
|
||||
(1 << 20, "prototype_pollution"),
|
||||
# HTML_ESCAPE (1<<1) is the universal reflected-XSS *sink* cap across every
|
||||
# language (`grep 'Sink(Cap::HTML_ESCAPE)' src/labels/` — PHP echo, JS
|
||||
# innerHTML, Java servlet writers, etc.); the same bit is the html-escape
|
||||
# *sanitizer* cap, so a finding only carries it as a sink when an un-encoded
|
||||
# tainted value reached an HTML output. Placed LAST so any higher-priority
|
||||
# sink bit (SQL_QUERY, FILE_IO, ...) on the same finding wins; a finding
|
||||
# carrying only HTML_ESCAPE is reflected XSS. Without this, every
|
||||
# taint-based reflected-XSS finding mis-buckets to "other".
|
||||
(1 << 1, "xss"),
|
||||
]
|
||||
|
||||
# Static lens (see --static): SHELL_ESCAPE (1<<2) is the command-injection sink
|
||||
# cap for *every* language (`grep SHELL_ESCAPE src/labels/` — all Sink uses are
|
||||
# command-exec; CODE_EXEC=1<<10 is the eval/code-exec variant, also cmdi). In a
|
||||
# normal `nyx scan` (no dynamic confirmation) a Java cmdi finding carries only
|
||||
# SHELL_ESCAPE; the SHELL_ESCAPE→CODE_EXEC remap that buckets it as cmdi is gated
|
||||
# on VerifyStatus::Confirmed (src/commands/scan.rs), so with 0 confirmations the
|
||||
# default table leaves these in "other" and the cmdi cell reads 0/0/N. The
|
||||
# static lens appends SHELL_ESCAPE→cmdi at the LOWEST priority (after every other
|
||||
# bit) so a SHELL_ESCAPE-only finding buckets as cmdi while a finding that also
|
||||
# carries a higher-priority sink bit (e.g. FILE_IO) keeps its existing bucket.
|
||||
# Opt-in via --static so the default confirmed-recall bucketing is byte-identical.
|
||||
_CAP_BIT_TABLE_STATIC = _CAP_BIT_TABLE + [(1 << 2, "cmdi")] # SHELL_ESCAPE
|
||||
|
||||
# Substring → cap lookup for rule IDs. Order matters: most specific first.
|
||||
_CAP_RULE_TABLE = [
|
||||
("path_traversal", "path_traversal"),
|
||||
("sql", "sqli"),
|
||||
("xss", "xss"),
|
||||
("ssrf", "ssrf"),
|
||||
("cmdi", "cmdi"),
|
||||
("cmd_exec", "cmdi"),
|
||||
("code_exec", "cmdi"),
|
||||
("deser", "deserialize"),
|
||||
("unserialize", "deserialize"),
|
||||
("redirect", "redirect"),
|
||||
("xxe", "xxe"),
|
||||
("template", "xss"),
|
||||
("auth", "auth"),
|
||||
("memory", "memory"),
|
||||
("crypto", "crypto"),
|
||||
("data-exfil", "data_exfil"),
|
||||
("data_exfil", "data_exfil"),
|
||||
("header", "header_injection"),
|
||||
]
|
||||
|
||||
|
||||
def load_json(path: str) -> object:
|
||||
with open(path) as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def cap_of(finding: dict, static_lens: bool = False) -> str:
|
||||
# 1. Prefer evidence.sink_caps bitmask — the engine's own classification.
|
||||
ev = finding.get("evidence", {}) or {}
|
||||
sink_caps = ev.get("sink_caps")
|
||||
if isinstance(sink_caps, int) and sink_caps:
|
||||
table = _CAP_BIT_TABLE_STATIC if static_lens else _CAP_BIT_TABLE
|
||||
for bit, name in table:
|
||||
if sink_caps & bit:
|
||||
return name
|
||||
# 2. Fall back to rule id substring (e.g. py.cmdi.os_system, java.deser.readobject).
|
||||
rid = (finding.get("id") or "").lower()
|
||||
head = rid.split(" ", 1)[0]
|
||||
for needle, cap in _CAP_RULE_TABLE:
|
||||
if needle in head:
|
||||
return cap
|
||||
return "other"
|
||||
|
||||
|
||||
def lang_of(finding: dict) -> str:
|
||||
path = finding.get("path", "")
|
||||
ext_map = {
|
||||
".py": "python", ".js": "javascript", ".ts": "typescript",
|
||||
".java": "java", ".go": "go", ".php": "php", ".rb": "ruby",
|
||||
".rs": "rust", ".c": "c", ".cpp": "cpp",
|
||||
}
|
||||
for ext, lang in ext_map.items():
|
||||
if path.endswith(ext):
|
||||
return lang
|
||||
return "unknown"
|
||||
|
||||
|
||||
def _norm_path(p: str) -> str:
|
||||
return p.replace("\\", "/")
|
||||
|
||||
|
||||
def path_matches(gt_path: str, finding_path: str) -> bool:
|
||||
"""True when a ground-truth path refers to the same file as a finding path.
|
||||
|
||||
Ground-truth paths are stored *relative to the corpus root* so the checked-in
|
||||
JSON stays portable, while nyx emits absolute paths rooted at wherever the
|
||||
corpus was cloned. Match on a path-component-aligned suffix so the relative
|
||||
GT path matches the absolute finding path (and the reverse, to keep a legacy
|
||||
absolute GT file working). Exact equality is the fast path; the `/` boundary
|
||||
stops `.../BenchmarkTest1.java` from matching `.../xBenchmarkTest1.java`.
|
||||
"""
|
||||
g = _norm_path(gt_path)
|
||||
f = _norm_path(finding_path)
|
||||
return g == f or f.endswith("/" + g) or g.endswith("/" + f)
|
||||
|
||||
|
||||
# ── Budget loading ──────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def load_budget(path: str) -> dict:
|
||||
"""Parse a budget.toml file.
|
||||
|
||||
Returns a dict::
|
||||
|
||||
{
|
||||
"default": {"unsupported_rate": 0.8, "false_confirmed_rate": 0.02,
|
||||
"repro_stability": 0.95, "ratchet_deadline": "..."},
|
||||
"cells": {(cap, lang): {...overrides...}, ...},
|
||||
}
|
||||
|
||||
Raises SystemExit(3) on a malformed file.
|
||||
"""
|
||||
|
||||
try:
|
||||
with open(path, "rb") as f:
|
||||
raw = tomllib.load(f)
|
||||
except FileNotFoundError:
|
||||
print(f"ERROR budget file not found: {path}", file=sys.stderr)
|
||||
sys.exit(3)
|
||||
except tomllib.TOMLDecodeError as e:
|
||||
print(f"ERROR budget file malformed: {path}: {e}", file=sys.stderr)
|
||||
sys.exit(3)
|
||||
|
||||
default = raw.get("default", {}) or {}
|
||||
cells = {}
|
||||
for row in raw.get("cell", []) or []:
|
||||
cap = row.get("cap")
|
||||
lang = row.get("lang")
|
||||
if not cap or not lang:
|
||||
print(
|
||||
f"ERROR budget cell missing cap/lang: {row!r}", file=sys.stderr
|
||||
)
|
||||
sys.exit(3)
|
||||
cells[(cap, lang)] = row
|
||||
|
||||
return {"default": default, "cells": cells}
|
||||
|
||||
|
||||
def budget_for_cell(budget: dict, cap: str, lang: str) -> dict:
|
||||
"""Merge cell-specific overrides on top of [default]."""
|
||||
merged = dict(budget.get("default", {}) or {})
|
||||
cell = budget.get("cells", {}).get((cap, lang))
|
||||
if cell:
|
||||
merged.update({k: v for k, v in cell.items() if k not in ("cap", "lang")})
|
||||
# Fall back to a wildcard override if present.
|
||||
if not cell:
|
||||
wildcard = budget.get("cells", {}).get((cap, "*")) or \
|
||||
budget.get("cells", {}).get(("*", lang)) or \
|
||||
budget.get("cells", {}).get(("*", "*"))
|
||||
if wildcard:
|
||||
merged.update({k: v for k, v in wildcard.items() if k not in ("cap", "lang")})
|
||||
return merged
|
||||
|
||||
|
||||
def enforce_budget(cells: list, budget: dict) -> list:
|
||||
"""Return a list of human-readable failure strings.
|
||||
|
||||
Each cell's measured Unsupported / false-Confirmed / repro-stability
|
||||
rate is compared against its merged budget row. A missing measurement
|
||||
(e.g. no Confirmed findings → false-Confirmed denominator = 0) is
|
||||
treated as "no data" and skipped, never as a failure.
|
||||
"""
|
||||
|
||||
failures = []
|
||||
soft_unsupported = _soft_unsupported()
|
||||
for c in cells:
|
||||
b = budget_for_cell(budget, c["cap"], c["lang"])
|
||||
if not b:
|
||||
continue
|
||||
cap, lang = c["cap"], c["lang"]
|
||||
max_unsup = b.get("unsupported_rate")
|
||||
max_false = b.get("false_confirmed_rate")
|
||||
min_stable = b.get("repro_stability")
|
||||
min_confirmed = b.get("confirmed_rate")
|
||||
|
||||
if isinstance(max_unsup, (int, float)) and c.get("total", 0) > 0:
|
||||
if c["unsupported_rate"] > max_unsup:
|
||||
# No-sound-oracle caps (and `other`) are report-only by design;
|
||||
# the rest are report-only when dynamic confirmation is known to
|
||||
# be environment-constrained (NYX_EVAL_SOFT_UNSUPPORTED, set by
|
||||
# CI). Hard otherwise so local dev still ratchets coverage.
|
||||
line = (
|
||||
f" {cap}/{lang}: Unsupported {c['unsupported_rate']*100:.1f}%"
|
||||
f" > budget {max_unsup*100:.1f}%"
|
||||
)
|
||||
if not (cap in NO_SOUND_ORACLE_CAPS or soft_unsupported):
|
||||
failures.append(f" FAIL{line}")
|
||||
if isinstance(min_confirmed, (int, float)) and c.get("total", 0) > 0:
|
||||
rate = c.get("confirmed", 0) / c["total"]
|
||||
if rate < min_confirmed:
|
||||
failures.append(
|
||||
f" FAIL {cap}/{lang}: Confirmed {rate*100:.1f}%"
|
||||
f" < budget {min_confirmed*100:.1f}%"
|
||||
)
|
||||
if isinstance(max_false, (int, float)) and c.get("confirmed", 0) > 0:
|
||||
rate = c.get("wrong_confirmed", 0) / c["confirmed"]
|
||||
if rate > max_false:
|
||||
failures.append(
|
||||
f" FAIL {cap}/{lang}: false-Confirmed {rate*100:.1f}%"
|
||||
f" > budget {max_false*100:.1f}%"
|
||||
)
|
||||
# Repro stability is only enforced when callers stamped at least
|
||||
# one `replay_stable: true` flag — otherwise stable_replays == 0
|
||||
# is indistinguishable from "we did not measure stability for
|
||||
# this row" and the gate would fire vacuously on every clean run.
|
||||
if (
|
||||
isinstance(min_stable, (int, float))
|
||||
and c.get("confirmed", 0) > 0
|
||||
and c.get("stable_replays", 0) > 0
|
||||
):
|
||||
rate = c["stable_replays"] / c["confirmed"]
|
||||
if rate < min_stable:
|
||||
failures.append(
|
||||
f" FAIL {cap}/{lang}: repro stability {rate*100:.1f}%"
|
||||
f" < budget {min_stable*100:.1f}%"
|
||||
)
|
||||
return failures
|
||||
|
||||
|
||||
# ── Diff loading ────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def load_previous_cells(path: str, label: str) -> dict:
|
||||
"""Index a previous results file by (cap, lang) → cell.
|
||||
|
||||
The previous file is the same shape as `--append`'s output. We pick the
|
||||
record whose `label` matches the current run; if no exact match, fall
|
||||
back to the first record. Missing/unreadable files exit 3.
|
||||
"""
|
||||
|
||||
try:
|
||||
with open(path) as f:
|
||||
data = json.load(f)
|
||||
except FileNotFoundError:
|
||||
print(f"ERROR diff file not found: {path}", file=sys.stderr)
|
||||
sys.exit(3)
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"ERROR diff file malformed: {path}: {e}", file=sys.stderr)
|
||||
sys.exit(3)
|
||||
|
||||
records = data if isinstance(data, list) else [data]
|
||||
chosen = None
|
||||
for r in records:
|
||||
if r.get("label") == label:
|
||||
chosen = r
|
||||
break
|
||||
if chosen is None and records:
|
||||
chosen = records[0]
|
||||
if not chosen:
|
||||
return {}
|
||||
return {(c["cap"], c["lang"]): c for c in chosen.get("cells", [])}
|
||||
|
||||
|
||||
def diff_regressions(cells: list, prev: dict) -> list:
|
||||
"""Compare current cells against previous. Returns failure strings.
|
||||
|
||||
Three monotonicity rules:
|
||||
* Unsupported% must not increase.
|
||||
* False-Confirmed% must not increase.
|
||||
* Repro-stability% must not decrease.
|
||||
|
||||
Cells absent from `prev` are treated as new (skipped).
|
||||
A small epsilon (0.5 percentage points) absorbs flake noise.
|
||||
"""
|
||||
EPS = 0.005
|
||||
failures = []
|
||||
for c in cells:
|
||||
key = (c["cap"], c["lang"])
|
||||
old = prev.get(key)
|
||||
if not old:
|
||||
continue
|
||||
# Unsupported.
|
||||
old_unsup = old.get("unsupported_rate", 0.0)
|
||||
new_unsup = c.get("unsupported_rate", 0.0)
|
||||
if new_unsup > old_unsup + EPS:
|
||||
failures.append(
|
||||
f" REGRESSION {key[0]}/{key[1]}: Unsupported"
|
||||
f" {old_unsup*100:.1f}% → {new_unsup*100:.1f}%"
|
||||
)
|
||||
# False-Confirmed.
|
||||
old_conf = old.get("confirmed", 0)
|
||||
old_false = (old.get("wrong_confirmed", 0) / old_conf) if old_conf else None
|
||||
new_conf = c.get("confirmed", 0)
|
||||
new_false = (c.get("wrong_confirmed", 0) / new_conf) if new_conf else None
|
||||
if old_false is not None and new_false is not None and new_false > old_false + EPS:
|
||||
failures.append(
|
||||
f" REGRESSION {key[0]}/{key[1]}: false-Confirmed"
|
||||
f" {old_false*100:.1f}% → {new_false*100:.1f}%"
|
||||
)
|
||||
# Repro stability (higher is better).
|
||||
old_stable = (
|
||||
(old.get("stable_replays", 0) / old_conf) if old_conf else None
|
||||
)
|
||||
new_stable = (
|
||||
(c.get("stable_replays", 0) / new_conf) if new_conf else None
|
||||
)
|
||||
if (
|
||||
old_stable is not None
|
||||
and new_stable is not None
|
||||
and new_stable < old_stable - EPS
|
||||
):
|
||||
failures.append(
|
||||
f" REGRESSION {key[0]}/{key[1]}: repro stability"
|
||||
f" {old_stable*100:.1f}% → {new_stable*100:.1f}%"
|
||||
)
|
||||
return failures
|
||||
|
||||
|
||||
def main() -> int:
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument("--label", required=True)
|
||||
p.add_argument("--scan", required=True, help="nyx scan --format json output")
|
||||
p.add_argument("--ground-truth", default="", help="ground truth JSON")
|
||||
p.add_argument("--inhouse", action="store_true")
|
||||
p.add_argument("--append", required=True, help="results accumulator JSON")
|
||||
p.add_argument(
|
||||
"--manual-triage",
|
||||
default="",
|
||||
help=(
|
||||
"path to a manual-triage JSON file (list of "
|
||||
"{path, line, cap, vuln: bool}). Confirmed findings matching a "
|
||||
"`vuln: false` entry are stamped with `wrong: true` before "
|
||||
"tabulation so the per-cell False-Confirmed budget becomes "
|
||||
"non-vacuous without depending on the host's `nyx verify-feedback` "
|
||||
"log. Matching uses LINE_TOLERANCE (=5) — line == 0 in the triage "
|
||||
"entry matches any line."
|
||||
),
|
||||
)
|
||||
p.add_argument(
|
||||
"--budget",
|
||||
default="",
|
||||
help="path to budget.toml (per-(cap,lang) thresholds)",
|
||||
)
|
||||
p.add_argument(
|
||||
"--lang",
|
||||
default="",
|
||||
help=(
|
||||
"comma-separated language allowlist (python, javascript, php, "
|
||||
"ruby, go, rust, ...). When set, only findings AND ground-truth "
|
||||
"entries whose source language is in the list are tabulated; "
|
||||
"everything else is dropped before tallying. Used by the Phase 29 "
|
||||
"polyglot corpora (Track R.2) to scope a single-language corpus to "
|
||||
"its target language so incidental third-party assets in other "
|
||||
"languages — e.g. the vendored JavaScript a Rails or aiohttp app "
|
||||
"bundles — do not pollute that corpus's per-cap metrics. Empty = "
|
||||
"no language filter (every finding tabulated, the OWASP/JSTS "
|
||||
"default)."
|
||||
),
|
||||
)
|
||||
p.add_argument(
|
||||
"--diff",
|
||||
default="",
|
||||
help="path to a previous results JSON; fail on monotonic-improvement regression",
|
||||
)
|
||||
p.add_argument(
|
||||
"--static",
|
||||
action="store_true",
|
||||
help=(
|
||||
"static lens: bucket SHELL_ESCAPE (1<<2) findings as cmdi even when "
|
||||
"they are unconfirmed. Java (and other) command-exec sinks carry "
|
||||
"SHELL_ESCAPE and only get remapped to CODE_EXEC on dynamic Confirm; "
|
||||
"without this flag, an env with 0 confirmations reads the cmdi cell "
|
||||
"as 0/0/N regardless of static quality. SHELL_ESCAPE is the "
|
||||
"command-injection sink cap for every language, so this is sound "
|
||||
"globally; it is opt-in only so the default confirmed-recall "
|
||||
"bucketing stays byte-identical."
|
||||
),
|
||||
)
|
||||
args = p.parse_args()
|
||||
lang_filter = {l.strip() for l in args.lang.split(",") if l.strip()}
|
||||
|
||||
scan_data = load_json(args.scan)
|
||||
findings = scan_data if isinstance(scan_data, list) else scan_data.get("findings", [])
|
||||
# Score only Security-category findings against the security ground truth.
|
||||
# Reliability defects (resource leaks, error-handling fallthrough) and
|
||||
# Quality findings are real bugs but not the injection / crypto / auth
|
||||
# vulns the corpus ground truth enumerates, so counting them as security
|
||||
# false-positives is a category error that wrecks precision with pure
|
||||
# noise. Findings with no explicit category (legacy fixtures) default to
|
||||
# Security and are kept.
|
||||
findings = [
|
||||
f for f in findings
|
||||
if f.get("category", "Security") not in ("Reliability", "Quality")
|
||||
]
|
||||
if lang_filter:
|
||||
findings = [f for f in findings if lang_of(f) in lang_filter]
|
||||
|
||||
# ── Manual-triage stamping (Phase 31 follow-up) ───────────────────────
|
||||
# Cross-reference Confirmed rows against a manual-triage file before
|
||||
# tabulation. Each `vuln: false` entry whose `(path, cap)` matches a
|
||||
# Confirmed finding (with LINE_TOLERANCE, or any line when triage
|
||||
# entry's `line == 0`) stamps `wrong: true` on the finding's
|
||||
# `dynamic_verdict`, which the existing wrong_confirmed counter picks
|
||||
# up below. Decouples the False-Confirmed budget from the host-local
|
||||
# `nyx verify-feedback` log so CI on a fresh eval corpus can still
|
||||
# gate the headline target.
|
||||
if args.manual_triage and Path(args.manual_triage).exists():
|
||||
triage = load_json(args.manual_triage)
|
||||
not_vuln: list[dict] = []
|
||||
for entry in triage if isinstance(triage, list) else []:
|
||||
if entry.get("vuln") is False:
|
||||
not_vuln.append({
|
||||
"path": entry.get("path", ""),
|
||||
"line": entry.get("line", 0),
|
||||
"cap": entry.get("cap", ""),
|
||||
})
|
||||
used: set[int] = set()
|
||||
for f in findings:
|
||||
ev = f.get("evidence") or {}
|
||||
dv = ev.get("dynamic_verdict") or {}
|
||||
if dv.get("status") != "Confirmed":
|
||||
continue
|
||||
f_path = f.get("path", "")
|
||||
f_line = f.get("line", 0)
|
||||
f_cap = cap_of(f, static_lens=args.static)
|
||||
for idx, entry in enumerate(not_vuln):
|
||||
if idx in used:
|
||||
continue
|
||||
if (path_matches(entry["path"], f_path)
|
||||
and entry["cap"] == f_cap
|
||||
and (entry["line"] == 0
|
||||
or abs(entry["line"] - f_line) <= LINE_TOLERANCE)):
|
||||
used.add(idx)
|
||||
dv["wrong"] = True
|
||||
ev["dynamic_verdict"] = dv
|
||||
f["evidence"] = ev
|
||||
break
|
||||
|
||||
# Per-cell tallies: {(cap, lang): {tp, fp, fn, unsupported, confirmed,
|
||||
# partially_confirmed, wrong_confirmed, stable_replays, total}}
|
||||
cells: dict[tuple[str, str], dict] = defaultdict(
|
||||
lambda: {
|
||||
"tp": 0,
|
||||
"fp": 0,
|
||||
"fn": 0,
|
||||
"unsupported": 0,
|
||||
"confirmed": 0,
|
||||
"partially_confirmed": 0,
|
||||
"wrong_confirmed": 0,
|
||||
"stable_replays": 0,
|
||||
# Confirmed-verdict precision/recall accounting, ground-truth-derived
|
||||
# (only populated when --ground-truth is supplied): confirmed_tp =
|
||||
# Confirmed findings that match a GT positive; confirmed_fp =
|
||||
# Confirmed findings that match no GT positive (false confirms).
|
||||
"confirmed_tp": 0,
|
||||
"confirmed_fp": 0,
|
||||
"total": 0,
|
||||
}
|
||||
)
|
||||
|
||||
for f in findings:
|
||||
cap = cap_of(f, static_lens=args.static)
|
||||
lang = lang_of(f)
|
||||
key = (cap, lang)
|
||||
ev = f.get("evidence", {}) or {}
|
||||
dv = ev.get("dynamic_verdict") if ev else None
|
||||
cells[key]["total"] += 1
|
||||
if dv:
|
||||
status = dv.get("status")
|
||||
if status == "Unsupported":
|
||||
cells[key]["unsupported"] += 1
|
||||
elif status == "PartiallyConfirmed":
|
||||
cells[key]["partially_confirmed"] += 1
|
||||
elif status == "Confirmed":
|
||||
cells[key]["confirmed"] += 1
|
||||
# Repro-stability and false-Confirmed counts are optional
|
||||
# fields tabulate.py reads off the verdict when callers have
|
||||
# stamped them.
|
||||
if dv.get("wrong") is True:
|
||||
cells[key]["wrong_confirmed"] += 1
|
||||
if dv.get("replay_stable") is True:
|
||||
cells[key]["stable_replays"] += 1
|
||||
|
||||
if not args.inhouse and args.ground_truth and Path(args.ground_truth).exists():
|
||||
gt = load_json(args.ground_truth)
|
||||
# Ground truth format: list of {"path": ..., "line": ..., "cap": ..., "vuln": bool}
|
||||
gt_true: list[dict] = []
|
||||
for entry in gt if isinstance(gt, list) else []:
|
||||
# Honour the same language scope as the findings filter so recall
|
||||
# is measured only over the corpus's target language.
|
||||
if lang_filter and lang_of(entry) not in lang_filter:
|
||||
continue
|
||||
if entry.get("vuln"):
|
||||
gt_true.append({
|
||||
"path": entry.get("path", ""),
|
||||
"line": entry.get("line", 0),
|
||||
"cap": entry.get("cap", ""),
|
||||
})
|
||||
|
||||
# Track which GT entries were matched (by index) to avoid double-counting.
|
||||
matched_gt: set[int] = set()
|
||||
# Track (path, cap) pairs that had at least one finding match.
|
||||
found_path_caps: set[tuple[str, str]] = set()
|
||||
|
||||
for f in findings:
|
||||
f_path = f.get("path", "")
|
||||
f_line = f.get("line", 0)
|
||||
f_cap = cap_of(f, static_lens=args.static)
|
||||
cap = f_cap
|
||||
lang = lang_of(f)
|
||||
cell_key = (cap, lang)
|
||||
dv = (f.get("evidence") or {}).get("dynamic_verdict") or {}
|
||||
is_confirmed = dv.get("status") == "Confirmed"
|
||||
matched_idx = None
|
||||
for idx, gt_entry in enumerate(gt_true):
|
||||
if (path_matches(gt_entry["path"], f_path)
|
||||
and gt_entry["cap"] == f_cap
|
||||
and idx not in matched_gt
|
||||
and (gt_entry["line"] == 0
|
||||
or abs(gt_entry["line"] - f_line) <= LINE_TOLERANCE)):
|
||||
matched_idx = idx
|
||||
break
|
||||
if matched_idx is not None:
|
||||
matched_gt.add(matched_idx)
|
||||
found_path_caps.add((f_path, f_cap))
|
||||
cells[cell_key]["tp"] += 1
|
||||
if is_confirmed:
|
||||
cells[cell_key]["confirmed_tp"] += 1
|
||||
else:
|
||||
cells[cell_key]["fp"] += 1
|
||||
if is_confirmed:
|
||||
cells[cell_key]["confirmed_fp"] += 1
|
||||
|
||||
for idx, gt_entry in enumerate(gt_true):
|
||||
if idx not in matched_gt:
|
||||
cap = gt_entry["cap"]
|
||||
# Land the FN in the cell its source language implies (from the
|
||||
# GT path extension) so per-(cap,lang) recall is meaningful and
|
||||
# OWASP misses show up in the java cell, not a stray "unknown".
|
||||
cells[(cap, lang_of(gt_entry))]["fn"] += 1
|
||||
|
||||
# Ground-truth-derived false-confirm accounting. When a corpus ships a
|
||||
# vuln/benign label per file (OWASP, SARD), a Confirmed finding that
|
||||
# matches no GT positive is a false confirm — authoritative, so it
|
||||
# overrides any manual-triage stamping for these labelled sets. This is
|
||||
# what makes the per-cell `false_confirmed_rate` budget non-vacuous on a
|
||||
# fresh eval corpus without a host-local verify-feedback log.
|
||||
for v in cells.values():
|
||||
if v["confirmed_tp"] or v["confirmed_fp"]:
|
||||
v["wrong_confirmed"] = v["confirmed_fp"]
|
||||
|
||||
result = {
|
||||
"label": args.label,
|
||||
"total_findings": len(findings),
|
||||
"cells": [
|
||||
{
|
||||
"cap": k[0],
|
||||
"lang": k[1],
|
||||
**v,
|
||||
"precision": v["tp"] / max(v["tp"] + v["fp"], 1),
|
||||
"recall": v["tp"] / max(v["tp"] + v["fn"], 1),
|
||||
"unsupported_rate": v["unsupported"] / max(v["total"], 1),
|
||||
}
|
||||
for k, v in sorted(cells.items())
|
||||
],
|
||||
}
|
||||
|
||||
existing = load_json(args.append) if Path(args.append).exists() else []
|
||||
existing.append(result)
|
||||
with open(args.append, "w") as f:
|
||||
json.dump(existing, f, indent=2)
|
||||
|
||||
# Print summary
|
||||
print(f"\n=== {args.label} ===")
|
||||
print(f"{'Cap':<20} {'Lang':<12} {'TP':>5} {'FP':>5} {'FN':>5} {'Prec':>6} {'Rec':>6} {'Unsup%':>7}")
|
||||
print("-" * 72)
|
||||
for c in result["cells"]:
|
||||
print(
|
||||
f"{c['cap']:<20} {c['lang']:<12} "
|
||||
f"{c['tp']:>5} {c['fp']:>5} {c['fn']:>5} "
|
||||
f"{c['precision']:>6.2f} {c['recall']:>6.2f} "
|
||||
f"{c['unsupported_rate']*100:>6.1f}%"
|
||||
)
|
||||
|
||||
exit_rc = 0
|
||||
|
||||
# ── Phase 29: per-cell budget enforcement ─────────────────────────────
|
||||
if args.budget:
|
||||
budget = load_budget(args.budget)
|
||||
failures = enforce_budget(result["cells"], budget)
|
||||
if failures:
|
||||
print(f"\n=== Per-cell budget regressions ({args.budget}) ===")
|
||||
for line in failures:
|
||||
print(line)
|
||||
exit_rc = 2
|
||||
else:
|
||||
print(f"\nPer-cell budget ({args.budget}): OK")
|
||||
|
||||
# ── Phase 29: diff against previous run ───────────────────────────────
|
||||
if args.diff:
|
||||
prev = load_previous_cells(args.diff, args.label)
|
||||
failures = diff_regressions(result["cells"], prev)
|
||||
if failures:
|
||||
print(f"\n=== Monotonic-improvement regressions vs {args.diff} ===")
|
||||
for line in failures:
|
||||
print(line)
|
||||
exit_rc = 2
|
||||
else:
|
||||
print(f"\nDiff vs {args.diff}: no regressions")
|
||||
|
||||
return exit_rc
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
251
tests/eval_corpus/test_manifest_gt_convert.py
Normal file
251
tests/eval_corpus/test_manifest_gt_convert.py
Normal file
|
|
@ -0,0 +1,251 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Phase 28 (Track R.1) regression test for tests/eval_corpus/manifest_gt_convert.py.
|
||||
|
||||
Proves the manifest -> ground-truth converter is non-vacuous:
|
||||
* a well-formed manifest converts to the expected sorted JSON,
|
||||
* --corpus-dir validation passes when every labelled path exists and
|
||||
produces byte-identical output to the no-corpus transform (so the CI
|
||||
in-sync guard, which diffs committed vs a validated regen, is sound),
|
||||
* --corpus-dir validation HARD-ERRORS (exit 2) on a missing path,
|
||||
* an unknown cap / duplicate (path,cap) / malformed TOML are rejected,
|
||||
* the committed nodegoat.json / juiceshop.json are exactly what a fresh
|
||||
conversion of their manifests produces (offline half of the CI guard).
|
||||
|
||||
Run with::
|
||||
|
||||
python3 tests/eval_corpus/test_manifest_gt_convert.py
|
||||
|
||||
Exits 0 when every assertion holds, non-zero otherwise.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
REPO = Path(__file__).resolve().parents[2]
|
||||
CONVERT = REPO / "tests/eval_corpus/manifest_gt_convert.py"
|
||||
GT_DIR = REPO / "tests/eval_corpus/ground_truth"
|
||||
|
||||
GOOD_MANIFEST = """\
|
||||
corpus = "demo"
|
||||
upstream = "https://example.test/demo"
|
||||
pinned_ref = "v1"
|
||||
|
||||
[[entry]]
|
||||
path = "routes/login.ts"
|
||||
cap = "sqli"
|
||||
vuln = true
|
||||
note = "raw SQL string-concat in login"
|
||||
|
||||
[[entry]]
|
||||
path = "app/routes/contributions.js"
|
||||
cap = "cmdi"
|
||||
vuln = true
|
||||
note = "eval of user input"
|
||||
|
||||
[[entry]]
|
||||
path = "lib/insecurity.ts"
|
||||
cap = "crypto"
|
||||
vuln = false
|
||||
note = "benign control example"
|
||||
"""
|
||||
|
||||
|
||||
def run_convert(*args: str) -> subprocess.CompletedProcess:
|
||||
return subprocess.run(
|
||||
[sys.executable, str(CONVERT), *args], capture_output=True, text=True
|
||||
)
|
||||
|
||||
|
||||
def test_transform_is_sorted_and_schema_clean(tmp: Path) -> None:
|
||||
man = tmp / "demo.manifest.toml"
|
||||
man.write_text(GOOD_MANIFEST)
|
||||
out = tmp / "demo.json"
|
||||
proc = run_convert("--manifest", str(man), "--output", str(out))
|
||||
assert proc.returncode == 0, proc.stdout + proc.stderr
|
||||
records = json.loads(out.read_text())
|
||||
# Sorted by (path, cap); only the 4 GT fields; `note` dropped.
|
||||
assert [r["path"] for r in records] == [
|
||||
"app/routes/contributions.js",
|
||||
"lib/insecurity.ts",
|
||||
"routes/login.ts",
|
||||
], records
|
||||
for r in records:
|
||||
assert set(r) == {"path", "line", "cap", "vuln"}, r
|
||||
assert r["line"] == 0, r
|
||||
assert records[0]["cap"] == "cmdi" and records[0]["vuln"] is True
|
||||
assert records[1]["cap"] == "crypto" and records[1]["vuln"] is False
|
||||
|
||||
|
||||
def test_corpus_validation_passes_and_matches_no_corpus(tmp: Path) -> None:
|
||||
man = tmp / "demo.manifest.toml"
|
||||
man.write_text(GOOD_MANIFEST)
|
||||
# Build a corpus tree containing every labelled path.
|
||||
corpus = tmp / "corpus"
|
||||
for rel in ("routes/login.ts", "app/routes/contributions.js", "lib/insecurity.ts"):
|
||||
f = corpus / rel
|
||||
f.parent.mkdir(parents=True, exist_ok=True)
|
||||
f.write_text("// stub\n")
|
||||
no_corpus = tmp / "no_corpus.json"
|
||||
with_corpus = tmp / "with_corpus.json"
|
||||
assert run_convert("--manifest", str(man), "--output", str(no_corpus)).returncode == 0
|
||||
proc = run_convert(
|
||||
"--manifest", str(man),
|
||||
"--corpus-dir", str(corpus),
|
||||
"--output", str(with_corpus),
|
||||
)
|
||||
assert proc.returncode == 0, proc.stdout + proc.stderr
|
||||
# Validation must not change the output — that is what makes the CI guard
|
||||
# (diff committed vs validated regen) meaningful.
|
||||
assert no_corpus.read_text() == with_corpus.read_text()
|
||||
assert "validated against" in proc.stdout, proc.stdout
|
||||
|
||||
|
||||
def test_missing_path_exits_2(tmp: Path) -> None:
|
||||
man = tmp / "demo.manifest.toml"
|
||||
man.write_text(GOOD_MANIFEST)
|
||||
corpus = tmp / "corpus"
|
||||
# Only two of the three labelled files exist → the third must trip.
|
||||
for rel in ("routes/login.ts", "app/routes/contributions.js"):
|
||||
f = corpus / rel
|
||||
f.parent.mkdir(parents=True, exist_ok=True)
|
||||
f.write_text("// stub\n")
|
||||
out = tmp / "demo.json"
|
||||
proc = run_convert(
|
||||
"--manifest", str(man), "--corpus-dir", str(corpus), "--output", str(out)
|
||||
)
|
||||
assert proc.returncode == 2, proc.stdout + proc.stderr
|
||||
assert "lib/insecurity.ts" in proc.stderr and "missing" in proc.stderr, proc.stderr
|
||||
|
||||
|
||||
def test_unknown_cap_rejected(tmp: Path) -> None:
|
||||
man = tmp / "bad_cap.manifest.toml"
|
||||
man.write_text(
|
||||
'[[entry]]\npath = "a.js"\ncap = "not_a_cap"\nvuln = true\n'
|
||||
)
|
||||
out = tmp / "out.json"
|
||||
proc = run_convert("--manifest", str(man), "--output", str(out))
|
||||
assert proc.returncode == 1, proc.stdout + proc.stderr
|
||||
assert "not a known nyx cap" in proc.stderr, proc.stderr
|
||||
|
||||
|
||||
def test_duplicate_path_cap_rejected(tmp: Path) -> None:
|
||||
man = tmp / "dup.manifest.toml"
|
||||
man.write_text(
|
||||
'[[entry]]\npath = "a.js"\ncap = "xss"\nvuln = true\n'
|
||||
'[[entry]]\npath = "a.js"\ncap = "xss"\nvuln = true\n'
|
||||
)
|
||||
out = tmp / "out.json"
|
||||
proc = run_convert("--manifest", str(man), "--output", str(out))
|
||||
assert proc.returncode == 1, proc.stdout + proc.stderr
|
||||
assert "duplicate" in proc.stderr, proc.stderr
|
||||
|
||||
|
||||
def test_malformed_manifest_exits_1(tmp: Path) -> None:
|
||||
man = tmp / "broken.toml"
|
||||
man.write_text("[[entry]\npath = \n") # invalid TOML
|
||||
out = tmp / "out.json"
|
||||
proc = run_convert("--manifest", str(man), "--output", str(out))
|
||||
assert proc.returncode == 1, proc.stdout + proc.stderr
|
||||
assert "malformed" in proc.stderr, proc.stderr
|
||||
|
||||
|
||||
def test_empty_manifest_exits_1(tmp: Path) -> None:
|
||||
man = tmp / "empty.toml"
|
||||
man.write_text('corpus = "x"\n') # no [[entry]] tables
|
||||
out = tmp / "out.json"
|
||||
proc = run_convert("--manifest", str(man), "--output", str(out))
|
||||
assert proc.returncode == 1, proc.stdout + proc.stderr
|
||||
assert "no [[entry]]" in proc.stderr, proc.stderr
|
||||
|
||||
|
||||
def test_committed_gt_matches_manifest(tmp: Path) -> None:
|
||||
# Offline half of the CI in-sync guard: the committed ground-truth JSON
|
||||
# must be exactly what a fresh conversion of its manifest produces. This
|
||||
# catches a manifest edit that was not followed by a regenerate.
|
||||
for name in (
|
||||
"nodegoat",
|
||||
"juiceshop",
|
||||
# Track R.2 polyglot corpora (Phase 29).
|
||||
"railsgoat",
|
||||
"dvwa",
|
||||
"dvpwa",
|
||||
"gosec",
|
||||
"rustsec",
|
||||
):
|
||||
man = GT_DIR / f"{name}.manifest.toml"
|
||||
committed = GT_DIR / f"{name}.json"
|
||||
assert man.exists(), f"missing manifest: {man}"
|
||||
assert committed.exists(), f"missing committed GT: {committed}"
|
||||
regen = tmp / f"{name}.json"
|
||||
proc = run_convert("--manifest", str(man), "--output", str(regen))
|
||||
assert proc.returncode == 0, proc.stdout + proc.stderr
|
||||
assert json.loads(regen.read_text()) == json.loads(committed.read_text()), (
|
||||
f"{committed} is stale — regenerate with manifest_gt_convert.py"
|
||||
)
|
||||
|
||||
|
||||
def test_negative_control_emits_empty(tmp: Path) -> None:
|
||||
# A negative-control manifest (no scannable source vulns, e.g. RustSec
|
||||
# advisory-db) declares `negative_control = true` and zero [[entry]]
|
||||
# tables; the converter emits an empty `[]` ground truth.
|
||||
man = tmp / "neg.manifest.toml"
|
||||
man.write_text(
|
||||
'corpus = "rustsec"\n'
|
||||
'upstream = "https://example.test/advisory-db"\n'
|
||||
'pinned_ref = "main"\n'
|
||||
"negative_control = true\n"
|
||||
)
|
||||
out = tmp / "neg.json"
|
||||
proc = run_convert("--manifest", str(man), "--output", str(out))
|
||||
assert proc.returncode == 0, proc.stdout + proc.stderr
|
||||
assert json.loads(out.read_text()) == [], out.read_text()
|
||||
assert "negative-control corpus" in proc.stdout, proc.stdout
|
||||
|
||||
|
||||
def test_negative_control_with_entries_rejected(tmp: Path) -> None:
|
||||
# negative_control and [[entry]] are mutually exclusive: a manifest that
|
||||
# sets the flag yet lists a vuln must be rejected so a real positive can
|
||||
# never be silently hidden behind the flag.
|
||||
man = tmp / "neg_bad.manifest.toml"
|
||||
man.write_text(
|
||||
"negative_control = true\n"
|
||||
'[[entry]]\npath = "a.rs"\ncap = "cmdi"\nvuln = true\n'
|
||||
)
|
||||
out = tmp / "neg_bad.json"
|
||||
proc = run_convert("--manifest", str(man), "--output", str(out))
|
||||
assert proc.returncode == 1, proc.stdout + proc.stderr
|
||||
assert "negative_control" in proc.stderr and "zero" in proc.stderr, proc.stderr
|
||||
|
||||
|
||||
def main() -> int:
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
tmp = Path(td)
|
||||
for fn in (
|
||||
test_transform_is_sorted_and_schema_clean,
|
||||
test_corpus_validation_passes_and_matches_no_corpus,
|
||||
test_missing_path_exits_2,
|
||||
test_unknown_cap_rejected,
|
||||
test_duplicate_path_cap_rejected,
|
||||
test_malformed_manifest_exits_1,
|
||||
test_empty_manifest_exits_1,
|
||||
test_committed_gt_matches_manifest,
|
||||
test_negative_control_emits_empty,
|
||||
test_negative_control_with_entries_rejected,
|
||||
):
|
||||
sub = tmp / fn.__name__
|
||||
sub.mkdir()
|
||||
print(f"... {fn.__name__}")
|
||||
fn(sub)
|
||||
print(" OK")
|
||||
print("\nAll manifest_gt_convert.py regression checks passed.")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
771
tests/eval_corpus/test_tabulate_regression.py
Normal file
771
tests/eval_corpus/test_tabulate_regression.py
Normal file
|
|
@ -0,0 +1,771 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Phase 29 (Track I) regression test for tests/eval_corpus/tabulate.py.
|
||||
|
||||
Exercises --budget and --diff against hand-crafted scan + ground-truth
|
||||
fixtures so the per-cell budget gate and monotonic-improvement diff are
|
||||
demonstrably non-vacuous.
|
||||
|
||||
Run with::
|
||||
|
||||
python3 tests/eval_corpus/test_tabulate_regression.py
|
||||
|
||||
Exits 0 when every assertion holds, non-zero otherwise. The asserts are
|
||||
plain `assert` statements so the file works both as a stand-alone script
|
||||
and under unittest discovery.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
REPO = Path(__file__).resolve().parents[2]
|
||||
TABULATE = REPO / "tests/eval_corpus/tabulate.py"
|
||||
REPORT = REPO / "tests/eval_corpus/report.py"
|
||||
BUDGET = REPO / "tests/eval_corpus/budget.toml"
|
||||
|
||||
|
||||
def run_tabulate(*args: str) -> subprocess.CompletedProcess:
|
||||
cmd = [sys.executable, str(TABULATE), *args]
|
||||
return subprocess.run(cmd, capture_output=True, text=True)
|
||||
|
||||
|
||||
def run_report(*args: str) -> subprocess.CompletedProcess:
|
||||
cmd = [sys.executable, str(REPORT), *args]
|
||||
return subprocess.run(cmd, capture_output=True, text=True)
|
||||
|
||||
|
||||
def write_json(path: Path, data: object) -> None:
|
||||
path.write_text(json.dumps(data, indent=2))
|
||||
|
||||
|
||||
# Cap bit positions cribbed from tabulate.py / src/labels/mod.rs.
|
||||
SINK_BIT_SQL = 1 << 7 # SQL_QUERY
|
||||
SINK_BIT_CMDI = 1 << 10 # CODE_EXEC
|
||||
SINK_BIT_SHELL = 1 << 2 # SHELL_ESCAPE (Java/other command-exec sink)
|
||||
SINK_BIT_FILE = 1 << 5 # FILE_IO (path_traversal)
|
||||
|
||||
|
||||
def python_finding(cap_bit: int, path: str, line: int, status: str | None) -> dict:
|
||||
finding = {
|
||||
"path": path,
|
||||
"line": line,
|
||||
"col": 0,
|
||||
"id": "py.sqli.cursor_execute",
|
||||
"evidence": {"sink_caps": cap_bit},
|
||||
}
|
||||
if status:
|
||||
finding["evidence"]["dynamic_verdict"] = {"status": status}
|
||||
return finding
|
||||
|
||||
|
||||
def test_budget_passes_on_clean_scan(tmp: Path) -> None:
|
||||
scan = tmp / "scan_clean.json"
|
||||
write_json(
|
||||
scan,
|
||||
{
|
||||
"findings": [
|
||||
python_finding(SINK_BIT_SQL, "app.py", 10, "Confirmed"),
|
||||
python_finding(SINK_BIT_SQL, "app.py", 20, "Confirmed"),
|
||||
python_finding(SINK_BIT_SQL, "app.py", 30, "NotConfirmed"),
|
||||
]
|
||||
},
|
||||
)
|
||||
append = tmp / "results_clean.json"
|
||||
write_json(append, [])
|
||||
proc = run_tabulate(
|
||||
"--label", "test",
|
||||
"--scan", str(scan),
|
||||
"--inhouse",
|
||||
"--append", str(append),
|
||||
"--budget", str(BUDGET),
|
||||
)
|
||||
assert proc.returncode == 0, f"clean scan must pass budget, got rc={proc.returncode}\nstdout: {proc.stdout}\nstderr: {proc.stderr}"
|
||||
assert "Per-cell budget" in proc.stdout and "OK" in proc.stdout, proc.stdout
|
||||
|
||||
|
||||
def test_budget_fails_when_unsupported_exceeds(tmp: Path) -> None:
|
||||
# SQL_QUERY/python budget is 40% Unsupported. Hand-craft a scan with
|
||||
# 100% Unsupported in that cell so the gate must trip.
|
||||
scan = tmp / "scan_unsup.json"
|
||||
write_json(
|
||||
scan,
|
||||
{
|
||||
"findings": [
|
||||
python_finding(SINK_BIT_SQL, "app.py", i, "Unsupported")
|
||||
for i in (10, 20, 30, 40, 50)
|
||||
]
|
||||
},
|
||||
)
|
||||
append = tmp / "results_unsup.json"
|
||||
write_json(append, [])
|
||||
proc = run_tabulate(
|
||||
"--label", "test",
|
||||
"--scan", str(scan),
|
||||
"--inhouse",
|
||||
"--append", str(append),
|
||||
"--budget", str(BUDGET),
|
||||
)
|
||||
assert proc.returncode == 2, (
|
||||
f"budget breach must exit 2, got {proc.returncode}\n"
|
||||
f"stdout: {proc.stdout}\nstderr: {proc.stderr}"
|
||||
)
|
||||
assert "FAIL" in proc.stdout and "sqli/python" in proc.stdout, proc.stdout
|
||||
|
||||
|
||||
def test_diff_fails_on_regression(tmp: Path) -> None:
|
||||
# Previous run: 1/4 Unsupported = 25%. Current run: 3/4 = 75%. The
|
||||
# default cell budget tolerates 80%, but the monotonic-improvement
|
||||
# diff must still flag the +50pp regression.
|
||||
prev_findings = [
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 1, "Confirmed"),
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 2, "Confirmed"),
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 3, "Confirmed"),
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 4, "Unsupported"),
|
||||
]
|
||||
prev_scan = tmp / "prev_scan.json"
|
||||
write_json(prev_scan, {"findings": prev_findings})
|
||||
prev_results = tmp / "prev_results.json"
|
||||
write_json(prev_results, [])
|
||||
rc_prev = run_tabulate(
|
||||
"--label", "diff-test",
|
||||
"--scan", str(prev_scan),
|
||||
"--inhouse",
|
||||
"--append", str(prev_results),
|
||||
).returncode
|
||||
assert rc_prev == 0, f"prev seed run must succeed, got {rc_prev}"
|
||||
|
||||
cur_findings = [
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 1, "Unsupported"),
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 2, "Unsupported"),
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 3, "Unsupported"),
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 4, "Confirmed"),
|
||||
]
|
||||
cur_scan = tmp / "cur_scan.json"
|
||||
write_json(cur_scan, {"findings": cur_findings})
|
||||
cur_results = tmp / "cur_results.json"
|
||||
write_json(cur_results, [])
|
||||
proc = run_tabulate(
|
||||
"--label", "diff-test",
|
||||
"--scan", str(cur_scan),
|
||||
"--inhouse",
|
||||
"--append", str(cur_results),
|
||||
"--diff", str(prev_results),
|
||||
)
|
||||
assert proc.returncode == 2, (
|
||||
f"regression diff must exit 2, got {proc.returncode}\n"
|
||||
f"stdout: {proc.stdout}\nstderr: {proc.stderr}"
|
||||
)
|
||||
assert "REGRESSION" in proc.stdout and "Unsupported" in proc.stdout, proc.stdout
|
||||
|
||||
|
||||
def test_diff_passes_on_improvement(tmp: Path) -> None:
|
||||
# Previous: 3/4 Unsupported. Current: 1/4. Monotonic improvement
|
||||
# must not flag any regression.
|
||||
prev_findings = [
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 1, "Unsupported"),
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 2, "Unsupported"),
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 3, "Unsupported"),
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 4, "Confirmed"),
|
||||
]
|
||||
prev_scan = tmp / "prev_scan.json"
|
||||
write_json(prev_scan, {"findings": prev_findings})
|
||||
prev_results = tmp / "prev_results.json"
|
||||
write_json(prev_results, [])
|
||||
run_tabulate(
|
||||
"--label", "improve-test",
|
||||
"--scan", str(prev_scan),
|
||||
"--inhouse",
|
||||
"--append", str(prev_results),
|
||||
)
|
||||
|
||||
cur_findings = [
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 1, "Confirmed"),
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 2, "Confirmed"),
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 3, "Confirmed"),
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 4, "Unsupported"),
|
||||
]
|
||||
cur_scan = tmp / "cur_scan.json"
|
||||
write_json(cur_scan, {"findings": cur_findings})
|
||||
cur_results = tmp / "cur_results.json"
|
||||
write_json(cur_results, [])
|
||||
proc = run_tabulate(
|
||||
"--label", "improve-test",
|
||||
"--scan", str(cur_scan),
|
||||
"--inhouse",
|
||||
"--append", str(cur_results),
|
||||
"--diff", str(prev_results),
|
||||
)
|
||||
assert proc.returncode == 0, (
|
||||
f"improvement diff must exit 0, got {proc.returncode}\n"
|
||||
f"stdout: {proc.stdout}\nstderr: {proc.stderr}"
|
||||
)
|
||||
assert "no regressions" in proc.stdout, proc.stdout
|
||||
|
||||
|
||||
def test_manual_triage_stamps_wrong_confirmed(tmp: Path) -> None:
|
||||
# Phase 31 follow-up: --manual-triage should cross-reference Confirmed
|
||||
# findings against a list of {path, line, cap, vuln: false} entries
|
||||
# and stamp `wrong: true` so the per-cell wrong_confirmed counter
|
||||
# becomes non-vacuous without the host's verify-feedback log.
|
||||
#
|
||||
# Confirmed at line 10 matches the triage's vuln:false at line 12
|
||||
# (within LINE_TOLERANCE=5). Confirmed at line 100 does not match
|
||||
# any triage entry, so wrong_confirmed stays at 1 / 2 Confirmed.
|
||||
scan = tmp / "scan.json"
|
||||
write_json(
|
||||
scan,
|
||||
{
|
||||
"findings": [
|
||||
python_finding(SINK_BIT_SQL, "app.py", 10, "Confirmed"),
|
||||
python_finding(SINK_BIT_SQL, "app.py", 100, "Confirmed"),
|
||||
]
|
||||
},
|
||||
)
|
||||
triage = tmp / "triage.json"
|
||||
write_json(
|
||||
triage,
|
||||
[
|
||||
{"path": "app.py", "line": 12, "cap": "sqli", "vuln": False},
|
||||
],
|
||||
)
|
||||
append = tmp / "results.json"
|
||||
write_json(append, [])
|
||||
proc = run_tabulate(
|
||||
"--label", "triage-test",
|
||||
"--scan", str(scan),
|
||||
"--inhouse",
|
||||
"--append", str(append),
|
||||
"--manual-triage", str(triage),
|
||||
)
|
||||
assert proc.returncode == 0, (
|
||||
f"manual-triage run must succeed without budget, got {proc.returncode}\n"
|
||||
f"stdout: {proc.stdout}\nstderr: {proc.stderr}"
|
||||
)
|
||||
results = json.loads(append.read_text())
|
||||
cells = {(c["cap"], c["lang"]): c for c in results[-1]["cells"]}
|
||||
sqli_py = cells.get(("sqli", "python"))
|
||||
assert sqli_py is not None, f"expected sqli/python cell, got {list(cells)}"
|
||||
assert sqli_py["confirmed"] == 2, sqli_py
|
||||
assert sqli_py["wrong_confirmed"] == 1, (
|
||||
"exactly one Confirmed finding must be stamped wrong via the triage match; "
|
||||
f"got {sqli_py}"
|
||||
)
|
||||
|
||||
|
||||
def test_manual_triage_ignores_vuln_true_entries(tmp: Path) -> None:
|
||||
# Triage entries with `vuln: true` are ground-truth-positive markers,
|
||||
# not False-Confirmed evidence. --manual-triage must leave them alone
|
||||
# so a real Confirmed-on-vuln-true row does not get downgraded.
|
||||
scan = tmp / "scan.json"
|
||||
write_json(
|
||||
scan,
|
||||
{
|
||||
"findings": [
|
||||
python_finding(SINK_BIT_SQL, "app.py", 10, "Confirmed"),
|
||||
]
|
||||
},
|
||||
)
|
||||
triage = tmp / "triage.json"
|
||||
write_json(
|
||||
triage,
|
||||
[
|
||||
{"path": "app.py", "line": 10, "cap": "sqli", "vuln": True},
|
||||
],
|
||||
)
|
||||
append = tmp / "results.json"
|
||||
write_json(append, [])
|
||||
proc = run_tabulate(
|
||||
"--label", "triage-true-test",
|
||||
"--scan", str(scan),
|
||||
"--inhouse",
|
||||
"--append", str(append),
|
||||
"--manual-triage", str(triage),
|
||||
)
|
||||
assert proc.returncode == 0
|
||||
results = json.loads(append.read_text())
|
||||
cells = {(c["cap"], c["lang"]): c for c in results[-1]["cells"]}
|
||||
sqli_py = cells[("sqli", "python")]
|
||||
assert sqli_py["confirmed"] == 1
|
||||
assert sqli_py["wrong_confirmed"] == 0, (
|
||||
f"vuln:true triage rows must not stamp wrong; got {sqli_py}"
|
||||
)
|
||||
|
||||
|
||||
def test_lang_filter_scopes_findings_and_gt(tmp: Path) -> None:
|
||||
# Phase 29 (Track R.2): --lang scopes a single-language corpus to its
|
||||
# target language so incidental other-language assets (e.g. the vendored
|
||||
# JavaScript a Rails app bundles, which nyx flags as prototype_pollution)
|
||||
# do not pollute the corpus's per-cap metrics. The filter must drop both
|
||||
# findings AND ground-truth entries outside the scope.
|
||||
gt = tmp / "gt.json"
|
||||
write_json(
|
||||
gt,
|
||||
[
|
||||
{"path": "app/models/user.rb", "line": 0, "cap": "sqli", "vuln": True},
|
||||
{"path": "app/assets/lib.js", "line": 0, "cap": "sqli", "vuln": True},
|
||||
],
|
||||
)
|
||||
scan = tmp / "scan.json"
|
||||
write_json(
|
||||
scan,
|
||||
{
|
||||
"findings": [
|
||||
python_finding(SINK_BIT_SQL, "/x/app/models/user.rb", 10, "NotConfirmed"),
|
||||
# A vendored-JS finding nyx would otherwise Confirm — must be
|
||||
# excluded entirely under `--lang ruby`.
|
||||
python_finding(SINK_BIT_SQL, "/x/app/assets/lib.js", 10, "Confirmed"),
|
||||
]
|
||||
},
|
||||
)
|
||||
|
||||
# Unscoped: both language cells appear.
|
||||
unscoped = tmp / "unscoped.json"
|
||||
write_json(unscoped, [])
|
||||
proc = run_tabulate(
|
||||
"--label", "railsgoat",
|
||||
"--scan", str(scan),
|
||||
"--ground-truth", str(gt),
|
||||
"--append", str(unscoped),
|
||||
)
|
||||
assert proc.returncode == 0, proc.stdout + proc.stderr
|
||||
cells = {(c["cap"], c["lang"]) for c in json.loads(unscoped.read_text())[-1]["cells"]}
|
||||
assert ("sqli", "ruby") in cells and ("sqli", "javascript") in cells, cells
|
||||
|
||||
# Scoped to ruby: the JS finding AND the JS ground-truth positive vanish.
|
||||
scoped = tmp / "scoped.json"
|
||||
write_json(scoped, [])
|
||||
proc = run_tabulate(
|
||||
"--label", "railsgoat",
|
||||
"--scan", str(scan),
|
||||
"--ground-truth", str(gt),
|
||||
"--lang", "ruby",
|
||||
"--append", str(scoped),
|
||||
)
|
||||
assert proc.returncode == 0, proc.stdout + proc.stderr
|
||||
cells = {(c["cap"], c["lang"]): c for c in json.loads(scoped.read_text())[-1]["cells"]}
|
||||
assert ("sqli", "javascript") not in cells, f"JS must be filtered out: {list(cells)}"
|
||||
ruby = cells[("sqli", "ruby")]
|
||||
assert ruby["tp"] == 1 and ruby["fn"] == 0, ruby
|
||||
# The dropped JS positive must NOT resurface as a phantom FN in any cell.
|
||||
assert all(lang != "javascript" for _cap, lang in cells), cells
|
||||
|
||||
|
||||
def test_static_lens_buckets_shell_escape_as_cmdi(tmp: Path) -> None:
|
||||
# Caveat-1 fix: in an env with 0 dynamic confirmations a Java command-exec
|
||||
# finding carries only SHELL_ESCAPE (1<<2), which the default bit table
|
||||
# leaves in "other" — so the cmdi cell reads 0 TP / N FN regardless of
|
||||
# static quality. --static appends SHELL_ESCAPE→cmdi so static recall is
|
||||
# measurable without dynamic confirmation.
|
||||
gt = tmp / "gt.json"
|
||||
write_json(
|
||||
gt,
|
||||
[{"path": "testcode/Cmd.java", "line": 0, "cap": "cmdi", "vuln": True}],
|
||||
)
|
||||
# Real Java taint findings carry id "taint-unsanitised-flow" (no cap
|
||||
# substring), so the rule-id fallback yields "other" — not the sqli/cmdi
|
||||
# the hand-crafted python_finding id would imply.
|
||||
java_cmdi = {
|
||||
"path": "/x/testcode/Cmd.java",
|
||||
"line": 10,
|
||||
"col": 0,
|
||||
"id": "taint-unsanitised-flow",
|
||||
"evidence": {"sink_caps": SINK_BIT_SHELL, "dynamic_verdict": {"status": "NotConfirmed"}},
|
||||
}
|
||||
scan = tmp / "scan.json"
|
||||
write_json(scan, {"findings": [java_cmdi]})
|
||||
|
||||
# Default lens: the finding buckets as "other", so cmdi shows the GT
|
||||
# positive as a pure FN (recall 0) — the measurement gap.
|
||||
default = tmp / "default.json"
|
||||
write_json(default, [])
|
||||
proc = run_tabulate(
|
||||
"--label", "owasp",
|
||||
"--scan", str(scan),
|
||||
"--ground-truth", str(gt),
|
||||
"--append", str(default),
|
||||
)
|
||||
assert proc.returncode == 0, proc.stdout + proc.stderr
|
||||
cells = {(c["cap"], c["lang"]): c for c in json.loads(default.read_text())[-1]["cells"]}
|
||||
assert ("cmdi", "java") in cells and cells[("cmdi", "java")]["tp"] == 0, cells
|
||||
assert cells[("cmdi", "java")]["fn"] == 1, cells[("cmdi", "java")]
|
||||
assert ("other", "java") in cells, f"SHELL_ESCAPE must bucket as other by default: {list(cells)}"
|
||||
|
||||
# Static lens: the finding buckets as cmdi → recall measurable (TP=1, FN=0).
|
||||
static = tmp / "static.json"
|
||||
write_json(static, [])
|
||||
proc = run_tabulate(
|
||||
"--label", "owasp",
|
||||
"--scan", str(scan),
|
||||
"--ground-truth", str(gt),
|
||||
"--static",
|
||||
"--append", str(static),
|
||||
)
|
||||
assert proc.returncode == 0, proc.stdout + proc.stderr
|
||||
cells = {(c["cap"], c["lang"]): c for c in json.loads(static.read_text())[-1]["cells"]}
|
||||
cmdi = cells[("cmdi", "java")]
|
||||
assert cmdi["tp"] == 1 and cmdi["fn"] == 0, cmdi
|
||||
assert ("other", "java") not in cells, f"static lens must reclaim the other-bucketed finding: {list(cells)}"
|
||||
|
||||
|
||||
def test_static_lens_preserves_higher_priority_bits(tmp: Path) -> None:
|
||||
# A finding carrying BOTH FILE_IO and SHELL_ESCAPE must keep bucketing as
|
||||
# path_traversal under the static lens (SHELL_ESCAPE is appended at lowest
|
||||
# priority), so the static lens never steals a finding from a non-cmdi cell.
|
||||
scan = tmp / "scan.json"
|
||||
write_json(
|
||||
scan,
|
||||
{
|
||||
"findings": [
|
||||
python_finding(SINK_BIT_FILE | SINK_BIT_SHELL, "B.java", 10, "NotConfirmed"),
|
||||
]
|
||||
},
|
||||
)
|
||||
for flag in ([], ["--static"]):
|
||||
append = tmp / f"out{len(flag)}.json"
|
||||
write_json(append, [])
|
||||
proc = run_tabulate(
|
||||
"--label", "x",
|
||||
"--scan", str(scan),
|
||||
"--inhouse",
|
||||
"--append", str(append),
|
||||
*flag,
|
||||
)
|
||||
assert proc.returncode == 0, proc.stdout + proc.stderr
|
||||
caps = {c["cap"] for c in json.loads(append.read_text())[-1]["cells"]}
|
||||
assert caps == {"path_traversal"}, f"flag={flag}: {caps}"
|
||||
|
||||
|
||||
def test_budget_malformed_exits_3(tmp: Path) -> None:
|
||||
bad = tmp / "bad.toml"
|
||||
bad.write_text("[default]\nunsupported_rate = not_a_number\n")
|
||||
scan = tmp / "scan.json"
|
||||
write_json(scan, {"findings": []})
|
||||
append = tmp / "results.json"
|
||||
write_json(append, [])
|
||||
proc = run_tabulate(
|
||||
"--label", "test",
|
||||
"--scan", str(scan),
|
||||
"--inhouse",
|
||||
"--append", str(append),
|
||||
"--budget", str(bad),
|
||||
)
|
||||
assert proc.returncode == 3, (
|
||||
f"malformed budget must exit 3, got {proc.returncode}\nstderr: {proc.stderr}"
|
||||
)
|
||||
|
||||
|
||||
def test_relative_gt_path_suffix_matches_absolute_finding(tmp: Path) -> None:
|
||||
# Phase 27: ground truth stores corpus-relative paths; nyx emits absolute
|
||||
# paths. A relative GT path must suffix-match the absolute finding path so
|
||||
# the committed JSON stays portable across machines / CI checkouts.
|
||||
gt = tmp / "gt.json"
|
||||
write_json(
|
||||
gt,
|
||||
[
|
||||
{
|
||||
"path": "src/main/java/org/owasp/benchmark/testcode/BenchmarkTest1.java",
|
||||
"line": 0,
|
||||
"cap": "sqli",
|
||||
"vuln": True,
|
||||
}
|
||||
],
|
||||
)
|
||||
scan = tmp / "scan.json"
|
||||
write_json(
|
||||
scan,
|
||||
{
|
||||
"findings": [
|
||||
# Absolute path with the GT relative path as a suffix → TP.
|
||||
python_finding(
|
||||
SINK_BIT_SQL,
|
||||
"/home/ci/work/owasp/src/main/java/org/owasp/benchmark/testcode/BenchmarkTest1.java",
|
||||
10,
|
||||
"Confirmed",
|
||||
),
|
||||
# Different file under the same corpus → no GT positive → FP.
|
||||
python_finding(
|
||||
SINK_BIT_SQL,
|
||||
"/home/ci/work/owasp/src/main/java/org/owasp/benchmark/testcode/BenchmarkTest2.java",
|
||||
10,
|
||||
"NotConfirmed",
|
||||
),
|
||||
]
|
||||
},
|
||||
)
|
||||
append = tmp / "results.json"
|
||||
write_json(append, [])
|
||||
proc = run_tabulate(
|
||||
"--label", "owasp",
|
||||
"--scan", str(scan),
|
||||
"--ground-truth", str(gt),
|
||||
"--append", str(append),
|
||||
)
|
||||
assert proc.returncode == 0, proc.stdout + proc.stderr
|
||||
cells = {(c["cap"], c["lang"]): c for c in json.loads(append.read_text())[-1]["cells"]}
|
||||
sqli_java = cells[("sqli", "java")]
|
||||
assert sqli_java["tp"] == 1, f"relative GT path must suffix-match absolute finding: {sqli_java}"
|
||||
assert sqli_java["fp"] == 1, f"benign-file finding must count as FP: {sqli_java}"
|
||||
assert sqli_java["fn"] == 0, sqli_java
|
||||
|
||||
|
||||
def test_unmatched_gt_positive_lands_in_lang_cell(tmp: Path) -> None:
|
||||
# Phase 27: a ground-truth positive with no matching finding is a FN, and
|
||||
# it must land in the cell its file extension implies (java), not a stray
|
||||
# "unknown" lang cell, so per-cap recall aggregation is meaningful.
|
||||
gt = tmp / "gt.json"
|
||||
write_json(
|
||||
gt,
|
||||
[
|
||||
{
|
||||
"path": "src/main/java/org/owasp/benchmark/testcode/BenchmarkTest9.java",
|
||||
"line": 0,
|
||||
"cap": "sqli",
|
||||
"vuln": True,
|
||||
}
|
||||
],
|
||||
)
|
||||
scan = tmp / "scan.json"
|
||||
write_json(scan, {"findings": []})
|
||||
append = tmp / "results.json"
|
||||
write_json(append, [])
|
||||
proc = run_tabulate(
|
||||
"--label", "owasp",
|
||||
"--scan", str(scan),
|
||||
"--ground-truth", str(gt),
|
||||
"--append", str(append),
|
||||
)
|
||||
assert proc.returncode == 0, proc.stdout + proc.stderr
|
||||
cells = {(c["cap"], c["lang"]): c for c in json.loads(append.read_text())[-1]["cells"]}
|
||||
assert ("sqli", "java") in cells, f"FN must land in the java cell: {list(cells)}"
|
||||
assert cells[("sqli", "java")]["fn"] == 1, cells[("sqli", "java")]
|
||||
assert ("sqli", "unknown") not in cells, f"no stray unknown-lang cell: {list(cells)}"
|
||||
|
||||
|
||||
def test_gt_grounded_false_confirm(tmp: Path) -> None:
|
||||
# Phase 27: with full ground truth, a Confirmed finding that matches no GT
|
||||
# positive is a false confirm — derived from GT, no manual-triage file
|
||||
# needed. vuln file → confirmed_tp; benign/other file → confirmed_fp →
|
||||
# wrong_confirmed. Makes false_confirmed_rate non-vacuous on a fresh corpus.
|
||||
gt = tmp / "gt.json"
|
||||
write_json(
|
||||
gt,
|
||||
[
|
||||
{"path": "testcode/Vuln.java", "line": 0, "cap": "sqli", "vuln": True},
|
||||
{"path": "testcode/Benign.java", "line": 0, "cap": "sqli", "vuln": False},
|
||||
],
|
||||
)
|
||||
scan = tmp / "scan.json"
|
||||
write_json(
|
||||
scan,
|
||||
{
|
||||
"findings": [
|
||||
# Correct confirm on the vuln file.
|
||||
python_finding(SINK_BIT_SQL, "/x/testcode/Vuln.java", 10, "Confirmed"),
|
||||
# False confirm on the benign file (no GT positive there).
|
||||
python_finding(SINK_BIT_SQL, "/x/testcode/Benign.java", 10, "Confirmed"),
|
||||
]
|
||||
},
|
||||
)
|
||||
append = tmp / "results.json"
|
||||
write_json(append, [])
|
||||
proc = run_tabulate(
|
||||
"--label", "owasp",
|
||||
"--scan", str(scan),
|
||||
"--ground-truth", str(gt),
|
||||
"--append", str(append),
|
||||
)
|
||||
assert proc.returncode == 0, proc.stdout + proc.stderr
|
||||
cells = {(c["cap"], c["lang"]): c for c in json.loads(append.read_text())[-1]["cells"]}
|
||||
sqli_java = cells[("sqli", "java")]
|
||||
assert sqli_java["confirmed_tp"] == 1, sqli_java
|
||||
assert sqli_java["confirmed_fp"] == 1, sqli_java
|
||||
assert sqli_java["wrong_confirmed"] == 1, (
|
||||
f"benign-file Confirmed must be a GT-derived false confirm: {sqli_java}"
|
||||
)
|
||||
|
||||
|
||||
def test_budget_confirmed_rate_floor(tmp: Path) -> None:
|
||||
# Phase 27: budget.toml may carry a per-cell `confirmed_rate` minimum.
|
||||
# 1 Confirmed of 5 (20%) must trip a 40% floor.
|
||||
budget = tmp / "budget.toml"
|
||||
budget.write_text(
|
||||
"[default]\n"
|
||||
"[[cell]]\n"
|
||||
'cap = "sqli"\n'
|
||||
'lang = "java"\n'
|
||||
"confirmed_rate = 0.40\n"
|
||||
)
|
||||
scan_fail = tmp / "scan_fail.json"
|
||||
write_json(
|
||||
scan_fail,
|
||||
{
|
||||
"findings": [
|
||||
python_finding(SINK_BIT_SQL, "B.java", 10, "Confirmed"),
|
||||
python_finding(SINK_BIT_SQL, "B.java", 20, "NotConfirmed"),
|
||||
python_finding(SINK_BIT_SQL, "B.java", 30, "NotConfirmed"),
|
||||
python_finding(SINK_BIT_SQL, "B.java", 40, "NotConfirmed"),
|
||||
python_finding(SINK_BIT_SQL, "B.java", 50, "NotConfirmed"),
|
||||
]
|
||||
},
|
||||
)
|
||||
append = tmp / "results_fail.json"
|
||||
write_json(append, [])
|
||||
proc = run_tabulate(
|
||||
"--label", "owasp",
|
||||
"--scan", str(scan_fail),
|
||||
"--inhouse",
|
||||
"--append", str(append),
|
||||
"--budget", str(budget),
|
||||
)
|
||||
assert proc.returncode == 2, proc.stdout + proc.stderr
|
||||
assert "Confirmed" in proc.stdout and "sqli/java" in proc.stdout, proc.stdout
|
||||
|
||||
# 3 Confirmed of 5 (60%) clears the floor.
|
||||
scan_ok = tmp / "scan_ok.json"
|
||||
write_json(
|
||||
scan_ok,
|
||||
{
|
||||
"findings": [
|
||||
python_finding(SINK_BIT_SQL, "B.java", 10, "Confirmed"),
|
||||
python_finding(SINK_BIT_SQL, "B.java", 20, "Confirmed"),
|
||||
python_finding(SINK_BIT_SQL, "B.java", 30, "Confirmed"),
|
||||
python_finding(SINK_BIT_SQL, "B.java", 40, "NotConfirmed"),
|
||||
python_finding(SINK_BIT_SQL, "B.java", 50, "NotConfirmed"),
|
||||
]
|
||||
},
|
||||
)
|
||||
append_ok = tmp / "results_ok.json"
|
||||
write_json(append_ok, [])
|
||||
proc = run_tabulate(
|
||||
"--label", "owasp",
|
||||
"--scan", str(scan_ok),
|
||||
"--inhouse",
|
||||
"--append", str(append_ok),
|
||||
"--budget", str(budget),
|
||||
)
|
||||
assert proc.returncode == 0, proc.stdout + proc.stderr
|
||||
|
||||
|
||||
def test_report_precision_recall_floors(tmp: Path) -> None:
|
||||
# Phase 27: report.py --min-precision / --min-recall enforce per-cap floors
|
||||
# aggregated across langs. cmdi precision 0.20 trips 0.85; ldap recall 0.10
|
||||
# trips 0.40; sqli (prec 1.0, rec 0.90) clears both.
|
||||
results = tmp / "results.json"
|
||||
|
||||
def cell(cap, lang, tp, fp, fn):
|
||||
return {
|
||||
"cap": cap, "lang": lang, "tp": tp, "fp": fp, "fn": fn,
|
||||
"unsupported": 0, "confirmed": 0, "partially_confirmed": 0,
|
||||
"wrong_confirmed": 0, "stable_replays": 0,
|
||||
"total": tp + fp + fn,
|
||||
}
|
||||
|
||||
write_json(
|
||||
results,
|
||||
[
|
||||
{
|
||||
"label": "owasp",
|
||||
"total_findings": 0,
|
||||
"cells": [
|
||||
cell("sqli", "java", 9, 0, 1), # prec 1.00, rec 0.90 → OK
|
||||
cell("cmdi", "java", 1, 4, 0), # prec 0.20 → FAIL precision
|
||||
cell("ldap_injection", "java", 1, 0, 9), # rec 0.10 → FAIL recall
|
||||
],
|
||||
}
|
||||
],
|
||||
)
|
||||
proc = run_report(
|
||||
"--results", str(results),
|
||||
"--min-precision", "0.85",
|
||||
"--min-recall", "0.40",
|
||||
)
|
||||
assert proc.returncode == 2, proc.stdout + proc.stderr
|
||||
assert "PRECISION" in proc.stdout and "cmdi" in proc.stdout, proc.stdout
|
||||
assert "RECALL" in proc.stdout and "ldap_injection" in proc.stdout, proc.stdout
|
||||
|
||||
# Clean: only the passing sqli cap.
|
||||
clean = tmp / "clean.json"
|
||||
write_json(
|
||||
clean,
|
||||
[{"label": "owasp", "total_findings": 0, "cells": [cell("sqli", "java", 9, 0, 1)]}],
|
||||
)
|
||||
proc = run_report(
|
||||
"--results", str(clean),
|
||||
"--min-precision", "0.85",
|
||||
"--min-recall", "0.40",
|
||||
)
|
||||
assert proc.returncode == 0, proc.stdout + proc.stderr
|
||||
assert "All per-cap precision/recall floors met" in proc.stdout, proc.stdout
|
||||
|
||||
|
||||
def test_report_confirmed_rate_floor(tmp: Path) -> None:
|
||||
results = tmp / "results.json"
|
||||
write_json(
|
||||
results,
|
||||
[
|
||||
{
|
||||
"label": "owasp",
|
||||
"total_findings": 5,
|
||||
"cells": [
|
||||
{
|
||||
"cap": "sqli",
|
||||
"lang": "java",
|
||||
"tp": 0,
|
||||
"fp": 0,
|
||||
"fn": 0,
|
||||
"unsupported": 0,
|
||||
"confirmed": 2,
|
||||
"wrong_confirmed": 0,
|
||||
"stable_replays": 0,
|
||||
"total": 5,
|
||||
}
|
||||
],
|
||||
}
|
||||
],
|
||||
)
|
||||
proc = run_report("--results", str(results), "--min-confirmed-rate", "0.40")
|
||||
assert proc.returncode == 0, proc.stdout + proc.stderr
|
||||
assert "All confirmed-rate floors met" in proc.stdout, proc.stdout
|
||||
|
||||
proc = run_report("--results", str(results), "--min-confirmed-rate", "0.50")
|
||||
assert proc.returncode == 2, proc.stdout + proc.stderr
|
||||
assert "FAIL" in proc.stdout and "sqli" in proc.stdout, proc.stdout
|
||||
|
||||
|
||||
def main() -> int:
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
tmp = Path(td)
|
||||
for fn in (
|
||||
test_budget_passes_on_clean_scan,
|
||||
test_budget_fails_when_unsupported_exceeds,
|
||||
test_diff_fails_on_regression,
|
||||
test_diff_passes_on_improvement,
|
||||
test_manual_triage_stamps_wrong_confirmed,
|
||||
test_manual_triage_ignores_vuln_true_entries,
|
||||
test_lang_filter_scopes_findings_and_gt,
|
||||
test_static_lens_buckets_shell_escape_as_cmdi,
|
||||
test_static_lens_preserves_higher_priority_bits,
|
||||
test_budget_malformed_exits_3,
|
||||
test_relative_gt_path_suffix_matches_absolute_finding,
|
||||
test_unmatched_gt_positive_lands_in_lang_cell,
|
||||
test_gt_grounded_false_confirm,
|
||||
test_budget_confirmed_rate_floor,
|
||||
test_report_precision_recall_floors,
|
||||
test_report_confirmed_rate_floor,
|
||||
):
|
||||
sub = tmp / fn.__name__
|
||||
sub.mkdir()
|
||||
print(f"... {fn.__name__}")
|
||||
fn(sub)
|
||||
print(f" OK")
|
||||
print("\nAll tabulate.py regression checks passed.")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Loading…
Add table
Add a link
Reference in a new issue