diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml index 3f7db77b..e5dc496d 100644 --- a/.github/workflows/eval.yml +++ b/.github/workflows/eval.yml @@ -1,9 +1,12 @@ # Real-corpus acceptance (Track R). # -# * owasp (Phase 27 / Track R.0): Gate 6 vs a real OWASP BenchmarkJava +# * owasp (Phase 27 / Track R.0): Gate 6 vs a real OWASP BenchmarkJava # checkout (Java). -# * jsts (Phase 28 / Track R.1): Gate 7 vs OWASP NodeGoat (Express, .js) +# * jsts (Phase 28 / Track R.1): Gate 7 vs OWASP NodeGoat (Express, .js) # and OWASP Juice Shop (TypeScript, .ts), one matrix row per corpus. +# * polyglot (Phase 29 / Track R.2): Gate 8 vs OWASP RailsGoat (Rails, .rb), +# DVWA (PHP), DVPWA (aiohttp, .py), gosec (Go) and the RustSec advisory-db +# (Rust negative control), one matrix row per corpus. # # Runs on every PR that touches the dynamic verifier (src/dynamic/), the # eval-corpus harness (tests/eval_corpus/), or the gate script itself. @@ -201,3 +204,141 @@ jobs: run: | export ${{ matrix.corpus.env }}="${{ github.workspace }}/.eval-corpus/${{ matrix.corpus.name }}" scripts/m7_ship_gate.sh --sets ${{ matrix.corpus.name }} + + polyglot: + name: eval / ${{ matrix.corpus.name }} + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + corpus: + - name: railsgoat + repo: https://github.com/OWASP/railsgoat + ref: rails.5.0.0 + lang: ruby + env: NYX_RAILSGOAT_CORPUS + manifest: railsgoat.manifest.toml + ground_truth: railsgoat.json + - name: dvwa + repo: https://github.com/digininja/DVWA + ref: "2.5" + lang: php + env: NYX_DVWA_CORPUS + manifest: dvwa.manifest.toml + ground_truth: dvwa.json + - name: dvpwa + repo: https://github.com/anxolerd/dvpwa + # DVPWA ships no release tags; pin the default branch and let the + # cache key hold it stable. + ref: master + lang: python + env: NYX_DVPWA_CORPUS + manifest: dvpwa.manifest.toml + ground_truth: dvpwa.json + - name: gosec + repo: https://github.com/securego/gosec + ref: v2.26.1 + lang: go + env: NYX_GOSEC_CORPUS + manifest: gosec.manifest.toml + ground_truth: gosec.json + - name: rustsec + repo: https://github.com/rustsec/advisory-db + # advisory-db ships no release tags; pin the default branch. This + # is the Rust NEGATIVE CONTROL (advisory metadata, no scannable + # source) — its committed ground truth is empty by construction. + ref: main + lang: rust + env: NYX_RUSTSEC_CORPUS + manifest: rustsec.manifest.toml + ground_truth: rustsec.json + env: + # CI wall-clock budget: 15 min. Override locally to tighten. + NYX_POLYGLOT_WALLCLOCK_BUDGET_SECONDS: "900" + steps: + - uses: actions/checkout@v6 + + - uses: actions-rust-lang/setup-rust-toolchain@v1 + with: + toolchain: stable + cache: true + + - uses: taiki-e/install-action@nextest + + # The dynamic verifier's per-language build pool (Phase 22/23) compiles + # its harnesses with a real toolchain. Each matrix row sets up only the + # toolchain for its corpus's target language; the Rust row needs no extra + # step (the rust toolchain above covers it, and advisory-db has no + # buildable source anyway). + - name: Set up Ruby + if: matrix.corpus.lang == 'ruby' + uses: ruby/setup-ruby@v1 + with: + ruby-version: "3.3" + + - name: Set up PHP + if: matrix.corpus.lang == 'php' + uses: shivammathur/setup-php@v2 + with: + php-version: "8.3" + + - name: Set up Python + if: matrix.corpus.lang == 'python' + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Set up Go + if: matrix.corpus.lang == 'go' + uses: actions/setup-go@v5 + with: + go-version: "1.22" + + - name: Cache ${{ matrix.corpus.name }} + id: cache-corpus + uses: actions/cache@v4 + with: + path: .eval-corpus/${{ matrix.corpus.name }} + key: polyglot-${{ matrix.corpus.name }}-${{ matrix.corpus.ref }} + + - name: Clone ${{ matrix.corpus.name }} (${{ matrix.corpus.ref }}) + if: steps.cache-corpus.outputs.cache-hit != 'true' + run: | + git clone --depth 1 --branch ${{ matrix.corpus.ref }} \ + ${{ matrix.corpus.repo }} \ + .eval-corpus/${{ matrix.corpus.name }} + + # No-compromise guard: the committed ground truth must be exactly what a + # fresh conversion of the curated manifest produces *against this corpus*. + # manifest_gt_convert.py hard-errors on any labelled path that no longer + # exists in the clone (corpus drift / typo); the diff below catches a + # stale committed JSON. For the RustSec negative control the manifest + # carries `negative_control = true` and zero entries, so the converter + # emits an empty `[]` — still validated against the real clone. + - name: Verify ground truth is in sync with the pinned corpus + run: | + python3 tests/eval_corpus/manifest_gt_convert.py \ + --manifest tests/eval_corpus/ground_truth/${{ matrix.corpus.manifest }} \ + --corpus-dir .eval-corpus/${{ matrix.corpus.name }} \ + --output /tmp/${{ matrix.corpus.name }}_gt_regen.json + python3 - <<'PY' + import json, sys + name = "${{ matrix.corpus.ground_truth }}" + committed = json.load(open(f"tests/eval_corpus/ground_truth/{name}")) + regen = json.load(open("/tmp/${{ matrix.corpus.name }}_gt_regen.json")) + if committed != regen: + sys.exit("committed ground truth diverges from a fresh conversion of " + "the manifest against the pinned corpus; regenerate with " + "manifest_gt_convert.py") + print(f"ground truth in sync: {len(committed)} records") + PY + + - name: eval-corpus harness regression tests + run: | + python3 tests/eval_corpus/test_tabulate_regression.py + python3 tests/eval_corpus/test_manifest_gt_convert.py + + - name: Gate 8 — ${{ matrix.corpus.name }} acceptance + run: | + export ${{ matrix.corpus.env }}="${{ github.workspace }}/.eval-corpus/${{ matrix.corpus.name }}" + scripts/m7_ship_gate.sh --sets ${{ matrix.corpus.name }} diff --git a/scripts/m7_ship_gate.sh b/scripts/m7_ship_gate.sh index 7b8a0c28..b41ee493 100755 --- a/scripts/m7_ship_gate.sh +++ b/scripts/m7_ship_gate.sh @@ -8,6 +8,8 @@ # scripts/m7_ship_gate.sh --sets owasp # Java OWASP corpus only # scripts/m7_ship_gate.sh --sets jsts # NodeGoat + Juice Shop only # scripts/m7_ship_gate.sh --sets nodegoat # one JS/TS corpus only +# scripts/m7_ship_gate.sh --sets polyglot # RailsGoat+DVWA+DVPWA+gosec+RustSec +# scripts/m7_ship_gate.sh --sets railsgoat # one polyglot corpus only # # Gate map (kept in sync with .pitboss/play/plan.md track M.7): # Gate 1: Static-only scan is green on `tests/benchmark/corpus`. @@ -37,13 +39,21 @@ # (NYX_JSTS_FLOOR_CAPS empty by default). Each corpus row # self-skips unless its NYX_NODEGOAT_CORPUS / NYX_JUICESHOP_CORPUS # points at a real checkout. +# Gate 8: Polyglot real-corpus acceptance (Track R.2 / Phase 29). OWASP +# RailsGoat (Rails, .rb), DVWA (PHP), DVPWA (aiohttp, .py), gosec +# (Go) and the RustSec advisory-db (Rust negative control), one +# row per corpus. Same shape as Gate 7: wall-clock budget + the +# per-(cap,lang) budget hard-enforced; per-cap confirmed/precision/ +# recall report-only (NYX_POLYGLOT_FLOOR_CAPS empty by default). +# Each row self-skips unless its NYX__CORPUS points at a real +# checkout. set -euo pipefail REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" cd "${REPO_ROOT}" -GATES="1,2,3,4,5,6,7" +GATES="1,2,3,4,5,6,7,8" SETS="" while [[ $# -gt 0 ]]; do @@ -71,9 +81,10 @@ done # `jsts` (both JS/TS corpora) / `nodegoat` / `juiceshop` -> Gate 7, with the # corpus name passed through so Gate 7 runs only the requested row. case "${SETS}" in - owasp) GATES="6" ;; - jsts|nodegoat|juiceshop) GATES="7" ;; - "") ;; # no --sets: run the requested --gates + owasp) GATES="6" ;; + jsts|nodegoat|juiceshop) GATES="7" ;; + polyglot|railsgoat|dvwa|dvpwa|gosec|rustsec) GATES="8" ;; + "") ;; # no --sets: run the requested --gates *) echo "unknown --sets: ${SETS}" >&2; exit 2 ;; esac @@ -308,34 +319,31 @@ PY echo " PASS" } -# ── Gate 7: JS/TS real-corpus acceptance (NodeGoat + Juice Shop) ────────────── - -# Phase 28 (Track R.1) mirror of Gate 6 for the JS/TS corpora. Same -# wall-clock split (10 min dev reference / 15 min CI) and the same -# report-only-by-default floor policy: NYX_JSTS_FLOOR_CAPS is empty, so the -# per-cap confirmed-rate / precision / recall numbers are published but gate -# nothing, while the per-(cap,lang) budget (unsupported_rate, -# false_confirmed_rate) is hard-enforced. Promote a cap into the floor set -# once it starts Confirming end to end. -GATE7_WALLCLOCK_BUDGET="${NYX_JSTS_WALLCLOCK_BUDGET_SECONDS:-900}" -GATE7_CONFIRMED_RATE_TARGET="${NYX_JSTS_CONFIRMED_RATE_TARGET:-0.40}" -GATE7_PRECISION_TARGET="${NYX_JSTS_PRECISION_TARGET:-0.85}" -GATE7_RECALL_TARGET="${NYX_JSTS_RECALL_TARGET:-0.40}" -GATE7_FLOOR_CAPS="${NYX_JSTS_FLOOR_CAPS:-}" -GATE7_BUDGET="${NYX_JSTS_BUDGET:-${REPO_ROOT}/tests/eval_corpus/budget.toml}" +# ── Shared real-corpus acceptance runner (Gates 7 + 8) ──────────────────────── # Run one real-corpus `--verify` row: scan under a wall-clock guard, # tabulate against the committed ground truth, enforce the per-cell budget, -# publish (or, when floor caps are set, enforce) the per-cap floors. -# $1 label $2 corpus dir $3 ground-truth json +# publish (or, when floor caps are set, enforce) the per-cap floors. Every +# random source nyx uses is seeded from spec_hash, so reruns are +# deterministic. Generic across gates — all gate-specific knobs are passed +# in so Gate 7 (JS/TS) and Gate 8 (polyglot) share one code path. +# $1 label $2 corpus dir $3 ground-truth json +# $4 wallclock(s) $5 budget.toml $6 floor caps (may be empty) +# $7 confirmed target $8 precision target $9 recall target +# $10 floor-unset hint (e.g. "NYX_POLYGLOT_FLOOR_CAPS unset") +# $11 lang filter (may be empty) — scope tabulation to one language so +# incidental other-language assets (vendored JS in a Rails/aiohttp app) +# do not pollute the corpus's per-cap metrics # Returns 0 on pass, 1 on fail. Caller decides skip. -_gate7_run_corpus() { - local label="$1" corpus="$2" gt="$3" - local scan_report="/tmp/m7_gate7_${label}_scan.json" - local results_report="/tmp/m7_gate7_${label}_results.json" - local wallclock_report="/tmp/m7_gate7_${label}_wallclock.txt" - local gate_home="${TMPDIR:-/tmp}/nyx_m7_gate7_${label}_home" - local gate_build_pool="${TMPDIR:-/tmp}/nyx_m7_gate7_${label}_build_pool" +_run_corpus_acceptance() { + local label="$1" corpus="$2" gt="$3" wallclock_budget="$4" budget_file="$5" + local floor_caps="$6" confirmed_target="$7" precision_target="$8" + local recall_target="$9" floor_hint="${10}" lang_filter="${11:-}" + local scan_report="/tmp/m7_corpus_${label}_scan.json" + local results_report="/tmp/m7_corpus_${label}_results.json" + local wallclock_report="/tmp/m7_corpus_${label}_wallclock.txt" + local gate_home="${TMPDIR:-/tmp}/nyx_m7_corpus_${label}_home" + local gate_build_pool="${TMPDIR:-/tmp}/nyx_m7_corpus_${label}_build_pool" local wallclock mkdir -p "${gate_home}" "${gate_build_pool}" @@ -344,7 +352,7 @@ _gate7_run_corpus() { set +e HOME="${gate_home}" \ NYX_BUILD_POOL_DIR="${gate_build_pool}" \ - python3 - "${GATE7_WALLCLOCK_BUDGET}" "${scan_report}" "${wallclock_report}" \ + python3 - "${wallclock_budget}" "${scan_report}" "${wallclock_report}" \ "${REPO_ROOT}/target/release/nyx" scan \ --verify \ --index off \ @@ -375,9 +383,9 @@ sys.exit(rc) PY local nyx_exit=$? set -e - wallclock="$(cat "${wallclock_report}" 2>/dev/null || printf "%s" "${GATE7_WALLCLOCK_BUDGET}")" + wallclock="$(cat "${wallclock_report}" 2>/dev/null || printf "%s" "${wallclock_budget}")" - echo " ${label} verify wall-clock: ${wallclock}s (budget ${GATE7_WALLCLOCK_BUDGET}s)" + echo " ${label} verify wall-clock: ${wallclock}s (budget ${wallclock_budget}s)" if [[ ${nyx_exit} -eq 124 ]]; then echo " FAIL: ${label} scan exceeded wall-clock budget" @@ -391,38 +399,60 @@ PY echo " FAIL: ${label} scan produced no JSON report" return 1 fi - awk -v w="${wallclock}" -v b="${GATE7_WALLCLOCK_BUDGET}" \ + awk -v w="${wallclock}" -v b="${wallclock_budget}" \ 'BEGIN { if (w+0 > b+0) exit 1 }' \ || { echo " FAIL: ${label} wall-clock exceeds budget"; return 1; } echo "[]" > "${results_report}" - python3 "${REPO_ROOT}/tests/eval_corpus/tabulate.py" \ - --label "${label}" \ - --scan "${scan_report}" \ - --ground-truth "${gt}" \ - --append "${results_report}" \ + local -a tabulate_args=( + --label "${label}" + --scan "${scan_report}" + --ground-truth "${gt}" + --append "${results_report}" + ) + if [[ -n "${lang_filter}" ]]; then + tabulate_args+=(--lang "${lang_filter}") + echo " scoping tabulation to language(s): ${lang_filter}" + fi + python3 "${REPO_ROOT}/tests/eval_corpus/tabulate.py" "${tabulate_args[@]}" \ || { echo " FAIL: ${label} result tabulation failed"; return 1; } local -a report_args=( --results "${results_report}" - --budget "${GATE7_BUDGET}" + --budget "${budget_file}" ) - if [[ -n "${GATE7_FLOOR_CAPS}" ]]; then + if [[ -n "${floor_caps}" ]]; then report_args+=( - --floor-caps "${GATE7_FLOOR_CAPS}" - --min-confirmed-rate "${GATE7_CONFIRMED_RATE_TARGET}" - --min-precision "${GATE7_PRECISION_TARGET}" - --min-recall "${GATE7_RECALL_TARGET}" + --floor-caps "${floor_caps}" + --min-confirmed-rate "${confirmed_target}" + --min-precision "${precision_target}" + --min-recall "${recall_target}" ) - echo " enforcing per-cap floors (confirmed >= ${GATE7_CONFIRMED_RATE_TARGET}, precision >= ${GATE7_PRECISION_TARGET}, recall >= ${GATE7_RECALL_TARGET}) on: ${GATE7_FLOOR_CAPS}" + echo " enforcing per-cap floors (confirmed >= ${confirmed_target}, precision >= ${precision_target}, recall >= ${recall_target}) on: ${floor_caps}" else - echo " per-cap confirmed/precision/recall: report-only (NYX_JSTS_FLOOR_CAPS unset)" + echo " per-cap confirmed/precision/recall: report-only (${floor_hint})" fi python3 "${REPO_ROOT}/tests/eval_corpus/report.py" "${report_args[@]}" \ || { echo " FAIL: ${label} per-cell budget exceeded or a gated per-cap floor missed"; return 1; } return 0 } +# ── Gate 7: JS/TS real-corpus acceptance (NodeGoat + Juice Shop) ────────────── + +# Phase 28 (Track R.1) mirror of Gate 6 for the JS/TS corpora. Same +# wall-clock split (10 min dev reference / 15 min CI) and the same +# report-only-by-default floor policy: NYX_JSTS_FLOOR_CAPS is empty, so the +# per-cap confirmed-rate / precision / recall numbers are published but gate +# nothing, while the per-(cap,lang) budget (unsupported_rate, +# false_confirmed_rate) is hard-enforced. Promote a cap into the floor set +# once it starts Confirming end to end. +GATE7_WALLCLOCK_BUDGET="${NYX_JSTS_WALLCLOCK_BUDGET_SECONDS:-900}" +GATE7_CONFIRMED_RATE_TARGET="${NYX_JSTS_CONFIRMED_RATE_TARGET:-0.40}" +GATE7_PRECISION_TARGET="${NYX_JSTS_PRECISION_TARGET:-0.85}" +GATE7_RECALL_TARGET="${NYX_JSTS_RECALL_TARGET:-0.40}" +GATE7_FLOOR_CAPS="${NYX_JSTS_FLOOR_CAPS:-}" +GATE7_BUDGET="${NYX_JSTS_BUDGET:-${REPO_ROOT}/tests/eval_corpus/budget.toml}" + gate_7_jsts_scale() { echo "── Gate 7: JS/TS real-corpus (NodeGoat + Juice Shop) verify acceptance ──" cargo build --release --quiet --features dynamic @@ -447,8 +477,13 @@ gate_7_jsts_scale() { fi any_ran=1 echo " ── ${name} (${corpus}) ──" - if _gate7_run_corpus "${name}" "${corpus}" \ - "${REPO_ROOT}/tests/eval_corpus/ground_truth/${gtfile}"; then + # No --lang scope: NodeGoat/Juice Shop are single-language (js/ts), so + # there is no cross-language asset noise to filter (unchanged Gate 7). + if _run_corpus_acceptance "${name}" "${corpus}" \ + "${REPO_ROOT}/tests/eval_corpus/ground_truth/${gtfile}" \ + "${GATE7_WALLCLOCK_BUDGET}" "${GATE7_BUDGET}" "${GATE7_FLOOR_CAPS}" \ + "${GATE7_CONFIRMED_RATE_TARGET}" "${GATE7_PRECISION_TARGET}" \ + "${GATE7_RECALL_TARGET}" "NYX_JSTS_FLOOR_CAPS unset" ""; then echo " PASS ${name}" else any_failed=1 @@ -464,6 +499,76 @@ gate_7_jsts_scale() { echo " PASS" } +# ── Gate 8: Polyglot real-corpus acceptance (Track R.2 / Phase 29) ──────────── + +# RailsGoat (Rails, .rb) + DVWA (PHP) + DVPWA (aiohttp, .py) + gosec (Go) + +# the RustSec advisory-db (Rust negative control). Same wall-clock split and +# the same report-only-by-default floor policy as Gates 6/7: the per-(cap,lang) +# budget in tests/eval_corpus/budget.toml is hard-enforced, while per-cap +# confirmed-rate / precision / recall are published but gate nothing until +# NYX_POLYGLOT_FLOOR_CAPS names a cap. Each row self-skips unless its +# corpus env var points at a real checkout. The RustSec row is a NEGATIVE +# CONTROL: advisory-db ships advisory metadata, not vulnerable source, so its +# ground truth is empty by construction and the row asserts nyx Confirms +# nothing there (false_confirmed_rate guard). +GATE8_WALLCLOCK_BUDGET="${NYX_POLYGLOT_WALLCLOCK_BUDGET_SECONDS:-900}" +GATE8_CONFIRMED_RATE_TARGET="${NYX_POLYGLOT_CONFIRMED_RATE_TARGET:-0.40}" +GATE8_PRECISION_TARGET="${NYX_POLYGLOT_PRECISION_TARGET:-0.85}" +GATE8_RECALL_TARGET="${NYX_POLYGLOT_RECALL_TARGET:-0.40}" +GATE8_FLOOR_CAPS="${NYX_POLYGLOT_FLOOR_CAPS:-}" +GATE8_BUDGET="${NYX_POLYGLOT_BUDGET:-${REPO_ROOT}/tests/eval_corpus/budget.toml}" + +gate_8_polyglot_scale() { + echo "── Gate 8: polyglot real-corpus (RailsGoat/DVWA/DVPWA/gosec/RustSec) verify acceptance ──" + cargo build --release --quiet --features dynamic + + # name : env var holding the corpus dir : committed ground-truth file : + # target language (tabulation is scoped to it so incidental other-language + # assets — e.g. vendored JS in the Rails / aiohttp apps — do not pollute + # the corpus's per-cap metrics). + local rows=( + "railsgoat:NYX_RAILSGOAT_CORPUS:railsgoat.json:ruby" + "dvwa:NYX_DVWA_CORPUS:dvwa.json:php" + "dvpwa:NYX_DVPWA_CORPUS:dvpwa.json:python" + "gosec:NYX_GOSEC_CORPUS:gosec.json:go" + "rustsec:NYX_RUSTSEC_CORPUS:rustsec.json:rust" + ) + local any_ran=0 any_failed=0 + for row in "${rows[@]}"; do + local name envvar gtfile lang + IFS=: read -r name envvar gtfile lang <<<"${row}" + # When --sets names a single corpus, only run that row. + if [[ -n "${SETS}" && "${SETS}" != "polyglot" && "${SETS}" != "${name}" ]]; then + continue + fi + local corpus="${!envvar:-}" + if [[ -z "${corpus}" || ! -d "${corpus}" ]]; then + echo " SKIP ${name}: set ${envvar} to a checkout to run this row." + continue + fi + any_ran=1 + echo " ── ${name} (${corpus}) ──" + if _run_corpus_acceptance "${name}" "${corpus}" \ + "${REPO_ROOT}/tests/eval_corpus/ground_truth/${gtfile}" \ + "${GATE8_WALLCLOCK_BUDGET}" "${GATE8_BUDGET}" "${GATE8_FLOOR_CAPS}" \ + "${GATE8_CONFIRMED_RATE_TARGET}" "${GATE8_PRECISION_TARGET}" \ + "${GATE8_RECALL_TARGET}" "NYX_POLYGLOT_FLOOR_CAPS unset" "${lang}"; then + echo " PASS ${name}" + else + any_failed=1 + fi + done + + if [[ ${any_ran} -eq 0 ]]; then + echo " SKIP: no polyglot corpus configured (set NYX_RAILSGOAT_CORPUS /" + echo " NYX_DVWA_CORPUS / NYX_DVPWA_CORPUS / NYX_GOSEC_CORPUS / NYX_RUSTSEC_CORPUS)." + echo " (Gate 8 is Phase 29's headline acceptance for the polyglot real corpora.)" + return 0 + fi + [[ ${any_failed} -eq 0 ]] || return 1 + echo " PASS" +} + # ── Driver ──────────────────────────────────────────────────────────────────── declare -a FAILED=() @@ -483,6 +588,7 @@ run_gate 4 sarif_schema run_gate 5 layering run_gate 6 owasp_scale run_gate 7 jsts_scale +run_gate 8 polyglot_scale if [[ ${#FAILED[@]} -gt 0 ]]; then echo diff --git a/tests/eval_corpus/budget.toml b/tests/eval_corpus/budget.toml index 340da270..6a213134 100644 --- a/tests/eval_corpus/budget.toml +++ b/tests/eval_corpus/budget.toml @@ -200,3 +200,153 @@ cap = "crypto" lang = "typescript" unsupported_rate = 0.20 false_confirmed_rate = 0.02 + +# ── Polyglot real corpora (Ruby/PHP/Python/Go/Rust) — Track R.2 ────────────── +# +# Phase 29 wires five more intentionally-vulnerable real corpora, one per +# remaining language family, into the same acceptance machinery as OWASP / +# NodeGoat / Juice Shop: +# +# * railsgoat — OWASP RailsGoat (Rails, .rb) +# * dvwa — Damn Vulnerable Web Application (PHP); ships graded +# source variants, so low.php = vuln and impossible.php = +# benign control — real vuln/benign PAIRS like OWASP. +# * dvpwa — Damn Vulnerable Python Web App (aiohttp, .py); its +# parameterized DAO siblings are benign controls for the +# one `%`-formatted SQL sink. +# * gosec — the Go SAST tool's own repo; the scannable, `// want`- +# annotated sample under goanalysis/testdata is the curated +# ground truth (its embedded-string rule samples are not +# scannable, so they are unlabelled). +# * rustsec — RustSec advisory-db: a NEGATIVE CONTROL. It ships +# advisory metadata, not vulnerable .rs source, so its +# ground truth is empty by construction; the row asserts the +# Rust scan/verify path runs at scale within wall-clock and +# Confirms NOTHING (any Confirmed Rust finding there is a +# false confirm and trips the default false_confirmed_rate). +# +# Each row is gated with the SAME policy as Gates 6/7 (scripts/m7_ship_gate.sh +# Gate 8): wall-clock + the per-(cap,lang) budget below are HARD-enforced; +# per-cap confirmed-rate / precision / recall are published report-only +# (NYX_POLYGLOT_FLOOR_CAPS empty by default). Because each corpus targets a +# single language, Gate 8 scopes tabulation to that language (tabulate.py +# --lang), so the vendored third-party JavaScript these Ruby/Python apps +# bundle (bootstrap-colorpicker, materialize, …) — which nyx confirms as +# prototype_pollution — does not pollute the corpus's per-cap metrics. Those +# JS findings are still emitted; they are simply out of scope for a Ruby / +# Python corpus. +# +# Calibrated against the pinned corpora (nyx HEAD of the Phase 29 branch, +# 2026-05-31) with `nyx scan --verify --index off`. Measured frontier +# (target-language scope): every curated cell sits at <= the headline maxima +# below EXCEPT cmdi, where every finding carries a SHELL_ESCAPE sanitizer cap +# and is therefore routed to Unsupported(SoundOracleUnavailable) — the same +# no-sound-oracle treatment OWASP's crypto/auth cells get. RailsGoat's +# deserialize (Marshal.load) and redirect (open redirect) cells Confirm end to +# end with zero false confirms — the first real polyglot confirms. + +# railsgoat (ruby): caps with a ground-truth label in railsgoat.manifest.toml. +[[cell]] +cap = "auth" +lang = "ruby" +unsupported_rate = 0.20 +false_confirmed_rate = 0.02 + +[[cell]] +cap = "crypto" +lang = "ruby" +unsupported_rate = 0.20 +false_confirmed_rate = 0.02 + +[[cell]] +cap = "deserialize" +lang = "ruby" +unsupported_rate = 0.20 +false_confirmed_rate = 0.02 + +[[cell]] +cap = "redirect" +lang = "ruby" +unsupported_rate = 0.20 +false_confirmed_rate = 0.02 + +[[cell]] +cap = "path_traversal" +lang = "ruby" +unsupported_rate = 0.20 +false_confirmed_rate = 0.02 + +# cmdi/ruby is incidental (RailsGoat's `self.try(params[:graph])` reflection +# sink); the lone finding carries a SHELL_ESCAPE sanitizer cap and routes to +# Unsupported(SoundOracleUnavailable), so unsupported_rate is locked at the +# measured frontier (1/1). The false-confirm guard stays at the headline 2%. +[[cell]] +cap = "cmdi" +lang = "ruby" +unsupported_rate = 1.00 +false_confirmed_rate = 0.02 + +# dvwa (php): caps with a ground-truth label in dvwa.manifest.toml. +[[cell]] +cap = "sqli" +lang = "php" +unsupported_rate = 0.20 +false_confirmed_rate = 0.02 + +[[cell]] +cap = "redirect" +lang = "php" +unsupported_rate = 0.20 +false_confirmed_rate = 0.02 + +[[cell]] +cap = "header_injection" +lang = "php" +unsupported_rate = 0.20 +false_confirmed_rate = 0.02 + +# cmdi/php: DVWA's ping handlers reach shell_exec through a SHELL_ESCAPE +# sanitizer cap, so ~69% of the cell's findings route to +# Unsupported(SoundOracleUnavailable). unsupported_rate is locked to that +# frontier with margin (a regression above 75% fails); false-confirm at 2%. +[[cell]] +cap = "cmdi" +lang = "php" +unsupported_rate = 0.75 +false_confirmed_rate = 0.02 + +# dvpwa (python): caps with a ground-truth label in dvpwa.manifest.toml. +[[cell]] +cap = "sqli" +lang = "python" +unsupported_rate = 0.20 +false_confirmed_rate = 0.02 + +[[cell]] +cap = "crypto" +lang = "python" +unsupported_rate = 0.20 +false_confirmed_rate = 0.02 + +[[cell]] +cap = "auth" +lang = "python" +unsupported_rate = 0.20 +false_confirmed_rate = 0.02 + +# gosec (go): caps with a ground-truth label in gosec.manifest.toml. +[[cell]] +cap = "crypto" +lang = "go" +unsupported_rate = 0.20 +false_confirmed_rate = 0.02 + +# cmdi/go: the goanalysis/testdata exec.Command sample reaches the sink +# through a SHELL_ESCAPE sanitizer cap, so every cmdi/go finding routes to +# Unsupported(SoundOracleUnavailable). unsupported_rate locked to the +# measured frontier (3/3); false-confirm at the headline 2%. +[[cell]] +cap = "cmdi" +lang = "go" +unsupported_rate = 1.00 +false_confirmed_rate = 0.02 diff --git a/tests/eval_corpus/ground_truth/README.md b/tests/eval_corpus/ground_truth/README.md index 663a0be6..3e09583a 100644 --- a/tests/eval_corpus/ground_truth/README.md +++ b/tests/eval_corpus/ground_truth/README.md @@ -69,3 +69,38 @@ known vulns) is the meaningful metric; precision vs this partial ground truth is informational. Gate 7 publishes per-cap precision/recall/confirmed report-only by default (`NYX_JSTS_FLOOR_CAPS` empty), matching the OWASP gate. + +## Polyglot real corpora (Ruby/PHP/Python/Go/Rust — Track R.2) + +Phase 29 wires the remaining language families into the same machinery, one +corpus per family, each with a curated `*.manifest.toml` → committed `*.json`: + +- `railsgoat.{manifest.toml,json}` — OWASP RailsGoat (Rails, `.rb`). +- `dvwa.{manifest.toml,json}` — Damn Vulnerable Web Application (PHP). DVWA + ships graded source variants (`source/{low,impossible}.php`), so this is + the one Track R corpus besides OWASP with real vuln/benign **pairs** + (`low.php` = vuln, `impossible.php` = benign control) — precision is + meaningful here, not just informational. +- `dvpwa.{manifest.toml,json}` — Damn Vulnerable Python Web App (aiohttp, + `.py`). Its parameterized DAO siblings are benign controls for the one + `%`-formatted SQL sink. +- `gosec.{manifest.toml,json}` — the gosec Go SAST tool repo; the scannable, + `// want`-annotated sample under `goanalysis/testdata` is the curated + ground truth (gosec's string-embedded rule samples are not scannable, so + they are deliberately unlabelled). +- `rustsec.{manifest.toml,json}` — RustSec advisory-db, a **negative + control**. advisory-db ships advisory metadata, not vulnerable `.rs` + source, so its committed ground truth is empty (`[]`) by construction. The + manifest sets `negative_control = true` (mutually exclusive with + `[[entry]]` tables); `manifest_gt_convert.py` emits the empty JSON and the + row asserts the Rust scan/verify path runs at scale within wall-clock and + Confirms nothing there (any Confirmed Rust finding is a false confirm). + +These are converted, validated and asserted-in-sync exactly like NodeGoat / +Juice Shop (the `polyglot` job in `.github/workflows/eval.yml`). Because each +corpus targets a single language, Gate 8 scopes tabulation to that language +(`tabulate.py --lang`) so the vendored third-party JavaScript these Ruby / +Python apps bundle does not pollute their per-cap metrics. Gate 8 publishes +per-cap precision/recall/confirmed report-only by default +(`NYX_POLYGLOT_FLOOR_CAPS` empty), matching the OWASP and JS/TS gates. See +`tests/eval_corpus/budget.toml` for the per-(cap,lang) gate policy. diff --git a/tests/eval_corpus/ground_truth/dvpwa.json b/tests/eval_corpus/ground_truth/dvpwa.json new file mode 100644 index 00000000..f1f764eb --- /dev/null +++ b/tests/eval_corpus/ground_truth/dvpwa.json @@ -0,0 +1,38 @@ +[ + { + "path": "sqli/dao/course.py", + "line": 0, + "cap": "sqli", + "vuln": false + }, + { + "path": "sqli/dao/mark.py", + "line": 0, + "cap": "sqli", + "vuln": false + }, + { + "path": "sqli/dao/review.py", + "line": 0, + "cap": "sqli", + "vuln": false + }, + { + "path": "sqli/dao/student.py", + "line": 0, + "cap": "sqli", + "vuln": true + }, + { + "path": "sqli/dao/user.py", + "line": 0, + "cap": "crypto", + "vuln": true + }, + { + "path": "sqli/views.py", + "line": 0, + "cap": "auth", + "vuln": true + } +] diff --git a/tests/eval_corpus/ground_truth/dvpwa.manifest.toml b/tests/eval_corpus/ground_truth/dvpwa.manifest.toml new file mode 100644 index 00000000..af005801 --- /dev/null +++ b/tests/eval_corpus/ground_truth/dvpwa.manifest.toml @@ -0,0 +1,70 @@ +# DVPWA (Damn Vulnerable Python Web Application) — curated ground-truth +# manifest (Phase 29, Track R.2). +# +# DVPWA is an intentionally-vulnerable aiohttp app whose headline flaw is +# SQL injection (the package is literally named `sqli`). It ships no +# machine-readable per-file labels, so this manifest IS the authoritative +# source. Its DAO layer is convenient: one method builds a query with +# Python `%` string-formatting (the injectable sink) while its siblings use +# proper parameterized `cur.execute(q, params)` — so the parameterized DAOs +# serve as genuine benign controls (vuln = false) for the sqli cell, making +# precision there meaningful, not just informational. +# +# tests/eval_corpus/manifest_gt_convert.py turns this into the committed +# ground_truth/dvpwa.json. CI regenerates it against a fresh clone of the +# pinned ref and asserts byte-equality; the converter HARD-ERRORS on any +# path that no longer exists, so a corpus bump that moves a DAO fails the +# job loudly rather than silently dropping recall. +# +# `cap` is a nyx cap label (tabulate.py), aligned to how nyx classifies each +# sink (the request-scoped ownership lookups in views.py surface as `auth`). +# `path` is relative to the DVPWA clone root, POSIX separators. Lang is +# inferred from the extension (.py -> python). See +# tests/eval_corpus/budget.toml for the gate policy on these cells. + +corpus = "dvpwa" +upstream = "https://github.com/anxolerd/dvpwa" +# DVPWA publishes no release tags; the eval job pins the default branch via +# the CI cache key (clone HEAD a1d8f89fac2e57093189853c6527c2b01fc1d9c1). +# The sqli/ package layout has been stable; re-validate if the cache key is +# bumped. +pinned_ref = "master" + +# ── SQL injection (sqli) — one injectable sink + parameterized controls ────── +[[entry]] +path = "sqli/dao/student.py" +cap = "sqli" +vuln = true +note = "Student.create builds the INSERT with Python `%` formatting (\"... VALUES ('%(name)s')\" % {'name': name}) on the request-supplied student name, then cur.execute(q) — SQL injection." + +[[entry]] +path = "sqli/dao/course.py" +cap = "sqli" +vuln = false +note = "benign control: every Course query uses parameterized cur.execute(q, params) / VALUES (%(title)s, %(description)s) — not injectable." + +[[entry]] +path = "sqli/dao/review.py" +cap = "sqli" +vuln = false +note = "benign control: Review.create / get_for_course bind via cur.execute(q, params) with %(course_id)s / %s placeholders — parameterized." + +[[entry]] +path = "sqli/dao/mark.py" +cap = "sqli" +vuln = false +note = "benign control: Mark.create / get_for_student bind via parameterized cur.execute(q, params) — not injectable." + +# ── Weak crypto (crypto) ───────────────────────────────────────────────────── +[[entry]] +path = "sqli/dao/user.py" +cap = "crypto" +vuln = true +note = "User.check_password compares against md5(password).hexdigest() — unsalted MD5 for credential storage (weak cryptography)." + +# ── Broken access control (auth) ───────────────────────────────────────────── +[[entry]] +path = "sqli/views.py" +cap = "auth" +vuln = true +note = "request handlers resolve the acting user from a client-controlled session id and act on objects without an ownership/authorization check — broken access control." diff --git a/tests/eval_corpus/ground_truth/dvwa.json b/tests/eval_corpus/ground_truth/dvwa.json new file mode 100644 index 00000000..3431ff5a --- /dev/null +++ b/tests/eval_corpus/ground_truth/dvwa.json @@ -0,0 +1,50 @@ +[ + { + "path": "vulnerabilities/exec/source/impossible.php", + "line": 0, + "cap": "cmdi", + "vuln": false + }, + { + "path": "vulnerabilities/exec/source/low.php", + "line": 0, + "cap": "cmdi", + "vuln": true + }, + { + "path": "vulnerabilities/open_redirect/source/impossible.php", + "line": 0, + "cap": "header_injection", + "vuln": false + }, + { + "path": "vulnerabilities/open_redirect/source/impossible.php", + "line": 0, + "cap": "redirect", + "vuln": false + }, + { + "path": "vulnerabilities/open_redirect/source/low.php", + "line": 0, + "cap": "header_injection", + "vuln": true + }, + { + "path": "vulnerabilities/open_redirect/source/low.php", + "line": 0, + "cap": "redirect", + "vuln": true + }, + { + "path": "vulnerabilities/sqli/source/impossible.php", + "line": 0, + "cap": "sqli", + "vuln": false + }, + { + "path": "vulnerabilities/sqli/source/low.php", + "line": 0, + "cap": "sqli", + "vuln": true + } +] diff --git a/tests/eval_corpus/ground_truth/dvwa.manifest.toml b/tests/eval_corpus/ground_truth/dvwa.manifest.toml new file mode 100644 index 00000000..9cab6759 --- /dev/null +++ b/tests/eval_corpus/ground_truth/dvwa.manifest.toml @@ -0,0 +1,84 @@ +# DVWA (Damn Vulnerable Web Application) — curated ground-truth manifest +# (Phase 29, Track R.2). +# +# DVWA is an intentionally-vulnerable PHP app. Unlike the other Track R +# apps it ships its vulnerabilities as graded source variants under +# vulnerabilities//source/{low,medium,high,impossible}.php, where +# `low.php` is the textbook-vulnerable handler and `impossible.php` is the +# hardened, secure rewrite of the SAME sink. That gives DVWA real +# vuln/benign PAIRS (low = vuln, impossible = benign control) the way OWASP +# Benchmark does — so precision against this manifest is meaningful, not +# just informational: a Confirmed finding on an `impossible.php` control is +# a genuine false confirm. +# +# tests/eval_corpus/manifest_gt_convert.py turns this into the committed +# ground_truth/dvwa.json. CI regenerates it against a fresh clone of the +# pinned tag and asserts byte-equality; the converter HARD-ERRORS on any +# path that no longer exists, so a DVWA bump that restructures a module +# fails loudly rather than silently dropping recall. Re-pin `pinned_ref` +# and re-validate the paths together. +# +# `cap` is a nyx cap label (tabulate.py), aligned to how nyx classifies the +# sink. `path` is relative to the DVWA clone root, POSIX separators. Lang +# is inferred from the extension (.php -> php). See +# tests/eval_corpus/budget.toml for the gate policy on these cells. + +corpus = "dvwa" +upstream = "https://github.com/digininja/DVWA" +# Pinned to release tag 2.5 (clone HEAD +# a96943dc1f52f390ee5df72144660636c4b7dd06). The +# vulnerabilities//source/{low,impossible}.php layout has been stable +# for years; re-validate if the tag is bumped. +pinned_ref = "2.5" + +# ── SQL injection (sqli) ───────────────────────────────────────────────────── +[[entry]] +path = "vulnerabilities/sqli/source/low.php" +cap = "sqli" +vuln = true +note = "id = $_REQUEST['id'] is concatenated straight into \"... WHERE user_id = '$id'\" and run via mysqli_query — classic SQL injection." + +[[entry]] +path = "vulnerabilities/sqli/source/impossible.php" +cap = "sqli" +vuln = false +note = "benign control: same query via PDO prepare + bindParam(:id, PDO::PARAM_INT) with is_numeric/intval validation — parameterized, not injectable." + +# ── OS command injection (cmdi) ────────────────────────────────────────────── +[[entry]] +path = "vulnerabilities/exec/source/low.php" +cap = "cmdi" +vuln = true +note = "target = $_REQUEST['ip'] is concatenated into shell_exec('ping -c 4 ' . $target) with no validation — OS command injection." + +[[entry]] +path = "vulnerabilities/exec/source/impossible.php" +cap = "cmdi" +vuln = false +note = "benign control: the IP is split into 4 octets and each is_numeric-checked before being reassembled and passed to shell_exec — not injectable." + +# ── Open redirect (redirect) ───────────────────────────────────────────────── +[[entry]] +path = "vulnerabilities/open_redirect/source/low.php" +cap = "redirect" +vuln = true +note = "header('location: ' . $_GET['redirect']) forwards to an unvalidated user-supplied URL — open redirect." + +[[entry]] +path = "vulnerabilities/open_redirect/source/impossible.php" +cap = "redirect" +vuln = false +note = "benign control: redirect target is chosen by an integer switch on is_numeric($_GET['redirect']) — no user-controlled URL reaches the Location header." + +# ── CRLF / HTTP header injection (header_injection) ────────────────────────── +[[entry]] +path = "vulnerabilities/open_redirect/source/low.php" +cap = "header_injection" +vuln = true +note = "the same unvalidated $_GET['redirect'] flows into a raw header() call, so CRLF in the value splits/injects response headers — HTTP header injection." + +[[entry]] +path = "vulnerabilities/open_redirect/source/impossible.php" +cap = "header_injection" +vuln = false +note = "benign control: only a fixed, integer-selected target string reaches header() — no user bytes, so no CRLF injection." diff --git a/tests/eval_corpus/ground_truth/gosec.json b/tests/eval_corpus/ground_truth/gosec.json new file mode 100644 index 00000000..4467831b --- /dev/null +++ b/tests/eval_corpus/ground_truth/gosec.json @@ -0,0 +1,14 @@ +[ + { + "path": "goanalysis/testdata/src/a/basic_output.go", + "line": 0, + "cap": "cmdi", + "vuln": true + }, + { + "path": "goanalysis/testdata/src/a/basic_output.go", + "line": 0, + "cap": "crypto", + "vuln": true + } +] diff --git a/tests/eval_corpus/ground_truth/gosec.manifest.toml b/tests/eval_corpus/ground_truth/gosec.manifest.toml new file mode 100644 index 00000000..a335d58c --- /dev/null +++ b/tests/eval_corpus/ground_truth/gosec.manifest.toml @@ -0,0 +1,42 @@ +# gosec — curated Go ground-truth manifest (Phase 29, Track R.2). +# +# gosec is the Go SAST tool; its repo doubles as the de-facto Go security +# corpus. Most of gosec's rule samples live as Go source embedded in +# backtick string literals inside testutils/g*_samples.go — those are NOT +# scannable by a taint analyzer (the vulnerable code is string data, not +# real AST), so they are deliberately NOT labelled here. gosec also ships a +# small set of REAL, compilable sample programs under goanalysis/testdata +# that carry the tool's OWN inline `// want 'GNNN ...'` expectations — that +# is the authoritative, scannable ground truth this manifest pins. +# +# Because the eval scans the whole gosec checkout (the tool's own source +# included), unlabelled findings are expected and are NOT false positives — +# precision against this manifest is informational, recall on the curated +# samples is the meaningful floor (same policy as the all-vulnerable apps; +# see tests/eval_corpus/budget.toml). +# +# tests/eval_corpus/manifest_gt_convert.py turns this into the committed +# ground_truth/gosec.json. CI regenerates it against a fresh clone of the +# pinned tag and asserts byte-equality; the converter HARD-ERRORS on any +# path that no longer exists, so a gosec bump that moves the testdata fails +# the job loudly. `cap` is a nyx cap label (tabulate.py); `path` is relative +# to the gosec clone root, POSIX separators; lang is inferred (.go -> go). + +corpus = "gosec" +upstream = "https://github.com/securego/gosec" +# Pinned to release tag v2.26.1 (clone HEAD +# 4a3bd8af174872c778439083ded7adbf3747e770). goanalysis/testdata/src/a/ has +# been stable; re-validate if the tag is bumped. +pinned_ref = "v2.26.1" + +[[entry]] +path = "goanalysis/testdata/src/a/basic_output.go" +cap = "cmdi" +vuln = true +note = "VulnerableFunction runs exec.Command(\"sh\", \"-c\", getUserInput()) — subprocess launched with a non-constant argument (gosec's own `// want G204 [CWE-78]` expectation)." + +[[entry]] +path = "goanalysis/testdata/src/a/basic_output.go" +cap = "crypto" +vuln = true +note = "VulnerableFunction imports crypto/md5 and calls md5.New() — weak cryptographic primitive (gosec's own `// want G401/G501` expectations)." diff --git a/tests/eval_corpus/ground_truth/railsgoat.json b/tests/eval_corpus/ground_truth/railsgoat.json new file mode 100644 index 00000000..e3bcc5d3 --- /dev/null +++ b/tests/eval_corpus/ground_truth/railsgoat.json @@ -0,0 +1,56 @@ +[ + { + "path": "app/controllers/admin_controller.rb", + "line": 0, + "cap": "auth", + "vuln": true + }, + { + "path": "app/controllers/benefit_forms_controller.rb", + "line": 0, + "cap": "deserialize", + "vuln": true + }, + { + "path": "app/controllers/benefit_forms_controller.rb", + "line": 0, + "cap": "path_traversal", + "vuln": true + }, + { + "path": "app/controllers/messages_controller.rb", + "line": 0, + "cap": "auth", + "vuln": true + }, + { + "path": "app/controllers/password_resets_controller.rb", + "line": 0, + "cap": "crypto", + "vuln": true + }, + { + "path": "app/controllers/password_resets_controller.rb", + "line": 0, + "cap": "deserialize", + "vuln": true + }, + { + "path": "app/controllers/sessions_controller.rb", + "line": 0, + "cap": "redirect", + "vuln": true + }, + { + "path": "app/controllers/users_controller.rb", + "line": 0, + "cap": "auth", + "vuln": true + }, + { + "path": "app/models/user.rb", + "line": 0, + "cap": "crypto", + "vuln": true + } +] diff --git a/tests/eval_corpus/ground_truth/railsgoat.manifest.toml b/tests/eval_corpus/ground_truth/railsgoat.manifest.toml new file mode 100644 index 00000000..0f2609d8 --- /dev/null +++ b/tests/eval_corpus/ground_truth/railsgoat.manifest.toml @@ -0,0 +1,88 @@ +# OWASP RailsGoat — curated vuln ground-truth manifest (Phase 29, Track R.2). +# +# RailsGoat is an intentionally-vulnerable Ruby on Rails app that maps the +# OWASP Top 10 to concrete controllers/models. Like NodeGoat / Juice Shop +# (Phase 28) it ships no machine-readable per-file vuln labels, so this +# manifest IS the authoritative source: one [[entry]] per known-vulnerable +# location, curated from the project's own tutorial walk-throughs, each with +# a `note` citing why. +# +# tests/eval_corpus/manifest_gt_convert.py turns this into the committed +# ground_truth/railsgoat.json. CI regenerates it against a fresh clone of +# the pinned tag and asserts byte-equality, and the converter HARD-ERRORS on +# any path that no longer exists in the corpus, so a RailsGoat bump that +# moves a controller fails the eval job loudly rather than silently dropping +# recall. Update `pinned_ref` + the paths together when re-pinning. +# +# `cap` is a nyx cap label (tabulate.py); it is aligned with how nyx +# classifies the sink in each file (e.g. a missing ownership check on a +# direct-object lookup surfaces as `auth`, not `unauthorized_id`), so recall +# (did nyx catch the canonical vuln) is meaningful. `path` is relative to +# the RailsGoat clone root, POSIX separators. Lang is inferred from the +# extension (.rb -> ruby). All `vuln = true`: RailsGoat is all-vulnerable, +# so there is no benign-control file to pair against — precision vs this +# manifest is informational (an unlabelled finding may be a real uncurated +# vuln), while recall is the meaningful floor. See +# tests/eval_corpus/budget.toml for how the gate treats these cells. + +corpus = "railsgoat" +upstream = "https://github.com/OWASP/railsgoat" +# Pinned to the stable Rails 5 release tag (clone HEAD +# 0766ca80bf2d94acbde1dd4aaf7baf9b86afe4eb). The app/controllers + app/models +# layout below has been stable across this tag; re-validate the paths if the +# ref is bumped. +pinned_ref = "rails.5.0.0" + +[[entry]] +path = "app/controllers/users_controller.rb" +cap = "auth" +vuln = true +note = "update looks up the account with User.where(\"id = '#{params[:user][:id]}'\") and mass-assigns user_params (params.require(:user).permit!) with no ownership check — broken access control / mass-assignment privilege escalation (OWASP A4/A5)." + +[[entry]] +path = "app/controllers/messages_controller.rb" +cap = "auth" +vuln = true +note = "show / destroy fetch Message.where(id: params[:id]) with no check that the message belongs to current_user — insecure direct object reference (OWASP A4 broken access control)." + +[[entry]] +path = "app/controllers/admin_controller.rb" +cap = "auth" +vuln = true +note = "administrative actions are gated by a bypassable admin_param check (params[:admin_id] != \"1\"); update_user / delete_user act on any admin_id — broken access control / privilege escalation (OWASP A5)." + +[[entry]] +path = "app/models/user.rb" +cap = "crypto" +vuln = true +note = "passwords are hashed with Digest::MD5.hexdigest (hash_password / authenticate) — unsalted weak hash for credential storage (OWASP A2 cryptographic failure)." + +[[entry]] +path = "app/controllers/password_resets_controller.rb" +cap = "crypto" +vuln = true +note = "generate_token derives the reset token as Digest::MD5.hexdigest(email) — a predictable, forgeable password-reset token (weak cryptography)." + +[[entry]] +path = "app/controllers/password_resets_controller.rb" +cap = "deserialize" +vuln = true +note = "reset_password runs Marshal.load(Base64.decode64(params[:user])) on attacker-controlled input — insecure deserialization leading to RCE (OWASP A8)." + +[[entry]] +path = "app/controllers/sessions_controller.rb" +cap = "redirect" +vuln = true +note = "create redirects to params[:url] with no allow-list (path = params[:url] then redirect_to path) — open redirect (OWASP unvalidated redirects)." + +[[entry]] +path = "app/controllers/benefit_forms_controller.rb" +cap = "path_traversal" +vuln = true +note = "download builds send_file from a user-controlled params[:name] path with no containment — arbitrary file read / path traversal." + +[[entry]] +path = "app/controllers/benefit_forms_controller.rb" +cap = "deserialize" +vuln = true +note = "download calls params[:type].constantize.new(path), constantizing a user-supplied class name — unsafe reflection / object injection." diff --git a/tests/eval_corpus/ground_truth/rustsec.json b/tests/eval_corpus/ground_truth/rustsec.json new file mode 100644 index 00000000..fe51488c --- /dev/null +++ b/tests/eval_corpus/ground_truth/rustsec.json @@ -0,0 +1 @@ +[] diff --git a/tests/eval_corpus/ground_truth/rustsec.manifest.toml b/tests/eval_corpus/ground_truth/rustsec.manifest.toml new file mode 100644 index 00000000..8b429dc2 --- /dev/null +++ b/tests/eval_corpus/ground_truth/rustsec.manifest.toml @@ -0,0 +1,37 @@ +# RustSec advisory-db — Rust negative-control corpus (Phase 29, Track R.2). +# +# The plan's Rust real-corpus row is the RustSec advisory database. Unlike +# RailsGoat / DVWA / DVPWA / gosec, advisory-db ships advisory METADATA +# (TOML + Markdown under crates//RUSTSEC-*.md), not vulnerable Rust +# SOURCE. A static scan of it therefore contains zero `.rs` files and nyx +# correctly produces zero findings — so there are no source-level vuln +# positives to label, and no canonical scannable "RustGoat" exists to +# substitute without fabricating paths (which the CI byte-equality + path +# existence guards would reject outright). +# +# advisory-db is still worth pinning and scanning as a NEGATIVE CONTROL for +# the Rust language path: +# * it exercises the Rust scan + verify pipeline (Phase 23 Rust build +# pool) end to end on a large real-world tree (thousands of files) and +# asserts it stays within the wall-clock budget without crashing, and +# * it is an over-confirmation guard: nyx must Confirm NOTHING on a corpus +# with no real source vulns. Any Confirmed finding here is provably a +# false confirm and trips the per-cell false_confirmed_rate budget +# (tests/eval_corpus/budget.toml) — a genuine regression sentinel if a +# future change makes nyx treat advisory text as scannable code. +# +# `negative_control = true` tells manifest_gt_convert.py to emit an empty +# `[]` ground truth. It is mutually exclusive with `[[entry]]` tables, so a +# real Rust vuln can never be silently hidden behind the flag. When a +# scannable advisory-backed Rust corpus (a vulnerable crate pinned at its +# affected version with a source-level taint sink) is curated, drop the flag +# and add [[entry]] tables here exactly as the other Track R.2 manifests do. + +corpus = "rustsec" +upstream = "https://github.com/rustsec/advisory-db" +# advisory-db publishes no release tags; the eval job pins the default +# branch via the CI cache key (clone HEAD +# eaf48e749baa3d5e27d304107d8abf175fd756bb). +pinned_ref = "main" + +negative_control = true diff --git a/tests/eval_corpus/manifest_gt_convert.py b/tests/eval_corpus/manifest_gt_convert.py index 792338ad..0ddfefe6 100755 --- a/tests/eval_corpus/manifest_gt_convert.py +++ b/tests/eval_corpus/manifest_gt_convert.py @@ -23,6 +23,19 @@ Manifest schema (TOML):: vuln = true # true = real vuln, false = benign control note = "eval() of user-supplied pre/after-tax fields (NodeGoat A1)" +Negative-control corpora. A few real corpora carry **no** scannable +source-level vulnerabilities of their own — most notably the RustSec +`advisory-db`, which ships advisory *metadata* (TOML/Markdown), not +vulnerable `.rs` source. Such a corpus has zero ground-truth positives by +construction, yet it is still worth scanning: it exercises the language's +scan + verify path end to end on a large real-world tree and acts as an +over-confirmation guard (nyx must Confirm nothing on a corpus with no real +source vulns). Declare it with a top-level ``negative_control = true`` and +**zero** ``[[entry]]`` tables; the converter then emits an empty ``[]`` +ground truth. ``negative_control`` and ``[[entry]]`` are mutually +exclusive — a manifest that sets the flag *and* lists entries is rejected, +so a real vuln can never be silently dropped behind the flag. + Output (consumed by tabulate.py): a list of `{path, line, cap, vuln}` records, sorted by `(path, cap)` for deterministic, diff-stable JSON. `note` is intentionally dropped — the ground-truth JSON keeps the exact @@ -119,7 +132,15 @@ def main() -> int: manifest = load_manifest(Path(args.manifest).expanduser()) entries = manifest.get("entry", []) or [] - if not entries: + negative_control = bool(manifest.get("negative_control", False)) + if negative_control and entries: + print( + f"error: negative_control manifest must declare zero [[entry]] " + f"tables (found {len(entries)}): {args.manifest}", + file=sys.stderr, + ) + return 1 + if not entries and not negative_control: print(f"error: manifest has no [[entry]] tables: {args.manifest}", file=sys.stderr) return 1 @@ -184,6 +205,8 @@ def main() -> int: vuln_count = sum(1 for r in records if r["vuln"]) print(f"wrote {len(records)} records to {out}") + if negative_control: + print(" negative-control corpus: zero ground-truth positives by construction") print(f" vulns: {vuln_count}") print(f" non-vuln: {len(records) - vuln_count}") if corpus is not None: diff --git a/tests/eval_corpus/run.sh b/tests/eval_corpus/run.sh index 5ff19001..e0dd40fe 100755 --- a/tests/eval_corpus/run.sh +++ b/tests/eval_corpus/run.sh @@ -28,7 +28,7 @@ REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" OUTPUT_DIR="" NYX_BIN="${NYX_BIN:-${REPO_ROOT}/target/release/nyx}" CORPUS_CACHE="${NYX_EVAL_CORPUS_DIR:-${HOME}/.cache/nyx/eval_corpus}" -SETS="owasp,sard,nodegoat,juiceshop,inhouse" +SETS="owasp,sard,nodegoat,juiceshop,railsgoat,dvwa,dvpwa,gosec,rustsec,inhouse" # Optional per-cell budgets and monotonic-improvement diff. BUDGET_FILE="" DIFF_FILE="" @@ -90,6 +90,42 @@ run_jsts_corpus() { || info " tabulate.py failed on $label; ground truth file may be absent" } +# Scan one Track R.2 polyglot real corpus and tabulate it against its +# committed ground truth, SCOPED to its target language (tabulate --lang) so +# incidental other-language assets (e.g. vendored JS in a Rails / aiohttp app) +# do not pollute the corpus's per-cap metrics. Self-skips when the corpus has +# not been cloned into the cache; prints the exact clone command if so. +# $1 label $2 dir $3 ground-truth json $4 target lang $5 repo $6 ref +run_polyglot_corpus() { + local label="$1" dir="$2" gt="$3" lang="$4" repo="$5" ref="$6" + if [[ ! -d "$dir" ]]; then + info "Bootstrapping $label..." + info " git clone --depth 1 --branch ${ref} ${repo} ${dir}" + info "Skipping $label set (not yet downloaded)." + return 0 + fi + info "Running nyx scan on $label (lang scope: ${lang})..." + set +e + "$NYX_BIN" scan --format json --verify --no-index "$dir" \ + > "/tmp/nyx_${label}.json" 2>"/tmp/nyx_${label}.stderr" + local rc=$? + set -e + if [[ $rc -ne 0 && $rc -ne 1 ]]; then + info " nyx exited $rc on $label set (stderr follows):" + cat "/tmp/nyx_${label}.stderr" >&2 + return 0 + fi + python3 "${SCRIPT_DIR}/tabulate.py" \ + --label "$label" \ + --scan "/tmp/nyx_${label}.json" \ + --ground-truth "$gt" \ + --lang "$lang" \ + --append "$RESULTS_JSON" \ + ${BUDGET_FILE:+--budget "$BUDGET_FILE"} \ + ${DIFF_FILE:+--diff "$DIFF_FILE"} \ + || info " tabulate.py failed on $label; ground truth file may be absent" +} + [[ -x "$NYX_BIN" ]] || die "nyx binary not found or not executable: $NYX_BIN" mkdir -p "$CORPUS_CACHE" @@ -143,6 +179,35 @@ if [[ "$SETS" == *juiceshop* ]]; then "${SCRIPT_DIR}/ground_truth/juiceshop.json" fi +# ── Polyglot real corpora (Ruby/PHP/Python/Go/Rust) — Track R.2 ─────────────── +if [[ "$SETS" == *railsgoat* ]]; then + run_polyglot_corpus railsgoat "${CORPUS_CACHE}/railsgoat" \ + "${SCRIPT_DIR}/ground_truth/railsgoat.json" ruby \ + https://github.com/OWASP/railsgoat rails.5.0.0 +fi +if [[ "$SETS" == *dvwa* ]]; then + run_polyglot_corpus dvwa "${CORPUS_CACHE}/dvwa" \ + "${SCRIPT_DIR}/ground_truth/dvwa.json" php \ + https://github.com/digininja/DVWA 2.5 +fi +if [[ "$SETS" == *dvpwa* ]]; then + run_polyglot_corpus dvpwa "${CORPUS_CACHE}/dvpwa" \ + "${SCRIPT_DIR}/ground_truth/dvpwa.json" python \ + https://github.com/anxolerd/dvpwa master +fi +if [[ "$SETS" == *gosec* ]]; then + run_polyglot_corpus gosec "${CORPUS_CACHE}/gosec" \ + "${SCRIPT_DIR}/ground_truth/gosec.json" go \ + https://github.com/securego/gosec v2.26.1 +fi +# RustSec advisory-db is the Rust negative control (empty ground truth): the +# row asserts the Rust scan/verify path runs and Confirms nothing there. +if [[ "$SETS" == *rustsec* ]]; then + run_polyglot_corpus rustsec "${CORPUS_CACHE}/rustsec" \ + "${SCRIPT_DIR}/ground_truth/rustsec.json" rust \ + https://github.com/rustsec/advisory-db main +fi + # ── NIST SARD subset bootstrap ──────────────────────────────────────────────── SARD_DIR="${CORPUS_CACHE}/nist_sard" if [[ "$SETS" == *sard* ]]; then diff --git a/tests/eval_corpus/run_full.sh b/tests/eval_corpus/run_full.sh index 948d1642..ecd142e7 100755 --- a/tests/eval_corpus/run_full.sh +++ b/tests/eval_corpus/run_full.sh @@ -3,6 +3,7 @@ # # Drives a complete pass against every corpus set the project knows about # (OWASP Benchmark v1.2, the NIST SARD subset, OWASP NodeGoat + Juice Shop, +# the Track R.2 polyglot corpora — RailsGoat / DVWA / DVPWA / gosec / RustSec — # and the Nyx benchmark fixtures), then emits `tests/eval_corpus/results.json` # for reports, diffs, and docs. # @@ -70,7 +71,7 @@ set +e NYX_EVAL_CORPUS_DIR="$CORPUS_CACHE" \ bash "${SCRIPT_DIR}/run.sh" \ --nyx "$NYX_BIN" \ - --sets owasp,sard,nodegoat,juiceshop,inhouse \ + --sets owasp,sard,nodegoat,juiceshop,railsgoat,dvwa,dvpwa,gosec,rustsec,inhouse \ --output "$OUTPUT_DIR" \ --budget "$BUDGET_FILE" \ ${DIFF_FILE:+--diff "$DIFF_FILE"} diff --git a/tests/eval_corpus/tabulate.py b/tests/eval_corpus/tabulate.py index 09b19621..e537dc9f 100644 --- a/tests/eval_corpus/tabulate.py +++ b/tests/eval_corpus/tabulate.py @@ -362,15 +362,34 @@ def main() -> int: default="", help="path to budget.toml (per-(cap,lang) thresholds)", ) + p.add_argument( + "--lang", + default="", + help=( + "comma-separated language allowlist (python, javascript, php, " + "ruby, go, rust, ...). When set, only findings AND ground-truth " + "entries whose source language is in the list are tabulated; " + "everything else is dropped before tallying. Used by the Phase 29 " + "polyglot corpora (Track R.2) to scope a single-language corpus to " + "its target language so incidental third-party assets in other " + "languages — e.g. the vendored JavaScript a Rails or aiohttp app " + "bundles — do not pollute that corpus's per-cap metrics. Empty = " + "no language filter (every finding tabulated, the OWASP/JSTS " + "default)." + ), + ) p.add_argument( "--diff", default="", help="path to a previous results JSON; fail on monotonic-improvement regression", ) args = p.parse_args() + lang_filter = {l.strip() for l in args.lang.split(",") if l.strip()} scan_data = load_json(args.scan) findings = scan_data if isinstance(scan_data, list) else scan_data.get("findings", []) + if lang_filter: + findings = [f for f in findings if lang_of(f) in lang_filter] # ── Manual-triage stamping (Phase 31 follow-up) ─────────────────────── # Cross-reference Confirmed rows against a manual-triage file before @@ -463,6 +482,10 @@ def main() -> int: # Ground truth format: list of {"path": ..., "line": ..., "cap": ..., "vuln": bool} gt_true: list[dict] = [] for entry in gt if isinstance(gt, list) else []: + # Honour the same language scope as the findings filter so recall + # is measured only over the corpus's target language. + if lang_filter and lang_of(entry) not in lang_filter: + continue if entry.get("vuln"): gt_true.append({ "path": entry.get("path", ""), diff --git a/tests/eval_corpus/test_manifest_gt_convert.py b/tests/eval_corpus/test_manifest_gt_convert.py index 729adde2..f7826022 100644 --- a/tests/eval_corpus/test_manifest_gt_convert.py +++ b/tests/eval_corpus/test_manifest_gt_convert.py @@ -168,7 +168,16 @@ def test_committed_gt_matches_manifest(tmp: Path) -> None: # Offline half of the CI in-sync guard: the committed ground-truth JSON # must be exactly what a fresh conversion of its manifest produces. This # catches a manifest edit that was not followed by a regenerate. - for name in ("nodegoat", "juiceshop"): + for name in ( + "nodegoat", + "juiceshop", + # Track R.2 polyglot corpora (Phase 29). + "railsgoat", + "dvwa", + "dvpwa", + "gosec", + "rustsec", + ): man = GT_DIR / f"{name}.manifest.toml" committed = GT_DIR / f"{name}.json" assert man.exists(), f"missing manifest: {man}" @@ -181,6 +190,39 @@ def test_committed_gt_matches_manifest(tmp: Path) -> None: ) +def test_negative_control_emits_empty(tmp: Path) -> None: + # A negative-control manifest (no scannable source vulns, e.g. RustSec + # advisory-db) declares `negative_control = true` and zero [[entry]] + # tables; the converter emits an empty `[]` ground truth. + man = tmp / "neg.manifest.toml" + man.write_text( + 'corpus = "rustsec"\n' + 'upstream = "https://example.test/advisory-db"\n' + 'pinned_ref = "main"\n' + "negative_control = true\n" + ) + out = tmp / "neg.json" + proc = run_convert("--manifest", str(man), "--output", str(out)) + assert proc.returncode == 0, proc.stdout + proc.stderr + assert json.loads(out.read_text()) == [], out.read_text() + assert "negative-control corpus" in proc.stdout, proc.stdout + + +def test_negative_control_with_entries_rejected(tmp: Path) -> None: + # negative_control and [[entry]] are mutually exclusive: a manifest that + # sets the flag yet lists a vuln must be rejected so a real positive can + # never be silently hidden behind the flag. + man = tmp / "neg_bad.manifest.toml" + man.write_text( + "negative_control = true\n" + '[[entry]]\npath = "a.rs"\ncap = "cmdi"\nvuln = true\n' + ) + out = tmp / "neg_bad.json" + proc = run_convert("--manifest", str(man), "--output", str(out)) + assert proc.returncode == 1, proc.stdout + proc.stderr + assert "negative_control" in proc.stderr and "zero" in proc.stderr, proc.stderr + + def main() -> int: with tempfile.TemporaryDirectory() as td: tmp = Path(td) @@ -193,6 +235,8 @@ def main() -> int: test_malformed_manifest_exits_1, test_empty_manifest_exits_1, test_committed_gt_matches_manifest, + test_negative_control_emits_empty, + test_negative_control_with_entries_rejected, ): sub = tmp / fn.__name__ sub.mkdir() diff --git a/tests/eval_corpus/test_tabulate_regression.py b/tests/eval_corpus/test_tabulate_regression.py index 8bba6758..7398b978 100644 --- a/tests/eval_corpus/test_tabulate_regression.py +++ b/tests/eval_corpus/test_tabulate_regression.py @@ -294,6 +294,65 @@ def test_manual_triage_ignores_vuln_true_entries(tmp: Path) -> None: ) +def test_lang_filter_scopes_findings_and_gt(tmp: Path) -> None: + # Phase 29 (Track R.2): --lang scopes a single-language corpus to its + # target language so incidental other-language assets (e.g. the vendored + # JavaScript a Rails app bundles, which nyx flags as prototype_pollution) + # do not pollute the corpus's per-cap metrics. The filter must drop both + # findings AND ground-truth entries outside the scope. + gt = tmp / "gt.json" + write_json( + gt, + [ + {"path": "app/models/user.rb", "line": 0, "cap": "sqli", "vuln": True}, + {"path": "app/assets/lib.js", "line": 0, "cap": "sqli", "vuln": True}, + ], + ) + scan = tmp / "scan.json" + write_json( + scan, + { + "findings": [ + python_finding(SINK_BIT_SQL, "/x/app/models/user.rb", 10, "NotConfirmed"), + # A vendored-JS finding nyx would otherwise Confirm — must be + # excluded entirely under `--lang ruby`. + python_finding(SINK_BIT_SQL, "/x/app/assets/lib.js", 10, "Confirmed"), + ] + }, + ) + + # Unscoped: both language cells appear. + unscoped = tmp / "unscoped.json" + write_json(unscoped, []) + proc = run_tabulate( + "--label", "railsgoat", + "--scan", str(scan), + "--ground-truth", str(gt), + "--append", str(unscoped), + ) + assert proc.returncode == 0, proc.stdout + proc.stderr + cells = {(c["cap"], c["lang"]) for c in json.loads(unscoped.read_text())[-1]["cells"]} + assert ("sqli", "ruby") in cells and ("sqli", "javascript") in cells, cells + + # Scoped to ruby: the JS finding AND the JS ground-truth positive vanish. + scoped = tmp / "scoped.json" + write_json(scoped, []) + proc = run_tabulate( + "--label", "railsgoat", + "--scan", str(scan), + "--ground-truth", str(gt), + "--lang", "ruby", + "--append", str(scoped), + ) + assert proc.returncode == 0, proc.stdout + proc.stderr + cells = {(c["cap"], c["lang"]): c for c in json.loads(scoped.read_text())[-1]["cells"]} + assert ("sqli", "javascript") not in cells, f"JS must be filtered out: {list(cells)}" + ruby = cells[("sqli", "ruby")] + assert ruby["tp"] == 1 and ruby["fn"] == 0, ruby + # The dropped JS positive must NOT resurface as a phantom FN in any cell. + assert all(lang != "javascript" for _cap, lang in cells), cells + + def test_budget_malformed_exits_3(tmp: Path) -> None: bad = tmp / "bad.toml" bad.write_text("[default]\nunsupported_rate = not_a_number\n") @@ -601,6 +660,7 @@ def main() -> int: test_diff_passes_on_improvement, test_manual_triage_stamps_wrong_confirmed, test_manual_triage_ignores_vuln_true_entries, + test_lang_filter_scopes_findings_and_gt, test_budget_malformed_exits_3, test_relative_gt_path_suffix_matches_absolute_finding, test_unmatched_gt_positive_lands_in_lang_cell,