mirror of
https://github.com/elicpeter/nyx.git
synced 2026-06-12 19:55:14 +02:00
feat(eval-corpus): add Track R.2 polyglot corpora (RailsGoat, DVWA, DVPWA, gosec, RustSec) with curated manifests, negative controls, and CI validation
This commit is contained in:
parent
2a4d49b68b
commit
e0833537e4
20 changed files with 1181 additions and 53 deletions
|
|
@ -200,3 +200,153 @@ cap = "crypto"
|
|||
lang = "typescript"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
# ── Polyglot real corpora (Ruby/PHP/Python/Go/Rust) — Track R.2 ──────────────
|
||||
#
|
||||
# Phase 29 wires five more intentionally-vulnerable real corpora, one per
|
||||
# remaining language family, into the same acceptance machinery as OWASP /
|
||||
# NodeGoat / Juice Shop:
|
||||
#
|
||||
# * railsgoat — OWASP RailsGoat (Rails, .rb)
|
||||
# * dvwa — Damn Vulnerable Web Application (PHP); ships graded
|
||||
# source variants, so low.php = vuln and impossible.php =
|
||||
# benign control — real vuln/benign PAIRS like OWASP.
|
||||
# * dvpwa — Damn Vulnerable Python Web App (aiohttp, .py); its
|
||||
# parameterized DAO siblings are benign controls for the
|
||||
# one `%`-formatted SQL sink.
|
||||
# * gosec — the Go SAST tool's own repo; the scannable, `// want`-
|
||||
# annotated sample under goanalysis/testdata is the curated
|
||||
# ground truth (its embedded-string rule samples are not
|
||||
# scannable, so they are unlabelled).
|
||||
# * rustsec — RustSec advisory-db: a NEGATIVE CONTROL. It ships
|
||||
# advisory metadata, not vulnerable .rs source, so its
|
||||
# ground truth is empty by construction; the row asserts the
|
||||
# Rust scan/verify path runs at scale within wall-clock and
|
||||
# Confirms NOTHING (any Confirmed Rust finding there is a
|
||||
# false confirm and trips the default false_confirmed_rate).
|
||||
#
|
||||
# Each row is gated with the SAME policy as Gates 6/7 (scripts/m7_ship_gate.sh
|
||||
# Gate 8): wall-clock + the per-(cap,lang) budget below are HARD-enforced;
|
||||
# per-cap confirmed-rate / precision / recall are published report-only
|
||||
# (NYX_POLYGLOT_FLOOR_CAPS empty by default). Because each corpus targets a
|
||||
# single language, Gate 8 scopes tabulation to that language (tabulate.py
|
||||
# --lang), so the vendored third-party JavaScript these Ruby/Python apps
|
||||
# bundle (bootstrap-colorpicker, materialize, …) — which nyx confirms as
|
||||
# prototype_pollution — does not pollute the corpus's per-cap metrics. Those
|
||||
# JS findings are still emitted; they are simply out of scope for a Ruby /
|
||||
# Python corpus.
|
||||
#
|
||||
# Calibrated against the pinned corpora (nyx HEAD of the Phase 29 branch,
|
||||
# 2026-05-31) with `nyx scan --verify --index off`. Measured frontier
|
||||
# (target-language scope): every curated cell sits at <= the headline maxima
|
||||
# below EXCEPT cmdi, where every finding carries a SHELL_ESCAPE sanitizer cap
|
||||
# and is therefore routed to Unsupported(SoundOracleUnavailable) — the same
|
||||
# no-sound-oracle treatment OWASP's crypto/auth cells get. RailsGoat's
|
||||
# deserialize (Marshal.load) and redirect (open redirect) cells Confirm end to
|
||||
# end with zero false confirms — the first real polyglot confirms.
|
||||
|
||||
# railsgoat (ruby): caps with a ground-truth label in railsgoat.manifest.toml.
|
||||
[[cell]]
|
||||
cap = "auth"
|
||||
lang = "ruby"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
[[cell]]
|
||||
cap = "crypto"
|
||||
lang = "ruby"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
[[cell]]
|
||||
cap = "deserialize"
|
||||
lang = "ruby"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
[[cell]]
|
||||
cap = "redirect"
|
||||
lang = "ruby"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
[[cell]]
|
||||
cap = "path_traversal"
|
||||
lang = "ruby"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
# cmdi/ruby is incidental (RailsGoat's `self.try(params[:graph])` reflection
|
||||
# sink); the lone finding carries a SHELL_ESCAPE sanitizer cap and routes to
|
||||
# Unsupported(SoundOracleUnavailable), so unsupported_rate is locked at the
|
||||
# measured frontier (1/1). The false-confirm guard stays at the headline 2%.
|
||||
[[cell]]
|
||||
cap = "cmdi"
|
||||
lang = "ruby"
|
||||
unsupported_rate = 1.00
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
# dvwa (php): caps with a ground-truth label in dvwa.manifest.toml.
|
||||
[[cell]]
|
||||
cap = "sqli"
|
||||
lang = "php"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
[[cell]]
|
||||
cap = "redirect"
|
||||
lang = "php"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
[[cell]]
|
||||
cap = "header_injection"
|
||||
lang = "php"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
# cmdi/php: DVWA's ping handlers reach shell_exec through a SHELL_ESCAPE
|
||||
# sanitizer cap, so ~69% of the cell's findings route to
|
||||
# Unsupported(SoundOracleUnavailable). unsupported_rate is locked to that
|
||||
# frontier with margin (a regression above 75% fails); false-confirm at 2%.
|
||||
[[cell]]
|
||||
cap = "cmdi"
|
||||
lang = "php"
|
||||
unsupported_rate = 0.75
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
# dvpwa (python): caps with a ground-truth label in dvpwa.manifest.toml.
|
||||
[[cell]]
|
||||
cap = "sqli"
|
||||
lang = "python"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
[[cell]]
|
||||
cap = "crypto"
|
||||
lang = "python"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
[[cell]]
|
||||
cap = "auth"
|
||||
lang = "python"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
# gosec (go): caps with a ground-truth label in gosec.manifest.toml.
|
||||
[[cell]]
|
||||
cap = "crypto"
|
||||
lang = "go"
|
||||
unsupported_rate = 0.20
|
||||
false_confirmed_rate = 0.02
|
||||
|
||||
# cmdi/go: the goanalysis/testdata exec.Command sample reaches the sink
|
||||
# through a SHELL_ESCAPE sanitizer cap, so every cmdi/go finding routes to
|
||||
# Unsupported(SoundOracleUnavailable). unsupported_rate locked to the
|
||||
# measured frontier (3/3); false-confirm at the headline 2%.
|
||||
[[cell]]
|
||||
cap = "cmdi"
|
||||
lang = "go"
|
||||
unsupported_rate = 1.00
|
||||
false_confirmed_rate = 0.02
|
||||
|
|
|
|||
|
|
@ -69,3 +69,38 @@ known vulns) is the meaningful metric; precision vs this partial ground
|
|||
truth is informational. Gate 7 publishes per-cap precision/recall/confirmed
|
||||
report-only by default (`NYX_JSTS_FLOOR_CAPS` empty), matching the OWASP
|
||||
gate.
|
||||
|
||||
## Polyglot real corpora (Ruby/PHP/Python/Go/Rust — Track R.2)
|
||||
|
||||
Phase 29 wires the remaining language families into the same machinery, one
|
||||
corpus per family, each with a curated `*.manifest.toml` → committed `*.json`:
|
||||
|
||||
- `railsgoat.{manifest.toml,json}` — OWASP RailsGoat (Rails, `.rb`).
|
||||
- `dvwa.{manifest.toml,json}` — Damn Vulnerable Web Application (PHP). DVWA
|
||||
ships graded source variants (`source/{low,impossible}.php`), so this is
|
||||
the one Track R corpus besides OWASP with real vuln/benign **pairs**
|
||||
(`low.php` = vuln, `impossible.php` = benign control) — precision is
|
||||
meaningful here, not just informational.
|
||||
- `dvpwa.{manifest.toml,json}` — Damn Vulnerable Python Web App (aiohttp,
|
||||
`.py`). Its parameterized DAO siblings are benign controls for the one
|
||||
`%`-formatted SQL sink.
|
||||
- `gosec.{manifest.toml,json}` — the gosec Go SAST tool repo; the scannable,
|
||||
`// want`-annotated sample under `goanalysis/testdata` is the curated
|
||||
ground truth (gosec's string-embedded rule samples are not scannable, so
|
||||
they are deliberately unlabelled).
|
||||
- `rustsec.{manifest.toml,json}` — RustSec advisory-db, a **negative
|
||||
control**. advisory-db ships advisory metadata, not vulnerable `.rs`
|
||||
source, so its committed ground truth is empty (`[]`) by construction. The
|
||||
manifest sets `negative_control = true` (mutually exclusive with
|
||||
`[[entry]]` tables); `manifest_gt_convert.py` emits the empty JSON and the
|
||||
row asserts the Rust scan/verify path runs at scale within wall-clock and
|
||||
Confirms nothing there (any Confirmed Rust finding is a false confirm).
|
||||
|
||||
These are converted, validated and asserted-in-sync exactly like NodeGoat /
|
||||
Juice Shop (the `polyglot` job in `.github/workflows/eval.yml`). Because each
|
||||
corpus targets a single language, Gate 8 scopes tabulation to that language
|
||||
(`tabulate.py --lang`) so the vendored third-party JavaScript these Ruby /
|
||||
Python apps bundle does not pollute their per-cap metrics. Gate 8 publishes
|
||||
per-cap precision/recall/confirmed report-only by default
|
||||
(`NYX_POLYGLOT_FLOOR_CAPS` empty), matching the OWASP and JS/TS gates. See
|
||||
`tests/eval_corpus/budget.toml` for the per-(cap,lang) gate policy.
|
||||
|
|
|
|||
38
tests/eval_corpus/ground_truth/dvpwa.json
Normal file
38
tests/eval_corpus/ground_truth/dvpwa.json
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
[
|
||||
{
|
||||
"path": "sqli/dao/course.py",
|
||||
"line": 0,
|
||||
"cap": "sqli",
|
||||
"vuln": false
|
||||
},
|
||||
{
|
||||
"path": "sqli/dao/mark.py",
|
||||
"line": 0,
|
||||
"cap": "sqli",
|
||||
"vuln": false
|
||||
},
|
||||
{
|
||||
"path": "sqli/dao/review.py",
|
||||
"line": 0,
|
||||
"cap": "sqli",
|
||||
"vuln": false
|
||||
},
|
||||
{
|
||||
"path": "sqli/dao/student.py",
|
||||
"line": 0,
|
||||
"cap": "sqli",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "sqli/dao/user.py",
|
||||
"line": 0,
|
||||
"cap": "crypto",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "sqli/views.py",
|
||||
"line": 0,
|
||||
"cap": "auth",
|
||||
"vuln": true
|
||||
}
|
||||
]
|
||||
70
tests/eval_corpus/ground_truth/dvpwa.manifest.toml
Normal file
70
tests/eval_corpus/ground_truth/dvpwa.manifest.toml
Normal file
|
|
@ -0,0 +1,70 @@
|
|||
# DVPWA (Damn Vulnerable Python Web Application) — curated ground-truth
|
||||
# manifest (Phase 29, Track R.2).
|
||||
#
|
||||
# DVPWA is an intentionally-vulnerable aiohttp app whose headline flaw is
|
||||
# SQL injection (the package is literally named `sqli`). It ships no
|
||||
# machine-readable per-file labels, so this manifest IS the authoritative
|
||||
# source. Its DAO layer is convenient: one method builds a query with
|
||||
# Python `%` string-formatting (the injectable sink) while its siblings use
|
||||
# proper parameterized `cur.execute(q, params)` — so the parameterized DAOs
|
||||
# serve as genuine benign controls (vuln = false) for the sqli cell, making
|
||||
# precision there meaningful, not just informational.
|
||||
#
|
||||
# tests/eval_corpus/manifest_gt_convert.py turns this into the committed
|
||||
# ground_truth/dvpwa.json. CI regenerates it against a fresh clone of the
|
||||
# pinned ref and asserts byte-equality; the converter HARD-ERRORS on any
|
||||
# path that no longer exists, so a corpus bump that moves a DAO fails the
|
||||
# job loudly rather than silently dropping recall.
|
||||
#
|
||||
# `cap` is a nyx cap label (tabulate.py), aligned to how nyx classifies each
|
||||
# sink (the request-scoped ownership lookups in views.py surface as `auth`).
|
||||
# `path` is relative to the DVPWA clone root, POSIX separators. Lang is
|
||||
# inferred from the extension (.py -> python). See
|
||||
# tests/eval_corpus/budget.toml for the gate policy on these cells.
|
||||
|
||||
corpus = "dvpwa"
|
||||
upstream = "https://github.com/anxolerd/dvpwa"
|
||||
# DVPWA publishes no release tags; the eval job pins the default branch via
|
||||
# the CI cache key (clone HEAD a1d8f89fac2e57093189853c6527c2b01fc1d9c1).
|
||||
# The sqli/ package layout has been stable; re-validate if the cache key is
|
||||
# bumped.
|
||||
pinned_ref = "master"
|
||||
|
||||
# ── SQL injection (sqli) — one injectable sink + parameterized controls ──────
|
||||
[[entry]]
|
||||
path = "sqli/dao/student.py"
|
||||
cap = "sqli"
|
||||
vuln = true
|
||||
note = "Student.create builds the INSERT with Python `%` formatting (\"... VALUES ('%(name)s')\" % {'name': name}) on the request-supplied student name, then cur.execute(q) — SQL injection."
|
||||
|
||||
[[entry]]
|
||||
path = "sqli/dao/course.py"
|
||||
cap = "sqli"
|
||||
vuln = false
|
||||
note = "benign control: every Course query uses parameterized cur.execute(q, params) / VALUES (%(title)s, %(description)s) — not injectable."
|
||||
|
||||
[[entry]]
|
||||
path = "sqli/dao/review.py"
|
||||
cap = "sqli"
|
||||
vuln = false
|
||||
note = "benign control: Review.create / get_for_course bind via cur.execute(q, params) with %(course_id)s / %s placeholders — parameterized."
|
||||
|
||||
[[entry]]
|
||||
path = "sqli/dao/mark.py"
|
||||
cap = "sqli"
|
||||
vuln = false
|
||||
note = "benign control: Mark.create / get_for_student bind via parameterized cur.execute(q, params) — not injectable."
|
||||
|
||||
# ── Weak crypto (crypto) ─────────────────────────────────────────────────────
|
||||
[[entry]]
|
||||
path = "sqli/dao/user.py"
|
||||
cap = "crypto"
|
||||
vuln = true
|
||||
note = "User.check_password compares against md5(password).hexdigest() — unsalted MD5 for credential storage (weak cryptography)."
|
||||
|
||||
# ── Broken access control (auth) ─────────────────────────────────────────────
|
||||
[[entry]]
|
||||
path = "sqli/views.py"
|
||||
cap = "auth"
|
||||
vuln = true
|
||||
note = "request handlers resolve the acting user from a client-controlled session id and act on objects without an ownership/authorization check — broken access control."
|
||||
50
tests/eval_corpus/ground_truth/dvwa.json
Normal file
50
tests/eval_corpus/ground_truth/dvwa.json
Normal file
|
|
@ -0,0 +1,50 @@
|
|||
[
|
||||
{
|
||||
"path": "vulnerabilities/exec/source/impossible.php",
|
||||
"line": 0,
|
||||
"cap": "cmdi",
|
||||
"vuln": false
|
||||
},
|
||||
{
|
||||
"path": "vulnerabilities/exec/source/low.php",
|
||||
"line": 0,
|
||||
"cap": "cmdi",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "vulnerabilities/open_redirect/source/impossible.php",
|
||||
"line": 0,
|
||||
"cap": "header_injection",
|
||||
"vuln": false
|
||||
},
|
||||
{
|
||||
"path": "vulnerabilities/open_redirect/source/impossible.php",
|
||||
"line": 0,
|
||||
"cap": "redirect",
|
||||
"vuln": false
|
||||
},
|
||||
{
|
||||
"path": "vulnerabilities/open_redirect/source/low.php",
|
||||
"line": 0,
|
||||
"cap": "header_injection",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "vulnerabilities/open_redirect/source/low.php",
|
||||
"line": 0,
|
||||
"cap": "redirect",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "vulnerabilities/sqli/source/impossible.php",
|
||||
"line": 0,
|
||||
"cap": "sqli",
|
||||
"vuln": false
|
||||
},
|
||||
{
|
||||
"path": "vulnerabilities/sqli/source/low.php",
|
||||
"line": 0,
|
||||
"cap": "sqli",
|
||||
"vuln": true
|
||||
}
|
||||
]
|
||||
84
tests/eval_corpus/ground_truth/dvwa.manifest.toml
Normal file
84
tests/eval_corpus/ground_truth/dvwa.manifest.toml
Normal file
|
|
@ -0,0 +1,84 @@
|
|||
# DVWA (Damn Vulnerable Web Application) — curated ground-truth manifest
|
||||
# (Phase 29, Track R.2).
|
||||
#
|
||||
# DVWA is an intentionally-vulnerable PHP app. Unlike the other Track R
|
||||
# apps it ships its vulnerabilities as graded source variants under
|
||||
# vulnerabilities/<module>/source/{low,medium,high,impossible}.php, where
|
||||
# `low.php` is the textbook-vulnerable handler and `impossible.php` is the
|
||||
# hardened, secure rewrite of the SAME sink. That gives DVWA real
|
||||
# vuln/benign PAIRS (low = vuln, impossible = benign control) the way OWASP
|
||||
# Benchmark does — so precision against this manifest is meaningful, not
|
||||
# just informational: a Confirmed finding on an `impossible.php` control is
|
||||
# a genuine false confirm.
|
||||
#
|
||||
# tests/eval_corpus/manifest_gt_convert.py turns this into the committed
|
||||
# ground_truth/dvwa.json. CI regenerates it against a fresh clone of the
|
||||
# pinned tag and asserts byte-equality; the converter HARD-ERRORS on any
|
||||
# path that no longer exists, so a DVWA bump that restructures a module
|
||||
# fails loudly rather than silently dropping recall. Re-pin `pinned_ref`
|
||||
# and re-validate the paths together.
|
||||
#
|
||||
# `cap` is a nyx cap label (tabulate.py), aligned to how nyx classifies the
|
||||
# sink. `path` is relative to the DVWA clone root, POSIX separators. Lang
|
||||
# is inferred from the extension (.php -> php). See
|
||||
# tests/eval_corpus/budget.toml for the gate policy on these cells.
|
||||
|
||||
corpus = "dvwa"
|
||||
upstream = "https://github.com/digininja/DVWA"
|
||||
# Pinned to release tag 2.5 (clone HEAD
|
||||
# a96943dc1f52f390ee5df72144660636c4b7dd06). The
|
||||
# vulnerabilities/<module>/source/{low,impossible}.php layout has been stable
|
||||
# for years; re-validate if the tag is bumped.
|
||||
pinned_ref = "2.5"
|
||||
|
||||
# ── SQL injection (sqli) ─────────────────────────────────────────────────────
|
||||
[[entry]]
|
||||
path = "vulnerabilities/sqli/source/low.php"
|
||||
cap = "sqli"
|
||||
vuln = true
|
||||
note = "id = $_REQUEST['id'] is concatenated straight into \"... WHERE user_id = '$id'\" and run via mysqli_query — classic SQL injection."
|
||||
|
||||
[[entry]]
|
||||
path = "vulnerabilities/sqli/source/impossible.php"
|
||||
cap = "sqli"
|
||||
vuln = false
|
||||
note = "benign control: same query via PDO prepare + bindParam(:id, PDO::PARAM_INT) with is_numeric/intval validation — parameterized, not injectable."
|
||||
|
||||
# ── OS command injection (cmdi) ──────────────────────────────────────────────
|
||||
[[entry]]
|
||||
path = "vulnerabilities/exec/source/low.php"
|
||||
cap = "cmdi"
|
||||
vuln = true
|
||||
note = "target = $_REQUEST['ip'] is concatenated into shell_exec('ping -c 4 ' . $target) with no validation — OS command injection."
|
||||
|
||||
[[entry]]
|
||||
path = "vulnerabilities/exec/source/impossible.php"
|
||||
cap = "cmdi"
|
||||
vuln = false
|
||||
note = "benign control: the IP is split into 4 octets and each is_numeric-checked before being reassembled and passed to shell_exec — not injectable."
|
||||
|
||||
# ── Open redirect (redirect) ─────────────────────────────────────────────────
|
||||
[[entry]]
|
||||
path = "vulnerabilities/open_redirect/source/low.php"
|
||||
cap = "redirect"
|
||||
vuln = true
|
||||
note = "header('location: ' . $_GET['redirect']) forwards to an unvalidated user-supplied URL — open redirect."
|
||||
|
||||
[[entry]]
|
||||
path = "vulnerabilities/open_redirect/source/impossible.php"
|
||||
cap = "redirect"
|
||||
vuln = false
|
||||
note = "benign control: redirect target is chosen by an integer switch on is_numeric($_GET['redirect']) — no user-controlled URL reaches the Location header."
|
||||
|
||||
# ── CRLF / HTTP header injection (header_injection) ──────────────────────────
|
||||
[[entry]]
|
||||
path = "vulnerabilities/open_redirect/source/low.php"
|
||||
cap = "header_injection"
|
||||
vuln = true
|
||||
note = "the same unvalidated $_GET['redirect'] flows into a raw header() call, so CRLF in the value splits/injects response headers — HTTP header injection."
|
||||
|
||||
[[entry]]
|
||||
path = "vulnerabilities/open_redirect/source/impossible.php"
|
||||
cap = "header_injection"
|
||||
vuln = false
|
||||
note = "benign control: only a fixed, integer-selected target string reaches header() — no user bytes, so no CRLF injection."
|
||||
14
tests/eval_corpus/ground_truth/gosec.json
Normal file
14
tests/eval_corpus/ground_truth/gosec.json
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
[
|
||||
{
|
||||
"path": "goanalysis/testdata/src/a/basic_output.go",
|
||||
"line": 0,
|
||||
"cap": "cmdi",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "goanalysis/testdata/src/a/basic_output.go",
|
||||
"line": 0,
|
||||
"cap": "crypto",
|
||||
"vuln": true
|
||||
}
|
||||
]
|
||||
42
tests/eval_corpus/ground_truth/gosec.manifest.toml
Normal file
42
tests/eval_corpus/ground_truth/gosec.manifest.toml
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
# gosec — curated Go ground-truth manifest (Phase 29, Track R.2).
|
||||
#
|
||||
# gosec is the Go SAST tool; its repo doubles as the de-facto Go security
|
||||
# corpus. Most of gosec's rule samples live as Go source embedded in
|
||||
# backtick string literals inside testutils/g*_samples.go — those are NOT
|
||||
# scannable by a taint analyzer (the vulnerable code is string data, not
|
||||
# real AST), so they are deliberately NOT labelled here. gosec also ships a
|
||||
# small set of REAL, compilable sample programs under goanalysis/testdata
|
||||
# that carry the tool's OWN inline `// want 'GNNN ...'` expectations — that
|
||||
# is the authoritative, scannable ground truth this manifest pins.
|
||||
#
|
||||
# Because the eval scans the whole gosec checkout (the tool's own source
|
||||
# included), unlabelled findings are expected and are NOT false positives —
|
||||
# precision against this manifest is informational, recall on the curated
|
||||
# samples is the meaningful floor (same policy as the all-vulnerable apps;
|
||||
# see tests/eval_corpus/budget.toml).
|
||||
#
|
||||
# tests/eval_corpus/manifest_gt_convert.py turns this into the committed
|
||||
# ground_truth/gosec.json. CI regenerates it against a fresh clone of the
|
||||
# pinned tag and asserts byte-equality; the converter HARD-ERRORS on any
|
||||
# path that no longer exists, so a gosec bump that moves the testdata fails
|
||||
# the job loudly. `cap` is a nyx cap label (tabulate.py); `path` is relative
|
||||
# to the gosec clone root, POSIX separators; lang is inferred (.go -> go).
|
||||
|
||||
corpus = "gosec"
|
||||
upstream = "https://github.com/securego/gosec"
|
||||
# Pinned to release tag v2.26.1 (clone HEAD
|
||||
# 4a3bd8af174872c778439083ded7adbf3747e770). goanalysis/testdata/src/a/ has
|
||||
# been stable; re-validate if the tag is bumped.
|
||||
pinned_ref = "v2.26.1"
|
||||
|
||||
[[entry]]
|
||||
path = "goanalysis/testdata/src/a/basic_output.go"
|
||||
cap = "cmdi"
|
||||
vuln = true
|
||||
note = "VulnerableFunction runs exec.Command(\"sh\", \"-c\", getUserInput()) — subprocess launched with a non-constant argument (gosec's own `// want G204 [CWE-78]` expectation)."
|
||||
|
||||
[[entry]]
|
||||
path = "goanalysis/testdata/src/a/basic_output.go"
|
||||
cap = "crypto"
|
||||
vuln = true
|
||||
note = "VulnerableFunction imports crypto/md5 and calls md5.New() — weak cryptographic primitive (gosec's own `// want G401/G501` expectations)."
|
||||
56
tests/eval_corpus/ground_truth/railsgoat.json
Normal file
56
tests/eval_corpus/ground_truth/railsgoat.json
Normal file
|
|
@ -0,0 +1,56 @@
|
|||
[
|
||||
{
|
||||
"path": "app/controllers/admin_controller.rb",
|
||||
"line": 0,
|
||||
"cap": "auth",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "app/controllers/benefit_forms_controller.rb",
|
||||
"line": 0,
|
||||
"cap": "deserialize",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "app/controllers/benefit_forms_controller.rb",
|
||||
"line": 0,
|
||||
"cap": "path_traversal",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "app/controllers/messages_controller.rb",
|
||||
"line": 0,
|
||||
"cap": "auth",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "app/controllers/password_resets_controller.rb",
|
||||
"line": 0,
|
||||
"cap": "crypto",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "app/controllers/password_resets_controller.rb",
|
||||
"line": 0,
|
||||
"cap": "deserialize",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "app/controllers/sessions_controller.rb",
|
||||
"line": 0,
|
||||
"cap": "redirect",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "app/controllers/users_controller.rb",
|
||||
"line": 0,
|
||||
"cap": "auth",
|
||||
"vuln": true
|
||||
},
|
||||
{
|
||||
"path": "app/models/user.rb",
|
||||
"line": 0,
|
||||
"cap": "crypto",
|
||||
"vuln": true
|
||||
}
|
||||
]
|
||||
88
tests/eval_corpus/ground_truth/railsgoat.manifest.toml
Normal file
88
tests/eval_corpus/ground_truth/railsgoat.manifest.toml
Normal file
|
|
@ -0,0 +1,88 @@
|
|||
# OWASP RailsGoat — curated vuln ground-truth manifest (Phase 29, Track R.2).
|
||||
#
|
||||
# RailsGoat is an intentionally-vulnerable Ruby on Rails app that maps the
|
||||
# OWASP Top 10 to concrete controllers/models. Like NodeGoat / Juice Shop
|
||||
# (Phase 28) it ships no machine-readable per-file vuln labels, so this
|
||||
# manifest IS the authoritative source: one [[entry]] per known-vulnerable
|
||||
# location, curated from the project's own tutorial walk-throughs, each with
|
||||
# a `note` citing why.
|
||||
#
|
||||
# tests/eval_corpus/manifest_gt_convert.py turns this into the committed
|
||||
# ground_truth/railsgoat.json. CI regenerates it against a fresh clone of
|
||||
# the pinned tag and asserts byte-equality, and the converter HARD-ERRORS on
|
||||
# any path that no longer exists in the corpus, so a RailsGoat bump that
|
||||
# moves a controller fails the eval job loudly rather than silently dropping
|
||||
# recall. Update `pinned_ref` + the paths together when re-pinning.
|
||||
#
|
||||
# `cap` is a nyx cap label (tabulate.py); it is aligned with how nyx
|
||||
# classifies the sink in each file (e.g. a missing ownership check on a
|
||||
# direct-object lookup surfaces as `auth`, not `unauthorized_id`), so recall
|
||||
# (did nyx catch the canonical vuln) is meaningful. `path` is relative to
|
||||
# the RailsGoat clone root, POSIX separators. Lang is inferred from the
|
||||
# extension (.rb -> ruby). All `vuln = true`: RailsGoat is all-vulnerable,
|
||||
# so there is no benign-control file to pair against — precision vs this
|
||||
# manifest is informational (an unlabelled finding may be a real uncurated
|
||||
# vuln), while recall is the meaningful floor. See
|
||||
# tests/eval_corpus/budget.toml for how the gate treats these cells.
|
||||
|
||||
corpus = "railsgoat"
|
||||
upstream = "https://github.com/OWASP/railsgoat"
|
||||
# Pinned to the stable Rails 5 release tag (clone HEAD
|
||||
# 0766ca80bf2d94acbde1dd4aaf7baf9b86afe4eb). The app/controllers + app/models
|
||||
# layout below has been stable across this tag; re-validate the paths if the
|
||||
# ref is bumped.
|
||||
pinned_ref = "rails.5.0.0"
|
||||
|
||||
[[entry]]
|
||||
path = "app/controllers/users_controller.rb"
|
||||
cap = "auth"
|
||||
vuln = true
|
||||
note = "update looks up the account with User.where(\"id = '#{params[:user][:id]}'\") and mass-assigns user_params (params.require(:user).permit!) with no ownership check — broken access control / mass-assignment privilege escalation (OWASP A4/A5)."
|
||||
|
||||
[[entry]]
|
||||
path = "app/controllers/messages_controller.rb"
|
||||
cap = "auth"
|
||||
vuln = true
|
||||
note = "show / destroy fetch Message.where(id: params[:id]) with no check that the message belongs to current_user — insecure direct object reference (OWASP A4 broken access control)."
|
||||
|
||||
[[entry]]
|
||||
path = "app/controllers/admin_controller.rb"
|
||||
cap = "auth"
|
||||
vuln = true
|
||||
note = "administrative actions are gated by a bypassable admin_param check (params[:admin_id] != \"1\"); update_user / delete_user act on any admin_id — broken access control / privilege escalation (OWASP A5)."
|
||||
|
||||
[[entry]]
|
||||
path = "app/models/user.rb"
|
||||
cap = "crypto"
|
||||
vuln = true
|
||||
note = "passwords are hashed with Digest::MD5.hexdigest (hash_password / authenticate) — unsalted weak hash for credential storage (OWASP A2 cryptographic failure)."
|
||||
|
||||
[[entry]]
|
||||
path = "app/controllers/password_resets_controller.rb"
|
||||
cap = "crypto"
|
||||
vuln = true
|
||||
note = "generate_token derives the reset token as Digest::MD5.hexdigest(email) — a predictable, forgeable password-reset token (weak cryptography)."
|
||||
|
||||
[[entry]]
|
||||
path = "app/controllers/password_resets_controller.rb"
|
||||
cap = "deserialize"
|
||||
vuln = true
|
||||
note = "reset_password runs Marshal.load(Base64.decode64(params[:user])) on attacker-controlled input — insecure deserialization leading to RCE (OWASP A8)."
|
||||
|
||||
[[entry]]
|
||||
path = "app/controllers/sessions_controller.rb"
|
||||
cap = "redirect"
|
||||
vuln = true
|
||||
note = "create redirects to params[:url] with no allow-list (path = params[:url] then redirect_to path) — open redirect (OWASP unvalidated redirects)."
|
||||
|
||||
[[entry]]
|
||||
path = "app/controllers/benefit_forms_controller.rb"
|
||||
cap = "path_traversal"
|
||||
vuln = true
|
||||
note = "download builds send_file from a user-controlled params[:name] path with no containment — arbitrary file read / path traversal."
|
||||
|
||||
[[entry]]
|
||||
path = "app/controllers/benefit_forms_controller.rb"
|
||||
cap = "deserialize"
|
||||
vuln = true
|
||||
note = "download calls params[:type].constantize.new(path), constantizing a user-supplied class name — unsafe reflection / object injection."
|
||||
1
tests/eval_corpus/ground_truth/rustsec.json
Normal file
1
tests/eval_corpus/ground_truth/rustsec.json
Normal file
|
|
@ -0,0 +1 @@
|
|||
[]
|
||||
37
tests/eval_corpus/ground_truth/rustsec.manifest.toml
Normal file
37
tests/eval_corpus/ground_truth/rustsec.manifest.toml
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
# RustSec advisory-db — Rust negative-control corpus (Phase 29, Track R.2).
|
||||
#
|
||||
# The plan's Rust real-corpus row is the RustSec advisory database. Unlike
|
||||
# RailsGoat / DVWA / DVPWA / gosec, advisory-db ships advisory METADATA
|
||||
# (TOML + Markdown under crates/<crate>/RUSTSEC-*.md), not vulnerable Rust
|
||||
# SOURCE. A static scan of it therefore contains zero `.rs` files and nyx
|
||||
# correctly produces zero findings — so there are no source-level vuln
|
||||
# positives to label, and no canonical scannable "RustGoat" exists to
|
||||
# substitute without fabricating paths (which the CI byte-equality + path
|
||||
# existence guards would reject outright).
|
||||
#
|
||||
# advisory-db is still worth pinning and scanning as a NEGATIVE CONTROL for
|
||||
# the Rust language path:
|
||||
# * it exercises the Rust scan + verify pipeline (Phase 23 Rust build
|
||||
# pool) end to end on a large real-world tree (thousands of files) and
|
||||
# asserts it stays within the wall-clock budget without crashing, and
|
||||
# * it is an over-confirmation guard: nyx must Confirm NOTHING on a corpus
|
||||
# with no real source vulns. Any Confirmed finding here is provably a
|
||||
# false confirm and trips the per-cell false_confirmed_rate budget
|
||||
# (tests/eval_corpus/budget.toml) — a genuine regression sentinel if a
|
||||
# future change makes nyx treat advisory text as scannable code.
|
||||
#
|
||||
# `negative_control = true` tells manifest_gt_convert.py to emit an empty
|
||||
# `[]` ground truth. It is mutually exclusive with `[[entry]]` tables, so a
|
||||
# real Rust vuln can never be silently hidden behind the flag. When a
|
||||
# scannable advisory-backed Rust corpus (a vulnerable crate pinned at its
|
||||
# affected version with a source-level taint sink) is curated, drop the flag
|
||||
# and add [[entry]] tables here exactly as the other Track R.2 manifests do.
|
||||
|
||||
corpus = "rustsec"
|
||||
upstream = "https://github.com/rustsec/advisory-db"
|
||||
# advisory-db publishes no release tags; the eval job pins the default
|
||||
# branch via the CI cache key (clone HEAD
|
||||
# eaf48e749baa3d5e27d304107d8abf175fd756bb).
|
||||
pinned_ref = "main"
|
||||
|
||||
negative_control = true
|
||||
|
|
@ -23,6 +23,19 @@ Manifest schema (TOML)::
|
|||
vuln = true # true = real vuln, false = benign control
|
||||
note = "eval() of user-supplied pre/after-tax fields (NodeGoat A1)"
|
||||
|
||||
Negative-control corpora. A few real corpora carry **no** scannable
|
||||
source-level vulnerabilities of their own — most notably the RustSec
|
||||
`advisory-db`, which ships advisory *metadata* (TOML/Markdown), not
|
||||
vulnerable `.rs` source. Such a corpus has zero ground-truth positives by
|
||||
construction, yet it is still worth scanning: it exercises the language's
|
||||
scan + verify path end to end on a large real-world tree and acts as an
|
||||
over-confirmation guard (nyx must Confirm nothing on a corpus with no real
|
||||
source vulns). Declare it with a top-level ``negative_control = true`` and
|
||||
**zero** ``[[entry]]`` tables; the converter then emits an empty ``[]``
|
||||
ground truth. ``negative_control`` and ``[[entry]]`` are mutually
|
||||
exclusive — a manifest that sets the flag *and* lists entries is rejected,
|
||||
so a real vuln can never be silently dropped behind the flag.
|
||||
|
||||
Output (consumed by tabulate.py): a list of `{path, line, cap, vuln}`
|
||||
records, sorted by `(path, cap)` for deterministic, diff-stable JSON.
|
||||
`note` is intentionally dropped — the ground-truth JSON keeps the exact
|
||||
|
|
@ -119,7 +132,15 @@ def main() -> int:
|
|||
|
||||
manifest = load_manifest(Path(args.manifest).expanduser())
|
||||
entries = manifest.get("entry", []) or []
|
||||
if not entries:
|
||||
negative_control = bool(manifest.get("negative_control", False))
|
||||
if negative_control and entries:
|
||||
print(
|
||||
f"error: negative_control manifest must declare zero [[entry]] "
|
||||
f"tables (found {len(entries)}): {args.manifest}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 1
|
||||
if not entries and not negative_control:
|
||||
print(f"error: manifest has no [[entry]] tables: {args.manifest}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
|
@ -184,6 +205,8 @@ def main() -> int:
|
|||
|
||||
vuln_count = sum(1 for r in records if r["vuln"])
|
||||
print(f"wrote {len(records)} records to {out}")
|
||||
if negative_control:
|
||||
print(" negative-control corpus: zero ground-truth positives by construction")
|
||||
print(f" vulns: {vuln_count}")
|
||||
print(f" non-vuln: {len(records) - vuln_count}")
|
||||
if corpus is not None:
|
||||
|
|
|
|||
|
|
@ -28,7 +28,7 @@ REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
|||
OUTPUT_DIR=""
|
||||
NYX_BIN="${NYX_BIN:-${REPO_ROOT}/target/release/nyx}"
|
||||
CORPUS_CACHE="${NYX_EVAL_CORPUS_DIR:-${HOME}/.cache/nyx/eval_corpus}"
|
||||
SETS="owasp,sard,nodegoat,juiceshop,inhouse"
|
||||
SETS="owasp,sard,nodegoat,juiceshop,railsgoat,dvwa,dvpwa,gosec,rustsec,inhouse"
|
||||
# Optional per-cell budgets and monotonic-improvement diff.
|
||||
BUDGET_FILE=""
|
||||
DIFF_FILE=""
|
||||
|
|
@ -90,6 +90,42 @@ run_jsts_corpus() {
|
|||
|| info " tabulate.py failed on $label; ground truth file may be absent"
|
||||
}
|
||||
|
||||
# Scan one Track R.2 polyglot real corpus and tabulate it against its
|
||||
# committed ground truth, SCOPED to its target language (tabulate --lang) so
|
||||
# incidental other-language assets (e.g. vendored JS in a Rails / aiohttp app)
|
||||
# do not pollute the corpus's per-cap metrics. Self-skips when the corpus has
|
||||
# not been cloned into the cache; prints the exact clone command if so.
|
||||
# $1 label $2 dir $3 ground-truth json $4 target lang $5 repo $6 ref
|
||||
run_polyglot_corpus() {
|
||||
local label="$1" dir="$2" gt="$3" lang="$4" repo="$5" ref="$6"
|
||||
if [[ ! -d "$dir" ]]; then
|
||||
info "Bootstrapping $label..."
|
||||
info " git clone --depth 1 --branch ${ref} ${repo} ${dir}"
|
||||
info "Skipping $label set (not yet downloaded)."
|
||||
return 0
|
||||
fi
|
||||
info "Running nyx scan on $label (lang scope: ${lang})..."
|
||||
set +e
|
||||
"$NYX_BIN" scan --format json --verify --no-index "$dir" \
|
||||
> "/tmp/nyx_${label}.json" 2>"/tmp/nyx_${label}.stderr"
|
||||
local rc=$?
|
||||
set -e
|
||||
if [[ $rc -ne 0 && $rc -ne 1 ]]; then
|
||||
info " nyx exited $rc on $label set (stderr follows):"
|
||||
cat "/tmp/nyx_${label}.stderr" >&2
|
||||
return 0
|
||||
fi
|
||||
python3 "${SCRIPT_DIR}/tabulate.py" \
|
||||
--label "$label" \
|
||||
--scan "/tmp/nyx_${label}.json" \
|
||||
--ground-truth "$gt" \
|
||||
--lang "$lang" \
|
||||
--append "$RESULTS_JSON" \
|
||||
${BUDGET_FILE:+--budget "$BUDGET_FILE"} \
|
||||
${DIFF_FILE:+--diff "$DIFF_FILE"} \
|
||||
|| info " tabulate.py failed on $label; ground truth file may be absent"
|
||||
}
|
||||
|
||||
[[ -x "$NYX_BIN" ]] || die "nyx binary not found or not executable: $NYX_BIN"
|
||||
|
||||
mkdir -p "$CORPUS_CACHE"
|
||||
|
|
@ -143,6 +179,35 @@ if [[ "$SETS" == *juiceshop* ]]; then
|
|||
"${SCRIPT_DIR}/ground_truth/juiceshop.json"
|
||||
fi
|
||||
|
||||
# ── Polyglot real corpora (Ruby/PHP/Python/Go/Rust) — Track R.2 ───────────────
|
||||
if [[ "$SETS" == *railsgoat* ]]; then
|
||||
run_polyglot_corpus railsgoat "${CORPUS_CACHE}/railsgoat" \
|
||||
"${SCRIPT_DIR}/ground_truth/railsgoat.json" ruby \
|
||||
https://github.com/OWASP/railsgoat rails.5.0.0
|
||||
fi
|
||||
if [[ "$SETS" == *dvwa* ]]; then
|
||||
run_polyglot_corpus dvwa "${CORPUS_CACHE}/dvwa" \
|
||||
"${SCRIPT_DIR}/ground_truth/dvwa.json" php \
|
||||
https://github.com/digininja/DVWA 2.5
|
||||
fi
|
||||
if [[ "$SETS" == *dvpwa* ]]; then
|
||||
run_polyglot_corpus dvpwa "${CORPUS_CACHE}/dvpwa" \
|
||||
"${SCRIPT_DIR}/ground_truth/dvpwa.json" python \
|
||||
https://github.com/anxolerd/dvpwa master
|
||||
fi
|
||||
if [[ "$SETS" == *gosec* ]]; then
|
||||
run_polyglot_corpus gosec "${CORPUS_CACHE}/gosec" \
|
||||
"${SCRIPT_DIR}/ground_truth/gosec.json" go \
|
||||
https://github.com/securego/gosec v2.26.1
|
||||
fi
|
||||
# RustSec advisory-db is the Rust negative control (empty ground truth): the
|
||||
# row asserts the Rust scan/verify path runs and Confirms nothing there.
|
||||
if [[ "$SETS" == *rustsec* ]]; then
|
||||
run_polyglot_corpus rustsec "${CORPUS_CACHE}/rustsec" \
|
||||
"${SCRIPT_DIR}/ground_truth/rustsec.json" rust \
|
||||
https://github.com/rustsec/advisory-db main
|
||||
fi
|
||||
|
||||
# ── NIST SARD subset bootstrap ────────────────────────────────────────────────
|
||||
SARD_DIR="${CORPUS_CACHE}/nist_sard"
|
||||
if [[ "$SETS" == *sard* ]]; then
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@
|
|||
#
|
||||
# Drives a complete pass against every corpus set the project knows about
|
||||
# (OWASP Benchmark v1.2, the NIST SARD subset, OWASP NodeGoat + Juice Shop,
|
||||
# the Track R.2 polyglot corpora — RailsGoat / DVWA / DVPWA / gosec / RustSec —
|
||||
# and the Nyx benchmark fixtures), then emits `tests/eval_corpus/results.json`
|
||||
# for reports, diffs, and docs.
|
||||
#
|
||||
|
|
@ -70,7 +71,7 @@ set +e
|
|||
NYX_EVAL_CORPUS_DIR="$CORPUS_CACHE" \
|
||||
bash "${SCRIPT_DIR}/run.sh" \
|
||||
--nyx "$NYX_BIN" \
|
||||
--sets owasp,sard,nodegoat,juiceshop,inhouse \
|
||||
--sets owasp,sard,nodegoat,juiceshop,railsgoat,dvwa,dvpwa,gosec,rustsec,inhouse \
|
||||
--output "$OUTPUT_DIR" \
|
||||
--budget "$BUDGET_FILE" \
|
||||
${DIFF_FILE:+--diff "$DIFF_FILE"}
|
||||
|
|
|
|||
|
|
@ -362,15 +362,34 @@ def main() -> int:
|
|||
default="",
|
||||
help="path to budget.toml (per-(cap,lang) thresholds)",
|
||||
)
|
||||
p.add_argument(
|
||||
"--lang",
|
||||
default="",
|
||||
help=(
|
||||
"comma-separated language allowlist (python, javascript, php, "
|
||||
"ruby, go, rust, ...). When set, only findings AND ground-truth "
|
||||
"entries whose source language is in the list are tabulated; "
|
||||
"everything else is dropped before tallying. Used by the Phase 29 "
|
||||
"polyglot corpora (Track R.2) to scope a single-language corpus to "
|
||||
"its target language so incidental third-party assets in other "
|
||||
"languages — e.g. the vendored JavaScript a Rails or aiohttp app "
|
||||
"bundles — do not pollute that corpus's per-cap metrics. Empty = "
|
||||
"no language filter (every finding tabulated, the OWASP/JSTS "
|
||||
"default)."
|
||||
),
|
||||
)
|
||||
p.add_argument(
|
||||
"--diff",
|
||||
default="",
|
||||
help="path to a previous results JSON; fail on monotonic-improvement regression",
|
||||
)
|
||||
args = p.parse_args()
|
||||
lang_filter = {l.strip() for l in args.lang.split(",") if l.strip()}
|
||||
|
||||
scan_data = load_json(args.scan)
|
||||
findings = scan_data if isinstance(scan_data, list) else scan_data.get("findings", [])
|
||||
if lang_filter:
|
||||
findings = [f for f in findings if lang_of(f) in lang_filter]
|
||||
|
||||
# ── Manual-triage stamping (Phase 31 follow-up) ───────────────────────
|
||||
# Cross-reference Confirmed rows against a manual-triage file before
|
||||
|
|
@ -463,6 +482,10 @@ def main() -> int:
|
|||
# Ground truth format: list of {"path": ..., "line": ..., "cap": ..., "vuln": bool}
|
||||
gt_true: list[dict] = []
|
||||
for entry in gt if isinstance(gt, list) else []:
|
||||
# Honour the same language scope as the findings filter so recall
|
||||
# is measured only over the corpus's target language.
|
||||
if lang_filter and lang_of(entry) not in lang_filter:
|
||||
continue
|
||||
if entry.get("vuln"):
|
||||
gt_true.append({
|
||||
"path": entry.get("path", ""),
|
||||
|
|
|
|||
|
|
@ -168,7 +168,16 @@ def test_committed_gt_matches_manifest(tmp: Path) -> None:
|
|||
# Offline half of the CI in-sync guard: the committed ground-truth JSON
|
||||
# must be exactly what a fresh conversion of its manifest produces. This
|
||||
# catches a manifest edit that was not followed by a regenerate.
|
||||
for name in ("nodegoat", "juiceshop"):
|
||||
for name in (
|
||||
"nodegoat",
|
||||
"juiceshop",
|
||||
# Track R.2 polyglot corpora (Phase 29).
|
||||
"railsgoat",
|
||||
"dvwa",
|
||||
"dvpwa",
|
||||
"gosec",
|
||||
"rustsec",
|
||||
):
|
||||
man = GT_DIR / f"{name}.manifest.toml"
|
||||
committed = GT_DIR / f"{name}.json"
|
||||
assert man.exists(), f"missing manifest: {man}"
|
||||
|
|
@ -181,6 +190,39 @@ def test_committed_gt_matches_manifest(tmp: Path) -> None:
|
|||
)
|
||||
|
||||
|
||||
def test_negative_control_emits_empty(tmp: Path) -> None:
|
||||
# A negative-control manifest (no scannable source vulns, e.g. RustSec
|
||||
# advisory-db) declares `negative_control = true` and zero [[entry]]
|
||||
# tables; the converter emits an empty `[]` ground truth.
|
||||
man = tmp / "neg.manifest.toml"
|
||||
man.write_text(
|
||||
'corpus = "rustsec"\n'
|
||||
'upstream = "https://example.test/advisory-db"\n'
|
||||
'pinned_ref = "main"\n'
|
||||
"negative_control = true\n"
|
||||
)
|
||||
out = tmp / "neg.json"
|
||||
proc = run_convert("--manifest", str(man), "--output", str(out))
|
||||
assert proc.returncode == 0, proc.stdout + proc.stderr
|
||||
assert json.loads(out.read_text()) == [], out.read_text()
|
||||
assert "negative-control corpus" in proc.stdout, proc.stdout
|
||||
|
||||
|
||||
def test_negative_control_with_entries_rejected(tmp: Path) -> None:
|
||||
# negative_control and [[entry]] are mutually exclusive: a manifest that
|
||||
# sets the flag yet lists a vuln must be rejected so a real positive can
|
||||
# never be silently hidden behind the flag.
|
||||
man = tmp / "neg_bad.manifest.toml"
|
||||
man.write_text(
|
||||
"negative_control = true\n"
|
||||
'[[entry]]\npath = "a.rs"\ncap = "cmdi"\nvuln = true\n'
|
||||
)
|
||||
out = tmp / "neg_bad.json"
|
||||
proc = run_convert("--manifest", str(man), "--output", str(out))
|
||||
assert proc.returncode == 1, proc.stdout + proc.stderr
|
||||
assert "negative_control" in proc.stderr and "zero" in proc.stderr, proc.stderr
|
||||
|
||||
|
||||
def main() -> int:
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
tmp = Path(td)
|
||||
|
|
@ -193,6 +235,8 @@ def main() -> int:
|
|||
test_malformed_manifest_exits_1,
|
||||
test_empty_manifest_exits_1,
|
||||
test_committed_gt_matches_manifest,
|
||||
test_negative_control_emits_empty,
|
||||
test_negative_control_with_entries_rejected,
|
||||
):
|
||||
sub = tmp / fn.__name__
|
||||
sub.mkdir()
|
||||
|
|
|
|||
|
|
@ -294,6 +294,65 @@ def test_manual_triage_ignores_vuln_true_entries(tmp: Path) -> None:
|
|||
)
|
||||
|
||||
|
||||
def test_lang_filter_scopes_findings_and_gt(tmp: Path) -> None:
|
||||
# Phase 29 (Track R.2): --lang scopes a single-language corpus to its
|
||||
# target language so incidental other-language assets (e.g. the vendored
|
||||
# JavaScript a Rails app bundles, which nyx flags as prototype_pollution)
|
||||
# do not pollute the corpus's per-cap metrics. The filter must drop both
|
||||
# findings AND ground-truth entries outside the scope.
|
||||
gt = tmp / "gt.json"
|
||||
write_json(
|
||||
gt,
|
||||
[
|
||||
{"path": "app/models/user.rb", "line": 0, "cap": "sqli", "vuln": True},
|
||||
{"path": "app/assets/lib.js", "line": 0, "cap": "sqli", "vuln": True},
|
||||
],
|
||||
)
|
||||
scan = tmp / "scan.json"
|
||||
write_json(
|
||||
scan,
|
||||
{
|
||||
"findings": [
|
||||
python_finding(SINK_BIT_SQL, "/x/app/models/user.rb", 10, "NotConfirmed"),
|
||||
# A vendored-JS finding nyx would otherwise Confirm — must be
|
||||
# excluded entirely under `--lang ruby`.
|
||||
python_finding(SINK_BIT_SQL, "/x/app/assets/lib.js", 10, "Confirmed"),
|
||||
]
|
||||
},
|
||||
)
|
||||
|
||||
# Unscoped: both language cells appear.
|
||||
unscoped = tmp / "unscoped.json"
|
||||
write_json(unscoped, [])
|
||||
proc = run_tabulate(
|
||||
"--label", "railsgoat",
|
||||
"--scan", str(scan),
|
||||
"--ground-truth", str(gt),
|
||||
"--append", str(unscoped),
|
||||
)
|
||||
assert proc.returncode == 0, proc.stdout + proc.stderr
|
||||
cells = {(c["cap"], c["lang"]) for c in json.loads(unscoped.read_text())[-1]["cells"]}
|
||||
assert ("sqli", "ruby") in cells and ("sqli", "javascript") in cells, cells
|
||||
|
||||
# Scoped to ruby: the JS finding AND the JS ground-truth positive vanish.
|
||||
scoped = tmp / "scoped.json"
|
||||
write_json(scoped, [])
|
||||
proc = run_tabulate(
|
||||
"--label", "railsgoat",
|
||||
"--scan", str(scan),
|
||||
"--ground-truth", str(gt),
|
||||
"--lang", "ruby",
|
||||
"--append", str(scoped),
|
||||
)
|
||||
assert proc.returncode == 0, proc.stdout + proc.stderr
|
||||
cells = {(c["cap"], c["lang"]): c for c in json.loads(scoped.read_text())[-1]["cells"]}
|
||||
assert ("sqli", "javascript") not in cells, f"JS must be filtered out: {list(cells)}"
|
||||
ruby = cells[("sqli", "ruby")]
|
||||
assert ruby["tp"] == 1 and ruby["fn"] == 0, ruby
|
||||
# The dropped JS positive must NOT resurface as a phantom FN in any cell.
|
||||
assert all(lang != "javascript" for _cap, lang in cells), cells
|
||||
|
||||
|
||||
def test_budget_malformed_exits_3(tmp: Path) -> None:
|
||||
bad = tmp / "bad.toml"
|
||||
bad.write_text("[default]\nunsupported_rate = not_a_number\n")
|
||||
|
|
@ -601,6 +660,7 @@ def main() -> int:
|
|||
test_diff_passes_on_improvement,
|
||||
test_manual_triage_stamps_wrong_confirmed,
|
||||
test_manual_triage_ignores_vuln_true_entries,
|
||||
test_lang_filter_scopes_findings_and_gt,
|
||||
test_budget_malformed_exits_3,
|
||||
test_relative_gt_path_suffix_matches_absolute_finding,
|
||||
test_unmatched_gt_positive_lands_in_lang_cell,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue