mirror of
https://github.com/elicpeter/nyx.git
synced 2026-06-09 19:45:13 +02:00
352 lines
12 KiB
TOML
352 lines
12 KiB
TOML
# Eval corpus budget.
|
|
#
|
|
# `report.py` enforces these values when `run.sh` or `run_full.sh` pass
|
|
# `--budget`. Each (cap, lang) cell uses the default row unless a specific
|
|
# override appears below.
|
|
#
|
|
# Wall-clock cost is measured separately from this per-cell budget.
|
|
#
|
|
# Schema:
|
|
#
|
|
# [default]
|
|
# unsupported_rate = 0.20 # max(Unsupported / total) per cell
|
|
# false_confirmed_rate = 0.02 # max(wrong / Confirmed) per cap
|
|
# repro_stability = 0.95 # min(stable / Confirmed) per cell
|
|
# confirmed_rate = 0.40 # min(Confirmed / total) per cell (omit to skip)
|
|
# ratchet_deadline = "..." # informational; cells already at headline
|
|
#
|
|
# [[cell]]
|
|
# cap = "..."
|
|
# lang = "..."
|
|
# <overrides as above>
|
|
#
|
|
# `cap` matches `tabulate.py`'s _CAP_BIT_TABLE / _CAP_RULE_TABLE labels.
|
|
# `lang` matches the ext_map values (`python`, `javascript`, …).
|
|
# A wildcard `"*"` matches any cell that does not have an exact entry.
|
|
#
|
|
# Each rate is enforced only when the relevant denominator is non-zero, so a
|
|
# cell with no findings (or no Confirmed findings) never trips a budget
|
|
# vacuously. `confirmed_rate` is a *minimum* (a ratchet floor); the others are
|
|
# maxima. Per-cell overrides are calibrated to the measured frontier on the
|
|
# real corpus so the gate locks in current performance and catches regressions
|
|
# (see the OWASP cells below).
|
|
|
|
[default]
|
|
unsupported_rate = 0.20
|
|
false_confirmed_rate = 0.02
|
|
repro_stability = 0.95
|
|
ratchet_deadline = "2026-05-15"
|
|
|
|
# ── OWASP Benchmark v1.2 (Java) — Track R.0 ratchet ──────────────────────────
|
|
#
|
|
# Calibrated against the pinned 1.2beta corpus, nyx c0501884, 2026-05-29
|
|
# (`nyx scan --verify` over all 2740 BenchmarkTest files; 5812 findings).
|
|
#
|
|
# Measured frontier at calibration:
|
|
# verdicts : Confirmed 0 | NotConfirmed 4077 | Inconclusive 1725
|
|
# (BuildFailed 952 + SpecDerivationFailed 773) | Unsupported 10
|
|
# per cell : unsupported_rate <= 1.7% (headline <= 20% -> MET)
|
|
# false_confirmed = 0% (headline <= 2% -> MET, 0 confirms)
|
|
# confirmed_rate = 0% (headline >= 40% -> NOT met)
|
|
#
|
|
# The verifier confirms nothing on OWASP yet: every BenchmarkTest is a servlet
|
|
# whose harness lands in BuildFailed / SpecDerivationFailed (Java servlet entry
|
|
# wiring + classpath are Track L.12 / Track O.0 work). So the enforced floors
|
|
# below are the two headline maxima the verifier already satisfies
|
|
# (unsupported_rate, false_confirmed_rate). `confirmed_rate` is intentionally
|
|
# left UNSET — the headline >= 40% is the ratchet's destination, recorded here
|
|
# and in scripts/m7_ship_gate.sh (NYX_OWASP_FLOOR_CAPS), not a floor we can
|
|
# honestly assert at 0 confirms. Promote a cap into the gated set (and add its
|
|
# `confirmed_rate`) the moment it starts Confirming.
|
|
#
|
|
# Caps split two ways:
|
|
# sound-oracle (injection): cmdi, sqli, path_traversal, ldap_injection,
|
|
# xpath_injection — once their servlet harnesses build, a runtime oracle
|
|
# exists; these are the GATE6_FLOOR_CAPS candidates.
|
|
# no-sound-oracle (config/usage smell): crypto (weak rand/hash), auth
|
|
# (insecure cookie), xss/trustbound — Phase-11 routes these to
|
|
# Unsupported(SoundOracleUnavailable); they stay report-only. When that
|
|
# routing lands their unsupported_rate will rise and these cells must be
|
|
# relaxed accordingly.
|
|
|
|
[[cell]]
|
|
cap = "cmdi"
|
|
lang = "java"
|
|
unsupported_rate = 0.20
|
|
false_confirmed_rate = 0.02
|
|
|
|
[[cell]]
|
|
cap = "sqli"
|
|
lang = "java"
|
|
unsupported_rate = 0.20
|
|
false_confirmed_rate = 0.02
|
|
|
|
[[cell]]
|
|
cap = "path_traversal"
|
|
lang = "java"
|
|
unsupported_rate = 0.20
|
|
false_confirmed_rate = 0.02
|
|
|
|
[[cell]]
|
|
cap = "ldap_injection"
|
|
lang = "java"
|
|
unsupported_rate = 0.20
|
|
false_confirmed_rate = 0.02
|
|
|
|
[[cell]]
|
|
cap = "xpath_injection"
|
|
lang = "java"
|
|
unsupported_rate = 0.20
|
|
false_confirmed_rate = 0.02
|
|
|
|
[[cell]]
|
|
cap = "xss"
|
|
lang = "java"
|
|
unsupported_rate = 0.20
|
|
false_confirmed_rate = 0.02
|
|
|
|
[[cell]]
|
|
cap = "crypto"
|
|
lang = "java"
|
|
unsupported_rate = 0.20
|
|
false_confirmed_rate = 0.02
|
|
|
|
[[cell]]
|
|
cap = "auth"
|
|
lang = "java"
|
|
unsupported_rate = 0.20
|
|
false_confirmed_rate = 0.02
|
|
|
|
# ── NodeGoat / Juice Shop (JS/TS) — Track R.1 ratchet ────────────────────────
|
|
#
|
|
# Phase 28 wires two intentionally-vulnerable JS/TS apps into the same
|
|
# acceptance machinery as OWASP Benchmark: OWASP NodeGoat (Express, .js)
|
|
# and OWASP Juice Shop (TypeScript, .ts). Unlike OWASP Benchmark, neither
|
|
# app ships vuln/benign *pairs* — every labelled file is `vuln = true` (see
|
|
# ground_truth/{nodegoat,juiceshop}.manifest.toml). Two consequences for
|
|
# these cells:
|
|
#
|
|
# * false_confirmed_rate (<= 2%) is the headline maximum the verifier
|
|
# already satisfies and is HARD-enforced: it only trips when a Confirmed
|
|
# finding lands on a file with no ground-truth positive, i.e. an
|
|
# over-confirm. With the verifier confirming little on real corpora yet
|
|
# it is satisfied, and it ratchets precision as confirms grow.
|
|
# * unsupported_rate (<= 20%) is HARD-enforced too. `Unsupported` counts
|
|
# only NoPayloadsForCap / EntryKindUnsupported / SoundOracleUnavailable —
|
|
# a narrow bucket that Tracks J + M shrank — *not* BuildFailed /
|
|
# SpecDerivationFailed (those are Inconclusive), so it stays low.
|
|
#
|
|
# confirmed_rate (>= 40%), precision (>= 0.85) and recall (>= 0.40) are the
|
|
# Phase 28 acceptance DESTINATIONS. They are intentionally left UNSET here
|
|
# and published report-only by Gate 7 (NYX_JSTS_FLOOR_CAPS empty by default,
|
|
# mirroring NYX_OWASP_FLOOR_CAPS) because (a) the verifier does not yet
|
|
# Confirm these corpora end to end and (b) the manifest labels canonical
|
|
# vulns only, so precision vs partial ground truth is informational until
|
|
# the labels are completed. Promote a cap into the floor set the moment it
|
|
# starts Confirming, exactly as for OWASP.
|
|
|
|
# NodeGoat (javascript): caps with a ground-truth label in nodegoat.manifest.toml.
|
|
[[cell]]
|
|
cap = "cmdi"
|
|
lang = "javascript"
|
|
unsupported_rate = 0.20
|
|
false_confirmed_rate = 0.02
|
|
|
|
[[cell]]
|
|
cap = "xss"
|
|
lang = "javascript"
|
|
unsupported_rate = 0.20
|
|
false_confirmed_rate = 0.02
|
|
|
|
[[cell]]
|
|
cap = "unauthorized_id"
|
|
lang = "javascript"
|
|
unsupported_rate = 0.20
|
|
false_confirmed_rate = 0.02
|
|
|
|
[[cell]]
|
|
cap = "crypto"
|
|
lang = "javascript"
|
|
unsupported_rate = 0.20
|
|
false_confirmed_rate = 0.02
|
|
|
|
# Juice Shop (typescript): caps with a ground-truth label in juiceshop.manifest.toml.
|
|
[[cell]]
|
|
cap = "sqli"
|
|
lang = "typescript"
|
|
unsupported_rate = 0.20
|
|
false_confirmed_rate = 0.02
|
|
|
|
[[cell]]
|
|
cap = "path_traversal"
|
|
lang = "typescript"
|
|
unsupported_rate = 0.20
|
|
false_confirmed_rate = 0.02
|
|
|
|
[[cell]]
|
|
cap = "redirect"
|
|
lang = "typescript"
|
|
unsupported_rate = 0.20
|
|
false_confirmed_rate = 0.02
|
|
|
|
[[cell]]
|
|
cap = "ssrf"
|
|
lang = "typescript"
|
|
unsupported_rate = 0.20
|
|
false_confirmed_rate = 0.02
|
|
|
|
[[cell]]
|
|
cap = "crypto"
|
|
lang = "typescript"
|
|
unsupported_rate = 0.20
|
|
false_confirmed_rate = 0.02
|
|
|
|
# ── Polyglot real corpora (Ruby/PHP/Python/Go/Rust) — Track R.2 ──────────────
|
|
#
|
|
# Phase 29 wires five more intentionally-vulnerable real corpora, one per
|
|
# remaining language family, into the same acceptance machinery as OWASP /
|
|
# NodeGoat / Juice Shop:
|
|
#
|
|
# * railsgoat — OWASP RailsGoat (Rails, .rb)
|
|
# * dvwa — Damn Vulnerable Web Application (PHP); ships graded
|
|
# source variants, so low.php = vuln and impossible.php =
|
|
# benign control — real vuln/benign PAIRS like OWASP.
|
|
# * dvpwa — Damn Vulnerable Python Web App (aiohttp, .py); its
|
|
# parameterized DAO siblings are benign controls for the
|
|
# one `%`-formatted SQL sink.
|
|
# * gosec — the Go SAST tool's own repo; the scannable, `// want`-
|
|
# annotated sample under goanalysis/testdata is the curated
|
|
# ground truth (its embedded-string rule samples are not
|
|
# scannable, so they are unlabelled).
|
|
# * rustsec — RustSec advisory-db: a NEGATIVE CONTROL. It ships
|
|
# advisory metadata, not vulnerable .rs source, so its
|
|
# ground truth is empty by construction; the row asserts the
|
|
# Rust scan/verify path runs at scale within wall-clock and
|
|
# Confirms NOTHING (any Confirmed Rust finding there is a
|
|
# false confirm and trips the default false_confirmed_rate).
|
|
#
|
|
# Each row is gated with the SAME policy as Gates 6/7 (scripts/m7_ship_gate.sh
|
|
# Gate 8): wall-clock + the per-(cap,lang) budget below are HARD-enforced;
|
|
# per-cap confirmed-rate / precision / recall are published report-only
|
|
# (NYX_POLYGLOT_FLOOR_CAPS empty by default). Because each corpus targets a
|
|
# single language, Gate 8 scopes tabulation to that language (tabulate.py
|
|
# --lang), so the vendored third-party JavaScript these Ruby/Python apps
|
|
# bundle (bootstrap-colorpicker, materialize, …) — which nyx confirms as
|
|
# prototype_pollution — does not pollute the corpus's per-cap metrics. Those
|
|
# JS findings are still emitted; they are simply out of scope for a Ruby /
|
|
# Python corpus.
|
|
#
|
|
# Calibrated against the pinned corpora (nyx HEAD of the Phase 29 branch,
|
|
# 2026-05-31) with `nyx scan --verify --index off`. Measured frontier
|
|
# (target-language scope): every curated cell sits at <= the headline maxima
|
|
# below EXCEPT cmdi, where every finding carries a SHELL_ESCAPE sanitizer cap
|
|
# and is therefore routed to Unsupported(SoundOracleUnavailable) — the same
|
|
# no-sound-oracle treatment OWASP's crypto/auth cells get. RailsGoat's
|
|
# deserialize (Marshal.load) and redirect (open redirect) cells Confirm end to
|
|
# end with zero false confirms — the first real polyglot confirms.
|
|
|
|
# railsgoat (ruby): caps with a ground-truth label in railsgoat.manifest.toml.
|
|
[[cell]]
|
|
cap = "auth"
|
|
lang = "ruby"
|
|
unsupported_rate = 0.20
|
|
false_confirmed_rate = 0.02
|
|
|
|
[[cell]]
|
|
cap = "crypto"
|
|
lang = "ruby"
|
|
unsupported_rate = 0.20
|
|
false_confirmed_rate = 0.02
|
|
|
|
[[cell]]
|
|
cap = "deserialize"
|
|
lang = "ruby"
|
|
unsupported_rate = 0.20
|
|
false_confirmed_rate = 0.02
|
|
|
|
[[cell]]
|
|
cap = "redirect"
|
|
lang = "ruby"
|
|
unsupported_rate = 0.20
|
|
false_confirmed_rate = 0.02
|
|
|
|
[[cell]]
|
|
cap = "path_traversal"
|
|
lang = "ruby"
|
|
unsupported_rate = 0.20
|
|
false_confirmed_rate = 0.02
|
|
|
|
# cmdi/ruby is incidental (RailsGoat's `self.try(params[:graph])` reflection
|
|
# sink); the lone finding carries a SHELL_ESCAPE sanitizer cap and routes to
|
|
# Unsupported(SoundOracleUnavailable), so unsupported_rate is locked at the
|
|
# measured frontier (1/1). The false-confirm guard stays at the headline 2%.
|
|
[[cell]]
|
|
cap = "cmdi"
|
|
lang = "ruby"
|
|
unsupported_rate = 1.00
|
|
false_confirmed_rate = 0.02
|
|
|
|
# dvwa (php): caps with a ground-truth label in dvwa.manifest.toml.
|
|
[[cell]]
|
|
cap = "sqli"
|
|
lang = "php"
|
|
unsupported_rate = 0.20
|
|
false_confirmed_rate = 0.02
|
|
|
|
[[cell]]
|
|
cap = "redirect"
|
|
lang = "php"
|
|
unsupported_rate = 0.20
|
|
false_confirmed_rate = 0.02
|
|
|
|
[[cell]]
|
|
cap = "header_injection"
|
|
lang = "php"
|
|
unsupported_rate = 0.20
|
|
false_confirmed_rate = 0.02
|
|
|
|
# cmdi/php: DVWA's ping handlers reach shell_exec through a SHELL_ESCAPE
|
|
# sanitizer cap, so ~69% of the cell's findings route to
|
|
# Unsupported(SoundOracleUnavailable). unsupported_rate is locked to that
|
|
# frontier with margin (a regression above 75% fails); false-confirm at 2%.
|
|
[[cell]]
|
|
cap = "cmdi"
|
|
lang = "php"
|
|
unsupported_rate = 0.75
|
|
false_confirmed_rate = 0.02
|
|
|
|
# dvpwa (python): caps with a ground-truth label in dvpwa.manifest.toml.
|
|
[[cell]]
|
|
cap = "sqli"
|
|
lang = "python"
|
|
unsupported_rate = 0.20
|
|
false_confirmed_rate = 0.02
|
|
|
|
[[cell]]
|
|
cap = "crypto"
|
|
lang = "python"
|
|
unsupported_rate = 0.20
|
|
false_confirmed_rate = 0.02
|
|
|
|
[[cell]]
|
|
cap = "auth"
|
|
lang = "python"
|
|
unsupported_rate = 0.20
|
|
false_confirmed_rate = 0.02
|
|
|
|
# gosec (go): caps with a ground-truth label in gosec.manifest.toml.
|
|
[[cell]]
|
|
cap = "crypto"
|
|
lang = "go"
|
|
unsupported_rate = 0.20
|
|
false_confirmed_rate = 0.02
|
|
|
|
# cmdi/go: the goanalysis/testdata exec.Command sample reaches the sink
|
|
# through a SHELL_ESCAPE sanitizer cap, so every cmdi/go finding routes to
|
|
# Unsupported(SoundOracleUnavailable). unsupported_rate locked to the
|
|
# measured frontier (3/3); false-confirm at the headline 2%.
|
|
[[cell]]
|
|
cap = "cmdi"
|
|
lang = "go"
|
|
unsupported_rate = 1.00
|
|
false_confirmed_rate = 0.02
|