# Eval corpus budget. # # `report.py` enforces these values when `run.sh` or `run_full.sh` pass # `--budget`. Each (cap, lang) cell uses the default row unless a specific # override appears below. # # Wall-clock cost is measured separately from this per-cell budget. # # Schema: # # [default] # unsupported_rate = 0.20 # max(Unsupported / total) per cell # false_confirmed_rate = 0.02 # max(wrong / Confirmed) per cap # repro_stability = 0.95 # min(stable / Confirmed) per cell # confirmed_rate = 0.40 # min(Confirmed / total) per cell (omit to skip) # ratchet_deadline = "..." # informational; cells already at headline # # [[cell]] # cap = "..." # lang = "..." # # # `cap` matches `tabulate.py`'s _CAP_BIT_TABLE / _CAP_RULE_TABLE labels. # `lang` matches the ext_map values (`python`, `javascript`, …). # A wildcard `"*"` matches any cell that does not have an exact entry. # # Each rate is enforced only when the relevant denominator is non-zero, so a # cell with no findings (or no Confirmed findings) never trips a budget # vacuously. `confirmed_rate` is a *minimum* (a ratchet floor); the others are # maxima. Per-cell overrides are calibrated to the measured frontier on the # real corpus so the gate locks in current performance and catches regressions # (see the OWASP cells below). [default] unsupported_rate = 0.20 false_confirmed_rate = 0.02 repro_stability = 0.95 ratchet_deadline = "2026-05-15" # ── OWASP Benchmark v1.2 (Java) — Track R.0 ratchet ────────────────────────── # # Calibrated against the pinned 1.2beta corpus, nyx c0501884, 2026-05-29 # (`nyx scan --verify` over all 2740 BenchmarkTest files; 5812 findings). # # Measured frontier at calibration: # verdicts : Confirmed 0 | NotConfirmed 4077 | Inconclusive 1725 # (BuildFailed 952 + SpecDerivationFailed 773) | Unsupported 10 # per cell : unsupported_rate <= 1.7% (headline <= 20% -> MET) # false_confirmed = 0% (headline <= 2% -> MET, 0 confirms) # confirmed_rate = 0% (headline >= 40% -> NOT met) # # The verifier confirms nothing on OWASP yet: every BenchmarkTest is a servlet # whose harness lands in BuildFailed / SpecDerivationFailed (Java servlet entry # wiring + classpath are Track L.12 / Track O.0 work). So the enforced floors # below are the two headline maxima the verifier already satisfies # (unsupported_rate, false_confirmed_rate). `confirmed_rate` is intentionally # left UNSET — the headline >= 40% is the ratchet's destination, recorded here # and in scripts/m7_ship_gate.sh (NYX_OWASP_FLOOR_CAPS), not a floor we can # honestly assert at 0 confirms. Promote a cap into the gated set (and add its # `confirmed_rate`) the moment it starts Confirming. # # Caps split two ways: # sound-oracle (injection): cmdi, sqli, path_traversal, ldap_injection, # xpath_injection — once their servlet harnesses build, a runtime oracle # exists; these are the GATE6_FLOOR_CAPS candidates. # no-sound-oracle (config/usage smell): crypto (weak rand/hash), auth # (insecure cookie), xss/trustbound — Phase-11 routes these to # Unsupported(SoundOracleUnavailable); they stay report-only. When that # routing lands their unsupported_rate will rise and these cells must be # relaxed accordingly. [[cell]] cap = "cmdi" lang = "java" unsupported_rate = 0.20 false_confirmed_rate = 0.02 [[cell]] cap = "sqli" lang = "java" unsupported_rate = 0.20 false_confirmed_rate = 0.02 [[cell]] cap = "path_traversal" lang = "java" unsupported_rate = 0.20 false_confirmed_rate = 0.02 [[cell]] cap = "ldap_injection" lang = "java" unsupported_rate = 0.20 false_confirmed_rate = 0.02 [[cell]] cap = "xpath_injection" lang = "java" unsupported_rate = 0.20 false_confirmed_rate = 0.02 [[cell]] cap = "xss" lang = "java" unsupported_rate = 0.20 false_confirmed_rate = 0.02 [[cell]] cap = "crypto" lang = "java" unsupported_rate = 0.20 false_confirmed_rate = 0.02 [[cell]] cap = "auth" lang = "java" unsupported_rate = 0.20 false_confirmed_rate = 0.02 # ── NodeGoat / Juice Shop (JS/TS) — Track R.1 ratchet ──────────────────────── # # Phase 28 wires two intentionally-vulnerable JS/TS apps into the same # acceptance machinery as OWASP Benchmark: OWASP NodeGoat (Express, .js) # and OWASP Juice Shop (TypeScript, .ts). Unlike OWASP Benchmark, neither # app ships vuln/benign *pairs* — every labelled file is `vuln = true` (see # ground_truth/{nodegoat,juiceshop}.manifest.toml). Two consequences for # these cells: # # * false_confirmed_rate (<= 2%) is the headline maximum the verifier # already satisfies and is HARD-enforced: it only trips when a Confirmed # finding lands on a file with no ground-truth positive, i.e. an # over-confirm. With the verifier confirming little on real corpora yet # it is satisfied, and it ratchets precision as confirms grow. # * unsupported_rate (<= 20%) is HARD-enforced too. `Unsupported` counts # only NoPayloadsForCap / EntryKindUnsupported / SoundOracleUnavailable — # a narrow bucket that Tracks J + M shrank — *not* BuildFailed / # SpecDerivationFailed (those are Inconclusive), so it stays low. # # confirmed_rate (>= 40%), precision (>= 0.85) and recall (>= 0.40) are the # Phase 28 acceptance DESTINATIONS. They are intentionally left UNSET here # and published report-only by Gate 7 (NYX_JSTS_FLOOR_CAPS empty by default, # mirroring NYX_OWASP_FLOOR_CAPS) because (a) the verifier does not yet # Confirm these corpora end to end and (b) the manifest labels canonical # vulns only, so precision vs partial ground truth is informational until # the labels are completed. Promote a cap into the floor set the moment it # starts Confirming, exactly as for OWASP. # NodeGoat (javascript): caps with a ground-truth label in nodegoat.manifest.toml. [[cell]] cap = "cmdi" lang = "javascript" unsupported_rate = 0.20 false_confirmed_rate = 0.02 [[cell]] cap = "xss" lang = "javascript" unsupported_rate = 0.20 false_confirmed_rate = 0.02 [[cell]] cap = "unauthorized_id" lang = "javascript" unsupported_rate = 0.20 false_confirmed_rate = 0.02 [[cell]] cap = "crypto" lang = "javascript" unsupported_rate = 0.20 false_confirmed_rate = 0.02 # Juice Shop (typescript): caps with a ground-truth label in juiceshop.manifest.toml. [[cell]] cap = "sqli" lang = "typescript" unsupported_rate = 0.20 false_confirmed_rate = 0.02 [[cell]] cap = "path_traversal" lang = "typescript" unsupported_rate = 0.20 false_confirmed_rate = 0.02 [[cell]] cap = "redirect" lang = "typescript" unsupported_rate = 0.20 false_confirmed_rate = 0.02 [[cell]] cap = "ssrf" lang = "typescript" unsupported_rate = 0.20 false_confirmed_rate = 0.02 [[cell]] cap = "crypto" lang = "typescript" unsupported_rate = 0.20 false_confirmed_rate = 0.02 # ── Polyglot real corpora (Ruby/PHP/Python/Go/Rust) — Track R.2 ────────────── # # Phase 29 wires five more intentionally-vulnerable real corpora, one per # remaining language family, into the same acceptance machinery as OWASP / # NodeGoat / Juice Shop: # # * railsgoat — OWASP RailsGoat (Rails, .rb) # * dvwa — Damn Vulnerable Web Application (PHP); ships graded # source variants, so low.php = vuln and impossible.php = # benign control — real vuln/benign PAIRS like OWASP. # * dvpwa — Damn Vulnerable Python Web App (aiohttp, .py); its # parameterized DAO siblings are benign controls for the # one `%`-formatted SQL sink. # * gosec — the Go SAST tool's own repo; the scannable, `// want`- # annotated sample under goanalysis/testdata is the curated # ground truth (its embedded-string rule samples are not # scannable, so they are unlabelled). # * rustsec — RustSec advisory-db: a NEGATIVE CONTROL. It ships # advisory metadata, not vulnerable .rs source, so its # ground truth is empty by construction; the row asserts the # Rust scan/verify path runs at scale within wall-clock and # Confirms NOTHING (any Confirmed Rust finding there is a # false confirm and trips the default false_confirmed_rate). # # Each row is gated with the SAME policy as Gates 6/7 (scripts/m7_ship_gate.sh # Gate 8): wall-clock + the per-(cap,lang) budget below are HARD-enforced; # per-cap confirmed-rate / precision / recall are published report-only # (NYX_POLYGLOT_FLOOR_CAPS empty by default). Because each corpus targets a # single language, Gate 8 scopes tabulation to that language (tabulate.py # --lang), so the vendored third-party JavaScript these Ruby/Python apps # bundle (bootstrap-colorpicker, materialize, …) — which nyx confirms as # prototype_pollution — does not pollute the corpus's per-cap metrics. Those # JS findings are still emitted; they are simply out of scope for a Ruby / # Python corpus. # # Calibrated against the pinned corpora (nyx HEAD of the Phase 29 branch, # 2026-05-31) with `nyx scan --verify --index off`. Measured frontier # (target-language scope): every curated cell sits at <= the headline maxima # below EXCEPT cmdi, where every finding carries a SHELL_ESCAPE sanitizer cap # and is therefore routed to Unsupported(SoundOracleUnavailable) — the same # no-sound-oracle treatment OWASP's crypto/auth cells get. RailsGoat's # deserialize (Marshal.load) and redirect (open redirect) cells Confirm end to # end with zero false confirms — the first real polyglot confirms. # railsgoat (ruby): caps with a ground-truth label in railsgoat.manifest.toml. [[cell]] cap = "auth" lang = "ruby" unsupported_rate = 0.20 false_confirmed_rate = 0.02 [[cell]] cap = "crypto" lang = "ruby" unsupported_rate = 0.20 false_confirmed_rate = 0.02 [[cell]] cap = "deserialize" lang = "ruby" unsupported_rate = 0.20 false_confirmed_rate = 0.02 [[cell]] cap = "redirect" lang = "ruby" unsupported_rate = 0.20 false_confirmed_rate = 0.02 [[cell]] cap = "path_traversal" lang = "ruby" unsupported_rate = 0.20 false_confirmed_rate = 0.02 # cmdi/ruby is incidental (RailsGoat's `self.try(params[:graph])` reflection # sink); the lone finding carries a SHELL_ESCAPE sanitizer cap and routes to # Unsupported(SoundOracleUnavailable), so unsupported_rate is locked at the # measured frontier (1/1). The false-confirm guard stays at the headline 2%. [[cell]] cap = "cmdi" lang = "ruby" unsupported_rate = 1.00 false_confirmed_rate = 0.02 # dvwa (php): caps with a ground-truth label in dvwa.manifest.toml. [[cell]] cap = "sqli" lang = "php" unsupported_rate = 0.20 false_confirmed_rate = 0.02 [[cell]] cap = "redirect" lang = "php" unsupported_rate = 0.20 false_confirmed_rate = 0.02 [[cell]] cap = "header_injection" lang = "php" unsupported_rate = 0.20 false_confirmed_rate = 0.02 # cmdi/php: DVWA's ping handlers reach shell_exec through a SHELL_ESCAPE # sanitizer cap, so ~69% of the cell's findings route to # Unsupported(SoundOracleUnavailable). unsupported_rate is locked to that # frontier with margin (a regression above 75% fails); false-confirm at 2%. [[cell]] cap = "cmdi" lang = "php" unsupported_rate = 0.75 false_confirmed_rate = 0.02 # dvpwa (python): caps with a ground-truth label in dvpwa.manifest.toml. [[cell]] cap = "sqli" lang = "python" unsupported_rate = 0.20 false_confirmed_rate = 0.02 [[cell]] cap = "crypto" lang = "python" unsupported_rate = 0.20 false_confirmed_rate = 0.02 [[cell]] cap = "auth" lang = "python" unsupported_rate = 0.20 false_confirmed_rate = 0.02 # gosec (go): caps with a ground-truth label in gosec.manifest.toml. [[cell]] cap = "crypto" lang = "go" unsupported_rate = 0.20 false_confirmed_rate = 0.02 # cmdi/go: the goanalysis/testdata exec.Command sample reaches the sink # through a SHELL_ESCAPE sanitizer cap, so every cmdi/go finding routes to # Unsupported(SoundOracleUnavailable). unsupported_rate locked to the # measured frontier (3/3); false-confirm at the headline 2%. [[cell]] cap = "cmdi" lang = "go" unsupported_rate = 1.00 false_confirmed_rate = 0.02