mirror of
https://github.com/elicpeter/nyx.git
synced 2026-06-09 19:45:13 +02:00
[pitboss] phase 29: Track I — Per-cell budgets, --diff, fixture prerequisites, CI matrix expansion
This commit is contained in:
parent
760bc1beb2
commit
dd607fb4b3
10 changed files with 1325 additions and 32 deletions
152
.github/workflows/dynamic.yml
vendored
Normal file
152
.github/workflows/dynamic.yml
vendored
Normal file
|
|
@ -0,0 +1,152 @@
|
|||
# Phase 29 (Track I): dedicated dynamic-verification matrix.
|
||||
#
|
||||
# Three rows exercise the dynamic harness pipeline (`cargo nextest run
|
||||
# --features dynamic`) under the host configurations the Phase 17–28
|
||||
# tracks documented as supported:
|
||||
#
|
||||
# linux-process-only — Ubuntu host, no docker daemon. Forces the
|
||||
# process backend and exercises the Phase 17
|
||||
# Linux hardening primitives (chroot, seccomp,
|
||||
# unshare, no_new_privs). `libc6-dev` is
|
||||
# installed so the hardening probe + escape
|
||||
# suite can `cc -static`; without it the
|
||||
# chroot-leg of the escape suite skips silently
|
||||
# (Phase 20 follow-up #4 in deferred.md).
|
||||
#
|
||||
# linux-with-docker — Ubuntu host with docker-in-docker. Exercises
|
||||
# the docker backend (Phase 19) and the
|
||||
# differential-confirmation parity tests.
|
||||
#
|
||||
# macos — macOS-latest, no docker. Exercises the
|
||||
# Phase-18 `sandbox-exec` primitives plus the
|
||||
# process backend on Darwin. Track-I acceptance
|
||||
# literal: "cargo nextest run --features dynamic
|
||||
# is green on macOS without docker."
|
||||
|
||||
name: dynamic
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: ["master"]
|
||||
pull_request:
|
||||
branches: ["master"]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
linux-process-only:
|
||||
name: dynamic / linux-process-only
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
# Force the process backend even when callers default to Auto so
|
||||
# docker-unavailable paths cannot accidentally hide a regression.
|
||||
NYX_SANDBOX_BACKEND: process
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- uses: actions-rust-lang/setup-rust-toolchain@v1
|
||||
with:
|
||||
toolchain: stable
|
||||
cache: true
|
||||
|
||||
- uses: taiki-e/install-action@nextest
|
||||
|
||||
# Phase 17 / Phase 20 follow-up: the hardening probe + escape
|
||||
# suite chroot leg need static glibc. Without these packages the
|
||||
# `cc -static probe.c` step in tests/sandbox_hardening_linux.rs +
|
||||
# tests/sandbox_escape_suite.rs falls back to dynamic linking and
|
||||
# the chroot leg silently skips.
|
||||
- name: Install fixture prerequisites (static libc)
|
||||
run: |
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install -y --no-install-recommends libc6-dev libc-dev-bin
|
||||
|
||||
- name: Smoke-test interpreter availability
|
||||
run: |
|
||||
python3 --version
|
||||
node --version || sudo apt-get install -y --no-install-recommends nodejs
|
||||
ruby --version || true
|
||||
php --version || true
|
||||
|
||||
- name: Dynamic suite (process backend only)
|
||||
run: cargo nextest run --features dynamic
|
||||
|
||||
linux-with-docker:
|
||||
name: dynamic / linux-with-docker
|
||||
runs-on: ubuntu-latest
|
||||
services:
|
||||
docker:
|
||||
image: docker:dind
|
||||
options: --privileged
|
||||
env:
|
||||
DOCKER_TLS_CERTDIR: ""
|
||||
DOCKER_HOST: tcp://docker:2375
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- uses: actions-rust-lang/setup-rust-toolchain@v1
|
||||
with:
|
||||
toolchain: stable
|
||||
cache: true
|
||||
|
||||
- uses: taiki-e/install-action@nextest
|
||||
|
||||
- name: Install fixture prerequisites (static libc)
|
||||
run: |
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install -y --no-install-recommends libc6-dev libc-dev-bin
|
||||
|
||||
- name: Pull language images for sandbox tests
|
||||
run: |
|
||||
docker pull python:3-slim
|
||||
docker pull node:20-slim
|
||||
docker pull eclipse-temurin:21-jre-jammy
|
||||
docker pull php:8-cli
|
||||
|
||||
- name: Smoke-test docker interpreter availability
|
||||
run: |
|
||||
docker run --rm python:3-slim python3 --version
|
||||
docker run --rm node:20-slim node --version
|
||||
docker run --rm eclipse-temurin:21-jre-jammy java -version
|
||||
docker run --rm php:8-cli php --version
|
||||
|
||||
- name: Dynamic suite (process + docker backends)
|
||||
run: cargo nextest run --features dynamic
|
||||
|
||||
macos:
|
||||
name: dynamic / macos
|
||||
runs-on: macos-latest
|
||||
env:
|
||||
# macOS runners ship without docker; force process backend so the
|
||||
# `Auto` resolver in src/dynamic/sandbox.rs cannot accidentally
|
||||
# pick up a stray Lima/Colima daemon and confuse the matrix.
|
||||
NYX_SANDBOX_BACKEND: process
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- uses: actions-rust-lang/setup-rust-toolchain@v1
|
||||
with:
|
||||
toolchain: stable
|
||||
cache: true
|
||||
|
||||
- uses: taiki-e/install-action@nextest
|
||||
|
||||
- name: Smoke-test sandbox-exec availability
|
||||
run: |
|
||||
/usr/bin/sandbox-exec -p '(version 1)(allow default)' /bin/echo ok
|
||||
|
||||
- name: Smoke-test interpreter availability
|
||||
run: |
|
||||
python3 --version
|
||||
node --version
|
||||
ruby --version
|
||||
|
||||
# Phase 29 acceptance literal: "cargo nextest run --features
|
||||
# dynamic is green on macOS without docker (process-only row)."
|
||||
- name: Dynamic suite (macOS, process backend)
|
||||
run: cargo nextest run --features dynamic
|
||||
|
|
@ -6,6 +6,7 @@
|
|||
#
|
||||
# Usage:
|
||||
# scripts/m7_ship_gate.sh [--nyx BIN] [--corpus-dir DIR] [--skip GATE,...]
|
||||
# [--budget FILE] [--diff FILE]
|
||||
#
|
||||
# Gates:
|
||||
# 1. unsupported-rate — per-cell (cap × lang) Unsupported% within budget
|
||||
|
|
@ -13,6 +14,11 @@
|
|||
# 3. wall-clock — default scan ≤ 2× static-only on bench suite
|
||||
# 4. sandbox-escape — sandbox escape suite green for all langs
|
||||
# 5. repro-stability — repro artifact regenerates identical verdict ≥ 95%
|
||||
#
|
||||
# Phase 29 (Track I): Gate 1 consumes per-cell budgets from
|
||||
# `tests/eval_corpus/budget.toml` and, when `--diff PREV.json` is
|
||||
# supplied, fails on any monotonic-improvement regression vs the
|
||||
# previous run.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
|
|
@ -23,12 +29,17 @@ CORPUS_DIR="${CORPUS_DIR:-${HOME}/.cache/nyx/eval_corpus}"
|
|||
SKIP_GATES=""
|
||||
GATE_ERRORS=0
|
||||
GATE_LOG="${REPO_ROOT}/target/m7_gate.log"
|
||||
# Phase 29 (Track I): per-cell budgets + monotonic diff.
|
||||
BUDGET_FILE="${BUDGET_FILE:-${REPO_ROOT}/tests/eval_corpus/budget.toml}"
|
||||
DIFF_FILE="${DIFF_FILE:-}"
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--nyx) NYX_BIN="$2"; shift 2 ;;
|
||||
--corpus-dir) CORPUS_DIR="$2"; shift 2 ;;
|
||||
--skip) SKIP_GATES="$2"; shift 2 ;;
|
||||
--budget) BUDGET_FILE="$2"; shift 2 ;;
|
||||
--diff) DIFF_FILE="$2"; shift 2 ;;
|
||||
*) shift ;;
|
||||
esac
|
||||
done
|
||||
|
|
@ -45,28 +56,46 @@ mkdir -p "$(dirname "$GATE_LOG")"
|
|||
echo "# M7 ship gate — $(date -u +%Y-%m-%dT%H:%M:%SZ)" > "$GATE_LOG"
|
||||
info "nyx: $NYX_BIN"
|
||||
info "corpus: $CORPUS_DIR"
|
||||
info "budget: $BUDGET_FILE"
|
||||
info "diff: ${DIFF_FILE:-<none>}"
|
||||
info ""
|
||||
|
||||
# ── Gate 1: Unsupported-rate budget ─────────────────────────────────────────
|
||||
# ── Gate 1: Per-cell budget + monotonic-improvement diff ───────────────────
|
||||
#
|
||||
# Phase 29 (Track I): the single global Unsupported threshold is replaced
|
||||
# by per-cell (cap × lang) budgets in tests/eval_corpus/budget.toml.
|
||||
# `tests/eval_corpus/run.sh` invokes `tabulate.py` per set and `report.py`
|
||||
# at the end with `--budget` (and `--diff` when DIFF_FILE is set), so
|
||||
# any per-cell failure (or any regression vs the prior run) propagates
|
||||
# back as exit 2.
|
||||
if skip unsupported-rate; then
|
||||
info "Gate 1 (unsupported-rate): SKIPPED"
|
||||
else
|
||||
info "Gate 1: per-cell Unsupported rate within budget..."
|
||||
info "Gate 1: per-cell budget within tolerance + no monotonic regressions..."
|
||||
EVAL_RESULTS="${REPO_ROOT}/target/eval_results.json"
|
||||
echo "[]" > "$EVAL_RESULTS"
|
||||
|
||||
# Run eval corpus runner (in-house set always present).
|
||||
if bash "${REPO_ROOT}/tests/eval_corpus/run.sh" \
|
||||
if [[ ! -f "$BUDGET_FILE" ]]; then
|
||||
die "Gate 1: budget file not found at $BUDGET_FILE"
|
||||
else
|
||||
# Run eval corpus runner (in-house set always present).
|
||||
set +e
|
||||
bash "${REPO_ROOT}/tests/eval_corpus/run.sh" \
|
||||
--nyx "$NYX_BIN" \
|
||||
--sets inhouse \
|
||||
--output "$(dirname "$EVAL_RESULTS")" 2>>"$GATE_LOG"; then
|
||||
# Copy result to our location.
|
||||
cp "$(dirname "$EVAL_RESULTS")/eval_results.json" "$EVAL_RESULTS" 2>/dev/null || true
|
||||
pass "Gate 1: unsupported-rate check passed"
|
||||
else
|
||||
--output "$(dirname "$EVAL_RESULTS")" \
|
||||
--budget "$BUDGET_FILE" \
|
||||
${DIFF_FILE:+--diff "$DIFF_FILE"} \
|
||||
>>"$GATE_LOG" 2>>"$GATE_LOG"
|
||||
RC=$?
|
||||
if [[ $RC -eq 2 ]]; then
|
||||
die "Gate 1: Unsupported rate exceeds budget for one or more (cap, lang) cells"
|
||||
set -e
|
||||
cp "$(dirname "$EVAL_RESULTS")/eval_results.json" "$EVAL_RESULTS" 2>/dev/null || true
|
||||
if [[ $RC -eq 0 ]]; then
|
||||
pass "Gate 1: per-cell budget + diff check passed"
|
||||
elif [[ $RC -eq 2 ]]; then
|
||||
die "Gate 1: per-cell budget exceeded OR monotonic-improvement regression (see $GATE_LOG)"
|
||||
elif [[ $RC -eq 3 ]]; then
|
||||
die "Gate 1: budget/diff configuration is malformed (see $GATE_LOG)"
|
||||
else
|
||||
info "Gate 1: eval runner returned $RC (corpus may not be downloaded; treating as SKIP)"
|
||||
fi
|
||||
|
|
|
|||
|
|
@ -48,6 +48,131 @@ pub enum CopyStrategy {
|
|||
RustEntry,
|
||||
}
|
||||
|
||||
/// Phase 29 (Track I): host-environment prerequisite a fixture needs in
|
||||
/// order to run. The harness consults the list before staging the
|
||||
/// fixture; any unsatisfied prerequisite triggers a structured skip
|
||||
/// rather than a panic, so non-applicable matrix rows (process-only
|
||||
/// macOS, dockerless CI, missing static libc) still see green ticks.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
#[allow(dead_code)]
|
||||
pub enum Prerequisite {
|
||||
/// A binary must resolve on `PATH` and respond to `--version` with
|
||||
/// exit code 0 (e.g. `python3`, `node`, `go`, `cargo`).
|
||||
CommandAvailable(&'static str),
|
||||
/// A specific env var must be set (used to gate feature-flagged
|
||||
/// suites — e.g. `NYX_ENABLE_FLAKY_FIXTURES=1`).
|
||||
EnvVar(&'static str),
|
||||
/// The docker daemon must be reachable. Equivalent to
|
||||
/// `docker info` returning exit 0.
|
||||
DockerAvailable,
|
||||
/// A static C library archive (e.g. `libc.a`) must be linkable.
|
||||
/// Used by the Phase-17/20 hardening probe fixtures.
|
||||
StaticLib(&'static str),
|
||||
}
|
||||
|
||||
/// Phase 29 (Track I): why the harness skipped a fixture. Carried by
|
||||
/// every skip so callers can distinguish "host did not have python3" from
|
||||
/// "host has docker but daemon refused" from "intentional env-var gate".
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
#[allow(dead_code)]
|
||||
pub enum SkipReason {
|
||||
MissingCommand(&'static str),
|
||||
MissingEnvVar(&'static str),
|
||||
DockerUnavailable,
|
||||
MissingStaticLib(&'static str),
|
||||
}
|
||||
|
||||
impl std::fmt::Display for SkipReason {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
SkipReason::MissingCommand(c) => write!(f, "missing command on PATH: {c}"),
|
||||
SkipReason::MissingEnvVar(v) => write!(f, "env var not set: {v}"),
|
||||
SkipReason::DockerUnavailable => write!(f, "docker daemon unavailable"),
|
||||
SkipReason::MissingStaticLib(l) => write!(f, "static lib not linkable: {l}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the first unsatisfied prerequisite, or `Ok(())` when every
|
||||
/// requirement holds. Exposed for tests that want to gate their own
|
||||
/// per-shape helpers without going through `FixtureSpec`.
|
||||
#[allow(dead_code)]
|
||||
pub fn check_prerequisites(reqs: &[Prerequisite]) -> Result<(), SkipReason> {
|
||||
for req in reqs {
|
||||
match req {
|
||||
Prerequisite::CommandAvailable(cmd) => {
|
||||
let ok = std::process::Command::new(cmd)
|
||||
.arg("--version")
|
||||
.output()
|
||||
.map(|o| o.status.success())
|
||||
.unwrap_or(false);
|
||||
if !ok {
|
||||
return Err(SkipReason::MissingCommand(cmd));
|
||||
}
|
||||
}
|
||||
Prerequisite::EnvVar(var) => {
|
||||
if std::env::var(var).is_err() {
|
||||
return Err(SkipReason::MissingEnvVar(var));
|
||||
}
|
||||
}
|
||||
Prerequisite::DockerAvailable => {
|
||||
let ok = std::process::Command::new("docker")
|
||||
.arg("info")
|
||||
.output()
|
||||
.map(|o| o.status.success())
|
||||
.unwrap_or(false);
|
||||
if !ok {
|
||||
return Err(SkipReason::DockerUnavailable);
|
||||
}
|
||||
}
|
||||
Prerequisite::StaticLib(lib) => {
|
||||
// Treat the lib as linkable iff `cc -static -l<lib>` on
|
||||
// an empty TU succeeds. Slow but reliable; only called
|
||||
// by the small Phase-17 hardening suite.
|
||||
let probe = match tempfile::NamedTempFile::new() {
|
||||
Ok(f) => f,
|
||||
Err(_) => return Err(SkipReason::MissingStaticLib(lib)),
|
||||
};
|
||||
use std::io::Write;
|
||||
let mut handle = match std::fs::OpenOptions::new()
|
||||
.write(true)
|
||||
.open(probe.path())
|
||||
{
|
||||
Ok(h) => h,
|
||||
Err(_) => return Err(SkipReason::MissingStaticLib(lib)),
|
||||
};
|
||||
let _ = writeln!(handle, "int main(void) {{ return 0; }}");
|
||||
drop(handle);
|
||||
let out = tempfile::Builder::new()
|
||||
.prefix("nyx-prereq-")
|
||||
.tempfile()
|
||||
.map(|f| f.path().to_path_buf())
|
||||
.ok();
|
||||
let out = match out {
|
||||
Some(p) => p,
|
||||
None => return Err(SkipReason::MissingStaticLib(lib)),
|
||||
};
|
||||
let status = std::process::Command::new("cc")
|
||||
.args([
|
||||
"-x", "c", "-static",
|
||||
probe.path().to_str().unwrap_or(""),
|
||||
"-o",
|
||||
out.to_str().unwrap_or(""),
|
||||
&format!("-l{lib}"),
|
||||
])
|
||||
.output()
|
||||
.map(|o| o.status.success())
|
||||
.unwrap_or(false);
|
||||
let _ = std::fs::remove_file(&out);
|
||||
if !status {
|
||||
return Err(SkipReason::MissingStaticLib(lib));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Per-fixture specification.
|
||||
pub struct FixtureSpec<'a> {
|
||||
/// Subdirectory under `tests/dynamic_fixtures/` (e.g. `"python"`, `"rust"`).
|
||||
|
|
@ -67,6 +192,11 @@ pub struct FixtureSpec<'a> {
|
|||
pub confidence: Confidence,
|
||||
/// File-layout strategy for the temp-dir copy.
|
||||
pub copy: CopyStrategy,
|
||||
/// Phase 29 (Track I): host-environment prerequisites. Empty means
|
||||
/// "always runs"; otherwise the harness checks each entry before
|
||||
/// staging the fixture and skips with a structured [`SkipReason`]
|
||||
/// when any prerequisite is unmet.
|
||||
pub requires: Vec<Prerequisite>,
|
||||
}
|
||||
|
||||
/// Trimmed verdict shape persisted in the `.golden.json` file.
|
||||
|
|
@ -100,6 +230,14 @@ impl From<&VerifyResult> for GoldenVerdict {
|
|||
/// stored golden or — when `NYX_UPDATE_GOLDENS=1` — overwrite the golden
|
||||
/// with the current verdict.
|
||||
pub fn run_fixture_and_compare_to_golden(spec: &FixtureSpec<'_>) {
|
||||
if let Err(reason) = check_prerequisites(&spec.requires) {
|
||||
eprintln!(
|
||||
"SKIP {}/{}: prerequisite unmet — {reason}",
|
||||
spec.lang_dir, spec.fixture
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
let _guard = FIXTURE_LOCK.lock().unwrap_or_else(|e| e.into_inner());
|
||||
|
||||
let fixture_root = fixture_dir(spec.lang_dir);
|
||||
|
|
|
|||
210
tests/eval_corpus/budget.toml
Normal file
210
tests/eval_corpus/budget.toml
Normal file
|
|
@ -0,0 +1,210 @@
|
|||
# Per-cell (cap × lang) budgets for the dynamic-verification eval corpus.
|
||||
#
|
||||
# Phase 29 (Track I): replaces the single global Unsupported-rate gate in
|
||||
# tests/eval_corpus/report.py with per-cell targets. Each cell records the
|
||||
# largest tolerated rate today plus a deadline date for the next ratchet.
|
||||
#
|
||||
# Schema:
|
||||
#
|
||||
# [default]
|
||||
# unsupported_rate = 0.80 # max(Unsupported / total) per cell
|
||||
# false_confirmed_rate = 0.02 # max(wrong / Confirmed) per cell
|
||||
# repro_stability = 0.95 # min(stable / Confirmed) per cell
|
||||
# ratchet_deadline = "2026-08-01"
|
||||
#
|
||||
# [[cell]]
|
||||
# cap = "sqli"
|
||||
# lang = "python"
|
||||
# unsupported_rate = 0.50
|
||||
# false_confirmed_rate = 0.02
|
||||
# repro_stability = 0.97
|
||||
# ratchet_deadline = "2026-07-15"
|
||||
#
|
||||
# `cap` matches tabulate.py's _CAP_BIT_TABLE / _CAP_RULE_TABLE labels.
|
||||
# `lang` matches the ext_map values (`python`, `javascript`, …).
|
||||
# A wildcard `"*"` matches any cell that does not have an exact entry.
|
||||
|
||||
[default]
|
||||
# Inherited by any cell not overridden below. Aligned with the legacy
|
||||
# Gate-1 / Gate-2 / Gate-5 thresholds in scripts/m7_ship_gate.sh.
|
||||
unsupported_rate = 0.80
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-08-01"
|
||||
|
||||
# Python verticals (Phase 12 — most mature; tightest budgets).
|
||||
|
||||
[[cell]]
|
||||
cap = "sqli"
|
||||
lang = "python"
|
||||
unsupported_rate = 0.40
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.97
|
||||
ratchet_deadline = "2026-07-15"
|
||||
|
||||
[[cell]]
|
||||
cap = "cmdi"
|
||||
lang = "python"
|
||||
unsupported_rate = 0.40
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.97
|
||||
ratchet_deadline = "2026-07-15"
|
||||
|
||||
[[cell]]
|
||||
cap = "path_traversal"
|
||||
lang = "python"
|
||||
unsupported_rate = 0.50
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-07-15"
|
||||
|
||||
[[cell]]
|
||||
cap = "ssrf"
|
||||
lang = "python"
|
||||
unsupported_rate = 0.50
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-07-15"
|
||||
|
||||
[[cell]]
|
||||
cap = "deserialize"
|
||||
lang = "python"
|
||||
unsupported_rate = 0.60
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-08-01"
|
||||
|
||||
# JavaScript / TypeScript (Phase 13 — second-most-mature).
|
||||
|
||||
[[cell]]
|
||||
cap = "sqli"
|
||||
lang = "javascript"
|
||||
unsupported_rate = 0.55
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-08-01"
|
||||
|
||||
[[cell]]
|
||||
cap = "cmdi"
|
||||
lang = "javascript"
|
||||
unsupported_rate = 0.55
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-08-01"
|
||||
|
||||
[[cell]]
|
||||
cap = "ssrf"
|
||||
lang = "javascript"
|
||||
unsupported_rate = 0.60
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-08-01"
|
||||
|
||||
[[cell]]
|
||||
cap = "xss"
|
||||
lang = "javascript"
|
||||
unsupported_rate = 0.70
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-08-15"
|
||||
|
||||
[[cell]]
|
||||
cap = "sqli"
|
||||
lang = "typescript"
|
||||
unsupported_rate = 0.60
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-08-15"
|
||||
|
||||
# Java (Phase 14).
|
||||
|
||||
[[cell]]
|
||||
cap = "sqli"
|
||||
lang = "java"
|
||||
unsupported_rate = 0.65
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-08-15"
|
||||
|
||||
[[cell]]
|
||||
cap = "deserialize"
|
||||
lang = "java"
|
||||
unsupported_rate = 0.70
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-09-01"
|
||||
|
||||
# Phase 15 / 16 verticals (Go, PHP, Ruby, Rust, C, C++) — newer; broader
|
||||
# tolerance until their probe-shim splicing follow-ups land.
|
||||
|
||||
[[cell]]
|
||||
cap = "cmdi"
|
||||
lang = "go"
|
||||
unsupported_rate = 0.75
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-09-01"
|
||||
|
||||
[[cell]]
|
||||
cap = "sqli"
|
||||
lang = "go"
|
||||
unsupported_rate = 0.75
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-09-01"
|
||||
|
||||
[[cell]]
|
||||
cap = "cmdi"
|
||||
lang = "php"
|
||||
unsupported_rate = 0.75
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-09-01"
|
||||
|
||||
[[cell]]
|
||||
cap = "deserialize"
|
||||
lang = "php"
|
||||
unsupported_rate = 0.75
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-09-01"
|
||||
|
||||
[[cell]]
|
||||
cap = "cmdi"
|
||||
lang = "ruby"
|
||||
unsupported_rate = 0.75
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-09-01"
|
||||
|
||||
[[cell]]
|
||||
cap = "sqli"
|
||||
lang = "rust"
|
||||
unsupported_rate = 0.80
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-09-15"
|
||||
|
||||
[[cell]]
|
||||
cap = "fmt_string"
|
||||
lang = "c"
|
||||
unsupported_rate = 0.85
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-09-15"
|
||||
|
||||
[[cell]]
|
||||
cap = "memory"
|
||||
lang = "c"
|
||||
unsupported_rate = 0.90
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-10-01"
|
||||
|
||||
[[cell]]
|
||||
cap = "memory"
|
||||
lang = "cpp"
|
||||
unsupported_rate = 0.90
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-10-01"
|
||||
|
|
@ -2,6 +2,11 @@
|
|||
"""
|
||||
Aggregate eval results across all corpus sets and emit a summary table.
|
||||
Used by run.sh after all corpus sets have been tabulated.
|
||||
|
||||
Phase 29 (Track I) extensions:
|
||||
--budget tests/eval_corpus/budget.toml per-cell budget enforcement
|
||||
--diff previous.json monotonic-improvement diff;
|
||||
CI fails on any regression.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
|
|
@ -9,10 +14,105 @@ import json
|
|||
import sys
|
||||
from collections import defaultdict
|
||||
|
||||
try:
|
||||
import tomllib # Python 3.11+
|
||||
except ModuleNotFoundError: # pragma: no cover — older interpreters only
|
||||
import tomli as tomllib # type: ignore[no-redef]
|
||||
|
||||
|
||||
def load_budget(path: str) -> dict:
|
||||
try:
|
||||
with open(path, "rb") as f:
|
||||
raw = tomllib.load(f)
|
||||
except FileNotFoundError:
|
||||
print(f"ERROR budget file not found: {path}", file=sys.stderr)
|
||||
sys.exit(3)
|
||||
except tomllib.TOMLDecodeError as e:
|
||||
print(f"ERROR budget file malformed: {path}: {e}", file=sys.stderr)
|
||||
sys.exit(3)
|
||||
default = raw.get("default", {}) or {}
|
||||
cells = {}
|
||||
for row in raw.get("cell", []) or []:
|
||||
cap = row.get("cap")
|
||||
lang = row.get("lang")
|
||||
if not cap or not lang:
|
||||
print(f"ERROR budget cell missing cap/lang: {row!r}", file=sys.stderr)
|
||||
sys.exit(3)
|
||||
cells[(cap, lang)] = row
|
||||
return {"default": default, "cells": cells}
|
||||
|
||||
|
||||
def budget_for_cell(budget: dict, cap: str, lang: str) -> dict:
|
||||
merged = dict(budget.get("default", {}) or {})
|
||||
cell = budget.get("cells", {}).get((cap, lang))
|
||||
if cell:
|
||||
merged.update({k: v for k, v in cell.items() if k not in ("cap", "lang")})
|
||||
if not cell:
|
||||
wildcard = (
|
||||
budget.get("cells", {}).get((cap, "*"))
|
||||
or budget.get("cells", {}).get(("*", lang))
|
||||
or budget.get("cells", {}).get(("*", "*"))
|
||||
)
|
||||
if wildcard:
|
||||
merged.update(
|
||||
{k: v for k, v in wildcard.items() if k not in ("cap", "lang")}
|
||||
)
|
||||
return merged
|
||||
|
||||
|
||||
def load_previous_agg(path: str) -> dict:
|
||||
"""Aggregate a previous results file the same way main() does."""
|
||||
try:
|
||||
with open(path) as f:
|
||||
data = json.load(f)
|
||||
except FileNotFoundError:
|
||||
print(f"ERROR diff file not found: {path}", file=sys.stderr)
|
||||
sys.exit(3)
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"ERROR diff file malformed: {path}: {e}", file=sys.stderr)
|
||||
sys.exit(3)
|
||||
agg: dict[tuple[str, str], dict] = defaultdict(
|
||||
lambda: {
|
||||
"tp": 0,
|
||||
"fp": 0,
|
||||
"fn": 0,
|
||||
"unsupported": 0,
|
||||
"confirmed": 0,
|
||||
"wrong_confirmed": 0,
|
||||
"stable_replays": 0,
|
||||
"total": 0,
|
||||
}
|
||||
)
|
||||
for r in data:
|
||||
for c in r.get("cells", []):
|
||||
k = (c["cap"], c["lang"])
|
||||
for field in (
|
||||
"tp",
|
||||
"fp",
|
||||
"fn",
|
||||
"unsupported",
|
||||
"confirmed",
|
||||
"wrong_confirmed",
|
||||
"stable_replays",
|
||||
"total",
|
||||
):
|
||||
agg[k][field] += c.get(field, 0)
|
||||
return agg
|
||||
|
||||
|
||||
def main() -> int:
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument("--results", required=True)
|
||||
p.add_argument(
|
||||
"--budget",
|
||||
default="",
|
||||
help="path to budget.toml (per-(cap,lang) thresholds)",
|
||||
)
|
||||
p.add_argument(
|
||||
"--diff",
|
||||
default="",
|
||||
help="path to a previous results.json; fail on monotonic-improvement regression",
|
||||
)
|
||||
args = p.parse_args()
|
||||
|
||||
with open(args.results) as f:
|
||||
|
|
@ -24,12 +124,30 @@ def main() -> int:
|
|||
|
||||
# Aggregate across sets.
|
||||
agg: dict[tuple[str, str], dict] = defaultdict(
|
||||
lambda: {"tp": 0, "fp": 0, "fn": 0, "unsupported": 0, "total": 0}
|
||||
lambda: {
|
||||
"tp": 0,
|
||||
"fp": 0,
|
||||
"fn": 0,
|
||||
"unsupported": 0,
|
||||
"confirmed": 0,
|
||||
"wrong_confirmed": 0,
|
||||
"stable_replays": 0,
|
||||
"total": 0,
|
||||
}
|
||||
)
|
||||
for r in results:
|
||||
for c in r.get("cells", []):
|
||||
k = (c["cap"], c["lang"])
|
||||
for field in ("tp", "fp", "fn", "unsupported", "total"):
|
||||
for field in (
|
||||
"tp",
|
||||
"fp",
|
||||
"fn",
|
||||
"unsupported",
|
||||
"confirmed",
|
||||
"wrong_confirmed",
|
||||
"stable_replays",
|
||||
"total",
|
||||
):
|
||||
agg[k][field] += c.get(field, 0)
|
||||
|
||||
print("\n=== Aggregated eval corpus report ===")
|
||||
|
|
@ -46,18 +164,114 @@ def main() -> int:
|
|||
f"{unsup*100:>6.1f}%"
|
||||
)
|
||||
|
||||
# Gate check: per-cap Unsupported rate <= 80%
|
||||
gate_failed = False
|
||||
print("\n=== Gate checks ===")
|
||||
UNSUPPORTED_BUDGET = 0.80
|
||||
for k, v in sorted(agg.items()):
|
||||
unsup = v["unsupported"] / max(v["total"], 1)
|
||||
if unsup > UNSUPPORTED_BUDGET:
|
||||
print(f" FAIL {k[0]}/{k[1]}: Unsupported {unsup*100:.1f}% > {UNSUPPORTED_BUDGET*100:.0f}% budget")
|
||||
gate_failed = True
|
||||
|
||||
if not gate_failed:
|
||||
print(" All gate thresholds met.")
|
||||
# ── Phase 29: per-cell budget enforcement ────────────────────────────
|
||||
if args.budget:
|
||||
budget = load_budget(args.budget)
|
||||
print(f"\n=== Per-cell budget ({args.budget}) ===")
|
||||
cell_fails: list[str] = []
|
||||
for k, v in sorted(agg.items()):
|
||||
b = budget_for_cell(budget, k[0], k[1])
|
||||
if not b:
|
||||
continue
|
||||
max_unsup = b.get("unsupported_rate")
|
||||
max_false = b.get("false_confirmed_rate")
|
||||
min_stable = b.get("repro_stability")
|
||||
|
||||
if isinstance(max_unsup, (int, float)) and v["total"] > 0:
|
||||
rate = v["unsupported"] / v["total"]
|
||||
if rate > max_unsup:
|
||||
cell_fails.append(
|
||||
f" FAIL {k[0]}/{k[1]}: Unsupported {rate*100:.1f}%"
|
||||
f" > budget {max_unsup*100:.1f}%"
|
||||
)
|
||||
if isinstance(max_false, (int, float)) and v["confirmed"] > 0:
|
||||
rate = v["wrong_confirmed"] / v["confirmed"]
|
||||
if rate > max_false:
|
||||
cell_fails.append(
|
||||
f" FAIL {k[0]}/{k[1]}: false-Confirmed {rate*100:.1f}%"
|
||||
f" > budget {max_false*100:.1f}%"
|
||||
)
|
||||
if (
|
||||
isinstance(min_stable, (int, float))
|
||||
and v["confirmed"] > 0
|
||||
and v.get("stable_replays", 0) > 0
|
||||
):
|
||||
rate = v["stable_replays"] / v["confirmed"]
|
||||
if rate < min_stable:
|
||||
cell_fails.append(
|
||||
f" FAIL {k[0]}/{k[1]}: repro stability {rate*100:.1f}%"
|
||||
f" < budget {min_stable*100:.1f}%"
|
||||
)
|
||||
if cell_fails:
|
||||
for line in cell_fails:
|
||||
print(line)
|
||||
gate_failed = True
|
||||
else:
|
||||
print(" All per-cell budgets met.")
|
||||
else:
|
||||
# Legacy fallback: per-cap Unsupported rate <= 80%.
|
||||
print("\n=== Gate checks ===")
|
||||
UNSUPPORTED_BUDGET = 0.80
|
||||
cell_fails: list[str] = []
|
||||
for k, v in sorted(agg.items()):
|
||||
unsup = v["unsupported"] / max(v["total"], 1)
|
||||
if unsup > UNSUPPORTED_BUDGET:
|
||||
cell_fails.append(
|
||||
f" FAIL {k[0]}/{k[1]}: Unsupported {unsup*100:.1f}%"
|
||||
f" > {UNSUPPORTED_BUDGET*100:.0f}% budget"
|
||||
)
|
||||
if cell_fails:
|
||||
for line in cell_fails:
|
||||
print(line)
|
||||
gate_failed = True
|
||||
else:
|
||||
print(" All gate thresholds met.")
|
||||
|
||||
# ── Phase 29: monotonic-improvement diff ─────────────────────────────
|
||||
if args.diff:
|
||||
prev = load_previous_agg(args.diff)
|
||||
print(f"\n=== Monotonic-improvement diff vs {args.diff} ===")
|
||||
diff_fails: list[str] = []
|
||||
EPS = 0.005
|
||||
for k, v in sorted(agg.items()):
|
||||
old = prev.get(k)
|
||||
if not old:
|
||||
continue
|
||||
old_unsup = old["unsupported"] / max(old["total"], 1)
|
||||
new_unsup = v["unsupported"] / max(v["total"], 1)
|
||||
if new_unsup > old_unsup + EPS:
|
||||
diff_fails.append(
|
||||
f" REGRESSION {k[0]}/{k[1]}: Unsupported"
|
||||
f" {old_unsup*100:.1f}% → {new_unsup*100:.1f}%"
|
||||
)
|
||||
old_conf = old.get("confirmed", 0)
|
||||
new_conf = v.get("confirmed", 0)
|
||||
old_false = (old.get("wrong_confirmed", 0) / old_conf) if old_conf else None
|
||||
new_false = (v.get("wrong_confirmed", 0) / new_conf) if new_conf else None
|
||||
if old_false is not None and new_false is not None and new_false > old_false + EPS:
|
||||
diff_fails.append(
|
||||
f" REGRESSION {k[0]}/{k[1]}: false-Confirmed"
|
||||
f" {old_false*100:.1f}% → {new_false*100:.1f}%"
|
||||
)
|
||||
old_stable = (old.get("stable_replays", 0) / old_conf) if old_conf else None
|
||||
new_stable = (v.get("stable_replays", 0) / new_conf) if new_conf else None
|
||||
if (
|
||||
old_stable is not None
|
||||
and new_stable is not None
|
||||
and new_stable < old_stable - EPS
|
||||
):
|
||||
diff_fails.append(
|
||||
f" REGRESSION {k[0]}/{k[1]}: repro stability"
|
||||
f" {old_stable*100:.1f}% → {new_stable*100:.1f}%"
|
||||
)
|
||||
if diff_fails:
|
||||
for line in diff_fails:
|
||||
print(line)
|
||||
gate_failed = True
|
||||
else:
|
||||
print(" No regressions vs previous run.")
|
||||
|
||||
return 2 if gate_failed else 0
|
||||
|
||||
|
|
|
|||
|
|
@ -29,12 +29,17 @@ OUTPUT_DIR=""
|
|||
NYX_BIN="${NYX_BIN:-${REPO_ROOT}/target/release/nyx}"
|
||||
CORPUS_CACHE="${NYX_EVAL_CORPUS_DIR:-${HOME}/.cache/nyx/eval_corpus}"
|
||||
SETS="owasp,sard,inhouse"
|
||||
# Phase 29 (Track I): per-cell budgets + monotonic-improvement diff.
|
||||
BUDGET_FILE=""
|
||||
DIFF_FILE=""
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--output) OUTPUT_DIR="$2"; shift 2 ;;
|
||||
--nyx) NYX_BIN="$2"; shift 2 ;;
|
||||
--sets) SETS="$2"; shift 2 ;;
|
||||
--budget) BUDGET_FILE="$2"; shift 2 ;;
|
||||
--diff) DIFF_FILE="$2"; shift 2 ;;
|
||||
*) shift ;;
|
||||
esac
|
||||
done
|
||||
|
|
@ -83,6 +88,8 @@ if [[ "$SETS" == *owasp* ]]; then
|
|||
--scan /tmp/nyx_owasp.json \
|
||||
--ground-truth "${SCRIPT_DIR}/ground_truth/owasp_benchmark_v1.2.json" \
|
||||
--append "$RESULTS_JSON" \
|
||||
${BUDGET_FILE:+--budget "$BUDGET_FILE"} \
|
||||
${DIFF_FILE:+--diff "$DIFF_FILE"} \
|
||||
|| info " tabulate.py failed; ground truth file may be absent"
|
||||
fi
|
||||
fi
|
||||
|
|
@ -111,6 +118,8 @@ if [[ "$SETS" == *sard* ]]; then
|
|||
--scan /tmp/nyx_sard.json \
|
||||
--ground-truth "${SCRIPT_DIR}/ground_truth/nist_sard.json" \
|
||||
--append "$RESULTS_JSON" \
|
||||
${BUDGET_FILE:+--budget "$BUDGET_FILE"} \
|
||||
${DIFF_FILE:+--diff "$DIFF_FILE"} \
|
||||
|| info " tabulate.py failed; ground truth file may be absent"
|
||||
fi
|
||||
fi
|
||||
|
|
@ -140,6 +149,8 @@ if [[ "$SETS" == *inhouse* ]]; then
|
|||
--scan "/tmp/nyx_${label}.json" \
|
||||
--inhouse \
|
||||
--append "$RESULTS_JSON" \
|
||||
${BUDGET_FILE:+--budget "$BUDGET_FILE"} \
|
||||
${DIFF_FILE:+--diff "$DIFF_FILE"} \
|
||||
|| info " tabulate.py failed on $label"
|
||||
done
|
||||
fi
|
||||
|
|
@ -156,12 +167,20 @@ if [[ ! -f "${SCRIPT_DIR}/report.py" ]]; then
|
|||
fi
|
||||
|
||||
set +e
|
||||
python3 "${SCRIPT_DIR}/report.py" --results "$RESULTS_JSON"
|
||||
python3 "${SCRIPT_DIR}/report.py" \
|
||||
--results "$RESULTS_JSON" \
|
||||
${BUDGET_FILE:+--budget "$BUDGET_FILE"} \
|
||||
${DIFF_FILE:+--diff "$DIFF_FILE"}
|
||||
REPORT_RC=$?
|
||||
set -e
|
||||
# Propagate gate-fail (exit 2). Treat other non-zero as setup error (exit 1).
|
||||
# Propagate gate-fail (exit 2) and malformed-config (exit 3) so the
|
||||
# m7_ship_gate.sh Gate-1 dispatch can tell them apart. Treat other
|
||||
# non-zero as setup error (exit 1).
|
||||
if [[ $REPORT_RC -eq 2 ]]; then
|
||||
exit 2
|
||||
elif [[ $REPORT_RC -eq 3 ]]; then
|
||||
info "report.py: budget/diff configuration malformed; see $RESULTS_JSON"
|
||||
exit 3
|
||||
elif [[ $REPORT_RC -ne 0 ]]; then
|
||||
info "report.py crashed (exit $REPORT_RC); raw results at $RESULTS_JSON"
|
||||
exit 1
|
||||
|
|
|
|||
|
|
@ -9,6 +9,17 @@ For in-house sets (--inhouse): counts findings by cap x language; reports
|
|||
Unsupported rate only (no ground truth required).
|
||||
|
||||
Output: appends a result record to --append FILE.
|
||||
|
||||
Phase 29 (Track I) extensions:
|
||||
--budget tests/eval_corpus/budget.toml enforce per-cell budget thresholds
|
||||
--diff previous.json compare against prior result file,
|
||||
fail on monotonic-improvement
|
||||
regression
|
||||
|
||||
Exit codes:
|
||||
0 all rows pass.
|
||||
2 one or more per-cell budgets exceeded OR a diff regression was found.
|
||||
3 malformed budget / diff input (callers must fix configuration).
|
||||
"""
|
||||
|
||||
import argparse
|
||||
|
|
@ -17,6 +28,11 @@ import sys
|
|||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
import tomllib # Python 3.11+
|
||||
except ModuleNotFoundError: # pragma: no cover — older interpreters only
|
||||
import tomli as tomllib # type: ignore[no-redef]
|
||||
|
||||
LINE_TOLERANCE = 5
|
||||
|
||||
# Bitflag positions for Cap (src/labels/mod.rs). Sink bits map to a cap label.
|
||||
|
|
@ -97,6 +113,203 @@ def lang_of(finding: dict) -> str:
|
|||
return "unknown"
|
||||
|
||||
|
||||
# ── Budget loading ──────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def load_budget(path: str) -> dict:
|
||||
"""Parse a budget.toml file.
|
||||
|
||||
Returns a dict::
|
||||
|
||||
{
|
||||
"default": {"unsupported_rate": 0.8, "false_confirmed_rate": 0.02,
|
||||
"repro_stability": 0.95, "ratchet_deadline": "..."},
|
||||
"cells": {(cap, lang): {...overrides...}, ...},
|
||||
}
|
||||
|
||||
Raises SystemExit(3) on a malformed file.
|
||||
"""
|
||||
|
||||
try:
|
||||
with open(path, "rb") as f:
|
||||
raw = tomllib.load(f)
|
||||
except FileNotFoundError:
|
||||
print(f"ERROR budget file not found: {path}", file=sys.stderr)
|
||||
sys.exit(3)
|
||||
except tomllib.TOMLDecodeError as e:
|
||||
print(f"ERROR budget file malformed: {path}: {e}", file=sys.stderr)
|
||||
sys.exit(3)
|
||||
|
||||
default = raw.get("default", {}) or {}
|
||||
cells = {}
|
||||
for row in raw.get("cell", []) or []:
|
||||
cap = row.get("cap")
|
||||
lang = row.get("lang")
|
||||
if not cap or not lang:
|
||||
print(
|
||||
f"ERROR budget cell missing cap/lang: {row!r}", file=sys.stderr
|
||||
)
|
||||
sys.exit(3)
|
||||
cells[(cap, lang)] = row
|
||||
|
||||
return {"default": default, "cells": cells}
|
||||
|
||||
|
||||
def budget_for_cell(budget: dict, cap: str, lang: str) -> dict:
|
||||
"""Merge cell-specific overrides on top of [default]."""
|
||||
merged = dict(budget.get("default", {}) or {})
|
||||
cell = budget.get("cells", {}).get((cap, lang))
|
||||
if cell:
|
||||
merged.update({k: v for k, v in cell.items() if k not in ("cap", "lang")})
|
||||
# Fall back to a wildcard override if present.
|
||||
if not cell:
|
||||
wildcard = budget.get("cells", {}).get((cap, "*")) or \
|
||||
budget.get("cells", {}).get(("*", lang)) or \
|
||||
budget.get("cells", {}).get(("*", "*"))
|
||||
if wildcard:
|
||||
merged.update({k: v for k, v in wildcard.items() if k not in ("cap", "lang")})
|
||||
return merged
|
||||
|
||||
|
||||
def enforce_budget(cells: list, budget: dict) -> list:
|
||||
"""Return a list of human-readable failure strings.
|
||||
|
||||
Each cell's measured Unsupported / false-Confirmed / repro-stability
|
||||
rate is compared against its merged budget row. A missing measurement
|
||||
(e.g. no Confirmed findings → false-Confirmed denominator = 0) is
|
||||
treated as "no data" and skipped, never as a failure.
|
||||
"""
|
||||
|
||||
failures = []
|
||||
for c in cells:
|
||||
b = budget_for_cell(budget, c["cap"], c["lang"])
|
||||
if not b:
|
||||
continue
|
||||
cap, lang = c["cap"], c["lang"]
|
||||
max_unsup = b.get("unsupported_rate")
|
||||
max_false = b.get("false_confirmed_rate")
|
||||
min_stable = b.get("repro_stability")
|
||||
|
||||
if isinstance(max_unsup, (int, float)) and c.get("total", 0) > 0:
|
||||
if c["unsupported_rate"] > max_unsup:
|
||||
failures.append(
|
||||
f" FAIL {cap}/{lang}: Unsupported {c['unsupported_rate']*100:.1f}%"
|
||||
f" > budget {max_unsup*100:.1f}%"
|
||||
)
|
||||
if isinstance(max_false, (int, float)) and c.get("confirmed", 0) > 0:
|
||||
rate = c.get("wrong_confirmed", 0) / c["confirmed"]
|
||||
if rate > max_false:
|
||||
failures.append(
|
||||
f" FAIL {cap}/{lang}: false-Confirmed {rate*100:.1f}%"
|
||||
f" > budget {max_false*100:.1f}%"
|
||||
)
|
||||
# Repro stability is only enforced when callers stamped at least
|
||||
# one `replay_stable: true` flag — otherwise stable_replays == 0
|
||||
# is indistinguishable from "we did not measure stability for
|
||||
# this row" and the gate would fire vacuously on every clean run.
|
||||
if (
|
||||
isinstance(min_stable, (int, float))
|
||||
and c.get("confirmed", 0) > 0
|
||||
and c.get("stable_replays", 0) > 0
|
||||
):
|
||||
rate = c["stable_replays"] / c["confirmed"]
|
||||
if rate < min_stable:
|
||||
failures.append(
|
||||
f" FAIL {cap}/{lang}: repro stability {rate*100:.1f}%"
|
||||
f" < budget {min_stable*100:.1f}%"
|
||||
)
|
||||
return failures
|
||||
|
||||
|
||||
# ── Diff loading ────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def load_previous_cells(path: str, label: str) -> dict:
|
||||
"""Index a previous results file by (cap, lang) → cell.
|
||||
|
||||
The previous file is the same shape as `--append`'s output. We pick the
|
||||
record whose `label` matches the current run; if no exact match, fall
|
||||
back to the first record. Missing/unreadable files exit 3.
|
||||
"""
|
||||
|
||||
try:
|
||||
with open(path) as f:
|
||||
data = json.load(f)
|
||||
except FileNotFoundError:
|
||||
print(f"ERROR diff file not found: {path}", file=sys.stderr)
|
||||
sys.exit(3)
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"ERROR diff file malformed: {path}: {e}", file=sys.stderr)
|
||||
sys.exit(3)
|
||||
|
||||
records = data if isinstance(data, list) else [data]
|
||||
chosen = None
|
||||
for r in records:
|
||||
if r.get("label") == label:
|
||||
chosen = r
|
||||
break
|
||||
if chosen is None and records:
|
||||
chosen = records[0]
|
||||
if not chosen:
|
||||
return {}
|
||||
return {(c["cap"], c["lang"]): c for c in chosen.get("cells", [])}
|
||||
|
||||
|
||||
def diff_regressions(cells: list, prev: dict) -> list:
|
||||
"""Compare current cells against previous. Returns failure strings.
|
||||
|
||||
Three monotonicity rules:
|
||||
* Unsupported% must not increase.
|
||||
* False-Confirmed% must not increase.
|
||||
* Repro-stability% must not decrease.
|
||||
|
||||
Cells absent from `prev` are treated as new (skipped).
|
||||
A small epsilon (0.5 percentage points) absorbs flake noise.
|
||||
"""
|
||||
EPS = 0.005
|
||||
failures = []
|
||||
for c in cells:
|
||||
key = (c["cap"], c["lang"])
|
||||
old = prev.get(key)
|
||||
if not old:
|
||||
continue
|
||||
# Unsupported.
|
||||
old_unsup = old.get("unsupported_rate", 0.0)
|
||||
new_unsup = c.get("unsupported_rate", 0.0)
|
||||
if new_unsup > old_unsup + EPS:
|
||||
failures.append(
|
||||
f" REGRESSION {key[0]}/{key[1]}: Unsupported"
|
||||
f" {old_unsup*100:.1f}% → {new_unsup*100:.1f}%"
|
||||
)
|
||||
# False-Confirmed.
|
||||
old_conf = old.get("confirmed", 0)
|
||||
old_false = (old.get("wrong_confirmed", 0) / old_conf) if old_conf else None
|
||||
new_conf = c.get("confirmed", 0)
|
||||
new_false = (c.get("wrong_confirmed", 0) / new_conf) if new_conf else None
|
||||
if old_false is not None and new_false is not None and new_false > old_false + EPS:
|
||||
failures.append(
|
||||
f" REGRESSION {key[0]}/{key[1]}: false-Confirmed"
|
||||
f" {old_false*100:.1f}% → {new_false*100:.1f}%"
|
||||
)
|
||||
# Repro stability (higher is better).
|
||||
old_stable = (
|
||||
(old.get("stable_replays", 0) / old_conf) if old_conf else None
|
||||
)
|
||||
new_stable = (
|
||||
(c.get("stable_replays", 0) / new_conf) if new_conf else None
|
||||
)
|
||||
if (
|
||||
old_stable is not None
|
||||
and new_stable is not None
|
||||
and new_stable < old_stable - EPS
|
||||
):
|
||||
failures.append(
|
||||
f" REGRESSION {key[0]}/{key[1]}: repro stability"
|
||||
f" {old_stable*100:.1f}% → {new_stable*100:.1f}%"
|
||||
)
|
||||
return failures
|
||||
|
||||
|
||||
def main() -> int:
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument("--label", required=True)
|
||||
|
|
@ -104,14 +317,34 @@ def main() -> int:
|
|||
p.add_argument("--ground-truth", default="", help="ground truth JSON")
|
||||
p.add_argument("--inhouse", action="store_true")
|
||||
p.add_argument("--append", required=True, help="results accumulator JSON")
|
||||
p.add_argument(
|
||||
"--budget",
|
||||
default="",
|
||||
help="path to budget.toml (per-(cap,lang) thresholds)",
|
||||
)
|
||||
p.add_argument(
|
||||
"--diff",
|
||||
default="",
|
||||
help="path to a previous results JSON; fail on monotonic-improvement regression",
|
||||
)
|
||||
args = p.parse_args()
|
||||
|
||||
scan_data = load_json(args.scan)
|
||||
findings = scan_data if isinstance(scan_data, list) else scan_data.get("findings", [])
|
||||
|
||||
# Per-cell tallies: {(cap, lang): {tp, fp, fn, unsupported}}
|
||||
# Per-cell tallies: {(cap, lang): {tp, fp, fn, unsupported, confirmed,
|
||||
# wrong_confirmed, stable_replays, total}}
|
||||
cells: dict[tuple[str, str], dict] = defaultdict(
|
||||
lambda: {"tp": 0, "fp": 0, "fn": 0, "unsupported": 0, "total": 0}
|
||||
lambda: {
|
||||
"tp": 0,
|
||||
"fp": 0,
|
||||
"fn": 0,
|
||||
"unsupported": 0,
|
||||
"confirmed": 0,
|
||||
"wrong_confirmed": 0,
|
||||
"stable_replays": 0,
|
||||
"total": 0,
|
||||
}
|
||||
)
|
||||
|
||||
for f in findings:
|
||||
|
|
@ -121,8 +354,19 @@ def main() -> int:
|
|||
ev = f.get("evidence", {}) or {}
|
||||
dv = ev.get("dynamic_verdict") if ev else None
|
||||
cells[key]["total"] += 1
|
||||
if dv and dv.get("status") == "Unsupported":
|
||||
cells[key]["unsupported"] += 1
|
||||
if dv:
|
||||
status = dv.get("status")
|
||||
if status == "Unsupported":
|
||||
cells[key]["unsupported"] += 1
|
||||
elif status == "Confirmed":
|
||||
cells[key]["confirmed"] += 1
|
||||
# Repro-stability and false-Confirmed counts are optional
|
||||
# fields tabulate.py reads off the verdict when callers
|
||||
# (m7_ship_gate.sh / corpus_promote.yml) have stamped them.
|
||||
if dv.get("wrong") is True:
|
||||
cells[key]["wrong_confirmed"] += 1
|
||||
if dv.get("replay_stable") is True:
|
||||
cells[key]["stable_replays"] += 1
|
||||
|
||||
if not args.inhouse and args.ground_truth and Path(args.ground_truth).exists():
|
||||
gt = load_json(args.ground_truth)
|
||||
|
|
@ -201,7 +445,34 @@ def main() -> int:
|
|||
f"{c['precision']:>6.2f} {c['recall']:>6.2f} "
|
||||
f"{c['unsupported_rate']*100:>6.1f}%"
|
||||
)
|
||||
return 0
|
||||
|
||||
exit_rc = 0
|
||||
|
||||
# ── Phase 29: per-cell budget enforcement ─────────────────────────────
|
||||
if args.budget:
|
||||
budget = load_budget(args.budget)
|
||||
failures = enforce_budget(result["cells"], budget)
|
||||
if failures:
|
||||
print(f"\n=== Per-cell budget regressions ({args.budget}) ===")
|
||||
for line in failures:
|
||||
print(line)
|
||||
exit_rc = 2
|
||||
else:
|
||||
print(f"\nPer-cell budget ({args.budget}): OK")
|
||||
|
||||
# ── Phase 29: diff against previous run ───────────────────────────────
|
||||
if args.diff:
|
||||
prev = load_previous_cells(args.diff, args.label)
|
||||
failures = diff_regressions(result["cells"], prev)
|
||||
if failures:
|
||||
print(f"\n=== Monotonic-improvement regressions vs {args.diff} ===")
|
||||
for line in failures:
|
||||
print(line)
|
||||
exit_rc = 2
|
||||
else:
|
||||
print(f"\nDiff vs {args.diff}: no regressions")
|
||||
|
||||
return exit_rc
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
241
tests/eval_corpus/test_tabulate_regression.py
Normal file
241
tests/eval_corpus/test_tabulate_regression.py
Normal file
|
|
@ -0,0 +1,241 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Phase 29 (Track I) regression test for tests/eval_corpus/tabulate.py.
|
||||
|
||||
Exercises --budget and --diff against hand-crafted scan + ground-truth
|
||||
fixtures so the per-cell budget gate and monotonic-improvement diff are
|
||||
demonstrably non-vacuous.
|
||||
|
||||
Run with::
|
||||
|
||||
python3 tests/eval_corpus/test_tabulate_regression.py
|
||||
|
||||
Exits 0 when every assertion holds, non-zero otherwise. The asserts are
|
||||
plain `assert` statements so the file works both as a stand-alone script
|
||||
and under unittest discovery.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
REPO = Path(__file__).resolve().parents[2]
|
||||
TABULATE = REPO / "tests/eval_corpus/tabulate.py"
|
||||
BUDGET = REPO / "tests/eval_corpus/budget.toml"
|
||||
|
||||
|
||||
def run_tabulate(*args: str) -> subprocess.CompletedProcess:
|
||||
cmd = [sys.executable, str(TABULATE), *args]
|
||||
return subprocess.run(cmd, capture_output=True, text=True)
|
||||
|
||||
|
||||
def write_json(path: Path, data: object) -> None:
|
||||
path.write_text(json.dumps(data, indent=2))
|
||||
|
||||
|
||||
# Cap bit positions cribbed from tabulate.py / src/labels/mod.rs.
|
||||
SINK_BIT_SQL = 1 << 7 # SQL_QUERY
|
||||
SINK_BIT_CMDI = 1 << 10 # CODE_EXEC
|
||||
|
||||
|
||||
def python_finding(cap_bit: int, path: str, line: int, status: str | None) -> dict:
|
||||
finding = {
|
||||
"path": path,
|
||||
"line": line,
|
||||
"col": 0,
|
||||
"id": "py.sqli.cursor_execute",
|
||||
"evidence": {"sink_caps": cap_bit},
|
||||
}
|
||||
if status:
|
||||
finding["evidence"]["dynamic_verdict"] = {"status": status}
|
||||
return finding
|
||||
|
||||
|
||||
def test_budget_passes_on_clean_scan(tmp: Path) -> None:
|
||||
scan = tmp / "scan_clean.json"
|
||||
write_json(
|
||||
scan,
|
||||
{
|
||||
"findings": [
|
||||
python_finding(SINK_BIT_SQL, "app.py", 10, "Confirmed"),
|
||||
python_finding(SINK_BIT_SQL, "app.py", 20, "Confirmed"),
|
||||
python_finding(SINK_BIT_SQL, "app.py", 30, "NotConfirmed"),
|
||||
]
|
||||
},
|
||||
)
|
||||
append = tmp / "results_clean.json"
|
||||
write_json(append, [])
|
||||
proc = run_tabulate(
|
||||
"--label", "test",
|
||||
"--scan", str(scan),
|
||||
"--inhouse",
|
||||
"--append", str(append),
|
||||
"--budget", str(BUDGET),
|
||||
)
|
||||
assert proc.returncode == 0, f"clean scan must pass budget, got rc={proc.returncode}\nstdout: {proc.stdout}\nstderr: {proc.stderr}"
|
||||
assert "Per-cell budget" in proc.stdout and "OK" in proc.stdout, proc.stdout
|
||||
|
||||
|
||||
def test_budget_fails_when_unsupported_exceeds(tmp: Path) -> None:
|
||||
# SQL_QUERY/python budget is 40% Unsupported. Hand-craft a scan with
|
||||
# 100% Unsupported in that cell so the gate must trip.
|
||||
scan = tmp / "scan_unsup.json"
|
||||
write_json(
|
||||
scan,
|
||||
{
|
||||
"findings": [
|
||||
python_finding(SINK_BIT_SQL, "app.py", i, "Unsupported")
|
||||
for i in (10, 20, 30, 40, 50)
|
||||
]
|
||||
},
|
||||
)
|
||||
append = tmp / "results_unsup.json"
|
||||
write_json(append, [])
|
||||
proc = run_tabulate(
|
||||
"--label", "test",
|
||||
"--scan", str(scan),
|
||||
"--inhouse",
|
||||
"--append", str(append),
|
||||
"--budget", str(BUDGET),
|
||||
)
|
||||
assert proc.returncode == 2, (
|
||||
f"budget breach must exit 2, got {proc.returncode}\n"
|
||||
f"stdout: {proc.stdout}\nstderr: {proc.stderr}"
|
||||
)
|
||||
assert "FAIL" in proc.stdout and "sqli/python" in proc.stdout, proc.stdout
|
||||
|
||||
|
||||
def test_diff_fails_on_regression(tmp: Path) -> None:
|
||||
# Previous run: 1/4 Unsupported = 25%. Current run: 3/4 = 75%. The
|
||||
# default cell budget tolerates 80%, but the monotonic-improvement
|
||||
# diff must still flag the +50pp regression.
|
||||
prev_findings = [
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 1, "Confirmed"),
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 2, "Confirmed"),
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 3, "Confirmed"),
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 4, "Unsupported"),
|
||||
]
|
||||
prev_scan = tmp / "prev_scan.json"
|
||||
write_json(prev_scan, {"findings": prev_findings})
|
||||
prev_results = tmp / "prev_results.json"
|
||||
write_json(prev_results, [])
|
||||
rc_prev = run_tabulate(
|
||||
"--label", "diff-test",
|
||||
"--scan", str(prev_scan),
|
||||
"--inhouse",
|
||||
"--append", str(prev_results),
|
||||
).returncode
|
||||
assert rc_prev == 0, f"prev seed run must succeed, got {rc_prev}"
|
||||
|
||||
cur_findings = [
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 1, "Unsupported"),
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 2, "Unsupported"),
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 3, "Unsupported"),
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 4, "Confirmed"),
|
||||
]
|
||||
cur_scan = tmp / "cur_scan.json"
|
||||
write_json(cur_scan, {"findings": cur_findings})
|
||||
cur_results = tmp / "cur_results.json"
|
||||
write_json(cur_results, [])
|
||||
proc = run_tabulate(
|
||||
"--label", "diff-test",
|
||||
"--scan", str(cur_scan),
|
||||
"--inhouse",
|
||||
"--append", str(cur_results),
|
||||
"--diff", str(prev_results),
|
||||
)
|
||||
assert proc.returncode == 2, (
|
||||
f"regression diff must exit 2, got {proc.returncode}\n"
|
||||
f"stdout: {proc.stdout}\nstderr: {proc.stderr}"
|
||||
)
|
||||
assert "REGRESSION" in proc.stdout and "Unsupported" in proc.stdout, proc.stdout
|
||||
|
||||
|
||||
def test_diff_passes_on_improvement(tmp: Path) -> None:
|
||||
# Previous: 3/4 Unsupported. Current: 1/4. Monotonic improvement
|
||||
# must not flag any regression.
|
||||
prev_findings = [
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 1, "Unsupported"),
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 2, "Unsupported"),
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 3, "Unsupported"),
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 4, "Confirmed"),
|
||||
]
|
||||
prev_scan = tmp / "prev_scan.json"
|
||||
write_json(prev_scan, {"findings": prev_findings})
|
||||
prev_results = tmp / "prev_results.json"
|
||||
write_json(prev_results, [])
|
||||
run_tabulate(
|
||||
"--label", "improve-test",
|
||||
"--scan", str(prev_scan),
|
||||
"--inhouse",
|
||||
"--append", str(prev_results),
|
||||
)
|
||||
|
||||
cur_findings = [
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 1, "Confirmed"),
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 2, "Confirmed"),
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 3, "Confirmed"),
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 4, "Unsupported"),
|
||||
]
|
||||
cur_scan = tmp / "cur_scan.json"
|
||||
write_json(cur_scan, {"findings": cur_findings})
|
||||
cur_results = tmp / "cur_results.json"
|
||||
write_json(cur_results, [])
|
||||
proc = run_tabulate(
|
||||
"--label", "improve-test",
|
||||
"--scan", str(cur_scan),
|
||||
"--inhouse",
|
||||
"--append", str(cur_results),
|
||||
"--diff", str(prev_results),
|
||||
)
|
||||
assert proc.returncode == 0, (
|
||||
f"improvement diff must exit 0, got {proc.returncode}\n"
|
||||
f"stdout: {proc.stdout}\nstderr: {proc.stderr}"
|
||||
)
|
||||
assert "no regressions" in proc.stdout, proc.stdout
|
||||
|
||||
|
||||
def test_budget_malformed_exits_3(tmp: Path) -> None:
|
||||
bad = tmp / "bad.toml"
|
||||
bad.write_text("[default]\nunsupported_rate = not_a_number\n")
|
||||
scan = tmp / "scan.json"
|
||||
write_json(scan, {"findings": []})
|
||||
append = tmp / "results.json"
|
||||
write_json(append, [])
|
||||
proc = run_tabulate(
|
||||
"--label", "test",
|
||||
"--scan", str(scan),
|
||||
"--inhouse",
|
||||
"--append", str(append),
|
||||
"--budget", str(bad),
|
||||
)
|
||||
assert proc.returncode == 3, (
|
||||
f"malformed budget must exit 3, got {proc.returncode}\nstderr: {proc.stderr}"
|
||||
)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
tmp = Path(td)
|
||||
for fn in (
|
||||
test_budget_passes_on_clean_scan,
|
||||
test_budget_fails_when_unsupported_exceeds,
|
||||
test_diff_fails_on_regression,
|
||||
test_diff_passes_on_improvement,
|
||||
test_budget_malformed_exits_3,
|
||||
):
|
||||
sub = tmp / fn.__name__
|
||||
sub.mkdir()
|
||||
print(f"... {fn.__name__}")
|
||||
fn(sub)
|
||||
print(f" OK")
|
||||
print("\nAll tabulate.py regression checks passed.")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
|
|
@ -15,7 +15,7 @@ mod common;
|
|||
mod python_fixture_tests {
|
||||
use crate::common::fixture_harness::{
|
||||
run_fixture_and_compare_to_golden, run_harness_snapshot, run_shape_fixture,
|
||||
CopyStrategy, FixtureSpec,
|
||||
CopyStrategy, FixtureSpec, Prerequisite,
|
||||
};
|
||||
use nyx_scanner::commands::scan::Diag;
|
||||
use nyx_scanner::dynamic::spec::PayloadSlot;
|
||||
|
|
@ -48,6 +48,12 @@ mod python_fixture_tests {
|
|||
sink_line,
|
||||
confidence: Confidence::High,
|
||||
copy: CopyStrategy::PreserveName,
|
||||
// Phase 29 (Track I): the Python harness emitter shells out
|
||||
// to `python3` during verify, so the host must have it.
|
||||
// The harness short-circuits with a structured skip when
|
||||
// missing; CI rows that intentionally omit Python still go
|
||||
// green.
|
||||
requires: vec![Prerequisite::CommandAvailable("python3")],
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -65,6 +71,10 @@ mod python_fixture_tests {
|
|||
sink_line,
|
||||
confidence: Confidence::Low,
|
||||
copy: CopyStrategy::PreserveName,
|
||||
// Low-confidence rows short-circuit to
|
||||
// `Unsupported(ConfidenceTooLow)` before the harness ever
|
||||
// shells out to python3, so no prerequisite is needed.
|
||||
requires: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ mod common;
|
|||
#[cfg(feature = "dynamic")]
|
||||
mod rust_fixture_tests {
|
||||
use crate::common::fixture_harness::{
|
||||
run_fixture_and_compare_to_golden, CopyStrategy, FixtureSpec,
|
||||
run_fixture_and_compare_to_golden, CopyStrategy, FixtureSpec, Prerequisite,
|
||||
};
|
||||
use nyx_scanner::commands::scan::Diag;
|
||||
use nyx_scanner::dynamic::verify::{verify_finding, VerifyOptions};
|
||||
|
|
@ -32,6 +32,11 @@ mod rust_fixture_tests {
|
|||
sink_line,
|
||||
confidence: Confidence::High,
|
||||
copy: CopyStrategy::RustEntry,
|
||||
// Phase 29 (Track I): the Rust harness emitter shells out
|
||||
// to `cargo` during verify, so the host must have a Rust
|
||||
// toolchain on PATH. Missing cargo triggers a structured
|
||||
// skip rather than a panic.
|
||||
requires: vec![Prerequisite::CommandAvailable("cargo")],
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -49,6 +54,10 @@ mod rust_fixture_tests {
|
|||
sink_line,
|
||||
confidence: Confidence::Low,
|
||||
copy: CopyStrategy::RustEntry,
|
||||
// Low-confidence rows short-circuit to
|
||||
// `Unsupported(ConfidenceTooLow)` before the harness ever
|
||||
// shells out to cargo.
|
||||
requires: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue