mirror of
https://github.com/elicpeter/nyx.git
synced 2026-06-21 20:18:06 +02:00
[pitboss] phase 29: Track I — Per-cell budgets, --diff, fixture prerequisites, CI matrix expansion
This commit is contained in:
parent
760bc1beb2
commit
dd607fb4b3
10 changed files with 1325 additions and 32 deletions
210
tests/eval_corpus/budget.toml
Normal file
210
tests/eval_corpus/budget.toml
Normal file
|
|
@ -0,0 +1,210 @@
|
|||
# Per-cell (cap × lang) budgets for the dynamic-verification eval corpus.
|
||||
#
|
||||
# Phase 29 (Track I): replaces the single global Unsupported-rate gate in
|
||||
# tests/eval_corpus/report.py with per-cell targets. Each cell records the
|
||||
# largest tolerated rate today plus a deadline date for the next ratchet.
|
||||
#
|
||||
# Schema:
|
||||
#
|
||||
# [default]
|
||||
# unsupported_rate = 0.80 # max(Unsupported / total) per cell
|
||||
# false_confirmed_rate = 0.02 # max(wrong / Confirmed) per cell
|
||||
# repro_stability = 0.95 # min(stable / Confirmed) per cell
|
||||
# ratchet_deadline = "2026-08-01"
|
||||
#
|
||||
# [[cell]]
|
||||
# cap = "sqli"
|
||||
# lang = "python"
|
||||
# unsupported_rate = 0.50
|
||||
# false_confirmed_rate = 0.02
|
||||
# repro_stability = 0.97
|
||||
# ratchet_deadline = "2026-07-15"
|
||||
#
|
||||
# `cap` matches tabulate.py's _CAP_BIT_TABLE / _CAP_RULE_TABLE labels.
|
||||
# `lang` matches the ext_map values (`python`, `javascript`, …).
|
||||
# A wildcard `"*"` matches any cell that does not have an exact entry.
|
||||
|
||||
[default]
|
||||
# Inherited by any cell not overridden below. Aligned with the legacy
|
||||
# Gate-1 / Gate-2 / Gate-5 thresholds in scripts/m7_ship_gate.sh.
|
||||
unsupported_rate = 0.80
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-08-01"
|
||||
|
||||
# Python verticals (Phase 12 — most mature; tightest budgets).
|
||||
|
||||
[[cell]]
|
||||
cap = "sqli"
|
||||
lang = "python"
|
||||
unsupported_rate = 0.40
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.97
|
||||
ratchet_deadline = "2026-07-15"
|
||||
|
||||
[[cell]]
|
||||
cap = "cmdi"
|
||||
lang = "python"
|
||||
unsupported_rate = 0.40
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.97
|
||||
ratchet_deadline = "2026-07-15"
|
||||
|
||||
[[cell]]
|
||||
cap = "path_traversal"
|
||||
lang = "python"
|
||||
unsupported_rate = 0.50
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-07-15"
|
||||
|
||||
[[cell]]
|
||||
cap = "ssrf"
|
||||
lang = "python"
|
||||
unsupported_rate = 0.50
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-07-15"
|
||||
|
||||
[[cell]]
|
||||
cap = "deserialize"
|
||||
lang = "python"
|
||||
unsupported_rate = 0.60
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-08-01"
|
||||
|
||||
# JavaScript / TypeScript (Phase 13 — second-most-mature).
|
||||
|
||||
[[cell]]
|
||||
cap = "sqli"
|
||||
lang = "javascript"
|
||||
unsupported_rate = 0.55
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-08-01"
|
||||
|
||||
[[cell]]
|
||||
cap = "cmdi"
|
||||
lang = "javascript"
|
||||
unsupported_rate = 0.55
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-08-01"
|
||||
|
||||
[[cell]]
|
||||
cap = "ssrf"
|
||||
lang = "javascript"
|
||||
unsupported_rate = 0.60
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-08-01"
|
||||
|
||||
[[cell]]
|
||||
cap = "xss"
|
||||
lang = "javascript"
|
||||
unsupported_rate = 0.70
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-08-15"
|
||||
|
||||
[[cell]]
|
||||
cap = "sqli"
|
||||
lang = "typescript"
|
||||
unsupported_rate = 0.60
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-08-15"
|
||||
|
||||
# Java (Phase 14).
|
||||
|
||||
[[cell]]
|
||||
cap = "sqli"
|
||||
lang = "java"
|
||||
unsupported_rate = 0.65
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-08-15"
|
||||
|
||||
[[cell]]
|
||||
cap = "deserialize"
|
||||
lang = "java"
|
||||
unsupported_rate = 0.70
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-09-01"
|
||||
|
||||
# Phase 15 / 16 verticals (Go, PHP, Ruby, Rust, C, C++) — newer; broader
|
||||
# tolerance until their probe-shim splicing follow-ups land.
|
||||
|
||||
[[cell]]
|
||||
cap = "cmdi"
|
||||
lang = "go"
|
||||
unsupported_rate = 0.75
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-09-01"
|
||||
|
||||
[[cell]]
|
||||
cap = "sqli"
|
||||
lang = "go"
|
||||
unsupported_rate = 0.75
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-09-01"
|
||||
|
||||
[[cell]]
|
||||
cap = "cmdi"
|
||||
lang = "php"
|
||||
unsupported_rate = 0.75
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-09-01"
|
||||
|
||||
[[cell]]
|
||||
cap = "deserialize"
|
||||
lang = "php"
|
||||
unsupported_rate = 0.75
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-09-01"
|
||||
|
||||
[[cell]]
|
||||
cap = "cmdi"
|
||||
lang = "ruby"
|
||||
unsupported_rate = 0.75
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-09-01"
|
||||
|
||||
[[cell]]
|
||||
cap = "sqli"
|
||||
lang = "rust"
|
||||
unsupported_rate = 0.80
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-09-15"
|
||||
|
||||
[[cell]]
|
||||
cap = "fmt_string"
|
||||
lang = "c"
|
||||
unsupported_rate = 0.85
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-09-15"
|
||||
|
||||
[[cell]]
|
||||
cap = "memory"
|
||||
lang = "c"
|
||||
unsupported_rate = 0.90
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-10-01"
|
||||
|
||||
[[cell]]
|
||||
cap = "memory"
|
||||
lang = "cpp"
|
||||
unsupported_rate = 0.90
|
||||
false_confirmed_rate = 0.02
|
||||
repro_stability = 0.95
|
||||
ratchet_deadline = "2026-10-01"
|
||||
|
|
@ -2,6 +2,11 @@
|
|||
"""
|
||||
Aggregate eval results across all corpus sets and emit a summary table.
|
||||
Used by run.sh after all corpus sets have been tabulated.
|
||||
|
||||
Phase 29 (Track I) extensions:
|
||||
--budget tests/eval_corpus/budget.toml per-cell budget enforcement
|
||||
--diff previous.json monotonic-improvement diff;
|
||||
CI fails on any regression.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
|
|
@ -9,10 +14,105 @@ import json
|
|||
import sys
|
||||
from collections import defaultdict
|
||||
|
||||
try:
|
||||
import tomllib # Python 3.11+
|
||||
except ModuleNotFoundError: # pragma: no cover — older interpreters only
|
||||
import tomli as tomllib # type: ignore[no-redef]
|
||||
|
||||
|
||||
def load_budget(path: str) -> dict:
|
||||
try:
|
||||
with open(path, "rb") as f:
|
||||
raw = tomllib.load(f)
|
||||
except FileNotFoundError:
|
||||
print(f"ERROR budget file not found: {path}", file=sys.stderr)
|
||||
sys.exit(3)
|
||||
except tomllib.TOMLDecodeError as e:
|
||||
print(f"ERROR budget file malformed: {path}: {e}", file=sys.stderr)
|
||||
sys.exit(3)
|
||||
default = raw.get("default", {}) or {}
|
||||
cells = {}
|
||||
for row in raw.get("cell", []) or []:
|
||||
cap = row.get("cap")
|
||||
lang = row.get("lang")
|
||||
if not cap or not lang:
|
||||
print(f"ERROR budget cell missing cap/lang: {row!r}", file=sys.stderr)
|
||||
sys.exit(3)
|
||||
cells[(cap, lang)] = row
|
||||
return {"default": default, "cells": cells}
|
||||
|
||||
|
||||
def budget_for_cell(budget: dict, cap: str, lang: str) -> dict:
|
||||
merged = dict(budget.get("default", {}) or {})
|
||||
cell = budget.get("cells", {}).get((cap, lang))
|
||||
if cell:
|
||||
merged.update({k: v for k, v in cell.items() if k not in ("cap", "lang")})
|
||||
if not cell:
|
||||
wildcard = (
|
||||
budget.get("cells", {}).get((cap, "*"))
|
||||
or budget.get("cells", {}).get(("*", lang))
|
||||
or budget.get("cells", {}).get(("*", "*"))
|
||||
)
|
||||
if wildcard:
|
||||
merged.update(
|
||||
{k: v for k, v in wildcard.items() if k not in ("cap", "lang")}
|
||||
)
|
||||
return merged
|
||||
|
||||
|
||||
def load_previous_agg(path: str) -> dict:
|
||||
"""Aggregate a previous results file the same way main() does."""
|
||||
try:
|
||||
with open(path) as f:
|
||||
data = json.load(f)
|
||||
except FileNotFoundError:
|
||||
print(f"ERROR diff file not found: {path}", file=sys.stderr)
|
||||
sys.exit(3)
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"ERROR diff file malformed: {path}: {e}", file=sys.stderr)
|
||||
sys.exit(3)
|
||||
agg: dict[tuple[str, str], dict] = defaultdict(
|
||||
lambda: {
|
||||
"tp": 0,
|
||||
"fp": 0,
|
||||
"fn": 0,
|
||||
"unsupported": 0,
|
||||
"confirmed": 0,
|
||||
"wrong_confirmed": 0,
|
||||
"stable_replays": 0,
|
||||
"total": 0,
|
||||
}
|
||||
)
|
||||
for r in data:
|
||||
for c in r.get("cells", []):
|
||||
k = (c["cap"], c["lang"])
|
||||
for field in (
|
||||
"tp",
|
||||
"fp",
|
||||
"fn",
|
||||
"unsupported",
|
||||
"confirmed",
|
||||
"wrong_confirmed",
|
||||
"stable_replays",
|
||||
"total",
|
||||
):
|
||||
agg[k][field] += c.get(field, 0)
|
||||
return agg
|
||||
|
||||
|
||||
def main() -> int:
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument("--results", required=True)
|
||||
p.add_argument(
|
||||
"--budget",
|
||||
default="",
|
||||
help="path to budget.toml (per-(cap,lang) thresholds)",
|
||||
)
|
||||
p.add_argument(
|
||||
"--diff",
|
||||
default="",
|
||||
help="path to a previous results.json; fail on monotonic-improvement regression",
|
||||
)
|
||||
args = p.parse_args()
|
||||
|
||||
with open(args.results) as f:
|
||||
|
|
@ -24,12 +124,30 @@ def main() -> int:
|
|||
|
||||
# Aggregate across sets.
|
||||
agg: dict[tuple[str, str], dict] = defaultdict(
|
||||
lambda: {"tp": 0, "fp": 0, "fn": 0, "unsupported": 0, "total": 0}
|
||||
lambda: {
|
||||
"tp": 0,
|
||||
"fp": 0,
|
||||
"fn": 0,
|
||||
"unsupported": 0,
|
||||
"confirmed": 0,
|
||||
"wrong_confirmed": 0,
|
||||
"stable_replays": 0,
|
||||
"total": 0,
|
||||
}
|
||||
)
|
||||
for r in results:
|
||||
for c in r.get("cells", []):
|
||||
k = (c["cap"], c["lang"])
|
||||
for field in ("tp", "fp", "fn", "unsupported", "total"):
|
||||
for field in (
|
||||
"tp",
|
||||
"fp",
|
||||
"fn",
|
||||
"unsupported",
|
||||
"confirmed",
|
||||
"wrong_confirmed",
|
||||
"stable_replays",
|
||||
"total",
|
||||
):
|
||||
agg[k][field] += c.get(field, 0)
|
||||
|
||||
print("\n=== Aggregated eval corpus report ===")
|
||||
|
|
@ -46,18 +164,114 @@ def main() -> int:
|
|||
f"{unsup*100:>6.1f}%"
|
||||
)
|
||||
|
||||
# Gate check: per-cap Unsupported rate <= 80%
|
||||
gate_failed = False
|
||||
print("\n=== Gate checks ===")
|
||||
UNSUPPORTED_BUDGET = 0.80
|
||||
for k, v in sorted(agg.items()):
|
||||
unsup = v["unsupported"] / max(v["total"], 1)
|
||||
if unsup > UNSUPPORTED_BUDGET:
|
||||
print(f" FAIL {k[0]}/{k[1]}: Unsupported {unsup*100:.1f}% > {UNSUPPORTED_BUDGET*100:.0f}% budget")
|
||||
gate_failed = True
|
||||
|
||||
if not gate_failed:
|
||||
print(" All gate thresholds met.")
|
||||
# ── Phase 29: per-cell budget enforcement ────────────────────────────
|
||||
if args.budget:
|
||||
budget = load_budget(args.budget)
|
||||
print(f"\n=== Per-cell budget ({args.budget}) ===")
|
||||
cell_fails: list[str] = []
|
||||
for k, v in sorted(agg.items()):
|
||||
b = budget_for_cell(budget, k[0], k[1])
|
||||
if not b:
|
||||
continue
|
||||
max_unsup = b.get("unsupported_rate")
|
||||
max_false = b.get("false_confirmed_rate")
|
||||
min_stable = b.get("repro_stability")
|
||||
|
||||
if isinstance(max_unsup, (int, float)) and v["total"] > 0:
|
||||
rate = v["unsupported"] / v["total"]
|
||||
if rate > max_unsup:
|
||||
cell_fails.append(
|
||||
f" FAIL {k[0]}/{k[1]}: Unsupported {rate*100:.1f}%"
|
||||
f" > budget {max_unsup*100:.1f}%"
|
||||
)
|
||||
if isinstance(max_false, (int, float)) and v["confirmed"] > 0:
|
||||
rate = v["wrong_confirmed"] / v["confirmed"]
|
||||
if rate > max_false:
|
||||
cell_fails.append(
|
||||
f" FAIL {k[0]}/{k[1]}: false-Confirmed {rate*100:.1f}%"
|
||||
f" > budget {max_false*100:.1f}%"
|
||||
)
|
||||
if (
|
||||
isinstance(min_stable, (int, float))
|
||||
and v["confirmed"] > 0
|
||||
and v.get("stable_replays", 0) > 0
|
||||
):
|
||||
rate = v["stable_replays"] / v["confirmed"]
|
||||
if rate < min_stable:
|
||||
cell_fails.append(
|
||||
f" FAIL {k[0]}/{k[1]}: repro stability {rate*100:.1f}%"
|
||||
f" < budget {min_stable*100:.1f}%"
|
||||
)
|
||||
if cell_fails:
|
||||
for line in cell_fails:
|
||||
print(line)
|
||||
gate_failed = True
|
||||
else:
|
||||
print(" All per-cell budgets met.")
|
||||
else:
|
||||
# Legacy fallback: per-cap Unsupported rate <= 80%.
|
||||
print("\n=== Gate checks ===")
|
||||
UNSUPPORTED_BUDGET = 0.80
|
||||
cell_fails: list[str] = []
|
||||
for k, v in sorted(agg.items()):
|
||||
unsup = v["unsupported"] / max(v["total"], 1)
|
||||
if unsup > UNSUPPORTED_BUDGET:
|
||||
cell_fails.append(
|
||||
f" FAIL {k[0]}/{k[1]}: Unsupported {unsup*100:.1f}%"
|
||||
f" > {UNSUPPORTED_BUDGET*100:.0f}% budget"
|
||||
)
|
||||
if cell_fails:
|
||||
for line in cell_fails:
|
||||
print(line)
|
||||
gate_failed = True
|
||||
else:
|
||||
print(" All gate thresholds met.")
|
||||
|
||||
# ── Phase 29: monotonic-improvement diff ─────────────────────────────
|
||||
if args.diff:
|
||||
prev = load_previous_agg(args.diff)
|
||||
print(f"\n=== Monotonic-improvement diff vs {args.diff} ===")
|
||||
diff_fails: list[str] = []
|
||||
EPS = 0.005
|
||||
for k, v in sorted(agg.items()):
|
||||
old = prev.get(k)
|
||||
if not old:
|
||||
continue
|
||||
old_unsup = old["unsupported"] / max(old["total"], 1)
|
||||
new_unsup = v["unsupported"] / max(v["total"], 1)
|
||||
if new_unsup > old_unsup + EPS:
|
||||
diff_fails.append(
|
||||
f" REGRESSION {k[0]}/{k[1]}: Unsupported"
|
||||
f" {old_unsup*100:.1f}% → {new_unsup*100:.1f}%"
|
||||
)
|
||||
old_conf = old.get("confirmed", 0)
|
||||
new_conf = v.get("confirmed", 0)
|
||||
old_false = (old.get("wrong_confirmed", 0) / old_conf) if old_conf else None
|
||||
new_false = (v.get("wrong_confirmed", 0) / new_conf) if new_conf else None
|
||||
if old_false is not None and new_false is not None and new_false > old_false + EPS:
|
||||
diff_fails.append(
|
||||
f" REGRESSION {k[0]}/{k[1]}: false-Confirmed"
|
||||
f" {old_false*100:.1f}% → {new_false*100:.1f}%"
|
||||
)
|
||||
old_stable = (old.get("stable_replays", 0) / old_conf) if old_conf else None
|
||||
new_stable = (v.get("stable_replays", 0) / new_conf) if new_conf else None
|
||||
if (
|
||||
old_stable is not None
|
||||
and new_stable is not None
|
||||
and new_stable < old_stable - EPS
|
||||
):
|
||||
diff_fails.append(
|
||||
f" REGRESSION {k[0]}/{k[1]}: repro stability"
|
||||
f" {old_stable*100:.1f}% → {new_stable*100:.1f}%"
|
||||
)
|
||||
if diff_fails:
|
||||
for line in diff_fails:
|
||||
print(line)
|
||||
gate_failed = True
|
||||
else:
|
||||
print(" No regressions vs previous run.")
|
||||
|
||||
return 2 if gate_failed else 0
|
||||
|
||||
|
|
|
|||
|
|
@ -29,12 +29,17 @@ OUTPUT_DIR=""
|
|||
NYX_BIN="${NYX_BIN:-${REPO_ROOT}/target/release/nyx}"
|
||||
CORPUS_CACHE="${NYX_EVAL_CORPUS_DIR:-${HOME}/.cache/nyx/eval_corpus}"
|
||||
SETS="owasp,sard,inhouse"
|
||||
# Phase 29 (Track I): per-cell budgets + monotonic-improvement diff.
|
||||
BUDGET_FILE=""
|
||||
DIFF_FILE=""
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--output) OUTPUT_DIR="$2"; shift 2 ;;
|
||||
--nyx) NYX_BIN="$2"; shift 2 ;;
|
||||
--sets) SETS="$2"; shift 2 ;;
|
||||
--budget) BUDGET_FILE="$2"; shift 2 ;;
|
||||
--diff) DIFF_FILE="$2"; shift 2 ;;
|
||||
*) shift ;;
|
||||
esac
|
||||
done
|
||||
|
|
@ -83,6 +88,8 @@ if [[ "$SETS" == *owasp* ]]; then
|
|||
--scan /tmp/nyx_owasp.json \
|
||||
--ground-truth "${SCRIPT_DIR}/ground_truth/owasp_benchmark_v1.2.json" \
|
||||
--append "$RESULTS_JSON" \
|
||||
${BUDGET_FILE:+--budget "$BUDGET_FILE"} \
|
||||
${DIFF_FILE:+--diff "$DIFF_FILE"} \
|
||||
|| info " tabulate.py failed; ground truth file may be absent"
|
||||
fi
|
||||
fi
|
||||
|
|
@ -111,6 +118,8 @@ if [[ "$SETS" == *sard* ]]; then
|
|||
--scan /tmp/nyx_sard.json \
|
||||
--ground-truth "${SCRIPT_DIR}/ground_truth/nist_sard.json" \
|
||||
--append "$RESULTS_JSON" \
|
||||
${BUDGET_FILE:+--budget "$BUDGET_FILE"} \
|
||||
${DIFF_FILE:+--diff "$DIFF_FILE"} \
|
||||
|| info " tabulate.py failed; ground truth file may be absent"
|
||||
fi
|
||||
fi
|
||||
|
|
@ -140,6 +149,8 @@ if [[ "$SETS" == *inhouse* ]]; then
|
|||
--scan "/tmp/nyx_${label}.json" \
|
||||
--inhouse \
|
||||
--append "$RESULTS_JSON" \
|
||||
${BUDGET_FILE:+--budget "$BUDGET_FILE"} \
|
||||
${DIFF_FILE:+--diff "$DIFF_FILE"} \
|
||||
|| info " tabulate.py failed on $label"
|
||||
done
|
||||
fi
|
||||
|
|
@ -156,12 +167,20 @@ if [[ ! -f "${SCRIPT_DIR}/report.py" ]]; then
|
|||
fi
|
||||
|
||||
set +e
|
||||
python3 "${SCRIPT_DIR}/report.py" --results "$RESULTS_JSON"
|
||||
python3 "${SCRIPT_DIR}/report.py" \
|
||||
--results "$RESULTS_JSON" \
|
||||
${BUDGET_FILE:+--budget "$BUDGET_FILE"} \
|
||||
${DIFF_FILE:+--diff "$DIFF_FILE"}
|
||||
REPORT_RC=$?
|
||||
set -e
|
||||
# Propagate gate-fail (exit 2). Treat other non-zero as setup error (exit 1).
|
||||
# Propagate gate-fail (exit 2) and malformed-config (exit 3) so the
|
||||
# m7_ship_gate.sh Gate-1 dispatch can tell them apart. Treat other
|
||||
# non-zero as setup error (exit 1).
|
||||
if [[ $REPORT_RC -eq 2 ]]; then
|
||||
exit 2
|
||||
elif [[ $REPORT_RC -eq 3 ]]; then
|
||||
info "report.py: budget/diff configuration malformed; see $RESULTS_JSON"
|
||||
exit 3
|
||||
elif [[ $REPORT_RC -ne 0 ]]; then
|
||||
info "report.py crashed (exit $REPORT_RC); raw results at $RESULTS_JSON"
|
||||
exit 1
|
||||
|
|
|
|||
|
|
@ -9,6 +9,17 @@ For in-house sets (--inhouse): counts findings by cap x language; reports
|
|||
Unsupported rate only (no ground truth required).
|
||||
|
||||
Output: appends a result record to --append FILE.
|
||||
|
||||
Phase 29 (Track I) extensions:
|
||||
--budget tests/eval_corpus/budget.toml enforce per-cell budget thresholds
|
||||
--diff previous.json compare against prior result file,
|
||||
fail on monotonic-improvement
|
||||
regression
|
||||
|
||||
Exit codes:
|
||||
0 all rows pass.
|
||||
2 one or more per-cell budgets exceeded OR a diff regression was found.
|
||||
3 malformed budget / diff input (callers must fix configuration).
|
||||
"""
|
||||
|
||||
import argparse
|
||||
|
|
@ -17,6 +28,11 @@ import sys
|
|||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
import tomllib # Python 3.11+
|
||||
except ModuleNotFoundError: # pragma: no cover — older interpreters only
|
||||
import tomli as tomllib # type: ignore[no-redef]
|
||||
|
||||
LINE_TOLERANCE = 5
|
||||
|
||||
# Bitflag positions for Cap (src/labels/mod.rs). Sink bits map to a cap label.
|
||||
|
|
@ -97,6 +113,203 @@ def lang_of(finding: dict) -> str:
|
|||
return "unknown"
|
||||
|
||||
|
||||
# ── Budget loading ──────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def load_budget(path: str) -> dict:
|
||||
"""Parse a budget.toml file.
|
||||
|
||||
Returns a dict::
|
||||
|
||||
{
|
||||
"default": {"unsupported_rate": 0.8, "false_confirmed_rate": 0.02,
|
||||
"repro_stability": 0.95, "ratchet_deadline": "..."},
|
||||
"cells": {(cap, lang): {...overrides...}, ...},
|
||||
}
|
||||
|
||||
Raises SystemExit(3) on a malformed file.
|
||||
"""
|
||||
|
||||
try:
|
||||
with open(path, "rb") as f:
|
||||
raw = tomllib.load(f)
|
||||
except FileNotFoundError:
|
||||
print(f"ERROR budget file not found: {path}", file=sys.stderr)
|
||||
sys.exit(3)
|
||||
except tomllib.TOMLDecodeError as e:
|
||||
print(f"ERROR budget file malformed: {path}: {e}", file=sys.stderr)
|
||||
sys.exit(3)
|
||||
|
||||
default = raw.get("default", {}) or {}
|
||||
cells = {}
|
||||
for row in raw.get("cell", []) or []:
|
||||
cap = row.get("cap")
|
||||
lang = row.get("lang")
|
||||
if not cap or not lang:
|
||||
print(
|
||||
f"ERROR budget cell missing cap/lang: {row!r}", file=sys.stderr
|
||||
)
|
||||
sys.exit(3)
|
||||
cells[(cap, lang)] = row
|
||||
|
||||
return {"default": default, "cells": cells}
|
||||
|
||||
|
||||
def budget_for_cell(budget: dict, cap: str, lang: str) -> dict:
|
||||
"""Merge cell-specific overrides on top of [default]."""
|
||||
merged = dict(budget.get("default", {}) or {})
|
||||
cell = budget.get("cells", {}).get((cap, lang))
|
||||
if cell:
|
||||
merged.update({k: v for k, v in cell.items() if k not in ("cap", "lang")})
|
||||
# Fall back to a wildcard override if present.
|
||||
if not cell:
|
||||
wildcard = budget.get("cells", {}).get((cap, "*")) or \
|
||||
budget.get("cells", {}).get(("*", lang)) or \
|
||||
budget.get("cells", {}).get(("*", "*"))
|
||||
if wildcard:
|
||||
merged.update({k: v for k, v in wildcard.items() if k not in ("cap", "lang")})
|
||||
return merged
|
||||
|
||||
|
||||
def enforce_budget(cells: list, budget: dict) -> list:
|
||||
"""Return a list of human-readable failure strings.
|
||||
|
||||
Each cell's measured Unsupported / false-Confirmed / repro-stability
|
||||
rate is compared against its merged budget row. A missing measurement
|
||||
(e.g. no Confirmed findings → false-Confirmed denominator = 0) is
|
||||
treated as "no data" and skipped, never as a failure.
|
||||
"""
|
||||
|
||||
failures = []
|
||||
for c in cells:
|
||||
b = budget_for_cell(budget, c["cap"], c["lang"])
|
||||
if not b:
|
||||
continue
|
||||
cap, lang = c["cap"], c["lang"]
|
||||
max_unsup = b.get("unsupported_rate")
|
||||
max_false = b.get("false_confirmed_rate")
|
||||
min_stable = b.get("repro_stability")
|
||||
|
||||
if isinstance(max_unsup, (int, float)) and c.get("total", 0) > 0:
|
||||
if c["unsupported_rate"] > max_unsup:
|
||||
failures.append(
|
||||
f" FAIL {cap}/{lang}: Unsupported {c['unsupported_rate']*100:.1f}%"
|
||||
f" > budget {max_unsup*100:.1f}%"
|
||||
)
|
||||
if isinstance(max_false, (int, float)) and c.get("confirmed", 0) > 0:
|
||||
rate = c.get("wrong_confirmed", 0) / c["confirmed"]
|
||||
if rate > max_false:
|
||||
failures.append(
|
||||
f" FAIL {cap}/{lang}: false-Confirmed {rate*100:.1f}%"
|
||||
f" > budget {max_false*100:.1f}%"
|
||||
)
|
||||
# Repro stability is only enforced when callers stamped at least
|
||||
# one `replay_stable: true` flag — otherwise stable_replays == 0
|
||||
# is indistinguishable from "we did not measure stability for
|
||||
# this row" and the gate would fire vacuously on every clean run.
|
||||
if (
|
||||
isinstance(min_stable, (int, float))
|
||||
and c.get("confirmed", 0) > 0
|
||||
and c.get("stable_replays", 0) > 0
|
||||
):
|
||||
rate = c["stable_replays"] / c["confirmed"]
|
||||
if rate < min_stable:
|
||||
failures.append(
|
||||
f" FAIL {cap}/{lang}: repro stability {rate*100:.1f}%"
|
||||
f" < budget {min_stable*100:.1f}%"
|
||||
)
|
||||
return failures
|
||||
|
||||
|
||||
# ── Diff loading ────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def load_previous_cells(path: str, label: str) -> dict:
|
||||
"""Index a previous results file by (cap, lang) → cell.
|
||||
|
||||
The previous file is the same shape as `--append`'s output. We pick the
|
||||
record whose `label` matches the current run; if no exact match, fall
|
||||
back to the first record. Missing/unreadable files exit 3.
|
||||
"""
|
||||
|
||||
try:
|
||||
with open(path) as f:
|
||||
data = json.load(f)
|
||||
except FileNotFoundError:
|
||||
print(f"ERROR diff file not found: {path}", file=sys.stderr)
|
||||
sys.exit(3)
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"ERROR diff file malformed: {path}: {e}", file=sys.stderr)
|
||||
sys.exit(3)
|
||||
|
||||
records = data if isinstance(data, list) else [data]
|
||||
chosen = None
|
||||
for r in records:
|
||||
if r.get("label") == label:
|
||||
chosen = r
|
||||
break
|
||||
if chosen is None and records:
|
||||
chosen = records[0]
|
||||
if not chosen:
|
||||
return {}
|
||||
return {(c["cap"], c["lang"]): c for c in chosen.get("cells", [])}
|
||||
|
||||
|
||||
def diff_regressions(cells: list, prev: dict) -> list:
|
||||
"""Compare current cells against previous. Returns failure strings.
|
||||
|
||||
Three monotonicity rules:
|
||||
* Unsupported% must not increase.
|
||||
* False-Confirmed% must not increase.
|
||||
* Repro-stability% must not decrease.
|
||||
|
||||
Cells absent from `prev` are treated as new (skipped).
|
||||
A small epsilon (0.5 percentage points) absorbs flake noise.
|
||||
"""
|
||||
EPS = 0.005
|
||||
failures = []
|
||||
for c in cells:
|
||||
key = (c["cap"], c["lang"])
|
||||
old = prev.get(key)
|
||||
if not old:
|
||||
continue
|
||||
# Unsupported.
|
||||
old_unsup = old.get("unsupported_rate", 0.0)
|
||||
new_unsup = c.get("unsupported_rate", 0.0)
|
||||
if new_unsup > old_unsup + EPS:
|
||||
failures.append(
|
||||
f" REGRESSION {key[0]}/{key[1]}: Unsupported"
|
||||
f" {old_unsup*100:.1f}% → {new_unsup*100:.1f}%"
|
||||
)
|
||||
# False-Confirmed.
|
||||
old_conf = old.get("confirmed", 0)
|
||||
old_false = (old.get("wrong_confirmed", 0) / old_conf) if old_conf else None
|
||||
new_conf = c.get("confirmed", 0)
|
||||
new_false = (c.get("wrong_confirmed", 0) / new_conf) if new_conf else None
|
||||
if old_false is not None and new_false is not None and new_false > old_false + EPS:
|
||||
failures.append(
|
||||
f" REGRESSION {key[0]}/{key[1]}: false-Confirmed"
|
||||
f" {old_false*100:.1f}% → {new_false*100:.1f}%"
|
||||
)
|
||||
# Repro stability (higher is better).
|
||||
old_stable = (
|
||||
(old.get("stable_replays", 0) / old_conf) if old_conf else None
|
||||
)
|
||||
new_stable = (
|
||||
(c.get("stable_replays", 0) / new_conf) if new_conf else None
|
||||
)
|
||||
if (
|
||||
old_stable is not None
|
||||
and new_stable is not None
|
||||
and new_stable < old_stable - EPS
|
||||
):
|
||||
failures.append(
|
||||
f" REGRESSION {key[0]}/{key[1]}: repro stability"
|
||||
f" {old_stable*100:.1f}% → {new_stable*100:.1f}%"
|
||||
)
|
||||
return failures
|
||||
|
||||
|
||||
def main() -> int:
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument("--label", required=True)
|
||||
|
|
@ -104,14 +317,34 @@ def main() -> int:
|
|||
p.add_argument("--ground-truth", default="", help="ground truth JSON")
|
||||
p.add_argument("--inhouse", action="store_true")
|
||||
p.add_argument("--append", required=True, help="results accumulator JSON")
|
||||
p.add_argument(
|
||||
"--budget",
|
||||
default="",
|
||||
help="path to budget.toml (per-(cap,lang) thresholds)",
|
||||
)
|
||||
p.add_argument(
|
||||
"--diff",
|
||||
default="",
|
||||
help="path to a previous results JSON; fail on monotonic-improvement regression",
|
||||
)
|
||||
args = p.parse_args()
|
||||
|
||||
scan_data = load_json(args.scan)
|
||||
findings = scan_data if isinstance(scan_data, list) else scan_data.get("findings", [])
|
||||
|
||||
# Per-cell tallies: {(cap, lang): {tp, fp, fn, unsupported}}
|
||||
# Per-cell tallies: {(cap, lang): {tp, fp, fn, unsupported, confirmed,
|
||||
# wrong_confirmed, stable_replays, total}}
|
||||
cells: dict[tuple[str, str], dict] = defaultdict(
|
||||
lambda: {"tp": 0, "fp": 0, "fn": 0, "unsupported": 0, "total": 0}
|
||||
lambda: {
|
||||
"tp": 0,
|
||||
"fp": 0,
|
||||
"fn": 0,
|
||||
"unsupported": 0,
|
||||
"confirmed": 0,
|
||||
"wrong_confirmed": 0,
|
||||
"stable_replays": 0,
|
||||
"total": 0,
|
||||
}
|
||||
)
|
||||
|
||||
for f in findings:
|
||||
|
|
@ -121,8 +354,19 @@ def main() -> int:
|
|||
ev = f.get("evidence", {}) or {}
|
||||
dv = ev.get("dynamic_verdict") if ev else None
|
||||
cells[key]["total"] += 1
|
||||
if dv and dv.get("status") == "Unsupported":
|
||||
cells[key]["unsupported"] += 1
|
||||
if dv:
|
||||
status = dv.get("status")
|
||||
if status == "Unsupported":
|
||||
cells[key]["unsupported"] += 1
|
||||
elif status == "Confirmed":
|
||||
cells[key]["confirmed"] += 1
|
||||
# Repro-stability and false-Confirmed counts are optional
|
||||
# fields tabulate.py reads off the verdict when callers
|
||||
# (m7_ship_gate.sh / corpus_promote.yml) have stamped them.
|
||||
if dv.get("wrong") is True:
|
||||
cells[key]["wrong_confirmed"] += 1
|
||||
if dv.get("replay_stable") is True:
|
||||
cells[key]["stable_replays"] += 1
|
||||
|
||||
if not args.inhouse and args.ground_truth and Path(args.ground_truth).exists():
|
||||
gt = load_json(args.ground_truth)
|
||||
|
|
@ -201,7 +445,34 @@ def main() -> int:
|
|||
f"{c['precision']:>6.2f} {c['recall']:>6.2f} "
|
||||
f"{c['unsupported_rate']*100:>6.1f}%"
|
||||
)
|
||||
return 0
|
||||
|
||||
exit_rc = 0
|
||||
|
||||
# ── Phase 29: per-cell budget enforcement ─────────────────────────────
|
||||
if args.budget:
|
||||
budget = load_budget(args.budget)
|
||||
failures = enforce_budget(result["cells"], budget)
|
||||
if failures:
|
||||
print(f"\n=== Per-cell budget regressions ({args.budget}) ===")
|
||||
for line in failures:
|
||||
print(line)
|
||||
exit_rc = 2
|
||||
else:
|
||||
print(f"\nPer-cell budget ({args.budget}): OK")
|
||||
|
||||
# ── Phase 29: diff against previous run ───────────────────────────────
|
||||
if args.diff:
|
||||
prev = load_previous_cells(args.diff, args.label)
|
||||
failures = diff_regressions(result["cells"], prev)
|
||||
if failures:
|
||||
print(f"\n=== Monotonic-improvement regressions vs {args.diff} ===")
|
||||
for line in failures:
|
||||
print(line)
|
||||
exit_rc = 2
|
||||
else:
|
||||
print(f"\nDiff vs {args.diff}: no regressions")
|
||||
|
||||
return exit_rc
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
241
tests/eval_corpus/test_tabulate_regression.py
Normal file
241
tests/eval_corpus/test_tabulate_regression.py
Normal file
|
|
@ -0,0 +1,241 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Phase 29 (Track I) regression test for tests/eval_corpus/tabulate.py.
|
||||
|
||||
Exercises --budget and --diff against hand-crafted scan + ground-truth
|
||||
fixtures so the per-cell budget gate and monotonic-improvement diff are
|
||||
demonstrably non-vacuous.
|
||||
|
||||
Run with::
|
||||
|
||||
python3 tests/eval_corpus/test_tabulate_regression.py
|
||||
|
||||
Exits 0 when every assertion holds, non-zero otherwise. The asserts are
|
||||
plain `assert` statements so the file works both as a stand-alone script
|
||||
and under unittest discovery.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
REPO = Path(__file__).resolve().parents[2]
|
||||
TABULATE = REPO / "tests/eval_corpus/tabulate.py"
|
||||
BUDGET = REPO / "tests/eval_corpus/budget.toml"
|
||||
|
||||
|
||||
def run_tabulate(*args: str) -> subprocess.CompletedProcess:
|
||||
cmd = [sys.executable, str(TABULATE), *args]
|
||||
return subprocess.run(cmd, capture_output=True, text=True)
|
||||
|
||||
|
||||
def write_json(path: Path, data: object) -> None:
|
||||
path.write_text(json.dumps(data, indent=2))
|
||||
|
||||
|
||||
# Cap bit positions cribbed from tabulate.py / src/labels/mod.rs.
|
||||
SINK_BIT_SQL = 1 << 7 # SQL_QUERY
|
||||
SINK_BIT_CMDI = 1 << 10 # CODE_EXEC
|
||||
|
||||
|
||||
def python_finding(cap_bit: int, path: str, line: int, status: str | None) -> dict:
|
||||
finding = {
|
||||
"path": path,
|
||||
"line": line,
|
||||
"col": 0,
|
||||
"id": "py.sqli.cursor_execute",
|
||||
"evidence": {"sink_caps": cap_bit},
|
||||
}
|
||||
if status:
|
||||
finding["evidence"]["dynamic_verdict"] = {"status": status}
|
||||
return finding
|
||||
|
||||
|
||||
def test_budget_passes_on_clean_scan(tmp: Path) -> None:
|
||||
scan = tmp / "scan_clean.json"
|
||||
write_json(
|
||||
scan,
|
||||
{
|
||||
"findings": [
|
||||
python_finding(SINK_BIT_SQL, "app.py", 10, "Confirmed"),
|
||||
python_finding(SINK_BIT_SQL, "app.py", 20, "Confirmed"),
|
||||
python_finding(SINK_BIT_SQL, "app.py", 30, "NotConfirmed"),
|
||||
]
|
||||
},
|
||||
)
|
||||
append = tmp / "results_clean.json"
|
||||
write_json(append, [])
|
||||
proc = run_tabulate(
|
||||
"--label", "test",
|
||||
"--scan", str(scan),
|
||||
"--inhouse",
|
||||
"--append", str(append),
|
||||
"--budget", str(BUDGET),
|
||||
)
|
||||
assert proc.returncode == 0, f"clean scan must pass budget, got rc={proc.returncode}\nstdout: {proc.stdout}\nstderr: {proc.stderr}"
|
||||
assert "Per-cell budget" in proc.stdout and "OK" in proc.stdout, proc.stdout
|
||||
|
||||
|
||||
def test_budget_fails_when_unsupported_exceeds(tmp: Path) -> None:
|
||||
# SQL_QUERY/python budget is 40% Unsupported. Hand-craft a scan with
|
||||
# 100% Unsupported in that cell so the gate must trip.
|
||||
scan = tmp / "scan_unsup.json"
|
||||
write_json(
|
||||
scan,
|
||||
{
|
||||
"findings": [
|
||||
python_finding(SINK_BIT_SQL, "app.py", i, "Unsupported")
|
||||
for i in (10, 20, 30, 40, 50)
|
||||
]
|
||||
},
|
||||
)
|
||||
append = tmp / "results_unsup.json"
|
||||
write_json(append, [])
|
||||
proc = run_tabulate(
|
||||
"--label", "test",
|
||||
"--scan", str(scan),
|
||||
"--inhouse",
|
||||
"--append", str(append),
|
||||
"--budget", str(BUDGET),
|
||||
)
|
||||
assert proc.returncode == 2, (
|
||||
f"budget breach must exit 2, got {proc.returncode}\n"
|
||||
f"stdout: {proc.stdout}\nstderr: {proc.stderr}"
|
||||
)
|
||||
assert "FAIL" in proc.stdout and "sqli/python" in proc.stdout, proc.stdout
|
||||
|
||||
|
||||
def test_diff_fails_on_regression(tmp: Path) -> None:
|
||||
# Previous run: 1/4 Unsupported = 25%. Current run: 3/4 = 75%. The
|
||||
# default cell budget tolerates 80%, but the monotonic-improvement
|
||||
# diff must still flag the +50pp regression.
|
||||
prev_findings = [
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 1, "Confirmed"),
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 2, "Confirmed"),
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 3, "Confirmed"),
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 4, "Unsupported"),
|
||||
]
|
||||
prev_scan = tmp / "prev_scan.json"
|
||||
write_json(prev_scan, {"findings": prev_findings})
|
||||
prev_results = tmp / "prev_results.json"
|
||||
write_json(prev_results, [])
|
||||
rc_prev = run_tabulate(
|
||||
"--label", "diff-test",
|
||||
"--scan", str(prev_scan),
|
||||
"--inhouse",
|
||||
"--append", str(prev_results),
|
||||
).returncode
|
||||
assert rc_prev == 0, f"prev seed run must succeed, got {rc_prev}"
|
||||
|
||||
cur_findings = [
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 1, "Unsupported"),
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 2, "Unsupported"),
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 3, "Unsupported"),
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 4, "Confirmed"),
|
||||
]
|
||||
cur_scan = tmp / "cur_scan.json"
|
||||
write_json(cur_scan, {"findings": cur_findings})
|
||||
cur_results = tmp / "cur_results.json"
|
||||
write_json(cur_results, [])
|
||||
proc = run_tabulate(
|
||||
"--label", "diff-test",
|
||||
"--scan", str(cur_scan),
|
||||
"--inhouse",
|
||||
"--append", str(cur_results),
|
||||
"--diff", str(prev_results),
|
||||
)
|
||||
assert proc.returncode == 2, (
|
||||
f"regression diff must exit 2, got {proc.returncode}\n"
|
||||
f"stdout: {proc.stdout}\nstderr: {proc.stderr}"
|
||||
)
|
||||
assert "REGRESSION" in proc.stdout and "Unsupported" in proc.stdout, proc.stdout
|
||||
|
||||
|
||||
def test_diff_passes_on_improvement(tmp: Path) -> None:
|
||||
# Previous: 3/4 Unsupported. Current: 1/4. Monotonic improvement
|
||||
# must not flag any regression.
|
||||
prev_findings = [
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 1, "Unsupported"),
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 2, "Unsupported"),
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 3, "Unsupported"),
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 4, "Confirmed"),
|
||||
]
|
||||
prev_scan = tmp / "prev_scan.json"
|
||||
write_json(prev_scan, {"findings": prev_findings})
|
||||
prev_results = tmp / "prev_results.json"
|
||||
write_json(prev_results, [])
|
||||
run_tabulate(
|
||||
"--label", "improve-test",
|
||||
"--scan", str(prev_scan),
|
||||
"--inhouse",
|
||||
"--append", str(prev_results),
|
||||
)
|
||||
|
||||
cur_findings = [
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 1, "Confirmed"),
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 2, "Confirmed"),
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 3, "Confirmed"),
|
||||
python_finding(SINK_BIT_CMDI, "x.unknown", 4, "Unsupported"),
|
||||
]
|
||||
cur_scan = tmp / "cur_scan.json"
|
||||
write_json(cur_scan, {"findings": cur_findings})
|
||||
cur_results = tmp / "cur_results.json"
|
||||
write_json(cur_results, [])
|
||||
proc = run_tabulate(
|
||||
"--label", "improve-test",
|
||||
"--scan", str(cur_scan),
|
||||
"--inhouse",
|
||||
"--append", str(cur_results),
|
||||
"--diff", str(prev_results),
|
||||
)
|
||||
assert proc.returncode == 0, (
|
||||
f"improvement diff must exit 0, got {proc.returncode}\n"
|
||||
f"stdout: {proc.stdout}\nstderr: {proc.stderr}"
|
||||
)
|
||||
assert "no regressions" in proc.stdout, proc.stdout
|
||||
|
||||
|
||||
def test_budget_malformed_exits_3(tmp: Path) -> None:
|
||||
bad = tmp / "bad.toml"
|
||||
bad.write_text("[default]\nunsupported_rate = not_a_number\n")
|
||||
scan = tmp / "scan.json"
|
||||
write_json(scan, {"findings": []})
|
||||
append = tmp / "results.json"
|
||||
write_json(append, [])
|
||||
proc = run_tabulate(
|
||||
"--label", "test",
|
||||
"--scan", str(scan),
|
||||
"--inhouse",
|
||||
"--append", str(append),
|
||||
"--budget", str(bad),
|
||||
)
|
||||
assert proc.returncode == 3, (
|
||||
f"malformed budget must exit 3, got {proc.returncode}\nstderr: {proc.stderr}"
|
||||
)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
tmp = Path(td)
|
||||
for fn in (
|
||||
test_budget_passes_on_clean_scan,
|
||||
test_budget_fails_when_unsupported_exceeds,
|
||||
test_diff_fails_on_regression,
|
||||
test_diff_passes_on_improvement,
|
||||
test_budget_malformed_exits_3,
|
||||
):
|
||||
sub = tmp / fn.__name__
|
||||
sub.mkdir()
|
||||
print(f"... {fn.__name__}")
|
||||
fn(sub)
|
||||
print(f" OK")
|
||||
print("\nAll tabulate.py regression checks passed.")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Loading…
Add table
Add a link
Reference in a new issue