#!/usr/bin/env bash # M7 pre-flip ship gate. # # Runs all five gates required before the default-on merge can land. # Must pass with exit 0 on the branch being merged. # # Usage: # scripts/m7_ship_gate.sh [--nyx BIN] [--corpus-dir DIR] [--skip GATE,...] # [--budget FILE] [--diff FILE] # # Gates: # 1. unsupported-rate — per-cell (cap × lang) Unsupported% within budget # 2. false-confirmed — false-Confirmed rate from telemetry ≤ 2% per cap # 3. wall-clock — default scan ≤ 2× static-only on bench suite # 4. sandbox-escape — sandbox escape suite green for all langs # 5. repro-stability — repro artifact regenerates identical verdict ≥ 95% # # Phase 29 (Track I): Gate 1 consumes per-cell budgets from # `tests/eval_corpus/budget.toml` and, when `--diff PREV.json` is # supplied, fails on any monotonic-improvement regression vs the # previous run. set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" NYX_BIN="${NYX_BIN:-${REPO_ROOT}/target/release/nyx}" CORPUS_DIR="${CORPUS_DIR:-${HOME}/.cache/nyx/eval_corpus}" SKIP_GATES="" GATE_ERRORS=0 GATE_LOG="${REPO_ROOT}/target/m7_gate.log" # Phase 29 (Track I): per-cell budgets + monotonic diff. BUDGET_FILE="${BUDGET_FILE:-${REPO_ROOT}/tests/eval_corpus/budget.toml}" DIFF_FILE="${DIFF_FILE:-}" while [[ $# -gt 0 ]]; do case "$1" in --nyx) NYX_BIN="$2"; shift 2 ;; --corpus-dir) CORPUS_DIR="$2"; shift 2 ;; --skip) SKIP_GATES="$2"; shift 2 ;; --budget) BUDGET_FILE="$2"; shift 2 ;; --diff) DIFF_FILE="$2"; shift 2 ;; *) shift ;; esac done skip() { [[ ",$SKIP_GATES," == *",$1,"* ]]; } die() { echo "GATE FAIL: $*" | tee -a "$GATE_LOG" >&2; GATE_ERRORS=$((GATE_ERRORS + 1)); } pass() { echo "GATE PASS: $*" | tee -a "$GATE_LOG"; } info() { echo "[gate] $*" | tee -a "$GATE_LOG"; } [[ -x "$NYX_BIN" ]] || { echo "nyx binary not found: $NYX_BIN" >&2; exit 1; } mkdir -p "$(dirname "$GATE_LOG")" echo "# M7 ship gate — $(date -u +%Y-%m-%dT%H:%M:%SZ)" > "$GATE_LOG" info "nyx: $NYX_BIN" info "corpus: $CORPUS_DIR" info "budget: $BUDGET_FILE" info "diff: ${DIFF_FILE:-}" info "" # ── Gate 1: Per-cell budget + monotonic-improvement diff ─────────────────── # # Phase 29 (Track I): the single global Unsupported threshold is replaced # by per-cell (cap × lang) budgets in tests/eval_corpus/budget.toml. # `tests/eval_corpus/run.sh` invokes `tabulate.py` per set and `report.py` # at the end with `--budget` (and `--diff` when DIFF_FILE is set), so # any per-cell failure (or any regression vs the prior run) propagates # back as exit 2. if skip unsupported-rate; then info "Gate 1 (unsupported-rate): SKIPPED" else info "Gate 1: per-cell budget within tolerance + no monotonic regressions..." EVAL_RESULTS="${REPO_ROOT}/target/eval_results.json" echo "[]" > "$EVAL_RESULTS" if [[ ! -f "$BUDGET_FILE" ]]; then die "Gate 1: budget file not found at $BUDGET_FILE" else # Run eval corpus runner (in-house set always present). set +e bash "${REPO_ROOT}/tests/eval_corpus/run.sh" \ --nyx "$NYX_BIN" \ --sets inhouse \ --output "$(dirname "$EVAL_RESULTS")" \ --budget "$BUDGET_FILE" \ ${DIFF_FILE:+--diff "$DIFF_FILE"} \ >>"$GATE_LOG" 2>>"$GATE_LOG" RC=$? set -e cp "$(dirname "$EVAL_RESULTS")/eval_results.json" "$EVAL_RESULTS" 2>/dev/null || true if [[ $RC -eq 0 ]]; then pass "Gate 1: per-cell budget + diff check passed" elif [[ $RC -eq 2 ]]; then die "Gate 1: per-cell budget exceeded OR monotonic-improvement regression (see $GATE_LOG)" elif [[ $RC -eq 3 ]]; then die "Gate 1: budget/diff configuration is malformed (see $GATE_LOG)" else info "Gate 1: eval runner returned $RC (corpus may not be downloaded; treating as SKIP)" fi fi fi # ── Gate 2: False-Confirmed rate ───────────────────────────────────────────── # # Phase 27 (Track H.1): the telemetry log is schema-versioned. Gate 2 reads # `EXPECTED_SCHEMA_VERSION` against every record's `schema_version` field and # fails loudly with exit 3 when a mismatch is found — silently treating a # v0 (pre-Phase-27) log as "no data" would mask incompatible releases mixing # their records. EXPECTED_SCHEMA_VERSION=1 if skip false-confirmed; then info "Gate 2 (false-confirmed): SKIPPED" else info "Gate 2: false-Confirmed rate from telemetry ≤ 2% per cap..." EVENTS="${HOME}/.cache/nyx/dynamic/events.jsonl" if [[ ! -f "$EVENTS" ]]; then info "Gate 2: telemetry log not found at $EVENTS; skipping (no data)" else set +e python3 - "$EVENTS" "$EXPECTED_SCHEMA_VERSION" <<'PYEOF' import json, sys, collections path = sys.argv[1] expected_schema = int(sys.argv[2]) cap_counts = collections.defaultdict(lambda: {"confirmed": 0, "wrong": 0}) with open(path) as f: for line_no, raw in enumerate(f, start=1): if not raw.strip(): continue try: ev = json.loads(raw) except json.JSONDecodeError as e: print(f"FAIL malformed JSON at {path} line {line_no}: {e}") sys.exit(3) if "schema_version" not in ev: print(f"FAIL missing schema_version at {path} line {line_no}") sys.exit(3) if ev["schema_version"] != expected_schema: print( f"FAIL schema mismatch at {path} line {line_no}: " f"expected {expected_schema}, found {ev['schema_version']}" ) sys.exit(3) kind = ev.get("kind", "") if kind == "feedback" and ev.get("wrong"): cap = ev.get("cap", "unknown") cap_counts[cap]["wrong"] += 1 elif kind == "verdict" and ev.get("status") == "Confirmed": cap = ev.get("cap", "unknown") cap_counts[cap]["confirmed"] += 1 THRESHOLD = 0.02 failed = False for cap, counts in sorted(cap_counts.items()): total = counts["confirmed"] wrong = counts["wrong"] if total == 0: continue rate = wrong / total if rate > THRESHOLD: print(f"FAIL cap={cap}: false-Confirmed rate {rate:.1%} > {THRESHOLD:.0%} (wrong={wrong}, confirmed={total})") failed = True else: print(f"OK cap={cap}: false-Confirmed rate {rate:.1%} (wrong={wrong}, confirmed={total})") sys.exit(2 if failed else 0) PYEOF RC=$? set -e if [[ $RC -eq 0 ]]; then pass "Gate 2: false-Confirmed rate within threshold" elif [[ $RC -eq 3 ]]; then die "Gate 2: telemetry schema mismatch (expected v$EXPECTED_SCHEMA_VERSION) — refusing to silently skip" else die "Gate 2: false-Confirmed rate exceeds 2% for one or more caps" fi fi fi # ── Gate 3: Wall-clock cost ≤ 2× static-only ──────────────────────────────── if skip wall-clock; then info "Gate 3 (wall-clock): SKIPPED" else info "Gate 3: wall-clock ≤ 2× static-only on bench suite..." BENCH_DIR="${REPO_ROOT}/benches/fixtures" if [[ ! -d "$BENCH_DIR" ]]; then info "Gate 3: benches/fixtures not found; skipping" else # Portable epoch-millis. BSD date (macOS) lacks %3N; GNU date has it. ms_now() { python3 -c 'import time; print(int(time.time()*1000))'; } # Static-only baseline. T_STATIC_START=$(ms_now) "$NYX_BIN" scan --no-verify --format json --no-index "$BENCH_DIR" > /dev/null 2>&1 || true T_STATIC_END=$(ms_now) T_STATIC=$(( T_STATIC_END - T_STATIC_START )) # Default (with verify). T_VERIFY_START=$(ms_now) "$NYX_BIN" scan --format json --no-index "$BENCH_DIR" > /dev/null 2>&1 || true T_VERIFY_END=$(ms_now) T_VERIFY=$(( T_VERIFY_END - T_VERIFY_START )) info " static-only: ${T_STATIC}ms with-verify: ${T_VERIFY}ms" # Allow 2× overhead. LIMIT=$(( T_STATIC * 2 )) if [[ $T_VERIFY -le $LIMIT ]]; then pass "Gate 3: wall-clock ${T_VERIFY}ms ≤ 2× ${T_STATIC}ms baseline (limit ${LIMIT}ms)" else die "Gate 3: wall-clock ${T_VERIFY}ms > 2× ${T_STATIC}ms baseline (limit ${LIMIT}ms)" fi fi fi # ── Gate 4: Sandbox-escape suite ───────────────────────────────────────────── if skip sandbox-escape; then info "Gate 4 (sandbox-escape): SKIPPED" else info "Gate 4: sandbox escape suite green..." ESCAPE_DIR="${REPO_ROOT}/tests/dynamic_fixtures/escape" if [[ ! -d "$ESCAPE_DIR" ]]; then info "Gate 4: tests/dynamic_fixtures/escape not found; skipping" else # Run each escape fixture under both Process and Docker backends. # A Confirmed verdict on an escape fixture is a gate failure. # Additionally, check that no sentinel files were written to the host. ESCAPE_SENTINEL_PATHS=( "/tmp/nyx_escape_file_write_marker" ) for _s in "${ESCAPE_SENTINEL_PATHS[@]}"; do rm -f "$_s" done ESCAPE_FAILED=0 for fixture in "$ESCAPE_DIR"/*/; do [[ -d "$fixture" ]] || continue for backend in process docker; do set +e RESULT=$("$NYX_BIN" scan --format json --verify \ --backend "$backend" --no-index "$fixture" 2>/dev/null) RC=$? set -e if [[ $RC -ne 0 && $RC -ne 1 ]]; then info " $backend/$fixture: nyx returned $RC (likely docker unavailable — skip)" continue fi CONFIRMED=$(echo "$RESULT" | python3 -c " import json,sys data = json.load(sys.stdin) findings = data if isinstance(data, list) else data.get('findings', []) confirmed = [ f for f in findings if ((f.get('evidence') or {}).get('dynamic_verdict') or {}).get('status') == 'Confirmed' ] print(len(confirmed)) " 2>/dev/null || echo 0) if [[ "$CONFIRMED" -gt 0 ]]; then die "Gate 4: escape fixture confirmed in $backend backend: $fixture" ESCAPE_FAILED=1 fi done done for _s in "${ESCAPE_SENTINEL_PATHS[@]}"; do if [[ -f "$_s" ]]; then die "Gate 4: escape sentinel written to host: $_s" ESCAPE_FAILED=1 fi done [[ $ESCAPE_FAILED -eq 0 ]] && pass "Gate 4: sandbox escape suite green" for _s in "${ESCAPE_SENTINEL_PATHS[@]}"; do rm -f "$_s" done fi fi # ── Gate 5: Repro stability ≥ 95% ──────────────────────────────────────────── # # Phase 28 (Track H.4): inversion of the legacy "conservative — treat # unexpected errors as stable" rule. Old behaviour silently counted any # subprocess error (timeout, missing toolchain, broken pipe) as stable, # which let the gate pass while bundles were structurally unreplayable. # Phase 28 flips that: known exit codes (0 = pass, 1 = sink mismatch, # 2 = docker unavailable, 3 = toolchain mismatch) are classified # normally, but any other failure (timeout, ENOENT on `sh`, non-zero # code outside the documented set) is flagged as instability so the # gate fails loudly instead of masking the problem. if skip repro-stability; then info "Gate 5 (repro-stability): SKIPPED" else info "Gate 5: repro artifact stability ≥ 95% of Confirmed..." # Repro bundles live under dynamic/repro/ (written by repro.rs). REPRO_DIR="${HOME}/.cache/nyx/dynamic/repro" if [[ ! -d "$REPRO_DIR" ]] || [[ -z "$(ls -A "$REPRO_DIR" 2>/dev/null)" ]]; then info "Gate 5: no repro artifacts found at $REPRO_DIR; skipping" else python3 - <<'PYEOF' "$REPRO_DIR" "$NYX_BIN" import subprocess, sys, json, pathlib # Phase 28 documented reproduce.sh exit codes. EXIT_PASS = 0 # sink_hit matches expected/outcome.json EXIT_MISMATCH = 1 # sink_hit diverged from recorded outcome EXIT_DOCKER_UNAVAIL = 2 # --docker requested but unavailable EXIT_TOOLCHAIN_MISMATCH = 3 # host toolchain mismatch in process mode repro_root = pathlib.Path(sys.argv[1]) total = 0 stable = 0 unstable = 0 # Each bundle has expected/verdict.json (written by repro.rs). for verdict_file in repro_root.rglob("expected/verdict.json"): bundle_dir = verdict_file.parent.parent # parent of expected/ try: with open(verdict_file) as f: orig = json.load(f) orig_status = orig.get("status", "") except Exception as e: # Bundle is malformed. Phase 28 inversion: this is no longer # silently "stable"; it is a broken bundle and counts against # the stability rate. unstable += 1 total += 1 print(f"UNSTABLE: {bundle_dir.name} — verdict.json unreadable ({e})") continue if orig_status != "Confirmed": continue total += 1 reproduce_sh = bundle_dir / "reproduce.sh" if not reproduce_sh.exists(): # Legacy bundles without reproduce.sh used to be counted as # stable; Phase 28 treats them as instability because the # repro bundle layout has shipped reproduce.sh since the # first cut of the dynamic feature. unstable += 1 print(f"UNSTABLE: {bundle_dir.name} — reproduce.sh missing") continue try: result = subprocess.run( ["sh", str(reproduce_sh)], capture_output=True, timeout=30, ) rc = result.returncode if rc == EXIT_PASS: stable += 1 elif rc == EXIT_MISMATCH: unstable += 1 print(f"UNSTABLE: {bundle_dir.name} — sink_hit mismatch (exit 1)") elif rc in (EXIT_DOCKER_UNAVAIL, EXIT_TOOLCHAIN_MISMATCH): # Documented environmental skip codes — neither pass nor # fail. Exclude from the stability ratio so an offline # CI row does not pollute the score. total -= 1 print(f"SKIP: {bundle_dir.name} — environment exit {rc}") else: # Phase 28 inversion: any other non-zero code is unexpected. unstable += 1 print(f"UNSTABLE: {bundle_dir.name} — unexpected exit {rc}") except subprocess.TimeoutExpired: unstable += 1 print(f"UNSTABLE: {bundle_dir.name} — reproduce.sh exceeded 30s") except Exception as e: # Phase 28 inversion: subprocess error is no longer silent # success. Anything that prevents the script from completing # cleanly counts against stability. unstable += 1 print(f"UNSTABLE: {bundle_dir.name} — invocation error ({e})") if total == 0: print("No Confirmed repro artifacts found; skipping stability check.") sys.exit(0) rate = stable / total print(f"Repro stability: {stable}/{total} = {rate:.1%} (unstable={unstable})") if rate < 0.95: print(f"FAIL: stability {rate:.1%} < 95%") sys.exit(2) PYEOF RC=$? if [[ $RC -eq 0 ]]; then pass "Gate 5: repro stability ≥ 95%" else die "Gate 5: repro stability < 95%" fi fi fi # ── Summary ────────────────────────────────────────────────────────────────── echo "" info "Gate log: $GATE_LOG" if [[ $GATE_ERRORS -gt 0 ]]; then echo "" echo "M7 SHIP GATE FAILED: $GATE_ERRORS gate(s) did not pass." echo "Fix failures before merging the default-on flip." exit 2 else echo "" echo "M7 SHIP GATE PASSED: all active gates green." exit 0 fi