mirror of
https://github.com/elicpeter/nyx.git
synced 2026-06-09 19:45:13 +02:00
372 lines
14 KiB
Bash
Executable file
372 lines
14 KiB
Bash
Executable file
#!/usr/bin/env bash
|
||
# M7 pre-flip ship gate.
|
||
#
|
||
# Runs all five gates required before the default-on merge can land.
|
||
# Must pass with exit 0 on the branch being merged.
|
||
#
|
||
# Usage:
|
||
# scripts/m7_ship_gate.sh [--nyx BIN] [--corpus-dir DIR] [--skip GATE,...]
|
||
#
|
||
# Gates:
|
||
# 1. unsupported-rate — per-cell (cap × lang) Unsupported% within budget
|
||
# 2. false-confirmed — false-Confirmed rate from telemetry ≤ 2% per cap
|
||
# 3. wall-clock — default scan ≤ 2× static-only on bench suite
|
||
# 4. sandbox-escape — sandbox escape suite green for all langs
|
||
# 5. repro-stability — repro artifact regenerates identical verdict ≥ 95%
|
||
|
||
set -euo pipefail
|
||
|
||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||
REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||
NYX_BIN="${NYX_BIN:-${REPO_ROOT}/target/release/nyx}"
|
||
CORPUS_DIR="${CORPUS_DIR:-${HOME}/.cache/nyx/eval_corpus}"
|
||
SKIP_GATES=""
|
||
GATE_ERRORS=0
|
||
GATE_LOG="${REPO_ROOT}/target/m7_gate.log"
|
||
|
||
while [[ $# -gt 0 ]]; do
|
||
case "$1" in
|
||
--nyx) NYX_BIN="$2"; shift 2 ;;
|
||
--corpus-dir) CORPUS_DIR="$2"; shift 2 ;;
|
||
--skip) SKIP_GATES="$2"; shift 2 ;;
|
||
*) shift ;;
|
||
esac
|
||
done
|
||
|
||
skip() { [[ ",$SKIP_GATES," == *",$1,"* ]]; }
|
||
|
||
die() { echo "GATE FAIL: $*" | tee -a "$GATE_LOG" >&2; GATE_ERRORS=$((GATE_ERRORS + 1)); }
|
||
pass() { echo "GATE PASS: $*" | tee -a "$GATE_LOG"; }
|
||
info() { echo "[gate] $*" | tee -a "$GATE_LOG"; }
|
||
|
||
[[ -x "$NYX_BIN" ]] || { echo "nyx binary not found: $NYX_BIN" >&2; exit 1; }
|
||
|
||
mkdir -p "$(dirname "$GATE_LOG")"
|
||
echo "# M7 ship gate — $(date -u +%Y-%m-%dT%H:%M:%SZ)" > "$GATE_LOG"
|
||
info "nyx: $NYX_BIN"
|
||
info "corpus: $CORPUS_DIR"
|
||
info ""
|
||
|
||
# ── Gate 1: Unsupported-rate budget ─────────────────────────────────────────
|
||
if skip unsupported-rate; then
|
||
info "Gate 1 (unsupported-rate): SKIPPED"
|
||
else
|
||
info "Gate 1: per-cell Unsupported rate within budget..."
|
||
EVAL_RESULTS="${REPO_ROOT}/target/eval_results.json"
|
||
echo "[]" > "$EVAL_RESULTS"
|
||
|
||
# Run eval corpus runner (in-house set always present).
|
||
if bash "${REPO_ROOT}/tests/eval_corpus/run.sh" \
|
||
--nyx "$NYX_BIN" \
|
||
--sets inhouse \
|
||
--output "$(dirname "$EVAL_RESULTS")" 2>>"$GATE_LOG"; then
|
||
# Copy result to our location.
|
||
cp "$(dirname "$EVAL_RESULTS")/eval_results.json" "$EVAL_RESULTS" 2>/dev/null || true
|
||
pass "Gate 1: unsupported-rate check passed"
|
||
else
|
||
RC=$?
|
||
if [[ $RC -eq 2 ]]; then
|
||
die "Gate 1: Unsupported rate exceeds budget for one or more (cap, lang) cells"
|
||
else
|
||
info "Gate 1: eval runner returned $RC (corpus may not be downloaded; treating as SKIP)"
|
||
fi
|
||
fi
|
||
fi
|
||
|
||
# ── Gate 2: False-Confirmed rate ─────────────────────────────────────────────
|
||
#
|
||
# Phase 27 (Track H.1): the telemetry log is schema-versioned. Gate 2 reads
|
||
# `EXPECTED_SCHEMA_VERSION` against every record's `schema_version` field and
|
||
# fails loudly with exit 3 when a mismatch is found — silently treating a
|
||
# v0 (pre-Phase-27) log as "no data" would mask incompatible releases mixing
|
||
# their records.
|
||
EXPECTED_SCHEMA_VERSION=1
|
||
|
||
if skip false-confirmed; then
|
||
info "Gate 2 (false-confirmed): SKIPPED"
|
||
else
|
||
info "Gate 2: false-Confirmed rate from telemetry ≤ 2% per cap..."
|
||
EVENTS="${HOME}/.cache/nyx/dynamic/events.jsonl"
|
||
if [[ ! -f "$EVENTS" ]]; then
|
||
info "Gate 2: telemetry log not found at $EVENTS; skipping (no data)"
|
||
else
|
||
set +e
|
||
python3 - "$EVENTS" "$EXPECTED_SCHEMA_VERSION" <<'PYEOF'
|
||
import json, sys, collections
|
||
path = sys.argv[1]
|
||
expected_schema = int(sys.argv[2])
|
||
cap_counts = collections.defaultdict(lambda: {"confirmed": 0, "wrong": 0})
|
||
with open(path) as f:
|
||
for line_no, raw in enumerate(f, start=1):
|
||
if not raw.strip():
|
||
continue
|
||
try:
|
||
ev = json.loads(raw)
|
||
except json.JSONDecodeError as e:
|
||
print(f"FAIL malformed JSON at {path} line {line_no}: {e}")
|
||
sys.exit(3)
|
||
if "schema_version" not in ev:
|
||
print(f"FAIL missing schema_version at {path} line {line_no}")
|
||
sys.exit(3)
|
||
if ev["schema_version"] != expected_schema:
|
||
print(
|
||
f"FAIL schema mismatch at {path} line {line_no}: "
|
||
f"expected {expected_schema}, found {ev['schema_version']}"
|
||
)
|
||
sys.exit(3)
|
||
kind = ev.get("kind", "")
|
||
if kind == "feedback" and ev.get("wrong"):
|
||
cap = ev.get("cap", "unknown")
|
||
cap_counts[cap]["wrong"] += 1
|
||
elif kind == "verdict" and ev.get("status") == "Confirmed":
|
||
cap = ev.get("cap", "unknown")
|
||
cap_counts[cap]["confirmed"] += 1
|
||
|
||
THRESHOLD = 0.02
|
||
failed = False
|
||
for cap, counts in sorted(cap_counts.items()):
|
||
total = counts["confirmed"]
|
||
wrong = counts["wrong"]
|
||
if total == 0:
|
||
continue
|
||
rate = wrong / total
|
||
if rate > THRESHOLD:
|
||
print(f"FAIL cap={cap}: false-Confirmed rate {rate:.1%} > {THRESHOLD:.0%} (wrong={wrong}, confirmed={total})")
|
||
failed = True
|
||
else:
|
||
print(f"OK cap={cap}: false-Confirmed rate {rate:.1%} (wrong={wrong}, confirmed={total})")
|
||
sys.exit(2 if failed else 0)
|
||
PYEOF
|
||
RC=$?
|
||
set -e
|
||
if [[ $RC -eq 0 ]]; then
|
||
pass "Gate 2: false-Confirmed rate within threshold"
|
||
elif [[ $RC -eq 3 ]]; then
|
||
die "Gate 2: telemetry schema mismatch (expected v$EXPECTED_SCHEMA_VERSION) — refusing to silently skip"
|
||
else
|
||
die "Gate 2: false-Confirmed rate exceeds 2% for one or more caps"
|
||
fi
|
||
fi
|
||
fi
|
||
|
||
# ── Gate 3: Wall-clock cost ≤ 2× static-only ────────────────────────────────
|
||
if skip wall-clock; then
|
||
info "Gate 3 (wall-clock): SKIPPED"
|
||
else
|
||
info "Gate 3: wall-clock ≤ 2× static-only on bench suite..."
|
||
BENCH_DIR="${REPO_ROOT}/benches/fixtures"
|
||
if [[ ! -d "$BENCH_DIR" ]]; then
|
||
info "Gate 3: benches/fixtures not found; skipping"
|
||
else
|
||
# Portable epoch-millis. BSD date (macOS) lacks %3N; GNU date has it.
|
||
ms_now() { python3 -c 'import time; print(int(time.time()*1000))'; }
|
||
|
||
# Static-only baseline.
|
||
T_STATIC_START=$(ms_now)
|
||
"$NYX_BIN" scan --no-verify --format json --no-index "$BENCH_DIR" > /dev/null 2>&1 || true
|
||
T_STATIC_END=$(ms_now)
|
||
T_STATIC=$(( T_STATIC_END - T_STATIC_START ))
|
||
|
||
# Default (with verify).
|
||
T_VERIFY_START=$(ms_now)
|
||
"$NYX_BIN" scan --format json --no-index "$BENCH_DIR" > /dev/null 2>&1 || true
|
||
T_VERIFY_END=$(ms_now)
|
||
T_VERIFY=$(( T_VERIFY_END - T_VERIFY_START ))
|
||
|
||
info " static-only: ${T_STATIC}ms with-verify: ${T_VERIFY}ms"
|
||
|
||
# Allow 2× overhead.
|
||
LIMIT=$(( T_STATIC * 2 ))
|
||
if [[ $T_VERIFY -le $LIMIT ]]; then
|
||
pass "Gate 3: wall-clock ${T_VERIFY}ms ≤ 2× ${T_STATIC}ms baseline (limit ${LIMIT}ms)"
|
||
else
|
||
die "Gate 3: wall-clock ${T_VERIFY}ms > 2× ${T_STATIC}ms baseline (limit ${LIMIT}ms)"
|
||
fi
|
||
fi
|
||
fi
|
||
|
||
# ── Gate 4: Sandbox-escape suite ─────────────────────────────────────────────
|
||
if skip sandbox-escape; then
|
||
info "Gate 4 (sandbox-escape): SKIPPED"
|
||
else
|
||
info "Gate 4: sandbox escape suite green..."
|
||
ESCAPE_DIR="${REPO_ROOT}/tests/dynamic_fixtures/escape"
|
||
if [[ ! -d "$ESCAPE_DIR" ]]; then
|
||
info "Gate 4: tests/dynamic_fixtures/escape not found; skipping"
|
||
else
|
||
# Run each escape fixture under both Process and Docker backends.
|
||
# A Confirmed verdict on an escape fixture is a gate failure.
|
||
# Additionally, check that no sentinel files were written to the host.
|
||
ESCAPE_SENTINEL_PATHS=(
|
||
"/tmp/nyx_escape_file_write_marker"
|
||
)
|
||
for _s in "${ESCAPE_SENTINEL_PATHS[@]}"; do
|
||
rm -f "$_s"
|
||
done
|
||
ESCAPE_FAILED=0
|
||
for fixture in "$ESCAPE_DIR"/*/; do
|
||
[[ -d "$fixture" ]] || continue
|
||
for backend in process docker; do
|
||
set +e
|
||
RESULT=$("$NYX_BIN" scan --format json --verify \
|
||
--backend "$backend" --no-index "$fixture" 2>/dev/null)
|
||
RC=$?
|
||
set -e
|
||
if [[ $RC -ne 0 && $RC -ne 1 ]]; then
|
||
info " $backend/$fixture: nyx returned $RC (likely docker unavailable — skip)"
|
||
continue
|
||
fi
|
||
CONFIRMED=$(echo "$RESULT" | python3 -c "
|
||
import json,sys
|
||
data = json.load(sys.stdin)
|
||
findings = data if isinstance(data, list) else data.get('findings', [])
|
||
confirmed = [
|
||
f for f in findings
|
||
if ((f.get('evidence') or {}).get('dynamic_verdict') or {}).get('status') == 'Confirmed'
|
||
]
|
||
print(len(confirmed))
|
||
" 2>/dev/null || echo 0)
|
||
if [[ "$CONFIRMED" -gt 0 ]]; then
|
||
die "Gate 4: escape fixture confirmed in $backend backend: $fixture"
|
||
ESCAPE_FAILED=1
|
||
fi
|
||
done
|
||
done
|
||
for _s in "${ESCAPE_SENTINEL_PATHS[@]}"; do
|
||
if [[ -f "$_s" ]]; then
|
||
die "Gate 4: escape sentinel written to host: $_s"
|
||
ESCAPE_FAILED=1
|
||
fi
|
||
done
|
||
[[ $ESCAPE_FAILED -eq 0 ]] && pass "Gate 4: sandbox escape suite green"
|
||
for _s in "${ESCAPE_SENTINEL_PATHS[@]}"; do
|
||
rm -f "$_s"
|
||
done
|
||
fi
|
||
fi
|
||
|
||
# ── Gate 5: Repro stability ≥ 95% ────────────────────────────────────────────
|
||
#
|
||
# Phase 28 (Track H.4): inversion of the legacy "conservative — treat
|
||
# unexpected errors as stable" rule. Old behaviour silently counted any
|
||
# subprocess error (timeout, missing toolchain, broken pipe) as stable,
|
||
# which let the gate pass while bundles were structurally unreplayable.
|
||
# Phase 28 flips that: known exit codes (0 = pass, 1 = sink mismatch,
|
||
# 2 = docker unavailable, 3 = toolchain mismatch) are classified
|
||
# normally, but any other failure (timeout, ENOENT on `sh`, non-zero
|
||
# code outside the documented set) is flagged as instability so the
|
||
# gate fails loudly instead of masking the problem.
|
||
if skip repro-stability; then
|
||
info "Gate 5 (repro-stability): SKIPPED"
|
||
else
|
||
info "Gate 5: repro artifact stability ≥ 95% of Confirmed..."
|
||
# Repro bundles live under dynamic/repro/ (written by repro.rs).
|
||
REPRO_DIR="${HOME}/.cache/nyx/dynamic/repro"
|
||
if [[ ! -d "$REPRO_DIR" ]] || [[ -z "$(ls -A "$REPRO_DIR" 2>/dev/null)" ]]; then
|
||
info "Gate 5: no repro artifacts found at $REPRO_DIR; skipping"
|
||
else
|
||
python3 - <<'PYEOF' "$REPRO_DIR" "$NYX_BIN"
|
||
import subprocess, sys, json, pathlib
|
||
|
||
# Phase 28 documented reproduce.sh exit codes.
|
||
EXIT_PASS = 0 # sink_hit matches expected/outcome.json
|
||
EXIT_MISMATCH = 1 # sink_hit diverged from recorded outcome
|
||
EXIT_DOCKER_UNAVAIL = 2 # --docker requested but unavailable
|
||
EXIT_TOOLCHAIN_MISMATCH = 3 # host toolchain mismatch in process mode
|
||
|
||
repro_root = pathlib.Path(sys.argv[1])
|
||
total = 0
|
||
stable = 0
|
||
unstable = 0
|
||
|
||
# Each bundle has expected/verdict.json (written by repro.rs).
|
||
for verdict_file in repro_root.rglob("expected/verdict.json"):
|
||
bundle_dir = verdict_file.parent.parent # parent of expected/
|
||
try:
|
||
with open(verdict_file) as f:
|
||
orig = json.load(f)
|
||
orig_status = orig.get("status", "")
|
||
except Exception as e:
|
||
# Bundle is malformed. Phase 28 inversion: this is no longer
|
||
# silently "stable"; it is a broken bundle and counts against
|
||
# the stability rate.
|
||
unstable += 1
|
||
total += 1
|
||
print(f"UNSTABLE: {bundle_dir.name} — verdict.json unreadable ({e})")
|
||
continue
|
||
if orig_status != "Confirmed":
|
||
continue
|
||
total += 1
|
||
reproduce_sh = bundle_dir / "reproduce.sh"
|
||
if not reproduce_sh.exists():
|
||
# Legacy bundles without reproduce.sh used to be counted as
|
||
# stable; Phase 28 treats them as instability because the
|
||
# repro bundle layout has shipped reproduce.sh since the
|
||
# first cut of the dynamic feature.
|
||
unstable += 1
|
||
print(f"UNSTABLE: {bundle_dir.name} — reproduce.sh missing")
|
||
continue
|
||
try:
|
||
result = subprocess.run(
|
||
["sh", str(reproduce_sh)],
|
||
capture_output=True,
|
||
timeout=30,
|
||
)
|
||
rc = result.returncode
|
||
if rc == EXIT_PASS:
|
||
stable += 1
|
||
elif rc == EXIT_MISMATCH:
|
||
unstable += 1
|
||
print(f"UNSTABLE: {bundle_dir.name} — sink_hit mismatch (exit 1)")
|
||
elif rc in (EXIT_DOCKER_UNAVAIL, EXIT_TOOLCHAIN_MISMATCH):
|
||
# Documented environmental skip codes — neither pass nor
|
||
# fail. Exclude from the stability ratio so an offline
|
||
# CI row does not pollute the score.
|
||
total -= 1
|
||
print(f"SKIP: {bundle_dir.name} — environment exit {rc}")
|
||
else:
|
||
# Phase 28 inversion: any other non-zero code is unexpected.
|
||
unstable += 1
|
||
print(f"UNSTABLE: {bundle_dir.name} — unexpected exit {rc}")
|
||
except subprocess.TimeoutExpired:
|
||
unstable += 1
|
||
print(f"UNSTABLE: {bundle_dir.name} — reproduce.sh exceeded 30s")
|
||
except Exception as e:
|
||
# Phase 28 inversion: subprocess error is no longer silent
|
||
# success. Anything that prevents the script from completing
|
||
# cleanly counts against stability.
|
||
unstable += 1
|
||
print(f"UNSTABLE: {bundle_dir.name} — invocation error ({e})")
|
||
|
||
if total == 0:
|
||
print("No Confirmed repro artifacts found; skipping stability check.")
|
||
sys.exit(0)
|
||
|
||
rate = stable / total
|
||
print(f"Repro stability: {stable}/{total} = {rate:.1%} (unstable={unstable})")
|
||
if rate < 0.95:
|
||
print(f"FAIL: stability {rate:.1%} < 95%")
|
||
sys.exit(2)
|
||
PYEOF
|
||
RC=$?
|
||
if [[ $RC -eq 0 ]]; then
|
||
pass "Gate 5: repro stability ≥ 95%"
|
||
else
|
||
die "Gate 5: repro stability < 95%"
|
||
fi
|
||
fi
|
||
fi
|
||
|
||
# ── Summary ──────────────────────────────────────────────────────────────────
|
||
echo ""
|
||
info "Gate log: $GATE_LOG"
|
||
if [[ $GATE_ERRORS -gt 0 ]]; then
|
||
echo ""
|
||
echo "M7 SHIP GATE FAILED: $GATE_ERRORS gate(s) did not pass."
|
||
echo "Fix failures before merging the default-on flip."
|
||
exit 2
|
||
else
|
||
echo ""
|
||
echo "M7 SHIP GATE PASSED: all active gates green."
|
||
exit 0
|
||
fi
|