[pitboss] phase 09: M7 — Default-on flip + real-corpus calibration

2026-06-21 20:18:06 +02:00 · 2026-05-12 14:33:40 -04:00 · 2026-05-12 14:33:40 -04:00 · 996bff5983
commit 996bff5983
parent 118cafa535
19 changed files with 1094 additions and 51 deletions
--- a/tests/eval_corpus/ground_truth/README.md
+++ b/tests/eval_corpus/ground_truth/README.md
@ -0,0 +1,24 @@
+# Ground truth files
+
+Place corpus ground truth JSON files here before running `tests/eval_corpus/run.sh`.
+
+## OWASP Benchmark v1.2
+
+File: `owasp_benchmark_v1.2.json`
+
+Format:
+```json
+[
+  {"path": "src/main/java/org/owasp/.../BenchmarkTest00001.java", "line": 42, "cap": "sqli", "vuln": true},
+  ...
+]
+```
+
+Source: generate from `expectedresults-1.2.csv` shipped with the benchmark repo using
+`python3 tests/eval_corpus/owasp_gt_convert.py`.
+
+## NIST SARD subset
+
+File: `nist_sard.json`
+
+Same format. Source: SARD manifest XML converted with `python3 tests/eval_corpus/sard_gt_convert.py`.
--- a/tests/eval_corpus/report.py
+++ b/tests/eval_corpus/report.py
@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+"""
+Aggregate eval results across all corpus sets and emit a summary table.
+Used by run.sh after all corpus sets have been tabulated.
+"""
+
+import argparse
+import json
+import sys
+from collections import defaultdict
+
+
+def main() -> int:
+    p = argparse.ArgumentParser()
+    p.add_argument("--results", required=True)
+    args = p.parse_args()
+
+    with open(args.results) as f:
+        results = json.load(f)
+
+    if not results:
+        print("No results to report.")
+        return 0
+
+    # Aggregate across sets.
+    agg: dict[tuple[str, str], dict] = defaultdict(
+        lambda: {"tp": 0, "fp": 0, "fn": 0, "unsupported": 0, "total": 0}
+    )
+    for r in results:
+        for c in r.get("cells", []):
+            k = (c["cap"], c["lang"])
+            for field in ("tp", "fp", "fn", "unsupported", "total"):
+                agg[k][field] += c.get(field, 0)
+
+    print("\n=== Aggregated eval corpus report ===")
+    print(f"{'Cap':<20} {'Lang':<12} {'TP':>5} {'FP':>5} {'FN':>5} {'Prec':>6} {'Rec':>6} {'Unsup%':>7}")
+    print("-" * 72)
+    for k, v in sorted(agg.items()):
+        prec = v["tp"] / max(v["tp"] + v["fp"], 1)
+        rec = v["tp"] / max(v["tp"] + v["fn"], 1)
+        unsup = v["unsupported"] / max(v["total"], 1)
+        print(
+            f"{k[0]:<20} {k[1]:<12} "
+            f"{v['tp']:>5} {v['fp']:>5} {v['fn']:>5} "
+            f"{prec:>6.2f} {rec:>6.2f} "
+            f"{unsup*100:>6.1f}%"
+        )
+
+    # Gate check: per-cap Unsupported rate <= 80%
+    gate_failed = False
+    print("\n=== Gate checks ===")
+    UNSUPPORTED_BUDGET = 0.80
+    for k, v in sorted(agg.items()):
+        unsup = v["unsupported"] / max(v["total"], 1)
+        if unsup > UNSUPPORTED_BUDGET:
+            print(f"  FAIL  {k[0]}/{k[1]}: Unsupported {unsup*100:.1f}% > {UNSUPPORTED_BUDGET*100:.0f}% budget")
+            gate_failed = True
+
+    if not gate_failed:
+        print("  All gate thresholds met.")
+
+    return 2 if gate_failed else 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/tests/eval_corpus/run.sh
+++ b/tests/eval_corpus/run.sh
@ -0,0 +1,153 @@
+#!/usr/bin/env bash
+# Eval corpus runner for M7 pre-flip gate calibration.
+#
+# Usage:
+#   tests/eval_corpus/run.sh [--output DIR] [--nyx BIN] [--sets owasp,sard,inhouse]
+#
+# Bootstraps OWASP Benchmark v1.2, NIST SARD subset, and in-house
+# bughunt-curated fixtures. Runs `nyx scan --verify` on each. Emits
+# per-cell (cap x language) precision/recall table and per-cap Unsupported
+# rate to stdout (and --output DIR if given).
+#
+# Environment:
+#   NYX_EVAL_CORPUS_DIR  — path to pre-downloaded corpus roots
+#                          (default: ~/.cache/nyx/eval_corpus)
+#   NYX_BIN              — path to nyx binary (default: ./target/release/nyx)
+#
+# Exit codes:
+#   0 — all gate thresholds met
+#   1 — setup or I/O error
+#   2 — one or more gate thresholds exceeded (see output for details)
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+
+# ── Defaults ──────────────────────────────────────────────────────────────────
+OUTPUT_DIR=""
+NYX_BIN="${NYX_BIN:-${REPO_ROOT}/target/release/nyx}"
+CORPUS_CACHE="${NYX_EVAL_CORPUS_DIR:-${HOME}/.cache/nyx/eval_corpus}"
+SETS="owasp,sard,inhouse"
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --output) OUTPUT_DIR="$2"; shift 2 ;;
+    --nyx)    NYX_BIN="$2"; shift 2 ;;
+    --sets)   SETS="$2"; shift 2 ;;
+    *)        shift ;;
+  esac
+done
+
+# ── Helpers ───────────────────────────────────────────────────────────────────
+die()  { echo "error: $*" >&2; exit 1; }
+info() { echo "[eval] $*"; }
+
+require_cmd() { command -v "$1" >/dev/null 2>&1 || die "required command not found: $1"; }
+require_cmd jq
+require_cmd python3
+
+[[ -x "$NYX_BIN" ]] || die "nyx binary not found or not executable: $NYX_BIN"
+
+mkdir -p "$CORPUS_CACHE"
+[[ -n "$OUTPUT_DIR" ]] && mkdir -p "$OUTPUT_DIR"
+
+RESULTS_JSON="${OUTPUT_DIR:-/tmp}/eval_results_$(date +%Y%m%d_%H%M%S).json"
+echo "[]" > "$RESULTS_JSON"
+
+# ── OWASP Benchmark v1.2 bootstrap ───────────────────────────────────────────
+OWASP_DIR="${CORPUS_CACHE}/owasp_benchmark_v1.2"
+if [[ "$SETS" == *owasp* ]]; then
+  if [[ ! -d "$OWASP_DIR" ]]; then
+    info "Bootstrapping OWASP Benchmark v1.2..."
+    info "  Clone from https://github.com/OWASP-Benchmark/BenchmarkJava"
+    info "  into ${OWASP_DIR}"
+    info "  then re-run this script."
+    info "  git clone --depth 1 --branch v1.2 \\"
+    info "    https://github.com/OWASP-Benchmark/BenchmarkJava \\"
+    info "    ${OWASP_DIR}"
+    info "Skipping OWASP set (not yet downloaded)."
+  else
+    info "Running nyx scan on OWASP Benchmark v1.2..."
+    set +e
+    "$NYX_BIN" scan --format json --verify --no-index "$OWASP_DIR" \
+      > /tmp/nyx_owasp.json 2>/tmp/nyx_owasp.stderr
+    NYX_EXIT=$?
+    set -e
+    if [[ $NYX_EXIT -ne 0 && $NYX_EXIT -ne 1 ]]; then
+      info "  nyx exited $NYX_EXIT on OWASP set (stderr follows):"
+      cat /tmp/nyx_owasp.stderr >&2
+    else
+      python3 "${SCRIPT_DIR}/tabulate.py" \
+        --label owasp \
+        --scan /tmp/nyx_owasp.json \
+        --ground-truth "${SCRIPT_DIR}/ground_truth/owasp_benchmark_v1.2.json" \
+        --append "$RESULTS_JSON" \
+        || info "  tabulate.py failed; ground truth file may be absent"
+    fi
+  fi
+fi
+
+# ── NIST SARD subset bootstrap ────────────────────────────────────────────────
+SARD_DIR="${CORPUS_CACHE}/nist_sard"
+if [[ "$SETS" == *sard* ]]; then
+  if [[ ! -d "$SARD_DIR" ]]; then
+    info "Bootstrapping NIST SARD subset..."
+    info "  Download from https://samate.nist.gov/SARD/"
+    info "  into ${SARD_DIR} then re-run this script."
+    info "Skipping SARD set (not yet downloaded)."
+  else
+    info "Running nyx scan on NIST SARD subset..."
+    set +e
+    "$NYX_BIN" scan --format json --verify --no-index "$SARD_DIR" \
+      > /tmp/nyx_sard.json 2>/tmp/nyx_sard.stderr
+    NYX_EXIT=$?
+    set -e
+    if [[ $NYX_EXIT -ne 0 && $NYX_EXIT -ne 1 ]]; then
+      info "  nyx exited $NYX_EXIT on SARD set"
+    else
+      python3 "${SCRIPT_DIR}/tabulate.py" \
+        --label sard \
+        --scan /tmp/nyx_sard.json \
+        --ground-truth "${SCRIPT_DIR}/ground_truth/nist_sard.json" \
+        --append "$RESULTS_JSON" \
+        || info "  tabulate.py failed; ground truth file may be absent"
+    fi
+  fi
+fi
+
+# ── In-house bughunt-curated set ──────────────────────────────────────────────
+if [[ "$SETS" == *inhouse* ]]; then
+  INHOUSE_DIRS=(
+    "${REPO_ROOT}/tests/benchmark/corpus"
+    "${REPO_ROOT}/tests/dynamic_fixtures"
+  )
+  for dir in "${INHOUSE_DIRS[@]}"; do
+    [[ -d "$dir" ]] || continue
+    label="inhouse_$(basename "$dir")"
+    info "Running nyx scan on in-house set: $dir"
+    set +e
+    "$NYX_BIN" scan --format json --verify --no-index "$dir" \
+      > "/tmp/nyx_${label}.json" 2>"/tmp/nyx_${label}.stderr"
+    NYX_EXIT=$?
+    set -e
+    if [[ $NYX_EXIT -ne 0 && $NYX_EXIT -ne 1 ]]; then
+      info "  nyx exited $NYX_EXIT on $label"
+      continue
+    fi
+    python3 "${SCRIPT_DIR}/tabulate.py" \
+      --label "$label" \
+      --scan "/tmp/nyx_${label}.json" \
+      --inhouse \
+      --append "$RESULTS_JSON" \
+      || info "  tabulate.py failed on $label"
+  done
+fi
+
+# ── Emit summary table ────────────────────────────────────────────────────────
+info ""
+info "Results written to: $RESULTS_JSON"
+python3 "${SCRIPT_DIR}/report.py" --results "$RESULTS_JSON" \
+  || { info "report.py not available; raw results at $RESULTS_JSON"; exit 0; }
+
+[[ -n "$OUTPUT_DIR" ]] && cp "$RESULTS_JSON" "${OUTPUT_DIR}/eval_results.json"
--- a/tests/eval_corpus/tabulate.py
+++ b/tests/eval_corpus/tabulate.py
@ -0,0 +1,137 @@
+#!/usr/bin/env python3
+"""
+Tabulate nyx scan results against a ground-truth file.
+
+For OWASP / SARD sets: compares nyx findings against known-true/known-false
+labels from the ground truth JSON.
+
+For in-house sets (--inhouse): counts findings by cap x language; reports
+Unsupported rate only (no ground truth required).
+
+Output: appends a result record to --append FILE.
+"""
+
+import argparse
+import json
+import sys
+from collections import defaultdict
+from pathlib import Path
+
+
+def load_json(path: str) -> object:
+    with open(path) as f:
+        return json.load(f)
+
+
+def cap_of(finding: dict) -> str:
+    rule = finding.get("rule_id", "")
+    # Map rule_id prefix to cap name.
+    for cap in ["sqli", "xss", "cmdi", "ssrf", "deserialize", "path_traversal",
+                "redirect", "xxe", "taint", "auth"]:
+        if cap in rule.lower():
+            return cap
+    return "other"
+
+
+def lang_of(finding: dict) -> str:
+    path = finding.get("path", "")
+    ext_map = {
+        ".py": "python", ".js": "javascript", ".ts": "typescript",
+        ".java": "java", ".go": "go", ".php": "php", ".rb": "ruby",
+        ".rs": "rust", ".c": "c", ".cpp": "cpp",
+    }
+    for ext, lang in ext_map.items():
+        if path.endswith(ext):
+            return lang
+    return "unknown"
+
+
+def main() -> int:
+    p = argparse.ArgumentParser()
+    p.add_argument("--label", required=True)
+    p.add_argument("--scan", required=True, help="nyx scan --format json output")
+    p.add_argument("--ground-truth", default="", help="ground truth JSON")
+    p.add_argument("--inhouse", action="store_true")
+    p.add_argument("--append", required=True, help="results accumulator JSON")
+    args = p.parse_args()
+
+    scan_data = load_json(args.scan)
+    findings = scan_data if isinstance(scan_data, list) else scan_data.get("findings", [])
+
+    # Per-cell tallies: {(cap, lang): {tp, fp, fn, unsupported}}
+    cells: dict[tuple[str, str], dict] = defaultdict(
+        lambda: {"tp": 0, "fp": 0, "fn": 0, "unsupported": 0, "total": 0}
+    )
+
+    for f in findings:
+        cap = cap_of(f)
+        lang = lang_of(f)
+        key = (cap, lang)
+        ev = f.get("evidence", {}) or {}
+        dv = ev.get("dynamic_verdict") if ev else None
+        cells[key]["total"] += 1
+        if dv and dv.get("status") == "Unsupported":
+            cells[key]["unsupported"] += 1
+
+    if not args.inhouse and args.ground_truth and Path(args.ground_truth).exists():
+        gt = load_json(args.ground_truth)
+        # Ground truth format: list of {"path": ..., "line": ..., "cap": ..., "vuln": bool}
+        gt_true: set[tuple[str, int, str]] = set()
+        for entry in gt if isinstance(gt, list) else []:
+            if entry.get("vuln"):
+                gt_true.add((entry.get("path", ""), entry.get("line", 0), entry.get("cap", "")))
+
+        found_keys: set[tuple[str, int, str]] = set()
+        for f in findings:
+            key_gt = (f.get("path", ""), f.get("line", 0), cap_of(f))
+            found_keys.add(key_gt)
+            cap = cap_of(f)
+            lang = lang_of(f)
+            cell_key = (cap, lang)
+            if key_gt in gt_true:
+                cells[cell_key]["tp"] += 1
+            else:
+                cells[cell_key]["fp"] += 1
+
+        for gt_key in gt_true:
+            if gt_key not in found_keys:
+                cap = gt_key[2]
+                cells[(cap, "unknown")]["fn"] += 1
+
+    result = {
+        "label": args.label,
+        "total_findings": len(findings),
+        "cells": [
+            {
+                "cap": k[0],
+                "lang": k[1],
+                **v,
+                "precision": v["tp"] / max(v["tp"] + v["fp"], 1),
+                "recall": v["tp"] / max(v["tp"] + v["fn"], 1),
+                "unsupported_rate": v["unsupported"] / max(v["total"], 1),
+            }
+            for k, v in sorted(cells.items())
+        ],
+    }
+
+    existing = load_json(args.append) if Path(args.append).exists() else []
+    existing.append(result)
+    with open(args.append, "w") as f:
+        json.dump(existing, f, indent=2)
+
+    # Print summary
+    print(f"\n=== {args.label} ===")
+    print(f"{'Cap':<20} {'Lang':<12} {'TP':>5} {'FP':>5} {'FN':>5} {'Prec':>6} {'Rec':>6} {'Unsup%':>7}")
+    print("-" * 72)
+    for c in result["cells"]:
+        print(
+            f"{c['cap']:<20} {c['lang']:<12} "
+            f"{c['tp']:>5} {c['fp']:>5} {c['fn']:>5} "
+            f"{c['precision']:>6.2f} {c['recall']:>6.2f} "
+            f"{c['unsupported_rate']*100:>6.1f}%"
+        )
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())