mirror of
https://github.com/elicpeter/nyx.git
synced 2026-06-21 20:18:06 +02:00
[pitboss] phase 09: M7 — Default-on flip + real-corpus calibration
This commit is contained in:
parent
118cafa535
commit
996bff5983
19 changed files with 1094 additions and 51 deletions
24
tests/eval_corpus/ground_truth/README.md
Normal file
24
tests/eval_corpus/ground_truth/README.md
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
# Ground truth files
|
||||
|
||||
Place corpus ground truth JSON files here before running `tests/eval_corpus/run.sh`.
|
||||
|
||||
## OWASP Benchmark v1.2
|
||||
|
||||
File: `owasp_benchmark_v1.2.json`
|
||||
|
||||
Format:
|
||||
```json
|
||||
[
|
||||
{"path": "src/main/java/org/owasp/.../BenchmarkTest00001.java", "line": 42, "cap": "sqli", "vuln": true},
|
||||
...
|
||||
]
|
||||
```
|
||||
|
||||
Source: generate from `expectedresults-1.2.csv` shipped with the benchmark repo using
|
||||
`python3 tests/eval_corpus/owasp_gt_convert.py`.
|
||||
|
||||
## NIST SARD subset
|
||||
|
||||
File: `nist_sard.json`
|
||||
|
||||
Same format. Source: SARD manifest XML converted with `python3 tests/eval_corpus/sard_gt_convert.py`.
|
||||
66
tests/eval_corpus/report.py
Normal file
66
tests/eval_corpus/report.py
Normal file
|
|
@ -0,0 +1,66 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Aggregate eval results across all corpus sets and emit a summary table.
|
||||
Used by run.sh after all corpus sets have been tabulated.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
def main() -> int:
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument("--results", required=True)
|
||||
args = p.parse_args()
|
||||
|
||||
with open(args.results) as f:
|
||||
results = json.load(f)
|
||||
|
||||
if not results:
|
||||
print("No results to report.")
|
||||
return 0
|
||||
|
||||
# Aggregate across sets.
|
||||
agg: dict[tuple[str, str], dict] = defaultdict(
|
||||
lambda: {"tp": 0, "fp": 0, "fn": 0, "unsupported": 0, "total": 0}
|
||||
)
|
||||
for r in results:
|
||||
for c in r.get("cells", []):
|
||||
k = (c["cap"], c["lang"])
|
||||
for field in ("tp", "fp", "fn", "unsupported", "total"):
|
||||
agg[k][field] += c.get(field, 0)
|
||||
|
||||
print("\n=== Aggregated eval corpus report ===")
|
||||
print(f"{'Cap':<20} {'Lang':<12} {'TP':>5} {'FP':>5} {'FN':>5} {'Prec':>6} {'Rec':>6} {'Unsup%':>7}")
|
||||
print("-" * 72)
|
||||
for k, v in sorted(agg.items()):
|
||||
prec = v["tp"] / max(v["tp"] + v["fp"], 1)
|
||||
rec = v["tp"] / max(v["tp"] + v["fn"], 1)
|
||||
unsup = v["unsupported"] / max(v["total"], 1)
|
||||
print(
|
||||
f"{k[0]:<20} {k[1]:<12} "
|
||||
f"{v['tp']:>5} {v['fp']:>5} {v['fn']:>5} "
|
||||
f"{prec:>6.2f} {rec:>6.2f} "
|
||||
f"{unsup*100:>6.1f}%"
|
||||
)
|
||||
|
||||
# Gate check: per-cap Unsupported rate <= 80%
|
||||
gate_failed = False
|
||||
print("\n=== Gate checks ===")
|
||||
UNSUPPORTED_BUDGET = 0.80
|
||||
for k, v in sorted(agg.items()):
|
||||
unsup = v["unsupported"] / max(v["total"], 1)
|
||||
if unsup > UNSUPPORTED_BUDGET:
|
||||
print(f" FAIL {k[0]}/{k[1]}: Unsupported {unsup*100:.1f}% > {UNSUPPORTED_BUDGET*100:.0f}% budget")
|
||||
gate_failed = True
|
||||
|
||||
if not gate_failed:
|
||||
print(" All gate thresholds met.")
|
||||
|
||||
return 2 if gate_failed else 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
153
tests/eval_corpus/run.sh
Executable file
153
tests/eval_corpus/run.sh
Executable file
|
|
@ -0,0 +1,153 @@
|
|||
#!/usr/bin/env bash
|
||||
# Eval corpus runner for M7 pre-flip gate calibration.
|
||||
#
|
||||
# Usage:
|
||||
# tests/eval_corpus/run.sh [--output DIR] [--nyx BIN] [--sets owasp,sard,inhouse]
|
||||
#
|
||||
# Bootstraps OWASP Benchmark v1.2, NIST SARD subset, and in-house
|
||||
# bughunt-curated fixtures. Runs `nyx scan --verify` on each. Emits
|
||||
# per-cell (cap x language) precision/recall table and per-cap Unsupported
|
||||
# rate to stdout (and --output DIR if given).
|
||||
#
|
||||
# Environment:
|
||||
# NYX_EVAL_CORPUS_DIR — path to pre-downloaded corpus roots
|
||||
# (default: ~/.cache/nyx/eval_corpus)
|
||||
# NYX_BIN — path to nyx binary (default: ./target/release/nyx)
|
||||
#
|
||||
# Exit codes:
|
||||
# 0 — all gate thresholds met
|
||||
# 1 — setup or I/O error
|
||||
# 2 — one or more gate thresholds exceeded (see output for details)
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||||
|
||||
# ── Defaults ──────────────────────────────────────────────────────────────────
|
||||
OUTPUT_DIR=""
|
||||
NYX_BIN="${NYX_BIN:-${REPO_ROOT}/target/release/nyx}"
|
||||
CORPUS_CACHE="${NYX_EVAL_CORPUS_DIR:-${HOME}/.cache/nyx/eval_corpus}"
|
||||
SETS="owasp,sard,inhouse"
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--output) OUTPUT_DIR="$2"; shift 2 ;;
|
||||
--nyx) NYX_BIN="$2"; shift 2 ;;
|
||||
--sets) SETS="$2"; shift 2 ;;
|
||||
*) shift ;;
|
||||
esac
|
||||
done
|
||||
|
||||
# ── Helpers ───────────────────────────────────────────────────────────────────
|
||||
die() { echo "error: $*" >&2; exit 1; }
|
||||
info() { echo "[eval] $*"; }
|
||||
|
||||
require_cmd() { command -v "$1" >/dev/null 2>&1 || die "required command not found: $1"; }
|
||||
require_cmd jq
|
||||
require_cmd python3
|
||||
|
||||
[[ -x "$NYX_BIN" ]] || die "nyx binary not found or not executable: $NYX_BIN"
|
||||
|
||||
mkdir -p "$CORPUS_CACHE"
|
||||
[[ -n "$OUTPUT_DIR" ]] && mkdir -p "$OUTPUT_DIR"
|
||||
|
||||
RESULTS_JSON="${OUTPUT_DIR:-/tmp}/eval_results_$(date +%Y%m%d_%H%M%S).json"
|
||||
echo "[]" > "$RESULTS_JSON"
|
||||
|
||||
# ── OWASP Benchmark v1.2 bootstrap ───────────────────────────────────────────
|
||||
OWASP_DIR="${CORPUS_CACHE}/owasp_benchmark_v1.2"
|
||||
if [[ "$SETS" == *owasp* ]]; then
|
||||
if [[ ! -d "$OWASP_DIR" ]]; then
|
||||
info "Bootstrapping OWASP Benchmark v1.2..."
|
||||
info " Clone from https://github.com/OWASP-Benchmark/BenchmarkJava"
|
||||
info " into ${OWASP_DIR}"
|
||||
info " then re-run this script."
|
||||
info " git clone --depth 1 --branch v1.2 \\"
|
||||
info " https://github.com/OWASP-Benchmark/BenchmarkJava \\"
|
||||
info " ${OWASP_DIR}"
|
||||
info "Skipping OWASP set (not yet downloaded)."
|
||||
else
|
||||
info "Running nyx scan on OWASP Benchmark v1.2..."
|
||||
set +e
|
||||
"$NYX_BIN" scan --format json --verify --no-index "$OWASP_DIR" \
|
||||
> /tmp/nyx_owasp.json 2>/tmp/nyx_owasp.stderr
|
||||
NYX_EXIT=$?
|
||||
set -e
|
||||
if [[ $NYX_EXIT -ne 0 && $NYX_EXIT -ne 1 ]]; then
|
||||
info " nyx exited $NYX_EXIT on OWASP set (stderr follows):"
|
||||
cat /tmp/nyx_owasp.stderr >&2
|
||||
else
|
||||
python3 "${SCRIPT_DIR}/tabulate.py" \
|
||||
--label owasp \
|
||||
--scan /tmp/nyx_owasp.json \
|
||||
--ground-truth "${SCRIPT_DIR}/ground_truth/owasp_benchmark_v1.2.json" \
|
||||
--append "$RESULTS_JSON" \
|
||||
|| info " tabulate.py failed; ground truth file may be absent"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
# ── NIST SARD subset bootstrap ────────────────────────────────────────────────
|
||||
SARD_DIR="${CORPUS_CACHE}/nist_sard"
|
||||
if [[ "$SETS" == *sard* ]]; then
|
||||
if [[ ! -d "$SARD_DIR" ]]; then
|
||||
info "Bootstrapping NIST SARD subset..."
|
||||
info " Download from https://samate.nist.gov/SARD/"
|
||||
info " into ${SARD_DIR} then re-run this script."
|
||||
info "Skipping SARD set (not yet downloaded)."
|
||||
else
|
||||
info "Running nyx scan on NIST SARD subset..."
|
||||
set +e
|
||||
"$NYX_BIN" scan --format json --verify --no-index "$SARD_DIR" \
|
||||
> /tmp/nyx_sard.json 2>/tmp/nyx_sard.stderr
|
||||
NYX_EXIT=$?
|
||||
set -e
|
||||
if [[ $NYX_EXIT -ne 0 && $NYX_EXIT -ne 1 ]]; then
|
||||
info " nyx exited $NYX_EXIT on SARD set"
|
||||
else
|
||||
python3 "${SCRIPT_DIR}/tabulate.py" \
|
||||
--label sard \
|
||||
--scan /tmp/nyx_sard.json \
|
||||
--ground-truth "${SCRIPT_DIR}/ground_truth/nist_sard.json" \
|
||||
--append "$RESULTS_JSON" \
|
||||
|| info " tabulate.py failed; ground truth file may be absent"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
# ── In-house bughunt-curated set ──────────────────────────────────────────────
|
||||
if [[ "$SETS" == *inhouse* ]]; then
|
||||
INHOUSE_DIRS=(
|
||||
"${REPO_ROOT}/tests/benchmark/corpus"
|
||||
"${REPO_ROOT}/tests/dynamic_fixtures"
|
||||
)
|
||||
for dir in "${INHOUSE_DIRS[@]}"; do
|
||||
[[ -d "$dir" ]] || continue
|
||||
label="inhouse_$(basename "$dir")"
|
||||
info "Running nyx scan on in-house set: $dir"
|
||||
set +e
|
||||
"$NYX_BIN" scan --format json --verify --no-index "$dir" \
|
||||
> "/tmp/nyx_${label}.json" 2>"/tmp/nyx_${label}.stderr"
|
||||
NYX_EXIT=$?
|
||||
set -e
|
||||
if [[ $NYX_EXIT -ne 0 && $NYX_EXIT -ne 1 ]]; then
|
||||
info " nyx exited $NYX_EXIT on $label"
|
||||
continue
|
||||
fi
|
||||
python3 "${SCRIPT_DIR}/tabulate.py" \
|
||||
--label "$label" \
|
||||
--scan "/tmp/nyx_${label}.json" \
|
||||
--inhouse \
|
||||
--append "$RESULTS_JSON" \
|
||||
|| info " tabulate.py failed on $label"
|
||||
done
|
||||
fi
|
||||
|
||||
# ── Emit summary table ────────────────────────────────────────────────────────
|
||||
info ""
|
||||
info "Results written to: $RESULTS_JSON"
|
||||
python3 "${SCRIPT_DIR}/report.py" --results "$RESULTS_JSON" \
|
||||
|| { info "report.py not available; raw results at $RESULTS_JSON"; exit 0; }
|
||||
|
||||
[[ -n "$OUTPUT_DIR" ]] && cp "$RESULTS_JSON" "${OUTPUT_DIR}/eval_results.json"
|
||||
137
tests/eval_corpus/tabulate.py
Normal file
137
tests/eval_corpus/tabulate.py
Normal file
|
|
@ -0,0 +1,137 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tabulate nyx scan results against a ground-truth file.
|
||||
|
||||
For OWASP / SARD sets: compares nyx findings against known-true/known-false
|
||||
labels from the ground truth JSON.
|
||||
|
||||
For in-house sets (--inhouse): counts findings by cap x language; reports
|
||||
Unsupported rate only (no ground truth required).
|
||||
|
||||
Output: appends a result record to --append FILE.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def load_json(path: str) -> object:
|
||||
with open(path) as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def cap_of(finding: dict) -> str:
|
||||
rule = finding.get("rule_id", "")
|
||||
# Map rule_id prefix to cap name.
|
||||
for cap in ["sqli", "xss", "cmdi", "ssrf", "deserialize", "path_traversal",
|
||||
"redirect", "xxe", "taint", "auth"]:
|
||||
if cap in rule.lower():
|
||||
return cap
|
||||
return "other"
|
||||
|
||||
|
||||
def lang_of(finding: dict) -> str:
|
||||
path = finding.get("path", "")
|
||||
ext_map = {
|
||||
".py": "python", ".js": "javascript", ".ts": "typescript",
|
||||
".java": "java", ".go": "go", ".php": "php", ".rb": "ruby",
|
||||
".rs": "rust", ".c": "c", ".cpp": "cpp",
|
||||
}
|
||||
for ext, lang in ext_map.items():
|
||||
if path.endswith(ext):
|
||||
return lang
|
||||
return "unknown"
|
||||
|
||||
|
||||
def main() -> int:
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument("--label", required=True)
|
||||
p.add_argument("--scan", required=True, help="nyx scan --format json output")
|
||||
p.add_argument("--ground-truth", default="", help="ground truth JSON")
|
||||
p.add_argument("--inhouse", action="store_true")
|
||||
p.add_argument("--append", required=True, help="results accumulator JSON")
|
||||
args = p.parse_args()
|
||||
|
||||
scan_data = load_json(args.scan)
|
||||
findings = scan_data if isinstance(scan_data, list) else scan_data.get("findings", [])
|
||||
|
||||
# Per-cell tallies: {(cap, lang): {tp, fp, fn, unsupported}}
|
||||
cells: dict[tuple[str, str], dict] = defaultdict(
|
||||
lambda: {"tp": 0, "fp": 0, "fn": 0, "unsupported": 0, "total": 0}
|
||||
)
|
||||
|
||||
for f in findings:
|
||||
cap = cap_of(f)
|
||||
lang = lang_of(f)
|
||||
key = (cap, lang)
|
||||
ev = f.get("evidence", {}) or {}
|
||||
dv = ev.get("dynamic_verdict") if ev else None
|
||||
cells[key]["total"] += 1
|
||||
if dv and dv.get("status") == "Unsupported":
|
||||
cells[key]["unsupported"] += 1
|
||||
|
||||
if not args.inhouse and args.ground_truth and Path(args.ground_truth).exists():
|
||||
gt = load_json(args.ground_truth)
|
||||
# Ground truth format: list of {"path": ..., "line": ..., "cap": ..., "vuln": bool}
|
||||
gt_true: set[tuple[str, int, str]] = set()
|
||||
for entry in gt if isinstance(gt, list) else []:
|
||||
if entry.get("vuln"):
|
||||
gt_true.add((entry.get("path", ""), entry.get("line", 0), entry.get("cap", "")))
|
||||
|
||||
found_keys: set[tuple[str, int, str]] = set()
|
||||
for f in findings:
|
||||
key_gt = (f.get("path", ""), f.get("line", 0), cap_of(f))
|
||||
found_keys.add(key_gt)
|
||||
cap = cap_of(f)
|
||||
lang = lang_of(f)
|
||||
cell_key = (cap, lang)
|
||||
if key_gt in gt_true:
|
||||
cells[cell_key]["tp"] += 1
|
||||
else:
|
||||
cells[cell_key]["fp"] += 1
|
||||
|
||||
for gt_key in gt_true:
|
||||
if gt_key not in found_keys:
|
||||
cap = gt_key[2]
|
||||
cells[(cap, "unknown")]["fn"] += 1
|
||||
|
||||
result = {
|
||||
"label": args.label,
|
||||
"total_findings": len(findings),
|
||||
"cells": [
|
||||
{
|
||||
"cap": k[0],
|
||||
"lang": k[1],
|
||||
**v,
|
||||
"precision": v["tp"] / max(v["tp"] + v["fp"], 1),
|
||||
"recall": v["tp"] / max(v["tp"] + v["fn"], 1),
|
||||
"unsupported_rate": v["unsupported"] / max(v["total"], 1),
|
||||
}
|
||||
for k, v in sorted(cells.items())
|
||||
],
|
||||
}
|
||||
|
||||
existing = load_json(args.append) if Path(args.append).exists() else []
|
||||
existing.append(result)
|
||||
with open(args.append, "w") as f:
|
||||
json.dump(existing, f, indent=2)
|
||||
|
||||
# Print summary
|
||||
print(f"\n=== {args.label} ===")
|
||||
print(f"{'Cap':<20} {'Lang':<12} {'TP':>5} {'FP':>5} {'FN':>5} {'Prec':>6} {'Rec':>6} {'Unsup%':>7}")
|
||||
print("-" * 72)
|
||||
for c in result["cells"]:
|
||||
print(
|
||||
f"{c['cap']:<20} {c['lang']:<12} "
|
||||
f"{c['tp']:>5} {c['fp']:>5} {c['fn']:>5} "
|
||||
f"{c['precision']:>6.2f} {c['recall']:>6.2f} "
|
||||
f"{c['unsupported_rate']*100:>6.1f}%"
|
||||
)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Loading…
Add table
Add a link
Reference in a new issue