nyx/tests/eval_corpus/report.py

#!/usr/bin/env python3
"""
Aggregate eval results across all corpus sets and emit a summary table.
Used by run.sh after all corpus sets have been tabulated.
"""

import argparse
import json
import sys
from collections import defaultdict


def main() -> int:
    p = argparse.ArgumentParser()
    p.add_argument("--results", required=True)
    args = p.parse_args()

    with open(args.results) as f:
        results = json.load(f)

    if not results:
        print("No results to report.")
        return 0

    # Aggregate across sets.
    agg: dict[tuple[str, str], dict] = defaultdict(
        lambda: {"tp": 0, "fp": 0, "fn": 0, "unsupported": 0, "total": 0}
    )
    for r in results:
        for c in r.get("cells", []):
            k = (c["cap"], c["lang"])
            for field in ("tp", "fp", "fn", "unsupported", "total"):
                agg[k][field] += c.get(field, 0)

    print("\n=== Aggregated eval corpus report ===")
    print(f"{'Cap':<20} {'Lang':<12} {'TP':>5} {'FP':>5} {'FN':>5} {'Prec':>6} {'Rec':>6} {'Unsup%':>7}")
    print("-" * 72)
    for k, v in sorted(agg.items()):
        prec = v["tp"] / max(v["tp"] + v["fp"], 1)
        rec = v["tp"] / max(v["tp"] + v["fn"], 1)
        unsup = v["unsupported"] / max(v["total"], 1)
        print(
            f"{k[0]:<20} {k[1]:<12} "
            f"{v['tp']:>5} {v['fp']:>5} {v['fn']:>5} "
            f"{prec:>6.2f} {rec:>6.2f} "
            f"{unsup*100:>6.1f}%"
        )

    # Gate check: per-cap Unsupported rate <= 80%
    gate_failed = False
    print("\n=== Gate checks ===")
    UNSUPPORTED_BUDGET = 0.80
    for k, v in sorted(agg.items()):
        unsup = v["unsupported"] / max(v["total"], 1)
        if unsup > UNSUPPORTED_BUDGET:
            print(f"  FAIL  {k[0]}/{k[1]}: Unsupported {unsup*100:.1f}% > {UNSUPPORTED_BUDGET*100:.0f}% budget")
            gate_failed = True

    if not gate_failed:
        print("  All gate thresholds met.")

    return 2 if gate_failed else 0


if __name__ == "__main__":
    sys.exit(main())