"""Compute the deeper statistics the blog needs: McNemar pairwise tests, per-PDF heterogeneity, latency/token distribution percentiles. Reads the merged post-retry artifact: data/multimodal_doc/runs//parser_compare/raw_post_retry.jsonl Outputs to stdout: 1) Per-arm latency distribution (n, mean, std, p10, p25, p50, p75, p90, p95, p99, max). 2) Per-arm input/output token distribution (mean, p50, p95, max). 3) McNemar pairwise table: for every (arm_i, arm_j) ordered pair on the same 171 questions, count b_ij = #(arm_i correct & arm_j wrong) and b_ji = #(arm_i wrong & arm_j correct), and report the exact-binomial two-sided p-value. We include both raw (using the original raw.jsonl) and post-retry results. 4) Per-PDF accuracy variance per arm (n_pdfs=30): mean, std, min, max. Pure stdlib — no scipy/numpy. """ from __future__ import annotations import argparse import json import math import statistics from pathlib import Path REPO = Path(__file__).resolve().parents[1] # --------------------------------------------------------------------------- # I/O # --------------------------------------------------------------------------- def _read_jsonl(path: Path) -> list[dict]: out: list[dict] = [] with path.open("r", encoding="utf-8") as fh: for line in fh: line = line.strip() if not line: continue out.append(json.loads(line)) return out # --------------------------------------------------------------------------- # Distribution helpers # --------------------------------------------------------------------------- def _percentile(values: list[float], p: float) -> float: """Linear-interpolation percentile (p in [0, 100]).""" if not values: return 0.0 s = sorted(values) if len(s) == 1: return float(s[0]) k = (len(s) - 1) * (p / 100.0) lo, hi = math.floor(k), math.ceil(k) if lo == hi: return float(s[int(k)]) return float(s[lo] + (s[hi] - s[lo]) * (k - lo)) # --------------------------------------------------------------------------- # McNemar exact-binomial p-value # --------------------------------------------------------------------------- def _binom_coef(n: int, k: int) -> int: if k < 0 or k > n: return 0 return math.comb(n, k) def _mcnemar_exact_pvalue(b: int, c: int) -> float: """Two-sided exact-binomial McNemar p-value. Tests H0: P(arm_i wrong, arm_j right) == P(arm_i right, arm_j wrong) on discordant pairs only. Under H0 the count b ~ Bin(b+c, 0.5). The two-sided p-value is P(X <= min(b, c)) + P(X >= max(b, c)) computed exactly (cheap because b+c <= 27 in our run). """ n = b + c if n == 0: return 1.0 k = min(b, c) # Two-sided exact: 2 * P(X <= k) clipped at 1.0 cdf = sum(_binom_coef(n, i) for i in range(k + 1)) p = 2.0 * cdf / (2 ** n) return min(1.0, p) def _mcnemar_table(rows: list[dict]) -> dict: """Group rows -> {qid: {arm: bool_correct}} and compute pairwise.""" by_qid: dict[str, dict[str, bool]] = {} arms_seen: set[str] = set() for r in rows: qid = r["qid"] arm = r["arm"] graded = r.get("graded") or {} correct = bool(graded.get("correct")) by_qid.setdefault(qid, {})[arm] = correct arms_seen.add(arm) arms = sorted(arms_seen) qids = sorted(by_qid) out: dict[str, dict] = {"arms": arms, "n_qids": len(qids), "pairs": []} for i, ai in enumerate(arms): for aj in arms[i + 1:]: b = c = both = neither = 0 for q in qids: row = by_qid[q] if ai not in row or aj not in row: continue ci, cj = row[ai], row[aj] if ci and not cj: b += 1 elif cj and not ci: c += 1 elif ci and cj: both += 1 else: neither += 1 p = _mcnemar_exact_pvalue(b, c) out["pairs"].append({ "arm_i": ai, "arm_j": aj, "b_i_only": b, "c_j_only": c, "both_correct": both, "both_wrong": neither, "p_value": p, }) return out # --------------------------------------------------------------------------- # Per-PDF heterogeneity # --------------------------------------------------------------------------- def _per_pdf_stats(rows: list[dict]) -> dict[str, dict]: """For each arm, per-PDF accuracy = correct/total questions on that PDF.""" bucket: dict[str, dict[str, list[bool]]] = {} for r in rows: arm = r["arm"] pdf = r["doc_id"] graded = r.get("graded") or {} bucket.setdefault(arm, {}).setdefault(pdf, []).append( bool(graded.get("correct")) ) out: dict[str, dict] = {} for arm, pdfs in bucket.items(): accs = [sum(b) / len(b) for b in pdfs.values() if b] if not accs: continue out[arm] = { "n_pdfs": len(accs), "mean": statistics.mean(accs), "std": statistics.stdev(accs) if len(accs) > 1 else 0.0, "min": min(accs), "max": max(accs), "p25": _percentile(accs, 25), "p50": _percentile(accs, 50), "p75": _percentile(accs, 75), "n_pdfs_zero": sum(1 for a in accs if a == 0.0), "n_pdfs_perfect": sum(1 for a in accs if a == 1.0), } return out # --------------------------------------------------------------------------- # Latency / token distributions # --------------------------------------------------------------------------- def _per_arm_latency(rows: list[dict]) -> dict[str, dict]: by_arm: dict[str, list[float]] = {} for r in rows: lat = r.get("latency_ms") if lat is None or lat == 0: continue by_arm.setdefault(r["arm"], []).append(float(lat)) out: dict[str, dict] = {} for arm, lats in by_arm.items(): out[arm] = { "n": len(lats), "mean_s": statistics.mean(lats) / 1000, "std_s": (statistics.stdev(lats) / 1000) if len(lats) > 1 else 0.0, "p10_s": _percentile(lats, 10) / 1000, "p25_s": _percentile(lats, 25) / 1000, "p50_s": _percentile(lats, 50) / 1000, "p75_s": _percentile(lats, 75) / 1000, "p90_s": _percentile(lats, 90) / 1000, "p95_s": _percentile(lats, 95) / 1000, "p99_s": _percentile(lats, 99) / 1000, "max_s": max(lats) / 1000, # Coefficient of variation: std / mean (unitless tail-fatness). "cv": ( statistics.stdev(lats) / statistics.mean(lats) if len(lats) > 1 and statistics.mean(lats) > 0 else 0.0 ), } return out def _per_arm_tokens(rows: list[dict]) -> dict[str, dict]: by_arm_in: dict[str, list[float]] = {} by_arm_out: dict[str, list[float]] = {} for r in rows: t_in = r.get("input_tokens") or 0 t_out = r.get("output_tokens") or 0 if t_in: by_arm_in.setdefault(r["arm"], []).append(float(t_in)) if t_out: by_arm_out.setdefault(r["arm"], []).append(float(t_out)) out: dict[str, dict] = {} for arm in sorted(set(by_arm_in) | set(by_arm_out)): in_vals = by_arm_in.get(arm, []) out_vals = by_arm_out.get(arm, []) if not in_vals and not out_vals: continue entry: dict = {} if in_vals: entry["input"] = { "n": len(in_vals), "mean": statistics.mean(in_vals), "p50": _percentile(in_vals, 50), "p95": _percentile(in_vals, 95), "max": max(in_vals), } if out_vals: entry["output"] = { "n": len(out_vals), "mean": statistics.mean(out_vals), "p50": _percentile(out_vals, 50), "p95": _percentile(out_vals, 95), "max": max(out_vals), } out[arm] = entry return out # --------------------------------------------------------------------------- # Pretty-printing # --------------------------------------------------------------------------- def _print_latency(title: str, lat: dict[str, dict]) -> None: print() print(title) print("-" * len(title)) header = (f"{'arm':<25} {'n':>4} {'mean':>7} {'std':>7} " f"{'p50':>7} {'p90':>7} {'p95':>7} {'p99':>7} {'max':>7} {'CV':>5}") print(header) print("-" * len(header)) for arm in sorted(lat, key=lambda a: lat[a]["mean_s"]): s = lat[arm] print(f"{arm:<25} {s['n']:>4} " f"{s['mean_s']:>6.1f}s {s['std_s']:>6.1f}s " f"{s['p50_s']:>6.1f}s {s['p90_s']:>6.1f}s {s['p95_s']:>6.1f}s " f"{s['p99_s']:>6.1f}s {s['max_s']:>6.1f}s {s['cv']:>5.2f}") def _print_tokens(title: str, toks: dict[str, dict]) -> None: print() print(title) print("-" * len(title)) header = (f"{'arm':<25} {'in mean':>9} {'in p50':>9} {'in p95':>9} {'in max':>9}" f" {'out mean':>9} {'out p95':>9}") print(header) print("-" * len(header)) for arm in sorted(toks): e = toks[arm] ein = e.get("input") eout = e.get("output") if not ein: continue print(f"{arm:<25} " f"{ein['mean']:>9,.0f} {ein['p50']:>9,.0f} {ein['p95']:>9,.0f} {ein['max']:>9,.0f} " f"{(eout or {}).get('mean', 0):>9,.0f} {(eout or {}).get('p95', 0):>9,.0f}") def _print_pdf_var(title: str, var: dict[str, dict]) -> None: print() print(title) print("-" * len(title)) header = (f"{'arm':<25} {'n_pdfs':>7} {'mean':>7} {'std':>7} {'min':>7} " f"{'p25':>7} {'p50':>7} {'p75':>7} {'max':>7} {'#0%':>5} {'#100%':>6}") print(header) print("-" * len(header)) for arm in sorted(var, key=lambda a: -var[a]["mean"]): s = var[arm] print(f"{arm:<25} {s['n_pdfs']:>7} " f"{s['mean']*100:>6.1f}% {s['std']*100:>6.1f}% {s['min']*100:>6.1f}% " f"{s['p25']*100:>6.1f}% {s['p50']*100:>6.1f}% {s['p75']*100:>6.1f}% " f"{s['max']*100:>6.1f}% {s['n_pdfs_zero']:>5} {s['n_pdfs_perfect']:>6}") def _print_mcnemar(title: str, table: dict) -> None: print() print(title) print("-" * len(title)) print(f"n_qids on which all arms have a graded row: {table['n_qids']}") header = (f"{'arm_i':<25} {'arm_j':<25} {'b':>4} {'c':>4} " f"{'both ok':>8} {'both wr':>8} {'p (2-sided)':>13} {'sig':>4}") print(header) print("-" * len(header)) for pair in sorted(table["pairs"], key=lambda p: p["p_value"]): sig = "" if pair["p_value"] < 0.001: sig = "***" elif pair["p_value"] < 0.01: sig = "**" elif pair["p_value"] < 0.05: sig = "*" print(f"{pair['arm_i']:<25} {pair['arm_j']:<25} " f"{pair['b_i_only']:>4} {pair['c_j_only']:>4} " f"{pair['both_correct']:>8} {pair['both_wrong']:>8} " f"{pair['p_value']:>13.4f} {sig:>4}") # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- def main() -> int: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--run-id", default="2026-05-14T00-53-19Z") args = parser.parse_args() run_dir = REPO / "data" / "multimodal_doc" / "runs" / args.run_id / "parser_compare" raw_path = run_dir / "raw.jsonl" post_path = run_dir / "raw_post_retry.jsonl" if not raw_path.exists() or not post_path.exists(): raise SystemExit( "Missing raw.jsonl or raw_post_retry.jsonl. " "Run scripts/compute_post_retry_accuracy.py first." ) raw_rows = _read_jsonl(raw_path) post_rows = _read_jsonl(post_path) print(f"Run: {args.run_id}") print(f"raw rows: {len(raw_rows)}, post-retry rows: {len(post_rows)}") # Latency uses post-retry rows (post-retry rows include the retry's own # latency for recovered rows). For raw, recovered rows have latency=0 # because the harness recorded a failure. _print_latency("Per-arm latency (post-retry)", _per_arm_latency(post_rows)) _print_tokens("Per-arm token distribution (post-retry)", _per_arm_tokens(post_rows)) _print_pdf_var( "Per-PDF accuracy heterogeneity (post-retry)", _per_pdf_stats(post_rows), ) _print_mcnemar( "McNemar pairwise (RAW, no retries)", _mcnemar_table(raw_rows), ) _print_mcnemar( "McNemar pairwise (POST-RETRY)", _mcnemar_table(post_rows), ) return 0 if __name__ == "__main__": raise SystemExit(main())