SurfSense/surfsense_evals/scripts/compute_blog_extras.py
DESKTOP-RTLN3BA\$punk 9bcd50164d feat(evals): publish multimodal_doc parser_compare benchmark + n=171 report
Adds the full parser_compare experiment for the multimodal_doc suite:
six arms compared on 30 PDFs / 171 questions from MMLongBench-Doc with
anthropic/claude-sonnet-4.5 across the board.

Source code:
- core/parsers/{azure_di,llamacloud,pdf_pages}.py: direct parser SDK
  callers (Azure Document Intelligence prebuilt-read/layout, LlamaParse
  parse_page_with_llm/parse_page_with_agent) used by the LC arms,
  bypassing the SurfSense backend so each (basic/premium) extraction
  is a clean A/B independent of backend ETL routing.
- suites/multimodal_doc/parser_compare/{ingest,runner,prompt}.py:
  six-arm benchmark (native_pdf, azure_basic_lc, azure_premium_lc,
  llamacloud_basic_lc, llamacloud_premium_lc, surfsense_agentic) with
  byte-identical prompts per question, deterministic grader, Wilson
  CIs, and the per-page preprocessing tariff cost overlay.

Reproducibility:
- pyproject.toml + uv.lock pin pypdf, azure-ai-documentintelligence,
  llama-cloud-services as new deps.
- .env.example documents the AZURE_DI_* and LLAMA_CLOUD_API_KEY env
  vars now required for parser_compare.
- 12 analysis scripts under scripts/: retry pass with exponential
  backoff, post-retry accuracy merge, McNemar / latency / per-PDF
  stats, context-overflow hypothesis test, etc. Each produces one
  number cited by the blog report.

Citation surface:
- reports/blog/multimodal_doc_parser_compare_n171_report.md: 1219-line
  technical writeup (16 sections) covering headline accuracy, per-format
  accuracy, McNemar pairwise significance, latency / token / per-PDF
  distributions, error analysis, retry experiment, post-retry final
  accuracy, cost amortization model with closed-form derivation, threats
  to validity, and reproducibility appendix.
- data/multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/{raw,
  raw_retries,raw_post_retry}.jsonl + run_artifact.json + retry summary
  whitelisted via data/.gitignore as the verifiable numbers source.

Gitignore:
- ignore logs_*.txt + retry_run.log; structured artifacts cover the
  citation surface, debug logs are noise.
- data/.gitignore default-ignores everything, whitelists the n=171 run
  artifacts only (parser manifest left ignored to avoid leaking local
  Windows usernames in absolute paths; manifest is fully regenerable
  via 'ingest multimodal_doc parser_compare').
- reports/.gitignore now whitelists hand-curated reports/blog/.

Also retires the abandoned CRAG Task 3 implementation (download script,
streaming Task 3 ingest, CragTask3Benchmark + tests) and trims the
runner / ingest module APIs to match.

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-14 19:54:41 -07:00

381 lines
13 KiB
Python

"""Compute the deeper statistics the blog needs: McNemar pairwise tests,
per-PDF heterogeneity, latency/token distribution percentiles.
Reads the merged post-retry artifact:
data/multimodal_doc/runs/<run_id>/parser_compare/raw_post_retry.jsonl
Outputs to stdout:
1) Per-arm latency distribution (n, mean, std, p10, p25, p50, p75, p90, p95, p99, max).
2) Per-arm input/output token distribution (mean, p50, p95, max).
3) McNemar pairwise table: for every (arm_i, arm_j) ordered pair on the
same 171 questions, count b_ij = #(arm_i correct & arm_j wrong) and
b_ji = #(arm_i wrong & arm_j correct), and report the exact-binomial
two-sided p-value. We include both raw (using the original raw.jsonl)
and post-retry results.
4) Per-PDF accuracy variance per arm (n_pdfs=30): mean, std, min, max.
Pure stdlib — no scipy/numpy.
"""
from __future__ import annotations
import argparse
import json
import math
import statistics
from pathlib import Path
REPO = Path(__file__).resolve().parents[1]
# ---------------------------------------------------------------------------
# I/O
# ---------------------------------------------------------------------------
def _read_jsonl(path: Path) -> list[dict]:
out: list[dict] = []
with path.open("r", encoding="utf-8") as fh:
for line in fh:
line = line.strip()
if not line:
continue
out.append(json.loads(line))
return out
# ---------------------------------------------------------------------------
# Distribution helpers
# ---------------------------------------------------------------------------
def _percentile(values: list[float], p: float) -> float:
"""Linear-interpolation percentile (p in [0, 100])."""
if not values:
return 0.0
s = sorted(values)
if len(s) == 1:
return float(s[0])
k = (len(s) - 1) * (p / 100.0)
lo, hi = math.floor(k), math.ceil(k)
if lo == hi:
return float(s[int(k)])
return float(s[lo] + (s[hi] - s[lo]) * (k - lo))
# ---------------------------------------------------------------------------
# McNemar exact-binomial p-value
# ---------------------------------------------------------------------------
def _binom_coef(n: int, k: int) -> int:
if k < 0 or k > n:
return 0
return math.comb(n, k)
def _mcnemar_exact_pvalue(b: int, c: int) -> float:
"""Two-sided exact-binomial McNemar p-value.
Tests H0: P(arm_i wrong, arm_j right) == P(arm_i right, arm_j wrong)
on discordant pairs only. Under H0 the count b ~ Bin(b+c, 0.5).
The two-sided p-value is
P(X <= min(b, c)) + P(X >= max(b, c))
computed exactly (cheap because b+c <= 27 in our run).
"""
n = b + c
if n == 0:
return 1.0
k = min(b, c)
# Two-sided exact: 2 * P(X <= k) clipped at 1.0
cdf = sum(_binom_coef(n, i) for i in range(k + 1))
p = 2.0 * cdf / (2 ** n)
return min(1.0, p)
def _mcnemar_table(rows: list[dict]) -> dict:
"""Group rows -> {qid: {arm: bool_correct}} and compute pairwise."""
by_qid: dict[str, dict[str, bool]] = {}
arms_seen: set[str] = set()
for r in rows:
qid = r["qid"]
arm = r["arm"]
graded = r.get("graded") or {}
correct = bool(graded.get("correct"))
by_qid.setdefault(qid, {})[arm] = correct
arms_seen.add(arm)
arms = sorted(arms_seen)
qids = sorted(by_qid)
out: dict[str, dict] = {"arms": arms, "n_qids": len(qids), "pairs": []}
for i, ai in enumerate(arms):
for aj in arms[i + 1:]:
b = c = both = neither = 0
for q in qids:
row = by_qid[q]
if ai not in row or aj not in row:
continue
ci, cj = row[ai], row[aj]
if ci and not cj:
b += 1
elif cj and not ci:
c += 1
elif ci and cj:
both += 1
else:
neither += 1
p = _mcnemar_exact_pvalue(b, c)
out["pairs"].append({
"arm_i": ai, "arm_j": aj,
"b_i_only": b, "c_j_only": c,
"both_correct": both, "both_wrong": neither,
"p_value": p,
})
return out
# ---------------------------------------------------------------------------
# Per-PDF heterogeneity
# ---------------------------------------------------------------------------
def _per_pdf_stats(rows: list[dict]) -> dict[str, dict]:
"""For each arm, per-PDF accuracy = correct/total questions on that PDF."""
bucket: dict[str, dict[str, list[bool]]] = {}
for r in rows:
arm = r["arm"]
pdf = r["doc_id"]
graded = r.get("graded") or {}
bucket.setdefault(arm, {}).setdefault(pdf, []).append(
bool(graded.get("correct"))
)
out: dict[str, dict] = {}
for arm, pdfs in bucket.items():
accs = [sum(b) / len(b) for b in pdfs.values() if b]
if not accs:
continue
out[arm] = {
"n_pdfs": len(accs),
"mean": statistics.mean(accs),
"std": statistics.stdev(accs) if len(accs) > 1 else 0.0,
"min": min(accs),
"max": max(accs),
"p25": _percentile(accs, 25),
"p50": _percentile(accs, 50),
"p75": _percentile(accs, 75),
"n_pdfs_zero": sum(1 for a in accs if a == 0.0),
"n_pdfs_perfect": sum(1 for a in accs if a == 1.0),
}
return out
# ---------------------------------------------------------------------------
# Latency / token distributions
# ---------------------------------------------------------------------------
def _per_arm_latency(rows: list[dict]) -> dict[str, dict]:
by_arm: dict[str, list[float]] = {}
for r in rows:
lat = r.get("latency_ms")
if lat is None or lat == 0:
continue
by_arm.setdefault(r["arm"], []).append(float(lat))
out: dict[str, dict] = {}
for arm, lats in by_arm.items():
out[arm] = {
"n": len(lats),
"mean_s": statistics.mean(lats) / 1000,
"std_s": (statistics.stdev(lats) / 1000) if len(lats) > 1 else 0.0,
"p10_s": _percentile(lats, 10) / 1000,
"p25_s": _percentile(lats, 25) / 1000,
"p50_s": _percentile(lats, 50) / 1000,
"p75_s": _percentile(lats, 75) / 1000,
"p90_s": _percentile(lats, 90) / 1000,
"p95_s": _percentile(lats, 95) / 1000,
"p99_s": _percentile(lats, 99) / 1000,
"max_s": max(lats) / 1000,
# Coefficient of variation: std / mean (unitless tail-fatness).
"cv": (
statistics.stdev(lats) / statistics.mean(lats)
if len(lats) > 1 and statistics.mean(lats) > 0 else 0.0
),
}
return out
def _per_arm_tokens(rows: list[dict]) -> dict[str, dict]:
by_arm_in: dict[str, list[float]] = {}
by_arm_out: dict[str, list[float]] = {}
for r in rows:
t_in = r.get("input_tokens") or 0
t_out = r.get("output_tokens") or 0
if t_in:
by_arm_in.setdefault(r["arm"], []).append(float(t_in))
if t_out:
by_arm_out.setdefault(r["arm"], []).append(float(t_out))
out: dict[str, dict] = {}
for arm in sorted(set(by_arm_in) | set(by_arm_out)):
in_vals = by_arm_in.get(arm, [])
out_vals = by_arm_out.get(arm, [])
if not in_vals and not out_vals:
continue
entry: dict = {}
if in_vals:
entry["input"] = {
"n": len(in_vals),
"mean": statistics.mean(in_vals),
"p50": _percentile(in_vals, 50),
"p95": _percentile(in_vals, 95),
"max": max(in_vals),
}
if out_vals:
entry["output"] = {
"n": len(out_vals),
"mean": statistics.mean(out_vals),
"p50": _percentile(out_vals, 50),
"p95": _percentile(out_vals, 95),
"max": max(out_vals),
}
out[arm] = entry
return out
# ---------------------------------------------------------------------------
# Pretty-printing
# ---------------------------------------------------------------------------
def _print_latency(title: str, lat: dict[str, dict]) -> None:
print()
print(title)
print("-" * len(title))
header = (f"{'arm':<25} {'n':>4} {'mean':>7} {'std':>7} "
f"{'p50':>7} {'p90':>7} {'p95':>7} {'p99':>7} {'max':>7} {'CV':>5}")
print(header)
print("-" * len(header))
for arm in sorted(lat, key=lambda a: lat[a]["mean_s"]):
s = lat[arm]
print(f"{arm:<25} {s['n']:>4} "
f"{s['mean_s']:>6.1f}s {s['std_s']:>6.1f}s "
f"{s['p50_s']:>6.1f}s {s['p90_s']:>6.1f}s {s['p95_s']:>6.1f}s "
f"{s['p99_s']:>6.1f}s {s['max_s']:>6.1f}s {s['cv']:>5.2f}")
def _print_tokens(title: str, toks: dict[str, dict]) -> None:
print()
print(title)
print("-" * len(title))
header = (f"{'arm':<25} {'in mean':>9} {'in p50':>9} {'in p95':>9} {'in max':>9}"
f" {'out mean':>9} {'out p95':>9}")
print(header)
print("-" * len(header))
for arm in sorted(toks):
e = toks[arm]
ein = e.get("input")
eout = e.get("output")
if not ein:
continue
print(f"{arm:<25} "
f"{ein['mean']:>9,.0f} {ein['p50']:>9,.0f} {ein['p95']:>9,.0f} {ein['max']:>9,.0f} "
f"{(eout or {}).get('mean', 0):>9,.0f} {(eout or {}).get('p95', 0):>9,.0f}")
def _print_pdf_var(title: str, var: dict[str, dict]) -> None:
print()
print(title)
print("-" * len(title))
header = (f"{'arm':<25} {'n_pdfs':>7} {'mean':>7} {'std':>7} {'min':>7} "
f"{'p25':>7} {'p50':>7} {'p75':>7} {'max':>7} {'#0%':>5} {'#100%':>6}")
print(header)
print("-" * len(header))
for arm in sorted(var, key=lambda a: -var[a]["mean"]):
s = var[arm]
print(f"{arm:<25} {s['n_pdfs']:>7} "
f"{s['mean']*100:>6.1f}% {s['std']*100:>6.1f}% {s['min']*100:>6.1f}% "
f"{s['p25']*100:>6.1f}% {s['p50']*100:>6.1f}% {s['p75']*100:>6.1f}% "
f"{s['max']*100:>6.1f}% {s['n_pdfs_zero']:>5} {s['n_pdfs_perfect']:>6}")
def _print_mcnemar(title: str, table: dict) -> None:
print()
print(title)
print("-" * len(title))
print(f"n_qids on which all arms have a graded row: {table['n_qids']}")
header = (f"{'arm_i':<25} {'arm_j':<25} {'b':>4} {'c':>4} "
f"{'both ok':>8} {'both wr':>8} {'p (2-sided)':>13} {'sig':>4}")
print(header)
print("-" * len(header))
for pair in sorted(table["pairs"], key=lambda p: p["p_value"]):
sig = ""
if pair["p_value"] < 0.001:
sig = "***"
elif pair["p_value"] < 0.01:
sig = "**"
elif pair["p_value"] < 0.05:
sig = "*"
print(f"{pair['arm_i']:<25} {pair['arm_j']:<25} "
f"{pair['b_i_only']:>4} {pair['c_j_only']:>4} "
f"{pair['both_correct']:>8} {pair['both_wrong']:>8} "
f"{pair['p_value']:>13.4f} {sig:>4}")
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--run-id", default="2026-05-14T00-53-19Z")
args = parser.parse_args()
run_dir = REPO / "data" / "multimodal_doc" / "runs" / args.run_id / "parser_compare"
raw_path = run_dir / "raw.jsonl"
post_path = run_dir / "raw_post_retry.jsonl"
if not raw_path.exists() or not post_path.exists():
raise SystemExit(
"Missing raw.jsonl or raw_post_retry.jsonl. "
"Run scripts/compute_post_retry_accuracy.py first."
)
raw_rows = _read_jsonl(raw_path)
post_rows = _read_jsonl(post_path)
print(f"Run: {args.run_id}")
print(f"raw rows: {len(raw_rows)}, post-retry rows: {len(post_rows)}")
# Latency uses post-retry rows (post-retry rows include the retry's own
# latency for recovered rows). For raw, recovered rows have latency=0
# because the harness recorded a failure.
_print_latency("Per-arm latency (post-retry)", _per_arm_latency(post_rows))
_print_tokens("Per-arm token distribution (post-retry)", _per_arm_tokens(post_rows))
_print_pdf_var(
"Per-PDF accuracy heterogeneity (post-retry)",
_per_pdf_stats(post_rows),
)
_print_mcnemar(
"McNemar pairwise (RAW, no retries)",
_mcnemar_table(raw_rows),
)
_print_mcnemar(
"McNemar pairwise (POST-RETRY)",
_mcnemar_table(post_rows),
)
return 0
if __name__ == "__main__":
raise SystemExit(main())