mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-17 18:35:19 +02:00
Adds the full parser_compare experiment for the multimodal_doc suite:
six arms compared on 30 PDFs / 171 questions from MMLongBench-Doc with
anthropic/claude-sonnet-4.5 across the board.
Source code:
- core/parsers/{azure_di,llamacloud,pdf_pages}.py: direct parser SDK
callers (Azure Document Intelligence prebuilt-read/layout, LlamaParse
parse_page_with_llm/parse_page_with_agent) used by the LC arms,
bypassing the SurfSense backend so each (basic/premium) extraction
is a clean A/B independent of backend ETL routing.
- suites/multimodal_doc/parser_compare/{ingest,runner,prompt}.py:
six-arm benchmark (native_pdf, azure_basic_lc, azure_premium_lc,
llamacloud_basic_lc, llamacloud_premium_lc, surfsense_agentic) with
byte-identical prompts per question, deterministic grader, Wilson
CIs, and the per-page preprocessing tariff cost overlay.
Reproducibility:
- pyproject.toml + uv.lock pin pypdf, azure-ai-documentintelligence,
llama-cloud-services as new deps.
- .env.example documents the AZURE_DI_* and LLAMA_CLOUD_API_KEY env
vars now required for parser_compare.
- 12 analysis scripts under scripts/: retry pass with exponential
backoff, post-retry accuracy merge, McNemar / latency / per-PDF
stats, context-overflow hypothesis test, etc. Each produces one
number cited by the blog report.
Citation surface:
- reports/blog/multimodal_doc_parser_compare_n171_report.md: 1219-line
technical writeup (16 sections) covering headline accuracy, per-format
accuracy, McNemar pairwise significance, latency / token / per-PDF
distributions, error analysis, retry experiment, post-retry final
accuracy, cost amortization model with closed-form derivation, threats
to validity, and reproducibility appendix.
- data/multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/{raw,
raw_retries,raw_post_retry}.jsonl + run_artifact.json + retry summary
whitelisted via data/.gitignore as the verifiable numbers source.
Gitignore:
- ignore logs_*.txt + retry_run.log; structured artifacts cover the
citation surface, debug logs are noise.
- data/.gitignore default-ignores everything, whitelists the n=171 run
artifacts only (parser manifest left ignored to avoid leaking local
Windows usernames in absolute paths; manifest is fully regenerable
via 'ingest multimodal_doc parser_compare').
- reports/.gitignore now whitelists hand-curated reports/blog/.
Also retires the abandoned CRAG Task 3 implementation (download script,
streaming Task 3 ingest, CragTask3Benchmark + tests) and trims the
runner / ingest module APIs to match.
Co-authored-by: Cursor <cursoragent@cursor.com>
381 lines
13 KiB
Python
381 lines
13 KiB
Python
"""Compute the deeper statistics the blog needs: McNemar pairwise tests,
|
|
per-PDF heterogeneity, latency/token distribution percentiles.
|
|
|
|
Reads the merged post-retry artifact:
|
|
|
|
data/multimodal_doc/runs/<run_id>/parser_compare/raw_post_retry.jsonl
|
|
|
|
Outputs to stdout:
|
|
|
|
1) Per-arm latency distribution (n, mean, std, p10, p25, p50, p75, p90, p95, p99, max).
|
|
2) Per-arm input/output token distribution (mean, p50, p95, max).
|
|
3) McNemar pairwise table: for every (arm_i, arm_j) ordered pair on the
|
|
same 171 questions, count b_ij = #(arm_i correct & arm_j wrong) and
|
|
b_ji = #(arm_i wrong & arm_j correct), and report the exact-binomial
|
|
two-sided p-value. We include both raw (using the original raw.jsonl)
|
|
and post-retry results.
|
|
4) Per-PDF accuracy variance per arm (n_pdfs=30): mean, std, min, max.
|
|
|
|
Pure stdlib — no scipy/numpy.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import math
|
|
import statistics
|
|
from pathlib import Path
|
|
|
|
REPO = Path(__file__).resolve().parents[1]
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# I/O
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _read_jsonl(path: Path) -> list[dict]:
|
|
out: list[dict] = []
|
|
with path.open("r", encoding="utf-8") as fh:
|
|
for line in fh:
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
out.append(json.loads(line))
|
|
return out
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Distribution helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _percentile(values: list[float], p: float) -> float:
|
|
"""Linear-interpolation percentile (p in [0, 100])."""
|
|
|
|
if not values:
|
|
return 0.0
|
|
s = sorted(values)
|
|
if len(s) == 1:
|
|
return float(s[0])
|
|
k = (len(s) - 1) * (p / 100.0)
|
|
lo, hi = math.floor(k), math.ceil(k)
|
|
if lo == hi:
|
|
return float(s[int(k)])
|
|
return float(s[lo] + (s[hi] - s[lo]) * (k - lo))
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# McNemar exact-binomial p-value
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _binom_coef(n: int, k: int) -> int:
|
|
if k < 0 or k > n:
|
|
return 0
|
|
return math.comb(n, k)
|
|
|
|
|
|
def _mcnemar_exact_pvalue(b: int, c: int) -> float:
|
|
"""Two-sided exact-binomial McNemar p-value.
|
|
|
|
Tests H0: P(arm_i wrong, arm_j right) == P(arm_i right, arm_j wrong)
|
|
on discordant pairs only. Under H0 the count b ~ Bin(b+c, 0.5).
|
|
The two-sided p-value is
|
|
|
|
P(X <= min(b, c)) + P(X >= max(b, c))
|
|
|
|
computed exactly (cheap because b+c <= 27 in our run).
|
|
"""
|
|
|
|
n = b + c
|
|
if n == 0:
|
|
return 1.0
|
|
k = min(b, c)
|
|
# Two-sided exact: 2 * P(X <= k) clipped at 1.0
|
|
cdf = sum(_binom_coef(n, i) for i in range(k + 1))
|
|
p = 2.0 * cdf / (2 ** n)
|
|
return min(1.0, p)
|
|
|
|
|
|
def _mcnemar_table(rows: list[dict]) -> dict:
|
|
"""Group rows -> {qid: {arm: bool_correct}} and compute pairwise."""
|
|
|
|
by_qid: dict[str, dict[str, bool]] = {}
|
|
arms_seen: set[str] = set()
|
|
for r in rows:
|
|
qid = r["qid"]
|
|
arm = r["arm"]
|
|
graded = r.get("graded") or {}
|
|
correct = bool(graded.get("correct"))
|
|
by_qid.setdefault(qid, {})[arm] = correct
|
|
arms_seen.add(arm)
|
|
|
|
arms = sorted(arms_seen)
|
|
qids = sorted(by_qid)
|
|
out: dict[str, dict] = {"arms": arms, "n_qids": len(qids), "pairs": []}
|
|
for i, ai in enumerate(arms):
|
|
for aj in arms[i + 1:]:
|
|
b = c = both = neither = 0
|
|
for q in qids:
|
|
row = by_qid[q]
|
|
if ai not in row or aj not in row:
|
|
continue
|
|
ci, cj = row[ai], row[aj]
|
|
if ci and not cj:
|
|
b += 1
|
|
elif cj and not ci:
|
|
c += 1
|
|
elif ci and cj:
|
|
both += 1
|
|
else:
|
|
neither += 1
|
|
p = _mcnemar_exact_pvalue(b, c)
|
|
out["pairs"].append({
|
|
"arm_i": ai, "arm_j": aj,
|
|
"b_i_only": b, "c_j_only": c,
|
|
"both_correct": both, "both_wrong": neither,
|
|
"p_value": p,
|
|
})
|
|
return out
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Per-PDF heterogeneity
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _per_pdf_stats(rows: list[dict]) -> dict[str, dict]:
|
|
"""For each arm, per-PDF accuracy = correct/total questions on that PDF."""
|
|
|
|
bucket: dict[str, dict[str, list[bool]]] = {}
|
|
for r in rows:
|
|
arm = r["arm"]
|
|
pdf = r["doc_id"]
|
|
graded = r.get("graded") or {}
|
|
bucket.setdefault(arm, {}).setdefault(pdf, []).append(
|
|
bool(graded.get("correct"))
|
|
)
|
|
|
|
out: dict[str, dict] = {}
|
|
for arm, pdfs in bucket.items():
|
|
accs = [sum(b) / len(b) for b in pdfs.values() if b]
|
|
if not accs:
|
|
continue
|
|
out[arm] = {
|
|
"n_pdfs": len(accs),
|
|
"mean": statistics.mean(accs),
|
|
"std": statistics.stdev(accs) if len(accs) > 1 else 0.0,
|
|
"min": min(accs),
|
|
"max": max(accs),
|
|
"p25": _percentile(accs, 25),
|
|
"p50": _percentile(accs, 50),
|
|
"p75": _percentile(accs, 75),
|
|
"n_pdfs_zero": sum(1 for a in accs if a == 0.0),
|
|
"n_pdfs_perfect": sum(1 for a in accs if a == 1.0),
|
|
}
|
|
return out
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Latency / token distributions
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _per_arm_latency(rows: list[dict]) -> dict[str, dict]:
|
|
by_arm: dict[str, list[float]] = {}
|
|
for r in rows:
|
|
lat = r.get("latency_ms")
|
|
if lat is None or lat == 0:
|
|
continue
|
|
by_arm.setdefault(r["arm"], []).append(float(lat))
|
|
out: dict[str, dict] = {}
|
|
for arm, lats in by_arm.items():
|
|
out[arm] = {
|
|
"n": len(lats),
|
|
"mean_s": statistics.mean(lats) / 1000,
|
|
"std_s": (statistics.stdev(lats) / 1000) if len(lats) > 1 else 0.0,
|
|
"p10_s": _percentile(lats, 10) / 1000,
|
|
"p25_s": _percentile(lats, 25) / 1000,
|
|
"p50_s": _percentile(lats, 50) / 1000,
|
|
"p75_s": _percentile(lats, 75) / 1000,
|
|
"p90_s": _percentile(lats, 90) / 1000,
|
|
"p95_s": _percentile(lats, 95) / 1000,
|
|
"p99_s": _percentile(lats, 99) / 1000,
|
|
"max_s": max(lats) / 1000,
|
|
# Coefficient of variation: std / mean (unitless tail-fatness).
|
|
"cv": (
|
|
statistics.stdev(lats) / statistics.mean(lats)
|
|
if len(lats) > 1 and statistics.mean(lats) > 0 else 0.0
|
|
),
|
|
}
|
|
return out
|
|
|
|
|
|
def _per_arm_tokens(rows: list[dict]) -> dict[str, dict]:
|
|
by_arm_in: dict[str, list[float]] = {}
|
|
by_arm_out: dict[str, list[float]] = {}
|
|
for r in rows:
|
|
t_in = r.get("input_tokens") or 0
|
|
t_out = r.get("output_tokens") or 0
|
|
if t_in:
|
|
by_arm_in.setdefault(r["arm"], []).append(float(t_in))
|
|
if t_out:
|
|
by_arm_out.setdefault(r["arm"], []).append(float(t_out))
|
|
out: dict[str, dict] = {}
|
|
for arm in sorted(set(by_arm_in) | set(by_arm_out)):
|
|
in_vals = by_arm_in.get(arm, [])
|
|
out_vals = by_arm_out.get(arm, [])
|
|
if not in_vals and not out_vals:
|
|
continue
|
|
entry: dict = {}
|
|
if in_vals:
|
|
entry["input"] = {
|
|
"n": len(in_vals),
|
|
"mean": statistics.mean(in_vals),
|
|
"p50": _percentile(in_vals, 50),
|
|
"p95": _percentile(in_vals, 95),
|
|
"max": max(in_vals),
|
|
}
|
|
if out_vals:
|
|
entry["output"] = {
|
|
"n": len(out_vals),
|
|
"mean": statistics.mean(out_vals),
|
|
"p50": _percentile(out_vals, 50),
|
|
"p95": _percentile(out_vals, 95),
|
|
"max": max(out_vals),
|
|
}
|
|
out[arm] = entry
|
|
return out
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Pretty-printing
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _print_latency(title: str, lat: dict[str, dict]) -> None:
|
|
print()
|
|
print(title)
|
|
print("-" * len(title))
|
|
header = (f"{'arm':<25} {'n':>4} {'mean':>7} {'std':>7} "
|
|
f"{'p50':>7} {'p90':>7} {'p95':>7} {'p99':>7} {'max':>7} {'CV':>5}")
|
|
print(header)
|
|
print("-" * len(header))
|
|
for arm in sorted(lat, key=lambda a: lat[a]["mean_s"]):
|
|
s = lat[arm]
|
|
print(f"{arm:<25} {s['n']:>4} "
|
|
f"{s['mean_s']:>6.1f}s {s['std_s']:>6.1f}s "
|
|
f"{s['p50_s']:>6.1f}s {s['p90_s']:>6.1f}s {s['p95_s']:>6.1f}s "
|
|
f"{s['p99_s']:>6.1f}s {s['max_s']:>6.1f}s {s['cv']:>5.2f}")
|
|
|
|
|
|
def _print_tokens(title: str, toks: dict[str, dict]) -> None:
|
|
print()
|
|
print(title)
|
|
print("-" * len(title))
|
|
header = (f"{'arm':<25} {'in mean':>9} {'in p50':>9} {'in p95':>9} {'in max':>9}"
|
|
f" {'out mean':>9} {'out p95':>9}")
|
|
print(header)
|
|
print("-" * len(header))
|
|
for arm in sorted(toks):
|
|
e = toks[arm]
|
|
ein = e.get("input")
|
|
eout = e.get("output")
|
|
if not ein:
|
|
continue
|
|
print(f"{arm:<25} "
|
|
f"{ein['mean']:>9,.0f} {ein['p50']:>9,.0f} {ein['p95']:>9,.0f} {ein['max']:>9,.0f} "
|
|
f"{(eout or {}).get('mean', 0):>9,.0f} {(eout or {}).get('p95', 0):>9,.0f}")
|
|
|
|
|
|
def _print_pdf_var(title: str, var: dict[str, dict]) -> None:
|
|
print()
|
|
print(title)
|
|
print("-" * len(title))
|
|
header = (f"{'arm':<25} {'n_pdfs':>7} {'mean':>7} {'std':>7} {'min':>7} "
|
|
f"{'p25':>7} {'p50':>7} {'p75':>7} {'max':>7} {'#0%':>5} {'#100%':>6}")
|
|
print(header)
|
|
print("-" * len(header))
|
|
for arm in sorted(var, key=lambda a: -var[a]["mean"]):
|
|
s = var[arm]
|
|
print(f"{arm:<25} {s['n_pdfs']:>7} "
|
|
f"{s['mean']*100:>6.1f}% {s['std']*100:>6.1f}% {s['min']*100:>6.1f}% "
|
|
f"{s['p25']*100:>6.1f}% {s['p50']*100:>6.1f}% {s['p75']*100:>6.1f}% "
|
|
f"{s['max']*100:>6.1f}% {s['n_pdfs_zero']:>5} {s['n_pdfs_perfect']:>6}")
|
|
|
|
|
|
def _print_mcnemar(title: str, table: dict) -> None:
|
|
print()
|
|
print(title)
|
|
print("-" * len(title))
|
|
print(f"n_qids on which all arms have a graded row: {table['n_qids']}")
|
|
header = (f"{'arm_i':<25} {'arm_j':<25} {'b':>4} {'c':>4} "
|
|
f"{'both ok':>8} {'both wr':>8} {'p (2-sided)':>13} {'sig':>4}")
|
|
print(header)
|
|
print("-" * len(header))
|
|
for pair in sorted(table["pairs"], key=lambda p: p["p_value"]):
|
|
sig = ""
|
|
if pair["p_value"] < 0.001:
|
|
sig = "***"
|
|
elif pair["p_value"] < 0.01:
|
|
sig = "**"
|
|
elif pair["p_value"] < 0.05:
|
|
sig = "*"
|
|
print(f"{pair['arm_i']:<25} {pair['arm_j']:<25} "
|
|
f"{pair['b_i_only']:>4} {pair['c_j_only']:>4} "
|
|
f"{pair['both_correct']:>8} {pair['both_wrong']:>8} "
|
|
f"{pair['p_value']:>13.4f} {sig:>4}")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Main
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser(description=__doc__)
|
|
parser.add_argument("--run-id", default="2026-05-14T00-53-19Z")
|
|
args = parser.parse_args()
|
|
|
|
run_dir = REPO / "data" / "multimodal_doc" / "runs" / args.run_id / "parser_compare"
|
|
raw_path = run_dir / "raw.jsonl"
|
|
post_path = run_dir / "raw_post_retry.jsonl"
|
|
if not raw_path.exists() or not post_path.exists():
|
|
raise SystemExit(
|
|
"Missing raw.jsonl or raw_post_retry.jsonl. "
|
|
"Run scripts/compute_post_retry_accuracy.py first."
|
|
)
|
|
|
|
raw_rows = _read_jsonl(raw_path)
|
|
post_rows = _read_jsonl(post_path)
|
|
|
|
print(f"Run: {args.run_id}")
|
|
print(f"raw rows: {len(raw_rows)}, post-retry rows: {len(post_rows)}")
|
|
|
|
# Latency uses post-retry rows (post-retry rows include the retry's own
|
|
# latency for recovered rows). For raw, recovered rows have latency=0
|
|
# because the harness recorded a failure.
|
|
_print_latency("Per-arm latency (post-retry)", _per_arm_latency(post_rows))
|
|
|
|
_print_tokens("Per-arm token distribution (post-retry)", _per_arm_tokens(post_rows))
|
|
|
|
_print_pdf_var(
|
|
"Per-PDF accuracy heterogeneity (post-retry)",
|
|
_per_pdf_stats(post_rows),
|
|
)
|
|
|
|
_print_mcnemar(
|
|
"McNemar pairwise (RAW, no retries)",
|
|
_mcnemar_table(raw_rows),
|
|
)
|
|
_print_mcnemar(
|
|
"McNemar pairwise (POST-RETRY)",
|
|
_mcnemar_table(post_rows),
|
|
)
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|