feat(evals): publish multimodal_doc parser_compare benchmark + n=171 report

Adds the full parser_compare experiment for the multimodal_doc suite: six arms compared on 30 PDFs / 171 questions from MMLongBench-Doc with anthropic/claude-sonnet-4.5 across the board. Source code: - core/parsers/{azure_di,llamacloud,pdf_pages}.py: direct parser SDK callers (Azure Document Intelligence prebuilt-read/layout, LlamaParse parse_page_with_llm/parse_page_with_agent) used by the LC arms, bypassing the SurfSense backend so each (basic/premium) extraction is a clean A/B independent of backend ETL routing. - suites/multimodal_doc/parser_compare/{ingest,runner,prompt}.py: six-arm benchmark (native_pdf, azure_basic_lc, azure_premium_lc, llamacloud_basic_lc, llamacloud_premium_lc, surfsense_agentic) with byte-identical prompts per question, deterministic grader, Wilson CIs, and the per-page preprocessing tariff cost overlay. Reproducibility: - pyproject.toml + uv.lock pin pypdf, azure-ai-documentintelligence, llama-cloud-services as new deps. - .env.example documents the AZURE_DI_* and LLAMA_CLOUD_API_KEY env vars now required for parser_compare. - 12 analysis scripts under scripts/: retry pass with exponential backoff, post-retry accuracy merge, McNemar / latency / per-PDF stats, context-overflow hypothesis test, etc. Each produces one number cited by the blog report. Citation surface: - reports/blog/multimodal_doc_parser_compare_n171_report.md: 1219-line technical writeup (16 sections) covering headline accuracy, per-format accuracy, McNemar pairwise significance, latency / token / per-PDF distributions, error analysis, retry experiment, post-retry final accuracy, cost amortization model with closed-form derivation, threats to validity, and reproducibility appendix. - data/multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/{raw, raw_retries,raw_post_retry}.jsonl + run_artifact.json + retry summary whitelisted via data/.gitignore as the verifiable numbers source. Gitignore: - ignore logs_*.txt + retry_run.log; structured artifacts cover the citation surface, debug logs are noise. - data/.gitignore default-ignores everything, whitelists the n=171 run artifacts only (parser manifest left ignored to avoid leaking local Windows usernames in absolute paths; manifest is fully regenerable via 'ingest multimodal_doc parser_compare'). - reports/.gitignore now whitelists hand-curated reports/blog/. Also retires the abandoned CRAG Task 3 implementation (download script, streaming Task 3 ingest, CragTask3Benchmark + tests) and trims the runner / ingest module APIs to match. Co-authored-by: Cursor <cursoragent@cursor.com>
2026-07-04 22:02:16 +02:00 · 2026-05-14 19:54:41 -07:00 · 2026-05-14 19:54:41 -07:00 · 9bcd50164d
commit 9bcd50164d
parent 3737118050
40 changed files with 9303 additions and 993 deletions
--- a/surfsense_evals/scripts/analyze_failure_timing.py
+++ b/surfsense_evals/scripts/analyze_failure_timing.py
@ -0,0 +1,125 @@
+"""Were the SSL failures clustered in time (network blip) or evenly
+distributed (sustained limit)? Group failures by 1-min buckets using
+the run start time and the per-row latency_ms / answer order.
+
+Also: for the one *real* intrinsic failure — the 30MB Anthropic limit
+on 2405.09818v1.pdf::Q007 — print the full error message + raw payload
+sizes so the blog has a clean root cause.
+"""
+
+from __future__ import annotations
+
+import json
+from collections import Counter, defaultdict
+from pathlib import Path
+
+REPO = Path(__file__).resolve().parents[1]
+RUN = REPO / "data" / "multimodal_doc" / "runs" / "2026-05-14T00-53-19Z" / "parser_compare"
+RAW = RUN / "raw.jsonl"
+PDFS = REPO / "data" / "multimodal_doc" / "mmlongbench" / "pdfs"
+
+
+def main() -> None:
+    rows = [
+        json.loads(line) for line in RAW.read_text(encoding="utf-8").splitlines()
+        if line.strip()
+    ]
+
+    # 1) SSL clustering: failures by question index per arm
+    by_arm_idx: dict[str, list[tuple[int, str]]] = defaultdict(list)
+    qid_order: dict[str, int] = {}
+    arm_seen_count: dict[str, int] = defaultdict(int)
+    for row in rows:
+        arm = row["arm"]
+        idx = arm_seen_count[arm]
+        arm_seen_count[arm] += 1
+        qid_order[f"{arm}::{row['qid']}"] = idx
+        err = row.get("error") or ""
+        cluster = "ssl" if "SSLError" in err else (
+            "empty" if not (row.get("raw_text") or "").strip() and not err else (
+                "5xx" if "502" in err or "503" in err else (
+                    "size_limit" if "exceeds" in err.lower() and "limit" in err.lower() else (
+                        "other_err" if err else "ok"
+                    )
+                )
+            )
+        )
+        if cluster != "ok":
+            by_arm_idx[arm].append((idx, cluster))
+
+    print("=" * 80)
+    print("SSL/network-error indices per arm (each arm processes 171 questions in")
+    print("order; index = sequential position within that arm). Tight clustering")
+    print("in time = transient blip, even spread = sustained limit.")
+    print("=" * 80)
+    for arm in sorted(by_arm_idx):
+        items = by_arm_idx[arm]
+        if not items:
+            continue
+        idxs = sorted(set(i for i, _ in items))
+        print(f"\n{arm}: {len(items)} failures at indices {idxs}")
+        # show clusters
+        cluster_runs = []
+        cur = [idxs[0]]
+        for i in idxs[1:]:
+            if i - cur[-1] <= 5:  # within 5 questions = same time window
+                cur.append(i)
+            else:
+                cluster_runs.append(cur)
+                cur = [i]
+        cluster_runs.append(cur)
+        print(f"   clusters (gap<=5): {len(cluster_runs)}: {cluster_runs}")
+
+    # 2) The 30MB intrinsic failure — full details
+    print()
+    print("=" * 80)
+    print("Intrinsic failure: 30MB Anthropic input limit on 2405.09818v1.pdf::Q007")
+    print("=" * 80)
+    for row in rows:
+        if row["qid"] == "2405.09818v1.pdf::Q007" and row["arm"] == "native_pdf":
+            err = row.get("error") or ""
+            print(f"  qid: {row['qid']}")
+            print(f"  doc: {row['doc_id']}, pages: {row.get('pages')}")
+            pdf_path = PDFS / row["doc_id"]
+            if pdf_path.exists():
+                size_mb = pdf_path.stat().st_size / (1024 * 1024)
+                print(f"  PDF size on disk: {size_mb:.1f} MB")
+                # base64 inflates ~33%
+                est_b64 = size_mb * 1.33
+                print(f"  estimated base64 wire size: {est_b64:.1f} MB")
+            print(f"  full error: {err[:600]}")
+            break
+
+    # 3) Per-PDF: which PDFs are pathological?
+    print()
+    print("=" * 80)
+    print("Per-PDF failure breakdown across all 6 arms (only PDFs with failures)")
+    print("=" * 80)
+    by_pdf: dict[str, list[dict]] = defaultdict(list)
+    for row in rows:
+        err = row.get("error") or ""
+        empty = not (row.get("raw_text") or "").strip()
+        if err or empty:
+            by_pdf[row["doc_id"]].append({
+                "arm": row["arm"],
+                "qid": row["qid"],
+                "err_kind": (
+                    "ssl" if "SSLError" in err
+                    else "size_limit" if "exceeds" in err.lower() and "limit" in err.lower()
+                    else "5xx" if "502" in err or "503" in err
+                    else "json_decode" if "JSONDecodeError" in err
+                    else "empty" if empty and not err
+                    else "other"
+                ),
+                "pages": row.get("pages"),
+            })
+    for doc, items in sorted(by_pdf.items(), key=lambda x: (-len(x[1]), x[0])):
+        kinds = Counter(i["err_kind"] for i in items)
+        arms = sorted({i["arm"] for i in items})
+        pages = items[0]["pages"]
+        print(f"  {doc}  pages={pages}  failures={len(items)}  arms={arms}")
+        print(f"     kinds: {dict(kinds)}")
+
+
+if __name__ == "__main__":
+    main()
--- a/surfsense_evals/scripts/analyze_failures.py
+++ b/surfsense_evals/scripts/analyze_failures.py
@ -0,0 +1,155 @@
+"""Drill into the parser_compare n=171 raw.jsonl to surface every
+failure, group by arm + PDF, and dump the underlying error strings so
+we can write up a clean failure-mode taxonomy for the blog post.
+
+Outputs (printed to stdout + written to `failures_n171.json`):
+* per-arm failure count and rate
+* per-PDF failure count across all arms (which docs are pathological?)
+* error-string clusters per arm (so we can give human-readable causes)
+* sample failure rows (one per cluster) for the appendix
+"""
+
+from __future__ import annotations
+
+import json
+import re
+from collections import Counter, defaultdict
+from pathlib import Path
+from typing import Any
+
+
+REPO = Path(__file__).resolve().parents[1]
+RUN = REPO / "data" / "multimodal_doc" / "runs" / "2026-05-14T00-53-19Z" / "parser_compare"
+RAW = RUN / "raw.jsonl"
+OUT = REPO / "scripts" / "failures_n171.json"
+
+
+def _classify(error: str | None, raw_text: str) -> str:
+    """Coarse-grained bucket for an error message."""
+
+    blob = (error or "").lower()
+    if not blob and not raw_text.strip():
+        return "empty_response"
+    if "rate limit" in blob or "429" in blob:
+        return "rate_limit"
+    if "context_length" in blob or "context window" in blob or "too many tokens" in blob:
+        return "context_overflow"
+    if "could not process image" in blob or "invalid image" in blob:
+        return "image_decode_failure"
+    if "could not process pdf" in blob or "invalid_request_error" in blob and "pdf" in blob:
+        return "pdf_decode_failure"
+    if "timeout" in blob or "timed out" in blob:
+        return "timeout"
+    if "5xx" in blob or "internal server error" in blob or "503" in blob or "502" in blob:
+        return "provider_5xx"
+    if "filenotfound" in blob:
+        return "missing_extraction"
+    if "badrequest" in blob:
+        return "provider_400"
+    if blob:
+        return "other_error"
+    return "unknown"
+
+
+def main() -> None:
+    rows = [
+        json.loads(line) for line in RAW.read_text(encoding="utf-8").splitlines()
+        if line.strip()
+    ]
+
+    by_arm_failures: dict[str, list[dict]] = defaultdict(list)
+    by_pdf_failures: dict[str, list[dict]] = defaultdict(list)
+    error_clusters: dict[str, dict[str, list[dict]]] = defaultdict(lambda: defaultdict(list))
+
+    n_per_arm: dict[str, int] = defaultdict(int)
+    for row in rows:
+        arm = row["arm"]
+        n_per_arm[arm] += 1
+        err = row.get("error")
+        raw_text = row.get("raw_text") or ""
+        if err or not raw_text.strip():
+            cluster = _classify(err, raw_text)
+            entry = {
+                "qid": row["qid"],
+                "doc_id": row["doc_id"],
+                "answer_format": row["answer_format"],
+                "gold": row["gold"],
+                "error": err,
+                "cluster": cluster,
+                "raw_text_len": len(raw_text),
+                "pages": row.get("pages"),
+            }
+            by_arm_failures[arm].append(entry)
+            by_pdf_failures[row["doc_id"]].append({**entry, "arm": arm})
+            error_clusters[arm][cluster].append(entry)
+
+    print("=" * 90)
+    print("Per-arm failure count & rate")
+    print("=" * 90)
+    print(f"{'arm':<25} {'n':>4} {'fail':>5} {'rate%':>6}")
+    for arm in sorted(n_per_arm):
+        f = len(by_arm_failures[arm])
+        n = n_per_arm[arm]
+        print(f"{arm:<25} {n:>4} {f:>5} {f / n * 100:>5.1f}%")
+
+    print()
+    print("=" * 90)
+    print("Failure clusters per arm")
+    print("=" * 90)
+    for arm in sorted(error_clusters):
+        print(f"\n{arm}:")
+        for cluster, items in sorted(error_clusters[arm].items()):
+            print(f"  {cluster:<22} {len(items):>3}")
+            sample = items[0]
+            err_short = (sample["error"] or "")[:200].replace("\n", " ")
+            print(f"     example: {sample['qid']}  doc={sample['doc_id']} pages={sample['pages']}")
+            print(f"     error: {err_short}")
+
+    print()
+    print("=" * 90)
+    print("Per-PDF failure totals (PDFs with >=2 failures)")
+    print("=" * 90)
+    pdf_counts = Counter({pdf: len(rows) for pdf, rows in by_pdf_failures.items()})
+    for pdf, count in pdf_counts.most_common():
+        if count < 2:
+            break
+        arms_failed = sorted({r["arm"] for r in by_pdf_failures[pdf]})
+        pages = by_pdf_failures[pdf][0].get("pages")
+        print(f"  {pdf}  pages={pages}  failures={count}  arms={arms_failed}")
+
+    print()
+    print("=" * 90)
+    print("All native_pdf failures (one row per failure)")
+    print("=" * 90)
+    for entry in by_arm_failures.get("native_pdf", []):
+        err = (entry["error"] or "(no error string)")[:240].replace("\n", " ")
+        print(f"  {entry['qid']}  doc={entry['doc_id']} pages={entry['pages']} cluster={entry['cluster']}")
+        print(f"    err: {err}")
+
+    summary: dict[str, Any] = {
+        "per_arm": {
+            arm: {
+                "n": n_per_arm[arm],
+                "failures": len(by_arm_failures[arm]),
+                "rate": len(by_arm_failures[arm]) / n_per_arm[arm],
+                "clusters": {
+                    cluster: len(items)
+                    for cluster, items in error_clusters[arm].items()
+                },
+                "rows": by_arm_failures[arm],
+            }
+            for arm in sorted(n_per_arm)
+        },
+        "per_pdf": {
+            pdf: [
+                {**r, "arm": r["arm"]} for r in failures
+            ]
+            for pdf, failures in by_pdf_failures.items()
+        },
+    }
+    OUT.write_text(json.dumps(summary, indent=2), encoding="utf-8")
+    print(f"\nWrote: {OUT}")
+
+
+if __name__ == "__main__":
+    main()
--- a/surfsense_evals/scripts/check_extraction_sizes.py
+++ b/surfsense_evals/scripts/check_extraction_sizes.py
@ -0,0 +1,60 @@
+"""Sanity check extraction sizes against Sonnet 4.5's context window.
+
+Sonnet 4.5 supports ~200k tokens. As a *very* rough heuristic, English
+markdown is ~4 chars/token, so anything over ~750k chars likely won't
+fit alongside the system + question + 512 max_output_tokens. Print
+warnings for any extraction that's at risk.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+REPO = Path(__file__).resolve().parents[1]
+MAP = REPO / "data" / "multimodal_doc" / "maps" / "parser_compare_doc_map.jsonl"
+
+CHARS_PER_TOKEN = 4
+CTX_TOKENS = 200_000
+PROMPT_OVERHEAD_TOKENS = 1_000  # system + question + format hint
+MAX_OUTPUT_TOKENS = 512
+SAFE_CHARS = (CTX_TOKENS - PROMPT_OVERHEAD_TOKENS - MAX_OUTPUT_TOKENS) * CHARS_PER_TOKEN
+
+
+def main() -> None:
+    rows = [
+        json.loads(line)
+        for line in MAP.read_text(encoding="utf-8").splitlines()
+        if line.strip()
+    ]
+
+    total = len(rows)
+    arm_max: dict[str, tuple[int, str]] = {}
+    overflows: list[tuple[str, str, int]] = []
+    for row in rows:
+        for arm, ext in (row.get("extractions") or {}).items():
+            chars = int(ext.get("chars") or 0)
+            if arm not in arm_max or arm_max[arm][0] < chars:
+                arm_max[arm] = (chars, row["doc_id"])
+            if chars > SAFE_CHARS:
+                overflows.append((row["doc_id"], arm, chars))
+
+    print(f"PDFs in manifest: {total}")
+    print(f"safe char budget: {SAFE_CHARS:,}  (~{(SAFE_CHARS // CHARS_PER_TOKEN):,} tokens)")
+    print()
+    print("largest extraction per arm:")
+    for arm, (chars, doc_id) in sorted(arm_max.items()):
+        print(f"  {arm:25s}  {chars:>10,} chars  ({doc_id})")
+
+    print()
+    if overflows:
+        print(f"OVERFLOW RISK ({len(overflows)} extractions > safe budget):")
+        for doc_id, arm, chars in overflows:
+            est_tokens = chars // CHARS_PER_TOKEN
+            print(f"  {doc_id} :: {arm} :: {chars:,} chars (~{est_tokens:,} tokens)")
+    else:
+        print("no overflow risk — all extractions fit Sonnet 4.5's 200k context.")
+
+
+if __name__ == "__main__":
+    main()
--- a/surfsense_evals/scripts/check_uploaded_status.py
+++ b/surfsense_evals/scripts/check_uploaded_status.py
@ -0,0 +1,77 @@
+"""Query SurfSense for the status of every MMLongBench PDF in scope.
+
+Uses the existing SurfSense documents client to query
+``/documents/status?document_ids=...`` for both the known-existing 5
+PDFs (doc ids 5219-5223) and the recently-uploaded mmlongbench batch
+(7577-7600 range).
+"""
+
+from __future__ import annotations
+
+import asyncio
+import os
+from pathlib import Path
+
+import httpx
+from dotenv import load_dotenv
+
+
+REPO = Path(__file__).resolve().parents[1]
+PDF_DIR = REPO / "data" / "multimodal_doc" / "mmlongbench" / "pdfs"
+
+
+async def main() -> None:
+    load_dotenv(REPO / ".env")
+    base = os.environ.get("SURFSENSE_API_BASE", "http://localhost:8000").rstrip("/")
+    token = os.environ.get("SURFSENSE_JWT")
+    if not token:
+        raise SystemExit("SURFSENSE_JWT missing from .env")
+
+    pdf_names = sorted(p.name for p in PDF_DIR.glob("*.pdf"))
+    print(f"local cached PDFs: {len(pdf_names)}")
+
+    candidate_ids = list(range(5219, 5224)) + list(range(7577, 7625))
+
+    headers = {
+        "Authorization": f"Bearer {token}",
+        "Accept": "application/json",
+    }
+    async with httpx.AsyncClient(timeout=30.0) as http:
+        r = await http.get(
+            f"{base}/api/v1/documents/status",
+            params={
+                "search_space_id": 55,
+                "document_ids": ",".join(str(d) for d in candidate_ids),
+            },
+            headers=headers,
+        )
+        r.raise_for_status()
+        items = r.json().get("items", [])
+
+    by_title: dict[str, dict] = {}
+    for it in items:
+        by_title[it.get("title", "")] = {
+            "id": it.get("id"),
+            "state": (it.get("status") or {}).get("state"),
+            "reason": (it.get("status") or {}).get("reason"),
+        }
+
+    by_state: dict[str, int] = {}
+    print()
+    for name in pdf_names:
+        info = by_title.get(name)
+        if info is None:
+            print(f"  [missing      ]              {name}")
+            by_state["missing"] = by_state.get("missing", 0) + 1
+        else:
+            tag = info["state"] or "?"
+            print(f"  [{tag:13s}] doc_id={info['id']:>5}  {name}")
+            by_state[tag] = by_state.get(tag, 0) + 1
+    print()
+    print("summary:")
+    for k, v in sorted(by_state.items()):
+        print(f"  {k}: {v}")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/surfsense_evals/scripts/compute_adjusted_accuracy.py
+++ b/surfsense_evals/scripts/compute_adjusted_accuracy.py
@ -0,0 +1,112 @@
+"""Compute "intrinsic" accuracy by removing transient network errors.
+
+A failure is *transient* if it's:
+  * SSLError: SSL bad-record-mac (TLS hiccup)
+  * Cloudflare 502 / 503 (provider-side load shedding)
+  * empty_response with no error string and no other signal (likely
+    connection reset mid-stream)
+  * JSONDecodeError (parse error mid-stream)
+
+A failure is *intrinsic* if it's a hard limit:
+  * "exceeds .* limit" (size limits)
+  * context_length errors
+  * provider 400 with image / pdf decode failure
+  * malformed-input failures
+
+We re-compute accuracy with two denominators:
+  * raw acc       = correct / 171  (what the headline reports)
+  * adjusted acc  = correct / (171 - transient_failures)  (intrinsic)
+
+Outputs a table that we can drop straight into the blog.
+"""
+
+from __future__ import annotations
+
+import json
+from collections import defaultdict
+from pathlib import Path
+
+REPO = Path(__file__).resolve().parents[1]
+RUN = REPO / "data" / "multimodal_doc" / "runs" / "2026-05-14T00-53-19Z" / "parser_compare"
+RAW = RUN / "raw.jsonl"
+
+
+TRANSIENT_HINTS = (
+    "sslv3_alert_bad_record_mac",
+    "ssl_alert_bad_record_mac",
+    "ssl: ssl",
+    "cloudflare",
+    "error 502",
+    "error 503",
+    "bad gateway",
+    "service unavailable",
+    "gateway timeout",
+    "jsondecodeerror",
+)
+INTRINSIC_HINTS = (
+    "exceeds",
+    "context_length",
+    "context window",
+    "could not process pdf",
+    "could not process image",
+)
+
+
+def classify(error: str | None, raw_text: str) -> str:
+    err = (error or "").lower()
+    if not err and not raw_text.strip():
+        return "transient_empty"
+    if any(h in err for h in TRANSIENT_HINTS):
+        return "transient_ssl_or_5xx"
+    if any(h in err for h in INTRINSIC_HINTS):
+        return "intrinsic_limit"
+    if err:
+        return "other_error"
+    return "ok"
+
+
+def main() -> None:
+    rows = [
+        json.loads(line) for line in RAW.read_text(encoding="utf-8").splitlines()
+        if line.strip()
+    ]
+    by_arm: dict[str, dict] = defaultdict(lambda: {
+        "n": 0, "correct": 0,
+        "transient_ssl_or_5xx": 0, "transient_empty": 0,
+        "intrinsic_limit": 0, "other_error": 0,
+    })
+    for row in rows:
+        arm = row["arm"]
+        m = by_arm[arm]
+        m["n"] += 1
+        graded = row.get("graded") or {}
+        if graded.get("correct"):
+            m["correct"] += 1
+        kind = classify(row.get("error"), row.get("raw_text") or "")
+        if kind != "ok":
+            m[kind] += 1
+
+    print(f"{'arm':<25} {'raw acc%':>8} {'transient':>10} {'intrinsic':>10} {'other':>6} {'adj acc% (no transient)':>22}")
+    print("-" * 88)
+    for arm in sorted(by_arm):
+        m = by_arm[arm]
+        raw = m["correct"] / m["n"] * 100
+        transient = m["transient_ssl_or_5xx"] + m["transient_empty"]
+        intrinsic = m["intrinsic_limit"]
+        other = m["other_error"]
+        usable = m["n"] - transient
+        adj = m["correct"] / usable * 100 if usable else 0
+        print(
+            f"{arm:<25} {raw:>7.1f}% {transient:>10} {intrinsic:>10} {other:>6} {adj:>21.1f}%"
+        )
+
+    print()
+    print("transient   = SSLError / 502 / 503 / empty stream / mid-stream JSON decode (would")
+    print("              succeed on retry; eval harness has no built-in retry today).")
+    print("intrinsic   = hard limit (e.g. >30MB Anthropic request, model context overflow).")
+    print("adj acc%    = correct / (n - transient) — what the arm scores when network noise")
+    print("              is removed; closest thing we have to a like-for-like quality number.")
+
+
+if __name__ == "__main__":
+    main()
--- a/surfsense_evals/scripts/compute_blog_extras.py
+++ b/surfsense_evals/scripts/compute_blog_extras.py
@ -0,0 +1,381 @@
+"""Compute the deeper statistics the blog needs: McNemar pairwise tests,
+per-PDF heterogeneity, latency/token distribution percentiles.
+
+Reads the merged post-retry artifact:
+
+    data/multimodal_doc/runs/<run_id>/parser_compare/raw_post_retry.jsonl
+
+Outputs to stdout:
+
+  1) Per-arm latency distribution (n, mean, std, p10, p25, p50, p75, p90, p95, p99, max).
+  2) Per-arm input/output token distribution (mean, p50, p95, max).
+  3) McNemar pairwise table: for every (arm_i, arm_j) ordered pair on the
+     same 171 questions, count b_ij = #(arm_i correct & arm_j wrong) and
+     b_ji = #(arm_i wrong & arm_j correct), and report the exact-binomial
+     two-sided p-value. We include both raw (using the original raw.jsonl)
+     and post-retry results.
+  4) Per-PDF accuracy variance per arm (n_pdfs=30): mean, std, min, max.
+
+Pure stdlib — no scipy/numpy.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import math
+import statistics
+from pathlib import Path
+
+REPO = Path(__file__).resolve().parents[1]
+
+
+# ---------------------------------------------------------------------------
+# I/O
+# ---------------------------------------------------------------------------
+
+
+def _read_jsonl(path: Path) -> list[dict]:
+    out: list[dict] = []
+    with path.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            out.append(json.loads(line))
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Distribution helpers
+# ---------------------------------------------------------------------------
+
+
+def _percentile(values: list[float], p: float) -> float:
+    """Linear-interpolation percentile (p in [0, 100])."""
+
+    if not values:
+        return 0.0
+    s = sorted(values)
+    if len(s) == 1:
+        return float(s[0])
+    k = (len(s) - 1) * (p / 100.0)
+    lo, hi = math.floor(k), math.ceil(k)
+    if lo == hi:
+        return float(s[int(k)])
+    return float(s[lo] + (s[hi] - s[lo]) * (k - lo))
+
+
+# ---------------------------------------------------------------------------
+# McNemar exact-binomial p-value
+# ---------------------------------------------------------------------------
+
+
+def _binom_coef(n: int, k: int) -> int:
+    if k < 0 or k > n:
+        return 0
+    return math.comb(n, k)
+
+
+def _mcnemar_exact_pvalue(b: int, c: int) -> float:
+    """Two-sided exact-binomial McNemar p-value.
+
+    Tests H0: P(arm_i wrong, arm_j right) == P(arm_i right, arm_j wrong)
+    on discordant pairs only. Under H0 the count b ~ Bin(b+c, 0.5).
+    The two-sided p-value is
+
+        P(X <= min(b, c)) + P(X >= max(b, c))
+
+    computed exactly (cheap because b+c <= 27 in our run).
+    """
+
+    n = b + c
+    if n == 0:
+        return 1.0
+    k = min(b, c)
+    # Two-sided exact: 2 * P(X <= k) clipped at 1.0
+    cdf = sum(_binom_coef(n, i) for i in range(k + 1))
+    p = 2.0 * cdf / (2 ** n)
+    return min(1.0, p)
+
+
+def _mcnemar_table(rows: list[dict]) -> dict:
+    """Group rows -> {qid: {arm: bool_correct}} and compute pairwise."""
+
+    by_qid: dict[str, dict[str, bool]] = {}
+    arms_seen: set[str] = set()
+    for r in rows:
+        qid = r["qid"]
+        arm = r["arm"]
+        graded = r.get("graded") or {}
+        correct = bool(graded.get("correct"))
+        by_qid.setdefault(qid, {})[arm] = correct
+        arms_seen.add(arm)
+
+    arms = sorted(arms_seen)
+    qids = sorted(by_qid)
+    out: dict[str, dict] = {"arms": arms, "n_qids": len(qids), "pairs": []}
+    for i, ai in enumerate(arms):
+        for aj in arms[i + 1:]:
+            b = c = both = neither = 0
+            for q in qids:
+                row = by_qid[q]
+                if ai not in row or aj not in row:
+                    continue
+                ci, cj = row[ai], row[aj]
+                if ci and not cj:
+                    b += 1
+                elif cj and not ci:
+                    c += 1
+                elif ci and cj:
+                    both += 1
+                else:
+                    neither += 1
+            p = _mcnemar_exact_pvalue(b, c)
+            out["pairs"].append({
+                "arm_i": ai, "arm_j": aj,
+                "b_i_only": b, "c_j_only": c,
+                "both_correct": both, "both_wrong": neither,
+                "p_value": p,
+            })
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Per-PDF heterogeneity
+# ---------------------------------------------------------------------------
+
+
+def _per_pdf_stats(rows: list[dict]) -> dict[str, dict]:
+    """For each arm, per-PDF accuracy = correct/total questions on that PDF."""
+
+    bucket: dict[str, dict[str, list[bool]]] = {}
+    for r in rows:
+        arm = r["arm"]
+        pdf = r["doc_id"]
+        graded = r.get("graded") or {}
+        bucket.setdefault(arm, {}).setdefault(pdf, []).append(
+            bool(graded.get("correct"))
+        )
+
+    out: dict[str, dict] = {}
+    for arm, pdfs in bucket.items():
+        accs = [sum(b) / len(b) for b in pdfs.values() if b]
+        if not accs:
+            continue
+        out[arm] = {
+            "n_pdfs": len(accs),
+            "mean": statistics.mean(accs),
+            "std": statistics.stdev(accs) if len(accs) > 1 else 0.0,
+            "min": min(accs),
+            "max": max(accs),
+            "p25": _percentile(accs, 25),
+            "p50": _percentile(accs, 50),
+            "p75": _percentile(accs, 75),
+            "n_pdfs_zero": sum(1 for a in accs if a == 0.0),
+            "n_pdfs_perfect": sum(1 for a in accs if a == 1.0),
+        }
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Latency / token distributions
+# ---------------------------------------------------------------------------
+
+
+def _per_arm_latency(rows: list[dict]) -> dict[str, dict]:
+    by_arm: dict[str, list[float]] = {}
+    for r in rows:
+        lat = r.get("latency_ms")
+        if lat is None or lat == 0:
+            continue
+        by_arm.setdefault(r["arm"], []).append(float(lat))
+    out: dict[str, dict] = {}
+    for arm, lats in by_arm.items():
+        out[arm] = {
+            "n": len(lats),
+            "mean_s": statistics.mean(lats) / 1000,
+            "std_s": (statistics.stdev(lats) / 1000) if len(lats) > 1 else 0.0,
+            "p10_s": _percentile(lats, 10) / 1000,
+            "p25_s": _percentile(lats, 25) / 1000,
+            "p50_s": _percentile(lats, 50) / 1000,
+            "p75_s": _percentile(lats, 75) / 1000,
+            "p90_s": _percentile(lats, 90) / 1000,
+            "p95_s": _percentile(lats, 95) / 1000,
+            "p99_s": _percentile(lats, 99) / 1000,
+            "max_s": max(lats) / 1000,
+            # Coefficient of variation: std / mean (unitless tail-fatness).
+            "cv": (
+                statistics.stdev(lats) / statistics.mean(lats)
+                if len(lats) > 1 and statistics.mean(lats) > 0 else 0.0
+            ),
+        }
+    return out
+
+
+def _per_arm_tokens(rows: list[dict]) -> dict[str, dict]:
+    by_arm_in: dict[str, list[float]] = {}
+    by_arm_out: dict[str, list[float]] = {}
+    for r in rows:
+        t_in = r.get("input_tokens") or 0
+        t_out = r.get("output_tokens") or 0
+        if t_in:
+            by_arm_in.setdefault(r["arm"], []).append(float(t_in))
+        if t_out:
+            by_arm_out.setdefault(r["arm"], []).append(float(t_out))
+    out: dict[str, dict] = {}
+    for arm in sorted(set(by_arm_in) | set(by_arm_out)):
+        in_vals = by_arm_in.get(arm, [])
+        out_vals = by_arm_out.get(arm, [])
+        if not in_vals and not out_vals:
+            continue
+        entry: dict = {}
+        if in_vals:
+            entry["input"] = {
+                "n": len(in_vals),
+                "mean": statistics.mean(in_vals),
+                "p50": _percentile(in_vals, 50),
+                "p95": _percentile(in_vals, 95),
+                "max": max(in_vals),
+            }
+        if out_vals:
+            entry["output"] = {
+                "n": len(out_vals),
+                "mean": statistics.mean(out_vals),
+                "p50": _percentile(out_vals, 50),
+                "p95": _percentile(out_vals, 95),
+                "max": max(out_vals),
+            }
+        out[arm] = entry
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Pretty-printing
+# ---------------------------------------------------------------------------
+
+
+def _print_latency(title: str, lat: dict[str, dict]) -> None:
+    print()
+    print(title)
+    print("-" * len(title))
+    header = (f"{'arm':<25} {'n':>4} {'mean':>7} {'std':>7} "
+              f"{'p50':>7} {'p90':>7} {'p95':>7} {'p99':>7} {'max':>7} {'CV':>5}")
+    print(header)
+    print("-" * len(header))
+    for arm in sorted(lat, key=lambda a: lat[a]["mean_s"]):
+        s = lat[arm]
+        print(f"{arm:<25} {s['n']:>4} "
+              f"{s['mean_s']:>6.1f}s {s['std_s']:>6.1f}s "
+              f"{s['p50_s']:>6.1f}s {s['p90_s']:>6.1f}s {s['p95_s']:>6.1f}s "
+              f"{s['p99_s']:>6.1f}s {s['max_s']:>6.1f}s {s['cv']:>5.2f}")
+
+
+def _print_tokens(title: str, toks: dict[str, dict]) -> None:
+    print()
+    print(title)
+    print("-" * len(title))
+    header = (f"{'arm':<25} {'in mean':>9} {'in p50':>9} {'in p95':>9} {'in max':>9}"
+              f"  {'out mean':>9} {'out p95':>9}")
+    print(header)
+    print("-" * len(header))
+    for arm in sorted(toks):
+        e = toks[arm]
+        ein = e.get("input")
+        eout = e.get("output")
+        if not ein:
+            continue
+        print(f"{arm:<25} "
+              f"{ein['mean']:>9,.0f} {ein['p50']:>9,.0f} {ein['p95']:>9,.0f} {ein['max']:>9,.0f}  "
+              f"{(eout or {}).get('mean', 0):>9,.0f} {(eout or {}).get('p95', 0):>9,.0f}")
+
+
+def _print_pdf_var(title: str, var: dict[str, dict]) -> None:
+    print()
+    print(title)
+    print("-" * len(title))
+    header = (f"{'arm':<25} {'n_pdfs':>7} {'mean':>7} {'std':>7} {'min':>7} "
+              f"{'p25':>7} {'p50':>7} {'p75':>7} {'max':>7} {'#0%':>5} {'#100%':>6}")
+    print(header)
+    print("-" * len(header))
+    for arm in sorted(var, key=lambda a: -var[a]["mean"]):
+        s = var[arm]
+        print(f"{arm:<25} {s['n_pdfs']:>7} "
+              f"{s['mean']*100:>6.1f}% {s['std']*100:>6.1f}% {s['min']*100:>6.1f}% "
+              f"{s['p25']*100:>6.1f}% {s['p50']*100:>6.1f}% {s['p75']*100:>6.1f}% "
+              f"{s['max']*100:>6.1f}% {s['n_pdfs_zero']:>5} {s['n_pdfs_perfect']:>6}")
+
+
+def _print_mcnemar(title: str, table: dict) -> None:
+    print()
+    print(title)
+    print("-" * len(title))
+    print(f"n_qids on which all arms have a graded row: {table['n_qids']}")
+    header = (f"{'arm_i':<25} {'arm_j':<25} {'b':>4} {'c':>4} "
+              f"{'both ok':>8} {'both wr':>8} {'p (2-sided)':>13} {'sig':>4}")
+    print(header)
+    print("-" * len(header))
+    for pair in sorted(table["pairs"], key=lambda p: p["p_value"]):
+        sig = ""
+        if pair["p_value"] < 0.001:
+            sig = "***"
+        elif pair["p_value"] < 0.01:
+            sig = "**"
+        elif pair["p_value"] < 0.05:
+            sig = "*"
+        print(f"{pair['arm_i']:<25} {pair['arm_j']:<25} "
+              f"{pair['b_i_only']:>4} {pair['c_j_only']:>4} "
+              f"{pair['both_correct']:>8} {pair['both_wrong']:>8} "
+              f"{pair['p_value']:>13.4f} {sig:>4}")
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--run-id", default="2026-05-14T00-53-19Z")
+    args = parser.parse_args()
+
+    run_dir = REPO / "data" / "multimodal_doc" / "runs" / args.run_id / "parser_compare"
+    raw_path = run_dir / "raw.jsonl"
+    post_path = run_dir / "raw_post_retry.jsonl"
+    if not raw_path.exists() or not post_path.exists():
+        raise SystemExit(
+            "Missing raw.jsonl or raw_post_retry.jsonl. "
+            "Run scripts/compute_post_retry_accuracy.py first."
+        )
+
+    raw_rows = _read_jsonl(raw_path)
+    post_rows = _read_jsonl(post_path)
+
+    print(f"Run: {args.run_id}")
+    print(f"raw rows: {len(raw_rows)}, post-retry rows: {len(post_rows)}")
+
+    # Latency uses post-retry rows (post-retry rows include the retry's own
+    # latency for recovered rows). For raw, recovered rows have latency=0
+    # because the harness recorded a failure.
+    _print_latency("Per-arm latency (post-retry)", _per_arm_latency(post_rows))
+
+    _print_tokens("Per-arm token distribution (post-retry)", _per_arm_tokens(post_rows))
+
+    _print_pdf_var(
+        "Per-PDF accuracy heterogeneity (post-retry)",
+        _per_pdf_stats(post_rows),
+    )
+
+    _print_mcnemar(
+        "McNemar pairwise (RAW, no retries)",
+        _mcnemar_table(raw_rows),
+    )
+    _print_mcnemar(
+        "McNemar pairwise (POST-RETRY)",
+        _mcnemar_table(post_rows),
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/surfsense_evals/scripts/compute_post_retry_accuracy.py
+++ b/surfsense_evals/scripts/compute_post_retry_accuracy.py
@ -0,0 +1,180 @@
+"""Recompute per-arm accuracy/F1 after merging retry survivors into raw.jsonl.
+
+Reads:
+  - data/multimodal_doc/runs/<run_id>/parser_compare/raw.jsonl
+  - data/multimodal_doc/runs/<run_id>/parser_compare/raw_retries.jsonl
+
+For each (arm, qid) present in the retry artifact:
+  - if the retry RECOVERED, the retry row replaces the original row (same
+    grader is reused — see ``mmlongbench/grader.py``);
+  - if the retry did NOT recover, the original row stays (still a failure,
+    so ``correct=False`` and ``f1=0``).
+
+Prints two tables side by side:
+  * Raw run (no retries) — matches §1 of the blog.
+  * Post-retry run        — final, "what would the headline have been if
+                              the harness had had retries from day one".
+
+It also writes ``data/multimodal_doc/runs/<run_id>/parser_compare/raw_post_retry.jsonl``
+so any downstream notebook / report can join straight on it.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+REPO = Path(__file__).resolve().parents[1]
+
+
+def _read_jsonl(path: Path) -> list[dict]:
+    out: list[dict] = []
+    with path.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            out.append(json.loads(line))
+    return out
+
+
+def _row_key(row: dict) -> tuple[str, str]:
+    return (str(row["arm"]), str(row["qid"]))
+
+
+def _is_failure(row: dict) -> bool:
+    if row.get("error"):
+        return True
+    if not (row.get("raw_text") or "").strip():
+        return True
+    return False
+
+
+def _summarise(rows_by_arm: dict[str, list[dict]]) -> dict[str, dict]:
+    out: dict[str, dict] = {}
+    for arm, rows in rows_by_arm.items():
+        n = len(rows)
+        n_correct = sum(1 for r in rows if r.get("graded", {}).get("correct"))
+        f1_sum = sum(float(r.get("graded", {}).get("f1") or 0.0) for r in rows)
+        n_fail = sum(1 for r in rows if _is_failure(r))
+        out[arm] = {
+            "n": n,
+            "n_correct": n_correct,
+            "n_failures": n_fail,
+            "accuracy": (n_correct / n) if n else 0.0,
+            "f1_mean": (f1_sum / n) if n else 0.0,
+            "failure_rate": (n_fail / n) if n else 0.0,
+        }
+    return out
+
+
+def _print_table(title: str, summary: dict[str, dict]) -> None:
+    print()
+    print(title)
+    print("-" * len(title))
+    header = f"{'arm':<25} {'n':>4} {'n_corr':>7} {'acc':>7} {'F1':>7} {'fails':>6} {'fail%':>7}"
+    print(header)
+    print("-" * len(header))
+    # stable order: highest accuracy first
+    arms_sorted = sorted(summary.items(), key=lambda kv: -kv[1]["accuracy"])
+    for arm, s in arms_sorted:
+        print(f"{arm:<25} {s['n']:>4} {s['n_correct']:>7} "
+              f"{s['accuracy']*100:>6.1f}% {s['f1_mean']*100:>6.1f}% "
+              f"{s['n_failures']:>6} {s['failure_rate']*100:>6.1f}%")
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--run-id", default="2026-05-14T00-53-19Z")
+    args = parser.parse_args()
+
+    run_dir = REPO / "data" / "multimodal_doc" / "runs" / args.run_id / "parser_compare"
+    raw_path = run_dir / "raw.jsonl"
+    retry_path = run_dir / "raw_retries.jsonl"
+    out_path = run_dir / "raw_post_retry.jsonl"
+
+    if not raw_path.exists():
+        print(f"raw.jsonl not found at {raw_path}", file=sys.stderr)
+        return 1
+    if not retry_path.exists():
+        print(f"raw_retries.jsonl not found at {retry_path}", file=sys.stderr)
+        return 1
+
+    raw_rows = _read_jsonl(raw_path)
+    retry_rows = _read_jsonl(retry_path)
+
+    retry_by_key: dict[tuple[str, str], dict] = {
+        _row_key(r): r for r in retry_rows
+    }
+
+    merged_rows: list[dict] = []
+    n_replaced_recovered = 0
+    n_replaced_still_failed = 0
+    n_unchanged = 0
+    for row in raw_rows:
+        key = _row_key(row)
+        retry = retry_by_key.get(key)
+        if retry is None:
+            merged_rows.append(row)
+            n_unchanged += 1
+            continue
+        # The retry artifact carries a fresh ArmResult + grade in the same
+        # shape, plus a "retry" sub-object. We use the retry row whenever
+        # it represents a recovery; otherwise we keep the original (the
+        # retry confirms it is intrinsic, but the original row is the one
+        # the headline numbers were computed from, and the failure verdict
+        # is identical either way).
+        recovered = bool(retry.get("retry", {}).get("recovered"))
+        if recovered:
+            merged_rows.append(retry)
+            n_replaced_recovered += 1
+        else:
+            merged_rows.append(row)
+            n_replaced_still_failed += 1
+
+    # Persist merged jsonl for downstream consumers
+    with out_path.open("w", encoding="utf-8") as fh:
+        for r in merged_rows:
+            fh.write(json.dumps(r) + "\n")
+
+    # Bucket per arm
+    raw_by_arm: dict[str, list[dict]] = {}
+    for r in raw_rows:
+        raw_by_arm.setdefault(r["arm"], []).append(r)
+    post_by_arm: dict[str, list[dict]] = {}
+    for r in merged_rows:
+        post_by_arm.setdefault(r["arm"], []).append(r)
+
+    raw_summary = _summarise(raw_by_arm)
+    post_summary = _summarise(post_by_arm)
+
+    print()
+    print(f"Run: {args.run_id}")
+    print(f"Replaced (retry recovered):     {n_replaced_recovered}")
+    print(f"Kept original (retry still failed): {n_replaced_still_failed}")
+    print(f"Untouched rows:                 {n_unchanged}")
+    print(f"Wrote merged artifact: {out_path.relative_to(REPO)}")
+
+    _print_table("Raw run (no retries)", raw_summary)
+    _print_table("Post-retry run (final)", post_summary)
+
+    print()
+    print("Delta (post-retry minus raw):")
+    print(f"{'arm':<25} {'d_acc':>7} {'d_fails':>8}")
+    print("-" * 42)
+    for arm in sorted(set(raw_summary) | set(post_summary)):
+        r = raw_summary.get(arm)
+        p = post_summary.get(arm)
+        if not r or not p:
+            continue
+        d_acc = (p["accuracy"] - r["accuracy"]) * 100
+        d_fail = p["n_failures"] - r["n_failures"]
+        print(f"{arm:<25} {d_acc:>+6.1f}p {d_fail:>+7d}")
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/surfsense_evals/scripts/download_crag_task3.py
+++ b/surfsense_evals/scripts/download_crag_task3.py
@ -1,97 +0,0 @@
-"""Download CRAG Task 3's 4 .tar.bz2 parts in parallel.
-
-Run once before ``ingest research crag_t3`` to avoid the ingest
-synchronously blocking on a 7 GB download. Skips parts already
-present and complete on disk.
-"""
-
-from __future__ import annotations
-
-import logging
-import sys
-import time
-import urllib.request
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from pathlib import Path
-
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s %(levelname)s %(message)s",
-)
-log = logging.getLogger("download_task3")
-
-
-_BASE = (
-    "https://github.com/facebookresearch/CRAG/raw/refs/heads/main/data/"
-    "crag_task_3_dev_v4.tar.bz2.part"
-)
-_USER_AGENT = "SurfSense-Evals/0.1 (CRAG Task 3 fetch)"
-
-
-def _expected_size(url: str) -> int:
-    req = urllib.request.Request(url, method="HEAD", headers={"User-Agent": _USER_AGENT})
-    with urllib.request.urlopen(req, timeout=30) as resp:
-        return int(resp.headers.get("content-length", 0))
-
-
-def download_one(part: int, dest_dir: Path) -> Path:
-    url = f"{_BASE}{part}"
-    dest = dest_dir / f"crag_task_3_dev_v4.tar.bz2.part{part}"
-    expected = _expected_size(url)
-    if dest.exists() and dest.stat().st_size == expected:
-        log.info("part%d: cached (%d bytes)", part, expected)
-        return dest
-    log.info("part%d: downloading %d bytes ...", part, expected)
-    tmp = dest.with_suffix(dest.suffix + ".part_dl")
-    started = time.monotonic()
-    last_log = started
-    with urllib.request.urlopen(
-        urllib.request.Request(url, headers={"User-Agent": _USER_AGENT}),
-        timeout=900,
-    ) as resp, tmp.open("wb") as fh:
-        downloaded = 0
-        chunk = resp.read(1 << 20)
-        while chunk:
-            fh.write(chunk)
-            downloaded += len(chunk)
-            now = time.monotonic()
-            if now - last_log > 5.0:
-                pct = 100 * downloaded / expected if expected else 0
-                rate_mb = (downloaded / (now - started)) / (1 << 20)
-                log.info(
-                    "part%d: %5.1f%% (%.1f / %.1f MiB at %.1f MiB/s)",
-                    part, pct, downloaded / (1 << 20), expected / (1 << 20), rate_mb,
-                )
-                last_log = now
-            chunk = resp.read(1 << 20)
-    tmp.replace(dest)
-    elapsed = time.monotonic() - started
-    log.info(
-        "part%d: done in %.1fs (%.1f MiB/s avg)",
-        part, elapsed, (expected / (1 << 20)) / max(elapsed, 0.001),
-    )
-    return dest
-
-
-def main() -> int:
-    dest_dir = Path("data/research/crag_t3/.raw_cache")
-    dest_dir.mkdir(parents=True, exist_ok=True)
-
-    # 4 parts in parallel — typical residential connection saturates around
-    # 2 streams; GitHub raw serves these fine in parallel.
-    started = time.monotonic()
-    with ThreadPoolExecutor(max_workers=4) as ex:
-        futures = {ex.submit(download_one, i, dest_dir): i for i in range(1, 5)}
-        for fut in as_completed(futures):
-            part = futures[fut]
-            try:
-                fut.result()
-            except Exception as exc:  # noqa: BLE001
-                log.error("part%d failed: %s", part, exc)
-                return 1
-    log.info("All 4 parts downloaded in %.1fs", time.monotonic() - started)
-    return 0
-
-
-if __name__ == "__main__":
-    sys.exit(main())
--- a/surfsense_evals/scripts/inspect_first30.py
+++ b/surfsense_evals/scripts/inspect_first30.py
@ -0,0 +1,59 @@
+"""Inspect what the first 30 MMLongBench-Doc PDFs would look like for scoping.
+
+Run from surfsense_evals/ root via:
+    python scripts/inspect_first30.py
+
+Prints which docs are already ingested (existing 5), which are new (25 to
+upload), how many questions cover those 30 PDFs, and the answerable /
+unanswerable + format mix.
+"""
+
+from __future__ import annotations
+
+import json
+from collections import Counter
+from pathlib import Path
+
+
+def main() -> None:
+    qpath = Path("data/multimodal_doc/mmlongbench/questions.jsonl")
+    lines = qpath.read_text(encoding="utf-8").splitlines()
+    rows = [json.loads(line) for line in lines if line.strip()]
+
+    docs_by_id = sorted({r["doc_id"] for r in rows})
+    first30 = docs_by_id[:30]
+    existing5 = {
+        "05-03-18-political-release.pdf",
+        "0b85477387a9d0cc33fca0f4becaa0e5.pdf",
+        "0e94b4197b10096b1f4c699701570fbf.pdf",
+        "11-21-16-Updated-Post-Election-Release.pdf",
+        "12-15-15-ISIS-and-terrorism-release-final.pdf",
+    }
+    new25 = [d for d in first30 if d not in existing5]
+    print(
+        f"first 30 docs (alphabetical) — {len(new25)} new, "
+        f"{len(first30) - len(new25)} already in SurfSense"
+    )
+
+    qs_in_30 = [r for r in rows if r["doc_id"] in set(first30)]
+    fmts = Counter((r.get("answer_format") or "").lower() for r in qs_in_30)
+    answerable = sum(v for k, v in fmts.items() if k != "none")
+    unanswerable = fmts.get("none", 0)
+
+    print(
+        f"questions covering first 30 docs: total={len(qs_in_30)}  "
+        f"answerable={answerable}  unanswerable={unanswerable}"
+    )
+    print(
+        f"avg Qs/PDF: {len(qs_in_30) / 30:.1f}  "
+        f"answerable/PDF: {answerable / 30:.1f}"
+    )
+    print(f"format mix in scope: {dict(fmts)}")
+    print()
+    print("25 new PDFs to ingest:")
+    for d in new25:
+        print(f"  - {d}")
+
+
+if __name__ == "__main__":
+    main()
--- a/surfsense_evals/scripts/patch_manifest_for_parallel_ingest.py
+++ b/surfsense_evals/scripts/patch_manifest_for_parallel_ingest.py
@ -0,0 +1,100 @@
+"""Stub the mmlongbench manifest so parser_compare can extract in parallel.
+
+The mmlongbench Surfsense ingest writes its manifest only at the very
+end of the upload pipeline (~hours of celery work). parser_compare's
+ingest, on the other hand, just needs a list of (doc_id, pdf_path)
+tuples to know which PDFs to extract — it doesn't care about the
+SurfSense ``document_id`` (the runner does, later, after a refresh).
+
+This script extends the existing manifest with the *additional* PDFs
+that mmlongbench has already cached on disk (i.e. all 30 PDFs in
+``data/multimodal_doc/mmlongbench/pdfs/`` even though only 5 have
+SurfSense ``document_id``s yet) so parser_compare can run all four
+extractions for them in parallel with the SurfSense ingest.
+
+After mmlongbench finishes, re-run::
+
+    python -m surfsense_evals ingest multimodal_doc parser_compare \
+        --max-docs 30
+
+…to refresh ``parser_compare_doc_map.jsonl`` with the now-populated
+``document_id`` values for the 25 new PDFs. The extractions
+themselves are cached on disk so the second pass is essentially free.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+
+REPO = Path(__file__).resolve().parents[1]
+MAP_PATH = REPO / "data" / "multimodal_doc" / "maps" / "mmlongbench_doc_map.jsonl"
+PDF_DIR = REPO / "data" / "multimodal_doc" / "mmlongbench" / "pdfs"
+QUESTIONS = REPO / "data" / "multimodal_doc" / "mmlongbench" / "questions.jsonl"
+
+
+def _question_count_per_doc() -> dict[str, int]:
+    counts: dict[str, int] = {}
+    with QUESTIONS.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            counts[row["doc_id"]] = counts.get(row["doc_id"], 0) + 1
+    return counts
+
+
+def main() -> None:
+    if not MAP_PATH.exists():
+        raise SystemExit(
+            f"manifest not found at {MAP_PATH} — "
+            "run `surfsense_evals ingest multimodal_doc mmlongbench` first."
+        )
+
+    existing_lines = MAP_PATH.read_text(encoding="utf-8").splitlines()
+    existing_rows: list[dict] = []
+    settings_line = None
+    for line in existing_lines:
+        line = line.strip()
+        if not line:
+            continue
+        row = json.loads(line)
+        if "__settings__" in row:
+            settings_line = line
+        else:
+            existing_rows.append(row)
+
+    by_doc_id = {r["doc_id"]: r for r in existing_rows}
+    counts = _question_count_per_doc()
+
+    cached_pdfs = sorted(p for p in PDF_DIR.glob("*.pdf"))
+    print(f"existing manifest entries: {len(existing_rows)}")
+    print(f"cached PDFs on disk:       {len(cached_pdfs)}")
+
+    added = 0
+    for pdf in cached_pdfs:
+        if pdf.name in by_doc_id:
+            continue
+        by_doc_id[pdf.name] = {
+            "doc_id": pdf.name,
+            "document_id": None,
+            "pdf_path": str(pdf),
+            "n_questions": counts.get(pdf.name, 0),
+        }
+        added += 1
+
+    out_lines: list[str] = []
+    if settings_line:
+        out_lines.append(settings_line)
+    for doc_id in sorted(by_doc_id):
+        out_lines.append(json.dumps(by_doc_id[doc_id]))
+    MAP_PATH.write_text("\n".join(out_lines) + "\n", encoding="utf-8")
+
+    print(f"added {added} stub rows; manifest now has {len(by_doc_id)} PDFs")
+    print(f"wrote: {MAP_PATH}")
+
+
+if __name__ == "__main__":
+    main()
--- a/surfsense_evals/scripts/peek_t3_doc_map.py
+++ b/surfsense_evals/scripts/peek_t3_doc_map.py
@ -1,40 +0,0 @@
-"""Quick sanity-check for the CRAG Task 3 doc map after ingest."""
-
-from __future__ import annotations
-
-import json
-import sys
-from pathlib import Path
-
-
-def main() -> int:
-    p = Path("data/research/maps/crag_t3_doc_map.jsonl")
-    if not p.exists():
-        print(f"Doc map missing: {p}")
-        return 1
-    rows = []
-    settings = {}
-    for line in p.read_text(encoding="utf-8").splitlines():
-        if not line.strip():
-            continue
-        row = json.loads(line)
-        if "__settings__" in row:
-            settings = row
-            continue
-        rows.append(row)
-    print(f"Settings header: {settings}")
-    print(f"Doc map rows:   {len(rows)}")
-    for r in rows:
-        print(f"  qid={r['qid']:<10} domain={r['domain']:<8} qtype={r['question_type']}")
-        print(f"    question: {r['question'][:90]}")
-        print(f"    gold:     {r['gold_answer'][:90]}")
-        print(
-            f"    pages:    {len(r['page_filenames'])} extracted, "
-            f"{len(r['document_ids'])} doc_ids, "
-            f"{len(r['missing_pages'])} missing"
-        )
-    return 0
-
-
-if __name__ == "__main__":
-    sys.exit(main())
--- a/surfsense_evals/scripts/retry_failed_questions.py
+++ b/surfsense_evals/scripts/retry_failed_questions.py
@ -0,0 +1,636 @@
+"""Retry only the failed (arm, question) pairs from a previous parser_compare run.
+
+The original parser_compare run records one row per (arm, qid) in
+``raw.jsonl``. Some of those rows came back with transient transport
+errors (SSL alerts, gateway 502s, empty SSE streams) or empty
+``raw_text``. This script re-issues *only* those calls with exponential
+backoff so we can see how many recover.
+
+Design constraints / choices:
+
+* **No re-ingest.** All cached PDFs and parser-extracted markdown stay
+  on disk. We rebuild ``ArmRequest`` objects from the existing manifest
+  + the original ``mmlongbench/questions.jsonl``.
+* **No SurfSense backend or celery required.** SurfSense had 0
+  reported failures; this script will skip any ``surfsense_agentic``
+  rows it encounters and warn rather than try to start the backend.
+* **Original ``raw.jsonl`` is never mutated.** Retries land in a
+  sibling ``raw_retries.jsonl`` so the original artifact stays
+  citeable.
+* **Idempotent.** Re-running this script re-tries the same set of
+  failed rows from ``raw.jsonl``. If you want to merge survivor rows
+  back in, do that as a separate aggregation step.
+
+Usage:
+
+    python scripts/retry_failed_questions.py \
+        --run-id 2026-05-14T00-53-19Z \
+        --max-attempts 5 \
+        --concurrency 2
+
+Outputs (written next to the original raw.jsonl):
+
+* ``raw_retries.jsonl`` — one line per retried (arm, qid). Each line
+  carries the original error, every retry attempt's timing/error,
+  and the final result (incl. grade) so you can drop it straight
+  into a notebook.
+* ``raw_retries_summary.json`` — per-arm tried/recovered/still-failed
+  counts and an aggregated retry-success rate.
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+import logging
+import os
+import random
+import sys
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+REPO = Path(__file__).resolve().parents[1]
+SRC = REPO / "src"
+if str(SRC) not in sys.path:
+    sys.path.insert(0, str(SRC))
+
+from dotenv import load_dotenv  # noqa: E402
+
+from surfsense_evals.core.arms import (  # noqa: E402
+    ArmRequest,
+    ArmResult,
+    BareLlmArm,
+    NativePdfArm,
+)
+from surfsense_evals.core.parse.freeform_answer import (  # noqa: E402
+    extract_freeform_answer,
+)
+from surfsense_evals.core.providers.openrouter_chat import (  # noqa: E402
+    OpenRouterChatProvider,
+)
+from surfsense_evals.core.providers.openrouter_pdf import (  # noqa: E402
+    OpenRouterPdfProvider,
+    PdfEngine,
+)
+from surfsense_evals.suites.multimodal_doc.mmlongbench.grader import grade  # noqa: E402
+from surfsense_evals.suites.multimodal_doc.parser_compare.prompt import (  # noqa: E402
+    build_long_context_prompt,
+    build_native_pdf_prompt,
+)
+
+logger = logging.getLogger("retry_failed_questions")
+
+LC_ARMS = {
+    "azure_basic_lc",
+    "azure_premium_lc",
+    "llamacloud_basic_lc",
+    "llamacloud_premium_lc",
+}
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _is_failure_row(row: dict[str, Any]) -> bool:
+    """A row counts as failed if it raised an error OR returned empty text.
+
+    We retry both because the empty-stream case is the same operational
+    failure mode (the call returned nothing usable) — we just didn't
+    raise it as an exception.
+    """
+
+    if row.get("error"):
+        return True
+    if not (row.get("raw_text") or "").strip():
+        return True
+    return False
+
+
+@dataclass
+class FailedRow:
+    arm: str
+    qid: str
+    doc_id: str
+    answer_format: str
+    gold: str
+    pages: int
+    document_id: int | None
+    original_error: str | None
+    original_row: dict[str, Any]
+
+
+def _load_failed_rows(raw_path: Path) -> list[FailedRow]:
+    out: list[FailedRow] = []
+    with raw_path.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            if not _is_failure_row(row):
+                continue
+            out.append(FailedRow(
+                arm=str(row["arm"]),
+                qid=str(row["qid"]),
+                doc_id=str(row["doc_id"]),
+                answer_format=str(row.get("answer_format") or ""),
+                gold=str(row.get("gold") or ""),
+                pages=int(row.get("pages") or 0),
+                document_id=row.get("document_id"),
+                original_error=row.get("error"),
+                original_row=row,
+            ))
+    return out
+
+
+def _load_doc_map(map_path: Path) -> dict[str, dict[str, Any]]:
+    out: dict[str, dict[str, Any]] = {}
+    with map_path.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            out[str(row["doc_id"])] = row
+    return out
+
+
+def _load_question_text_index(
+    questions_jsonl: Path,
+) -> dict[tuple[str, int], dict[str, Any]]:
+    """Map (doc_id, per_doc_index) -> raw question row.
+
+    qids in raw.jsonl are formatted ``{doc_id}::Q{NNN}`` where NNN is
+    the per-doc index. Reproducing the runner's question selection
+    requires walking ``questions.jsonl`` in order and assigning
+    indices per doc_id (so we match the runner's ``per_doc_idx`` logic
+    in ``_select_questions``).
+    """
+
+    out: dict[tuple[str, int], dict[str, Any]] = {}
+    per_doc_idx: dict[str, int] = {}
+    with questions_jsonl.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            doc_id = str(row.get("doc_id") or "")
+            if not doc_id:
+                continue
+            idx = per_doc_idx.get(doc_id, 0)
+            per_doc_idx[doc_id] = idx + 1
+            out[(doc_id, idx)] = row
+    return out
+
+
+def _qid_index(qid: str) -> int:
+    """Parse the per-doc question index out of a qid like ``foo.pdf::Q007``."""
+
+    _, _, q_part = qid.rpartition("::")
+    if not q_part.startswith("Q"):
+        raise ValueError(f"unexpected qid shape: {qid!r}")
+    return int(q_part[1:])
+
+
+# ---------------------------------------------------------------------------
+# Request building (mirrors runner.py exactly so prompts are byte-identical)
+# ---------------------------------------------------------------------------
+
+
+def _build_native_request(
+    qid: str, question: str, answer_format: str, pdf_path: Path,
+    *, max_output_tokens: int,
+) -> ArmRequest:
+    return ArmRequest(
+        question_id=qid,
+        prompt=build_native_pdf_prompt(question, answer_format=answer_format),
+        pdf_paths=[pdf_path],
+        options={"max_tokens": max_output_tokens},
+    )
+
+
+def _build_lc_request(
+    qid: str, question: str, answer_format: str, doc_id: str, md_path: Path,
+) -> ArmRequest:
+    if not md_path.exists():
+        raise FileNotFoundError(
+            f"Missing parser extraction at {md_path}; cannot retry LC arm."
+        )
+    markdown = md_path.read_text(encoding="utf-8")
+    return ArmRequest(
+        question_id=qid,
+        prompt=build_long_context_prompt(
+            question,
+            answer_format=answer_format,
+            document_markdown=markdown,
+            document_label=doc_id,
+        ),
+    )
+
+
+# ---------------------------------------------------------------------------
+# Retry driver
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class AttemptLog:
+    attempt: int
+    started_iso: str
+    latency_ms: int
+    error: str | None
+    raw_text_chars: int
+
+
+@dataclass
+class RetryOutcome:
+    arm: str
+    qid: str
+    attempts: list[AttemptLog]
+    final_result: ArmResult
+    recovered: bool
+
+
+async def _retry_one(
+    arm_obj: Any, request: ArmRequest, *,
+    arm_name: str,
+    qid: str,
+    max_attempts: int,
+    base_delay: float,
+    max_delay: float,
+) -> RetryOutcome:
+    attempts: list[AttemptLog] = []
+    final: ArmResult | None = None
+    for attempt in range(1, max_attempts + 1):
+        started_iso = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
+        t0 = time.monotonic()
+        result = await arm_obj.answer(request)
+        latency_ms = int((time.monotonic() - t0) * 1000)
+        raw_text = (result.raw_text or "").strip()
+        attempt_error = result.error
+        if not attempt_error and not raw_text:
+            attempt_error = "EmptyResponse: stream ended with no text"
+        attempts.append(AttemptLog(
+            attempt=attempt,
+            started_iso=started_iso,
+            latency_ms=latency_ms,
+            error=attempt_error,
+            raw_text_chars=len(raw_text),
+        ))
+        final = result
+        if not attempt_error and raw_text:
+            return RetryOutcome(
+                arm=arm_name, qid=qid, attempts=attempts,
+                final_result=result, recovered=True,
+            )
+        if attempt < max_attempts:
+            delay = min(max_delay, base_delay * (2 ** (attempt - 1)))
+            delay = delay * (0.5 + random.random())
+            logger.info(
+                "[%s::%s] attempt %d/%d failed (%s); sleeping %.1fs",
+                arm_name, qid, attempt, max_attempts, attempt_error, delay,
+            )
+            await asyncio.sleep(delay)
+    assert final is not None
+    return RetryOutcome(
+        arm=arm_name, qid=qid, attempts=attempts,
+        final_result=final, recovered=False,
+    )
+
+
+async def _gather_with_limit(coros: list, *, concurrency: int) -> list[Any]:
+    sem = asyncio.Semaphore(max(1, concurrency))
+
+    async def _wrap(coro):
+        async with sem:
+            return await coro
+
+    return await asyncio.gather(*(_wrap(c) for c in coros))
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+
+async def _run(args: argparse.Namespace) -> int:
+    load_dotenv(REPO / ".env")
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s %(levelname)s %(name)s %(message)s",
+    )
+
+    run_dir = REPO / "data" / "multimodal_doc" / "runs" / args.run_id / "parser_compare"
+    raw_path = run_dir / "raw.jsonl"
+    if not raw_path.exists():
+        raise SystemExit(f"raw.jsonl not found at {raw_path}")
+
+    map_path = REPO / "data" / "multimodal_doc" / "maps" / "parser_compare_doc_map.jsonl"
+    questions_jsonl = REPO / "data" / "multimodal_doc" / "mmlongbench" / "questions.jsonl"
+    if not map_path.exists():
+        raise SystemExit(f"parser_compare manifest not found at {map_path}")
+    if not questions_jsonl.exists():
+        raise SystemExit(f"mmlongbench questions not found at {questions_jsonl}")
+
+    failed = _load_failed_rows(raw_path)
+    if not failed:
+        logger.info("No failed rows in %s — nothing to retry.", raw_path)
+        return 0
+
+    # SurfSense rows: warn and skip; we don't want to start backend just to
+    # defensively retry a 0-failure arm.
+    surf_failed = [f for f in failed if f.arm == "surfsense_agentic"]
+    if surf_failed:
+        logger.warning(
+            "Skipping %d surfsense_agentic failures; this script doesn't drive the backend. "
+            "If you want those retried too, start backend + celery and rerun "
+            "with --include-surfsense.",
+            len(surf_failed),
+        )
+        if not args.include_surfsense:
+            failed = [f for f in failed if f.arm != "surfsense_agentic"]
+    else:
+        logger.info("No surfsense_agentic failures; backend/celery not needed for this retry.")
+
+    if not failed:
+        logger.info("Nothing left to retry after filtering.")
+        return 0
+
+    by_arm_count: dict[str, int] = {}
+    for f in failed:
+        by_arm_count[f.arm] = by_arm_count.get(f.arm, 0) + 1
+    logger.info(
+        "Loaded %d failed rows across %d arms: %s",
+        len(failed), len(by_arm_count),
+        ", ".join(f"{a}={n}" for a, n in sorted(by_arm_count.items())),
+    )
+
+    doc_map = _load_doc_map(map_path)
+    qtext_idx = _load_question_text_index(questions_jsonl)
+
+    api_key = os.environ.get("OPENROUTER_API_KEY")
+    if not api_key:
+        raise SystemExit("OPENROUTER_API_KEY missing from environment / .env")
+
+    native_provider = OpenRouterPdfProvider(
+        api_key=api_key,
+        base_url=os.environ.get("OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1"),
+        model=args.llm_model,
+        engine=PdfEngine(args.pdf_engine),
+    )
+    native_arm = NativePdfArm(
+        provider=native_provider, max_output_tokens=args.max_output_tokens,
+    )
+
+    lc_arms: dict[str, BareLlmArm] = {}
+    for arm_name in sorted({f.arm for f in failed} & LC_ARMS):
+        lc_provider = OpenRouterChatProvider(
+            api_key=api_key,
+            base_url=os.environ.get("OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1"),
+            model=args.llm_model,
+        )
+        lc_arms[arm_name] = BareLlmArm(
+            provider=lc_provider,
+            max_output_tokens=args.max_output_tokens,
+            name=arm_name,
+        )
+
+    coros: list = []
+    plan: list[tuple[FailedRow, ArmRequest, Any]] = []
+
+    for f in failed:
+        # Look up the question text from questions.jsonl
+        try:
+            q_idx = _qid_index(f.qid)
+        except Exception:
+            logger.error("Bad qid %r — skipping", f.qid)
+            continue
+        qrow = qtext_idx.get((f.doc_id, q_idx))
+        if qrow is None:
+            logger.error(
+                "Could not find question text for %s (idx %d) — skipping",
+                f.doc_id, q_idx,
+            )
+            continue
+        question_text = str(qrow.get("question") or "").strip()
+        answer_format = str(qrow.get("answer_format") or f.answer_format or "").strip().lower()
+
+        map_row = doc_map.get(f.doc_id)
+        if map_row is None:
+            logger.error("doc_id %s not in manifest — skipping", f.doc_id)
+            continue
+
+        if f.arm == "native_pdf":
+            pdf_path = Path(map_row["pdf_path"])
+            if not pdf_path.exists():
+                logger.error("PDF missing on disk: %s — skipping", pdf_path)
+                continue
+            request = _build_native_request(
+                f.qid, question_text, answer_format, pdf_path,
+                max_output_tokens=args.max_output_tokens,
+            )
+            arm_obj = native_arm
+        elif f.arm in LC_ARMS:
+            ext_blob = (map_row.get("extractions") or {}).get(f.arm) or {}
+            md_path_str = ext_blob.get("markdown_path")
+            if not md_path_str or ext_blob.get("status") != "ok":
+                logger.error(
+                    "Missing extraction for %s on %s — cannot retry; skipping",
+                    f.arm, f.doc_id,
+                )
+                continue
+            request = _build_lc_request(
+                f.qid, question_text, answer_format, f.doc_id, Path(md_path_str),
+            )
+            arm_obj = lc_arms[f.arm]
+        else:
+            logger.warning("Unhandled arm %s — skipping", f.arm)
+            continue
+
+        plan.append((f, request, arm_obj))
+        coros.append(_retry_one(
+            arm_obj, request,
+            arm_name=f.arm, qid=f.qid,
+            max_attempts=args.max_attempts,
+            base_delay=args.base_delay,
+            max_delay=args.max_delay,
+        ))
+
+    if not coros:
+        logger.warning("Nothing to retry after request building.")
+        return 0
+
+    logger.info(
+        "Retrying %d failed rows with up to %d attempts each "
+        "(base_delay=%.1fs, max_delay=%.1fs, concurrency=%d).",
+        len(coros), args.max_attempts, args.base_delay, args.max_delay,
+        args.concurrency,
+    )
+
+    started = time.monotonic()
+    outcomes: list[RetryOutcome] = await _gather_with_limit(
+        coros, concurrency=args.concurrency,
+    )
+    elapsed = time.monotonic() - started
+    logger.info("Retry pass finished in %.1fs.", elapsed)
+
+    out_path = run_dir / "raw_retries.jsonl"
+    summary_path = run_dir / "raw_retries_summary.json"
+
+    per_arm_recovered: dict[str, int] = {}
+    per_arm_total: dict[str, int] = {}
+    per_arm_attempts_dist: dict[str, list[int]] = {}
+
+    with out_path.open("w", encoding="utf-8") as fh:
+        for (f, _req, _arm_obj), outcome in zip(plan, outcomes, strict=True):
+            per_arm_total[outcome.arm] = per_arm_total.get(outcome.arm, 0) + 1
+            if outcome.recovered:
+                per_arm_recovered[outcome.arm] = (
+                    per_arm_recovered.get(outcome.arm, 0) + 1
+                )
+            per_arm_attempts_dist.setdefault(outcome.arm, []).append(
+                len(outcome.attempts)
+            )
+
+            g = grade(
+                pred=extract_freeform_answer(outcome.final_result.raw_text or ""),
+                gold=f.gold,
+                answer_format=f.answer_format,
+            )
+            row = {
+                "qid": f.qid,
+                "doc_id": f.doc_id,
+                "arm": f.arm,
+                "answer_format": f.answer_format,
+                "gold": f.gold,
+                "pages": f.pages,
+                "document_id": f.document_id,
+                "original_error": f.original_error,
+                "retry": {
+                    "max_attempts": args.max_attempts,
+                    "n_attempts": len(outcome.attempts),
+                    "recovered": outcome.recovered,
+                    "attempts": [
+                        {
+                            "attempt": a.attempt,
+                            "started_iso": a.started_iso,
+                            "latency_ms": a.latency_ms,
+                            "error": a.error,
+                            "raw_text_chars": a.raw_text_chars,
+                        }
+                        for a in outcome.attempts
+                    ],
+                },
+                **outcome.final_result.to_jsonl(),
+                "graded": {
+                    "correct": g.correct,
+                    "f1": g.f1,
+                    "method": g.method,
+                    "normalised_pred": g.normalised_pred,
+                    "normalised_gold": g.normalised_gold,
+                },
+            }
+            fh.write(json.dumps(row) + "\n")
+
+    summary = {
+        "run_id": args.run_id,
+        "raw_retries_path": str(out_path.relative_to(REPO)),
+        "n_failed_rows_input": len(failed),
+        "n_retried": len(coros),
+        "elapsed_s": round(elapsed, 1),
+        "config": {
+            "max_attempts": args.max_attempts,
+            "base_delay": args.base_delay,
+            "max_delay": args.max_delay,
+            "concurrency": args.concurrency,
+            "llm_model": args.llm_model,
+            "pdf_engine": args.pdf_engine,
+            "max_output_tokens": args.max_output_tokens,
+        },
+        "per_arm": {
+            arm: {
+                "tried": per_arm_total.get(arm, 0),
+                "recovered": per_arm_recovered.get(arm, 0),
+                "still_failed": (
+                    per_arm_total.get(arm, 0) - per_arm_recovered.get(arm, 0)
+                ),
+                "recovery_rate": (
+                    per_arm_recovered.get(arm, 0) / per_arm_total[arm]
+                    if per_arm_total.get(arm) else 0.0
+                ),
+                "attempts_distribution": sorted(per_arm_attempts_dist.get(arm, [])),
+            }
+            for arm in sorted(per_arm_total)
+        },
+        "totals": {
+            "tried": sum(per_arm_total.values()),
+            "recovered": sum(per_arm_recovered.values()),
+            "still_failed": sum(per_arm_total.values()) - sum(per_arm_recovered.values()),
+        },
+    }
+    summary_path.write_text(
+        json.dumps(summary, indent=2, sort_keys=True) + "\n",
+        encoding="utf-8",
+    )
+
+    print()
+    print("=" * 78)
+    print("Retry pass summary")
+    print("=" * 78)
+    header = f"{'arm':<25} {'tried':>6} {'recovered':>10} {'still fail':>11} {'rate':>7}"
+    print(header)
+    print("-" * len(header))
+    for arm in sorted(per_arm_total):
+        tried = per_arm_total[arm]
+        rec = per_arm_recovered.get(arm, 0)
+        rate = (rec / tried * 100) if tried else 0.0
+        print(f"{arm:<25} {tried:>6} {rec:>10} {tried - rec:>11} {rate:>6.1f}%")
+    total = sum(per_arm_total.values())
+    rec_total = sum(per_arm_recovered.values())
+    rate_total = (rec_total / total * 100) if total else 0.0
+    print("-" * len(header))
+    print(f"{'TOTAL':<25} {total:>6} {rec_total:>10} {total - rec_total:>11} "
+          f"{rate_total:>6.1f}%")
+    print()
+    print(f"Wrote {out_path.relative_to(REPO)}")
+    print(f"Wrote {summary_path.relative_to(REPO)}")
+    return 0
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--run-id", default="2026-05-14T00-53-19Z",
+        help="Run timestamp under data/multimodal_doc/runs/. Default is the "
+             "n=171 production run we wrote up in the blog.",
+    )
+    parser.add_argument("--max-attempts", type=int, default=5)
+    parser.add_argument("--base-delay", type=float, default=1.0,
+                        help="Base seconds for exponential backoff (default 1s).")
+    parser.add_argument("--max-delay", type=float, default=30.0,
+                        help="Cap on per-retry sleep (default 30s).")
+    parser.add_argument("--concurrency", type=int, default=2,
+                        help="Parallel retries in flight (default 2 — keep low "
+                             "to avoid the same transport stress that caused "
+                             "the original failures).")
+    parser.add_argument("--llm-model", default="anthropic/claude-sonnet-4.5")
+    parser.add_argument("--pdf-engine", default="native",
+                        choices=[e.value for e in PdfEngine])
+    parser.add_argument("--max-output-tokens", type=int, default=512)
+    parser.add_argument(
+        "--include-surfsense", action="store_true",
+        help="Also retry surfsense_agentic failures (requires backend + celery up). "
+             "Default is to skip them since the n=171 run had 0 SurfSense failures.",
+    )
+    args = parser.parse_args()
+    raise SystemExit(asyncio.run(_run(args)))
+
+
+if __name__ == "__main__":
+    main()
--- a/surfsense_evals/scripts/summarise_parser_compare_run.py
+++ b/surfsense_evals/scripts/summarise_parser_compare_run.py
@ -0,0 +1,122 @@
+"""Slice the parser_compare raw.jsonl for the n=171 run.
+
+Reports per-arm:
+  * tokens & cost stats (input/output mean, $/Q distribution)
+  * failures (status != ok or empty raw_text)
+  * answer_format breakdown (accuracy by str/int/float/list)
+
+Plus surfsense agentic breakdown so we can compare apples to apples
+even though the new_chat SSE doesn't surface per-call token counts.
+"""
+
+from __future__ import annotations
+
+import json
+import statistics
+from collections import defaultdict
+from pathlib import Path
+
+
+REPO = Path(__file__).resolve().parents[1]
+RUN_DIR = REPO / "data" / "multimodal_doc" / "runs" / "2026-05-14T00-53-19Z" / "parser_compare"
+RAW = RUN_DIR / "raw.jsonl"
+ARTIFACT = RUN_DIR / "run_artifact.json"
+
+
+def main() -> None:
+    rows = [json.loads(line) for line in RAW.read_text(encoding="utf-8").splitlines() if line.strip()]
+    print(f"raw rows: {len(rows)}")
+
+    by_qid: dict[str, list[dict]] = defaultdict(list)
+    for row in rows:
+        by_qid[row["qid"]].append(row)
+    print(f"unique questions: {len(by_qid)}")
+
+    arm_metrics: dict[str, dict] = defaultdict(lambda: {
+        "n": 0, "n_correct": 0, "n_failed": 0, "n_empty": 0,
+        "costs": [], "in_tokens": [], "out_tokens": [], "latency_ms": [],
+        "by_format": defaultdict(lambda: {"n": 0, "correct": 0}),
+    })
+
+    for row in rows:
+        arm = row["arm"]
+        m = arm_metrics[arm]
+        m["n"] += 1
+        graded = row.get("graded") or {}
+        if graded.get("correct"):
+            m["n_correct"] += 1
+
+        err = row.get("error")
+        raw_text = row.get("raw_text") or ""
+        if err:
+            m["n_failed"] += 1
+        elif not raw_text.strip():
+            m["n_empty"] += 1
+
+        cost = row.get("cost_usd")
+        if cost is not None:
+            m["costs"].append(float(cost))
+        ut = row.get("usage") or {}
+        if ut.get("prompt_tokens"):
+            m["in_tokens"].append(ut["prompt_tokens"])
+        if ut.get("completion_tokens"):
+            m["out_tokens"].append(ut["completion_tokens"])
+        if row.get("latency_ms"):
+            m["latency_ms"].append(row["latency_ms"])
+
+        fmt = row.get("answer_format") or "unknown"
+        m["by_format"][fmt]["n"] += 1
+        if graded.get("correct"):
+            m["by_format"][fmt]["correct"] += 1
+
+    print()
+    print("=" * 100)
+    print(f"{'arm':<25} {'n':>4} {'acc%':>6} {'F1%':>6} {'fail':>5} {'$ mean':>10} {'$ median':>10} {'in tok mean':>12} {'out tok mean':>12} {'p50 ms':>8}")
+    print("=" * 100)
+    art = json.loads(ARTIFACT.read_text(encoding="utf-8"))
+    per_arm_art = art["metrics"]["per_arm"]
+    for arm, m in sorted(arm_metrics.items()):
+        acc = m["n_correct"] / m["n"] * 100
+        fail = m["n_failed"]
+        cost_mean = statistics.mean(m["costs"]) if m["costs"] else 0.0
+        cost_med = statistics.median(m["costs"]) if m["costs"] else 0.0
+        in_mean = statistics.mean(m["in_tokens"]) if m["in_tokens"] else 0
+        out_mean = statistics.mean(m["out_tokens"]) if m["out_tokens"] else 0
+        lat_p50 = statistics.median(m["latency_ms"]) if m["latency_ms"] else 0
+        f1 = per_arm_art.get(arm, {}).get("f1_mean", 0.0) * 100
+        print(
+            f"{arm:<25} {m['n']:>4} {acc:>5.1f}% {f1:>5.1f}% {fail:>5} "
+            f"${cost_mean:>9.4f} ${cost_med:>9.4f} {in_mean:>12.0f} {out_mean:>12.0f} {lat_p50:>8.0f}"
+        )
+
+    print()
+    print("by answer_format (accuracy):")
+    formats = sorted({f for m in arm_metrics.values() for f in m["by_format"].keys()})
+    header = f"{'arm':<25} " + " ".join(f"{f:>10}" for f in formats)
+    print(header)
+    print("-" * len(header))
+    for arm, m in sorted(arm_metrics.items()):
+        cells = []
+        for f in formats:
+            row = m["by_format"][f]
+            if row["n"] == 0:
+                cells.append(f"{'-':>10}")
+            else:
+                pct = row["correct"] / row["n"] * 100
+                cells.append(f"{pct:>5.0f}% ({row['correct']:>2}/{row['n']:>2})")
+        print(f"{arm:<25} " + " ".join(cells))
+
+    print()
+    print("=" * 100)
+    print("Aggregated cost (from run_artifact.json):")
+    for arm, row in per_arm_art.items():
+        print(
+            f"  {arm:<25}  acc={row['accuracy']*100:5.1f}% "
+            f"  $/Q LLM={row['llm_cost_per_q']:.4f}  "
+            f"  preprocess total=${row['preprocess_cost_total']:.2f}  "
+            f"  $/Q total={row['total_cost_per_q']:.4f}"
+        )
+
+
+if __name__ == "__main__":
+    main()
--- a/surfsense_evals/scripts/test_context_overflow_hypothesis.py
+++ b/surfsense_evals/scripts/test_context_overflow_hypothesis.py
@ -0,0 +1,155 @@
+"""Test the hypothesis: were the LC-arm errors actually context-window
+overflow errors disguised as SSL / network failures?
+
+If true, we'd expect:
+  (a) literal "prompt is too long" / "context_length_exceeded" / "exceeds .* tokens" strings,
+  (b) failures correlated with extraction size / input_tokens (large doc -> failure),
+  (c) failing requests near or over Sonnet 4.5's 200k input-token limit.
+
+If false (transport-layer hypothesis), we'd expect:
+  (a) only SSL / 502 / empty stream / JSONDecode strings,
+  (b) failures NOT correlated with size (uniform across PDFs by time, not by tokens),
+  (c) failing requests well below the 200k limit.
+"""
+
+from __future__ import annotations
+
+import json
+import statistics
+from collections import defaultdict
+from pathlib import Path
+
+REPO = Path(__file__).resolve().parents[1]
+RUN = REPO / "data" / "multimodal_doc" / "runs" / "2026-05-14T00-53-19Z" / "parser_compare"
+RAW = RUN / "raw.jsonl"
+MANIFEST = REPO / "data" / "multimodal_doc" / "maps" / "parser_compare_doc_map.jsonl"
+
+CONTEXT_HINTS = (
+    "context_length",
+    "context window",
+    "prompt is too long",
+    "exceeds",
+    "maximum context",
+    "input tokens",
+    "too many tokens",
+    "over the maximum",
+    "200000",
+    "200_000",
+)
+
+
+def main() -> None:
+    rows = [
+        json.loads(line) for line in RAW.read_text(encoding="utf-8").splitlines()
+        if line.strip()
+    ]
+
+    extraction_size: dict[tuple[str, str], int] = {}
+    for line in MANIFEST.read_text(encoding="utf-8").splitlines():
+        if not line.strip():
+            continue
+        m = json.loads(line)
+        for arm, ext in (m.get("extractions") or {}).items():
+            extraction_size[(m["doc_id"], arm)] = int(ext.get("chars") or 0)
+
+    print("=" * 80)
+    print("(a) Literal 'context window' / 'prompt too long' error strings?")
+    print("=" * 80)
+    found = 0
+    for row in rows:
+        err = (row.get("error") or "").lower()
+        if not err:
+            continue
+        for hint in CONTEXT_HINTS:
+            if hint in err:
+                print(f"  {row['arm']:<25} {row['qid']:<50}")
+                print(f"      -> {err[:240]}")
+                found += 1
+                break
+    if not found:
+        print("  none found.")
+
+    print()
+    print("=" * 80)
+    print("(b) Extraction size for OK vs FAILED rows per arm")
+    print("=" * 80)
+    arm_buckets: dict[str, dict[str, list[int]]] = defaultdict(
+        lambda: {"ok": [], "fail": []}
+    )
+    parser_arms = (
+        "azure_basic_lc", "azure_premium_lc",
+        "llamacloud_basic_lc", "llamacloud_premium_lc",
+    )
+    for row in rows:
+        arm = row["arm"]
+        if arm not in parser_arms:
+            continue
+        size = extraction_size.get((row["doc_id"], arm), 0)
+        bucket = "fail" if (row.get("error") or not (row.get("raw_text") or "").strip()) else "ok"
+        arm_buckets[arm][bucket].append(size)
+
+    print(f"{'arm':<25} {'bucket':<5} {'n':>4} {'mean chars':>12} {'median':>10} {'max':>10}")
+    for arm in parser_arms:
+        for bucket in ("ok", "fail"):
+            sizes = arm_buckets[arm][bucket]
+            if not sizes:
+                print(f"  {arm:<23} {bucket:<5} {0:>4}  -")
+                continue
+            print(
+                f"  {arm:<23} {bucket:<5} {len(sizes):>4} "
+                f"{statistics.mean(sizes):>12,.0f} "
+                f"{statistics.median(sizes):>10,.0f} "
+                f"{max(sizes):>10,}"
+            )
+
+    print()
+    print("=" * 80)
+    print("(c) Largest extraction each arm processed *successfully* vs *failed*")
+    print("=" * 80)
+    print(
+        "(Sonnet 4.5 input limit ~200k tokens ~= 800k chars. If failures were "
+        "context-overflow, max-OK would be near that cap. If max-OK is well "
+        "above max-FAIL, the model handled bigger contexts than the failed "
+        "ones, so size cannot be the cause.)"
+    )
+    print()
+    for arm in parser_arms:
+        ok_sizes = arm_buckets[arm]["ok"]
+        fail_sizes = arm_buckets[arm]["fail"]
+        if not ok_sizes:
+            continue
+        max_ok = max(ok_sizes)
+        max_fail = max(fail_sizes) if fail_sizes else 0
+        print(
+            f"  {arm:<25} max OK = {max_ok:>10,} chars (~{max_ok / 4:>7,.0f} tokens)  "
+            f"max FAIL = {max_fail:>10,} chars (~{max_fail / 4:>7,.0f} tokens)"
+        )
+
+    print()
+    print("=" * 80)
+    print("(d) Did the *known* overflow candidate fail?")
+    print("=" * 80)
+    print(
+        "  3M_2018_10K x llamacloud_premium = 908,733 chars (~227k tokens) "
+        "-- this is above Sonnet 4.5's 200k window."
+    )
+    print("  If transport hypothesis is correct, this should still fail with a "
+          "real overflow error.")
+    print("  If transport hypothesis is correct AND the model truncates silently, "
+          "it might 'succeed' but be wrong.")
+    print()
+    for row in rows:
+        if row["doc_id"] != "3M_2018_10K.pdf":
+            continue
+        if row["arm"] != "llamacloud_premium_lc":
+            continue
+        err = row.get("error") or "(none)"
+        graded = row.get("graded") or {}
+        print(
+            f"  {row['qid']:<40} correct={graded.get('correct')!s:<5}  "
+            f"err={err[:100]}"
+        )
+
+
+if __name__ == "__main__":
+    main()