feat(evals): publish multimodal_doc parser_compare benchmark + n=171 report

Adds the full parser_compare experiment for the multimodal_doc suite:
six arms compared on 30 PDFs / 171 questions from MMLongBench-Doc with
anthropic/claude-sonnet-4.5 across the board.

Source code:
- core/parsers/{azure_di,llamacloud,pdf_pages}.py: direct parser SDK
  callers (Azure Document Intelligence prebuilt-read/layout, LlamaParse
  parse_page_with_llm/parse_page_with_agent) used by the LC arms,
  bypassing the SurfSense backend so each (basic/premium) extraction
  is a clean A/B independent of backend ETL routing.
- suites/multimodal_doc/parser_compare/{ingest,runner,prompt}.py:
  six-arm benchmark (native_pdf, azure_basic_lc, azure_premium_lc,
  llamacloud_basic_lc, llamacloud_premium_lc, surfsense_agentic) with
  byte-identical prompts per question, deterministic grader, Wilson
  CIs, and the per-page preprocessing tariff cost overlay.

Reproducibility:
- pyproject.toml + uv.lock pin pypdf, azure-ai-documentintelligence,
  llama-cloud-services as new deps.
- .env.example documents the AZURE_DI_* and LLAMA_CLOUD_API_KEY env
  vars now required for parser_compare.
- 12 analysis scripts under scripts/: retry pass with exponential
  backoff, post-retry accuracy merge, McNemar / latency / per-PDF
  stats, context-overflow hypothesis test, etc. Each produces one
  number cited by the blog report.

Citation surface:
- reports/blog/multimodal_doc_parser_compare_n171_report.md: 1219-line
  technical writeup (16 sections) covering headline accuracy, per-format
  accuracy, McNemar pairwise significance, latency / token / per-PDF
  distributions, error analysis, retry experiment, post-retry final
  accuracy, cost amortization model with closed-form derivation, threats
  to validity, and reproducibility appendix.
- data/multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/{raw,
  raw_retries,raw_post_retry}.jsonl + run_artifact.json + retry summary
  whitelisted via data/.gitignore as the verifiable numbers source.

Gitignore:
- ignore logs_*.txt + retry_run.log; structured artifacts cover the
  citation surface, debug logs are noise.
- data/.gitignore default-ignores everything, whitelists the n=171 run
  artifacts only (parser manifest left ignored to avoid leaking local
  Windows usernames in absolute paths; manifest is fully regenerable
  via 'ingest multimodal_doc parser_compare').
- reports/.gitignore now whitelists hand-curated reports/blog/.

Also retires the abandoned CRAG Task 3 implementation (download script,
streaming Task 3 ingest, CragTask3Benchmark + tests) and trims the
runner / ingest module APIs to match.

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
DESKTOP-RTLN3BA\$punk 2026-05-14 19:54:41 -07:00
parent 3737118050
commit 9bcd50164d
40 changed files with 9303 additions and 993 deletions

View file

@ -0,0 +1,125 @@
"""Were the SSL failures clustered in time (network blip) or evenly
distributed (sustained limit)? Group failures by 1-min buckets using
the run start time and the per-row latency_ms / answer order.
Also: for the one *real* intrinsic failure the 30MB Anthropic limit
on 2405.09818v1.pdf::Q007 print the full error message + raw payload
sizes so the blog has a clean root cause.
"""
from __future__ import annotations
import json
from collections import Counter, defaultdict
from pathlib import Path
REPO = Path(__file__).resolve().parents[1]
RUN = REPO / "data" / "multimodal_doc" / "runs" / "2026-05-14T00-53-19Z" / "parser_compare"
RAW = RUN / "raw.jsonl"
PDFS = REPO / "data" / "multimodal_doc" / "mmlongbench" / "pdfs"
def main() -> None:
rows = [
json.loads(line) for line in RAW.read_text(encoding="utf-8").splitlines()
if line.strip()
]
# 1) SSL clustering: failures by question index per arm
by_arm_idx: dict[str, list[tuple[int, str]]] = defaultdict(list)
qid_order: dict[str, int] = {}
arm_seen_count: dict[str, int] = defaultdict(int)
for row in rows:
arm = row["arm"]
idx = arm_seen_count[arm]
arm_seen_count[arm] += 1
qid_order[f"{arm}::{row['qid']}"] = idx
err = row.get("error") or ""
cluster = "ssl" if "SSLError" in err else (
"empty" if not (row.get("raw_text") or "").strip() and not err else (
"5xx" if "502" in err or "503" in err else (
"size_limit" if "exceeds" in err.lower() and "limit" in err.lower() else (
"other_err" if err else "ok"
)
)
)
)
if cluster != "ok":
by_arm_idx[arm].append((idx, cluster))
print("=" * 80)
print("SSL/network-error indices per arm (each arm processes 171 questions in")
print("order; index = sequential position within that arm). Tight clustering")
print("in time = transient blip, even spread = sustained limit.")
print("=" * 80)
for arm in sorted(by_arm_idx):
items = by_arm_idx[arm]
if not items:
continue
idxs = sorted(set(i for i, _ in items))
print(f"\n{arm}: {len(items)} failures at indices {idxs}")
# show clusters
cluster_runs = []
cur = [idxs[0]]
for i in idxs[1:]:
if i - cur[-1] <= 5: # within 5 questions = same time window
cur.append(i)
else:
cluster_runs.append(cur)
cur = [i]
cluster_runs.append(cur)
print(f" clusters (gap<=5): {len(cluster_runs)}: {cluster_runs}")
# 2) The 30MB intrinsic failure — full details
print()
print("=" * 80)
print("Intrinsic failure: 30MB Anthropic input limit on 2405.09818v1.pdf::Q007")
print("=" * 80)
for row in rows:
if row["qid"] == "2405.09818v1.pdf::Q007" and row["arm"] == "native_pdf":
err = row.get("error") or ""
print(f" qid: {row['qid']}")
print(f" doc: {row['doc_id']}, pages: {row.get('pages')}")
pdf_path = PDFS / row["doc_id"]
if pdf_path.exists():
size_mb = pdf_path.stat().st_size / (1024 * 1024)
print(f" PDF size on disk: {size_mb:.1f} MB")
# base64 inflates ~33%
est_b64 = size_mb * 1.33
print(f" estimated base64 wire size: {est_b64:.1f} MB")
print(f" full error: {err[:600]}")
break
# 3) Per-PDF: which PDFs are pathological?
print()
print("=" * 80)
print("Per-PDF failure breakdown across all 6 arms (only PDFs with failures)")
print("=" * 80)
by_pdf: dict[str, list[dict]] = defaultdict(list)
for row in rows:
err = row.get("error") or ""
empty = not (row.get("raw_text") or "").strip()
if err or empty:
by_pdf[row["doc_id"]].append({
"arm": row["arm"],
"qid": row["qid"],
"err_kind": (
"ssl" if "SSLError" in err
else "size_limit" if "exceeds" in err.lower() and "limit" in err.lower()
else "5xx" if "502" in err or "503" in err
else "json_decode" if "JSONDecodeError" in err
else "empty" if empty and not err
else "other"
),
"pages": row.get("pages"),
})
for doc, items in sorted(by_pdf.items(), key=lambda x: (-len(x[1]), x[0])):
kinds = Counter(i["err_kind"] for i in items)
arms = sorted({i["arm"] for i in items})
pages = items[0]["pages"]
print(f" {doc} pages={pages} failures={len(items)} arms={arms}")
print(f" kinds: {dict(kinds)}")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,155 @@
"""Drill into the parser_compare n=171 raw.jsonl to surface every
failure, group by arm + PDF, and dump the underlying error strings so
we can write up a clean failure-mode taxonomy for the blog post.
Outputs (printed to stdout + written to `failures_n171.json`):
* per-arm failure count and rate
* per-PDF failure count across all arms (which docs are pathological?)
* error-string clusters per arm (so we can give human-readable causes)
* sample failure rows (one per cluster) for the appendix
"""
from __future__ import annotations
import json
import re
from collections import Counter, defaultdict
from pathlib import Path
from typing import Any
REPO = Path(__file__).resolve().parents[1]
RUN = REPO / "data" / "multimodal_doc" / "runs" / "2026-05-14T00-53-19Z" / "parser_compare"
RAW = RUN / "raw.jsonl"
OUT = REPO / "scripts" / "failures_n171.json"
def _classify(error: str | None, raw_text: str) -> str:
"""Coarse-grained bucket for an error message."""
blob = (error or "").lower()
if not blob and not raw_text.strip():
return "empty_response"
if "rate limit" in blob or "429" in blob:
return "rate_limit"
if "context_length" in blob or "context window" in blob or "too many tokens" in blob:
return "context_overflow"
if "could not process image" in blob or "invalid image" in blob:
return "image_decode_failure"
if "could not process pdf" in blob or "invalid_request_error" in blob and "pdf" in blob:
return "pdf_decode_failure"
if "timeout" in blob or "timed out" in blob:
return "timeout"
if "5xx" in blob or "internal server error" in blob or "503" in blob or "502" in blob:
return "provider_5xx"
if "filenotfound" in blob:
return "missing_extraction"
if "badrequest" in blob:
return "provider_400"
if blob:
return "other_error"
return "unknown"
def main() -> None:
rows = [
json.loads(line) for line in RAW.read_text(encoding="utf-8").splitlines()
if line.strip()
]
by_arm_failures: dict[str, list[dict]] = defaultdict(list)
by_pdf_failures: dict[str, list[dict]] = defaultdict(list)
error_clusters: dict[str, dict[str, list[dict]]] = defaultdict(lambda: defaultdict(list))
n_per_arm: dict[str, int] = defaultdict(int)
for row in rows:
arm = row["arm"]
n_per_arm[arm] += 1
err = row.get("error")
raw_text = row.get("raw_text") or ""
if err or not raw_text.strip():
cluster = _classify(err, raw_text)
entry = {
"qid": row["qid"],
"doc_id": row["doc_id"],
"answer_format": row["answer_format"],
"gold": row["gold"],
"error": err,
"cluster": cluster,
"raw_text_len": len(raw_text),
"pages": row.get("pages"),
}
by_arm_failures[arm].append(entry)
by_pdf_failures[row["doc_id"]].append({**entry, "arm": arm})
error_clusters[arm][cluster].append(entry)
print("=" * 90)
print("Per-arm failure count & rate")
print("=" * 90)
print(f"{'arm':<25} {'n':>4} {'fail':>5} {'rate%':>6}")
for arm in sorted(n_per_arm):
f = len(by_arm_failures[arm])
n = n_per_arm[arm]
print(f"{arm:<25} {n:>4} {f:>5} {f / n * 100:>5.1f}%")
print()
print("=" * 90)
print("Failure clusters per arm")
print("=" * 90)
for arm in sorted(error_clusters):
print(f"\n{arm}:")
for cluster, items in sorted(error_clusters[arm].items()):
print(f" {cluster:<22} {len(items):>3}")
sample = items[0]
err_short = (sample["error"] or "")[:200].replace("\n", " ")
print(f" example: {sample['qid']} doc={sample['doc_id']} pages={sample['pages']}")
print(f" error: {err_short}")
print()
print("=" * 90)
print("Per-PDF failure totals (PDFs with >=2 failures)")
print("=" * 90)
pdf_counts = Counter({pdf: len(rows) for pdf, rows in by_pdf_failures.items()})
for pdf, count in pdf_counts.most_common():
if count < 2:
break
arms_failed = sorted({r["arm"] for r in by_pdf_failures[pdf]})
pages = by_pdf_failures[pdf][0].get("pages")
print(f" {pdf} pages={pages} failures={count} arms={arms_failed}")
print()
print("=" * 90)
print("All native_pdf failures (one row per failure)")
print("=" * 90)
for entry in by_arm_failures.get("native_pdf", []):
err = (entry["error"] or "(no error string)")[:240].replace("\n", " ")
print(f" {entry['qid']} doc={entry['doc_id']} pages={entry['pages']} cluster={entry['cluster']}")
print(f" err: {err}")
summary: dict[str, Any] = {
"per_arm": {
arm: {
"n": n_per_arm[arm],
"failures": len(by_arm_failures[arm]),
"rate": len(by_arm_failures[arm]) / n_per_arm[arm],
"clusters": {
cluster: len(items)
for cluster, items in error_clusters[arm].items()
},
"rows": by_arm_failures[arm],
}
for arm in sorted(n_per_arm)
},
"per_pdf": {
pdf: [
{**r, "arm": r["arm"]} for r in failures
]
for pdf, failures in by_pdf_failures.items()
},
}
OUT.write_text(json.dumps(summary, indent=2), encoding="utf-8")
print(f"\nWrote: {OUT}")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,60 @@
"""Sanity check extraction sizes against Sonnet 4.5's context window.
Sonnet 4.5 supports ~200k tokens. As a *very* rough heuristic, English
markdown is ~4 chars/token, so anything over ~750k chars likely won't
fit alongside the system + question + 512 max_output_tokens. Print
warnings for any extraction that's at risk.
"""
from __future__ import annotations
import json
from pathlib import Path
REPO = Path(__file__).resolve().parents[1]
MAP = REPO / "data" / "multimodal_doc" / "maps" / "parser_compare_doc_map.jsonl"
CHARS_PER_TOKEN = 4
CTX_TOKENS = 200_000
PROMPT_OVERHEAD_TOKENS = 1_000 # system + question + format hint
MAX_OUTPUT_TOKENS = 512
SAFE_CHARS = (CTX_TOKENS - PROMPT_OVERHEAD_TOKENS - MAX_OUTPUT_TOKENS) * CHARS_PER_TOKEN
def main() -> None:
rows = [
json.loads(line)
for line in MAP.read_text(encoding="utf-8").splitlines()
if line.strip()
]
total = len(rows)
arm_max: dict[str, tuple[int, str]] = {}
overflows: list[tuple[str, str, int]] = []
for row in rows:
for arm, ext in (row.get("extractions") or {}).items():
chars = int(ext.get("chars") or 0)
if arm not in arm_max or arm_max[arm][0] < chars:
arm_max[arm] = (chars, row["doc_id"])
if chars > SAFE_CHARS:
overflows.append((row["doc_id"], arm, chars))
print(f"PDFs in manifest: {total}")
print(f"safe char budget: {SAFE_CHARS:,} (~{(SAFE_CHARS // CHARS_PER_TOKEN):,} tokens)")
print()
print("largest extraction per arm:")
for arm, (chars, doc_id) in sorted(arm_max.items()):
print(f" {arm:25s} {chars:>10,} chars ({doc_id})")
print()
if overflows:
print(f"OVERFLOW RISK ({len(overflows)} extractions > safe budget):")
for doc_id, arm, chars in overflows:
est_tokens = chars // CHARS_PER_TOKEN
print(f" {doc_id} :: {arm} :: {chars:,} chars (~{est_tokens:,} tokens)")
else:
print("no overflow risk — all extractions fit Sonnet 4.5's 200k context.")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,77 @@
"""Query SurfSense for the status of every MMLongBench PDF in scope.
Uses the existing SurfSense documents client to query
``/documents/status?document_ids=...`` for both the known-existing 5
PDFs (doc ids 5219-5223) and the recently-uploaded mmlongbench batch
(7577-7600 range).
"""
from __future__ import annotations
import asyncio
import os
from pathlib import Path
import httpx
from dotenv import load_dotenv
REPO = Path(__file__).resolve().parents[1]
PDF_DIR = REPO / "data" / "multimodal_doc" / "mmlongbench" / "pdfs"
async def main() -> None:
load_dotenv(REPO / ".env")
base = os.environ.get("SURFSENSE_API_BASE", "http://localhost:8000").rstrip("/")
token = os.environ.get("SURFSENSE_JWT")
if not token:
raise SystemExit("SURFSENSE_JWT missing from .env")
pdf_names = sorted(p.name for p in PDF_DIR.glob("*.pdf"))
print(f"local cached PDFs: {len(pdf_names)}")
candidate_ids = list(range(5219, 5224)) + list(range(7577, 7625))
headers = {
"Authorization": f"Bearer {token}",
"Accept": "application/json",
}
async with httpx.AsyncClient(timeout=30.0) as http:
r = await http.get(
f"{base}/api/v1/documents/status",
params={
"search_space_id": 55,
"document_ids": ",".join(str(d) for d in candidate_ids),
},
headers=headers,
)
r.raise_for_status()
items = r.json().get("items", [])
by_title: dict[str, dict] = {}
for it in items:
by_title[it.get("title", "")] = {
"id": it.get("id"),
"state": (it.get("status") or {}).get("state"),
"reason": (it.get("status") or {}).get("reason"),
}
by_state: dict[str, int] = {}
print()
for name in pdf_names:
info = by_title.get(name)
if info is None:
print(f" [missing ] {name}")
by_state["missing"] = by_state.get("missing", 0) + 1
else:
tag = info["state"] or "?"
print(f" [{tag:13s}] doc_id={info['id']:>5} {name}")
by_state[tag] = by_state.get(tag, 0) + 1
print()
print("summary:")
for k, v in sorted(by_state.items()):
print(f" {k}: {v}")
if __name__ == "__main__":
asyncio.run(main())

View file

@ -0,0 +1,112 @@
"""Compute "intrinsic" accuracy by removing transient network errors.
A failure is *transient* if it's:
* SSLError: SSL bad-record-mac (TLS hiccup)
* Cloudflare 502 / 503 (provider-side load shedding)
* empty_response with no error string and no other signal (likely
connection reset mid-stream)
* JSONDecodeError (parse error mid-stream)
A failure is *intrinsic* if it's a hard limit:
* "exceeds .* limit" (size limits)
* context_length errors
* provider 400 with image / pdf decode failure
* malformed-input failures
We re-compute accuracy with two denominators:
* raw acc = correct / 171 (what the headline reports)
* adjusted acc = correct / (171 - transient_failures) (intrinsic)
Outputs a table that we can drop straight into the blog.
"""
from __future__ import annotations
import json
from collections import defaultdict
from pathlib import Path
REPO = Path(__file__).resolve().parents[1]
RUN = REPO / "data" / "multimodal_doc" / "runs" / "2026-05-14T00-53-19Z" / "parser_compare"
RAW = RUN / "raw.jsonl"
TRANSIENT_HINTS = (
"sslv3_alert_bad_record_mac",
"ssl_alert_bad_record_mac",
"ssl: ssl",
"cloudflare",
"error 502",
"error 503",
"bad gateway",
"service unavailable",
"gateway timeout",
"jsondecodeerror",
)
INTRINSIC_HINTS = (
"exceeds",
"context_length",
"context window",
"could not process pdf",
"could not process image",
)
def classify(error: str | None, raw_text: str) -> str:
err = (error or "").lower()
if not err and not raw_text.strip():
return "transient_empty"
if any(h in err for h in TRANSIENT_HINTS):
return "transient_ssl_or_5xx"
if any(h in err for h in INTRINSIC_HINTS):
return "intrinsic_limit"
if err:
return "other_error"
return "ok"
def main() -> None:
rows = [
json.loads(line) for line in RAW.read_text(encoding="utf-8").splitlines()
if line.strip()
]
by_arm: dict[str, dict] = defaultdict(lambda: {
"n": 0, "correct": 0,
"transient_ssl_or_5xx": 0, "transient_empty": 0,
"intrinsic_limit": 0, "other_error": 0,
})
for row in rows:
arm = row["arm"]
m = by_arm[arm]
m["n"] += 1
graded = row.get("graded") or {}
if graded.get("correct"):
m["correct"] += 1
kind = classify(row.get("error"), row.get("raw_text") or "")
if kind != "ok":
m[kind] += 1
print(f"{'arm':<25} {'raw acc%':>8} {'transient':>10} {'intrinsic':>10} {'other':>6} {'adj acc% (no transient)':>22}")
print("-" * 88)
for arm in sorted(by_arm):
m = by_arm[arm]
raw = m["correct"] / m["n"] * 100
transient = m["transient_ssl_or_5xx"] + m["transient_empty"]
intrinsic = m["intrinsic_limit"]
other = m["other_error"]
usable = m["n"] - transient
adj = m["correct"] / usable * 100 if usable else 0
print(
f"{arm:<25} {raw:>7.1f}% {transient:>10} {intrinsic:>10} {other:>6} {adj:>21.1f}%"
)
print()
print("transient = SSLError / 502 / 503 / empty stream / mid-stream JSON decode (would")
print(" succeed on retry; eval harness has no built-in retry today).")
print("intrinsic = hard limit (e.g. >30MB Anthropic request, model context overflow).")
print("adj acc% = correct / (n - transient) — what the arm scores when network noise")
print(" is removed; closest thing we have to a like-for-like quality number.")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,381 @@
"""Compute the deeper statistics the blog needs: McNemar pairwise tests,
per-PDF heterogeneity, latency/token distribution percentiles.
Reads the merged post-retry artifact:
data/multimodal_doc/runs/<run_id>/parser_compare/raw_post_retry.jsonl
Outputs to stdout:
1) Per-arm latency distribution (n, mean, std, p10, p25, p50, p75, p90, p95, p99, max).
2) Per-arm input/output token distribution (mean, p50, p95, max).
3) McNemar pairwise table: for every (arm_i, arm_j) ordered pair on the
same 171 questions, count b_ij = #(arm_i correct & arm_j wrong) and
b_ji = #(arm_i wrong & arm_j correct), and report the exact-binomial
two-sided p-value. We include both raw (using the original raw.jsonl)
and post-retry results.
4) Per-PDF accuracy variance per arm (n_pdfs=30): mean, std, min, max.
Pure stdlib no scipy/numpy.
"""
from __future__ import annotations
import argparse
import json
import math
import statistics
from pathlib import Path
REPO = Path(__file__).resolve().parents[1]
# ---------------------------------------------------------------------------
# I/O
# ---------------------------------------------------------------------------
def _read_jsonl(path: Path) -> list[dict]:
out: list[dict] = []
with path.open("r", encoding="utf-8") as fh:
for line in fh:
line = line.strip()
if not line:
continue
out.append(json.loads(line))
return out
# ---------------------------------------------------------------------------
# Distribution helpers
# ---------------------------------------------------------------------------
def _percentile(values: list[float], p: float) -> float:
"""Linear-interpolation percentile (p in [0, 100])."""
if not values:
return 0.0
s = sorted(values)
if len(s) == 1:
return float(s[0])
k = (len(s) - 1) * (p / 100.0)
lo, hi = math.floor(k), math.ceil(k)
if lo == hi:
return float(s[int(k)])
return float(s[lo] + (s[hi] - s[lo]) * (k - lo))
# ---------------------------------------------------------------------------
# McNemar exact-binomial p-value
# ---------------------------------------------------------------------------
def _binom_coef(n: int, k: int) -> int:
if k < 0 or k > n:
return 0
return math.comb(n, k)
def _mcnemar_exact_pvalue(b: int, c: int) -> float:
"""Two-sided exact-binomial McNemar p-value.
Tests H0: P(arm_i wrong, arm_j right) == P(arm_i right, arm_j wrong)
on discordant pairs only. Under H0 the count b ~ Bin(b+c, 0.5).
The two-sided p-value is
P(X <= min(b, c)) + P(X >= max(b, c))
computed exactly (cheap because b+c <= 27 in our run).
"""
n = b + c
if n == 0:
return 1.0
k = min(b, c)
# Two-sided exact: 2 * P(X <= k) clipped at 1.0
cdf = sum(_binom_coef(n, i) for i in range(k + 1))
p = 2.0 * cdf / (2 ** n)
return min(1.0, p)
def _mcnemar_table(rows: list[dict]) -> dict:
"""Group rows -> {qid: {arm: bool_correct}} and compute pairwise."""
by_qid: dict[str, dict[str, bool]] = {}
arms_seen: set[str] = set()
for r in rows:
qid = r["qid"]
arm = r["arm"]
graded = r.get("graded") or {}
correct = bool(graded.get("correct"))
by_qid.setdefault(qid, {})[arm] = correct
arms_seen.add(arm)
arms = sorted(arms_seen)
qids = sorted(by_qid)
out: dict[str, dict] = {"arms": arms, "n_qids": len(qids), "pairs": []}
for i, ai in enumerate(arms):
for aj in arms[i + 1:]:
b = c = both = neither = 0
for q in qids:
row = by_qid[q]
if ai not in row or aj not in row:
continue
ci, cj = row[ai], row[aj]
if ci and not cj:
b += 1
elif cj and not ci:
c += 1
elif ci and cj:
both += 1
else:
neither += 1
p = _mcnemar_exact_pvalue(b, c)
out["pairs"].append({
"arm_i": ai, "arm_j": aj,
"b_i_only": b, "c_j_only": c,
"both_correct": both, "both_wrong": neither,
"p_value": p,
})
return out
# ---------------------------------------------------------------------------
# Per-PDF heterogeneity
# ---------------------------------------------------------------------------
def _per_pdf_stats(rows: list[dict]) -> dict[str, dict]:
"""For each arm, per-PDF accuracy = correct/total questions on that PDF."""
bucket: dict[str, dict[str, list[bool]]] = {}
for r in rows:
arm = r["arm"]
pdf = r["doc_id"]
graded = r.get("graded") or {}
bucket.setdefault(arm, {}).setdefault(pdf, []).append(
bool(graded.get("correct"))
)
out: dict[str, dict] = {}
for arm, pdfs in bucket.items():
accs = [sum(b) / len(b) for b in pdfs.values() if b]
if not accs:
continue
out[arm] = {
"n_pdfs": len(accs),
"mean": statistics.mean(accs),
"std": statistics.stdev(accs) if len(accs) > 1 else 0.0,
"min": min(accs),
"max": max(accs),
"p25": _percentile(accs, 25),
"p50": _percentile(accs, 50),
"p75": _percentile(accs, 75),
"n_pdfs_zero": sum(1 for a in accs if a == 0.0),
"n_pdfs_perfect": sum(1 for a in accs if a == 1.0),
}
return out
# ---------------------------------------------------------------------------
# Latency / token distributions
# ---------------------------------------------------------------------------
def _per_arm_latency(rows: list[dict]) -> dict[str, dict]:
by_arm: dict[str, list[float]] = {}
for r in rows:
lat = r.get("latency_ms")
if lat is None or lat == 0:
continue
by_arm.setdefault(r["arm"], []).append(float(lat))
out: dict[str, dict] = {}
for arm, lats in by_arm.items():
out[arm] = {
"n": len(lats),
"mean_s": statistics.mean(lats) / 1000,
"std_s": (statistics.stdev(lats) / 1000) if len(lats) > 1 else 0.0,
"p10_s": _percentile(lats, 10) / 1000,
"p25_s": _percentile(lats, 25) / 1000,
"p50_s": _percentile(lats, 50) / 1000,
"p75_s": _percentile(lats, 75) / 1000,
"p90_s": _percentile(lats, 90) / 1000,
"p95_s": _percentile(lats, 95) / 1000,
"p99_s": _percentile(lats, 99) / 1000,
"max_s": max(lats) / 1000,
# Coefficient of variation: std / mean (unitless tail-fatness).
"cv": (
statistics.stdev(lats) / statistics.mean(lats)
if len(lats) > 1 and statistics.mean(lats) > 0 else 0.0
),
}
return out
def _per_arm_tokens(rows: list[dict]) -> dict[str, dict]:
by_arm_in: dict[str, list[float]] = {}
by_arm_out: dict[str, list[float]] = {}
for r in rows:
t_in = r.get("input_tokens") or 0
t_out = r.get("output_tokens") or 0
if t_in:
by_arm_in.setdefault(r["arm"], []).append(float(t_in))
if t_out:
by_arm_out.setdefault(r["arm"], []).append(float(t_out))
out: dict[str, dict] = {}
for arm in sorted(set(by_arm_in) | set(by_arm_out)):
in_vals = by_arm_in.get(arm, [])
out_vals = by_arm_out.get(arm, [])
if not in_vals and not out_vals:
continue
entry: dict = {}
if in_vals:
entry["input"] = {
"n": len(in_vals),
"mean": statistics.mean(in_vals),
"p50": _percentile(in_vals, 50),
"p95": _percentile(in_vals, 95),
"max": max(in_vals),
}
if out_vals:
entry["output"] = {
"n": len(out_vals),
"mean": statistics.mean(out_vals),
"p50": _percentile(out_vals, 50),
"p95": _percentile(out_vals, 95),
"max": max(out_vals),
}
out[arm] = entry
return out
# ---------------------------------------------------------------------------
# Pretty-printing
# ---------------------------------------------------------------------------
def _print_latency(title: str, lat: dict[str, dict]) -> None:
print()
print(title)
print("-" * len(title))
header = (f"{'arm':<25} {'n':>4} {'mean':>7} {'std':>7} "
f"{'p50':>7} {'p90':>7} {'p95':>7} {'p99':>7} {'max':>7} {'CV':>5}")
print(header)
print("-" * len(header))
for arm in sorted(lat, key=lambda a: lat[a]["mean_s"]):
s = lat[arm]
print(f"{arm:<25} {s['n']:>4} "
f"{s['mean_s']:>6.1f}s {s['std_s']:>6.1f}s "
f"{s['p50_s']:>6.1f}s {s['p90_s']:>6.1f}s {s['p95_s']:>6.1f}s "
f"{s['p99_s']:>6.1f}s {s['max_s']:>6.1f}s {s['cv']:>5.2f}")
def _print_tokens(title: str, toks: dict[str, dict]) -> None:
print()
print(title)
print("-" * len(title))
header = (f"{'arm':<25} {'in mean':>9} {'in p50':>9} {'in p95':>9} {'in max':>9}"
f" {'out mean':>9} {'out p95':>9}")
print(header)
print("-" * len(header))
for arm in sorted(toks):
e = toks[arm]
ein = e.get("input")
eout = e.get("output")
if not ein:
continue
print(f"{arm:<25} "
f"{ein['mean']:>9,.0f} {ein['p50']:>9,.0f} {ein['p95']:>9,.0f} {ein['max']:>9,.0f} "
f"{(eout or {}).get('mean', 0):>9,.0f} {(eout or {}).get('p95', 0):>9,.0f}")
def _print_pdf_var(title: str, var: dict[str, dict]) -> None:
print()
print(title)
print("-" * len(title))
header = (f"{'arm':<25} {'n_pdfs':>7} {'mean':>7} {'std':>7} {'min':>7} "
f"{'p25':>7} {'p50':>7} {'p75':>7} {'max':>7} {'#0%':>5} {'#100%':>6}")
print(header)
print("-" * len(header))
for arm in sorted(var, key=lambda a: -var[a]["mean"]):
s = var[arm]
print(f"{arm:<25} {s['n_pdfs']:>7} "
f"{s['mean']*100:>6.1f}% {s['std']*100:>6.1f}% {s['min']*100:>6.1f}% "
f"{s['p25']*100:>6.1f}% {s['p50']*100:>6.1f}% {s['p75']*100:>6.1f}% "
f"{s['max']*100:>6.1f}% {s['n_pdfs_zero']:>5} {s['n_pdfs_perfect']:>6}")
def _print_mcnemar(title: str, table: dict) -> None:
print()
print(title)
print("-" * len(title))
print(f"n_qids on which all arms have a graded row: {table['n_qids']}")
header = (f"{'arm_i':<25} {'arm_j':<25} {'b':>4} {'c':>4} "
f"{'both ok':>8} {'both wr':>8} {'p (2-sided)':>13} {'sig':>4}")
print(header)
print("-" * len(header))
for pair in sorted(table["pairs"], key=lambda p: p["p_value"]):
sig = ""
if pair["p_value"] < 0.001:
sig = "***"
elif pair["p_value"] < 0.01:
sig = "**"
elif pair["p_value"] < 0.05:
sig = "*"
print(f"{pair['arm_i']:<25} {pair['arm_j']:<25} "
f"{pair['b_i_only']:>4} {pair['c_j_only']:>4} "
f"{pair['both_correct']:>8} {pair['both_wrong']:>8} "
f"{pair['p_value']:>13.4f} {sig:>4}")
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--run-id", default="2026-05-14T00-53-19Z")
args = parser.parse_args()
run_dir = REPO / "data" / "multimodal_doc" / "runs" / args.run_id / "parser_compare"
raw_path = run_dir / "raw.jsonl"
post_path = run_dir / "raw_post_retry.jsonl"
if not raw_path.exists() or not post_path.exists():
raise SystemExit(
"Missing raw.jsonl or raw_post_retry.jsonl. "
"Run scripts/compute_post_retry_accuracy.py first."
)
raw_rows = _read_jsonl(raw_path)
post_rows = _read_jsonl(post_path)
print(f"Run: {args.run_id}")
print(f"raw rows: {len(raw_rows)}, post-retry rows: {len(post_rows)}")
# Latency uses post-retry rows (post-retry rows include the retry's own
# latency for recovered rows). For raw, recovered rows have latency=0
# because the harness recorded a failure.
_print_latency("Per-arm latency (post-retry)", _per_arm_latency(post_rows))
_print_tokens("Per-arm token distribution (post-retry)", _per_arm_tokens(post_rows))
_print_pdf_var(
"Per-PDF accuracy heterogeneity (post-retry)",
_per_pdf_stats(post_rows),
)
_print_mcnemar(
"McNemar pairwise (RAW, no retries)",
_mcnemar_table(raw_rows),
)
_print_mcnemar(
"McNemar pairwise (POST-RETRY)",
_mcnemar_table(post_rows),
)
return 0
if __name__ == "__main__":
raise SystemExit(main())

View file

@ -0,0 +1,180 @@
"""Recompute per-arm accuracy/F1 after merging retry survivors into raw.jsonl.
Reads:
- data/multimodal_doc/runs/<run_id>/parser_compare/raw.jsonl
- data/multimodal_doc/runs/<run_id>/parser_compare/raw_retries.jsonl
For each (arm, qid) present in the retry artifact:
- if the retry RECOVERED, the retry row replaces the original row (same
grader is reused see ``mmlongbench/grader.py``);
- if the retry did NOT recover, the original row stays (still a failure,
so ``correct=False`` and ``f1=0``).
Prints two tables side by side:
* Raw run (no retries) matches §1 of the blog.
* Post-retry run final, "what would the headline have been if
the harness had had retries from day one".
It also writes ``data/multimodal_doc/runs/<run_id>/parser_compare/raw_post_retry.jsonl``
so any downstream notebook / report can join straight on it.
"""
from __future__ import annotations
import argparse
import json
import sys
from pathlib import Path
REPO = Path(__file__).resolve().parents[1]
def _read_jsonl(path: Path) -> list[dict]:
out: list[dict] = []
with path.open("r", encoding="utf-8") as fh:
for line in fh:
line = line.strip()
if not line:
continue
out.append(json.loads(line))
return out
def _row_key(row: dict) -> tuple[str, str]:
return (str(row["arm"]), str(row["qid"]))
def _is_failure(row: dict) -> bool:
if row.get("error"):
return True
if not (row.get("raw_text") or "").strip():
return True
return False
def _summarise(rows_by_arm: dict[str, list[dict]]) -> dict[str, dict]:
out: dict[str, dict] = {}
for arm, rows in rows_by_arm.items():
n = len(rows)
n_correct = sum(1 for r in rows if r.get("graded", {}).get("correct"))
f1_sum = sum(float(r.get("graded", {}).get("f1") or 0.0) for r in rows)
n_fail = sum(1 for r in rows if _is_failure(r))
out[arm] = {
"n": n,
"n_correct": n_correct,
"n_failures": n_fail,
"accuracy": (n_correct / n) if n else 0.0,
"f1_mean": (f1_sum / n) if n else 0.0,
"failure_rate": (n_fail / n) if n else 0.0,
}
return out
def _print_table(title: str, summary: dict[str, dict]) -> None:
print()
print(title)
print("-" * len(title))
header = f"{'arm':<25} {'n':>4} {'n_corr':>7} {'acc':>7} {'F1':>7} {'fails':>6} {'fail%':>7}"
print(header)
print("-" * len(header))
# stable order: highest accuracy first
arms_sorted = sorted(summary.items(), key=lambda kv: -kv[1]["accuracy"])
for arm, s in arms_sorted:
print(f"{arm:<25} {s['n']:>4} {s['n_correct']:>7} "
f"{s['accuracy']*100:>6.1f}% {s['f1_mean']*100:>6.1f}% "
f"{s['n_failures']:>6} {s['failure_rate']*100:>6.1f}%")
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--run-id", default="2026-05-14T00-53-19Z")
args = parser.parse_args()
run_dir = REPO / "data" / "multimodal_doc" / "runs" / args.run_id / "parser_compare"
raw_path = run_dir / "raw.jsonl"
retry_path = run_dir / "raw_retries.jsonl"
out_path = run_dir / "raw_post_retry.jsonl"
if not raw_path.exists():
print(f"raw.jsonl not found at {raw_path}", file=sys.stderr)
return 1
if not retry_path.exists():
print(f"raw_retries.jsonl not found at {retry_path}", file=sys.stderr)
return 1
raw_rows = _read_jsonl(raw_path)
retry_rows = _read_jsonl(retry_path)
retry_by_key: dict[tuple[str, str], dict] = {
_row_key(r): r for r in retry_rows
}
merged_rows: list[dict] = []
n_replaced_recovered = 0
n_replaced_still_failed = 0
n_unchanged = 0
for row in raw_rows:
key = _row_key(row)
retry = retry_by_key.get(key)
if retry is None:
merged_rows.append(row)
n_unchanged += 1
continue
# The retry artifact carries a fresh ArmResult + grade in the same
# shape, plus a "retry" sub-object. We use the retry row whenever
# it represents a recovery; otherwise we keep the original (the
# retry confirms it is intrinsic, but the original row is the one
# the headline numbers were computed from, and the failure verdict
# is identical either way).
recovered = bool(retry.get("retry", {}).get("recovered"))
if recovered:
merged_rows.append(retry)
n_replaced_recovered += 1
else:
merged_rows.append(row)
n_replaced_still_failed += 1
# Persist merged jsonl for downstream consumers
with out_path.open("w", encoding="utf-8") as fh:
for r in merged_rows:
fh.write(json.dumps(r) + "\n")
# Bucket per arm
raw_by_arm: dict[str, list[dict]] = {}
for r in raw_rows:
raw_by_arm.setdefault(r["arm"], []).append(r)
post_by_arm: dict[str, list[dict]] = {}
for r in merged_rows:
post_by_arm.setdefault(r["arm"], []).append(r)
raw_summary = _summarise(raw_by_arm)
post_summary = _summarise(post_by_arm)
print()
print(f"Run: {args.run_id}")
print(f"Replaced (retry recovered): {n_replaced_recovered}")
print(f"Kept original (retry still failed): {n_replaced_still_failed}")
print(f"Untouched rows: {n_unchanged}")
print(f"Wrote merged artifact: {out_path.relative_to(REPO)}")
_print_table("Raw run (no retries)", raw_summary)
_print_table("Post-retry run (final)", post_summary)
print()
print("Delta (post-retry minus raw):")
print(f"{'arm':<25} {'d_acc':>7} {'d_fails':>8}")
print("-" * 42)
for arm in sorted(set(raw_summary) | set(post_summary)):
r = raw_summary.get(arm)
p = post_summary.get(arm)
if not r or not p:
continue
d_acc = (p["accuracy"] - r["accuracy"]) * 100
d_fail = p["n_failures"] - r["n_failures"]
print(f"{arm:<25} {d_acc:>+6.1f}p {d_fail:>+7d}")
return 0
if __name__ == "__main__":
raise SystemExit(main())

View file

@ -1,97 +0,0 @@
"""Download CRAG Task 3's 4 .tar.bz2 parts in parallel.
Run once before ``ingest research crag_t3`` to avoid the ingest
synchronously blocking on a 7 GB download. Skips parts already
present and complete on disk.
"""
from __future__ import annotations
import logging
import sys
import time
import urllib.request
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
)
log = logging.getLogger("download_task3")
_BASE = (
"https://github.com/facebookresearch/CRAG/raw/refs/heads/main/data/"
"crag_task_3_dev_v4.tar.bz2.part"
)
_USER_AGENT = "SurfSense-Evals/0.1 (CRAG Task 3 fetch)"
def _expected_size(url: str) -> int:
req = urllib.request.Request(url, method="HEAD", headers={"User-Agent": _USER_AGENT})
with urllib.request.urlopen(req, timeout=30) as resp:
return int(resp.headers.get("content-length", 0))
def download_one(part: int, dest_dir: Path) -> Path:
url = f"{_BASE}{part}"
dest = dest_dir / f"crag_task_3_dev_v4.tar.bz2.part{part}"
expected = _expected_size(url)
if dest.exists() and dest.stat().st_size == expected:
log.info("part%d: cached (%d bytes)", part, expected)
return dest
log.info("part%d: downloading %d bytes ...", part, expected)
tmp = dest.with_suffix(dest.suffix + ".part_dl")
started = time.monotonic()
last_log = started
with urllib.request.urlopen(
urllib.request.Request(url, headers={"User-Agent": _USER_AGENT}),
timeout=900,
) as resp, tmp.open("wb") as fh:
downloaded = 0
chunk = resp.read(1 << 20)
while chunk:
fh.write(chunk)
downloaded += len(chunk)
now = time.monotonic()
if now - last_log > 5.0:
pct = 100 * downloaded / expected if expected else 0
rate_mb = (downloaded / (now - started)) / (1 << 20)
log.info(
"part%d: %5.1f%% (%.1f / %.1f MiB at %.1f MiB/s)",
part, pct, downloaded / (1 << 20), expected / (1 << 20), rate_mb,
)
last_log = now
chunk = resp.read(1 << 20)
tmp.replace(dest)
elapsed = time.monotonic() - started
log.info(
"part%d: done in %.1fs (%.1f MiB/s avg)",
part, elapsed, (expected / (1 << 20)) / max(elapsed, 0.001),
)
return dest
def main() -> int:
dest_dir = Path("data/research/crag_t3/.raw_cache")
dest_dir.mkdir(parents=True, exist_ok=True)
# 4 parts in parallel — typical residential connection saturates around
# 2 streams; GitHub raw serves these fine in parallel.
started = time.monotonic()
with ThreadPoolExecutor(max_workers=4) as ex:
futures = {ex.submit(download_one, i, dest_dir): i for i in range(1, 5)}
for fut in as_completed(futures):
part = futures[fut]
try:
fut.result()
except Exception as exc: # noqa: BLE001
log.error("part%d failed: %s", part, exc)
return 1
log.info("All 4 parts downloaded in %.1fs", time.monotonic() - started)
return 0
if __name__ == "__main__":
sys.exit(main())

View file

@ -0,0 +1,59 @@
"""Inspect what the first 30 MMLongBench-Doc PDFs would look like for scoping.
Run from surfsense_evals/ root via:
python scripts/inspect_first30.py
Prints which docs are already ingested (existing 5), which are new (25 to
upload), how many questions cover those 30 PDFs, and the answerable /
unanswerable + format mix.
"""
from __future__ import annotations
import json
from collections import Counter
from pathlib import Path
def main() -> None:
qpath = Path("data/multimodal_doc/mmlongbench/questions.jsonl")
lines = qpath.read_text(encoding="utf-8").splitlines()
rows = [json.loads(line) for line in lines if line.strip()]
docs_by_id = sorted({r["doc_id"] for r in rows})
first30 = docs_by_id[:30]
existing5 = {
"05-03-18-political-release.pdf",
"0b85477387a9d0cc33fca0f4becaa0e5.pdf",
"0e94b4197b10096b1f4c699701570fbf.pdf",
"11-21-16-Updated-Post-Election-Release.pdf",
"12-15-15-ISIS-and-terrorism-release-final.pdf",
}
new25 = [d for d in first30 if d not in existing5]
print(
f"first 30 docs (alphabetical) — {len(new25)} new, "
f"{len(first30) - len(new25)} already in SurfSense"
)
qs_in_30 = [r for r in rows if r["doc_id"] in set(first30)]
fmts = Counter((r.get("answer_format") or "").lower() for r in qs_in_30)
answerable = sum(v for k, v in fmts.items() if k != "none")
unanswerable = fmts.get("none", 0)
print(
f"questions covering first 30 docs: total={len(qs_in_30)} "
f"answerable={answerable} unanswerable={unanswerable}"
)
print(
f"avg Qs/PDF: {len(qs_in_30) / 30:.1f} "
f"answerable/PDF: {answerable / 30:.1f}"
)
print(f"format mix in scope: {dict(fmts)}")
print()
print("25 new PDFs to ingest:")
for d in new25:
print(f" - {d}")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,100 @@
"""Stub the mmlongbench manifest so parser_compare can extract in parallel.
The mmlongbench Surfsense ingest writes its manifest only at the very
end of the upload pipeline (~hours of celery work). parser_compare's
ingest, on the other hand, just needs a list of (doc_id, pdf_path)
tuples to know which PDFs to extract it doesn't care about the
SurfSense ``document_id`` (the runner does, later, after a refresh).
This script extends the existing manifest with the *additional* PDFs
that mmlongbench has already cached on disk (i.e. all 30 PDFs in
``data/multimodal_doc/mmlongbench/pdfs/`` even though only 5 have
SurfSense ``document_id``s yet) so parser_compare can run all four
extractions for them in parallel with the SurfSense ingest.
After mmlongbench finishes, re-run::
python -m surfsense_evals ingest multimodal_doc parser_compare \
--max-docs 30
to refresh ``parser_compare_doc_map.jsonl`` with the now-populated
``document_id`` values for the 25 new PDFs. The extractions
themselves are cached on disk so the second pass is essentially free.
"""
from __future__ import annotations
import json
from pathlib import Path
REPO = Path(__file__).resolve().parents[1]
MAP_PATH = REPO / "data" / "multimodal_doc" / "maps" / "mmlongbench_doc_map.jsonl"
PDF_DIR = REPO / "data" / "multimodal_doc" / "mmlongbench" / "pdfs"
QUESTIONS = REPO / "data" / "multimodal_doc" / "mmlongbench" / "questions.jsonl"
def _question_count_per_doc() -> dict[str, int]:
counts: dict[str, int] = {}
with QUESTIONS.open("r", encoding="utf-8") as fh:
for line in fh:
line = line.strip()
if not line:
continue
row = json.loads(line)
counts[row["doc_id"]] = counts.get(row["doc_id"], 0) + 1
return counts
def main() -> None:
if not MAP_PATH.exists():
raise SystemExit(
f"manifest not found at {MAP_PATH}"
"run `surfsense_evals ingest multimodal_doc mmlongbench` first."
)
existing_lines = MAP_PATH.read_text(encoding="utf-8").splitlines()
existing_rows: list[dict] = []
settings_line = None
for line in existing_lines:
line = line.strip()
if not line:
continue
row = json.loads(line)
if "__settings__" in row:
settings_line = line
else:
existing_rows.append(row)
by_doc_id = {r["doc_id"]: r for r in existing_rows}
counts = _question_count_per_doc()
cached_pdfs = sorted(p for p in PDF_DIR.glob("*.pdf"))
print(f"existing manifest entries: {len(existing_rows)}")
print(f"cached PDFs on disk: {len(cached_pdfs)}")
added = 0
for pdf in cached_pdfs:
if pdf.name in by_doc_id:
continue
by_doc_id[pdf.name] = {
"doc_id": pdf.name,
"document_id": None,
"pdf_path": str(pdf),
"n_questions": counts.get(pdf.name, 0),
}
added += 1
out_lines: list[str] = []
if settings_line:
out_lines.append(settings_line)
for doc_id in sorted(by_doc_id):
out_lines.append(json.dumps(by_doc_id[doc_id]))
MAP_PATH.write_text("\n".join(out_lines) + "\n", encoding="utf-8")
print(f"added {added} stub rows; manifest now has {len(by_doc_id)} PDFs")
print(f"wrote: {MAP_PATH}")
if __name__ == "__main__":
main()

View file

@ -1,40 +0,0 @@
"""Quick sanity-check for the CRAG Task 3 doc map after ingest."""
from __future__ import annotations
import json
import sys
from pathlib import Path
def main() -> int:
p = Path("data/research/maps/crag_t3_doc_map.jsonl")
if not p.exists():
print(f"Doc map missing: {p}")
return 1
rows = []
settings = {}
for line in p.read_text(encoding="utf-8").splitlines():
if not line.strip():
continue
row = json.loads(line)
if "__settings__" in row:
settings = row
continue
rows.append(row)
print(f"Settings header: {settings}")
print(f"Doc map rows: {len(rows)}")
for r in rows:
print(f" qid={r['qid']:<10} domain={r['domain']:<8} qtype={r['question_type']}")
print(f" question: {r['question'][:90]}")
print(f" gold: {r['gold_answer'][:90]}")
print(
f" pages: {len(r['page_filenames'])} extracted, "
f"{len(r['document_ids'])} doc_ids, "
f"{len(r['missing_pages'])} missing"
)
return 0
if __name__ == "__main__":
sys.exit(main())

View file

@ -0,0 +1,636 @@
"""Retry only the failed (arm, question) pairs from a previous parser_compare run.
The original parser_compare run records one row per (arm, qid) in
``raw.jsonl``. Some of those rows came back with transient transport
errors (SSL alerts, gateway 502s, empty SSE streams) or empty
``raw_text``. This script re-issues *only* those calls with exponential
backoff so we can see how many recover.
Design constraints / choices:
* **No re-ingest.** All cached PDFs and parser-extracted markdown stay
on disk. We rebuild ``ArmRequest`` objects from the existing manifest
+ the original ``mmlongbench/questions.jsonl``.
* **No SurfSense backend or celery required.** SurfSense had 0
reported failures; this script will skip any ``surfsense_agentic``
rows it encounters and warn rather than try to start the backend.
* **Original ``raw.jsonl`` is never mutated.** Retries land in a
sibling ``raw_retries.jsonl`` so the original artifact stays
citeable.
* **Idempotent.** Re-running this script re-tries the same set of
failed rows from ``raw.jsonl``. If you want to merge survivor rows
back in, do that as a separate aggregation step.
Usage:
python scripts/retry_failed_questions.py \
--run-id 2026-05-14T00-53-19Z \
--max-attempts 5 \
--concurrency 2
Outputs (written next to the original raw.jsonl):
* ``raw_retries.jsonl`` one line per retried (arm, qid). Each line
carries the original error, every retry attempt's timing/error,
and the final result (incl. grade) so you can drop it straight
into a notebook.
* ``raw_retries_summary.json`` per-arm tried/recovered/still-failed
counts and an aggregated retry-success rate.
"""
from __future__ import annotations
import argparse
import asyncio
import json
import logging
import os
import random
import sys
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Any
REPO = Path(__file__).resolve().parents[1]
SRC = REPO / "src"
if str(SRC) not in sys.path:
sys.path.insert(0, str(SRC))
from dotenv import load_dotenv # noqa: E402
from surfsense_evals.core.arms import ( # noqa: E402
ArmRequest,
ArmResult,
BareLlmArm,
NativePdfArm,
)
from surfsense_evals.core.parse.freeform_answer import ( # noqa: E402
extract_freeform_answer,
)
from surfsense_evals.core.providers.openrouter_chat import ( # noqa: E402
OpenRouterChatProvider,
)
from surfsense_evals.core.providers.openrouter_pdf import ( # noqa: E402
OpenRouterPdfProvider,
PdfEngine,
)
from surfsense_evals.suites.multimodal_doc.mmlongbench.grader import grade # noqa: E402
from surfsense_evals.suites.multimodal_doc.parser_compare.prompt import ( # noqa: E402
build_long_context_prompt,
build_native_pdf_prompt,
)
logger = logging.getLogger("retry_failed_questions")
LC_ARMS = {
"azure_basic_lc",
"azure_premium_lc",
"llamacloud_basic_lc",
"llamacloud_premium_lc",
}
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _is_failure_row(row: dict[str, Any]) -> bool:
"""A row counts as failed if it raised an error OR returned empty text.
We retry both because the empty-stream case is the same operational
failure mode (the call returned nothing usable) we just didn't
raise it as an exception.
"""
if row.get("error"):
return True
if not (row.get("raw_text") or "").strip():
return True
return False
@dataclass
class FailedRow:
arm: str
qid: str
doc_id: str
answer_format: str
gold: str
pages: int
document_id: int | None
original_error: str | None
original_row: dict[str, Any]
def _load_failed_rows(raw_path: Path) -> list[FailedRow]:
out: list[FailedRow] = []
with raw_path.open("r", encoding="utf-8") as fh:
for line in fh:
line = line.strip()
if not line:
continue
row = json.loads(line)
if not _is_failure_row(row):
continue
out.append(FailedRow(
arm=str(row["arm"]),
qid=str(row["qid"]),
doc_id=str(row["doc_id"]),
answer_format=str(row.get("answer_format") or ""),
gold=str(row.get("gold") or ""),
pages=int(row.get("pages") or 0),
document_id=row.get("document_id"),
original_error=row.get("error"),
original_row=row,
))
return out
def _load_doc_map(map_path: Path) -> dict[str, dict[str, Any]]:
out: dict[str, dict[str, Any]] = {}
with map_path.open("r", encoding="utf-8") as fh:
for line in fh:
line = line.strip()
if not line:
continue
row = json.loads(line)
out[str(row["doc_id"])] = row
return out
def _load_question_text_index(
questions_jsonl: Path,
) -> dict[tuple[str, int], dict[str, Any]]:
"""Map (doc_id, per_doc_index) -> raw question row.
qids in raw.jsonl are formatted ``{doc_id}::Q{NNN}`` where NNN is
the per-doc index. Reproducing the runner's question selection
requires walking ``questions.jsonl`` in order and assigning
indices per doc_id (so we match the runner's ``per_doc_idx`` logic
in ``_select_questions``).
"""
out: dict[tuple[str, int], dict[str, Any]] = {}
per_doc_idx: dict[str, int] = {}
with questions_jsonl.open("r", encoding="utf-8") as fh:
for line in fh:
line = line.strip()
if not line:
continue
row = json.loads(line)
doc_id = str(row.get("doc_id") or "")
if not doc_id:
continue
idx = per_doc_idx.get(doc_id, 0)
per_doc_idx[doc_id] = idx + 1
out[(doc_id, idx)] = row
return out
def _qid_index(qid: str) -> int:
"""Parse the per-doc question index out of a qid like ``foo.pdf::Q007``."""
_, _, q_part = qid.rpartition("::")
if not q_part.startswith("Q"):
raise ValueError(f"unexpected qid shape: {qid!r}")
return int(q_part[1:])
# ---------------------------------------------------------------------------
# Request building (mirrors runner.py exactly so prompts are byte-identical)
# ---------------------------------------------------------------------------
def _build_native_request(
qid: str, question: str, answer_format: str, pdf_path: Path,
*, max_output_tokens: int,
) -> ArmRequest:
return ArmRequest(
question_id=qid,
prompt=build_native_pdf_prompt(question, answer_format=answer_format),
pdf_paths=[pdf_path],
options={"max_tokens": max_output_tokens},
)
def _build_lc_request(
qid: str, question: str, answer_format: str, doc_id: str, md_path: Path,
) -> ArmRequest:
if not md_path.exists():
raise FileNotFoundError(
f"Missing parser extraction at {md_path}; cannot retry LC arm."
)
markdown = md_path.read_text(encoding="utf-8")
return ArmRequest(
question_id=qid,
prompt=build_long_context_prompt(
question,
answer_format=answer_format,
document_markdown=markdown,
document_label=doc_id,
),
)
# ---------------------------------------------------------------------------
# Retry driver
# ---------------------------------------------------------------------------
@dataclass
class AttemptLog:
attempt: int
started_iso: str
latency_ms: int
error: str | None
raw_text_chars: int
@dataclass
class RetryOutcome:
arm: str
qid: str
attempts: list[AttemptLog]
final_result: ArmResult
recovered: bool
async def _retry_one(
arm_obj: Any, request: ArmRequest, *,
arm_name: str,
qid: str,
max_attempts: int,
base_delay: float,
max_delay: float,
) -> RetryOutcome:
attempts: list[AttemptLog] = []
final: ArmResult | None = None
for attempt in range(1, max_attempts + 1):
started_iso = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
t0 = time.monotonic()
result = await arm_obj.answer(request)
latency_ms = int((time.monotonic() - t0) * 1000)
raw_text = (result.raw_text or "").strip()
attempt_error = result.error
if not attempt_error and not raw_text:
attempt_error = "EmptyResponse: stream ended with no text"
attempts.append(AttemptLog(
attempt=attempt,
started_iso=started_iso,
latency_ms=latency_ms,
error=attempt_error,
raw_text_chars=len(raw_text),
))
final = result
if not attempt_error and raw_text:
return RetryOutcome(
arm=arm_name, qid=qid, attempts=attempts,
final_result=result, recovered=True,
)
if attempt < max_attempts:
delay = min(max_delay, base_delay * (2 ** (attempt - 1)))
delay = delay * (0.5 + random.random())
logger.info(
"[%s::%s] attempt %d/%d failed (%s); sleeping %.1fs",
arm_name, qid, attempt, max_attempts, attempt_error, delay,
)
await asyncio.sleep(delay)
assert final is not None
return RetryOutcome(
arm=arm_name, qid=qid, attempts=attempts,
final_result=final, recovered=False,
)
async def _gather_with_limit(coros: list, *, concurrency: int) -> list[Any]:
sem = asyncio.Semaphore(max(1, concurrency))
async def _wrap(coro):
async with sem:
return await coro
return await asyncio.gather(*(_wrap(c) for c in coros))
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
async def _run(args: argparse.Namespace) -> int:
load_dotenv(REPO / ".env")
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(name)s %(message)s",
)
run_dir = REPO / "data" / "multimodal_doc" / "runs" / args.run_id / "parser_compare"
raw_path = run_dir / "raw.jsonl"
if not raw_path.exists():
raise SystemExit(f"raw.jsonl not found at {raw_path}")
map_path = REPO / "data" / "multimodal_doc" / "maps" / "parser_compare_doc_map.jsonl"
questions_jsonl = REPO / "data" / "multimodal_doc" / "mmlongbench" / "questions.jsonl"
if not map_path.exists():
raise SystemExit(f"parser_compare manifest not found at {map_path}")
if not questions_jsonl.exists():
raise SystemExit(f"mmlongbench questions not found at {questions_jsonl}")
failed = _load_failed_rows(raw_path)
if not failed:
logger.info("No failed rows in %s — nothing to retry.", raw_path)
return 0
# SurfSense rows: warn and skip; we don't want to start backend just to
# defensively retry a 0-failure arm.
surf_failed = [f for f in failed if f.arm == "surfsense_agentic"]
if surf_failed:
logger.warning(
"Skipping %d surfsense_agentic failures; this script doesn't drive the backend. "
"If you want those retried too, start backend + celery and rerun "
"with --include-surfsense.",
len(surf_failed),
)
if not args.include_surfsense:
failed = [f for f in failed if f.arm != "surfsense_agentic"]
else:
logger.info("No surfsense_agentic failures; backend/celery not needed for this retry.")
if not failed:
logger.info("Nothing left to retry after filtering.")
return 0
by_arm_count: dict[str, int] = {}
for f in failed:
by_arm_count[f.arm] = by_arm_count.get(f.arm, 0) + 1
logger.info(
"Loaded %d failed rows across %d arms: %s",
len(failed), len(by_arm_count),
", ".join(f"{a}={n}" for a, n in sorted(by_arm_count.items())),
)
doc_map = _load_doc_map(map_path)
qtext_idx = _load_question_text_index(questions_jsonl)
api_key = os.environ.get("OPENROUTER_API_KEY")
if not api_key:
raise SystemExit("OPENROUTER_API_KEY missing from environment / .env")
native_provider = OpenRouterPdfProvider(
api_key=api_key,
base_url=os.environ.get("OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1"),
model=args.llm_model,
engine=PdfEngine(args.pdf_engine),
)
native_arm = NativePdfArm(
provider=native_provider, max_output_tokens=args.max_output_tokens,
)
lc_arms: dict[str, BareLlmArm] = {}
for arm_name in sorted({f.arm for f in failed} & LC_ARMS):
lc_provider = OpenRouterChatProvider(
api_key=api_key,
base_url=os.environ.get("OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1"),
model=args.llm_model,
)
lc_arms[arm_name] = BareLlmArm(
provider=lc_provider,
max_output_tokens=args.max_output_tokens,
name=arm_name,
)
coros: list = []
plan: list[tuple[FailedRow, ArmRequest, Any]] = []
for f in failed:
# Look up the question text from questions.jsonl
try:
q_idx = _qid_index(f.qid)
except Exception:
logger.error("Bad qid %r — skipping", f.qid)
continue
qrow = qtext_idx.get((f.doc_id, q_idx))
if qrow is None:
logger.error(
"Could not find question text for %s (idx %d) — skipping",
f.doc_id, q_idx,
)
continue
question_text = str(qrow.get("question") or "").strip()
answer_format = str(qrow.get("answer_format") or f.answer_format or "").strip().lower()
map_row = doc_map.get(f.doc_id)
if map_row is None:
logger.error("doc_id %s not in manifest — skipping", f.doc_id)
continue
if f.arm == "native_pdf":
pdf_path = Path(map_row["pdf_path"])
if not pdf_path.exists():
logger.error("PDF missing on disk: %s — skipping", pdf_path)
continue
request = _build_native_request(
f.qid, question_text, answer_format, pdf_path,
max_output_tokens=args.max_output_tokens,
)
arm_obj = native_arm
elif f.arm in LC_ARMS:
ext_blob = (map_row.get("extractions") or {}).get(f.arm) or {}
md_path_str = ext_blob.get("markdown_path")
if not md_path_str or ext_blob.get("status") != "ok":
logger.error(
"Missing extraction for %s on %s — cannot retry; skipping",
f.arm, f.doc_id,
)
continue
request = _build_lc_request(
f.qid, question_text, answer_format, f.doc_id, Path(md_path_str),
)
arm_obj = lc_arms[f.arm]
else:
logger.warning("Unhandled arm %s — skipping", f.arm)
continue
plan.append((f, request, arm_obj))
coros.append(_retry_one(
arm_obj, request,
arm_name=f.arm, qid=f.qid,
max_attempts=args.max_attempts,
base_delay=args.base_delay,
max_delay=args.max_delay,
))
if not coros:
logger.warning("Nothing to retry after request building.")
return 0
logger.info(
"Retrying %d failed rows with up to %d attempts each "
"(base_delay=%.1fs, max_delay=%.1fs, concurrency=%d).",
len(coros), args.max_attempts, args.base_delay, args.max_delay,
args.concurrency,
)
started = time.monotonic()
outcomes: list[RetryOutcome] = await _gather_with_limit(
coros, concurrency=args.concurrency,
)
elapsed = time.monotonic() - started
logger.info("Retry pass finished in %.1fs.", elapsed)
out_path = run_dir / "raw_retries.jsonl"
summary_path = run_dir / "raw_retries_summary.json"
per_arm_recovered: dict[str, int] = {}
per_arm_total: dict[str, int] = {}
per_arm_attempts_dist: dict[str, list[int]] = {}
with out_path.open("w", encoding="utf-8") as fh:
for (f, _req, _arm_obj), outcome in zip(plan, outcomes, strict=True):
per_arm_total[outcome.arm] = per_arm_total.get(outcome.arm, 0) + 1
if outcome.recovered:
per_arm_recovered[outcome.arm] = (
per_arm_recovered.get(outcome.arm, 0) + 1
)
per_arm_attempts_dist.setdefault(outcome.arm, []).append(
len(outcome.attempts)
)
g = grade(
pred=extract_freeform_answer(outcome.final_result.raw_text or ""),
gold=f.gold,
answer_format=f.answer_format,
)
row = {
"qid": f.qid,
"doc_id": f.doc_id,
"arm": f.arm,
"answer_format": f.answer_format,
"gold": f.gold,
"pages": f.pages,
"document_id": f.document_id,
"original_error": f.original_error,
"retry": {
"max_attempts": args.max_attempts,
"n_attempts": len(outcome.attempts),
"recovered": outcome.recovered,
"attempts": [
{
"attempt": a.attempt,
"started_iso": a.started_iso,
"latency_ms": a.latency_ms,
"error": a.error,
"raw_text_chars": a.raw_text_chars,
}
for a in outcome.attempts
],
},
**outcome.final_result.to_jsonl(),
"graded": {
"correct": g.correct,
"f1": g.f1,
"method": g.method,
"normalised_pred": g.normalised_pred,
"normalised_gold": g.normalised_gold,
},
}
fh.write(json.dumps(row) + "\n")
summary = {
"run_id": args.run_id,
"raw_retries_path": str(out_path.relative_to(REPO)),
"n_failed_rows_input": len(failed),
"n_retried": len(coros),
"elapsed_s": round(elapsed, 1),
"config": {
"max_attempts": args.max_attempts,
"base_delay": args.base_delay,
"max_delay": args.max_delay,
"concurrency": args.concurrency,
"llm_model": args.llm_model,
"pdf_engine": args.pdf_engine,
"max_output_tokens": args.max_output_tokens,
},
"per_arm": {
arm: {
"tried": per_arm_total.get(arm, 0),
"recovered": per_arm_recovered.get(arm, 0),
"still_failed": (
per_arm_total.get(arm, 0) - per_arm_recovered.get(arm, 0)
),
"recovery_rate": (
per_arm_recovered.get(arm, 0) / per_arm_total[arm]
if per_arm_total.get(arm) else 0.0
),
"attempts_distribution": sorted(per_arm_attempts_dist.get(arm, [])),
}
for arm in sorted(per_arm_total)
},
"totals": {
"tried": sum(per_arm_total.values()),
"recovered": sum(per_arm_recovered.values()),
"still_failed": sum(per_arm_total.values()) - sum(per_arm_recovered.values()),
},
}
summary_path.write_text(
json.dumps(summary, indent=2, sort_keys=True) + "\n",
encoding="utf-8",
)
print()
print("=" * 78)
print("Retry pass summary")
print("=" * 78)
header = f"{'arm':<25} {'tried':>6} {'recovered':>10} {'still fail':>11} {'rate':>7}"
print(header)
print("-" * len(header))
for arm in sorted(per_arm_total):
tried = per_arm_total[arm]
rec = per_arm_recovered.get(arm, 0)
rate = (rec / tried * 100) if tried else 0.0
print(f"{arm:<25} {tried:>6} {rec:>10} {tried - rec:>11} {rate:>6.1f}%")
total = sum(per_arm_total.values())
rec_total = sum(per_arm_recovered.values())
rate_total = (rec_total / total * 100) if total else 0.0
print("-" * len(header))
print(f"{'TOTAL':<25} {total:>6} {rec_total:>10} {total - rec_total:>11} "
f"{rate_total:>6.1f}%")
print()
print(f"Wrote {out_path.relative_to(REPO)}")
print(f"Wrote {summary_path.relative_to(REPO)}")
return 0
def main() -> None:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--run-id", default="2026-05-14T00-53-19Z",
help="Run timestamp under data/multimodal_doc/runs/. Default is the "
"n=171 production run we wrote up in the blog.",
)
parser.add_argument("--max-attempts", type=int, default=5)
parser.add_argument("--base-delay", type=float, default=1.0,
help="Base seconds for exponential backoff (default 1s).")
parser.add_argument("--max-delay", type=float, default=30.0,
help="Cap on per-retry sleep (default 30s).")
parser.add_argument("--concurrency", type=int, default=2,
help="Parallel retries in flight (default 2 — keep low "
"to avoid the same transport stress that caused "
"the original failures).")
parser.add_argument("--llm-model", default="anthropic/claude-sonnet-4.5")
parser.add_argument("--pdf-engine", default="native",
choices=[e.value for e in PdfEngine])
parser.add_argument("--max-output-tokens", type=int, default=512)
parser.add_argument(
"--include-surfsense", action="store_true",
help="Also retry surfsense_agentic failures (requires backend + celery up). "
"Default is to skip them since the n=171 run had 0 SurfSense failures.",
)
args = parser.parse_args()
raise SystemExit(asyncio.run(_run(args)))
if __name__ == "__main__":
main()

View file

@ -0,0 +1,122 @@
"""Slice the parser_compare raw.jsonl for the n=171 run.
Reports per-arm:
* tokens & cost stats (input/output mean, $/Q distribution)
* failures (status != ok or empty raw_text)
* answer_format breakdown (accuracy by str/int/float/list)
Plus surfsense agentic breakdown so we can compare apples to apples
even though the new_chat SSE doesn't surface per-call token counts.
"""
from __future__ import annotations
import json
import statistics
from collections import defaultdict
from pathlib import Path
REPO = Path(__file__).resolve().parents[1]
RUN_DIR = REPO / "data" / "multimodal_doc" / "runs" / "2026-05-14T00-53-19Z" / "parser_compare"
RAW = RUN_DIR / "raw.jsonl"
ARTIFACT = RUN_DIR / "run_artifact.json"
def main() -> None:
rows = [json.loads(line) for line in RAW.read_text(encoding="utf-8").splitlines() if line.strip()]
print(f"raw rows: {len(rows)}")
by_qid: dict[str, list[dict]] = defaultdict(list)
for row in rows:
by_qid[row["qid"]].append(row)
print(f"unique questions: {len(by_qid)}")
arm_metrics: dict[str, dict] = defaultdict(lambda: {
"n": 0, "n_correct": 0, "n_failed": 0, "n_empty": 0,
"costs": [], "in_tokens": [], "out_tokens": [], "latency_ms": [],
"by_format": defaultdict(lambda: {"n": 0, "correct": 0}),
})
for row in rows:
arm = row["arm"]
m = arm_metrics[arm]
m["n"] += 1
graded = row.get("graded") or {}
if graded.get("correct"):
m["n_correct"] += 1
err = row.get("error")
raw_text = row.get("raw_text") or ""
if err:
m["n_failed"] += 1
elif not raw_text.strip():
m["n_empty"] += 1
cost = row.get("cost_usd")
if cost is not None:
m["costs"].append(float(cost))
ut = row.get("usage") or {}
if ut.get("prompt_tokens"):
m["in_tokens"].append(ut["prompt_tokens"])
if ut.get("completion_tokens"):
m["out_tokens"].append(ut["completion_tokens"])
if row.get("latency_ms"):
m["latency_ms"].append(row["latency_ms"])
fmt = row.get("answer_format") or "unknown"
m["by_format"][fmt]["n"] += 1
if graded.get("correct"):
m["by_format"][fmt]["correct"] += 1
print()
print("=" * 100)
print(f"{'arm':<25} {'n':>4} {'acc%':>6} {'F1%':>6} {'fail':>5} {'$ mean':>10} {'$ median':>10} {'in tok mean':>12} {'out tok mean':>12} {'p50 ms':>8}")
print("=" * 100)
art = json.loads(ARTIFACT.read_text(encoding="utf-8"))
per_arm_art = art["metrics"]["per_arm"]
for arm, m in sorted(arm_metrics.items()):
acc = m["n_correct"] / m["n"] * 100
fail = m["n_failed"]
cost_mean = statistics.mean(m["costs"]) if m["costs"] else 0.0
cost_med = statistics.median(m["costs"]) if m["costs"] else 0.0
in_mean = statistics.mean(m["in_tokens"]) if m["in_tokens"] else 0
out_mean = statistics.mean(m["out_tokens"]) if m["out_tokens"] else 0
lat_p50 = statistics.median(m["latency_ms"]) if m["latency_ms"] else 0
f1 = per_arm_art.get(arm, {}).get("f1_mean", 0.0) * 100
print(
f"{arm:<25} {m['n']:>4} {acc:>5.1f}% {f1:>5.1f}% {fail:>5} "
f"${cost_mean:>9.4f} ${cost_med:>9.4f} {in_mean:>12.0f} {out_mean:>12.0f} {lat_p50:>8.0f}"
)
print()
print("by answer_format (accuracy):")
formats = sorted({f for m in arm_metrics.values() for f in m["by_format"].keys()})
header = f"{'arm':<25} " + " ".join(f"{f:>10}" for f in formats)
print(header)
print("-" * len(header))
for arm, m in sorted(arm_metrics.items()):
cells = []
for f in formats:
row = m["by_format"][f]
if row["n"] == 0:
cells.append(f"{'-':>10}")
else:
pct = row["correct"] / row["n"] * 100
cells.append(f"{pct:>5.0f}% ({row['correct']:>2}/{row['n']:>2})")
print(f"{arm:<25} " + " ".join(cells))
print()
print("=" * 100)
print("Aggregated cost (from run_artifact.json):")
for arm, row in per_arm_art.items():
print(
f" {arm:<25} acc={row['accuracy']*100:5.1f}% "
f" $/Q LLM={row['llm_cost_per_q']:.4f} "
f" preprocess total=${row['preprocess_cost_total']:.2f} "
f" $/Q total={row['total_cost_per_q']:.4f}"
)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,155 @@
"""Test the hypothesis: were the LC-arm errors actually context-window
overflow errors disguised as SSL / network failures?
If true, we'd expect:
(a) literal "prompt is too long" / "context_length_exceeded" / "exceeds .* tokens" strings,
(b) failures correlated with extraction size / input_tokens (large doc -> failure),
(c) failing requests near or over Sonnet 4.5's 200k input-token limit.
If false (transport-layer hypothesis), we'd expect:
(a) only SSL / 502 / empty stream / JSONDecode strings,
(b) failures NOT correlated with size (uniform across PDFs by time, not by tokens),
(c) failing requests well below the 200k limit.
"""
from __future__ import annotations
import json
import statistics
from collections import defaultdict
from pathlib import Path
REPO = Path(__file__).resolve().parents[1]
RUN = REPO / "data" / "multimodal_doc" / "runs" / "2026-05-14T00-53-19Z" / "parser_compare"
RAW = RUN / "raw.jsonl"
MANIFEST = REPO / "data" / "multimodal_doc" / "maps" / "parser_compare_doc_map.jsonl"
CONTEXT_HINTS = (
"context_length",
"context window",
"prompt is too long",
"exceeds",
"maximum context",
"input tokens",
"too many tokens",
"over the maximum",
"200000",
"200_000",
)
def main() -> None:
rows = [
json.loads(line) for line in RAW.read_text(encoding="utf-8").splitlines()
if line.strip()
]
extraction_size: dict[tuple[str, str], int] = {}
for line in MANIFEST.read_text(encoding="utf-8").splitlines():
if not line.strip():
continue
m = json.loads(line)
for arm, ext in (m.get("extractions") or {}).items():
extraction_size[(m["doc_id"], arm)] = int(ext.get("chars") or 0)
print("=" * 80)
print("(a) Literal 'context window' / 'prompt too long' error strings?")
print("=" * 80)
found = 0
for row in rows:
err = (row.get("error") or "").lower()
if not err:
continue
for hint in CONTEXT_HINTS:
if hint in err:
print(f" {row['arm']:<25} {row['qid']:<50}")
print(f" -> {err[:240]}")
found += 1
break
if not found:
print(" none found.")
print()
print("=" * 80)
print("(b) Extraction size for OK vs FAILED rows per arm")
print("=" * 80)
arm_buckets: dict[str, dict[str, list[int]]] = defaultdict(
lambda: {"ok": [], "fail": []}
)
parser_arms = (
"azure_basic_lc", "azure_premium_lc",
"llamacloud_basic_lc", "llamacloud_premium_lc",
)
for row in rows:
arm = row["arm"]
if arm not in parser_arms:
continue
size = extraction_size.get((row["doc_id"], arm), 0)
bucket = "fail" if (row.get("error") or not (row.get("raw_text") or "").strip()) else "ok"
arm_buckets[arm][bucket].append(size)
print(f"{'arm':<25} {'bucket':<5} {'n':>4} {'mean chars':>12} {'median':>10} {'max':>10}")
for arm in parser_arms:
for bucket in ("ok", "fail"):
sizes = arm_buckets[arm][bucket]
if not sizes:
print(f" {arm:<23} {bucket:<5} {0:>4} -")
continue
print(
f" {arm:<23} {bucket:<5} {len(sizes):>4} "
f"{statistics.mean(sizes):>12,.0f} "
f"{statistics.median(sizes):>10,.0f} "
f"{max(sizes):>10,}"
)
print()
print("=" * 80)
print("(c) Largest extraction each arm processed *successfully* vs *failed*")
print("=" * 80)
print(
"(Sonnet 4.5 input limit ~200k tokens ~= 800k chars. If failures were "
"context-overflow, max-OK would be near that cap. If max-OK is well "
"above max-FAIL, the model handled bigger contexts than the failed "
"ones, so size cannot be the cause.)"
)
print()
for arm in parser_arms:
ok_sizes = arm_buckets[arm]["ok"]
fail_sizes = arm_buckets[arm]["fail"]
if not ok_sizes:
continue
max_ok = max(ok_sizes)
max_fail = max(fail_sizes) if fail_sizes else 0
print(
f" {arm:<25} max OK = {max_ok:>10,} chars (~{max_ok / 4:>7,.0f} tokens) "
f"max FAIL = {max_fail:>10,} chars (~{max_fail / 4:>7,.0f} tokens)"
)
print()
print("=" * 80)
print("(d) Did the *known* overflow candidate fail?")
print("=" * 80)
print(
" 3M_2018_10K x llamacloud_premium = 908,733 chars (~227k tokens) "
"-- this is above Sonnet 4.5's 200k window."
)
print(" If transport hypothesis is correct, this should still fail with a "
"real overflow error.")
print(" If transport hypothesis is correct AND the model truncates silently, "
"it might 'succeed' but be wrong.")
print()
for row in rows:
if row["doc_id"] != "3M_2018_10K.pdf":
continue
if row["arm"] != "llamacloud_premium_lc":
continue
err = row.get("error") or "(none)"
graded = row.get("graded") or {}
print(
f" {row['qid']:<40} correct={graded.get('correct')!s:<5} "
f"err={err[:100]}"
)
if __name__ == "__main__":
main()