feat(evals): publish multimodal_doc parser_compare benchmark + n=171 report

Adds the full parser_compare experiment for the multimodal_doc suite: six arms compared on 30 PDFs / 171 questions from MMLongBench-Doc with anthropic/claude-sonnet-4.5 across the board. Source code: - core/parsers/{azure_di,llamacloud,pdf_pages}.py: direct parser SDK callers (Azure Document Intelligence prebuilt-read/layout, LlamaParse parse_page_with_llm/parse_page_with_agent) used by the LC arms, bypassing the SurfSense backend so each (basic/premium) extraction is a clean A/B independent of backend ETL routing. - suites/multimodal_doc/parser_compare/{ingest,runner,prompt}.py: six-arm benchmark (native_pdf, azure_basic_lc, azure_premium_lc, llamacloud_basic_lc, llamacloud_premium_lc, surfsense_agentic) with byte-identical prompts per question, deterministic grader, Wilson CIs, and the per-page preprocessing tariff cost overlay. Reproducibility: - pyproject.toml + uv.lock pin pypdf, azure-ai-documentintelligence, llama-cloud-services as new deps. - .env.example documents the AZURE_DI_* and LLAMA_CLOUD_API_KEY env vars now required for parser_compare. - 12 analysis scripts under scripts/: retry pass with exponential backoff, post-retry accuracy merge, McNemar / latency / per-PDF stats, context-overflow hypothesis test, etc. Each produces one number cited by the blog report. Citation surface: - reports/blog/multimodal_doc_parser_compare_n171_report.md: 1219-line technical writeup (16 sections) covering headline accuracy, per-format accuracy, McNemar pairwise significance, latency / token / per-PDF distributions, error analysis, retry experiment, post-retry final accuracy, cost amortization model with closed-form derivation, threats to validity, and reproducibility appendix. - data/multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/{raw, raw_retries,raw_post_retry}.jsonl + run_artifact.json + retry summary whitelisted via data/.gitignore as the verifiable numbers source. Gitignore: - ignore logs_*.txt + retry_run.log; structured artifacts cover the citation surface, debug logs are noise. - data/.gitignore default-ignores everything, whitelists the n=171 run artifacts only (parser manifest left ignored to avoid leaking local Windows usernames in absolute paths; manifest is fully regenerable via 'ingest multimodal_doc parser_compare'). - reports/.gitignore now whitelists hand-curated reports/blog/. Also retires the abandoned CRAG Task 3 implementation (download script, streaming Task 3 ingest, CragTask3Benchmark + tests) and trims the runner / ingest module APIs to match. Co-authored-by: Cursor <cursoragent@cursor.com>
2026-07-02 22:01:05 +02:00 · 2026-05-14 19:54:41 -07:00 · 2026-05-14 19:54:41 -07:00 · 9bcd50164d
commit 9bcd50164d
parent 3737118050
40 changed files with 9303 additions and 993 deletions
--- a/surfsense_evals/.env.example
+++ b/surfsense_evals/.env.example
@ -63,3 +63,21 @@ OPENROUTER_API_KEY=sk-or-...
 # Where generated reports (summary.md / summary.json) get written.
 # Default: <surfsense_evals>/reports/
 # EVAL_REPORTS_DIR=./reports
+
+# ---------------------------------------------------------------------------
+# 5. Parser SDKs — REQUIRED for the multimodal_doc / parser_compare suite
+# ---------------------------------------------------------------------------
+# parser_compare calls Azure Document Intelligence and LlamaParse SDKs
+# directly from the eval harness so each (basic / premium) extraction
+# is a clean A/B test independent of the SurfSense backend's ETL routing.
+#
+# Azure Document Intelligence — used for the `azure_basic_lc` (prebuilt-read)
+# and `azure_premium_lc` (prebuilt-layout) arms. Get an endpoint + key from
+# https://portal.azure.com (Document Intelligence resource, F0 / S0 tier).
+# AZURE_DI_ENDPOINT=https://<your-resource>.cognitiveservices.azure.com/
+# AZURE_DI_KEY=<your-32-char-key>
+#
+# LlamaCloud (LlamaParse) — used for `llamacloud_basic_lc` (parse_page_with_llm)
+# and `llamacloud_premium_lc` (parse_page_with_agent). Get a key from
+# https://cloud.llamaindex.ai/api-key.
+# LLAMA_CLOUD_API_KEY=llx-...
--- a/surfsense_evals/.gitignore
+++ b/surfsense_evals/.gitignore
@ -27,3 +27,9 @@ htmlcov/
 .env.local
 .env.*.local
 !.env.example
+
+# Run / debug logs — keep ephemeral, structured artifacts in
+# `data/.../<run_id>/` are the citation surface, not these.
+*.log
+logs_*.txt
+retry_run.log
--- a/surfsense_evals/data/.gitignore
+++ b/surfsense_evals/data/.gitignore
@ -1,2 +1,22 @@
+# Default: don't track anything under data/ (large benchmarks, PDFs,
+# extracted markdown, ingestion caches, per-run artifacts can balloon).
 *
+
+# Always keep this gitignore file
 !.gitignore
+
+# Whitelist the artifacts the n=171 multimodal_doc / parser_compare blog
+# report (`reports/blog/multimodal_doc_parser_compare_n171_report.md`)
+# cites by path. These are the verifiable numbers source for the report.
+#
+# Path components have to be unblocked one level at a time because git
+# does not look into a directory whose parent is ignored.
+!multimodal_doc/
+!multimodal_doc/runs/
+!multimodal_doc/runs/2026-05-14T00-53-19Z/
+!multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/
+!multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw.jsonl
+!multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw_retries.jsonl
+!multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw_retries_summary.json
+!multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw_post_retry.jsonl
+!multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/run_artifact.json
--- a/surfsense_evals/data/multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw.jsonl
+++ b/surfsense_evals/data/multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw.jsonl
--- a/surfsense_evals/data/multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw_post_retry.jsonl
+++ b/surfsense_evals/data/multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw_post_retry.jsonl
--- a/surfsense_evals/data/multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw_retries.jsonl
+++ b/surfsense_evals/data/multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw_retries.jsonl
@ -0,0 +1,37 @@
+{"qid": "05-03-18-political-release.pdf::Q000", "doc_id": "05-03-18-political-release.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "541", "pages": 17, "document_id": 5219, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:50:15Z", "latency_ms": 20985, "error": null, "raw_text_chars": 696}]}, "question_id": "05-03-18-political-release.pdf::Q000", "answer_letter": null, "raw_text": "I need to find information about how many adults rated Trump's government ethical standards as \"poor\" in the survey conducted April 25-May 1, 2018.\n\nLooking through the document, I find relevant information on page 4 (document page 6):\n\nThe document states: \"About four-in-ten Americans rate the ethical standards of Trump administration officials as excellent (9%) or good (30%), while 58% say they are not good (21%) or poor (36%).\"\n\nThe survey was conducted among 1,503 adults (as stated in the methodology section on page 14).\n\nTo find the number of adults who rated the ethical standards as \"poor\":\n36% of 1,503 = 0.36 \u00d7 1,503 = 541.08\n\nRounding to the nearest whole number: 541\n\nAnswer: 541", "citations": [], "input_tokens": 33894, "output_tokens": 191, "cost_micros": 104547, "latency_ms": 20968, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "05-03-18-political-release.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "541", "normalised_gold": "541"}}
+{"qid": "05-03-18-political-release.pdf::Q002", "doc_id": "05-03-18-political-release.pdf", "arm": "native_pdf", "answer_format": "list", "gold": "['Make good decisions about economic policy ', 'Make wise decisions about immigration policy ']", "pages": 17, "document_id": 5219, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:50:16Z", "latency_ms": 20453, "error": null, "raw_text_chars": 544}]}, "question_id": "05-03-18-political-release.pdf::Q002", "answer_letter": null, "raw_text": "Looking at page 5 of the document, which shows \"Public's confidence in Trump to handle several policy areas mixed,\" I can see the data for \"very confident\" and \"not at all confident\" percentages across different domains.\n\nFor \"Very confident\" (highest percentage):\n- Negotiate favorable trade agreements with other countries: 30%\n\nFor \"Not at all confident\" (highest percentage):\n- Make wise decisions about immigration policy: 55%\n\nAnswer: Make wise decisions about immigration policy, Negotiate favorable trade agreements with other countries", "citations": [], "input_tokens": 33942, "output_tokens": 117, "cost_micros": 103581, "latency_ms": 20438, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "05-03-18-political-release.pdf"}, "graded": {"correct": false, "f1": 0.5, "method": "list_set", "normalised_pred": "make wise decisions about immigration policy, negotiate favorable trade agreements with other countries", "normalised_gold": "make good decisions about economic policy, make wise decisions about immigration policy"}}
+{"qid": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q005", "doc_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "6", "pages": 16, "document_id": 5220, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:50:36Z", "latency_ms": 41750, "error": null, "raw_text_chars": 1479}]}, "question_id": "0b85477387a9d0cc33fca0f4becaa0e5.pdf::Q005", "answer_letter": null, "raw_text": "I need to count the pictures in the document that contain only one person.\n\nLet me go through the document page by page:\n\nPage 1:\n- Photo of Dr. Xiaoyan \"Cathy\" Zou with Rear Adm. (Ret.) Tim Ziemer - 2 people\n\nPage 2:\n- Photo of Capt. John W. Sanders - 1 person\n- Photo of Capt. Steven M. Wechsler - 1 person\n\nPage 3:\n- Photo of Afghan trainees in laboratory - multiple people\n\nPage 4:\n- Group photo of Pacific Partnership 2012 Internal Medicine Department staff - multiple people\n- Photo of Cmdr. Charmagne Beckett on flight deck - 1 person\n\nPage 5:\n- Photo of Petty Officer 1st Class Dennis Gonzales - 1 person\n\nPage 6:\n- Photo of Lance Cpl. Kip Boker and Cpl. Jacob Echeverri - 2 people\n\nPage 7:\n- Photo with Lt. Cmdr. Jennifer Curry, Capt. Buhari Oyofo, Dr. Walter T. Gwenigale, Lt. Joseph Diclaro, and Dr. Fatorma Bolay - 5 people\n- Photo with Lt. Cmdr. Jennifer Curry, Capt. Chris Martinez, Capt. Buhari Oyofo, Col. Vernon Graham, and Lt. Joseph Diclaro - 5 people\n\nPage 8:\n- Group photo of Kazakh scientists and NMRC staff - multiple people\n\nPage 9:\n- Photo of Lt. j.g. Michael Rucker treating a child - 2 people\n- Photo of U.S. Marines and Sailors in aircraft - multiple people\n\nPage 10:\n- Group photo of Joint Combat Casualty Research Team - multiple people\n\nPage 12:\n- Photo of NMRC 2012 Dining Out Event - multiple people\n- Photo with Rear Adm. Bruce Doll, Capt. John Sanders and others - multiple people\n\nPage 13:\n- Photo with Ellen Flannery, Emmanuel Adu-Gyamfi, Eil", "citations": [], "input_tokens": 40387, "output_tokens": 512, "cost_micros": 128841, "latency_ms": 41735, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "length", "pdf_filename": "0b85477387a9d0cc33fca0f4becaa0e5.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "6"}}
+{"qid": "2005.12872v3.pdf::Q005", "doc_id": "2005.12872v3.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "36", "pages": 26, "document_id": 7577, "original_error": "JSONDecodeError: Expecting value: line 169 column 1 (char 924)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:50:36Z", "latency_ms": 36889, "error": null, "raw_text_chars": 1214}]}, "question_id": "2005.12872v3.pdf::Q005", "answer_letter": null, "raw_text": "I need to find information about DETR's default structure, specifically the number of multi-head self-attention layers.\n\nLet me search through the document for information about DETR's architecture.\n\nFrom page 2, I can see that DETR has a transformer encoder-decoder architecture.\n\nOn page 10 (Section 4.1), I find: \"To be comparable in the number of parameters we choose a model with 6 transformer and 6 decoder layers of width 256 with 8 attention heads.\"\n\nLooking at the detailed architecture description on page 22 (Section A.3) and Figure 10, I can see that:\n- The encoder has multiple layers, each containing multi-head self-attention\n- The decoder has multiple layers, each containing multi-head self-attention AND multi-head attention (cross-attention)\n\nFrom page 9-10, the baseline model uses \"6 encoder, 6 decoder layers\"\n\nEach encoder layer has one multi-head self-attention layer.\nEach decoder layer has one multi-head self-attention layer and one multi-head cross-attention layer.\n\nSo the total number of multi-head self-attention layers (not counting cross-attention) is:\n- 6 encoder layers \u00d7 1 self-attention per layer = 6\n- 6 decoder layers \u00d7 1 self-attention per layer = 6\n- Total = 12\n\nAnswer: 12", "citations": [], "input_tokens": 59866, "output_tokens": 318, "cost_micros": 184368, "latency_ms": 36875, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2005.12872v3.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "12", "normalised_gold": "36"}}
+{"qid": "2306.05425v1.pdf::Q002", "doc_id": "2306.05425v1.pdf", "arm": "llamacloud_basic_lc", "answer_format": "int", "gold": "6", "pages": 22, "document_id": 7587, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:51:13Z", "latency_ms": 11328, "error": null, "raw_text_chars": 1376}]}, "question_id": "2306.05425v1.pdf::Q002", "answer_letter": null, "raw_text": "I need to carefully examine the document for bar plots.\n\nLooking through the figures in the document:\n\n1. **Figure 1**: Shows example images with text descriptions - no bar plots\n2. **Figure 2**: Shows data format comparison with images and text - no bar plots\n3. **Figure 3**: Shows the Syphus overview diagram - no bar plots\n4. **Figure 4**: This appears to show statistics and analysis. Let me examine it carefully:\n   - Part (a): Shows verb-noun structure for instructions - this appears to be a bar chart\n   - Part (b): Shows verb-noun structure for responses - this appears to be a bar chart\n   - Part (c): Shows \"Diversity in our dataset\" with what appears to be distribution plots\n\nLooking at Figure 4 more carefully:\n- (a) contains what looks like a horizontal bar chart showing verbs and nouns from instructions\n- (b) contains what looks like a horizontal bar chart showing verbs and nouns from responses\n- (c) contains multiple distribution plots showing lengths and counts\n\n5. **Figure 5**: Shows response examples - no bar plots\n6. **Figure 6**: The document mentions Fig. 6(b) and Fig. 6(c) but these are not fully visible in the provided pages\n\nFrom what I can see clearly in Figure 4:\n- Subfigure (a): 1 bar plot (showing instruction verb-noun structure)\n- Subfigure (b): 1 bar plot (showing response verb-noun structure)\n\nTotal visible bar plots: 2\n\nAnswer: 2", "citations": [], "input_tokens": 20240, "output_tokens": 354, "cost_micros": 66030, "latency_ms": 11328, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "6"}}
+{"qid": "2307.09288v2.pdf::Q000", "doc_id": "2307.09288v2.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "5", "pages": 77, "document_id": 7588, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:51:18Z", "latency_ms": 52514, "error": null, "raw_text_chars": 860}]}, "question_id": "2307.09288v2.pdf::Q000", "answer_letter": null, "raw_text": "I'll examine pages 5-10 to count the tables.\n\nPage 5: No tables visible in the text content shown.\n\nPage 6: Table 1 is present - \"Llama 2 family of models\" showing training data, parameters, context length, GQA, tokens, and LR for Llama 1 and Llama 2 models.\n\nPage 7: \n- Table 2: \"CO2 emissions during pretraining\" showing time, power consumption, and carbon emitted for different Llama 2 models.\n- Table 3: \"Overall performance on grouped academic benchmarks\" comparing MPT, Falcon, Llama 1, and Llama 2 models.\n\nPage 8:\n- Table 4: \"Comparison to closed-source models on academic benchmarks\" showing results for GPT-3.5, GPT-4, PaLM, PaLM-2-L, and Llama 2.\n- Table 5: \"SFT annotation\" showing examples of helpfulness and safety annotations.\n\nPage 9: No complete tables visible in the shown content.\n\nPage 10: No tables visible in the shown content.\n\nAnswer: 5", "citations": [], "input_tokens": 199213, "output_tokens": 280, "cost_micros": 601839, "latency_ms": 52468, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2307.09288v2.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "5", "normalised_gold": "5"}}
+{"qid": "2307.09288v2.pdf::Q002", "doc_id": "2307.09288v2.pdf", "arm": "native_pdf", "answer_format": "list", "gold": "['Vicuna13b-v1.3', 'PaLM Bison', 'Vicuna13b-v1.1']", "pages": 77, "document_id": 7588, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:51:25Z", "latency_ms": 40452, "error": null, "raw_text_chars": 575}]}, "question_id": "2307.09288v2.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 3 on page 4, which shows \"Safety human evaluation results for Llama 2-Chat compared to other open-source and closed-source models,\" I need to identify the models with the lowest violation percentages (since lower is better for safety).\n\nFrom the bar chart showing \"Violation % (lower is safer)\":\n- Llama 2-70b-chat has the lowest violation percentage (approximately 0-5%)\n- Llama 2-13b-chat has the second lowest (approximately 0-5%)\n- Llama 2-34b-chat has the third lowest (approximately 5-10%)\n\nAnswer: Llama 2-70b-chat, Llama 2-13b-chat, Llama 2-34b-chat", "citations": [], "input_tokens": 199224, "output_tokens": 182, "cost_micros": 600402, "latency_ms": 40406, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2307.09288v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "llama 2 13b chat, llama 2 34b chat, llama 2 70b chat", "normalised_gold": "palm bison, vicuna13b v1 1, vicuna13b v1 3"}}
+{"qid": "2307.09288v2.pdf::Q002", "doc_id": "2307.09288v2.pdf", "arm": "llamacloud_premium_lc", "answer_format": "list", "gold": "['Vicuna13b-v1.3', 'PaLM Bison', 'Vicuna13b-v1.1']", "pages": 77, "document_id": 7588, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:52:05Z", "latency_ms": 10343, "error": null, "raw_text_chars": 376}]}, "question_id": "2307.09288v2.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Figure 3, which shows \"Safety human evaluation results for LLAMA 2-CHAT compared to other open-source and closed-source models,\" I can see the violation percentages (lower is safer):\n\nThe top-3 safest models (lowest violation %) are:\n1. Llama-2 7b-chat: 3%\n2. Llama-2 13b-chat: 3%\n3. Llama-2 70b-chat: 4%\n\nAnswer: Llama-2 7b-chat, Llama-2 13b-chat, Llama-2 70b-chat", "citations": [], "input_tokens": 105645, "output_tokens": 151, "cost_micros": 319200, "latency_ms": 10343, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "llama 2 13b chat, llama 2 70b chat, llama 2 7b chat", "normalised_gold": "palm bison, vicuna13b v1 1, vicuna13b v1 3"}}
+{"qid": "2307.09288v2.pdf::Q004", "doc_id": "2307.09288v2.pdf", "arm": "llamacloud_premium_lc", "answer_format": "str", "gold": "Llama 1 33B", "pages": 77, "document_id": 7588, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:52:10Z", "latency_ms": 29391, "error": null, "raw_text_chars": 525}]}, "question_id": "2307.09288v2.pdf::Q004", "answer_letter": null, "raw_text": "Looking at Table 23 in the document, which shows \"Comparison to open-source models on reading comprehension (SQUAD and QUAC)\", I need to find the QUAC (f1) 0-shot column.\n\nFrom the table, the QUAC (f1) 0-shot scores are:\n- MPT 7B: 38.0\n- MPT 30B: 40.4\n- Falcon 7B: 24.0\n- Falcon 40B: 41.2\n- LLAMA 1 7B: 38.9\n- LLAMA 1 13B: 39.9\n- LLAMA 1 33B: 44.1\n- LLAMA 1 65B: 41.0\n- LLAMA 2 7B: 39.4\n- LLAMA 2 13B: 42.7\n- LLAMA 2 34B: 42.9\n- LLAMA 2 70B: 42.4\n\nThe highest score is 44.1, which belongs to LLAMA 1 33B.\n\nAnswer: LLAMA 1 33B", "citations": [], "input_tokens": 105649, "output_tokens": 274, "cost_micros": 321057, "latency_ms": 29391, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "llama 1 33b", "normalised_gold": "llama 1 33b"}}
+{"qid": "2307.09288v2.pdf::Q007", "doc_id": "2307.09288v2.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "15", "pages": 77, "document_id": 7588, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:52:15Z", "latency_ms": 43610, "error": null, "raw_text_chars": 650}]}, "question_id": "2307.09288v2.pdf::Q007", "answer_letter": null, "raw_text": "I need to find Figure 9 and count the emojis in the left and right subfigures.\n\nLooking through the document, I find Figure 9 on page 16 (document page 16). The figure shows \"Issues with multi-turn memory (left) can be improved with GAtt (right).\"\n\nLooking at the left subfigure:\n- It shows a conversation with text-only responses (no emojis visible)\n\nLooking at the right subfigure:\n- First response: 4 emojis (\ud83d\udc4b\ud83d\udc41\ufe0f\ud83d\udc44\ud83d\udc41\ufe0f)\n- Second response: 4 emojis (\u2708\ufe0f\u2708\ufe0f\ud83d\udeeb\ud83d\udcba)\n- Third response: 4 emojis (\ud83e\uddb4\ud83e\udd95\ud83e\uddb4\ud83d\udd25)\n- Fourth response: 2 emojis (\ud83c\uddee\ud83c\uddf9\ud83c\udf5d\ud83d\udc90)\n\nTotal in right subfigure: 4 + 4 + 4 + 2 = 14 emojis\n\nTotal in left subfigure: 0 emojis\n\nDifference: 14 - 0 = 14\n\nAnswer: 14", "citations": [], "input_tokens": 199223, "output_tokens": 267, "cost_micros": 601674, "latency_ms": 43578, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2307.09288v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "14", "normalised_gold": "15"}}
+{"qid": "2309.17421v2.pdf::Q000", "doc_id": "2309.17421v2.pdf", "arm": "native_pdf", "answer_format": "str", "gold": "Blue", "pages": 166, "document_id": 7589, "original_error": "HTTPStatusError: OpenRouter HTTP 502: {\"type\":\"https://developers.cloudflare.com/support/troubleshooting/http-status-codes/cloudflare-5xx-errors/error-502/\",\"title\":\"Error 502: Bad gateway\",\"status\":502,\"detail\":\"The origin web server returned an invalid or incomplete response to Cloudflare. This typically indicates the origin is overlo", "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:52:40Z", "latency_ms": 43593, "error": "HTTPStatusError: OpenRouter HTTP 502: {\"type\":\"https://developers.cloudflare.com/support/troubleshooting/http-status-codes/cloudflare-5xx-errors/error-502/\",\"title\":\"Error 502: Bad gateway\",\"status\":502,\"detail\":\"The origin web server returned an invalid or incomplete response to Cloudflare. This typically indicates the origin is overlo", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T01:53:24Z", "latency_ms": 44391, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T01:54:10Z", "latency_ms": 35797, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T01:54:48Z", "latency_ms": 42266, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T01:55:41Z", "latency_ms": 31983, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}]}, "question_id": "2309.17421v2.pdf::Q000", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 31843, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": null, "pdf_filename": "2309.17421v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "blue"}}
+{"qid": "2309.17421v2.pdf::Q001", "doc_id": "2309.17421v2.pdf", "arm": "native_pdf", "answer_format": "str", "gold": "YouTube Music", "pages": 166, "document_id": 7589, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:52:59Z", "latency_ms": 33250, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T01:53:33Z", "latency_ms": 40889, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T01:54:16Z", "latency_ms": 51515, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T01:55:10Z", "latency_ms": 54500, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T01:56:14Z", "latency_ms": 32641, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}]}, "question_id": "2309.17421v2.pdf::Q001", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 32500, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": null, "pdf_filename": "2309.17421v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "youtube music"}}
+{"qid": "2309.17421v2.pdf::Q002", "doc_id": "2309.17421v2.pdf", "arm": "native_pdf", "answer_format": "float", "gold": "$49.99", "pages": 166, "document_id": 7589, "original_error": null, "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:56:13Z", "latency_ms": 42110, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T01:56:56Z", "latency_ms": 52202, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T01:57:50Z", "latency_ms": 40469, "error": "HTTPStatusError: OpenRouter HTTP 502: {\"type\":\"https://developers.cloudflare.com/support/troubleshooting/http-status-codes/cloudflare-5xx-errors/error-502/\",\"title\":\"Error 502: Bad gateway\",\"status\":502,\"detail\":\"The origin web server returned an invalid or incomplete response to Cloudflare. This typically indicates the origin is overlo", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T01:58:34Z", "latency_ms": 49766, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T01:59:30Z", "latency_ms": 42110, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}]}, "question_id": "2309.17421v2.pdf::Q002", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 41968, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": null, "pdf_filename": "2309.17421v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "", "normalised_gold": "49.99"}}
+{"qid": "2309.17421v2.pdf::Q002", "doc_id": "2309.17421v2.pdf", "arm": "azure_premium_lc", "answer_format": "float", "gold": "$49.99", "pages": 166, "document_id": 7589, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:56:47Z", "latency_ms": 24578, "error": null, "raw_text_chars": 640}]}, "question_id": "2309.17421v2.pdf::Q002", "answer_letter": null, "raw_text": "I need to find Figure 111 in the document and identify the price of the keyboard shown in the screenshot.\n\nLooking through the document, I can find Figure 111 which is part of Section 9.8 on GUI Navigation, specifically about Online Shopping for an Ergonomic Keyboard.\n\nIn Figure 111, the screenshot shows a shopping cart with a keyboard item. The text indicates:\n\n\"Subtotal $4999\"\n\nAnd in the cart details, it shows:\n\"Kensington Pro Fit Ergonomic Wireless Keyboard - Black (K7540...\n$4999\"\n\nThe price shown is $49.99 (represented as $4999 in the format used in the screenshot, which appears to be missing the decimal point).\n\nAnswer: 49.99", "citations": [], "input_tokens": 133648, "output_tokens": 176, "cost_micros": 403584, "latency_ms": 24578, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "float_tol", "normalised_pred": "49.99", "normalised_gold": "49.99"}}
+{"qid": "2309.17421v2.pdf::Q003", "doc_id": "2309.17421v2.pdf", "arm": "native_pdf", "answer_format": "float", "gold": "76%", "pages": 166, "document_id": 7589, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T01:57:11Z", "latency_ms": 69062, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T01:58:21Z", "latency_ms": 35639, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T01:58:59Z", "latency_ms": 51312, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T01:59:54Z", "latency_ms": 49218, "error": "HTTPStatusError: OpenRouter HTTP 502: {\"type\":\"https://developers.cloudflare.com/support/troubleshooting/http-status-codes/cloudflare-5xx-errors/error-502/\",\"title\":\"Error 502: Bad gateway\",\"status\":502,\"detail\":\"The origin web server returned an invalid or incomplete response to Cloudflare. This typically indicates the origin is overlo", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T02:00:48Z", "latency_ms": 56375, "error": "HTTPStatusError: OpenRouter HTTP 502: {\"type\":\"https://developers.cloudflare.com/support/troubleshooting/http-status-codes/cloudflare-5xx-errors/error-502/\",\"title\":\"Error 502: Bad gateway\",\"status\":502,\"detail\":\"The origin web server returned an invalid or incomplete response to Cloudflare. This typically indicates the origin is overlo", "raw_text_chars": 0}]}, "question_id": "2309.17421v2.pdf::Q003", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "HTTPStatusError: OpenRouter HTTP 502: {\"type\":\"https://developers.cloudflare.com/support/troubleshooting/http-status-codes/cloudflare-5xx-errors/error-502/\",\"title\":\"Error 502: Bad gateway\",\"status\":502,\"detail\":\"The origin web server returned an invalid or incomplete response to Cloudflare. This typically indicates the origin is overlo", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "", "normalised_gold": "76.0"}}
+{"qid": "2309.17421v2.pdf::Q003", "doc_id": "2309.17421v2.pdf", "arm": "azure_basic_lc", "answer_format": "float", "gold": "76%", "pages": 166, "document_id": 7589, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:00:12Z", "latency_ms": 7967, "error": null, "raw_text_chars": 594}]}, "question_id": "2309.17421v2.pdf::Q003", "answer_letter": null, "raw_text": "I need to find Figure 107 in the document and identify the battery percentage shown in that screenshot.\n\nLooking through the document for Figure 107, I can see it's in Section 9.8 on GUI Navigation, specifically related to online shopping.\n\nIn the image caption list, Figure 107 is described as \"Section 9.8: online shopping.\"\n\nLooking at the actual Figure 107 in the document, I can see it's a smartphone screenshot showing an Amazon shopping interface. At the top of the screenshot, there is a time display \"5:32\" and next to it shows \"75\" which represents the battery percentage.\n\nAnswer: 75", "citations": [], "input_tokens": 117429, "output_tokens": 143, "cost_micros": 354432, "latency_ms": 7967, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "float_tol", "normalised_pred": "75.0", "normalised_gold": "76.0"}}
+{"qid": "2309.17421v2.pdf::Q004", "doc_id": "2309.17421v2.pdf", "arm": "native_pdf", "answer_format": "str", "gold": "Ukraine", "pages": 166, "document_id": 7589, "original_error": null, "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:00:20Z", "latency_ms": 33344, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T02:00:54Z", "latency_ms": 43733, "error": "HTTPStatusError: OpenRouter HTTP 502: {\"type\":\"https://developers.cloudflare.com/support/troubleshooting/http-status-codes/cloudflare-5xx-errors/error-502/\",\"title\":\"Error 502: Bad gateway\",\"status\":502,\"detail\":\"The origin web server returned an invalid or incomplete response to Cloudflare. This typically indicates the origin is overlo", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T02:01:39Z", "latency_ms": 33390, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T02:02:16Z", "latency_ms": 33641, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T02:02:59Z", "latency_ms": 38000, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}]}, "question_id": "2309.17421v2.pdf::Q004", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 37859, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": null, "pdf_filename": "2309.17421v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "ukraine"}}
+{"qid": "2309.17421v2.pdf::Q005", "doc_id": "2309.17421v2.pdf", "arm": "native_pdf", "answer_format": "str", "gold": "JoooDeee", "pages": 166, "document_id": 7589, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:01:45Z", "latency_ms": 35218, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T02:02:21Z", "latency_ms": 44921, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T02:03:08Z", "latency_ms": 52842, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T02:04:05Z", "latency_ms": 27250, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T02:04:41Z", "latency_ms": 48031, "error": "EmptyResponse: stream ended with no text", "raw_text_chars": 0}]}, "question_id": "2309.17421v2.pdf::Q005", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 47889, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": null, "pdf_filename": "2309.17421v2.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "jooodeee"}}
+{"qid": "2310.05634v2.pdf::Q005", "doc_id": "2310.05634v2.pdf", "arm": "native_pdf", "answer_format": "str", "gold": "If it can be partially verified by the knowledge graph G", "pages": 24, "document_id": 7590, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:03:37Z", "latency_ms": 20843, "error": null, "raw_text_chars": 345}]}, "question_id": "2310.05634v2.pdf::Q005", "answer_letter": null, "raw_text": "Looking at the document, specifically on page 2, the task formulation section discusses the \"Setting of Conscious Incompetence.\"\n\nThe relevant text states: \"A sentence can map to both [NA] and a list of sub-graph knowledge if it can be partially verified by the knowledge graph G.\"\n\nAnswer: if it can be partially verified by the knowledge graph", "citations": [], "input_tokens": 60205, "output_tokens": 80, "cost_micros": 181815, "latency_ms": 20828, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2310.05634v2.pdf"}, "graded": {"correct": true, "f1": 0.9473684210526316, "method": "str_norm", "normalised_pred": "if it can be partially verified by knowledge graph", "normalised_gold": "if it can be partially verified by knowledge graph g"}}
+{"qid": "2310.07609v1.pdf::Q002", "doc_id": "2310.07609v1.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "5", "pages": 10, "document_id": 7591, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:03:57Z", "latency_ms": 14639, "error": null, "raw_text_chars": 1219}]}, "question_id": "2310.07609v1.pdf::Q002", "answer_letter": null, "raw_text": "Looking at the document, I need to identify which modules use Large Language Models (LLMs) as their backbone.\n\nFrom Section 3, the QACHECK system has 5 modules:\n1. **Claim Verifier** - Section 3.1 states it's \"based on InstructGPT\" (an LLM)\n2. **Question Generator** - Section 3.2 states \"Similar to the claim verifier, we also leverage InstructGPT\" (an LLM)\n3. **Question Answering Model** - Section 3.3 describes three implementations, with the default being \"GPT Reciter-Reader\" which uses InstructGPT (an LLM). However, this module also has non-LLM options (Retriever-Reader, FLAN-T5).\n4. **QA Validator** - Section 3.4 states \"We again implement the validator by prompting InstructGPT\" (an LLM)\n5. **Reasoner** - Section 3.5 provides \"two different implementations: 1) the end-to-end QA model based on FLAN-T5, and 2) the InstructGPT model\"\n\nCounting the modules that use LLM backbones:\n- Claim Verifier: Yes (InstructGPT)\n- Question Generator: Yes (InstructGPT)\n- Question Answering Model: Yes (uses GPT Reciter-Reader by default)\n- QA Validator: Yes (InstructGPT)\n- Reasoner: Yes (InstructGPT is one of the two implementations)\n\nAll 5 modules have LLM implementations, with some offering alternatives.\n\nAnswer: 5", "citations": [], "input_tokens": 27149, "output_tokens": 403, "cost_micros": 87492, "latency_ms": 14639, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2310.07609v1.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "5", "normalised_gold": "5"}}
+{"qid": "2311.16502v3.pdf::Q000", "doc_id": "2311.16502v3.pdf", "arm": "llamacloud_premium_lc", "answer_format": "int", "gold": "287", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:04:12Z", "latency_ms": 6562, "error": null, "raw_text_chars": 406}]}, "question_id": "2311.16502v3.pdf::Q000", "answer_letter": null, "raw_text": "Looking at the document, I need to find information about the sociology subject in MMMU.\n\nIn Figure 3, which shows the distribution of subjects across disciplines, I can see:\n\nUnder \"Humanities & Social Sci. (9%)\":\n- History (313, 2.71%)\n- Literature (147, 1.27%)\n- Psychology (340, 2.94%)\n- **Sociology (287, 2.48%)**\n\nThe number 287 represents the number of samples in the sociology subject.\n\nAnswer: 287", "citations": [], "input_tokens": 103232, "output_tokens": 123, "cost_micros": 311541, "latency_ms": 6562, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "287", "normalised_gold": "287"}}
+{"qid": "2311.16502v3.pdf::Q002", "doc_id": "2311.16502v3.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "2", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:04:19Z", "latency_ms": 62062, "error": null, "raw_text_chars": 760}]}, "question_id": "2311.16502v3.pdf::Q002", "answer_letter": null, "raw_text": "I need to find Economics-related error cases that fall into the \"Perceptual Error\" category.\n\nLet me search through the document for Economics error cases:\n\nFrom the Case Study section (pages 20-21), I can see Table 10 which shows the index of case study figures by subjects with associated error categories.\n\nLooking at the Economics row in Table 10:\n- Economics has: Correct Case (22), Perceptual Error (23, 24)\n\nThis indicates that there are 2 Economics-related error cases in the Perceptual Error category (Figures 23 and 24).\n\nLet me verify by checking the actual figures:\n- Figure 23 (page 37): Economics: Macroeconomics - Error Category: Perceptual Error \u2713\n- Figure 24 (page 38): Economics: Macroeconomics - Error Category: Perceptual Error \u2713\n\nAnswer: 2", "citations": [], "input_tokens": 259127, "output_tokens": 212, "cost_micros": 1559532, "latency_ms": 62014, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2311.16502v3.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "int_eq", "normalised_pred": "2", "normalised_gold": "2"}}
+{"qid": "2311.16502v3.pdf::Q003", "doc_id": "2311.16502v3.pdf", "arm": "llamacloud_basic_lc", "answer_format": "list", "gold": "['Thermodynamics', 'Heat Transfer', 'Fluid Mechanics']", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:05:21Z", "latency_ms": 5093, "error": null, "raw_text_chars": 174}]}, "question_id": "2311.16502v3.pdf::Q003", "answer_letter": null, "raw_text": "Looking at Table 11 in the document which lists the subfields of each subject:\n\nFor Energy & Power, the subfields are:\n\nAnswer: Thermodynamics, Heat Transfer, Fluid Mechanics", "citations": [], "input_tokens": 83153, "output_tokens": 47, "cost_micros": 250164, "latency_ms": 5093, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "fluid mechanics, heat transfer, thermodynamics", "normalised_gold": "fluid mechanics, heat transfer, thermodynamics"}}
+{"qid": "2311.16502v3.pdf::Q007", "doc_id": "2311.16502v3.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "17", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:05:26Z", "latency_ms": 64485, "error": null, "raw_text_chars": 1513}]}, "question_id": "2311.16502v3.pdf::Q007", "answer_letter": null, "raw_text": "I need to find Figure 1 in the document and count the different icons shown as different image types.\n\nLooking at page 1, Figure 1 is titled \"Overview of the MMMU dataset\" and shows four main sections:\n\n1. **Comprehensive Disciplines** - Shows icons/images representing different disciplines (Engineering, Science, Art & Design, Business, Humanities & Social Sci., Medicine)\n\n2. **Heterogeneous Image Types** - Shows various image type icons including:\n   - Diagrams\n   - Tables\n   - Plots and Charts\n   - Photographs\n   - Chemical Structures\n   - Paintings\n   - Medical Images\n   - Sheet Music\n   - Geometric\n   - Pathology images\n   - Microscopic Images\n   - Comics\n\n3. **Interleaved Text and Images** - Shows a medical question example with 3 MRI images\n\n4. **Expert-level Skills Test** - Shows a diagram with three components: Perception, Knowledge, and Reasoning\n\nThe question asks specifically about \"different icon are shown as different image types\" in Figure 1. Looking at the \"Heterogeneous Image Types\" section, I can count the distinct image type icons shown.\n\nCounting the image types listed in the \"Heterogeneous Image Types\" section:\nDiagrams, Tables, Plots and Charts, Photographs, Chemical Structures, Paintings, Medical Images, Sheet Music, Geometric, Pathology images, Microscopic Images, Comics, and there's an ellipsis (...) suggesting more types.\n\nBased on what's explicitly visible and labeled in Figure 1, I can count approximately 12 distinct image type representations shown.\n\nAnswer: 12", "citations": [], "input_tokens": 259123, "output_tokens": 378, "cost_micros": 1563243, "latency_ms": 64406, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2311.16502v3.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "12", "normalised_gold": "17"}}
+{"qid": "2311.16502v3.pdf::Q009", "doc_id": "2311.16502v3.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "4", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:05:29Z", "latency_ms": 75843, "error": null, "raw_text_chars": 831}]}, "question_id": "2311.16502v3.pdf::Q009", "answer_letter": null, "raw_text": "I need to count the charts that appear in pages 5-10 of the document.\n\nLet me go through each page:\n\n**Page 5:**\n- Figure 4: A comparison chart showing benchmarks plotted on a 2D graph with \"Breadth (Knowledge)\" on x-axis and \"Depth (Reasoning)\" on y-axis\n- A table comparing different datasets\nTotal: 1 chart\n\n**Page 6:**\n- Table 2: Results table (not a chart)\nTotal: 0 charts\n\n**Page 7:**\n- Figure 5: Bar chart showing \"Performance of models on different types of images\"\n- Table 3: Results table (not a chart)\nTotal: 1 chart\n\n**Page 8:**\n- Figure 6: Pie chart showing \"Error distribution over 150 annotated GPT-4V errors\"\nTotal: 1 chart\n\n**Page 9:**\n- No charts, only text\nTotal: 0 charts\n\n**Page 10:**\n- No charts, only text (references section)\nTotal: 0 charts\n\nTotal charts in pages 5-10: 1 + 0 + 1 + 1 + 0 + 0 = 3\n\nAnswer: 3", "citations": [], "input_tokens": 259120, "output_tokens": 289, "cost_micros": 1561222, "latency_ms": 75796, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2311.16502v3.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "4"}}
+{"qid": "2311.16502v3.pdf::Q013", "doc_id": "2311.16502v3.pdf", "arm": "native_pdf", "answer_format": "list", "gold": "['Perceptual Error', 'Lack of Knowledge', 'Reasoning Error']", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:06:30Z", "latency_ms": 77217, "error": null, "raw_text_chars": 404}]}, "question_id": "2311.16502v3.pdf::Q013", "answer_letter": null, "raw_text": "Looking at Figure 6 on page 8 of the document, which shows the \"Error distribution over 150 annotated GPT-4V errors\":\n\nThe error categories and their percentages are:\n- Perceptual Error: 35%\n- Lack of Knowledge: 29%\n- Reasoning Error: 26%\n- Textual Understanding: 4%\n- Reject to Answer: 3%\n- Annotation Error: 2%\n- Answer Extraction Error: 1%\n\nAnswer: Perceptual Error, Lack of Knowledge, Reasoning Error", "citations": [], "input_tokens": 259144, "output_tokens": 133, "cost_micros": 1557856, "latency_ms": 77171, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2311.16502v3.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "lack of knowledge, perceptual error, reasoning error", "normalised_gold": "lack of knowledge, perceptual error, reasoning error"}}
+{"qid": "2311.16502v3.pdf::Q013", "doc_id": "2311.16502v3.pdf", "arm": "azure_premium_lc", "answer_format": "list", "gold": "['Perceptual Error', 'Lack of Knowledge', 'Reasoning Error']", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:06:45Z", "latency_ms": 7750, "error": null, "raw_text_chars": 387}]}, "question_id": "2311.16502v3.pdf::Q013", "answer_letter": null, "raw_text": "Looking at Figure 6 \"Error distribution over 150 annotated GPT-4V errors\":\n\nThe pie chart shows the following error categories and their percentages:\n- Perceptual Error: 35%\n- Lack of Knowledge: 29%\n- Reasoning Error: 26%\n- Textual Understanding: 4%\n- Reject to Answer: 3%\n- Annotation Error: 2%\n- Answer Extraction Error: 1%\n\nAnswer: Perceptual Error, Lack of Knowledge, Reasoning Error", "citations": [], "input_tokens": 109199, "output_tokens": 125, "cost_micros": 329472, "latency_ms": 7750, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "lack of knowledge, perceptual error, reasoning error", "normalised_gold": "lack of knowledge, perceptual error, reasoning error"}}
+{"qid": "2311.16502v3.pdf::Q014", "doc_id": "2311.16502v3.pdf", "arm": "native_pdf", "answer_format": "list", "gold": "['Reasoning Error', 'Perceptual Error', 'Lack of Knowledge']", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:06:53Z", "latency_ms": 59125, "error": null, "raw_text_chars": 830}]}, "question_id": "2311.16502v3.pdf::Q014", "answer_letter": null, "raw_text": "I'll search through pages 95-100 for error types mentioned in the case studies.\n\nFrom page 95 (Figure 80 - Architecture and Engineering: Surveying and Mapping):\n- This is a correct case, no error type mentioned\n\nFrom page 96 (Figure 82 - Architecture and Engineering: Civil Engineering):\n- Error Category: Reasoning Error\n\nFrom page 97 (Figure 83 - Computer Science: Data Structure and Algorithm):\n- This is a correct case, no error type mentioned\n\nFrom page 98 (Figure 84 - Computer Science: Compiler Principle):\n- Error Category: Perceptual Error, Lack of Knowledge\n\nFrom page 99 (Figure 85 - Computer Science: Operating System):\n- Error Category: Perceptual Error\n\nFrom page 100 (Figure 86 - Computer Science: Compiler Principle):\n- Error Category: Perceptual Error\n\nAnswer: Reasoning Error, Perceptual Error, Lack of Knowledge", "citations": [], "input_tokens": 259136, "output_tokens": 226, "cost_micros": 1559901, "latency_ms": 59077, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2311.16502v3.pdf"}, "graded": {"correct": true, "f1": 1.0, "method": "list_set", "normalised_pred": "lack of knowledge, perceptual error, reasoning error", "normalised_gold": "lack of knowledge, perceptual error, reasoning error"}}
+{"qid": "2311.16502v3.pdf::Q015", "doc_id": "2311.16502v3.pdf", "arm": "llamacloud_premium_lc", "answer_format": "int", "gold": "4", "pages": 117, "document_id": 7593, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:07:48Z", "latency_ms": 11110, "error": null, "raw_text_chars": 1296}]}, "question_id": "2311.16502v3.pdf::Q015", "answer_letter": null, "raw_text": "I need to count the successful cases (correct predictions by GPT-4V) in Pages 65-70.\n\nLet me go through each figure in this range:\n\n**Page 65 - Figure 51 (Clinical Medicine: Pathology)**\n- GPT-4V correctly identified (E) Familial polyposis coli\n- Ground Truth: (E) Familial polyposis coli\n- **SUCCESS**\n\n**Page 66 - Figure 52 (Clinical Medicine: Pathology)**\n- GPT-4V correctly identified (D) Ruptured berry aneurysm\n- Ground Truth: (D) Ruptured berry aneurysm\n- **SUCCESS**\n\n**Page 67 - Figure 53 (Clinical Medicine: Pathology)**\n- GPT-4V answered (A) Human papillomavirus infection\n- Ground Truth: (E) Chronic inflammation\n- Error Category: Perceptual Error\n- **FAILURE**\n\n**Page 68 - Figure 54 (Clinical Medicine: Pathology)**\n- GPT-4V answered (A) Human papillomavirus infection\n- Ground Truth: (E) Chronic inflammation\n- Error Category: Perceptual Error\n- **FAILURE**\n\n**Page 69 - Figure 55 (Clinical Medicine: Heart Disease)**\n- GPT-4V answered (B) Mitral regurgitation\n- Ground Truth: (C) Aortic value regurgitation\n- Error Category: Lack of Knowledge\n- **FAILURE**\n\n**Page 70 - Figure 56 (Diagnostics and Laboratory Medicine: Radiology)**\n- GPT-4V correctly identified (C) Dandy-Walker syndrome\n- Ground Truth: (C) Dandy-Walker syndrome\n- **SUCCESS**\n\nTotal successful cases: 3\n\nAnswer: 3", "citations": [], "input_tokens": 103232, "output_tokens": 424, "cost_micros": 316056, "latency_ms": 11110, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "3", "normalised_gold": "4"}}
+{"qid": "2312.10997v5.pdf::Q000", "doc_id": "2312.10997v5.pdf", "arm": "native_pdf", "answer_format": "str", "gold": "Pre-training", "pages": 21, "document_id": 7596, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:07:52Z", "latency_ms": 23593, "error": null, "raw_text_chars": 1091}]}, "question_id": "2312.10997v5.pdf::Q000", "answer_letter": null, "raw_text": "I need to find the figure with a tree shape and identify which branch has the least leaves.\n\nLooking through the document, I can see Figure 1 on page 2, which shows a \"Technology tree of RAG research\" with a tree-like structure.\n\nExamining this tree diagram, I can see it has a trunk labeled \"Retrieval-Augmented Generation\" and branches extending upward. The main branches appear to be:\n\n1. A left branch (colored in teal/green) showing the evolution from pre-training through fine-tuning stages\n2. A middle branch (colored in orange) labeled \"Instructlens\" \n3. A right branch (colored in blue) showing the inference stage with multiple methods\n\nLooking at the number of \"leaves\" (individual methods/papers) on each branch:\n- The left branch (Pre-training/Fine-tuning) has several leaves\n- The middle branch (Instructlens) appears to have the fewest leaves - it's relatively short with minimal branching\n- The right branch (Inference) has many leaves with extensive branching\n\nThe branch labeled \"Instructlens\" in the middle appears to have the least number of leaves.\n\nAnswer: Instructlens", "citations": [], "input_tokens": 64633, "output_tokens": 269, "cost_micros": 197934, "latency_ms": 23577, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "engine": "native", "answer_letter_strategy": "none", "finish_reason": "stop", "pdf_filename": "2312.10997v5.pdf"}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "instructlens", "normalised_gold": "pre training"}}
+{"qid": "2401.18059v1.pdf::Q002", "doc_id": "2401.18059v1.pdf", "arm": "azure_premium_lc", "answer_format": "str", "gold": "Collapsed Tree Algorithm", "pages": 23, "document_id": 7597, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 1, "recovered": true, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:07:59Z", "latency_ms": 7156, "error": null, "raw_text_chars": 752}]}, "question_id": "2401.18059v1.pdf::Q002", "answer_letter": null, "raw_text": "Looking at Appendix F \"PSEUDOCODE FOR RETRIEVAL METHODS\":\n\n**Algorithm 1 Tree Traversal Algorithm:**\n- Lines: function declaration, initialization of Scurrent, for layer loop, initialization of topk, for node loop, score calculation, append to top_k, end for, Slayer assignment, Scurrent update, end for, return statement, end function\n- Total: 13 lines\n\n**Algorithm 2 Collapsed Tree Algorithm:**\n- Lines: function declaration, flatten tree, initialization of top_nodes, for node loop, append with dot product, end for, sort top_nodes, result initialization, total_tokens initialization, for node loop, if condition, result append, end if, total_tokens update, end for, return statement, end function\n- Total: 16 lines\n\nAnswer: Collapsed Tree Algorithm", "citations": [], "input_tokens": 27211, "output_tokens": 190, "cost_micros": 84483, "latency_ms": 7156, "error": null, "extra": {"model": "anthropic/claude-sonnet-4.5", "finish_reason": "stop"}, "graded": {"correct": true, "f1": 1.0, "method": "str_norm", "normalised_pred": "collapsed tree algorithm", "normalised_gold": "collapsed tree algorithm"}}
+{"qid": "2405.09818v1.pdf::Q000", "doc_id": "2405.09818v1.pdf", "arm": "native_pdf", "answer_format": "list", "gold": "['Figure 5', 'Figure 6']", "pages": 27, "document_id": 7598, "original_error": null, "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:08:06Z", "latency_ms": 16641, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657610 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T02:08:23Z", "latency_ms": 11860, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657610 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T02:08:36Z", "latency_ms": 12422, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657610 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T02:08:54Z", "latency_ms": 19625, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657610 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T02:09:25Z", "latency_ms": 14078, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657610 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}]}, "question_id": "2405.09818v1.pdf::Q000", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657610 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "list_set", "normalised_pred": "", "normalised_gold": "figure 5, figure 6"}}
+{"qid": "2405.09818v1.pdf::Q001", "doc_id": "2405.09818v1.pdf", "arm": "native_pdf", "answer_format": "str", "gold": "text tokens", "pages": 27, "document_id": 7598, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:08:16Z", "latency_ms": 13922, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657593 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T02:08:30Z", "latency_ms": 25952, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657593 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T02:08:57Z", "latency_ms": 14718, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657593 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T02:09:14Z", "latency_ms": 33452, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657593 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T02:09:54Z", "latency_ms": 22093, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657593 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}]}, "question_id": "2405.09818v1.pdf::Q001", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657593 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "text tokens"}}
+{"qid": "2405.09818v1.pdf::Q003", "doc_id": "2405.09818v1.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "18", "pages": 27, "document_id": 7598, "original_error": "SSLError: [SSL: SSLV3_ALERT_BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2559)", "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:09:39Z", "latency_ms": 11906, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657608 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T02:09:52Z", "latency_ms": 15921, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657608 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T02:10:10Z", "latency_ms": 21921, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657608 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T02:10:34Z", "latency_ms": 11906, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657608 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T02:10:53Z", "latency_ms": 13686, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657608 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}]}, "question_id": "2405.09818v1.pdf::Q003", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657608 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "18"}}
+{"qid": "2405.09818v1.pdf::Q004", "doc_id": "2405.09818v1.pdf", "arm": "native_pdf", "answer_format": "int", "gold": "1", "pages": 27, "document_id": 7598, "original_error": null, "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:10:16Z", "latency_ms": 14281, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657583 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T02:10:32Z", "latency_ms": 12125, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657583 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T02:10:45Z", "latency_ms": 11952, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657583 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T02:11:01Z", "latency_ms": 20641, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657583 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T02:11:30Z", "latency_ms": 13985, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657583 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}]}, "question_id": "2405.09818v1.pdf::Q004", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657583 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "int_eq", "normalised_pred": "None", "normalised_gold": "1"}}
+{"qid": "2405.09818v1.pdf::Q005", "doc_id": "2405.09818v1.pdf", "arm": "native_pdf", "answer_format": "str", "gold": "no", "pages": 27, "document_id": 7598, "original_error": null, "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:11:06Z", "latency_ms": 17157, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"{\\\"message\\\":\\\"Input is too long.\\\"}\",\"provider_name\":\"Amazon Bedrock\",\"is_byok\":false}},\"user_id\":\"user_3CNdnY1vL3Ln9TYRiGAii5kmBvu\"}", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T02:11:25Z", "latency_ms": 12125, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657607 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T02:11:40Z", "latency_ms": 13734, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657607 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T02:11:59Z", "latency_ms": 15828, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657607 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T02:12:25Z", "latency_ms": 19985, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657607 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}]}, "question_id": "2405.09818v1.pdf::Q005", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657607 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "no"}}
+{"qid": "2405.09818v1.pdf::Q007", "doc_id": "2405.09818v1.pdf", "arm": "native_pdf", "answer_format": "str", "gold": "150k", "pages": 27, "document_id": 7598, "original_error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657603 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "retry": {"max_attempts": 5, "n_attempts": 5, "recovered": false, "attempts": [{"attempt": 1, "started_iso": "2026-05-15T02:11:44Z", "latency_ms": 15875, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657603 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 2, "started_iso": "2026-05-15T02:12:00Z", "latency_ms": 16625, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657603 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 3, "started_iso": "2026-05-15T02:12:19Z", "latency_ms": 12937, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657603 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 4, "started_iso": "2026-05-15T02:12:35Z", "latency_ms": 14625, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657603 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}, {"attempt": 5, "started_iso": "2026-05-15T02:12:57Z", "latency_ms": 12156, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657603 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "raw_text_chars": 0}]}, "question_id": "2405.09818v1.pdf::Q007", "answer_letter": null, "raw_text": "", "citations": [], "input_tokens": 0, "output_tokens": 0, "cost_micros": 0, "latency_ms": 0, "error": "HTTPStatusError: OpenRouter HTTP 400: {\"error\":{\"message\":\"Provider returned error\",\"code\":400,\"metadata\":{\"raw\":\"[{\\n  \\\"error\\\": {\\n    \\\"code\\\": 400,\\n    \\\"message\\\": \\\"The message size (33657603 bytes) exceeds 30.000MB limit.\\\",\\n    \\\"status\\\": \\\"FAILED_PRECONDITION\\\"\\n  }\\n}\\n]\",\"provider_name\":\"Google\",\"is_byok\":false}},\"user_id", "extra": {}, "graded": {"correct": false, "f1": 0.0, "method": "str_norm", "normalised_pred": "", "normalised_gold": "150k"}}
--- a/surfsense_evals/data/multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw_retries_summary.json
+++ b/surfsense_evals/data/multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/raw_retries_summary.json
@ -0,0 +1,100 @@
+{
+  "config": {
+    "base_delay": 1.0,
+    "concurrency": 2,
+    "llm_model": "anthropic/claude-sonnet-4.5",
+    "max_attempts": 5,
+    "max_delay": 30.0,
+    "max_output_tokens": 512,
+    "pdf_engine": "native"
+  },
+  "elapsed_s": 1373.6,
+  "n_failed_rows_input": 37,
+  "n_retried": 37,
+  "per_arm": {
+    "azure_basic_lc": {
+      "attempts_distribution": [
+        1
+      ],
+      "recovered": 1,
+      "recovery_rate": 1.0,
+      "still_failed": 0,
+      "tried": 1
+    },
+    "azure_premium_lc": {
+      "attempts_distribution": [
+        1,
+        1,
+        1
+      ],
+      "recovered": 3,
+      "recovery_rate": 1.0,
+      "still_failed": 0,
+      "tried": 3
+    },
+    "llamacloud_basic_lc": {
+      "attempts_distribution": [
+        1,
+        1
+      ],
+      "recovered": 2,
+      "recovery_rate": 1.0,
+      "still_failed": 0,
+      "tried": 2
+    },
+    "llamacloud_premium_lc": {
+      "attempts_distribution": [
+        1,
+        1,
+        1,
+        1
+      ],
+      "recovered": 4,
+      "recovery_rate": 1.0,
+      "still_failed": 0,
+      "tried": 4
+    },
+    "native_pdf": {
+      "attempts_distribution": [
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        5,
+        5,
+        5,
+        5,
+        5,
+        5,
+        5,
+        5,
+        5,
+        5,
+        5,
+        5
+      ],
+      "recovered": 15,
+      "recovery_rate": 0.5555555555555556,
+      "still_failed": 12,
+      "tried": 27
+    }
+  },
+  "raw_retries_path": "data\\multimodal_doc\\runs\\2026-05-14T00-53-19Z\\parser_compare\\raw_retries.jsonl",
+  "run_id": "2026-05-14T00-53-19Z",
+  "totals": {
+    "recovered": 25,
+    "still_failed": 12,
+    "tried": 37
+  }
+}
--- a/surfsense_evals/data/multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/run_artifact.json
+++ b/surfsense_evals/data/multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/run_artifact.json
--- a/surfsense_evals/pyproject.toml
+++ b/surfsense_evals/pyproject.toml
@ -23,6 +23,9 @@ dependencies = [
    "python-dotenv>=1.0.0",
    "rich>=13.7.0",
    "trafilatura>=1.12.0",
+    "pypdf>=5.1.0",
+    "azure-ai-documentintelligence>=1.0.2",
+    "llama-cloud-services>=0.6.25",
 ]

 [project.optional-dependencies]
--- a/surfsense_evals/reports/.gitignore
+++ b/surfsense_evals/reports/.gitignore
@ -1,4 +1,13 @@
+# Default: don't track auto-generated `summary.md` / `summary.json` from
+# every benchmark run — those are derivative of `data/.../runs/<id>/`.
 *
 !.gitignore
+
+# Hand-curated sample report kept as a reference for the medical suite.
 !medical/
 !medical/sample_summary.md
+
+# Hand-curated blog-ready writeups (one per experiment) — these *are*
+# the public citation surface and must travel with the repo.
+!blog/
+!blog/*.md
--- a/surfsense_evals/reports/blog/multimodal_doc_parser_compare_n171_report.md
+++ b/surfsense_evals/reports/blog/multimodal_doc_parser_compare_n171_report.md
--- a/surfsense_evals/scripts/analyze_failure_timing.py
+++ b/surfsense_evals/scripts/analyze_failure_timing.py
@ -0,0 +1,125 @@
+"""Were the SSL failures clustered in time (network blip) or evenly
+distributed (sustained limit)? Group failures by 1-min buckets using
+the run start time and the per-row latency_ms / answer order.
+
+Also: for the one *real* intrinsic failure — the 30MB Anthropic limit
+on 2405.09818v1.pdf::Q007 — print the full error message + raw payload
+sizes so the blog has a clean root cause.
+"""
+
+from __future__ import annotations
+
+import json
+from collections import Counter, defaultdict
+from pathlib import Path
+
+REPO = Path(__file__).resolve().parents[1]
+RUN = REPO / "data" / "multimodal_doc" / "runs" / "2026-05-14T00-53-19Z" / "parser_compare"
+RAW = RUN / "raw.jsonl"
+PDFS = REPO / "data" / "multimodal_doc" / "mmlongbench" / "pdfs"
+
+
+def main() -> None:
+    rows = [
+        json.loads(line) for line in RAW.read_text(encoding="utf-8").splitlines()
+        if line.strip()
+    ]
+
+    # 1) SSL clustering: failures by question index per arm
+    by_arm_idx: dict[str, list[tuple[int, str]]] = defaultdict(list)
+    qid_order: dict[str, int] = {}
+    arm_seen_count: dict[str, int] = defaultdict(int)
+    for row in rows:
+        arm = row["arm"]
+        idx = arm_seen_count[arm]
+        arm_seen_count[arm] += 1
+        qid_order[f"{arm}::{row['qid']}"] = idx
+        err = row.get("error") or ""
+        cluster = "ssl" if "SSLError" in err else (
+            "empty" if not (row.get("raw_text") or "").strip() and not err else (
+                "5xx" if "502" in err or "503" in err else (
+                    "size_limit" if "exceeds" in err.lower() and "limit" in err.lower() else (
+                        "other_err" if err else "ok"
+                    )
+                )
+            )
+        )
+        if cluster != "ok":
+            by_arm_idx[arm].append((idx, cluster))
+
+    print("=" * 80)
+    print("SSL/network-error indices per arm (each arm processes 171 questions in")
+    print("order; index = sequential position within that arm). Tight clustering")
+    print("in time = transient blip, even spread = sustained limit.")
+    print("=" * 80)
+    for arm in sorted(by_arm_idx):
+        items = by_arm_idx[arm]
+        if not items:
+            continue
+        idxs = sorted(set(i for i, _ in items))
+        print(f"\n{arm}: {len(items)} failures at indices {idxs}")
+        # show clusters
+        cluster_runs = []
+        cur = [idxs[0]]
+        for i in idxs[1:]:
+            if i - cur[-1] <= 5:  # within 5 questions = same time window
+                cur.append(i)
+            else:
+                cluster_runs.append(cur)
+                cur = [i]
+        cluster_runs.append(cur)
+        print(f"   clusters (gap<=5): {len(cluster_runs)}: {cluster_runs}")
+
+    # 2) The 30MB intrinsic failure — full details
+    print()
+    print("=" * 80)
+    print("Intrinsic failure: 30MB Anthropic input limit on 2405.09818v1.pdf::Q007")
+    print("=" * 80)
+    for row in rows:
+        if row["qid"] == "2405.09818v1.pdf::Q007" and row["arm"] == "native_pdf":
+            err = row.get("error") or ""
+            print(f"  qid: {row['qid']}")
+            print(f"  doc: {row['doc_id']}, pages: {row.get('pages')}")
+            pdf_path = PDFS / row["doc_id"]
+            if pdf_path.exists():
+                size_mb = pdf_path.stat().st_size / (1024 * 1024)
+                print(f"  PDF size on disk: {size_mb:.1f} MB")
+                # base64 inflates ~33%
+                est_b64 = size_mb * 1.33
+                print(f"  estimated base64 wire size: {est_b64:.1f} MB")
+            print(f"  full error: {err[:600]}")
+            break
+
+    # 3) Per-PDF: which PDFs are pathological?
+    print()
+    print("=" * 80)
+    print("Per-PDF failure breakdown across all 6 arms (only PDFs with failures)")
+    print("=" * 80)
+    by_pdf: dict[str, list[dict]] = defaultdict(list)
+    for row in rows:
+        err = row.get("error") or ""
+        empty = not (row.get("raw_text") or "").strip()
+        if err or empty:
+            by_pdf[row["doc_id"]].append({
+                "arm": row["arm"],
+                "qid": row["qid"],
+                "err_kind": (
+                    "ssl" if "SSLError" in err
+                    else "size_limit" if "exceeds" in err.lower() and "limit" in err.lower()
+                    else "5xx" if "502" in err or "503" in err
+                    else "json_decode" if "JSONDecodeError" in err
+                    else "empty" if empty and not err
+                    else "other"
+                ),
+                "pages": row.get("pages"),
+            })
+    for doc, items in sorted(by_pdf.items(), key=lambda x: (-len(x[1]), x[0])):
+        kinds = Counter(i["err_kind"] for i in items)
+        arms = sorted({i["arm"] for i in items})
+        pages = items[0]["pages"]
+        print(f"  {doc}  pages={pages}  failures={len(items)}  arms={arms}")
+        print(f"     kinds: {dict(kinds)}")
+
+
+if __name__ == "__main__":
+    main()
--- a/surfsense_evals/scripts/analyze_failures.py
+++ b/surfsense_evals/scripts/analyze_failures.py
@ -0,0 +1,155 @@
+"""Drill into the parser_compare n=171 raw.jsonl to surface every
+failure, group by arm + PDF, and dump the underlying error strings so
+we can write up a clean failure-mode taxonomy for the blog post.
+
+Outputs (printed to stdout + written to `failures_n171.json`):
+* per-arm failure count and rate
+* per-PDF failure count across all arms (which docs are pathological?)
+* error-string clusters per arm (so we can give human-readable causes)
+* sample failure rows (one per cluster) for the appendix
+"""
+
+from __future__ import annotations
+
+import json
+import re
+from collections import Counter, defaultdict
+from pathlib import Path
+from typing import Any
+
+
+REPO = Path(__file__).resolve().parents[1]
+RUN = REPO / "data" / "multimodal_doc" / "runs" / "2026-05-14T00-53-19Z" / "parser_compare"
+RAW = RUN / "raw.jsonl"
+OUT = REPO / "scripts" / "failures_n171.json"
+
+
+def _classify(error: str | None, raw_text: str) -> str:
+    """Coarse-grained bucket for an error message."""
+
+    blob = (error or "").lower()
+    if not blob and not raw_text.strip():
+        return "empty_response"
+    if "rate limit" in blob or "429" in blob:
+        return "rate_limit"
+    if "context_length" in blob or "context window" in blob or "too many tokens" in blob:
+        return "context_overflow"
+    if "could not process image" in blob or "invalid image" in blob:
+        return "image_decode_failure"
+    if "could not process pdf" in blob or "invalid_request_error" in blob and "pdf" in blob:
+        return "pdf_decode_failure"
+    if "timeout" in blob or "timed out" in blob:
+        return "timeout"
+    if "5xx" in blob or "internal server error" in blob or "503" in blob or "502" in blob:
+        return "provider_5xx"
+    if "filenotfound" in blob:
+        return "missing_extraction"
+    if "badrequest" in blob:
+        return "provider_400"
+    if blob:
+        return "other_error"
+    return "unknown"
+
+
+def main() -> None:
+    rows = [
+        json.loads(line) for line in RAW.read_text(encoding="utf-8").splitlines()
+        if line.strip()
+    ]
+
+    by_arm_failures: dict[str, list[dict]] = defaultdict(list)
+    by_pdf_failures: dict[str, list[dict]] = defaultdict(list)
+    error_clusters: dict[str, dict[str, list[dict]]] = defaultdict(lambda: defaultdict(list))
+
+    n_per_arm: dict[str, int] = defaultdict(int)
+    for row in rows:
+        arm = row["arm"]
+        n_per_arm[arm] += 1
+        err = row.get("error")
+        raw_text = row.get("raw_text") or ""
+        if err or not raw_text.strip():
+            cluster = _classify(err, raw_text)
+            entry = {
+                "qid": row["qid"],
+                "doc_id": row["doc_id"],
+                "answer_format": row["answer_format"],
+                "gold": row["gold"],
+                "error": err,
+                "cluster": cluster,
+                "raw_text_len": len(raw_text),
+                "pages": row.get("pages"),
+            }
+            by_arm_failures[arm].append(entry)
+            by_pdf_failures[row["doc_id"]].append({**entry, "arm": arm})
+            error_clusters[arm][cluster].append(entry)
+
+    print("=" * 90)
+    print("Per-arm failure count & rate")
+    print("=" * 90)
+    print(f"{'arm':<25} {'n':>4} {'fail':>5} {'rate%':>6}")
+    for arm in sorted(n_per_arm):
+        f = len(by_arm_failures[arm])
+        n = n_per_arm[arm]
+        print(f"{arm:<25} {n:>4} {f:>5} {f / n * 100:>5.1f}%")
+
+    print()
+    print("=" * 90)
+    print("Failure clusters per arm")
+    print("=" * 90)
+    for arm in sorted(error_clusters):
+        print(f"\n{arm}:")
+        for cluster, items in sorted(error_clusters[arm].items()):
+            print(f"  {cluster:<22} {len(items):>3}")
+            sample = items[0]
+            err_short = (sample["error"] or "")[:200].replace("\n", " ")
+            print(f"     example: {sample['qid']}  doc={sample['doc_id']} pages={sample['pages']}")
+            print(f"     error: {err_short}")
+
+    print()
+    print("=" * 90)
+    print("Per-PDF failure totals (PDFs with >=2 failures)")
+    print("=" * 90)
+    pdf_counts = Counter({pdf: len(rows) for pdf, rows in by_pdf_failures.items()})
+    for pdf, count in pdf_counts.most_common():
+        if count < 2:
+            break
+        arms_failed = sorted({r["arm"] for r in by_pdf_failures[pdf]})
+        pages = by_pdf_failures[pdf][0].get("pages")
+        print(f"  {pdf}  pages={pages}  failures={count}  arms={arms_failed}")
+
+    print()
+    print("=" * 90)
+    print("All native_pdf failures (one row per failure)")
+    print("=" * 90)
+    for entry in by_arm_failures.get("native_pdf", []):
+        err = (entry["error"] or "(no error string)")[:240].replace("\n", " ")
+        print(f"  {entry['qid']}  doc={entry['doc_id']} pages={entry['pages']} cluster={entry['cluster']}")
+        print(f"    err: {err}")
+
+    summary: dict[str, Any] = {
+        "per_arm": {
+            arm: {
+                "n": n_per_arm[arm],
+                "failures": len(by_arm_failures[arm]),
+                "rate": len(by_arm_failures[arm]) / n_per_arm[arm],
+                "clusters": {
+                    cluster: len(items)
+                    for cluster, items in error_clusters[arm].items()
+                },
+                "rows": by_arm_failures[arm],
+            }
+            for arm in sorted(n_per_arm)
+        },
+        "per_pdf": {
+            pdf: [
+                {**r, "arm": r["arm"]} for r in failures
+            ]
+            for pdf, failures in by_pdf_failures.items()
+        },
+    }
+    OUT.write_text(json.dumps(summary, indent=2), encoding="utf-8")
+    print(f"\nWrote: {OUT}")
+
+
+if __name__ == "__main__":
+    main()
--- a/surfsense_evals/scripts/check_extraction_sizes.py
+++ b/surfsense_evals/scripts/check_extraction_sizes.py
@ -0,0 +1,60 @@
+"""Sanity check extraction sizes against Sonnet 4.5's context window.
+
+Sonnet 4.5 supports ~200k tokens. As a *very* rough heuristic, English
+markdown is ~4 chars/token, so anything over ~750k chars likely won't
+fit alongside the system + question + 512 max_output_tokens. Print
+warnings for any extraction that's at risk.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+REPO = Path(__file__).resolve().parents[1]
+MAP = REPO / "data" / "multimodal_doc" / "maps" / "parser_compare_doc_map.jsonl"
+
+CHARS_PER_TOKEN = 4
+CTX_TOKENS = 200_000
+PROMPT_OVERHEAD_TOKENS = 1_000  # system + question + format hint
+MAX_OUTPUT_TOKENS = 512
+SAFE_CHARS = (CTX_TOKENS - PROMPT_OVERHEAD_TOKENS - MAX_OUTPUT_TOKENS) * CHARS_PER_TOKEN
+
+
+def main() -> None:
+    rows = [
+        json.loads(line)
+        for line in MAP.read_text(encoding="utf-8").splitlines()
+        if line.strip()
+    ]
+
+    total = len(rows)
+    arm_max: dict[str, tuple[int, str]] = {}
+    overflows: list[tuple[str, str, int]] = []
+    for row in rows:
+        for arm, ext in (row.get("extractions") or {}).items():
+            chars = int(ext.get("chars") or 0)
+            if arm not in arm_max or arm_max[arm][0] < chars:
+                arm_max[arm] = (chars, row["doc_id"])
+            if chars > SAFE_CHARS:
+                overflows.append((row["doc_id"], arm, chars))
+
+    print(f"PDFs in manifest: {total}")
+    print(f"safe char budget: {SAFE_CHARS:,}  (~{(SAFE_CHARS // CHARS_PER_TOKEN):,} tokens)")
+    print()
+    print("largest extraction per arm:")
+    for arm, (chars, doc_id) in sorted(arm_max.items()):
+        print(f"  {arm:25s}  {chars:>10,} chars  ({doc_id})")
+
+    print()
+    if overflows:
+        print(f"OVERFLOW RISK ({len(overflows)} extractions > safe budget):")
+        for doc_id, arm, chars in overflows:
+            est_tokens = chars // CHARS_PER_TOKEN
+            print(f"  {doc_id} :: {arm} :: {chars:,} chars (~{est_tokens:,} tokens)")
+    else:
+        print("no overflow risk — all extractions fit Sonnet 4.5's 200k context.")
+
+
+if __name__ == "__main__":
+    main()
--- a/surfsense_evals/scripts/check_uploaded_status.py
+++ b/surfsense_evals/scripts/check_uploaded_status.py
@ -0,0 +1,77 @@
+"""Query SurfSense for the status of every MMLongBench PDF in scope.
+
+Uses the existing SurfSense documents client to query
+``/documents/status?document_ids=...`` for both the known-existing 5
+PDFs (doc ids 5219-5223) and the recently-uploaded mmlongbench batch
+(7577-7600 range).
+"""
+
+from __future__ import annotations
+
+import asyncio
+import os
+from pathlib import Path
+
+import httpx
+from dotenv import load_dotenv
+
+
+REPO = Path(__file__).resolve().parents[1]
+PDF_DIR = REPO / "data" / "multimodal_doc" / "mmlongbench" / "pdfs"
+
+
+async def main() -> None:
+    load_dotenv(REPO / ".env")
+    base = os.environ.get("SURFSENSE_API_BASE", "http://localhost:8000").rstrip("/")
+    token = os.environ.get("SURFSENSE_JWT")
+    if not token:
+        raise SystemExit("SURFSENSE_JWT missing from .env")
+
+    pdf_names = sorted(p.name for p in PDF_DIR.glob("*.pdf"))
+    print(f"local cached PDFs: {len(pdf_names)}")
+
+    candidate_ids = list(range(5219, 5224)) + list(range(7577, 7625))
+
+    headers = {
+        "Authorization": f"Bearer {token}",
+        "Accept": "application/json",
+    }
+    async with httpx.AsyncClient(timeout=30.0) as http:
+        r = await http.get(
+            f"{base}/api/v1/documents/status",
+            params={
+                "search_space_id": 55,
+                "document_ids": ",".join(str(d) for d in candidate_ids),
+            },
+            headers=headers,
+        )
+        r.raise_for_status()
+        items = r.json().get("items", [])
+
+    by_title: dict[str, dict] = {}
+    for it in items:
+        by_title[it.get("title", "")] = {
+            "id": it.get("id"),
+            "state": (it.get("status") or {}).get("state"),
+            "reason": (it.get("status") or {}).get("reason"),
+        }
+
+    by_state: dict[str, int] = {}
+    print()
+    for name in pdf_names:
+        info = by_title.get(name)
+        if info is None:
+            print(f"  [missing      ]              {name}")
+            by_state["missing"] = by_state.get("missing", 0) + 1
+        else:
+            tag = info["state"] or "?"
+            print(f"  [{tag:13s}] doc_id={info['id']:>5}  {name}")
+            by_state[tag] = by_state.get(tag, 0) + 1
+    print()
+    print("summary:")
+    for k, v in sorted(by_state.items()):
+        print(f"  {k}: {v}")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/surfsense_evals/scripts/compute_adjusted_accuracy.py
+++ b/surfsense_evals/scripts/compute_adjusted_accuracy.py
@ -0,0 +1,112 @@
+"""Compute "intrinsic" accuracy by removing transient network errors.
+
+A failure is *transient* if it's:
+  * SSLError: SSL bad-record-mac (TLS hiccup)
+  * Cloudflare 502 / 503 (provider-side load shedding)
+  * empty_response with no error string and no other signal (likely
+    connection reset mid-stream)
+  * JSONDecodeError (parse error mid-stream)
+
+A failure is *intrinsic* if it's a hard limit:
+  * "exceeds .* limit" (size limits)
+  * context_length errors
+  * provider 400 with image / pdf decode failure
+  * malformed-input failures
+
+We re-compute accuracy with two denominators:
+  * raw acc       = correct / 171  (what the headline reports)
+  * adjusted acc  = correct / (171 - transient_failures)  (intrinsic)
+
+Outputs a table that we can drop straight into the blog.
+"""
+
+from __future__ import annotations
+
+import json
+from collections import defaultdict
+from pathlib import Path
+
+REPO = Path(__file__).resolve().parents[1]
+RUN = REPO / "data" / "multimodal_doc" / "runs" / "2026-05-14T00-53-19Z" / "parser_compare"
+RAW = RUN / "raw.jsonl"
+
+
+TRANSIENT_HINTS = (
+    "sslv3_alert_bad_record_mac",
+    "ssl_alert_bad_record_mac",
+    "ssl: ssl",
+    "cloudflare",
+    "error 502",
+    "error 503",
+    "bad gateway",
+    "service unavailable",
+    "gateway timeout",
+    "jsondecodeerror",
+)
+INTRINSIC_HINTS = (
+    "exceeds",
+    "context_length",
+    "context window",
+    "could not process pdf",
+    "could not process image",
+)
+
+
+def classify(error: str | None, raw_text: str) -> str:
+    err = (error or "").lower()
+    if not err and not raw_text.strip():
+        return "transient_empty"
+    if any(h in err for h in TRANSIENT_HINTS):
+        return "transient_ssl_or_5xx"
+    if any(h in err for h in INTRINSIC_HINTS):
+        return "intrinsic_limit"
+    if err:
+        return "other_error"
+    return "ok"
+
+
+def main() -> None:
+    rows = [
+        json.loads(line) for line in RAW.read_text(encoding="utf-8").splitlines()
+        if line.strip()
+    ]
+    by_arm: dict[str, dict] = defaultdict(lambda: {
+        "n": 0, "correct": 0,
+        "transient_ssl_or_5xx": 0, "transient_empty": 0,
+        "intrinsic_limit": 0, "other_error": 0,
+    })
+    for row in rows:
+        arm = row["arm"]
+        m = by_arm[arm]
+        m["n"] += 1
+        graded = row.get("graded") or {}
+        if graded.get("correct"):
+            m["correct"] += 1
+        kind = classify(row.get("error"), row.get("raw_text") or "")
+        if kind != "ok":
+            m[kind] += 1
+
+    print(f"{'arm':<25} {'raw acc%':>8} {'transient':>10} {'intrinsic':>10} {'other':>6} {'adj acc% (no transient)':>22}")
+    print("-" * 88)
+    for arm in sorted(by_arm):
+        m = by_arm[arm]
+        raw = m["correct"] / m["n"] * 100
+        transient = m["transient_ssl_or_5xx"] + m["transient_empty"]
+        intrinsic = m["intrinsic_limit"]
+        other = m["other_error"]
+        usable = m["n"] - transient
+        adj = m["correct"] / usable * 100 if usable else 0
+        print(
+            f"{arm:<25} {raw:>7.1f}% {transient:>10} {intrinsic:>10} {other:>6} {adj:>21.1f}%"
+        )
+
+    print()
+    print("transient   = SSLError / 502 / 503 / empty stream / mid-stream JSON decode (would")
+    print("              succeed on retry; eval harness has no built-in retry today).")
+    print("intrinsic   = hard limit (e.g. >30MB Anthropic request, model context overflow).")
+    print("adj acc%    = correct / (n - transient) — what the arm scores when network noise")
+    print("              is removed; closest thing we have to a like-for-like quality number.")
+
+
+if __name__ == "__main__":
+    main()
--- a/surfsense_evals/scripts/compute_blog_extras.py
+++ b/surfsense_evals/scripts/compute_blog_extras.py
@ -0,0 +1,381 @@
+"""Compute the deeper statistics the blog needs: McNemar pairwise tests,
+per-PDF heterogeneity, latency/token distribution percentiles.
+
+Reads the merged post-retry artifact:
+
+    data/multimodal_doc/runs/<run_id>/parser_compare/raw_post_retry.jsonl
+
+Outputs to stdout:
+
+  1) Per-arm latency distribution (n, mean, std, p10, p25, p50, p75, p90, p95, p99, max).
+  2) Per-arm input/output token distribution (mean, p50, p95, max).
+  3) McNemar pairwise table: for every (arm_i, arm_j) ordered pair on the
+     same 171 questions, count b_ij = #(arm_i correct & arm_j wrong) and
+     b_ji = #(arm_i wrong & arm_j correct), and report the exact-binomial
+     two-sided p-value. We include both raw (using the original raw.jsonl)
+     and post-retry results.
+  4) Per-PDF accuracy variance per arm (n_pdfs=30): mean, std, min, max.
+
+Pure stdlib — no scipy/numpy.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import math
+import statistics
+from pathlib import Path
+
+REPO = Path(__file__).resolve().parents[1]
+
+
+# ---------------------------------------------------------------------------
+# I/O
+# ---------------------------------------------------------------------------
+
+
+def _read_jsonl(path: Path) -> list[dict]:
+    out: list[dict] = []
+    with path.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            out.append(json.loads(line))
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Distribution helpers
+# ---------------------------------------------------------------------------
+
+
+def _percentile(values: list[float], p: float) -> float:
+    """Linear-interpolation percentile (p in [0, 100])."""
+
+    if not values:
+        return 0.0
+    s = sorted(values)
+    if len(s) == 1:
+        return float(s[0])
+    k = (len(s) - 1) * (p / 100.0)
+    lo, hi = math.floor(k), math.ceil(k)
+    if lo == hi:
+        return float(s[int(k)])
+    return float(s[lo] + (s[hi] - s[lo]) * (k - lo))
+
+
+# ---------------------------------------------------------------------------
+# McNemar exact-binomial p-value
+# ---------------------------------------------------------------------------
+
+
+def _binom_coef(n: int, k: int) -> int:
+    if k < 0 or k > n:
+        return 0
+    return math.comb(n, k)
+
+
+def _mcnemar_exact_pvalue(b: int, c: int) -> float:
+    """Two-sided exact-binomial McNemar p-value.
+
+    Tests H0: P(arm_i wrong, arm_j right) == P(arm_i right, arm_j wrong)
+    on discordant pairs only. Under H0 the count b ~ Bin(b+c, 0.5).
+    The two-sided p-value is
+
+        P(X <= min(b, c)) + P(X >= max(b, c))
+
+    computed exactly (cheap because b+c <= 27 in our run).
+    """
+
+    n = b + c
+    if n == 0:
+        return 1.0
+    k = min(b, c)
+    # Two-sided exact: 2 * P(X <= k) clipped at 1.0
+    cdf = sum(_binom_coef(n, i) for i in range(k + 1))
+    p = 2.0 * cdf / (2 ** n)
+    return min(1.0, p)
+
+
+def _mcnemar_table(rows: list[dict]) -> dict:
+    """Group rows -> {qid: {arm: bool_correct}} and compute pairwise."""
+
+    by_qid: dict[str, dict[str, bool]] = {}
+    arms_seen: set[str] = set()
+    for r in rows:
+        qid = r["qid"]
+        arm = r["arm"]
+        graded = r.get("graded") or {}
+        correct = bool(graded.get("correct"))
+        by_qid.setdefault(qid, {})[arm] = correct
+        arms_seen.add(arm)
+
+    arms = sorted(arms_seen)
+    qids = sorted(by_qid)
+    out: dict[str, dict] = {"arms": arms, "n_qids": len(qids), "pairs": []}
+    for i, ai in enumerate(arms):
+        for aj in arms[i + 1:]:
+            b = c = both = neither = 0
+            for q in qids:
+                row = by_qid[q]
+                if ai not in row or aj not in row:
+                    continue
+                ci, cj = row[ai], row[aj]
+                if ci and not cj:
+                    b += 1
+                elif cj and not ci:
+                    c += 1
+                elif ci and cj:
+                    both += 1
+                else:
+                    neither += 1
+            p = _mcnemar_exact_pvalue(b, c)
+            out["pairs"].append({
+                "arm_i": ai, "arm_j": aj,
+                "b_i_only": b, "c_j_only": c,
+                "both_correct": both, "both_wrong": neither,
+                "p_value": p,
+            })
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Per-PDF heterogeneity
+# ---------------------------------------------------------------------------
+
+
+def _per_pdf_stats(rows: list[dict]) -> dict[str, dict]:
+    """For each arm, per-PDF accuracy = correct/total questions on that PDF."""
+
+    bucket: dict[str, dict[str, list[bool]]] = {}
+    for r in rows:
+        arm = r["arm"]
+        pdf = r["doc_id"]
+        graded = r.get("graded") or {}
+        bucket.setdefault(arm, {}).setdefault(pdf, []).append(
+            bool(graded.get("correct"))
+        )
+
+    out: dict[str, dict] = {}
+    for arm, pdfs in bucket.items():
+        accs = [sum(b) / len(b) for b in pdfs.values() if b]
+        if not accs:
+            continue
+        out[arm] = {
+            "n_pdfs": len(accs),
+            "mean": statistics.mean(accs),
+            "std": statistics.stdev(accs) if len(accs) > 1 else 0.0,
+            "min": min(accs),
+            "max": max(accs),
+            "p25": _percentile(accs, 25),
+            "p50": _percentile(accs, 50),
+            "p75": _percentile(accs, 75),
+            "n_pdfs_zero": sum(1 for a in accs if a == 0.0),
+            "n_pdfs_perfect": sum(1 for a in accs if a == 1.0),
+        }
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Latency / token distributions
+# ---------------------------------------------------------------------------
+
+
+def _per_arm_latency(rows: list[dict]) -> dict[str, dict]:
+    by_arm: dict[str, list[float]] = {}
+    for r in rows:
+        lat = r.get("latency_ms")
+        if lat is None or lat == 0:
+            continue
+        by_arm.setdefault(r["arm"], []).append(float(lat))
+    out: dict[str, dict] = {}
+    for arm, lats in by_arm.items():
+        out[arm] = {
+            "n": len(lats),
+            "mean_s": statistics.mean(lats) / 1000,
+            "std_s": (statistics.stdev(lats) / 1000) if len(lats) > 1 else 0.0,
+            "p10_s": _percentile(lats, 10) / 1000,
+            "p25_s": _percentile(lats, 25) / 1000,
+            "p50_s": _percentile(lats, 50) / 1000,
+            "p75_s": _percentile(lats, 75) / 1000,
+            "p90_s": _percentile(lats, 90) / 1000,
+            "p95_s": _percentile(lats, 95) / 1000,
+            "p99_s": _percentile(lats, 99) / 1000,
+            "max_s": max(lats) / 1000,
+            # Coefficient of variation: std / mean (unitless tail-fatness).
+            "cv": (
+                statistics.stdev(lats) / statistics.mean(lats)
+                if len(lats) > 1 and statistics.mean(lats) > 0 else 0.0
+            ),
+        }
+    return out
+
+
+def _per_arm_tokens(rows: list[dict]) -> dict[str, dict]:
+    by_arm_in: dict[str, list[float]] = {}
+    by_arm_out: dict[str, list[float]] = {}
+    for r in rows:
+        t_in = r.get("input_tokens") or 0
+        t_out = r.get("output_tokens") or 0
+        if t_in:
+            by_arm_in.setdefault(r["arm"], []).append(float(t_in))
+        if t_out:
+            by_arm_out.setdefault(r["arm"], []).append(float(t_out))
+    out: dict[str, dict] = {}
+    for arm in sorted(set(by_arm_in) | set(by_arm_out)):
+        in_vals = by_arm_in.get(arm, [])
+        out_vals = by_arm_out.get(arm, [])
+        if not in_vals and not out_vals:
+            continue
+        entry: dict = {}
+        if in_vals:
+            entry["input"] = {
+                "n": len(in_vals),
+                "mean": statistics.mean(in_vals),
+                "p50": _percentile(in_vals, 50),
+                "p95": _percentile(in_vals, 95),
+                "max": max(in_vals),
+            }
+        if out_vals:
+            entry["output"] = {
+                "n": len(out_vals),
+                "mean": statistics.mean(out_vals),
+                "p50": _percentile(out_vals, 50),
+                "p95": _percentile(out_vals, 95),
+                "max": max(out_vals),
+            }
+        out[arm] = entry
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Pretty-printing
+# ---------------------------------------------------------------------------
+
+
+def _print_latency(title: str, lat: dict[str, dict]) -> None:
+    print()
+    print(title)
+    print("-" * len(title))
+    header = (f"{'arm':<25} {'n':>4} {'mean':>7} {'std':>7} "
+              f"{'p50':>7} {'p90':>7} {'p95':>7} {'p99':>7} {'max':>7} {'CV':>5}")
+    print(header)
+    print("-" * len(header))
+    for arm in sorted(lat, key=lambda a: lat[a]["mean_s"]):
+        s = lat[arm]
+        print(f"{arm:<25} {s['n']:>4} "
+              f"{s['mean_s']:>6.1f}s {s['std_s']:>6.1f}s "
+              f"{s['p50_s']:>6.1f}s {s['p90_s']:>6.1f}s {s['p95_s']:>6.1f}s "
+              f"{s['p99_s']:>6.1f}s {s['max_s']:>6.1f}s {s['cv']:>5.2f}")
+
+
+def _print_tokens(title: str, toks: dict[str, dict]) -> None:
+    print()
+    print(title)
+    print("-" * len(title))
+    header = (f"{'arm':<25} {'in mean':>9} {'in p50':>9} {'in p95':>9} {'in max':>9}"
+              f"  {'out mean':>9} {'out p95':>9}")
+    print(header)
+    print("-" * len(header))
+    for arm in sorted(toks):
+        e = toks[arm]
+        ein = e.get("input")
+        eout = e.get("output")
+        if not ein:
+            continue
+        print(f"{arm:<25} "
+              f"{ein['mean']:>9,.0f} {ein['p50']:>9,.0f} {ein['p95']:>9,.0f} {ein['max']:>9,.0f}  "
+              f"{(eout or {}).get('mean', 0):>9,.0f} {(eout or {}).get('p95', 0):>9,.0f}")
+
+
+def _print_pdf_var(title: str, var: dict[str, dict]) -> None:
+    print()
+    print(title)
+    print("-" * len(title))
+    header = (f"{'arm':<25} {'n_pdfs':>7} {'mean':>7} {'std':>7} {'min':>7} "
+              f"{'p25':>7} {'p50':>7} {'p75':>7} {'max':>7} {'#0%':>5} {'#100%':>6}")
+    print(header)
+    print("-" * len(header))
+    for arm in sorted(var, key=lambda a: -var[a]["mean"]):
+        s = var[arm]
+        print(f"{arm:<25} {s['n_pdfs']:>7} "
+              f"{s['mean']*100:>6.1f}% {s['std']*100:>6.1f}% {s['min']*100:>6.1f}% "
+              f"{s['p25']*100:>6.1f}% {s['p50']*100:>6.1f}% {s['p75']*100:>6.1f}% "
+              f"{s['max']*100:>6.1f}% {s['n_pdfs_zero']:>5} {s['n_pdfs_perfect']:>6}")
+
+
+def _print_mcnemar(title: str, table: dict) -> None:
+    print()
+    print(title)
+    print("-" * len(title))
+    print(f"n_qids on which all arms have a graded row: {table['n_qids']}")
+    header = (f"{'arm_i':<25} {'arm_j':<25} {'b':>4} {'c':>4} "
+              f"{'both ok':>8} {'both wr':>8} {'p (2-sided)':>13} {'sig':>4}")
+    print(header)
+    print("-" * len(header))
+    for pair in sorted(table["pairs"], key=lambda p: p["p_value"]):
+        sig = ""
+        if pair["p_value"] < 0.001:
+            sig = "***"
+        elif pair["p_value"] < 0.01:
+            sig = "**"
+        elif pair["p_value"] < 0.05:
+            sig = "*"
+        print(f"{pair['arm_i']:<25} {pair['arm_j']:<25} "
+              f"{pair['b_i_only']:>4} {pair['c_j_only']:>4} "
+              f"{pair['both_correct']:>8} {pair['both_wrong']:>8} "
+              f"{pair['p_value']:>13.4f} {sig:>4}")
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--run-id", default="2026-05-14T00-53-19Z")
+    args = parser.parse_args()
+
+    run_dir = REPO / "data" / "multimodal_doc" / "runs" / args.run_id / "parser_compare"
+    raw_path = run_dir / "raw.jsonl"
+    post_path = run_dir / "raw_post_retry.jsonl"
+    if not raw_path.exists() or not post_path.exists():
+        raise SystemExit(
+            "Missing raw.jsonl or raw_post_retry.jsonl. "
+            "Run scripts/compute_post_retry_accuracy.py first."
+        )
+
+    raw_rows = _read_jsonl(raw_path)
+    post_rows = _read_jsonl(post_path)
+
+    print(f"Run: {args.run_id}")
+    print(f"raw rows: {len(raw_rows)}, post-retry rows: {len(post_rows)}")
+
+    # Latency uses post-retry rows (post-retry rows include the retry's own
+    # latency for recovered rows). For raw, recovered rows have latency=0
+    # because the harness recorded a failure.
+    _print_latency("Per-arm latency (post-retry)", _per_arm_latency(post_rows))
+
+    _print_tokens("Per-arm token distribution (post-retry)", _per_arm_tokens(post_rows))
+
+    _print_pdf_var(
+        "Per-PDF accuracy heterogeneity (post-retry)",
+        _per_pdf_stats(post_rows),
+    )
+
+    _print_mcnemar(
+        "McNemar pairwise (RAW, no retries)",
+        _mcnemar_table(raw_rows),
+    )
+    _print_mcnemar(
+        "McNemar pairwise (POST-RETRY)",
+        _mcnemar_table(post_rows),
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/surfsense_evals/scripts/compute_post_retry_accuracy.py
+++ b/surfsense_evals/scripts/compute_post_retry_accuracy.py
@ -0,0 +1,180 @@
+"""Recompute per-arm accuracy/F1 after merging retry survivors into raw.jsonl.
+
+Reads:
+  - data/multimodal_doc/runs/<run_id>/parser_compare/raw.jsonl
+  - data/multimodal_doc/runs/<run_id>/parser_compare/raw_retries.jsonl
+
+For each (arm, qid) present in the retry artifact:
+  - if the retry RECOVERED, the retry row replaces the original row (same
+    grader is reused — see ``mmlongbench/grader.py``);
+  - if the retry did NOT recover, the original row stays (still a failure,
+    so ``correct=False`` and ``f1=0``).
+
+Prints two tables side by side:
+  * Raw run (no retries) — matches §1 of the blog.
+  * Post-retry run        — final, "what would the headline have been if
+                              the harness had had retries from day one".
+
+It also writes ``data/multimodal_doc/runs/<run_id>/parser_compare/raw_post_retry.jsonl``
+so any downstream notebook / report can join straight on it.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+REPO = Path(__file__).resolve().parents[1]
+
+
+def _read_jsonl(path: Path) -> list[dict]:
+    out: list[dict] = []
+    with path.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            out.append(json.loads(line))
+    return out
+
+
+def _row_key(row: dict) -> tuple[str, str]:
+    return (str(row["arm"]), str(row["qid"]))
+
+
+def _is_failure(row: dict) -> bool:
+    if row.get("error"):
+        return True
+    if not (row.get("raw_text") or "").strip():
+        return True
+    return False
+
+
+def _summarise(rows_by_arm: dict[str, list[dict]]) -> dict[str, dict]:
+    out: dict[str, dict] = {}
+    for arm, rows in rows_by_arm.items():
+        n = len(rows)
+        n_correct = sum(1 for r in rows if r.get("graded", {}).get("correct"))
+        f1_sum = sum(float(r.get("graded", {}).get("f1") or 0.0) for r in rows)
+        n_fail = sum(1 for r in rows if _is_failure(r))
+        out[arm] = {
+            "n": n,
+            "n_correct": n_correct,
+            "n_failures": n_fail,
+            "accuracy": (n_correct / n) if n else 0.0,
+            "f1_mean": (f1_sum / n) if n else 0.0,
+            "failure_rate": (n_fail / n) if n else 0.0,
+        }
+    return out
+
+
+def _print_table(title: str, summary: dict[str, dict]) -> None:
+    print()
+    print(title)
+    print("-" * len(title))
+    header = f"{'arm':<25} {'n':>4} {'n_corr':>7} {'acc':>7} {'F1':>7} {'fails':>6} {'fail%':>7}"
+    print(header)
+    print("-" * len(header))
+    # stable order: highest accuracy first
+    arms_sorted = sorted(summary.items(), key=lambda kv: -kv[1]["accuracy"])
+    for arm, s in arms_sorted:
+        print(f"{arm:<25} {s['n']:>4} {s['n_correct']:>7} "
+              f"{s['accuracy']*100:>6.1f}% {s['f1_mean']*100:>6.1f}% "
+              f"{s['n_failures']:>6} {s['failure_rate']*100:>6.1f}%")
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--run-id", default="2026-05-14T00-53-19Z")
+    args = parser.parse_args()
+
+    run_dir = REPO / "data" / "multimodal_doc" / "runs" / args.run_id / "parser_compare"
+    raw_path = run_dir / "raw.jsonl"
+    retry_path = run_dir / "raw_retries.jsonl"
+    out_path = run_dir / "raw_post_retry.jsonl"
+
+    if not raw_path.exists():
+        print(f"raw.jsonl not found at {raw_path}", file=sys.stderr)
+        return 1
+    if not retry_path.exists():
+        print(f"raw_retries.jsonl not found at {retry_path}", file=sys.stderr)
+        return 1
+
+    raw_rows = _read_jsonl(raw_path)
+    retry_rows = _read_jsonl(retry_path)
+
+    retry_by_key: dict[tuple[str, str], dict] = {
+        _row_key(r): r for r in retry_rows
+    }
+
+    merged_rows: list[dict] = []
+    n_replaced_recovered = 0
+    n_replaced_still_failed = 0
+    n_unchanged = 0
+    for row in raw_rows:
+        key = _row_key(row)
+        retry = retry_by_key.get(key)
+        if retry is None:
+            merged_rows.append(row)
+            n_unchanged += 1
+            continue
+        # The retry artifact carries a fresh ArmResult + grade in the same
+        # shape, plus a "retry" sub-object. We use the retry row whenever
+        # it represents a recovery; otherwise we keep the original (the
+        # retry confirms it is intrinsic, but the original row is the one
+        # the headline numbers were computed from, and the failure verdict
+        # is identical either way).
+        recovered = bool(retry.get("retry", {}).get("recovered"))
+        if recovered:
+            merged_rows.append(retry)
+            n_replaced_recovered += 1
+        else:
+            merged_rows.append(row)
+            n_replaced_still_failed += 1
+
+    # Persist merged jsonl for downstream consumers
+    with out_path.open("w", encoding="utf-8") as fh:
+        for r in merged_rows:
+            fh.write(json.dumps(r) + "\n")
+
+    # Bucket per arm
+    raw_by_arm: dict[str, list[dict]] = {}
+    for r in raw_rows:
+        raw_by_arm.setdefault(r["arm"], []).append(r)
+    post_by_arm: dict[str, list[dict]] = {}
+    for r in merged_rows:
+        post_by_arm.setdefault(r["arm"], []).append(r)
+
+    raw_summary = _summarise(raw_by_arm)
+    post_summary = _summarise(post_by_arm)
+
+    print()
+    print(f"Run: {args.run_id}")
+    print(f"Replaced (retry recovered):     {n_replaced_recovered}")
+    print(f"Kept original (retry still failed): {n_replaced_still_failed}")
+    print(f"Untouched rows:                 {n_unchanged}")
+    print(f"Wrote merged artifact: {out_path.relative_to(REPO)}")
+
+    _print_table("Raw run (no retries)", raw_summary)
+    _print_table("Post-retry run (final)", post_summary)
+
+    print()
+    print("Delta (post-retry minus raw):")
+    print(f"{'arm':<25} {'d_acc':>7} {'d_fails':>8}")
+    print("-" * 42)
+    for arm in sorted(set(raw_summary) | set(post_summary)):
+        r = raw_summary.get(arm)
+        p = post_summary.get(arm)
+        if not r or not p:
+            continue
+        d_acc = (p["accuracy"] - r["accuracy"]) * 100
+        d_fail = p["n_failures"] - r["n_failures"]
+        print(f"{arm:<25} {d_acc:>+6.1f}p {d_fail:>+7d}")
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/surfsense_evals/scripts/download_crag_task3.py
+++ b/surfsense_evals/scripts/download_crag_task3.py
@ -1,97 +0,0 @@
-"""Download CRAG Task 3's 4 .tar.bz2 parts in parallel.
-
-Run once before ``ingest research crag_t3`` to avoid the ingest
-synchronously blocking on a 7 GB download. Skips parts already
-present and complete on disk.
-"""
-
-from __future__ import annotations
-
-import logging
-import sys
-import time
-import urllib.request
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from pathlib import Path
-
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s %(levelname)s %(message)s",
-)
-log = logging.getLogger("download_task3")
-
-
-_BASE = (
-    "https://github.com/facebookresearch/CRAG/raw/refs/heads/main/data/"
-    "crag_task_3_dev_v4.tar.bz2.part"
-)
-_USER_AGENT = "SurfSense-Evals/0.1 (CRAG Task 3 fetch)"
-
-
-def _expected_size(url: str) -> int:
-    req = urllib.request.Request(url, method="HEAD", headers={"User-Agent": _USER_AGENT})
-    with urllib.request.urlopen(req, timeout=30) as resp:
-        return int(resp.headers.get("content-length", 0))
-
-
-def download_one(part: int, dest_dir: Path) -> Path:
-    url = f"{_BASE}{part}"
-    dest = dest_dir / f"crag_task_3_dev_v4.tar.bz2.part{part}"
-    expected = _expected_size(url)
-    if dest.exists() and dest.stat().st_size == expected:
-        log.info("part%d: cached (%d bytes)", part, expected)
-        return dest
-    log.info("part%d: downloading %d bytes ...", part, expected)
-    tmp = dest.with_suffix(dest.suffix + ".part_dl")
-    started = time.monotonic()
-    last_log = started
-    with urllib.request.urlopen(
-        urllib.request.Request(url, headers={"User-Agent": _USER_AGENT}),
-        timeout=900,
-    ) as resp, tmp.open("wb") as fh:
-        downloaded = 0
-        chunk = resp.read(1 << 20)
-        while chunk:
-            fh.write(chunk)
-            downloaded += len(chunk)
-            now = time.monotonic()
-            if now - last_log > 5.0:
-                pct = 100 * downloaded / expected if expected else 0
-                rate_mb = (downloaded / (now - started)) / (1 << 20)
-                log.info(
-                    "part%d: %5.1f%% (%.1f / %.1f MiB at %.1f MiB/s)",
-                    part, pct, downloaded / (1 << 20), expected / (1 << 20), rate_mb,
-                )
-                last_log = now
-            chunk = resp.read(1 << 20)
-    tmp.replace(dest)
-    elapsed = time.monotonic() - started
-    log.info(
-        "part%d: done in %.1fs (%.1f MiB/s avg)",
-        part, elapsed, (expected / (1 << 20)) / max(elapsed, 0.001),
-    )
-    return dest
-
-
-def main() -> int:
-    dest_dir = Path("data/research/crag_t3/.raw_cache")
-    dest_dir.mkdir(parents=True, exist_ok=True)
-
-    # 4 parts in parallel — typical residential connection saturates around
-    # 2 streams; GitHub raw serves these fine in parallel.
-    started = time.monotonic()
-    with ThreadPoolExecutor(max_workers=4) as ex:
-        futures = {ex.submit(download_one, i, dest_dir): i for i in range(1, 5)}
-        for fut in as_completed(futures):
-            part = futures[fut]
-            try:
-                fut.result()
-            except Exception as exc:  # noqa: BLE001
-                log.error("part%d failed: %s", part, exc)
-                return 1
-    log.info("All 4 parts downloaded in %.1fs", time.monotonic() - started)
-    return 0
-
-
-if __name__ == "__main__":
-    sys.exit(main())
--- a/surfsense_evals/scripts/inspect_first30.py
+++ b/surfsense_evals/scripts/inspect_first30.py
@ -0,0 +1,59 @@
+"""Inspect what the first 30 MMLongBench-Doc PDFs would look like for scoping.
+
+Run from surfsense_evals/ root via:
+    python scripts/inspect_first30.py
+
+Prints which docs are already ingested (existing 5), which are new (25 to
+upload), how many questions cover those 30 PDFs, and the answerable /
+unanswerable + format mix.
+"""
+
+from __future__ import annotations
+
+import json
+from collections import Counter
+from pathlib import Path
+
+
+def main() -> None:
+    qpath = Path("data/multimodal_doc/mmlongbench/questions.jsonl")
+    lines = qpath.read_text(encoding="utf-8").splitlines()
+    rows = [json.loads(line) for line in lines if line.strip()]
+
+    docs_by_id = sorted({r["doc_id"] for r in rows})
+    first30 = docs_by_id[:30]
+    existing5 = {
+        "05-03-18-political-release.pdf",
+        "0b85477387a9d0cc33fca0f4becaa0e5.pdf",
+        "0e94b4197b10096b1f4c699701570fbf.pdf",
+        "11-21-16-Updated-Post-Election-Release.pdf",
+        "12-15-15-ISIS-and-terrorism-release-final.pdf",
+    }
+    new25 = [d for d in first30 if d not in existing5]
+    print(
+        f"first 30 docs (alphabetical) — {len(new25)} new, "
+        f"{len(first30) - len(new25)} already in SurfSense"
+    )
+
+    qs_in_30 = [r for r in rows if r["doc_id"] in set(first30)]
+    fmts = Counter((r.get("answer_format") or "").lower() for r in qs_in_30)
+    answerable = sum(v for k, v in fmts.items() if k != "none")
+    unanswerable = fmts.get("none", 0)
+
+    print(
+        f"questions covering first 30 docs: total={len(qs_in_30)}  "
+        f"answerable={answerable}  unanswerable={unanswerable}"
+    )
+    print(
+        f"avg Qs/PDF: {len(qs_in_30) / 30:.1f}  "
+        f"answerable/PDF: {answerable / 30:.1f}"
+    )
+    print(f"format mix in scope: {dict(fmts)}")
+    print()
+    print("25 new PDFs to ingest:")
+    for d in new25:
+        print(f"  - {d}")
+
+
+if __name__ == "__main__":
+    main()
--- a/surfsense_evals/scripts/patch_manifest_for_parallel_ingest.py
+++ b/surfsense_evals/scripts/patch_manifest_for_parallel_ingest.py
@ -0,0 +1,100 @@
+"""Stub the mmlongbench manifest so parser_compare can extract in parallel.
+
+The mmlongbench Surfsense ingest writes its manifest only at the very
+end of the upload pipeline (~hours of celery work). parser_compare's
+ingest, on the other hand, just needs a list of (doc_id, pdf_path)
+tuples to know which PDFs to extract — it doesn't care about the
+SurfSense ``document_id`` (the runner does, later, after a refresh).
+
+This script extends the existing manifest with the *additional* PDFs
+that mmlongbench has already cached on disk (i.e. all 30 PDFs in
+``data/multimodal_doc/mmlongbench/pdfs/`` even though only 5 have
+SurfSense ``document_id``s yet) so parser_compare can run all four
+extractions for them in parallel with the SurfSense ingest.
+
+After mmlongbench finishes, re-run::
+
+    python -m surfsense_evals ingest multimodal_doc parser_compare \
+        --max-docs 30
+
+…to refresh ``parser_compare_doc_map.jsonl`` with the now-populated
+``document_id`` values for the 25 new PDFs. The extractions
+themselves are cached on disk so the second pass is essentially free.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+
+REPO = Path(__file__).resolve().parents[1]
+MAP_PATH = REPO / "data" / "multimodal_doc" / "maps" / "mmlongbench_doc_map.jsonl"
+PDF_DIR = REPO / "data" / "multimodal_doc" / "mmlongbench" / "pdfs"
+QUESTIONS = REPO / "data" / "multimodal_doc" / "mmlongbench" / "questions.jsonl"
+
+
+def _question_count_per_doc() -> dict[str, int]:
+    counts: dict[str, int] = {}
+    with QUESTIONS.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            counts[row["doc_id"]] = counts.get(row["doc_id"], 0) + 1
+    return counts
+
+
+def main() -> None:
+    if not MAP_PATH.exists():
+        raise SystemExit(
+            f"manifest not found at {MAP_PATH} — "
+            "run `surfsense_evals ingest multimodal_doc mmlongbench` first."
+        )
+
+    existing_lines = MAP_PATH.read_text(encoding="utf-8").splitlines()
+    existing_rows: list[dict] = []
+    settings_line = None
+    for line in existing_lines:
+        line = line.strip()
+        if not line:
+            continue
+        row = json.loads(line)
+        if "__settings__" in row:
+            settings_line = line
+        else:
+            existing_rows.append(row)
+
+    by_doc_id = {r["doc_id"]: r for r in existing_rows}
+    counts = _question_count_per_doc()
+
+    cached_pdfs = sorted(p for p in PDF_DIR.glob("*.pdf"))
+    print(f"existing manifest entries: {len(existing_rows)}")
+    print(f"cached PDFs on disk:       {len(cached_pdfs)}")
+
+    added = 0
+    for pdf in cached_pdfs:
+        if pdf.name in by_doc_id:
+            continue
+        by_doc_id[pdf.name] = {
+            "doc_id": pdf.name,
+            "document_id": None,
+            "pdf_path": str(pdf),
+            "n_questions": counts.get(pdf.name, 0),
+        }
+        added += 1
+
+    out_lines: list[str] = []
+    if settings_line:
+        out_lines.append(settings_line)
+    for doc_id in sorted(by_doc_id):
+        out_lines.append(json.dumps(by_doc_id[doc_id]))
+    MAP_PATH.write_text("\n".join(out_lines) + "\n", encoding="utf-8")
+
+    print(f"added {added} stub rows; manifest now has {len(by_doc_id)} PDFs")
+    print(f"wrote: {MAP_PATH}")
+
+
+if __name__ == "__main__":
+    main()
--- a/surfsense_evals/scripts/peek_t3_doc_map.py
+++ b/surfsense_evals/scripts/peek_t3_doc_map.py
@ -1,40 +0,0 @@
-"""Quick sanity-check for the CRAG Task 3 doc map after ingest."""
-
-from __future__ import annotations
-
-import json
-import sys
-from pathlib import Path
-
-
-def main() -> int:
-    p = Path("data/research/maps/crag_t3_doc_map.jsonl")
-    if not p.exists():
-        print(f"Doc map missing: {p}")
-        return 1
-    rows = []
-    settings = {}
-    for line in p.read_text(encoding="utf-8").splitlines():
-        if not line.strip():
-            continue
-        row = json.loads(line)
-        if "__settings__" in row:
-            settings = row
-            continue
-        rows.append(row)
-    print(f"Settings header: {settings}")
-    print(f"Doc map rows:   {len(rows)}")
-    for r in rows:
-        print(f"  qid={r['qid']:<10} domain={r['domain']:<8} qtype={r['question_type']}")
-        print(f"    question: {r['question'][:90]}")
-        print(f"    gold:     {r['gold_answer'][:90]}")
-        print(
-            f"    pages:    {len(r['page_filenames'])} extracted, "
-            f"{len(r['document_ids'])} doc_ids, "
-            f"{len(r['missing_pages'])} missing"
-        )
-    return 0
-
-
-if __name__ == "__main__":
-    sys.exit(main())
--- a/surfsense_evals/scripts/retry_failed_questions.py
+++ b/surfsense_evals/scripts/retry_failed_questions.py
@ -0,0 +1,636 @@
+"""Retry only the failed (arm, question) pairs from a previous parser_compare run.
+
+The original parser_compare run records one row per (arm, qid) in
+``raw.jsonl``. Some of those rows came back with transient transport
+errors (SSL alerts, gateway 502s, empty SSE streams) or empty
+``raw_text``. This script re-issues *only* those calls with exponential
+backoff so we can see how many recover.
+
+Design constraints / choices:
+
+* **No re-ingest.** All cached PDFs and parser-extracted markdown stay
+  on disk. We rebuild ``ArmRequest`` objects from the existing manifest
+  + the original ``mmlongbench/questions.jsonl``.
+* **No SurfSense backend or celery required.** SurfSense had 0
+  reported failures; this script will skip any ``surfsense_agentic``
+  rows it encounters and warn rather than try to start the backend.
+* **Original ``raw.jsonl`` is never mutated.** Retries land in a
+  sibling ``raw_retries.jsonl`` so the original artifact stays
+  citeable.
+* **Idempotent.** Re-running this script re-tries the same set of
+  failed rows from ``raw.jsonl``. If you want to merge survivor rows
+  back in, do that as a separate aggregation step.
+
+Usage:
+
+    python scripts/retry_failed_questions.py \
+        --run-id 2026-05-14T00-53-19Z \
+        --max-attempts 5 \
+        --concurrency 2
+
+Outputs (written next to the original raw.jsonl):
+
+* ``raw_retries.jsonl`` — one line per retried (arm, qid). Each line
+  carries the original error, every retry attempt's timing/error,
+  and the final result (incl. grade) so you can drop it straight
+  into a notebook.
+* ``raw_retries_summary.json`` — per-arm tried/recovered/still-failed
+  counts and an aggregated retry-success rate.
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+import logging
+import os
+import random
+import sys
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+REPO = Path(__file__).resolve().parents[1]
+SRC = REPO / "src"
+if str(SRC) not in sys.path:
+    sys.path.insert(0, str(SRC))
+
+from dotenv import load_dotenv  # noqa: E402
+
+from surfsense_evals.core.arms import (  # noqa: E402
+    ArmRequest,
+    ArmResult,
+    BareLlmArm,
+    NativePdfArm,
+)
+from surfsense_evals.core.parse.freeform_answer import (  # noqa: E402
+    extract_freeform_answer,
+)
+from surfsense_evals.core.providers.openrouter_chat import (  # noqa: E402
+    OpenRouterChatProvider,
+)
+from surfsense_evals.core.providers.openrouter_pdf import (  # noqa: E402
+    OpenRouterPdfProvider,
+    PdfEngine,
+)
+from surfsense_evals.suites.multimodal_doc.mmlongbench.grader import grade  # noqa: E402
+from surfsense_evals.suites.multimodal_doc.parser_compare.prompt import (  # noqa: E402
+    build_long_context_prompt,
+    build_native_pdf_prompt,
+)
+
+logger = logging.getLogger("retry_failed_questions")
+
+LC_ARMS = {
+    "azure_basic_lc",
+    "azure_premium_lc",
+    "llamacloud_basic_lc",
+    "llamacloud_premium_lc",
+}
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _is_failure_row(row: dict[str, Any]) -> bool:
+    """A row counts as failed if it raised an error OR returned empty text.
+
+    We retry both because the empty-stream case is the same operational
+    failure mode (the call returned nothing usable) — we just didn't
+    raise it as an exception.
+    """
+
+    if row.get("error"):
+        return True
+    if not (row.get("raw_text") or "").strip():
+        return True
+    return False
+
+
+@dataclass
+class FailedRow:
+    arm: str
+    qid: str
+    doc_id: str
+    answer_format: str
+    gold: str
+    pages: int
+    document_id: int | None
+    original_error: str | None
+    original_row: dict[str, Any]
+
+
+def _load_failed_rows(raw_path: Path) -> list[FailedRow]:
+    out: list[FailedRow] = []
+    with raw_path.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            if not _is_failure_row(row):
+                continue
+            out.append(FailedRow(
+                arm=str(row["arm"]),
+                qid=str(row["qid"]),
+                doc_id=str(row["doc_id"]),
+                answer_format=str(row.get("answer_format") or ""),
+                gold=str(row.get("gold") or ""),
+                pages=int(row.get("pages") or 0),
+                document_id=row.get("document_id"),
+                original_error=row.get("error"),
+                original_row=row,
+            ))
+    return out
+
+
+def _load_doc_map(map_path: Path) -> dict[str, dict[str, Any]]:
+    out: dict[str, dict[str, Any]] = {}
+    with map_path.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            out[str(row["doc_id"])] = row
+    return out
+
+
+def _load_question_text_index(
+    questions_jsonl: Path,
+) -> dict[tuple[str, int], dict[str, Any]]:
+    """Map (doc_id, per_doc_index) -> raw question row.
+
+    qids in raw.jsonl are formatted ``{doc_id}::Q{NNN}`` where NNN is
+    the per-doc index. Reproducing the runner's question selection
+    requires walking ``questions.jsonl`` in order and assigning
+    indices per doc_id (so we match the runner's ``per_doc_idx`` logic
+    in ``_select_questions``).
+    """
+
+    out: dict[tuple[str, int], dict[str, Any]] = {}
+    per_doc_idx: dict[str, int] = {}
+    with questions_jsonl.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            doc_id = str(row.get("doc_id") or "")
+            if not doc_id:
+                continue
+            idx = per_doc_idx.get(doc_id, 0)
+            per_doc_idx[doc_id] = idx + 1
+            out[(doc_id, idx)] = row
+    return out
+
+
+def _qid_index(qid: str) -> int:
+    """Parse the per-doc question index out of a qid like ``foo.pdf::Q007``."""
+
+    _, _, q_part = qid.rpartition("::")
+    if not q_part.startswith("Q"):
+        raise ValueError(f"unexpected qid shape: {qid!r}")
+    return int(q_part[1:])
+
+
+# ---------------------------------------------------------------------------
+# Request building (mirrors runner.py exactly so prompts are byte-identical)
+# ---------------------------------------------------------------------------
+
+
+def _build_native_request(
+    qid: str, question: str, answer_format: str, pdf_path: Path,
+    *, max_output_tokens: int,
+) -> ArmRequest:
+    return ArmRequest(
+        question_id=qid,
+        prompt=build_native_pdf_prompt(question, answer_format=answer_format),
+        pdf_paths=[pdf_path],
+        options={"max_tokens": max_output_tokens},
+    )
+
+
+def _build_lc_request(
+    qid: str, question: str, answer_format: str, doc_id: str, md_path: Path,
+) -> ArmRequest:
+    if not md_path.exists():
+        raise FileNotFoundError(
+            f"Missing parser extraction at {md_path}; cannot retry LC arm."
+        )
+    markdown = md_path.read_text(encoding="utf-8")
+    return ArmRequest(
+        question_id=qid,
+        prompt=build_long_context_prompt(
+            question,
+            answer_format=answer_format,
+            document_markdown=markdown,
+            document_label=doc_id,
+        ),
+    )
+
+
+# ---------------------------------------------------------------------------
+# Retry driver
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class AttemptLog:
+    attempt: int
+    started_iso: str
+    latency_ms: int
+    error: str | None
+    raw_text_chars: int
+
+
+@dataclass
+class RetryOutcome:
+    arm: str
+    qid: str
+    attempts: list[AttemptLog]
+    final_result: ArmResult
+    recovered: bool
+
+
+async def _retry_one(
+    arm_obj: Any, request: ArmRequest, *,
+    arm_name: str,
+    qid: str,
+    max_attempts: int,
+    base_delay: float,
+    max_delay: float,
+) -> RetryOutcome:
+    attempts: list[AttemptLog] = []
+    final: ArmResult | None = None
+    for attempt in range(1, max_attempts + 1):
+        started_iso = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
+        t0 = time.monotonic()
+        result = await arm_obj.answer(request)
+        latency_ms = int((time.monotonic() - t0) * 1000)
+        raw_text = (result.raw_text or "").strip()
+        attempt_error = result.error
+        if not attempt_error and not raw_text:
+            attempt_error = "EmptyResponse: stream ended with no text"
+        attempts.append(AttemptLog(
+            attempt=attempt,
+            started_iso=started_iso,
+            latency_ms=latency_ms,
+            error=attempt_error,
+            raw_text_chars=len(raw_text),
+        ))
+        final = result
+        if not attempt_error and raw_text:
+            return RetryOutcome(
+                arm=arm_name, qid=qid, attempts=attempts,
+                final_result=result, recovered=True,
+            )
+        if attempt < max_attempts:
+            delay = min(max_delay, base_delay * (2 ** (attempt - 1)))
+            delay = delay * (0.5 + random.random())
+            logger.info(
+                "[%s::%s] attempt %d/%d failed (%s); sleeping %.1fs",
+                arm_name, qid, attempt, max_attempts, attempt_error, delay,
+            )
+            await asyncio.sleep(delay)
+    assert final is not None
+    return RetryOutcome(
+        arm=arm_name, qid=qid, attempts=attempts,
+        final_result=final, recovered=False,
+    )
+
+
+async def _gather_with_limit(coros: list, *, concurrency: int) -> list[Any]:
+    sem = asyncio.Semaphore(max(1, concurrency))
+
+    async def _wrap(coro):
+        async with sem:
+            return await coro
+
+    return await asyncio.gather(*(_wrap(c) for c in coros))
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+
+async def _run(args: argparse.Namespace) -> int:
+    load_dotenv(REPO / ".env")
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s %(levelname)s %(name)s %(message)s",
+    )
+
+    run_dir = REPO / "data" / "multimodal_doc" / "runs" / args.run_id / "parser_compare"
+    raw_path = run_dir / "raw.jsonl"
+    if not raw_path.exists():
+        raise SystemExit(f"raw.jsonl not found at {raw_path}")
+
+    map_path = REPO / "data" / "multimodal_doc" / "maps" / "parser_compare_doc_map.jsonl"
+    questions_jsonl = REPO / "data" / "multimodal_doc" / "mmlongbench" / "questions.jsonl"
+    if not map_path.exists():
+        raise SystemExit(f"parser_compare manifest not found at {map_path}")
+    if not questions_jsonl.exists():
+        raise SystemExit(f"mmlongbench questions not found at {questions_jsonl}")
+
+    failed = _load_failed_rows(raw_path)
+    if not failed:
+        logger.info("No failed rows in %s — nothing to retry.", raw_path)
+        return 0
+
+    # SurfSense rows: warn and skip; we don't want to start backend just to
+    # defensively retry a 0-failure arm.
+    surf_failed = [f for f in failed if f.arm == "surfsense_agentic"]
+    if surf_failed:
+        logger.warning(
+            "Skipping %d surfsense_agentic failures; this script doesn't drive the backend. "
+            "If you want those retried too, start backend + celery and rerun "
+            "with --include-surfsense.",
+            len(surf_failed),
+        )
+        if not args.include_surfsense:
+            failed = [f for f in failed if f.arm != "surfsense_agentic"]
+    else:
+        logger.info("No surfsense_agentic failures; backend/celery not needed for this retry.")
+
+    if not failed:
+        logger.info("Nothing left to retry after filtering.")
+        return 0
+
+    by_arm_count: dict[str, int] = {}
+    for f in failed:
+        by_arm_count[f.arm] = by_arm_count.get(f.arm, 0) + 1
+    logger.info(
+        "Loaded %d failed rows across %d arms: %s",
+        len(failed), len(by_arm_count),
+        ", ".join(f"{a}={n}" for a, n in sorted(by_arm_count.items())),
+    )
+
+    doc_map = _load_doc_map(map_path)
+    qtext_idx = _load_question_text_index(questions_jsonl)
+
+    api_key = os.environ.get("OPENROUTER_API_KEY")
+    if not api_key:
+        raise SystemExit("OPENROUTER_API_KEY missing from environment / .env")
+
+    native_provider = OpenRouterPdfProvider(
+        api_key=api_key,
+        base_url=os.environ.get("OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1"),
+        model=args.llm_model,
+        engine=PdfEngine(args.pdf_engine),
+    )
+    native_arm = NativePdfArm(
+        provider=native_provider, max_output_tokens=args.max_output_tokens,
+    )
+
+    lc_arms: dict[str, BareLlmArm] = {}
+    for arm_name in sorted({f.arm for f in failed} & LC_ARMS):
+        lc_provider = OpenRouterChatProvider(
+            api_key=api_key,
+            base_url=os.environ.get("OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1"),
+            model=args.llm_model,
+        )
+        lc_arms[arm_name] = BareLlmArm(
+            provider=lc_provider,
+            max_output_tokens=args.max_output_tokens,
+            name=arm_name,
+        )
+
+    coros: list = []
+    plan: list[tuple[FailedRow, ArmRequest, Any]] = []
+
+    for f in failed:
+        # Look up the question text from questions.jsonl
+        try:
+            q_idx = _qid_index(f.qid)
+        except Exception:
+            logger.error("Bad qid %r — skipping", f.qid)
+            continue
+        qrow = qtext_idx.get((f.doc_id, q_idx))
+        if qrow is None:
+            logger.error(
+                "Could not find question text for %s (idx %d) — skipping",
+                f.doc_id, q_idx,
+            )
+            continue
+        question_text = str(qrow.get("question") or "").strip()
+        answer_format = str(qrow.get("answer_format") or f.answer_format or "").strip().lower()
+
+        map_row = doc_map.get(f.doc_id)
+        if map_row is None:
+            logger.error("doc_id %s not in manifest — skipping", f.doc_id)
+            continue
+
+        if f.arm == "native_pdf":
+            pdf_path = Path(map_row["pdf_path"])
+            if not pdf_path.exists():
+                logger.error("PDF missing on disk: %s — skipping", pdf_path)
+                continue
+            request = _build_native_request(
+                f.qid, question_text, answer_format, pdf_path,
+                max_output_tokens=args.max_output_tokens,
+            )
+            arm_obj = native_arm
+        elif f.arm in LC_ARMS:
+            ext_blob = (map_row.get("extractions") or {}).get(f.arm) or {}
+            md_path_str = ext_blob.get("markdown_path")
+            if not md_path_str or ext_blob.get("status") != "ok":
+                logger.error(
+                    "Missing extraction for %s on %s — cannot retry; skipping",
+                    f.arm, f.doc_id,
+                )
+                continue
+            request = _build_lc_request(
+                f.qid, question_text, answer_format, f.doc_id, Path(md_path_str),
+            )
+            arm_obj = lc_arms[f.arm]
+        else:
+            logger.warning("Unhandled arm %s — skipping", f.arm)
+            continue
+
+        plan.append((f, request, arm_obj))
+        coros.append(_retry_one(
+            arm_obj, request,
+            arm_name=f.arm, qid=f.qid,
+            max_attempts=args.max_attempts,
+            base_delay=args.base_delay,
+            max_delay=args.max_delay,
+        ))
+
+    if not coros:
+        logger.warning("Nothing to retry after request building.")
+        return 0
+
+    logger.info(
+        "Retrying %d failed rows with up to %d attempts each "
+        "(base_delay=%.1fs, max_delay=%.1fs, concurrency=%d).",
+        len(coros), args.max_attempts, args.base_delay, args.max_delay,
+        args.concurrency,
+    )
+
+    started = time.monotonic()
+    outcomes: list[RetryOutcome] = await _gather_with_limit(
+        coros, concurrency=args.concurrency,
+    )
+    elapsed = time.monotonic() - started
+    logger.info("Retry pass finished in %.1fs.", elapsed)
+
+    out_path = run_dir / "raw_retries.jsonl"
+    summary_path = run_dir / "raw_retries_summary.json"
+
+    per_arm_recovered: dict[str, int] = {}
+    per_arm_total: dict[str, int] = {}
+    per_arm_attempts_dist: dict[str, list[int]] = {}
+
+    with out_path.open("w", encoding="utf-8") as fh:
+        for (f, _req, _arm_obj), outcome in zip(plan, outcomes, strict=True):
+            per_arm_total[outcome.arm] = per_arm_total.get(outcome.arm, 0) + 1
+            if outcome.recovered:
+                per_arm_recovered[outcome.arm] = (
+                    per_arm_recovered.get(outcome.arm, 0) + 1
+                )
+            per_arm_attempts_dist.setdefault(outcome.arm, []).append(
+                len(outcome.attempts)
+            )
+
+            g = grade(
+                pred=extract_freeform_answer(outcome.final_result.raw_text or ""),
+                gold=f.gold,
+                answer_format=f.answer_format,
+            )
+            row = {
+                "qid": f.qid,
+                "doc_id": f.doc_id,
+                "arm": f.arm,
+                "answer_format": f.answer_format,
+                "gold": f.gold,
+                "pages": f.pages,
+                "document_id": f.document_id,
+                "original_error": f.original_error,
+                "retry": {
+                    "max_attempts": args.max_attempts,
+                    "n_attempts": len(outcome.attempts),
+                    "recovered": outcome.recovered,
+                    "attempts": [
+                        {
+                            "attempt": a.attempt,
+                            "started_iso": a.started_iso,
+                            "latency_ms": a.latency_ms,
+                            "error": a.error,
+                            "raw_text_chars": a.raw_text_chars,
+                        }
+                        for a in outcome.attempts
+                    ],
+                },
+                **outcome.final_result.to_jsonl(),
+                "graded": {
+                    "correct": g.correct,
+                    "f1": g.f1,
+                    "method": g.method,
+                    "normalised_pred": g.normalised_pred,
+                    "normalised_gold": g.normalised_gold,
+                },
+            }
+            fh.write(json.dumps(row) + "\n")
+
+    summary = {
+        "run_id": args.run_id,
+        "raw_retries_path": str(out_path.relative_to(REPO)),
+        "n_failed_rows_input": len(failed),
+        "n_retried": len(coros),
+        "elapsed_s": round(elapsed, 1),
+        "config": {
+            "max_attempts": args.max_attempts,
+            "base_delay": args.base_delay,
+            "max_delay": args.max_delay,
+            "concurrency": args.concurrency,
+            "llm_model": args.llm_model,
+            "pdf_engine": args.pdf_engine,
+            "max_output_tokens": args.max_output_tokens,
+        },
+        "per_arm": {
+            arm: {
+                "tried": per_arm_total.get(arm, 0),
+                "recovered": per_arm_recovered.get(arm, 0),
+                "still_failed": (
+                    per_arm_total.get(arm, 0) - per_arm_recovered.get(arm, 0)
+                ),
+                "recovery_rate": (
+                    per_arm_recovered.get(arm, 0) / per_arm_total[arm]
+                    if per_arm_total.get(arm) else 0.0
+                ),
+                "attempts_distribution": sorted(per_arm_attempts_dist.get(arm, [])),
+            }
+            for arm in sorted(per_arm_total)
+        },
+        "totals": {
+            "tried": sum(per_arm_total.values()),
+            "recovered": sum(per_arm_recovered.values()),
+            "still_failed": sum(per_arm_total.values()) - sum(per_arm_recovered.values()),
+        },
+    }
+    summary_path.write_text(
+        json.dumps(summary, indent=2, sort_keys=True) + "\n",
+        encoding="utf-8",
+    )
+
+    print()
+    print("=" * 78)
+    print("Retry pass summary")
+    print("=" * 78)
+    header = f"{'arm':<25} {'tried':>6} {'recovered':>10} {'still fail':>11} {'rate':>7}"
+    print(header)
+    print("-" * len(header))
+    for arm in sorted(per_arm_total):
+        tried = per_arm_total[arm]
+        rec = per_arm_recovered.get(arm, 0)
+        rate = (rec / tried * 100) if tried else 0.0
+        print(f"{arm:<25} {tried:>6} {rec:>10} {tried - rec:>11} {rate:>6.1f}%")
+    total = sum(per_arm_total.values())
+    rec_total = sum(per_arm_recovered.values())
+    rate_total = (rec_total / total * 100) if total else 0.0
+    print("-" * len(header))
+    print(f"{'TOTAL':<25} {total:>6} {rec_total:>10} {total - rec_total:>11} "
+          f"{rate_total:>6.1f}%")
+    print()
+    print(f"Wrote {out_path.relative_to(REPO)}")
+    print(f"Wrote {summary_path.relative_to(REPO)}")
+    return 0
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--run-id", default="2026-05-14T00-53-19Z",
+        help="Run timestamp under data/multimodal_doc/runs/. Default is the "
+             "n=171 production run we wrote up in the blog.",
+    )
+    parser.add_argument("--max-attempts", type=int, default=5)
+    parser.add_argument("--base-delay", type=float, default=1.0,
+                        help="Base seconds for exponential backoff (default 1s).")
+    parser.add_argument("--max-delay", type=float, default=30.0,
+                        help="Cap on per-retry sleep (default 30s).")
+    parser.add_argument("--concurrency", type=int, default=2,
+                        help="Parallel retries in flight (default 2 — keep low "
+                             "to avoid the same transport stress that caused "
+                             "the original failures).")
+    parser.add_argument("--llm-model", default="anthropic/claude-sonnet-4.5")
+    parser.add_argument("--pdf-engine", default="native",
+                        choices=[e.value for e in PdfEngine])
+    parser.add_argument("--max-output-tokens", type=int, default=512)
+    parser.add_argument(
+        "--include-surfsense", action="store_true",
+        help="Also retry surfsense_agentic failures (requires backend + celery up). "
+             "Default is to skip them since the n=171 run had 0 SurfSense failures.",
+    )
+    args = parser.parse_args()
+    raise SystemExit(asyncio.run(_run(args)))
+
+
+if __name__ == "__main__":
+    main()
--- a/surfsense_evals/scripts/summarise_parser_compare_run.py
+++ b/surfsense_evals/scripts/summarise_parser_compare_run.py
@ -0,0 +1,122 @@
+"""Slice the parser_compare raw.jsonl for the n=171 run.
+
+Reports per-arm:
+  * tokens & cost stats (input/output mean, $/Q distribution)
+  * failures (status != ok or empty raw_text)
+  * answer_format breakdown (accuracy by str/int/float/list)
+
+Plus surfsense agentic breakdown so we can compare apples to apples
+even though the new_chat SSE doesn't surface per-call token counts.
+"""
+
+from __future__ import annotations
+
+import json
+import statistics
+from collections import defaultdict
+from pathlib import Path
+
+
+REPO = Path(__file__).resolve().parents[1]
+RUN_DIR = REPO / "data" / "multimodal_doc" / "runs" / "2026-05-14T00-53-19Z" / "parser_compare"
+RAW = RUN_DIR / "raw.jsonl"
+ARTIFACT = RUN_DIR / "run_artifact.json"
+
+
+def main() -> None:
+    rows = [json.loads(line) for line in RAW.read_text(encoding="utf-8").splitlines() if line.strip()]
+    print(f"raw rows: {len(rows)}")
+
+    by_qid: dict[str, list[dict]] = defaultdict(list)
+    for row in rows:
+        by_qid[row["qid"]].append(row)
+    print(f"unique questions: {len(by_qid)}")
+
+    arm_metrics: dict[str, dict] = defaultdict(lambda: {
+        "n": 0, "n_correct": 0, "n_failed": 0, "n_empty": 0,
+        "costs": [], "in_tokens": [], "out_tokens": [], "latency_ms": [],
+        "by_format": defaultdict(lambda: {"n": 0, "correct": 0}),
+    })
+
+    for row in rows:
+        arm = row["arm"]
+        m = arm_metrics[arm]
+        m["n"] += 1
+        graded = row.get("graded") or {}
+        if graded.get("correct"):
+            m["n_correct"] += 1
+
+        err = row.get("error")
+        raw_text = row.get("raw_text") or ""
+        if err:
+            m["n_failed"] += 1
+        elif not raw_text.strip():
+            m["n_empty"] += 1
+
+        cost = row.get("cost_usd")
+        if cost is not None:
+            m["costs"].append(float(cost))
+        ut = row.get("usage") or {}
+        if ut.get("prompt_tokens"):
+            m["in_tokens"].append(ut["prompt_tokens"])
+        if ut.get("completion_tokens"):
+            m["out_tokens"].append(ut["completion_tokens"])
+        if row.get("latency_ms"):
+            m["latency_ms"].append(row["latency_ms"])
+
+        fmt = row.get("answer_format") or "unknown"
+        m["by_format"][fmt]["n"] += 1
+        if graded.get("correct"):
+            m["by_format"][fmt]["correct"] += 1
+
+    print()
+    print("=" * 100)
+    print(f"{'arm':<25} {'n':>4} {'acc%':>6} {'F1%':>6} {'fail':>5} {'$ mean':>10} {'$ median':>10} {'in tok mean':>12} {'out tok mean':>12} {'p50 ms':>8}")
+    print("=" * 100)
+    art = json.loads(ARTIFACT.read_text(encoding="utf-8"))
+    per_arm_art = art["metrics"]["per_arm"]
+    for arm, m in sorted(arm_metrics.items()):
+        acc = m["n_correct"] / m["n"] * 100
+        fail = m["n_failed"]
+        cost_mean = statistics.mean(m["costs"]) if m["costs"] else 0.0
+        cost_med = statistics.median(m["costs"]) if m["costs"] else 0.0
+        in_mean = statistics.mean(m["in_tokens"]) if m["in_tokens"] else 0
+        out_mean = statistics.mean(m["out_tokens"]) if m["out_tokens"] else 0
+        lat_p50 = statistics.median(m["latency_ms"]) if m["latency_ms"] else 0
+        f1 = per_arm_art.get(arm, {}).get("f1_mean", 0.0) * 100
+        print(
+            f"{arm:<25} {m['n']:>4} {acc:>5.1f}% {f1:>5.1f}% {fail:>5} "
+            f"${cost_mean:>9.4f} ${cost_med:>9.4f} {in_mean:>12.0f} {out_mean:>12.0f} {lat_p50:>8.0f}"
+        )
+
+    print()
+    print("by answer_format (accuracy):")
+    formats = sorted({f for m in arm_metrics.values() for f in m["by_format"].keys()})
+    header = f"{'arm':<25} " + " ".join(f"{f:>10}" for f in formats)
+    print(header)
+    print("-" * len(header))
+    for arm, m in sorted(arm_metrics.items()):
+        cells = []
+        for f in formats:
+            row = m["by_format"][f]
+            if row["n"] == 0:
+                cells.append(f"{'-':>10}")
+            else:
+                pct = row["correct"] / row["n"] * 100
+                cells.append(f"{pct:>5.0f}% ({row['correct']:>2}/{row['n']:>2})")
+        print(f"{arm:<25} " + " ".join(cells))
+
+    print()
+    print("=" * 100)
+    print("Aggregated cost (from run_artifact.json):")
+    for arm, row in per_arm_art.items():
+        print(
+            f"  {arm:<25}  acc={row['accuracy']*100:5.1f}% "
+            f"  $/Q LLM={row['llm_cost_per_q']:.4f}  "
+            f"  preprocess total=${row['preprocess_cost_total']:.2f}  "
+            f"  $/Q total={row['total_cost_per_q']:.4f}"
+        )
+
+
+if __name__ == "__main__":
+    main()
--- a/surfsense_evals/scripts/test_context_overflow_hypothesis.py
+++ b/surfsense_evals/scripts/test_context_overflow_hypothesis.py
@ -0,0 +1,155 @@
+"""Test the hypothesis: were the LC-arm errors actually context-window
+overflow errors disguised as SSL / network failures?
+
+If true, we'd expect:
+  (a) literal "prompt is too long" / "context_length_exceeded" / "exceeds .* tokens" strings,
+  (b) failures correlated with extraction size / input_tokens (large doc -> failure),
+  (c) failing requests near or over Sonnet 4.5's 200k input-token limit.
+
+If false (transport-layer hypothesis), we'd expect:
+  (a) only SSL / 502 / empty stream / JSONDecode strings,
+  (b) failures NOT correlated with size (uniform across PDFs by time, not by tokens),
+  (c) failing requests well below the 200k limit.
+"""
+
+from __future__ import annotations
+
+import json
+import statistics
+from collections import defaultdict
+from pathlib import Path
+
+REPO = Path(__file__).resolve().parents[1]
+RUN = REPO / "data" / "multimodal_doc" / "runs" / "2026-05-14T00-53-19Z" / "parser_compare"
+RAW = RUN / "raw.jsonl"
+MANIFEST = REPO / "data" / "multimodal_doc" / "maps" / "parser_compare_doc_map.jsonl"
+
+CONTEXT_HINTS = (
+    "context_length",
+    "context window",
+    "prompt is too long",
+    "exceeds",
+    "maximum context",
+    "input tokens",
+    "too many tokens",
+    "over the maximum",
+    "200000",
+    "200_000",
+)
+
+
+def main() -> None:
+    rows = [
+        json.loads(line) for line in RAW.read_text(encoding="utf-8").splitlines()
+        if line.strip()
+    ]
+
+    extraction_size: dict[tuple[str, str], int] = {}
+    for line in MANIFEST.read_text(encoding="utf-8").splitlines():
+        if not line.strip():
+            continue
+        m = json.loads(line)
+        for arm, ext in (m.get("extractions") or {}).items():
+            extraction_size[(m["doc_id"], arm)] = int(ext.get("chars") or 0)
+
+    print("=" * 80)
+    print("(a) Literal 'context window' / 'prompt too long' error strings?")
+    print("=" * 80)
+    found = 0
+    for row in rows:
+        err = (row.get("error") or "").lower()
+        if not err:
+            continue
+        for hint in CONTEXT_HINTS:
+            if hint in err:
+                print(f"  {row['arm']:<25} {row['qid']:<50}")
+                print(f"      -> {err[:240]}")
+                found += 1
+                break
+    if not found:
+        print("  none found.")
+
+    print()
+    print("=" * 80)
+    print("(b) Extraction size for OK vs FAILED rows per arm")
+    print("=" * 80)
+    arm_buckets: dict[str, dict[str, list[int]]] = defaultdict(
+        lambda: {"ok": [], "fail": []}
+    )
+    parser_arms = (
+        "azure_basic_lc", "azure_premium_lc",
+        "llamacloud_basic_lc", "llamacloud_premium_lc",
+    )
+    for row in rows:
+        arm = row["arm"]
+        if arm not in parser_arms:
+            continue
+        size = extraction_size.get((row["doc_id"], arm), 0)
+        bucket = "fail" if (row.get("error") or not (row.get("raw_text") or "").strip()) else "ok"
+        arm_buckets[arm][bucket].append(size)
+
+    print(f"{'arm':<25} {'bucket':<5} {'n':>4} {'mean chars':>12} {'median':>10} {'max':>10}")
+    for arm in parser_arms:
+        for bucket in ("ok", "fail"):
+            sizes = arm_buckets[arm][bucket]
+            if not sizes:
+                print(f"  {arm:<23} {bucket:<5} {0:>4}  -")
+                continue
+            print(
+                f"  {arm:<23} {bucket:<5} {len(sizes):>4} "
+                f"{statistics.mean(sizes):>12,.0f} "
+                f"{statistics.median(sizes):>10,.0f} "
+                f"{max(sizes):>10,}"
+            )
+
+    print()
+    print("=" * 80)
+    print("(c) Largest extraction each arm processed *successfully* vs *failed*")
+    print("=" * 80)
+    print(
+        "(Sonnet 4.5 input limit ~200k tokens ~= 800k chars. If failures were "
+        "context-overflow, max-OK would be near that cap. If max-OK is well "
+        "above max-FAIL, the model handled bigger contexts than the failed "
+        "ones, so size cannot be the cause.)"
+    )
+    print()
+    for arm in parser_arms:
+        ok_sizes = arm_buckets[arm]["ok"]
+        fail_sizes = arm_buckets[arm]["fail"]
+        if not ok_sizes:
+            continue
+        max_ok = max(ok_sizes)
+        max_fail = max(fail_sizes) if fail_sizes else 0
+        print(
+            f"  {arm:<25} max OK = {max_ok:>10,} chars (~{max_ok / 4:>7,.0f} tokens)  "
+            f"max FAIL = {max_fail:>10,} chars (~{max_fail / 4:>7,.0f} tokens)"
+        )
+
+    print()
+    print("=" * 80)
+    print("(d) Did the *known* overflow candidate fail?")
+    print("=" * 80)
+    print(
+        "  3M_2018_10K x llamacloud_premium = 908,733 chars (~227k tokens) "
+        "-- this is above Sonnet 4.5's 200k window."
+    )
+    print("  If transport hypothesis is correct, this should still fail with a "
+          "real overflow error.")
+    print("  If transport hypothesis is correct AND the model truncates silently, "
+          "it might 'succeed' but be wrong.")
+    print()
+    for row in rows:
+        if row["doc_id"] != "3M_2018_10K.pdf":
+            continue
+        if row["arm"] != "llamacloud_premium_lc":
+            continue
+        err = row.get("error") or "(none)"
+        graded = row.get("graded") or {}
+        print(
+            f"  {row['qid']:<40} correct={graded.get('correct')!s:<5}  "
+            f"err={err[:100]}"
+        )
+
+
+if __name__ == "__main__":
+    main()
--- a/surfsense_evals/src/surfsense_evals/core/parsers/init.py
+++ b/surfsense_evals/src/surfsense_evals/core/parsers/init.py
@ -0,0 +1,35 @@
+"""Direct parser invocations for the parser_compare benchmark.
+
+The SurfSense backend exposes a single ``ETL_SERVICE`` env var that
+picks one parser globally; per-ingestion overrides are not on the
+public API. To drive the four (Azure DI x basic/premium, LlamaCloud x
+basic/premium) extractions we need for ``multimodal_doc/parser_compare``
+we therefore call the Azure DI and LlamaCloud SDKs directly from the
+eval harness, mirroring the production code path in
+``surfsense_backend/app/etl_pipeline/parsers/``.
+
+Two design rules:
+
+* No backend imports — the eval harness cannot pull in the FastAPI
+  app's config layer (it would require the full backend ``.env`` plus a
+  reachable Postgres). We re-read keys from our own environment instead.
+* Same wire shape as the backend's parsers (Azure ``prebuilt-read`` /
+  ``prebuilt-layout`` selected by ``processing_mode``; LlamaCloud
+  ``parse_page_with_llm`` / ``parse_page_with_agent`` selected by
+  ``processing_mode``) so any quality conclusions transfer back to
+  production behaviour.
+"""
+
+from __future__ import annotations
+
+from .azure_di import AzureDIError, parse_with_azure_di
+from .llamacloud import LlamaCloudError, parse_with_llamacloud
+from .pdf_pages import count_pdf_pages
+
+__all__ = [
+    "AzureDIError",
+    "LlamaCloudError",
+    "count_pdf_pages",
+    "parse_with_azure_di",
+    "parse_with_llamacloud",
+]
--- a/surfsense_evals/src/surfsense_evals/core/parsers/azure_di.py
+++ b/surfsense_evals/src/surfsense_evals/core/parsers/azure_di.py
@ -0,0 +1,144 @@
+"""Azure Document Intelligence parser — eval-side mirror of the backend.
+
+Calls ``DocumentIntelligenceClient.begin_analyze_document`` with one
+of two ``model_id`` slugs depending on ``processing_mode``:
+
+* ``basic``   → ``prebuilt-read``   (text OCR only, cheaper, faster)
+* ``premium`` → ``prebuilt-layout`` (text + tables + structure;
+                                     produces real markdown headings,
+                                     pipe-tables, etc.)
+
+These are the same model selections the production
+``surfsense_backend/app/etl_pipeline/parsers/azure_doc_intelligence.py``
+makes per ``processing_mode``. Output format is forced to Markdown
+(``DocumentContentFormat.MARKDOWN``) so the long-context arm can stuff
+it into a prompt verbatim.
+
+Retry policy is intentionally light here (the eval harness re-runs
+the whole batch on top-level failure); we do one synchronous attempt
+plus exponential backoff on transient transport errors.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+import os
+import random
+
+logger = logging.getLogger(__name__)
+
+
+_AZURE_MODEL_BY_MODE = {
+    "basic": "prebuilt-read",
+    "premium": "prebuilt-layout",
+}
+
+_MAX_RETRIES = 4
+_BASE_DELAY = 5.0
+_MAX_DELAY = 60.0
+
+
+class AzureDIError(RuntimeError):
+    """Raised when Azure DI fails after all retries."""
+
+
+async def parse_with_azure_di(
+    file_path: str | os.PathLike,
+    *,
+    processing_mode: str = "basic",
+    endpoint: str | None = None,
+    api_key: str | None = None,
+) -> str:
+    """Run Azure DI on ``file_path`` and return the markdown content.
+
+    ``endpoint`` / ``api_key`` default to ``AZURE_DI_ENDPOINT`` and
+    ``AZURE_DI_KEY`` env vars (set in ``surfsense_evals/.env``).
+
+    Raises ``AzureDIError`` after exhausting retries; ``ValueError`` if
+    credentials are missing.
+    """
+
+    endpoint = endpoint or os.environ.get("AZURE_DI_ENDPOINT")
+    api_key = api_key or os.environ.get("AZURE_DI_KEY")
+    if not endpoint or not api_key:
+        raise ValueError(
+            "AZURE_DI_ENDPOINT and AZURE_DI_KEY must be set "
+            "(see surfsense_evals/.env)."
+        )
+
+    model_id = _AZURE_MODEL_BY_MODE.get(processing_mode, "prebuilt-read")
+
+    # Lazy imports — surfsense_evals shouldn't pay the azure-sdk
+    # import cost on every CLI invocation that doesn't touch
+    # parser_compare.
+    from azure.ai.documentintelligence.aio import DocumentIntelligenceClient
+    from azure.ai.documentintelligence.models import DocumentContentFormat
+    from azure.core.credentials import AzureKeyCredential
+    from azure.core.exceptions import (
+        ClientAuthenticationError,
+        HttpResponseError,
+        ServiceRequestError,
+        ServiceResponseError,
+    )
+
+    file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
+    logger.info(
+        "Azure DI parsing %s (mode=%s, model=%s, size=%.1fMB)",
+        file_path, processing_mode, model_id, file_size_mb,
+    )
+
+    last_exc: Exception | None = None
+    for attempt in range(1, _MAX_RETRIES + 1):
+        try:
+            client = DocumentIntelligenceClient(
+                endpoint=endpoint,
+                credential=AzureKeyCredential(api_key),
+            )
+            async with client:
+                with open(file_path, "rb") as fh:
+                    poller = await client.begin_analyze_document(
+                        model_id,
+                        body=fh,
+                        output_content_format=DocumentContentFormat.MARKDOWN,
+                    )
+                result = await poller.result()
+            content = (result.content or "").strip()
+            if not content:
+                raise AzureDIError(
+                    f"Azure DI returned empty content for {file_path}"
+                )
+            logger.info(
+                "Azure DI OK: %s (%s) -> %d chars",
+                file_path, model_id, len(content),
+            )
+            return content
+
+        except ClientAuthenticationError:
+            raise
+        except HttpResponseError as exc:
+            # 4xx that's not auth: don't retry, the request itself is broken.
+            if exc.status_code and 400 <= exc.status_code < 500:
+                raise AzureDIError(
+                    f"Azure DI {exc.status_code} on {file_path}: {exc}"
+                ) from exc
+            last_exc = exc
+        except (ServiceRequestError, ServiceResponseError) as exc:
+            last_exc = exc
+
+        if attempt < _MAX_RETRIES:
+            delay = min(_BASE_DELAY * (2 ** (attempt - 1)), _MAX_DELAY)
+            jitter = delay * 0.25 * (2 * random.random() - 1)
+            sleep_for = delay + jitter
+            logger.warning(
+                "Azure DI attempt %d/%d failed (%s); retrying in %.1fs",
+                attempt, _MAX_RETRIES, type(last_exc).__name__, sleep_for,
+            )
+            await asyncio.sleep(sleep_for)
+
+    raise AzureDIError(
+        f"Azure DI failed after {_MAX_RETRIES} attempts on {file_path}"
+    ) from last_exc
+
+
+__all__ = ["AzureDIError", "parse_with_azure_di"]
--- a/surfsense_evals/src/surfsense_evals/core/parsers/llamacloud.py
+++ b/surfsense_evals/src/surfsense_evals/core/parsers/llamacloud.py
@ -0,0 +1,168 @@
+"""LlamaParse (LlamaCloud) parser — eval-side mirror of the backend.
+
+Calls ``LlamaParse.aparse`` with one of two ``parse_mode`` slugs
+depending on ``processing_mode``:
+
+* ``basic``   → ``parse_page_with_llm``   (cheap, single-LLM-call/page)
+* ``premium`` → ``parse_page_with_agent`` (multi-step agent per page;
+                                            handles tables / figures
+                                            substantially better)
+
+These are the exact mappings from production
+``surfsense_backend/app/etl_pipeline/parsers/llamacloud.py``. We keep
+``num_workers=1`` and language=``"en"`` to match production.
+
+The result is materialised via ``get_markdown_documents(split_by_page=False)``
+which concatenates every page into a single markdown string, exactly
+the shape we need for long-context stuffing.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+import os
+import random
+
+import httpx
+
+logger = logging.getLogger(__name__)
+
+_LLAMA_PARSE_MODE_MAP = {
+    "basic": "parse_page_with_llm",
+    "premium": "parse_page_with_agent",
+}
+
+_MAX_RETRIES = 3
+_BASE_DELAY = 10.0
+_MAX_DELAY = 90.0
+
+
+class LlamaCloudError(RuntimeError):
+    """Raised when LlamaCloud parse fails after all retries."""
+
+
+def _extract_markdown(result) -> str:
+    """Pull markdown out of whatever object LlamaParse.aparse returns.
+
+    Mirrors backend's tolerant extraction: the SDK has gone through
+    several response shapes; we accept all of them so a minor SDK bump
+    doesn't silently zero the eval.
+    """
+
+    if hasattr(result, "get_markdown_documents"):
+        docs = result.get_markdown_documents(split_by_page=False)
+        if docs and hasattr(docs[0], "text"):
+            return docs[0].text
+        if hasattr(result, "pages") and result.pages:
+            return "\n\n".join(p.md for p in result.pages if hasattr(p, "md") and p.md)
+
+    if isinstance(result, list):
+        if result and hasattr(result[0], "text"):
+            return result[0].text
+        return "\n\n".join(
+            doc.page_content if hasattr(doc, "page_content") else str(doc)
+            for doc in result
+        )
+
+    return str(result)
+
+
+async def parse_with_llamacloud(
+    file_path: str | os.PathLike,
+    *,
+    processing_mode: str = "basic",
+    estimated_pages: int = 50,
+    api_key: str | None = None,
+) -> str:
+    """Run LlamaParse on ``file_path`` and return the markdown content.
+
+    ``api_key`` defaults to the ``LLAMA_CLOUD_API_KEY`` env var (set
+    in ``surfsense_evals/.env``).
+
+    Raises ``LlamaCloudError`` after exhausting retries; ``ValueError``
+    if the API key is missing.
+    """
+
+    api_key = api_key or os.environ.get("LLAMA_CLOUD_API_KEY")
+    if not api_key:
+        raise ValueError(
+            "LLAMA_CLOUD_API_KEY must be set (see surfsense_evals/.env)."
+        )
+
+    parse_mode = _LLAMA_PARSE_MODE_MAP.get(processing_mode, "parse_page_with_llm")
+
+    # Lazy import: llama-cloud pulls llama-index-core (~50 MB) on first
+    # touch; defer until the parser actually runs.
+    from llama_cloud_services import LlamaParse
+    from llama_cloud_services.parse.base import JobFailedException
+    from llama_cloud_services.parse.utils import ResultType
+
+    file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
+    # Match backend's per-page timeout heuristic so big PDFs don't drop
+    # mid-job: 60s baseline + 30s/page (premium agent runs longer than
+    # basic; both fit comfortably here).
+    job_timeout = max(180.0, 60.0 + 30.0 * estimated_pages)
+    upload_timeout = max(120.0, 30.0 * file_size_mb)
+
+    logger.info(
+        "LlamaCloud parsing %s (mode=%s, parse_mode=%s, %.1fMB, "
+        "job_timeout=%.0fs)",
+        file_path, processing_mode, parse_mode, file_size_mb, job_timeout,
+    )
+
+    custom_timeout = httpx.Timeout(
+        connect=120.0, read=upload_timeout, write=upload_timeout, pool=120.0,
+    )
+
+    last_exc: Exception | None = None
+    for attempt in range(1, _MAX_RETRIES + 1):
+        try:
+            async with httpx.AsyncClient(timeout=custom_timeout) as client:
+                parser = LlamaParse(
+                    api_key=api_key,
+                    num_workers=1,
+                    verbose=False,
+                    language="en",
+                    result_type=ResultType.MD,
+                    parse_mode=parse_mode,
+                    ignore_errors=False,
+                    max_timeout=int(max(2000.0, job_timeout + upload_timeout)),
+                    job_timeout_in_seconds=job_timeout,
+                    job_timeout_extra_time_per_page_in_seconds=60,
+                    custom_client=client,
+                )
+                result = await parser.aparse(str(file_path))
+            content = _extract_markdown(result).strip()
+            if not content:
+                raise LlamaCloudError(
+                    f"LlamaCloud returned empty content for {file_path}"
+                )
+            logger.info(
+                "LlamaCloud OK: %s (%s) -> %d chars",
+                file_path, parse_mode, len(content),
+            )
+            return content
+
+        except (
+            httpx.HTTPError,
+            JobFailedException,
+            RuntimeError,
+        ) as exc:
+            last_exc = exc
+            if attempt < _MAX_RETRIES:
+                delay = min(_BASE_DELAY * (2 ** (attempt - 1)), _MAX_DELAY)
+                jitter = delay * 0.25 * (2 * random.random() - 1)
+                sleep_for = delay + jitter
+                logger.warning(
+                    "LlamaCloud attempt %d/%d failed (%s); retrying in %.1fs",
+                    attempt, _MAX_RETRIES, type(last_exc).__name__, sleep_for,
+                )
+                await asyncio.sleep(sleep_for)
+
+    raise LlamaCloudError(
+        f"LlamaCloud failed after {_MAX_RETRIES} attempts on {file_path}"
+    ) from last_exc
+
+
+__all__ = ["LlamaCloudError", "parse_with_llamacloud"]
--- a/surfsense_evals/src/surfsense_evals/core/parsers/pdf_pages.py
+++ b/surfsense_evals/src/surfsense_evals/core/parsers/pdf_pages.py
@ -0,0 +1,35 @@
+"""Tiny pypdf wrapper for "how many pages does this PDF have?".
+
+Used by ``parser_compare`` to:
+
+* Decide LlamaCloud's per-page job timeout.
+* Compute the SurfSense preprocessing dollar cost
+  (``$1 / 1k pages`` for basic, ``$10 / 1k pages`` for premium) so the
+  report can show "ingest + LLM" total cost per arm.
+
+Returns ``0`` (and logs) on parse failure rather than raising — costs
+shown as ``?`` are always better than a benchmark that crashes mid-run.
+"""
+
+from __future__ import annotations
+
+import logging
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+
+def count_pdf_pages(path: Path) -> int:
+    """Return the page count for ``path``; ``0`` if pypdf can't open it."""
+
+    try:
+        from pypdf import PdfReader
+
+        reader = PdfReader(str(path))
+        return len(reader.pages)
+    except Exception as exc:  # noqa: BLE001
+        logger.warning("Failed to count pages for %s: %s", path, exc)
+        return 0
+
+
+__all__ = ["count_pdf_pages"]
--- a/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/parser_compare/init.py
+++ b/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/parser_compare/init.py
@ -0,0 +1,46 @@
+"""parser_compare — six-way head-to-head on long multimodal PDFs.
+
+Same 5 mmlongbench PDFs that ``mmlongbench`` already ingested
+(``search_space_id=55``), one question per PDF for the smoke run.
+
+The point of this benchmark is to disentangle TWO orthogonal
+dimensions of "how good is our multimodal pipeline?":
+
+1. **Parser quality** — Azure DI prebuilt-read vs prebuilt-layout vs
+   LlamaParse parse_page_with_llm vs parse_page_with_agent. We run
+   each parser directly (bypassing ``/documents/fileupload`` because
+   the backend's parser routing is global, not per-call) and stuff the
+   resulting markdown into a long-context prompt.
+
+2. **Context-management strategy** — full-context stuffing (no chunk
+   selection, the model sees everything) vs SurfSense's agentic
+   retrieval over chunks of the same documents.
+
+Six arms, all answered by ``anthropic/claude-sonnet-4.5``:
+
+* ``native_pdf``           — PDF attached natively via OpenRouter
+                              (gold-standard reference).
+* ``azure_basic_lc``       — Azure DI ``prebuilt-read`` markdown stuffed
+                              into the prompt.
+* ``azure_premium_lc``     — Azure DI ``prebuilt-layout`` markdown stuffed.
+* ``llamacloud_basic_lc``  — LlamaParse ``parse_page_with_llm`` markdown stuffed.
+* ``llamacloud_premium_lc`` — LlamaParse ``parse_page_with_agent`` markdown stuffed.
+* ``surfsense_agentic``    — SurfSense ``/api/v1/new_chat`` with
+                              ``mentioned_document_ids`` scoped to the
+                              one source PDF, retrieving chunks from
+                              the existing search_space=55 ingestion
+                              (vision_llm=on, processing_mode=premium,
+                              ETL_SERVICE=LLAMACLOUD with Azure DI
+                              fallback ⇒ effectively azure_premium).
+
+The report includes preprocessing cost ($1 / 1k pages basic, $10 / 1k
+pages premium) on top of the OpenRouter LLM cost so each arm's true
+total-cost-per-question is directly comparable.
+"""
+
+from __future__ import annotations
+
+from ....core import registry as _registry
+from .runner import ParserCompareBenchmark
+
+_registry.register(ParserCompareBenchmark())
--- a/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/parser_compare/ingest.py
+++ b/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/parser_compare/ingest.py
@ -0,0 +1,356 @@
+"""parser_compare ingestion: pre-extract markdown 4 ways per PDF.
+
+For each PDF in scope, we run all four (parser × mode) combinations
+in parallel and persist the resulting markdown alongside the PDF:
+
+    data/multimodal_doc/parser_compare/extractions/
+      <doc_id>.azure_basic.md
+      <doc_id>.azure_premium.md
+      <doc_id>.llamacloud_basic.md
+      <doc_id>.llamacloud_premium.md
+
+A manifest at ``maps/parser_compare_doc_map.jsonl`` records, per PDF:
+
+* ``doc_id``         — filename of the source PDF.
+* ``pdf_path``       — local cached PDF path.
+* ``document_id``    — SurfSense document id (carried over from
+                        mmlongbench's existing ingestion so the
+                        SurfSense agentic arm can scope retrieval).
+* ``pages``          — page count via pypdf (drives preprocessing cost).
+* ``extractions``    — map of ``arm_name -> {markdown_path, chars,
+                        elapsed_s, status, error}``.
+
+The runner reads this manifest, loads the markdown for each long-context
+arm, and uses ``document_id`` for the SurfSense arm.
+
+Source PDFs come from the existing mmlongbench ingestion — no new
+download or upload happens here. The point of this benchmark is
+parser quality on the same physical PDFs SurfSense already has, so
+re-using mmlongbench's PDF cache is correct.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import time
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+from ....core.config import set_suite_state
+from ....core.parsers import (
+    AzureDIError,
+    LlamaCloudError,
+    count_pdf_pages,
+    parse_with_azure_di,
+    parse_with_llamacloud,
+)
+from ....core.registry import RunContext
+
+logger = logging.getLogger(__name__)
+
+
+# Order matters for the manifest only (deterministic JSONL diffs);
+# the runner doesn't rely on it.
+PARSER_ARMS: tuple[tuple[str, str, str], ...] = (
+    ("azure_basic_lc",       "azure",      "basic"),
+    ("azure_premium_lc",     "azure",      "premium"),
+    ("llamacloud_basic_lc",  "llamacloud", "basic"),
+    ("llamacloud_premium_lc", "llamacloud", "premium"),
+)
+
+
+@dataclass
+class ExtractionResult:
+    arm: str
+    parser: str
+    mode: str
+    markdown_path: Path | None = None
+    chars: int = 0
+    elapsed_s: float = 0.0
+    status: str = "ok"  # "ok" | "failed"
+    error: str | None = None
+
+    def to_jsonl(self) -> dict[str, Any]:
+        return {
+            "arm": self.arm,
+            "parser": self.parser,
+            "mode": self.mode,
+            "markdown_path": str(self.markdown_path) if self.markdown_path else None,
+            "chars": self.chars,
+            "elapsed_s": round(self.elapsed_s, 2),
+            "status": self.status,
+            "error": self.error,
+        }
+
+
+@dataclass
+class PdfManifestRow:
+    doc_id: str
+    pdf_path: Path
+    document_id: int | None
+    pages: int
+    extractions: dict[str, ExtractionResult] = field(default_factory=dict)
+
+    def to_jsonl(self) -> dict[str, Any]:
+        return {
+            "doc_id": self.doc_id,
+            "pdf_path": str(self.pdf_path),
+            "document_id": self.document_id,
+            "pages": self.pages,
+            "extractions": {
+                arm: ext.to_jsonl() for arm, ext in self.extractions.items()
+            },
+        }
+
+
+# ---------------------------------------------------------------------------
+# Single-PDF extraction
+# ---------------------------------------------------------------------------
+
+
+async def _run_one_extraction(
+    pdf_path: Path,
+    *,
+    parser: str,
+    mode: str,
+    out_path: Path,
+    estimated_pages: int,
+) -> tuple[str, float]:
+    """Invoke the requested parser, persist markdown, return (markdown, elapsed_s)."""
+
+    started = time.monotonic()
+    if parser == "azure":
+        markdown = await parse_with_azure_di(pdf_path, processing_mode=mode)
+    elif parser == "llamacloud":
+        markdown = await parse_with_llamacloud(
+            pdf_path, processing_mode=mode, estimated_pages=estimated_pages,
+        )
+    else:
+        raise ValueError(f"Unknown parser {parser!r}")
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    out_path.write_text(markdown, encoding="utf-8")
+    return markdown, time.monotonic() - started
+
+
+async def _extract_one_pdf(
+    pdf_path: Path,
+    *,
+    extractions_dir: Path,
+    force_reextract: bool,
+) -> dict[str, ExtractionResult]:
+    """Run all four parser combos for ``pdf_path``, returning per-arm results.
+
+    Re-uses any cached ``.md`` already on disk unless ``force_reextract``.
+    The four parser invocations run concurrently — they're independent
+    HTTP-bound jobs and the providers don't share state.
+    """
+
+    estimated_pages = count_pdf_pages(pdf_path) or 50
+    out: dict[str, ExtractionResult] = {}
+    coros = []
+    arm_specs: list[tuple[str, str, str, Path]] = []
+
+    for arm_name, parser, mode in PARSER_ARMS:
+        out_path = extractions_dir / f"{pdf_path.stem}.{arm_name}.md"
+        arm_specs.append((arm_name, parser, mode, out_path))
+
+        if out_path.exists() and not force_reextract:
+            cached = out_path.read_text(encoding="utf-8")
+            out[arm_name] = ExtractionResult(
+                arm=arm_name,
+                parser=parser,
+                mode=mode,
+                markdown_path=out_path,
+                chars=len(cached),
+                elapsed_s=0.0,
+                status="ok",
+                error="(cached)",
+            )
+            logger.info(
+                "Cached extraction reused: %s (%d chars)", out_path.name, len(cached),
+            )
+            coros.append(_noop())
+        else:
+            coros.append(
+                _run_one_extraction(
+                    pdf_path,
+                    parser=parser, mode=mode,
+                    out_path=out_path,
+                    estimated_pages=estimated_pages,
+                )
+            )
+
+    results = await asyncio.gather(*coros, return_exceptions=True)
+    for (arm_name, parser, mode, out_path), result in zip(arm_specs, results, strict=True):
+        if arm_name in out:
+            continue  # cached — already populated above
+        if isinstance(result, Exception):
+            err = result
+            err_msg = f"{type(err).__name__}: {err}"
+            logger.warning(
+                "Extraction FAILED for %s [%s/%s]: %s",
+                pdf_path.name, parser, mode, err_msg,
+            )
+            out[arm_name] = ExtractionResult(
+                arm=arm_name, parser=parser, mode=mode,
+                status="failed", error=err_msg,
+            )
+        else:
+            markdown, elapsed = result
+            out[arm_name] = ExtractionResult(
+                arm=arm_name, parser=parser, mode=mode,
+                markdown_path=out_path,
+                chars=len(markdown),
+                elapsed_s=elapsed,
+                status="ok",
+            )
+    return out
+
+
+async def _noop() -> tuple[str, float]:
+    """Placeholder so cached entries align with parallel gather indexing."""
+
+    return ("", 0.0)
+
+
+# ---------------------------------------------------------------------------
+# Public entry point
+# ---------------------------------------------------------------------------
+
+
+def _read_existing_mmlongbench_map(map_path: Path) -> list[dict[str, Any]]:
+    """Read the mmlongbench doc map (skipping its ``__settings__`` header)."""
+
+    if not map_path.exists():
+        raise RuntimeError(
+            f"mmlongbench doc map not found at {map_path}. Run "
+            "`python -m surfsense_evals ingest multimodal_doc mmlongbench` first."
+        )
+    rows: list[dict[str, Any]] = []
+    with map_path.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            if "__settings__" in row:
+                continue
+            rows.append(row)
+    return rows
+
+
+async def run_ingest(
+    ctx: RunContext,
+    *,
+    docs_filter: list[str] | None = None,
+    max_docs: int | None = None,
+    force_reextract: bool = False,
+    pdf_concurrency: int = 2,
+) -> None:
+    """Pre-extract all four parser markdowns for each PDF.
+
+    Parameters
+    ----------
+    docs_filter : list[str] | None
+        Specific filenames to extract (default: all PDFs from
+        mmlongbench's existing manifest).
+    max_docs : int | None
+        Cap on number of PDFs to process. Default: all.
+    force_reextract : bool
+        Re-call parsers even if a cached ``.md`` already exists. Off
+        by default — extractions are deterministic and parser calls
+        cost real money.
+    pdf_concurrency : int
+        How many PDFs to extract in parallel. Each PDF triggers four
+        parser HTTP calls, so total in-flight = 4 * pdf_concurrency.
+        Default 2 keeps us comfortably under both Azure DI and
+        LlamaCloud per-IP rate limits.
+    """
+
+    # Pull the source PDFs and document_ids from mmlongbench's existing
+    # ingestion. parser_compare doesn't re-upload; SurfSense's agentic
+    # arm queries the same search_space=55 chunks.
+    mmlb_map = ctx.suite_state.ingestion_maps.get("mmlongbench")
+    if not mmlb_map:
+        raise RuntimeError(
+            "Suite state has no mmlongbench ingestion map. Run "
+            "`python -m surfsense_evals ingest multimodal_doc mmlongbench` first "
+            "so parser_compare can re-use those PDFs."
+        )
+    src_rows = _read_existing_mmlongbench_map(Path(mmlb_map))
+
+    rows_in_scope = src_rows
+    if docs_filter:
+        wanted = set(docs_filter)
+        rows_in_scope = [r for r in rows_in_scope if r["doc_id"] in wanted]
+    if max_docs is not None and max_docs > 0:
+        rows_in_scope = rows_in_scope[:max_docs]
+
+    if not rows_in_scope:
+        raise RuntimeError(
+            "No PDFs in scope for parser_compare. Check --docs / --max-docs."
+        )
+
+    bench_dir = ctx.benchmark_data_dir()
+    extractions_dir = bench_dir / "extractions"
+    extractions_dir.mkdir(parents=True, exist_ok=True)
+
+    sem = asyncio.Semaphore(max(1, pdf_concurrency))
+    manifest_rows: list[PdfManifestRow] = []
+
+    async def _process(row: dict[str, Any]) -> PdfManifestRow:
+        pdf_path = Path(row["pdf_path"])
+        async with sem:
+            extractions = await _extract_one_pdf(
+                pdf_path,
+                extractions_dir=extractions_dir,
+                force_reextract=force_reextract,
+            )
+        return PdfManifestRow(
+            doc_id=str(row["doc_id"]),
+            pdf_path=pdf_path,
+            document_id=row.get("document_id"),
+            pages=count_pdf_pages(pdf_path),
+            extractions=extractions,
+        )
+
+    logger.info(
+        "parser_compare: extracting %d PDFs x 4 parsers (concurrency=%d)",
+        len(rows_in_scope), pdf_concurrency,
+    )
+    manifest_rows = await asyncio.gather(*(_process(r) for r in rows_in_scope))
+
+    # Persist manifest
+    map_path = ctx.maps_dir() / "parser_compare_doc_map.jsonl"
+    with map_path.open("w", encoding="utf-8") as fh:
+        for mr in manifest_rows:
+            fh.write(json.dumps(mr.to_jsonl()) + "\n")
+    logger.info("parser_compare manifest -> %s", map_path)
+
+    # Update suite state so the runner can find us via
+    # ctx.suite_state.ingestion_maps.
+    new_state = ctx.suite_state
+    new_state.ingestion_maps["parser_compare"] = str(map_path)
+    set_suite_state(ctx.config, ctx.suite, new_state)
+
+    # Quick summary log
+    total_extractions = sum(len(mr.extractions) for mr in manifest_rows)
+    failures = sum(
+        1 for mr in manifest_rows for ext in mr.extractions.values()
+        if ext.status != "ok"
+    )
+    logger.info(
+        "parser_compare ingest done: %d PDFs, %d extractions, %d failures",
+        len(manifest_rows), total_extractions, failures,
+    )
+
+
+__all__ = [
+    "ExtractionResult",
+    "PARSER_ARMS",
+    "PdfManifestRow",
+    "run_ingest",
+]
--- a/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/parser_compare/prompt.py
+++ b/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/parser_compare/prompt.py
@ -0,0 +1,120 @@
+"""Prompt templates for the three input modalities in parser_compare.
+
+We deliberately reuse the *same* core question framing as
+``mmlongbench/prompt.py`` so byte-identical questions reach all six
+arms; only the document delivery channel changes.
+
+Three templates:
+
+* ``build_native_pdf_prompt``       — bare question + format hint.
+                                       The PDF rides as a separate file
+                                       part (``NativePdfArm`` handles it).
+* ``build_long_context_prompt``     — question + format hint + the
+                                       parser-extracted markdown wrapped
+                                       in fenced ``<document>`` tags so
+                                       the model can clearly delimit
+                                       "context" from "instruction".
+* ``build_surfsense_prompt``        — bare question + format hint
+                                       (chunks come from RAG retrieval,
+                                       not from the prompt).
+
+The ``<document>`` tag is doc-aware: even though parser_compare runs
+one PDF per question today, we keep the wrapper plural so this is
+trivial to extend to multi-doc later.
+"""
+
+from __future__ import annotations
+
+# ---------------------------------------------------------------------------
+# Per-format hint blocks (same lookup as mmlongbench/prompt.py)
+# ---------------------------------------------------------------------------
+
+_FORMAT_HINTS: dict[str, str] = {
+    "str": (
+        "Respond with the answer as a short phrase, no full sentence. "
+        "Format your final line as `Answer: <text>`."
+    ),
+    "int": (
+        "Respond with a single integer only. "
+        "Format your final line as `Answer: <integer>`."
+    ),
+    "float": (
+        "Respond with a single decimal number only (no units). "
+        "Format your final line as `Answer: <number>`."
+    ),
+    "list": (
+        "Respond with a comma-separated list of items, no extra text. "
+        "Format your final line as `Answer: item1, item2, item3`."
+    ),
+    "none": (
+        "If the answer cannot be determined from the document, say so explicitly. "
+        "Format your final line as `Answer: Not answerable`."
+    ),
+}
+
+
+def _format_hint(answer_format: str) -> str:
+    fmt = (answer_format or "str").strip().lower()
+    return _FORMAT_HINTS.get(fmt, _FORMAT_HINTS["str"])
+
+
+_BASE_INSTRUCTION = (
+    "You are a document-understanding assistant. Use ONLY the provided "
+    "document to answer the question. The document may contain text, "
+    "tables, charts, figures, and images. If the answer is in a chart "
+    "or image, read it carefully. Do not use external knowledge."
+)
+
+
+def build_native_pdf_prompt(question: str, *, answer_format: str) -> str:
+    """Prompt for ``NativePdfArm`` — PDF attached separately as a file part."""
+
+    return (
+        f"{_BASE_INSTRUCTION}\n\n"
+        f"Question: {question.strip()}\n\n"
+        f"{_format_hint(answer_format)}\n"
+    )
+
+
+def build_surfsense_prompt(question: str, *, answer_format: str) -> str:
+    """Prompt for ``SurfSenseArm`` — chunks retrieved by the agent."""
+
+    # SurfSense's agent already injects retrieved chunks via its tool
+    # loop; the prompt only carries the user-visible question + format
+    # hint, mirroring how a human asks the SurfSense UI.
+    return (
+        f"{_BASE_INSTRUCTION}\n\n"
+        f"Question: {question.strip()}\n\n"
+        f"{_format_hint(answer_format)}\n"
+    )
+
+
+def build_long_context_prompt(
+    question: str,
+    *,
+    answer_format: str,
+    document_markdown: str,
+    document_label: str,
+) -> str:
+    """Prompt for the four long-context arms — markdown stuffed inline.
+
+    ``document_label`` is a short human-readable name (e.g. the PDF
+    filename) so the model can reason about source provenance even
+    though only one document is in scope.
+    """
+
+    return (
+        f"{_BASE_INSTRUCTION}\n\n"
+        f"<document name=\"{document_label}\">\n"
+        f"{document_markdown.strip()}\n"
+        f"</document>\n\n"
+        f"Question: {question.strip()}\n\n"
+        f"{_format_hint(answer_format)}\n"
+    )
+
+
+__all__ = [
+    "build_long_context_prompt",
+    "build_native_pdf_prompt",
+    "build_surfsense_prompt",
+]
--- a/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/parser_compare/runner.py
+++ b/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/parser_compare/runner.py
@ -0,0 +1,797 @@
+"""parser_compare runner — six-arm head-to-head on n shared questions.
+
+For each (PDF, question) pair we issue six LLM calls (all sonnet 4.5):
+
+* ``native_pdf``           — PDF attached natively.
+* ``azure_basic_lc``       — Azure prebuilt-read markdown stuffed.
+* ``azure_premium_lc``     — Azure prebuilt-layout markdown stuffed.
+* ``llamacloud_basic_lc``  — LlamaParse parse_page_with_llm markdown stuffed.
+* ``llamacloud_premium_lc`` — LlamaParse parse_page_with_agent markdown stuffed.
+* ``surfsense_agentic``    — SurfSense /api/v1/new_chat retrieval over chunks.
+
+Cost reporting:
+
+* ``llm_cost_per_q``       — mean OpenRouter ``usage.cost`` reported by
+                              the chat-completions API. Zero for the
+                              SurfSense agentic arm because the SSE
+                              stream doesn't surface per-call cost yet
+                              (a known gap; we annotate it in the
+                              report rather than estimating).
+* ``preprocess_cost_total`` — pages * $/1k according to the user's
+                              tariff:
+                                * basic   : $1   / 1k pages
+                                * premium : $10  / 1k pages
+                                * native_pdf : $0  (no preprocessing)
+                                * surfsense_agentic : $10 / 1k pages
+                                  (existing mmlongbench ingest used
+                                  processing_mode=premium with Azure DI).
+* ``preprocess_cost_per_q`` — preprocess_cost_total / n_questions.
+* ``total_cost_per_q``      — llm_cost_per_q + preprocess_cost_per_q.
+
+The grader is reused from ``mmlongbench/grader.py`` (deterministic,
+format-aware) so the metric is directly comparable to the existing
+mmlongbench runs.
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+import logging
+import os
+from collections.abc import Iterable
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+from ....core.arms import (
+    ArmRequest,
+    ArmResult,
+    BareLlmArm,
+    NativePdfArm,
+    SurfSenseArm,
+)
+from ....core.config import utc_iso_timestamp
+from ....core.metrics.mc_accuracy import accuracy_with_wilson_ci
+from ....core.parse.freeform_answer import extract_freeform_answer
+from ....core.providers.openrouter_chat import OpenRouterChatProvider
+from ....core.providers.openrouter_pdf import OpenRouterPdfProvider, PdfEngine
+from ....core.registry import ReportSection, RunArtifact, RunContext
+from ..mmlongbench.grader import GradeResult, grade
+from .ingest import PARSER_ARMS
+from .prompt import (
+    build_long_context_prompt,
+    build_native_pdf_prompt,
+    build_surfsense_prompt,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# Cost tariff (per the user's spec: $1 / 1k pages basic, $10 / 1k pages premium).
+# Held as dollars-per-page so per-PDF math is a pure multiply.
+PREPROCESS_USD_PER_PAGE = {
+    "basic":   1.0  / 1000.0,
+    "premium": 10.0 / 1000.0,
+}
+
+ARM_NAMES = (
+    "native_pdf",
+    "azure_basic_lc",
+    "azure_premium_lc",
+    "llamacloud_basic_lc",
+    "llamacloud_premium_lc",
+    "surfsense_agentic",
+)
+
+# What ingest mode each LC arm corresponds to (drives preprocess cost).
+_LC_ARM_MODE: dict[str, str] = {
+    "azure_basic_lc": "basic",
+    "azure_premium_lc": "premium",
+    "llamacloud_basic_lc": "basic",
+    "llamacloud_premium_lc": "premium",
+}
+
+# The SurfSense agentic arm is fed by the existing mmlongbench
+# ingestion. That ingestion was performed with vision_llm=on and
+# processing_mode=premium, and the backend's ETL routes premium-mode
+# PDFs through Azure DI prebuilt-layout when AZURE_DI_KEY is set. So
+# the preprocessing cost is the premium tariff.
+SURFSENSE_INGEST_MODE = "premium"
+
+
+# ---------------------------------------------------------------------------
+# Question + PDF row shapes
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class PCQuestion:
+    qid: str
+    doc_id: str
+    question: str
+    gold_answer: str
+    answer_format: str
+    pdf_path: Path
+    document_id: int | None
+    pages: int
+    extractions: dict[str, Path]  # arm_name -> markdown path (only successes)
+
+
+def _read_doc_map(map_path: Path) -> dict[str, dict[str, Any]]:
+    out: dict[str, dict[str, Any]] = {}
+    with map_path.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            out[str(row["doc_id"])] = row
+    return out
+
+
+def _select_questions(
+    questions_jsonl: Path,
+    doc_map: dict[str, dict[str, Any]],
+    *,
+    docs_filter: list[str] | None,
+    sample_per_doc: int,
+    skip_unanswerable: bool,
+    skip_format: list[str] | None,
+) -> list[PCQuestion]:
+    """Pick the first ``sample_per_doc`` questions per PDF in scope.
+
+    Defaults to one per PDF (n=5 across 5 PDFs ⇒ 5 questions). Filters
+    out unanswerable probes by default since they're noise at small n.
+    """
+
+    out: list[PCQuestion] = []
+    per_doc_taken: dict[str, int] = {}
+    per_doc_idx: dict[str, int] = {}
+    skip_format_set = {f.lower() for f in (skip_format or [])}
+
+    with questions_jsonl.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            doc_id = str(row.get("doc_id") or "")
+            if not doc_id:
+                continue
+            if docs_filter and doc_id not in docs_filter:
+                continue
+            map_row = doc_map.get(doc_id)
+            if map_row is None:
+                continue
+
+            answer_format = str(row.get("answer_format") or "").strip().lower()
+            idx = per_doc_idx.get(doc_id, 0)
+            per_doc_idx[doc_id] = idx + 1
+
+            if skip_unanswerable and answer_format == "none":
+                continue
+            if answer_format in skip_format_set:
+                continue
+
+            if per_doc_taken.get(doc_id, 0) >= sample_per_doc:
+                continue
+
+            extractions: dict[str, Path] = {}
+            for arm_name, ext_blob in (map_row.get("extractions") or {}).items():
+                if ext_blob.get("status") == "ok" and ext_blob.get("markdown_path"):
+                    extractions[arm_name] = Path(ext_blob["markdown_path"])
+
+            out.append(PCQuestion(
+                qid=f"{doc_id}::Q{idx:03d}",
+                doc_id=doc_id,
+                question=str(row.get("question") or "").strip(),
+                gold_answer=str(row.get("answer") or "").strip(),
+                answer_format=answer_format,
+                pdf_path=Path(map_row["pdf_path"]),
+                document_id=map_row.get("document_id"),
+                pages=int(map_row.get("pages", 0)),
+                extractions=extractions,
+            ))
+            per_doc_taken[doc_id] = per_doc_taken.get(doc_id, 0) + 1
+
+    out.sort(key=lambda q: (q.doc_id, q.qid))
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Bounded concurrency helper
+# ---------------------------------------------------------------------------
+
+
+async def _gather_with_limit(coros: Iterable, *, concurrency: int) -> list[Any]:
+    sem = asyncio.Semaphore(max(1, concurrency))
+
+    async def _wrap(coro):
+        async with sem:
+            return await coro
+
+    return await asyncio.gather(*(_wrap(c) for c in coros))
+
+
+# ---------------------------------------------------------------------------
+# Benchmark
+# ---------------------------------------------------------------------------
+
+
+_DESCRIPTION = (
+    "parser_compare — 6-arm head-to-head on shared MMLongBench-Doc PDFs: "
+    "native PDF + (Azure DI / LlamaCloud) x (basic / premium) long-context "
+    "stuffing + SurfSense agentic retrieval. Reports preprocessing dollars "
+    "($1 / 1k pages basic, $10 / 1k pages premium) on top of LLM cost."
+)
+
+
+class ParserCompareBenchmark:
+    """6-arm parser + agentic-vs-non-agentic head-to-head."""
+
+    suite: str = "multimodal_doc"
+    name: str = "parser_compare"
+    headline: bool = True
+    description: str = _DESCRIPTION
+
+    # ------------------------------------------------------------------
+    # CLI flags
+    # ------------------------------------------------------------------
+
+    def add_run_args(self, parser: argparse.ArgumentParser) -> None:
+        parser.add_argument(
+            "--docs", default=None,
+            help="Comma-separated doc_ids to include (default: all in manifest).",
+        )
+        parser.add_argument(
+            "--sample-per-doc", type=int, default=1,
+            help="Take the first N answerable questions per PDF (default 1).",
+        )
+        parser.add_argument(
+            "--skip-unanswerable", dest="skip_unanswerable",
+            action="store_true", default=True,
+            help="Drop 'None' format probes (default true; we want signal not "
+                 "hallucination probes for n=5).",
+        )
+        parser.add_argument(
+            "--include-unanswerable", dest="skip_unanswerable",
+            action="store_false",
+            help="Override --skip-unanswerable; include unanswerable probes too.",
+        )
+        parser.add_argument(
+            "--skip-format", default=None,
+            help="Comma-separated answer_format values to skip (e.g. 'none,float').",
+        )
+        parser.add_argument(
+            "--concurrency", type=int, default=2,
+            help="Parallel question workers per arm (default 2).",
+        )
+        parser.add_argument(
+            "--no-mentions", dest="no_mentions", action="store_true",
+            help="SurfSense arm: skip mentioned_document_ids (full-corpus retrieval).",
+        )
+        parser.add_argument(
+            "--pdf-engine", default="native",
+            choices=[e.value for e in PdfEngine],
+            help="OpenRouter file-parser engine for native_pdf arm.",
+        )
+        parser.add_argument(
+            "--max-output-tokens", type=int, default=512,
+            help="Cap on completion length for every arm.",
+        )
+        parser.add_argument(
+            "--llm-model", default="anthropic/claude-sonnet-4.5",
+            help="OpenRouter slug used by the 5 OpenRouter-driven arms. "
+                 "SurfSense arm uses whatever provider_model is pinned on the suite.",
+        )
+        parser.add_argument(
+            "--skip-arms", default=None,
+            help="Comma-separated arm names to skip (e.g. 'llamacloud_premium_lc').",
+        )
+        # Ingest-only flags (forwarded by the CLI to ingest.run_ingest).
+        parser.add_argument(
+            "--max-docs", type=int, default=None,
+            help="(ingest only) cap number of unique PDFs to process.",
+        )
+        parser.add_argument(
+            "--force-reextract", action="store_true",
+            help="(ingest only) re-call parsers even if cached .md exists.",
+        )
+        parser.add_argument(
+            "--pdf-concurrency", type=int, default=2,
+            help="(ingest only) parallel PDFs (each fans out to 4 parsers).",
+        )
+
+    # ------------------------------------------------------------------
+    # Lifecycle: ingest delegates to .ingest.run_ingest
+    # ------------------------------------------------------------------
+
+    async def ingest(self, ctx: RunContext, **opts: Any) -> None:
+        from .ingest import run_ingest
+
+        docs_raw: str | None = opts.get("docs")
+        docs_filter = (
+            [d.strip() for d in docs_raw.split(",") if d.strip()] if docs_raw else None
+        )
+        await run_ingest(
+            ctx,
+            docs_filter=docs_filter,
+            max_docs=opts.get("max_docs"),
+            force_reextract=bool(opts.get("force_reextract", False)),
+            pdf_concurrency=int(opts.get("pdf_concurrency") or 2),
+        )
+
+    # ------------------------------------------------------------------
+    # Run
+    # ------------------------------------------------------------------
+
+    async def run(self, ctx: RunContext, **opts: Any) -> RunArtifact:
+        docs_raw: str | None = opts.get("docs")
+        docs_filter = (
+            [d.strip() for d in docs_raw.split(",") if d.strip()] if docs_raw else None
+        )
+        sample_per_doc = int(opts.get("sample_per_doc") or 1)
+        skip_unanswerable = bool(opts.get("skip_unanswerable", True))
+        skip_format_raw: str | None = opts.get("skip_format")
+        skip_format = (
+            [f.strip() for f in skip_format_raw.split(",") if f.strip()]
+            if skip_format_raw else None
+        )
+        concurrency = int(opts.get("concurrency") or 2)
+        no_mentions = bool(opts.get("no_mentions"))
+        pdf_engine_name = opts.get("pdf_engine") or "native"
+        max_output_tokens = int(opts.get("max_output_tokens") or 512)
+        llm_model = str(opts.get("llm_model") or "anthropic/claude-sonnet-4.5")
+        skip_arms_raw: str | None = opts.get("skip_arms")
+        skip_arms = (
+            {a.strip() for a in skip_arms_raw.split(",") if a.strip()}
+            if skip_arms_raw else set()
+        )
+
+        active_arms = [a for a in ARM_NAMES if a not in skip_arms]
+        if not active_arms:
+            raise RuntimeError("All arms skipped; nothing to run.")
+
+        bench_dir = ctx.benchmark_data_dir()
+        # parser_compare reuses mmlongbench's questions.jsonl (already
+        # downloaded by `ingest multimodal_doc mmlongbench`).
+        questions_jsonl = bench_dir.parent / "mmlongbench" / "questions.jsonl"
+        map_path = ctx.maps_dir() / "parser_compare_doc_map.jsonl"
+        if not questions_jsonl.exists():
+            raise RuntimeError(
+                "Missing mmlongbench questions at "
+                f"{questions_jsonl}. Run "
+                "`python -m surfsense_evals ingest multimodal_doc mmlongbench` first."
+            )
+        if not map_path.exists():
+            raise RuntimeError(
+                "parser_compare doc map missing. Run "
+                "`python -m surfsense_evals ingest multimodal_doc parser_compare` first."
+            )
+
+        doc_map = _read_doc_map(map_path)
+        questions = _select_questions(
+            questions_jsonl, doc_map,
+            docs_filter=docs_filter,
+            sample_per_doc=sample_per_doc,
+            skip_unanswerable=skip_unanswerable,
+            skip_format=skip_format,
+        )
+        if not questions:
+            raise RuntimeError(
+                "No questions matched filters; broaden --docs / --skip-format."
+            )
+        logger.info(
+            "parser_compare: scheduled %d questions across %d arms (%s)",
+            len(questions), len(active_arms), ",".join(active_arms),
+        )
+
+        api_key = os.environ.get("OPENROUTER_API_KEY")
+        if not api_key:
+            raise RuntimeError("OPENROUTER_API_KEY env var is required.")
+
+        # Build arms
+        arms: dict[str, Any] = {}
+        if "native_pdf" in active_arms:
+            native_provider = OpenRouterPdfProvider(
+                api_key=api_key, base_url=ctx.config.openrouter_base_url,
+                model=llm_model, engine=PdfEngine(pdf_engine_name),
+            )
+            arms["native_pdf"] = NativePdfArm(
+                provider=native_provider, max_output_tokens=max_output_tokens,
+            )
+        for arm_name, _, _ in PARSER_ARMS:
+            if arm_name in active_arms:
+                lc_provider = OpenRouterChatProvider(
+                    api_key=api_key, base_url=ctx.config.openrouter_base_url,
+                    model=llm_model,
+                )
+                arms[arm_name] = BareLlmArm(
+                    provider=lc_provider,
+                    max_output_tokens=max_output_tokens,
+                    name=arm_name,
+                )
+        if "surfsense_agentic" in active_arms:
+            surf = SurfSenseArm(
+                client=ctx.new_chat_client(),
+                search_space_id=ctx.search_space_id,
+                ephemeral_threads=True,
+            )
+            # Override the default "surfsense" name so the metrics
+            # bucket lines up with the rest of parser_compare's arms.
+            surf.name = "surfsense_agentic"
+            arms["surfsense_agentic"] = surf
+
+        run_timestamp = utc_iso_timestamp()
+        run_dir = ctx.runs_dir(run_timestamp=run_timestamp)
+        raw_path = run_dir / "raw.jsonl"
+
+        # ---- per-arm answer coroutine helpers ----
+
+        def _native_req(q: PCQuestion) -> ArmRequest:
+            return ArmRequest(
+                question_id=q.qid,
+                prompt=build_native_pdf_prompt(q.question, answer_format=q.answer_format),
+                pdf_paths=[q.pdf_path],
+                options={"max_tokens": max_output_tokens},
+            )
+
+        def _lc_req(q: PCQuestion, arm_name: str) -> ArmRequest:
+            md_path = q.extractions.get(arm_name)
+            if md_path is None or not md_path.exists():
+                raise FileNotFoundError(
+                    f"Missing extraction for {arm_name} on {q.doc_id}"
+                )
+            markdown = md_path.read_text(encoding="utf-8")
+            return ArmRequest(
+                question_id=q.qid,
+                prompt=build_long_context_prompt(
+                    q.question,
+                    answer_format=q.answer_format,
+                    document_markdown=markdown,
+                    document_label=q.doc_id,
+                ),
+            )
+
+        def _surf_req(q: PCQuestion) -> ArmRequest:
+            mentions: list[int] | None = None
+            if not no_mentions and q.document_id is not None:
+                mentions = [int(q.document_id)]
+            return ArmRequest(
+                question_id=q.qid,
+                prompt=build_surfsense_prompt(q.question, answer_format=q.answer_format),
+                mentioned_document_ids=mentions,
+            )
+
+        async def _answer_one(arm_name: str, q: PCQuestion) -> ArmResult:
+            arm = arms[arm_name]
+            try:
+                if arm_name == "native_pdf":
+                    return await arm.answer(_native_req(q))
+                if arm_name == "surfsense_agentic":
+                    return await arm.answer(_surf_req(q))
+                return await arm.answer(_lc_req(q, arm_name))
+            except FileNotFoundError as exc:
+                return ArmResult(
+                    arm=arm_name,
+                    question_id=q.qid,
+                    raw_text="",
+                    error=f"FileNotFoundError: {exc}",
+                )
+
+        # Run all arms in parallel (each arm bounded by `concurrency`).
+        per_arm_tasks: dict[str, list] = {
+            arm_name: [_answer_one(arm_name, q) for q in questions]
+            for arm_name in active_arms
+        }
+        per_arm_results: dict[str, list[ArmResult]] = {}
+        gathered = await asyncio.gather(*[
+            _gather_with_limit(per_arm_tasks[arm_name], concurrency=concurrency)
+            for arm_name in active_arms
+        ])
+        for arm_name, results in zip(active_arms, gathered, strict=True):
+            per_arm_results[arm_name] = results
+
+        # Grade
+        per_arm_grades: dict[str, list[GradeResult]] = {}
+        for arm_name in active_arms:
+            per_arm_grades[arm_name] = [
+                grade(
+                    pred=extract_freeform_answer(r.raw_text or ""),
+                    gold=q.gold_answer,
+                    answer_format=q.answer_format,
+                )
+                for q, r in zip(questions, per_arm_results[arm_name], strict=True)
+            ]
+
+        # Persist raw.jsonl
+        with raw_path.open("w", encoding="utf-8") as fh:
+            for i, q in enumerate(questions):
+                base = {
+                    "qid": q.qid,
+                    "doc_id": q.doc_id,
+                    "answer_format": q.answer_format,
+                    "gold": q.gold_answer,
+                    "pages": q.pages,
+                    "document_id": q.document_id,
+                }
+                for arm_name in active_arms:
+                    res = per_arm_results[arm_name][i]
+                    g = per_arm_grades[arm_name][i]
+                    fh.write(json.dumps({
+                        **base,
+                        **res.to_jsonl(),
+                        "graded": {
+                            "correct": g.correct,
+                            "f1": g.f1,
+                            "method": g.method,
+                            "normalised_pred": g.normalised_pred,
+                            "normalised_gold": g.normalised_gold,
+                        },
+                    }) + "\n")
+
+        # Aggregate per-arm metrics + cost
+        metrics = _compute_metrics(
+            questions, per_arm_results, per_arm_grades, active_arms,
+        )
+
+        artifact = RunArtifact(
+            suite=self.suite,
+            benchmark=self.name,
+            run_timestamp=run_timestamp,
+            raw_path=raw_path,
+            metrics=metrics,
+            extra={
+                "n_questions": len(questions),
+                "n_pdfs": len({q.doc_id for q in questions}),
+                "active_arms": list(active_arms),
+                "concurrency": concurrency,
+                "no_mentions": no_mentions,
+                "pdf_engine": pdf_engine_name,
+                "llm_model": llm_model,
+                "scenario": ctx.scenario,
+                "provider_model": ctx.provider_model,
+                "vision_provider_model": ctx.vision_provider_model,
+                "agent_llm_id": ctx.agent_llm_id,
+                "preprocess_tariff": {
+                    "basic_per_1k_pages": 1.0,
+                    "premium_per_1k_pages": 10.0,
+                },
+            },
+        )
+
+        manifest_path = run_dir / "run_artifact.json"
+        manifest_path.write_text(
+            json.dumps({
+                "suite": self.suite,
+                "benchmark": self.name,
+                "raw_path": "raw.jsonl",
+                "metrics": metrics,
+                "extra": artifact.extra,
+            }, indent=2, sort_keys=True) + "\n",
+            encoding="utf-8",
+        )
+        return artifact
+
+    # ------------------------------------------------------------------
+    # Report
+    # ------------------------------------------------------------------
+
+    def report_section(self, artifacts: list[RunArtifact]) -> ReportSection:
+        if not artifacts:
+            return ReportSection(
+                title="Parser × agent-vs-stuffing comparison",
+                headline=True,
+                body_md="(no run artifacts found)",
+                body_json={},
+            )
+        latest = max(artifacts, key=lambda a: a.run_timestamp)
+        m = latest.metrics
+        extra = latest.extra
+        per_arm = m.get("per_arm", {})
+        active_arms = list(extra.get("active_arms", per_arm.keys()))
+
+        n_q = extra.get("n_questions", "?")
+        n_pdfs = extra.get("n_pdfs", "?")
+
+        body: list[str] = []
+        body.append(
+            f"- Sample size: **{n_q} questions across {n_pdfs} PDFs** "
+            f"(LLM: `{extra.get('llm_model', '?')}`, "
+            f"engine: `{extra.get('pdf_engine', 'native')}`)."
+        )
+        body.append(
+            f"- Preprocess tariff: basic = $1 / 1k pages, "
+            f"premium = $10 / 1k pages."
+        )
+        body.append("")
+        body.append("### Per-arm summary")
+        body.append("")
+        body.append(
+            "| Arm | Accuracy | F1 mean | LLM $/Q | Preprocess $ total | Total $/Q | Latency p50 |"
+        )
+        body.append("|---|---:|---:|---:|---:|---:|---:|")
+        for arm_name in active_arms:
+            row = per_arm.get(arm_name)
+            if not row:
+                body.append(f"| `{arm_name}` | (no data) | | | | | |")
+                continue
+            body.append(
+                f"| `{arm_name}` "
+                f"| {row['accuracy']*100:.1f}% "
+                f"({row['n_correct']}/{row['n']}) "
+                f"| {row['f1_mean']*100:.1f}% "
+                f"| ${row['llm_cost_per_q']:.4f} "
+                f"| ${row['preprocess_cost_total']:.4f} "
+                f"| ${row['total_cost_per_q']:.4f} "
+                f"| {row['latency_ms_median']/1000:.1f}s |"
+            )
+        body.append("")
+
+        # Notes / caveats
+        body.append("### Notes")
+        body.append("")
+        body.append(
+            "- `surfsense_agentic` LLM cost shows as $0.0000 because the "
+            "`/api/v1/new_chat` SSE stream does not surface per-call token "
+            "or cost yet (a known instrumentation gap). Preprocessing cost "
+            "is the premium tariff because the underlying mmlongbench "
+            "ingestion was performed with `processing_mode=premium` + "
+            "`vision_llm=on` + Azure DI."
+        )
+        body.append(
+            "- Long-context arms include the **same PDF text** for every "
+            "question against that PDF, so the OpenRouter input cost is "
+            "dominated by markdown size; preprocessing cost is paid once "
+            "across all questions sharing a PDF."
+        )
+        body.append(
+            "- Preprocessing $ total is computed as "
+            "`pages_processed_per_arm × tariff`, summed across the unique "
+            "PDFs in scope. With one question per PDF (n=5), preprocess $ "
+            "= preprocess $ / Q."
+        )
+        if extra.get("scenario"):
+            body.append(
+                f"- Scenario: `{extra.get('scenario')}` "
+                f"(suite-pinned `provider_model`: "
+                f"`{extra.get('provider_model', '?')}`)."
+            )
+
+        # Per-PDF breakdown if useful
+        per_pdf = m.get("per_pdf", {})
+        if per_pdf:
+            body.append("")
+            body.append("### Per-PDF correctness")
+            body.append("")
+            header = "| Doc | Pages | " + " | ".join(f"`{a}`" for a in active_arms) + " |"
+            sep = "|---|---:|" + "|".join(":---:" for _ in active_arms) + "|"
+            body.append(header)
+            body.append(sep)
+            for doc_id, info in sorted(per_pdf.items()):
+                row_cells = []
+                for arm_name in active_arms:
+                    g = info.get("arms", {}).get(arm_name, {})
+                    if not g:
+                        row_cells.append("?")
+                    else:
+                        row_cells.append("✓" if g.get("correct") else "✗")
+                body.append(
+                    f"| `{doc_id}` | {info.get('pages', '?')} | "
+                    + " | ".join(row_cells) + " |"
+                )
+
+        return ReportSection(
+            title="Parser × agent-vs-stuffing — long PDFs (sonnet 4.5)",
+            headline=True,
+            body_md="\n".join(body),
+            body_json=m,
+        )
+
+
+# ---------------------------------------------------------------------------
+# Metrics
+# ---------------------------------------------------------------------------
+
+
+def _compute_metrics(
+    questions: list[PCQuestion],
+    per_arm_results: dict[str, list[ArmResult]],
+    per_arm_grades: dict[str, list[GradeResult]],
+    active_arms: Iterable[str],
+) -> dict[str, Any]:
+    """Aggregate per-arm metrics + the user's preprocessing cost overlay."""
+
+    # Sum unique PDF pages — preprocessing pays per unique PDF, not per question.
+    pdf_pages: dict[str, int] = {}
+    for q in questions:
+        pdf_pages.setdefault(q.doc_id, q.pages)
+
+    per_arm: dict[str, dict[str, Any]] = {}
+    for arm_name in active_arms:
+        results = per_arm_results[arm_name]
+        grades = per_arm_grades[arm_name]
+        n = len(grades)
+        n_correct = sum(1 for g in grades if g.correct)
+        f1_sum = sum(g.f1 for g in grades)
+        acc_with_ci = accuracy_with_wilson_ci(n_correct, n)
+
+        # LLM cost: sum of per-call cost_micros across questions, then average.
+        cost_micros_total = sum(int(r.cost_micros or 0) for r in results)
+        llm_cost_per_q = (cost_micros_total / 1_000_000.0) / n if n else 0.0
+
+        # Preprocessing cost depends on which mode this arm corresponds to.
+        if arm_name == "native_pdf":
+            preprocess_per_page = 0.0
+            preprocess_label = "n/a (PDF attached natively)"
+        elif arm_name in _LC_ARM_MODE:
+            mode = _LC_ARM_MODE[arm_name]
+            preprocess_per_page = PREPROCESS_USD_PER_PAGE[mode]
+            preprocess_label = f"{mode} tier ($/{mode}/page = ${preprocess_per_page:.4f})"
+        elif arm_name == "surfsense_agentic":
+            preprocess_per_page = PREPROCESS_USD_PER_PAGE[SURFSENSE_INGEST_MODE]
+            preprocess_label = (
+                f"{SURFSENSE_INGEST_MODE} tier (ingested by SurfSense at "
+                f"processing_mode=premium + vision_llm=on)"
+            )
+        else:
+            preprocess_per_page = 0.0
+            preprocess_label = "unknown"
+
+        preprocess_cost_total = sum(
+            pages * preprocess_per_page for pages in pdf_pages.values()
+        )
+        preprocess_cost_per_q = preprocess_cost_total / n if n else 0.0
+        total_cost_per_q = llm_cost_per_q + preprocess_cost_per_q
+
+        latencies = sorted(int(r.latency_ms or 0) for r in results)
+        latency_median = latencies[len(latencies) // 2] if latencies else 0
+        latency_p95 = latencies[int(len(latencies) * 0.95)] if len(latencies) >= 20 else (
+            latencies[-1] if latencies else 0
+        )
+
+        in_tokens = [int(r.input_tokens or 0) for r in results]
+        out_tokens = [int(r.output_tokens or 0) for r in results]
+
+        per_arm[arm_name] = {
+            **acc_with_ci.to_dict(),
+            "n": n,
+            "n_correct": n_correct,
+            "f1_mean": f1_sum / n if n else 0.0,
+            "llm_cost_per_q": llm_cost_per_q,
+            "preprocess_per_page_usd": preprocess_per_page,
+            "preprocess_cost_total": preprocess_cost_total,
+            "preprocess_cost_per_q": preprocess_cost_per_q,
+            "total_cost_per_q": total_cost_per_q,
+            "preprocess_label": preprocess_label,
+            "latency_ms_median": latency_median,
+            "latency_ms_p95": latency_p95,
+            "input_tokens_mean": (sum(in_tokens) / len(in_tokens)) if in_tokens else 0.0,
+            "output_tokens_mean": (sum(out_tokens) / len(out_tokens)) if out_tokens else 0.0,
+        }
+
+    # Per-PDF breakdown (correct / not for each arm)
+    per_pdf: dict[str, dict[str, Any]] = {}
+    for i, q in enumerate(questions):
+        slot = per_pdf.setdefault(q.doc_id, {
+            "pages": q.pages,
+            "arms": {},
+        })
+        for arm_name in active_arms:
+            slot["arms"].setdefault(arm_name, {
+                "correct": per_arm_grades[arm_name][i].correct,
+                "f1": per_arm_grades[arm_name][i].f1,
+            })
+
+    return {
+        "per_arm": per_arm,
+        "per_pdf": per_pdf,
+        "n_questions": len(questions),
+        "n_unique_pdfs": len(pdf_pages),
+        "total_pages_in_scope": sum(pdf_pages.values()),
+    }
+
+
+__all__ = ["ParserCompareBenchmark", "PCQuestion"]
--- a/surfsense_evals/src/surfsense_evals/suites/research/crag/init.py
+++ b/surfsense_evals/src/surfsense_evals/suites/research/crag/init.py
@ -1,22 +1,10 @@
 """CRAG — Comprehensive RAG Benchmark (Yang et al., Meta, KDD Cup 2024).

-Source: https://github.com/facebookresearch/CRAG  (Tasks 1, 2, and 3)
+Source: https://github.com/facebookresearch/CRAG (Tasks 1 & 2)
 Paper:  https://arxiv.org/abs/2406.04744

-This package registers two siblings:
-
-* ``crag``    — Tasks 1 & 2: 5 candidate pages per question.
-* ``crag_t3`` — Task 3:       50 candidate pages per question. The
-  long-context arm is capped to the top-5 (the realistic "naive
-  RAG = pick top-K results" baseline); SurfSense retrieves over
-  all 50, where its rerank becomes the entire contribution.
-
-Both share the grader, prompt, runner, and report code; only the
-ingest path differs (single bz2 vs 4-part tar.bz2 streamed).
-
 CRAG ships ~2,706 factual QA pairs, each paired with **5 full HTML
-pages** retrieved as the top-5 of a real web search at ``query_time``
-(50 in Task 3).
+pages** retrieved as the top-5 of a real web search at ``query_time``.
 The benchmark spans 5 domains (finance, music, movie, sports, open)
 and 8 question types (simple, comparison, aggregation, set, multi-hop,
 post-processing, false_premise, simple_w_condition) — heads/torsos/
@ -51,7 +39,6 @@ relative to refusals.
 from __future__ import annotations

 from ....core import registry as _registry
-from .runner import CragBenchmark, CragTask3Benchmark
+from .runner import CragBenchmark

 _registry.register(CragBenchmark())
-_registry.register(CragTask3Benchmark())
--- a/surfsense_evals/src/surfsense_evals/suites/research/crag/dataset_task3.py
+++ b/surfsense_evals/src/surfsense_evals/suites/research/crag/dataset_task3.py
@ -1,263 +0,0 @@
-"""CRAG Task 3 dataset loader — 4-part tar.bz2 → streaming JSONL.
-
-Task 3 ships ~7 GB of compressed data split into 4 parts on GitHub:
-
-    crag_task_3_dev_v4.tar.bz2.part1    (≈2 GB)
-    crag_task_3_dev_v4.tar.bz2.part2    (≈2 GB)
-    crag_task_3_dev_v4.tar.bz2.part3    (≈2 GB)
-    crag_task_3_dev_v4.tar.bz2.part4    (≈1.3 GB)
-
-Concatenated, they form a tar archive containing a single JSONL file.
-Decompressed, that JSONL is on the order of 30-50 GB because each row
-embeds 50 full HTML pages (vs 5 in Tasks 1 & 2).
-
-Materialising the JSONL would blow the disk budget (we have ~50 GB
-free at the time of writing), so we stream the whole thing instead:
-
-  1. Download parts (idempotent; ``scripts/download_crag_task3.py``).
-  2. Concat them into a virtual file via ``_MultiPartReader``.
-  3. Wrap in ``bz2.BZ2File`` for on-the-fly decompression.
-  4. Wrap in ``tarfile.open(fileobj=..., mode="r|")`` for streaming
-     tar member iteration.
-  5. For the JSONL member inside, ``tar.extractfile()`` returns a
-     binary file-like; we iterate lines and yield parsed dicts.
-
-The caller can ``break`` out as soon as they have enough samples —
-nothing past the consumed point is decompressed.
-
-Schema is identical to Tasks 1 & 2 (see ``dataset.py``); only
-``search_results`` is bigger (50 entries instead of 5).
-"""
-
-from __future__ import annotations
-
-import bz2
-import json
-import logging
-import tarfile
-from collections.abc import Iterator
-from pathlib import Path
-from typing import IO
-
-from .dataset import (
-    CragPage,
-    CragQuestion,
-    _parse_alt_answers,
-    _parse_pages,
-)
-
-logger = logging.getLogger(__name__)
-
-
-CRAG_TASK_3_PART_URLS: tuple[str, ...] = tuple(
-    "https://github.com/facebookresearch/CRAG/raw/refs/heads/main/data/"
-    f"crag_task_3_dev_v4.tar.bz2.part{i}"
-    for i in (1, 2, 3, 4)
-)
-CRAG_TASK_3_PART_NAMES: tuple[str, ...] = tuple(
-    f"crag_task_3_dev_v4.tar.bz2.part{i}" for i in (1, 2, 3, 4)
-)
-
-
-# ---------------------------------------------------------------------------
-# Multi-part virtual file (concatenates N files transparently)
-# ---------------------------------------------------------------------------
-
-
-class _MultiPartReader:
-    """Read N files end-to-end as if they were one big file.
-
-    Implements just enough of the file protocol for ``bz2.BZ2File``
-    to consume it: ``read(n)``, ``readable()``, ``close()``.
-    Doesn't implement ``seek`` — the bz2 + tarfile streaming path
-    is forward-only, which is what we want here.
-    """
-
-    def __init__(self, paths: list[Path]) -> None:
-        if not paths:
-            raise ValueError("_MultiPartReader needs at least one path")
-        for p in paths:
-            if not p.exists():
-                raise FileNotFoundError(p)
-        self._paths = list(paths)
-        self._idx = 0
-        self._fh: IO[bytes] | None = self._paths[0].open("rb")
-        self._closed = False
-
-    def read(self, n: int = -1) -> bytes:
-        if self._closed:
-            raise ValueError("read of closed _MultiPartReader")
-        if n is None or n < 0:
-            chunks: list[bytes] = []
-            while self._fh is not None:
-                chunks.append(self._fh.read())
-                self._advance()
-            return b"".join(chunks)
-        out: list[bytes] = []
-        remaining = n
-        while remaining > 0 and self._fh is not None:
-            chunk = self._fh.read(remaining)
-            if not chunk:
-                self._advance()
-                continue
-            out.append(chunk)
-            remaining -= len(chunk)
-        return b"".join(out)
-
-    def _advance(self) -> None:
-        if self._fh is not None:
-            self._fh.close()
-            self._fh = None
-        self._idx += 1
-        if self._idx < len(self._paths):
-            self._fh = self._paths[self._idx].open("rb")
-
-    def readable(self) -> bool:
-        return not self._closed
-
-    def close(self) -> None:
-        if self._fh is not None:
-            self._fh.close()
-            self._fh = None
-        self._closed = True
-
-    def __enter__(self) -> _MultiPartReader:
-        return self
-
-    def __exit__(self, exc_type, exc, tb) -> None:  # type: ignore[no-untyped-def]
-        self.close()
-
-
-# ---------------------------------------------------------------------------
-# Stream the JSONL inside the tar.bz2
-# ---------------------------------------------------------------------------
-
-
-def _is_jsonl_member(name: str) -> bool:
-    return name.endswith(".jsonl") or name.endswith(".jsonl.txt")
-
-
-def iter_questions_task3(
-    parts_dir: Path,
-    *,
-    max_questions: int | None = None,
-) -> list[CragQuestion]:
-    """Stream-parse Task 3 rows into ``CragQuestion`` objects.
-
-    The Task 3 archive ships its 2,706 questions sharded across
-    multiple JSONL files inside the tar (e.g.
-    ``crag_task_3_dev_v4_0.jsonl``, ``..._1.jsonl``, …). We iterate
-    members in-stream, parse every JSONL one we encounter, and stop
-    as soon as ``max_questions`` is reached — at which point we
-    don't decompress any further members.
-
-    For a typical n=50 sample at ~3 MB per row we touch ~150 MB of
-    decompressed JSONL — almost always inside the first shard.
-    """
-
-    parts = [parts_dir / name for name in CRAG_TASK_3_PART_NAMES]
-    multi = _MultiPartReader(parts)
-    bz = bz2.BZ2File(multi, mode="rb")
-    tar = tarfile.open(fileobj=bz, mode="r|")
-    out: list[CragQuestion] = []
-    raw_idx = 0
-    found_jsonl = False
-    try:
-        for member in tar:
-            if not member.isfile() or not _is_jsonl_member(member.name):
-                continue
-            found_jsonl = True
-            logger.info(
-                "CRAG Task 3: streaming JSONL shard %s (size: %d bytes)",
-                member.name, member.size,
-            )
-            fh = tar.extractfile(member)
-            if fh is None:
-                logger.warning("tar.extractfile returned None for %s; skipping", member.name)
-                continue
-            try:
-                for raw_line in fh:
-                    line = raw_line.decode("utf-8", errors="replace").strip()
-                    if not line:
-                        continue
-                    try:
-                        row = json.loads(line)
-                    except json.JSONDecodeError as exc:
-                        logger.warning(
-                            "Skipping malformed CRAG Task 3 row %d in %s: %s",
-                            raw_idx, member.name, exc,
-                        )
-                        raw_idx += 1
-                        continue
-                    query = str(row.get("query") or "").strip()
-                    answer = str(row.get("answer") or "").strip()
-                    if not query or not answer:
-                        raw_idx += 1
-                        continue
-                    out.append(CragQuestion(
-                        qid=f"T3_{raw_idx:05d}",
-                        interaction_id=str(row.get("interaction_id") or "").strip(),
-                        query_time=str(row.get("query_time") or "").strip(),
-                        query=query,
-                        gold_answer=answer,
-                        alt_answers=_parse_alt_answers(row.get("alt_ans")),
-                        domain=str(row.get("domain") or "").strip().lower(),
-                        question_type=str(row.get("question_type") or "").strip().lower(),
-                        static_or_dynamic=str(row.get("static_or_dynamic") or "").strip().lower(),
-                        popularity=str(row.get("popularity") or "").strip().lower(),
-                        split=int(row.get("split") or 0),
-                        raw_index=raw_idx,
-                        pages=_parse_pages(row.get("search_results")),
-                    ))
-                    raw_idx += 1
-                    if max_questions is not None and len(out) >= max_questions:
-                        return out
-            finally:
-                try:
-                    fh.close()
-                except Exception:  # noqa: BLE001
-                    pass
-        if not found_jsonl:
-            raise RuntimeError(
-                "No JSONL member found inside Task 3 tar.bz2 archive; "
-                "schema may have changed upstream."
-            )
-    finally:
-        try:
-            tar.close()
-        except Exception:  # noqa: BLE001
-            pass
-        try:
-            bz.close()
-        except Exception:  # noqa: BLE001
-            pass
-        try:
-            multi.close()
-        except Exception:  # noqa: BLE001
-            pass
-    return out
-
-
-def parts_present(parts_dir: Path) -> bool:
-    """``True`` iff all 4 parts exist on disk and are non-empty."""
-
-    for name in CRAG_TASK_3_PART_NAMES:
-        p = parts_dir / name
-        if not p.exists() or p.stat().st_size == 0:
-            return False
-    return True
-
-
-# ---------------------------------------------------------------------------
-# Re-exports for convenience
-# ---------------------------------------------------------------------------
-
-
-__all__ = [
-    "CRAG_TASK_3_PART_NAMES",
-    "CRAG_TASK_3_PART_URLS",
-    "CragPage",
-    "CragQuestion",
-    "iter_questions_task3",
-    "parts_present",
-]
--- a/surfsense_evals/src/surfsense_evals/suites/research/crag/ingest.py
+++ b/surfsense_evals/src/surfsense_evals/suites/research/crag/ingest.py
@ -436,12 +436,4 @@ async def _retry_upload_idempotent(  # noqa: D401 - hidden helper
    return {}


-__all__ = [
-    "_IngestStats",
-    "_materialise_pages",
-    "_page_filename",
-    "_resolve_question_doc_ids",
-    "_upload_pages",
-    "read_page_markdown",
-    "run_ingest",
-]
+__all__ = ["read_page_markdown", "run_ingest"]
--- a/surfsense_evals/src/surfsense_evals/suites/research/crag/ingest_task3.py
+++ b/surfsense_evals/src/surfsense_evals/suites/research/crag/ingest_task3.py
@ -1,191 +0,0 @@
-"""CRAG Task 3 ingestion: 4-part download → streaming JSONL → upload.
-
-Same flow as ``ingest.run_ingest`` for Tasks 1 & 2 (extract HTML →
-upload markdown → resolve doc_ids → write doc map), but:
-
-* Source: 4 .tar.bz2 parts streamed via ``dataset_task3``.
-* Page count: 50 per question instead of 5 — the whole point of
-  Task 3 (the long-context arm now structurally has to choose what
-  to keep, while SurfSense's retrieval becomes mandatory).
-* Stratified sampling re-uses the Task 1 helper since the question
-  schema is identical.
-
-Doc map lands at ``<suite_data>/maps/crag_t3_doc_map.jsonl`` with the
-same row shape as Task 1's map (so the runner only needs to know
-which file to load; everything else is shared).
-"""
-
-from __future__ import annotations
-
-import json
-import logging
-from pathlib import Path
-
-from ....core.config import set_suite_state
-from ....core.ingest_settings import IngestSettings, settings_header_line
-from ....core.registry import RunContext
-from .dataset import stratified_sample, write_questions_jsonl
-from .dataset_task3 import (
-    CRAG_TASK_3_PART_NAMES,
-    iter_questions_task3,
-    parts_present,
-)
-from .ingest import (
-    _IngestStats,
-    _materialise_pages,
-    _resolve_question_doc_ids,
-    _upload_pages,
-)
-
-logger = logging.getLogger(__name__)
-
-
-_INSTRUCTIONS_TO_DOWNLOAD = (
-    "Run `python scripts/download_crag_task3.py` first to fetch the "
-    "4 tar.bz2 parts (~7 GB total) into "
-    "data/research/crag_t3/.raw_cache/. The downloader is idempotent "
-    "and parallel."
-)
-
-
-async def run_ingest_task3(
-    ctx: RunContext,
-    *,
-    n_questions: int | None = None,
-    upload_batch_size: int = 16,
-    skip_upload: bool = False,
-    overwrite_extract: bool = False,
-    settings: IngestSettings | None = None,
-    sample_seed: int = 17,
-    parse_cap: int | None = None,
-) -> None:
-    """Ingest CRAG Task 3 (50 pages per question) into the research suite.
-
-    Parameters
-    ----------
-    n_questions
-        Cap on the post-stratified-sample question count. ``None`` =
-        "use whatever ``parse_cap`` produced". For real runs aim for
-        50 (~2,500 pages) — n=200 (10k pages) is doable but slow.
-    parse_cap
-        Hard cap on how many rows we *parse* from the streaming
-        archive before stratified sampling. Defaults to
-        ``max(400, 6*n_questions)`` — enough to cover all (domain,
-        question_type) buckets ~5x but small enough to fit in the
-        first shard or two (each shard is ≈5 GB decompressed and
-        holds ~300 rows; bz2 throughput is ~50 MB/s). Lowering this
-        is the only knob that bounds streaming cost since we can
-        ``break`` out of the JSONL stream early without decompressing
-        the rest of the ~50 GB archive body.
-    upload_batch_size
-        Markdown files per ``/documents/fileupload`` call.
-    skip_upload
-        Extract markdown locally, don't push to SurfSense.
-    overwrite_extract
-        Re-run trafilatura even when a cached markdown is present.
-    settings
-        Per-upload knobs override (default: text-only basic ETL).
-    sample_seed
-        RNG seed for stratified sampling (deterministic).
-    """
-
-    settings = settings or IngestSettings(
-        use_vision_llm=False,
-        processing_mode="basic",
-        should_summarize=False,
-    )
-    bench_dir = ctx.benchmark_data_dir()
-    pages_dir = bench_dir / "pages"
-    raw_cache = bench_dir / ".raw_cache"
-    raw_cache.mkdir(parents=True, exist_ok=True)
-
-    if not parts_present(raw_cache):
-        missing = [
-            n for n in CRAG_TASK_3_PART_NAMES
-            if not (raw_cache / n).exists()
-        ]
-        raise RuntimeError(
-            f"CRAG Task 3 parts missing from {raw_cache}: {missing}. "
-            f"{_INSTRUCTIONS_TO_DOWNLOAD}"
-        )
-
-    # 1. Stream-parse (capped). For n=50 we don't need the full 2,706
-    #    rows — just enough that the stratified sampler can balance.
-    #    Each tar shard ~5 GB / ~300 rows / ~2 min decompress, so
-    #    400-500 rows = shard 0 + a slice of shard 1 ≈ 3-4 min.
-    parse_cap = parse_cap or (
-        max(400, 6 * (n_questions or 50)) if n_questions else None
-    )
-    logger.info(
-        "CRAG Task 3: streaming JSONL (parse_cap=%s) ...",
-        parse_cap if parse_cap else "no-cap",
-    )
-    all_questions = iter_questions_task3(raw_cache, max_questions=parse_cap)
-    logger.info("CRAG Task 3: parsed %d rows", len(all_questions))
-
-    if not all_questions:
-        raise RuntimeError("CRAG Task 3 streaming returned 0 rows; check archive integrity.")
-
-    if n_questions is not None and n_questions > 0:
-        questions = stratified_sample(all_questions, n=n_questions, seed=sample_seed)
-        logger.info(
-            "CRAG Task 3: stratified sample of %d questions across %d (domain, qtype) buckets",
-            len(questions),
-            len({(q.domain, q.question_type) for q in questions}),
-        )
-    else:
-        questions = all_questions
-
-    questions_jsonl = bench_dir / "questions.jsonl"
-    write_questions_jsonl(questions, questions_jsonl)
-
-    n_pages_total = sum(len(q.pages) for q in questions)
-    logger.info(
-        "CRAG Task 3: extracting up to %d pages across %d questions ...",
-        n_pages_total, len(questions),
-    )
-    qid_to_files, _file_to_url = _materialise_pages(
-        questions, pages_dir=pages_dir, overwrite=overwrite_extract,
-    )
-    n_pages_extracted = sum(len(v) for v in qid_to_files.values())
-
-    name_to_id: dict[str, int] = {}
-    if skip_upload:
-        logger.info("CRAG Task 3: --skip-upload; skipping SurfSense ingestion")
-    else:
-        all_filenames = sorted({fn for fns in qid_to_files.values() for fn in fns})
-        logger.info("CRAG Task 3: uploading %d unique pages ...", len(all_filenames))
-        name_to_id = await _upload_pages(
-            ctx,
-            pages_dir=pages_dir,
-            filenames=all_filenames,
-            batch_size=upload_batch_size,
-            settings=settings,
-        )
-
-    doc_rows = _resolve_question_doc_ids(questions, qid_to_files, name_to_id)
-    map_path = ctx.maps_dir() / "crag_t3_doc_map.jsonl"
-    with map_path.open("w", encoding="utf-8") as fh:
-        fh.write(settings_header_line(settings) + "\n")
-        for row in doc_rows:
-            fh.write(json.dumps(row) + "\n")
-    logger.info("Wrote CRAG Task 3 doc map to %s (%d rows)", map_path, len(doc_rows))
-
-    new_state = ctx.suite_state
-    new_state.ingestion_maps["crag_t3"] = str(map_path)
-    set_suite_state(ctx.config, ctx.suite, new_state)
-
-    stats = _IngestStats(
-        n_questions=len(questions),
-        n_pages_total=n_pages_total,
-        n_pages_extracted=n_pages_extracted,
-        n_pages_empty=n_pages_total - n_pages_extracted,
-        n_uploaded=len(name_to_id),
-        n_existing=0,
-        bench_dir=bench_dir,
-        map_path=map_path,
-    )
-    logger.info("CRAG Task 3 ingest done: %s", stats)
-
-
-__all__ = ["run_ingest_task3"]
--- a/surfsense_evals/src/surfsense_evals/suites/research/crag/runner.py
+++ b/surfsense_evals/src/surfsense_evals/suites/research/crag/runner.py
@ -189,18 +189,6 @@ class CragBenchmark:
    headline: bool = True
    description: str = _DESCRIPTION

-    # Subclasses (e.g. Task 3) override these without re-implementing run().
-    doc_map_filename: str = "crag_doc_map.jsonl"
-    # 0 = use ALL pages in the long-context arm. Task 3 defaults to 5
-    # so the long-context arm models the realistic "stuff the top-5
-    # search results into the prompt" baseline rather than blowing
-    # past the 128k-token context window with all 50 pages.
-    default_long_context_top_n: int = 0
-    pages_per_question_label: str = "5 pages"
-    ingest_hint: str = (
-        "`python -m surfsense_evals ingest research crag --n-questions 200`"
-    )
-
    def add_run_args(self, parser: argparse.ArgumentParser) -> None:
        parser.add_argument(
            "--n", dest="sample_n", type=int, default=None,
@ -230,15 +218,6 @@ class CragBenchmark:
            "--per-page-char-cap", dest="per_page_char_cap", type=int, default=12_000,
            help="Long-context arm: max chars per page before truncation (default 12k).",
        )
-        parser.add_argument(
-            "--long-context-top-n-pages", dest="long_context_top_n_pages",
-            type=int, default=self.default_long_context_top_n,
-            help=(
-                "Long-context arm: keep only the first N pages from the "
-                "question's candidate list (0 = use all). Task 3 defaults "
-                "to 5 (the realistic 'naive RAG' top-K baseline)."
-            ),
-        )
        parser.add_argument(
            "--skip-bare", dest="skip_bare", action="store_true",
            help="Skip the bare-LLM arm (saves cost on re-runs).",
@ -317,11 +296,6 @@ class CragBenchmark:
        concurrency = int(opts.get("concurrency") or 4)
        max_output_tokens = int(opts.get("max_output_tokens") or 512)
        per_page_char_cap = int(opts.get("per_page_char_cap") or 12_000)
-        long_context_top_n_pages = int(
-            opts.get("long_context_top_n_pages")
-            if opts.get("long_context_top_n_pages") is not None
-            else self.default_long_context_top_n
-        )
        skip_bare = bool(opts.get("skip_bare"))
        skip_long_context = bool(opts.get("skip_long_context"))
        skip_surfsense = bool(opts.get("skip_surfsense"))
@ -331,11 +305,11 @@ class CragBenchmark:
        judge_concurrency = int(opts.get("judge_concurrency") or 4)

        bench_dir = ctx.benchmark_data_dir()
-        map_path = ctx.maps_dir() / self.doc_map_filename
+        map_path = ctx.maps_dir() / "crag_doc_map.jsonl"
        if not map_path.exists():
            raise RuntimeError(
-                f"{self.name} not ingested for this suite. Run "
-                f"{self.ingest_hint} first."
+                "CRAG not ingested for this suite. Run "
+                "`python -m surfsense_evals ingest research crag --n-questions 200` first."
            )

        rows, ingest_settings = _load_doc_map(map_path)
@ -407,13 +381,7 @@ class CragBenchmark:
        async def _long_context_one(q: CragRunnerQuestion) -> ArmResult:
            assert long_context_arm is not None
            return await long_context_arm.answer(
-                _make_long_context_request(
-                    q,
-                    bench_dir,
-                    max_output_tokens,
-                    per_page_char_cap,
-                    top_n_pages=long_context_top_n_pages,
-                )
+                _make_long_context_request(q, bench_dir, max_output_tokens, per_page_char_cap)
            )

        async def _surf_one(q: CragRunnerQuestion) -> ArmResult:
@ -503,8 +471,6 @@ class CragBenchmark:
                "agent_llm_id": ctx.agent_llm_id,
                "ingest_settings": ingest_settings,
                "per_page_char_cap": per_page_char_cap,
-                "long_context_top_n_pages": long_context_top_n_pages,
-                "pages_per_question_label": self.pages_per_question_label,
                "max_output_tokens": max_output_tokens,
                "arms_active": {
                    "bare_llm": bare_arm is not None,
@ -565,29 +531,18 @@ class CragBenchmark:
        if not active.get("long_context", True):
            body_lines.append("- Long-context arm: SKIPPED.")
        else:
-            top_n = int(extra.get("long_context_top_n_pages") or 0)
-            page_phrase = (
-                f"top-{top_n} of {extra.get('pages_per_question_label') or 'pages'}"
-                if top_n > 0
-                else f"all of {extra.get('pages_per_question_label') or 'pages'}"
-            )
            body_lines.append(
                f"- Long-context arm (`{extra.get('native_arm_model') or '?'}`, "
-                f"{page_phrase} stuffed into prompt; per-page cap "
+                f"all 5 pages stuffed into prompt; per-page cap "
                f"{extra.get('per_page_char_cap', 12_000):,} chars):"
            )
            body_lines.append(_arm_summary_lines(lc, indent="  "))
        if not active.get("surfsense", True):
            body_lines.append("- SurfSense arm: SKIPPED.")
        else:
-            scope_phrase = (
-                "whole SearchSpace"
-                if extra.get("no_mention_scope")
-                else f"per-question {extra.get('pages_per_question_label') or 'pages'}"
-            )
            body_lines.append(
                f"- SurfSense arm (`{extra.get('provider_model', '?')}`, retrieval over "
-                f"{scope_phrase}):"
+                f"{'whole SearchSpace' if extra.get('no_mention_scope') else 'per-question 5 pages'}):"
            )
            body_lines.append(_arm_summary_lines(surf, indent="  "))

@ -673,17 +628,9 @@ def _make_long_context_request(
    bench_dir: Path,
    max_tokens: int,
    per_page_char_cap: int,
-    *,
-    top_n_pages: int = 0,
 ) -> ArmRequest:
-    # The CRAG search_results list is already ranked top-K from the
-    # original web search at query_time; slicing the prefix is the
-    # honest "naive RAG: take the top-K results" baseline.
-    page_iter = q.page_filenames
-    if top_n_pages and top_n_pages > 0:
-        page_iter = page_iter[:top_n_pages]
    contexts: list[tuple[str, str]] = []
-    for fn in page_iter:
+    for fn in q.page_filenames:
        text = read_page_markdown(bench_dir, fn) or ""
        if not text.strip():
            continue
@ -993,61 +940,4 @@ def _fmt(value: Any, ndigits: int) -> str:
        return "?"


-_TASK3_DESCRIPTION = (
-    "CRAG Task 3 (Meta KDD Cup 2024) — same 3 arms but the corpus per "
-    "question now has **50 candidate web pages** (vs 5 in Tasks 1 & 2). "
-    "The long-context arm uses only the top-5 (the realistic naive-RAG "
-    "baseline); SurfSense retrieves over all 50, where its rerank "
-    "becomes the actual contribution."
-)
-
-
-class CragTask3Benchmark(CragBenchmark):
-    """3-arm CRAG runner over Task 3 (50 pages per question).
-
-    Reuses the entire Task 1/2 runtime (grader, prompt, metrics,
-    reporting) — the only deltas are: the doc map filename, the
-    long-context arm's default page cap (5 instead of all 50), and
-    the ingest entrypoint (4-part archive instead of single bz2).
-    """
-
-    name: str = "crag_t3"
-    description: str = _TASK3_DESCRIPTION
-    doc_map_filename: str = "crag_t3_doc_map.jsonl"
-    default_long_context_top_n: int = 5
-    pages_per_question_label: str = "50 pages"
-    ingest_hint: str = (
-        "`python -m surfsense_evals ingest research crag_t3 --n-questions 50` "
-        "(after `python scripts/download_crag_task3.py`)"
-    )
-
-    async def ingest(self, ctx: RunContext, **opts: Any) -> None:
-        # Local import: keep dataset_task3's lazy-streaming module out
-        # of the import graph until someone actually wants Task 3.
-        from .ingest_task3 import run_ingest_task3
-
-        settings = IngestSettings.merge(_DEFAULT_INGEST_SETTINGS, opts)
-        await run_ingest_task3(
-            ctx,
-            n_questions=opts.get("n_questions"),
-            upload_batch_size=int(opts.get("upload_batch_size") or 16),
-            skip_upload=bool(opts.get("skip_upload", False)),
-            overwrite_extract=bool(opts.get("overwrite_extract", False)),
-            settings=settings,
-            sample_seed=int(opts.get("sample_seed") or 17),
-            parse_cap=opts.get("parse_cap"),
-        )
-
-    def add_run_args(self, parser: argparse.ArgumentParser) -> None:
-        super().add_run_args(parser)
-        parser.add_argument(
-            "--parse-cap", dest="parse_cap", type=int, default=None,
-            help=(
-                "(ingest only) Hard cap on rows parsed from the streaming "
-                "Task 3 archive before stratified sampling. Default: "
-                "max(2000, 10 * n_questions). Lower = less decompression."
-            ),
-        )
-
-
-__all__ = ["CragBenchmark", "CragRunnerQuestion", "CragTask3Benchmark"]
+__all__ = ["CragBenchmark", "CragRunnerQuestion"]
--- a/surfsense_evals/tests/suites/test_crag_dataset_task3.py
+++ b/surfsense_evals/tests/suites/test_crag_dataset_task3.py
@ -1,259 +0,0 @@
-"""Unit tests for CRAG Task 3 streaming dataset loader.
-
-We don't (and shouldn't) hit the real 7 GB upstream archive in
-unit tests. Instead we construct tiny tar.bz2 archives split across
-N parts and verify:
-
-* ``_MultiPartReader`` correctly stitches N files together.
-* The streaming path (multi → bz2 → tar → JSONL) yields parsed
-  ``CragQuestion`` rows with the right shape.
-* ``max_questions`` cap is honoured (early break, no greedy read).
-* ``parts_present`` correctly detects missing/empty parts.
-"""
-
-from __future__ import annotations
-
-import bz2
-import io
-import json
-import tarfile
-from pathlib import Path
-
-import pytest
-
-from surfsense_evals.suites.research.crag.dataset_task3 import (
-    _MultiPartReader,
-    iter_questions_task3,
-    parts_present,
-)
-
-
-# ---------------------------------------------------------------------------
-# Fixtures: build a tiny synthetic Task 3 archive
-# ---------------------------------------------------------------------------
-
-
-def _make_jsonl_payload(n_rows: int) -> bytes:
-    rows = []
-    for i in range(n_rows):
-        rows.append({
-            "interaction_id": f"int_{i:04d}",
-            "query_time": "2024-01-01 00:00:00",
-            "domain": ["finance", "music", "movie", "sports", "open"][i % 5],
-            "question_type": ["simple", "comparison", "aggregation", "multi-hop"][i % 4],
-            "static_or_dynamic": "static",
-            "popularity": "head",
-            "split": 0,
-            "query": f"Synthetic CRAG question {i}?",
-            "answer": f"answer-{i}",
-            "alt_ans": [f"alt-{i}-a", f"alt-{i}-b"],
-            "search_results": [
-                {
-                    "page_name": f"Page {j} for q{i}",
-                    "page_url": f"https://example.com/q{i}/p{j}",
-                    "page_snippet": "snippet",
-                    "page_result": f"<html><body><p>q{i} p{j} body</p></body></html>",
-                    "page_last_modified": "",
-                }
-                for j in range(50)
-            ],
-        })
-    return b"\n".join(json.dumps(r).encode("utf-8") for r in rows) + b"\n"
-
-
-def _make_tar_bz2(jsonl_bytes: bytes, *, member_name: str = "data.jsonl") -> bytes:
-    bio = io.BytesIO()
-    with bz2.BZ2File(bio, mode="wb") as bz:
-        with tarfile.open(fileobj=bz, mode="w") as tar:
-            info = tarfile.TarInfo(name=member_name)
-            info.size = len(jsonl_bytes)
-            tar.addfile(info, io.BytesIO(jsonl_bytes))
-    return bio.getvalue()
-
-
-def _make_tar_bz2_multi(shards: list[tuple[str, bytes]]) -> bytes:
-    """Build a tar.bz2 archive containing multiple JSONL shards.
-
-    Mirrors the real CRAG Task 3 layout: one tar with N JSONL members
-    named ``crag_task_3_dev_v4_{i}.jsonl`` (or whatever the caller
-    passes in).
-    """
-
-    bio = io.BytesIO()
-    with bz2.BZ2File(bio, mode="wb") as bz:
-        with tarfile.open(fileobj=bz, mode="w") as tar:
-            for name, payload in shards:
-                info = tarfile.TarInfo(name=name)
-                info.size = len(payload)
-                tar.addfile(info, io.BytesIO(payload))
-    return bio.getvalue()
-
-
-def _split_into_parts(blob: bytes, n_parts: int) -> list[bytes]:
-    """Split byte string into N roughly-equal chunks (last gets remainder)."""
-    chunk = max(1, len(blob) // n_parts)
-    parts = [blob[i * chunk : (i + 1) * chunk] for i in range(n_parts - 1)]
-    parts.append(blob[(n_parts - 1) * chunk :])
-    return parts
-
-
-@pytest.fixture
-def task3_parts_dir(tmp_path: Path) -> Path:
-    """A directory containing a 4-part synthetic CRAG Task 3 archive (12 rows)."""
-    blob = _make_tar_bz2(_make_jsonl_payload(12))
-    parts = _split_into_parts(blob, 4)
-    parts_dir = tmp_path / ".raw_cache"
-    parts_dir.mkdir()
-    for i, b in enumerate(parts, start=1):
-        (parts_dir / f"crag_task_3_dev_v4.tar.bz2.part{i}").write_bytes(b)
-    return parts_dir
-
-
-# ---------------------------------------------------------------------------
-# _MultiPartReader
-# ---------------------------------------------------------------------------
-
-
-class TestMultiPartReader:
-    def test_concatenates_parts_in_order(self, tmp_path: Path) -> None:
-        a = tmp_path / "a"
-        b = tmp_path / "b"
-        c = tmp_path / "c"
-        a.write_bytes(b"hello, ")
-        b.write_bytes(b"streaming ")
-        c.write_bytes(b"world!")
-        with _MultiPartReader([a, b, c]) as r:
-            assert r.read() == b"hello, streaming world!"
-
-    def test_read_n_crosses_part_boundary(self, tmp_path: Path) -> None:
-        a = tmp_path / "a"
-        b = tmp_path / "b"
-        a.write_bytes(b"AAA")
-        b.write_bytes(b"BBBB")
-        with _MultiPartReader([a, b]) as r:
-            # Read 5 bytes — straddles boundary between parts.
-            assert r.read(5) == b"AAABB"
-            assert r.read(5) == b"BB"
-            assert r.read(5) == b""
-
-    def test_close_is_idempotent(self, tmp_path: Path) -> None:
-        a = tmp_path / "a"
-        a.write_bytes(b"x")
-        r = _MultiPartReader([a])
-        r.close()
-        r.close()
-        with pytest.raises(ValueError):
-            r.read(1)
-
-    def test_missing_part_raises(self, tmp_path: Path) -> None:
-        with pytest.raises(FileNotFoundError):
-            _MultiPartReader([tmp_path / "does-not-exist"])
-
-    def test_empty_paths_raises(self) -> None:
-        with pytest.raises(ValueError):
-            _MultiPartReader([])
-
-
-# ---------------------------------------------------------------------------
-# iter_questions_task3
-# ---------------------------------------------------------------------------
-
-
-@pytest.fixture
-def task3_multi_shard_dir(tmp_path: Path) -> Path:
-    """A 4-part archive whose tar contains 3 JSONL shards (4 + 4 + 4 rows)."""
-    payload_a = _make_jsonl_payload(4)
-    payload_b = _make_jsonl_payload(4)
-    payload_c = _make_jsonl_payload(4)
-    blob = _make_tar_bz2_multi([
-        ("crag_task_3_dev_v4_0.jsonl", payload_a),
-        ("crag_task_3_dev_v4_1.jsonl", payload_b),
-        ("crag_task_3_dev_v4_2.jsonl", payload_c),
-    ])
-    parts = _split_into_parts(blob, 4)
-    parts_dir = tmp_path / ".raw_cache"
-    parts_dir.mkdir()
-    for i, b in enumerate(parts, start=1):
-        (parts_dir / f"crag_task_3_dev_v4.tar.bz2.part{i}").write_bytes(b)
-    return parts_dir
-
-
-class TestIterQuestionsTask3:
-    def test_streams_full_archive(self, task3_parts_dir: Path) -> None:
-        questions = iter_questions_task3(task3_parts_dir)
-        assert len(questions) == 12
-        # All questions get the T3_ prefix and 50 pages each.
-        assert all(q.qid.startswith("T3_") for q in questions)
-        assert all(len(q.pages) == 50 for q in questions)
-        # Schema fields preserved.
-        first = questions[0]
-        assert first.query == "Synthetic CRAG question 0?"
-        assert first.gold_answer == "answer-0"
-        assert first.domain == "finance"
-        assert "alt-0-a" in first.alt_answers
-
-    def test_max_questions_caps_early(self, task3_parts_dir: Path) -> None:
-        questions = iter_questions_task3(task3_parts_dir, max_questions=3)
-        assert len(questions) == 3
-        # Sequential indices 0..2 — we don't skip rows.
-        assert [q.raw_index for q in questions] == [0, 1, 2]
-
-    def test_streams_multi_shard_archive(self, task3_multi_shard_dir: Path) -> None:
-        # Three shards × four rows each = twelve rows total.
-        questions = iter_questions_task3(task3_multi_shard_dir)
-        assert len(questions) == 12
-        # raw_index increments monotonically across shards.
-        assert [q.raw_index for q in questions] == list(range(12))
-        # qids are unique and sequential across shards.
-        assert len({q.qid for q in questions}) == 12
-
-    def test_max_questions_short_circuits_first_shard(self, task3_multi_shard_dir: Path) -> None:
-        # Cap < shard size — shouldn't touch shards 1 or 2 at all.
-        questions = iter_questions_task3(task3_multi_shard_dir, max_questions=2)
-        assert len(questions) == 2
-        # Both come from shard 0 (raw_index 0, 1).
-        assert [q.raw_index for q in questions] == [0, 1]
-
-    def test_max_questions_spans_shards(self, task3_multi_shard_dir: Path) -> None:
-        # Cap = 6 → all 4 from shard 0 + first 2 from shard 1.
-        questions = iter_questions_task3(task3_multi_shard_dir, max_questions=6)
-        assert len(questions) == 6
-        assert [q.raw_index for q in questions] == [0, 1, 2, 3, 4, 5]
-
-    def test_raises_when_no_jsonl_member(self, tmp_path: Path) -> None:
-        # Archive containing a non-jsonl member.
-        bio = io.BytesIO()
-        with bz2.BZ2File(bio, mode="wb") as bz:
-            with tarfile.open(fileobj=bz, mode="w") as tar:
-                info = tarfile.TarInfo(name="README.md")
-                payload = b"not jsonl"
-                info.size = len(payload)
-                tar.addfile(info, io.BytesIO(payload))
-        parts_dir = tmp_path / ".raw_cache"
-        parts_dir.mkdir()
-        for i, name in enumerate(
-            ("part1", "part2", "part3", "part4"), start=1,
-        ):
-            half = len(bio.getvalue()) // 4
-            chunk = bio.getvalue()[(i - 1) * half : i * half if i < 4 else len(bio.getvalue())]
-            (parts_dir / f"crag_task_3_dev_v4.tar.bz2.{name}").write_bytes(chunk)
-        with pytest.raises(RuntimeError, match="No JSONL member"):
-            iter_questions_task3(parts_dir)
-
-
-# ---------------------------------------------------------------------------
-# parts_present
-# ---------------------------------------------------------------------------
-
-
-class TestPartsPresent:
-    def test_all_present(self, task3_parts_dir: Path) -> None:
-        assert parts_present(task3_parts_dir) is True
-
-    def test_one_missing(self, task3_parts_dir: Path) -> None:
-        (task3_parts_dir / "crag_task_3_dev_v4.tar.bz2.part2").unlink()
-        assert parts_present(task3_parts_dir) is False
-
-    def test_one_empty(self, task3_parts_dir: Path) -> None:
-        (task3_parts_dir / "crag_task_3_dev_v4.tar.bz2.part3").write_bytes(b"")
-        assert parts_present(task3_parts_dir) is False
--- a/surfsense_evals/uv.lock
+++ b/surfsense_evals/uv.lock