mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-17 18:35:19 +02:00
feat(evals): publish multimodal_doc parser_compare benchmark + n=171 report
Adds the full parser_compare experiment for the multimodal_doc suite:
six arms compared on 30 PDFs / 171 questions from MMLongBench-Doc with
anthropic/claude-sonnet-4.5 across the board.
Source code:
- core/parsers/{azure_di,llamacloud,pdf_pages}.py: direct parser SDK
callers (Azure Document Intelligence prebuilt-read/layout, LlamaParse
parse_page_with_llm/parse_page_with_agent) used by the LC arms,
bypassing the SurfSense backend so each (basic/premium) extraction
is a clean A/B independent of backend ETL routing.
- suites/multimodal_doc/parser_compare/{ingest,runner,prompt}.py:
six-arm benchmark (native_pdf, azure_basic_lc, azure_premium_lc,
llamacloud_basic_lc, llamacloud_premium_lc, surfsense_agentic) with
byte-identical prompts per question, deterministic grader, Wilson
CIs, and the per-page preprocessing tariff cost overlay.
Reproducibility:
- pyproject.toml + uv.lock pin pypdf, azure-ai-documentintelligence,
llama-cloud-services as new deps.
- .env.example documents the AZURE_DI_* and LLAMA_CLOUD_API_KEY env
vars now required for parser_compare.
- 12 analysis scripts under scripts/: retry pass with exponential
backoff, post-retry accuracy merge, McNemar / latency / per-PDF
stats, context-overflow hypothesis test, etc. Each produces one
number cited by the blog report.
Citation surface:
- reports/blog/multimodal_doc_parser_compare_n171_report.md: 1219-line
technical writeup (16 sections) covering headline accuracy, per-format
accuracy, McNemar pairwise significance, latency / token / per-PDF
distributions, error analysis, retry experiment, post-retry final
accuracy, cost amortization model with closed-form derivation, threats
to validity, and reproducibility appendix.
- data/multimodal_doc/runs/2026-05-14T00-53-19Z/parser_compare/{raw,
raw_retries,raw_post_retry}.jsonl + run_artifact.json + retry summary
whitelisted via data/.gitignore as the verifiable numbers source.
Gitignore:
- ignore logs_*.txt + retry_run.log; structured artifacts cover the
citation surface, debug logs are noise.
- data/.gitignore default-ignores everything, whitelists the n=171 run
artifacts only (parser manifest left ignored to avoid leaking local
Windows usernames in absolute paths; manifest is fully regenerable
via 'ingest multimodal_doc parser_compare').
- reports/.gitignore now whitelists hand-curated reports/blog/.
Also retires the abandoned CRAG Task 3 implementation (download script,
streaming Task 3 ingest, CragTask3Benchmark + tests) and trims the
runner / ingest module APIs to match.
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
parent
3737118050
commit
9bcd50164d
40 changed files with 9303 additions and 993 deletions
125
surfsense_evals/scripts/analyze_failure_timing.py
Normal file
125
surfsense_evals/scripts/analyze_failure_timing.py
Normal file
|
|
@ -0,0 +1,125 @@
|
|||
"""Were the SSL failures clustered in time (network blip) or evenly
|
||||
distributed (sustained limit)? Group failures by 1-min buckets using
|
||||
the run start time and the per-row latency_ms / answer order.
|
||||
|
||||
Also: for the one *real* intrinsic failure — the 30MB Anthropic limit
|
||||
on 2405.09818v1.pdf::Q007 — print the full error message + raw payload
|
||||
sizes so the blog has a clean root cause.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from collections import Counter, defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
REPO = Path(__file__).resolve().parents[1]
|
||||
RUN = REPO / "data" / "multimodal_doc" / "runs" / "2026-05-14T00-53-19Z" / "parser_compare"
|
||||
RAW = RUN / "raw.jsonl"
|
||||
PDFS = REPO / "data" / "multimodal_doc" / "mmlongbench" / "pdfs"
|
||||
|
||||
|
||||
def main() -> None:
|
||||
rows = [
|
||||
json.loads(line) for line in RAW.read_text(encoding="utf-8").splitlines()
|
||||
if line.strip()
|
||||
]
|
||||
|
||||
# 1) SSL clustering: failures by question index per arm
|
||||
by_arm_idx: dict[str, list[tuple[int, str]]] = defaultdict(list)
|
||||
qid_order: dict[str, int] = {}
|
||||
arm_seen_count: dict[str, int] = defaultdict(int)
|
||||
for row in rows:
|
||||
arm = row["arm"]
|
||||
idx = arm_seen_count[arm]
|
||||
arm_seen_count[arm] += 1
|
||||
qid_order[f"{arm}::{row['qid']}"] = idx
|
||||
err = row.get("error") or ""
|
||||
cluster = "ssl" if "SSLError" in err else (
|
||||
"empty" if not (row.get("raw_text") or "").strip() and not err else (
|
||||
"5xx" if "502" in err or "503" in err else (
|
||||
"size_limit" if "exceeds" in err.lower() and "limit" in err.lower() else (
|
||||
"other_err" if err else "ok"
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
if cluster != "ok":
|
||||
by_arm_idx[arm].append((idx, cluster))
|
||||
|
||||
print("=" * 80)
|
||||
print("SSL/network-error indices per arm (each arm processes 171 questions in")
|
||||
print("order; index = sequential position within that arm). Tight clustering")
|
||||
print("in time = transient blip, even spread = sustained limit.")
|
||||
print("=" * 80)
|
||||
for arm in sorted(by_arm_idx):
|
||||
items = by_arm_idx[arm]
|
||||
if not items:
|
||||
continue
|
||||
idxs = sorted(set(i for i, _ in items))
|
||||
print(f"\n{arm}: {len(items)} failures at indices {idxs}")
|
||||
# show clusters
|
||||
cluster_runs = []
|
||||
cur = [idxs[0]]
|
||||
for i in idxs[1:]:
|
||||
if i - cur[-1] <= 5: # within 5 questions = same time window
|
||||
cur.append(i)
|
||||
else:
|
||||
cluster_runs.append(cur)
|
||||
cur = [i]
|
||||
cluster_runs.append(cur)
|
||||
print(f" clusters (gap<=5): {len(cluster_runs)}: {cluster_runs}")
|
||||
|
||||
# 2) The 30MB intrinsic failure — full details
|
||||
print()
|
||||
print("=" * 80)
|
||||
print("Intrinsic failure: 30MB Anthropic input limit on 2405.09818v1.pdf::Q007")
|
||||
print("=" * 80)
|
||||
for row in rows:
|
||||
if row["qid"] == "2405.09818v1.pdf::Q007" and row["arm"] == "native_pdf":
|
||||
err = row.get("error") or ""
|
||||
print(f" qid: {row['qid']}")
|
||||
print(f" doc: {row['doc_id']}, pages: {row.get('pages')}")
|
||||
pdf_path = PDFS / row["doc_id"]
|
||||
if pdf_path.exists():
|
||||
size_mb = pdf_path.stat().st_size / (1024 * 1024)
|
||||
print(f" PDF size on disk: {size_mb:.1f} MB")
|
||||
# base64 inflates ~33%
|
||||
est_b64 = size_mb * 1.33
|
||||
print(f" estimated base64 wire size: {est_b64:.1f} MB")
|
||||
print(f" full error: {err[:600]}")
|
||||
break
|
||||
|
||||
# 3) Per-PDF: which PDFs are pathological?
|
||||
print()
|
||||
print("=" * 80)
|
||||
print("Per-PDF failure breakdown across all 6 arms (only PDFs with failures)")
|
||||
print("=" * 80)
|
||||
by_pdf: dict[str, list[dict]] = defaultdict(list)
|
||||
for row in rows:
|
||||
err = row.get("error") or ""
|
||||
empty = not (row.get("raw_text") or "").strip()
|
||||
if err or empty:
|
||||
by_pdf[row["doc_id"]].append({
|
||||
"arm": row["arm"],
|
||||
"qid": row["qid"],
|
||||
"err_kind": (
|
||||
"ssl" if "SSLError" in err
|
||||
else "size_limit" if "exceeds" in err.lower() and "limit" in err.lower()
|
||||
else "5xx" if "502" in err or "503" in err
|
||||
else "json_decode" if "JSONDecodeError" in err
|
||||
else "empty" if empty and not err
|
||||
else "other"
|
||||
),
|
||||
"pages": row.get("pages"),
|
||||
})
|
||||
for doc, items in sorted(by_pdf.items(), key=lambda x: (-len(x[1]), x[0])):
|
||||
kinds = Counter(i["err_kind"] for i in items)
|
||||
arms = sorted({i["arm"] for i in items})
|
||||
pages = items[0]["pages"]
|
||||
print(f" {doc} pages={pages} failures={len(items)} arms={arms}")
|
||||
print(f" kinds: {dict(kinds)}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
155
surfsense_evals/scripts/analyze_failures.py
Normal file
155
surfsense_evals/scripts/analyze_failures.py
Normal file
|
|
@ -0,0 +1,155 @@
|
|||
"""Drill into the parser_compare n=171 raw.jsonl to surface every
|
||||
failure, group by arm + PDF, and dump the underlying error strings so
|
||||
we can write up a clean failure-mode taxonomy for the blog post.
|
||||
|
||||
Outputs (printed to stdout + written to `failures_n171.json`):
|
||||
* per-arm failure count and rate
|
||||
* per-PDF failure count across all arms (which docs are pathological?)
|
||||
* error-string clusters per arm (so we can give human-readable causes)
|
||||
* sample failure rows (one per cluster) for the appendix
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
from collections import Counter, defaultdict
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
REPO = Path(__file__).resolve().parents[1]
|
||||
RUN = REPO / "data" / "multimodal_doc" / "runs" / "2026-05-14T00-53-19Z" / "parser_compare"
|
||||
RAW = RUN / "raw.jsonl"
|
||||
OUT = REPO / "scripts" / "failures_n171.json"
|
||||
|
||||
|
||||
def _classify(error: str | None, raw_text: str) -> str:
|
||||
"""Coarse-grained bucket for an error message."""
|
||||
|
||||
blob = (error or "").lower()
|
||||
if not blob and not raw_text.strip():
|
||||
return "empty_response"
|
||||
if "rate limit" in blob or "429" in blob:
|
||||
return "rate_limit"
|
||||
if "context_length" in blob or "context window" in blob or "too many tokens" in blob:
|
||||
return "context_overflow"
|
||||
if "could not process image" in blob or "invalid image" in blob:
|
||||
return "image_decode_failure"
|
||||
if "could not process pdf" in blob or "invalid_request_error" in blob and "pdf" in blob:
|
||||
return "pdf_decode_failure"
|
||||
if "timeout" in blob or "timed out" in blob:
|
||||
return "timeout"
|
||||
if "5xx" in blob or "internal server error" in blob or "503" in blob or "502" in blob:
|
||||
return "provider_5xx"
|
||||
if "filenotfound" in blob:
|
||||
return "missing_extraction"
|
||||
if "badrequest" in blob:
|
||||
return "provider_400"
|
||||
if blob:
|
||||
return "other_error"
|
||||
return "unknown"
|
||||
|
||||
|
||||
def main() -> None:
|
||||
rows = [
|
||||
json.loads(line) for line in RAW.read_text(encoding="utf-8").splitlines()
|
||||
if line.strip()
|
||||
]
|
||||
|
||||
by_arm_failures: dict[str, list[dict]] = defaultdict(list)
|
||||
by_pdf_failures: dict[str, list[dict]] = defaultdict(list)
|
||||
error_clusters: dict[str, dict[str, list[dict]]] = defaultdict(lambda: defaultdict(list))
|
||||
|
||||
n_per_arm: dict[str, int] = defaultdict(int)
|
||||
for row in rows:
|
||||
arm = row["arm"]
|
||||
n_per_arm[arm] += 1
|
||||
err = row.get("error")
|
||||
raw_text = row.get("raw_text") or ""
|
||||
if err or not raw_text.strip():
|
||||
cluster = _classify(err, raw_text)
|
||||
entry = {
|
||||
"qid": row["qid"],
|
||||
"doc_id": row["doc_id"],
|
||||
"answer_format": row["answer_format"],
|
||||
"gold": row["gold"],
|
||||
"error": err,
|
||||
"cluster": cluster,
|
||||
"raw_text_len": len(raw_text),
|
||||
"pages": row.get("pages"),
|
||||
}
|
||||
by_arm_failures[arm].append(entry)
|
||||
by_pdf_failures[row["doc_id"]].append({**entry, "arm": arm})
|
||||
error_clusters[arm][cluster].append(entry)
|
||||
|
||||
print("=" * 90)
|
||||
print("Per-arm failure count & rate")
|
||||
print("=" * 90)
|
||||
print(f"{'arm':<25} {'n':>4} {'fail':>5} {'rate%':>6}")
|
||||
for arm in sorted(n_per_arm):
|
||||
f = len(by_arm_failures[arm])
|
||||
n = n_per_arm[arm]
|
||||
print(f"{arm:<25} {n:>4} {f:>5} {f / n * 100:>5.1f}%")
|
||||
|
||||
print()
|
||||
print("=" * 90)
|
||||
print("Failure clusters per arm")
|
||||
print("=" * 90)
|
||||
for arm in sorted(error_clusters):
|
||||
print(f"\n{arm}:")
|
||||
for cluster, items in sorted(error_clusters[arm].items()):
|
||||
print(f" {cluster:<22} {len(items):>3}")
|
||||
sample = items[0]
|
||||
err_short = (sample["error"] or "")[:200].replace("\n", " ")
|
||||
print(f" example: {sample['qid']} doc={sample['doc_id']} pages={sample['pages']}")
|
||||
print(f" error: {err_short}")
|
||||
|
||||
print()
|
||||
print("=" * 90)
|
||||
print("Per-PDF failure totals (PDFs with >=2 failures)")
|
||||
print("=" * 90)
|
||||
pdf_counts = Counter({pdf: len(rows) for pdf, rows in by_pdf_failures.items()})
|
||||
for pdf, count in pdf_counts.most_common():
|
||||
if count < 2:
|
||||
break
|
||||
arms_failed = sorted({r["arm"] for r in by_pdf_failures[pdf]})
|
||||
pages = by_pdf_failures[pdf][0].get("pages")
|
||||
print(f" {pdf} pages={pages} failures={count} arms={arms_failed}")
|
||||
|
||||
print()
|
||||
print("=" * 90)
|
||||
print("All native_pdf failures (one row per failure)")
|
||||
print("=" * 90)
|
||||
for entry in by_arm_failures.get("native_pdf", []):
|
||||
err = (entry["error"] or "(no error string)")[:240].replace("\n", " ")
|
||||
print(f" {entry['qid']} doc={entry['doc_id']} pages={entry['pages']} cluster={entry['cluster']}")
|
||||
print(f" err: {err}")
|
||||
|
||||
summary: dict[str, Any] = {
|
||||
"per_arm": {
|
||||
arm: {
|
||||
"n": n_per_arm[arm],
|
||||
"failures": len(by_arm_failures[arm]),
|
||||
"rate": len(by_arm_failures[arm]) / n_per_arm[arm],
|
||||
"clusters": {
|
||||
cluster: len(items)
|
||||
for cluster, items in error_clusters[arm].items()
|
||||
},
|
||||
"rows": by_arm_failures[arm],
|
||||
}
|
||||
for arm in sorted(n_per_arm)
|
||||
},
|
||||
"per_pdf": {
|
||||
pdf: [
|
||||
{**r, "arm": r["arm"]} for r in failures
|
||||
]
|
||||
for pdf, failures in by_pdf_failures.items()
|
||||
},
|
||||
}
|
||||
OUT.write_text(json.dumps(summary, indent=2), encoding="utf-8")
|
||||
print(f"\nWrote: {OUT}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
60
surfsense_evals/scripts/check_extraction_sizes.py
Normal file
60
surfsense_evals/scripts/check_extraction_sizes.py
Normal file
|
|
@ -0,0 +1,60 @@
|
|||
"""Sanity check extraction sizes against Sonnet 4.5's context window.
|
||||
|
||||
Sonnet 4.5 supports ~200k tokens. As a *very* rough heuristic, English
|
||||
markdown is ~4 chars/token, so anything over ~750k chars likely won't
|
||||
fit alongside the system + question + 512 max_output_tokens. Print
|
||||
warnings for any extraction that's at risk.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
REPO = Path(__file__).resolve().parents[1]
|
||||
MAP = REPO / "data" / "multimodal_doc" / "maps" / "parser_compare_doc_map.jsonl"
|
||||
|
||||
CHARS_PER_TOKEN = 4
|
||||
CTX_TOKENS = 200_000
|
||||
PROMPT_OVERHEAD_TOKENS = 1_000 # system + question + format hint
|
||||
MAX_OUTPUT_TOKENS = 512
|
||||
SAFE_CHARS = (CTX_TOKENS - PROMPT_OVERHEAD_TOKENS - MAX_OUTPUT_TOKENS) * CHARS_PER_TOKEN
|
||||
|
||||
|
||||
def main() -> None:
|
||||
rows = [
|
||||
json.loads(line)
|
||||
for line in MAP.read_text(encoding="utf-8").splitlines()
|
||||
if line.strip()
|
||||
]
|
||||
|
||||
total = len(rows)
|
||||
arm_max: dict[str, tuple[int, str]] = {}
|
||||
overflows: list[tuple[str, str, int]] = []
|
||||
for row in rows:
|
||||
for arm, ext in (row.get("extractions") or {}).items():
|
||||
chars = int(ext.get("chars") or 0)
|
||||
if arm not in arm_max or arm_max[arm][0] < chars:
|
||||
arm_max[arm] = (chars, row["doc_id"])
|
||||
if chars > SAFE_CHARS:
|
||||
overflows.append((row["doc_id"], arm, chars))
|
||||
|
||||
print(f"PDFs in manifest: {total}")
|
||||
print(f"safe char budget: {SAFE_CHARS:,} (~{(SAFE_CHARS // CHARS_PER_TOKEN):,} tokens)")
|
||||
print()
|
||||
print("largest extraction per arm:")
|
||||
for arm, (chars, doc_id) in sorted(arm_max.items()):
|
||||
print(f" {arm:25s} {chars:>10,} chars ({doc_id})")
|
||||
|
||||
print()
|
||||
if overflows:
|
||||
print(f"OVERFLOW RISK ({len(overflows)} extractions > safe budget):")
|
||||
for doc_id, arm, chars in overflows:
|
||||
est_tokens = chars // CHARS_PER_TOKEN
|
||||
print(f" {doc_id} :: {arm} :: {chars:,} chars (~{est_tokens:,} tokens)")
|
||||
else:
|
||||
print("no overflow risk — all extractions fit Sonnet 4.5's 200k context.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
77
surfsense_evals/scripts/check_uploaded_status.py
Normal file
77
surfsense_evals/scripts/check_uploaded_status.py
Normal file
|
|
@ -0,0 +1,77 @@
|
|||
"""Query SurfSense for the status of every MMLongBench PDF in scope.
|
||||
|
||||
Uses the existing SurfSense documents client to query
|
||||
``/documents/status?document_ids=...`` for both the known-existing 5
|
||||
PDFs (doc ids 5219-5223) and the recently-uploaded mmlongbench batch
|
||||
(7577-7600 range).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
from dotenv import load_dotenv
|
||||
|
||||
|
||||
REPO = Path(__file__).resolve().parents[1]
|
||||
PDF_DIR = REPO / "data" / "multimodal_doc" / "mmlongbench" / "pdfs"
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
load_dotenv(REPO / ".env")
|
||||
base = os.environ.get("SURFSENSE_API_BASE", "http://localhost:8000").rstrip("/")
|
||||
token = os.environ.get("SURFSENSE_JWT")
|
||||
if not token:
|
||||
raise SystemExit("SURFSENSE_JWT missing from .env")
|
||||
|
||||
pdf_names = sorted(p.name for p in PDF_DIR.glob("*.pdf"))
|
||||
print(f"local cached PDFs: {len(pdf_names)}")
|
||||
|
||||
candidate_ids = list(range(5219, 5224)) + list(range(7577, 7625))
|
||||
|
||||
headers = {
|
||||
"Authorization": f"Bearer {token}",
|
||||
"Accept": "application/json",
|
||||
}
|
||||
async with httpx.AsyncClient(timeout=30.0) as http:
|
||||
r = await http.get(
|
||||
f"{base}/api/v1/documents/status",
|
||||
params={
|
||||
"search_space_id": 55,
|
||||
"document_ids": ",".join(str(d) for d in candidate_ids),
|
||||
},
|
||||
headers=headers,
|
||||
)
|
||||
r.raise_for_status()
|
||||
items = r.json().get("items", [])
|
||||
|
||||
by_title: dict[str, dict] = {}
|
||||
for it in items:
|
||||
by_title[it.get("title", "")] = {
|
||||
"id": it.get("id"),
|
||||
"state": (it.get("status") or {}).get("state"),
|
||||
"reason": (it.get("status") or {}).get("reason"),
|
||||
}
|
||||
|
||||
by_state: dict[str, int] = {}
|
||||
print()
|
||||
for name in pdf_names:
|
||||
info = by_title.get(name)
|
||||
if info is None:
|
||||
print(f" [missing ] {name}")
|
||||
by_state["missing"] = by_state.get("missing", 0) + 1
|
||||
else:
|
||||
tag = info["state"] or "?"
|
||||
print(f" [{tag:13s}] doc_id={info['id']:>5} {name}")
|
||||
by_state[tag] = by_state.get(tag, 0) + 1
|
||||
print()
|
||||
print("summary:")
|
||||
for k, v in sorted(by_state.items()):
|
||||
print(f" {k}: {v}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
112
surfsense_evals/scripts/compute_adjusted_accuracy.py
Normal file
112
surfsense_evals/scripts/compute_adjusted_accuracy.py
Normal file
|
|
@ -0,0 +1,112 @@
|
|||
"""Compute "intrinsic" accuracy by removing transient network errors.
|
||||
|
||||
A failure is *transient* if it's:
|
||||
* SSLError: SSL bad-record-mac (TLS hiccup)
|
||||
* Cloudflare 502 / 503 (provider-side load shedding)
|
||||
* empty_response with no error string and no other signal (likely
|
||||
connection reset mid-stream)
|
||||
* JSONDecodeError (parse error mid-stream)
|
||||
|
||||
A failure is *intrinsic* if it's a hard limit:
|
||||
* "exceeds .* limit" (size limits)
|
||||
* context_length errors
|
||||
* provider 400 with image / pdf decode failure
|
||||
* malformed-input failures
|
||||
|
||||
We re-compute accuracy with two denominators:
|
||||
* raw acc = correct / 171 (what the headline reports)
|
||||
* adjusted acc = correct / (171 - transient_failures) (intrinsic)
|
||||
|
||||
Outputs a table that we can drop straight into the blog.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
REPO = Path(__file__).resolve().parents[1]
|
||||
RUN = REPO / "data" / "multimodal_doc" / "runs" / "2026-05-14T00-53-19Z" / "parser_compare"
|
||||
RAW = RUN / "raw.jsonl"
|
||||
|
||||
|
||||
TRANSIENT_HINTS = (
|
||||
"sslv3_alert_bad_record_mac",
|
||||
"ssl_alert_bad_record_mac",
|
||||
"ssl: ssl",
|
||||
"cloudflare",
|
||||
"error 502",
|
||||
"error 503",
|
||||
"bad gateway",
|
||||
"service unavailable",
|
||||
"gateway timeout",
|
||||
"jsondecodeerror",
|
||||
)
|
||||
INTRINSIC_HINTS = (
|
||||
"exceeds",
|
||||
"context_length",
|
||||
"context window",
|
||||
"could not process pdf",
|
||||
"could not process image",
|
||||
)
|
||||
|
||||
|
||||
def classify(error: str | None, raw_text: str) -> str:
|
||||
err = (error or "").lower()
|
||||
if not err and not raw_text.strip():
|
||||
return "transient_empty"
|
||||
if any(h in err for h in TRANSIENT_HINTS):
|
||||
return "transient_ssl_or_5xx"
|
||||
if any(h in err for h in INTRINSIC_HINTS):
|
||||
return "intrinsic_limit"
|
||||
if err:
|
||||
return "other_error"
|
||||
return "ok"
|
||||
|
||||
|
||||
def main() -> None:
|
||||
rows = [
|
||||
json.loads(line) for line in RAW.read_text(encoding="utf-8").splitlines()
|
||||
if line.strip()
|
||||
]
|
||||
by_arm: dict[str, dict] = defaultdict(lambda: {
|
||||
"n": 0, "correct": 0,
|
||||
"transient_ssl_or_5xx": 0, "transient_empty": 0,
|
||||
"intrinsic_limit": 0, "other_error": 0,
|
||||
})
|
||||
for row in rows:
|
||||
arm = row["arm"]
|
||||
m = by_arm[arm]
|
||||
m["n"] += 1
|
||||
graded = row.get("graded") or {}
|
||||
if graded.get("correct"):
|
||||
m["correct"] += 1
|
||||
kind = classify(row.get("error"), row.get("raw_text") or "")
|
||||
if kind != "ok":
|
||||
m[kind] += 1
|
||||
|
||||
print(f"{'arm':<25} {'raw acc%':>8} {'transient':>10} {'intrinsic':>10} {'other':>6} {'adj acc% (no transient)':>22}")
|
||||
print("-" * 88)
|
||||
for arm in sorted(by_arm):
|
||||
m = by_arm[arm]
|
||||
raw = m["correct"] / m["n"] * 100
|
||||
transient = m["transient_ssl_or_5xx"] + m["transient_empty"]
|
||||
intrinsic = m["intrinsic_limit"]
|
||||
other = m["other_error"]
|
||||
usable = m["n"] - transient
|
||||
adj = m["correct"] / usable * 100 if usable else 0
|
||||
print(
|
||||
f"{arm:<25} {raw:>7.1f}% {transient:>10} {intrinsic:>10} {other:>6} {adj:>21.1f}%"
|
||||
)
|
||||
|
||||
print()
|
||||
print("transient = SSLError / 502 / 503 / empty stream / mid-stream JSON decode (would")
|
||||
print(" succeed on retry; eval harness has no built-in retry today).")
|
||||
print("intrinsic = hard limit (e.g. >30MB Anthropic request, model context overflow).")
|
||||
print("adj acc% = correct / (n - transient) — what the arm scores when network noise")
|
||||
print(" is removed; closest thing we have to a like-for-like quality number.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
381
surfsense_evals/scripts/compute_blog_extras.py
Normal file
381
surfsense_evals/scripts/compute_blog_extras.py
Normal file
|
|
@ -0,0 +1,381 @@
|
|||
"""Compute the deeper statistics the blog needs: McNemar pairwise tests,
|
||||
per-PDF heterogeneity, latency/token distribution percentiles.
|
||||
|
||||
Reads the merged post-retry artifact:
|
||||
|
||||
data/multimodal_doc/runs/<run_id>/parser_compare/raw_post_retry.jsonl
|
||||
|
||||
Outputs to stdout:
|
||||
|
||||
1) Per-arm latency distribution (n, mean, std, p10, p25, p50, p75, p90, p95, p99, max).
|
||||
2) Per-arm input/output token distribution (mean, p50, p95, max).
|
||||
3) McNemar pairwise table: for every (arm_i, arm_j) ordered pair on the
|
||||
same 171 questions, count b_ij = #(arm_i correct & arm_j wrong) and
|
||||
b_ji = #(arm_i wrong & arm_j correct), and report the exact-binomial
|
||||
two-sided p-value. We include both raw (using the original raw.jsonl)
|
||||
and post-retry results.
|
||||
4) Per-PDF accuracy variance per arm (n_pdfs=30): mean, std, min, max.
|
||||
|
||||
Pure stdlib — no scipy/numpy.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import math
|
||||
import statistics
|
||||
from pathlib import Path
|
||||
|
||||
REPO = Path(__file__).resolve().parents[1]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# I/O
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _read_jsonl(path: Path) -> list[dict]:
|
||||
out: list[dict] = []
|
||||
with path.open("r", encoding="utf-8") as fh:
|
||||
for line in fh:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
out.append(json.loads(line))
|
||||
return out
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Distribution helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _percentile(values: list[float], p: float) -> float:
|
||||
"""Linear-interpolation percentile (p in [0, 100])."""
|
||||
|
||||
if not values:
|
||||
return 0.0
|
||||
s = sorted(values)
|
||||
if len(s) == 1:
|
||||
return float(s[0])
|
||||
k = (len(s) - 1) * (p / 100.0)
|
||||
lo, hi = math.floor(k), math.ceil(k)
|
||||
if lo == hi:
|
||||
return float(s[int(k)])
|
||||
return float(s[lo] + (s[hi] - s[lo]) * (k - lo))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# McNemar exact-binomial p-value
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _binom_coef(n: int, k: int) -> int:
|
||||
if k < 0 or k > n:
|
||||
return 0
|
||||
return math.comb(n, k)
|
||||
|
||||
|
||||
def _mcnemar_exact_pvalue(b: int, c: int) -> float:
|
||||
"""Two-sided exact-binomial McNemar p-value.
|
||||
|
||||
Tests H0: P(arm_i wrong, arm_j right) == P(arm_i right, arm_j wrong)
|
||||
on discordant pairs only. Under H0 the count b ~ Bin(b+c, 0.5).
|
||||
The two-sided p-value is
|
||||
|
||||
P(X <= min(b, c)) + P(X >= max(b, c))
|
||||
|
||||
computed exactly (cheap because b+c <= 27 in our run).
|
||||
"""
|
||||
|
||||
n = b + c
|
||||
if n == 0:
|
||||
return 1.0
|
||||
k = min(b, c)
|
||||
# Two-sided exact: 2 * P(X <= k) clipped at 1.0
|
||||
cdf = sum(_binom_coef(n, i) for i in range(k + 1))
|
||||
p = 2.0 * cdf / (2 ** n)
|
||||
return min(1.0, p)
|
||||
|
||||
|
||||
def _mcnemar_table(rows: list[dict]) -> dict:
|
||||
"""Group rows -> {qid: {arm: bool_correct}} and compute pairwise."""
|
||||
|
||||
by_qid: dict[str, dict[str, bool]] = {}
|
||||
arms_seen: set[str] = set()
|
||||
for r in rows:
|
||||
qid = r["qid"]
|
||||
arm = r["arm"]
|
||||
graded = r.get("graded") or {}
|
||||
correct = bool(graded.get("correct"))
|
||||
by_qid.setdefault(qid, {})[arm] = correct
|
||||
arms_seen.add(arm)
|
||||
|
||||
arms = sorted(arms_seen)
|
||||
qids = sorted(by_qid)
|
||||
out: dict[str, dict] = {"arms": arms, "n_qids": len(qids), "pairs": []}
|
||||
for i, ai in enumerate(arms):
|
||||
for aj in arms[i + 1:]:
|
||||
b = c = both = neither = 0
|
||||
for q in qids:
|
||||
row = by_qid[q]
|
||||
if ai not in row or aj not in row:
|
||||
continue
|
||||
ci, cj = row[ai], row[aj]
|
||||
if ci and not cj:
|
||||
b += 1
|
||||
elif cj and not ci:
|
||||
c += 1
|
||||
elif ci and cj:
|
||||
both += 1
|
||||
else:
|
||||
neither += 1
|
||||
p = _mcnemar_exact_pvalue(b, c)
|
||||
out["pairs"].append({
|
||||
"arm_i": ai, "arm_j": aj,
|
||||
"b_i_only": b, "c_j_only": c,
|
||||
"both_correct": both, "both_wrong": neither,
|
||||
"p_value": p,
|
||||
})
|
||||
return out
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Per-PDF heterogeneity
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _per_pdf_stats(rows: list[dict]) -> dict[str, dict]:
|
||||
"""For each arm, per-PDF accuracy = correct/total questions on that PDF."""
|
||||
|
||||
bucket: dict[str, dict[str, list[bool]]] = {}
|
||||
for r in rows:
|
||||
arm = r["arm"]
|
||||
pdf = r["doc_id"]
|
||||
graded = r.get("graded") or {}
|
||||
bucket.setdefault(arm, {}).setdefault(pdf, []).append(
|
||||
bool(graded.get("correct"))
|
||||
)
|
||||
|
||||
out: dict[str, dict] = {}
|
||||
for arm, pdfs in bucket.items():
|
||||
accs = [sum(b) / len(b) for b in pdfs.values() if b]
|
||||
if not accs:
|
||||
continue
|
||||
out[arm] = {
|
||||
"n_pdfs": len(accs),
|
||||
"mean": statistics.mean(accs),
|
||||
"std": statistics.stdev(accs) if len(accs) > 1 else 0.0,
|
||||
"min": min(accs),
|
||||
"max": max(accs),
|
||||
"p25": _percentile(accs, 25),
|
||||
"p50": _percentile(accs, 50),
|
||||
"p75": _percentile(accs, 75),
|
||||
"n_pdfs_zero": sum(1 for a in accs if a == 0.0),
|
||||
"n_pdfs_perfect": sum(1 for a in accs if a == 1.0),
|
||||
}
|
||||
return out
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Latency / token distributions
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _per_arm_latency(rows: list[dict]) -> dict[str, dict]:
|
||||
by_arm: dict[str, list[float]] = {}
|
||||
for r in rows:
|
||||
lat = r.get("latency_ms")
|
||||
if lat is None or lat == 0:
|
||||
continue
|
||||
by_arm.setdefault(r["arm"], []).append(float(lat))
|
||||
out: dict[str, dict] = {}
|
||||
for arm, lats in by_arm.items():
|
||||
out[arm] = {
|
||||
"n": len(lats),
|
||||
"mean_s": statistics.mean(lats) / 1000,
|
||||
"std_s": (statistics.stdev(lats) / 1000) if len(lats) > 1 else 0.0,
|
||||
"p10_s": _percentile(lats, 10) / 1000,
|
||||
"p25_s": _percentile(lats, 25) / 1000,
|
||||
"p50_s": _percentile(lats, 50) / 1000,
|
||||
"p75_s": _percentile(lats, 75) / 1000,
|
||||
"p90_s": _percentile(lats, 90) / 1000,
|
||||
"p95_s": _percentile(lats, 95) / 1000,
|
||||
"p99_s": _percentile(lats, 99) / 1000,
|
||||
"max_s": max(lats) / 1000,
|
||||
# Coefficient of variation: std / mean (unitless tail-fatness).
|
||||
"cv": (
|
||||
statistics.stdev(lats) / statistics.mean(lats)
|
||||
if len(lats) > 1 and statistics.mean(lats) > 0 else 0.0
|
||||
),
|
||||
}
|
||||
return out
|
||||
|
||||
|
||||
def _per_arm_tokens(rows: list[dict]) -> dict[str, dict]:
|
||||
by_arm_in: dict[str, list[float]] = {}
|
||||
by_arm_out: dict[str, list[float]] = {}
|
||||
for r in rows:
|
||||
t_in = r.get("input_tokens") or 0
|
||||
t_out = r.get("output_tokens") or 0
|
||||
if t_in:
|
||||
by_arm_in.setdefault(r["arm"], []).append(float(t_in))
|
||||
if t_out:
|
||||
by_arm_out.setdefault(r["arm"], []).append(float(t_out))
|
||||
out: dict[str, dict] = {}
|
||||
for arm in sorted(set(by_arm_in) | set(by_arm_out)):
|
||||
in_vals = by_arm_in.get(arm, [])
|
||||
out_vals = by_arm_out.get(arm, [])
|
||||
if not in_vals and not out_vals:
|
||||
continue
|
||||
entry: dict = {}
|
||||
if in_vals:
|
||||
entry["input"] = {
|
||||
"n": len(in_vals),
|
||||
"mean": statistics.mean(in_vals),
|
||||
"p50": _percentile(in_vals, 50),
|
||||
"p95": _percentile(in_vals, 95),
|
||||
"max": max(in_vals),
|
||||
}
|
||||
if out_vals:
|
||||
entry["output"] = {
|
||||
"n": len(out_vals),
|
||||
"mean": statistics.mean(out_vals),
|
||||
"p50": _percentile(out_vals, 50),
|
||||
"p95": _percentile(out_vals, 95),
|
||||
"max": max(out_vals),
|
||||
}
|
||||
out[arm] = entry
|
||||
return out
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Pretty-printing
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _print_latency(title: str, lat: dict[str, dict]) -> None:
|
||||
print()
|
||||
print(title)
|
||||
print("-" * len(title))
|
||||
header = (f"{'arm':<25} {'n':>4} {'mean':>7} {'std':>7} "
|
||||
f"{'p50':>7} {'p90':>7} {'p95':>7} {'p99':>7} {'max':>7} {'CV':>5}")
|
||||
print(header)
|
||||
print("-" * len(header))
|
||||
for arm in sorted(lat, key=lambda a: lat[a]["mean_s"]):
|
||||
s = lat[arm]
|
||||
print(f"{arm:<25} {s['n']:>4} "
|
||||
f"{s['mean_s']:>6.1f}s {s['std_s']:>6.1f}s "
|
||||
f"{s['p50_s']:>6.1f}s {s['p90_s']:>6.1f}s {s['p95_s']:>6.1f}s "
|
||||
f"{s['p99_s']:>6.1f}s {s['max_s']:>6.1f}s {s['cv']:>5.2f}")
|
||||
|
||||
|
||||
def _print_tokens(title: str, toks: dict[str, dict]) -> None:
|
||||
print()
|
||||
print(title)
|
||||
print("-" * len(title))
|
||||
header = (f"{'arm':<25} {'in mean':>9} {'in p50':>9} {'in p95':>9} {'in max':>9}"
|
||||
f" {'out mean':>9} {'out p95':>9}")
|
||||
print(header)
|
||||
print("-" * len(header))
|
||||
for arm in sorted(toks):
|
||||
e = toks[arm]
|
||||
ein = e.get("input")
|
||||
eout = e.get("output")
|
||||
if not ein:
|
||||
continue
|
||||
print(f"{arm:<25} "
|
||||
f"{ein['mean']:>9,.0f} {ein['p50']:>9,.0f} {ein['p95']:>9,.0f} {ein['max']:>9,.0f} "
|
||||
f"{(eout or {}).get('mean', 0):>9,.0f} {(eout or {}).get('p95', 0):>9,.0f}")
|
||||
|
||||
|
||||
def _print_pdf_var(title: str, var: dict[str, dict]) -> None:
|
||||
print()
|
||||
print(title)
|
||||
print("-" * len(title))
|
||||
header = (f"{'arm':<25} {'n_pdfs':>7} {'mean':>7} {'std':>7} {'min':>7} "
|
||||
f"{'p25':>7} {'p50':>7} {'p75':>7} {'max':>7} {'#0%':>5} {'#100%':>6}")
|
||||
print(header)
|
||||
print("-" * len(header))
|
||||
for arm in sorted(var, key=lambda a: -var[a]["mean"]):
|
||||
s = var[arm]
|
||||
print(f"{arm:<25} {s['n_pdfs']:>7} "
|
||||
f"{s['mean']*100:>6.1f}% {s['std']*100:>6.1f}% {s['min']*100:>6.1f}% "
|
||||
f"{s['p25']*100:>6.1f}% {s['p50']*100:>6.1f}% {s['p75']*100:>6.1f}% "
|
||||
f"{s['max']*100:>6.1f}% {s['n_pdfs_zero']:>5} {s['n_pdfs_perfect']:>6}")
|
||||
|
||||
|
||||
def _print_mcnemar(title: str, table: dict) -> None:
|
||||
print()
|
||||
print(title)
|
||||
print("-" * len(title))
|
||||
print(f"n_qids on which all arms have a graded row: {table['n_qids']}")
|
||||
header = (f"{'arm_i':<25} {'arm_j':<25} {'b':>4} {'c':>4} "
|
||||
f"{'both ok':>8} {'both wr':>8} {'p (2-sided)':>13} {'sig':>4}")
|
||||
print(header)
|
||||
print("-" * len(header))
|
||||
for pair in sorted(table["pairs"], key=lambda p: p["p_value"]):
|
||||
sig = ""
|
||||
if pair["p_value"] < 0.001:
|
||||
sig = "***"
|
||||
elif pair["p_value"] < 0.01:
|
||||
sig = "**"
|
||||
elif pair["p_value"] < 0.05:
|
||||
sig = "*"
|
||||
print(f"{pair['arm_i']:<25} {pair['arm_j']:<25} "
|
||||
f"{pair['b_i_only']:>4} {pair['c_j_only']:>4} "
|
||||
f"{pair['both_correct']:>8} {pair['both_wrong']:>8} "
|
||||
f"{pair['p_value']:>13.4f} {sig:>4}")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--run-id", default="2026-05-14T00-53-19Z")
|
||||
args = parser.parse_args()
|
||||
|
||||
run_dir = REPO / "data" / "multimodal_doc" / "runs" / args.run_id / "parser_compare"
|
||||
raw_path = run_dir / "raw.jsonl"
|
||||
post_path = run_dir / "raw_post_retry.jsonl"
|
||||
if not raw_path.exists() or not post_path.exists():
|
||||
raise SystemExit(
|
||||
"Missing raw.jsonl or raw_post_retry.jsonl. "
|
||||
"Run scripts/compute_post_retry_accuracy.py first."
|
||||
)
|
||||
|
||||
raw_rows = _read_jsonl(raw_path)
|
||||
post_rows = _read_jsonl(post_path)
|
||||
|
||||
print(f"Run: {args.run_id}")
|
||||
print(f"raw rows: {len(raw_rows)}, post-retry rows: {len(post_rows)}")
|
||||
|
||||
# Latency uses post-retry rows (post-retry rows include the retry's own
|
||||
# latency for recovered rows). For raw, recovered rows have latency=0
|
||||
# because the harness recorded a failure.
|
||||
_print_latency("Per-arm latency (post-retry)", _per_arm_latency(post_rows))
|
||||
|
||||
_print_tokens("Per-arm token distribution (post-retry)", _per_arm_tokens(post_rows))
|
||||
|
||||
_print_pdf_var(
|
||||
"Per-PDF accuracy heterogeneity (post-retry)",
|
||||
_per_pdf_stats(post_rows),
|
||||
)
|
||||
|
||||
_print_mcnemar(
|
||||
"McNemar pairwise (RAW, no retries)",
|
||||
_mcnemar_table(raw_rows),
|
||||
)
|
||||
_print_mcnemar(
|
||||
"McNemar pairwise (POST-RETRY)",
|
||||
_mcnemar_table(post_rows),
|
||||
)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
180
surfsense_evals/scripts/compute_post_retry_accuracy.py
Normal file
180
surfsense_evals/scripts/compute_post_retry_accuracy.py
Normal file
|
|
@ -0,0 +1,180 @@
|
|||
"""Recompute per-arm accuracy/F1 after merging retry survivors into raw.jsonl.
|
||||
|
||||
Reads:
|
||||
- data/multimodal_doc/runs/<run_id>/parser_compare/raw.jsonl
|
||||
- data/multimodal_doc/runs/<run_id>/parser_compare/raw_retries.jsonl
|
||||
|
||||
For each (arm, qid) present in the retry artifact:
|
||||
- if the retry RECOVERED, the retry row replaces the original row (same
|
||||
grader is reused — see ``mmlongbench/grader.py``);
|
||||
- if the retry did NOT recover, the original row stays (still a failure,
|
||||
so ``correct=False`` and ``f1=0``).
|
||||
|
||||
Prints two tables side by side:
|
||||
* Raw run (no retries) — matches §1 of the blog.
|
||||
* Post-retry run — final, "what would the headline have been if
|
||||
the harness had had retries from day one".
|
||||
|
||||
It also writes ``data/multimodal_doc/runs/<run_id>/parser_compare/raw_post_retry.jsonl``
|
||||
so any downstream notebook / report can join straight on it.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
REPO = Path(__file__).resolve().parents[1]
|
||||
|
||||
|
||||
def _read_jsonl(path: Path) -> list[dict]:
|
||||
out: list[dict] = []
|
||||
with path.open("r", encoding="utf-8") as fh:
|
||||
for line in fh:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
out.append(json.loads(line))
|
||||
return out
|
||||
|
||||
|
||||
def _row_key(row: dict) -> tuple[str, str]:
|
||||
return (str(row["arm"]), str(row["qid"]))
|
||||
|
||||
|
||||
def _is_failure(row: dict) -> bool:
|
||||
if row.get("error"):
|
||||
return True
|
||||
if not (row.get("raw_text") or "").strip():
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _summarise(rows_by_arm: dict[str, list[dict]]) -> dict[str, dict]:
|
||||
out: dict[str, dict] = {}
|
||||
for arm, rows in rows_by_arm.items():
|
||||
n = len(rows)
|
||||
n_correct = sum(1 for r in rows if r.get("graded", {}).get("correct"))
|
||||
f1_sum = sum(float(r.get("graded", {}).get("f1") or 0.0) for r in rows)
|
||||
n_fail = sum(1 for r in rows if _is_failure(r))
|
||||
out[arm] = {
|
||||
"n": n,
|
||||
"n_correct": n_correct,
|
||||
"n_failures": n_fail,
|
||||
"accuracy": (n_correct / n) if n else 0.0,
|
||||
"f1_mean": (f1_sum / n) if n else 0.0,
|
||||
"failure_rate": (n_fail / n) if n else 0.0,
|
||||
}
|
||||
return out
|
||||
|
||||
|
||||
def _print_table(title: str, summary: dict[str, dict]) -> None:
|
||||
print()
|
||||
print(title)
|
||||
print("-" * len(title))
|
||||
header = f"{'arm':<25} {'n':>4} {'n_corr':>7} {'acc':>7} {'F1':>7} {'fails':>6} {'fail%':>7}"
|
||||
print(header)
|
||||
print("-" * len(header))
|
||||
# stable order: highest accuracy first
|
||||
arms_sorted = sorted(summary.items(), key=lambda kv: -kv[1]["accuracy"])
|
||||
for arm, s in arms_sorted:
|
||||
print(f"{arm:<25} {s['n']:>4} {s['n_correct']:>7} "
|
||||
f"{s['accuracy']*100:>6.1f}% {s['f1_mean']*100:>6.1f}% "
|
||||
f"{s['n_failures']:>6} {s['failure_rate']*100:>6.1f}%")
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--run-id", default="2026-05-14T00-53-19Z")
|
||||
args = parser.parse_args()
|
||||
|
||||
run_dir = REPO / "data" / "multimodal_doc" / "runs" / args.run_id / "parser_compare"
|
||||
raw_path = run_dir / "raw.jsonl"
|
||||
retry_path = run_dir / "raw_retries.jsonl"
|
||||
out_path = run_dir / "raw_post_retry.jsonl"
|
||||
|
||||
if not raw_path.exists():
|
||||
print(f"raw.jsonl not found at {raw_path}", file=sys.stderr)
|
||||
return 1
|
||||
if not retry_path.exists():
|
||||
print(f"raw_retries.jsonl not found at {retry_path}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
raw_rows = _read_jsonl(raw_path)
|
||||
retry_rows = _read_jsonl(retry_path)
|
||||
|
||||
retry_by_key: dict[tuple[str, str], dict] = {
|
||||
_row_key(r): r for r in retry_rows
|
||||
}
|
||||
|
||||
merged_rows: list[dict] = []
|
||||
n_replaced_recovered = 0
|
||||
n_replaced_still_failed = 0
|
||||
n_unchanged = 0
|
||||
for row in raw_rows:
|
||||
key = _row_key(row)
|
||||
retry = retry_by_key.get(key)
|
||||
if retry is None:
|
||||
merged_rows.append(row)
|
||||
n_unchanged += 1
|
||||
continue
|
||||
# The retry artifact carries a fresh ArmResult + grade in the same
|
||||
# shape, plus a "retry" sub-object. We use the retry row whenever
|
||||
# it represents a recovery; otherwise we keep the original (the
|
||||
# retry confirms it is intrinsic, but the original row is the one
|
||||
# the headline numbers were computed from, and the failure verdict
|
||||
# is identical either way).
|
||||
recovered = bool(retry.get("retry", {}).get("recovered"))
|
||||
if recovered:
|
||||
merged_rows.append(retry)
|
||||
n_replaced_recovered += 1
|
||||
else:
|
||||
merged_rows.append(row)
|
||||
n_replaced_still_failed += 1
|
||||
|
||||
# Persist merged jsonl for downstream consumers
|
||||
with out_path.open("w", encoding="utf-8") as fh:
|
||||
for r in merged_rows:
|
||||
fh.write(json.dumps(r) + "\n")
|
||||
|
||||
# Bucket per arm
|
||||
raw_by_arm: dict[str, list[dict]] = {}
|
||||
for r in raw_rows:
|
||||
raw_by_arm.setdefault(r["arm"], []).append(r)
|
||||
post_by_arm: dict[str, list[dict]] = {}
|
||||
for r in merged_rows:
|
||||
post_by_arm.setdefault(r["arm"], []).append(r)
|
||||
|
||||
raw_summary = _summarise(raw_by_arm)
|
||||
post_summary = _summarise(post_by_arm)
|
||||
|
||||
print()
|
||||
print(f"Run: {args.run_id}")
|
||||
print(f"Replaced (retry recovered): {n_replaced_recovered}")
|
||||
print(f"Kept original (retry still failed): {n_replaced_still_failed}")
|
||||
print(f"Untouched rows: {n_unchanged}")
|
||||
print(f"Wrote merged artifact: {out_path.relative_to(REPO)}")
|
||||
|
||||
_print_table("Raw run (no retries)", raw_summary)
|
||||
_print_table("Post-retry run (final)", post_summary)
|
||||
|
||||
print()
|
||||
print("Delta (post-retry minus raw):")
|
||||
print(f"{'arm':<25} {'d_acc':>7} {'d_fails':>8}")
|
||||
print("-" * 42)
|
||||
for arm in sorted(set(raw_summary) | set(post_summary)):
|
||||
r = raw_summary.get(arm)
|
||||
p = post_summary.get(arm)
|
||||
if not r or not p:
|
||||
continue
|
||||
d_acc = (p["accuracy"] - r["accuracy"]) * 100
|
||||
d_fail = p["n_failures"] - r["n_failures"]
|
||||
print(f"{arm:<25} {d_acc:>+6.1f}p {d_fail:>+7d}")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
|
|
@ -1,97 +0,0 @@
|
|||
"""Download CRAG Task 3's 4 .tar.bz2 parts in parallel.
|
||||
|
||||
Run once before ``ingest research crag_t3`` to avoid the ingest
|
||||
synchronously blocking on a 7 GB download. Skips parts already
|
||||
present and complete on disk.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import sys
|
||||
import time
|
||||
import urllib.request
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from pathlib import Path
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(message)s",
|
||||
)
|
||||
log = logging.getLogger("download_task3")
|
||||
|
||||
|
||||
_BASE = (
|
||||
"https://github.com/facebookresearch/CRAG/raw/refs/heads/main/data/"
|
||||
"crag_task_3_dev_v4.tar.bz2.part"
|
||||
)
|
||||
_USER_AGENT = "SurfSense-Evals/0.1 (CRAG Task 3 fetch)"
|
||||
|
||||
|
||||
def _expected_size(url: str) -> int:
|
||||
req = urllib.request.Request(url, method="HEAD", headers={"User-Agent": _USER_AGENT})
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
return int(resp.headers.get("content-length", 0))
|
||||
|
||||
|
||||
def download_one(part: int, dest_dir: Path) -> Path:
|
||||
url = f"{_BASE}{part}"
|
||||
dest = dest_dir / f"crag_task_3_dev_v4.tar.bz2.part{part}"
|
||||
expected = _expected_size(url)
|
||||
if dest.exists() and dest.stat().st_size == expected:
|
||||
log.info("part%d: cached (%d bytes)", part, expected)
|
||||
return dest
|
||||
log.info("part%d: downloading %d bytes ...", part, expected)
|
||||
tmp = dest.with_suffix(dest.suffix + ".part_dl")
|
||||
started = time.monotonic()
|
||||
last_log = started
|
||||
with urllib.request.urlopen(
|
||||
urllib.request.Request(url, headers={"User-Agent": _USER_AGENT}),
|
||||
timeout=900,
|
||||
) as resp, tmp.open("wb") as fh:
|
||||
downloaded = 0
|
||||
chunk = resp.read(1 << 20)
|
||||
while chunk:
|
||||
fh.write(chunk)
|
||||
downloaded += len(chunk)
|
||||
now = time.monotonic()
|
||||
if now - last_log > 5.0:
|
||||
pct = 100 * downloaded / expected if expected else 0
|
||||
rate_mb = (downloaded / (now - started)) / (1 << 20)
|
||||
log.info(
|
||||
"part%d: %5.1f%% (%.1f / %.1f MiB at %.1f MiB/s)",
|
||||
part, pct, downloaded / (1 << 20), expected / (1 << 20), rate_mb,
|
||||
)
|
||||
last_log = now
|
||||
chunk = resp.read(1 << 20)
|
||||
tmp.replace(dest)
|
||||
elapsed = time.monotonic() - started
|
||||
log.info(
|
||||
"part%d: done in %.1fs (%.1f MiB/s avg)",
|
||||
part, elapsed, (expected / (1 << 20)) / max(elapsed, 0.001),
|
||||
)
|
||||
return dest
|
||||
|
||||
|
||||
def main() -> int:
|
||||
dest_dir = Path("data/research/crag_t3/.raw_cache")
|
||||
dest_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 4 parts in parallel — typical residential connection saturates around
|
||||
# 2 streams; GitHub raw serves these fine in parallel.
|
||||
started = time.monotonic()
|
||||
with ThreadPoolExecutor(max_workers=4) as ex:
|
||||
futures = {ex.submit(download_one, i, dest_dir): i for i in range(1, 5)}
|
||||
for fut in as_completed(futures):
|
||||
part = futures[fut]
|
||||
try:
|
||||
fut.result()
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.error("part%d failed: %s", part, exc)
|
||||
return 1
|
||||
log.info("All 4 parts downloaded in %.1fs", time.monotonic() - started)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
59
surfsense_evals/scripts/inspect_first30.py
Normal file
59
surfsense_evals/scripts/inspect_first30.py
Normal file
|
|
@ -0,0 +1,59 @@
|
|||
"""Inspect what the first 30 MMLongBench-Doc PDFs would look like for scoping.
|
||||
|
||||
Run from surfsense_evals/ root via:
|
||||
python scripts/inspect_first30.py
|
||||
|
||||
Prints which docs are already ingested (existing 5), which are new (25 to
|
||||
upload), how many questions cover those 30 PDFs, and the answerable /
|
||||
unanswerable + format mix.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def main() -> None:
|
||||
qpath = Path("data/multimodal_doc/mmlongbench/questions.jsonl")
|
||||
lines = qpath.read_text(encoding="utf-8").splitlines()
|
||||
rows = [json.loads(line) for line in lines if line.strip()]
|
||||
|
||||
docs_by_id = sorted({r["doc_id"] for r in rows})
|
||||
first30 = docs_by_id[:30]
|
||||
existing5 = {
|
||||
"05-03-18-political-release.pdf",
|
||||
"0b85477387a9d0cc33fca0f4becaa0e5.pdf",
|
||||
"0e94b4197b10096b1f4c699701570fbf.pdf",
|
||||
"11-21-16-Updated-Post-Election-Release.pdf",
|
||||
"12-15-15-ISIS-and-terrorism-release-final.pdf",
|
||||
}
|
||||
new25 = [d for d in first30 if d not in existing5]
|
||||
print(
|
||||
f"first 30 docs (alphabetical) — {len(new25)} new, "
|
||||
f"{len(first30) - len(new25)} already in SurfSense"
|
||||
)
|
||||
|
||||
qs_in_30 = [r for r in rows if r["doc_id"] in set(first30)]
|
||||
fmts = Counter((r.get("answer_format") or "").lower() for r in qs_in_30)
|
||||
answerable = sum(v for k, v in fmts.items() if k != "none")
|
||||
unanswerable = fmts.get("none", 0)
|
||||
|
||||
print(
|
||||
f"questions covering first 30 docs: total={len(qs_in_30)} "
|
||||
f"answerable={answerable} unanswerable={unanswerable}"
|
||||
)
|
||||
print(
|
||||
f"avg Qs/PDF: {len(qs_in_30) / 30:.1f} "
|
||||
f"answerable/PDF: {answerable / 30:.1f}"
|
||||
)
|
||||
print(f"format mix in scope: {dict(fmts)}")
|
||||
print()
|
||||
print("25 new PDFs to ingest:")
|
||||
for d in new25:
|
||||
print(f" - {d}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
100
surfsense_evals/scripts/patch_manifest_for_parallel_ingest.py
Normal file
100
surfsense_evals/scripts/patch_manifest_for_parallel_ingest.py
Normal file
|
|
@ -0,0 +1,100 @@
|
|||
"""Stub the mmlongbench manifest so parser_compare can extract in parallel.
|
||||
|
||||
The mmlongbench Surfsense ingest writes its manifest only at the very
|
||||
end of the upload pipeline (~hours of celery work). parser_compare's
|
||||
ingest, on the other hand, just needs a list of (doc_id, pdf_path)
|
||||
tuples to know which PDFs to extract — it doesn't care about the
|
||||
SurfSense ``document_id`` (the runner does, later, after a refresh).
|
||||
|
||||
This script extends the existing manifest with the *additional* PDFs
|
||||
that mmlongbench has already cached on disk (i.e. all 30 PDFs in
|
||||
``data/multimodal_doc/mmlongbench/pdfs/`` even though only 5 have
|
||||
SurfSense ``document_id``s yet) so parser_compare can run all four
|
||||
extractions for them in parallel with the SurfSense ingest.
|
||||
|
||||
After mmlongbench finishes, re-run::
|
||||
|
||||
python -m surfsense_evals ingest multimodal_doc parser_compare \
|
||||
--max-docs 30
|
||||
|
||||
…to refresh ``parser_compare_doc_map.jsonl`` with the now-populated
|
||||
``document_id`` values for the 25 new PDFs. The extractions
|
||||
themselves are cached on disk so the second pass is essentially free.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
REPO = Path(__file__).resolve().parents[1]
|
||||
MAP_PATH = REPO / "data" / "multimodal_doc" / "maps" / "mmlongbench_doc_map.jsonl"
|
||||
PDF_DIR = REPO / "data" / "multimodal_doc" / "mmlongbench" / "pdfs"
|
||||
QUESTIONS = REPO / "data" / "multimodal_doc" / "mmlongbench" / "questions.jsonl"
|
||||
|
||||
|
||||
def _question_count_per_doc() -> dict[str, int]:
|
||||
counts: dict[str, int] = {}
|
||||
with QUESTIONS.open("r", encoding="utf-8") as fh:
|
||||
for line in fh:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
row = json.loads(line)
|
||||
counts[row["doc_id"]] = counts.get(row["doc_id"], 0) + 1
|
||||
return counts
|
||||
|
||||
|
||||
def main() -> None:
|
||||
if not MAP_PATH.exists():
|
||||
raise SystemExit(
|
||||
f"manifest not found at {MAP_PATH} — "
|
||||
"run `surfsense_evals ingest multimodal_doc mmlongbench` first."
|
||||
)
|
||||
|
||||
existing_lines = MAP_PATH.read_text(encoding="utf-8").splitlines()
|
||||
existing_rows: list[dict] = []
|
||||
settings_line = None
|
||||
for line in existing_lines:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
row = json.loads(line)
|
||||
if "__settings__" in row:
|
||||
settings_line = line
|
||||
else:
|
||||
existing_rows.append(row)
|
||||
|
||||
by_doc_id = {r["doc_id"]: r for r in existing_rows}
|
||||
counts = _question_count_per_doc()
|
||||
|
||||
cached_pdfs = sorted(p for p in PDF_DIR.glob("*.pdf"))
|
||||
print(f"existing manifest entries: {len(existing_rows)}")
|
||||
print(f"cached PDFs on disk: {len(cached_pdfs)}")
|
||||
|
||||
added = 0
|
||||
for pdf in cached_pdfs:
|
||||
if pdf.name in by_doc_id:
|
||||
continue
|
||||
by_doc_id[pdf.name] = {
|
||||
"doc_id": pdf.name,
|
||||
"document_id": None,
|
||||
"pdf_path": str(pdf),
|
||||
"n_questions": counts.get(pdf.name, 0),
|
||||
}
|
||||
added += 1
|
||||
|
||||
out_lines: list[str] = []
|
||||
if settings_line:
|
||||
out_lines.append(settings_line)
|
||||
for doc_id in sorted(by_doc_id):
|
||||
out_lines.append(json.dumps(by_doc_id[doc_id]))
|
||||
MAP_PATH.write_text("\n".join(out_lines) + "\n", encoding="utf-8")
|
||||
|
||||
print(f"added {added} stub rows; manifest now has {len(by_doc_id)} PDFs")
|
||||
print(f"wrote: {MAP_PATH}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,40 +0,0 @@
|
|||
"""Quick sanity-check for the CRAG Task 3 doc map after ingest."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def main() -> int:
|
||||
p = Path("data/research/maps/crag_t3_doc_map.jsonl")
|
||||
if not p.exists():
|
||||
print(f"Doc map missing: {p}")
|
||||
return 1
|
||||
rows = []
|
||||
settings = {}
|
||||
for line in p.read_text(encoding="utf-8").splitlines():
|
||||
if not line.strip():
|
||||
continue
|
||||
row = json.loads(line)
|
||||
if "__settings__" in row:
|
||||
settings = row
|
||||
continue
|
||||
rows.append(row)
|
||||
print(f"Settings header: {settings}")
|
||||
print(f"Doc map rows: {len(rows)}")
|
||||
for r in rows:
|
||||
print(f" qid={r['qid']:<10} domain={r['domain']:<8} qtype={r['question_type']}")
|
||||
print(f" question: {r['question'][:90]}")
|
||||
print(f" gold: {r['gold_answer'][:90]}")
|
||||
print(
|
||||
f" pages: {len(r['page_filenames'])} extracted, "
|
||||
f"{len(r['document_ids'])} doc_ids, "
|
||||
f"{len(r['missing_pages'])} missing"
|
||||
)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
636
surfsense_evals/scripts/retry_failed_questions.py
Normal file
636
surfsense_evals/scripts/retry_failed_questions.py
Normal file
|
|
@ -0,0 +1,636 @@
|
|||
"""Retry only the failed (arm, question) pairs from a previous parser_compare run.
|
||||
|
||||
The original parser_compare run records one row per (arm, qid) in
|
||||
``raw.jsonl``. Some of those rows came back with transient transport
|
||||
errors (SSL alerts, gateway 502s, empty SSE streams) or empty
|
||||
``raw_text``. This script re-issues *only* those calls with exponential
|
||||
backoff so we can see how many recover.
|
||||
|
||||
Design constraints / choices:
|
||||
|
||||
* **No re-ingest.** All cached PDFs and parser-extracted markdown stay
|
||||
on disk. We rebuild ``ArmRequest`` objects from the existing manifest
|
||||
+ the original ``mmlongbench/questions.jsonl``.
|
||||
* **No SurfSense backend or celery required.** SurfSense had 0
|
||||
reported failures; this script will skip any ``surfsense_agentic``
|
||||
rows it encounters and warn rather than try to start the backend.
|
||||
* **Original ``raw.jsonl`` is never mutated.** Retries land in a
|
||||
sibling ``raw_retries.jsonl`` so the original artifact stays
|
||||
citeable.
|
||||
* **Idempotent.** Re-running this script re-tries the same set of
|
||||
failed rows from ``raw.jsonl``. If you want to merge survivor rows
|
||||
back in, do that as a separate aggregation step.
|
||||
|
||||
Usage:
|
||||
|
||||
python scripts/retry_failed_questions.py \
|
||||
--run-id 2026-05-14T00-53-19Z \
|
||||
--max-attempts 5 \
|
||||
--concurrency 2
|
||||
|
||||
Outputs (written next to the original raw.jsonl):
|
||||
|
||||
* ``raw_retries.jsonl`` — one line per retried (arm, qid). Each line
|
||||
carries the original error, every retry attempt's timing/error,
|
||||
and the final result (incl. grade) so you can drop it straight
|
||||
into a notebook.
|
||||
* ``raw_retries_summary.json`` — per-arm tried/recovered/still-failed
|
||||
counts and an aggregated retry-success rate.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
REPO = Path(__file__).resolve().parents[1]
|
||||
SRC = REPO / "src"
|
||||
if str(SRC) not in sys.path:
|
||||
sys.path.insert(0, str(SRC))
|
||||
|
||||
from dotenv import load_dotenv # noqa: E402
|
||||
|
||||
from surfsense_evals.core.arms import ( # noqa: E402
|
||||
ArmRequest,
|
||||
ArmResult,
|
||||
BareLlmArm,
|
||||
NativePdfArm,
|
||||
)
|
||||
from surfsense_evals.core.parse.freeform_answer import ( # noqa: E402
|
||||
extract_freeform_answer,
|
||||
)
|
||||
from surfsense_evals.core.providers.openrouter_chat import ( # noqa: E402
|
||||
OpenRouterChatProvider,
|
||||
)
|
||||
from surfsense_evals.core.providers.openrouter_pdf import ( # noqa: E402
|
||||
OpenRouterPdfProvider,
|
||||
PdfEngine,
|
||||
)
|
||||
from surfsense_evals.suites.multimodal_doc.mmlongbench.grader import grade # noqa: E402
|
||||
from surfsense_evals.suites.multimodal_doc.parser_compare.prompt import ( # noqa: E402
|
||||
build_long_context_prompt,
|
||||
build_native_pdf_prompt,
|
||||
)
|
||||
|
||||
logger = logging.getLogger("retry_failed_questions")
|
||||
|
||||
LC_ARMS = {
|
||||
"azure_basic_lc",
|
||||
"azure_premium_lc",
|
||||
"llamacloud_basic_lc",
|
||||
"llamacloud_premium_lc",
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _is_failure_row(row: dict[str, Any]) -> bool:
|
||||
"""A row counts as failed if it raised an error OR returned empty text.
|
||||
|
||||
We retry both because the empty-stream case is the same operational
|
||||
failure mode (the call returned nothing usable) — we just didn't
|
||||
raise it as an exception.
|
||||
"""
|
||||
|
||||
if row.get("error"):
|
||||
return True
|
||||
if not (row.get("raw_text") or "").strip():
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
@dataclass
|
||||
class FailedRow:
|
||||
arm: str
|
||||
qid: str
|
||||
doc_id: str
|
||||
answer_format: str
|
||||
gold: str
|
||||
pages: int
|
||||
document_id: int | None
|
||||
original_error: str | None
|
||||
original_row: dict[str, Any]
|
||||
|
||||
|
||||
def _load_failed_rows(raw_path: Path) -> list[FailedRow]:
|
||||
out: list[FailedRow] = []
|
||||
with raw_path.open("r", encoding="utf-8") as fh:
|
||||
for line in fh:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
row = json.loads(line)
|
||||
if not _is_failure_row(row):
|
||||
continue
|
||||
out.append(FailedRow(
|
||||
arm=str(row["arm"]),
|
||||
qid=str(row["qid"]),
|
||||
doc_id=str(row["doc_id"]),
|
||||
answer_format=str(row.get("answer_format") or ""),
|
||||
gold=str(row.get("gold") or ""),
|
||||
pages=int(row.get("pages") or 0),
|
||||
document_id=row.get("document_id"),
|
||||
original_error=row.get("error"),
|
||||
original_row=row,
|
||||
))
|
||||
return out
|
||||
|
||||
|
||||
def _load_doc_map(map_path: Path) -> dict[str, dict[str, Any]]:
|
||||
out: dict[str, dict[str, Any]] = {}
|
||||
with map_path.open("r", encoding="utf-8") as fh:
|
||||
for line in fh:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
row = json.loads(line)
|
||||
out[str(row["doc_id"])] = row
|
||||
return out
|
||||
|
||||
|
||||
def _load_question_text_index(
|
||||
questions_jsonl: Path,
|
||||
) -> dict[tuple[str, int], dict[str, Any]]:
|
||||
"""Map (doc_id, per_doc_index) -> raw question row.
|
||||
|
||||
qids in raw.jsonl are formatted ``{doc_id}::Q{NNN}`` where NNN is
|
||||
the per-doc index. Reproducing the runner's question selection
|
||||
requires walking ``questions.jsonl`` in order and assigning
|
||||
indices per doc_id (so we match the runner's ``per_doc_idx`` logic
|
||||
in ``_select_questions``).
|
||||
"""
|
||||
|
||||
out: dict[tuple[str, int], dict[str, Any]] = {}
|
||||
per_doc_idx: dict[str, int] = {}
|
||||
with questions_jsonl.open("r", encoding="utf-8") as fh:
|
||||
for line in fh:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
row = json.loads(line)
|
||||
doc_id = str(row.get("doc_id") or "")
|
||||
if not doc_id:
|
||||
continue
|
||||
idx = per_doc_idx.get(doc_id, 0)
|
||||
per_doc_idx[doc_id] = idx + 1
|
||||
out[(doc_id, idx)] = row
|
||||
return out
|
||||
|
||||
|
||||
def _qid_index(qid: str) -> int:
|
||||
"""Parse the per-doc question index out of a qid like ``foo.pdf::Q007``."""
|
||||
|
||||
_, _, q_part = qid.rpartition("::")
|
||||
if not q_part.startswith("Q"):
|
||||
raise ValueError(f"unexpected qid shape: {qid!r}")
|
||||
return int(q_part[1:])
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Request building (mirrors runner.py exactly so prompts are byte-identical)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _build_native_request(
|
||||
qid: str, question: str, answer_format: str, pdf_path: Path,
|
||||
*, max_output_tokens: int,
|
||||
) -> ArmRequest:
|
||||
return ArmRequest(
|
||||
question_id=qid,
|
||||
prompt=build_native_pdf_prompt(question, answer_format=answer_format),
|
||||
pdf_paths=[pdf_path],
|
||||
options={"max_tokens": max_output_tokens},
|
||||
)
|
||||
|
||||
|
||||
def _build_lc_request(
|
||||
qid: str, question: str, answer_format: str, doc_id: str, md_path: Path,
|
||||
) -> ArmRequest:
|
||||
if not md_path.exists():
|
||||
raise FileNotFoundError(
|
||||
f"Missing parser extraction at {md_path}; cannot retry LC arm."
|
||||
)
|
||||
markdown = md_path.read_text(encoding="utf-8")
|
||||
return ArmRequest(
|
||||
question_id=qid,
|
||||
prompt=build_long_context_prompt(
|
||||
question,
|
||||
answer_format=answer_format,
|
||||
document_markdown=markdown,
|
||||
document_label=doc_id,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Retry driver
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@dataclass
|
||||
class AttemptLog:
|
||||
attempt: int
|
||||
started_iso: str
|
||||
latency_ms: int
|
||||
error: str | None
|
||||
raw_text_chars: int
|
||||
|
||||
|
||||
@dataclass
|
||||
class RetryOutcome:
|
||||
arm: str
|
||||
qid: str
|
||||
attempts: list[AttemptLog]
|
||||
final_result: ArmResult
|
||||
recovered: bool
|
||||
|
||||
|
||||
async def _retry_one(
|
||||
arm_obj: Any, request: ArmRequest, *,
|
||||
arm_name: str,
|
||||
qid: str,
|
||||
max_attempts: int,
|
||||
base_delay: float,
|
||||
max_delay: float,
|
||||
) -> RetryOutcome:
|
||||
attempts: list[AttemptLog] = []
|
||||
final: ArmResult | None = None
|
||||
for attempt in range(1, max_attempts + 1):
|
||||
started_iso = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
|
||||
t0 = time.monotonic()
|
||||
result = await arm_obj.answer(request)
|
||||
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||
raw_text = (result.raw_text or "").strip()
|
||||
attempt_error = result.error
|
||||
if not attempt_error and not raw_text:
|
||||
attempt_error = "EmptyResponse: stream ended with no text"
|
||||
attempts.append(AttemptLog(
|
||||
attempt=attempt,
|
||||
started_iso=started_iso,
|
||||
latency_ms=latency_ms,
|
||||
error=attempt_error,
|
||||
raw_text_chars=len(raw_text),
|
||||
))
|
||||
final = result
|
||||
if not attempt_error and raw_text:
|
||||
return RetryOutcome(
|
||||
arm=arm_name, qid=qid, attempts=attempts,
|
||||
final_result=result, recovered=True,
|
||||
)
|
||||
if attempt < max_attempts:
|
||||
delay = min(max_delay, base_delay * (2 ** (attempt - 1)))
|
||||
delay = delay * (0.5 + random.random())
|
||||
logger.info(
|
||||
"[%s::%s] attempt %d/%d failed (%s); sleeping %.1fs",
|
||||
arm_name, qid, attempt, max_attempts, attempt_error, delay,
|
||||
)
|
||||
await asyncio.sleep(delay)
|
||||
assert final is not None
|
||||
return RetryOutcome(
|
||||
arm=arm_name, qid=qid, attempts=attempts,
|
||||
final_result=final, recovered=False,
|
||||
)
|
||||
|
||||
|
||||
async def _gather_with_limit(coros: list, *, concurrency: int) -> list[Any]:
|
||||
sem = asyncio.Semaphore(max(1, concurrency))
|
||||
|
||||
async def _wrap(coro):
|
||||
async with sem:
|
||||
return await coro
|
||||
|
||||
return await asyncio.gather(*(_wrap(c) for c in coros))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def _run(args: argparse.Namespace) -> int:
|
||||
load_dotenv(REPO / ".env")
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(name)s %(message)s",
|
||||
)
|
||||
|
||||
run_dir = REPO / "data" / "multimodal_doc" / "runs" / args.run_id / "parser_compare"
|
||||
raw_path = run_dir / "raw.jsonl"
|
||||
if not raw_path.exists():
|
||||
raise SystemExit(f"raw.jsonl not found at {raw_path}")
|
||||
|
||||
map_path = REPO / "data" / "multimodal_doc" / "maps" / "parser_compare_doc_map.jsonl"
|
||||
questions_jsonl = REPO / "data" / "multimodal_doc" / "mmlongbench" / "questions.jsonl"
|
||||
if not map_path.exists():
|
||||
raise SystemExit(f"parser_compare manifest not found at {map_path}")
|
||||
if not questions_jsonl.exists():
|
||||
raise SystemExit(f"mmlongbench questions not found at {questions_jsonl}")
|
||||
|
||||
failed = _load_failed_rows(raw_path)
|
||||
if not failed:
|
||||
logger.info("No failed rows in %s — nothing to retry.", raw_path)
|
||||
return 0
|
||||
|
||||
# SurfSense rows: warn and skip; we don't want to start backend just to
|
||||
# defensively retry a 0-failure arm.
|
||||
surf_failed = [f for f in failed if f.arm == "surfsense_agentic"]
|
||||
if surf_failed:
|
||||
logger.warning(
|
||||
"Skipping %d surfsense_agentic failures; this script doesn't drive the backend. "
|
||||
"If you want those retried too, start backend + celery and rerun "
|
||||
"with --include-surfsense.",
|
||||
len(surf_failed),
|
||||
)
|
||||
if not args.include_surfsense:
|
||||
failed = [f for f in failed if f.arm != "surfsense_agentic"]
|
||||
else:
|
||||
logger.info("No surfsense_agentic failures; backend/celery not needed for this retry.")
|
||||
|
||||
if not failed:
|
||||
logger.info("Nothing left to retry after filtering.")
|
||||
return 0
|
||||
|
||||
by_arm_count: dict[str, int] = {}
|
||||
for f in failed:
|
||||
by_arm_count[f.arm] = by_arm_count.get(f.arm, 0) + 1
|
||||
logger.info(
|
||||
"Loaded %d failed rows across %d arms: %s",
|
||||
len(failed), len(by_arm_count),
|
||||
", ".join(f"{a}={n}" for a, n in sorted(by_arm_count.items())),
|
||||
)
|
||||
|
||||
doc_map = _load_doc_map(map_path)
|
||||
qtext_idx = _load_question_text_index(questions_jsonl)
|
||||
|
||||
api_key = os.environ.get("OPENROUTER_API_KEY")
|
||||
if not api_key:
|
||||
raise SystemExit("OPENROUTER_API_KEY missing from environment / .env")
|
||||
|
||||
native_provider = OpenRouterPdfProvider(
|
||||
api_key=api_key,
|
||||
base_url=os.environ.get("OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1"),
|
||||
model=args.llm_model,
|
||||
engine=PdfEngine(args.pdf_engine),
|
||||
)
|
||||
native_arm = NativePdfArm(
|
||||
provider=native_provider, max_output_tokens=args.max_output_tokens,
|
||||
)
|
||||
|
||||
lc_arms: dict[str, BareLlmArm] = {}
|
||||
for arm_name in sorted({f.arm for f in failed} & LC_ARMS):
|
||||
lc_provider = OpenRouterChatProvider(
|
||||
api_key=api_key,
|
||||
base_url=os.environ.get("OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1"),
|
||||
model=args.llm_model,
|
||||
)
|
||||
lc_arms[arm_name] = BareLlmArm(
|
||||
provider=lc_provider,
|
||||
max_output_tokens=args.max_output_tokens,
|
||||
name=arm_name,
|
||||
)
|
||||
|
||||
coros: list = []
|
||||
plan: list[tuple[FailedRow, ArmRequest, Any]] = []
|
||||
|
||||
for f in failed:
|
||||
# Look up the question text from questions.jsonl
|
||||
try:
|
||||
q_idx = _qid_index(f.qid)
|
||||
except Exception:
|
||||
logger.error("Bad qid %r — skipping", f.qid)
|
||||
continue
|
||||
qrow = qtext_idx.get((f.doc_id, q_idx))
|
||||
if qrow is None:
|
||||
logger.error(
|
||||
"Could not find question text for %s (idx %d) — skipping",
|
||||
f.doc_id, q_idx,
|
||||
)
|
||||
continue
|
||||
question_text = str(qrow.get("question") or "").strip()
|
||||
answer_format = str(qrow.get("answer_format") or f.answer_format or "").strip().lower()
|
||||
|
||||
map_row = doc_map.get(f.doc_id)
|
||||
if map_row is None:
|
||||
logger.error("doc_id %s not in manifest — skipping", f.doc_id)
|
||||
continue
|
||||
|
||||
if f.arm == "native_pdf":
|
||||
pdf_path = Path(map_row["pdf_path"])
|
||||
if not pdf_path.exists():
|
||||
logger.error("PDF missing on disk: %s — skipping", pdf_path)
|
||||
continue
|
||||
request = _build_native_request(
|
||||
f.qid, question_text, answer_format, pdf_path,
|
||||
max_output_tokens=args.max_output_tokens,
|
||||
)
|
||||
arm_obj = native_arm
|
||||
elif f.arm in LC_ARMS:
|
||||
ext_blob = (map_row.get("extractions") or {}).get(f.arm) or {}
|
||||
md_path_str = ext_blob.get("markdown_path")
|
||||
if not md_path_str or ext_blob.get("status") != "ok":
|
||||
logger.error(
|
||||
"Missing extraction for %s on %s — cannot retry; skipping",
|
||||
f.arm, f.doc_id,
|
||||
)
|
||||
continue
|
||||
request = _build_lc_request(
|
||||
f.qid, question_text, answer_format, f.doc_id, Path(md_path_str),
|
||||
)
|
||||
arm_obj = lc_arms[f.arm]
|
||||
else:
|
||||
logger.warning("Unhandled arm %s — skipping", f.arm)
|
||||
continue
|
||||
|
||||
plan.append((f, request, arm_obj))
|
||||
coros.append(_retry_one(
|
||||
arm_obj, request,
|
||||
arm_name=f.arm, qid=f.qid,
|
||||
max_attempts=args.max_attempts,
|
||||
base_delay=args.base_delay,
|
||||
max_delay=args.max_delay,
|
||||
))
|
||||
|
||||
if not coros:
|
||||
logger.warning("Nothing to retry after request building.")
|
||||
return 0
|
||||
|
||||
logger.info(
|
||||
"Retrying %d failed rows with up to %d attempts each "
|
||||
"(base_delay=%.1fs, max_delay=%.1fs, concurrency=%d).",
|
||||
len(coros), args.max_attempts, args.base_delay, args.max_delay,
|
||||
args.concurrency,
|
||||
)
|
||||
|
||||
started = time.monotonic()
|
||||
outcomes: list[RetryOutcome] = await _gather_with_limit(
|
||||
coros, concurrency=args.concurrency,
|
||||
)
|
||||
elapsed = time.monotonic() - started
|
||||
logger.info("Retry pass finished in %.1fs.", elapsed)
|
||||
|
||||
out_path = run_dir / "raw_retries.jsonl"
|
||||
summary_path = run_dir / "raw_retries_summary.json"
|
||||
|
||||
per_arm_recovered: dict[str, int] = {}
|
||||
per_arm_total: dict[str, int] = {}
|
||||
per_arm_attempts_dist: dict[str, list[int]] = {}
|
||||
|
||||
with out_path.open("w", encoding="utf-8") as fh:
|
||||
for (f, _req, _arm_obj), outcome in zip(plan, outcomes, strict=True):
|
||||
per_arm_total[outcome.arm] = per_arm_total.get(outcome.arm, 0) + 1
|
||||
if outcome.recovered:
|
||||
per_arm_recovered[outcome.arm] = (
|
||||
per_arm_recovered.get(outcome.arm, 0) + 1
|
||||
)
|
||||
per_arm_attempts_dist.setdefault(outcome.arm, []).append(
|
||||
len(outcome.attempts)
|
||||
)
|
||||
|
||||
g = grade(
|
||||
pred=extract_freeform_answer(outcome.final_result.raw_text or ""),
|
||||
gold=f.gold,
|
||||
answer_format=f.answer_format,
|
||||
)
|
||||
row = {
|
||||
"qid": f.qid,
|
||||
"doc_id": f.doc_id,
|
||||
"arm": f.arm,
|
||||
"answer_format": f.answer_format,
|
||||
"gold": f.gold,
|
||||
"pages": f.pages,
|
||||
"document_id": f.document_id,
|
||||
"original_error": f.original_error,
|
||||
"retry": {
|
||||
"max_attempts": args.max_attempts,
|
||||
"n_attempts": len(outcome.attempts),
|
||||
"recovered": outcome.recovered,
|
||||
"attempts": [
|
||||
{
|
||||
"attempt": a.attempt,
|
||||
"started_iso": a.started_iso,
|
||||
"latency_ms": a.latency_ms,
|
||||
"error": a.error,
|
||||
"raw_text_chars": a.raw_text_chars,
|
||||
}
|
||||
for a in outcome.attempts
|
||||
],
|
||||
},
|
||||
**outcome.final_result.to_jsonl(),
|
||||
"graded": {
|
||||
"correct": g.correct,
|
||||
"f1": g.f1,
|
||||
"method": g.method,
|
||||
"normalised_pred": g.normalised_pred,
|
||||
"normalised_gold": g.normalised_gold,
|
||||
},
|
||||
}
|
||||
fh.write(json.dumps(row) + "\n")
|
||||
|
||||
summary = {
|
||||
"run_id": args.run_id,
|
||||
"raw_retries_path": str(out_path.relative_to(REPO)),
|
||||
"n_failed_rows_input": len(failed),
|
||||
"n_retried": len(coros),
|
||||
"elapsed_s": round(elapsed, 1),
|
||||
"config": {
|
||||
"max_attempts": args.max_attempts,
|
||||
"base_delay": args.base_delay,
|
||||
"max_delay": args.max_delay,
|
||||
"concurrency": args.concurrency,
|
||||
"llm_model": args.llm_model,
|
||||
"pdf_engine": args.pdf_engine,
|
||||
"max_output_tokens": args.max_output_tokens,
|
||||
},
|
||||
"per_arm": {
|
||||
arm: {
|
||||
"tried": per_arm_total.get(arm, 0),
|
||||
"recovered": per_arm_recovered.get(arm, 0),
|
||||
"still_failed": (
|
||||
per_arm_total.get(arm, 0) - per_arm_recovered.get(arm, 0)
|
||||
),
|
||||
"recovery_rate": (
|
||||
per_arm_recovered.get(arm, 0) / per_arm_total[arm]
|
||||
if per_arm_total.get(arm) else 0.0
|
||||
),
|
||||
"attempts_distribution": sorted(per_arm_attempts_dist.get(arm, [])),
|
||||
}
|
||||
for arm in sorted(per_arm_total)
|
||||
},
|
||||
"totals": {
|
||||
"tried": sum(per_arm_total.values()),
|
||||
"recovered": sum(per_arm_recovered.values()),
|
||||
"still_failed": sum(per_arm_total.values()) - sum(per_arm_recovered.values()),
|
||||
},
|
||||
}
|
||||
summary_path.write_text(
|
||||
json.dumps(summary, indent=2, sort_keys=True) + "\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
print()
|
||||
print("=" * 78)
|
||||
print("Retry pass summary")
|
||||
print("=" * 78)
|
||||
header = f"{'arm':<25} {'tried':>6} {'recovered':>10} {'still fail':>11} {'rate':>7}"
|
||||
print(header)
|
||||
print("-" * len(header))
|
||||
for arm in sorted(per_arm_total):
|
||||
tried = per_arm_total[arm]
|
||||
rec = per_arm_recovered.get(arm, 0)
|
||||
rate = (rec / tried * 100) if tried else 0.0
|
||||
print(f"{arm:<25} {tried:>6} {rec:>10} {tried - rec:>11} {rate:>6.1f}%")
|
||||
total = sum(per_arm_total.values())
|
||||
rec_total = sum(per_arm_recovered.values())
|
||||
rate_total = (rec_total / total * 100) if total else 0.0
|
||||
print("-" * len(header))
|
||||
print(f"{'TOTAL':<25} {total:>6} {rec_total:>10} {total - rec_total:>11} "
|
||||
f"{rate_total:>6.1f}%")
|
||||
print()
|
||||
print(f"Wrote {out_path.relative_to(REPO)}")
|
||||
print(f"Wrote {summary_path.relative_to(REPO)}")
|
||||
return 0
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--run-id", default="2026-05-14T00-53-19Z",
|
||||
help="Run timestamp under data/multimodal_doc/runs/. Default is the "
|
||||
"n=171 production run we wrote up in the blog.",
|
||||
)
|
||||
parser.add_argument("--max-attempts", type=int, default=5)
|
||||
parser.add_argument("--base-delay", type=float, default=1.0,
|
||||
help="Base seconds for exponential backoff (default 1s).")
|
||||
parser.add_argument("--max-delay", type=float, default=30.0,
|
||||
help="Cap on per-retry sleep (default 30s).")
|
||||
parser.add_argument("--concurrency", type=int, default=2,
|
||||
help="Parallel retries in flight (default 2 — keep low "
|
||||
"to avoid the same transport stress that caused "
|
||||
"the original failures).")
|
||||
parser.add_argument("--llm-model", default="anthropic/claude-sonnet-4.5")
|
||||
parser.add_argument("--pdf-engine", default="native",
|
||||
choices=[e.value for e in PdfEngine])
|
||||
parser.add_argument("--max-output-tokens", type=int, default=512)
|
||||
parser.add_argument(
|
||||
"--include-surfsense", action="store_true",
|
||||
help="Also retry surfsense_agentic failures (requires backend + celery up). "
|
||||
"Default is to skip them since the n=171 run had 0 SurfSense failures.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
raise SystemExit(asyncio.run(_run(args)))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
122
surfsense_evals/scripts/summarise_parser_compare_run.py
Normal file
122
surfsense_evals/scripts/summarise_parser_compare_run.py
Normal file
|
|
@ -0,0 +1,122 @@
|
|||
"""Slice the parser_compare raw.jsonl for the n=171 run.
|
||||
|
||||
Reports per-arm:
|
||||
* tokens & cost stats (input/output mean, $/Q distribution)
|
||||
* failures (status != ok or empty raw_text)
|
||||
* answer_format breakdown (accuracy by str/int/float/list)
|
||||
|
||||
Plus surfsense agentic breakdown so we can compare apples to apples
|
||||
even though the new_chat SSE doesn't surface per-call token counts.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import statistics
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
REPO = Path(__file__).resolve().parents[1]
|
||||
RUN_DIR = REPO / "data" / "multimodal_doc" / "runs" / "2026-05-14T00-53-19Z" / "parser_compare"
|
||||
RAW = RUN_DIR / "raw.jsonl"
|
||||
ARTIFACT = RUN_DIR / "run_artifact.json"
|
||||
|
||||
|
||||
def main() -> None:
|
||||
rows = [json.loads(line) for line in RAW.read_text(encoding="utf-8").splitlines() if line.strip()]
|
||||
print(f"raw rows: {len(rows)}")
|
||||
|
||||
by_qid: dict[str, list[dict]] = defaultdict(list)
|
||||
for row in rows:
|
||||
by_qid[row["qid"]].append(row)
|
||||
print(f"unique questions: {len(by_qid)}")
|
||||
|
||||
arm_metrics: dict[str, dict] = defaultdict(lambda: {
|
||||
"n": 0, "n_correct": 0, "n_failed": 0, "n_empty": 0,
|
||||
"costs": [], "in_tokens": [], "out_tokens": [], "latency_ms": [],
|
||||
"by_format": defaultdict(lambda: {"n": 0, "correct": 0}),
|
||||
})
|
||||
|
||||
for row in rows:
|
||||
arm = row["arm"]
|
||||
m = arm_metrics[arm]
|
||||
m["n"] += 1
|
||||
graded = row.get("graded") or {}
|
||||
if graded.get("correct"):
|
||||
m["n_correct"] += 1
|
||||
|
||||
err = row.get("error")
|
||||
raw_text = row.get("raw_text") or ""
|
||||
if err:
|
||||
m["n_failed"] += 1
|
||||
elif not raw_text.strip():
|
||||
m["n_empty"] += 1
|
||||
|
||||
cost = row.get("cost_usd")
|
||||
if cost is not None:
|
||||
m["costs"].append(float(cost))
|
||||
ut = row.get("usage") or {}
|
||||
if ut.get("prompt_tokens"):
|
||||
m["in_tokens"].append(ut["prompt_tokens"])
|
||||
if ut.get("completion_tokens"):
|
||||
m["out_tokens"].append(ut["completion_tokens"])
|
||||
if row.get("latency_ms"):
|
||||
m["latency_ms"].append(row["latency_ms"])
|
||||
|
||||
fmt = row.get("answer_format") or "unknown"
|
||||
m["by_format"][fmt]["n"] += 1
|
||||
if graded.get("correct"):
|
||||
m["by_format"][fmt]["correct"] += 1
|
||||
|
||||
print()
|
||||
print("=" * 100)
|
||||
print(f"{'arm':<25} {'n':>4} {'acc%':>6} {'F1%':>6} {'fail':>5} {'$ mean':>10} {'$ median':>10} {'in tok mean':>12} {'out tok mean':>12} {'p50 ms':>8}")
|
||||
print("=" * 100)
|
||||
art = json.loads(ARTIFACT.read_text(encoding="utf-8"))
|
||||
per_arm_art = art["metrics"]["per_arm"]
|
||||
for arm, m in sorted(arm_metrics.items()):
|
||||
acc = m["n_correct"] / m["n"] * 100
|
||||
fail = m["n_failed"]
|
||||
cost_mean = statistics.mean(m["costs"]) if m["costs"] else 0.0
|
||||
cost_med = statistics.median(m["costs"]) if m["costs"] else 0.0
|
||||
in_mean = statistics.mean(m["in_tokens"]) if m["in_tokens"] else 0
|
||||
out_mean = statistics.mean(m["out_tokens"]) if m["out_tokens"] else 0
|
||||
lat_p50 = statistics.median(m["latency_ms"]) if m["latency_ms"] else 0
|
||||
f1 = per_arm_art.get(arm, {}).get("f1_mean", 0.0) * 100
|
||||
print(
|
||||
f"{arm:<25} {m['n']:>4} {acc:>5.1f}% {f1:>5.1f}% {fail:>5} "
|
||||
f"${cost_mean:>9.4f} ${cost_med:>9.4f} {in_mean:>12.0f} {out_mean:>12.0f} {lat_p50:>8.0f}"
|
||||
)
|
||||
|
||||
print()
|
||||
print("by answer_format (accuracy):")
|
||||
formats = sorted({f for m in arm_metrics.values() for f in m["by_format"].keys()})
|
||||
header = f"{'arm':<25} " + " ".join(f"{f:>10}" for f in formats)
|
||||
print(header)
|
||||
print("-" * len(header))
|
||||
for arm, m in sorted(arm_metrics.items()):
|
||||
cells = []
|
||||
for f in formats:
|
||||
row = m["by_format"][f]
|
||||
if row["n"] == 0:
|
||||
cells.append(f"{'-':>10}")
|
||||
else:
|
||||
pct = row["correct"] / row["n"] * 100
|
||||
cells.append(f"{pct:>5.0f}% ({row['correct']:>2}/{row['n']:>2})")
|
||||
print(f"{arm:<25} " + " ".join(cells))
|
||||
|
||||
print()
|
||||
print("=" * 100)
|
||||
print("Aggregated cost (from run_artifact.json):")
|
||||
for arm, row in per_arm_art.items():
|
||||
print(
|
||||
f" {arm:<25} acc={row['accuracy']*100:5.1f}% "
|
||||
f" $/Q LLM={row['llm_cost_per_q']:.4f} "
|
||||
f" preprocess total=${row['preprocess_cost_total']:.2f} "
|
||||
f" $/Q total={row['total_cost_per_q']:.4f}"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
155
surfsense_evals/scripts/test_context_overflow_hypothesis.py
Normal file
155
surfsense_evals/scripts/test_context_overflow_hypothesis.py
Normal file
|
|
@ -0,0 +1,155 @@
|
|||
"""Test the hypothesis: were the LC-arm errors actually context-window
|
||||
overflow errors disguised as SSL / network failures?
|
||||
|
||||
If true, we'd expect:
|
||||
(a) literal "prompt is too long" / "context_length_exceeded" / "exceeds .* tokens" strings,
|
||||
(b) failures correlated with extraction size / input_tokens (large doc -> failure),
|
||||
(c) failing requests near or over Sonnet 4.5's 200k input-token limit.
|
||||
|
||||
If false (transport-layer hypothesis), we'd expect:
|
||||
(a) only SSL / 502 / empty stream / JSONDecode strings,
|
||||
(b) failures NOT correlated with size (uniform across PDFs by time, not by tokens),
|
||||
(c) failing requests well below the 200k limit.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import statistics
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
REPO = Path(__file__).resolve().parents[1]
|
||||
RUN = REPO / "data" / "multimodal_doc" / "runs" / "2026-05-14T00-53-19Z" / "parser_compare"
|
||||
RAW = RUN / "raw.jsonl"
|
||||
MANIFEST = REPO / "data" / "multimodal_doc" / "maps" / "parser_compare_doc_map.jsonl"
|
||||
|
||||
CONTEXT_HINTS = (
|
||||
"context_length",
|
||||
"context window",
|
||||
"prompt is too long",
|
||||
"exceeds",
|
||||
"maximum context",
|
||||
"input tokens",
|
||||
"too many tokens",
|
||||
"over the maximum",
|
||||
"200000",
|
||||
"200_000",
|
||||
)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
rows = [
|
||||
json.loads(line) for line in RAW.read_text(encoding="utf-8").splitlines()
|
||||
if line.strip()
|
||||
]
|
||||
|
||||
extraction_size: dict[tuple[str, str], int] = {}
|
||||
for line in MANIFEST.read_text(encoding="utf-8").splitlines():
|
||||
if not line.strip():
|
||||
continue
|
||||
m = json.loads(line)
|
||||
for arm, ext in (m.get("extractions") or {}).items():
|
||||
extraction_size[(m["doc_id"], arm)] = int(ext.get("chars") or 0)
|
||||
|
||||
print("=" * 80)
|
||||
print("(a) Literal 'context window' / 'prompt too long' error strings?")
|
||||
print("=" * 80)
|
||||
found = 0
|
||||
for row in rows:
|
||||
err = (row.get("error") or "").lower()
|
||||
if not err:
|
||||
continue
|
||||
for hint in CONTEXT_HINTS:
|
||||
if hint in err:
|
||||
print(f" {row['arm']:<25} {row['qid']:<50}")
|
||||
print(f" -> {err[:240]}")
|
||||
found += 1
|
||||
break
|
||||
if not found:
|
||||
print(" none found.")
|
||||
|
||||
print()
|
||||
print("=" * 80)
|
||||
print("(b) Extraction size for OK vs FAILED rows per arm")
|
||||
print("=" * 80)
|
||||
arm_buckets: dict[str, dict[str, list[int]]] = defaultdict(
|
||||
lambda: {"ok": [], "fail": []}
|
||||
)
|
||||
parser_arms = (
|
||||
"azure_basic_lc", "azure_premium_lc",
|
||||
"llamacloud_basic_lc", "llamacloud_premium_lc",
|
||||
)
|
||||
for row in rows:
|
||||
arm = row["arm"]
|
||||
if arm not in parser_arms:
|
||||
continue
|
||||
size = extraction_size.get((row["doc_id"], arm), 0)
|
||||
bucket = "fail" if (row.get("error") or not (row.get("raw_text") or "").strip()) else "ok"
|
||||
arm_buckets[arm][bucket].append(size)
|
||||
|
||||
print(f"{'arm':<25} {'bucket':<5} {'n':>4} {'mean chars':>12} {'median':>10} {'max':>10}")
|
||||
for arm in parser_arms:
|
||||
for bucket in ("ok", "fail"):
|
||||
sizes = arm_buckets[arm][bucket]
|
||||
if not sizes:
|
||||
print(f" {arm:<23} {bucket:<5} {0:>4} -")
|
||||
continue
|
||||
print(
|
||||
f" {arm:<23} {bucket:<5} {len(sizes):>4} "
|
||||
f"{statistics.mean(sizes):>12,.0f} "
|
||||
f"{statistics.median(sizes):>10,.0f} "
|
||||
f"{max(sizes):>10,}"
|
||||
)
|
||||
|
||||
print()
|
||||
print("=" * 80)
|
||||
print("(c) Largest extraction each arm processed *successfully* vs *failed*")
|
||||
print("=" * 80)
|
||||
print(
|
||||
"(Sonnet 4.5 input limit ~200k tokens ~= 800k chars. If failures were "
|
||||
"context-overflow, max-OK would be near that cap. If max-OK is well "
|
||||
"above max-FAIL, the model handled bigger contexts than the failed "
|
||||
"ones, so size cannot be the cause.)"
|
||||
)
|
||||
print()
|
||||
for arm in parser_arms:
|
||||
ok_sizes = arm_buckets[arm]["ok"]
|
||||
fail_sizes = arm_buckets[arm]["fail"]
|
||||
if not ok_sizes:
|
||||
continue
|
||||
max_ok = max(ok_sizes)
|
||||
max_fail = max(fail_sizes) if fail_sizes else 0
|
||||
print(
|
||||
f" {arm:<25} max OK = {max_ok:>10,} chars (~{max_ok / 4:>7,.0f} tokens) "
|
||||
f"max FAIL = {max_fail:>10,} chars (~{max_fail / 4:>7,.0f} tokens)"
|
||||
)
|
||||
|
||||
print()
|
||||
print("=" * 80)
|
||||
print("(d) Did the *known* overflow candidate fail?")
|
||||
print("=" * 80)
|
||||
print(
|
||||
" 3M_2018_10K x llamacloud_premium = 908,733 chars (~227k tokens) "
|
||||
"-- this is above Sonnet 4.5's 200k window."
|
||||
)
|
||||
print(" If transport hypothesis is correct, this should still fail with a "
|
||||
"real overflow error.")
|
||||
print(" If transport hypothesis is correct AND the model truncates silently, "
|
||||
"it might 'succeed' but be wrong.")
|
||||
print()
|
||||
for row in rows:
|
||||
if row["doc_id"] != "3M_2018_10K.pdf":
|
||||
continue
|
||||
if row["arm"] != "llamacloud_premium_lc":
|
||||
continue
|
||||
err = row.get("error") or "(none)"
|
||||
graded = row.get("graded") or {}
|
||||
print(
|
||||
f" {row['qid']:<40} correct={graded.get('correct')!s:<5} "
|
||||
f"err={err[:100]}"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue