chore: evals

2026-07-02 22:01:05 +02:00 · 2026-05-13 14:02:26 -07:00 · 2026-05-13 14:02:26 -07:00 · 3737118050
commit 3737118050
parent 2402b730fa
122 changed files with 22598 additions and 13 deletions
--- a/surfsense_evals/scripts/download_crag_task3.py
+++ b/surfsense_evals/scripts/download_crag_task3.py
@ -0,0 +1,97 @@
+"""Download CRAG Task 3's 4 .tar.bz2 parts in parallel.
+
+Run once before ``ingest research crag_t3`` to avoid the ingest
+synchronously blocking on a 7 GB download. Skips parts already
+present and complete on disk.
+"""
+
+from __future__ import annotations
+
+import logging
+import sys
+import time
+import urllib.request
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)s %(message)s",
+)
+log = logging.getLogger("download_task3")
+
+
+_BASE = (
+    "https://github.com/facebookresearch/CRAG/raw/refs/heads/main/data/"
+    "crag_task_3_dev_v4.tar.bz2.part"
+)
+_USER_AGENT = "SurfSense-Evals/0.1 (CRAG Task 3 fetch)"
+
+
+def _expected_size(url: str) -> int:
+    req = urllib.request.Request(url, method="HEAD", headers={"User-Agent": _USER_AGENT})
+    with urllib.request.urlopen(req, timeout=30) as resp:
+        return int(resp.headers.get("content-length", 0))
+
+
+def download_one(part: int, dest_dir: Path) -> Path:
+    url = f"{_BASE}{part}"
+    dest = dest_dir / f"crag_task_3_dev_v4.tar.bz2.part{part}"
+    expected = _expected_size(url)
+    if dest.exists() and dest.stat().st_size == expected:
+        log.info("part%d: cached (%d bytes)", part, expected)
+        return dest
+    log.info("part%d: downloading %d bytes ...", part, expected)
+    tmp = dest.with_suffix(dest.suffix + ".part_dl")
+    started = time.monotonic()
+    last_log = started
+    with urllib.request.urlopen(
+        urllib.request.Request(url, headers={"User-Agent": _USER_AGENT}),
+        timeout=900,
+    ) as resp, tmp.open("wb") as fh:
+        downloaded = 0
+        chunk = resp.read(1 << 20)
+        while chunk:
+            fh.write(chunk)
+            downloaded += len(chunk)
+            now = time.monotonic()
+            if now - last_log > 5.0:
+                pct = 100 * downloaded / expected if expected else 0
+                rate_mb = (downloaded / (now - started)) / (1 << 20)
+                log.info(
+                    "part%d: %5.1f%% (%.1f / %.1f MiB at %.1f MiB/s)",
+                    part, pct, downloaded / (1 << 20), expected / (1 << 20), rate_mb,
+                )
+                last_log = now
+            chunk = resp.read(1 << 20)
+    tmp.replace(dest)
+    elapsed = time.monotonic() - started
+    log.info(
+        "part%d: done in %.1fs (%.1f MiB/s avg)",
+        part, elapsed, (expected / (1 << 20)) / max(elapsed, 0.001),
+    )
+    return dest
+
+
+def main() -> int:
+    dest_dir = Path("data/research/crag_t3/.raw_cache")
+    dest_dir.mkdir(parents=True, exist_ok=True)
+
+    # 4 parts in parallel — typical residential connection saturates around
+    # 2 streams; GitHub raw serves these fine in parallel.
+    started = time.monotonic()
+    with ThreadPoolExecutor(max_workers=4) as ex:
+        futures = {ex.submit(download_one, i, dest_dir): i for i in range(1, 5)}
+        for fut in as_completed(futures):
+            part = futures[fut]
+            try:
+                fut.result()
+            except Exception as exc:  # noqa: BLE001
+                log.error("part%d failed: %s", part, exc)
+                return 1
+    log.info("All 4 parts downloaded in %.1fs", time.monotonic() - started)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/surfsense_evals/scripts/peek_crag_run.py
+++ b/surfsense_evals/scripts/peek_crag_run.py
@ -0,0 +1,37 @@
+"""Tiny helper to inspect the latest CRAG run's per-question outputs."""
+
+from __future__ import annotations
+
+import glob
+import json
+from collections import defaultdict
+
+
+def main() -> None:
+    raw_path = sorted(glob.glob("data/research/runs/*/crag/raw.jsonl"))[-1]
+    print(f"Reading: {raw_path}")
+    rows = [json.loads(line) for line in open(raw_path, encoding="utf-8") if line.strip()]
+    by_q: dict[str, dict[str, dict]] = defaultdict(dict)
+    for r in rows:
+        by_q[r["qid"]][r["arm"]] = r
+
+    for qid, arms in list(by_q.items()):
+        b = arms.get("bare_llm", {})
+        l = arms.get("long_context", {})
+        s = arms.get("surfsense", {})
+        print(f"\n=== {qid} ({b.get('domain')}/{b.get('question_type')}) ===")
+        print(f"  question: {b.get('extra', {}).get('question', '?')!r}")
+        print(f"  gold: {b.get('gold')!r}")
+        for arm_name, a in (("bare_llm", b), ("long_context", l), ("surfsense", s)):
+            grade = a.get("graded", {})
+            text = (a.get("raw_text") or "").strip()
+            tail = text[-200:] if text else ""
+            print(
+                f"  [{arm_name}] grade={grade.get('grade')} "
+                f"method={grade.get('method')}"
+            )
+            print(f"    -> {tail!r}")
+
+
+if __name__ == "__main__":
+    main()
--- a/surfsense_evals/scripts/peek_disagreements.py
+++ b/surfsense_evals/scripts/peek_disagreements.py
@ -0,0 +1,64 @@
+"""Show questions where SurfSense was wrong but long-context was right (and vice versa)."""
+
+from __future__ import annotations
+
+import glob
+import json
+from collections import defaultdict
+
+
+def main() -> None:
+    raw_path = sorted(glob.glob("data/research/runs/*/crag/raw.jsonl"))[-1]
+    print(f"Reading: {raw_path}")
+    rows = [json.loads(line) for line in open(raw_path, encoding="utf-8") if line.strip()]
+    by_q: dict[str, dict[str, dict]] = defaultdict(dict)
+    for r in rows:
+        by_q[r["qid"]][r["arm"]] = r
+
+    surf_wrong_lc_right = []
+    lc_wrong_surf_right = []
+    surf_wrong_bare_right = []
+    for qid, arms in by_q.items():
+        b = arms.get("bare_llm", {}).get("graded", {}).get("grade")
+        lc = arms.get("long_context", {}).get("graded", {}).get("grade")
+        s = arms.get("surfsense", {}).get("graded", {}).get("grade")
+        if s == "incorrect" and lc == "correct":
+            surf_wrong_lc_right.append(qid)
+        if lc == "incorrect" and s == "correct":
+            lc_wrong_surf_right.append(qid)
+        if s == "incorrect" and b == "correct":
+            surf_wrong_bare_right.append(qid)
+
+    print(f"\nSurfSense INCORRECT but Long-Context CORRECT: {len(surf_wrong_lc_right)}")
+    print(f"Long-Context INCORRECT but SurfSense CORRECT: {len(lc_wrong_surf_right)}")
+    print(f"SurfSense INCORRECT but Bare CORRECT: {len(surf_wrong_bare_right)}")
+
+    print("\n=== Where SurfSense is wrong but long-context is right (top 5) ===")
+    for qid in surf_wrong_lc_right[:5]:
+        arms = by_q[qid]
+        b = arms.get("bare_llm", {})
+        print(f"\n[{qid}] domain={b.get('domain')} qtype={b.get('question_type')}")
+        print(f"  GOLD: {b.get('gold')!r}")
+        for arm_name in ("bare_llm", "long_context", "surfsense"):
+            a = arms.get(arm_name, {})
+            t = (a.get("raw_text") or "").strip()
+            tail = t[-180:] if t else ""
+            grade = a.get("graded", {})
+            print(f"  [{arm_name}] {grade.get('grade')} ({grade.get('method')}): {tail!r}")
+
+    print("\n=== Where Long-Context is wrong but SurfSense is right (top 5) ===")
+    for qid in lc_wrong_surf_right[:5]:
+        arms = by_q[qid]
+        b = arms.get("bare_llm", {})
+        print(f"\n[{qid}] domain={b.get('domain')} qtype={b.get('question_type')}")
+        print(f"  GOLD: {b.get('gold')!r}")
+        for arm_name in ("bare_llm", "long_context", "surfsense"):
+            a = arms.get(arm_name, {})
+            t = (a.get("raw_text") or "").strip()
+            tail = t[-180:] if t else ""
+            grade = a.get("graded", {})
+            print(f"  [{arm_name}] {grade.get('grade')} ({grade.get('method')}): {tail!r}")
+
+
+if __name__ == "__main__":
+    main()
--- a/surfsense_evals/scripts/peek_t3_doc_map.py
+++ b/surfsense_evals/scripts/peek_t3_doc_map.py
@ -0,0 +1,40 @@
+"""Quick sanity-check for the CRAG Task 3 doc map after ingest."""
+
+from __future__ import annotations
+
+import json
+import sys
+from pathlib import Path
+
+
+def main() -> int:
+    p = Path("data/research/maps/crag_t3_doc_map.jsonl")
+    if not p.exists():
+        print(f"Doc map missing: {p}")
+        return 1
+    rows = []
+    settings = {}
+    for line in p.read_text(encoding="utf-8").splitlines():
+        if not line.strip():
+            continue
+        row = json.loads(line)
+        if "__settings__" in row:
+            settings = row
+            continue
+        rows.append(row)
+    print(f"Settings header: {settings}")
+    print(f"Doc map rows:   {len(rows)}")
+    for r in rows:
+        print(f"  qid={r['qid']:<10} domain={r['domain']:<8} qtype={r['question_type']}")
+        print(f"    question: {r['question'][:90]}")
+        print(f"    gold:     {r['gold_answer'][:90]}")
+        print(
+            f"    pages:    {len(r['page_filenames'])} extracted, "
+            f"{len(r['document_ids'])} doc_ids, "
+            f"{len(r['missing_pages'])} missing"
+        )
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/surfsense_evals/scripts/summarise_crag_run.py
+++ b/surfsense_evals/scripts/summarise_crag_run.py
@ -0,0 +1,65 @@
+"""Render a quick textual summary of the latest CRAG run."""
+
+from __future__ import annotations
+
+import glob
+import json
+
+
+def main() -> None:
+    runs = sorted(glob.glob("data/research/runs/*/crag/run_artifact.json"))
+    if not runs:
+        print("(no CRAG runs found)")
+        return
+    m = json.load(open(runs[-1], encoding="utf-8"))
+    metrics = m["metrics"]
+
+    print(f"Reading: {runs[-1]}")
+    print(f"n_questions: {m['extra']['n_questions']}")
+    print()
+    print("=== ARMS ===")
+    for arm in ("bare_llm", "long_context", "surfsense"):
+        d = metrics[arm]
+        print(
+            f"{arm:14s}: "
+            f"acc={d['accuracy']*100:5.1f}% (Wilson 95% CI "
+            f"{d['ci_low']*100:.1f}-{d['ci_high']*100:.1f}) | "
+            f"correct={d['correct_rate']*100:5.1f}% "
+            f"missing={d['missing_rate']*100:5.1f}% "
+            f"incorrect={d['incorrect_rate']*100:5.1f}% | "
+            f"truth={d['truthfulness_score']*100:+5.1f}%"
+        )
+
+    print()
+    print("=== DELTAS ===")
+    for key, d in metrics["deltas"].items():
+        print(
+            f"{key:30s}: acc={d['accuracy_pp']:+5.1f}pp "
+            f"truth={d['truthfulness_score_pp']:+5.1f}pp "
+            f"McNemar p={d['mcnemar_p_value']:.4f} ({d['mcnemar_method']}) "
+            f"bootstrap CI [{d['bootstrap_ci_low']:+.1f}, {d['bootstrap_ci_high']:+.1f}]"
+        )
+
+    print()
+    print("=== PER-QUESTION-TYPE TRUTHFULNESS ===")
+    for qt, row in sorted(metrics["per_question_type"].items()):
+        n = row["n"]
+        pieces = [f"{qt:20s} (n={n:3d}):"]
+        for arm in ("bare_llm", "long_context", "surfsense"):
+            if arm in row:
+                pieces.append(f"{arm}={row[arm]['truthfulness_score']*100:+7.1f}%")
+        print(" ".join(pieces))
+
+    print()
+    print("=== PER-DOMAIN TRUTHFULNESS ===")
+    for dom, row in sorted(metrics["per_domain"].items()):
+        n = row["n"]
+        pieces = [f"{dom:10s} (n={n:3d}):"]
+        for arm in ("bare_llm", "long_context", "surfsense"):
+            if arm in row:
+                pieces.append(f"{arm}={row[arm]['truthfulness_score']*100:+7.1f}%")
+        print(" ".join(pieces))
+
+
+if __name__ == "__main__":
+    main()