chore: evals

This commit is contained in:
DESKTOP-RTLN3BA\$punk 2026-05-13 14:02:26 -07:00
parent 2402b730fa
commit 3737118050
122 changed files with 22598 additions and 13 deletions

View file

@ -0,0 +1,97 @@
"""Download CRAG Task 3's 4 .tar.bz2 parts in parallel.
Run once before ``ingest research crag_t3`` to avoid the ingest
synchronously blocking on a 7 GB download. Skips parts already
present and complete on disk.
"""
from __future__ import annotations
import logging
import sys
import time
import urllib.request
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
)
log = logging.getLogger("download_task3")
_BASE = (
"https://github.com/facebookresearch/CRAG/raw/refs/heads/main/data/"
"crag_task_3_dev_v4.tar.bz2.part"
)
_USER_AGENT = "SurfSense-Evals/0.1 (CRAG Task 3 fetch)"
def _expected_size(url: str) -> int:
req = urllib.request.Request(url, method="HEAD", headers={"User-Agent": _USER_AGENT})
with urllib.request.urlopen(req, timeout=30) as resp:
return int(resp.headers.get("content-length", 0))
def download_one(part: int, dest_dir: Path) -> Path:
url = f"{_BASE}{part}"
dest = dest_dir / f"crag_task_3_dev_v4.tar.bz2.part{part}"
expected = _expected_size(url)
if dest.exists() and dest.stat().st_size == expected:
log.info("part%d: cached (%d bytes)", part, expected)
return dest
log.info("part%d: downloading %d bytes ...", part, expected)
tmp = dest.with_suffix(dest.suffix + ".part_dl")
started = time.monotonic()
last_log = started
with urllib.request.urlopen(
urllib.request.Request(url, headers={"User-Agent": _USER_AGENT}),
timeout=900,
) as resp, tmp.open("wb") as fh:
downloaded = 0
chunk = resp.read(1 << 20)
while chunk:
fh.write(chunk)
downloaded += len(chunk)
now = time.monotonic()
if now - last_log > 5.0:
pct = 100 * downloaded / expected if expected else 0
rate_mb = (downloaded / (now - started)) / (1 << 20)
log.info(
"part%d: %5.1f%% (%.1f / %.1f MiB at %.1f MiB/s)",
part, pct, downloaded / (1 << 20), expected / (1 << 20), rate_mb,
)
last_log = now
chunk = resp.read(1 << 20)
tmp.replace(dest)
elapsed = time.monotonic() - started
log.info(
"part%d: done in %.1fs (%.1f MiB/s avg)",
part, elapsed, (expected / (1 << 20)) / max(elapsed, 0.001),
)
return dest
def main() -> int:
dest_dir = Path("data/research/crag_t3/.raw_cache")
dest_dir.mkdir(parents=True, exist_ok=True)
# 4 parts in parallel — typical residential connection saturates around
# 2 streams; GitHub raw serves these fine in parallel.
started = time.monotonic()
with ThreadPoolExecutor(max_workers=4) as ex:
futures = {ex.submit(download_one, i, dest_dir): i for i in range(1, 5)}
for fut in as_completed(futures):
part = futures[fut]
try:
fut.result()
except Exception as exc: # noqa: BLE001
log.error("part%d failed: %s", part, exc)
return 1
log.info("All 4 parts downloaded in %.1fs", time.monotonic() - started)
return 0
if __name__ == "__main__":
sys.exit(main())

View file

@ -0,0 +1,37 @@
"""Tiny helper to inspect the latest CRAG run's per-question outputs."""
from __future__ import annotations
import glob
import json
from collections import defaultdict
def main() -> None:
raw_path = sorted(glob.glob("data/research/runs/*/crag/raw.jsonl"))[-1]
print(f"Reading: {raw_path}")
rows = [json.loads(line) for line in open(raw_path, encoding="utf-8") if line.strip()]
by_q: dict[str, dict[str, dict]] = defaultdict(dict)
for r in rows:
by_q[r["qid"]][r["arm"]] = r
for qid, arms in list(by_q.items()):
b = arms.get("bare_llm", {})
l = arms.get("long_context", {})
s = arms.get("surfsense", {})
print(f"\n=== {qid} ({b.get('domain')}/{b.get('question_type')}) ===")
print(f" question: {b.get('extra', {}).get('question', '?')!r}")
print(f" gold: {b.get('gold')!r}")
for arm_name, a in (("bare_llm", b), ("long_context", l), ("surfsense", s)):
grade = a.get("graded", {})
text = (a.get("raw_text") or "").strip()
tail = text[-200:] if text else ""
print(
f" [{arm_name}] grade={grade.get('grade')} "
f"method={grade.get('method')}"
)
print(f" -> {tail!r}")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,64 @@
"""Show questions where SurfSense was wrong but long-context was right (and vice versa)."""
from __future__ import annotations
import glob
import json
from collections import defaultdict
def main() -> None:
raw_path = sorted(glob.glob("data/research/runs/*/crag/raw.jsonl"))[-1]
print(f"Reading: {raw_path}")
rows = [json.loads(line) for line in open(raw_path, encoding="utf-8") if line.strip()]
by_q: dict[str, dict[str, dict]] = defaultdict(dict)
for r in rows:
by_q[r["qid"]][r["arm"]] = r
surf_wrong_lc_right = []
lc_wrong_surf_right = []
surf_wrong_bare_right = []
for qid, arms in by_q.items():
b = arms.get("bare_llm", {}).get("graded", {}).get("grade")
lc = arms.get("long_context", {}).get("graded", {}).get("grade")
s = arms.get("surfsense", {}).get("graded", {}).get("grade")
if s == "incorrect" and lc == "correct":
surf_wrong_lc_right.append(qid)
if lc == "incorrect" and s == "correct":
lc_wrong_surf_right.append(qid)
if s == "incorrect" and b == "correct":
surf_wrong_bare_right.append(qid)
print(f"\nSurfSense INCORRECT but Long-Context CORRECT: {len(surf_wrong_lc_right)}")
print(f"Long-Context INCORRECT but SurfSense CORRECT: {len(lc_wrong_surf_right)}")
print(f"SurfSense INCORRECT but Bare CORRECT: {len(surf_wrong_bare_right)}")
print("\n=== Where SurfSense is wrong but long-context is right (top 5) ===")
for qid in surf_wrong_lc_right[:5]:
arms = by_q[qid]
b = arms.get("bare_llm", {})
print(f"\n[{qid}] domain={b.get('domain')} qtype={b.get('question_type')}")
print(f" GOLD: {b.get('gold')!r}")
for arm_name in ("bare_llm", "long_context", "surfsense"):
a = arms.get(arm_name, {})
t = (a.get("raw_text") or "").strip()
tail = t[-180:] if t else ""
grade = a.get("graded", {})
print(f" [{arm_name}] {grade.get('grade')} ({grade.get('method')}): {tail!r}")
print("\n=== Where Long-Context is wrong but SurfSense is right (top 5) ===")
for qid in lc_wrong_surf_right[:5]:
arms = by_q[qid]
b = arms.get("bare_llm", {})
print(f"\n[{qid}] domain={b.get('domain')} qtype={b.get('question_type')}")
print(f" GOLD: {b.get('gold')!r}")
for arm_name in ("bare_llm", "long_context", "surfsense"):
a = arms.get(arm_name, {})
t = (a.get("raw_text") or "").strip()
tail = t[-180:] if t else ""
grade = a.get("graded", {})
print(f" [{arm_name}] {grade.get('grade')} ({grade.get('method')}): {tail!r}")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,40 @@
"""Quick sanity-check for the CRAG Task 3 doc map after ingest."""
from __future__ import annotations
import json
import sys
from pathlib import Path
def main() -> int:
p = Path("data/research/maps/crag_t3_doc_map.jsonl")
if not p.exists():
print(f"Doc map missing: {p}")
return 1
rows = []
settings = {}
for line in p.read_text(encoding="utf-8").splitlines():
if not line.strip():
continue
row = json.loads(line)
if "__settings__" in row:
settings = row
continue
rows.append(row)
print(f"Settings header: {settings}")
print(f"Doc map rows: {len(rows)}")
for r in rows:
print(f" qid={r['qid']:<10} domain={r['domain']:<8} qtype={r['question_type']}")
print(f" question: {r['question'][:90]}")
print(f" gold: {r['gold_answer'][:90]}")
print(
f" pages: {len(r['page_filenames'])} extracted, "
f"{len(r['document_ids'])} doc_ids, "
f"{len(r['missing_pages'])} missing"
)
return 0
if __name__ == "__main__":
sys.exit(main())

View file

@ -0,0 +1,65 @@
"""Render a quick textual summary of the latest CRAG run."""
from __future__ import annotations
import glob
import json
def main() -> None:
runs = sorted(glob.glob("data/research/runs/*/crag/run_artifact.json"))
if not runs:
print("(no CRAG runs found)")
return
m = json.load(open(runs[-1], encoding="utf-8"))
metrics = m["metrics"]
print(f"Reading: {runs[-1]}")
print(f"n_questions: {m['extra']['n_questions']}")
print()
print("=== ARMS ===")
for arm in ("bare_llm", "long_context", "surfsense"):
d = metrics[arm]
print(
f"{arm:14s}: "
f"acc={d['accuracy']*100:5.1f}% (Wilson 95% CI "
f"{d['ci_low']*100:.1f}-{d['ci_high']*100:.1f}) | "
f"correct={d['correct_rate']*100:5.1f}% "
f"missing={d['missing_rate']*100:5.1f}% "
f"incorrect={d['incorrect_rate']*100:5.1f}% | "
f"truth={d['truthfulness_score']*100:+5.1f}%"
)
print()
print("=== DELTAS ===")
for key, d in metrics["deltas"].items():
print(
f"{key:30s}: acc={d['accuracy_pp']:+5.1f}pp "
f"truth={d['truthfulness_score_pp']:+5.1f}pp "
f"McNemar p={d['mcnemar_p_value']:.4f} ({d['mcnemar_method']}) "
f"bootstrap CI [{d['bootstrap_ci_low']:+.1f}, {d['bootstrap_ci_high']:+.1f}]"
)
print()
print("=== PER-QUESTION-TYPE TRUTHFULNESS ===")
for qt, row in sorted(metrics["per_question_type"].items()):
n = row["n"]
pieces = [f"{qt:20s} (n={n:3d}):"]
for arm in ("bare_llm", "long_context", "surfsense"):
if arm in row:
pieces.append(f"{arm}={row[arm]['truthfulness_score']*100:+7.1f}%")
print(" ".join(pieces))
print()
print("=== PER-DOMAIN TRUTHFULNESS ===")
for dom, row in sorted(metrics["per_domain"].items()):
n = row["n"]
pieces = [f"{dom:10s} (n={n:3d}):"]
for arm in ("bare_llm", "long_context", "surfsense"):
if arm in row:
pieces.append(f"{arm}={row[arm]['truthfulness_score']*100:+7.1f}%")
print(" ".join(pieces))
if __name__ == "__main__":
main()