SurfSense/surfsense_evals/scripts/peek_crag_run.py
DESKTOP-RTLN3BA\$punk 3737118050 chore: evals
2026-05-13 14:02:26 -07:00

37 lines
1.3 KiB
Python

"""Tiny helper to inspect the latest CRAG run's per-question outputs."""
from __future__ import annotations
import glob
import json
from collections import defaultdict
def main() -> None:
raw_path = sorted(glob.glob("data/research/runs/*/crag/raw.jsonl"))[-1]
print(f"Reading: {raw_path}")
rows = [json.loads(line) for line in open(raw_path, encoding="utf-8") if line.strip()]
by_q: dict[str, dict[str, dict]] = defaultdict(dict)
for r in rows:
by_q[r["qid"]][r["arm"]] = r
for qid, arms in list(by_q.items()):
b = arms.get("bare_llm", {})
l = arms.get("long_context", {})
s = arms.get("surfsense", {})
print(f"\n=== {qid} ({b.get('domain')}/{b.get('question_type')}) ===")
print(f" question: {b.get('extra', {}).get('question', '?')!r}")
print(f" gold: {b.get('gold')!r}")
for arm_name, a in (("bare_llm", b), ("long_context", l), ("surfsense", s)):
grade = a.get("graded", {})
text = (a.get("raw_text") or "").strip()
tail = text[-200:] if text else ""
print(
f" [{arm_name}] grade={grade.get('grade')} "
f"method={grade.get('method')}"
)
print(f" -> {tail!r}")
if __name__ == "__main__":
main()