mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-21 18:55:16 +02:00
chore: evals
This commit is contained in:
parent
2402b730fa
commit
3737118050
122 changed files with 22598 additions and 13 deletions
37
surfsense_evals/scripts/peek_crag_run.py
Normal file
37
surfsense_evals/scripts/peek_crag_run.py
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
"""Tiny helper to inspect the latest CRAG run's per-question outputs."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import glob
|
||||
import json
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
def main() -> None:
|
||||
raw_path = sorted(glob.glob("data/research/runs/*/crag/raw.jsonl"))[-1]
|
||||
print(f"Reading: {raw_path}")
|
||||
rows = [json.loads(line) for line in open(raw_path, encoding="utf-8") if line.strip()]
|
||||
by_q: dict[str, dict[str, dict]] = defaultdict(dict)
|
||||
for r in rows:
|
||||
by_q[r["qid"]][r["arm"]] = r
|
||||
|
||||
for qid, arms in list(by_q.items()):
|
||||
b = arms.get("bare_llm", {})
|
||||
l = arms.get("long_context", {})
|
||||
s = arms.get("surfsense", {})
|
||||
print(f"\n=== {qid} ({b.get('domain')}/{b.get('question_type')}) ===")
|
||||
print(f" question: {b.get('extra', {}).get('question', '?')!r}")
|
||||
print(f" gold: {b.get('gold')!r}")
|
||||
for arm_name, a in (("bare_llm", b), ("long_context", l), ("surfsense", s)):
|
||||
grade = a.get("graded", {})
|
||||
text = (a.get("raw_text") or "").strip()
|
||||
tail = text[-200:] if text else ""
|
||||
print(
|
||||
f" [{arm_name}] grade={grade.get('grade')} "
|
||||
f"method={grade.get('method')}"
|
||||
)
|
||||
print(f" -> {tail!r}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue