mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-17 18:35:19 +02:00
chore: evals
This commit is contained in:
parent
2402b730fa
commit
3737118050
122 changed files with 22598 additions and 13 deletions
97
surfsense_evals/scripts/download_crag_task3.py
Normal file
97
surfsense_evals/scripts/download_crag_task3.py
Normal file
|
|
@ -0,0 +1,97 @@
|
|||
"""Download CRAG Task 3's 4 .tar.bz2 parts in parallel.
|
||||
|
||||
Run once before ``ingest research crag_t3`` to avoid the ingest
|
||||
synchronously blocking on a 7 GB download. Skips parts already
|
||||
present and complete on disk.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import sys
|
||||
import time
|
||||
import urllib.request
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from pathlib import Path
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(message)s",
|
||||
)
|
||||
log = logging.getLogger("download_task3")
|
||||
|
||||
|
||||
_BASE = (
|
||||
"https://github.com/facebookresearch/CRAG/raw/refs/heads/main/data/"
|
||||
"crag_task_3_dev_v4.tar.bz2.part"
|
||||
)
|
||||
_USER_AGENT = "SurfSense-Evals/0.1 (CRAG Task 3 fetch)"
|
||||
|
||||
|
||||
def _expected_size(url: str) -> int:
|
||||
req = urllib.request.Request(url, method="HEAD", headers={"User-Agent": _USER_AGENT})
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
return int(resp.headers.get("content-length", 0))
|
||||
|
||||
|
||||
def download_one(part: int, dest_dir: Path) -> Path:
|
||||
url = f"{_BASE}{part}"
|
||||
dest = dest_dir / f"crag_task_3_dev_v4.tar.bz2.part{part}"
|
||||
expected = _expected_size(url)
|
||||
if dest.exists() and dest.stat().st_size == expected:
|
||||
log.info("part%d: cached (%d bytes)", part, expected)
|
||||
return dest
|
||||
log.info("part%d: downloading %d bytes ...", part, expected)
|
||||
tmp = dest.with_suffix(dest.suffix + ".part_dl")
|
||||
started = time.monotonic()
|
||||
last_log = started
|
||||
with urllib.request.urlopen(
|
||||
urllib.request.Request(url, headers={"User-Agent": _USER_AGENT}),
|
||||
timeout=900,
|
||||
) as resp, tmp.open("wb") as fh:
|
||||
downloaded = 0
|
||||
chunk = resp.read(1 << 20)
|
||||
while chunk:
|
||||
fh.write(chunk)
|
||||
downloaded += len(chunk)
|
||||
now = time.monotonic()
|
||||
if now - last_log > 5.0:
|
||||
pct = 100 * downloaded / expected if expected else 0
|
||||
rate_mb = (downloaded / (now - started)) / (1 << 20)
|
||||
log.info(
|
||||
"part%d: %5.1f%% (%.1f / %.1f MiB at %.1f MiB/s)",
|
||||
part, pct, downloaded / (1 << 20), expected / (1 << 20), rate_mb,
|
||||
)
|
||||
last_log = now
|
||||
chunk = resp.read(1 << 20)
|
||||
tmp.replace(dest)
|
||||
elapsed = time.monotonic() - started
|
||||
log.info(
|
||||
"part%d: done in %.1fs (%.1f MiB/s avg)",
|
||||
part, elapsed, (expected / (1 << 20)) / max(elapsed, 0.001),
|
||||
)
|
||||
return dest
|
||||
|
||||
|
||||
def main() -> int:
|
||||
dest_dir = Path("data/research/crag_t3/.raw_cache")
|
||||
dest_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 4 parts in parallel — typical residential connection saturates around
|
||||
# 2 streams; GitHub raw serves these fine in parallel.
|
||||
started = time.monotonic()
|
||||
with ThreadPoolExecutor(max_workers=4) as ex:
|
||||
futures = {ex.submit(download_one, i, dest_dir): i for i in range(1, 5)}
|
||||
for fut in as_completed(futures):
|
||||
part = futures[fut]
|
||||
try:
|
||||
fut.result()
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.error("part%d failed: %s", part, exc)
|
||||
return 1
|
||||
log.info("All 4 parts downloaded in %.1fs", time.monotonic() - started)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
37
surfsense_evals/scripts/peek_crag_run.py
Normal file
37
surfsense_evals/scripts/peek_crag_run.py
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
"""Tiny helper to inspect the latest CRAG run's per-question outputs."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import glob
|
||||
import json
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
def main() -> None:
|
||||
raw_path = sorted(glob.glob("data/research/runs/*/crag/raw.jsonl"))[-1]
|
||||
print(f"Reading: {raw_path}")
|
||||
rows = [json.loads(line) for line in open(raw_path, encoding="utf-8") if line.strip()]
|
||||
by_q: dict[str, dict[str, dict]] = defaultdict(dict)
|
||||
for r in rows:
|
||||
by_q[r["qid"]][r["arm"]] = r
|
||||
|
||||
for qid, arms in list(by_q.items()):
|
||||
b = arms.get("bare_llm", {})
|
||||
l = arms.get("long_context", {})
|
||||
s = arms.get("surfsense", {})
|
||||
print(f"\n=== {qid} ({b.get('domain')}/{b.get('question_type')}) ===")
|
||||
print(f" question: {b.get('extra', {}).get('question', '?')!r}")
|
||||
print(f" gold: {b.get('gold')!r}")
|
||||
for arm_name, a in (("bare_llm", b), ("long_context", l), ("surfsense", s)):
|
||||
grade = a.get("graded", {})
|
||||
text = (a.get("raw_text") or "").strip()
|
||||
tail = text[-200:] if text else ""
|
||||
print(
|
||||
f" [{arm_name}] grade={grade.get('grade')} "
|
||||
f"method={grade.get('method')}"
|
||||
)
|
||||
print(f" -> {tail!r}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
64
surfsense_evals/scripts/peek_disagreements.py
Normal file
64
surfsense_evals/scripts/peek_disagreements.py
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
"""Show questions where SurfSense was wrong but long-context was right (and vice versa)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import glob
|
||||
import json
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
def main() -> None:
|
||||
raw_path = sorted(glob.glob("data/research/runs/*/crag/raw.jsonl"))[-1]
|
||||
print(f"Reading: {raw_path}")
|
||||
rows = [json.loads(line) for line in open(raw_path, encoding="utf-8") if line.strip()]
|
||||
by_q: dict[str, dict[str, dict]] = defaultdict(dict)
|
||||
for r in rows:
|
||||
by_q[r["qid"]][r["arm"]] = r
|
||||
|
||||
surf_wrong_lc_right = []
|
||||
lc_wrong_surf_right = []
|
||||
surf_wrong_bare_right = []
|
||||
for qid, arms in by_q.items():
|
||||
b = arms.get("bare_llm", {}).get("graded", {}).get("grade")
|
||||
lc = arms.get("long_context", {}).get("graded", {}).get("grade")
|
||||
s = arms.get("surfsense", {}).get("graded", {}).get("grade")
|
||||
if s == "incorrect" and lc == "correct":
|
||||
surf_wrong_lc_right.append(qid)
|
||||
if lc == "incorrect" and s == "correct":
|
||||
lc_wrong_surf_right.append(qid)
|
||||
if s == "incorrect" and b == "correct":
|
||||
surf_wrong_bare_right.append(qid)
|
||||
|
||||
print(f"\nSurfSense INCORRECT but Long-Context CORRECT: {len(surf_wrong_lc_right)}")
|
||||
print(f"Long-Context INCORRECT but SurfSense CORRECT: {len(lc_wrong_surf_right)}")
|
||||
print(f"SurfSense INCORRECT but Bare CORRECT: {len(surf_wrong_bare_right)}")
|
||||
|
||||
print("\n=== Where SurfSense is wrong but long-context is right (top 5) ===")
|
||||
for qid in surf_wrong_lc_right[:5]:
|
||||
arms = by_q[qid]
|
||||
b = arms.get("bare_llm", {})
|
||||
print(f"\n[{qid}] domain={b.get('domain')} qtype={b.get('question_type')}")
|
||||
print(f" GOLD: {b.get('gold')!r}")
|
||||
for arm_name in ("bare_llm", "long_context", "surfsense"):
|
||||
a = arms.get(arm_name, {})
|
||||
t = (a.get("raw_text") or "").strip()
|
||||
tail = t[-180:] if t else ""
|
||||
grade = a.get("graded", {})
|
||||
print(f" [{arm_name}] {grade.get('grade')} ({grade.get('method')}): {tail!r}")
|
||||
|
||||
print("\n=== Where Long-Context is wrong but SurfSense is right (top 5) ===")
|
||||
for qid in lc_wrong_surf_right[:5]:
|
||||
arms = by_q[qid]
|
||||
b = arms.get("bare_llm", {})
|
||||
print(f"\n[{qid}] domain={b.get('domain')} qtype={b.get('question_type')}")
|
||||
print(f" GOLD: {b.get('gold')!r}")
|
||||
for arm_name in ("bare_llm", "long_context", "surfsense"):
|
||||
a = arms.get(arm_name, {})
|
||||
t = (a.get("raw_text") or "").strip()
|
||||
tail = t[-180:] if t else ""
|
||||
grade = a.get("graded", {})
|
||||
print(f" [{arm_name}] {grade.get('grade')} ({grade.get('method')}): {tail!r}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
40
surfsense_evals/scripts/peek_t3_doc_map.py
Normal file
40
surfsense_evals/scripts/peek_t3_doc_map.py
Normal file
|
|
@ -0,0 +1,40 @@
|
|||
"""Quick sanity-check for the CRAG Task 3 doc map after ingest."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def main() -> int:
|
||||
p = Path("data/research/maps/crag_t3_doc_map.jsonl")
|
||||
if not p.exists():
|
||||
print(f"Doc map missing: {p}")
|
||||
return 1
|
||||
rows = []
|
||||
settings = {}
|
||||
for line in p.read_text(encoding="utf-8").splitlines():
|
||||
if not line.strip():
|
||||
continue
|
||||
row = json.loads(line)
|
||||
if "__settings__" in row:
|
||||
settings = row
|
||||
continue
|
||||
rows.append(row)
|
||||
print(f"Settings header: {settings}")
|
||||
print(f"Doc map rows: {len(rows)}")
|
||||
for r in rows:
|
||||
print(f" qid={r['qid']:<10} domain={r['domain']:<8} qtype={r['question_type']}")
|
||||
print(f" question: {r['question'][:90]}")
|
||||
print(f" gold: {r['gold_answer'][:90]}")
|
||||
print(
|
||||
f" pages: {len(r['page_filenames'])} extracted, "
|
||||
f"{len(r['document_ids'])} doc_ids, "
|
||||
f"{len(r['missing_pages'])} missing"
|
||||
)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
65
surfsense_evals/scripts/summarise_crag_run.py
Normal file
65
surfsense_evals/scripts/summarise_crag_run.py
Normal file
|
|
@ -0,0 +1,65 @@
|
|||
"""Render a quick textual summary of the latest CRAG run."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import glob
|
||||
import json
|
||||
|
||||
|
||||
def main() -> None:
|
||||
runs = sorted(glob.glob("data/research/runs/*/crag/run_artifact.json"))
|
||||
if not runs:
|
||||
print("(no CRAG runs found)")
|
||||
return
|
||||
m = json.load(open(runs[-1], encoding="utf-8"))
|
||||
metrics = m["metrics"]
|
||||
|
||||
print(f"Reading: {runs[-1]}")
|
||||
print(f"n_questions: {m['extra']['n_questions']}")
|
||||
print()
|
||||
print("=== ARMS ===")
|
||||
for arm in ("bare_llm", "long_context", "surfsense"):
|
||||
d = metrics[arm]
|
||||
print(
|
||||
f"{arm:14s}: "
|
||||
f"acc={d['accuracy']*100:5.1f}% (Wilson 95% CI "
|
||||
f"{d['ci_low']*100:.1f}-{d['ci_high']*100:.1f}) | "
|
||||
f"correct={d['correct_rate']*100:5.1f}% "
|
||||
f"missing={d['missing_rate']*100:5.1f}% "
|
||||
f"incorrect={d['incorrect_rate']*100:5.1f}% | "
|
||||
f"truth={d['truthfulness_score']*100:+5.1f}%"
|
||||
)
|
||||
|
||||
print()
|
||||
print("=== DELTAS ===")
|
||||
for key, d in metrics["deltas"].items():
|
||||
print(
|
||||
f"{key:30s}: acc={d['accuracy_pp']:+5.1f}pp "
|
||||
f"truth={d['truthfulness_score_pp']:+5.1f}pp "
|
||||
f"McNemar p={d['mcnemar_p_value']:.4f} ({d['mcnemar_method']}) "
|
||||
f"bootstrap CI [{d['bootstrap_ci_low']:+.1f}, {d['bootstrap_ci_high']:+.1f}]"
|
||||
)
|
||||
|
||||
print()
|
||||
print("=== PER-QUESTION-TYPE TRUTHFULNESS ===")
|
||||
for qt, row in sorted(metrics["per_question_type"].items()):
|
||||
n = row["n"]
|
||||
pieces = [f"{qt:20s} (n={n:3d}):"]
|
||||
for arm in ("bare_llm", "long_context", "surfsense"):
|
||||
if arm in row:
|
||||
pieces.append(f"{arm}={row[arm]['truthfulness_score']*100:+7.1f}%")
|
||||
print(" ".join(pieces))
|
||||
|
||||
print()
|
||||
print("=== PER-DOMAIN TRUTHFULNESS ===")
|
||||
for dom, row in sorted(metrics["per_domain"].items()):
|
||||
n = row["n"]
|
||||
pieces = [f"{dom:10s} (n={n:3d}):"]
|
||||
for arm in ("bare_llm", "long_context", "surfsense"):
|
||||
if arm in row:
|
||||
pieces.append(f"{arm}={row[arm]['truthfulness_score']*100:+7.1f}%")
|
||||
print(" ".join(pieces))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue