mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-19 18:45:15 +02:00
chore: evals
This commit is contained in:
parent
2402b730fa
commit
3737118050
122 changed files with 22598 additions and 13 deletions
40
surfsense_evals/scripts/peek_t3_doc_map.py
Normal file
40
surfsense_evals/scripts/peek_t3_doc_map.py
Normal file
|
|
@ -0,0 +1,40 @@
|
|||
"""Quick sanity-check for the CRAG Task 3 doc map after ingest."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def main() -> int:
|
||||
p = Path("data/research/maps/crag_t3_doc_map.jsonl")
|
||||
if not p.exists():
|
||||
print(f"Doc map missing: {p}")
|
||||
return 1
|
||||
rows = []
|
||||
settings = {}
|
||||
for line in p.read_text(encoding="utf-8").splitlines():
|
||||
if not line.strip():
|
||||
continue
|
||||
row = json.loads(line)
|
||||
if "__settings__" in row:
|
||||
settings = row
|
||||
continue
|
||||
rows.append(row)
|
||||
print(f"Settings header: {settings}")
|
||||
print(f"Doc map rows: {len(rows)}")
|
||||
for r in rows:
|
||||
print(f" qid={r['qid']:<10} domain={r['domain']:<8} qtype={r['question_type']}")
|
||||
print(f" question: {r['question'][:90]}")
|
||||
print(f" gold: {r['gold_answer'][:90]}")
|
||||
print(
|
||||
f" pages: {len(r['page_filenames'])} extracted, "
|
||||
f"{len(r['document_ids'])} doc_ids, "
|
||||
f"{len(r['missing_pages'])} missing"
|
||||
)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Loading…
Add table
Add a link
Reference in a new issue