mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-19 18:45:15 +02:00
41 lines
1.1 KiB
Python
41 lines
1.1 KiB
Python
|
|
"""Quick sanity-check for the CRAG Task 3 doc map after ingest."""
|
||
|
|
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
import json
|
||
|
|
import sys
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
|
||
|
|
def main() -> int:
|
||
|
|
p = Path("data/research/maps/crag_t3_doc_map.jsonl")
|
||
|
|
if not p.exists():
|
||
|
|
print(f"Doc map missing: {p}")
|
||
|
|
return 1
|
||
|
|
rows = []
|
||
|
|
settings = {}
|
||
|
|
for line in p.read_text(encoding="utf-8").splitlines():
|
||
|
|
if not line.strip():
|
||
|
|
continue
|
||
|
|
row = json.loads(line)
|
||
|
|
if "__settings__" in row:
|
||
|
|
settings = row
|
||
|
|
continue
|
||
|
|
rows.append(row)
|
||
|
|
print(f"Settings header: {settings}")
|
||
|
|
print(f"Doc map rows: {len(rows)}")
|
||
|
|
for r in rows:
|
||
|
|
print(f" qid={r['qid']:<10} domain={r['domain']:<8} qtype={r['question_type']}")
|
||
|
|
print(f" question: {r['question'][:90]}")
|
||
|
|
print(f" gold: {r['gold_answer'][:90]}")
|
||
|
|
print(
|
||
|
|
f" pages: {len(r['page_filenames'])} extracted, "
|
||
|
|
f"{len(r['document_ids'])} doc_ids, "
|
||
|
|
f"{len(r['missing_pages'])} missing"
|
||
|
|
)
|
||
|
|
return 0
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
sys.exit(main())
|