"""Cleaned-dataset adapter for LongMemEval-S — D-02. Mempalace's reference benchmark uses ``xiaowu0162/longmemeval-cleaned`` (commit-pinned via ``huggingface_hub.repo_info()``). This adapter mirrors the ``LongMemEvalAdapter`` shape from ``bench/adapters/longmemeval.py`` so the orchestrator (`bench/longmemeval_blind.py`) can swap raw vs cleaned purely via the ``--dataset {cleaned, raw}`` CLI flag. ## boundary This adapter is NEW (Phase 9 Task 1). The raw adapter at ``bench/adapters/longmemeval.py`` is byte-identical to its v2 state — Phase 9 does NOT modify the v1/v2 baseline path. ``--dataset raw`` continues to load the raw revision ``2ec2a557f339...``; ``--dataset cleaned`` (the new v3 default) routes to this module. ## Pinning discipline Phase 9 LOCKED: pin via ``huggingface_hub.repo_info(...)``, NEVER hardcode a magic string. The cleaned dataset's HEAD SHA is auto-discovered on first instantiation and stored on ``self.revision`` so v3 output JSON records exactly which dataset variant was measured. On reproducer runs, the caller may pass ``revision=`` to pin a specific historical SHA. ## Schema The cleaned dataset uses the same row schema as the raw dataset (cleaned removed bad evidence; field names preserved). Each row in ``longmemeval_s_cleaned.json`` is: { "question_id": str, "question_type": str, "question": str, "haystack_session_ids": list[str], "haystack_sessions": list[list[{"role","content"}]], "answer_session_ids": list[str], } The adapter emits one ``LMESession`` per haystack session with the eval query attached (matching the raw adapter's emission shape exactly), so ``main()`` in ``longmemeval_blind.py`` does NOT branch on adapter type — it groups LMESessions by ``question_id`` either way. ## Split support Only ``split="S"`` is supported. The cleaned dataset ships only the S split as ``longmemeval_s_cleaned.json``; M and oracle remain in the raw dataset. """ from __future__ import annotations import json import sys from typing import Iterable from bench.adapters.longmemeval import LMESession CLEANED_DATASET_ID: str = "xiaowu0162/longmemeval-cleaned" CLEANED_FILENAME: str = "longmemeval_s_cleaned.json" class CleanedLongMemEvalAdapter: """Loads ``xiaowu0162/longmemeval-cleaned`` via ``huggingface_hub``. Mirrors ``LongMemEvalAdapter`` so ``bench/longmemeval_blind.py`` can treat them interchangeably (same ``LMESession`` iterator shape). Pin discipline: ``revision`` defaults to the current HEAD SHA of the HuggingFace dataset, auto-discovered via ``repo_info()``. Pass an explicit revision to reproduce a historical run. """ DATASET_ID: str = CLEANED_DATASET_ID def __init__(self, revision: str | None = None) -> None: if revision is not None: self.revision = revision return try: from huggingface_hub import repo_info except ImportError as exc: # pragma: no cover — dev extra raise RuntimeError( "huggingface_hub not installed; run " "`pip install 'datasets>=2.18' huggingface_hub`" ) from exc info = repo_info(repo_id=CLEANED_DATASET_ID, repo_type="dataset") self.revision = info.sha def load_dataset(self, split: str = "S") -> Iterable[LMESession]: """Stream LMESessions out of ``longmemeval_s_cleaned.json``. Only ``split="S"`` is supported (the cleaned dataset ships the S split only). Raises ``ValueError`` on any other split value. """ if split != "S": raise ValueError( f"unknown LongMemEval cleaned split {split!r}; " f"the cleaned dataset ships only the 'S' split" ) try: from huggingface_hub import hf_hub_download except ImportError as exc: # pragma: no cover — dev extra raise RuntimeError( "huggingface_hub not installed; run " "`pip install 'datasets>=2.18' huggingface_hub`" ) from exc print( f"[LongMemEval-cleaned] resolving split={split} " f"revision={self.revision} filename={CLEANED_FILENAME}", file=sys.stderr, flush=True, ) path = hf_hub_download( repo_id=CLEANED_DATASET_ID, filename=CLEANED_FILENAME, repo_type="dataset", revision=self.revision, ) with open(path, "r", encoding="utf-8") as f: rows = json.load(f) for row in rows: qid = row["question_id"] question = row["question"] question_type = str(row.get("question_type", "unknown")) answer_session_ids = list(row.get("answer_session_ids", [])) haystack_session_ids: list[str] = list( row.get("haystack_session_ids", []) ) haystack_sessions: list[list[dict]] = list( row.get("haystack_sessions", []) ) # Emit one LMESession per haystack session; attach the eval # query to every one so the orchestrator can run ONE recall # per row after inserting all haystack turns. Matches the # raw adapter's emission shape exactly. for sess_id, turns in zip( haystack_session_ids, haystack_sessions ): yield LMESession( session_id=sess_id, turns=list(turns), queries=[ { "query": question, "question_id": qid, "question_type": question_type, "relevant_turn_ids": answer_session_ids, "is_gold_session": sess_id in answer_session_ids, } ], ) __all__ = [ "CLEANED_DATASET_ID", "CLEANED_FILENAME", "CleanedLongMemEvalAdapter", ]