Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: XNLLLLH <XNLLLLH@users.noreply.github.com>
163 lines
6 KiB
Python
163 lines
6 KiB
Python
"""Cleaned-dataset adapter for LongMemEval-S — D-02.
|
|
|
|
Mempalace's reference benchmark uses ``xiaowu0162/longmemeval-cleaned``
|
|
(commit-pinned via ``huggingface_hub.repo_info()``). This adapter mirrors
|
|
the ``LongMemEvalAdapter`` shape from ``bench/adapters/longmemeval.py`` so
|
|
the orchestrator (`bench/longmemeval_blind.py`) can swap raw vs cleaned
|
|
purely via the ``--dataset {cleaned, raw}`` CLI flag.
|
|
|
|
## boundary
|
|
|
|
This adapter is NEW (Phase 9 Task 1). The raw adapter at
|
|
``bench/adapters/longmemeval.py`` is byte-identical to its v2 state — Phase
|
|
9 does NOT modify the v1/v2 baseline path. ``--dataset raw`` continues to
|
|
load the raw revision ``2ec2a557f339...``; ``--dataset cleaned`` (the new
|
|
v3 default) routes to this module.
|
|
|
|
## Pinning discipline
|
|
|
|
Phase 9 LOCKED: pin via ``huggingface_hub.repo_info(...)``, NEVER
|
|
hardcode a magic string. The cleaned dataset's HEAD SHA is auto-discovered
|
|
on first instantiation and stored on ``self.revision`` so v3 output JSON
|
|
records exactly which dataset variant was measured. On reproducer runs,
|
|
the caller may pass ``revision=`` to pin a specific historical SHA.
|
|
|
|
## Schema
|
|
|
|
The cleaned dataset uses the same row schema as the raw dataset (cleaned
|
|
removed bad evidence; field names preserved). Each row in
|
|
``longmemeval_s_cleaned.json`` is:
|
|
|
|
{
|
|
"question_id": str,
|
|
"question_type": str,
|
|
"question": str,
|
|
"haystack_session_ids": list[str],
|
|
"haystack_sessions": list[list[{"role","content"}]],
|
|
"answer_session_ids": list[str],
|
|
}
|
|
|
|
The adapter emits one ``LMESession`` per haystack session with the eval
|
|
query attached (matching the raw adapter's emission shape exactly), so
|
|
``main()`` in ``longmemeval_blind.py`` does NOT branch on adapter type —
|
|
it groups LMESessions by ``question_id`` either way.
|
|
|
|
## Split support
|
|
|
|
Only ``split="S"`` is supported. The cleaned dataset ships only the S split
|
|
as ``longmemeval_s_cleaned.json``; M and oracle remain in the raw dataset.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import sys
|
|
from typing import Iterable
|
|
|
|
from bench.adapters.longmemeval import LMESession
|
|
|
|
|
|
CLEANED_DATASET_ID: str = "xiaowu0162/longmemeval-cleaned"
|
|
CLEANED_FILENAME: str = "longmemeval_s_cleaned.json"
|
|
|
|
|
|
class CleanedLongMemEvalAdapter:
|
|
"""Loads ``xiaowu0162/longmemeval-cleaned`` via ``huggingface_hub``.
|
|
|
|
Mirrors ``LongMemEvalAdapter`` so ``bench/longmemeval_blind.py`` can
|
|
treat them interchangeably (same ``LMESession`` iterator shape).
|
|
|
|
Pin discipline: ``revision`` defaults to the current HEAD SHA of the
|
|
HuggingFace dataset, auto-discovered via ``repo_info()``. Pass an
|
|
explicit revision to reproduce a historical run.
|
|
"""
|
|
|
|
DATASET_ID: str = CLEANED_DATASET_ID
|
|
|
|
def __init__(self, revision: str | None = None) -> None:
|
|
if revision is not None:
|
|
self.revision = revision
|
|
return
|
|
try:
|
|
from huggingface_hub import repo_info
|
|
except ImportError as exc: # pragma: no cover — dev extra
|
|
raise RuntimeError(
|
|
"huggingface_hub not installed; run "
|
|
"`pip install 'datasets>=2.18' huggingface_hub`"
|
|
) from exc
|
|
info = repo_info(repo_id=CLEANED_DATASET_ID, repo_type="dataset")
|
|
self.revision = info.sha
|
|
|
|
def load_dataset(self, split: str = "S") -> Iterable[LMESession]:
|
|
"""Stream LMESessions out of ``longmemeval_s_cleaned.json``.
|
|
|
|
Only ``split="S"`` is supported (the cleaned dataset ships the S
|
|
split only). Raises ``ValueError`` on any other split value.
|
|
"""
|
|
if split != "S":
|
|
raise ValueError(
|
|
f"unknown LongMemEval cleaned split {split!r}; "
|
|
f"the cleaned dataset ships only the 'S' split"
|
|
)
|
|
|
|
try:
|
|
from huggingface_hub import hf_hub_download
|
|
except ImportError as exc: # pragma: no cover — dev extra
|
|
raise RuntimeError(
|
|
"huggingface_hub not installed; run "
|
|
"`pip install 'datasets>=2.18' huggingface_hub`"
|
|
) from exc
|
|
|
|
print(
|
|
f"[LongMemEval-cleaned] resolving split={split} "
|
|
f"revision={self.revision} filename={CLEANED_FILENAME}",
|
|
file=sys.stderr,
|
|
flush=True,
|
|
)
|
|
path = hf_hub_download(
|
|
repo_id=CLEANED_DATASET_ID,
|
|
filename=CLEANED_FILENAME,
|
|
repo_type="dataset",
|
|
revision=self.revision,
|
|
)
|
|
with open(path, "r", encoding="utf-8") as f:
|
|
rows = json.load(f)
|
|
|
|
for row in rows:
|
|
qid = row["question_id"]
|
|
question = row["question"]
|
|
question_type = str(row.get("question_type", "unknown"))
|
|
answer_session_ids = list(row.get("answer_session_ids", []))
|
|
haystack_session_ids: list[str] = list(
|
|
row.get("haystack_session_ids", [])
|
|
)
|
|
haystack_sessions: list[list[dict]] = list(
|
|
row.get("haystack_sessions", [])
|
|
)
|
|
|
|
# Emit one LMESession per haystack session; attach the eval
|
|
# query to every one so the orchestrator can run ONE recall
|
|
# per row after inserting all haystack turns. Matches the
|
|
# raw adapter's emission shape exactly.
|
|
for sess_id, turns in zip(
|
|
haystack_session_ids, haystack_sessions
|
|
):
|
|
yield LMESession(
|
|
session_id=sess_id,
|
|
turns=list(turns),
|
|
queries=[
|
|
{
|
|
"query": question,
|
|
"question_id": qid,
|
|
"question_type": question_type,
|
|
"relevant_turn_ids": answer_session_ids,
|
|
"is_gold_session": sess_id in answer_session_ids,
|
|
}
|
|
],
|
|
)
|
|
|
|
|
|
__all__ = [
|
|
"CLEANED_DATASET_ID",
|
|
"CLEANED_FILENAME",
|
|
"CleanedLongMemEvalAdapter",
|
|
]
|