Initial release: iai-mcp v0.1.0
Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: XNLLLLH <XNLLLLH@users.noreply.github.com>
This commit is contained in:
commit
f6b876fbe7
332 changed files with 97258 additions and 0 deletions
163
bench/adapters/longmemeval_cleaned.py
Normal file
163
bench/adapters/longmemeval_cleaned.py
Normal file
|
|
@ -0,0 +1,163 @@
|
|||
"""Cleaned-dataset adapter for LongMemEval-S — D-02.
|
||||
|
||||
Mempalace's reference benchmark uses ``xiaowu0162/longmemeval-cleaned``
|
||||
(commit-pinned via ``huggingface_hub.repo_info()``). This adapter mirrors
|
||||
the ``LongMemEvalAdapter`` shape from ``bench/adapters/longmemeval.py`` so
|
||||
the orchestrator (`bench/longmemeval_blind.py`) can swap raw vs cleaned
|
||||
purely via the ``--dataset {cleaned, raw}`` CLI flag.
|
||||
|
||||
## boundary
|
||||
|
||||
This adapter is NEW (Phase 9 Task 1). The raw adapter at
|
||||
``bench/adapters/longmemeval.py`` is byte-identical to its v2 state — Phase
|
||||
9 does NOT modify the v1/v2 baseline path. ``--dataset raw`` continues to
|
||||
load the raw revision ``2ec2a557f339...``; ``--dataset cleaned`` (the new
|
||||
v3 default) routes to this module.
|
||||
|
||||
## Pinning discipline
|
||||
|
||||
Phase 9 LOCKED: pin via ``huggingface_hub.repo_info(...)``, NEVER
|
||||
hardcode a magic string. The cleaned dataset's HEAD SHA is auto-discovered
|
||||
on first instantiation and stored on ``self.revision`` so v3 output JSON
|
||||
records exactly which dataset variant was measured. On reproducer runs,
|
||||
the caller may pass ``revision=`` to pin a specific historical SHA.
|
||||
|
||||
## Schema
|
||||
|
||||
The cleaned dataset uses the same row schema as the raw dataset (cleaned
|
||||
removed bad evidence; field names preserved). Each row in
|
||||
``longmemeval_s_cleaned.json`` is:
|
||||
|
||||
{
|
||||
"question_id": str,
|
||||
"question_type": str,
|
||||
"question": str,
|
||||
"haystack_session_ids": list[str],
|
||||
"haystack_sessions": list[list[{"role","content"}]],
|
||||
"answer_session_ids": list[str],
|
||||
}
|
||||
|
||||
The adapter emits one ``LMESession`` per haystack session with the eval
|
||||
query attached (matching the raw adapter's emission shape exactly), so
|
||||
``main()`` in ``longmemeval_blind.py`` does NOT branch on adapter type —
|
||||
it groups LMESessions by ``question_id`` either way.
|
||||
|
||||
## Split support
|
||||
|
||||
Only ``split="S"`` is supported. The cleaned dataset ships only the S split
|
||||
as ``longmemeval_s_cleaned.json``; M and oracle remain in the raw dataset.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import sys
|
||||
from typing import Iterable
|
||||
|
||||
from bench.adapters.longmemeval import LMESession
|
||||
|
||||
|
||||
CLEANED_DATASET_ID: str = "xiaowu0162/longmemeval-cleaned"
|
||||
CLEANED_FILENAME: str = "longmemeval_s_cleaned.json"
|
||||
|
||||
|
||||
class CleanedLongMemEvalAdapter:
|
||||
"""Loads ``xiaowu0162/longmemeval-cleaned`` via ``huggingface_hub``.
|
||||
|
||||
Mirrors ``LongMemEvalAdapter`` so ``bench/longmemeval_blind.py`` can
|
||||
treat them interchangeably (same ``LMESession`` iterator shape).
|
||||
|
||||
Pin discipline: ``revision`` defaults to the current HEAD SHA of the
|
||||
HuggingFace dataset, auto-discovered via ``repo_info()``. Pass an
|
||||
explicit revision to reproduce a historical run.
|
||||
"""
|
||||
|
||||
DATASET_ID: str = CLEANED_DATASET_ID
|
||||
|
||||
def __init__(self, revision: str | None = None) -> None:
|
||||
if revision is not None:
|
||||
self.revision = revision
|
||||
return
|
||||
try:
|
||||
from huggingface_hub import repo_info
|
||||
except ImportError as exc: # pragma: no cover — dev extra
|
||||
raise RuntimeError(
|
||||
"huggingface_hub not installed; run "
|
||||
"`pip install 'datasets>=2.18' huggingface_hub`"
|
||||
) from exc
|
||||
info = repo_info(repo_id=CLEANED_DATASET_ID, repo_type="dataset")
|
||||
self.revision = info.sha
|
||||
|
||||
def load_dataset(self, split: str = "S") -> Iterable[LMESession]:
|
||||
"""Stream LMESessions out of ``longmemeval_s_cleaned.json``.
|
||||
|
||||
Only ``split="S"`` is supported (the cleaned dataset ships the S
|
||||
split only). Raises ``ValueError`` on any other split value.
|
||||
"""
|
||||
if split != "S":
|
||||
raise ValueError(
|
||||
f"unknown LongMemEval cleaned split {split!r}; "
|
||||
f"the cleaned dataset ships only the 'S' split"
|
||||
)
|
||||
|
||||
try:
|
||||
from huggingface_hub import hf_hub_download
|
||||
except ImportError as exc: # pragma: no cover — dev extra
|
||||
raise RuntimeError(
|
||||
"huggingface_hub not installed; run "
|
||||
"`pip install 'datasets>=2.18' huggingface_hub`"
|
||||
) from exc
|
||||
|
||||
print(
|
||||
f"[LongMemEval-cleaned] resolving split={split} "
|
||||
f"revision={self.revision} filename={CLEANED_FILENAME}",
|
||||
file=sys.stderr,
|
||||
flush=True,
|
||||
)
|
||||
path = hf_hub_download(
|
||||
repo_id=CLEANED_DATASET_ID,
|
||||
filename=CLEANED_FILENAME,
|
||||
repo_type="dataset",
|
||||
revision=self.revision,
|
||||
)
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
rows = json.load(f)
|
||||
|
||||
for row in rows:
|
||||
qid = row["question_id"]
|
||||
question = row["question"]
|
||||
question_type = str(row.get("question_type", "unknown"))
|
||||
answer_session_ids = list(row.get("answer_session_ids", []))
|
||||
haystack_session_ids: list[str] = list(
|
||||
row.get("haystack_session_ids", [])
|
||||
)
|
||||
haystack_sessions: list[list[dict]] = list(
|
||||
row.get("haystack_sessions", [])
|
||||
)
|
||||
|
||||
# Emit one LMESession per haystack session; attach the eval
|
||||
# query to every one so the orchestrator can run ONE recall
|
||||
# per row after inserting all haystack turns. Matches the
|
||||
# raw adapter's emission shape exactly.
|
||||
for sess_id, turns in zip(
|
||||
haystack_session_ids, haystack_sessions
|
||||
):
|
||||
yield LMESession(
|
||||
session_id=sess_id,
|
||||
turns=list(turns),
|
||||
queries=[
|
||||
{
|
||||
"query": question,
|
||||
"question_id": qid,
|
||||
"question_type": question_type,
|
||||
"relevant_turn_ids": answer_session_ids,
|
||||
"is_gold_session": sess_id in answer_session_ids,
|
||||
}
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"CLEANED_DATASET_ID",
|
||||
"CLEANED_FILENAME",
|
||||
"CleanedLongMemEvalAdapter",
|
||||
]
|
||||
Loading…
Add table
Add a link
Reference in a new issue