Initial release: iai-mcp v0.1.0

Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: XNLLLLH <XNLLLLH@users.noreply.github.com>
2026-05-06 01:04:47 -07:00 · 2026-05-06 01:04:47 -07:00 · f6b876fbe7
commit f6b876fbe7
332 changed files with 97258 additions and 0 deletions
--- a/bench/adapters/longmemeval_cleaned.py
+++ b/bench/adapters/longmemeval_cleaned.py
@ -0,0 +1,163 @@
+"""Cleaned-dataset adapter for LongMemEval-S — D-02.
+
+Mempalace's reference benchmark uses ``xiaowu0162/longmemeval-cleaned``
+(commit-pinned via ``huggingface_hub.repo_info()``). This adapter mirrors
+the ``LongMemEvalAdapter`` shape from ``bench/adapters/longmemeval.py`` so
+the orchestrator (`bench/longmemeval_blind.py`) can swap raw vs cleaned
+purely via the ``--dataset {cleaned, raw}`` CLI flag.
+
+## boundary
+
+This adapter is NEW (Phase 9 Task 1). The raw adapter at
+``bench/adapters/longmemeval.py`` is byte-identical to its v2 state — Phase
+9 does NOT modify the v1/v2 baseline path. ``--dataset raw`` continues to
+load the raw revision ``2ec2a557f339...``; ``--dataset cleaned`` (the new
+v3 default) routes to this module.
+
+## Pinning discipline
+
+Phase 9 LOCKED: pin via ``huggingface_hub.repo_info(...)``, NEVER
+hardcode a magic string. The cleaned dataset's HEAD SHA is auto-discovered
+on first instantiation and stored on ``self.revision`` so v3 output JSON
+records exactly which dataset variant was measured. On reproducer runs,
+the caller may pass ``revision=`` to pin a specific historical SHA.
+
+## Schema
+
+The cleaned dataset uses the same row schema as the raw dataset (cleaned
+removed bad evidence; field names preserved). Each row in
+``longmemeval_s_cleaned.json`` is:
+
+    {
+      "question_id":          str,
+      "question_type":        str,
+      "question":             str,
+      "haystack_session_ids": list[str],
+      "haystack_sessions":    list[list[{"role","content"}]],
+      "answer_session_ids":   list[str],
+    }
+
+The adapter emits one ``LMESession`` per haystack session with the eval
+query attached (matching the raw adapter's emission shape exactly), so
+``main()`` in ``longmemeval_blind.py`` does NOT branch on adapter type —
+it groups LMESessions by ``question_id`` either way.
+
+## Split support
+
+Only ``split="S"`` is supported. The cleaned dataset ships only the S split
+as ``longmemeval_s_cleaned.json``; M and oracle remain in the raw dataset.
+"""
+from __future__ import annotations
+
+import json
+import sys
+from typing import Iterable
+
+from bench.adapters.longmemeval import LMESession
+
+
+CLEANED_DATASET_ID: str = "xiaowu0162/longmemeval-cleaned"
+CLEANED_FILENAME: str = "longmemeval_s_cleaned.json"
+
+
+class CleanedLongMemEvalAdapter:
+    """Loads ``xiaowu0162/longmemeval-cleaned`` via ``huggingface_hub``.
+
+    Mirrors ``LongMemEvalAdapter`` so ``bench/longmemeval_blind.py`` can
+    treat them interchangeably (same ``LMESession`` iterator shape).
+
+    Pin discipline: ``revision`` defaults to the current HEAD SHA of the
+    HuggingFace dataset, auto-discovered via ``repo_info()``. Pass an
+    explicit revision to reproduce a historical run.
+    """
+
+    DATASET_ID: str = CLEANED_DATASET_ID
+
+    def __init__(self, revision: str | None = None) -> None:
+        if revision is not None:
+            self.revision = revision
+            return
+        try:
+            from huggingface_hub import repo_info
+        except ImportError as exc:  # pragma: no cover — dev extra
+            raise RuntimeError(
+                "huggingface_hub not installed; run "
+                "`pip install 'datasets>=2.18' huggingface_hub`"
+            ) from exc
+        info = repo_info(repo_id=CLEANED_DATASET_ID, repo_type="dataset")
+        self.revision = info.sha
+
+    def load_dataset(self, split: str = "S") -> Iterable[LMESession]:
+        """Stream LMESessions out of ``longmemeval_s_cleaned.json``.
+
+        Only ``split="S"`` is supported (the cleaned dataset ships the S
+        split only). Raises ``ValueError`` on any other split value.
+        """
+        if split != "S":
+            raise ValueError(
+                f"unknown LongMemEval cleaned split {split!r}; "
+                f"the cleaned dataset ships only the 'S' split"
+            )
+
+        try:
+            from huggingface_hub import hf_hub_download
+        except ImportError as exc:  # pragma: no cover — dev extra
+            raise RuntimeError(
+                "huggingface_hub not installed; run "
+                "`pip install 'datasets>=2.18' huggingface_hub`"
+            ) from exc
+
+        print(
+            f"[LongMemEval-cleaned] resolving split={split} "
+            f"revision={self.revision} filename={CLEANED_FILENAME}",
+            file=sys.stderr,
+            flush=True,
+        )
+        path = hf_hub_download(
+            repo_id=CLEANED_DATASET_ID,
+            filename=CLEANED_FILENAME,
+            repo_type="dataset",
+            revision=self.revision,
+        )
+        with open(path, "r", encoding="utf-8") as f:
+            rows = json.load(f)
+
+        for row in rows:
+            qid = row["question_id"]
+            question = row["question"]
+            question_type = str(row.get("question_type", "unknown"))
+            answer_session_ids = list(row.get("answer_session_ids", []))
+            haystack_session_ids: list[str] = list(
+                row.get("haystack_session_ids", [])
+            )
+            haystack_sessions: list[list[dict]] = list(
+                row.get("haystack_sessions", [])
+            )
+
+            # Emit one LMESession per haystack session; attach the eval
+            # query to every one so the orchestrator can run ONE recall
+            # per row after inserting all haystack turns. Matches the
+            # raw adapter's emission shape exactly.
+            for sess_id, turns in zip(
+                haystack_session_ids, haystack_sessions
+            ):
+                yield LMESession(
+                    session_id=sess_id,
+                    turns=list(turns),
+                    queries=[
+                        {
+                            "query": question,
+                            "question_id": qid,
+                            "question_type": question_type,
+                            "relevant_turn_ids": answer_session_ids,
+                            "is_gold_session": sess_id in answer_session_ids,
+                        }
+                    ],
+                )
+
+
+__all__ = [
+    "CLEANED_DATASET_ID",
+    "CLEANED_FILENAME",
+    "CleanedLongMemEvalAdapter",
+]