"""LongMemEval adapter — / external-bench gate. Wires the public LongMemEval memory benchmark (Xie et al., 2024) into the IAI-MCP public API (MemoryStore.insert + retrieve.recall). Strict blind-run discipline: no per-dataset tuning, no field-mapping optimisation, no embedder finetune. The adapter is the ONLY translation layer; everything downstream is stock IAI-MCP. ## Dataset source The plan text (05-11-PLAN.md) cites ``lxucs/longmemeval`` — that repo does NOT exist on HuggingFace Hub (returns 401/Not Found). The canonical public mirror shipped by the paper authors is ``xiaowu0162/longmemeval``. Discovered mid-execution; documented as a Rule 3 deviation in the Plan 05-11 SUMMARY. DATASET_ID points at the live mirror; PINNED_REVISION is the 40-char commit hash resolved at execution time so numbers reproduce. ## Row schema (longmemeval_s split, 500 rows) Each row is: { "question_id": str (8-hex), "question_type": str (single-session-user, multi-session, ...), "question": str, "answer": str, "question_date": str ("YYYY/MM/DD (Day) HH:MM"), "haystack_dates": list[str], "haystack_session_ids": list[str] # len ~54 "haystack_sessions": list[list[{"role","content"}]] "answer_session_ids": list[str] # gold evidence (len typically 1) } ## LMESession mapping (Plan 05-11 deviation, Rule 1/3) The plan's interface says "one session -> many queries". The actual dataset is "one query -> many haystack sessions". We therefore flatten each row to a list of LMESession objects — one per haystack session — with the single eval query attached to every session in the row (so bench/longmemeval_blind.py can iterate LMESessions, insert haystack turns, and run the query against the store). The orchestrator (not the adapter) scores at the standard LongMemEval session-ID granularity. The ``score_r_at_k`` method in this module implements the plan's literal formula ``|retrieved ∩ relevant| / |relevant|`` over UUIDs — it is unit- testable and matches the Test 4 contract. The orchestrator also reports session-level R@k using the dataset's native session_id gold. """ from __future__ import annotations import os import sys from dataclasses import dataclass from datetime import datetime, timezone from pathlib import Path from typing import Iterable from uuid import UUID, uuid4 # Local imports kept lazy-friendly by using a distinct alias so tests can # mock ``bench.adapters.longmemeval.retrieve_recall`` without touching the # production retrieve module wholesale. from iai_mcp.retrieve import recall as retrieve_recall from iai_mcp.embed import embedder_for_store from iai_mcp.types import MemoryRecord DATASET_ID: str = "xiaowu0162/longmemeval" # Pinned at execution time (2026-04-20) against the # canonical LongMemEval HuggingFace mirror. Reproducers MUST load this # exact revision or disclose the drift. PINNED_REVISION: str = "2ec2a557f339b6c0369619b1ed5793734cc87533" # Split -> filename (the repo ships configs ``longmemeval_s``, # ``longmemeval_m``, ``longmemeval_oracle``). runs the S split. _SPLIT_FILENAMES: dict[str, str] = { "S": "longmemeval_s", "M": "longmemeval_m", "oracle": "longmemeval_oracle", } @dataclass class LMESession: """One flattened haystack session + its attached eval query. See module docstring for why this differs from the plan's original "one session many queries" spec. """ session_id: str turns: list[dict] # [{"role": "user"|"assistant", "content": str}] queries: list[dict] # [{"query": str, "relevant_turn_ids": list[str]}] class LongMemEvalAdapter: """Public API: load_dataset / session_to_inserts / query_to_recall / score_r_at_k.""" DATASET_ID: str = DATASET_ID PINNED_REVISION: str = PINNED_REVISION def __init__(self, revision: str | None = None) -> None: self.revision = revision or self.PINNED_REVISION # --------------------------------------------------------------- load def load_dataset(self, split: str = "S") -> Iterable[LMESession]: """Stream LMESessions out of the LongMemEval- JSON file. Uses ``huggingface_hub.hf_hub_download`` to grab the split file at the pinned revision (the datasets library's JSON auto-detection breaks on this repo because the files ship without a ``.json`` extension — see README). Falls back to raising a clear error if HuggingFace is unreachable and nothing is cached. """ import json filename = _SPLIT_FILENAMES.get(split) if filename is None: raise ValueError( f"unknown LongMemEval split {split!r}; " f"expected one of {sorted(_SPLIT_FILENAMES)}" ) try: from huggingface_hub import hf_hub_download except ImportError as exc: # pragma: no cover — dev extra raise RuntimeError( "huggingface_hub not installed; run " "`pip install 'datasets>=2.18' huggingface_hub`" ) from exc print( f"[LongMemEval] resolving split={split} " f"revision={self.revision} filename={filename}", file=sys.stderr, flush=True, ) path = hf_hub_download( repo_id=self.DATASET_ID, filename=filename, repo_type="dataset", revision=self.revision, ) with open(path, "r", encoding="utf-8") as f: rows = json.load(f) for row in rows: qid = row["question_id"] question = row["question"] # bench/lme500: capture question_type for per-type breakdown. question_type = str(row.get("question_type", "unknown")) answer_session_ids = list(row.get("answer_session_ids", [])) haystack_session_ids: list[str] = list( row.get("haystack_session_ids", []) ) haystack_sessions: list[list[dict]] = list( row.get("haystack_sessions", []) ) # Emit one LMESession per haystack session; attach the eval # query to every one so the orchestrator can run ONE recall # per row after inserting all haystack turns. # # The "relevant_turn_ids" field stays session-id-based (the # paper's native gold). We record which session is "gold" so # the orchestrator can score hits. for sess_id, turns in zip( haystack_session_ids, haystack_sessions ): yield LMESession( session_id=sess_id, turns=list(turns), queries=[ { "query": question, "question_id": qid, "question_type": question_type, # Gold at session granularity; the orchestrator # decides how to use it. score_r_at_k in this # adapter takes whatever the caller passes. "relevant_turn_ids": answer_session_ids, "is_gold_session": sess_id in answer_session_ids, } ], ) # ------------------------------------------------------- session_to_inserts def session_to_inserts(self, session: LMESession) -> list[MemoryRecord]: """Map each turn to one MemoryRecord (tier=episodic, literal_surface=content). Produces a placeholder embedding sized to the default embed dim. The blind-run orchestrator overrides the embedding with the real one from ``embedder_for_store(store).embed(text)`` before calling ``store.insert`` — this keeps ``session_to_inserts`` cheap for unit tests that don't want to load sentence-transformers. """ from iai_mcp.embed import Embedder dim = Embedder.DEFAULT_DIM records: list[MemoryRecord] = [] now = datetime.now(timezone.utc) for turn in session.turns: content = str(turn.get("content", "")) rec = MemoryRecord( id=uuid4(), tier="episodic", literal_surface=content, aaak_index="", embedding=[0.0] * dim, # placeholder; orchestrator overrides community_id=None, centrality=0.0, detail_level=2, pinned=False, stability=0.0, difficulty=0.0, last_reviewed=None, never_decay=False, never_merge=False, provenance=[], created_at=now, updated_at=now, tags=[ "longmemeval", f"role:{turn.get('role','user')}", f"session:{session.session_id}", ], language="en", ) records.append(rec) return records # ------------------------------------------------------- query_to_recall def query_to_recall(self, query: dict, store) -> list[UUID]: """Call retrieve.recall(cue_text=query['query'], k_hits=10). Returns the retrieved record ids in rank order. The orchestrator uses these ids to compute R@k. """ cue_text = str(query["query"]) embedder = embedder_for_store(store) cue_embedding = embedder.embed(cue_text) resp = retrieve_recall( store=store, cue_embedding=cue_embedding, cue_text=cue_text, session_id="longmemeval-blind", budget_tokens=1500, k_hits=10, k_anti=0, ) return [hit.record_id for hit in resp.hits] # ------------------------------------------------------- score_r_at_k def score_r_at_k( self, retrieved_ids: list, gold_turn_ids: list, k: int = 5, ) -> float: """R@k = |retrieved_top_k ∩ relevant| / |relevant|. Empty ``gold_turn_ids`` returns 1.0 (convention — avoids div-by-zero and matches the "no evidence to miss" semantics). Both lists are normalised to ``str`` so UUID vs session-id ids work. """ if not gold_turn_ids: return 1.0 top_k = retrieved_ids[: max(0, int(k))] gold_set = {str(g) for g in gold_turn_ids} hit = sum(1 for rid in top_k if str(rid) in gold_set) return hit / float(len(gold_set))