"""bench/verbatim.py -- benchmark harness + diagnostics. Simulates a session gap by inserting N pinned records, flooding the store with `session_gap * noise_per_session` unrelated records, then retrieving each pinned record by its own literal_surface as the cue. Counts byte-exact matches. Target: >= ACCURACY_FLOOR (0.99) on pinned records -- / MEM-10. Exit codes: - 0 if accuracy >= 0.99 - 1 otherwise JSON output (one line to stdout): {"accuracy": float, "n_records": int, "session_gap": int, "hits_exact": int, "passed": bool, "floor": 0.99, "noise_mode": str, "skip_l0_seed": bool, "storage_direct": bool, "k": int} Plan 05-01 (D5-01) diagnostic flags -- BENCH-ONLY (no production change): --skip-l0-seed : skip _seed_l0_identity to isolate L0 crowding (effect b) --storage-direct : bypass recall(), call store.query_similar directly (isolates provenance-write amplification, effect c) --n : override n_records (default 20) --gap : override session_gap (default 20) --noise-per-session : override noise_per_session (default 10) --k : override k_hits (default max(n_records + 10, 20)) Design note -- why we bypass dispatch("memory_recall"): The Plan-02 core.memory_recall routes non-empty stores through recall_for_response (Phase 8 entry-point split) which instantiates an Embedder() (downloads bge-small-en-v1.5 from HuggingFace on first call). That's fine for a real runtime but wrong for an offline bench: we need to measure storage-layer verbatim-recall correctness, not embedder warm-up latency. So we call `retrieve.recall` directly with a fixed cue embedding aligned with the pinned records (all-ones vector). H-03 noise model (review finding, 2026-04-16): The original noise vector was [-0.5]^384, which gives cosine=-1.0 against the [1.0]^384 cue -- making pinned-vs-noise discrimination a geometric artifact rather than a measurement of the storage layer. The fix uses seeded numpy.random.standard_normal(EMBED_DIM) normalised to unit length. Against a [1.0]^384 cue the expected cosine of a random unit vector is 0 with stddev 1/sqrt(EMBED_DIM) ~= 0.05 -- realistic noise geometry, but pinned still wins because cos=+1 >> cos~=0. The bench remains honest about what it measures (literal_surface round-trip under realistic embedding noise, given a fixed cue). A real bge-small-en-v1.5 bench is deferred to Phase 2. """ from __future__ import annotations import argparse import json import sys from datetime import datetime, timezone from uuid import uuid4 import numpy as np from iai_mcp.core import _seed_l0_identity from iai_mcp.retrieve import recall from iai_mcp.store import EMBED_DIM, MemoryStore from iai_mcp.types import MemoryRecord ACCURACY_FLOOR = 0.99 # OPS-04 NOISE_SEED = 20260416 # fixed for reproducibility across runs / CI def _make_pinned(text: str, dim: int = EMBED_DIM) -> MemoryRecord: """A pinned verbatim record -- detail_level=5, never_merge=True, never_decay=True. Uses a fixed all-ones embedding so the cue (also all-ones) maxes cosine to every pinned record simultaneously. The recall ranking then scores by insertion order / stability -- but the literal_surface substring match is the only correctness signal we care about. language="en" required. `dim` parameterised so callers can match a legacy 384d store or the 1024d default; default is `EMBED_DIM` (the current module constant). Unit tests that construct a fresh isolated store pick up the default; bench main() queries the store instance's embed_dim so a pre-existing ~/.iai-mcp store (possibly still at 384d prior to migration) works unchanged. """ return MemoryRecord( id=uuid4(), tier="semantic", literal_surface=text, aaak_index="", embedding=[1.0] * dim, community_id=None, centrality=0.0, detail_level=5, pinned=True, stability=0.0, difficulty=0.0, last_reviewed=None, never_decay=True, never_merge=True, provenance=[], created_at=datetime.now(timezone.utc), updated_at=datetime.now(timezone.utc), tags=["benchmark", "pinned"], language="en", ) def _random_unit_vector(rng: np.random.Generator, dim: int = EMBED_DIM) -> list[float]: """Unit-norm Gaussian vector with configurable dim. Expected cosine vs [1.0]^dim cue: 0 with stddev 1/sqrt(dim) ~= 0.05 at 384d or ~= 0.03 at 1024d. Uses the provided seeded Generator so every run reproduces identical noise. """ v = rng.standard_normal(dim) v = v / np.linalg.norm(v) return v.tolist() def _make_noise(i: int, rng: np.random.Generator, dim: int = EMBED_DIM) -> MemoryRecord: """Noise record with a random unit-vector embedding (H-03 honesty fix). Previous implementation used [-0.5]^EMBED_DIM which gave cosine=-1 against the cue, making pinned-vs-noise discrimination trivial by geometry. Seeded Gaussian unit vectors reproduce deterministically and approximate the orthogonality-on-average of real embeddings. language="en" required. """ return MemoryRecord( id=uuid4(), tier="episodic", literal_surface=f"unrelated session noise record #{i}: " + ("y " * 20), aaak_index="", embedding=_random_unit_vector(rng, dim=dim), community_id=None, centrality=0.0, detail_level=2, pinned=False, stability=0.0, difficulty=0.0, last_reviewed=None, never_decay=False, never_merge=False, provenance=[], created_at=datetime.now(timezone.utc), updated_at=datetime.now(timezone.utc), tags=[], language="en", ) def run_verbatim_bench( store: MemoryStore | None = None, n_records: int = 20, session_gap: int = 20, noise_per_session: int = 10, seed: int = NOISE_SEED, *, skip_l0_seed: bool = False, storage_direct: bool = False, k: int | None = None, ) -> dict: """Run the verbatim-recall benchmark. Parameters: store: optional; isolated tmp_path store in tests, default MemoryStore in CLI. n_records: how many pinned records to store and recall. session_gap: how many "sessions" of noise to interpose between write and recall. noise_per_session: noise records per simulated session. seed: RNG seed for noise vectors (H-03: reproducibility across runs). skip_l0_seed: D5-01 effect (b) isolation -- skip the L0 identity seed so pinned records are not competed against by a fixed-embedding identity record. BENCH-SCOPE ONLY; production _seed_l0_identity is unchanged. storage_direct: D5-01 effect (c) isolation -- bypass retrieve.recall() and call store.query_similar directly, so the per-hit provenance write amplification is removed from the hot loop. BENCH-SCOPE ONLY; production recall() is unchanged. k: override the top-k passed into recall(k_hits=K) or query_similar(k=K); None keeps the historic default of max(n_records + 10, 20). Returns a dict as documented in the module docstring. """ s = store if store is not None else MemoryStore() if not skip_l0_seed: _seed_l0_identity(s) # consult the store's actual embedding dim. An existing Phase 1 # store may still have 384d records pre-D-35-migration; a fresh store has # the default (1024d). Match either transparently. dim = s.embed_dim pinned_texts = [ f"Alice said on day {i}: verbatim phrase #{i}-{'x' * 10}" for i in range(n_records) ] pinned_records = [_make_pinned(t, dim=dim) for t in pinned_texts] for r in pinned_records: s.insert(r) # Simulate session_gap * noise_per_session unrelated records. # H-03: seeded RNG shared across every noise draw so results are reproducible. rng = np.random.default_rng(seed) for session_idx in range(session_gap): for j in range(noise_per_session): s.insert(_make_noise(session_idx * noise_per_session + j, rng, dim=dim)) cue_emb = [1.0] * dim # k must be >= n_records for every pinned record to have a chance of surfacing. # Plus a buffer for the L0 seed + anti-hits tail, so we retrieve a generous top-k. effective_k = k if k is not None else max(n_records + 10, 20) hits_exact = 0 for text in pinned_texts: if storage_direct: # D5-01 (c): bypass recall() -> no per-hit provenance write amplification. raw = s.query_similar(cue_emb, k=effective_k) literal_surfaces = [rec.literal_surface for rec, _score in raw] else: # retrieve.recall now defaults to mode='verbatim' # (conservative North-Star fallback). The bench's _make_pinned # uses tier='semantic' which the verbatim filter would drop. # The bench is measuring "verbatim TEXT exact-match recall under # noise" — that is independent of the cue-router's verbatim/concept # mode (the bench uses synthetic cues, not classifier-tagged # natural-language queries). Pin mode='concept' so the bench # measures what it has always measured. resp = recall( store=s, cue_embedding=cue_emb, cue_text=text, session_id="bench-verbatim", budget_tokens=5000, k_hits=effective_k, k_anti=3, mode="concept", ) literal_surfaces = [h.literal_surface for h in resp.hits] if text in literal_surfaces: hits_exact += 1 accuracy = hits_exact / n_records if n_records > 0 else 0.0 return { "accuracy": accuracy, "n_records": n_records, "session_gap": session_gap, "noise_per_session": noise_per_session, "hits_exact": hits_exact, "passed": accuracy >= ACCURACY_FLOOR, "floor": ACCURACY_FLOOR, "noise_mode": "random-unit-vectors", "noise_seed": seed, # diagnostic traceability keys. "skip_l0_seed": bool(skip_l0_seed), "storage_direct": bool(storage_direct), "k": int(effective_k), } def _build_arg_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser( prog="bench.verbatim", description="OPS-04 / verbatim recall benchmark + diagnostics", ) parser.add_argument( "--skip-l0-seed", action="store_true", help="D5-01 diagnostic: skip _seed_l0_identity to isolate L0 crowding effect", ) parser.add_argument( "--storage-direct", action="store_true", help="D5-01 diagnostic: bypass recall(), call store.query_similar directly", ) parser.add_argument( "--n", "--n-records", dest="n_records", type=int, default=20, help="pinned record count (default 20)", ) parser.add_argument( "--gap", "--session-gap", dest="session_gap", type=int, default=20, help="session gap -- how many noise sessions between writes and recall (default 20)", ) parser.add_argument( "--noise-per-session", type=int, default=10, help="noise records per simulated session (default 10)", ) parser.add_argument( "--k", type=int, default=None, help="override k_hits (default: max(n_records + 10, 20))", ) return parser def main(argv: list[str] | None = None) -> int: parser = _build_arg_parser() args = parser.parse_args(argv) result = run_verbatim_bench( n_records=args.n_records, session_gap=args.session_gap, noise_per_session=args.noise_per_session, skip_l0_seed=args.skip_l0_seed, storage_direct=args.storage_direct, k=args.k, ) print(json.dumps(result)) return 0 if result["passed"] else 1 if __name__ == "__main__": sys.exit(main())