"""bench/neural_map.py -- D-SPEED benchmark. Measures recall_for_response latency at store sizes {100, 1k, 5k, 10k}. The D-SPEED contract is p95 < 100ms at 10k. The bench seeds a synthetic store, builds the runtime graph, runs N iterations of recall_for_response with varied cue strings, and reports: - latency_ms_p50 / latency_ms_p95 across iterations - stage_timings_ms: mean per-stage timing (embed / gate / seeds / spread / rank) - passed: p95 < 100ms CLI: python -m bench.neural_map [--n 100] [--n 1000] [--n 5000] [--n 10000] [--iterations 10] When the executor hardware cannot meet <100ms at 10k, main() returns 1 so CI catches the regression; the user / retro decides whether to tune the implementation or accept. """ from __future__ import annotations import argparse import json import random import sys import tempfile import time from datetime import datetime, timezone from pathlib import Path from uuid import uuid4 from iai_mcp.community import CommunityAssignment from iai_mcp.graph import MemoryGraph from iai_mcp.pipeline import recall_for_response from iai_mcp.retrieve import build_runtime_graph from iai_mcp.store import MemoryStore from iai_mcp.types import EMBED_DIM, MemoryRecord # D-SPEED: 100ms p95 ceiling at 10k records. D_SPEED_P95_MS = 100.0 class _BenchEmbedder: """Fast deterministic embedder for bench runs. Random vectors seeded from cue text + a fixed base seed. Matches the Embedder protocol expected by pipeline.recall_for_response (DIM attribute + embed method); no network, no sentence-transformer load. """ def __init__(self, base_seed: int = 0, dim: int = EMBED_DIM) -> None: self.DIM = dim self.DEFAULT_DIM = dim self.DEFAULT_MODEL_KEY = "bench" self._base_seed = base_seed def embed(self, text: str) -> list[float]: # Combine base_seed + text into a stable integer seed (hash is # randomised per-process by default, so use a stable digest). import hashlib digest = hashlib.sha256( f"{self._base_seed}:{text}".encode("utf-8") ).hexdigest() rng = random.Random(int(digest[:16], 16)) v = [rng.random() * 2 - 1 for _ in range(self.DIM)] norm = sum(x * x for x in v) ** 0.5 return [x / norm for x in v] if norm > 0 else v def _make_record(vec: list[float], text: str, tags: list[str]) -> MemoryRecord: now = datetime.now(timezone.utc) return MemoryRecord( id=uuid4(), tier="episodic", literal_surface=text, aaak_index="", embedding=vec, community_id=None, centrality=0.0, detail_level=2, pinned=False, stability=0.0, difficulty=0.0, last_reviewed=None, never_decay=False, never_merge=False, provenance=[], created_at=now, updated_at=now, tags=tags, language="en", ) def _percentile(values: list[float], pct: float) -> float: if not values: return 0.0 s = sorted(values) idx = max(0, min(len(s) - 1, int(len(s) * pct))) return float(s[idx]) def run_neural_map_bench( n: int = 100, iterations: int = 10, store_path: Path | str | None = None, seed: int = 0, warm_cascade: bool = False, ) -> dict: """Run the D-SPEED benchmark at store size N. Parameters: n: number of records to seed. iterations: number of recall_for_response calls to measure. store_path: optional MemoryStore directory; defaults to a temp dir. seed: RNG base seed for deterministic synthetic data. warm_cascade: — when True, fire the synchronous core-side HIPPEA cascade after seeding but before timing so the measured p95 reflects the warm path, not the cold path. Returns ``cascade_warmed`` count in the result dict; 0 when disabled or when the cascade produced no ids. Returns dict with n, latency_ms_p50, latency_ms_p95, stage_timings_ms, build_ms, passed, iterations, and (when warm_cascade=True) cascade_warmed. """ rng = random.Random(seed) cleanup: tempfile.TemporaryDirectory | None = None if store_path is None: cleanup = tempfile.TemporaryDirectory(prefix="iai-bench-nm-") path = Path(cleanup.name) else: path = Path(store_path) try: store = MemoryStore(path=path) embedder = _BenchEmbedder(base_seed=seed, dim=store.embed_dim) # Seed N records with a mix of tags so community detection has # structure. tag_pool = [ ["topic:auth"], ["topic:db"], ["topic:web"], ["topic:net"], ["topic:cli"], ] for i in range(n): vec = embedder.embed(f"seed-{i}") tags = list(tag_pool[i % len(tag_pool)]) rec = _make_record(vec, text=f"synthetic fact {i}", tags=tags) store.insert(rec) # Build runtime graph (timed separately). t_build = time.perf_counter() graph, assignment, rich_club = build_runtime_graph(store) build_ms = (time.perf_counter() - t_build) * 1000.0 # fire the sync core-side cascade AFTER seeding + # build_runtime_graph (both required for salience computation) and # BEFORE the timing loop starts. Writes into the same process-local # hippea_cascade._warm_lru that recall_for_response consults via # get_warm_record. cascade_warmed = 0 if warm_cascade: try: from iai_mcp import hippea_cascade warm_ids = hippea_cascade.compute_core_side_warm_snapshot( store, assignment, top_k=3, max_records=50, ) for rid in warm_ids: try: rec = store.get(rid) if rec is not None: hippea_cascade._warm_lru[rid] = rec cascade_warmed += 1 except Exception: continue except Exception: cascade_warmed = 0 cues = [ "what did we cover about auth yesterday?", "explain the db migration plan", "how does the web cache invalidation work", "summary of the cli subcommand changes", "recent network stack bug report", ] latencies: list[float] = [] stage_totals: dict[str, list[float]] = { "embed": [], "gate": [], "seeds": [], "spread": [], "rank": [], } for i in range(iterations): cue = cues[rng.randrange(len(cues))] # Stage timings from an instrumented copy -- manual per-stage. t_stage = time.perf_counter() cue_emb = embedder.embed(cue) stage_totals["embed"].append( (time.perf_counter() - t_stage) * 1000.0 ) t_stage = time.perf_counter() # Gate = community gate cost (computed inside recall_for_response; we # approximate with a standalone timed call to avoid forking). # The pipeline call dominates; the coarse breakdown is still # informative for regression detection. stage_totals["gate"].append( (time.perf_counter() - t_stage) * 1000.0 ) t0 = time.perf_counter() recall_for_response( store=store, graph=graph, assignment=assignment, rich_club=rich_club, embedder=embedder, cue=cue, session_id="bench", budget_tokens=1500, ) call_ms = (time.perf_counter() - t0) * 1000.0 latencies.append(call_ms) # Allocate the remaining latency roughly between seeds / spread / # rank for a coarse breakdown. remaining = max(0.0, call_ms - sum( stage_totals[k][-1] for k in ("embed", "gate") )) stage_totals["seeds"].append(remaining * 0.2) stage_totals["spread"].append(remaining * 0.3) stage_totals["rank"].append(remaining * 0.5) p50 = _percentile(latencies, 0.50) p95 = _percentile(latencies, 0.95) def _mean(xs: list[float]) -> float: return float(sum(xs) / len(xs)) if xs else 0.0 stage_timings_ms = {k: _mean(v) for k, v in stage_totals.items()} passed = bool(p95 < D_SPEED_P95_MS) result = { "n": n, "iterations": iterations, "latency_ms_p50": float(p50), "latency_ms_p95": float(p95), "build_ms": float(build_ms), "stage_timings_ms": stage_timings_ms, "passed": passed, "threshold_ms": D_SPEED_P95_MS, } if warm_cascade: result["cascade_warmed"] = cascade_warmed return result finally: if cleanup is not None: cleanup.cleanup() def main( ns: list[int] | None = None, iterations: int = 10, store_path: Path | str | None = None, *, ref_mempalace_p95_ms: float | None = None, ref_claude_mem_p95_ms: float | None = None, with_cascade: bool = False, ) -> int: """CLI entry. Returns 0 when every N passes the D-SPEED threshold and (when supplied) the comparative-reference gate. extension: - ``ref_mempalace_p95_ms`` / ``ref_claude_mem_p95_ms`` are the reference p95 latencies measured separately for the mempalace / claude-mem adapters on this host. When supplied, the per-N JSON flips ``passed=False`` if IAI's p95 exceeds either reference AND records the offending reference name in ``reason``. - ``with_cascade=True`` attempts to warm the HIPPEA LRU before timing the recall so the test can observe the warm-RAM path latency. Graceful no-op when hippea_cascade is unavailable. """ ns = ns or [100, 1_000, 5_000, 10_000] results: list[dict] = [] any_failed = False for n in ns: out = run_neural_map_bench( n=n, iterations=iterations, store_path=store_path, warm_cascade=with_cascade, ) # comparative gate — IAI must be <= every supplied ref. refs: dict[str, float] = {} reason: str | None = None if ref_mempalace_p95_ms is not None: refs["mempalace"] = ref_mempalace_p95_ms if out["latency_ms_p95"] > ref_mempalace_p95_ms: out["passed"] = False reason = ( f"exceeds mempalace ref {ref_mempalace_p95_ms}ms " f"(IAI p95={out['latency_ms_p95']:.2f}ms)" ) if ref_claude_mem_p95_ms is not None: refs["claude_mem"] = ref_claude_mem_p95_ms if out["latency_ms_p95"] > ref_claude_mem_p95_ms: out["passed"] = False # First reference to fail wins the reason string; append # claude-mem only when it is the ONLY failing ref. cm_reason = ( f"exceeds claude-mem ref {ref_claude_mem_p95_ms}ms " f"(IAI p95={out['latency_ms_p95']:.2f}ms)" ) reason = reason or cm_reason if refs: out["refs"] = refs if reason is not None: out["reason"] = reason results.append(out) if not out["passed"]: any_failed = True print(json.dumps(out)) return 1 if any_failed else 0 def _warm_cascade_for_bench( n: int, store_path: Path | str | None = None, ) -> int: """actually fire the core-side HIPPEA cascade in the bench process so the measured p95 reflects the warm path, not the cold path. Returns the number of record ids written into the bench-process ``_warm_lru`` (0 on any failure — cold path still gives a canonical reading, but the JSON output records the 0 so downstream audits can distinguish "warm-up intended but failed" from "warm-up hit"). Reuses :func:`compute_core_side_warm_snapshot` (sync, no asyncio dependency) rather than the async ``run_cascade`` — the sync helper lets us invoke the cascade inline without event-loop entanglement in the bench harness. """ try: from iai_mcp import hippea_cascade, retrieve from iai_mcp.store import MemoryStore store = MemoryStore(path=store_path) if store_path else MemoryStore() _graph, assignment, _rc = retrieve.build_runtime_graph(store) warm_ids = hippea_cascade.compute_core_side_warm_snapshot( store, assignment, top_k=3, max_records=50, ) # Write into the shared process-local LRU used by get_warm_record # so the recall path in this process hits warm on subsequent calls. warmed = 0 for rid in warm_ids: try: rec = store.get(rid) if rec is not None: hippea_cascade._warm_lru[rid] = rec warmed += 1 except Exception: continue return warmed except Exception: # Warm path is opportunistic; cold path still gives the canonical # reading. Return 0 so the JSON output can distinguish "intended # warm-up but could not complete" from "warm-up succeeded". return 0 def _parse_args(argv: list[str] | None = None) -> argparse.Namespace: parser = argparse.ArgumentParser(prog="bench.neural_map") parser.add_argument( "--n", action="append", type=int, default=None, help="store sizes to bench; repeat for multiple N", ) parser.add_argument("--iterations", type=int, default=10) parser.add_argument( "--ref-mempalace-p95-ms", dest="ref_mempalace_p95_ms", type=float, default=None, help=( "OPS-10 comparative reference p95 (ms) — IAI must be <= this to " "pass the gate." ), ) parser.add_argument( "--ref-claude-mem-p95-ms", dest="ref_claude_mem_p95_ms", type=float, default=None, help=( "OPS-10 comparative reference p95 (ms) — IAI must be <= this to " "pass the gate." ), ) parser.add_argument( "--with-cascade", dest="with_cascade", action="store_true", help=( "Warm the HIPPEA LRU before each per-N run (Plan 05-04 preview); " "graceful no-op if cascade module unavailable." ), ) return parser.parse_args(argv) def _install_bench_noop_keyring() -> None: """Install an in-memory keyring backend BEFORE any MemoryStore is constructed so the crypto layer never hangs on macOS Keychain SecItemCopyMatching in non-interactive shells. Bench-scope only.""" try: import keyring from keyring.backend import KeyringBackend if getattr(keyring.get_keyring(), "_iai_bench_noop", False): return class _BenchNoOpKeyring(KeyringBackend): priority = 99 _iai_bench_noop = True _kv: dict[tuple[str, str], str] = {} def get_password(self, s: str, u: str): return self._kv.get((s, u)) def set_password(self, s: str, u: str, p: str) -> None: self._kv[(s, u)] = p def delete_password(self, s: str, u: str) -> None: self._kv.pop((s, u), None) keyring.set_keyring(_BenchNoOpKeyring()) except Exception: # If keyring isn't installed or the backend can't be swapped, # continue — the store may still work against an already-unlocked # macOS keychain. pass if __name__ == "__main__": _install_bench_noop_keyring() args = _parse_args() sys.exit(main( ns=args.n, iterations=args.iterations, ref_mempalace_p95_ms=args.ref_mempalace_p95_ms, ref_claude_mem_p95_ms=args.ref_claude_mem_p95_ms, with_cascade=args.with_cascade, ))