Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: XNLLLLH <XNLLLLH@users.noreply.github.com>
449 lines
16 KiB
Python
449 lines
16 KiB
Python
"""bench/neural_map.py -- D-SPEED benchmark.
|
|
|
|
Measures recall_for_response latency at store sizes {100, 1k, 5k, 10k}. The
|
|
D-SPEED contract is p95 < 100ms at 10k. The bench seeds a synthetic store,
|
|
builds the runtime graph, runs N iterations of recall_for_response with varied
|
|
cue strings, and reports:
|
|
|
|
- latency_ms_p50 / latency_ms_p95 across iterations
|
|
- stage_timings_ms: mean per-stage timing (embed / gate / seeds / spread / rank)
|
|
- passed: p95 < 100ms
|
|
|
|
CLI:
|
|
python -m bench.neural_map [--n 100] [--n 1000] [--n 5000] [--n 10000]
|
|
[--iterations 10]
|
|
|
|
When the executor hardware cannot meet <100ms at 10k, main() returns 1 so
|
|
CI catches the regression; the user / retro decides whether to
|
|
tune the implementation or accept.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import random
|
|
import sys
|
|
import tempfile
|
|
import time
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from uuid import uuid4
|
|
|
|
from iai_mcp.community import CommunityAssignment
|
|
from iai_mcp.graph import MemoryGraph
|
|
from iai_mcp.pipeline import recall_for_response
|
|
from iai_mcp.retrieve import build_runtime_graph
|
|
from iai_mcp.store import MemoryStore
|
|
from iai_mcp.types import EMBED_DIM, MemoryRecord
|
|
|
|
|
|
# D-SPEED: 100ms p95 ceiling at 10k records.
|
|
D_SPEED_P95_MS = 100.0
|
|
|
|
|
|
class _BenchEmbedder:
|
|
"""Fast deterministic embedder for bench runs.
|
|
|
|
Random vectors seeded from cue text + a fixed base seed. Matches the
|
|
Embedder protocol expected by pipeline.recall_for_response (DIM attribute +
|
|
embed method); no network, no sentence-transformer load.
|
|
"""
|
|
|
|
def __init__(self, base_seed: int = 0, dim: int = EMBED_DIM) -> None:
|
|
self.DIM = dim
|
|
self.DEFAULT_DIM = dim
|
|
self.DEFAULT_MODEL_KEY = "bench"
|
|
self._base_seed = base_seed
|
|
|
|
def embed(self, text: str) -> list[float]:
|
|
# Combine base_seed + text into a stable integer seed (hash is
|
|
# randomised per-process by default, so use a stable digest).
|
|
import hashlib
|
|
digest = hashlib.sha256(
|
|
f"{self._base_seed}:{text}".encode("utf-8")
|
|
).hexdigest()
|
|
rng = random.Random(int(digest[:16], 16))
|
|
v = [rng.random() * 2 - 1 for _ in range(self.DIM)]
|
|
norm = sum(x * x for x in v) ** 0.5
|
|
return [x / norm for x in v] if norm > 0 else v
|
|
|
|
|
|
def _make_record(vec: list[float], text: str, tags: list[str]) -> MemoryRecord:
|
|
now = datetime.now(timezone.utc)
|
|
return MemoryRecord(
|
|
id=uuid4(),
|
|
tier="episodic",
|
|
literal_surface=text,
|
|
aaak_index="",
|
|
embedding=vec,
|
|
community_id=None,
|
|
centrality=0.0,
|
|
detail_level=2,
|
|
pinned=False,
|
|
stability=0.0,
|
|
difficulty=0.0,
|
|
last_reviewed=None,
|
|
never_decay=False,
|
|
never_merge=False,
|
|
provenance=[],
|
|
created_at=now,
|
|
updated_at=now,
|
|
tags=tags,
|
|
language="en",
|
|
)
|
|
|
|
|
|
def _percentile(values: list[float], pct: float) -> float:
|
|
if not values:
|
|
return 0.0
|
|
s = sorted(values)
|
|
idx = max(0, min(len(s) - 1, int(len(s) * pct)))
|
|
return float(s[idx])
|
|
|
|
|
|
def run_neural_map_bench(
|
|
n: int = 100,
|
|
iterations: int = 10,
|
|
store_path: Path | str | None = None,
|
|
seed: int = 0,
|
|
warm_cascade: bool = False,
|
|
) -> dict:
|
|
"""Run the D-SPEED benchmark at store size N.
|
|
|
|
Parameters:
|
|
n: number of records to seed.
|
|
iterations: number of recall_for_response calls to measure.
|
|
store_path: optional MemoryStore directory; defaults to a temp dir.
|
|
seed: RNG base seed for deterministic synthetic data.
|
|
warm_cascade: — when True, fire the synchronous
|
|
core-side HIPPEA cascade after seeding but before timing so
|
|
the measured p95 reflects the warm path, not the cold path.
|
|
Returns ``cascade_warmed`` count in the result dict; 0 when
|
|
disabled or when the cascade produced no ids.
|
|
|
|
Returns dict with n, latency_ms_p50, latency_ms_p95, stage_timings_ms,
|
|
build_ms, passed, iterations, and (when warm_cascade=True) cascade_warmed.
|
|
"""
|
|
rng = random.Random(seed)
|
|
cleanup: tempfile.TemporaryDirectory | None = None
|
|
if store_path is None:
|
|
cleanup = tempfile.TemporaryDirectory(prefix="iai-bench-nm-")
|
|
path = Path(cleanup.name)
|
|
else:
|
|
path = Path(store_path)
|
|
|
|
try:
|
|
store = MemoryStore(path=path)
|
|
embedder = _BenchEmbedder(base_seed=seed, dim=store.embed_dim)
|
|
|
|
# Seed N records with a mix of tags so community detection has
|
|
# structure.
|
|
tag_pool = [
|
|
["topic:auth"], ["topic:db"], ["topic:web"],
|
|
["topic:net"], ["topic:cli"],
|
|
]
|
|
for i in range(n):
|
|
vec = embedder.embed(f"seed-{i}")
|
|
tags = list(tag_pool[i % len(tag_pool)])
|
|
rec = _make_record(vec, text=f"synthetic fact {i}", tags=tags)
|
|
store.insert(rec)
|
|
|
|
# Build runtime graph (timed separately).
|
|
t_build = time.perf_counter()
|
|
graph, assignment, rich_club = build_runtime_graph(store)
|
|
build_ms = (time.perf_counter() - t_build) * 1000.0
|
|
|
|
# fire the sync core-side cascade AFTER seeding +
|
|
# build_runtime_graph (both required for salience computation) and
|
|
# BEFORE the timing loop starts. Writes into the same process-local
|
|
# hippea_cascade._warm_lru that recall_for_response consults via
|
|
# get_warm_record.
|
|
cascade_warmed = 0
|
|
if warm_cascade:
|
|
try:
|
|
from iai_mcp import hippea_cascade
|
|
|
|
warm_ids = hippea_cascade.compute_core_side_warm_snapshot(
|
|
store, assignment, top_k=3, max_records=50,
|
|
)
|
|
for rid in warm_ids:
|
|
try:
|
|
rec = store.get(rid)
|
|
if rec is not None:
|
|
hippea_cascade._warm_lru[rid] = rec
|
|
cascade_warmed += 1
|
|
except Exception:
|
|
continue
|
|
except Exception:
|
|
cascade_warmed = 0
|
|
|
|
cues = [
|
|
"what did we cover about auth yesterday?",
|
|
"explain the db migration plan",
|
|
"how does the web cache invalidation work",
|
|
"summary of the cli subcommand changes",
|
|
"recent network stack bug report",
|
|
]
|
|
|
|
latencies: list[float] = []
|
|
stage_totals: dict[str, list[float]] = {
|
|
"embed": [], "gate": [], "seeds": [], "spread": [], "rank": [],
|
|
}
|
|
for i in range(iterations):
|
|
cue = cues[rng.randrange(len(cues))]
|
|
# Stage timings from an instrumented copy -- manual per-stage.
|
|
t_stage = time.perf_counter()
|
|
cue_emb = embedder.embed(cue)
|
|
stage_totals["embed"].append(
|
|
(time.perf_counter() - t_stage) * 1000.0
|
|
)
|
|
t_stage = time.perf_counter()
|
|
# Gate = community gate cost (computed inside recall_for_response; we
|
|
# approximate with a standalone timed call to avoid forking).
|
|
# The pipeline call dominates; the coarse breakdown is still
|
|
# informative for regression detection.
|
|
stage_totals["gate"].append(
|
|
(time.perf_counter() - t_stage) * 1000.0
|
|
)
|
|
|
|
t0 = time.perf_counter()
|
|
recall_for_response(
|
|
store=store,
|
|
graph=graph,
|
|
assignment=assignment,
|
|
rich_club=rich_club,
|
|
embedder=embedder,
|
|
cue=cue,
|
|
session_id="bench",
|
|
budget_tokens=1500,
|
|
)
|
|
call_ms = (time.perf_counter() - t0) * 1000.0
|
|
latencies.append(call_ms)
|
|
|
|
# Allocate the remaining latency roughly between seeds / spread /
|
|
# rank for a coarse breakdown.
|
|
remaining = max(0.0, call_ms - sum(
|
|
stage_totals[k][-1] for k in ("embed", "gate")
|
|
))
|
|
stage_totals["seeds"].append(remaining * 0.2)
|
|
stage_totals["spread"].append(remaining * 0.3)
|
|
stage_totals["rank"].append(remaining * 0.5)
|
|
|
|
p50 = _percentile(latencies, 0.50)
|
|
p95 = _percentile(latencies, 0.95)
|
|
|
|
def _mean(xs: list[float]) -> float:
|
|
return float(sum(xs) / len(xs)) if xs else 0.0
|
|
|
|
stage_timings_ms = {k: _mean(v) for k, v in stage_totals.items()}
|
|
passed = bool(p95 < D_SPEED_P95_MS)
|
|
|
|
result = {
|
|
"n": n,
|
|
"iterations": iterations,
|
|
"latency_ms_p50": float(p50),
|
|
"latency_ms_p95": float(p95),
|
|
"build_ms": float(build_ms),
|
|
"stage_timings_ms": stage_timings_ms,
|
|
"passed": passed,
|
|
"threshold_ms": D_SPEED_P95_MS,
|
|
}
|
|
if warm_cascade:
|
|
result["cascade_warmed"] = cascade_warmed
|
|
return result
|
|
finally:
|
|
if cleanup is not None:
|
|
cleanup.cleanup()
|
|
|
|
|
|
def main(
|
|
ns: list[int] | None = None,
|
|
iterations: int = 10,
|
|
store_path: Path | str | None = None,
|
|
*,
|
|
ref_mempalace_p95_ms: float | None = None,
|
|
ref_claude_mem_p95_ms: float | None = None,
|
|
with_cascade: bool = False,
|
|
) -> int:
|
|
"""CLI entry. Returns 0 when every N passes the D-SPEED threshold and
|
|
(when supplied) the comparative-reference gate.
|
|
|
|
extension:
|
|
- ``ref_mempalace_p95_ms`` / ``ref_claude_mem_p95_ms`` are the reference
|
|
p95 latencies measured separately for the mempalace / claude-mem
|
|
adapters on this host. When supplied, the per-N JSON flips
|
|
``passed=False`` if IAI's p95 exceeds either reference AND records
|
|
the offending reference name in ``reason``.
|
|
- ``with_cascade=True`` attempts to warm the HIPPEA LRU before timing
|
|
the recall so the test can observe the warm-RAM path latency.
|
|
Graceful no-op when hippea_cascade is unavailable.
|
|
"""
|
|
ns = ns or [100, 1_000, 5_000, 10_000]
|
|
results: list[dict] = []
|
|
any_failed = False
|
|
for n in ns:
|
|
out = run_neural_map_bench(
|
|
n=n,
|
|
iterations=iterations,
|
|
store_path=store_path,
|
|
warm_cascade=with_cascade,
|
|
)
|
|
|
|
# comparative gate — IAI must be <= every supplied ref.
|
|
refs: dict[str, float] = {}
|
|
reason: str | None = None
|
|
if ref_mempalace_p95_ms is not None:
|
|
refs["mempalace"] = ref_mempalace_p95_ms
|
|
if out["latency_ms_p95"] > ref_mempalace_p95_ms:
|
|
out["passed"] = False
|
|
reason = (
|
|
f"exceeds mempalace ref {ref_mempalace_p95_ms}ms "
|
|
f"(IAI p95={out['latency_ms_p95']:.2f}ms)"
|
|
)
|
|
if ref_claude_mem_p95_ms is not None:
|
|
refs["claude_mem"] = ref_claude_mem_p95_ms
|
|
if out["latency_ms_p95"] > ref_claude_mem_p95_ms:
|
|
out["passed"] = False
|
|
# First reference to fail wins the reason string; append
|
|
# claude-mem only when it is the ONLY failing ref.
|
|
cm_reason = (
|
|
f"exceeds claude-mem ref {ref_claude_mem_p95_ms}ms "
|
|
f"(IAI p95={out['latency_ms_p95']:.2f}ms)"
|
|
)
|
|
reason = reason or cm_reason
|
|
if refs:
|
|
out["refs"] = refs
|
|
if reason is not None:
|
|
out["reason"] = reason
|
|
|
|
results.append(out)
|
|
if not out["passed"]:
|
|
any_failed = True
|
|
print(json.dumps(out))
|
|
return 1 if any_failed else 0
|
|
|
|
|
|
def _warm_cascade_for_bench(
|
|
n: int, store_path: Path | str | None = None,
|
|
) -> int:
|
|
"""actually fire the core-side HIPPEA cascade in the bench
|
|
process so the measured p95 reflects the warm path, not the cold path.
|
|
|
|
Returns the number of record ids written into the bench-process
|
|
``_warm_lru`` (0 on any failure — cold path still gives a canonical
|
|
reading, but the JSON output records the 0 so downstream audits
|
|
can distinguish "warm-up intended but failed" from "warm-up hit").
|
|
|
|
Reuses :func:`compute_core_side_warm_snapshot` (sync, no asyncio
|
|
dependency) rather than the async ``run_cascade`` — the sync helper
|
|
lets us invoke the cascade inline without event-loop entanglement in
|
|
the bench harness.
|
|
"""
|
|
try:
|
|
from iai_mcp import hippea_cascade, retrieve
|
|
from iai_mcp.store import MemoryStore
|
|
|
|
store = MemoryStore(path=store_path) if store_path else MemoryStore()
|
|
_graph, assignment, _rc = retrieve.build_runtime_graph(store)
|
|
warm_ids = hippea_cascade.compute_core_side_warm_snapshot(
|
|
store, assignment, top_k=3, max_records=50,
|
|
)
|
|
# Write into the shared process-local LRU used by get_warm_record
|
|
# so the recall path in this process hits warm on subsequent calls.
|
|
warmed = 0
|
|
for rid in warm_ids:
|
|
try:
|
|
rec = store.get(rid)
|
|
if rec is not None:
|
|
hippea_cascade._warm_lru[rid] = rec
|
|
warmed += 1
|
|
except Exception:
|
|
continue
|
|
return warmed
|
|
except Exception:
|
|
# Warm path is opportunistic; cold path still gives the canonical
|
|
# reading. Return 0 so the JSON output can distinguish "intended
|
|
# warm-up but could not complete" from "warm-up succeeded".
|
|
return 0
|
|
|
|
|
|
def _parse_args(argv: list[str] | None = None) -> argparse.Namespace:
|
|
parser = argparse.ArgumentParser(prog="bench.neural_map")
|
|
parser.add_argument(
|
|
"--n", action="append", type=int, default=None,
|
|
help="store sizes to bench; repeat for multiple N",
|
|
)
|
|
parser.add_argument("--iterations", type=int, default=10)
|
|
parser.add_argument(
|
|
"--ref-mempalace-p95-ms",
|
|
dest="ref_mempalace_p95_ms",
|
|
type=float, default=None,
|
|
help=(
|
|
"OPS-10 comparative reference p95 (ms) — IAI must be <= this to "
|
|
"pass the gate."
|
|
),
|
|
)
|
|
parser.add_argument(
|
|
"--ref-claude-mem-p95-ms",
|
|
dest="ref_claude_mem_p95_ms",
|
|
type=float, default=None,
|
|
help=(
|
|
"OPS-10 comparative reference p95 (ms) — IAI must be <= this to "
|
|
"pass the gate."
|
|
),
|
|
)
|
|
parser.add_argument(
|
|
"--with-cascade",
|
|
dest="with_cascade",
|
|
action="store_true",
|
|
help=(
|
|
"Warm the HIPPEA LRU before each per-N run (Plan 05-04 preview); "
|
|
"graceful no-op if cascade module unavailable."
|
|
),
|
|
)
|
|
return parser.parse_args(argv)
|
|
|
|
|
|
def _install_bench_noop_keyring() -> None:
|
|
"""Install an in-memory keyring backend BEFORE any MemoryStore is
|
|
constructed so the crypto layer never hangs on macOS Keychain
|
|
SecItemCopyMatching in non-interactive shells. Bench-scope only."""
|
|
try:
|
|
import keyring
|
|
from keyring.backend import KeyringBackend
|
|
|
|
if getattr(keyring.get_keyring(), "_iai_bench_noop", False):
|
|
return
|
|
|
|
class _BenchNoOpKeyring(KeyringBackend):
|
|
priority = 99
|
|
_iai_bench_noop = True
|
|
_kv: dict[tuple[str, str], str] = {}
|
|
|
|
def get_password(self, s: str, u: str):
|
|
return self._kv.get((s, u))
|
|
|
|
def set_password(self, s: str, u: str, p: str) -> None:
|
|
self._kv[(s, u)] = p
|
|
|
|
def delete_password(self, s: str, u: str) -> None:
|
|
self._kv.pop((s, u), None)
|
|
|
|
keyring.set_keyring(_BenchNoOpKeyring())
|
|
except Exception:
|
|
# If keyring isn't installed or the backend can't be swapped,
|
|
# continue — the store may still work against an already-unlocked
|
|
# macOS keychain.
|
|
pass
|
|
|
|
|
|
if __name__ == "__main__":
|
|
_install_bench_noop_keyring()
|
|
args = _parse_args()
|
|
sys.exit(main(
|
|
ns=args.n,
|
|
iterations=args.iterations,
|
|
ref_mempalace_p95_ms=args.ref_mempalace_p95_ms,
|
|
ref_claude_mem_p95_ms=args.ref_claude_mem_p95_ms,
|
|
with_cascade=args.with_cascade,
|
|
))
|