Initial release: iai-mcp v0.1.0
Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: XNLLLLH <XNLLLLH@users.noreply.github.com>
This commit is contained in:
commit
f6b876fbe7
332 changed files with 97258 additions and 0 deletions
449
bench/neural_map.py
Normal file
449
bench/neural_map.py
Normal file
|
|
@ -0,0 +1,449 @@
|
|||
"""bench/neural_map.py -- D-SPEED benchmark.
|
||||
|
||||
Measures recall_for_response latency at store sizes {100, 1k, 5k, 10k}. The
|
||||
D-SPEED contract is p95 < 100ms at 10k. The bench seeds a synthetic store,
|
||||
builds the runtime graph, runs N iterations of recall_for_response with varied
|
||||
cue strings, and reports:
|
||||
|
||||
- latency_ms_p50 / latency_ms_p95 across iterations
|
||||
- stage_timings_ms: mean per-stage timing (embed / gate / seeds / spread / rank)
|
||||
- passed: p95 < 100ms
|
||||
|
||||
CLI:
|
||||
python -m bench.neural_map [--n 100] [--n 1000] [--n 5000] [--n 10000]
|
||||
[--iterations 10]
|
||||
|
||||
When the executor hardware cannot meet <100ms at 10k, main() returns 1 so
|
||||
CI catches the regression; the user / retro decides whether to
|
||||
tune the implementation or accept.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import random
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from uuid import uuid4
|
||||
|
||||
from iai_mcp.community import CommunityAssignment
|
||||
from iai_mcp.graph import MemoryGraph
|
||||
from iai_mcp.pipeline import recall_for_response
|
||||
from iai_mcp.retrieve import build_runtime_graph
|
||||
from iai_mcp.store import MemoryStore
|
||||
from iai_mcp.types import EMBED_DIM, MemoryRecord
|
||||
|
||||
|
||||
# D-SPEED: 100ms p95 ceiling at 10k records.
|
||||
D_SPEED_P95_MS = 100.0
|
||||
|
||||
|
||||
class _BenchEmbedder:
|
||||
"""Fast deterministic embedder for bench runs.
|
||||
|
||||
Random vectors seeded from cue text + a fixed base seed. Matches the
|
||||
Embedder protocol expected by pipeline.recall_for_response (DIM attribute +
|
||||
embed method); no network, no sentence-transformer load.
|
||||
"""
|
||||
|
||||
def __init__(self, base_seed: int = 0, dim: int = EMBED_DIM) -> None:
|
||||
self.DIM = dim
|
||||
self.DEFAULT_DIM = dim
|
||||
self.DEFAULT_MODEL_KEY = "bench"
|
||||
self._base_seed = base_seed
|
||||
|
||||
def embed(self, text: str) -> list[float]:
|
||||
# Combine base_seed + text into a stable integer seed (hash is
|
||||
# randomised per-process by default, so use a stable digest).
|
||||
import hashlib
|
||||
digest = hashlib.sha256(
|
||||
f"{self._base_seed}:{text}".encode("utf-8")
|
||||
).hexdigest()
|
||||
rng = random.Random(int(digest[:16], 16))
|
||||
v = [rng.random() * 2 - 1 for _ in range(self.DIM)]
|
||||
norm = sum(x * x for x in v) ** 0.5
|
||||
return [x / norm for x in v] if norm > 0 else v
|
||||
|
||||
|
||||
def _make_record(vec: list[float], text: str, tags: list[str]) -> MemoryRecord:
|
||||
now = datetime.now(timezone.utc)
|
||||
return MemoryRecord(
|
||||
id=uuid4(),
|
||||
tier="episodic",
|
||||
literal_surface=text,
|
||||
aaak_index="",
|
||||
embedding=vec,
|
||||
community_id=None,
|
||||
centrality=0.0,
|
||||
detail_level=2,
|
||||
pinned=False,
|
||||
stability=0.0,
|
||||
difficulty=0.0,
|
||||
last_reviewed=None,
|
||||
never_decay=False,
|
||||
never_merge=False,
|
||||
provenance=[],
|
||||
created_at=now,
|
||||
updated_at=now,
|
||||
tags=tags,
|
||||
language="en",
|
||||
)
|
||||
|
||||
|
||||
def _percentile(values: list[float], pct: float) -> float:
|
||||
if not values:
|
||||
return 0.0
|
||||
s = sorted(values)
|
||||
idx = max(0, min(len(s) - 1, int(len(s) * pct)))
|
||||
return float(s[idx])
|
||||
|
||||
|
||||
def run_neural_map_bench(
|
||||
n: int = 100,
|
||||
iterations: int = 10,
|
||||
store_path: Path | str | None = None,
|
||||
seed: int = 0,
|
||||
warm_cascade: bool = False,
|
||||
) -> dict:
|
||||
"""Run the D-SPEED benchmark at store size N.
|
||||
|
||||
Parameters:
|
||||
n: number of records to seed.
|
||||
iterations: number of recall_for_response calls to measure.
|
||||
store_path: optional MemoryStore directory; defaults to a temp dir.
|
||||
seed: RNG base seed for deterministic synthetic data.
|
||||
warm_cascade: — when True, fire the synchronous
|
||||
core-side HIPPEA cascade after seeding but before timing so
|
||||
the measured p95 reflects the warm path, not the cold path.
|
||||
Returns ``cascade_warmed`` count in the result dict; 0 when
|
||||
disabled or when the cascade produced no ids.
|
||||
|
||||
Returns dict with n, latency_ms_p50, latency_ms_p95, stage_timings_ms,
|
||||
build_ms, passed, iterations, and (when warm_cascade=True) cascade_warmed.
|
||||
"""
|
||||
rng = random.Random(seed)
|
||||
cleanup: tempfile.TemporaryDirectory | None = None
|
||||
if store_path is None:
|
||||
cleanup = tempfile.TemporaryDirectory(prefix="iai-bench-nm-")
|
||||
path = Path(cleanup.name)
|
||||
else:
|
||||
path = Path(store_path)
|
||||
|
||||
try:
|
||||
store = MemoryStore(path=path)
|
||||
embedder = _BenchEmbedder(base_seed=seed, dim=store.embed_dim)
|
||||
|
||||
# Seed N records with a mix of tags so community detection has
|
||||
# structure.
|
||||
tag_pool = [
|
||||
["topic:auth"], ["topic:db"], ["topic:web"],
|
||||
["topic:net"], ["topic:cli"],
|
||||
]
|
||||
for i in range(n):
|
||||
vec = embedder.embed(f"seed-{i}")
|
||||
tags = list(tag_pool[i % len(tag_pool)])
|
||||
rec = _make_record(vec, text=f"synthetic fact {i}", tags=tags)
|
||||
store.insert(rec)
|
||||
|
||||
# Build runtime graph (timed separately).
|
||||
t_build = time.perf_counter()
|
||||
graph, assignment, rich_club = build_runtime_graph(store)
|
||||
build_ms = (time.perf_counter() - t_build) * 1000.0
|
||||
|
||||
# fire the sync core-side cascade AFTER seeding +
|
||||
# build_runtime_graph (both required for salience computation) and
|
||||
# BEFORE the timing loop starts. Writes into the same process-local
|
||||
# hippea_cascade._warm_lru that recall_for_response consults via
|
||||
# get_warm_record.
|
||||
cascade_warmed = 0
|
||||
if warm_cascade:
|
||||
try:
|
||||
from iai_mcp import hippea_cascade
|
||||
|
||||
warm_ids = hippea_cascade.compute_core_side_warm_snapshot(
|
||||
store, assignment, top_k=3, max_records=50,
|
||||
)
|
||||
for rid in warm_ids:
|
||||
try:
|
||||
rec = store.get(rid)
|
||||
if rec is not None:
|
||||
hippea_cascade._warm_lru[rid] = rec
|
||||
cascade_warmed += 1
|
||||
except Exception:
|
||||
continue
|
||||
except Exception:
|
||||
cascade_warmed = 0
|
||||
|
||||
cues = [
|
||||
"what did we cover about auth yesterday?",
|
||||
"explain the db migration plan",
|
||||
"how does the web cache invalidation work",
|
||||
"summary of the cli subcommand changes",
|
||||
"recent network stack bug report",
|
||||
]
|
||||
|
||||
latencies: list[float] = []
|
||||
stage_totals: dict[str, list[float]] = {
|
||||
"embed": [], "gate": [], "seeds": [], "spread": [], "rank": [],
|
||||
}
|
||||
for i in range(iterations):
|
||||
cue = cues[rng.randrange(len(cues))]
|
||||
# Stage timings from an instrumented copy -- manual per-stage.
|
||||
t_stage = time.perf_counter()
|
||||
cue_emb = embedder.embed(cue)
|
||||
stage_totals["embed"].append(
|
||||
(time.perf_counter() - t_stage) * 1000.0
|
||||
)
|
||||
t_stage = time.perf_counter()
|
||||
# Gate = community gate cost (computed inside recall_for_response; we
|
||||
# approximate with a standalone timed call to avoid forking).
|
||||
# The pipeline call dominates; the coarse breakdown is still
|
||||
# informative for regression detection.
|
||||
stage_totals["gate"].append(
|
||||
(time.perf_counter() - t_stage) * 1000.0
|
||||
)
|
||||
|
||||
t0 = time.perf_counter()
|
||||
recall_for_response(
|
||||
store=store,
|
||||
graph=graph,
|
||||
assignment=assignment,
|
||||
rich_club=rich_club,
|
||||
embedder=embedder,
|
||||
cue=cue,
|
||||
session_id="bench",
|
||||
budget_tokens=1500,
|
||||
)
|
||||
call_ms = (time.perf_counter() - t0) * 1000.0
|
||||
latencies.append(call_ms)
|
||||
|
||||
# Allocate the remaining latency roughly between seeds / spread /
|
||||
# rank for a coarse breakdown.
|
||||
remaining = max(0.0, call_ms - sum(
|
||||
stage_totals[k][-1] for k in ("embed", "gate")
|
||||
))
|
||||
stage_totals["seeds"].append(remaining * 0.2)
|
||||
stage_totals["spread"].append(remaining * 0.3)
|
||||
stage_totals["rank"].append(remaining * 0.5)
|
||||
|
||||
p50 = _percentile(latencies, 0.50)
|
||||
p95 = _percentile(latencies, 0.95)
|
||||
|
||||
def _mean(xs: list[float]) -> float:
|
||||
return float(sum(xs) / len(xs)) if xs else 0.0
|
||||
|
||||
stage_timings_ms = {k: _mean(v) for k, v in stage_totals.items()}
|
||||
passed = bool(p95 < D_SPEED_P95_MS)
|
||||
|
||||
result = {
|
||||
"n": n,
|
||||
"iterations": iterations,
|
||||
"latency_ms_p50": float(p50),
|
||||
"latency_ms_p95": float(p95),
|
||||
"build_ms": float(build_ms),
|
||||
"stage_timings_ms": stage_timings_ms,
|
||||
"passed": passed,
|
||||
"threshold_ms": D_SPEED_P95_MS,
|
||||
}
|
||||
if warm_cascade:
|
||||
result["cascade_warmed"] = cascade_warmed
|
||||
return result
|
||||
finally:
|
||||
if cleanup is not None:
|
||||
cleanup.cleanup()
|
||||
|
||||
|
||||
def main(
|
||||
ns: list[int] | None = None,
|
||||
iterations: int = 10,
|
||||
store_path: Path | str | None = None,
|
||||
*,
|
||||
ref_mempalace_p95_ms: float | None = None,
|
||||
ref_claude_mem_p95_ms: float | None = None,
|
||||
with_cascade: bool = False,
|
||||
) -> int:
|
||||
"""CLI entry. Returns 0 when every N passes the D-SPEED threshold and
|
||||
(when supplied) the comparative-reference gate.
|
||||
|
||||
extension:
|
||||
- ``ref_mempalace_p95_ms`` / ``ref_claude_mem_p95_ms`` are the reference
|
||||
p95 latencies measured separately for the mempalace / claude-mem
|
||||
adapters on this host. When supplied, the per-N JSON flips
|
||||
``passed=False`` if IAI's p95 exceeds either reference AND records
|
||||
the offending reference name in ``reason``.
|
||||
- ``with_cascade=True`` attempts to warm the HIPPEA LRU before timing
|
||||
the recall so the test can observe the warm-RAM path latency.
|
||||
Graceful no-op when hippea_cascade is unavailable.
|
||||
"""
|
||||
ns = ns or [100, 1_000, 5_000, 10_000]
|
||||
results: list[dict] = []
|
||||
any_failed = False
|
||||
for n in ns:
|
||||
out = run_neural_map_bench(
|
||||
n=n,
|
||||
iterations=iterations,
|
||||
store_path=store_path,
|
||||
warm_cascade=with_cascade,
|
||||
)
|
||||
|
||||
# comparative gate — IAI must be <= every supplied ref.
|
||||
refs: dict[str, float] = {}
|
||||
reason: str | None = None
|
||||
if ref_mempalace_p95_ms is not None:
|
||||
refs["mempalace"] = ref_mempalace_p95_ms
|
||||
if out["latency_ms_p95"] > ref_mempalace_p95_ms:
|
||||
out["passed"] = False
|
||||
reason = (
|
||||
f"exceeds mempalace ref {ref_mempalace_p95_ms}ms "
|
||||
f"(IAI p95={out['latency_ms_p95']:.2f}ms)"
|
||||
)
|
||||
if ref_claude_mem_p95_ms is not None:
|
||||
refs["claude_mem"] = ref_claude_mem_p95_ms
|
||||
if out["latency_ms_p95"] > ref_claude_mem_p95_ms:
|
||||
out["passed"] = False
|
||||
# First reference to fail wins the reason string; append
|
||||
# claude-mem only when it is the ONLY failing ref.
|
||||
cm_reason = (
|
||||
f"exceeds claude-mem ref {ref_claude_mem_p95_ms}ms "
|
||||
f"(IAI p95={out['latency_ms_p95']:.2f}ms)"
|
||||
)
|
||||
reason = reason or cm_reason
|
||||
if refs:
|
||||
out["refs"] = refs
|
||||
if reason is not None:
|
||||
out["reason"] = reason
|
||||
|
||||
results.append(out)
|
||||
if not out["passed"]:
|
||||
any_failed = True
|
||||
print(json.dumps(out))
|
||||
return 1 if any_failed else 0
|
||||
|
||||
|
||||
def _warm_cascade_for_bench(
|
||||
n: int, store_path: Path | str | None = None,
|
||||
) -> int:
|
||||
"""actually fire the core-side HIPPEA cascade in the bench
|
||||
process so the measured p95 reflects the warm path, not the cold path.
|
||||
|
||||
Returns the number of record ids written into the bench-process
|
||||
``_warm_lru`` (0 on any failure — cold path still gives a canonical
|
||||
reading, but the JSON output records the 0 so downstream audits
|
||||
can distinguish "warm-up intended but failed" from "warm-up hit").
|
||||
|
||||
Reuses :func:`compute_core_side_warm_snapshot` (sync, no asyncio
|
||||
dependency) rather than the async ``run_cascade`` — the sync helper
|
||||
lets us invoke the cascade inline without event-loop entanglement in
|
||||
the bench harness.
|
||||
"""
|
||||
try:
|
||||
from iai_mcp import hippea_cascade, retrieve
|
||||
from iai_mcp.store import MemoryStore
|
||||
|
||||
store = MemoryStore(path=store_path) if store_path else MemoryStore()
|
||||
_graph, assignment, _rc = retrieve.build_runtime_graph(store)
|
||||
warm_ids = hippea_cascade.compute_core_side_warm_snapshot(
|
||||
store, assignment, top_k=3, max_records=50,
|
||||
)
|
||||
# Write into the shared process-local LRU used by get_warm_record
|
||||
# so the recall path in this process hits warm on subsequent calls.
|
||||
warmed = 0
|
||||
for rid in warm_ids:
|
||||
try:
|
||||
rec = store.get(rid)
|
||||
if rec is not None:
|
||||
hippea_cascade._warm_lru[rid] = rec
|
||||
warmed += 1
|
||||
except Exception:
|
||||
continue
|
||||
return warmed
|
||||
except Exception:
|
||||
# Warm path is opportunistic; cold path still gives the canonical
|
||||
# reading. Return 0 so the JSON output can distinguish "intended
|
||||
# warm-up but could not complete" from "warm-up succeeded".
|
||||
return 0
|
||||
|
||||
|
||||
def _parse_args(argv: list[str] | None = None) -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(prog="bench.neural_map")
|
||||
parser.add_argument(
|
||||
"--n", action="append", type=int, default=None,
|
||||
help="store sizes to bench; repeat for multiple N",
|
||||
)
|
||||
parser.add_argument("--iterations", type=int, default=10)
|
||||
parser.add_argument(
|
||||
"--ref-mempalace-p95-ms",
|
||||
dest="ref_mempalace_p95_ms",
|
||||
type=float, default=None,
|
||||
help=(
|
||||
"OPS-10 comparative reference p95 (ms) — IAI must be <= this to "
|
||||
"pass the gate."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ref-claude-mem-p95-ms",
|
||||
dest="ref_claude_mem_p95_ms",
|
||||
type=float, default=None,
|
||||
help=(
|
||||
"OPS-10 comparative reference p95 (ms) — IAI must be <= this to "
|
||||
"pass the gate."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--with-cascade",
|
||||
dest="with_cascade",
|
||||
action="store_true",
|
||||
help=(
|
||||
"Warm the HIPPEA LRU before each per-N run (Plan 05-04 preview); "
|
||||
"graceful no-op if cascade module unavailable."
|
||||
),
|
||||
)
|
||||
return parser.parse_args(argv)
|
||||
|
||||
|
||||
def _install_bench_noop_keyring() -> None:
|
||||
"""Install an in-memory keyring backend BEFORE any MemoryStore is
|
||||
constructed so the crypto layer never hangs on macOS Keychain
|
||||
SecItemCopyMatching in non-interactive shells. Bench-scope only."""
|
||||
try:
|
||||
import keyring
|
||||
from keyring.backend import KeyringBackend
|
||||
|
||||
if getattr(keyring.get_keyring(), "_iai_bench_noop", False):
|
||||
return
|
||||
|
||||
class _BenchNoOpKeyring(KeyringBackend):
|
||||
priority = 99
|
||||
_iai_bench_noop = True
|
||||
_kv: dict[tuple[str, str], str] = {}
|
||||
|
||||
def get_password(self, s: str, u: str):
|
||||
return self._kv.get((s, u))
|
||||
|
||||
def set_password(self, s: str, u: str, p: str) -> None:
|
||||
self._kv[(s, u)] = p
|
||||
|
||||
def delete_password(self, s: str, u: str) -> None:
|
||||
self._kv.pop((s, u), None)
|
||||
|
||||
keyring.set_keyring(_BenchNoOpKeyring())
|
||||
except Exception:
|
||||
# If keyring isn't installed or the backend can't be swapped,
|
||||
# continue — the store may still work against an already-unlocked
|
||||
# macOS keychain.
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
_install_bench_noop_keyring()
|
||||
args = _parse_args()
|
||||
sys.exit(main(
|
||||
ns=args.n,
|
||||
iterations=args.iterations,
|
||||
ref_mempalace_p95_ms=args.ref_mempalace_p95_ms,
|
||||
ref_claude_mem_p95_ms=args.ref_claude_mem_p95_ms,
|
||||
with_cascade=args.with_cascade,
|
||||
))
|
||||
Loading…
Add table
Add a link
Reference in a new issue