Initial release: iai-mcp v0.1.0
Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: XNLLLLH <XNLLLLH@users.noreply.github.com>
This commit is contained in:
commit
f6b876fbe7
332 changed files with 97258 additions and 0 deletions
316
bench/verbatim.py
Normal file
316
bench/verbatim.py
Normal file
|
|
@ -0,0 +1,316 @@
|
|||
"""bench/verbatim.py -- benchmark harness + diagnostics.
|
||||
|
||||
Simulates a session gap by inserting N pinned records, flooding the store with
|
||||
`session_gap * noise_per_session` unrelated records, then retrieving each
|
||||
pinned record by its own literal_surface as the cue. Counts byte-exact matches.
|
||||
|
||||
Target: >= ACCURACY_FLOOR (0.99) on pinned records -- / MEM-10.
|
||||
|
||||
Exit codes:
|
||||
- 0 if accuracy >= 0.99
|
||||
- 1 otherwise
|
||||
|
||||
JSON output (one line to stdout):
|
||||
{"accuracy": float, "n_records": int, "session_gap": int,
|
||||
"hits_exact": int, "passed": bool, "floor": 0.99, "noise_mode": str,
|
||||
"skip_l0_seed": bool, "storage_direct": bool, "k": int}
|
||||
|
||||
Plan 05-01 (D5-01) diagnostic flags -- BENCH-ONLY (no production change):
|
||||
--skip-l0-seed : skip _seed_l0_identity to isolate L0 crowding (effect b)
|
||||
--storage-direct : bypass recall(), call store.query_similar directly
|
||||
(isolates provenance-write amplification, effect c)
|
||||
--n : override n_records (default 20)
|
||||
--gap : override session_gap (default 20)
|
||||
--noise-per-session : override noise_per_session (default 10)
|
||||
--k : override k_hits (default max(n_records + 10, 20))
|
||||
|
||||
Design note -- why we bypass dispatch("memory_recall"):
|
||||
The Plan-02 core.memory_recall routes non-empty stores through recall_for_response
|
||||
(Phase 8 entry-point split) which instantiates an Embedder() (downloads
|
||||
bge-small-en-v1.5 from HuggingFace
|
||||
on first call). That's fine for a real runtime but wrong for an offline bench:
|
||||
we need to measure storage-layer verbatim-recall correctness, not embedder
|
||||
warm-up latency. So we call `retrieve.recall` directly with a fixed cue
|
||||
embedding aligned with the pinned records (all-ones vector).
|
||||
|
||||
H-03 noise model (review finding, 2026-04-16):
|
||||
The original noise vector was [-0.5]^384, which gives cosine=-1.0 against the
|
||||
[1.0]^384 cue -- making pinned-vs-noise discrimination a geometric artifact
|
||||
rather than a measurement of the storage layer. The fix uses seeded
|
||||
numpy.random.standard_normal(EMBED_DIM) normalised to unit length. Against a
|
||||
[1.0]^384 cue the expected cosine of a random unit vector is 0 with stddev
|
||||
1/sqrt(EMBED_DIM) ~= 0.05 -- realistic noise geometry, but pinned still wins
|
||||
because cos=+1 >> cos~=0. The bench remains honest about what it measures
|
||||
(literal_surface round-trip under realistic embedding noise, given a fixed
|
||||
cue). A real bge-small-en-v1.5 bench is deferred to Phase 2.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from uuid import uuid4
|
||||
|
||||
import numpy as np
|
||||
|
||||
from iai_mcp.core import _seed_l0_identity
|
||||
from iai_mcp.retrieve import recall
|
||||
from iai_mcp.store import EMBED_DIM, MemoryStore
|
||||
from iai_mcp.types import MemoryRecord
|
||||
|
||||
ACCURACY_FLOOR = 0.99 # OPS-04
|
||||
NOISE_SEED = 20260416 # fixed for reproducibility across runs / CI
|
||||
|
||||
|
||||
def _make_pinned(text: str, dim: int = EMBED_DIM) -> MemoryRecord:
|
||||
"""A pinned verbatim record -- detail_level=5, never_merge=True, never_decay=True.
|
||||
|
||||
Uses a fixed all-ones embedding so the cue (also all-ones) maxes cosine to
|
||||
every pinned record simultaneously. The recall ranking then scores by
|
||||
insertion order / stability -- but the literal_surface substring match is
|
||||
the only correctness signal we care about.
|
||||
|
||||
language="en" required. `dim` parameterised so callers
|
||||
can match a legacy 384d store or the 1024d default; default is
|
||||
`EMBED_DIM` (the current module constant). Unit tests that construct a
|
||||
fresh isolated store pick up the default; bench main() queries the
|
||||
store instance's embed_dim so a pre-existing ~/.iai-mcp store (possibly
|
||||
still at 384d prior to migration) works unchanged.
|
||||
"""
|
||||
return MemoryRecord(
|
||||
id=uuid4(),
|
||||
tier="semantic",
|
||||
literal_surface=text,
|
||||
aaak_index="",
|
||||
embedding=[1.0] * dim,
|
||||
community_id=None,
|
||||
centrality=0.0,
|
||||
detail_level=5,
|
||||
pinned=True,
|
||||
stability=0.0,
|
||||
difficulty=0.0,
|
||||
last_reviewed=None,
|
||||
never_decay=True,
|
||||
never_merge=True,
|
||||
provenance=[],
|
||||
created_at=datetime.now(timezone.utc),
|
||||
updated_at=datetime.now(timezone.utc),
|
||||
tags=["benchmark", "pinned"],
|
||||
language="en",
|
||||
)
|
||||
|
||||
|
||||
def _random_unit_vector(rng: np.random.Generator, dim: int = EMBED_DIM) -> list[float]:
|
||||
"""Unit-norm Gaussian vector with configurable dim.
|
||||
|
||||
Expected cosine vs [1.0]^dim cue: 0 with stddev 1/sqrt(dim) ~= 0.05 at 384d
|
||||
or ~= 0.03 at 1024d. Uses the provided seeded Generator so every run
|
||||
reproduces identical noise.
|
||||
"""
|
||||
v = rng.standard_normal(dim)
|
||||
v = v / np.linalg.norm(v)
|
||||
return v.tolist()
|
||||
|
||||
|
||||
def _make_noise(i: int, rng: np.random.Generator, dim: int = EMBED_DIM) -> MemoryRecord:
|
||||
"""Noise record with a random unit-vector embedding (H-03 honesty fix).
|
||||
|
||||
Previous implementation used [-0.5]^EMBED_DIM which gave cosine=-1 against the
|
||||
cue, making pinned-vs-noise discrimination trivial by geometry. Seeded
|
||||
Gaussian unit vectors reproduce deterministically and approximate the
|
||||
orthogonality-on-average of real embeddings.
|
||||
|
||||
language="en" required.
|
||||
"""
|
||||
return MemoryRecord(
|
||||
id=uuid4(),
|
||||
tier="episodic",
|
||||
literal_surface=f"unrelated session noise record #{i}: " + ("y " * 20),
|
||||
aaak_index="",
|
||||
embedding=_random_unit_vector(rng, dim=dim),
|
||||
community_id=None,
|
||||
centrality=0.0,
|
||||
detail_level=2,
|
||||
pinned=False,
|
||||
stability=0.0,
|
||||
difficulty=0.0,
|
||||
last_reviewed=None,
|
||||
never_decay=False,
|
||||
never_merge=False,
|
||||
provenance=[],
|
||||
created_at=datetime.now(timezone.utc),
|
||||
updated_at=datetime.now(timezone.utc),
|
||||
tags=[],
|
||||
language="en",
|
||||
)
|
||||
|
||||
|
||||
def run_verbatim_bench(
|
||||
store: MemoryStore | None = None,
|
||||
n_records: int = 20,
|
||||
session_gap: int = 20,
|
||||
noise_per_session: int = 10,
|
||||
seed: int = NOISE_SEED,
|
||||
*,
|
||||
skip_l0_seed: bool = False,
|
||||
storage_direct: bool = False,
|
||||
k: int | None = None,
|
||||
) -> dict:
|
||||
"""Run the verbatim-recall benchmark.
|
||||
|
||||
Parameters:
|
||||
store: optional; isolated tmp_path store in tests, default MemoryStore in CLI.
|
||||
n_records: how many pinned records to store and recall.
|
||||
session_gap: how many "sessions" of noise to interpose between write and recall.
|
||||
noise_per_session: noise records per simulated session.
|
||||
seed: RNG seed for noise vectors (H-03: reproducibility across runs).
|
||||
skip_l0_seed: D5-01 effect (b) isolation -- skip the L0 identity
|
||||
seed so pinned records are not competed against by a fixed-embedding
|
||||
identity record. BENCH-SCOPE ONLY; production _seed_l0_identity is
|
||||
unchanged.
|
||||
storage_direct: D5-01 effect (c) isolation -- bypass
|
||||
retrieve.recall() and call store.query_similar directly, so the
|
||||
per-hit provenance write amplification is removed from the hot loop.
|
||||
BENCH-SCOPE ONLY; production recall() is unchanged.
|
||||
k: override the top-k passed into recall(k_hits=K) or query_similar(k=K);
|
||||
None keeps the historic default of max(n_records + 10, 20).
|
||||
|
||||
Returns a dict as documented in the module docstring.
|
||||
"""
|
||||
s = store if store is not None else MemoryStore()
|
||||
if not skip_l0_seed:
|
||||
_seed_l0_identity(s)
|
||||
|
||||
# consult the store's actual embedding dim. An existing Phase 1
|
||||
# store may still have 384d records pre-D-35-migration; a fresh store has
|
||||
# the default (1024d). Match either transparently.
|
||||
dim = s.embed_dim
|
||||
|
||||
pinned_texts = [
|
||||
f"Alice said on day {i}: verbatim phrase #{i}-{'x' * 10}"
|
||||
for i in range(n_records)
|
||||
]
|
||||
pinned_records = [_make_pinned(t, dim=dim) for t in pinned_texts]
|
||||
for r in pinned_records:
|
||||
s.insert(r)
|
||||
|
||||
# Simulate session_gap * noise_per_session unrelated records.
|
||||
# H-03: seeded RNG shared across every noise draw so results are reproducible.
|
||||
rng = np.random.default_rng(seed)
|
||||
for session_idx in range(session_gap):
|
||||
for j in range(noise_per_session):
|
||||
s.insert(_make_noise(session_idx * noise_per_session + j, rng, dim=dim))
|
||||
|
||||
cue_emb = [1.0] * dim
|
||||
# k must be >= n_records for every pinned record to have a chance of surfacing.
|
||||
# Plus a buffer for the L0 seed + anti-hits tail, so we retrieve a generous top-k.
|
||||
effective_k = k if k is not None else max(n_records + 10, 20)
|
||||
hits_exact = 0
|
||||
for text in pinned_texts:
|
||||
if storage_direct:
|
||||
# D5-01 (c): bypass recall() -> no per-hit provenance write amplification.
|
||||
raw = s.query_similar(cue_emb, k=effective_k)
|
||||
literal_surfaces = [rec.literal_surface for rec, _score in raw]
|
||||
else:
|
||||
# retrieve.recall now defaults to mode='verbatim'
|
||||
# (conservative North-Star fallback). The bench's _make_pinned
|
||||
# uses tier='semantic' which the verbatim filter would drop.
|
||||
# The bench is measuring "verbatim TEXT exact-match recall under
|
||||
# noise" — that is independent of the cue-router's verbatim/concept
|
||||
# mode (the bench uses synthetic cues, not classifier-tagged
|
||||
# natural-language queries). Pin mode='concept' so the bench
|
||||
# measures what it has always measured.
|
||||
resp = recall(
|
||||
store=s,
|
||||
cue_embedding=cue_emb,
|
||||
cue_text=text,
|
||||
session_id="bench-verbatim",
|
||||
budget_tokens=5000,
|
||||
k_hits=effective_k,
|
||||
k_anti=3,
|
||||
mode="concept",
|
||||
)
|
||||
literal_surfaces = [h.literal_surface for h in resp.hits]
|
||||
if text in literal_surfaces:
|
||||
hits_exact += 1
|
||||
|
||||
accuracy = hits_exact / n_records if n_records > 0 else 0.0
|
||||
return {
|
||||
"accuracy": accuracy,
|
||||
"n_records": n_records,
|
||||
"session_gap": session_gap,
|
||||
"noise_per_session": noise_per_session,
|
||||
"hits_exact": hits_exact,
|
||||
"passed": accuracy >= ACCURACY_FLOOR,
|
||||
"floor": ACCURACY_FLOOR,
|
||||
"noise_mode": "random-unit-vectors",
|
||||
"noise_seed": seed,
|
||||
# diagnostic traceability keys.
|
||||
"skip_l0_seed": bool(skip_l0_seed),
|
||||
"storage_direct": bool(storage_direct),
|
||||
"k": int(effective_k),
|
||||
}
|
||||
|
||||
|
||||
def _build_arg_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="bench.verbatim",
|
||||
description="OPS-04 / verbatim recall benchmark + diagnostics",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-l0-seed",
|
||||
action="store_true",
|
||||
help="D5-01 diagnostic: skip _seed_l0_identity to isolate L0 crowding effect",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--storage-direct",
|
||||
action="store_true",
|
||||
help="D5-01 diagnostic: bypass recall(), call store.query_similar directly",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--n", "--n-records",
|
||||
dest="n_records",
|
||||
type=int,
|
||||
default=20,
|
||||
help="pinned record count (default 20)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--gap", "--session-gap",
|
||||
dest="session_gap",
|
||||
type=int,
|
||||
default=20,
|
||||
help="session gap -- how many noise sessions between writes and recall (default 20)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--noise-per-session",
|
||||
type=int,
|
||||
default=10,
|
||||
help="noise records per simulated session (default 10)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--k",
|
||||
type=int,
|
||||
default=None,
|
||||
help="override k_hits (default: max(n_records + 10, 20))",
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
parser = _build_arg_parser()
|
||||
args = parser.parse_args(argv)
|
||||
result = run_verbatim_bench(
|
||||
n_records=args.n_records,
|
||||
session_gap=args.session_gap,
|
||||
noise_per_session=args.noise_per_session,
|
||||
skip_l0_seed=args.skip_l0_seed,
|
||||
storage_direct=args.storage_direct,
|
||||
k=args.k,
|
||||
)
|
||||
print(json.dumps(result))
|
||||
return 0 if result["passed"] else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Loading…
Add table
Add a link
Reference in a new issue