iai-mcp-opencode/bench/verbatim.py

"""bench/verbatim.py -- benchmark harness + diagnostics.

Simulates a session gap by inserting N pinned records, flooding the store with
`session_gap * noise_per_session` unrelated records, then retrieving each
pinned record by its own literal_surface as the cue. Counts byte-exact matches.

Target: >= ACCURACY_FLOOR (0.99) on pinned records -- / MEM-10.

Exit codes:
- 0 if accuracy >= 0.99
- 1 otherwise

JSON output (one line to stdout):
    {"accuracy": float, "n_records": int, "session_gap": int,
     "hits_exact": int, "passed": bool, "floor": 0.99, "noise_mode": str,
     "skip_l0_seed": bool, "storage_direct": bool, "k": int}

Plan 05-01 (D5-01) diagnostic flags -- BENCH-ONLY (no production change):
  --skip-l0-seed   : skip _seed_l0_identity to isolate L0 crowding (effect b)
  --storage-direct : bypass recall(), call store.query_similar directly
                     (isolates provenance-write amplification, effect c)
  --n              : override n_records (default 20)
  --gap            : override session_gap (default 20)
  --noise-per-session : override noise_per_session (default 10)
  --k              : override k_hits (default max(n_records + 10, 20))

Design note -- why we bypass dispatch("memory_recall"):
The Plan-02 core.memory_recall routes non-empty stores through recall_for_response
(Phase 8 entry-point split) which instantiates an Embedder() (downloads
bge-small-en-v1.5 from HuggingFace
on first call). That's fine for a real runtime but wrong for an offline bench:
we need to measure storage-layer verbatim-recall correctness, not embedder
warm-up latency. So we call `retrieve.recall` directly with a fixed cue
embedding aligned with the pinned records (all-ones vector).

H-03 noise model (review finding, 2026-04-16):
The original noise vector was [-0.5]^384, which gives cosine=-1.0 against the
[1.0]^384 cue -- making pinned-vs-noise discrimination a geometric artifact
rather than a measurement of the storage layer. The fix uses seeded
numpy.random.standard_normal(EMBED_DIM) normalised to unit length. Against a
[1.0]^384 cue the expected cosine of a random unit vector is 0 with stddev
1/sqrt(EMBED_DIM) ~= 0.05 -- realistic noise geometry, but pinned still wins
because cos=+1 >> cos~=0. The bench remains honest about what it measures
(literal_surface round-trip under realistic embedding noise, given a fixed
cue). A real bge-small-en-v1.5 bench is deferred to Phase 2.
"""
from __future__ import annotations

import argparse
import json
import sys
from datetime import datetime, timezone
from uuid import uuid4

import numpy as np

from iai_mcp.core import _seed_l0_identity
from iai_mcp.retrieve import recall
from iai_mcp.store import EMBED_DIM, MemoryStore
from iai_mcp.types import MemoryRecord

ACCURACY_FLOOR = 0.99   # OPS-04
NOISE_SEED = 20260416   # fixed for reproducibility across runs / CI


def _make_pinned(text: str, dim: int = EMBED_DIM) -> MemoryRecord:
    """A pinned verbatim record -- detail_level=5, never_merge=True, never_decay=True.

    Uses a fixed all-ones embedding so the cue (also all-ones) maxes cosine to
    every pinned record simultaneously. The recall ranking then scores by
    insertion order / stability -- but the literal_surface substring match is
    the only correctness signal we care about.

    language="en" required. `dim` parameterised so callers
    can match a legacy 384d store or the 1024d default; default is
    `EMBED_DIM` (the current module constant). Unit tests that construct a
    fresh isolated store pick up the default; bench main() queries the
    store instance's embed_dim so a pre-existing ~/.iai-mcp store (possibly
    still at 384d prior to migration) works unchanged.
    """
    return MemoryRecord(
        id=uuid4(),
        tier="semantic",
        literal_surface=text,
        aaak_index="",
        embedding=[1.0] * dim,
        community_id=None,
        centrality=0.0,
        detail_level=5,
        pinned=True,
        stability=0.0,
        difficulty=0.0,
        last_reviewed=None,
        never_decay=True,
        never_merge=True,
        provenance=[],
        created_at=datetime.now(timezone.utc),
        updated_at=datetime.now(timezone.utc),
        tags=["benchmark", "pinned"],
        language="en",
    )


def _random_unit_vector(rng: np.random.Generator, dim: int = EMBED_DIM) -> list[float]:
    """Unit-norm Gaussian vector with configurable dim.

    Expected cosine vs [1.0]^dim cue: 0 with stddev 1/sqrt(dim) ~= 0.05 at 384d
    or ~= 0.03 at 1024d. Uses the provided seeded Generator so every run
    reproduces identical noise.
    """
    v = rng.standard_normal(dim)
    v = v / np.linalg.norm(v)
    return v.tolist()


def _make_noise(i: int, rng: np.random.Generator, dim: int = EMBED_DIM) -> MemoryRecord:
    """Noise record with a random unit-vector embedding (H-03 honesty fix).

    Previous implementation used [-0.5]^EMBED_DIM which gave cosine=-1 against the
    cue, making pinned-vs-noise discrimination trivial by geometry. Seeded
    Gaussian unit vectors reproduce deterministically and approximate the
    orthogonality-on-average of real embeddings.

    language="en" required.
    """
    return MemoryRecord(
        id=uuid4(),
        tier="episodic",
        literal_surface=f"unrelated session noise record #{i}: " + ("y " * 20),
        aaak_index="",
        embedding=_random_unit_vector(rng, dim=dim),
        community_id=None,
        centrality=0.0,
        detail_level=2,
        pinned=False,
        stability=0.0,
        difficulty=0.0,
        last_reviewed=None,
        never_decay=False,
        never_merge=False,
        provenance=[],
        created_at=datetime.now(timezone.utc),
        updated_at=datetime.now(timezone.utc),
        tags=[],
        language="en",
    )


def run_verbatim_bench(
    store: MemoryStore | None = None,
    n_records: int = 20,
    session_gap: int = 20,
    noise_per_session: int = 10,
    seed: int = NOISE_SEED,
    *,
    skip_l0_seed: bool = False,
    storage_direct: bool = False,
    k: int | None = None,
) -> dict:
    """Run the verbatim-recall benchmark.

    Parameters:
        store: optional; isolated tmp_path store in tests, default MemoryStore in CLI.
        n_records: how many pinned records to store and recall.
        session_gap: how many "sessions" of noise to interpose between write and recall.
        noise_per_session: noise records per simulated session.
        seed: RNG seed for noise vectors (H-03: reproducibility across runs).
        skip_l0_seed: D5-01 effect (b) isolation -- skip the L0 identity
            seed so pinned records are not competed against by a fixed-embedding
            identity record. BENCH-SCOPE ONLY; production _seed_l0_identity is
            unchanged.
        storage_direct: D5-01 effect (c) isolation -- bypass
            retrieve.recall() and call store.query_similar directly, so the
            per-hit provenance write amplification is removed from the hot loop.
            BENCH-SCOPE ONLY; production recall() is unchanged.
        k: override the top-k passed into recall(k_hits=K) or query_similar(k=K);
            None keeps the historic default of max(n_records + 10, 20).

    Returns a dict as documented in the module docstring.
    """
    s = store if store is not None else MemoryStore()
    if not skip_l0_seed:
        _seed_l0_identity(s)

    # consult the store's actual embedding dim. An existing Phase 1
    # store may still have 384d records pre-D-35-migration; a fresh store has
    # the default (1024d). Match either transparently.
    dim = s.embed_dim

    pinned_texts = [
        f"Alice said on day {i}: verbatim phrase #{i}-{'x' * 10}"
        for i in range(n_records)
    ]
    pinned_records = [_make_pinned(t, dim=dim) for t in pinned_texts]
    for r in pinned_records:
        s.insert(r)

    # Simulate session_gap * noise_per_session unrelated records.
    # H-03: seeded RNG shared across every noise draw so results are reproducible.
    rng = np.random.default_rng(seed)
    for session_idx in range(session_gap):
        for j in range(noise_per_session):
            s.insert(_make_noise(session_idx * noise_per_session + j, rng, dim=dim))

    cue_emb = [1.0] * dim
    # k must be >= n_records for every pinned record to have a chance of surfacing.
    # Plus a buffer for the L0 seed + anti-hits tail, so we retrieve a generous top-k.
    effective_k = k if k is not None else max(n_records + 10, 20)
    hits_exact = 0
    for text in pinned_texts:
        if storage_direct:
            # D5-01 (c): bypass recall() -> no per-hit provenance write amplification.
            raw = s.query_similar(cue_emb, k=effective_k)
            literal_surfaces = [rec.literal_surface for rec, _score in raw]
        else:
            # retrieve.recall now defaults to mode='verbatim'
            # (conservative North-Star fallback). The bench's _make_pinned
            # uses tier='semantic' which the verbatim filter would drop.
            # The bench is measuring "verbatim TEXT exact-match recall under
            # noise" — that is independent of the cue-router's verbatim/concept
            # mode (the bench uses synthetic cues, not classifier-tagged
            # natural-language queries). Pin mode='concept' so the bench
            # measures what it has always measured.
            resp = recall(
                store=s,
                cue_embedding=cue_emb,
                cue_text=text,
                session_id="bench-verbatim",
                budget_tokens=5000,
                k_hits=effective_k,
                k_anti=3,
                mode="concept",
            )
            literal_surfaces = [h.literal_surface for h in resp.hits]
        if text in literal_surfaces:
            hits_exact += 1

    accuracy = hits_exact / n_records if n_records > 0 else 0.0
    return {
        "accuracy": accuracy,
        "n_records": n_records,
        "session_gap": session_gap,
        "noise_per_session": noise_per_session,
        "hits_exact": hits_exact,
        "passed": accuracy >= ACCURACY_FLOOR,
        "floor": ACCURACY_FLOOR,
        "noise_mode": "random-unit-vectors",
        "noise_seed": seed,
        # diagnostic traceability keys.
        "skip_l0_seed": bool(skip_l0_seed),
        "storage_direct": bool(storage_direct),
        "k": int(effective_k),
    }


def _build_arg_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(
        prog="bench.verbatim",
        description="OPS-04 / verbatim recall benchmark + diagnostics",
    )
    parser.add_argument(
        "--skip-l0-seed",
        action="store_true",
        help="D5-01 diagnostic: skip _seed_l0_identity to isolate L0 crowding effect",
    )
    parser.add_argument(
        "--storage-direct",
        action="store_true",
        help="D5-01 diagnostic: bypass recall(), call store.query_similar directly",
    )
    parser.add_argument(
        "--n", "--n-records",
        dest="n_records",
        type=int,
        default=20,
        help="pinned record count (default 20)",
    )
    parser.add_argument(
        "--gap", "--session-gap",
        dest="session_gap",
        type=int,
        default=20,
        help="session gap -- how many noise sessions between writes and recall (default 20)",
    )
    parser.add_argument(
        "--noise-per-session",
        type=int,
        default=10,
        help="noise records per simulated session (default 10)",
    )
    parser.add_argument(
        "--k",
        type=int,
        default=None,
        help="override k_hits (default: max(n_records + 10, 20))",
    )
    return parser


def main(argv: list[str] | None = None) -> int:
    parser = _build_arg_parser()
    args = parser.parse_args(argv)
    result = run_verbatim_bench(
        n_records=args.n_records,
        session_gap=args.session_gap,
        noise_per_session=args.noise_per_session,
        skip_l0_seed=args.skip_l0_seed,
        storage_direct=args.storage_direct,
        k=args.k,
    )
    print(json.dumps(result))
    return 0 if result["passed"] else 1


if __name__ == "__main__":
    sys.exit(main())