iai-mcp-opencode/tests/test_recall_topk_stability.py

"""Plan 05-02 regression fence — rank stability + C5 invariant.

Scoped to the DIAGNOSTIC-NOTE.md dominant-effect verdict:

    Dominant effect: (c) provenance-write amplification

Effect (a) and (b) each contributed 0% to accuracy on the reference host, so
this test file covers the three tests that directly fence effect (c) and the
C5/MEM-05 invariants that must survive the Task 2 batching fix. Test 4 (L0
crowding) from the plan is NOT included because the 05-01 verdict disconfirmed
effect (b) on this host.

Expected state PRE-Task 2 fix on THIS host (16 GB+):
- Test 1 (rank stability) likely PASSES on a fresh store with baseline recall
  — the 05-01 diagnostic showed accuracy=1.0 on this host even with the per-hit
  provenance loop. Test 1's rank-stability fence is still load-bearing because
  on memory-pressed hosts (pressplay 8 GB) the same per-hit loop tips into
  swap thrash and perturbs ranks. Test 1 locks the invariant in place so that
  future regressions (any change that restores the N+1 append pattern, e.g.
  accidental revert) are caught in CI regardless of host memory.
- Test 2 (top-60 pinned coverage) PASSES (the bench numbers matched at 1.0).
- Test 3 (literal preservation) PASSES (C5 invariant is already enforced).

Expected state POST-Task 2 fix: all three tests PASS.

Constitutional invariants covered:
- C5 literal preservation (Test 3)
- provenance creation (Test 1 auxiliary assertion — batched write still
  produces exactly k_hits new provenance entries per recall call)
- verbatim recall at runbook profile (Test 2)
"""
from __future__ import annotations

from datetime import datetime, timezone
from uuid import uuid4

import numpy as np
import pytest

from bench.verbatim import _make_noise, _make_pinned
from iai_mcp.retrieve import recall
from iai_mcp.store import EMBED_DIM, MemoryStore
from iai_mcp.types import MemoryRecord


NOISE_SEED = 20260419


def _seed_store(tmp_path, n_pinned: int, n_noise: int, dim: int = EMBED_DIM):
    """Isolated store with n_pinned + n_noise records.

    Pinned records use identical embedding = [1.0]*dim so cosine ties across
    all of them — this is the tie-break stress profile Test 1 needs. Noise
    uses seeded random unit vectors.
    """
    store = MemoryStore(path=tmp_path)
    pinned_texts = [
        f"Alice pinned verbatim day {i}: phrase-{i}-{'x' * 10}"
        for i in range(n_pinned)
    ]
    pinned_records = [_make_pinned(t, dim=dim) for t in pinned_texts]
    for r in pinned_records:
        store.insert(r)

    rng = np.random.default_rng(NOISE_SEED)
    for j in range(n_noise):
        store.insert(_make_noise(j, rng, dim=dim))
    return store, pinned_records, pinned_texts


def test_topk_rank_identical_across_sequential_queries(tmp_path):
    """Effect (c) rank-stability fence.

    Seeds 30 pinned (tied at cosine=1.0) + 100 noise, calls recall 20x with
    the SAME cue. Asserts the top-30 hit set and per-slot (record_id,
    literal_surface) tuple is byte-identical across every call.

    If the per-hit `store.append_provenance(...)` loop inside `recall()`
    perturbs the LanceDB vector index mid-run (the pressplay failure mode),
    rank drift will cause this assertion to fail.

    Auxiliary assertion: for each of 20 sequential recalls, the pinned
    records' cumulative provenance entry count increases by exactly k_hits per
    call (batching preserves the "every recall → provenance entry" invariant,
    it only changes WHEN the writes happen).
    """
    store, pinned, pinned_texts = _seed_store(tmp_path, n_pinned=30, n_noise=100)
    dim = store.embed_dim
    cue = [1.0] * dim

    # retrieve.recall now defaults to mode='verbatim'
    # (conservative North-Star fallback). The fixture pinned records are
    # tier='semantic' (per bench/verbatim._make_pinned), which verbatim mode
    # filters out — leaving zero hits. The rank-stability invariant
    # this test covers is mode-agnostic (it tests provenance-batch ordering
    # under recall pressure), so pin to mode='concept' explicitly.
    resp0 = recall(
        store=store,
        cue_embedding=cue,
        cue_text="probe",
        session_id="t0",
        budget_tokens=5000,
        k_hits=30,
        k_anti=3,
        mode="concept",
    )
    baseline_ids = tuple((h.record_id, h.literal_surface) for h in resp0.hits)
    assert len(baseline_ids) >= 1, "recall returned zero hits; harness broken"

    # Cap k_hits at n_pinned to avoid mixing noise into the deterministic head.
    # Every pinned is cosine=1.0; any reordering among them is rank drift.
    for i in range(1, 20):
        resp = recall(
            store=store,
            cue_embedding=cue,
            cue_text="probe",
            session_id=f"t{i}",
            budget_tokens=5000,
            k_hits=30,
            k_anti=3,
            mode="concept",
        )
        current = tuple((h.record_id, h.literal_surface) for h in resp.hits)
        assert current == baseline_ids, (
            f"rank drift at iteration {i}: top-k set changed between sequential "
            f"recalls with identical cue. Baseline={baseline_ids}, current={current}. "
            f"This indicates effect (c) provenance-write amplification is perturbing "
            f"the LanceDB vector index."
        )

    # auxiliary: every pinned record should have >= 20 provenance entries
    # (one per recall that returned it in top-k). Because the cue is cosine=1.0
    # to every pinned, ALL 30 pinned are in top-30 on every call => exactly 20
    # new entries per pinned.
    for rec in pinned:
        updated = store.get(rec.id)
        assert updated is not None, f"pinned record {rec.id} vanished"
        # Allow tolerance for batch write ordering, but each pinned must have
        # >= 20 entries (20 recalls * 1 hit each).
        assert len(updated.provenance) >= 20, (
            f"MEM-05 violation: pinned {rec.id} has "
            f"{len(updated.provenance)} provenance entries after 20 recalls "
            f"(expected >= 20)."
        )


def test_topk_contains_all_pinned_at_runbook_profile(tmp_path):
    """OPS-04 gate at the runbook profile (n=50 pinned, k=60, 200 noise).

    At k=60 with 50 pinned + 200 noise, every pinned should be in the top-60.
    This is the in-process mirror of `bench/verbatim.py --n 50 --gap 5
    --noise-per-session 40 --k 60`, minus the provenance-write amplification
    angle that Test 1 covers.
    """
    store, pinned, _ = _seed_store(tmp_path, n_pinned=50, n_noise=200)
    dim = store.embed_dim
    cue = [1.0] * dim

    # pin mode='concept' so tier='semantic' pinned records
    # survive the candidate filter (verbatim mode would drop them).
    resp = recall(
        store=store,
        cue_embedding=cue,
        cue_text="probe",
        session_id="runbook",
        budget_tokens=50_000,
        k_hits=60,
        k_anti=3,
        mode="concept",
    )
    hit_ids = {h.record_id for h in resp.hits}
    pinned_ids = {r.id for r in pinned}
    missing = pinned_ids - hit_ids
    assert not missing, (
        f"OPS-04 violation at runbook profile: "
        f"{len(missing)}/{len(pinned_ids)} pinned records missing from top-60. "
        f"Missing surface (first 3): "
        f"{sorted(str(m)[:8] for m in list(missing)[:3])}"
    )


def test_no_literal_surface_mutation(tmp_path):
    """C5 invariant: literal_surface is byte-identical pre/post recalls.

    Belt-and-suspenders against any future change that would write to
    `literal_surface` during the recall path. The batching fix (Task 2) does
    not touch this field, but the invariant test locks it in so a regression
    in any other part of recall is caught immediately.
    """
    store, pinned, _ = _seed_store(tmp_path, n_pinned=10, n_noise=40)
    dim = store.embed_dim
    cue = [1.0] * dim

    # Snapshot literal_surface bytes before recalls.
    pre = {r.id: store.get(r.id).literal_surface for r in pinned}

    # 20 sequential recalls.
    # mode='concept' so tier='semantic' pinned records
    # survive the candidate filter.
    for i in range(20):
        recall(
            store=store,
            cue_embedding=cue,
            cue_text=f"probe-{i}",
            session_id=f"s{i}",
            budget_tokens=5000,
            k_hits=15,
            k_anti=3,
            mode="concept",
        )

    # Post-recall snapshot: every byte unchanged.
    post = {r.id: store.get(r.id).literal_surface for r in pinned}
    assert pre.keys() == post.keys()
    for rid in pre:
        assert pre[rid] == post[rid], (
            f"C5 violation: literal_surface of record {rid} mutated "
            f"by recall path. Before={pre[rid]!r}, after={post[rid]!r}."
        )