Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: XNLLLLH <XNLLLLH@users.noreply.github.com>
219 lines
8.5 KiB
Python
219 lines
8.5 KiB
Python
"""Plan 05-02 regression fence — rank stability + C5 invariant.
|
|
|
|
Scoped to the DIAGNOSTIC-NOTE.md dominant-effect verdict:
|
|
|
|
Dominant effect: (c) provenance-write amplification
|
|
|
|
Effect (a) and (b) each contributed 0% to accuracy on the reference host, so
|
|
this test file covers the three tests that directly fence effect (c) and the
|
|
C5/MEM-05 invariants that must survive the Task 2 batching fix. Test 4 (L0
|
|
crowding) from the plan is NOT included because the 05-01 verdict disconfirmed
|
|
effect (b) on this host.
|
|
|
|
Expected state PRE-Task 2 fix on THIS host (16 GB+):
|
|
- Test 1 (rank stability) likely PASSES on a fresh store with baseline recall
|
|
— the 05-01 diagnostic showed accuracy=1.0 on this host even with the per-hit
|
|
provenance loop. Test 1's rank-stability fence is still load-bearing because
|
|
on memory-pressed hosts (pressplay 8 GB) the same per-hit loop tips into
|
|
swap thrash and perturbs ranks. Test 1 locks the invariant in place so that
|
|
future regressions (any change that restores the N+1 append pattern, e.g.
|
|
accidental revert) are caught in CI regardless of host memory.
|
|
- Test 2 (top-60 pinned coverage) PASSES (the bench numbers matched at 1.0).
|
|
- Test 3 (literal preservation) PASSES (C5 invariant is already enforced).
|
|
|
|
Expected state POST-Task 2 fix: all three tests PASS.
|
|
|
|
Constitutional invariants covered:
|
|
- C5 literal preservation (Test 3)
|
|
- provenance creation (Test 1 auxiliary assertion — batched write still
|
|
produces exactly k_hits new provenance entries per recall call)
|
|
- verbatim recall at runbook profile (Test 2)
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
from datetime import datetime, timezone
|
|
from uuid import uuid4
|
|
|
|
import numpy as np
|
|
import pytest
|
|
|
|
from bench.verbatim import _make_noise, _make_pinned
|
|
from iai_mcp.retrieve import recall
|
|
from iai_mcp.store import EMBED_DIM, MemoryStore
|
|
from iai_mcp.types import MemoryRecord
|
|
|
|
|
|
NOISE_SEED = 20260419
|
|
|
|
|
|
def _seed_store(tmp_path, n_pinned: int, n_noise: int, dim: int = EMBED_DIM):
|
|
"""Isolated store with n_pinned + n_noise records.
|
|
|
|
Pinned records use identical embedding = [1.0]*dim so cosine ties across
|
|
all of them — this is the tie-break stress profile Test 1 needs. Noise
|
|
uses seeded random unit vectors.
|
|
"""
|
|
store = MemoryStore(path=tmp_path)
|
|
pinned_texts = [
|
|
f"Alice pinned verbatim day {i}: phrase-{i}-{'x' * 10}"
|
|
for i in range(n_pinned)
|
|
]
|
|
pinned_records = [_make_pinned(t, dim=dim) for t in pinned_texts]
|
|
for r in pinned_records:
|
|
store.insert(r)
|
|
|
|
rng = np.random.default_rng(NOISE_SEED)
|
|
for j in range(n_noise):
|
|
store.insert(_make_noise(j, rng, dim=dim))
|
|
return store, pinned_records, pinned_texts
|
|
|
|
|
|
def test_topk_rank_identical_across_sequential_queries(tmp_path):
|
|
"""Effect (c) rank-stability fence.
|
|
|
|
Seeds 30 pinned (tied at cosine=1.0) + 100 noise, calls recall 20x with
|
|
the SAME cue. Asserts the top-30 hit set and per-slot (record_id,
|
|
literal_surface) tuple is byte-identical across every call.
|
|
|
|
If the per-hit `store.append_provenance(...)` loop inside `recall()`
|
|
perturbs the LanceDB vector index mid-run (the pressplay failure mode),
|
|
rank drift will cause this assertion to fail.
|
|
|
|
Auxiliary assertion: for each of 20 sequential recalls, the pinned
|
|
records' cumulative provenance entry count increases by exactly k_hits per
|
|
call (batching preserves the "every recall → provenance entry" invariant,
|
|
it only changes WHEN the writes happen).
|
|
"""
|
|
store, pinned, pinned_texts = _seed_store(tmp_path, n_pinned=30, n_noise=100)
|
|
dim = store.embed_dim
|
|
cue = [1.0] * dim
|
|
|
|
# retrieve.recall now defaults to mode='verbatim'
|
|
# (conservative North-Star fallback). The fixture pinned records are
|
|
# tier='semantic' (per bench/verbatim._make_pinned), which verbatim mode
|
|
# filters out — leaving zero hits. The rank-stability invariant
|
|
# this test covers is mode-agnostic (it tests provenance-batch ordering
|
|
# under recall pressure), so pin to mode='concept' explicitly.
|
|
resp0 = recall(
|
|
store=store,
|
|
cue_embedding=cue,
|
|
cue_text="probe",
|
|
session_id="t0",
|
|
budget_tokens=5000,
|
|
k_hits=30,
|
|
k_anti=3,
|
|
mode="concept",
|
|
)
|
|
baseline_ids = tuple((h.record_id, h.literal_surface) for h in resp0.hits)
|
|
assert len(baseline_ids) >= 1, "recall returned zero hits; harness broken"
|
|
|
|
# Cap k_hits at n_pinned to avoid mixing noise into the deterministic head.
|
|
# Every pinned is cosine=1.0; any reordering among them is rank drift.
|
|
for i in range(1, 20):
|
|
resp = recall(
|
|
store=store,
|
|
cue_embedding=cue,
|
|
cue_text="probe",
|
|
session_id=f"t{i}",
|
|
budget_tokens=5000,
|
|
k_hits=30,
|
|
k_anti=3,
|
|
mode="concept",
|
|
)
|
|
current = tuple((h.record_id, h.literal_surface) for h in resp.hits)
|
|
assert current == baseline_ids, (
|
|
f"rank drift at iteration {i}: top-k set changed between sequential "
|
|
f"recalls with identical cue. Baseline={baseline_ids}, current={current}. "
|
|
f"This indicates effect (c) provenance-write amplification is perturbing "
|
|
f"the LanceDB vector index."
|
|
)
|
|
|
|
# auxiliary: every pinned record should have >= 20 provenance entries
|
|
# (one per recall that returned it in top-k). Because the cue is cosine=1.0
|
|
# to every pinned, ALL 30 pinned are in top-30 on every call => exactly 20
|
|
# new entries per pinned.
|
|
for rec in pinned:
|
|
updated = store.get(rec.id)
|
|
assert updated is not None, f"pinned record {rec.id} vanished"
|
|
# Allow tolerance for batch write ordering, but each pinned must have
|
|
# >= 20 entries (20 recalls * 1 hit each).
|
|
assert len(updated.provenance) >= 20, (
|
|
f"MEM-05 violation: pinned {rec.id} has "
|
|
f"{len(updated.provenance)} provenance entries after 20 recalls "
|
|
f"(expected >= 20)."
|
|
)
|
|
|
|
|
|
def test_topk_contains_all_pinned_at_runbook_profile(tmp_path):
|
|
"""OPS-04 gate at the runbook profile (n=50 pinned, k=60, 200 noise).
|
|
|
|
At k=60 with 50 pinned + 200 noise, every pinned should be in the top-60.
|
|
This is the in-process mirror of `bench/verbatim.py --n 50 --gap 5
|
|
--noise-per-session 40 --k 60`, minus the provenance-write amplification
|
|
angle that Test 1 covers.
|
|
"""
|
|
store, pinned, _ = _seed_store(tmp_path, n_pinned=50, n_noise=200)
|
|
dim = store.embed_dim
|
|
cue = [1.0] * dim
|
|
|
|
# pin mode='concept' so tier='semantic' pinned records
|
|
# survive the candidate filter (verbatim mode would drop them).
|
|
resp = recall(
|
|
store=store,
|
|
cue_embedding=cue,
|
|
cue_text="probe",
|
|
session_id="runbook",
|
|
budget_tokens=50_000,
|
|
k_hits=60,
|
|
k_anti=3,
|
|
mode="concept",
|
|
)
|
|
hit_ids = {h.record_id for h in resp.hits}
|
|
pinned_ids = {r.id for r in pinned}
|
|
missing = pinned_ids - hit_ids
|
|
assert not missing, (
|
|
f"OPS-04 violation at runbook profile: "
|
|
f"{len(missing)}/{len(pinned_ids)} pinned records missing from top-60. "
|
|
f"Missing surface (first 3): "
|
|
f"{sorted(str(m)[:8] for m in list(missing)[:3])}"
|
|
)
|
|
|
|
|
|
def test_no_literal_surface_mutation(tmp_path):
|
|
"""C5 invariant: literal_surface is byte-identical pre/post recalls.
|
|
|
|
Belt-and-suspenders against any future change that would write to
|
|
`literal_surface` during the recall path. The batching fix (Task 2) does
|
|
not touch this field, but the invariant test locks it in so a regression
|
|
in any other part of recall is caught immediately.
|
|
"""
|
|
store, pinned, _ = _seed_store(tmp_path, n_pinned=10, n_noise=40)
|
|
dim = store.embed_dim
|
|
cue = [1.0] * dim
|
|
|
|
# Snapshot literal_surface bytes before recalls.
|
|
pre = {r.id: store.get(r.id).literal_surface for r in pinned}
|
|
|
|
# 20 sequential recalls.
|
|
# mode='concept' so tier='semantic' pinned records
|
|
# survive the candidate filter.
|
|
for i in range(20):
|
|
recall(
|
|
store=store,
|
|
cue_embedding=cue,
|
|
cue_text=f"probe-{i}",
|
|
session_id=f"s{i}",
|
|
budget_tokens=5000,
|
|
k_hits=15,
|
|
k_anti=3,
|
|
mode="concept",
|
|
)
|
|
|
|
# Post-recall snapshot: every byte unchanged.
|
|
post = {r.id: store.get(r.id).literal_surface for r in pinned}
|
|
assert pre.keys() == post.keys()
|
|
for rid in pre:
|
|
assert pre[rid] == post[rid], (
|
|
f"C5 violation: literal_surface of record {rid} mutated "
|
|
f"by recall path. Before={pre[rid]!r}, after={post[rid]!r}."
|
|
)
|