"""Plan 06-04 R7: baseline parity tests. R7 acceptance per SPEC.md: - retrieve.recall accepts mode kwarg (default 'verbatim' per D-14). - mode='verbatim' applies the same tier filter + schema exclusion as pipeline_recall verbatim mode. - core.dispatch falls back to retrieve.recall when build_runtime_graph fails — the classified mode is preserved (verbatim default protects the North-Star essential variable on the degraded path). - regression fence (test_recall_topk_stability) continues to pass. Constitutional framing — Ashby ultrastability: the North-Star verbatim ≥99% essential variable is defended even when the full pipeline is unreachable. The fallback path inherits the same contract on hits[] so the user never silently lands on a schema-dominated surface. """ from __future__ import annotations from datetime import datetime, timezone from uuid import uuid4 import pytest from iai_mcp.types import EMBED_DIM, MemoryRecord # --------------------------------------------------------- Fixture machinery def _make_episodic(text: str) -> MemoryRecord: now = datetime.now(timezone.utc) return MemoryRecord( id=uuid4(), tier="episodic", literal_surface=text, aaak_index="", embedding=[1.0] + [0.0] * (EMBED_DIM - 1), community_id=None, centrality=0.0, detail_level=2, pinned=False, stability=0.0, difficulty=0.0, last_reviewed=None, never_decay=False, never_merge=False, provenance=[], created_at=now, updated_at=now, tags=[], language="en", ) def _make_schema(text: str, pattern: str) -> MemoryRecord: now = datetime.now(timezone.utc) return MemoryRecord( id=uuid4(), tier="semantic", literal_surface=text, aaak_index="", embedding=[1.0] + [0.0] * (EMBED_DIM - 1), community_id=None, centrality=0.0, detail_level=3, pinned=False, stability=0.0, difficulty=0.0, last_reviewed=None, never_decay=True, never_merge=False, provenance=[], created_at=now, updated_at=now, tags=["schema", "draft", f"pattern:{pattern}"], language="en", ) @pytest.fixture(autouse=True) def _isolated_keyring(monkeypatch: pytest.MonkeyPatch): import keyring as _keyring fake: dict[tuple[str, str], str] = {} monkeypatch.setattr(_keyring, "get_password", lambda s, u: fake.get((s, u))) monkeypatch.setattr( _keyring, "set_password", lambda s, u, p: fake.__setitem__((s, u), p) ) monkeypatch.setattr( _keyring, "delete_password", lambda s, u: fake.pop((s, u), None) ) yield fake def _seed_mixed_tier_store(tmp_path): """Seed: 3 episodic + 2 schema (semantic + pattern:*) — all share the same embedding so cosine ties to the cue.""" from iai_mcp.store import MemoryStore store = MemoryStore(path=tmp_path / "lancedb") episodic_records = [_make_episodic(f"episodic verbatim text {i}") for i in range(3)] schema_records = [ _make_schema(f"schema record {i}", pattern=f"test:r7:{i}") for i in range(2) ] for r in episodic_records: store.insert(r) for r in schema_records: store.insert(r) return store, episodic_records, schema_records # ============================================================================ # R7 acceptance tests # ============================================================================ def test_baseline_recall_default_mode_is_verbatim_per_d14(): """retrieve.recall mode kwarg default is 'verbatim' per D-14 (conservative North-Star fallback).""" import inspect from iai_mcp.retrieve import recall sig = inspect.signature(recall) assert "mode" in sig.parameters, "retrieve.recall must accept mode kwarg" assert sig.parameters["mode"].default == "verbatim", ( f"retrieve.recall default mode must be 'verbatim' per D-14, " f"got {sig.parameters['mode'].default!r}" ) def test_baseline_recall_verbatim_filters_to_episodic_only(tmp_path): """Direct call: recall(store, ...) without mode kwarg returns hits filtered to tier='episodic' (D-14 default) — schema records excluded.""" from iai_mcp.retrieve import recall store, episodic_records, schema_records = _seed_mixed_tier_store(tmp_path) cue = [1.0] + [0.0] * (EMBED_DIM - 1) # No mode kwarg -> verbatim default per D-14. resp = recall( store=store, cue_embedding=cue, cue_text="probe", session_id="r7_default", k_hits=5, k_anti=2, ) assert resp.cue_mode == "verbatim", ( f"baseline default mode must be 'verbatim', got {resp.cue_mode!r}" ) schema_id_set = {r.id for r in schema_records} for h in resp.hits: assert h.record_id not in schema_id_set, ( f"verbatim mode baseline must exclude schema records; " f"schema {h.record_id} appeared in hits" ) rec = store.get(h.record_id) assert rec is not None assert rec.tier == "episodic", ( f"verbatim mode hit {h.record_id} has tier {rec.tier!r}, expected 'episodic'" ) def test_baseline_recall_concept_mode_returns_all_tiers(tmp_path): """recall(..., mode='concept') returns the existing pure-cosine top-k INCLUDING all tiers (no filter — concept mode preserves baseline behaviour).""" from iai_mcp.retrieve import recall store, episodic_records, schema_records = _seed_mixed_tier_store(tmp_path) cue = [1.0] + [0.0] * (EMBED_DIM - 1) resp = recall( store=store, cue_embedding=cue, cue_text="probe", session_id="r7_concept", k_hits=5, k_anti=2, mode="concept", ) assert resp.cue_mode == "concept" # All 5 records (3 episodic + 2 schema) tied at cosine=1.0; with k_hits=5 # we should see all 5. Schema records ARE included on concept mode (the # baseline does not filter; only the full pipeline applies R6 split). hit_ids = {h.record_id for h in resp.hits} schema_id_set = {r.id for r in schema_records} assert schema_id_set & hit_ids, ( f"concept mode baseline must include schema tier (no filter); " f"schema_ids={schema_id_set}, hit_ids={hit_ids}" ) def test_dispatch_falls_back_to_baseline_on_graph_build_failure(tmp_path, monkeypatch): """R7 acceptance: monkeypatch retrieve.build_runtime_graph to raise. dispatch(..., 'memory_recall', {...verbatim cue...}) must: (a) complete (not propagate the exception); (b) return a non-empty response; (c) cue_mode == 'verbatim'; (d) all hits are tier='episodic' (verbatim filter applied via fallback). """ from iai_mcp import core from iai_mcp import retrieve as _retrieve_mod store, episodic_records, schema_records = _seed_mixed_tier_store(tmp_path) def fake_build(*args, **kwargs): raise RuntimeError("simulated graph build failure") monkeypatch.setattr(_retrieve_mod, "build_runtime_graph", fake_build) response = core.dispatch( store, "memory_recall", {"cue": "verbatim quote about migration", "session_id": "r7_fallback", "cue_embedding": [1.0] + [0.0] * (EMBED_DIM - 1)}, ) # (a) dispatch completed without raising — we have a response. assert isinstance(response, dict) # (c) classified mode preserved on the fallback path. assert response["cue_mode"] == "verbatim", ( f"verbatim cue must classify to verbatim even when graph build fails; " f"got {response['cue_mode']!r}" ) # (b) + (d) hits are episodic-only (when present). schema_id_strs = {str(r.id) for r in schema_records} for h in response["hits"]: assert h["record_id"] not in schema_id_strs, ( f"fallback path must apply verbatim filter; schema {h['record_id']} " f"appeared in hits despite graph build failure + verbatim cue" ) def test_recall_topk_stability_smoke(tmp_path): """Smoke check: tests/test_recall_topk_stability.py still passes with the mode='concept' explicit pin we added in Task 2 GREEN. The actual lock is the dedicated test file; this test merely imports + runs one of its representative invariants here as a sentinel. """ # Direct import + smoke run of the canonical helper from the existing # regression-fence module. If the module can't import at all under the # changes, this test catches it (import-time errors). import importlib mod = importlib.import_module("tests.test_recall_topk_stability") assert hasattr(mod, "test_no_literal_surface_mutation"), ( "regression-fence module must still expose its sentinel test" ) # Run one of the lighter assertions inline so this test does meaningful # work — the C5 literal_surface invariant. Runs in <2s. mod.test_no_literal_surface_mutation(tmp_path)