"""Phase 8 redesign (08-CONTEXT.md D-07): production answer-packing entry-point contract. Tests the new public function `recall_for_response(...)` introduced by Plan 08-02. Contract: - Signature: store, graph, assignment, rich_club, embedder, cue, session_id, budget_tokens=1500, profile_state=None, turn=0, mode='concept'. - NO `k_hits` parameter — calling with `k_hits=10` MUST raise TypeError. - Returns RecallResponse (not _RecallCoreResult). - Packs hits under `budget_tokens` per the pre-Phase-8 production contract: each hit contributes `len(literal_surface) // 4` tokens to the running budget; loop breaks when `budget_used + tokens > budget_tokens` AND `len(hits) >= 1` (always at least one hit when one exists). - mode plumbing: the `mode` parameter threads through to `_recall_core` unchanged. Cross-file: see `tests/test_recall_for_benchmark.py` for the top-K contract, and `tests/test_recall_core_unit.py` for the underlying `_recall_core` shape and stage-internal behaviour. """ from __future__ import annotations from datetime import datetime, timezone from uuid import uuid4 import pytest from iai_mcp.community import CommunityAssignment from iai_mcp.graph import MemoryGraph from iai_mcp.store import MemoryStore from iai_mcp.types import EMBED_DIM, MemoryRecord, RecallResponse # ------------------------------------------------------------ test fixtures class _FakeEmbedder: """Stand-in embedder. The cue's embedding is configurable per-test.""" DIM = EMBED_DIM def __init__(self, vec: list[float] | None = None) -> None: self._vec = vec if vec is not None else [1.0] + [0.0] * (EMBED_DIM - 1) def embed(self, text: str) -> list[float]: return list(self._vec) def embed_batch(self, texts: list[str]) -> list[list[float]]: return [list(self._vec) for _ in texts] def _make( vec: list[float], text: str = "rec", aaak: str = "", tier: str = "episodic", ) -> MemoryRecord: now = datetime.now(timezone.utc) return MemoryRecord( id=uuid4(), tier=tier, literal_surface=text, aaak_index=aaak, embedding=vec, community_id=None, centrality=0.0, detail_level=2, pinned=False, stability=0.0, difficulty=0.0, last_reviewed=None, never_decay=False, never_merge=False, provenance=[], created_at=now, updated_at=now, tags=[], language="en", ) def _build_store_and_graph( tmp_path, n: int, surface_len: int = 4, ) -> tuple[MemoryStore, MemoryGraph, list[MemoryRecord]]: """Build N records with primary-axis distinct embeddings + matching graph. Each record's literal_surface has `surface_len` characters so the per-hit token estimate is `surface_len // 4`. Tune `surface_len` to control budget-pack behaviour deterministically. """ store = MemoryStore(path=tmp_path / "lancedb") recs: list[MemoryRecord] = [] for i in range(n): vec = [0.0] * EMBED_DIM vec[i % EMBED_DIM] = 1.0 text = "x" * surface_len rec = _make(vec, text=text) store.insert(rec) recs.append(rec) graph = MemoryGraph() for rec in recs: graph.add_node( rec.id, community_id=None, embedding=list(rec.embedding), ) graph._nx.nodes[str(rec.id)].update({ "embedding": list(rec.embedding), "surface": rec.literal_surface, "centrality": 0.0, "tier": rec.tier, "tags": [], "language": "en", }) return store, graph, recs def _flat_assignment(recs: list[MemoryRecord]) -> CommunityAssignment: """Single flat community covering all records (healthy graph baseline).""" cid = uuid4() centroid = [1.0] + [0.0] * (EMBED_DIM - 1) return CommunityAssignment( node_to_community={r.id: cid for r in recs}, community_centroids={cid: centroid}, modularity=0.0, backend="flat", top_communities=[cid], mid_regions={cid: [r.id for r in recs]}, ) # -------------------------------------------------- contract / signature tests def test_recall_for_response_no_k_hits_param(tmp_path) -> None: """Test 1: calling with `k_hits=10` raises TypeError. The contract split is the whole point: production answer-packing cannot accept a top-K cap parameter, otherwise an optional argument would let the two contracts silently swap semantics. """ from iai_mcp.pipeline import recall_for_response store, graph, recs = _build_store_and_graph(tmp_path, n=5) assignment = _flat_assignment(recs) with pytest.raises(TypeError): recall_for_response( store=store, graph=graph, assignment=assignment, rich_club=[], embedder=_FakeEmbedder(), cue="test", session_id="s1", k_hits=10, # this kwarg does not exist ) def test_recall_for_response_returns_recall_response_type(tmp_path) -> None: """Test 2: returns a RecallResponse with all 7 fields populated.""" from iai_mcp.pipeline import recall_for_response store, graph, recs = _build_store_and_graph(tmp_path, n=5) assignment = _flat_assignment(recs) resp = recall_for_response( store=store, graph=graph, assignment=assignment, rich_club=[], embedder=_FakeEmbedder(), cue="test", session_id="s2", ) assert isinstance(resp, RecallResponse) assert isinstance(resp.hits, list) assert isinstance(resp.anti_hits, list) assert isinstance(resp.activation_trace, list) assert isinstance(resp.budget_used, int) assert isinstance(resp.hints, list) assert isinstance(resp.cue_mode, str) assert isinstance(resp.patterns_observed, list) def test_recall_for_response_packs_under_budget(tmp_path) -> None: """Test 3: hits packed under `budget_tokens` per the pre-Phase-8 contract. Each record's literal_surface = 200 chars -> tokens = 200 // 4 = 50. With budget_tokens=120, the loop breaks after the first hit (50 tokens). Adding a second would push us to 100; adding a third would push us to 150 > 120 AND len(hits) >= 1, so we break. """ from iai_mcp.pipeline import recall_for_response # surface_len=200 -> 50 tokens per hit. store, graph, recs = _build_store_and_graph(tmp_path, n=5, surface_len=200) assignment = _flat_assignment(recs) resp = recall_for_response( store=store, graph=graph, assignment=assignment, rich_club=[], embedder=_FakeEmbedder(), cue="test", session_id="s3", budget_tokens=120, ) # Tight budget: 1 fits (50 tokens, budget_used=50), 2nd would push # to 100 (still <= 120, fits), 3rd would push to 150 > 120 AND # len(hits) >= 1, break. So we get exactly 2 hits. assert len(resp.hits) == 2 assert resp.budget_used == 100 def test_recall_for_response_returns_all_with_unlimited_budget(tmp_path) -> None: """Test 4: with budget_tokens=10000 (effectively unlimited), all hits are returned. The exhaustion is the ranker's natural stop, not the budget cap. """ from iai_mcp.pipeline import recall_for_response store, graph, recs = _build_store_and_graph(tmp_path, n=5, surface_len=4) assignment = _flat_assignment(recs) resp = recall_for_response( store=store, graph=graph, assignment=assignment, rich_club=[], embedder=_FakeEmbedder(), cue="test", session_id="s4", budget_tokens=10000, ) # All 5 records fit (5 * 1 token = 5 tokens, budget = 10000). assert len(resp.hits) == 5 def test_recall_for_response_minimum_one_hit(tmp_path) -> None: """Test 5: with extremely tight budget, the minimum-1-hit guard returns 1 hit. Even when the first hit's tokens exceed `budget_tokens`, the contract guarantees `len(hits) >= 1` when at least one ranked hit exists. """ from iai_mcp.pipeline import recall_for_response # surface_len=400 -> 100 tokens per hit; budget=50 (tighter than even 1 hit). store, graph, recs = _build_store_and_graph(tmp_path, n=5, surface_len=400) assignment = _flat_assignment(recs) resp = recall_for_response( store=store, graph=graph, assignment=assignment, rich_club=[], embedder=_FakeEmbedder(), cue="test", session_id="s5", budget_tokens=50, ) # One hit always survives (the production "always at least one" guard). assert len(resp.hits) == 1 def test_recall_for_response_threads_mode_to_core(tmp_path) -> None: """Test 5b: wiring — `mode` flows from entry point to `_recall_core` unchanged. Calling with `mode="verbatim"` must produce a response whose `cue_mode == "verbatim"` (proves the parameter threaded through). """ from iai_mcp.pipeline import recall_for_response store, graph, recs = _build_store_and_graph(tmp_path, n=5) assignment = _flat_assignment(recs) resp_v = recall_for_response( store=store, graph=graph, assignment=assignment, rich_club=[], embedder=_FakeEmbedder(), cue="test", session_id="s5b", budget_tokens=10000, mode="verbatim", ) assert resp_v.cue_mode == "verbatim", ( f"verbatim mode did not propagate; cue_mode={resp_v.cue_mode}" ) resp_c = recall_for_response( store=store, graph=graph, assignment=assignment, rich_club=[], embedder=_FakeEmbedder(), cue="test", session_id="s5c", budget_tokens=10000, mode="concept", ) assert resp_c.cue_mode == "concept" def test_recall_for_response_signature_has_no_k_hits_param() -> None: """The function signature exposes `budget_tokens` and `mode` but NOT `k_hits`.""" import inspect from iai_mcp.pipeline import recall_for_response sig = inspect.signature(recall_for_response) assert "budget_tokens" in sig.parameters assert "mode" in sig.parameters assert "k_hits" not in sig.parameters, ( "recall_for_response signature must NOT carry a k_hits parameter " "(D-07 contract split — the entry-point split exists so the two " "response shapes can never silently swap via an optional kwarg)." ) def test_recall_for_response_default_budget_tokens_1500() -> None: """The default budget_tokens is 1500 (matches pre-Phase-8 production default).""" import inspect from iai_mcp.pipeline import recall_for_response sig = inspect.signature(recall_for_response) assert sig.parameters["budget_tokens"].default == 1500 # ------------------------------------------------------ shared / parity tests def test_recall_for_response_shares_core_with_benchmark(tmp_path) -> None: """Both entry points share `_recall_core` — only the final pack/cap differs. This test proves ("only the final pack/cap differs"): when called with the same fixture and the same `mode`, the cue-matched record (cosine=1.0) must be the top hit on BOTH entry points, and both must surface the same set of record_ids (only ordering of tied-cosine records may differ across calls due to age-penalty floating-point drift between the two `datetime.now()` calls). """ from iai_mcp.pipeline import recall_for_benchmark, recall_for_response store, graph, recs = _build_store_and_graph(tmp_path, n=8, surface_len=4) assignment = _flat_assignment(recs) resp_y = recall_for_response( store=store, graph=graph, assignment=assignment, rich_club=[], embedder=_FakeEmbedder(), cue="test", session_id="s-shared-r", budget_tokens=10000, # unlimited so all ranked hits surface ) resp_b = recall_for_benchmark( store=store, graph=graph, assignment=assignment, rich_club=[], embedder=_FakeEmbedder(), cue="test", session_id="s-shared-b", k_hits=100, # > graph size so all ranked hits surface ) # Top hit must be the cue-matched record (cosine=1.0 vs orthogonal 0.0 # for the rest) on both entry points — this is the load-bearing # ranking claim of D-07. assert resp_y.hits[0].record_id == resp_b.hits[0].record_id, ( "top scored hit (cosine=1.0 cue-match) must be identical across " "entry points; only the final pack/cap is supposed to differ" ) # Both entry points must surface the same SET of record_ids when # neither cap is binding. The within-set ordering may vary among # tied-cosine records due to age-penalty floating-point drift. r_set = {h.record_id for h in resp_y.hits} b_set = {h.record_id for h in resp_b.hits} assert r_set == b_set, ( f"recall_for_response and recall_for_benchmark must surface the " f"same record-id set when neither cap binds; got\n" f" response only: {r_set - b_set}\n" f" benchmark only: {b_set - r_set}" )