"""Phase 8 redesign (08-CONTEXT.md D-02): regression-fence — community gate is a MODE-DEPENDENT diagnostic, not a hard filter. The redesign's load-bearing claim has two parts: 1. Records OUTSIDE the top-3 gated communities can still surface in `scored_hits[:K]` when their cosine rank is high. The gate never filters; it only biases. 2. The bias is mode-dependent (D-02 grounded in CLS / EPF / HIPPEA / Ashby / Beer VSM): - verbatim mode -> 0.0 (HIPPEA pure / EPF literal / hippocampal episodic; categorical filtering is anti-aSD here) - concept mode -> 0.1 (CLS neocortical semantic; soft +10% categorical hint to records inside top-3 gated communities) Pre-08 the gate was a HARD FILTER: `pipeline_recall` reduced `candidates` to records inside the top-3 communities. On a degenerate one-record-per-community graph (the cold-start bug class smoking gun in the published LongMemEval-S bench report) only 3 candidates survived; gold (12-24 records) could not. The redesign closes this by reading the candidate pool from cosine top-K_CANDIDATES instead, and applying a mode-dependent soft bias only at the Stage-5 ranking step. This fence catches both: (a) someone re-introducing a hard filter (test 1 below); (b) someone changing the bias constants or removing the mode dispatch (test 2 below). """ from __future__ import annotations from datetime import datetime, timezone from uuid import UUID, uuid4 import numpy as np import pytest from iai_mcp.community import CommunityAssignment from iai_mcp.graph import MemoryGraph from iai_mcp.pipeline import ( COMMUNITY_BIAS_CONCEPT, COMMUNITY_BIAS_VERBATIM, _gate_bias_for_mode, _recall_core, recall_for_benchmark, ) from iai_mcp.store import MemoryStore from iai_mcp.types import EMBED_DIM, MemoryRecord # --------------------------------------------------------------- test fixtures class _FakeEmbedder: """Stand-in embedder; cue's embedding is configurable.""" DIM = EMBED_DIM def __init__(self, vec: list[float] | None = None) -> None: self._vec = vec if vec is not None else [1.0] + [0.0] * (EMBED_DIM - 1) def embed(self, text: str) -> list[float]: return list(self._vec) def embed_batch(self, texts: list[str]) -> list[list[float]]: return [list(self._vec) for _ in texts] def _make(vec: list[float], text: str = "rec") -> MemoryRecord: now = datetime.now(timezone.utc) return MemoryRecord( id=uuid4(), tier="episodic", literal_surface=text, aaak_index="", embedding=vec, community_id=None, centrality=0.0, detail_level=2, pinned=False, stability=0.0, difficulty=0.0, last_reviewed=None, never_decay=False, never_merge=False, provenance=[], created_at=now, updated_at=now, tags=[], language="en", ) def _build_one_record_per_community( tmp_path, n: int = 50, ) -> tuple[MemoryStore, MemoryGraph, list[MemoryRecord], CommunityAssignment]: """Replicates the cold-start bug class: 50 records, 1 community each. Each record's embedding is the i-th unit basis vector in EMBED_DIM space, so all records are mutually orthogonal AND aligned to a distinct primary axis. The assignment is constructed directly (bypassing Leiden), placing each record in its OWN community whose centroid equals the record's embedding. This means the community nearest the cue (by centroid cosine) is the community containing the record nearest the cue (by record cosine). Mirrors the deleted tests/test_pipeline_community_gate_augment.py helper `_build_degenerate_graph_and_assignment` (Phase 8 patch era). Kept as a private helper here since the patch tests are gone. """ store = MemoryStore(path=tmp_path / "lancedb") recs: list[MemoryRecord] = [] for i in range(n): vec = [0.0] * EMBED_DIM vec[i % EMBED_DIM] = 1.0 rec = _make(vec, text=f"rec{i}") store.insert(rec) recs.append(rec) graph = MemoryGraph() for rec in recs: graph.add_node( rec.id, community_id=None, embedding=list(rec.embedding), ) graph._nx.nodes[str(rec.id)].update({ "embedding": list(rec.embedding), "surface": rec.literal_surface, "centrality": 0.0, "tier": rec.tier, "tags": [], "language": "en", }) # One record per community: centroid = record's embedding. cids = [uuid4() for _ in recs] centroids = {cids[i]: list(recs[i].embedding) for i in range(len(recs))} node_to_community = {recs[i].id: cids[i] for i in range(len(recs))} mid_regions = {cids[i]: [recs[i].id] for i in range(len(recs))} assignment = CommunityAssignment( node_to_community=node_to_community, community_centroids=centroids, modularity=0.0, backend="leiden-test-degenerate", top_communities=cids[:3], mid_regions=mid_regions, ) return store, graph, recs, assignment # ------------------------------------------------------------------- tests def test_records_outside_gated_communities_surface_via_cosine(tmp_path): """D-02 anti-hard-filter fence: gold OUTSIDE top-3 communities still surfaces. Build a 50-record fixture where each record has a distinct primary axis. The cue points at axis 5 (rec[5] is the gold). The community gate (top-3 by centroid cosine) returns the community of rec[5] plus two arbitrary others (the orthogonal axes all tie at cosine 0 so the secondary order is by stable-sort UUID — out of our control, but reliably NOT covering all 50 communities). The cue points at axis 5, NOT at axis 0; rec[5] is therefore in its own community (because each record is in its own community in this fixture). The cosine top-K pool surfaces rec[5] regardless of whether the gate's secondary picks happen to include it. Mode is "concept" so the +0.1*cos bias for top-3-gated records is active; the gold record (cosine 1.0 to the cue) wins on its raw cosine alone, even when the gate's bias goes to other communities. If a future change re-introduces a hard filter (pre-08 behavior where `candidates` are reduced to gate members only), this test fails: rec[5] has cosine 1.0 but only the 3 gated communities survive, and on the orthogonal-axes geometry the gate may rank rec[5]'s community OUTSIDE the top-3, dropping the gold record from the candidate pool. """ store, graph, recs, assignment = _build_one_record_per_community(tmp_path, n=50) # Cue points at axis 5; rec[5] has cosine 1.0; all others are 0. cue_vec = [0.0] * EMBED_DIM cue_vec[5] = 1.0 embedder = _FakeEmbedder(vec=cue_vec) resp = recall_for_benchmark( store=store, graph=graph, assignment=assignment, rich_club=[], embedder=embedder, cue="cue at axis 5", session_id="s-gate-diag-1", k_hits=10, mode="concept", ) found_ids = {h.record_id for h in resp.hits} assert recs[5].id in found_ids, ( "D-02 violation: gold record (cosine 1.0 to cue, on axis 5) " "is NOT in top-10 hits. The gate must NEVER filter — only " "bias. If this fails, someone re-introduced the pre-Phase-8 " "hard-filter behavior (candidates restricted to top-3 " "gated-community members)." ) # Stronger version: the gold record is the TOP hit (cosine 1.0 vs # all others tied at 0; even with concept-mode +0.1*cos bias for # records in the gated set, the gold's cosine 1.0 beats anything # the bias can synthesize on a 0-cosine record). assert resp.hits[0].record_id == recs[5].id, ( f"gold should be top-1 by cosine alone (1.0 vs ~0); " f"got {resp.hits[0].record_id} as top hit. Possible cause: " "Stage 5 weights were re-tuned, or community-bias scalar is " "being applied multiplicatively/subtractively instead of " "additively to records inside the gated set." ) def test_mode_bias_verbatim_zero_concept_nonzero(tmp_path): """D-02 canonical fence: verbatim mode bias=0.0; concept mode bias=0.1. Records inside top-3 gated communities get a score bonus ONLY in concept mode. A record outside top-3 communities never gets the bonus regardless of mode. The same fixture is recalled in both modes; we assert: - Both calls return the SAME record list (gate never filters, only biases ranking). - In verbatim mode, the gated record's score reflects ZERO community contribution (cosine + AAAK + degree + age only). - In concept mode, the gated record's score is approximately `verbatim_score + 0.1 * cos` higher than its verbatim counterpart. - The non-gated control record's score is unchanged across modes (the bias only applies to records inside top-3 gated communities). This catches: (a) someone changing COMMUNITY_BIAS_VERBATIM away from 0.0 or COMMUNITY_BIAS_CONCEPT away from 0.1; (b) someone removing the `mode` dispatch from `_gate_bias_for_mode` or `_recall_core`'s Stage 5; (c) someone reintroducing a hard filter that drops non-gated records. Symbol-level pre-flight: `_gate_bias_for_mode("verbatim") == 0.0` and `_gate_bias_for_mode("concept") == 0.1` (constants intact). Fixture geometry — keep it simple to make scores byte-identical across the two modes for the non-bias terms: - All records have the SAME aaak (empty), SAME tier (episodic), SAME literal_surface length (so age, deg_norm contribute identically across records). - No edges in the graph -> max_deg = 0 -> log_max_deg = 0 -> deg_norm == 0 for every record -> W_DEGREE * deg_norm == 0. - No profile_state -> no per-record gain product. - No structural_weight -> no structural-similarity term. => base_s = W_COSINE * cos - W_AGE * age (everything else constant or zero across records). """ # Symbol-level pre-flight assertions — contract surface intact. assert COMMUNITY_BIAS_VERBATIM == 0.0 assert COMMUNITY_BIAS_CONCEPT == 0.1 assert _gate_bias_for_mode("verbatim") == 0.0 assert _gate_bias_for_mode("concept") == 0.1 assert _gate_bias_for_mode("unknown") == 0.0 # defensive default # Build a 50-record fixture: 1 record per community on distinct # primary axes (orthogonal). The cue points at axis 0; rec[0] sits # in community c0 whose centroid is the axis-0 unit vector — so the # gate places c0 first by centroid cosine. store, graph, recs, assignment = _build_one_record_per_community(tmp_path, n=50) # Cue points at axis 0 (matching rec[0]'s primary axis). cue_vec = [0.0] * EMBED_DIM cue_vec[0] = 1.0 embedder = _FakeEmbedder(vec=cue_vec) # Identify the GATED record (rec[0], in community c0 at top-1 by # centroid cosine) and the CONTROL record. The control is whichever # record sits in a community OUTSIDE the top-3 gated set; we pick # rec[20] (community c20, axis 20 — definitely orthogonal to cue, # cosine 0.0). We also pre-compute the gold expectation that rec[0] # gets the +0.1 community bonus in concept mode. rec_GATED = recs[0] rec_CONTROL = recs[20] # --- recall in verbatim mode --- result_v = _recall_core( store=store, graph=graph, assignment=assignment, rich_club=[], embedder=embedder, cue="cue at axis 0", session_id="s-mode-bias-v", mode="verbatim", ) # --- recall in concept mode --- result_c = _recall_core( store=store, graph=graph, assignment=assignment, rich_club=[], embedder=embedder, cue="cue at axis 0", session_id="s-mode-bias-c", mode="concept", ) # --- 1. Same record list — the gate must NEVER filter. --- verbatim_ids = {h.record_id for h in result_v.scored_hits} concept_ids = {h.record_id for h in result_c.scored_hits} assert verbatim_ids == concept_ids, ( "D-02 fence: gate must NEVER filter; mode change should not " f"alter the record list. verbatim_only={verbatim_ids - concept_ids}, " f"concept_only={concept_ids - verbatim_ids}" ) # --- 2. Lookup GATED and CONTROL records' scores in both modes. --- v_gated = next(h for h in result_v.scored_hits if h.record_id == rec_GATED.id) c_gated = next(h for h in result_c.scored_hits if h.record_id == rec_GATED.id) v_ctrl = next(h for h in result_v.scored_hits if h.record_id == rec_CONTROL.id) c_ctrl = next(h for h in result_c.scored_hits if h.record_id == rec_CONTROL.id) # --- 3. Concept mode: GATED record gains COMMUNITY_BIAS_CONCEPT * cos. --- # The score delta is the *only* term that changes across modes for # the gated record (everything else is identical: same record, same # cue, same fixture, same time, same profile_state). The cosine of # rec_GATED to the cue is 1.0 (axis 0 vs axis-0 cue), so the # expected bonus is 0.1 * 1.0 == 0.1. cos_GATED = 1.0 expected_bonus = COMMUNITY_BIAS_CONCEPT * cos_GATED delta_gated = c_gated.score - v_gated.score assert delta_gated == pytest.approx(expected_bonus, abs=1e-4), ( f"D-02 concept mode: GATED record (rec[0], cosine={cos_GATED}) " f"should gain ~{expected_bonus:.4f} from " f"COMMUNITY_BIAS_CONCEPT * cos when transitioning verbatim -> " f"concept. Got delta = c_gated.score - v_gated.score = " f"{delta_gated:.4f}.\n" f"v_gated.score = {v_gated.score:.4f}; " f"c_gated.score = {c_gated.score:.4f}." ) # --- 4. CONTROL record (outside top-3 gated): score UNCHANGED. --- # rec_CONTROL is in c20 — definitely NOT in top_communities[:3] for # this fixture (c0/c1/c2 dominate by centroid cosine since cue is # at axis 0; the orthogonal-axes geometry sorts the rest by stable- # sort UUID). The control record's score must be byte-identical # across modes. delta_ctrl = c_ctrl.score - v_ctrl.score assert delta_ctrl == pytest.approx(0.0, abs=1e-6), ( f"D-02 concept mode: CONTROL record (rec[20], cosine 0 to cue, " f"in non-gated community c20) must NOT receive the community " f"bias. Got delta = c_ctrl.score - v_ctrl.score = {delta_ctrl:.6f}; " f"expected 0.0.\n" f"v_ctrl.score = {v_ctrl.score:.6f}; " f"c_ctrl.score = {c_ctrl.score:.6f}." ) # --- 5. Verbatim mode: bias contribution is identically zero. --- # In verbatim mode COMMUNITY_BIAS_VERBATIM == 0.0, so the gated # record's score does NOT receive any community contribution. # Because cosine for rec_CONTROL is 0 (axis 20 vs axis-0 cue) and # rec_GATED has cosine 1.0, the verbatim-mode score difference is # purely W_COSINE * (1.0 - 0.0) = W_COSINE — the cosine term alone. # No additive bias term sneaks in; all other contributions # (aaak, deg_norm, age) are identical by fixture construction. from iai_mcp.pipeline import W_COSINE expected_verbatim_delta = W_COSINE * (1.0 - 0.0) actual_verbatim_delta = v_gated.score - v_ctrl.score assert actual_verbatim_delta == pytest.approx( expected_verbatim_delta, abs=1e-4 ), ( f"D-02 verbatim mode: gated vs control score delta should be " f"W_COSINE * cos_diff = {W_COSINE} * 1.0 = {expected_verbatim_delta:.4f} " f"with NO community-bias contribution. Got delta = " f"{actual_verbatim_delta:.4f}. If this differs, either " f"COMMUNITY_BIAS_VERBATIM is non-zero, or the mode dispatch " f"in _gate_bias_for_mode is broken." )