Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: XNLLLLH <XNLLLLH@users.noreply.github.com>
366 lines
16 KiB
Python
366 lines
16 KiB
Python
"""Phase 8 redesign (08-CONTEXT.md D-02): regression-fence — community
|
|
gate is a MODE-DEPENDENT diagnostic, not a hard filter.
|
|
|
|
The redesign's load-bearing claim has two parts:
|
|
1. Records OUTSIDE the top-3 gated communities can still surface in
|
|
`scored_hits[:K]` when their cosine rank is high. The gate never
|
|
filters; it only biases.
|
|
2. The bias is mode-dependent (D-02 grounded in CLS / EPF / HIPPEA /
|
|
Ashby / Beer VSM):
|
|
- verbatim mode -> 0.0 (HIPPEA pure / EPF literal / hippocampal
|
|
episodic; categorical filtering is
|
|
anti-aSD here)
|
|
- concept mode -> 0.1 (CLS neocortical semantic; soft +10%
|
|
categorical hint to records inside
|
|
top-3 gated communities)
|
|
|
|
Pre-08 the gate was a HARD FILTER: `pipeline_recall` reduced
|
|
`candidates` to records inside the top-3 communities. On a degenerate
|
|
one-record-per-community graph (the cold-start bug class smoking gun
|
|
in the published LongMemEval-S bench report) only 3 candidates survived; gold (12-24
|
|
records) could not. The redesign closes this by reading the candidate
|
|
pool from cosine top-K_CANDIDATES instead, and applying a mode-dependent
|
|
soft bias only at the Stage-5 ranking step.
|
|
|
|
This fence catches both:
|
|
(a) someone re-introducing a hard filter (test 1 below);
|
|
(b) someone changing the bias constants or removing the mode
|
|
dispatch (test 2 below).
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
from datetime import datetime, timezone
|
|
from uuid import UUID, uuid4
|
|
|
|
import numpy as np
|
|
import pytest
|
|
|
|
from iai_mcp.community import CommunityAssignment
|
|
from iai_mcp.graph import MemoryGraph
|
|
from iai_mcp.pipeline import (
|
|
COMMUNITY_BIAS_CONCEPT,
|
|
COMMUNITY_BIAS_VERBATIM,
|
|
_gate_bias_for_mode,
|
|
_recall_core,
|
|
recall_for_benchmark,
|
|
)
|
|
from iai_mcp.store import MemoryStore
|
|
from iai_mcp.types import EMBED_DIM, MemoryRecord
|
|
|
|
|
|
# --------------------------------------------------------------- test fixtures
|
|
|
|
|
|
class _FakeEmbedder:
|
|
"""Stand-in embedder; cue's embedding is configurable."""
|
|
|
|
DIM = EMBED_DIM
|
|
|
|
def __init__(self, vec: list[float] | None = None) -> None:
|
|
self._vec = vec if vec is not None else [1.0] + [0.0] * (EMBED_DIM - 1)
|
|
|
|
def embed(self, text: str) -> list[float]:
|
|
return list(self._vec)
|
|
|
|
def embed_batch(self, texts: list[str]) -> list[list[float]]:
|
|
return [list(self._vec) for _ in texts]
|
|
|
|
|
|
def _make(vec: list[float], text: str = "rec") -> MemoryRecord:
|
|
now = datetime.now(timezone.utc)
|
|
return MemoryRecord(
|
|
id=uuid4(),
|
|
tier="episodic",
|
|
literal_surface=text,
|
|
aaak_index="",
|
|
embedding=vec,
|
|
community_id=None,
|
|
centrality=0.0,
|
|
detail_level=2,
|
|
pinned=False,
|
|
stability=0.0,
|
|
difficulty=0.0,
|
|
last_reviewed=None,
|
|
never_decay=False,
|
|
never_merge=False,
|
|
provenance=[],
|
|
created_at=now,
|
|
updated_at=now,
|
|
tags=[],
|
|
language="en",
|
|
)
|
|
|
|
|
|
def _build_one_record_per_community(
|
|
tmp_path,
|
|
n: int = 50,
|
|
) -> tuple[MemoryStore, MemoryGraph, list[MemoryRecord], CommunityAssignment]:
|
|
"""Replicates the cold-start bug class: 50 records, 1 community each.
|
|
|
|
Each record's embedding is the i-th unit basis vector in EMBED_DIM
|
|
space, so all records are mutually orthogonal AND aligned to a
|
|
distinct primary axis. The assignment is constructed directly
|
|
(bypassing Leiden), placing each record in its OWN community whose
|
|
centroid equals the record's embedding. This means the community
|
|
nearest the cue (by centroid cosine) is the community containing
|
|
the record nearest the cue (by record cosine).
|
|
|
|
Mirrors the deleted tests/test_pipeline_community_gate_augment.py
|
|
helper `_build_degenerate_graph_and_assignment` (Phase 8 patch
|
|
era). Kept as a private helper here since the patch tests are gone.
|
|
"""
|
|
store = MemoryStore(path=tmp_path / "lancedb")
|
|
recs: list[MemoryRecord] = []
|
|
for i in range(n):
|
|
vec = [0.0] * EMBED_DIM
|
|
vec[i % EMBED_DIM] = 1.0
|
|
rec = _make(vec, text=f"rec{i}")
|
|
store.insert(rec)
|
|
recs.append(rec)
|
|
|
|
graph = MemoryGraph()
|
|
for rec in recs:
|
|
graph.add_node(
|
|
rec.id, community_id=None, embedding=list(rec.embedding),
|
|
)
|
|
graph._nx.nodes[str(rec.id)].update({
|
|
"embedding": list(rec.embedding),
|
|
"surface": rec.literal_surface,
|
|
"centrality": 0.0,
|
|
"tier": rec.tier,
|
|
"tags": [],
|
|
"language": "en",
|
|
})
|
|
|
|
# One record per community: centroid = record's embedding.
|
|
cids = [uuid4() for _ in recs]
|
|
centroids = {cids[i]: list(recs[i].embedding) for i in range(len(recs))}
|
|
node_to_community = {recs[i].id: cids[i] for i in range(len(recs))}
|
|
mid_regions = {cids[i]: [recs[i].id] for i in range(len(recs))}
|
|
assignment = CommunityAssignment(
|
|
node_to_community=node_to_community,
|
|
community_centroids=centroids,
|
|
modularity=0.0,
|
|
backend="leiden-test-degenerate",
|
|
top_communities=cids[:3],
|
|
mid_regions=mid_regions,
|
|
)
|
|
return store, graph, recs, assignment
|
|
|
|
|
|
# ------------------------------------------------------------------- tests
|
|
|
|
|
|
def test_records_outside_gated_communities_surface_via_cosine(tmp_path):
|
|
"""D-02 anti-hard-filter fence: gold OUTSIDE top-3 communities still surfaces.
|
|
|
|
Build a 50-record fixture where each record has a distinct primary
|
|
axis. The cue points at axis 5 (rec[5] is the gold). The community
|
|
gate (top-3 by centroid cosine) returns the community of rec[5]
|
|
plus two arbitrary others (the orthogonal axes all tie at cosine 0
|
|
so the secondary order is by stable-sort UUID — out of our control,
|
|
but reliably NOT covering all 50 communities).
|
|
|
|
The cue points at axis 5, NOT at axis 0; rec[5] is therefore in
|
|
its own community (because each record is in its own community in
|
|
this fixture). The cosine top-K pool surfaces rec[5] regardless of
|
|
whether the gate's secondary picks happen to include it.
|
|
|
|
Mode is "concept" so the +0.1*cos bias for top-3-gated records is
|
|
active; the gold record (cosine 1.0 to the cue) wins on its raw
|
|
cosine alone, even when the gate's bias goes to other communities.
|
|
|
|
If a future change re-introduces a hard filter (pre-08 behavior
|
|
where `candidates` are reduced to gate members only), this test
|
|
fails: rec[5] has cosine 1.0 but only the 3 gated communities
|
|
survive, and on the orthogonal-axes geometry the gate may rank
|
|
rec[5]'s community OUTSIDE the top-3, dropping the gold record
|
|
from the candidate pool.
|
|
"""
|
|
store, graph, recs, assignment = _build_one_record_per_community(tmp_path, n=50)
|
|
|
|
# Cue points at axis 5; rec[5] has cosine 1.0; all others are 0.
|
|
cue_vec = [0.0] * EMBED_DIM
|
|
cue_vec[5] = 1.0
|
|
embedder = _FakeEmbedder(vec=cue_vec)
|
|
|
|
resp = recall_for_benchmark(
|
|
store=store, graph=graph, assignment=assignment,
|
|
rich_club=[], embedder=embedder,
|
|
cue="cue at axis 5", session_id="s-gate-diag-1",
|
|
k_hits=10, mode="concept",
|
|
)
|
|
|
|
found_ids = {h.record_id for h in resp.hits}
|
|
assert recs[5].id in found_ids, (
|
|
"D-02 violation: gold record (cosine 1.0 to cue, on axis 5) "
|
|
"is NOT in top-10 hits. The gate must NEVER filter — only "
|
|
"bias. If this fails, someone re-introduced the pre-Phase-8 "
|
|
"hard-filter behavior (candidates restricted to top-3 "
|
|
"gated-community members)."
|
|
)
|
|
# Stronger version: the gold record is the TOP hit (cosine 1.0 vs
|
|
# all others tied at 0; even with concept-mode +0.1*cos bias for
|
|
# records in the gated set, the gold's cosine 1.0 beats anything
|
|
# the bias can synthesize on a 0-cosine record).
|
|
assert resp.hits[0].record_id == recs[5].id, (
|
|
f"gold should be top-1 by cosine alone (1.0 vs ~0); "
|
|
f"got {resp.hits[0].record_id} as top hit. Possible cause: "
|
|
"Stage 5 weights were re-tuned, or community-bias scalar is "
|
|
"being applied multiplicatively/subtractively instead of "
|
|
"additively to records inside the gated set."
|
|
)
|
|
|
|
|
|
def test_mode_bias_verbatim_zero_concept_nonzero(tmp_path):
|
|
"""D-02 canonical fence: verbatim mode bias=0.0; concept mode bias=0.1.
|
|
|
|
Records inside top-3 gated communities get a score bonus ONLY in
|
|
concept mode. A record outside top-3 communities never gets the
|
|
bonus regardless of mode. The same fixture is recalled in both
|
|
modes; we assert:
|
|
- Both calls return the SAME record list (gate never filters,
|
|
only biases ranking).
|
|
- In verbatim mode, the gated record's score reflects ZERO
|
|
community contribution (cosine + AAAK + degree + age only).
|
|
- In concept mode, the gated record's score is approximately
|
|
`verbatim_score + 0.1 * cos` higher than its verbatim
|
|
counterpart.
|
|
- The non-gated control record's score is unchanged across modes
|
|
(the bias only applies to records inside top-3 gated communities).
|
|
|
|
This catches: (a) someone changing COMMUNITY_BIAS_VERBATIM away
|
|
from 0.0 or COMMUNITY_BIAS_CONCEPT away from 0.1; (b) someone
|
|
removing the `mode` dispatch from `_gate_bias_for_mode` or
|
|
`_recall_core`'s Stage 5; (c) someone reintroducing a hard filter
|
|
that drops non-gated records.
|
|
|
|
Symbol-level pre-flight: `_gate_bias_for_mode("verbatim") == 0.0`
|
|
and `_gate_bias_for_mode("concept") == 0.1` (constants intact).
|
|
|
|
Fixture geometry — keep it simple to make scores byte-identical
|
|
across the two modes for the non-bias terms:
|
|
- All records have the SAME aaak (empty), SAME tier (episodic),
|
|
SAME literal_surface length (so age, deg_norm contribute
|
|
identically across records).
|
|
- No edges in the graph -> max_deg = 0 -> log_max_deg = 0 ->
|
|
deg_norm == 0 for every record -> W_DEGREE * deg_norm == 0.
|
|
- No profile_state -> no per-record gain product.
|
|
- No structural_weight -> no structural-similarity term.
|
|
=> base_s = W_COSINE * cos - W_AGE * age (everything else
|
|
constant or zero across records).
|
|
"""
|
|
# Symbol-level pre-flight assertions — contract surface intact.
|
|
assert COMMUNITY_BIAS_VERBATIM == 0.0
|
|
assert COMMUNITY_BIAS_CONCEPT == 0.1
|
|
assert _gate_bias_for_mode("verbatim") == 0.0
|
|
assert _gate_bias_for_mode("concept") == 0.1
|
|
assert _gate_bias_for_mode("unknown") == 0.0 # defensive default
|
|
|
|
# Build a 50-record fixture: 1 record per community on distinct
|
|
# primary axes (orthogonal). The cue points at axis 0; rec[0] sits
|
|
# in community c0 whose centroid is the axis-0 unit vector — so the
|
|
# gate places c0 first by centroid cosine.
|
|
store, graph, recs, assignment = _build_one_record_per_community(tmp_path, n=50)
|
|
|
|
# Cue points at axis 0 (matching rec[0]'s primary axis).
|
|
cue_vec = [0.0] * EMBED_DIM
|
|
cue_vec[0] = 1.0
|
|
embedder = _FakeEmbedder(vec=cue_vec)
|
|
|
|
# Identify the GATED record (rec[0], in community c0 at top-1 by
|
|
# centroid cosine) and the CONTROL record. The control is whichever
|
|
# record sits in a community OUTSIDE the top-3 gated set; we pick
|
|
# rec[20] (community c20, axis 20 — definitely orthogonal to cue,
|
|
# cosine 0.0). We also pre-compute the gold expectation that rec[0]
|
|
# gets the +0.1 community bonus in concept mode.
|
|
rec_GATED = recs[0]
|
|
rec_CONTROL = recs[20]
|
|
|
|
# --- recall in verbatim mode ---
|
|
result_v = _recall_core(
|
|
store=store, graph=graph, assignment=assignment,
|
|
rich_club=[], embedder=embedder,
|
|
cue="cue at axis 0", session_id="s-mode-bias-v",
|
|
mode="verbatim",
|
|
)
|
|
|
|
# --- recall in concept mode ---
|
|
result_c = _recall_core(
|
|
store=store, graph=graph, assignment=assignment,
|
|
rich_club=[], embedder=embedder,
|
|
cue="cue at axis 0", session_id="s-mode-bias-c",
|
|
mode="concept",
|
|
)
|
|
|
|
# --- 1. Same record list — the gate must NEVER filter. ---
|
|
verbatim_ids = {h.record_id for h in result_v.scored_hits}
|
|
concept_ids = {h.record_id for h in result_c.scored_hits}
|
|
assert verbatim_ids == concept_ids, (
|
|
"D-02 fence: gate must NEVER filter; mode change should not "
|
|
f"alter the record list. verbatim_only={verbatim_ids - concept_ids}, "
|
|
f"concept_only={concept_ids - verbatim_ids}"
|
|
)
|
|
|
|
# --- 2. Lookup GATED and CONTROL records' scores in both modes. ---
|
|
v_gated = next(h for h in result_v.scored_hits if h.record_id == rec_GATED.id)
|
|
c_gated = next(h for h in result_c.scored_hits if h.record_id == rec_GATED.id)
|
|
v_ctrl = next(h for h in result_v.scored_hits if h.record_id == rec_CONTROL.id)
|
|
c_ctrl = next(h for h in result_c.scored_hits if h.record_id == rec_CONTROL.id)
|
|
|
|
# --- 3. Concept mode: GATED record gains COMMUNITY_BIAS_CONCEPT * cos. ---
|
|
# The score delta is the *only* term that changes across modes for
|
|
# the gated record (everything else is identical: same record, same
|
|
# cue, same fixture, same time, same profile_state). The cosine of
|
|
# rec_GATED to the cue is 1.0 (axis 0 vs axis-0 cue), so the
|
|
# expected bonus is 0.1 * 1.0 == 0.1.
|
|
cos_GATED = 1.0
|
|
expected_bonus = COMMUNITY_BIAS_CONCEPT * cos_GATED
|
|
delta_gated = c_gated.score - v_gated.score
|
|
assert delta_gated == pytest.approx(expected_bonus, abs=1e-4), (
|
|
f"D-02 concept mode: GATED record (rec[0], cosine={cos_GATED}) "
|
|
f"should gain ~{expected_bonus:.4f} from "
|
|
f"COMMUNITY_BIAS_CONCEPT * cos when transitioning verbatim -> "
|
|
f"concept. Got delta = c_gated.score - v_gated.score = "
|
|
f"{delta_gated:.4f}.\n"
|
|
f"v_gated.score = {v_gated.score:.4f}; "
|
|
f"c_gated.score = {c_gated.score:.4f}."
|
|
)
|
|
|
|
# --- 4. CONTROL record (outside top-3 gated): score UNCHANGED. ---
|
|
# rec_CONTROL is in c20 — definitely NOT in top_communities[:3] for
|
|
# this fixture (c0/c1/c2 dominate by centroid cosine since cue is
|
|
# at axis 0; the orthogonal-axes geometry sorts the rest by stable-
|
|
# sort UUID). The control record's score must be byte-identical
|
|
# across modes.
|
|
delta_ctrl = c_ctrl.score - v_ctrl.score
|
|
assert delta_ctrl == pytest.approx(0.0, abs=1e-6), (
|
|
f"D-02 concept mode: CONTROL record (rec[20], cosine 0 to cue, "
|
|
f"in non-gated community c20) must NOT receive the community "
|
|
f"bias. Got delta = c_ctrl.score - v_ctrl.score = {delta_ctrl:.6f}; "
|
|
f"expected 0.0.\n"
|
|
f"v_ctrl.score = {v_ctrl.score:.6f}; "
|
|
f"c_ctrl.score = {c_ctrl.score:.6f}."
|
|
)
|
|
|
|
# --- 5. Verbatim mode: bias contribution is identically zero. ---
|
|
# In verbatim mode COMMUNITY_BIAS_VERBATIM == 0.0, so the gated
|
|
# record's score does NOT receive any community contribution.
|
|
# Because cosine for rec_CONTROL is 0 (axis 20 vs axis-0 cue) and
|
|
# rec_GATED has cosine 1.0, the verbatim-mode score difference is
|
|
# purely W_COSINE * (1.0 - 0.0) = W_COSINE — the cosine term alone.
|
|
# No additive bias term sneaks in; all other contributions
|
|
# (aaak, deg_norm, age) are identical by fixture construction.
|
|
from iai_mcp.pipeline import W_COSINE
|
|
expected_verbatim_delta = W_COSINE * (1.0 - 0.0)
|
|
actual_verbatim_delta = v_gated.score - v_ctrl.score
|
|
assert actual_verbatim_delta == pytest.approx(
|
|
expected_verbatim_delta, abs=1e-4
|
|
), (
|
|
f"D-02 verbatim mode: gated vs control score delta should be "
|
|
f"W_COSINE * cos_diff = {W_COSINE} * 1.0 = {expected_verbatim_delta:.4f} "
|
|
f"with NO community-bias contribution. Got delta = "
|
|
f"{actual_verbatim_delta:.4f}. If this differs, either "
|
|
f"COMMUNITY_BIAS_VERBATIM is non-zero, or the mode dispatch "
|
|
f"in _gate_bias_for_mode is broken."
|
|
)
|