"""Tests for migrate_reembed_to_current_dim. Contract: re-embed every record in the store under a target Embedder, even if the target dim differs from the current records-table schema dim. Rebuild the table in a staging location and atomically swap. Invariants preserved (constitutional): - literal_surface byte-for-byte identical before and after. - All non-embedding fields preserved (tags, tier, language, schema_version, s5_trust_score, detail_level, pinned, never_*, stability, difficulty, last_reviewed, provenance, profile_modulation_gain, structure_hv). - Idempotent: running with the same dim is a no-op and returns updated=0. - After migration: store.embed_dim == target_embedder.DIM. - After migration: retrieval at the new dim actually succeeds (no shape mismatch). """ from __future__ import annotations from datetime import datetime, timezone from uuid import UUID, uuid4 import pytest class _DimEmbedder: """Deterministic fake embedder with configurable dim. Turns text into a normalised vector by hashing char offsets into the target length.""" def __init__(self, dim: int): self.DIM = dim self.model_key = f"fake-dim-{dim}" def embed(self, text: str) -> list[float]: import math vec = [0.0] * self.DIM for i, ch in enumerate(text or ""): vec[i % self.DIM] += ord(ch) / 256.0 norm = math.sqrt(sum(x * x for x in vec)) or 1.0 return [x / norm for x in vec] def embed_batch(self, texts: list[str]) -> list[list[float]]: return [self.embed(t) for t in texts] def _fresh_store(tmp_path, dim: int, monkeypatch): """Make a MemoryStore at an explicit dim via env override. Env is torn down automatically by monkeypatch so other tests aren't polluted.""" monkeypatch.setenv("IAI_MCP_STORE", str(tmp_path / "iai")) monkeypatch.setenv("IAI_MCP_EMBED_DIM", str(dim)) from iai_mcp.store import MemoryStore return MemoryStore() def _seed_records(store, embedder, n: int = 3) -> list[UUID]: """Insert n deterministic records. Returns their ids.""" from iai_mcp.types import MemoryRecord ids = [] now = datetime.now(timezone.utc) for i in range(n): rid = uuid4() text = f"Record #{i} with literal surface content that must survive migration." rec = MemoryRecord( id=rid, tier="episodic", literal_surface=text, aaak_index="", embedding=embedder.embed(text), structure_hv=b"", community_id="", centrality=0.0, detail_level=1, pinned=False, stability=0.5, difficulty=0.3, last_reviewed=now, never_decay=False, never_merge=False, provenance=[{"ts": "2026-04-17T00:00:00+00:00", "cue": f"seed-{i}", "session_id": "seed"}], created_at=now, updated_at=now, tags=["test", "migration"], language="en", s5_trust_score=0.5, profile_modulation_gain={}, schema_version=4, ) store.insert(rec) ids.append(rid) return ids def test_reembed_upgrades_dim_and_preserves_all_non_embedding_fields(tmp_path, monkeypatch): """Start at 384d, migrate to 1024d, verify every field except embedding stays identical.""" src_embedder = _DimEmbedder(384) target_embedder = _DimEmbedder(1024) store = _fresh_store(tmp_path, 384, monkeypatch) assert store.embed_dim == 384 seeded_ids = _seed_records(store, src_embedder, n=3) pre = {rid: store.get(rid) for rid in seeded_ids} from iai_mcp.migrate import migrate_reembed_to_current_dim result = migrate_reembed_to_current_dim(store, target_embedder) assert result["target_dim"] == 1024 assert result["source_dim"] == 384 assert result["updated"] == 3 assert store.embed_dim == 1024 for rid in seeded_ids: post = store.get(rid) assert post is not None assert post.literal_surface == pre[rid].literal_surface, "literal_surface byte-identical" assert post.tier == pre[rid].tier assert post.tags == pre[rid].tags assert post.language == pre[rid].language assert post.schema_version == pre[rid].schema_version assert post.s5_trust_score == pre[rid].s5_trust_score assert post.pinned == pre[rid].pinned assert post.detail_level == pre[rid].detail_level assert post.never_decay == pre[rid].never_decay assert post.never_merge == pre[rid].never_merge assert post.provenance == pre[rid].provenance assert len(post.embedding) == 1024, "new embedding must have target dim" assert len(pre[rid].embedding) == 384, "old embedding was at source dim" assert post.embedding != pre[rid].embedding, "embedding must be re-computed" def test_reembed_idempotent_same_dim_no_op(tmp_path, monkeypatch): src = _DimEmbedder(384) store = _fresh_store(tmp_path, 384, monkeypatch) _seed_records(store, src, n=2) from iai_mcp.migrate import migrate_reembed_to_current_dim # Target matches current: should be no-op. result = migrate_reembed_to_current_dim(store, _DimEmbedder(384)) assert result["updated"] == 0 assert result["skipped"] == 2 or result.get("no_op") is True assert store.embed_dim == 384 def test_reembed_dry_run_reports_without_mutating(tmp_path, monkeypatch): src = _DimEmbedder(384) store = _fresh_store(tmp_path, 384, monkeypatch) seeded = _seed_records(store, src, n=2) from iai_mcp.migrate import migrate_reembed_to_current_dim result = migrate_reembed_to_current_dim(store, _DimEmbedder(1024), dry_run=True) assert result["would_update"] == 2 # Store unchanged after dry-run. assert store.embed_dim == 384 post = store.get(seeded[0]) assert len(post.embedding) == 384 def test_reembed_emits_migration_event(tmp_path, monkeypatch): from iai_mcp.events import query_events src = _DimEmbedder(384) store = _fresh_store(tmp_path, 384, monkeypatch) _seed_records(store, src, n=1) from iai_mcp.migrate import migrate_reembed_to_current_dim migrate_reembed_to_current_dim(store, _DimEmbedder(1024)) events = query_events(store, kind="migration_reembed", limit=5) assert len(events) >= 1 data = events[0]["data"] assert data.get("source_dim") == 384 assert data.get("target_dim") == 1024 assert data.get("updated") == 1