iai-mcp-opencode/src/iai_mcp/migrate.py
2026-05-12 16:45:15 +02:00

1977 lines
73 KiB
Python

"""D-35 -> migration + encryption +
Plan 03-01 CONN-05 TEM factorization (v3 -> v4 column rename + structure_hv fill).
Plan 02-01 (v1 -> v2):
One-time batch migration that re-embeds every record with the
configured embedder (bge-small-en-v1.5 by default per Plan 05-08; bge-m3
remains opt-in via IAI_MCP_EMBED_MODEL), backfills the v2 fields with
their defaults, detects language via langdetect on literal_surface
for legacy provenance, and marks each record schema_version=2.
Plan 02-08 (v2 -> v3 data upgrade):
In-place AES-256-GCM encryption of literal_surface / provenance_json /
profile_modulation_gain_json on the records table, and data_json on the
events table. Runs lazily via `migrate_encryption_v2_to_v3(store)` and
is idempotent (skips rows that already carry the iai:enc:v1: prefix).
Plan 03-01 (v3 -> v4 TEM factorization):
Renames the LanceDB records column `hd_vector_json` (pa.string(), JSON-
encoded list[int]|None reservation slot from Phase 1/2) to `structure_hv`
(pa.binary(), packed D=10000 BSC bits = 1250 bytes per row). For stores
created on the new schema (the typical case after this plan ships), the
column name is already correct; the migration just (a) backfills any row
whose `structure_hv` is still empty bytes via `tem.bind_structure(record)`,
and (b) bumps schema_version from 3 to 4. Idempotent: rows already at v4
with a populated `structure_hv` are skipped.
Invariants preserved (constitutional):
- literal_surface is byte-for-byte preserved through ALL migrations.
- Provenance entries preserved.
- All flags (detail_level, pinned, never_merge, never_decay, etc.) unchanged.
- Tags list unchanged.
- CR-01: every WHERE/DELETE predicate routes through store._uuid_literal so
injection content cannot ride a poisoned UUID.
Idempotent: records that are already schema_version=2 are skipped by v1->v2.
Records whose sensitive columns already start with iai:enc:v1: are skipped
by v2->v3. Records that are already schema_version=4 with a non-empty
structure_hv are skipped by v3->v4.
Resumable: each record is committed individually via delete + insert. If the
process crashes mid-batch, re-running picks up where it left off.
Emits events of kind='migration_v1_to_v2', 'migration_v2_to_v3', and
'migration_v3_to_v4' (D-STORAGE).
CLI wrappers:
iai-mcp migrate --from=1 --to=2 [--dry-run] # (v1 -> v2)
iai-mcp migrate --from=2 --to=3 [--dry-run] # (encryption)
iai-mcp migrate --from=3 --to=4 [--dry-run] # (TEM factorization)
"""
from __future__ import annotations
import json
import logging
import os
import sys
import tempfile
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Callable, Optional
from uuid import UUID
from iai_mcp.crypto import encrypt_field, is_encrypted
from iai_mcp.events import write_event
from iai_mcp.store import (
EVENTS_TABLE,
MemoryStore,
RECORDS_TABLE,
_uuid_literal,
)
from iai_mcp.types import (
SCHEMA_VERSION_CURRENT,
SCHEMA_VERSION_LEGACY,
MemoryRecord,
)
log = logging.getLogger(__name__)
# Plan 07.11-03 / crash-safe reembed migration constants.
# `STAGING_TABLE` is the LanceDB table that receives re-embedded rows during
# of the four-phase flow (stage -> validate -> atomic swap ->
# deferred cleanup). `OLD_TABLE_PREFIX` is the timestamp-suffixed name of the
# rolled-aside original records table after a successful swap. `PROGRESS_FILE`
# sits next to the LanceDB store and lets `--resume` pick up at the last
# successfully-staged row index after a crash.
STAGING_TABLE = "records_v_new"
OLD_TABLE_PREFIX = "records_old_"
PROGRESS_FILE = "migration_progress.json"
# Prior-key AES recovery (tail-end mandate): disjoint from reembed staging so
# detect_partial_migration taxonomy stays unchanged.
CRYPTO_RECOVER_STAGING = "records_crypto_recover_stage"
def _db_table_names_set(db) -> set[str]:
"""LanceDB 0.30+ list_tables() paginated response vs legacy list."""
res = db.list_tables()
if hasattr(res, "tables"):
return set(res.tables)
return set(res)
def _detect_language(text: str) -> str:
"""Best-effort language detection; fall back to 'en' on low confidence."""
text = (text or "").strip()
if not text:
return "en"
try:
from langdetect import DetectorFactory, detect_langs
DetectorFactory.seed = 42
cands = detect_langs(text)
if cands and cands[0].prob >= 0.7:
return cands[0].lang
except Exception:
pass
return "en"
def migrate_v1_to_v2(
store: MemoryStore,
embedder: Optional["Embedder"] = None,
dry_run: bool = False,
progress: Optional[Callable[[int, int], None]] = None,
) -> dict:
"""Re-embed + language-tag + default-backfill every v1 record.
Parameters
----------
store:
Open MemoryStore. Migration rewrites in-place via delete+insert per record.
embedder:
Embedder instance; defaults to Embedder() (bge-small-en-v1.5, 384d,
per Plan 05-08). The store's records table schema must match the
embedder's DIM; if they differ, the caller is responsible for using
the appropriate model_key (e.g. legacy 1024d stores from the brief
Phase-2 era should pass bge-m3 until the table schema is
rebuilt down to 384d in a dedicated re-embed migration).
dry_run:
If True, counts what would be migrated without mutating the store.
progress:
Optional callable(idx, total) invoked before each record migration
so CLI / external tooling can render a progress bar.
Returns a dict with records_migrated / skipped / duration_sec / previous_model / new_model.
"""
t0 = time.time()
if embedder is not None:
emb = embedder
else:
from iai_mcp.embed import embedder_for_store
emb = embedder_for_store(store)
all_records = store.all_records()
v1_records = [r for r in all_records if r.schema_version == SCHEMA_VERSION_LEGACY]
total = len(v1_records)
migrated = 0
for idx, record in enumerate(v1_records):
if progress is not None:
try:
progress(idx, total)
except Exception:
pass
new_lang = record.language if (record.language and record.language.strip()) else _detect_language(record.literal_surface)
if dry_run:
migrated += 1
continue
# Re-embed with the configured model (English-Only-Brain default,
# Plan 05-08). If the embedder's DIM differs from the store's current
# schema, insert will raise; callers on legacy 1024d stores from the
# brief Phase-2 era must pass a matching model_key.
new_embedding = emb.embed(record.literal_surface)
updated = MemoryRecord(
id=record.id,
tier=record.tier,
literal_surface=record.literal_surface, # verbatim preserved
aaak_index=record.aaak_index,
embedding=new_embedding,
structure_hv=record.structure_hv,
community_id=record.community_id,
centrality=record.centrality,
detail_level=record.detail_level,
pinned=record.pinned,
stability=record.stability,
difficulty=record.difficulty,
last_reviewed=record.last_reviewed,
never_decay=record.never_decay,
never_merge=record.never_merge,
provenance=record.provenance,
created_at=record.created_at,
updated_at=record.updated_at,
tags=record.tags,
language=new_lang,
s5_trust_score=0.5,
profile_modulation_gain={},
schema_version=SCHEMA_VERSION_CURRENT,
)
# Delete old v1 row, insert new v2 row (LanceDB MVCC-safe).
# fix: route record.id through _uuid_literal so the
# DELETE predicate cannot carry SQL injection content, matching the
# pattern already used in store.append_provenance / boost_edges.
tbl = store.db.open_table(RECORDS_TABLE)
tbl.delete(f"id = '{_uuid_literal(record.id)}'")
store.insert(updated)
migrated += 1
duration_sec = time.time() - t0
# Emit a single migration event even on dry-run so audit trails record
# the planned scope (severity=info).
if not dry_run and migrated > 0:
write_event(
store,
kind="migration_v1_to_v2",
data={
"record_count": migrated,
"duration_sec": duration_sec,
},
severity="info",
)
return {
"records_migrated": migrated,
"skipped": max(0, len(all_records) - total),
"duration_sec": duration_sec,
"previous_model": "bge-small-en-v1.5",
"new_model": emb.model_key,
}
def _records_schema_at_dim(dim: int) -> "pa.Schema":
"""Build the records-table Arrow schema at an explicit embedding dim.
Mirrors `MemoryStore._ensure_tables` lines 249-281 byte-for-byte except
for the `embedding` column's `list_size=dim`. Inlined here because the
staged-swap reembed migration needs to create `records_v_new` at a
DIFFERENT dim from the live store's `_embed_dim` — `store._ensure_tables`
is not parameterised on dim. Plan 07.11-03 / file-disjoint
constraint forbids store.py changes; inlining is the conservative path.
"""
import pyarrow as pa
return pa.schema(
[
("id", pa.string()),
("tier", pa.string()),
("literal_surface", pa.string()),
("aaak_index", pa.string()),
("embedding", pa.list_(pa.float32(), dim)),
("structure_hv", pa.binary()),
("community_id", pa.string()),
("centrality", pa.float32()),
("detail_level", pa.int32()),
("pinned", pa.bool_()),
("stability", pa.float32()),
("difficulty", pa.float32()),
("last_reviewed", pa.timestamp("us", tz="UTC")),
("never_decay", pa.bool_()),
("never_merge", pa.bool_()),
("provenance_json", pa.string()),
("created_at", pa.timestamp("us", tz="UTC")),
("updated_at", pa.timestamp("us", tz="UTC")),
("tags_json", pa.string()),
("language", pa.string()),
("s5_trust_score", pa.float32()),
("profile_modulation_gain_json", pa.string()),
("schema_version", pa.int32()),
]
)
def _progress_path(store: MemoryStore) -> Path:
"""Resolve the on-disk path of `migration_progress.json` for this store.
Sits next to the LanceDB tables under `store.root` (the IAI root —
parent of the `lancedb/` subdir, same convention as `daemon_state.py`
and `cleanup_schema_duplicates`).
"""
return Path(store.root) / PROGRESS_FILE
def _progress_read(store: MemoryStore) -> dict:
"""Self-healing reader for `migration_progress.json`.
Returns `{}` on missing or malformed file — mirrors
`daemon_state.load_state` lines 41-49 verbatim. Callers MUST tolerate an
empty dict as "no checkpoint, start from row 0".
"""
path = _progress_path(store)
if not path.exists():
return {}
try:
return json.loads(path.read_text())
except (OSError, json.JSONDecodeError, ValueError):
return {}
def _progress_write(store: MemoryStore, state: dict) -> None:
"""Atomic write for `migration_progress.json`.
Verbatim copy of `daemon_state.save_state`'s tempfile + fsync +
os.replace pattern — the project canon for atomic on-disk mutation.
`os.replace` (not `os.rename`) per CONTEXT + project convention
(cross-platform safety on Windows; preferred even on POSIX).
"""
target = _progress_path(store)
target.parent.mkdir(parents=True, exist_ok=True)
fd, tmp = tempfile.mkstemp(
prefix=".migration-progress.",
suffix=".tmp",
dir=str(target.parent),
)
try:
with os.fdopen(fd, "w") as f:
json.dump(state, f, indent=2)
f.flush()
os.fsync(f.fileno())
os.chmod(tmp, 0o600)
os.replace(tmp, target)
except Exception:
try:
os.unlink(tmp)
except OSError:
pass
raise
def _progress_clear(store: MemoryStore) -> None:
"""Drop the progress checkpoint if present. Idempotent."""
path = _progress_path(store)
try:
path.unlink()
except FileNotFoundError:
pass
except OSError:
# Permission errors / odd FS states — don't crash the migration.
pass
def _stage_record_to_table(
store: MemoryStore,
target_tbl,
rec: MemoryRecord,
new_embedding: list[float],
) -> None:
"""Append one re-embedded record to the staging table.
Mirrors `store.insert`'s sync write path (the legacy branch at
store.py:550-554) but targets an arbitrary table object instead of the
hard-coded RECORDS_TABLE. `store._to_row` handles AES-GCM encryption of
`literal_surface` / `provenance_json` / `profile_modulation_gain_json`
with `AAD = _uuid_literal(record.id)`, so a record written through this
helper round-trips through `store.get` after the atomic swap (same key,
same AAD).
`tem.bind_structure` is invoked when `structure_hv` is empty — preserves
the autopoietic write-time fill from `store.insert` line 519-521 so a
re-embedded record never lands in the staging table without a
structural fingerprint.
"""
if not rec.structure_hv:
from iai_mcp.tem import bind_structure
rec.structure_hv = bind_structure(rec)
new_rec = MemoryRecord(
id=rec.id,
tier=rec.tier,
literal_surface=rec.literal_surface, # verbatim
aaak_index=rec.aaak_index,
embedding=new_embedding,
structure_hv=rec.structure_hv,
community_id=rec.community_id,
centrality=rec.centrality,
detail_level=rec.detail_level,
pinned=rec.pinned,
stability=rec.stability,
difficulty=rec.difficulty,
last_reviewed=rec.last_reviewed,
never_decay=rec.never_decay,
never_merge=rec.never_merge,
provenance=rec.provenance,
created_at=rec.created_at,
updated_at=rec.updated_at,
tags=rec.tags,
language=rec.language,
s5_trust_score=rec.s5_trust_score,
profile_modulation_gain=rec.profile_modulation_gain,
schema_version=rec.schema_version,
)
target_tbl.add([store._to_row(new_rec)])
def _stage_loop(
store: MemoryStore,
target_embedder,
target_dim: int,
target_tbl,
source_iter,
*,
total: int,
started_at_iso: str,
started_idx: int = 0,
already_staged_ids: Optional[set[str]] = None,
progress: Optional[Callable[[int, int], None]] = None,
) -> tuple[int, list[str]]:
"""Run the per-row stage step over the source iterator.
Re-embeds each source record under `target_embedder`, writes the new
row to `target_tbl`, and atomically updates `migration_progress.json`
after each successful row so a crash leaves the checkpoint pointing at
the last successfully-staged record. Per-row exceptions are caught
+ structured-logged + counted (best-effort migration); KeyboardInterrupt
and SystemExit propagate untouched so the caller (the live records
table is intact in Phase 1) sees the kill.
Returns `(staged_count, failures)`. `failures` is the list of
record-id strings whose re-embedding raised a recoverable exception.
"""
staged_count = 0
failures: list[str] = []
staged_ids: list[str] = list(already_staged_ids or [])
skipped_set: set[str] = set(staged_ids)
idx = started_idx
for rec in source_iter:
rec_id_str = str(rec.id)
if rec_id_str in skipped_set:
# Already in the staging table from a prior run.
continue
if progress is not None:
try:
progress(idx, total)
except Exception:
pass
try:
new_embedding = target_embedder.embed(rec.literal_surface)
_stage_record_to_table(store, target_tbl, rec, new_embedding)
except (KeyboardInterrupt, SystemExit):
# Mid-flight kill: do not swallow. Records is intact;
# records_v_new holds the partial set; progress file points
# at the last successfully-staged row. The boot detector or
# CLI rollback handles the cleanup.
raise
except Exception as exc:
log.warning(
"migrate_reembed_per_row_failed",
extra={
"record_id": rec_id_str,
"error": str(exc)[:160],
},
)
failures.append(rec_id_str)
idx += 1
continue
staged_count += 1
staged_ids.append(rec_id_str)
# Atomic checkpoint write — every successful row.
_progress_write(
store,
{
"started_at": started_at_iso,
"ts": int(time.time()),
"row_index": idx,
"last_rid": rec_id_str,
"total": total,
"target_dim": target_dim,
"target_model_key": getattr(target_embedder, "model_key", "unknown"),
"staged_ids": staged_ids,
"failures": failures,
},
)
idx += 1
return staged_count, failures
def _lancedb_root(db) -> Path:
"""Resolve the on-disk root of the LanceDB connection.
Tables live as `<name>.lance` directories under this root. Used by the
filesystem-level atomic-swap fallback (LanceDB 0.30.2 OSS does NOT
implement `db.rename_table` — calling it raises `NotImplementedError:
rename_table is not supported in LanceDB OSS` despite the method
existing on the connection object). The fallback uses `os.replace` on
the table directories — POSIX `rename(2)` semantics on the same
filesystem give us the atomicity LanceDB OSS withholds.
"""
return Path(db.uri)
def _swap_tables_filesystem(db, *, source: str, dest: str) -> None:
"""Atomically rename `source.lance` -> `dest.lance` on disk.
Uses `os.replace` (project canon, project convention prefers it over
`os.rename` for cross-platform safety on Windows; on POSIX both are
atomic on the same filesystem). The destination MUST be empty or
absent (macOS/HFS+/APFS rejects `os.replace` onto a non-empty
directory with `[Errno 66] Directory not empty`).
Caller is responsible for ordering when swapping: rename A->A_old
BEFORE renaming B->A so the destination slot is empty.
"""
root = _lancedb_root(db)
src_path = root / f"{source}.lance"
dst_path = root / f"{dest}.lance"
os.replace(src_path, dst_path)
def _validate_and_swap(
store: MemoryStore,
*,
source_dim: int,
target_dim: int,
target_embedder,
staged_count: int,
failures: list[str],
duration_sec: float,
) -> dict:
"""Phase 2 (validate) + (atomic swap) + event emit.
Refuses to swap if staged < orig * 0.99 (D-03 gross-mismatch guard).
Emits `migration_reembed` BEFORE the rename so a crash mid-rename still
leaves an audit trail. Swap uses filesystem-level `os.replace` on the
table directories under `db.uri` (LanceDB 0.30.2 OSS raises
`NotImplementedError` on `db.rename_table` despite exposing the
method — verified at runtime against the pinned version). After the
swap, `_embed_dim` is refreshed to target_dim so subsequent inserts
pass the dim check.
"""
orig = store.db.open_table(RECORDS_TABLE).count_rows()
staged = store.db.open_table(STAGING_TABLE).count_rows()
if orig > 0 and staged < orig * 0.99:
log.error(
"migrate_reembed_validate_failed",
extra={
"orig": orig,
"staged": staged,
"ratio": staged / max(orig, 1),
"failures": len(failures),
},
)
raise RuntimeError(
f"reembed staging produced {staged}/{orig} rows "
f"({staged/max(orig,1):.3%}); refusing to swap. Inspect tables "
f"manually or run `iai-mcp migrate --rollback`."
)
# Emit BEFORE rename so the audit trail survives a mid-rename crash;
# the rollback path is then triggered by the boot detector.
try:
write_event(
store,
kind="migration_reembed",
data={
"source_dim": source_dim,
"target_dim": target_dim,
"updated": staged_count,
"duration_sec": duration_sec,
"target_model_key": getattr(target_embedder, "model_key", "unknown"),
"failures": len(failures),
},
severity="info",
)
except Exception:
pass
# — atomic swap via filesystem-level os.replace on the table
# directories (LanceDB OSS doesn't implement rename_table — see
# _swap_tables_filesystem docstring for evidence).
ts = int(time.time())
old_name = f"{OLD_TABLE_PREFIX}{ts}"
# Step 1: records -> records_old_<ts> (slot is empty after, so step 2 is safe).
_swap_tables_filesystem(store.db, source=RECORDS_TABLE, dest=old_name)
# Step 2: records_v_new -> records.
_swap_tables_filesystem(store.db, source=STAGING_TABLE, dest=RECORDS_TABLE)
# Refresh the in-memory dim binding so subsequent store.insert calls
# against the swapped table pass the dim check at store.py:514-517.
store._embed_dim = target_dim
# Drop the progress checkpoint — cleanup is handled at next
# boot's detect_partial_migration -> needs_cleanup branch.
_progress_clear(store)
return {
"source_dim": source_dim,
"target_dim": target_dim,
"updated": staged_count,
"skipped": 0,
"failures": len(failures),
"duration_sec": duration_sec,
"old_table": old_name,
}
def migrate_reembed_to_current_dim(
store: MemoryStore,
target_embedder,
dry_run: bool = False,
progress: Optional[Callable[[int, int], None]] = None,
) -> dict:
"""Crash-safe re-embed migration (Plan 07.11-03 / four-phase flow).
Closes V2-05: replaces the destructive drop-then-rebuild at the legacy
line 300-305 with stage -> validate -> atomic swap -> deferred cleanup.
A KeyboardInterrupt, kill, or power loss mid-flight leaves the original
`records` table untouched; the boot-time detector
(`detect_partial_migration`) refuses to advertise daemon-ready and
surfaces a remediation prompt.
(stage):
- Drop any pre-existing `records_v_new` (defensive — should not
normally exist; the boot detector catches a real partial state).
- Create `records_v_new` at the post-migration schema (target_dim).
- Stream rows from the live `records` table; re-embed each via
`target_embedder.embed`; insert into `records_v_new` via the same
AES-GCM-applying `_to_row` path as `store.insert`.
- On every successful row, atomically update `migration_progress.json`
with the row index + record id (resume anchor).
- Per-row embed exceptions are logged + counted; KeyboardInterrupt /
SystemExit propagates untouched.
(validate):
- `staged >= orig * 0.99` gate (allow up to 1% per-row failure).
- Gross mismatch (< 99%) raises RuntimeError; both tables remain
intact for inspection or `iai-mcp migrate --rollback`.
(atomic swap):
- LanceDB `db.rename_table(records, records_old_<ts>)` then
`db.rename_table(records_v_new, records)`. Cross-platform safe —
no filesystem-level `os.rename` (project convention prefers
`os.replace`; LanceDB owns the table-rename atomicity here).
- Emit `migration_reembed` BEFORE rename so audit trail survives
a mid-rename crash.
- Refresh `store._embed_dim = target_dim`.
- Drop `migration_progress.json`.
(deferred cleanup):
- `records_old_<ts>` is RETAINED. Next boot's
`detect_partial_migration` returns `needs_cleanup` and the daemon
drops it before advertising ready. Gives the operator a one-cycle
manual rollback window.
Idempotency: same-dim same-model returns `no_op=True` without
touching the store (preserves the legacy line-244-250 contract used
by `tests/test_migrate_reembed_to_current_dim.py`).
Preserves (MEM-01 + full record fidelity):
- `literal_surface` byte-for-byte (re-embedded but content unchanged).
- `structure_hv` (TEM factorization independent of content embedding).
- All flags, tags, language, schema_version, provenance,
s5_trust_score, profile_modulation_gain, timestamps.
Emits `kind='migration_reembed'` on success (data: source_dim,
target_dim, updated, duration_sec, target_model_key, failures) AND
on idempotent no-op runs (data.no_op = True).
Parameters mirror the legacy signature for source-compat:
`dry_run` short-circuits with a `would_update` count; `progress` is an
optional callable invoked at each row before embedding.
"""
t0 = time.time()
source_dim = int(store.embed_dim)
target_dim = int(target_embedder.DIM)
started_at_iso = datetime.now(timezone.utc).isoformat()
# — idempotency / dry-run / no-op fast paths.
# Match the legacy contract at line 244-260 so the existing
# tests/test_migrate_reembed_to_current_dim.py suite remains green.
if source_dim == target_dim:
# Emit a no-op event so case 5 (idempotency rerun) is witnessable.
try:
write_event(
store,
kind="migration_reembed",
data={
"source_dim": source_dim,
"target_dim": target_dim,
"updated": 0,
"no_op": True,
"duration_sec": time.time() - t0,
"target_model_key": getattr(
target_embedder, "model_key", "unknown"
),
},
severity="info",
)
except Exception:
pass
# `total` matches the legacy signature so the existing
# test_reembed_idempotent_same_dim_no_op assertion holds:
# `result["skipped"] == 2 or result.get("no_op") is True`.
return {
"source_dim": source_dim,
"target_dim": target_dim,
"updated": 0,
"skipped": store.db.open_table(RECORDS_TABLE).count_rows(),
"no_op": True,
"duration_sec": time.time() - t0,
}
if dry_run:
return {
"source_dim": source_dim,
"target_dim": target_dim,
"would_update": store.db.open_table(RECORDS_TABLE).count_rows(),
"duration_sec": time.time() - t0,
}
# — stage.
# Defensive drop of any pre-existing staging table. A real partial
# state is caught by `detect_partial_migration` at boot; if we got
# here cleanly the staging table should not exist.
if STAGING_TABLE in set(store.db.table_names()):
store.db.drop_table(STAGING_TABLE)
target_tbl = store.db.create_table(
STAGING_TABLE, schema=_records_schema_at_dim(target_dim)
)
total = store.db.open_table(RECORDS_TABLE).count_rows()
source_iter = store.iter_records()
staged_count, failures = _stage_loop(
store,
target_embedder,
target_dim,
target_tbl,
source_iter,
total=total,
started_at_iso=started_at_iso,
progress=progress,
)
# (validate) + (atomic swap) + (deferred cleanup).
duration_sec = time.time() - t0
return _validate_and_swap(
store,
source_dim=source_dim,
target_dim=target_dim,
target_embedder=target_embedder,
staged_count=staged_count,
failures=failures,
duration_sec=duration_sec,
)
# ---------------------------------------------------------------------------
# Plan 07.11-03 / boot-time partial-migration detector + rollback /
# resume entry points. The detector runs at daemon boot BEFORE ready-state
# advertisement (see daemon.py main() — the wire-up makes the rollback
# handler actually fire, closing the V2-07 anti-pattern of declared-but-
# unwired knobs).
# ---------------------------------------------------------------------------
def detect_partial_migration(db) -> dict:
"""Inspect the LanceDB store for evidence of a crashed reembed migration.
Returns a dict with `state` in:
- "clean": no partial-migration tables present.
- "needs_rollback": records_v_new present alongside records (mid-stage
crash; original records intact, staging partial — recover by
dropping staging or resuming).
- "needs_cleanup": records_old_<ts> present alongside fresh records;
successful swap from a prior boot — drop the old table.
- "partial_swap_inconsistent": records_v_new present without records
AND without any records_old_<ts> (catastrophic mid-swap state;
manual recovery required).
- "needs_rollback" (variant): records_v_new + records_old_<ts> both
present, records absent — swap interrupted between renames; the
old table is the rollback anchor.
- "unknown": defensive default for shapes we didn't enumerate.
Caller (daemon boot OR CLI subcommand) interprets state and acts. The
pure-inspection contract (no side effects) lets boot-time integration
bail out cleanly via `raise SystemExit(2)` while leaving the store
untouched for operator inspection.
"""
names = set(db.table_names())
has_records = RECORDS_TABLE in names
has_staging = STAGING_TABLE in names
old_tables = sorted(n for n in names if n.startswith(OLD_TABLE_PREFIX))
if not has_staging and not old_tables:
return {"state": "clean"}
if has_staging and not has_records and not old_tables:
return {
"state": "partial_swap_inconsistent",
"staging": STAGING_TABLE,
"old_tables": old_tables,
"reason": (
"records_v_new present but neither records nor records_old_<ts> "
"exist; manual recovery required."
),
}
if has_staging and has_records:
return {
"state": "needs_rollback",
"old_tables": old_tables,
"reason": (
"records_v_new present alongside records — staging did not "
"complete; recover by dropping records_v_new (rollback) or "
"resuming from migration_progress.json."
),
}
if not has_staging and has_records and old_tables:
return {
"state": "needs_cleanup",
"old_tables": old_tables,
"reason": "successful swap from prior boot; drop old tables.",
}
if has_staging and old_tables and not has_records:
return {
"state": "needs_rollback",
"old_tables": old_tables,
"reason": (
"records_v_new + records_old_<ts> present, records absent — "
"swap interrupted between renames; rollback from records_old_<ts>."
),
}
return {
"state": "unknown",
"has_records": has_records,
"has_staging": has_staging,
"old_tables": old_tables,
}
def _decrypt_field_try_keys(
ciphertext: str,
record_id: UUID,
keys: list[bytes],
) -> str:
"""Decrypt iai:enc:v1: field; try each key in order until one succeeds."""
from cryptography.exceptions import InvalidTag
from iai_mcp.crypto import decrypt_field
if not is_encrypted(ciphertext):
return str(ciphertext or "")
ad = _uuid_literal(record_id).encode("ascii")
last_exc: Exception | None = None
for key in keys:
if key is None or len(key) != 32:
continue
try:
return decrypt_field(ciphertext, key, associated_data=ad)
except (InvalidTag, ValueError) as exc:
last_exc = exc
continue
if last_exc is not None:
raise last_exc
raise ValueError("no valid keys supplied for decrypt")
def _memory_record_from_raw_row_multikey(
store: MemoryStore,
row: dict,
keys: list[bytes],
) -> MemoryRecord:
"""Build MemoryRecord from a Lance row dict; decrypt with key fallbacks."""
import pandas as pd
from uuid import UUID as _UUID
row_uuid = _UUID(row["id"])
structure_raw = row.get("structure_hv")
if structure_raw is None:
structure_hv = b""
elif isinstance(structure_raw, (bytes, bytearray)):
structure_hv = bytes(structure_raw)
else:
structure_hv = b""
community_raw = row.get("community_id") or ""
community_id = _UUID(community_raw) if community_raw else None
raw_version = row.get("schema_version")
try:
version_int = int(raw_version) if raw_version is not None else SCHEMA_VERSION_CURRENT
except (TypeError, ValueError):
version_int = SCHEMA_VERSION_CURRENT
schema_version = version_int
lang_raw = row.get("language")
is_empty_language = lang_raw is None or (isinstance(lang_raw, str) and lang_raw == "")
if is_empty_language and schema_version == 1:
language = "__LEGACY_EMPTY__"
elif is_empty_language:
language = "en"
else:
language = str(lang_raw)
s5_raw = row.get("s5_trust_score")
s5_trust_score = float(s5_raw) if s5_raw is not None else 0.5
gain_raw = row.get("profile_modulation_gain_json") or "{}"
gain_plain = _decrypt_field_try_keys(str(gain_raw), row_uuid, keys)
try:
profile_modulation_gain = json.loads(gain_plain) or {}
except (TypeError, json.JSONDecodeError):
profile_modulation_gain = {}
last_reviewed_raw = row.get("last_reviewed")
try:
last_reviewed = None if pd.isna(last_reviewed_raw) else last_reviewed_raw
except (TypeError, ValueError):
last_reviewed = last_reviewed_raw
literal_raw = row.get("literal_surface", "")
literal_plain = _decrypt_field_try_keys(str(literal_raw), row_uuid, keys)
provenance_raw = row.get("provenance_json") or "[]"
provenance_plain = _decrypt_field_try_keys(str(provenance_raw), row_uuid, keys)
try:
provenance_list = json.loads(provenance_plain) if provenance_plain else []
except (TypeError, json.JSONDecodeError):
provenance_list = []
rec = MemoryRecord(
id=row_uuid,
tier=row.get("tier", "episodic"),
literal_surface=literal_plain,
aaak_index=row.get("aaak_index") or "",
embedding=(
list(row["embedding"])
if row.get("embedding") is not None
else []
),
community_id=community_id,
centrality=float(row.get("centrality", 0.0) or 0.0),
detail_level=int(row.get("detail_level", 1)),
pinned=bool(row.get("pinned", False)),
stability=float(row.get("stability") or 0.0),
difficulty=float(row.get("difficulty") or 0.0),
last_reviewed=last_reviewed,
never_decay=bool(row.get("never_decay", False)),
never_merge=bool(row.get("never_merge", False)),
provenance=provenance_list,
created_at=row.get("created_at") or datetime.now(timezone.utc),
updated_at=row.get("updated_at") or datetime.now(timezone.utc),
tags=json.loads(row.get("tags_json") or "[]"),
language=language,
s5_trust_score=s5_trust_score,
profile_modulation_gain=profile_modulation_gain,
schema_version=schema_version,
structure_hv=structure_hv,
)
if language == "__LEGACY_EMPTY__":
rec.language = ""
return rec
def migrate_crypto_recover_prior_key(
store: MemoryStore,
prior_key: bytes,
*,
dry_run: bool = False,
) -> dict:
"""Re-encrypt all records under the current file key using a prior AES key.
Use when ``.crypto.key`` was rotated or replaced while rows still carry
ciphertext from the old key (InvalidTag under the live key). Stages into
``records_crypto_recover_stage``, validates full row count, atomically
swaps ``records`` aside (``records_old_<ts>``), promotes staging to
``records`` — same filesystem-rename pattern as reembed migration.
Preconditions:
- ``detect_partial_migration`` state is ``clean`` or ``needs_cleanup``
(no in-flight ``records_v_new`` reembed).
- ``prior_key`` is 32 raw bytes (same format as ``.crypto.key``).
Idempotent: if every row decrypts with the **current** key alone, returns
``{"no_op": True, ...}`` without creating staging or swapping.
Returns
-------
dict
``no_op``, ``records_staged``, ``duration_sec``, ``dry_run``, ``old_table`` (if any).
"""
from cryptography.exceptions import InvalidTag
from iai_mcp.crypto import KEY_BYTES
if len(prior_key) != KEY_BYTES:
raise ValueError(f"prior_key must be {KEY_BYTES} raw bytes")
mig = detect_partial_migration(store.db)
if mig["state"] not in ("clean", "needs_cleanup"):
raise RuntimeError(
"crypto recover requires a non-partial reembed state "
f"(got {mig['state']!r}); resolve migrate --rollback/--resume first."
)
cur_key = store._key()
key_chain = [cur_key, prior_key] if prior_key != cur_key else [cur_key]
names = _db_table_names_set(store.db)
if CRYPTO_RECOVER_STAGING in names:
try:
store.db.drop_table(CRYPTO_RECOVER_STAGING)
except Exception as exc:
raise RuntimeError(
f"drop stale {CRYPTO_RECOVER_STAGING} failed: {exc}"
) from exc
orig_tbl = store.db.open_table(RECORDS_TABLE)
orig_count = int(orig_tbl.count_rows())
if orig_count == 0:
return {"no_op": True, "reason": "empty_store", "records_staged": 0, "dry_run": dry_run}
df = orig_tbl.to_pandas()
needs_prior = 0
for _, r in df.iterrows():
rid = UUID(str(r["id"]))
lit = str(r.get("literal_surface") or "")
if not is_encrypted(lit):
continue
try:
_decrypt_field_try_keys(lit, rid, [cur_key])
except (InvalidTag, ValueError):
try:
_decrypt_field_try_keys(lit, rid, [prior_key])
needs_prior += 1
except (InvalidTag, ValueError):
raise RuntimeError(
f"record {rid}: literal_surface not decryptable with current "
"or prior key — run crypto redact-undecryptable or restore backup"
) from None
if needs_prior == 0:
return {
"no_op": True,
"reason": "all_rows_decrypt_with_current_key",
"records_staged": 0,
"dry_run": dry_run,
}
if dry_run:
return {
"no_op": False,
"dry_run": True,
"would_stage": orig_count,
"rows_needing_prior_key": needs_prior,
}
schema = orig_tbl.schema
staging_tbl = store.db.create_table(CRYPTO_RECOVER_STAGING, schema=schema)
staged = 0
t0 = time.time()
for _, r in df.iterrows():
row_dict = r.to_dict()
rec = _memory_record_from_raw_row_multikey(store, row_dict, key_chain)
staging_tbl.add([store._to_row(rec)])
staged += 1
if staged != orig_count:
try:
store.db.drop_table(CRYPTO_RECOVER_STAGING)
except Exception:
pass
raise RuntimeError(
f"staging row count mismatch: staged={staged} orig={orig_count}"
)
duration_sec = time.time() - t0
try:
write_event(
store,
kind="migration_crypto_recover",
data={
"records_staged": staged,
"duration_sec": duration_sec,
"rows_needed_prior_key": needs_prior,
},
severity="info",
)
except Exception:
pass
ts = int(time.time())
old_name = f"{OLD_TABLE_PREFIX}{ts}"
_swap_tables_filesystem(store.db, source=RECORDS_TABLE, dest=old_name)
_swap_tables_filesystem(
store.db, source=CRYPTO_RECOVER_STAGING, dest=RECORDS_TABLE
)
return {
"no_op": False,
"records_staged": staged,
"duration_sec": duration_sec,
"dry_run": False,
"old_table": old_name,
"rows_needed_prior_key": needs_prior,
}
REDACT_UNDECRYPTABLE_MARKER = "<REDACTED: pre-2026-04-30 key rotation>"
def migrate_redact_undecryptable_records(store: MemoryStore) -> dict:
"""Replace literal_surface that cannot decrypt with ``REDACT_UNDECRYPTABLE_MARKER``.
Preserves embeddings, tier, tags, provenance column bytes (best-effort:
provenance_json is left unchanged — only literal_surface is redacted per
mandate). Emits ``crypto_redaction`` per changed row. Idempotent.
"""
from cryptography.exceptions import InvalidTag
tbl = store.db.open_table(RECORDS_TABLE)
if tbl.count_rows() == 0:
return {"redacted": 0, "skipped_ok": 0, "skipped_plain": 0}
df = tbl.to_pandas()
redacted = 0
skipped_ok = 0
skipped_plain = 0
for _, r in df.iterrows():
rid = UUID(str(r["id"]))
lit = str(r.get("literal_surface") or "")
if not is_encrypted(lit):
skipped_plain += 1
continue
try:
plain = store._decrypt_for_record(rid, lit)
except (InvalidTag, ValueError):
plain = None
if plain is not None:
# Already decryptable (includes idempotent prior redaction).
skipped_ok += 1
continue
prov_raw = str(r.get("provenance_json") or "[]")
try:
if is_encrypted(prov_raw):
prov_plain = store._decrypt_for_record(rid, prov_raw)
else:
prov_plain = prov_raw
except (InvalidTag, ValueError):
prov_plain = "[]"
gain_raw = str(r.get("profile_modulation_gain_json") or "{}")
try:
if is_encrypted(gain_raw):
gain_plain = store._decrypt_for_record(rid, gain_raw)
else:
gain_plain = gain_raw
except (InvalidTag, ValueError):
gain_plain = "{}"
new_lit = store._encrypt_for_record(rid, REDACT_UNDECRYPTABLE_MARKER)
new_prov = store._encrypt_for_record(rid, prov_plain)
new_gain = store._encrypt_for_record(rid, gain_plain)
tbl.update(
where=f"id = '{_uuid_literal(rid)}'",
values={
"literal_surface": new_lit,
"provenance_json": new_prov,
"profile_modulation_gain_json": new_gain,
"updated_at": datetime.now(timezone.utc),
},
)
redacted += 1
try:
write_event(
store,
kind="crypto_redaction",
data={"record_id": str(rid), "reason": "undecryptable_literal"},
severity="warning",
)
except Exception:
pass
return {
"redacted": redacted,
"skipped_ok": skipped_ok,
"skipped_plain": skipped_plain,
}
def _rollback(db, store: MemoryStore) -> int:
"""Roll back a partial reembed migration. Plan 07.11-03 / D-03.
Behaviour by state (per `detect_partial_migration` taxonomy):
- records present + records_v_new present (mid-stage crash):
DROP records_v_new; records is intact, no rename needed.
- records absent + records_old_<ts> present (mid-swap crash variant):
Rename records_old_<newest_ts> -> records; drop records_v_new if
present.
- records present + records_old_<ts> present (deferred-cleanup state):
Drop records_old_<ts> (treats rollback as "discard old snapshot"
when the new table is already in place).
- clean: no-op, return 0.
Drops `migration_progress.json` if present.
Returns 0 on success, 1 on user-correctable error (e.g. nothing to roll
back to), 2 on unrecoverable.
"""
names = set(db.table_names())
has_records = RECORDS_TABLE in names
has_staging = STAGING_TABLE in names
old_tables = sorted(n for n in names if n.startswith(OLD_TABLE_PREFIX))
try:
# Mid-stage crash: drop the partial staging.
if has_staging and has_records:
db.drop_table(STAGING_TABLE)
_progress_clear(store)
log.info(
"migrate_reembed_rollback_drop_staging",
extra={"records_count": db.open_table(RECORDS_TABLE).count_rows()},
)
return 0
# Mid-swap crash: restore from the newest old table.
if not has_records and old_tables:
newest_old = old_tables[-1]
if has_staging:
db.drop_table(STAGING_TABLE)
# Filesystem-level rename: records_old_<ts>.lance -> records.lance.
_swap_tables_filesystem(db, source=newest_old, dest=RECORDS_TABLE)
# Refresh embed_dim from the restored table's schema
# (mirrors store._ensure_tables lines 285-296).
try:
tbl = db.open_table(RECORDS_TABLE)
emb_field = tbl.schema.field("embedding")
actual_dim = getattr(emb_field.type, "list_size", None)
if actual_dim and int(actual_dim) > 0:
store._embed_dim = int(actual_dim)
except Exception:
pass
_progress_clear(store)
log.info(
"migrate_reembed_rollback_restore_old",
extra={
"restored_from": newest_old,
"records_count": db.open_table(RECORDS_TABLE).count_rows(),
},
)
return 0
# Deferred-cleanup state: discard the old snapshot at the user's
# request (rollback semantics here treat "discard old after
# successful swap" as a valid operator action).
if has_records and old_tables and not has_staging:
for old in old_tables:
try:
db.drop_table(old)
except Exception as exc:
log.warning(
"migrate_reembed_rollback_drop_old_failed",
extra={"table": old, "error": str(exc)[:160]},
)
_progress_clear(store)
return 0
# Clean state: nothing to roll back.
if has_records and not has_staging and not old_tables:
_progress_clear(store)
return 0
# Catastrophic: records absent + no old table to restore.
log.error(
"migrate_reembed_rollback_unrecoverable",
extra={
"has_records": has_records,
"has_staging": has_staging,
"old_tables": old_tables,
},
)
return 2
except Exception as exc:
log.error(
"migrate_reembed_rollback_failed",
extra={"error": str(exc)[:200]},
)
return 1
def _resume(db, store: MemoryStore, target_embedder) -> int:
"""Resume a partial reembed migration from `migration_progress.json`.
Reads the checkpoint to recover `staged_ids` and `target_dim`. Continues
the staging loop over rows in the live `records` table that are NOT
already in `staged_ids`. After staging completes, runs (validate)
and (atomic swap), then drops the progress file.
Returns 0 on success, 1 on user-correctable error (no progress file,
target_dim mismatch with the embedder), 2 on unrecoverable.
"""
progress_state = _progress_read(store)
if not progress_state:
log.error(
"migrate_reembed_resume_no_progress_file",
extra={"path": str(_progress_path(store))},
)
return 1
target_dim = int(target_embedder.DIM)
saved_target_dim = int(progress_state.get("target_dim") or 0)
if saved_target_dim and saved_target_dim != target_dim:
log.error(
"migrate_reembed_resume_dim_mismatch",
extra={
"saved_target_dim": saved_target_dim,
"embedder_dim": target_dim,
},
)
return 1
names = set(db.table_names())
if RECORDS_TABLE not in names:
log.error("migrate_reembed_resume_records_missing")
return 2
if STAGING_TABLE not in names:
# Staging table was dropped (or never created). Re-create it at
# the target dim and re-stage everything.
target_tbl = db.create_table(
STAGING_TABLE, schema=_records_schema_at_dim(target_dim)
)
already_staged: set[str] = set()
else:
target_tbl = db.open_table(STAGING_TABLE)
already_staged = set(progress_state.get("staged_ids") or [])
source_dim = int(store.embed_dim)
started_at_iso = progress_state.get(
"started_at", datetime.now(timezone.utc).isoformat()
)
total = db.open_table(RECORDS_TABLE).count_rows()
last_idx = int(progress_state.get("row_index") or 0)
t0 = time.time()
try:
staged_count, failures = _stage_loop(
store,
target_embedder,
target_dim,
target_tbl,
store.iter_records(),
total=total,
started_at_iso=started_at_iso,
started_idx=last_idx + 1,
already_staged_ids=already_staged,
)
except (KeyboardInterrupt, SystemExit):
# Re-kill mid-resume: progress file is up-to-date; another --resume
# picks up where this one left off.
raise
except Exception as exc:
log.error(
"migrate_reembed_resume_stage_failed",
extra={"error": str(exc)[:200]},
)
return 2
# Combine prior-run staged count with this run's staged count for the
# event payload — total updated rows is what the user/audit cares about.
total_staged = len(already_staged) + staged_count
duration_sec = time.time() - t0
try:
_validate_and_swap(
store,
source_dim=source_dim,
target_dim=target_dim,
target_embedder=target_embedder,
staged_count=total_staged,
failures=failures,
duration_sec=duration_sec,
)
except RuntimeError as exc:
log.error(
"migrate_reembed_resume_validate_failed",
extra={"error": str(exc)[:200]},
)
return 2
return 0
# ---------------------------------------------------------------------------
# v2 -> v3 encryption migration
# ---------------------------------------------------------------------------
def _encrypt_or_passthrough(
store: MemoryStore,
record_id: UUID,
value: str,
) -> tuple[str, bool]:
"""Encrypt `value` if it is plaintext; pass through if already encrypted.
Returns (new_value, was_encrypted_now). `was_encrypted_now` is True only
when the value flipped from plaintext to ciphertext on this call.
"""
if is_encrypted(value):
return value, False
ad = _uuid_literal(record_id).encode("ascii")
ct = encrypt_field(value or "", store._key(), associated_data=ad)
return ct, True
def migrate_encryption_v2_to_v3(
store: MemoryStore,
dry_run: bool = False,
progress: Optional[Callable[[int, int], None]] = None,
) -> dict:
"""One-shot encryption migration for (SEC-ENCRYPTION-AT-REST).
Scans both the records table and the events table; anything whose
sensitive column currently lives as plaintext is re-encrypted in place.
Idempotent: rows already carrying the iai:enc:v1: prefix are left alone.
Records columns re-encrypted:
- literal_surface (user content)
- provenance_json (session cues + quotes)
- profile_modulation_gain_json (learned per-user data)
Events columns re-encrypted:
- data_json (may contain quoted user content in some event kinds)
Parameters
----------
store: open MemoryStore (encryption key auto-loaded from keyring).
dry_run: when True, count migrable rows without writing.
progress: optional callback(idx, total) for CLI / external progress UIs.
Returns a dict with record and event migration counts plus duration.
preserved: encryption is lossless; decrypt + get() returns the
exact same string bytes the caller originally stored.
"""
t0 = time.time()
result = {
"records_migrated": 0,
"events_migrated": 0,
"records_scanned": 0,
"events_scanned": 0,
"duration_sec": 0.0,
}
# ----- records table sweep -----
records_tbl = store.db.open_table(RECORDS_TABLE)
records_df = records_tbl.to_pandas()
result["records_scanned"] = int(len(records_df))
records_updates: list[dict] = []
record_total = len(records_df)
for idx, (_, row) in enumerate(records_df.iterrows()):
if progress is not None:
try:
progress(idx, record_total)
except Exception:
pass
try:
rid = UUID(str(row["id"]))
except (ValueError, TypeError):
continue
literal_raw = row.get("literal_surface") or ""
prov_raw = row.get("provenance_json") or "[]"
gain_raw = row.get("profile_modulation_gain_json") or "{}"
any_plaintext = any(
not is_encrypted(v) for v in (literal_raw, prov_raw, gain_raw)
)
if not any_plaintext:
continue # Row fully encrypted already -- skip (idempotent).
if dry_run:
result["records_migrated"] += 1
continue
new_literal, _ = _encrypt_or_passthrough(store, rid, literal_raw)
new_prov, _ = _encrypt_or_passthrough(store, rid, prov_raw)
new_gain, _ = _encrypt_or_passthrough(store, rid, gain_raw)
records_updates.append(
{
"id": _uuid_literal(rid),
"literal_surface": new_literal,
"provenance_json": new_prov,
"profile_modulation_gain_json": new_gain,
}
)
result["records_migrated"] += 1
if not dry_run and records_updates:
now = datetime.now(timezone.utc)
import pyarrow as pa
update_tbl = pa.table(
{
"id": [u["id"] for u in records_updates],
"literal_surface": [u["literal_surface"] for u in records_updates],
"provenance_json": [u["provenance_json"] for u in records_updates],
"profile_modulation_gain_json": [
u["profile_modulation_gain_json"] for u in records_updates
],
"updated_at": [now] * len(records_updates),
}
)
try:
records_tbl.merge_insert("id").when_matched_update_all().execute(update_tbl)
except Exception:
# Rule 1 fallback: per-id tbl.update when merge_insert is unavailable.
for u in records_updates:
try:
records_tbl.update(
where=f"id = '{u['id']}'",
values={
"literal_surface": u["literal_surface"],
"provenance_json": u["provenance_json"],
"profile_modulation_gain_json": u[
"profile_modulation_gain_json"
],
"updated_at": now,
},
)
except Exception:
continue
# ----- events table sweep -----
events_tbl = store.db.open_table(EVENTS_TABLE)
events_df = events_tbl.to_pandas()
result["events_scanned"] = int(len(events_df))
events_updates: list[dict] = []
for _, row in events_df.iterrows():
data_raw = row.get("data_json") or "{}"
if is_encrypted(data_raw):
continue
event_id = str(row["id"])
if dry_run:
result["events_migrated"] += 1
continue
ad = event_id.encode("ascii")
new_data = encrypt_field(data_raw, store._key(), associated_data=ad)
events_updates.append({"id": event_id, "data_json": new_data})
result["events_migrated"] += 1
if not dry_run and events_updates:
for u in events_updates:
try:
events_tbl.update(
where=f"id = '{u['id']}'",
values={"data_json": u["data_json"]},
)
except Exception:
continue
result["duration_sec"] = time.time() - t0
# ----- emit audit event -----
if not dry_run and (
result["records_migrated"] > 0 or result["events_migrated"] > 0
):
write_event(
store,
kind="migration_v2_to_v3",
data={
"record_count": result["records_migrated"],
"event_count": result["events_migrated"],
"duration_sec": result["duration_sec"],
"columns_encrypted": [
"records.literal_surface",
"records.provenance_json",
"records.profile_modulation_gain_json",
"events.data_json",
],
"algorithm": "AES-256-GCM",
"format": "iai:enc:v1:",
},
severity="info",
)
return result
# ---------------------------------------------------------------------------
# CONN-05: v3 -> v4 TEM factorization migration
# ---------------------------------------------------------------------------
def migrate_hd_vector_to_structure_hv_v3_to_v4(
store: MemoryStore,
dry_run: bool = False,
progress: Optional[Callable[[int, int], None]] = None,
) -> dict:
"""Plan 03-01 CONN-05: rename `hd_vector_json` (pa.string()) -> `structure_hv`
(pa.binary()) and backfill every Phase 1/2 record with a freshly-bound
structural hypervector via tem.bind_structure().
Idempotency contract:
Rows that satisfy BOTH (a) schema_version >= 4 AND (b) non-empty
structure_hv are skipped. Any row failing either condition is migrated.
CR-01 / SQL-injection guard (carried over from 02-06 lesson):
every WHERE / DELETE predicate routes through store._uuid_literal so
a poisoned UUID cannot inject SQL content.
Resumability:
Each record is delete+insert'd individually; a crash mid-batch leaves
a partially-migrated store that the next run picks up cleanly.
MEM-01:
literal_surface is preserved byte-for-byte. The migration only touches
structure_hv + schema_version on each row.
LanceDB schema-rename note:
For stores created on the new schema (the typical case after this plan
ships) the column already exists as `structure_hv` (pa.binary()). For
legacy stores still on the old `hd_vector_json` (pa.string()) schema,
the rebuild is implicit -- store.insert() writes through the new
schema, so the delete+insert per-row migration produces a fully-renamed
table after one full sweep.
Parameters
----------
store: open MemoryStore.
dry_run: when True, count migrable rows without writing.
progress: optional callback(idx, total) for CLI / external progress UIs.
Returns
-------
dict with keys: processed, updated, skipped, duration_ms,
column_renamed_from, column_renamed_to.
"""
t0 = time.time()
result: dict = {
"processed": 0,
"updated": 0,
"skipped": 0,
"duration_ms": 0.0,
"column_renamed_from": "hd_vector_json",
"column_renamed_to": "structure_hv",
}
# We use store.all_records() so the read path normalises legacy v3 rows
# (with the old `hd_vector_json` column) into MemoryRecord instances with
# an empty structure_hv -- giving the migration a uniform write surface.
all_records = store.all_records()
total = len(all_records)
result["processed"] = total
# Lazy import: tem.py is part of Plan 03-01; importing it at module top
# would create a load-time cycle (migrate.py is imported by cli.py which
# is imported by sometimes-called CLI tooling -- keep it lazy).
from iai_mcp.tem import bind_structure
from iai_mcp.types import (
SCHEMA_VERSION_V4,
STRUCTURE_HV_BYTES,
)
# Per-row delete+insert in the manner of migrate_v1_to_v2 (CR-01-safe).
tbl = store.db.open_table(RECORDS_TABLE)
for idx, record in enumerate(all_records):
if progress is not None:
try:
progress(idx, total)
except Exception:
pass
# Idempotency: already at v4 with a populated structure_hv -> skip.
already_v4 = record.schema_version >= SCHEMA_VERSION_V4
has_full_hv = (
isinstance(record.structure_hv, (bytes, bytearray))
and len(record.structure_hv) == STRUCTURE_HV_BYTES
)
if already_v4 and has_full_hv:
result["skipped"] += 1
continue
if dry_run:
result["updated"] += 1
continue
# Compute the canonical structure_hv if this row hasn't got one yet.
# only structure_hv + schema_version mutate; literal_surface
# and every other field flow through unchanged.
if not has_full_hv:
record.structure_hv = bind_structure(record)
record.schema_version = SCHEMA_VERSION_V4
# CR-01 guarded delete + insert. The _uuid_literal call sanitises the
# UUID before it enters the WHERE predicate -- a poisoned UUID would
# raise ValueError on canonical-form check, never reaching LanceDB.
try:
tbl.delete(f"id = '{_uuid_literal(record.id)}'")
except Exception:
# Diagnostic-only: a missing row still gets re-inserted below.
pass
store.insert(record)
result["updated"] += 1
result["duration_ms"] = (time.time() - t0) * 1000.0
# Audit-event emission per the established convention (no-op on dry_run).
if not dry_run and (result["updated"] > 0 or result["skipped"] > 0):
write_event(
store,
kind="migration_v3_to_v4",
data={
"processed": result["processed"],
"updated": result["updated"],
"skipped": result["skipped"],
"duration_ms": result["duration_ms"],
"column_renamed_from": result["column_renamed_from"],
"column_renamed_to": result["column_renamed_to"],
},
severity="info",
)
return result
# ---------------------------------------------------------------------------
# R8: cleanup migration for accumulated schema duplicates
# ---------------------------------------------------------------------------
def cleanup_schema_duplicates(
store: MemoryStore,
*,
apply: bool = False,
store_path: "Path | None" = None,
) -> dict:
"""Group semantic schema records by `pattern:*` tag; keep oldest; soft-delete the rest.
R8: a one-shot reversible cleanup of duplicates that accumulated
in the production store BEFORE made `persist_schema` idempotent.
NOT a schema_version v-bump — this is a maintenance op that runs on
demand, never automatically. Beer VSM S2 anti-oscillation + Ashby
ultrastability mandate dry-run default + snapshot before write +
soft-delete via tier rename + idempotency.
Parameters
----------
store : MemoryStore
Open store (connected to the LanceDB directory under inspection).
apply : bool
False (default) -- dry-run, mutate nothing, return diff summary.
True -- snapshot the LanceDB tables dir, reinforce edges, soft-delete
duplicates by renaming their tier to "semantic_pruned" + flipping
pinned/never_decay to False.
store_path : Path | None
IAI root directory (the path passed to MemoryStore(); contains the
`lancedb/` subdir with the actual tables). When None, falls back to
`store.root`. Snapshot lands at
`store.root / f"lancedb-pre-cleanup-{ts}"` (sibling of `lancedb/`,
per — recovery is `mv lancedb-pre-cleanup-{ts} lancedb`).
Returns
-------
dict
{
"mode": "dry-run" | "apply",
"groups": int, # patterns with N>1 duplicates
"keepers": int, # one per group
"pruned": int, # cumulative duplicates soft-deleted
"edges_reinforced": int, # incoming schema_instance_of edges redirected
"snapshot_dir": str | None, # set only on apply
}
"""
import shutil
from pathlib import Path
from datetime import datetime, timezone
from iai_mcp.store import EDGES_TABLE
from iai_mcp.types import SEMANTIC_PRUNED_TIER
# --- 1. Discover pattern groups: tier='semantic' AND tag matches pattern:*
groups: dict[str, list[MemoryRecord]] = {}
try:
all_records = store.all_records()
except Exception:
# Diagnostic-only: a read failure leaves the store untouched and
# returns an empty summary instead of raising. Operators see the
# empty result and can investigate.
return {
"mode": "apply" if apply else "dry-run",
"groups": 0,
"keepers": 0,
"pruned": 0,
"edges_reinforced": 0,
"snapshot_dir": None,
}
for rec in all_records:
if rec.tier != "semantic":
continue
pattern_tag = next(
(t for t in (rec.tags or []) if t.startswith("pattern:")),
None,
)
if pattern_tag is None or ":" not in pattern_tag:
continue
pattern = pattern_tag.split(":", 1)[1]
groups.setdefault(pattern, []).append(rec)
# Single-record groups are not duplicates -- nothing to do.
dup_groups = {p: recs for p, recs in groups.items() if len(recs) > 1}
# --- 2. Select keepers (oldest first per pattern) + identify duplicates
keepers: list[MemoryRecord] = []
duplicates: list[MemoryRecord] = []
for pattern, recs in dup_groups.items():
recs_sorted = sorted(recs, key=lambda r: r.created_at)
keepers.append(recs_sorted[0])
duplicates.extend(recs_sorted[1:])
# --- 3. Plan edge redirects: count incoming schema_instance_of edges
# to duplicates so the dry-run can report what would be reinforced.
edges_to_reinforce = 0
try:
edges_df = store.db.open_table(EDGES_TABLE).to_pandas()
dup_id_strs = {str(d.id) for d in duplicates}
if dup_id_strs and "edge_type" in edges_df.columns:
# boost_edges canonicalises (src, dst) to a sorted tuple, so the
# duplicate appears in EITHER column. OR-count both columns —
# each row has the dup in exactly one column, no double-count.
mask = (
(edges_df["edge_type"] == "schema_instance_of")
& (
edges_df["dst"].isin(dup_id_strs)
| edges_df["src"].isin(dup_id_strs)
)
)
edges_to_reinforce = int(mask.sum())
except Exception:
edges_to_reinforce = 0
snapshot_dir: str | None = None
if apply and (keepers or duplicates):
# --- 4. Snapshot the LanceDB tables dir BEFORE any write.
# store.root is the IAI root (contains lancedb/ subdir + state files).
# The actual tables live at store.root / "lancedb"; the snapshot is a
# sibling at store.root / f"lancedb-pre-cleanup-{ts}", so manual
# recovery is `mv ~/.iai-mcp/lancedb-pre-cleanup-{ts} ~/.iai-mcp/lancedb`.
iai_root = Path(store_path) if store_path is not None else Path(store.root)
src_lancedb = iai_root / "lancedb"
ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
snap = iai_root / f"lancedb-pre-cleanup-{ts}"
# If src_lancedb does not exist (e.g. legacy layout), fall back to
# snapshotting the IAI root itself so the operator still has rollback.
snapshot_source = src_lancedb if src_lancedb.exists() else iai_root
shutil.copytree(snapshot_source, snap)
snapshot_dir = str(snap)
# --- 5. Build keeper lookup by pattern for the redirect step.
keeper_by_pattern: dict[str, MemoryRecord] = {}
for k in keepers:
kp = next(
(t for t in (k.tags or []) if t.startswith("pattern:")),
None,
)
if kp and ":" in kp:
keeper_by_pattern[kp.split(":", 1)[1]] = k
# --- 6. Redirect edges: copy incoming schema_instance_of edges from
# each duplicate onto its keeper BEFORE the duplicate's tier is renamed.
# Edge reinforcement failure must NOT block the tier rename — the
# operator can re-run cleanup to complete edge consolidation.
try:
edges_df = store.db.open_table(EDGES_TABLE).to_pandas()
for dup in duplicates:
dp = next(
(t for t in (dup.tags or []) if t.startswith("pattern:")),
None,
)
if dp is None or ":" not in dp:
continue
pattern = dp.split(":", 1)[1]
keeper = keeper_by_pattern.get(pattern)
if keeper is None or keeper.id == dup.id:
continue
dup_str = str(dup.id)
incoming_mask = (
(edges_df["edge_type"] == "schema_instance_of")
& ((edges_df["dst"] == dup_str) | (edges_df["src"] == dup_str))
)
incoming = edges_df[incoming_mask]
if incoming.empty:
continue
pairs: list[tuple[UUID, UUID]] = []
for _, row in incoming.iterrows():
# Determine the OTHER side of the edge (the evidence node)
# — it's whichever column does NOT carry the duplicate's id.
other_str = (
row["src"] if row["dst"] == dup_str else row["dst"]
)
if other_str == dup_str:
# Self-edge sanity guard.
continue
try:
other_id = UUID(str(other_str))
except (TypeError, ValueError):
continue
pairs.append((other_id, keeper.id))
if pairs:
store.boost_edges(
pairs,
edge_type="schema_instance_of",
delta=0.1,
)
except Exception:
# Diagnostic: see comment at section header.
pass
# --- 7. Soft-delete via tier rename: delete + re-insert each duplicate
# with tier=semantic_pruned, pinned=False, never_decay=False.
# Other fields preserved (literal_surface, embedding, provenance, etc.)
# for reverse-migration recoverability.
for dup in duplicates:
try:
store.delete(dup.id)
pruned_rec = MemoryRecord(
id=dup.id,
tier=SEMANTIC_PRUNED_TIER,
literal_surface=dup.literal_surface,
aaak_index=dup.aaak_index,
embedding=dup.embedding,
community_id=dup.community_id,
centrality=dup.centrality,
detail_level=dup.detail_level,
pinned=False, # pruned rows are unpinned
stability=dup.stability,
difficulty=dup.difficulty,
last_reviewed=dup.last_reviewed,
never_decay=False, # pruned rows can decay
never_merge=dup.never_merge,
provenance=dup.provenance,
created_at=dup.created_at,
updated_at=datetime.now(timezone.utc),
tags=dup.tags,
language=dup.language,
s5_trust_score=dup.s5_trust_score,
profile_modulation_gain=dup.profile_modulation_gain,
schema_version=dup.schema_version,
structure_hv=dup.structure_hv,
)
store.insert(pruned_rec)
except Exception:
# Per-record continuation: a single failed soft-delete must
# not abort the rest of the batch. Operator can re-run.
continue
# --- 8. Emit summary event + return summary dict
summary: dict = {
"mode": "apply" if apply else "dry-run",
"groups": len(dup_groups),
"keepers": len(keepers),
"pruned": len(duplicates),
"edges_reinforced": int(edges_to_reinforce),
"snapshot_dir": snapshot_dir,
}
try:
write_event(
store,
kind="schema_cleanup_run",
data=summary,
severity="info",
source_ids=[k.id for k in keepers[:5]] if keepers else None,
)
except Exception:
# Diagnostic-only: an event-write failure must not invalidate the
# cleanup itself.
pass
return summary