"""Embedding layer -- configurable embedder with a 3-model registry. Plan 05-08 (2026-04-20): the DEFAULT is now ``bge-small-en-v1.5`` (384d English-only), reverting the Phase-2 deviation. PROJECT.md line 125 always specified bge-small-en-v1.5 as the intended default; Phase-2 swapped in bge-m3 (1024d multilingual) as D-08a. User directive 2026-04-19: the brain stores English, surface translation is Claude's job. bge-m3 stays selectable via env var / kwarg for anyone who needs multilingual semantic match at the 5x RAM cost. Configurable 4-model registry: - "bge-m3" -> BAAI/bge-m3 -> 1024d (opt-in, multilingual) - "multilingual-e5-small" -> intfloat/multilingual-e5-small -> 384d (compromise) - "bge-small-en-v1.5" -> BAAI/bge-small-en-v1.5 -> 384d (DEFAULT, English) - "all-MiniLM-L6-v2" -> sentence-transformers/all-MiniLM-L6-v2 -> 384d (English alternative embedder option; included for compatibility testing) Selection priority at Embedder() instantiation: 1. Explicit `model_key` constructor arg 2. IAI_MCP_EMBED_MODEL environment variable 3. MODEL_REGISTRY default ("bge-small-en-v1.5") The model is loaded once per process and cached in a module-level dict so multiple Embedder() instances share the underlying SentenceTransformer. Deterministic: `normalize_embeddings=True` is always passed, `show_progress_bar=False`. Same input text always produces the same output vector across calls within a process. """ from __future__ import annotations import os import threading from sentence_transformers import SentenceTransformer # 4-model registry. Name convention: short logical key -> HF repo id + dim. # (2026-04-29): all-MiniLM-L6-v2 added as additive ablation entry; # DEFAULT_MODEL_KEY unchanged (English-Only Brain lock from / Plan 05-08). MODEL_REGISTRY: dict[str, dict] = { "bge-m3": {"hf": "BAAI/bge-m3", "dim": 1024}, "multilingual-e5-small": {"hf": "intfloat/multilingual-e5-small", "dim": 384}, "bge-small-en-v1.5": {"hf": "BAAI/bge-small-en-v1.5", "dim": 384}, "all-MiniLM-L6-v2": {"hf": "sentence-transformers/all-MiniLM-L6-v2", "dim": 384}, } DEFAULT_MODEL_KEY = "bge-small-en-v1.5" def _resolve_model_key(model_key: str | None = None) -> str: if model_key is not None: if model_key not in MODEL_REGISTRY: raise ValueError( f"unknown embed model key {model_key!r}; valid: {sorted(MODEL_REGISTRY)}" ) return model_key env_key = os.environ.get("IAI_MCP_EMBED_MODEL") if env_key: if env_key not in MODEL_REGISTRY: raise ValueError( f"unknown embed model key {env_key!r} from IAI_MCP_EMBED_MODEL; " f"valid: {sorted(MODEL_REGISTRY)}" ) return env_key return DEFAULT_MODEL_KEY _MODEL_LOCK = threading.Lock() _MODEL_CACHE: dict[str, SentenceTransformer] = {} def _get_model(hf_id: str) -> SentenceTransformer: """Process-local lazy-load + cache. Thread-safe via lock around cache mutation.""" with _MODEL_LOCK: if hf_id not in _MODEL_CACHE: _MODEL_CACHE[hf_id] = SentenceTransformer(hf_id) return _MODEL_CACHE[hf_id] class Embedder: """English-Only Brain embedder with a configurable model registry. Default model is `bge-small-en-v1.5` (384d, English) per Plan 05-08. Used by the retrieval pipeline (stage 1, cue embedding) and by session-start assembler. `.DIM` is per-instance (varies by model). `.DEFAULT_DIM` is a class-level default pointing at the registry's default model dimension. The opt-in `bge-m3` (1024d multilingual) path stays in the registry for users who explicitly need multilingual semantic match at the 5x RAM cost, but it is opt-in via `IAI_MCP_EMBED_MODEL=bge-m3` — not the product. Backward compatibility: - `Embedder.DIM` is kept as a class attribute aliased to the default model dimension so tests that reference `Embedder.DIM` still work; new code should prefer `Embedder().DIM` (instance attr) for correctness. - `Embedder.DEFAULT_MODEL` is the HF id of the default model (bge-small-en-v1.5). """ DEFAULT_MODEL_KEY: str = DEFAULT_MODEL_KEY DEFAULT_DIM: int = MODEL_REGISTRY[DEFAULT_MODEL_KEY]["dim"] # Legacy class-level attributes (Phase 1 test compatibility). # New code should construct Embedder() and read .DIM from the instance. DEFAULT_MODEL: str = MODEL_REGISTRY[DEFAULT_MODEL_KEY]["hf"] DIM: int = DEFAULT_DIM def __init__( self, model_key: str | None = None, *, model_name: str | None = None, ) -> None: """Initialise an Embedder. Parameters ---------- model_key: Logical key from MODEL_REGISTRY ("bge-m3" | "multilingual-e5-small" | "bge-small-en-v1.5"). If None, uses IAI_MCP_EMBED_MODEL env var or the registry default. model_name: Legacy parameter: full HuggingFace repo id (e.g. "BAAI/bge-small-en-v1.5"). Prefer model_key for new code. If both are provided, model_key wins. """ if model_key is None and model_name is not None: # Reverse-lookup: find the key whose hf matches this name. match = next( (k for k, v in MODEL_REGISTRY.items() if v["hf"] == model_name), None, ) if match is None: raise ValueError( f"model_name {model_name!r} is not in MODEL_REGISTRY; " f"valid hf ids: {[v['hf'] for v in MODEL_REGISTRY.values()]}" ) key = match else: key = _resolve_model_key(model_key) self.model_key: str = key spec = MODEL_REGISTRY[key] self.model_name: str = spec["hf"] self.DIM: int = int(spec["dim"]) # instance attr overrides class attr self._model = _get_model(self.model_name) def embed(self, text: str) -> list[float]: """Encode a single string to a DIM-length list[float]. Normalised, deterministic.""" vec = self._model.encode( text, normalize_embeddings=True, show_progress_bar=False ) return vec.tolist() def embed_batch(self, texts: list[str]) -> list[list[float]]: """Batch-encode preserving input order. Returns N vectors for N inputs.""" vecs = self._model.encode( list(texts), normalize_embeddings=True, show_progress_bar=False, batch_size=32, ) return [v.tolist() for v in vecs] def embedder_for_store(store) -> "Embedder": """Store-aware Embedder factory. Picks the model whose output dim matches the existing LanceDB records schema, so a legacy 1024d store from the pre-Plan-05-08 bge-m3 era stays queryable until it is re-embedded down to the 384d English-Only-Brain default. Resolution order: 1. If store.embed_dim has an exact match in MODEL_REGISTRY, prefer the model whose logical key name indicates the canonical model at that dim (bge-small-en-v1.5 for 384d default; bge-m3 for legacy/opt-in 1024d). 2. Otherwise fall through to the env/registry default via Embedder(). This decouples runtime model selection from a global env var so a single process can operate multiple stores at different dims while the migration from a legacy 1024d store down to 384d completes. """ target_dim = getattr(store, "embed_dim", None) if target_dim is None: return Embedder() preferred = {384: "bge-small-en-v1.5", 1024: "bge-m3"} key = preferred.get(int(target_dim)) # Tests and migrations may monkey-patch `Embedder` with a stub that takes no # kwargs. Fall back to the zero-arg form in that case so the fake surface # stays compatible; real production code still respects store.embed_dim. try: if key is not None and key in MODEL_REGISTRY: return Embedder(model_key=key) for reg_key, spec in MODEL_REGISTRY.items(): if int(spec["dim"]) == int(target_dim): return Embedder(model_key=reg_key) except TypeError: pass return Embedder()