Initial release: iai-mcp v0.1.0

Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: XNLLLLH <XNLLLLH@users.noreply.github.com>
2026-05-06 01:04:47 -07:00 · 2026-05-06 01:04:47 -07:00 · f6b876fbe7
commit f6b876fbe7
332 changed files with 97258 additions and 0 deletions
--- a/src/iai_mcp/embed.py
+++ b/src/iai_mcp/embed.py
@ -0,0 +1,193 @@
+"""Embedding layer -- configurable embedder with a 3-model registry.
+
+Plan 05-08 (2026-04-20): the DEFAULT is now ``bge-small-en-v1.5`` (384d
+English-only), reverting the Phase-2 deviation. PROJECT.md line
+125 always specified bge-small-en-v1.5 as the intended default; Phase-2
+swapped in bge-m3 (1024d multilingual) as D-08a. User directive
+2026-04-19: the brain stores English, surface translation is Claude's
+job. bge-m3 stays selectable via env var / kwarg for anyone who needs
+multilingual semantic match at the 5x RAM cost.
+
+Configurable 4-model registry:
+- "bge-m3"                 -> BAAI/bge-m3               -> 1024d (opt-in, multilingual)
+- "multilingual-e5-small"  -> intfloat/multilingual-e5-small -> 384d (compromise)
+- "bge-small-en-v1.5"      -> BAAI/bge-small-en-v1.5    -> 384d (DEFAULT, English)
+- "all-MiniLM-L6-v2"       -> sentence-transformers/all-MiniLM-L6-v2 -> 384d (English alternative embedder option; included for compatibility testing)
+
+Selection priority at Embedder() instantiation:
+1. Explicit `model_key` constructor arg
+2. IAI_MCP_EMBED_MODEL environment variable
+3. MODEL_REGISTRY default ("bge-small-en-v1.5")
+
+The model is loaded once per process and cached in a module-level dict so
+multiple Embedder() instances share the underlying SentenceTransformer.
+
+Deterministic: `normalize_embeddings=True` is always passed,
+`show_progress_bar=False`. Same input text always produces the same output
+vector across calls within a process.
+"""
+from __future__ import annotations
+
+import os
+import threading
+
+from sentence_transformers import SentenceTransformer
+
+
+# 4-model registry. Name convention: short logical key -> HF repo id + dim.
+# (2026-04-29): all-MiniLM-L6-v2 added as additive ablation entry;
+# DEFAULT_MODEL_KEY unchanged (English-Only Brain lock from / Plan 05-08).
+MODEL_REGISTRY: dict[str, dict] = {
+    "bge-m3": {"hf": "BAAI/bge-m3", "dim": 1024},
+    "multilingual-e5-small": {"hf": "intfloat/multilingual-e5-small", "dim": 384},
+    "bge-small-en-v1.5": {"hf": "BAAI/bge-small-en-v1.5", "dim": 384},
+    "all-MiniLM-L6-v2": {"hf": "sentence-transformers/all-MiniLM-L6-v2", "dim": 384},
+}
+DEFAULT_MODEL_KEY = "bge-small-en-v1.5"
+
+
+def _resolve_model_key(model_key: str | None = None) -> str:
+    if model_key is not None:
+        if model_key not in MODEL_REGISTRY:
+            raise ValueError(
+                f"unknown embed model key {model_key!r}; valid: {sorted(MODEL_REGISTRY)}"
+            )
+        return model_key
+    env_key = os.environ.get("IAI_MCP_EMBED_MODEL")
+    if env_key:
+        if env_key not in MODEL_REGISTRY:
+            raise ValueError(
+                f"unknown embed model key {env_key!r} from IAI_MCP_EMBED_MODEL; "
+                f"valid: {sorted(MODEL_REGISTRY)}"
+            )
+        return env_key
+    return DEFAULT_MODEL_KEY
+
+
+_MODEL_LOCK = threading.Lock()
+_MODEL_CACHE: dict[str, SentenceTransformer] = {}
+
+
+def _get_model(hf_id: str) -> SentenceTransformer:
+    """Process-local lazy-load + cache. Thread-safe via lock around cache mutation."""
+    with _MODEL_LOCK:
+        if hf_id not in _MODEL_CACHE:
+            _MODEL_CACHE[hf_id] = SentenceTransformer(hf_id)
+        return _MODEL_CACHE[hf_id]
+
+
+class Embedder:
+    """English-Only Brain embedder with a configurable model registry.
+
+    Default model is `bge-small-en-v1.5` (384d, English) per Plan 05-08.
+    Used by the retrieval pipeline (stage 1, cue embedding) and by session-start
+    assembler. `.DIM` is per-instance (varies by model). `.DEFAULT_DIM` is a
+    class-level default pointing at the registry's default model dimension.
+
+    The opt-in `bge-m3` (1024d multilingual) path stays in the registry for
+    users who explicitly need multilingual semantic match at the 5x RAM cost,
+    but it is opt-in via `IAI_MCP_EMBED_MODEL=bge-m3` — not the product.
+
+    Backward compatibility:
+    - `Embedder.DIM` is kept as a class attribute aliased to the default model
+      dimension so tests that reference `Embedder.DIM` still work; new
+      code should prefer `Embedder().DIM` (instance attr) for correctness.
+    - `Embedder.DEFAULT_MODEL` is the HF id of the default model (bge-small-en-v1.5).
+    """
+
+    DEFAULT_MODEL_KEY: str = DEFAULT_MODEL_KEY
+    DEFAULT_DIM: int = MODEL_REGISTRY[DEFAULT_MODEL_KEY]["dim"]
+    # Legacy class-level attributes (Phase 1 test compatibility).
+    # New code should construct Embedder() and read .DIM from the instance.
+    DEFAULT_MODEL: str = MODEL_REGISTRY[DEFAULT_MODEL_KEY]["hf"]
+    DIM: int = DEFAULT_DIM
+
+    def __init__(
+        self,
+        model_key: str | None = None,
+        *,
+        model_name: str | None = None,
+    ) -> None:
+        """Initialise an Embedder.
+
+        Parameters
+        ----------
+        model_key:
+            Logical key from MODEL_REGISTRY ("bge-m3" | "multilingual-e5-small" |
+            "bge-small-en-v1.5"). If None, uses IAI_MCP_EMBED_MODEL env var or
+            the registry default.
+        model_name:
+            Legacy parameter: full HuggingFace repo id (e.g. "BAAI/bge-small-en-v1.5").
+            Prefer model_key for new code. If both are provided, model_key wins.
+        """
+        if model_key is None and model_name is not None:
+            # Reverse-lookup: find the key whose hf matches this name.
+            match = next(
+                (k for k, v in MODEL_REGISTRY.items() if v["hf"] == model_name),
+                None,
+            )
+            if match is None:
+                raise ValueError(
+                    f"model_name {model_name!r} is not in MODEL_REGISTRY; "
+                    f"valid hf ids: {[v['hf'] for v in MODEL_REGISTRY.values()]}"
+                )
+            key = match
+        else:
+            key = _resolve_model_key(model_key)
+        self.model_key: str = key
+        spec = MODEL_REGISTRY[key]
+        self.model_name: str = spec["hf"]
+        self.DIM: int = int(spec["dim"])  # instance attr overrides class attr
+        self._model = _get_model(self.model_name)
+
+    def embed(self, text: str) -> list[float]:
+        """Encode a single string to a DIM-length list[float]. Normalised, deterministic."""
+        vec = self._model.encode(
+            text, normalize_embeddings=True, show_progress_bar=False
+        )
+        return vec.tolist()
+
+    def embed_batch(self, texts: list[str]) -> list[list[float]]:
+        """Batch-encode preserving input order. Returns N vectors for N inputs."""
+        vecs = self._model.encode(
+            list(texts),
+            normalize_embeddings=True,
+            show_progress_bar=False,
+            batch_size=32,
+        )
+        return [v.tolist() for v in vecs]
+
+
+def embedder_for_store(store) -> "Embedder":
+    """Store-aware Embedder factory. Picks the model whose output dim matches
+    the existing LanceDB records schema, so a legacy 1024d store from the
+    pre-Plan-05-08 bge-m3 era stays queryable until it is re-embedded down to
+    the 384d English-Only-Brain default.
+
+    Resolution order:
+    1. If store.embed_dim has an exact match in MODEL_REGISTRY, prefer the
+       model whose logical key name indicates the canonical model at that dim
+       (bge-small-en-v1.5 for 384d default; bge-m3 for legacy/opt-in 1024d).
+    2. Otherwise fall through to the env/registry default via Embedder().
+
+    This decouples runtime model selection from a global env var so a single
+    process can operate multiple stores at different dims while the migration
+    from a legacy 1024d store down to 384d completes.
+    """
+    target_dim = getattr(store, "embed_dim", None)
+    if target_dim is None:
+        return Embedder()
+    preferred = {384: "bge-small-en-v1.5", 1024: "bge-m3"}
+    key = preferred.get(int(target_dim))
+    # Tests and migrations may monkey-patch `Embedder` with a stub that takes no
+    # kwargs. Fall back to the zero-arg form in that case so the fake surface
+    # stays compatible; real production code still respects store.embed_dim.
+    try:
+        if key is not None and key in MODEL_REGISTRY:
+            return Embedder(model_key=key)
+        for reg_key, spec in MODEL_REGISTRY.items():
+            if int(spec["dim"]) == int(target_dim):
+                return Embedder(model_key=reg_key)
+    except TypeError:
+        pass
+    return Embedder()