Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: XNLLLLH <XNLLLLH@users.noreply.github.com>
151 lines
4.6 KiB
Python
151 lines
4.6 KiB
Python
"""Tests for the multilingual embedder path in the 3-model registry.
|
|
|
|
Plan 05-08 (2026-04-20) flipped the DEFAULT to bge-small-en-v1.5 (384d
|
|
English-only). bge-m3 remains selectable via env var or explicit
|
|
``Embedder(model_key="bge-m3")`` — these tests pin the key explicitly
|
|
so the multilingual coverage keeps running under the new default.
|
|
|
|
These tests import SentenceTransformer and pull the bge-m3 weights once on
|
|
first run (HuggingFace cache is re-used thereafter). If bge-m3 is already
|
|
cached by any previous dev session the test runs in seconds.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
|
|
import numpy as np
|
|
import pytest
|
|
|
|
|
|
# ------------------------------------------------------------- bge-m3 opt-in
|
|
|
|
|
|
def test_bge_m3_opt_in_produces_1024d() -> None:
|
|
"""Explicit Embedder(model_key="bge-m3") still yields the multilingual
|
|
1024d path after Plan 05-08's default revert."""
|
|
from iai_mcp.embed import Embedder
|
|
|
|
e = Embedder(model_key="bge-m3")
|
|
assert e.model_key == "bge-m3"
|
|
assert e.model_name == "BAAI/bge-m3"
|
|
assert e.DIM == 1024
|
|
|
|
|
|
def test_bge_m3_embeds_english() -> None:
|
|
from iai_mcp.embed import Embedder
|
|
|
|
e = Embedder(model_key="bge-m3")
|
|
v = e.embed("Hello, how are you?")
|
|
assert len(v) == 1024
|
|
# bge-m3 returns normalised vectors (|v| == 1)
|
|
n = float(np.linalg.norm(np.asarray(v)))
|
|
assert abs(n - 1.0) < 1e-4
|
|
|
|
|
|
def test_bge_m3_embeds_russian() -> None:
|
|
from iai_mcp.embed import Embedder
|
|
|
|
e = Embedder(model_key="bge-m3")
|
|
v = e.embed("Привет, как дела?")
|
|
assert len(v) == 1024
|
|
n = float(np.linalg.norm(np.asarray(v)))
|
|
assert abs(n - 1.0) < 1e-4
|
|
|
|
|
|
def test_bge_m3_embeds_japanese() -> None:
|
|
from iai_mcp.embed import Embedder
|
|
|
|
e = Embedder(model_key="bge-m3")
|
|
v = e.embed("こんにちは、今日は元気ですか?")
|
|
assert len(v) == 1024
|
|
n = float(np.linalg.norm(np.asarray(v)))
|
|
assert abs(n - 1.0) < 1e-4
|
|
|
|
|
|
def test_bge_m3_cross_language_similarity() -> None:
|
|
"""bge-m3 encodes cross-lingual concepts. Pinned explicitly because
|
|
Plan 05-08's default is now English-only bge-small."""
|
|
from iai_mcp.embed import Embedder
|
|
|
|
e = Embedder(model_key="bge-m3")
|
|
en = np.asarray(e.embed("hello"))
|
|
ru = np.asarray(e.embed("привет"))
|
|
cos = float(en @ ru / (np.linalg.norm(en) * np.linalg.norm(ru)))
|
|
assert cos > 0.5, f"cross-language cosine too low: {cos}"
|
|
|
|
|
|
# ----------------------------------------------------------- env-var selection
|
|
|
|
|
|
def test_embed_model_selectable_via_env(monkeypatch) -> None:
|
|
"""IAI_MCP_EMBED_MODEL selects from the 3-model registry."""
|
|
import importlib
|
|
|
|
# Clear the process-level cache so re-import exposes the correct default.
|
|
import iai_mcp.embed as embed_mod
|
|
|
|
monkeypatch.setenv("IAI_MCP_EMBED_MODEL", "bge-small-en-v1.5")
|
|
importlib.reload(embed_mod)
|
|
e = embed_mod.Embedder()
|
|
assert e.model_key == "bge-small-en-v1.5"
|
|
assert e.DIM == 384
|
|
|
|
# Restore default for remaining tests.
|
|
monkeypatch.delenv("IAI_MCP_EMBED_MODEL", raising=False)
|
|
importlib.reload(embed_mod)
|
|
|
|
|
|
def test_embed_model_explicit_key_overrides_env(monkeypatch) -> None:
|
|
from iai_mcp.embed import Embedder
|
|
|
|
monkeypatch.setenv("IAI_MCP_EMBED_MODEL", "bge-m3")
|
|
e = Embedder(model_key="bge-small-en-v1.5")
|
|
# Explicit key wins over env.
|
|
assert e.model_key == "bge-small-en-v1.5"
|
|
assert e.DIM == 384
|
|
|
|
|
|
def test_embed_model_dimension_registered() -> None:
|
|
"""Registry reports the correct DIM for every entry."""
|
|
from iai_mcp.embed import MODEL_REGISTRY
|
|
|
|
assert MODEL_REGISTRY["bge-m3"]["dim"] == 1024
|
|
assert MODEL_REGISTRY["multilingual-e5-small"]["dim"] == 384
|
|
assert MODEL_REGISTRY["bge-small-en-v1.5"]["dim"] == 384
|
|
|
|
|
|
def test_embed_model_rejects_unknown_key() -> None:
|
|
from iai_mcp.embed import Embedder
|
|
|
|
with pytest.raises(ValueError):
|
|
Embedder(model_key="this-model-does-not-exist")
|
|
|
|
|
|
def test_embed_model_rejects_unknown_env(monkeypatch) -> None:
|
|
from iai_mcp.embed import Embedder
|
|
|
|
monkeypatch.setenv("IAI_MCP_EMBED_MODEL", "garbage")
|
|
with pytest.raises(ValueError):
|
|
Embedder()
|
|
|
|
|
|
# ------------------------------------------------------- batch + determinism
|
|
|
|
|
|
def test_embed_batch_preserves_order_and_dim() -> None:
|
|
from iai_mcp.embed import Embedder
|
|
|
|
e = Embedder(model_key="bge-m3")
|
|
texts = ["one", "два", "三"]
|
|
vecs = e.embed_batch(texts)
|
|
assert len(vecs) == 3
|
|
assert all(len(v) == 1024 for v in vecs)
|
|
|
|
|
|
def test_embed_deterministic_same_input() -> None:
|
|
from iai_mcp.embed import Embedder
|
|
|
|
e = Embedder()
|
|
a = e.embed("deterministic test")
|
|
b = e.embed("deterministic test")
|
|
assert a == b
|