Initial release: iai-mcp v0.1.0
Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: XNLLLLH <XNLLLLH@users.noreply.github.com>
This commit is contained in:
commit
f6b876fbe7
332 changed files with 97258 additions and 0 deletions
180
tests/test_schema_multilingual.py
Normal file
180
tests/test_schema_multilingual.py
Normal file
|
|
@ -0,0 +1,180 @@
|
|||
"""Tests for 02-REVIEW.md (persist_schema hardcodes language='en' --
|
||||
D-08a constitutional violation for multilingual users).
|
||||
|
||||
Bug: every schema hub record was created with language='en' regardless of
|
||||
the language of the source cluster. A user storing Russian records saw
|
||||
schema hubs derived from their Russian clusters tagged as English, so
|
||||
language-filtered retrieval ('ru' filter) missed their own schemas.
|
||||
|
||||
Fix:
|
||||
- Add helper _majority_language(evidence_ids, store) -> str. Tie-break
|
||||
is deterministic (max with key=count on a stable input order).
|
||||
- persist_schema derives language from the helper; fallback 'en' only
|
||||
when evidence is empty or all evidence records are missing.
|
||||
|
||||
Constitutional contract (D-08a native-language storage):
|
||||
Records are stored in the language they were recorded in. This extends
|
||||
to derived records (schema hubs). mandates 7+ language support;
|
||||
hardcoded 'en' broke the contract silently.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime, timezone
|
||||
from uuid import uuid4
|
||||
|
||||
import pytest
|
||||
|
||||
from iai_mcp.schema import SchemaCandidate, persist_schema
|
||||
from iai_mcp.store import MemoryStore
|
||||
from iai_mcp.types import EMBED_DIM, MemoryRecord
|
||||
|
||||
|
||||
# ---------------------------------------------------------------- helpers
|
||||
|
||||
|
||||
def _rec(*, language: str, text: str = "seed") -> MemoryRecord:
|
||||
now = datetime.now(timezone.utc)
|
||||
return MemoryRecord(
|
||||
id=uuid4(),
|
||||
tier="episodic",
|
||||
literal_surface=text,
|
||||
aaak_index="",
|
||||
embedding=[1.0] + [0.0] * (EMBED_DIM - 1),
|
||||
community_id=None,
|
||||
centrality=0.0,
|
||||
detail_level=2,
|
||||
pinned=False,
|
||||
stability=0.5,
|
||||
difficulty=0.3,
|
||||
last_reviewed=None,
|
||||
never_decay=False,
|
||||
never_merge=False,
|
||||
provenance=[],
|
||||
created_at=now,
|
||||
updated_at=now,
|
||||
tags=[],
|
||||
language=language,
|
||||
)
|
||||
|
||||
|
||||
def _seed_cluster(
|
||||
store: MemoryStore,
|
||||
lang_counts: dict[str, int],
|
||||
) -> list[uuid4]:
|
||||
"""Insert N records per language. Returns the list of evidence ids in
|
||||
INSERT ORDER (deterministic tie-break)."""
|
||||
evidence: list = []
|
||||
for lang, count in lang_counts.items():
|
||||
for i in range(count):
|
||||
r = _rec(language=lang, text=f"{lang}_seed_{i}")
|
||||
store.insert(r)
|
||||
evidence.append(r.id)
|
||||
return evidence
|
||||
|
||||
|
||||
# ================================================= core cases
|
||||
|
||||
|
||||
def test_persist_schema_derives_language_from_majority_evidence(tmp_path):
|
||||
"""5 ru + 2 en + 1 ja evidence -> schema.language == 'ru'."""
|
||||
store = MemoryStore(path=tmp_path)
|
||||
evidence = _seed_cluster(store, {"ru": 5, "en": 2, "ja": 1})
|
||||
|
||||
cand = SchemaCandidate(
|
||||
pattern="tags:tech+python",
|
||||
confidence=0.9,
|
||||
evidence_count=len(evidence),
|
||||
evidence_ids=list(evidence),
|
||||
status="auto",
|
||||
)
|
||||
schema_id = persist_schema(store, cand)
|
||||
|
||||
fresh = store.get(schema_id)
|
||||
assert fresh is not None
|
||||
assert fresh.language == "ru", (
|
||||
f"persist_schema must read majority language from evidence, got {fresh.language!r}"
|
||||
)
|
||||
|
||||
|
||||
def test_persist_schema_fallback_en_on_empty_evidence(tmp_path):
|
||||
"""No evidence -> fallback to 'en' (Phase-1 default, safe)."""
|
||||
store = MemoryStore(path=tmp_path)
|
||||
cand = SchemaCandidate(
|
||||
pattern="tags:orphan",
|
||||
confidence=0.9,
|
||||
evidence_count=0,
|
||||
evidence_ids=[],
|
||||
status="auto",
|
||||
)
|
||||
schema_id = persist_schema(store, cand)
|
||||
fresh = store.get(schema_id)
|
||||
assert fresh is not None
|
||||
assert fresh.language == "en"
|
||||
|
||||
|
||||
def test_persist_schema_tie_is_deterministic(tmp_path):
|
||||
"""3 ru + 3 en (tied) -> deterministic winner governed by input order.
|
||||
max(..., key=list.count) with a list preserves first-seen-wins; 'ru'
|
||||
inserted first wins the tie."""
|
||||
store = MemoryStore(path=tmp_path)
|
||||
evidence = _seed_cluster(store, {"ru": 3, "en": 3})
|
||||
|
||||
cand = SchemaCandidate(
|
||||
pattern="tags:tied",
|
||||
confidence=0.9,
|
||||
evidence_count=len(evidence),
|
||||
evidence_ids=list(evidence),
|
||||
status="auto",
|
||||
)
|
||||
schema_id = persist_schema(store, cand)
|
||||
fresh = store.get(schema_id)
|
||||
assert fresh is not None
|
||||
# Tie-break: first distinct language in the evidence list wins.
|
||||
# Seeded as {ru:3, en:3} in that order -> 'ru' appears first.
|
||||
assert fresh.language == "ru"
|
||||
|
||||
|
||||
def test_persist_schema_ignores_missing_evidence_records(tmp_path):
|
||||
"""evidence_ids can point to records that were deleted/never existed.
|
||||
The helper must filter those out gracefully and use only the surviving
|
||||
records' language values."""
|
||||
store = MemoryStore(path=tmp_path)
|
||||
|
||||
# Seed 2 real records in Japanese
|
||||
surviving = _seed_cluster(store, {"ja": 2})
|
||||
|
||||
# Add 3 phantom ids that were never inserted
|
||||
phantom_ids = [uuid4() for _ in range(3)]
|
||||
|
||||
cand = SchemaCandidate(
|
||||
pattern="tags:graceful",
|
||||
confidence=0.85,
|
||||
evidence_count=5,
|
||||
evidence_ids=list(surviving) + phantom_ids,
|
||||
status="auto",
|
||||
)
|
||||
schema_id = persist_schema(store, cand)
|
||||
fresh = store.get(schema_id)
|
||||
assert fresh is not None
|
||||
# Only the 2 surviving Japanese records contribute -> 'ja'
|
||||
assert fresh.language == "ja", (
|
||||
f"persist_schema must ignore missing evidence records, got {fresh.language!r}"
|
||||
)
|
||||
|
||||
|
||||
def test_persist_schema_no_hardcoded_english(tmp_path):
|
||||
"""Structural guard: persist_schema source must not carry `language='en'`
|
||||
hardcoded; it must route language through _majority_language."""
|
||||
import inspect
|
||||
from iai_mcp import schema as schema_mod
|
||||
|
||||
src = inspect.getsource(schema_mod.persist_schema)
|
||||
assert "language=\"en\"," not in src, (
|
||||
"persist_schema still hardcodes language='en'"
|
||||
)
|
||||
assert "_majority_language" in src, (
|
||||
"persist_schema must call _majority_language to derive schema language"
|
||||
)
|
||||
assert hasattr(schema_mod, "_majority_language"), (
|
||||
"_majority_language helper must exist at schema.py module scope"
|
||||
)
|
||||
Loading…
Add table
Add a link
Reference in a new issue