Initial release: iai-mcp v0.1.0
Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: XNLLLLH <XNLLLLH@users.noreply.github.com>
This commit is contained in:
commit
f6b876fbe7
332 changed files with 97258 additions and 0 deletions
341
tests/test_schema_dedup.py
Normal file
341
tests/test_schema_dedup.py
Normal file
|
|
@ -0,0 +1,341 @@
|
|||
"""Tests for R1 — schema-pattern dedup in persist_schema.
|
||||
|
||||
Locked decisions covered (06-CONTEXT.md):
|
||||
- persist_schema dedups by tag `pattern:{candidate.pattern}` against
|
||||
existing tier="semantic" records; reinforces schema_instance_of edges
|
||||
onto the keeper instead of inserting a duplicate row.
|
||||
- new event kind `schema_reinforced` with payload
|
||||
`{schema_id, pattern, evidence_added, total_evidence}`; severity "info";
|
||||
source_ids `[keeper_id, *new_evidence_ids[:5]]`.
|
||||
- single test file, pytest convention (`tmp_path` LanceDB root).
|
||||
|
||||
R1 acceptance (06-SPEC.md): N persist_schema calls for the same pattern
|
||||
collapse to ONE schema record, with the keeper's incoming
|
||||
`schema_instance_of` edge count equal to the cumulative distinct evidence
|
||||
count across all calls.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime, timezone
|
||||
from uuid import uuid4
|
||||
|
||||
import pytest
|
||||
|
||||
from iai_mcp.events import query_events
|
||||
from iai_mcp.store import EDGES_TABLE, MemoryStore
|
||||
from iai_mcp.types import EMBED_DIM, MemoryRecord
|
||||
|
||||
|
||||
# ---------------------------------------------------------------- helpers
|
||||
|
||||
|
||||
def _rec(
|
||||
*,
|
||||
text: str = "t",
|
||||
tags: list[str] | None = None,
|
||||
language: str = "en",
|
||||
tier: str = "episodic",
|
||||
detail_level: int = 2,
|
||||
) -> MemoryRecord:
|
||||
now = datetime.now(timezone.utc)
|
||||
return MemoryRecord(
|
||||
id=uuid4(),
|
||||
tier=tier,
|
||||
literal_surface=text,
|
||||
aaak_index="",
|
||||
embedding=[1.0] + [0.0] * (EMBED_DIM - 1),
|
||||
community_id=None,
|
||||
centrality=0.0,
|
||||
detail_level=detail_level,
|
||||
pinned=False,
|
||||
stability=0.0,
|
||||
difficulty=0.0,
|
||||
last_reviewed=None,
|
||||
never_decay=False,
|
||||
never_merge=False,
|
||||
provenance=[],
|
||||
created_at=now,
|
||||
updated_at=now,
|
||||
tags=list(tags or []),
|
||||
language=language,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _patch_embedder(monkeypatch):
|
||||
"""Avoid loading bge-m3 during dedup tests — perf hygiene."""
|
||||
from iai_mcp import embed as embed_mod
|
||||
|
||||
class _FakeEmbedder:
|
||||
DIM = EMBED_DIM
|
||||
DEFAULT_DIM = EMBED_DIM
|
||||
DEFAULT_MODEL_KEY = "fake"
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.DIM = EMBED_DIM
|
||||
|
||||
def embed(self, text: str) -> list[float]:
|
||||
return [1.0] + [0.0] * (EMBED_DIM - 1)
|
||||
|
||||
def embed_batch(self, texts):
|
||||
return [self.embed(t) for t in texts]
|
||||
|
||||
monkeypatch.setattr(embed_mod, "Embedder", _FakeEmbedder)
|
||||
yield
|
||||
|
||||
|
||||
# ---------------------------------------------------------------- Task 1: events taxonomy + write-event smoke
|
||||
|
||||
|
||||
def test_events_module_docstring_lists_schema_reinforced():
|
||||
"""events.py module docstring documents the new `schema_reinforced` kind."""
|
||||
import iai_mcp.events as events_mod
|
||||
|
||||
doc = events_mod.__doc__ or ""
|
||||
assert "schema_reinforced" in doc, (
|
||||
"events.py module docstring missing `schema_reinforced` taxonomy entry "
|
||||
"(Plan 06-01 D-10). Add a additions block after the "
|
||||
"section listing the new event kind, payload schema, and source_ids note."
|
||||
)
|
||||
|
||||
|
||||
def test_write_event_accepts_schema_reinforced_kind(tmp_path):
|
||||
"""schema_reinforced event round-trips through write_event + query_events."""
|
||||
from iai_mcp.events import write_event
|
||||
|
||||
store = MemoryStore(path=tmp_path)
|
||||
keeper_id = uuid4()
|
||||
ev_id = uuid4()
|
||||
write_event(
|
||||
store,
|
||||
kind="schema_reinforced",
|
||||
data={
|
||||
"schema_id": str(keeper_id),
|
||||
"pattern": "tags:capture+role:user",
|
||||
"evidence_added": 1,
|
||||
"total_evidence": 5,
|
||||
},
|
||||
severity="info",
|
||||
source_ids=[keeper_id, ev_id],
|
||||
)
|
||||
rows = query_events(store, kind="schema_reinforced")
|
||||
assert len(rows) == 1
|
||||
row = rows[0]
|
||||
assert row["kind"] == "schema_reinforced"
|
||||
assert row["severity"] == "info"
|
||||
payload = row["data"]
|
||||
assert payload["pattern"] == "tags:capture+role:user"
|
||||
assert payload["evidence_added"] == 1
|
||||
assert payload["total_evidence"] == 5
|
||||
assert payload["schema_id"] == str(keeper_id)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------- Task 2: persist_schema dedup branch (R1)
|
||||
|
||||
|
||||
def _seed_evidence(store: MemoryStore, n: int) -> list[MemoryRecord]:
|
||||
"""Insert n fresh episodic evidence records (one per call iteration).
|
||||
|
||||
Each record carries the canonical capture/role tags so a downstream
|
||||
induced schema for `tags:capture+role:user` traces back to genuine
|
||||
evidence. Returns the list in insertion order.
|
||||
"""
|
||||
recs = [_rec(text=f"ev{i}", tags=["capture", "role:user"]) for i in range(n)]
|
||||
for r in recs:
|
||||
store.insert(r)
|
||||
return recs
|
||||
|
||||
|
||||
def test_persist_schema_dedups_same_pattern(tmp_path):
|
||||
"""R1: 10 persist_schema calls for the same pattern produce ONE schema record."""
|
||||
from iai_mcp.schema import SchemaCandidate, persist_schema
|
||||
|
||||
store = MemoryStore(path=tmp_path)
|
||||
pattern = "tags:capture+role:user"
|
||||
pattern_tag = f"pattern:{pattern}"
|
||||
|
||||
for _ in range(10):
|
||||
ev = _seed_evidence(store, 1)
|
||||
cand = SchemaCandidate(
|
||||
pattern=pattern,
|
||||
confidence=0.9,
|
||||
evidence_count=1,
|
||||
evidence_ids=[ev[0].id],
|
||||
status="auto",
|
||||
)
|
||||
persist_schema(store, cand)
|
||||
|
||||
schemas = [
|
||||
r for r in store.all_records()
|
||||
if r.tier == "semantic" and pattern_tag in (r.tags or [])
|
||||
]
|
||||
assert len(schemas) == 1, (
|
||||
f"expected exactly one schema for pattern {pattern!r}, got {len(schemas)}"
|
||||
)
|
||||
|
||||
|
||||
def test_persist_schema_reinforces_edges_on_dedup(tmp_path):
|
||||
"""R1: schema_instance_of edge count to keeper == cumulative evidence count."""
|
||||
from iai_mcp.schema import SchemaCandidate, persist_schema
|
||||
|
||||
store = MemoryStore(path=tmp_path)
|
||||
pattern = "tags:capture+role:user"
|
||||
pattern_tag = f"pattern:{pattern}"
|
||||
|
||||
keeper_id = None
|
||||
cumulative_evidence = 0
|
||||
for _ in range(10):
|
||||
ev = _seed_evidence(store, 1)
|
||||
cand = SchemaCandidate(
|
||||
pattern=pattern,
|
||||
confidence=0.9,
|
||||
evidence_count=1,
|
||||
evidence_ids=[ev[0].id],
|
||||
status="auto",
|
||||
)
|
||||
sid = persist_schema(store, cand)
|
||||
keeper_id = keeper_id or sid
|
||||
cumulative_evidence += 1
|
||||
|
||||
# store.boost_edges canonicalises (src, dst) to a sorted tuple, so the
|
||||
# keeper appears in EITHER column depending on the string ordering of
|
||||
# the paired evidence UUID. OR-count both columns to recover the true
|
||||
# edge-incidence count (each edge row has the keeper in exactly one
|
||||
# column — no double-count).
|
||||
edges_df = store.db.open_table(EDGES_TABLE).to_pandas()
|
||||
keeper_str = str(keeper_id)
|
||||
sio = edges_df[
|
||||
(edges_df["edge_type"] == "schema_instance_of")
|
||||
& ((edges_df["dst"] == keeper_str) | (edges_df["src"] == keeper_str))
|
||||
]
|
||||
assert len(sio) == cumulative_evidence, (
|
||||
f"expected {cumulative_evidence} schema_instance_of edges incident on keeper, "
|
||||
f"got {len(sio)}"
|
||||
)
|
||||
|
||||
# Sanity: exactly one keeper survives.
|
||||
keepers = [
|
||||
r for r in store.all_records()
|
||||
if r.tier == "semantic" and pattern_tag in (r.tags or [])
|
||||
]
|
||||
assert len(keepers) == 1
|
||||
|
||||
|
||||
def test_persist_schema_emits_schema_reinforced_event(tmp_path):
|
||||
"""R1 + 9 reinforced events + 1 induction event after 10 calls."""
|
||||
from iai_mcp.schema import SchemaCandidate, persist_schema
|
||||
|
||||
store = MemoryStore(path=tmp_path)
|
||||
pattern = "tags:capture+role:user"
|
||||
|
||||
for _ in range(10):
|
||||
ev = _seed_evidence(store, 1)
|
||||
cand = SchemaCandidate(
|
||||
pattern=pattern,
|
||||
confidence=0.9,
|
||||
evidence_count=1,
|
||||
evidence_ids=[ev[0].id],
|
||||
status="auto",
|
||||
)
|
||||
persist_schema(store, cand)
|
||||
|
||||
induction_events = query_events(store, kind="schema_induction_run")
|
||||
reinforced_events = query_events(store, kind="schema_reinforced", limit=100)
|
||||
|
||||
matching_inductions = [
|
||||
e for e in induction_events if e["data"].get("pattern") == pattern
|
||||
]
|
||||
matching_reinforcements = [
|
||||
e for e in reinforced_events if e["data"].get("pattern") == pattern
|
||||
]
|
||||
assert len(matching_inductions) == 1, (
|
||||
f"expected 1 schema_induction_run event, got {len(matching_inductions)}"
|
||||
)
|
||||
assert len(matching_reinforcements) == 9, (
|
||||
f"expected 9 schema_reinforced events, got {len(matching_reinforcements)}"
|
||||
)
|
||||
|
||||
# query_events sorts newest first; the FIRST in the list is the most
|
||||
# recent reinforcement and must carry the highest total_evidence.
|
||||
payloads = [e["data"] for e in matching_reinforcements]
|
||||
for p in payloads:
|
||||
assert "schema_id" in p
|
||||
assert p["pattern"] == pattern
|
||||
assert isinstance(p["evidence_added"], int)
|
||||
assert isinstance(p["total_evidence"], int)
|
||||
totals = [p["total_evidence"] for p in payloads]
|
||||
# Newest first → totals should be monotonically non-increasing in list order.
|
||||
assert totals == sorted(totals, reverse=True), (
|
||||
f"total_evidence should grow over time; saw {totals}"
|
||||
)
|
||||
|
||||
|
||||
def test_persist_schema_returns_keeper_id(tmp_path):
|
||||
"""R1: persist_schema returns the SAME UUID across N calls for same pattern."""
|
||||
from iai_mcp.schema import SchemaCandidate, persist_schema
|
||||
|
||||
store = MemoryStore(path=tmp_path)
|
||||
pattern = "tags:capture+role:user"
|
||||
|
||||
returned_ids = []
|
||||
for _ in range(10):
|
||||
ev = _seed_evidence(store, 1)
|
||||
cand = SchemaCandidate(
|
||||
pattern=pattern,
|
||||
confidence=0.9,
|
||||
evidence_count=1,
|
||||
evidence_ids=[ev[0].id],
|
||||
status="auto",
|
||||
)
|
||||
returned_ids.append(persist_schema(store, cand))
|
||||
|
||||
first = returned_ids[0]
|
||||
assert all(rid == first for rid in returned_ids), (
|
||||
f"persist_schema should return the keeper id on every call; got {returned_ids}"
|
||||
)
|
||||
|
||||
|
||||
def test_persist_schema_does_not_collapse_distinct_patterns(tmp_path):
|
||||
"""R1 negative: distinct patterns produce distinct schema records."""
|
||||
from iai_mcp.schema import SchemaCandidate, persist_schema
|
||||
|
||||
store = MemoryStore(path=tmp_path)
|
||||
|
||||
ev_a = _seed_evidence(store, 1)
|
||||
sid_a = persist_schema(
|
||||
store,
|
||||
SchemaCandidate(
|
||||
pattern="A",
|
||||
confidence=0.9,
|
||||
evidence_count=1,
|
||||
evidence_ids=[ev_a[0].id],
|
||||
status="auto",
|
||||
),
|
||||
)
|
||||
ev_b = _seed_evidence(store, 1)
|
||||
sid_b = persist_schema(
|
||||
store,
|
||||
SchemaCandidate(
|
||||
pattern="B",
|
||||
confidence=0.9,
|
||||
evidence_count=1,
|
||||
evidence_ids=[ev_b[0].id],
|
||||
status="auto",
|
||||
),
|
||||
)
|
||||
assert sid_a != sid_b
|
||||
|
||||
schemas = [
|
||||
r for r in store.all_records()
|
||||
if r.tier == "semantic" and any(
|
||||
t in ("pattern:A", "pattern:B") for t in (r.tags or [])
|
||||
)
|
||||
]
|
||||
assert len(schemas) == 2
|
||||
patterns = sorted(
|
||||
t.split(":", 1)[1]
|
||||
for r in schemas
|
||||
for t in r.tags
|
||||
if t.startswith("pattern:")
|
||||
)
|
||||
assert patterns == ["A", "B"]
|
||||
Loading…
Add table
Add a link
Reference in a new issue