Initial release: iai-mcp v0.1.0

Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: XNLLLLH <XNLLLLH@users.noreply.github.com>
2026-05-06 01:04:47 -07:00 · 2026-05-06 01:04:47 -07:00 · f6b876fbe7
commit f6b876fbe7
332 changed files with 97258 additions and 0 deletions
--- a/tests/test_bench.py
+++ b/tests/test_bench.py
@ -0,0 +1,133 @@
+"""Tests for the Phase-1 benchmark harnesses (D-15, OPS-01/02/04).
+
+All tests inject `count_tokens_fn` where applicable so no live Anthropic API
+calls happen in CI. The actual Anthropic integration is exercised only when
+`ANTHROPIC_API_KEY` is set and the CLIs are run directly by hand.
+"""
+from __future__ import annotations
+
+from bench.tokens import FRESH_LIMIT, STEADY_LIMIT, run_token_bench
+from bench.verbatim import ACCURACY_FLOOR, run_verbatim_bench
+from iai_mcp.store import MemoryStore
+
+
+# ---------------------------------------------------------- bench/tokens.py
+
+
+def test_tokens_steady_pass(tmp_path):
+    """Injected counter at 2500 tokens -> both steady_ok and fresh_ok pass."""
+    store = MemoryStore(path=tmp_path)
+    res = run_token_bench(store=store, n_runs=3, count_tokens_fn=lambda t: 2500)
+    assert res["steady_ok"] is True
+    assert res["fresh_ok"] is True
+    assert all(w == 2500 for w in res["warm"])
+    assert res["mode"] == "injected"
+    assert res["limits"]["steady"] == STEADY_LIMIT
+    assert res["limits"]["fresh"] == FRESH_LIMIT
+
+
+def test_tokens_steady_fail(tmp_path):
+    """3500 tok > STEADY_LIMIT -> steady_ok False, fails."""
+    store = MemoryStore(path=tmp_path)
+    res = run_token_bench(store=store, n_runs=3, count_tokens_fn=lambda t: 3500)
+    assert res["steady_ok"] is False
+
+
+def test_tokens_fresh_fail(tmp_path):
+    """Fresh prompt at 9000 (> FRESH_LIMIT) triggers fresh_ok=False.
+
+    We flip counts via an iterator: first call (fresh) returns 9000, subsequent
+    warm calls return 2500. Demonstrates the boundary.
+    """
+    store = MemoryStore(path=tmp_path)
+    counts = iter([9000, 2500, 2500, 2500])
+
+    def _counter(_text: str) -> int:
+        return next(counts)
+
+    res = run_token_bench(store=store, n_runs=3, count_tokens_fn=_counter)
+    assert res["fresh_ok"] is False   # 9000 > 8000
+    assert res["steady_ok"] is True   # warm still under 3000
+
+
+def test_tokens_tiktoken_fallback_mode(tmp_path, monkeypatch):
+    """No ANTHROPIC_API_KEY but tiktoken installed -> mode == tiktoken-cl100k-proxy."""
+    monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False)
+    store = MemoryStore(path=tmp_path)
+    res = run_token_bench(store=store, n_runs=3)
+    assert res["mode"] == "tiktoken-cl100k-proxy"
+    # Payload on an empty store has no L0/L1/L2/rich_club content, so the warm
+    # prompt is literally ".", which tiktoken counts as a single token.
+    # Fresh adds the 1k-chars-tail so remains well under FRESH_LIMIT.
+    assert res["steady_ok"] is True
+    assert res["fresh_ok"] is True
+
+
+def test_tokens_char4_fallback_mode(tmp_path, monkeypatch):
+    """No ANTHROPIC_API_KEY and no tiktoken -> mode == heuristic-char4."""
+    import builtins
+
+    monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False)
+
+    real_import = builtins.__import__
+
+    def _fake_import(name, *args, **kwargs):
+        if name == "tiktoken":
+            raise ImportError("tiktoken not available in this scenario")
+        return real_import(name, *args, **kwargs)
+
+    monkeypatch.setattr(builtins, "__import__", _fake_import)
+
+    store = MemoryStore(path=tmp_path)
+    res = run_token_bench(store=store, n_runs=3)
+    assert res["mode"] == "heuristic-char4"
+    assert res["steady_ok"] is True
+
+
+def test_tokens_fresh_prompt_is_larger_than_warm(tmp_path):
+    """Sanity: the fresh prompt differs from the warm prompt (has the 1k tail)."""
+    store = MemoryStore(path=tmp_path)
+    seen_texts: list[str] = []
+
+    def _capture(text: str) -> int:
+        seen_texts.append(text)
+        return 100
+
+    run_token_bench(store=store, n_runs=1, count_tokens_fn=_capture)
+    # First call was the fresh prompt; second was the warm prompt.
+    assert len(seen_texts) == 2
+    assert len(seen_texts[0]) > len(seen_texts[1])
+
+
+# -------------------------------------------------------- bench/verbatim.py
+
+
+def test_verbatim_passes_small_n(tmp_path):
+    """Small-N smoke test: pinned records recall at >= 0.99 accuracy."""
+    store = MemoryStore(path=tmp_path)
+    res = run_verbatim_bench(
+        store=store, n_records=10, session_gap=2, noise_per_session=2
+    )
+    assert res["accuracy"] >= ACCURACY_FLOOR
+    assert res["passed"] is True
+    assert res["hits_exact"] == 10
+
+
+def test_verbatim_returns_floor_constant(tmp_path):
+    """The harness exposes its pass/fail threshold so verifiers can assert it."""
+    store = MemoryStore(path=tmp_path)
+    res = run_verbatim_bench(
+        store=store, n_records=5, session_gap=1, noise_per_session=1
+    )
+    assert res["floor"] == ACCURACY_FLOOR
+    assert res["floor"] == 0.99
+
+
+def test_verbatim_counts_exact_matches(tmp_path):
+    """hits_exact <= n_records and accuracy = hits_exact / n_records."""
+    store = MemoryStore(path=tmp_path)
+    res = run_verbatim_bench(
+        store=store, n_records=5, session_gap=1, noise_per_session=1
+    )
+    assert res["hits_exact"] <= res["n_records"]
+    assert res["accuracy"] == res["hits_exact"] / res["n_records"]