"""Tests for the Phase-1 benchmark harnesses (D-15, OPS-01/02/04). All tests inject `count_tokens_fn` where applicable so no live Anthropic API calls happen in CI. The actual Anthropic integration is exercised only when `ANTHROPIC_API_KEY` is set and the CLIs are run directly by hand. """ from __future__ import annotations from bench.tokens import FRESH_LIMIT, STEADY_LIMIT, run_token_bench from bench.verbatim import ACCURACY_FLOOR, run_verbatim_bench from iai_mcp.store import MemoryStore # ---------------------------------------------------------- bench/tokens.py def test_tokens_steady_pass(tmp_path): """Injected counter at 2500 tokens -> both steady_ok and fresh_ok pass.""" store = MemoryStore(path=tmp_path) res = run_token_bench(store=store, n_runs=3, count_tokens_fn=lambda t: 2500) assert res["steady_ok"] is True assert res["fresh_ok"] is True assert all(w == 2500 for w in res["warm"]) assert res["mode"] == "injected" assert res["limits"]["steady"] == STEADY_LIMIT assert res["limits"]["fresh"] == FRESH_LIMIT def test_tokens_steady_fail(tmp_path): """3500 tok > STEADY_LIMIT -> steady_ok False, fails.""" store = MemoryStore(path=tmp_path) res = run_token_bench(store=store, n_runs=3, count_tokens_fn=lambda t: 3500) assert res["steady_ok"] is False def test_tokens_fresh_fail(tmp_path): """Fresh prompt at 9000 (> FRESH_LIMIT) triggers fresh_ok=False. We flip counts via an iterator: first call (fresh) returns 9000, subsequent warm calls return 2500. Demonstrates the boundary. """ store = MemoryStore(path=tmp_path) counts = iter([9000, 2500, 2500, 2500]) def _counter(_text: str) -> int: return next(counts) res = run_token_bench(store=store, n_runs=3, count_tokens_fn=_counter) assert res["fresh_ok"] is False # 9000 > 8000 assert res["steady_ok"] is True # warm still under 3000 def test_tokens_tiktoken_fallback_mode(tmp_path, monkeypatch): """No ANTHROPIC_API_KEY but tiktoken installed -> mode == tiktoken-cl100k-proxy.""" monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False) store = MemoryStore(path=tmp_path) res = run_token_bench(store=store, n_runs=3) assert res["mode"] == "tiktoken-cl100k-proxy" # Payload on an empty store has no L0/L1/L2/rich_club content, so the warm # prompt is literally ".", which tiktoken counts as a single token. # Fresh adds the 1k-chars-tail so remains well under FRESH_LIMIT. assert res["steady_ok"] is True assert res["fresh_ok"] is True def test_tokens_char4_fallback_mode(tmp_path, monkeypatch): """No ANTHROPIC_API_KEY and no tiktoken -> mode == heuristic-char4.""" import builtins monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False) real_import = builtins.__import__ def _fake_import(name, *args, **kwargs): if name == "tiktoken": raise ImportError("tiktoken not available in this scenario") return real_import(name, *args, **kwargs) monkeypatch.setattr(builtins, "__import__", _fake_import) store = MemoryStore(path=tmp_path) res = run_token_bench(store=store, n_runs=3) assert res["mode"] == "heuristic-char4" assert res["steady_ok"] is True def test_tokens_fresh_prompt_is_larger_than_warm(tmp_path): """Sanity: the fresh prompt differs from the warm prompt (has the 1k tail).""" store = MemoryStore(path=tmp_path) seen_texts: list[str] = [] def _capture(text: str) -> int: seen_texts.append(text) return 100 run_token_bench(store=store, n_runs=1, count_tokens_fn=_capture) # First call was the fresh prompt; second was the warm prompt. assert len(seen_texts) == 2 assert len(seen_texts[0]) > len(seen_texts[1]) # -------------------------------------------------------- bench/verbatim.py def test_verbatim_passes_small_n(tmp_path): """Small-N smoke test: pinned records recall at >= 0.99 accuracy.""" store = MemoryStore(path=tmp_path) res = run_verbatim_bench( store=store, n_records=10, session_gap=2, noise_per_session=2 ) assert res["accuracy"] >= ACCURACY_FLOOR assert res["passed"] is True assert res["hits_exact"] == 10 def test_verbatim_returns_floor_constant(tmp_path): """The harness exposes its pass/fail threshold so verifiers can assert it.""" store = MemoryStore(path=tmp_path) res = run_verbatim_bench( store=store, n_records=5, session_gap=1, noise_per_session=1 ) assert res["floor"] == ACCURACY_FLOOR assert res["floor"] == 0.99 def test_verbatim_counts_exact_matches(tmp_path): """hits_exact <= n_records and accuracy = hits_exact / n_records.""" store = MemoryStore(path=tmp_path) res = run_verbatim_bench( store=store, n_records=5, session_gap=1, noise_per_session=1 ) assert res["hits_exact"] <= res["n_records"] assert res["accuracy"] == res["hits_exact"] / res["n_records"]