Initial release: iai-mcp v0.1.0

Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: XNLLLLH <XNLLLLH@users.noreply.github.com>
2026-05-06 01:04:47 -07:00 · 2026-05-06 01:04:47 -07:00 · f6b876fbe7
commit f6b876fbe7
332 changed files with 97258 additions and 0 deletions
--- a/tests/test_pipeline_recall_perf_gate.py
+++ b/tests/test_pipeline_recall_perf_gate.py
@ -0,0 +1,143 @@
+"""Plan 06-02 Task 3 — perf gate for normalize + max_degree cache.
+
+The lock at N=1k warm p95 ≤ 83.6 ms is enforced via
+``bench/neural_map.py`` for reproducibility on the reference host. This
+pytest gate runs at N=200 with a CI-generous ceiling so it can catch
+egregious hot-path regressions without flapping on slower runners.
+
+Plan 06-02 added per-recall work:
+  - one ``getattr(graph, "_max_degree", 0)`` (dict lookup) before the loop
+  - one ``log(1.0 + max_deg)`` once per call
+  - one float division per candidate
+
+The combined cost is sub-millisecond at N=200; the gate ceiling at 200 ms
+absorbs CI jitter and gives the reference-host bench room to land the
+strict 83.6 ms read.
+"""
+from __future__ import annotations
+
+import time
+
+import pytest
+
+# Reuse the perf fixtures from the existing pipeline-perf suite. Importing
+# at the module top so failures surface immediately at collection time.
+from tests.test_pipeline_perf import _seed_store
+
+
+CI_GENEROUS_P95_S: float = 0.200  # 200 ms — see module docstring
+
+
+# --------------------------------------------------------- p95 ceiling
+
+
+def test_pipeline_recall_p95_under_ci_ceiling_after_normalize(tmp_path):
+    """Seed N=200, warm the cache, then time 20 recall calls.
+
+    p95 ≤ 200 ms (CI-generous). The reference host bench enforces the
+    strict 83.6 ms invariant separately.
+    """
+    from iai_mcp.pipeline import recall_for_response
+
+    store, embedder, graph, assignment, rich_club = _seed_store(
+        tmp_path, n=200, seed=0,
+    )
+
+    cues = [
+        "what did we cover about auth yesterday?",
+        "explain the db migration plan",
+        "how does the web cache invalidation work",
+        "summary of the cli subcommand changes",
+        "recent network stack bug report",
+    ]
+
+    # One throwaway warm call so the records_cache + community gate
+    # data structures are hot before timing.
+    recall_for_response(
+        store=store, graph=graph, assignment=assignment,
+        rich_club=rich_club, embedder=embedder,
+        cue=cues[0], session_id="warm", budget_tokens=1500,
+    )
+
+    latencies: list[float] = []
+    for i in range(20):
+        cue = cues[i % len(cues)]
+        t0 = time.perf_counter()
+        recall_for_response(
+            store=store, graph=graph, assignment=assignment,
+            rich_club=rich_club, embedder=embedder,
+            cue=cue, session_id="perf_gate", budget_tokens=1500,
+        )
+        latencies.append(time.perf_counter() - t0)
+
+    latencies.sort()
+    # p95 index for 20 samples = int(0.95 * 20) = 19 (the slowest).
+    p95 = latencies[int(0.95 * len(latencies))]
+    p95_ms = p95 * 1000.0
+    print(
+        f"\n[perf-gate] recall_for_response N=200 warm p95 = {p95_ms:.2f} ms "
+        f"(CI ceiling: {CI_GENEROUS_P95_S * 1000:.0f} ms; "
+        f"reference-host strict: 83.6 ms via bench/neural_map.py)"
+    )
+
+    assert p95 < CI_GENEROUS_P95_S, (
+        f"Plan 06-02 normalize regression: recall_for_response N=200 warm "
+        f"p95 = {p95_ms:.2f} ms exceeds CI ceiling "
+        f"{CI_GENEROUS_P95_S * 1000:.0f} ms. "
+        f"All latencies (ms): {[f'{x*1000:.1f}' for x in latencies]}"
+    )
+
+
+def test_normalize_overhead_is_submillisecond(tmp_path, capsys):
+    """Sanity: surface the normalize-stage timing as a printed trend so
+    CI logs show whether Plan 06-02's per-call additions stay sub-ms.
+
+    Implementation note: a clean A/B against the OLD formula is hard to
+    do without a feature flag (the change is unconditional in the rank
+    stage). Instead we measure absolute p95 at N=100 and assert it sits
+    well under the same 200 ms CI ceiling — a sub-100 ms read is the
+    informal sanity check that normalize-overhead did not regress.
+    """
+    from iai_mcp.pipeline import recall_for_response
+
+    store, embedder, graph, assignment, rich_club = _seed_store(
+        tmp_path, n=100, seed=1,
+    )
+
+    cues = [
+        "auth verbatim cue",
+        "db schema rebuild",
+        "web cache invalidation",
+    ]
+
+    # Warm cache.
+    recall_for_response(
+        store=store, graph=graph, assignment=assignment,
+        rich_club=rich_club, embedder=embedder,
+        cue=cues[0], session_id="warm", budget_tokens=1500,
+    )
+
+    latencies: list[float] = []
+    for i in range(10):
+        cue = cues[i % len(cues)]
+        t0 = time.perf_counter()
+        recall_for_response(
+            store=store, graph=graph, assignment=assignment,
+            rich_club=rich_club, embedder=embedder,
+            cue=cue, session_id="overhead_check", budget_tokens=1500,
+        )
+        latencies.append(time.perf_counter() - t0)
+
+    latencies.sort()
+    p95 = latencies[int(0.95 * len(latencies))]
+    p95_ms = p95 * 1000.0
+    # Surface to test log; CI log captures the trend even on pass.
+    print(
+        f"\n[perf-gate] recall_for_response N=100 warm p95 = {p95_ms:.2f} ms "
+        f"(normalize overhead: one division + one getattr per call)"
+    )
+
+    assert p95 < CI_GENEROUS_P95_S, (
+        f"normalize-overhead sanity: p95 = {p95_ms:.2f} ms > "
+        f"CI ceiling {CI_GENEROUS_P95_S * 1000:.0f} ms"
+    )