iai-mcp-opencode/tests/test_pipeline_recall_perf_gate.py

"""Plan 06-02 Task 3 — perf gate for normalize + max_degree cache.

The lock at N=1k warm p95 ≤ 83.6 ms is enforced via
``bench/neural_map.py`` for reproducibility on the reference host. This
pytest gate runs at N=200 with a CI-generous ceiling so it can catch
egregious hot-path regressions without flapping on slower runners.

Plan 06-02 added per-recall work:
  - one ``getattr(graph, "_max_degree", 0)`` (dict lookup) before the loop
  - one ``log(1.0 + max_deg)`` once per call
  - one float division per candidate

The combined cost is sub-millisecond at N=200; the gate ceiling at 200 ms
absorbs CI jitter and gives the reference-host bench room to land the
strict 83.6 ms read.
"""
from __future__ import annotations

import time

import pytest

# Reuse the perf fixtures from the existing pipeline-perf suite. Importing
# at the module top so failures surface immediately at collection time.
from tests.test_pipeline_perf import _seed_store


CI_GENEROUS_P95_S: float = 0.200  # 200 ms — see module docstring


# --------------------------------------------------------- p95 ceiling


def test_pipeline_recall_p95_under_ci_ceiling_after_normalize(tmp_path):
    """Seed N=200, warm the cache, then time 20 recall calls.

    p95 ≤ 200 ms (CI-generous). The reference host bench enforces the
    strict 83.6 ms invariant separately.
    """
    from iai_mcp.pipeline import recall_for_response

    store, embedder, graph, assignment, rich_club = _seed_store(
        tmp_path, n=200, seed=0,
    )

    cues = [
        "what did we cover about auth yesterday?",
        "explain the db migration plan",
        "how does the web cache invalidation work",
        "summary of the cli subcommand changes",
        "recent network stack bug report",
    ]

    # One throwaway warm call so the records_cache + community gate
    # data structures are hot before timing.
    recall_for_response(
        store=store, graph=graph, assignment=assignment,
        rich_club=rich_club, embedder=embedder,
        cue=cues[0], session_id="warm", budget_tokens=1500,
    )

    latencies: list[float] = []
    for i in range(20):
        cue = cues[i % len(cues)]
        t0 = time.perf_counter()
        recall_for_response(
            store=store, graph=graph, assignment=assignment,
            rich_club=rich_club, embedder=embedder,
            cue=cue, session_id="perf_gate", budget_tokens=1500,
        )
        latencies.append(time.perf_counter() - t0)

    latencies.sort()
    # p95 index for 20 samples = int(0.95 * 20) = 19 (the slowest).
    p95 = latencies[int(0.95 * len(latencies))]
    p95_ms = p95 * 1000.0
    print(
        f"\n[perf-gate] recall_for_response N=200 warm p95 = {p95_ms:.2f} ms "
        f"(CI ceiling: {CI_GENEROUS_P95_S * 1000:.0f} ms; "
        f"reference-host strict: 83.6 ms via bench/neural_map.py)"
    )

    assert p95 < CI_GENEROUS_P95_S, (
        f"Plan 06-02 normalize regression: recall_for_response N=200 warm "
        f"p95 = {p95_ms:.2f} ms exceeds CI ceiling "
        f"{CI_GENEROUS_P95_S * 1000:.0f} ms. "
        f"All latencies (ms): {[f'{x*1000:.1f}' for x in latencies]}"
    )


def test_normalize_overhead_is_submillisecond(tmp_path, capsys):
    """Sanity: surface the normalize-stage timing as a printed trend so
    CI logs show whether Plan 06-02's per-call additions stay sub-ms.

    Implementation note: a clean A/B against the OLD formula is hard to
    do without a feature flag (the change is unconditional in the rank
    stage). Instead we measure absolute p95 at N=100 and assert it sits
    well under the same 200 ms CI ceiling — a sub-100 ms read is the
    informal sanity check that normalize-overhead did not regress.
    """
    from iai_mcp.pipeline import recall_for_response

    store, embedder, graph, assignment, rich_club = _seed_store(
        tmp_path, n=100, seed=1,
    )

    cues = [
        "auth verbatim cue",
        "db schema rebuild",
        "web cache invalidation",
    ]

    # Warm cache.
    recall_for_response(
        store=store, graph=graph, assignment=assignment,
        rich_club=rich_club, embedder=embedder,
        cue=cues[0], session_id="warm", budget_tokens=1500,
    )

    latencies: list[float] = []
    for i in range(10):
        cue = cues[i % len(cues)]
        t0 = time.perf_counter()
        recall_for_response(
            store=store, graph=graph, assignment=assignment,
            rich_club=rich_club, embedder=embedder,
            cue=cue, session_id="overhead_check", budget_tokens=1500,
        )
        latencies.append(time.perf_counter() - t0)

    latencies.sort()
    p95 = latencies[int(0.95 * len(latencies))]
    p95_ms = p95 * 1000.0
    # Surface to test log; CI log captures the trend even on pass.
    print(
        f"\n[perf-gate] recall_for_response N=100 warm p95 = {p95_ms:.2f} ms "
        f"(normalize overhead: one division + one getattr per call)"
    )

    assert p95 < CI_GENEROUS_P95_S, (
        f"normalize-overhead sanity: p95 = {p95_ms:.2f} ms > "
        f"CI ceiling {CI_GENEROUS_P95_S * 1000:.0f} ms"
    )