Initial release: iai-mcp v0.1.0
Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: XNLLLLH <XNLLLLH@users.noreply.github.com>
This commit is contained in:
commit
f6b876fbe7
332 changed files with 97258 additions and 0 deletions
121
tests/test_bench_latency_regression.py
Normal file
121
tests/test_bench_latency_regression.py
Normal file
|
|
@ -0,0 +1,121 @@
|
|||
"""OPS-10 regression guard: small-N latency stays under D-SPEED p95 ceiling.
|
||||
|
||||
Plan 05-05 (D5-08) — CI-runnable guard for bench/neural_map.py at the
|
||||
small-N end of the matrix. The full N ∈ {100, 1k, 5k, 10k} matrix runs
|
||||
ad-hoc on this dev Mac and is recorded in the published bench report; this
|
||||
test exercises N=100 only so CI catches regressions in <30s.
|
||||
|
||||
D-SPEED contract: p95 < 100 ms at every measured N.
|
||||
|
||||
Adds the comparative reference flags to argparse:
|
||||
--ref-mempalace-p95-ms <float>
|
||||
--ref-claude-mem-p95-ms <float>
|
||||
|
||||
When supplied, the bench's per-N `passed` flag flips to False if IAI's p95
|
||||
exceeds the reference. Tests assert these flags exist on the parser.
|
||||
|
||||
See:
|
||||
- bench/neural_map.py — the harness under guard
|
||||
- tests/test_bench_neural_map.py — sibling D-SPEED tests (passed=True at N=100)
|
||||
- internal architecture spec
|
||||
Task 2 for the behavior contract
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _isolated_keyring(monkeypatch: pytest.MonkeyPatch):
|
||||
"""Prevent macOS keyring prompts by swapping the keyring backend for an
|
||||
in-memory dict (same pattern as tests/test_hippea_cascade.py and
|
||||
tests/test_memory_recall_structural.py)."""
|
||||
import keyring as _keyring
|
||||
|
||||
fake_store: dict[tuple[str, str], str] = {}
|
||||
monkeypatch.setattr(_keyring, "get_password", lambda s, u: fake_store.get((s, u)))
|
||||
monkeypatch.setattr(
|
||||
_keyring, "set_password",
|
||||
lambda s, u, p: fake_store.__setitem__((s, u), p),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
_keyring, "delete_password", lambda s, u: fake_store.pop((s, u), None),
|
||||
)
|
||||
yield fake_store
|
||||
|
||||
|
||||
def test_neural_map_small_n_p95_under_regression_ceiling(tmp_path: Path):
|
||||
"""OPS-10 regression guard at N=100.
|
||||
|
||||
The strict D-SPEED p95 < 100 ms gate is asserted by
|
||||
tests/test_bench_neural_map.py::test_neural_map_bench_reports_passed_flag
|
||||
— an existing test that famously trips under concurrent system load
|
||||
(Plan 05-02 SUMMARY notes the same flake). This guard is a
|
||||
REGRESSION fence: it asserts the bench still produces a numeric p95
|
||||
in the same order of magnitude as the D-SPEED ceiling, so a
|
||||
structural regression (e.g. someone breaks the spread pruning and
|
||||
p95 jumps to 1s+) is caught in CI even when wall-clock noise puts
|
||||
the strict 100 ms test on a flaky boundary.
|
||||
|
||||
The 200 ms ceiling is 2x D-SPEED at N=100; if a real regression
|
||||
drops latency by 2x or more, this gate catches it and the strict
|
||||
100 ms gate (run in isolation) handles the absolute measurement.
|
||||
"""
|
||||
from bench.neural_map import run_neural_map_bench
|
||||
|
||||
out = run_neural_map_bench(n=100, iterations=10, store_path=tmp_path / "store")
|
||||
|
||||
assert out["latency_ms_p95"] < 200.0, (
|
||||
f"OPS-10 regression: p95 {out['latency_ms_p95']:.2f}ms > 200ms at N=100 "
|
||||
f"(2x D-SPEED ceiling — likely a real regression, not concurrency noise)"
|
||||
)
|
||||
# Sanity: the harness always returns a positive p95.
|
||||
assert out["latency_ms_p95"] > 0.0
|
||||
|
||||
|
||||
def test_neural_map_main_with_matrix_returns_int(tmp_path: Path):
|
||||
"""CLI entry-point honours an explicit ns list (the N matrix)."""
|
||||
from bench import neural_map
|
||||
|
||||
code = neural_map.main(ns=[50], iterations=3, store_path=tmp_path)
|
||||
assert code in (0, 1)
|
||||
|
||||
|
||||
def test_neural_map_argparse_has_reference_flags():
|
||||
"""OPS-10 comparative gate: argparse exposes the reference-p95 flags so
|
||||
the bench can compare IAI to mempalace/claude-mem reference numbers
|
||||
measured separately on this host.
|
||||
|
||||
Grep-verifiable contract: any ratification of these names elsewhere in
|
||||
the report harness has to update the test.
|
||||
"""
|
||||
from bench import neural_map
|
||||
|
||||
parser = neural_map._parse_args.__defaults__ # noqa: SLF001
|
||||
# Inspect the actual parser by parsing a dry args list.
|
||||
ns = neural_map._parse_args([
|
||||
"--n", "100",
|
||||
"--ref-mempalace-p95-ms", "42.5",
|
||||
"--ref-claude-mem-p95-ms", "61.0",
|
||||
])
|
||||
assert getattr(ns, "ref_mempalace_p95_ms", None) == 42.5
|
||||
assert getattr(ns, "ref_claude_mem_p95_ms", None) == 61.0
|
||||
|
||||
|
||||
def test_neural_map_comparative_gate_flips_passed_false_when_above_ref(tmp_path: Path):
|
||||
"""If IAI p95 > mempalace ref, the per-N JSON's `passed` flips False
|
||||
AND `reason` carries the reference name.
|
||||
"""
|
||||
from bench import neural_map
|
||||
|
||||
# An impossibly low ref that any realistic bench will exceed.
|
||||
code = neural_map.main(
|
||||
ns=[50],
|
||||
iterations=3,
|
||||
store_path=tmp_path,
|
||||
ref_mempalace_p95_ms=0.0001,
|
||||
)
|
||||
# With a 0.0001 ms reference, the bench cannot pass.
|
||||
assert code == 1
|
||||
Loading…
Add table
Add a link
Reference in a new issue