132 lines
5 KiB
Python
132 lines
5 KiB
Python
|
|
"""Plan 07-02 Wave 2 R3 acceptance: per-connection multiplexing without HOL blocking.
|
|||
|
|
|
|||
|
|
10 concurrent clients × 5 sequential calls each must complete within 2× the
|
|||
|
|
latency of a single client doing the same workload alone (SPEC R3 invariant).
|
|||
|
|
|
|||
|
|
The R3 acceptance asserts the dispatch-via-asyncio.to_thread pattern is in
|
|||
|
|
place: if a future regression were to inline `await dispatch(...)` instead of
|
|||
|
|
`await asyncio.to_thread(dispatch, ...)`, every connection would head-of-line
|
|||
|
|
block on the GIL-held sync dispatch, the 10-client wall-clock would slide
|
|||
|
|
toward 10× baseline, and this test would fail loudly.
|
|||
|
|
|
|||
|
|
Reuses _send_jsonrpc + _with_socket_server + short_socket_paths fixture
|
|||
|
|
from the sibling test_socket_server_dispatch module (same package).
|
|||
|
|
"""
|
|||
|
|
from __future__ import annotations
|
|||
|
|
|
|||
|
|
import asyncio
|
|||
|
|
import time
|
|||
|
|
|
|||
|
|
# Re-export the fixture so pytest finds it for tests in this module without
|
|||
|
|
# requiring a conftest.py change.
|
|||
|
|
from .test_socket_server_dispatch import short_socket_paths # noqa: F401
|
|||
|
|
|
|||
|
|
|
|||
|
|
def test_10_concurrent_clients_no_hol_blocking(short_socket_paths):
|
|||
|
|
"""R3: 10 clients × 5 sequential calls each, total ≤ 2× single-client baseline."""
|
|||
|
|
_, sock_path, _ = short_socket_paths
|
|||
|
|
from iai_mcp.store import MemoryStore
|
|||
|
|
|
|||
|
|
from .test_socket_server_dispatch import _send_jsonrpc, _with_socket_server
|
|||
|
|
|
|||
|
|
store = MemoryStore()
|
|||
|
|
|
|||
|
|
async def _client_workload(sock_path, client_idx, n_calls=5):
|
|||
|
|
results = []
|
|||
|
|
for call_idx in range(n_calls):
|
|||
|
|
r = await _send_jsonrpc(
|
|||
|
|
sock_path,
|
|||
|
|
"memory_recall",
|
|||
|
|
{"cue": f"client-{client_idx}-call-{call_idx}", "budget_tokens": 100},
|
|||
|
|
req_id=call_idx + 1,
|
|||
|
|
)
|
|||
|
|
results.append(r)
|
|||
|
|
return results
|
|||
|
|
|
|||
|
|
async def _runner(sock_path, store):
|
|||
|
|
# Warm-up: pay the embedder load cost once before measuring.
|
|||
|
|
await _client_workload(sock_path, -1, n_calls=2)
|
|||
|
|
|
|||
|
|
# Single-client baseline (5 sequential calls).
|
|||
|
|
t0 = time.monotonic()
|
|||
|
|
await _client_workload(sock_path, 0)
|
|||
|
|
baseline = time.monotonic() - t0
|
|||
|
|
|
|||
|
|
# 10 concurrent clients × 5 calls each = 50 in-flight calls total.
|
|||
|
|
t1 = time.monotonic()
|
|||
|
|
await asyncio.gather(
|
|||
|
|
*[_client_workload(sock_path, i) for i in range(10)]
|
|||
|
|
)
|
|||
|
|
concurrent_total = time.monotonic() - t1
|
|||
|
|
|
|||
|
|
return baseline, concurrent_total
|
|||
|
|
|
|||
|
|
baseline, concurrent_total = asyncio.run(
|
|||
|
|
_with_socket_server(sock_path, store, _runner)
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# SPEC R3: 10 clients of identical work in ≤ 2× the wall-clock of one client.
|
|||
|
|
# The +0.5s slack absorbs OS scheduling jitter at low N (50 calls total,
|
|||
|
|
# warm-cache embedder p50 sub-10ms — total wall-clock typically <1s).
|
|||
|
|
assert concurrent_total <= 2 * baseline + 0.5, (
|
|||
|
|
f"HOL blocking detected: 10 concurrent clients took "
|
|||
|
|
f"{concurrent_total:.3f}s vs {baseline:.3f}s baseline (>2× ratio + 0.5s slack). "
|
|||
|
|
f"Probable cause: dispatch is not running via asyncio.to_thread."
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def test_3_clients_serialize_per_connection_but_parallel_across(short_socket_paths):
|
|||
|
|
"""R3 sanity: same connection serializes; different connections parallelize.
|
|||
|
|
|
|||
|
|
Three connections each fire one call simultaneously; total wall-clock must
|
|||
|
|
be close to a single-call wall-clock (not 3×). Demonstrates the per-connection
|
|||
|
|
coroutine + asyncio.to_thread interleaving pattern.
|
|||
|
|
"""
|
|||
|
|
_, sock_path, _ = short_socket_paths
|
|||
|
|
from iai_mcp.store import MemoryStore
|
|||
|
|
|
|||
|
|
from .test_socket_server_dispatch import _send_jsonrpc, _with_socket_server
|
|||
|
|
|
|||
|
|
store = MemoryStore()
|
|||
|
|
|
|||
|
|
async def _single_call(sock_path, idx):
|
|||
|
|
return await _send_jsonrpc(
|
|||
|
|
sock_path,
|
|||
|
|
"memory_recall",
|
|||
|
|
{"cue": f"parallel-test-{idx}", "budget_tokens": 100},
|
|||
|
|
req_id=idx,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
async def _runner(sock_path, store):
|
|||
|
|
# Warm-up so the embedder load cost is amortised.
|
|||
|
|
await _single_call(sock_path, 0)
|
|||
|
|
|
|||
|
|
# Single-call baseline (one connection, one call).
|
|||
|
|
t0 = time.monotonic()
|
|||
|
|
await _single_call(sock_path, 1)
|
|||
|
|
baseline = time.monotonic() - t0
|
|||
|
|
|
|||
|
|
# Three connections in parallel.
|
|||
|
|
t1 = time.monotonic()
|
|||
|
|
await asyncio.gather(
|
|||
|
|
_single_call(sock_path, 2),
|
|||
|
|
_single_call(sock_path, 3),
|
|||
|
|
_single_call(sock_path, 4),
|
|||
|
|
)
|
|||
|
|
parallel_total = time.monotonic() - t1
|
|||
|
|
|
|||
|
|
return baseline, parallel_total
|
|||
|
|
|
|||
|
|
baseline, parallel_total = asyncio.run(
|
|||
|
|
_with_socket_server(sock_path, store, _runner)
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# 3 calls in parallel should not take more than 1.5× a single call's
|
|||
|
|
# wall-clock + 0.3s slack (warm-cache memory_recall is fast; the test
|
|||
|
|
# asserts that the second + third connections aren't HOL-blocked behind
|
|||
|
|
# the first connection's dispatch worker).
|
|||
|
|
assert parallel_total <= 1.5 * baseline + 0.3, (
|
|||
|
|
f"3 parallel connections took {parallel_total:.3f}s vs "
|
|||
|
|
f"{baseline:.3f}s single-call baseline (>1.5× + 0.3s slack)."
|
|||
|
|
)
|