Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: XNLLLLH <XNLLLLH@users.noreply.github.com>
131 lines
5 KiB
Python
131 lines
5 KiB
Python
"""Plan 07-02 Wave 2 R3 acceptance: per-connection multiplexing without HOL blocking.
|
||
|
||
10 concurrent clients × 5 sequential calls each must complete within 2× the
|
||
latency of a single client doing the same workload alone (SPEC R3 invariant).
|
||
|
||
The R3 acceptance asserts the dispatch-via-asyncio.to_thread pattern is in
|
||
place: if a future regression were to inline `await dispatch(...)` instead of
|
||
`await asyncio.to_thread(dispatch, ...)`, every connection would head-of-line
|
||
block on the GIL-held sync dispatch, the 10-client wall-clock would slide
|
||
toward 10× baseline, and this test would fail loudly.
|
||
|
||
Reuses _send_jsonrpc + _with_socket_server + short_socket_paths fixture
|
||
from the sibling test_socket_server_dispatch module (same package).
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import asyncio
|
||
import time
|
||
|
||
# Re-export the fixture so pytest finds it for tests in this module without
|
||
# requiring a conftest.py change.
|
||
from .test_socket_server_dispatch import short_socket_paths # noqa: F401
|
||
|
||
|
||
def test_10_concurrent_clients_no_hol_blocking(short_socket_paths):
|
||
"""R3: 10 clients × 5 sequential calls each, total ≤ 2× single-client baseline."""
|
||
_, sock_path, _ = short_socket_paths
|
||
from iai_mcp.store import MemoryStore
|
||
|
||
from .test_socket_server_dispatch import _send_jsonrpc, _with_socket_server
|
||
|
||
store = MemoryStore()
|
||
|
||
async def _client_workload(sock_path, client_idx, n_calls=5):
|
||
results = []
|
||
for call_idx in range(n_calls):
|
||
r = await _send_jsonrpc(
|
||
sock_path,
|
||
"memory_recall",
|
||
{"cue": f"client-{client_idx}-call-{call_idx}", "budget_tokens": 100},
|
||
req_id=call_idx + 1,
|
||
)
|
||
results.append(r)
|
||
return results
|
||
|
||
async def _runner(sock_path, store):
|
||
# Warm-up: pay the embedder load cost once before measuring.
|
||
await _client_workload(sock_path, -1, n_calls=2)
|
||
|
||
# Single-client baseline (5 sequential calls).
|
||
t0 = time.monotonic()
|
||
await _client_workload(sock_path, 0)
|
||
baseline = time.monotonic() - t0
|
||
|
||
# 10 concurrent clients × 5 calls each = 50 in-flight calls total.
|
||
t1 = time.monotonic()
|
||
await asyncio.gather(
|
||
*[_client_workload(sock_path, i) for i in range(10)]
|
||
)
|
||
concurrent_total = time.monotonic() - t1
|
||
|
||
return baseline, concurrent_total
|
||
|
||
baseline, concurrent_total = asyncio.run(
|
||
_with_socket_server(sock_path, store, _runner)
|
||
)
|
||
|
||
# SPEC R3: 10 clients of identical work in ≤ 2× the wall-clock of one client.
|
||
# The +0.5s slack absorbs OS scheduling jitter at low N (50 calls total,
|
||
# warm-cache embedder p50 sub-10ms — total wall-clock typically <1s).
|
||
assert concurrent_total <= 2 * baseline + 0.5, (
|
||
f"HOL blocking detected: 10 concurrent clients took "
|
||
f"{concurrent_total:.3f}s vs {baseline:.3f}s baseline (>2× ratio + 0.5s slack). "
|
||
f"Probable cause: dispatch is not running via asyncio.to_thread."
|
||
)
|
||
|
||
|
||
def test_3_clients_serialize_per_connection_but_parallel_across(short_socket_paths):
|
||
"""R3 sanity: same connection serializes; different connections parallelize.
|
||
|
||
Three connections each fire one call simultaneously; total wall-clock must
|
||
be close to a single-call wall-clock (not 3×). Demonstrates the per-connection
|
||
coroutine + asyncio.to_thread interleaving pattern.
|
||
"""
|
||
_, sock_path, _ = short_socket_paths
|
||
from iai_mcp.store import MemoryStore
|
||
|
||
from .test_socket_server_dispatch import _send_jsonrpc, _with_socket_server
|
||
|
||
store = MemoryStore()
|
||
|
||
async def _single_call(sock_path, idx):
|
||
return await _send_jsonrpc(
|
||
sock_path,
|
||
"memory_recall",
|
||
{"cue": f"parallel-test-{idx}", "budget_tokens": 100},
|
||
req_id=idx,
|
||
)
|
||
|
||
async def _runner(sock_path, store):
|
||
# Warm-up so the embedder load cost is amortised.
|
||
await _single_call(sock_path, 0)
|
||
|
||
# Single-call baseline (one connection, one call).
|
||
t0 = time.monotonic()
|
||
await _single_call(sock_path, 1)
|
||
baseline = time.monotonic() - t0
|
||
|
||
# Three connections in parallel.
|
||
t1 = time.monotonic()
|
||
await asyncio.gather(
|
||
_single_call(sock_path, 2),
|
||
_single_call(sock_path, 3),
|
||
_single_call(sock_path, 4),
|
||
)
|
||
parallel_total = time.monotonic() - t1
|
||
|
||
return baseline, parallel_total
|
||
|
||
baseline, parallel_total = asyncio.run(
|
||
_with_socket_server(sock_path, store, _runner)
|
||
)
|
||
|
||
# 3 calls in parallel should not take more than 1.5× a single call's
|
||
# wall-clock + 0.3s slack (warm-cache memory_recall is fast; the test
|
||
# asserts that the second + third connections aren't HOL-blocked behind
|
||
# the first connection's dispatch worker).
|
||
assert parallel_total <= 1.5 * baseline + 0.3, (
|
||
f"3 parallel connections took {parallel_total:.3f}s vs "
|
||
f"{baseline:.3f}s single-call baseline (>1.5× + 0.3s slack)."
|
||
)
|