iai-mcp-opencode/tests/test_socket_concurrent_clients.py

"""Plan 07-02 Wave 2 R3 acceptance: per-connection multiplexing without HOL blocking.

10 concurrent clients × 5 sequential calls each must complete within 2× the
latency of a single client doing the same workload alone (SPEC R3 invariant).

The R3 acceptance asserts the dispatch-via-asyncio.to_thread pattern is in
place: if a future regression were to inline `await dispatch(...)` instead of
`await asyncio.to_thread(dispatch, ...)`, every connection would head-of-line
block on the GIL-held sync dispatch, the 10-client wall-clock would slide
toward 10× baseline, and this test would fail loudly.

Reuses _send_jsonrpc + _with_socket_server + short_socket_paths fixture
from the sibling test_socket_server_dispatch module (same package).
"""
from __future__ import annotations

import asyncio
import time

# Re-export the fixture so pytest finds it for tests in this module without
# requiring a conftest.py change.
from .test_socket_server_dispatch import short_socket_paths  # noqa: F401


def test_10_concurrent_clients_no_hol_blocking(short_socket_paths):
    """R3: 10 clients × 5 sequential calls each, total ≤ 2× single-client baseline."""
    _, sock_path, _ = short_socket_paths
    from iai_mcp.store import MemoryStore

    from .test_socket_server_dispatch import _send_jsonrpc, _with_socket_server

    store = MemoryStore()

    async def _client_workload(sock_path, client_idx, n_calls=5):
        results = []
        for call_idx in range(n_calls):
            r = await _send_jsonrpc(
                sock_path,
                "memory_recall",
                {"cue": f"client-{client_idx}-call-{call_idx}", "budget_tokens": 100},
                req_id=call_idx + 1,
            )
            results.append(r)
        return results

    async def _runner(sock_path, store):
        # Warm-up: pay the embedder load cost once before measuring.
        await _client_workload(sock_path, -1, n_calls=2)

        # Single-client baseline (5 sequential calls).
        t0 = time.monotonic()
        await _client_workload(sock_path, 0)
        baseline = time.monotonic() - t0

        # 10 concurrent clients × 5 calls each = 50 in-flight calls total.
        t1 = time.monotonic()
        await asyncio.gather(
            *[_client_workload(sock_path, i) for i in range(10)]
        )
        concurrent_total = time.monotonic() - t1

        return baseline, concurrent_total

    baseline, concurrent_total = asyncio.run(
        _with_socket_server(sock_path, store, _runner)
    )

    # SPEC R3: 10 clients of identical work in ≤ 2× the wall-clock of one client.
    # The +0.5s slack absorbs OS scheduling jitter at low N (50 calls total,
    # warm-cache embedder p50 sub-10ms — total wall-clock typically <1s).
    assert concurrent_total <= 2 * baseline + 0.5, (
        f"HOL blocking detected: 10 concurrent clients took "
        f"{concurrent_total:.3f}s vs {baseline:.3f}s baseline (>2× ratio + 0.5s slack). "
        f"Probable cause: dispatch is not running via asyncio.to_thread."
    )


def test_3_clients_serialize_per_connection_but_parallel_across(short_socket_paths):
    """R3 sanity: same connection serializes; different connections parallelize.

    Three connections each fire one call simultaneously; total wall-clock must
    be close to a single-call wall-clock (not 3×). Demonstrates the per-connection
    coroutine + asyncio.to_thread interleaving pattern.
    """
    _, sock_path, _ = short_socket_paths
    from iai_mcp.store import MemoryStore

    from .test_socket_server_dispatch import _send_jsonrpc, _with_socket_server

    store = MemoryStore()

    async def _single_call(sock_path, idx):
        return await _send_jsonrpc(
            sock_path,
            "memory_recall",
            {"cue": f"parallel-test-{idx}", "budget_tokens": 100},
            req_id=idx,
        )

    async def _runner(sock_path, store):
        # Warm-up so the embedder load cost is amortised.
        await _single_call(sock_path, 0)

        # Single-call baseline (one connection, one call).
        t0 = time.monotonic()
        await _single_call(sock_path, 1)
        baseline = time.monotonic() - t0

        # Three connections in parallel.
        t1 = time.monotonic()
        await asyncio.gather(
            _single_call(sock_path, 2),
            _single_call(sock_path, 3),
            _single_call(sock_path, 4),
        )
        parallel_total = time.monotonic() - t1

        return baseline, parallel_total

    baseline, parallel_total = asyncio.run(
        _with_socket_server(sock_path, store, _runner)
    )

    # 3 calls in parallel should not take more than 1.5× a single call's
    # wall-clock + 0.3s slack (warm-cache memory_recall is fast; the test
    # asserts that the second + third connections aren't HOL-blocked behind
    # the first connection's dispatch worker).
    assert parallel_total <= 1.5 * baseline + 0.3, (
        f"3 parallel connections took {parallel_total:.3f}s vs "
        f"{baseline:.3f}s single-call baseline (>1.5× + 0.3s slack)."
    )