fix: Lightweight health/introspection probes no longer compete with long-lived streaming completions for the proxy pool's per-host connection slots

2026-05-28 09:54:53 +02:00 · 2026-05-28 09:54:53 +02:00 · 820e217da6
commit 820e217da6
parent 13d796817f
5 changed files with 45 additions and 7 deletions
--- a/backends/probe.py
+++ b/backends/probe.py
@ -39,7 +39,7 @@ from state import (
    _bg_refresh_lock,
    default_headers,
 )
-from backends.sessions import get_session
+from backends.sessions import get_probe_session
 from backends.health import (
    _is_fresh,
    _ensure_success,
@ -71,7 +71,7 @@ class fetch:
            endpoint_url = f"{ep_base}/api/tags"
            key = "models"

-        client: aiohttp.ClientSession = get_session(endpoint)
+        client: aiohttp.ClientSession = get_probe_session(endpoint)
        try:
            async with client.get(endpoint_url, headers=headers) as resp:
                await _ensure_success(resp)
@ -191,7 +191,7 @@ class fetch:
        For Ollama endpoints: queries /api/ps and returns model names
        For llama-server endpoints: queries /v1/models and filters for status.value == "loaded"
        """
-        client: aiohttp.ClientSession = get_session(endpoint)
+        client: aiohttp.ClientSession = get_probe_session(endpoint)
        cfg = get_config()

        # Check if this is a llama-server endpoint
@ -360,7 +360,7 @@ class fetch:
            headers["Authorization"] = "Bearer " + api_key

        request_url = f"{endpoint.rstrip('/')}/{route.lstrip('/')}"
-        client: aiohttp.ClientSession = get_session(endpoint)
+        client: aiohttp.ClientSession = get_probe_session(endpoint)
        req_kwargs = {}
        if timeout is not None:
            req_kwargs["timeout"] = aiohttp.ClientTimeout(total=timeout)
@ -401,7 +401,7 @@ async def _raw_probe(
    if timeout is not None:
        req_kwargs["timeout"] = aiohttp.ClientTimeout(total=timeout)
    try:
-        client: aiohttp.ClientSession = get_session(ep)
+        client: aiohttp.ClientSession = get_probe_session(ep)
        async with client.get(url, headers=headers, **req_kwargs) as resp:
            await _ensure_success(resp)
            data = await resp.json()
--- a/backends/sessions.py
+++ b/backends/sessions.py
@ -50,6 +50,26 @@ def get_session(endpoint: str) -> aiohttp.ClientSession:
    return app_state["session"]


+def get_probe_session(endpoint: str) -> aiohttp.ClientSession:
+    """Return the session used for lightweight health/introspection probes.
+
+    Probes (available/loaded models, endpoint health) run on a connection
+    pool kept separate from the proxy/streaming session, so a burst of
+    long-lived completion requests cannot starve them — otherwise a probe
+    would queue waiting for a connection, hit its deadline, and mark a
+    perfectly healthy endpoint as unavailable under load.
+
+    Unix socket endpoints keep their dedicated per-endpoint session. TCP
+    endpoints use the shared probe session, falling back to the main
+    session when the probe pool has not been initialised (e.g. in tests).
+    """
+    if _is_unix_socket_endpoint(endpoint):
+        sess = app_state["socket_sessions"].get(endpoint)
+        if sess is not None:
+            return sess
+    return app_state.get("probe_session") or app_state["session"]
+
+
 def _make_openai_client(
    endpoint: str,
    default_headers: dict | None = None,