fix: Lightweight health/introspection probes no longer compete with long-lived streaming completions for the proxy pool's per-host connection slots

2026-05-28 09:54:53 +02:00 · 2026-05-28 09:54:53 +02:00 · 820e217da6
commit 820e217da6
parent 13d796817f
5 changed files with 45 additions and 7 deletions
--- a/api/ollama.py
+++ b/api/ollama.py
@ -44,7 +44,7 @@ from backends.normalize import (
    _extract_llama_quant,
 )
 from backends.probe import fetch
-from backends.sessions import _make_openai_client, get_session
+from backends.sessions import _make_openai_client, get_probe_session
 from requests.chat import _make_moe_requests
 from requests.messages import (
    transform_images_to_data_urls,
@ -1055,7 +1055,7 @@ async def ps_details_proxy(request: Request):
        # Fetch /props for each llama-server model to get context length (n_ctx)
        # and unload sleeping models automatically
        async def _fetch_llama_props(endpoint: str, model_id: str) -> tuple[int | None, bool, bool]:
-            client: aiohttp.ClientSession = get_session(endpoint)
+            client: aiohttp.ClientSession = get_probe_session(endpoint)
            base_url = endpoint.rstrip("/").removesuffix("/v1")
            props_url = f"{base_url}/props?model={model_id}"
            headers = None