fix: Lightweight health/introspection probes no longer compete with long-lived streaming completions for the proxy pool's per-host connection slots
This commit is contained in:
parent
13d796817f
commit
820e217da6
5 changed files with 45 additions and 7 deletions
|
|
@ -39,7 +39,7 @@ from state import (
|
|||
_bg_refresh_lock,
|
||||
default_headers,
|
||||
)
|
||||
from backends.sessions import get_session
|
||||
from backends.sessions import get_probe_session
|
||||
from backends.health import (
|
||||
_is_fresh,
|
||||
_ensure_success,
|
||||
|
|
@ -71,7 +71,7 @@ class fetch:
|
|||
endpoint_url = f"{ep_base}/api/tags"
|
||||
key = "models"
|
||||
|
||||
client: aiohttp.ClientSession = get_session(endpoint)
|
||||
client: aiohttp.ClientSession = get_probe_session(endpoint)
|
||||
try:
|
||||
async with client.get(endpoint_url, headers=headers) as resp:
|
||||
await _ensure_success(resp)
|
||||
|
|
@ -191,7 +191,7 @@ class fetch:
|
|||
For Ollama endpoints: queries /api/ps and returns model names
|
||||
For llama-server endpoints: queries /v1/models and filters for status.value == "loaded"
|
||||
"""
|
||||
client: aiohttp.ClientSession = get_session(endpoint)
|
||||
client: aiohttp.ClientSession = get_probe_session(endpoint)
|
||||
cfg = get_config()
|
||||
|
||||
# Check if this is a llama-server endpoint
|
||||
|
|
@ -360,7 +360,7 @@ class fetch:
|
|||
headers["Authorization"] = "Bearer " + api_key
|
||||
|
||||
request_url = f"{endpoint.rstrip('/')}/{route.lstrip('/')}"
|
||||
client: aiohttp.ClientSession = get_session(endpoint)
|
||||
client: aiohttp.ClientSession = get_probe_session(endpoint)
|
||||
req_kwargs = {}
|
||||
if timeout is not None:
|
||||
req_kwargs["timeout"] = aiohttp.ClientTimeout(total=timeout)
|
||||
|
|
@ -401,7 +401,7 @@ async def _raw_probe(
|
|||
if timeout is not None:
|
||||
req_kwargs["timeout"] = aiohttp.ClientTimeout(total=timeout)
|
||||
try:
|
||||
client: aiohttp.ClientSession = get_session(ep)
|
||||
client: aiohttp.ClientSession = get_probe_session(ep)
|
||||
async with client.get(url, headers=headers, **req_kwargs) as resp:
|
||||
await _ensure_success(resp)
|
||||
data = await resp.json()
|
||||
|
|
|
|||
|
|
@ -50,6 +50,26 @@ def get_session(endpoint: str) -> aiohttp.ClientSession:
|
|||
return app_state["session"]
|
||||
|
||||
|
||||
def get_probe_session(endpoint: str) -> aiohttp.ClientSession:
|
||||
"""Return the session used for lightweight health/introspection probes.
|
||||
|
||||
Probes (available/loaded models, endpoint health) run on a connection
|
||||
pool kept separate from the proxy/streaming session, so a burst of
|
||||
long-lived completion requests cannot starve them — otherwise a probe
|
||||
would queue waiting for a connection, hit its deadline, and mark a
|
||||
perfectly healthy endpoint as unavailable under load.
|
||||
|
||||
Unix socket endpoints keep their dedicated per-endpoint session. TCP
|
||||
endpoints use the shared probe session, falling back to the main
|
||||
session when the probe pool has not been initialised (e.g. in tests).
|
||||
"""
|
||||
if _is_unix_socket_endpoint(endpoint):
|
||||
sess = app_state["socket_sessions"].get(endpoint)
|
||||
if sess is not None:
|
||||
return sess
|
||||
return app_state.get("probe_session") or app_state["session"]
|
||||
|
||||
|
||||
def _make_openai_client(
|
||||
endpoint: str,
|
||||
default_headers: dict | None = None,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue