fix: Lightweight health/introspection probes no longer compete with long-lived streaming completions for the proxy pool's per-host connection slots

This commit is contained in:
Alpha Nerd 2026-05-28 09:54:53 +02:00
parent 13d796817f
commit 820e217da6
Signed by: alpha-nerd
SSH key fingerprint: SHA256:QkkAgVoYi9TQ0UKPkiKSfnerZy2h4qhi3SVPXJmBN+M
5 changed files with 45 additions and 7 deletions

View file

@ -44,7 +44,7 @@ from backends.normalize import (
_extract_llama_quant,
)
from backends.probe import fetch
from backends.sessions import _make_openai_client, get_session
from backends.sessions import _make_openai_client, get_probe_session
from requests.chat import _make_moe_requests
from requests.messages import (
transform_images_to_data_urls,
@ -1055,7 +1055,7 @@ async def ps_details_proxy(request: Request):
# Fetch /props for each llama-server model to get context length (n_ctx)
# and unload sleeping models automatically
async def _fetch_llama_props(endpoint: str, model_id: str) -> tuple[int | None, bool, bool]:
client: aiohttp.ClientSession = get_session(endpoint)
client: aiohttp.ClientSession = get_probe_session(endpoint)
base_url = endpoint.rstrip("/").removesuffix("/v1")
props_url = f"{base_url}/props?model={model_id}"
headers = None