fix: Lightweight health/introspection probes no longer compete with long-lived streaming completions for the proxy pool's per-host connection slots
This commit is contained in:
parent
13d796817f
commit
820e217da6
5 changed files with 45 additions and 7 deletions
|
|
@ -44,7 +44,7 @@ from backends.normalize import (
|
|||
_extract_llama_quant,
|
||||
)
|
||||
from backends.probe import fetch
|
||||
from backends.sessions import _make_openai_client, get_session
|
||||
from backends.sessions import _make_openai_client, get_probe_session
|
||||
from requests.chat import _make_moe_requests
|
||||
from requests.messages import (
|
||||
transform_images_to_data_urls,
|
||||
|
|
@ -1055,7 +1055,7 @@ async def ps_details_proxy(request: Request):
|
|||
# Fetch /props for each llama-server model to get context length (n_ctx)
|
||||
# and unload sleeping models automatically
|
||||
async def _fetch_llama_props(endpoint: str, model_id: str) -> tuple[int | None, bool, bool]:
|
||||
client: aiohttp.ClientSession = get_session(endpoint)
|
||||
client: aiohttp.ClientSession = get_probe_session(endpoint)
|
||||
base_url = endpoint.rstrip("/").removesuffix("/v1")
|
||||
props_url = f"{base_url}/props?model={model_id}"
|
||||
headers = None
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue