fix: Lightweight health/introspection probes no longer compete with long-lived streaming completions for the proxy pool's per-host connection slots
This commit is contained in:
parent
13d796817f
commit
820e217da6
5 changed files with 45 additions and 7 deletions
|
|
@ -44,7 +44,7 @@ from backends.normalize import (
|
|||
_extract_llama_quant,
|
||||
)
|
||||
from backends.probe import fetch
|
||||
from backends.sessions import _make_openai_client, get_session
|
||||
from backends.sessions import _make_openai_client, get_probe_session
|
||||
from requests.chat import _make_moe_requests
|
||||
from requests.messages import (
|
||||
transform_images_to_data_urls,
|
||||
|
|
@ -1055,7 +1055,7 @@ async def ps_details_proxy(request: Request):
|
|||
# Fetch /props for each llama-server model to get context length (n_ctx)
|
||||
# and unload sleeping models automatically
|
||||
async def _fetch_llama_props(endpoint: str, model_id: str) -> tuple[int | None, bool, bool]:
|
||||
client: aiohttp.ClientSession = get_session(endpoint)
|
||||
client: aiohttp.ClientSession = get_probe_session(endpoint)
|
||||
base_url = endpoint.rstrip("/").removesuffix("/v1")
|
||||
props_url = f"{base_url}/props?model={model_id}"
|
||||
headers = None
|
||||
|
|
|
|||
|
|
@ -39,7 +39,7 @@ from state import (
|
|||
_bg_refresh_lock,
|
||||
default_headers,
|
||||
)
|
||||
from backends.sessions import get_session
|
||||
from backends.sessions import get_probe_session
|
||||
from backends.health import (
|
||||
_is_fresh,
|
||||
_ensure_success,
|
||||
|
|
@ -71,7 +71,7 @@ class fetch:
|
|||
endpoint_url = f"{ep_base}/api/tags"
|
||||
key = "models"
|
||||
|
||||
client: aiohttp.ClientSession = get_session(endpoint)
|
||||
client: aiohttp.ClientSession = get_probe_session(endpoint)
|
||||
try:
|
||||
async with client.get(endpoint_url, headers=headers) as resp:
|
||||
await _ensure_success(resp)
|
||||
|
|
@ -191,7 +191,7 @@ class fetch:
|
|||
For Ollama endpoints: queries /api/ps and returns model names
|
||||
For llama-server endpoints: queries /v1/models and filters for status.value == "loaded"
|
||||
"""
|
||||
client: aiohttp.ClientSession = get_session(endpoint)
|
||||
client: aiohttp.ClientSession = get_probe_session(endpoint)
|
||||
cfg = get_config()
|
||||
|
||||
# Check if this is a llama-server endpoint
|
||||
|
|
@ -360,7 +360,7 @@ class fetch:
|
|||
headers["Authorization"] = "Bearer " + api_key
|
||||
|
||||
request_url = f"{endpoint.rstrip('/')}/{route.lstrip('/')}"
|
||||
client: aiohttp.ClientSession = get_session(endpoint)
|
||||
client: aiohttp.ClientSession = get_probe_session(endpoint)
|
||||
req_kwargs = {}
|
||||
if timeout is not None:
|
||||
req_kwargs["timeout"] = aiohttp.ClientTimeout(total=timeout)
|
||||
|
|
@ -401,7 +401,7 @@ async def _raw_probe(
|
|||
if timeout is not None:
|
||||
req_kwargs["timeout"] = aiohttp.ClientTimeout(total=timeout)
|
||||
try:
|
||||
client: aiohttp.ClientSession = get_session(ep)
|
||||
client: aiohttp.ClientSession = get_probe_session(ep)
|
||||
async with client.get(url, headers=headers, **req_kwargs) as resp:
|
||||
await _ensure_success(resp)
|
||||
data = await resp.json()
|
||||
|
|
|
|||
|
|
@ -50,6 +50,26 @@ def get_session(endpoint: str) -> aiohttp.ClientSession:
|
|||
return app_state["session"]
|
||||
|
||||
|
||||
def get_probe_session(endpoint: str) -> aiohttp.ClientSession:
|
||||
"""Return the session used for lightweight health/introspection probes.
|
||||
|
||||
Probes (available/loaded models, endpoint health) run on a connection
|
||||
pool kept separate from the proxy/streaming session, so a burst of
|
||||
long-lived completion requests cannot starve them — otherwise a probe
|
||||
would queue waiting for a connection, hit its deadline, and mark a
|
||||
perfectly healthy endpoint as unavailable under load.
|
||||
|
||||
Unix socket endpoints keep their dedicated per-endpoint session. TCP
|
||||
endpoints use the shared probe session, falling back to the main
|
||||
session when the probe pool has not been initialised (e.g. in tests).
|
||||
"""
|
||||
if _is_unix_socket_endpoint(endpoint):
|
||||
sess = app_state["socket_sessions"].get(endpoint)
|
||||
if sess is not None:
|
||||
return sess
|
||||
return app_state.get("probe_session") or app_state["session"]
|
||||
|
||||
|
||||
def _make_openai_client(
|
||||
endpoint: str,
|
||||
default_headers: dict | None = None,
|
||||
|
|
|
|||
16
router.py
16
router.py
|
|
@ -343,6 +343,20 @@ async def startup_event() -> None:
|
|||
app_state["connector"] = connector
|
||||
app_state["session"] = session
|
||||
|
||||
# Dedicated pool for health/introspection probes, isolated from the proxy
|
||||
# session above. Streaming completions can hold the proxy pool's per-host
|
||||
# slots open for a long time; without a separate pool the lightweight
|
||||
# probes queue behind them, hit their deadline, and mark healthy endpoints
|
||||
# as unavailable under load.
|
||||
probe_connector = aiohttp.TCPConnector(limit=0, limit_per_host=64, ssl=ssl_context)
|
||||
probe_session = aiohttp.ClientSession(
|
||||
connector=probe_connector,
|
||||
timeout=timeout,
|
||||
headers={"Referer": default_headers.get("HTTP-Referer", "https://nomyo.ai")},
|
||||
)
|
||||
app_state["probe_connector"] = probe_connector
|
||||
app_state["probe_session"] = probe_session
|
||||
|
||||
# Create httpx clients for external OpenAI endpoints (Google, etc.)
|
||||
# aiohttp strips Referer headers for cross-origin requests, so we use httpx
|
||||
for ep in config.endpoints:
|
||||
|
|
@ -380,6 +394,8 @@ async def shutdown_event() -> None:
|
|||
|
||||
await flush_remaining_buffers()
|
||||
await app_state["session"].close()
|
||||
if app_state.get("probe_session") is not None:
|
||||
await app_state["probe_session"].close()
|
||||
|
||||
# Close Unix socket sessions
|
||||
for ep, sess in list(app_state.get("socket_sessions", {}).items()):
|
||||
|
|
|
|||
2
state.py
2
state.py
|
|
@ -65,6 +65,8 @@ token_queue: asyncio.Queue[tuple[str, str, int, int]] = asyncio.Queue()
|
|||
app_state = {
|
||||
"session": None,
|
||||
"connector": None,
|
||||
"probe_session": None, # dedicated session for health/introspection probes
|
||||
"probe_connector": None, # connection pool isolated from proxy traffic
|
||||
"socket_sessions": {}, # endpoint -> aiohttp.ClientSession(UnixConnector) for .sock endpoints
|
||||
"httpx_clients": {}, # endpoint -> httpx.AsyncClient(UDS transport) for .sock endpoints
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue