136 lines
5.3 KiB
Python
136 lines
5.3 KiB
Python
"""Backend health probes and error classification helpers.
|
|
|
|
Contains:
|
|
* cache-freshness check (``_is_fresh``)
|
|
* aiohttp response success assertion (``_ensure_success``)
|
|
* human-readable connection-issue formatter
|
|
* upstream-error detection that distinguishes connection failures from
|
|
legitimate 4xx responses (``_is_backend_connection_error``)
|
|
* per-(endpoint, model) unhealthy marker that feeds ``choose_endpoint``
|
|
* llama-server status interpretation (``_is_llama_model_loaded`` etc.)
|
|
"""
|
|
import asyncio
|
|
import time
|
|
from urllib.parse import urlparse
|
|
|
|
import aiohttp
|
|
import openai
|
|
from fastapi import HTTPException
|
|
|
|
from security import _mask_secrets
|
|
from state import _completion_error_cache, _completion_error_cache_lock
|
|
|
|
|
|
def _is_fresh(cached_at: float, ttl: int) -> bool:
|
|
return (time.time() - cached_at) < ttl
|
|
|
|
|
|
async def _ensure_success(resp: aiohttp.ClientResponse) -> None:
|
|
if resp.status >= 400:
|
|
text = await resp.text()
|
|
raise HTTPException(status_code=resp.status, detail=_mask_secrets(text))
|
|
|
|
|
|
def _format_connection_issue(url: str, error: Exception) -> str:
|
|
"""
|
|
Provide a human-friendly error string for connection failures so operators
|
|
know which endpoint and address failed from inside the container.
|
|
"""
|
|
parsed = urlparse(url)
|
|
host_hint = parsed.hostname or ""
|
|
port_hint = parsed.port or ""
|
|
|
|
if isinstance(error, aiohttp.ClientConnectorError):
|
|
resolved_host = getattr(error, "host", host_hint) or host_hint or "?"
|
|
resolved_port = getattr(error, "port", port_hint) or port_hint or "?"
|
|
parts = [
|
|
f"Failed to connect to {url} (resolved: {resolved_host}:{resolved_port}).",
|
|
"Ensure the endpoint address is reachable from within the container.",
|
|
]
|
|
if resolved_host in {"localhost", "127.0.0.1"}:
|
|
parts.append(
|
|
"Inside Docker, 'localhost' refers to the container itself; use "
|
|
"'host.docker.internal' or a Docker network alias if the service "
|
|
"runs on the host machine."
|
|
)
|
|
os_error = getattr(error, "os_error", None)
|
|
if isinstance(os_error, OSError):
|
|
errno = getattr(os_error, "errno", None)
|
|
strerror = os_error.strerror or str(os_error)
|
|
if errno is not None or strerror:
|
|
parts.append(f"OS error [{errno}]: {strerror}.")
|
|
elif os_error:
|
|
parts.append(f"OS error: {os_error}.")
|
|
parts.append(f"Original error: {error}.")
|
|
return " ".join(parts)
|
|
|
|
if isinstance(error, asyncio.TimeoutError):
|
|
return (
|
|
f"Timed out waiting for {url}. "
|
|
"The remote endpoint may be offline or slow to respond."
|
|
)
|
|
|
|
return f"Error while contacting {url}: {error}"
|
|
|
|
|
|
def _is_backend_connection_error(exc: Exception) -> bool:
|
|
"""True for upstream connection-class failures observed via the OpenAI client.
|
|
|
|
Targets the case where a llama-server in router mode keeps answering
|
|
/v1/models but its delegated worker for a specific model is dead, so
|
|
chat/completions calls return 5xx with 'proxy error: Could not establish
|
|
connection' (or the SDK raises APIConnectionError outright).
|
|
|
|
Excludes BadRequestError with exceed_context_size_error by design — those
|
|
must stay on the reactive-trim path.
|
|
"""
|
|
if isinstance(exc, openai.APIConnectionError):
|
|
return True
|
|
if isinstance(exc, openai.InternalServerError):
|
|
msg = str(exc).lower()
|
|
return (
|
|
"proxy error" in msg
|
|
or "could not establish connection" in msg
|
|
or "connection refused" in msg
|
|
)
|
|
return False
|
|
|
|
|
|
async def _mark_backend_unhealthy(endpoint: str, model: str, reason: str = "") -> None:
|
|
"""Record (endpoint, model) as broken so choose_endpoint avoids it.
|
|
|
|
Cleared only by TTL — the dead-worker failure mode is invisible to the
|
|
/v1/models / /api/ps probes that clear _loaded_error_cache, so we cannot
|
|
rely on a successful probe as a recovery signal.
|
|
"""
|
|
async with _completion_error_cache_lock:
|
|
_completion_error_cache[(endpoint, model)] = time.time()
|
|
print(f"[health] marked unhealthy ep={endpoint} model={model} reason={reason[:120]}", flush=True)
|
|
|
|
|
|
def _is_llama_model_loaded(item: dict) -> bool:
|
|
"""Return True if a llama-server /v1/models item has status 'loaded'.
|
|
Handles both dict format ({"value": "loaded"}) and plain string ("loaded").
|
|
If no status field is present, the model is always-loaded (not dynamically managed)."""
|
|
status = item.get("status")
|
|
if status is None:
|
|
return True # No status field: model is always loaded (e.g. single-model servers)
|
|
if isinstance(status, dict):
|
|
return status.get("value") == "loaded"
|
|
if isinstance(status, str):
|
|
return status == "loaded"
|
|
return False
|
|
|
|
|
|
def _is_llama_model_loaded_or_sleeping(item: dict) -> bool:
|
|
"""Return True if status is 'loaded' or 'sleeping'.
|
|
Newer llama-server versions report 'sleeping' in /v1/models when a model is idle;
|
|
ps_details needs to include these so _fetch_llama_props can detect and unload them."""
|
|
status = item.get("status")
|
|
if status is None:
|
|
return True
|
|
if isinstance(status, dict):
|
|
return status.get("value") in ("loaded", "sleeping")
|
|
if isinstance(status, str):
|
|
return status in ("loaded", "sleeping")
|
|
return False
|