nomyo-router/backends/health.py

136 lines
5.3 KiB
Python

"""Backend health probes and error classification helpers.
Contains:
* cache-freshness check (``_is_fresh``)
* aiohttp response success assertion (``_ensure_success``)
* human-readable connection-issue formatter
* upstream-error detection that distinguishes connection failures from
legitimate 4xx responses (``_is_backend_connection_error``)
* per-(endpoint, model) unhealthy marker that feeds ``choose_endpoint``
* llama-server status interpretation (``_is_llama_model_loaded`` etc.)
"""
import asyncio
import time
from urllib.parse import urlparse
import aiohttp
import openai
from fastapi import HTTPException
from security import _mask_secrets
from state import _completion_error_cache, _completion_error_cache_lock
def _is_fresh(cached_at: float, ttl: int) -> bool:
return (time.time() - cached_at) < ttl
async def _ensure_success(resp: aiohttp.ClientResponse) -> None:
if resp.status >= 400:
text = await resp.text()
raise HTTPException(status_code=resp.status, detail=_mask_secrets(text))
def _format_connection_issue(url: str, error: Exception) -> str:
"""
Provide a human-friendly error string for connection failures so operators
know which endpoint and address failed from inside the container.
"""
parsed = urlparse(url)
host_hint = parsed.hostname or ""
port_hint = parsed.port or ""
if isinstance(error, aiohttp.ClientConnectorError):
resolved_host = getattr(error, "host", host_hint) or host_hint or "?"
resolved_port = getattr(error, "port", port_hint) or port_hint or "?"
parts = [
f"Failed to connect to {url} (resolved: {resolved_host}:{resolved_port}).",
"Ensure the endpoint address is reachable from within the container.",
]
if resolved_host in {"localhost", "127.0.0.1"}:
parts.append(
"Inside Docker, 'localhost' refers to the container itself; use "
"'host.docker.internal' or a Docker network alias if the service "
"runs on the host machine."
)
os_error = getattr(error, "os_error", None)
if isinstance(os_error, OSError):
errno = getattr(os_error, "errno", None)
strerror = os_error.strerror or str(os_error)
if errno is not None or strerror:
parts.append(f"OS error [{errno}]: {strerror}.")
elif os_error:
parts.append(f"OS error: {os_error}.")
parts.append(f"Original error: {error}.")
return " ".join(parts)
if isinstance(error, asyncio.TimeoutError):
return (
f"Timed out waiting for {url}. "
"The remote endpoint may be offline or slow to respond."
)
return f"Error while contacting {url}: {error}"
def _is_backend_connection_error(exc: Exception) -> bool:
"""True for upstream connection-class failures observed via the OpenAI client.
Targets the case where a llama-server in router mode keeps answering
/v1/models but its delegated worker for a specific model is dead, so
chat/completions calls return 5xx with 'proxy error: Could not establish
connection' (or the SDK raises APIConnectionError outright).
Excludes BadRequestError with exceed_context_size_error by design — those
must stay on the reactive-trim path.
"""
if isinstance(exc, openai.APIConnectionError):
return True
if isinstance(exc, openai.InternalServerError):
msg = str(exc).lower()
return (
"proxy error" in msg
or "could not establish connection" in msg
or "connection refused" in msg
)
return False
async def _mark_backend_unhealthy(endpoint: str, model: str, reason: str = "") -> None:
"""Record (endpoint, model) as broken so choose_endpoint avoids it.
Cleared only by TTL — the dead-worker failure mode is invisible to the
/v1/models / /api/ps probes that clear _loaded_error_cache, so we cannot
rely on a successful probe as a recovery signal.
"""
async with _completion_error_cache_lock:
_completion_error_cache[(endpoint, model)] = time.time()
print(f"[health] marked unhealthy ep={endpoint} model={model} reason={reason[:120]}", flush=True)
def _is_llama_model_loaded(item: dict) -> bool:
"""Return True if a llama-server /v1/models item has status 'loaded'.
Handles both dict format ({"value": "loaded"}) and plain string ("loaded").
If no status field is present, the model is always-loaded (not dynamically managed)."""
status = item.get("status")
if status is None:
return True # No status field: model is always loaded (e.g. single-model servers)
if isinstance(status, dict):
return status.get("value") == "loaded"
if isinstance(status, str):
return status == "loaded"
return False
def _is_llama_model_loaded_or_sleeping(item: dict) -> bool:
"""Return True if status is 'loaded' or 'sleeping'.
Newer llama-server versions report 'sleeping' in /v1/models when a model is idle;
ps_details needs to include these so _fetch_llama_props can detect and unload them."""
status = item.get("status")
if status is None:
return True
if isinstance(status, dict):
return status.get("value") in ("loaded", "sleeping")
if isinstance(status, str):
return status in ("loaded", "sleeping")
return False