"""Backend health probes and error classification helpers. Contains: * cache-freshness check (``_is_fresh``) * aiohttp response success assertion (``_ensure_success``) * human-readable connection-issue formatter * upstream-error detection that distinguishes connection failures from legitimate 4xx responses (``_is_backend_connection_error``) * per-(endpoint, model) unhealthy marker that feeds ``choose_endpoint`` * llama-server status interpretation (``_is_llama_model_loaded`` etc.) """ import asyncio import time from urllib.parse import urlparse import aiohttp import openai from fastapi import HTTPException from security import _mask_secrets from state import _completion_error_cache, _completion_error_cache_lock def _is_fresh(cached_at: float, ttl: int) -> bool: return (time.time() - cached_at) < ttl async def _ensure_success(resp: aiohttp.ClientResponse) -> None: if resp.status >= 400: text = await resp.text() raise HTTPException(status_code=resp.status, detail=_mask_secrets(text)) def _format_connection_issue(url: str, error: Exception) -> str: """ Provide a human-friendly error string for connection failures so operators know which endpoint and address failed from inside the container. """ parsed = urlparse(url) host_hint = parsed.hostname or "" port_hint = parsed.port or "" if isinstance(error, aiohttp.ClientConnectorError): resolved_host = getattr(error, "host", host_hint) or host_hint or "?" resolved_port = getattr(error, "port", port_hint) or port_hint or "?" parts = [ f"Failed to connect to {url} (resolved: {resolved_host}:{resolved_port}).", "Ensure the endpoint address is reachable from within the container.", ] if resolved_host in {"localhost", "127.0.0.1"}: parts.append( "Inside Docker, 'localhost' refers to the container itself; use " "'host.docker.internal' or a Docker network alias if the service " "runs on the host machine." ) os_error = getattr(error, "os_error", None) if isinstance(os_error, OSError): errno = getattr(os_error, "errno", None) strerror = os_error.strerror or str(os_error) if errno is not None or strerror: parts.append(f"OS error [{errno}]: {strerror}.") elif os_error: parts.append(f"OS error: {os_error}.") parts.append(f"Original error: {error}.") return " ".join(parts) if isinstance(error, asyncio.TimeoutError): return ( f"Timed out waiting for {url}. " "The remote endpoint may be offline or slow to respond." ) return f"Error while contacting {url}: {error}" def _is_backend_connection_error(exc: Exception) -> bool: """True for upstream connection-class failures observed via the OpenAI client. Targets the case where a llama-server in router mode keeps answering /v1/models but its delegated worker for a specific model is dead, so chat/completions calls return 5xx with 'proxy error: Could not establish connection' (or the SDK raises APIConnectionError outright). Excludes BadRequestError with exceed_context_size_error by design — those must stay on the reactive-trim path. """ if isinstance(exc, openai.APIConnectionError): return True if isinstance(exc, openai.InternalServerError): msg = str(exc).lower() return ( "proxy error" in msg or "could not establish connection" in msg or "connection refused" in msg ) return False async def _mark_backend_unhealthy(endpoint: str, model: str, reason: str = "") -> None: """Record (endpoint, model) as broken so choose_endpoint avoids it. Cleared only by TTL — the dead-worker failure mode is invisible to the /v1/models / /api/ps probes that clear _loaded_error_cache, so we cannot rely on a successful probe as a recovery signal. """ async with _completion_error_cache_lock: _completion_error_cache[(endpoint, model)] = time.time() print(f"[health] marked unhealthy ep={endpoint} model={model} reason={reason[:120]}", flush=True) def _is_llama_model_loaded(item: dict) -> bool: """Return True if a llama-server /v1/models item has status 'loaded'. Handles both dict format ({"value": "loaded"}) and plain string ("loaded"). If no status field is present, the model is always-loaded (not dynamically managed).""" status = item.get("status") if status is None: return True # No status field: model is always loaded (e.g. single-model servers) if isinstance(status, dict): return status.get("value") == "loaded" if isinstance(status, str): return status == "loaded" return False def _is_llama_model_loaded_or_sleeping(item: dict) -> bool: """Return True if status is 'loaded' or 'sleeping'. Newer llama-server versions report 'sleeping' in /v1/models when a model is idle; ps_details needs to include these so _fetch_llama_props can detect and unload them.""" status = item.get("status") if status is None: return True if isinstance(status, dict): return status.get("value") in ("loaded", "sleeping") if isinstance(status, str): return status in ("loaded", "sleeping") return False