nomyo-router/backends/health.py

"""Backend health probes and error classification helpers.

Contains:
  * cache-freshness check (``_is_fresh``)
  * aiohttp response success assertion (``_ensure_success``)
  * human-readable connection-issue formatter
  * upstream-error detection that distinguishes connection failures from
    legitimate 4xx responses (``_is_backend_connection_error``)
  * per-(endpoint, model) unhealthy marker that feeds ``choose_endpoint``
  * llama-server status interpretation (``_is_llama_model_loaded`` etc.)
"""
import asyncio
import time
from urllib.parse import urlparse

import aiohttp
import openai
from fastapi import HTTPException

from security import _mask_secrets
from state import _completion_error_cache, _completion_error_cache_lock


def _is_fresh(cached_at: float, ttl: int) -> bool:
    return (time.time() - cached_at) < ttl


async def _ensure_success(resp: aiohttp.ClientResponse) -> None:
    if resp.status >= 400:
        text = await resp.text()
        raise HTTPException(status_code=resp.status, detail=_mask_secrets(text))


def _format_connection_issue(url: str, error: Exception) -> str:
    """
    Provide a human-friendly error string for connection failures so operators
    know which endpoint and address failed from inside the container.
    """
    parsed = urlparse(url)
    host_hint = parsed.hostname or ""
    port_hint = parsed.port or ""

    if isinstance(error, aiohttp.ClientConnectorError):
        resolved_host = getattr(error, "host", host_hint) or host_hint or "?"
        resolved_port = getattr(error, "port", port_hint) or port_hint or "?"
        parts = [
            f"Failed to connect to {url} (resolved: {resolved_host}:{resolved_port}).",
            "Ensure the endpoint address is reachable from within the container.",
        ]
        if resolved_host in {"localhost", "127.0.0.1"}:
            parts.append(
                "Inside Docker, 'localhost' refers to the container itself; use "
                "'host.docker.internal' or a Docker network alias if the service "
                "runs on the host machine."
            )
        os_error = getattr(error, "os_error", None)
        if isinstance(os_error, OSError):
            errno = getattr(os_error, "errno", None)
            strerror = os_error.strerror or str(os_error)
            if errno is not None or strerror:
                parts.append(f"OS error [{errno}]: {strerror}.")
        elif os_error:
            parts.append(f"OS error: {os_error}.")
        parts.append(f"Original error: {error}.")
        return " ".join(parts)

    if isinstance(error, asyncio.TimeoutError):
        return (
            f"Timed out waiting for {url}. "
            "The remote endpoint may be offline or slow to respond."
        )

    return f"Error while contacting {url}: {error}"


def _is_backend_connection_error(exc: Exception) -> bool:
    """True for upstream connection-class failures observed via the OpenAI client.

    Targets the case where a llama-server in router mode keeps answering
    /v1/models but its delegated worker for a specific model is dead, so
    chat/completions calls return 5xx with 'proxy error: Could not establish
    connection' (or the SDK raises APIConnectionError outright).

    Excludes BadRequestError with exceed_context_size_error by design — those
    must stay on the reactive-trim path.
    """
    if isinstance(exc, openai.APIConnectionError):
        return True
    if isinstance(exc, openai.InternalServerError):
        msg = str(exc).lower()
        return (
            "proxy error" in msg
            or "could not establish connection" in msg
            or "connection refused" in msg
        )
    return False


async def _mark_backend_unhealthy(endpoint: str, model: str, reason: str = "") -> None:
    """Record (endpoint, model) as broken so choose_endpoint avoids it.

    Cleared only by TTL — the dead-worker failure mode is invisible to the
    /v1/models / /api/ps probes that clear _loaded_error_cache, so we cannot
    rely on a successful probe as a recovery signal.
    """
    async with _completion_error_cache_lock:
        _completion_error_cache[(endpoint, model)] = time.time()
    print(f"[health] marked unhealthy ep={endpoint} model={model} reason={reason[:120]}", flush=True)


def _is_llama_model_loaded(item: dict) -> bool:
    """Return True if a llama-server /v1/models item has status 'loaded'.
    Handles both dict format ({"value": "loaded"}) and plain string ("loaded").
    If no status field is present, the model is always-loaded (not dynamically managed)."""
    status = item.get("status")
    if status is None:
        return True  # No status field: model is always loaded (e.g. single-model servers)
    if isinstance(status, dict):
        return status.get("value") == "loaded"
    if isinstance(status, str):
        return status == "loaded"
    return False


def _is_llama_model_loaded_or_sleeping(item: dict) -> bool:
    """Return True if status is 'loaded' or 'sleeping'.
    Newer llama-server versions report 'sleeping' in /v1/models when a model is idle;
    ps_details needs to include these so _fetch_llama_props can detect and unload them."""
    status = item.get("status")
    if status is None:
        return True
    if isinstance(status, dict):
        return status.get("value") in ("loaded", "sleeping")
    if isinstance(status, str):
        return status in ("loaded", "sleeping")
    return False