refac: modularize backend IV

2026-05-19 12:05:51 +02:00 · 2026-05-19 12:05:51 +02:00 · 3a9854c5db
commit 3a9854c5db
parent c88ba1e5a4
8 changed files with 822 additions and 666 deletions
--- a/backends/init.py
+++ b/backends/init.py
--- a/backends/health.py
+++ b/backends/health.py
@ -0,0 +1,136 @@
 """Backend health probes and error classification helpers.
 Contains:
  * cache-freshness check (``_is_fresh``)
  * aiohttp response success assertion (``_ensure_success``)
  * human-readable connection-issue formatter
  * upstream-error detection that distinguishes connection failures from
    legitimate 4xx responses (``_is_backend_connection_error``)
  * per-(endpoint, model) unhealthy marker that feeds ``choose_endpoint``
  * llama-server status interpretation (``_is_llama_model_loaded`` etc.)
 """
 import asyncio
 import time
 from urllib.parse import urlparse
 import aiohttp
 import openai
 from fastapi import HTTPException
 from security import _mask_secrets
 from state import _completion_error_cache, _completion_error_cache_lock
 def _is_fresh(cached_at: float, ttl: int) -> bool:
    return (time.time() - cached_at) < ttl
 async def _ensure_success(resp: aiohttp.ClientResponse) -> None:
    if resp.status >= 400:
        text = await resp.text()
        raise HTTPException(status_code=resp.status, detail=_mask_secrets(text))
 def _format_connection_issue(url: str, error: Exception) -> str:
    """
    Provide a human-friendly error string for connection failures so operators
    know which endpoint and address failed from inside the container.
    """
    parsed = urlparse(url)
    host_hint = parsed.hostname or ""
    port_hint = parsed.port or ""
    if isinstance(error, aiohttp.ClientConnectorError):
        resolved_host = getattr(error, "host", host_hint) or host_hint or "?"
        resolved_port = getattr(error, "port", port_hint) or port_hint or "?"
        parts = [
            f"Failed to connect to {url} (resolved: {resolved_host}:{resolved_port}).",
            "Ensure the endpoint address is reachable from within the container.",
        ]
        if resolved_host in {"localhost", "127.0.0.1"}:
            parts.append(
                "Inside Docker, 'localhost' refers to the container itself; use "
                "'host.docker.internal' or a Docker network alias if the service "
                "runs on the host machine."
            )
        os_error = getattr(error, "os_error", None)
        if isinstance(os_error, OSError):
            errno = getattr(os_error, "errno", None)
            strerror = os_error.strerror or str(os_error)
            if errno is not None or strerror:
                parts.append(f"OS error [{errno}]: {strerror}.")
        elif os_error:
            parts.append(f"OS error: {os_error}.")
        parts.append(f"Original error: {error}.")
        return " ".join(parts)
    if isinstance(error, asyncio.TimeoutError):
        return (
            f"Timed out waiting for {url}. "
            "The remote endpoint may be offline or slow to respond."
        )
    return f"Error while contacting {url}: {error}"
 def _is_backend_connection_error(exc: Exception) -> bool:
    """True for upstream connection-class failures observed via the OpenAI client.
    Targets the case where a llama-server in router mode keeps answering
    /v1/models but its delegated worker for a specific model is dead, so
    chat/completions calls return 5xx with 'proxy error: Could not establish
    connection' (or the SDK raises APIConnectionError outright).
    Excludes BadRequestError with exceed_context_size_error by design — those
    must stay on the reactive-trim path.
    """
    if isinstance(exc, openai.APIConnectionError):
        return True
    if isinstance(exc, openai.InternalServerError):
        msg = str(exc).lower()
        return (
            "proxy error" in msg
            or "could not establish connection" in msg
            or "connection refused" in msg
        )
    return False
 async def _mark_backend_unhealthy(endpoint: str, model: str, reason: str = "") -> None:
    """Record (endpoint, model) as broken so choose_endpoint avoids it.
    Cleared only by TTL — the dead-worker failure mode is invisible to the
    /v1/models / /api/ps probes that clear _loaded_error_cache, so we cannot
    rely on a successful probe as a recovery signal.
    """
    async with _completion_error_cache_lock:
        _completion_error_cache[(endpoint, model)] = time.time()
    print(f"[health] marked unhealthy ep={endpoint} model={model} reason={reason[:120]}", flush=True)
 def _is_llama_model_loaded(item: dict) -> bool:
    """Return True if a llama-server /v1/models item has status 'loaded'.
    Handles both dict format ({"value": "loaded"}) and plain string ("loaded").
    If no status field is present, the model is always-loaded (not dynamically managed)."""
    status = item.get("status")
    if status is None:
        return True  # No status field: model is always loaded (e.g. single-model servers)
    if isinstance(status, dict):
        return status.get("value") == "loaded"
    if isinstance(status, str):
        return status == "loaded"
    return False
 def _is_llama_model_loaded_or_sleeping(item: dict) -> bool:
    """Return True if status is 'loaded' or 'sleeping'.
    Newer llama-server versions report 'sleeping' in /v1/models when a model is idle;
    ps_details needs to include these so _fetch_llama_props can detect and unload them."""
    status = item.get("status")
    if status is None:
        return True
    if isinstance(status, dict):
        return status.get("value") in ("loaded", "sleeping")
    if isinstance(status, str):
        return status in ("loaded", "sleeping")
    return False
--- a/backends/normalize.py
+++ b/backends/normalize.py
@ -0,0 +1,113 @@
 """Endpoint URL, model-name, and endpoint-classification helpers.
 The endpoint classifiers read live config via ``get_config()`` so that the
 startup-time rebind of ``config`` in router.py is picked up at call time.
 """
 from config import get_config
 def _normalize_llama_model_name(name: str) -> str:
    """Extract the model name from a huggingface-style identifier.
    e.g. 'unsloth/gpt-oss-20b-GGUF:F16' -> 'gpt-oss-20b-GGUF'
    """
    if "/" in name:
        name = name.rsplit("/", 1)[1]
    if ":" in name:
        name = name.split(":")[0]
    return name
 def _extract_llama_quant(name: str) -> str:
    """Extract the quantization level from a huggingface-style identifier.
    e.g. 'unsloth/gpt-oss-20b-GGUF:Q8_0' -> 'Q8_0'
    Returns empty string if no quant suffix is present.
    """
    if ":" in name:
        return name.rsplit(":", 1)[1]
    return ""
 def ep2base(ep):
    if "/v1" in ep:
        base_url = ep
    else:
        base_url = ep + "/v1"
    return base_url
 def dedupe_on_keys(dicts, key_fields):
    """
    Helper function to deduplicate endpoint details based on given dict keys.
    """
    seen = set()
    out = []
    for d in dicts:
        # Build a tuple of the values for the chosen keys
        key = tuple(d.get(k) for k in key_fields)
        if key not in seen:
            seen.add(key)
            out.append(d)
    return out
 def is_ext_openai_endpoint(endpoint: str) -> bool:
    """
    Determine if an endpoint is an external OpenAI-compatible endpoint (not Ollama or llama-server).
    Returns True for:
    - External services like OpenAI.com, Groq, etc.
    Returns False for:
    - Ollama endpoints (without /v1, or with /v1 but default port 11434)
    - llama-server endpoints (explicitly configured in llama_server_endpoints)
    """
    cfg = get_config()
    # Check if it's a llama-server endpoint (has /v1 and is in the configured list)
    if endpoint in cfg.llama_server_endpoints:
        return False
    if "/v1" not in endpoint:
        return False
    base_endpoint = endpoint.replace('/v1', '')
    if base_endpoint in cfg.endpoints:
        return False  # It's Ollama's /v1
    # Check for default Ollama port
    if ':11434' in endpoint:
        return False  # It's Ollama
    return True  # It's an external OpenAI endpoint
 def is_openai_compatible(endpoint: str) -> bool:
    """
    Return True if the endpoint speaks the OpenAI API (not native Ollama).
    This includes external OpenAI endpoints AND llama-server endpoints.
    """
    return "/v1" in endpoint or endpoint in get_config().llama_server_endpoints
 def get_tracking_model(endpoint: str, model: str) -> str:
    """
    Normalize model name for tracking purposes so it matches the PS table key.
    - For llama-server endpoints: strips HF prefix and quantization suffix
    - For Ollama endpoints: appends ":latest" if no version suffix is present
    - For external OpenAI endpoints: returns as-is (not shown in PS)
    This ensures consistent model naming across all routes for usage tracking.
    """
    # External OpenAI endpoints are not shown in PS, keep as-is
    if is_ext_openai_endpoint(endpoint):
        return model
    # llama-server endpoints use normalized names in PS
    if endpoint in get_config().llama_server_endpoints:
        return _normalize_llama_model_name(model)
    # Ollama endpoints: append ":latest" if no version suffix
    if ":" not in model:
        return model + ":latest"
    return model
--- a/backends/probe.py
+++ b/backends/probe.py
@ -0,0 +1,449 @@
 """Backend probe / discovery primitives.
 The ``fetch`` class wraps the three discovery paths the router uses:
  * ``available_models`` — what the endpoint advertises (Ollama ``/api/tags``
    or OpenAI-style ``/v1/models``)
  * ``loaded_models``    — what is currently resident (Ollama ``/api/ps`` or
    llama-server ``/v1/models`` filtered on ``status == "loaded"``)
  * ``endpoint_details`` — arbitrary detail fetch used by management routes
 Each path goes through three layers of cache: success cache, error cache,
 and an in-flight request map. Stale-while-revalidate refreshes happen in
 background tasks tracked by the ``_bg_refresh_*`` maps in ``state``.
 ``_raw_probe`` and ``_endpoint_health`` are the lower-level dual probes
 used by ``/health`` and ``/api/config`` to distinguish a healthy daemon
 with a broken model-introspection path from a dead daemon.
 """
 import asyncio
 import time
 from typing import List, Optional, Set
 import aiohttp
 from config import get_config
 from state import (
    _models_cache,
    _models_cache_lock,
    _loaded_models_cache,
    _loaded_models_cache_lock,
    _available_error_cache,
    _available_error_cache_lock,
    _loaded_error_cache,
    _loaded_error_cache_lock,
    _inflight_available_models,
    _inflight_loaded_models,
    _inflight_lock,
    _bg_refresh_available,
    _bg_refresh_loaded,
    _bg_refresh_lock,
    default_headers,
 )
 from backends.sessions import get_session
 from backends.health import (
    _is_fresh,
    _ensure_success,
    _format_connection_issue,
    _is_llama_model_loaded,
 )
 from backends.normalize import is_ext_openai_endpoint, is_openai_compatible
 class fetch:
    async def _fetch_available_models_internal(endpoint: str, api_key: Optional[str] = None) -> Set[str]:
        """
        Internal function that performs the actual HTTP request to fetch available models.
        This is called by available_models() after checking caches and in-flight requests.
        """
        cfg = get_config()
        headers = {"Referer": default_headers.get("HTTP-Referer", "https://nomyo.ai")}
        if api_key is not None:
            headers["Authorization"] = "Bearer " + api_key
        ep_base = endpoint.rstrip("/")
        if endpoint in cfg.llama_server_endpoints and "/v1" not in endpoint:
            endpoint_url = f"{ep_base}/v1/models"
            key = "data"
        elif "/v1" in endpoint or endpoint in cfg.llama_server_endpoints:
            endpoint_url = f"{ep_base}/models"
            key = "data"
        else:
            endpoint_url = f"{ep_base}/api/tags"
            key = "models"
        client: aiohttp.ClientSession = get_session(endpoint)
        try:
            async with client.get(endpoint_url, headers=headers) as resp:
                await _ensure_success(resp)
                data = await resp.json()
            items = data.get(key, [])
            models = {item.get("id") or item.get("name") for item in items if item.get("id") or item.get("name")}
            async with _models_cache_lock:
                _models_cache[endpoint] = (models, time.time())
            return models
        except Exception as e:
            # Treat any error as if the endpoint offers no models
            message = _format_connection_issue(endpoint_url, e)
            print(f"[fetch.available_models] {message}")
            # Update error cache with lock protection
            async with _available_error_cache_lock:
                _available_error_cache[endpoint] = time.time()
            return set()
    async def _refresh_available_models(endpoint: str, api_key: Optional[str] = None) -> None:
        """
        Background task to refresh available models cache without blocking the caller.
        Used for stale-while-revalidate pattern.
        Deduplicates: only one background refresh runs per endpoint at a time.
        """
        async with _bg_refresh_lock:
            if endpoint in _bg_refresh_available and not _bg_refresh_available[endpoint].done():
                return  # A refresh is already running for this endpoint
            task = asyncio.create_task(fetch._fetch_available_models_internal(endpoint, api_key))
            _bg_refresh_available[endpoint] = task
        try:
            await task
        except Exception as e:
            # Silently fail - cache will remain stale but functional
            print(f"[fetch._refresh_available_models] Background refresh failed for {endpoint}: {e}")
        finally:
            async with _bg_refresh_lock:
                if _bg_refresh_available.get(endpoint) is task:
                    _bg_refresh_available.pop(endpoint, None)
    async def available_models(endpoint: str, api_key: Optional[str] = None) -> Set[str]:
        """
        Query <endpoint>/api/tags and return a set of all model names that the
        endpoint *advertises* (i.e. is capable of serving).  This endpoint lists
        every model that is installed on the Ollama instance, regardless of
        whether the model is currently loaded into memory.
        Uses request coalescing to prevent cache stampede: if multiple requests
        arrive when cache is expired, only one actual HTTP request is made.
        Uses stale-while-revalidate: when the cache is between 300-600s old,
        the stale data is returned immediately while a background refresh runs.
        This prevents model blackouts caused by transient timeouts.
        If the request fails (e.g. timeout, 5xx, or malformed response), an empty
        set is returned.
        """
        # Check models cache with lock protection
        async with _models_cache_lock:
            if endpoint in _models_cache:
                models, cached_at = _models_cache[endpoint]
                # FRESH: <= 300s old - return immediately
                if _is_fresh(cached_at, 300):
                    return models
                # STALE: 300-600s old - return stale data and refresh in background
                if _is_fresh(cached_at, 600):
                    asyncio.create_task(fetch._refresh_available_models(endpoint, api_key))
                    return models  # Return stale data immediately
                # EXPIRED: > 600s old - too stale, must refresh synchronously
                del _models_cache[endpoint]
        # Check error cache with lock protection
        async with _available_error_cache_lock:
            if endpoint in _available_error_cache:
                err_age = time.time() - _available_error_cache[endpoint]
                if err_age < 30:
                    # Very fresh error (<30s) – endpoint likely still down, bail fast
                    return set()
                elif err_age < 300:
                    # Stale error (30-300s) – endpoint may have recovered, probe in background
                    asyncio.create_task(fetch._refresh_available_models(endpoint, api_key))
                    return set()
                # Error expired (>300s) – remove and fall through to fresh fetch
                del _available_error_cache[endpoint]
        # Request coalescing: check if another request is already fetching this endpoint
        async with _inflight_lock:
            if endpoint in _inflight_available_models:
                # Another request is already fetching - wait for it
                task = _inflight_available_models[endpoint]
            else:
                # Create new fetch task
                task = asyncio.create_task(fetch._fetch_available_models_internal(endpoint, api_key))
                _inflight_available_models[endpoint] = task
        try:
            # Wait for the fetch to complete (either ours or another request's)
            result = await task
            return result
        finally:
            # Clean up in-flight tracking (only if we created it)
            async with _inflight_lock:
                if _inflight_available_models.get(endpoint) == task:
                    _inflight_available_models.pop(endpoint, None)
    async def _fetch_loaded_models_internal(endpoint: str) -> Set[str]:
        """
        Internal function that performs the actual HTTP request to fetch loaded models.
        This is called by loaded_models() after checking caches and in-flight requests.
        For Ollama endpoints: queries /api/ps and returns model names
        For llama-server endpoints: queries /v1/models and filters for status.value == "loaded"
        """
        client: aiohttp.ClientSession = get_session(endpoint)
        # Check if this is a llama-server endpoint
        if endpoint in get_config().llama_server_endpoints:
            # Query /v1/models for llama-server
            try:
                async with client.get(f"{endpoint}/models") as resp:
                    await _ensure_success(resp)
                    data = await resp.json()
                # Filter for loaded models only
                items = data.get("data", [])
                models = {
                    item.get("id")
                    for item in items
                    if item.get("id") and _is_llama_model_loaded(item)
                }
                # Update cache with lock protection
                async with _loaded_models_cache_lock:
                    _loaded_models_cache[endpoint] = (models, time.time())
                # Probe succeeded — clear any stale error so the endpoint
                # becomes routable again.
                async with _loaded_error_cache_lock:
                    _loaded_error_cache.pop(endpoint, None)
                return models
            except Exception as e:
                # If anything goes wrong we simply assume the endpoint has no models
                message = _format_connection_issue(f"{endpoint}/models", e)
                print(f"[fetch.loaded_models] {message}")
                # Record the failure so `choose_endpoint` can avoid routing
                # to an unhealthy backend and repeated probes short-circuit.
                async with _loaded_error_cache_lock:
                    _loaded_error_cache[endpoint] = time.time()
                return set()
        else:
            # Original Ollama /api/ps logic
            try:
                async with client.get(f"{endpoint}/api/ps") as resp:
                    await _ensure_success(resp)
                    data = await resp.json()
                # The response format is:
                #   {"models": [{"name": "model1"}, {"name": "model2"}]}
                models = {m.get("name") for m in data.get("models", []) if m.get("name")}
                # Update cache with lock protection
                async with _loaded_models_cache_lock:
                    _loaded_models_cache[endpoint] = (models, time.time())
                async with _loaded_error_cache_lock:
                    _loaded_error_cache.pop(endpoint, None)
                return models
            except Exception as e:
                # If anything goes wrong we simply assume the endpoint has no models
                message = _format_connection_issue(f"{endpoint}/api/ps", e)
                print(f"[fetch.loaded_models] {message}")
                async with _loaded_error_cache_lock:
                    _loaded_error_cache[endpoint] = time.time()
                return set()
    async def _refresh_loaded_models(endpoint: str) -> None:
        """
        Background task to refresh loaded models cache without blocking the caller.
        Used for stale-while-revalidate pattern.
        Deduplicates: only one background refresh runs per endpoint at a time.
        """
        async with _bg_refresh_lock:
            if endpoint in _bg_refresh_loaded and not _bg_refresh_loaded[endpoint].done():
                return  # A refresh is already running for this endpoint
            task = asyncio.create_task(fetch._fetch_loaded_models_internal(endpoint))
            _bg_refresh_loaded[endpoint] = task
        try:
            await task
        except Exception as e:
            # Silently fail - cache will remain stale but functional
            print(f"[fetch._refresh_loaded_models] Background refresh failed for {endpoint}: {e}")
        finally:
            async with _bg_refresh_lock:
                if _bg_refresh_loaded.get(endpoint) is task:
                    _bg_refresh_loaded.pop(endpoint, None)
    async def loaded_models(endpoint: str) -> Set[str]:
        """
        Query <endpoint>/api/ps and return a set of model names that are currently
        loaded on that endpoint. If the request fails (e.g. timeout, 5xx), an empty
        set is returned.
        Uses request coalescing to prevent cache stampede and stale-while-revalidate
        to serve requests immediately even when cache is stale (refreshing in background).
        """
        if is_ext_openai_endpoint(endpoint):
            return set()
        # Check loaded models cache with lock protection
        async with _loaded_models_cache_lock:
            if endpoint in _loaded_models_cache:
                models, cached_at = _loaded_models_cache[endpoint]
                # FRESH: < 10s old - return immediately
                if _is_fresh(cached_at, 10):
                    return models
                # STALE: 10-60s old - return stale data and refresh in background
                if _is_fresh(cached_at, 60):
                    # Kick off background refresh (fire-and-forget)
                    asyncio.create_task(fetch._refresh_loaded_models(endpoint))
                    return models  # Return stale data immediately
                # EXPIRED: > 60s old - too stale, must refresh synchronously
                del _loaded_models_cache[endpoint]
        # Check error cache with lock protection
        async with _loaded_error_cache_lock:
            if endpoint in _loaded_error_cache:
                if _is_fresh(_loaded_error_cache[endpoint], 300):
                    return set()
                # Error expired - remove it
                del _loaded_error_cache[endpoint]
        # Request coalescing: check if another request is already fetching this endpoint
        async with _inflight_lock:
            if endpoint in _inflight_loaded_models:
                # Another request is already fetching - wait for it
                task = _inflight_loaded_models[endpoint]
            else:
                # Create new fetch task
                task = asyncio.create_task(fetch._fetch_loaded_models_internal(endpoint))
                _inflight_loaded_models[endpoint] = task
        try:
            # Wait for the fetch to complete (either ours or another request's)
            result = await task
            return result
        finally:
            # Clean up in-flight tracking (only if we created it)
            async with _inflight_lock:
                if _inflight_loaded_models.get(endpoint) == task:
                    _inflight_loaded_models.pop(endpoint, None)
    async def endpoint_details(endpoint: str, route: str, detail: str, api_key: Optional[str] = None, skip_error_cache: bool = False, timeout: float = None) -> List[dict]:
        """
        Query <endpoint>/<route> to fetch <detail> and return a List of dicts with details
        for the corresponding Ollama endpoint. If the request fails we respond with "N/A" for detail.
        When ``skip_error_cache`` is False (the default), the call is short-circuited
        if the endpoint recently failed (recorded in ``_available_error_cache``).
        Pass ``skip_error_cache=True`` from health-check routes that must always probe.
        ``timeout`` overrides the session default for this single request (seconds, total).
        """
        # Fast-fail if the endpoint is known to be down (unless caller opts out)
        if not skip_error_cache:
            async with _available_error_cache_lock:
                if endpoint in _available_error_cache:
                    if _is_fresh(_available_error_cache[endpoint], 300):
                        return []
        headers = {"Referer": default_headers.get("HTTP-Referer", "https://nomyo.ai")}
        if api_key is not None:
            headers["Authorization"] = "Bearer " + api_key
        request_url = f"{endpoint.rstrip('/')}/{route.lstrip('/')}"
        client: aiohttp.ClientSession = get_session(endpoint)
        req_kwargs = {}
        if timeout is not None:
            req_kwargs["timeout"] = aiohttp.ClientTimeout(total=timeout)
        try:
            async with client.get(request_url, headers=headers, **req_kwargs) as resp:
                await _ensure_success(resp)
                data = await resp.json()
            detail = data.get(detail, [])
            return detail
        except Exception as e:
            # If anything goes wrong we cannot reply details
            message = _format_connection_issue(request_url, e)
            print(f"[fetch.endpoint_details] {message}")
            if not skip_error_cache:
                async with _available_error_cache_lock:
                    _available_error_cache[endpoint] = time.time()
            return []
 # -------------------------------------------------------------
 # Endpoint health probes (shared by /api/config and /health)
 # -------------------------------------------------------------
 async def _raw_probe(
    ep: str,
    route: str,
    api_key: Optional[str] = None,
    timeout: Optional[float] = None,
 ) -> tuple[bool, object]:
    """Direct HTTP probe that distinguishes success from failure
    (unlike `fetch.endpoint_details`, which returns [] on either).
    Returns `(ok, payload_or_error_message)`.
    """
    headers = {"Referer": default_headers.get("HTTP-Referer", "https://nomyo.ai")}
    if api_key is not None:
        headers["Authorization"] = "Bearer " + api_key
    url = f"{ep.rstrip('/')}/{route.lstrip('/')}"
    req_kwargs = {}
    if timeout is not None:
        req_kwargs["timeout"] = aiohttp.ClientTimeout(total=timeout)
    try:
        client: aiohttp.ClientSession = get_session(ep)
        async with client.get(url, headers=headers, **req_kwargs) as resp:
            await _ensure_success(resp)
            data = await resp.json()
        return True, data
    except Exception as exc:
        return False, _format_connection_issue(url, exc)
 async def _endpoint_health(ep: str, *, timeout: Optional[float] = None) -> dict:
    """Probe an endpoint and return `{status, version?, detail?}`.
    Ollama endpoints get a dual probe of `/api/version` and `/api/ps` so
    that a daemon which is reachable but has a broken model-introspection
    path (issue #83) is reported as `error` rather than `ok`.
    OpenAI-compatible endpoints use a single `/models` probe.
    """
    if is_openai_compatible(ep):
        ok, payload = await _raw_probe(
            ep, "/models", get_config().api_keys.get(ep), timeout=timeout,
        )
        if ok:
            return {"status": "ok", "version": "latest"}
        return {"status": "error", "detail": str(payload)}
    (version_ok, version_payload), (ps_ok, ps_payload) = await asyncio.gather(
        _raw_probe(ep, "/api/version", timeout=timeout),
        _raw_probe(ep, "/api/ps", timeout=timeout),
    )
    version_value = (
        version_payload.get("version")
        if version_ok and isinstance(version_payload, dict)
        else None
    )
    if version_ok and ps_ok:
        return {"status": "ok", "version": version_value}
    if not version_ok and not ps_ok:
        return {"status": "error", "detail": str(version_payload)}
    # Partial failure — daemon reachable but one probe failed. Report
    # as "error" so callers can surface the issue; include `version` so
    # the operator knows the daemon itself is alive.
    if not ps_ok:
        return {
            "status": "error",
            "version": version_value,
            "detail": f"/api/ps: {ps_payload}",
        }
    return {
        "status": "error",
        "detail": f"/api/version: {version_payload}",
    }
--- a/backends/sessions.py
+++ b/backends/sessions.py
@ -0,0 +1,72 @@
 """aiohttp / OpenAI client factories aware of Unix-socket endpoints.
 Unix socket endpoints follow the ``.sock`` hostname convention (e.g.
 ``http://192.168.0.52.sock/v1``) and resolve to ``/run/user/<uid>/<host>``.
 Their sessions/clients live in ``state.app_state`` so that startup can
 populate them once and routes can reuse them.
 """
 import os
 import aiohttp
 import openai
 from state import app_state
 from backends.normalize import ep2base
 def _is_unix_socket_endpoint(endpoint: str) -> bool:
    """Return True if endpoint uses Unix socket (.sock hostname convention).
    Detects URLs like http://192.168.0.52.sock/v1 where the host ends with
    .sock, indicating the connection should use a Unix domain socket at
    /tmp/<host> instead of TCP.
    """
    try:
        host = endpoint.split("//", 1)[1].split("/")[0].split(":")[0]
        return host.endswith(".sock")
    except IndexError:
        return False
 def _get_socket_path(endpoint: str) -> str:
    """Derive Unix socket file path from a .sock endpoint URL.
    http://192.168.0.52.sock/v1 -> /run/user/<uid>/192.168.0.52.sock
    """
    host = endpoint.split("//", 1)[1].split("/")[0].split(":")[0]
    return f"/run/user/{os.getuid()}/{host}"
 def get_session(endpoint: str) -> aiohttp.ClientSession:
    """Return the appropriate aiohttp session for the given endpoint.
    Unix socket endpoints (.sock) get their own UnixConnector session.
    All other endpoints share the main TCP session.
    """
    if _is_unix_socket_endpoint(endpoint):
        sess = app_state["socket_sessions"].get(endpoint)
        if sess is not None:
            return sess
    return app_state["session"]
 def _make_openai_client(
    endpoint: str,
    default_headers: dict | None = None,
    api_key: str = "no-key",
 ) -> openai.AsyncOpenAI:
    """Return an AsyncOpenAI client configured for the given endpoint.
    For Unix socket endpoints, injects a pre-created httpx UDS transport
    so the OpenAI SDK connects via the socket instead of TCP.
    """
    base_url = ep2base(endpoint)
    kwargs: dict = {"api_key": api_key}
    if default_headers is not None:
        kwargs["default_headers"] = default_headers
    if _is_unix_socket_endpoint(endpoint):
        http_client = app_state["httpx_clients"].get(endpoint)
        if http_client is not None:
            kwargs["http_client"] = http_client
            base_url = "http://localhost/v1"
    return openai.AsyncOpenAI(base_url=base_url, **kwargs)
--- a/config.py
+++ b/config.py
@ -124,3 +124,16 @@ def _config_path_from_env() -> Path:
    if candidate:
        return Path(candidate).expanduser()
    return Path("config.yaml")
 # ------------------------------------------------------------------
 # Shared config accessor
 # ------------------------------------------------------------------
 # Submodules read config at call time via get_config() instead of importing
 # a bound name. The single source of truth is ``router.config`` — the lazy
 # import below resolves it after router.py has finished loading, and lets
 # tests that ``patch.object(router, "config", cfg)`` flow through.
 def get_config() -> "Config":
    """Return the currently active Config from router.py."""
    import router  # lazy to avoid module-load circular import
    return router.config
--- a/router.py
+++ b/router.py
@ -75,7 +75,8 @@ from db import TokenDatabase
 from cache import init_llm_cache, get_llm_cache, openai_nonstream_to_sse
-# Create the global config object – it will be overwritten on startup
+# Create the global config object – it will be overwritten on startup.
 # Submodules read it lazily via config.get_config().
 config = Config.from_yaml(_config_path_from_env())
 # -------------------------------------------------------------
@ -90,11 +91,7 @@ app.add_middleware(
    allow_methods=["GET", "POST", "DELETE"],
    allow_headers=["Authorization", "Content-Type"],
 )
-default_headers={
+from state import default_headers
    "HTTP-Referer": "https://nomyo.ai",
    "Referer": "https://nomyo.ai",
    "X-Title": "NOMYO Router",
    }
 # -------------------------------------------------------------
 # Router-level authentication (optional)
@ -205,254 +202,36 @@ from fingerprint import _conversation_fingerprint
 db: "TokenDatabase" = None
 # -------------------------------------------------------------
-# 4. Helperfunctions 
+# 4. Helperfunctions
 # -------------------------------------------------------------
-def _is_fresh(cached_at: float, ttl: int) -> bool:
+from backends.normalize import (
-    return (time.time() - cached_at) < ttl
+    _normalize_llama_model_name,
-
+    _extract_llama_quant,
-async def _ensure_success(resp: aiohttp.ClientResponse) -> None:
+    ep2base,
-    if resp.status >= 400:
+    dedupe_on_keys,
-        text = await resp.text()
+)
-        raise HTTPException(status_code=resp.status, detail=_mask_secrets(text))
+from backends.sessions import (
-    
+    _is_unix_socket_endpoint,
-def _format_connection_issue(url: str, error: Exception) -> str:
+    _get_socket_path,
-    """
+    get_session,
-    Provide a human-friendly error string for connection failures so operators
+    _make_openai_client,
-    know which endpoint and address failed from inside the container.
+)
-    """
+from backends.health import (
-    parsed = urlparse(url)
+    _is_fresh,
-    host_hint = parsed.hostname or ""
+    _ensure_success,
-    port_hint = parsed.port or ""
+    _format_connection_issue,
-
+    _is_backend_connection_error,
-    if isinstance(error, aiohttp.ClientConnectorError):
+    _mark_backend_unhealthy,
-        resolved_host = getattr(error, "host", host_hint) or host_hint or "?"
+    _is_llama_model_loaded,
-        resolved_port = getattr(error, "port", port_hint) or port_hint or "?"
+    _is_llama_model_loaded_or_sleeping,
-        parts = [
+)
            f"Failed to connect to {url} (resolved: {resolved_host}:{resolved_port}).",
            "Ensure the endpoint address is reachable from within the container.",
        ]
        if resolved_host in {"localhost", "127.0.0.1"}:
            parts.append(
                "Inside Docker, 'localhost' refers to the container itself; use "
                "'host.docker.internal' or a Docker network alias if the service "
                "runs on the host machine."
            )
        os_error = getattr(error, "os_error", None)
        if isinstance(os_error, OSError):
            errno = getattr(os_error, "errno", None)
            strerror = os_error.strerror or str(os_error)
            if errno is not None or strerror:
                parts.append(f"OS error [{errno}]: {strerror}.")
        elif os_error:
            parts.append(f"OS error: {os_error}.")
        parts.append(f"Original error: {error}.")
        return " ".join(parts)
    if isinstance(error, asyncio.TimeoutError):
        return (
            f"Timed out waiting for {url}. "
            "The remote endpoint may be offline or slow to respond."
        )
    return f"Error while contacting {url}: {error}"
 def _normalize_llama_model_name(name: str) -> str:
    """Extract the model name from a huggingface-style identifier.
    e.g. 'unsloth/gpt-oss-20b-GGUF:F16' -> 'gpt-oss-20b-GGUF'
    """
    if "/" in name:
        name = name.rsplit("/", 1)[1]
    if ":" in name:
        name = name.split(":")[0]
    return name
 def _extract_llama_quant(name: str) -> str:
    """Extract the quantization level from a huggingface-style identifier.
    e.g. 'unsloth/gpt-oss-20b-GGUF:Q8_0' -> 'Q8_0'
    Returns empty string if no quant suffix is present.
    """
    if ":" in name:
        return name.rsplit(":", 1)[1]
    return ""
-def _is_unix_socket_endpoint(endpoint: str) -> bool:
+from backends.normalize import (
-    """Return True if endpoint uses Unix socket (.sock hostname convention).
+    is_ext_openai_endpoint,
-
+    is_openai_compatible,
-    Detects URLs like http://192.168.0.52.sock/v1 where the host ends with
+    get_tracking_model,
-    .sock, indicating the connection should use a Unix domain socket at
+)
    /tmp/<host> instead of TCP.
    """
    try:
        host = endpoint.split("//", 1)[1].split("/")[0].split(":")[0]
        return host.endswith(".sock")
    except IndexError:
        return False
 def _get_socket_path(endpoint: str) -> str:
    """Derive Unix socket file path from a .sock endpoint URL.
    http://192.168.0.52.sock/v1 -> /run/user/<uid>/192.168.0.52.sock
    """
    host = endpoint.split("//", 1)[1].split("/")[0].split(":")[0]
    return f"/run/user/{os.getuid()}/{host}"
 def get_session(endpoint: str) -> aiohttp.ClientSession:
    """Return the appropriate aiohttp session for the given endpoint.
    Unix socket endpoints (.sock) get their own UnixConnector session.
    All other endpoints share the main TCP session.
    """
    if _is_unix_socket_endpoint(endpoint):
        sess = app_state["socket_sessions"].get(endpoint)
        if sess is not None:
            return sess
    return app_state["session"]
 def _make_openai_client(
    endpoint: str,
    default_headers: dict | None = None,
    api_key: str = "no-key",
 ) -> openai.AsyncOpenAI:
    """Return an AsyncOpenAI client configured for the given endpoint.
    For Unix socket endpoints, injects a pre-created httpx UDS transport
    so the OpenAI SDK connects via the socket instead of TCP.
    """
    base_url = ep2base(endpoint)
    kwargs: dict = {"api_key": api_key}
    if default_headers is not None:
        kwargs["default_headers"] = default_headers
    if _is_unix_socket_endpoint(endpoint):
        http_client = app_state["httpx_clients"].get(endpoint)
        if http_client is not None:
            kwargs["http_client"] = http_client
            base_url = "http://localhost/v1"
    return openai.AsyncOpenAI(base_url=base_url, **kwargs)
 def _is_backend_connection_error(exc: Exception) -> bool:
    """True for upstream connection-class failures observed via the OpenAI client.
    Targets the case where a llama-server in router mode keeps answering
    /v1/models but its delegated worker for a specific model is dead, so
    chat/completions calls return 5xx with 'proxy error: Could not establish
    connection' (or the SDK raises APIConnectionError outright).
    Excludes BadRequestError with exceed_context_size_error by design — those
    must stay on the reactive-trim path.
    """
    if isinstance(exc, openai.APIConnectionError):
        return True
    if isinstance(exc, openai.InternalServerError):
        msg = str(exc).lower()
        return (
            "proxy error" in msg
            or "could not establish connection" in msg
            or "connection refused" in msg
        )
    return False
 async def _mark_backend_unhealthy(endpoint: str, model: str, reason: str = "") -> None:
    """Record (endpoint, model) as broken so choose_endpoint avoids it.
    Cleared only by TTL — the dead-worker failure mode is invisible to the
    /v1/models / /api/ps probes that clear _loaded_error_cache, so we cannot
    rely on a successful probe as a recovery signal.
    """
    async with _completion_error_cache_lock:
        _completion_error_cache[(endpoint, model)] = time.time()
    print(f"[health] marked unhealthy ep={endpoint} model={model} reason={reason[:120]}", flush=True)
 def _is_llama_model_loaded(item: dict) -> bool:
    """Return True if a llama-server /v1/models item has status 'loaded'.
    Handles both dict format ({"value": "loaded"}) and plain string ("loaded").
    If no status field is present, the model is always-loaded (not dynamically managed)."""
    status = item.get("status")
    if status is None:
        return True  # No status field: model is always loaded (e.g. single-model servers)
    if isinstance(status, dict):
        return status.get("value") == "loaded"
    if isinstance(status, str):
        return status == "loaded"
    return False
 def _is_llama_model_loaded_or_sleeping(item: dict) -> bool:
    """Return True if status is 'loaded' or 'sleeping'.
    Newer llama-server versions report 'sleeping' in /v1/models when a model is idle;
    ps_details needs to include these so _fetch_llama_props can detect and unload them."""
    status = item.get("status")
    if status is None:
        return True
    if isinstance(status, dict):
        return status.get("value") in ("loaded", "sleeping")
    if isinstance(status, str):
        return status in ("loaded", "sleeping")
    return False
 def is_ext_openai_endpoint(endpoint: str) -> bool:
    """
    Determine if an endpoint is an external OpenAI-compatible endpoint (not Ollama or llama-server).
    Returns True for:
    - External services like OpenAI.com, Groq, etc.
    Returns False for:
    - Ollama endpoints (without /v1, or with /v1 but default port 11434)
    - llama-server endpoints (explicitly configured in llama_server_endpoints)
    """
    # Check if it's a llama-server endpoint (has /v1 and is in the configured list)
    if endpoint in config.llama_server_endpoints:
        return False
    if "/v1" not in endpoint:
        return False
    base_endpoint = endpoint.replace('/v1', '')
    if base_endpoint in config.endpoints:
        return False  # It's Ollama's /v1
    # Check for default Ollama port
    if ':11434' in endpoint:
        return False  # It's Ollama
    return True  # It's an external OpenAI endpoint
 def is_openai_compatible(endpoint: str) -> bool:
    """
    Return True if the endpoint speaks the OpenAI API (not native Ollama).
    This includes external OpenAI endpoints AND llama-server endpoints.
    """
    return "/v1" in endpoint or endpoint in config.llama_server_endpoints
 def get_tracking_model(endpoint: str, model: str) -> str:
    """
    Normalize model name for tracking purposes so it matches the PS table key.
    - For llama-server endpoints: strips HF prefix and quantization suffix
    - For Ollama endpoints: appends ":latest" if no version suffix is present
    - For external OpenAI endpoints: returns as-is (not shown in PS)
    This ensures consistent model naming across all routes for usage tracking.
    """
    # External OpenAI endpoints are not shown in PS, keep as-is
    if is_ext_openai_endpoint(endpoint):
        return model
    # llama-server endpoints use normalized names in PS
    if endpoint in config.llama_server_endpoints:
        return _normalize_llama_model_name(model)
    # Ollama endpoints: append ":latest" if no version suffix
    if ":" not in model:
        return model + ":latest"
    return model
 async def token_worker() -> None:
    try:
@ -601,348 +380,8 @@ async def flush_remaining_buffers() -> None:
        # Do not raise during shutdown – log and continue teardown
        print(f"[shutdown] Error flushing remaining buffers: {e}")
-class fetch:
+from backends.probe import fetch
    async def _fetch_available_models_internal(endpoint: str, api_key: Optional[str] = None) -> Set[str]:
        """
        Internal function that performs the actual HTTP request to fetch available models.
        This is called by available_models() after checking caches and in-flight requests.
        """
        headers = {"Referer": default_headers.get("HTTP-Referer", "https://nomyo.ai")}
        if api_key is not None:
            headers["Authorization"] = "Bearer " + api_key
        ep_base = endpoint.rstrip("/")
        if endpoint in config.llama_server_endpoints and "/v1" not in endpoint:
            endpoint_url = f"{ep_base}/v1/models"
            key = "data"
        elif "/v1" in endpoint or endpoint in config.llama_server_endpoints:
            endpoint_url = f"{ep_base}/models"
            key = "data"
        else:
            endpoint_url = f"{ep_base}/api/tags"
            key = "models"
        client: aiohttp.ClientSession = get_session(endpoint)
        try:
            async with client.get(endpoint_url, headers=headers) as resp:
                await _ensure_success(resp)
                data = await resp.json()
            items = data.get(key, [])
            models = {item.get("id") or item.get("name") for item in items if item.get("id") or item.get("name")}
            async with _models_cache_lock:
                _models_cache[endpoint] = (models, time.time())
            return models
        except Exception as e:
            # Treat any error as if the endpoint offers no models
            message = _format_connection_issue(endpoint_url, e)
            print(f"[fetch.available_models] {message}")
            # Update error cache with lock protection
            async with _available_error_cache_lock:
                _available_error_cache[endpoint] = time.time()
            return set()
    async def _refresh_available_models(endpoint: str, api_key: Optional[str] = None) -> None:
        """
        Background task to refresh available models cache without blocking the caller.
        Used for stale-while-revalidate pattern.
        Deduplicates: only one background refresh runs per endpoint at a time.
        """
        async with _bg_refresh_lock:
            if endpoint in _bg_refresh_available and not _bg_refresh_available[endpoint].done():
                return  # A refresh is already running for this endpoint
            task = asyncio.create_task(fetch._fetch_available_models_internal(endpoint, api_key))
            _bg_refresh_available[endpoint] = task
        try:
            await task
        except Exception as e:
            # Silently fail - cache will remain stale but functional
            print(f"[fetch._refresh_available_models] Background refresh failed for {endpoint}: {e}")
        finally:
            async with _bg_refresh_lock:
                if _bg_refresh_available.get(endpoint) is task:
                    _bg_refresh_available.pop(endpoint, None)
    async def available_models(endpoint: str, api_key: Optional[str] = None) -> Set[str]:
        """
        Query <endpoint>/api/tags and return a set of all model names that the
        endpoint *advertises* (i.e. is capable of serving).  This endpoint lists
        every model that is installed on the Ollama instance, regardless of
        whether the model is currently loaded into memory.
        Uses request coalescing to prevent cache stampede: if multiple requests
        arrive when cache is expired, only one actual HTTP request is made.
        Uses stale-while-revalidate: when the cache is between 300-600s old,
        the stale data is returned immediately while a background refresh runs.
        This prevents model blackouts caused by transient timeouts.
        If the request fails (e.g. timeout, 5xx, or malformed response), an empty
        set is returned.
        """
        # Check models cache with lock protection
        async with _models_cache_lock:
            if endpoint in _models_cache:
                models, cached_at = _models_cache[endpoint]
                # FRESH: <= 300s old - return immediately
                if _is_fresh(cached_at, 300):
                    return models
                # STALE: 300-600s old - return stale data and refresh in background
                if _is_fresh(cached_at, 600):
                    asyncio.create_task(fetch._refresh_available_models(endpoint, api_key))
                    return models  # Return stale data immediately
                # EXPIRED: > 600s old - too stale, must refresh synchronously
                del _models_cache[endpoint]
        # Check error cache with lock protection
        async with _available_error_cache_lock:
            if endpoint in _available_error_cache:
                err_age = time.time() - _available_error_cache[endpoint]
                if err_age < 30:
                    # Very fresh error (<30s) – endpoint likely still down, bail fast
                    return set()
                elif err_age < 300:
                    # Stale error (30-300s) – endpoint may have recovered, probe in background
                    asyncio.create_task(fetch._refresh_available_models(endpoint, api_key))
                    return set()
                # Error expired (>300s) – remove and fall through to fresh fetch
                del _available_error_cache[endpoint]
        # Request coalescing: check if another request is already fetching this endpoint
        async with _inflight_lock:
            if endpoint in _inflight_available_models:
                # Another request is already fetching - wait for it
                task = _inflight_available_models[endpoint]
            else:
                # Create new fetch task
                task = asyncio.create_task(fetch._fetch_available_models_internal(endpoint, api_key))
                _inflight_available_models[endpoint] = task
        try:
            # Wait for the fetch to complete (either ours or another request's)
            result = await task
            return result
        finally:
            # Clean up in-flight tracking (only if we created it)
            async with _inflight_lock:
                if _inflight_available_models.get(endpoint) == task:
                    _inflight_available_models.pop(endpoint, None)
    async def _fetch_loaded_models_internal(endpoint: str) -> Set[str]:
        """
        Internal function that performs the actual HTTP request to fetch loaded models.
        This is called by loaded_models() after checking caches and in-flight requests.
        For Ollama endpoints: queries /api/ps and returns model names
        For llama-server endpoints: queries /v1/models and filters for status.value == "loaded"
        """
        client: aiohttp.ClientSession = get_session(endpoint)
        # Check if this is a llama-server endpoint
        if endpoint in config.llama_server_endpoints:
            # Query /v1/models for llama-server
            try:
                async with client.get(f"{endpoint}/models") as resp:
                    await _ensure_success(resp)
                    data = await resp.json()
                # Filter for loaded models only
                items = data.get("data", [])
                models = {
                    item.get("id")
                    for item in items
                    if item.get("id") and _is_llama_model_loaded(item)
                }
                # Update cache with lock protection
                async with _loaded_models_cache_lock:
                    _loaded_models_cache[endpoint] = (models, time.time())
                # Probe succeeded — clear any stale error so the endpoint
                # becomes routable again.
                async with _loaded_error_cache_lock:
                    _loaded_error_cache.pop(endpoint, None)
                return models
            except Exception as e:
                # If anything goes wrong we simply assume the endpoint has no models
                message = _format_connection_issue(f"{endpoint}/models", e)
                print(f"[fetch.loaded_models] {message}")
                # Record the failure so `choose_endpoint` can avoid routing
                # to an unhealthy backend and repeated probes short-circuit.
                async with _loaded_error_cache_lock:
                    _loaded_error_cache[endpoint] = time.time()
                return set()
        else:
            # Original Ollama /api/ps logic
            try:
                async with client.get(f"{endpoint}/api/ps") as resp:
                    await _ensure_success(resp)
                    data = await resp.json()
                # The response format is:
                #   {"models": [{"name": "model1"}, {"name": "model2"}]}
                models = {m.get("name") for m in data.get("models", []) if m.get("name")}
                # Update cache with lock protection
                async with _loaded_models_cache_lock:
                    _loaded_models_cache[endpoint] = (models, time.time())
                async with _loaded_error_cache_lock:
                    _loaded_error_cache.pop(endpoint, None)
                return models
            except Exception as e:
                # If anything goes wrong we simply assume the endpoint has no models
                message = _format_connection_issue(f"{endpoint}/api/ps", e)
                print(f"[fetch.loaded_models] {message}")
                async with _loaded_error_cache_lock:
                    _loaded_error_cache[endpoint] = time.time()
                return set()
    async def _refresh_loaded_models(endpoint: str) -> None:
        """
        Background task to refresh loaded models cache without blocking the caller.
        Used for stale-while-revalidate pattern.
        Deduplicates: only one background refresh runs per endpoint at a time.
        """
        async with _bg_refresh_lock:
            if endpoint in _bg_refresh_loaded and not _bg_refresh_loaded[endpoint].done():
                return  # A refresh is already running for this endpoint
            task = asyncio.create_task(fetch._fetch_loaded_models_internal(endpoint))
            _bg_refresh_loaded[endpoint] = task
        try:
            await task
        except Exception as e:
            # Silently fail - cache will remain stale but functional
            print(f"[fetch._refresh_loaded_models] Background refresh failed for {endpoint}: {e}")
        finally:
            async with _bg_refresh_lock:
                if _bg_refresh_loaded.get(endpoint) is task:
                    _bg_refresh_loaded.pop(endpoint, None)
    async def loaded_models(endpoint: str) -> Set[str]:
        """
        Query <endpoint>/api/ps and return a set of model names that are currently
        loaded on that endpoint. If the request fails (e.g. timeout, 5xx), an empty
        set is returned.
        Uses request coalescing to prevent cache stampede and stale-while-revalidate
        to serve requests immediately even when cache is stale (refreshing in background).
        """
        if is_ext_openai_endpoint(endpoint):
            return set()
        # Check loaded models cache with lock protection
        async with _loaded_models_cache_lock:
            if endpoint in _loaded_models_cache:
                models, cached_at = _loaded_models_cache[endpoint]
                # FRESH: < 10s old - return immediately
                if _is_fresh(cached_at, 10):
                    return models
                # STALE: 10-60s old - return stale data and refresh in background
                if _is_fresh(cached_at, 60):
                    # Kick off background refresh (fire-and-forget)
                    asyncio.create_task(fetch._refresh_loaded_models(endpoint))
                    return models  # Return stale data immediately
                # EXPIRED: > 60s old - too stale, must refresh synchronously
                del _loaded_models_cache[endpoint]
        # Check error cache with lock protection
        async with _loaded_error_cache_lock:
            if endpoint in _loaded_error_cache:
                if _is_fresh(_loaded_error_cache[endpoint], 300):
                    return set()
                # Error expired - remove it
                del _loaded_error_cache[endpoint]
        # Request coalescing: check if another request is already fetching this endpoint
        async with _inflight_lock:
            if endpoint in _inflight_loaded_models:
                # Another request is already fetching - wait for it
                task = _inflight_loaded_models[endpoint]
            else:
                # Create new fetch task
                task = asyncio.create_task(fetch._fetch_loaded_models_internal(endpoint))
                _inflight_loaded_models[endpoint] = task
        try:
            # Wait for the fetch to complete (either ours or another request's)
            result = await task
            return result
        finally:
            # Clean up in-flight tracking (only if we created it)
            async with _inflight_lock:
                if _inflight_loaded_models.get(endpoint) == task:
                    _inflight_loaded_models.pop(endpoint, None)
    async def endpoint_details(endpoint: str, route: str, detail: str, api_key: Optional[str] = None, skip_error_cache: bool = False, timeout: float = None) -> List[dict]:
        """
        Query <endpoint>/<route> to fetch <detail> and return a List of dicts with details
        for the corresponding Ollama endpoint. If the request fails we respond with "N/A" for detail.
        When ``skip_error_cache`` is False (the default), the call is short-circuited
        if the endpoint recently failed (recorded in ``_available_error_cache``).
        Pass ``skip_error_cache=True`` from health-check routes that must always probe.
        ``timeout`` overrides the session default for this single request (seconds, total).
        """
        # Fast-fail if the endpoint is known to be down (unless caller opts out)
        if not skip_error_cache:
            async with _available_error_cache_lock:
                if endpoint in _available_error_cache:
                    if _is_fresh(_available_error_cache[endpoint], 300):
                        return []
        headers = {"Referer": default_headers.get("HTTP-Referer", "https://nomyo.ai")}
        if api_key is not None:
            headers["Authorization"] = "Bearer " + api_key
        request_url = f"{endpoint.rstrip('/')}/{route.lstrip('/')}"
        client: aiohttp.ClientSession = get_session(endpoint)
        req_kwargs = {}
        if timeout is not None:
            req_kwargs["timeout"] = aiohttp.ClientTimeout(total=timeout)
        try:
            async with client.get(request_url, headers=headers, **req_kwargs) as resp:
                await _ensure_success(resp)
                data = await resp.json()
            detail = data.get(detail, [])
            return detail
        except Exception as e:
            # If anything goes wrong we cannot reply details
            message = _format_connection_issue(request_url, e)
            print(f"[fetch.endpoint_details] {message}")
            if not skip_error_cache:
                async with _available_error_cache_lock:
                    _available_error_cache[endpoint] = time.time()
            return []
 def ep2base(ep):
    if "/v1" in ep:
        base_url = ep
    else:
        base_url = ep+"/v1"
    return base_url
 def dedupe_on_keys(dicts, key_fields):
    """
    Helper function to deduplicate endpoint details based on given dict keys.
    """
    seen = set()
    out = []
    for d in dicts:
        # Build a tuple of the values for the chosen keys
        key = tuple(d.get(k) for k in key_fields)
        if key not in seen:
            seen.add(key)
            out.append(d)
    return out
 async def increment_usage(endpoint: str, model: str) -> None:
    async with usage_lock:
@ -2910,80 +2349,7 @@ async def usage_proxy(request: Request):
    return {"usage_counts": usage_counts,
            "token_usage_counts": token_usage_counts}
-# -------------------------------------------------------------
+from backends.probe import _raw_probe, _endpoint_health
 # 20. Endpoint health probes (shared by /api/config and /health)
 # -------------------------------------------------------------
 async def _raw_probe(
    ep: str,
    route: str,
    api_key: Optional[str] = None,
    timeout: Optional[float] = None,
 ) -> tuple[bool, object]:
    """Direct HTTP probe that distinguishes success from failure
    (unlike `fetch.endpoint_details`, which returns [] on either).
    Returns `(ok, payload_or_error_message)`.
    """
    headers = {"Referer": default_headers.get("HTTP-Referer", "https://nomyo.ai")}
    if api_key is not None:
        headers["Authorization"] = "Bearer " + api_key
    url = f"{ep.rstrip('/')}/{route.lstrip('/')}"
    req_kwargs = {}
    if timeout is not None:
        req_kwargs["timeout"] = aiohttp.ClientTimeout(total=timeout)
    try:
        client: aiohttp.ClientSession = get_session(ep)
        async with client.get(url, headers=headers, **req_kwargs) as resp:
            await _ensure_success(resp)
            data = await resp.json()
        return True, data
    except Exception as exc:
        return False, _format_connection_issue(url, exc)
 async def _endpoint_health(ep: str, *, timeout: Optional[float] = None) -> dict:
    """Probe an endpoint and return `{status, version?, detail?}`.
    Ollama endpoints get a dual probe of `/api/version` and `/api/ps` so
    that a daemon which is reachable but has a broken model-introspection
    path (issue #83) is reported as `error` rather than `ok`.
    OpenAI-compatible endpoints use a single `/models` probe.
    """
    if is_openai_compatible(ep):
        ok, payload = await _raw_probe(
            ep, "/models", config.api_keys.get(ep), timeout=timeout,
        )
        if ok:
            return {"status": "ok", "version": "latest"}
        return {"status": "error", "detail": str(payload)}
    (version_ok, version_payload), (ps_ok, ps_payload) = await asyncio.gather(
        _raw_probe(ep, "/api/version", timeout=timeout),
        _raw_probe(ep, "/api/ps", timeout=timeout),
    )
    version_value = (
        version_payload.get("version")
        if version_ok and isinstance(version_payload, dict)
        else None
    )
    if version_ok and ps_ok:
        return {"status": "ok", "version": version_value}
    if not version_ok and not ps_ok:
        return {"status": "error", "detail": str(version_payload)}
    # Partial failure — daemon reachable but one probe failed. Report
    # as "error" so callers can surface the issue; include `version` so
    # the operator knows the daemon itself is alive.
    if not ps_ok:
        return {
            "status": "error",
            "version": version_value,
            "detail": f"/api/ps: {ps_payload}",
        }
    return {
        "status": "error",
        "detail": f"/api/version: {version_payload}",
    }
 # -------------------------------------------------------------
--- a/state.py
+++ b/state.py
@ -69,6 +69,13 @@ app_state = {
    "httpx_clients": {},    # endpoint -> httpx.AsyncClient(UDS transport) for .sock endpoints
 }
 # Default outbound HTTP headers attached to every backend request.
 default_headers = {
    "HTTP-Referer": "https://nomyo.ai",
    "Referer": "https://nomyo.ai",
    "X-Title": "NOMYO Router",
 }
 # ------------------------------------------------------------------
 # Token Count Buffer (for write-behind pattern)
 # ------------------------------------------------------------------