"""Backend control operations (model unload). llama-server and llama-swap evict a resident model through different routes: * llama-server → ``POST {base}/models/unload`` with body ``{"model": id}`` * llama-swap → ``POST {base}/api/models/unload/{id}`` (path parameter) ``unload_model`` dispatches on the configured backend type so callers don't have to know which one they are talking to. Both routes live at the endpoint root, so any ``/v1`` suffix is stripped first. """ from typing import Optional import aiohttp from config import get_config from state import default_headers from backends.sessions import get_probe_session from backends.normalize import is_llama_swap from backends.health import _format_connection_issue async def unload_model(endpoint: str, model_id: str) -> bool: """Ask ``endpoint`` to unload ``model_id``. Returns True on a 2xx response. ``model_id`` must be the backend's native model identifier (the raw HF id for llama-server / llama-swap), not the router-normalized display name. """ cfg = get_config() base_url = endpoint.rstrip("/").removesuffix("/v1") headers = {"Referer": default_headers.get("HTTP-Referer", "https://nomyo.ai")} api_key: Optional[str] = cfg.api_keys.get(endpoint) if api_key is not None: headers["Authorization"] = "Bearer " + api_key if is_llama_swap(endpoint): url = f"{base_url}/api/models/unload/{model_id}" json_body = None else: url = f"{base_url}/models/unload" json_body = {"model": model_id} client: aiohttp.ClientSession = get_probe_session(endpoint) try: async with client.post(url, json=json_body, headers=headers) as resp: ok = resp.status < 400 print(f"[unload_model] {model_id} on {endpoint}: {resp.status}") return ok except Exception as e: print(f"[unload_model] {_format_connection_issue(url, e)}") return False