feat: add llama-swap as a backend

2026-06-14 16:34:31 +02:00 · 2026-06-14 16:34:31 +02:00 · aa8baebac5
commit aa8baebac5
parent c8da58430a
17 changed files with 544 additions and 52 deletions
--- a/backends/control.py
+++ b/backends/control.py
@ -0,0 +1,50 @@
+"""Backend control operations (model unload).
+
+llama-server and llama-swap evict a resident model through different routes:
+  * llama-server → ``POST {base}/models/unload`` with body ``{"model": id}``
+  * llama-swap   → ``POST {base}/api/models/unload/{id}`` (path parameter)
+
+``unload_model`` dispatches on the configured backend type so callers don't
+have to know which one they are talking to. Both routes live at the endpoint
+root, so any ``/v1`` suffix is stripped first.
+"""
+from typing import Optional
+
+import aiohttp
+
+from config import get_config
+from state import default_headers
+from backends.sessions import get_probe_session
+from backends.normalize import is_llama_swap
+from backends.health import _format_connection_issue
+
+
+async def unload_model(endpoint: str, model_id: str) -> bool:
+    """Ask ``endpoint`` to unload ``model_id``. Returns True on a 2xx response.
+
+    ``model_id`` must be the backend's native model identifier (the raw HF id
+    for llama-server / llama-swap), not the router-normalized display name.
+    """
+    cfg = get_config()
+    base_url = endpoint.rstrip("/").removesuffix("/v1")
+    headers = {"Referer": default_headers.get("HTTP-Referer", "https://nomyo.ai")}
+    api_key: Optional[str] = cfg.api_keys.get(endpoint)
+    if api_key is not None:
+        headers["Authorization"] = "Bearer " + api_key
+
+    if is_llama_swap(endpoint):
+        url = f"{base_url}/api/models/unload/{model_id}"
+        json_body = None
+    else:
+        url = f"{base_url}/models/unload"
+        json_body = {"model": model_id}
+
+    client: aiohttp.ClientSession = get_probe_session(endpoint)
+    try:
+        async with client.post(url, json=json_body, headers=headers) as resp:
+            ok = resp.status < 400
+            print(f"[unload_model] {model_id} on {endpoint}: {resp.status}")
+            return ok
+    except Exception as e:
+        print(f"[unload_model] {_format_connection_issue(url, e)}")
+        return False