feat: add llama-swap as a backend

2026-06-14 16:34:31 +02:00 · 2026-06-14 16:34:31 +02:00 · aa8baebac5
commit aa8baebac5
parent c8da58430a
17 changed files with 544 additions and 52 deletions
--- a/api/management.py
+++ b/api/management.py
@ -27,7 +27,7 @@ from state import (
    _affinity_lock,
 )
 from sse import subscribe, unsubscribe
-from backends.normalize import _normalize_llama_model_name
+from backends.normalize import _normalize_llama_model_name, is_llama_server, llama_endpoints
 from backends.probe import _endpoint_health


@ -127,7 +127,6 @@ async def affinity_stats(request: Request):

    now = time.monotonic()
    entries: list[dict] = []
-    llama_eps = set(config.llama_server_endpoints)
    async with _affinity_lock:
        for fp, (ep, mdl, expires_at) in list(_affinity_map.items()):
            remaining = expires_at - now
@ -136,7 +135,7 @@ async def affinity_stats(request: Request):
                continue
            # Mirror the normalisation used by /api/ps_details so the dashboard
            # can join affinity entries to PS rows by (endpoint, model).
-            display_model = _normalize_llama_model_name(mdl) if ep in llama_eps else mdl
+            display_model = _normalize_llama_model_name(mdl) if is_llama_server(ep) else mdl
            entries.append({
                "endpoint": ep,
                "model": display_model,
@ -175,9 +174,12 @@ async def config_proxy(request: Request):

    ollama_results = await asyncio.gather(*[check(ep) for ep in config.endpoints])
    llama_results = []
-    if config.llama_server_endpoints:
+    # llama-server and llama-swap render identically in the dashboard ("llama" rows),
+    # so health-check both and merge them into one list.
+    llama_eps = llama_endpoints(config)
+    if llama_eps:
        llama_results = await asyncio.gather(
-            *[check(ep) for ep in config.llama_server_endpoints]
+            *[check(ep) for ep in llama_eps]
        )

    return {
@ -227,7 +229,7 @@ async def health_proxy(request: Request):
    # purposes. Probing /api/version alone would miss the case where the
    # Ollama process is up but /api/ps is failing — see issue #83.
    all_endpoints = list(config.endpoints)
-    llama_eps_extra = [ep for ep in config.llama_server_endpoints if ep not in config.endpoints]
+    llama_eps_extra = [ep for ep in llama_endpoints(config) if ep not in config.endpoints]
    all_endpoints += llama_eps_extra

    probe_results = await asyncio.gather(