feat: add llama-swap as a backend
All checks were successful
PR Tests / test (pull_request) Successful in 1m18s
NYX Security Scan / nyx-scan (pull_request) Successful in 6m19s

This commit is contained in:
Alpha Nerd 2026-06-14 16:34:31 +02:00
parent c8da58430a
commit aa8baebac5
Signed by: alpha-nerd
SSH key fingerprint: SHA256:QkkAgVoYi9TQ0UKPkiKSfnerZy2h4qhi3SVPXJmBN+M
17 changed files with 544 additions and 52 deletions

View file

@ -20,10 +20,11 @@ MOCK_OLLAMA_EP = "http://mock-ollama:11434"
MOCK_LLAMA_EP = "http://mock-llama:8080/v1"
def _make_cfg(ollama_eps=None, llama_eps=None, api_keys=None):
def _make_cfg(ollama_eps=None, llama_eps=None, swap_eps=None, api_keys=None):
cfg = MagicMock()
cfg.endpoints = ollama_eps or [MOCK_OLLAMA_EP]
cfg.llama_server_endpoints = llama_eps or [MOCK_LLAMA_EP]
cfg.llama_swap_endpoints = swap_eps or []
cfg.api_keys = api_keys or {}
cfg.max_concurrent_connections = 2
cfg.router_api_key = None
@ -228,6 +229,30 @@ class TestFetchLoadedModels:
models = await router.fetch.loaded_models(MOCK_LLAMA_EP)
assert "always-on-model" in models
async def test_llama_swap_reads_running_state_ready(self):
# llama-swap omits the /v1/models status field, so loaded workers come
# from /running (a root route — the /v1 suffix must be stripped).
swap_ep = "http://mock-swap:8080/v1"
cfg = _make_cfg(llama_eps=[], swap_eps=[swap_ep])
with patch.object(router, "config", cfg), mock_probe() as m:
m.add_get(
"http://mock-swap:8080/running",
payload={"running": [
{"model": "org/ready-model:Q4_K_M", "state": "ready"},
{"model": "org/starting-model:Q8_0", "state": "starting"},
]},
)
models = await router.fetch.loaded_models(swap_ep)
assert models == {"org/ready-model:Q4_K_M"}
async def test_llama_swap_records_error_on_failure(self):
swap_ep = "http://mock-swap:8080/v1"
cfg = _make_cfg(llama_eps=[], swap_eps=[swap_ep])
with patch.object(router, "config", cfg), mock_probe() as m:
m.add_get("http://mock-swap:8080/running", status=502, payload={})
await router.fetch.loaded_models(swap_ep)
assert swap_ep in router._loaded_error_cache
async def test_returns_empty_on_error(self):
cfg = _make_cfg(ollama_eps=[MOCK_OLLAMA_EP], llama_eps=[])
with patch.object(router, "config", cfg), mock_probe() as m: