feat: add llama-swap as a backend
All checks were successful
PR Tests / test (pull_request) Successful in 1m18s
NYX Security Scan / nyx-scan (pull_request) Successful in 6m19s

This commit is contained in:
Alpha Nerd 2026-06-14 16:34:31 +02:00
parent c8da58430a
commit aa8baebac5
Signed by: alpha-nerd
SSH key fingerprint: SHA256:QkkAgVoYi9TQ0UKPkiKSfnerZy2h4qhi3SVPXJmBN+M
17 changed files with 544 additions and 52 deletions

View file

@ -27,7 +27,7 @@ from state import (
_affinity_lock,
)
from sse import subscribe, unsubscribe
from backends.normalize import _normalize_llama_model_name
from backends.normalize import _normalize_llama_model_name, is_llama_server, llama_endpoints
from backends.probe import _endpoint_health
@ -127,7 +127,6 @@ async def affinity_stats(request: Request):
now = time.monotonic()
entries: list[dict] = []
llama_eps = set(config.llama_server_endpoints)
async with _affinity_lock:
for fp, (ep, mdl, expires_at) in list(_affinity_map.items()):
remaining = expires_at - now
@ -136,7 +135,7 @@ async def affinity_stats(request: Request):
continue
# Mirror the normalisation used by /api/ps_details so the dashboard
# can join affinity entries to PS rows by (endpoint, model).
display_model = _normalize_llama_model_name(mdl) if ep in llama_eps else mdl
display_model = _normalize_llama_model_name(mdl) if is_llama_server(ep) else mdl
entries.append({
"endpoint": ep,
"model": display_model,
@ -175,9 +174,12 @@ async def config_proxy(request: Request):
ollama_results = await asyncio.gather(*[check(ep) for ep in config.endpoints])
llama_results = []
if config.llama_server_endpoints:
# llama-server and llama-swap render identically in the dashboard ("llama" rows),
# so health-check both and merge them into one list.
llama_eps = llama_endpoints(config)
if llama_eps:
llama_results = await asyncio.gather(
*[check(ep) for ep in config.llama_server_endpoints]
*[check(ep) for ep in llama_eps]
)
return {
@ -227,7 +229,7 @@ async def health_proxy(request: Request):
# purposes. Probing /api/version alone would miss the case where the
# Ollama process is up but /api/ps is failing — see issue #83.
all_endpoints = list(config.endpoints)
llama_eps_extra = [ep for ep in config.llama_server_endpoints if ep not in config.endpoints]
llama_eps_extra = [ep for ep in llama_endpoints(config) if ep not in config.endpoints]
all_endpoints += llama_eps_extra
probe_results = await asyncio.gather(

View file

@ -40,9 +40,12 @@ from backends.health import (
from backends.normalize import (
dedupe_on_keys,
is_openai_compatible,
is_llama_server,
llama_endpoints,
_normalize_llama_model_name,
_extract_llama_quant,
)
from backends.control import unload_model
from backends.probe import fetch
from backends.sessions import _make_openai_client, get_ollama_client, get_probe_session
from requests.chat import _make_moe_requests
@ -372,7 +375,7 @@ async def chat_proxy(request: Request):
if use_openai:
start_ts = time.perf_counter()
# Proactive trim: only for small-ctx models we've already seen run out of space
_lookup_model = _normalize_llama_model_name(model) if endpoint in config.llama_server_endpoints else model
_lookup_model = _normalize_llama_model_name(model) if is_llama_server(endpoint) else model
_known_nctx = _endpoint_nctx.get((endpoint, _lookup_model))
if _known_nctx and _known_nctx <= _CTX_TRIM_SMALL_LIMIT:
_pre_target = int((_known_nctx - _known_nctx // 4) / 1.2)
@ -935,8 +938,8 @@ async def tags_proxy(request: Request):
# 1. Query all endpoints for models
tasks = [fetch.endpoint_details(ep, "/api/tags", "models", skip_error_cache=True, timeout=8) for ep in config.endpoints if "/v1" not in ep]
tasks += [fetch.endpoint_details(ep, "/models", "data", config.api_keys[ep], skip_error_cache=True, timeout=8) for ep in config.endpoints if "/v1" in ep]
# Also query llama-server endpoints not already covered by config.endpoints
llama_eps_for_tags = [ep for ep in config.llama_server_endpoints if ep not in config.endpoints]
# Also query llama-server / llama-swap endpoints not already covered by config.endpoints
llama_eps_for_tags = [ep for ep in llama_endpoints(config) if ep not in config.endpoints]
tasks += [fetch.endpoint_details(ep, "/models", "data", config.api_keys.get(ep), skip_error_cache=True, timeout=8) for ep in llama_eps_for_tags]
all_models = await asyncio.gather(*tasks)
@ -960,27 +963,42 @@ async def tags_proxy(request: Request):
)
async def _fetch_llama_swap_running(endpoint: str) -> list[dict]:
"""Return the list of ready (`state == "ready"`) workers from a llama-swap
endpoint's `/running` route. llama-swap omits the per-model `status` field on
`/v1/models`, so running workers must be read here instead.
"""
config = get_config()
base_url = endpoint.rstrip("/").removesuffix("/v1")
return await fetch.endpoint_details(
base_url, "/running", "running", config.api_keys.get(endpoint),
skip_error_cache=True, timeout=8,
)
@router.get("/api/ps")
async def ps_proxy(request: Request):
"""
Proxy a ps request to all Ollama and llama-server endpoints and reply a unique list of all running models.
Proxy a ps request to all Ollama, llama-server and llama-swap endpoints and reply a unique list of all running models.
For Ollama endpoints: queries /api/ps
For llama-server endpoints: queries /v1/models with status.value == "loaded"
For llama-swap endpoints: queries /running (state == "ready")
"""
config = get_config()
# 1. Query Ollama endpoints for running models via /api/ps
ollama_tasks = [fetch.endpoint_details(ep, "/api/ps", "models", skip_error_cache=True, timeout=8) for ep in config.endpoints if "/v1" not in ep]
# 2. Query llama-server endpoints for loaded models via /v1/models
# Also query endpoints from llama_server_endpoints that may not be in config.endpoints
all_llama_endpoints = set(config.llama_server_endpoints) | set(ep for ep in config.endpoints if ep in config.llama_server_endpoints)
llama_tasks = [
fetch.endpoint_details(ep, "/models", "data", config.api_keys.get(ep), skip_error_cache=True, timeout=8)
for ep in all_llama_endpoints
for ep in config.llama_server_endpoints
]
# 3. Query llama-swap endpoints for running workers via /running
swap_tasks = [_fetch_llama_swap_running(ep) for ep in config.llama_swap_endpoints]
ollama_loaded = await asyncio.gather(*ollama_tasks) if ollama_tasks else []
llama_loaded = await asyncio.gather(*llama_tasks) if llama_tasks else []
swap_running = await asyncio.gather(*swap_tasks) if swap_tasks else []
models = {'models': []}
# Add Ollama models (if any)
@ -1003,6 +1021,21 @@ async def ps_proxy(request: Request):
"status": item.get("status"),
"details": {"quantization_level": quant} if quant else {}
})
# Add llama-swap running workers (already filtered on state == "ready")
if swap_running:
for runlist in swap_running:
for item in runlist:
if item.get("state") != "ready":
continue
raw_id = item.get("model", "")
normalized = _normalize_llama_model_name(raw_id)
quant = _extract_llama_quant(raw_id)
models['models'].append({
"name": normalized,
"id": normalized,
"digest": "",
"details": {"quantization_level": quant} if quant else {}
})
# 3. Return a JSONResponse with deduplicated currently deployed models
# Deduplicate on 'name' rather than 'digest': llama-server models always
@ -1101,16 +1134,7 @@ async def ps_details_proxy(request: Request):
is_generation = "temperature" in dgs
if is_sleeping:
unload_url = f"{base_url}/models/unload"
try:
async with client.post(
unload_url,
json={"model": model_id},
headers=headers,
) as unload_resp:
print(f"[ps_details] Unloaded sleeping model {model_id} from {endpoint}: {unload_resp.status}")
except Exception as ue:
print(f"[ps_details] Failed to unload sleeping model {model_id} from {endpoint}: {ue}")
await unload_model(endpoint, model_id)
return n_ctx, is_sleeping, is_generation
except Exception as e:
@ -1131,4 +1155,31 @@ async def ps_details_proxy(request: Request):
if not is_sleeping:
models.append(model_dict)
# Add llama-swap running workers (read from /running; no status/props/auto-unload —
# llama-swap omits the status field on /v1/models and manages its own TTL eviction).
if config.llama_swap_endpoints:
swap_running = await asyncio.gather(
*[_fetch_llama_swap_running(ep) for ep in config.llama_swap_endpoints]
)
for endpoint, runlist in zip(config.llama_swap_endpoints, swap_running):
for item in runlist:
if not isinstance(item, dict) or item.get("state") != "ready":
continue
raw_id = item.get("model", "")
if not raw_id:
continue
normalized = _normalize_llama_model_name(raw_id)
quant = _extract_llama_quant(raw_id)
models.append({
"name": normalized,
"id": normalized,
"original_name": raw_id,
"digest": "",
"details": {"quantization_level": quant} if quant else {},
"endpoint": endpoint,
"state": item.get("state"),
"ttl": item.get("ttl"),
"proxy": item.get("proxy"),
})
return JSONResponse(content={"models": models}, status_code=200)

View file

@ -34,6 +34,8 @@ from backends.normalize import (
ep2base,
is_ext_openai_endpoint,
is_openai_compatible,
is_llama_server,
llama_endpoints,
_normalize_llama_model_name,
)
from backends.probe import fetch
@ -353,7 +355,7 @@ async def openai_chat_completions_proxy(request: Request):
resolved_msgs = await _normalize_images_in_messages(params.get("messages", []))
send_params = {**params, "messages": resolved_msgs}
# Proactive trim: only for small-ctx models we've already seen run out of space
_lookup_model = _normalize_llama_model_name(model) if endpoint in config.llama_server_endpoints else model
_lookup_model = _normalize_llama_model_name(model) if is_llama_server(endpoint) else model
_known_nctx = _endpoint_nctx.get((endpoint, _lookup_model))
if _known_nctx and _known_nctx <= _CTX_TRIM_SMALL_LIMIT:
_pre_target = int(((_known_nctx - _known_nctx // 4)) / 1.2)
@ -658,9 +660,9 @@ async def openai_models_proxy(request: Request):
ollama_tasks = [fetch.endpoint_details(ep, "/api/tags", "models", skip_error_cache=True, timeout=8) for ep in config.endpoints if "/v1" not in ep]
# 2. Query external OpenAI endpoints (Groq, OpenAI, etc.) via /models
ext_openai_tasks = [fetch.endpoint_details(ep, "/models", "data", config.api_keys.get(ep), skip_error_cache=True, timeout=8) for ep in config.endpoints if is_ext_openai_endpoint(ep)]
# 3. Query llama-server endpoints for loaded models via /v1/models
# Also query endpoints from llama_server_endpoints that may not be in config.endpoints
all_llama_endpoints = set(config.llama_server_endpoints) | set(ep for ep in config.endpoints if ep in config.llama_server_endpoints)
# 3. Query llama-server / llama-swap endpoints for advertised models via /v1/models
# Also query endpoints that may not be in config.endpoints
all_llama_endpoints = llama_endpoints(config)
llama_tasks = [
fetch.endpoint_details(ep, "/models", "data", config.api_keys.get(ep), skip_error_cache=True, timeout=8)
for ep in all_llama_endpoints
@ -783,10 +785,10 @@ async def rerank_proxy(request: Request):
upstream_payload[optional_key] = payload[optional_key]
# Determine upstream URL:
# llama-server exposes /v1/rerank (base already contains /v1 for llama_server_endpoints)
# llama-server / llama-swap expose /v1/rerank (base already contains /v1)
# External OpenAI endpoints expose /rerank under their /v1 base
if endpoint in config.llama_server_endpoints:
# llama-server: endpoint may or may not already contain /v1
if is_llama_server(endpoint):
# llama-server / llama-swap: endpoint may or may not already contain /v1
if "/v1" in endpoint:
rerank_url = f"{endpoint}/rerank"
else:
@ -823,3 +825,82 @@ async def rerank_proxy(request: Request):
return JSONResponse(content=data)
finally:
await decrement_usage(endpoint, tracking_model)
async def _resolve_llama_swap_endpoint(model_id: str) -> str | None:
"""Pick the llama-swap endpoint that serves ``model_id``.
Prefers an endpoint that already has the worker running; falls back to any
that advertises the model. Returns None if none do.
"""
config = get_config()
swap_eps = config.llama_swap_endpoints
if not swap_eps:
return None
advertised = await asyncio.gather(
*[fetch.available_models(ep, config.api_keys.get(ep)) for ep in swap_eps]
)
candidates = [ep for ep, models in zip(swap_eps, advertised) if model_id in models]
if not candidates:
return None
if len(candidates) == 1:
return candidates[0]
loaded = await asyncio.gather(*[fetch.loaded_models(ep) for ep in candidates])
for ep, lm in zip(candidates, loaded):
if model_id in lm:
return ep
return candidates[0]
@router.api_route("/upstream/{model_id}/{path:path}", methods=["GET", "POST"])
async def llama_swap_upstream(model_id: str, path: str, request: Request):
"""Bypass llama-swap and reach a model's underlying llama-server worker directly
via llama-swap's ``/upstream/:model_id`` route.
Lets clients use llama-server features that llama-swap itself does not forward
(e.g. token-array prompts), while still letting the router pick the backend that
actually hosts the model. ``/upstream`` is a root route, so the ``/v1`` suffix is
stripped from the configured endpoint.
"""
config = get_config()
endpoint = await _resolve_llama_swap_endpoint(model_id)
if endpoint is None:
raise HTTPException(
status_code=404,
detail=f"No configured llama-swap endpoint serves model '{model_id}'.",
)
base_url = endpoint.rstrip("/").removesuffix("/v1")
url = f"{base_url}/upstream/{model_id}/{path}"
if request.url.query:
url = f"{url}?{request.url.query}"
headers = {"Referer": default_headers.get("HTTP-Referer", "https://nomyo.ai")}
content_type = request.headers.get("content-type")
if content_type:
headers["Content-Type"] = content_type
api_key = config.api_keys.get(endpoint)
if api_key is not None:
headers["Authorization"] = "Bearer " + api_key
body = await request.body()
client: aiohttp.ClientSession = get_session(endpoint)
try:
resp = await client.request(request.method, url, data=body or None, headers=headers)
except Exception as e:
raise HTTPException(status_code=502, detail=f"Upstream request to {url} failed: {e}")
async def _iter():
try:
async for chunk in resp.content.iter_any():
yield chunk
finally:
resp.release()
return StreamingResponse(
_iter(),
status_code=resp.status,
media_type=resp.headers.get("Content-Type"),
)