feat: add llama-swap as a backend
This commit is contained in:
parent
c8da58430a
commit
aa8baebac5
17 changed files with 544 additions and 52 deletions
|
|
@ -40,9 +40,12 @@ from backends.health import (
|
|||
from backends.normalize import (
|
||||
dedupe_on_keys,
|
||||
is_openai_compatible,
|
||||
is_llama_server,
|
||||
llama_endpoints,
|
||||
_normalize_llama_model_name,
|
||||
_extract_llama_quant,
|
||||
)
|
||||
from backends.control import unload_model
|
||||
from backends.probe import fetch
|
||||
from backends.sessions import _make_openai_client, get_ollama_client, get_probe_session
|
||||
from requests.chat import _make_moe_requests
|
||||
|
|
@ -372,7 +375,7 @@ async def chat_proxy(request: Request):
|
|||
if use_openai:
|
||||
start_ts = time.perf_counter()
|
||||
# Proactive trim: only for small-ctx models we've already seen run out of space
|
||||
_lookup_model = _normalize_llama_model_name(model) if endpoint in config.llama_server_endpoints else model
|
||||
_lookup_model = _normalize_llama_model_name(model) if is_llama_server(endpoint) else model
|
||||
_known_nctx = _endpoint_nctx.get((endpoint, _lookup_model))
|
||||
if _known_nctx and _known_nctx <= _CTX_TRIM_SMALL_LIMIT:
|
||||
_pre_target = int((_known_nctx - _known_nctx // 4) / 1.2)
|
||||
|
|
@ -935,8 +938,8 @@ async def tags_proxy(request: Request):
|
|||
# 1. Query all endpoints for models
|
||||
tasks = [fetch.endpoint_details(ep, "/api/tags", "models", skip_error_cache=True, timeout=8) for ep in config.endpoints if "/v1" not in ep]
|
||||
tasks += [fetch.endpoint_details(ep, "/models", "data", config.api_keys[ep], skip_error_cache=True, timeout=8) for ep in config.endpoints if "/v1" in ep]
|
||||
# Also query llama-server endpoints not already covered by config.endpoints
|
||||
llama_eps_for_tags = [ep for ep in config.llama_server_endpoints if ep not in config.endpoints]
|
||||
# Also query llama-server / llama-swap endpoints not already covered by config.endpoints
|
||||
llama_eps_for_tags = [ep for ep in llama_endpoints(config) if ep not in config.endpoints]
|
||||
tasks += [fetch.endpoint_details(ep, "/models", "data", config.api_keys.get(ep), skip_error_cache=True, timeout=8) for ep in llama_eps_for_tags]
|
||||
all_models = await asyncio.gather(*tasks)
|
||||
|
||||
|
|
@ -960,27 +963,42 @@ async def tags_proxy(request: Request):
|
|||
)
|
||||
|
||||
|
||||
async def _fetch_llama_swap_running(endpoint: str) -> list[dict]:
|
||||
"""Return the list of ready (`state == "ready"`) workers from a llama-swap
|
||||
endpoint's `/running` route. llama-swap omits the per-model `status` field on
|
||||
`/v1/models`, so running workers must be read here instead.
|
||||
"""
|
||||
config = get_config()
|
||||
base_url = endpoint.rstrip("/").removesuffix("/v1")
|
||||
return await fetch.endpoint_details(
|
||||
base_url, "/running", "running", config.api_keys.get(endpoint),
|
||||
skip_error_cache=True, timeout=8,
|
||||
)
|
||||
|
||||
|
||||
@router.get("/api/ps")
|
||||
async def ps_proxy(request: Request):
|
||||
"""
|
||||
Proxy a ps request to all Ollama and llama-server endpoints and reply a unique list of all running models.
|
||||
Proxy a ps request to all Ollama, llama-server and llama-swap endpoints and reply a unique list of all running models.
|
||||
|
||||
For Ollama endpoints: queries /api/ps
|
||||
For llama-server endpoints: queries /v1/models with status.value == "loaded"
|
||||
For llama-swap endpoints: queries /running (state == "ready")
|
||||
"""
|
||||
config = get_config()
|
||||
# 1. Query Ollama endpoints for running models via /api/ps
|
||||
ollama_tasks = [fetch.endpoint_details(ep, "/api/ps", "models", skip_error_cache=True, timeout=8) for ep in config.endpoints if "/v1" not in ep]
|
||||
# 2. Query llama-server endpoints for loaded models via /v1/models
|
||||
# Also query endpoints from llama_server_endpoints that may not be in config.endpoints
|
||||
all_llama_endpoints = set(config.llama_server_endpoints) | set(ep for ep in config.endpoints if ep in config.llama_server_endpoints)
|
||||
llama_tasks = [
|
||||
fetch.endpoint_details(ep, "/models", "data", config.api_keys.get(ep), skip_error_cache=True, timeout=8)
|
||||
for ep in all_llama_endpoints
|
||||
for ep in config.llama_server_endpoints
|
||||
]
|
||||
# 3. Query llama-swap endpoints for running workers via /running
|
||||
swap_tasks = [_fetch_llama_swap_running(ep) for ep in config.llama_swap_endpoints]
|
||||
|
||||
ollama_loaded = await asyncio.gather(*ollama_tasks) if ollama_tasks else []
|
||||
llama_loaded = await asyncio.gather(*llama_tasks) if llama_tasks else []
|
||||
swap_running = await asyncio.gather(*swap_tasks) if swap_tasks else []
|
||||
|
||||
models = {'models': []}
|
||||
# Add Ollama models (if any)
|
||||
|
|
@ -1003,6 +1021,21 @@ async def ps_proxy(request: Request):
|
|||
"status": item.get("status"),
|
||||
"details": {"quantization_level": quant} if quant else {}
|
||||
})
|
||||
# Add llama-swap running workers (already filtered on state == "ready")
|
||||
if swap_running:
|
||||
for runlist in swap_running:
|
||||
for item in runlist:
|
||||
if item.get("state") != "ready":
|
||||
continue
|
||||
raw_id = item.get("model", "")
|
||||
normalized = _normalize_llama_model_name(raw_id)
|
||||
quant = _extract_llama_quant(raw_id)
|
||||
models['models'].append({
|
||||
"name": normalized,
|
||||
"id": normalized,
|
||||
"digest": "",
|
||||
"details": {"quantization_level": quant} if quant else {}
|
||||
})
|
||||
|
||||
# 3. Return a JSONResponse with deduplicated currently deployed models
|
||||
# Deduplicate on 'name' rather than 'digest': llama-server models always
|
||||
|
|
@ -1101,16 +1134,7 @@ async def ps_details_proxy(request: Request):
|
|||
is_generation = "temperature" in dgs
|
||||
|
||||
if is_sleeping:
|
||||
unload_url = f"{base_url}/models/unload"
|
||||
try:
|
||||
async with client.post(
|
||||
unload_url,
|
||||
json={"model": model_id},
|
||||
headers=headers,
|
||||
) as unload_resp:
|
||||
print(f"[ps_details] Unloaded sleeping model {model_id} from {endpoint}: {unload_resp.status}")
|
||||
except Exception as ue:
|
||||
print(f"[ps_details] Failed to unload sleeping model {model_id} from {endpoint}: {ue}")
|
||||
await unload_model(endpoint, model_id)
|
||||
|
||||
return n_ctx, is_sleeping, is_generation
|
||||
except Exception as e:
|
||||
|
|
@ -1131,4 +1155,31 @@ async def ps_details_proxy(request: Request):
|
|||
if not is_sleeping:
|
||||
models.append(model_dict)
|
||||
|
||||
# Add llama-swap running workers (read from /running; no status/props/auto-unload —
|
||||
# llama-swap omits the status field on /v1/models and manages its own TTL eviction).
|
||||
if config.llama_swap_endpoints:
|
||||
swap_running = await asyncio.gather(
|
||||
*[_fetch_llama_swap_running(ep) for ep in config.llama_swap_endpoints]
|
||||
)
|
||||
for endpoint, runlist in zip(config.llama_swap_endpoints, swap_running):
|
||||
for item in runlist:
|
||||
if not isinstance(item, dict) or item.get("state") != "ready":
|
||||
continue
|
||||
raw_id = item.get("model", "")
|
||||
if not raw_id:
|
||||
continue
|
||||
normalized = _normalize_llama_model_name(raw_id)
|
||||
quant = _extract_llama_quant(raw_id)
|
||||
models.append({
|
||||
"name": normalized,
|
||||
"id": normalized,
|
||||
"original_name": raw_id,
|
||||
"digest": "",
|
||||
"details": {"quantization_level": quant} if quant else {},
|
||||
"endpoint": endpoint,
|
||||
"state": item.get("state"),
|
||||
"ttl": item.get("ttl"),
|
||||
"proxy": item.get("proxy"),
|
||||
})
|
||||
|
||||
return JSONResponse(content={"models": models}, status_code=200)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue