feat: add ctx-size for llama-swap models to dashboard
This commit is contained in:
parent
aa8baebac5
commit
cef71df3df
2 changed files with 86 additions and 2 deletions
|
|
@ -13,6 +13,7 @@ import asyncio
|
|||
import re
|
||||
import time
|
||||
from typing import Optional
|
||||
from urllib.parse import quote
|
||||
|
||||
import aiohttp
|
||||
import ollama
|
||||
|
|
@ -976,6 +977,43 @@ async def _fetch_llama_swap_running(endpoint: str) -> list[dict]:
|
|||
)
|
||||
|
||||
|
||||
# Match the context size in a llama-swap worker's `cmd` string, e.g.
|
||||
# "llama-server --port 5818 -hf ... --ctx-size 131072 ...". llama.cpp accepts
|
||||
# both --ctx-size and the short -c alias.
|
||||
_CTX_SIZE_CMD_RE = re.compile(r"(?:--ctx-size|-c)[=\s]+(\d+)")
|
||||
|
||||
|
||||
def _ctx_size_from_cmd(cmd: str) -> int | None:
|
||||
"""Extract n_ctx from a llama-swap worker `cmd` string, or None if absent."""
|
||||
if not cmd:
|
||||
return None
|
||||
m = _CTX_SIZE_CMD_RE.search(cmd)
|
||||
return int(m.group(1)) if m else None
|
||||
|
||||
|
||||
async def _fetch_llama_swap_nctx(endpoint: str, model_id: str) -> int | None:
|
||||
"""Fallback when a worker's `cmd` lacks --ctx-size: ask the underlying
|
||||
llama-server via llama-swap's /upstream/<model>/props route (plain /props?model=
|
||||
is not routed by llama-swap and 404s). Returns n_ctx or None on any failure.
|
||||
"""
|
||||
config = get_config()
|
||||
base_url = endpoint.rstrip("/").removesuffix("/v1")
|
||||
props_url = f"{base_url}/upstream/{quote(model_id, safe='')}/props"
|
||||
headers = None
|
||||
api_key = config.api_keys.get(endpoint)
|
||||
if api_key:
|
||||
headers = {"Authorization": f"Bearer {api_key}"}
|
||||
try:
|
||||
client: aiohttp.ClientSession = get_probe_session(endpoint)
|
||||
async with client.get(props_url, headers=headers, timeout=aiohttp.ClientTimeout(total=5)) as resp:
|
||||
if resp.status == 200:
|
||||
data = await resp.json()
|
||||
return data.get("default_generation_settings", {}).get("n_ctx")
|
||||
except Exception as e:
|
||||
print(f"[ps_details] Failed to fetch props from {props_url}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
@router.get("/api/ps")
|
||||
async def ps_proxy(request: Request):
|
||||
"""
|
||||
|
|
@ -1161,6 +1199,7 @@ async def ps_details_proxy(request: Request):
|
|||
swap_running = await asyncio.gather(
|
||||
*[_fetch_llama_swap_running(ep) for ep in config.llama_swap_endpoints]
|
||||
)
|
||||
swap_nctx_fallbacks: list[tuple[str, str, dict]] = []
|
||||
for endpoint, runlist in zip(config.llama_swap_endpoints, swap_running):
|
||||
for item in runlist:
|
||||
if not isinstance(item, dict) or item.get("state") != "ready":
|
||||
|
|
@ -1170,7 +1209,7 @@ async def ps_details_proxy(request: Request):
|
|||
continue
|
||||
normalized = _normalize_llama_model_name(raw_id)
|
||||
quant = _extract_llama_quant(raw_id)
|
||||
models.append({
|
||||
swap_model = {
|
||||
"name": normalized,
|
||||
"id": normalized,
|
||||
"original_name": raw_id,
|
||||
|
|
@ -1180,6 +1219,29 @@ async def ps_details_proxy(request: Request):
|
|||
"state": item.get("state"),
|
||||
"ttl": item.get("ttl"),
|
||||
"proxy": item.get("proxy"),
|
||||
})
|
||||
}
|
||||
# llama-swap omits n_ctx from /running, but the worker's launch
|
||||
# command carries --ctx-size, so parse it from there (no extra
|
||||
# request). Workers whose cmd lacks the flag fall back to an
|
||||
# /upstream/<model>/props probe below.
|
||||
n_ctx = _ctx_size_from_cmd(item.get("cmd", ""))
|
||||
if n_ctx is not None:
|
||||
swap_model["context_length"] = n_ctx
|
||||
if 0 < n_ctx <= _CTX_TRIM_SMALL_LIMIT:
|
||||
_endpoint_nctx[(endpoint, normalized)] = n_ctx
|
||||
else:
|
||||
swap_nctx_fallbacks.append((endpoint, raw_id, swap_model))
|
||||
models.append(swap_model)
|
||||
|
||||
# Resolve ctx for workers whose cmd lacked --ctx-size via /upstream props.
|
||||
if swap_nctx_fallbacks:
|
||||
fallback_results = await asyncio.gather(
|
||||
*[_fetch_llama_swap_nctx(ep, rid) for ep, rid, _ in swap_nctx_fallbacks]
|
||||
)
|
||||
for (ep, _rid, swap_model), n_ctx in zip(swap_nctx_fallbacks, fallback_results):
|
||||
if n_ctx is not None:
|
||||
swap_model["context_length"] = n_ctx
|
||||
if 0 < n_ctx <= _CTX_TRIM_SMALL_LIMIT:
|
||||
_endpoint_nctx[(ep, swap_model["id"])] = n_ctx
|
||||
|
||||
return JSONResponse(content={"models": models}, status_code=200)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue