feat: add ctx-size for llama-swap models to dashboard

2026-06-15 19:09:55 +02:00 · 2026-06-15 19:09:55 +02:00 · cef71df3df
commit cef71df3df
parent aa8baebac5
2 changed files with 86 additions and 2 deletions
--- a/api/ollama.py
+++ b/api/ollama.py
@ -13,6 +13,7 @@ import asyncio
 import re
 import time
 from typing import Optional
+from urllib.parse import quote

 import aiohttp
 import ollama
@ -976,6 +977,43 @@ async def _fetch_llama_swap_running(endpoint: str) -> list[dict]:
    )


+# Match the context size in a llama-swap worker's `cmd` string, e.g.
+# "llama-server --port 5818 -hf ... --ctx-size 131072 ...". llama.cpp accepts
+# both --ctx-size and the short -c alias.
+_CTX_SIZE_CMD_RE = re.compile(r"(?:--ctx-size|-c)[=\s]+(\d+)")
+
+
+def _ctx_size_from_cmd(cmd: str) -> int | None:
+    """Extract n_ctx from a llama-swap worker `cmd` string, or None if absent."""
+    if not cmd:
+        return None
+    m = _CTX_SIZE_CMD_RE.search(cmd)
+    return int(m.group(1)) if m else None
+
+
+async def _fetch_llama_swap_nctx(endpoint: str, model_id: str) -> int | None:
+    """Fallback when a worker's `cmd` lacks --ctx-size: ask the underlying
+    llama-server via llama-swap's /upstream/<model>/props route (plain /props?model=
+    is not routed by llama-swap and 404s). Returns n_ctx or None on any failure.
+    """
+    config = get_config()
+    base_url = endpoint.rstrip("/").removesuffix("/v1")
+    props_url = f"{base_url}/upstream/{quote(model_id, safe='')}/props"
+    headers = None
+    api_key = config.api_keys.get(endpoint)
+    if api_key:
+        headers = {"Authorization": f"Bearer {api_key}"}
+    try:
+        client: aiohttp.ClientSession = get_probe_session(endpoint)
+        async with client.get(props_url, headers=headers, timeout=aiohttp.ClientTimeout(total=5)) as resp:
+            if resp.status == 200:
+                data = await resp.json()
+                return data.get("default_generation_settings", {}).get("n_ctx")
+    except Exception as e:
+        print(f"[ps_details] Failed to fetch props from {props_url}: {e}")
+    return None
+
+
@router.get("/api/ps")
 async def ps_proxy(request: Request):
    """
@ -1161,6 +1199,7 @@ async def ps_details_proxy(request: Request):
        swap_running = await asyncio.gather(
            *[_fetch_llama_swap_running(ep) for ep in config.llama_swap_endpoints]
        )
+        swap_nctx_fallbacks: list[tuple[str, str, dict]] = []
        for endpoint, runlist in zip(config.llama_swap_endpoints, swap_running):
            for item in runlist:
                if not isinstance(item, dict) or item.get("state") != "ready":
@ -1170,7 +1209,7 @@ async def ps_details_proxy(request: Request):
                    continue
                normalized = _normalize_llama_model_name(raw_id)
                quant = _extract_llama_quant(raw_id)
-                models.append({
+                swap_model = {
                    "name": normalized,
                    "id": normalized,
                    "original_name": raw_id,
@ -1180,6 +1219,29 @@ async def ps_details_proxy(request: Request):
                    "state": item.get("state"),
                    "ttl": item.get("ttl"),
                    "proxy": item.get("proxy"),
-                })
+                }
+                # llama-swap omits n_ctx from /running, but the worker's launch
+                # command carries --ctx-size, so parse it from there (no extra
+                # request). Workers whose cmd lacks the flag fall back to an
+                # /upstream/<model>/props probe below.
+                n_ctx = _ctx_size_from_cmd(item.get("cmd", ""))
+                if n_ctx is not None:
+                    swap_model["context_length"] = n_ctx
+                    if 0 < n_ctx <= _CTX_TRIM_SMALL_LIMIT:
+                        _endpoint_nctx[(endpoint, normalized)] = n_ctx
+                else:
+                    swap_nctx_fallbacks.append((endpoint, raw_id, swap_model))
+                models.append(swap_model)
+
+        # Resolve ctx for workers whose cmd lacked --ctx-size via /upstream props.
+        if swap_nctx_fallbacks:
+            fallback_results = await asyncio.gather(
+                *[_fetch_llama_swap_nctx(ep, rid) for ep, rid, _ in swap_nctx_fallbacks]
+            )
+            for (ep, _rid, swap_model), n_ctx in zip(swap_nctx_fallbacks, fallback_results):
+                if n_ctx is not None:
+                    swap_model["context_length"] = n_ctx
+                    if 0 < n_ctx <= _CTX_TRIM_SMALL_LIMIT:
+                        _endpoint_nctx[(ep, swap_model["id"])] = n_ctx

    return JSONResponse(content={"models": models}, status_code=200)