From e296ac19badeeb2258de42375d66325a2cdfa803 Mon Sep 17 00:00:00 2001 From: alpha nerd Date: Thu, 7 May 2026 11:34:09 +0200 Subject: [PATCH] feat: new helper to bridge change of behaviour in llama.cpp v1/models status - now correctly reporting "sleeping" or "loaded" for auto-unload --- router.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/router.py b/router.py index 326ec33..7cf3ada 100644 --- a/router.py +++ b/router.py @@ -572,6 +572,19 @@ def _is_llama_model_loaded(item: dict) -> bool: return status == "loaded" return False +def _is_llama_model_loaded_or_sleeping(item: dict) -> bool: + """Return True if status is 'loaded' or 'sleeping'. + Newer llama-server versions report 'sleeping' in /v1/models when a model is idle; + ps_details needs to include these so _fetch_llama_props can detect and unload them.""" + status = item.get("status") + if status is None: + return True + if isinstance(status, dict): + return status.get("value") in ("loaded", "sleeping") + if isinstance(status, str): + return status in ("loaded", "sleeping") + return False + def is_ext_openai_endpoint(endpoint: str) -> bool: """ Determine if an endpoint is an external OpenAI-compatible endpoint (not Ollama or llama-server). @@ -2908,8 +2921,8 @@ async def ps_details_proxy(request: Request): llama_models_pending: list[dict] = [] for (endpoint, modellist) in zip([ep for ep, _ in llama_tasks], llama_loaded): - # Filter for loaded models only - loaded_models = [item for item in modellist if _is_llama_model_loaded(item)] + # Include sleeping models too so _fetch_llama_props can unload them + loaded_models = [item for item in modellist if _is_llama_model_loaded_or_sleeping(item)] for item in loaded_models: if isinstance(item, dict) and item.get("id"): raw_id = item["id"]