From e296ac19badeeb2258de42375d66325a2cdfa803 Mon Sep 17 00:00:00 2001
From: alpha nerd <alpha-nerd@nomyo.ai>
Date: Thu, 7 May 2026 11:34:09 +0200
Subject: [PATCH] feat: new helper to bridge change of behaviour in llama.cpp
 v1/models status  - now correctly reporting "sleeping" or "loaded" for
 auto-unload

---
 router.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/router.py b/router.py
index 326ec33..7cf3ada 100644
--- a/router.py
+++ b/router.py
@@ -572,6 +572,19 @@ def _is_llama_model_loaded(item: dict) -> bool:
         return status == "loaded"
     return False
 
+def _is_llama_model_loaded_or_sleeping(item: dict) -> bool:
+    """Return True if status is 'loaded' or 'sleeping'.
+    Newer llama-server versions report 'sleeping' in /v1/models when a model is idle;
+    ps_details needs to include these so _fetch_llama_props can detect and unload them."""
+    status = item.get("status")
+    if status is None:
+        return True
+    if isinstance(status, dict):
+        return status.get("value") in ("loaded", "sleeping")
+    if isinstance(status, str):
+        return status in ("loaded", "sleeping")
+    return False
+
 def is_ext_openai_endpoint(endpoint: str) -> bool:
     """
     Determine if an endpoint is an external OpenAI-compatible endpoint (not Ollama or llama-server).
@@ -2908,8 +2921,8 @@ async def ps_details_proxy(request: Request):
         llama_models_pending: list[dict] = []
 
         for (endpoint, modellist) in zip([ep for ep, _ in llama_tasks], llama_loaded):
-            # Filter for loaded models only
-            loaded_models = [item for item in modellist if _is_llama_model_loaded(item)]
+            # Include sleeping models too so _fetch_llama_props can unload them
+            loaded_models = [item for item in modellist if _is_llama_model_loaded_or_sleeping(item)]
             for item in loaded_models:
                 if isinstance(item, dict) and item.get("id"):
                     raw_id = item["id"]