fix:

- _fetch_loaded_models_internal now writes _loaded_error_cache[endpoint] = time.time() on /api/ps or /v1/models failure, and clears the entry on success - choose_endpoint now filters out candidates with a fresh (<300s) loaded-models error. - /health now probes both /api/version and /api/ps for Ollama endpoints - dashboard adaption relates to #83
2026-05-18 13:45:06 +02:00 · 2026-05-18 13:45:06 +02:00 · db6aa73903
commit db6aa73903
parent 0b64a84e96
4 changed files with 251 additions and 90 deletions
--- a/test/test_choose_endpoint.py
+++ b/test/test_choose_endpoint.py
@ -1,4 +1,5 @@
 """Tests for choose_endpoint routing logic with mocked fetch calls."""
+import time
 from unittest.mock import AsyncMock, MagicMock, patch

 import pytest
@ -25,10 +26,12 @@ def _make_cfg(endpoints, llama_eps=None, max_conn=2, endpoint_config=None, prior

@pytest.fixture(autouse=True)
 def reset_usage():
-    """Clear usage_counts between tests to prevent bleed."""
+    """Clear usage_counts and error caches between tests to prevent bleed."""
    router.usage_counts.clear()
+    router._loaded_error_cache.clear()
    yield
    router.usage_counts.clear()
+    router._loaded_error_cache.clear()


 class TestChooseEndpointBasic:
@ -102,6 +105,57 @@ class TestChooseEndpointBasic:
        # Least-busy is EP2
        assert ep == EP2

+    async def test_excludes_endpoint_with_recent_loaded_error(self):
+        # Regression: issue #83 — when /api/ps fails for EP1 but EP1
+        # still advertises the model via /api/tags, routing must not
+        # fall back to EP1 just because it has a free slot.
+        cfg = _make_cfg([EP1, EP2])
+
+        async def available(ep, *_):
+            return {"llama3.2:latest"}
+
+        # EP1's /api/ps probe failed recently; EP2 is fine but the model
+        # is not loaded there. Without the health filter, EP1 would be
+        # picked by the free-slot fallback (step 4 in choose_endpoint).
+        router._loaded_error_cache[EP1] = time.time()
+
+        with (
+            patch.object(router, "config", cfg),
+            patch.object(router.fetch, "available_models", side_effect=available),
+            patch.object(router.fetch, "loaded_models", AsyncMock(return_value=set())),
+        ):
+            ep, _ = await router.choose_endpoint("llama3.2:latest")
+        assert ep == EP2
+
+    async def test_stale_loaded_error_does_not_exclude(self):
+        # Errors older than the 300s window must not keep an endpoint
+        # excluded forever.
+        cfg = _make_cfg([EP1])
+        router._loaded_error_cache[EP1] = time.time() - 301
+
+        with (
+            patch.object(router, "config", cfg),
+            patch.object(router.fetch, "available_models", AsyncMock(return_value={"m:latest"})),
+            patch.object(router.fetch, "loaded_models", AsyncMock(return_value={"m:latest"})),
+        ):
+            ep, _ = await router.choose_endpoint("m:latest")
+        assert ep == EP1
+
+    async def test_all_unhealthy_still_routes(self):
+        # If every candidate has a fresh loaded-error we still try one
+        # (it may have recovered between the cache write and now) rather
+        # than refusing to route.
+        cfg = _make_cfg([EP1])
+        router._loaded_error_cache[EP1] = time.time()
+
+        with (
+            patch.object(router, "config", cfg),
+            patch.object(router.fetch, "available_models", AsyncMock(return_value={"m:latest"})),
+            patch.object(router.fetch, "loaded_models", AsyncMock(return_value=set())),
+        ):
+            ep, _ = await router.choose_endpoint("m:latest")
+        assert ep == EP1
+
    async def test_reserve_increments_usage(self):
        cfg = _make_cfg([EP1])
        with (