fix:
All checks were successful
PR Tests / test (pull_request) Successful in 58s
NYX Security Scan / nyx-scan (pull_request) Successful in 6m59s

-  _fetch_loaded_models_internal now writes _loaded_error_cache[endpoint] = time.time() on /api/ps or /v1/models failure, and clears the entry on success
- choose_endpoint now filters out candidates with a fresh (<300s) loaded-models error.
-  /health now probes both /api/version and /api/ps for Ollama endpoints
-  dashboard adaption

relates to #83
This commit is contained in:
Alpha Nerd 2026-05-18 13:45:06 +02:00
parent 0b64a84e96
commit db6aa73903
Signed by: alpha-nerd
SSH key fingerprint: SHA256:QkkAgVoYi9TQ0UKPkiKSfnerZy2h4qhi3SVPXJmBN+M
4 changed files with 251 additions and 90 deletions

View file

@ -1,4 +1,5 @@
"""Tests for choose_endpoint routing logic with mocked fetch calls."""
import time
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
@ -25,10 +26,12 @@ def _make_cfg(endpoints, llama_eps=None, max_conn=2, endpoint_config=None, prior
@pytest.fixture(autouse=True)
def reset_usage():
"""Clear usage_counts between tests to prevent bleed."""
"""Clear usage_counts and error caches between tests to prevent bleed."""
router.usage_counts.clear()
router._loaded_error_cache.clear()
yield
router.usage_counts.clear()
router._loaded_error_cache.clear()
class TestChooseEndpointBasic:
@ -102,6 +105,57 @@ class TestChooseEndpointBasic:
# Least-busy is EP2
assert ep == EP2
async def test_excludes_endpoint_with_recent_loaded_error(self):
# Regression: issue #83 — when /api/ps fails for EP1 but EP1
# still advertises the model via /api/tags, routing must not
# fall back to EP1 just because it has a free slot.
cfg = _make_cfg([EP1, EP2])
async def available(ep, *_):
return {"llama3.2:latest"}
# EP1's /api/ps probe failed recently; EP2 is fine but the model
# is not loaded there. Without the health filter, EP1 would be
# picked by the free-slot fallback (step 4 in choose_endpoint).
router._loaded_error_cache[EP1] = time.time()
with (
patch.object(router, "config", cfg),
patch.object(router.fetch, "available_models", side_effect=available),
patch.object(router.fetch, "loaded_models", AsyncMock(return_value=set())),
):
ep, _ = await router.choose_endpoint("llama3.2:latest")
assert ep == EP2
async def test_stale_loaded_error_does_not_exclude(self):
# Errors older than the 300s window must not keep an endpoint
# excluded forever.
cfg = _make_cfg([EP1])
router._loaded_error_cache[EP1] = time.time() - 301
with (
patch.object(router, "config", cfg),
patch.object(router.fetch, "available_models", AsyncMock(return_value={"m:latest"})),
patch.object(router.fetch, "loaded_models", AsyncMock(return_value={"m:latest"})),
):
ep, _ = await router.choose_endpoint("m:latest")
assert ep == EP1
async def test_all_unhealthy_still_routes(self):
# If every candidate has a fresh loaded-error we still try one
# (it may have recovered between the cache write and now) rather
# than refusing to route.
cfg = _make_cfg([EP1])
router._loaded_error_cache[EP1] = time.time()
with (
patch.object(router, "config", cfg),
patch.object(router.fetch, "available_models", AsyncMock(return_value={"m:latest"})),
patch.object(router.fetch, "loaded_models", AsyncMock(return_value=set())),
):
ep, _ = await router.choose_endpoint("m:latest")
assert ep == EP1
async def test_reserve_increments_usage(self):
cfg = _make_cfg([EP1])
with (