fix:
- _fetch_loaded_models_internal now writes _loaded_error_cache[endpoint] = time.time() on /api/ps or /v1/models failure, and clears the entry on success - choose_endpoint now filters out candidates with a fresh (<300s) loaded-models error. - /health now probes both /api/version and /api/ps for Ollama endpoints - dashboard adaption relates to #83
This commit is contained in:
parent
0b64a84e96
commit
db6aa73903
4 changed files with 251 additions and 90 deletions
|
|
@ -1,4 +1,5 @@
|
|||
"""Tests for choose_endpoint routing logic with mocked fetch calls."""
|
||||
import time
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
|
@ -25,10 +26,12 @@ def _make_cfg(endpoints, llama_eps=None, max_conn=2, endpoint_config=None, prior
|
|||
|
||||
@pytest.fixture(autouse=True)
|
||||
def reset_usage():
|
||||
"""Clear usage_counts between tests to prevent bleed."""
|
||||
"""Clear usage_counts and error caches between tests to prevent bleed."""
|
||||
router.usage_counts.clear()
|
||||
router._loaded_error_cache.clear()
|
||||
yield
|
||||
router.usage_counts.clear()
|
||||
router._loaded_error_cache.clear()
|
||||
|
||||
|
||||
class TestChooseEndpointBasic:
|
||||
|
|
@ -102,6 +105,57 @@ class TestChooseEndpointBasic:
|
|||
# Least-busy is EP2
|
||||
assert ep == EP2
|
||||
|
||||
async def test_excludes_endpoint_with_recent_loaded_error(self):
|
||||
# Regression: issue #83 — when /api/ps fails for EP1 but EP1
|
||||
# still advertises the model via /api/tags, routing must not
|
||||
# fall back to EP1 just because it has a free slot.
|
||||
cfg = _make_cfg([EP1, EP2])
|
||||
|
||||
async def available(ep, *_):
|
||||
return {"llama3.2:latest"}
|
||||
|
||||
# EP1's /api/ps probe failed recently; EP2 is fine but the model
|
||||
# is not loaded there. Without the health filter, EP1 would be
|
||||
# picked by the free-slot fallback (step 4 in choose_endpoint).
|
||||
router._loaded_error_cache[EP1] = time.time()
|
||||
|
||||
with (
|
||||
patch.object(router, "config", cfg),
|
||||
patch.object(router.fetch, "available_models", side_effect=available),
|
||||
patch.object(router.fetch, "loaded_models", AsyncMock(return_value=set())),
|
||||
):
|
||||
ep, _ = await router.choose_endpoint("llama3.2:latest")
|
||||
assert ep == EP2
|
||||
|
||||
async def test_stale_loaded_error_does_not_exclude(self):
|
||||
# Errors older than the 300s window must not keep an endpoint
|
||||
# excluded forever.
|
||||
cfg = _make_cfg([EP1])
|
||||
router._loaded_error_cache[EP1] = time.time() - 301
|
||||
|
||||
with (
|
||||
patch.object(router, "config", cfg),
|
||||
patch.object(router.fetch, "available_models", AsyncMock(return_value={"m:latest"})),
|
||||
patch.object(router.fetch, "loaded_models", AsyncMock(return_value={"m:latest"})),
|
||||
):
|
||||
ep, _ = await router.choose_endpoint("m:latest")
|
||||
assert ep == EP1
|
||||
|
||||
async def test_all_unhealthy_still_routes(self):
|
||||
# If every candidate has a fresh loaded-error we still try one
|
||||
# (it may have recovered between the cache write and now) rather
|
||||
# than refusing to route.
|
||||
cfg = _make_cfg([EP1])
|
||||
router._loaded_error_cache[EP1] = time.time()
|
||||
|
||||
with (
|
||||
patch.object(router, "config", cfg),
|
||||
patch.object(router.fetch, "available_models", AsyncMock(return_value={"m:latest"})),
|
||||
patch.object(router.fetch, "loaded_models", AsyncMock(return_value=set())),
|
||||
):
|
||||
ep, _ = await router.choose_endpoint("m:latest")
|
||||
assert ep == EP1
|
||||
|
||||
async def test_reserve_increments_usage(self):
|
||||
cfg = _make_cfg([EP1])
|
||||
with (
|
||||
|
|
|
|||
|
|
@ -178,3 +178,33 @@ class TestFetchLoadedModels:
|
|||
first = await router.fetch.loaded_models(MOCK_OLLAMA_EP)
|
||||
second = await router.fetch.loaded_models(MOCK_OLLAMA_EP)
|
||||
assert first == second
|
||||
|
||||
async def test_records_error_in_loaded_error_cache_on_failure(self):
|
||||
# Regression: issue #83 — /api/ps failures must be recorded so
|
||||
# `choose_endpoint` can exclude unhealthy backends from routing.
|
||||
cfg = _make_cfg(ollama_eps=[MOCK_OLLAMA_EP], llama_eps=[])
|
||||
with patch.object(router, "config", cfg), aioresponses() as m:
|
||||
m.get(f"{MOCK_OLLAMA_EP}/api/ps", status=502, payload={})
|
||||
await router.fetch.loaded_models(MOCK_OLLAMA_EP)
|
||||
assert MOCK_OLLAMA_EP in router._loaded_error_cache
|
||||
|
||||
async def test_records_error_for_llama_server_on_failure(self):
|
||||
cfg = _make_cfg(ollama_eps=[], llama_eps=[MOCK_LLAMA_EP])
|
||||
with patch.object(router, "config", cfg), aioresponses() as m:
|
||||
m.get(f"{MOCK_LLAMA_EP}/models", status=502, payload={})
|
||||
await router.fetch.loaded_models(MOCK_LLAMA_EP)
|
||||
assert MOCK_LLAMA_EP in router._loaded_error_cache
|
||||
|
||||
async def test_clears_error_cache_on_subsequent_success(self):
|
||||
cfg = _make_cfg(ollama_eps=[MOCK_OLLAMA_EP], llama_eps=[])
|
||||
# Pre-seed an old error so loaded_models() falls through to the
|
||||
# network probe instead of short-circuiting on the error cache.
|
||||
async with router._loaded_error_cache_lock:
|
||||
router._loaded_error_cache[MOCK_OLLAMA_EP] = time.time() - 301
|
||||
with patch.object(router, "config", cfg), aioresponses() as m:
|
||||
m.get(
|
||||
f"{MOCK_OLLAMA_EP}/api/ps",
|
||||
payload={"models": [{"name": "qwen:7b"}]},
|
||||
)
|
||||
await router.fetch.loaded_models(MOCK_OLLAMA_EP)
|
||||
assert MOCK_OLLAMA_EP not in router._loaded_error_cache
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue