fix: treat external openai compatible endpoint always as loaded for the advertised models #128
All checks were successful
PR Tests / test (pull_request) Successful in 1m24s
NYX Security Scan / nyx-scan (pull_request) Successful in 5m39s

This commit is contained in:
Alpha Nerd 2026-06-25 14:22:54 +02:00
parent 1b3ad057a6
commit c9e495876b
Signed by: alpha-nerd
SSH key fingerprint: SHA256:QkkAgVoYi9TQ0UKPkiKSfnerZy2h4qhi3SVPXJmBN+M
2 changed files with 21 additions and 4 deletions

View file

@ -321,7 +321,16 @@ class fetch:
to serve requests immediately even when cache is stale (refreshing in background).
"""
if is_ext_openai_endpoint(endpoint):
return set()
# External OpenAI-compatible backends (vLLM, OpenAI, Groq, …) keep
# every advertised model permanently resident — there is no
# /api/ps-style "loaded" subset to probe. Report the advertised set
# as the loaded set so choose_endpoint's "loaded & free" preference
# treats them on par with Ollama/llama-server backends that have the
# model hot, instead of relegating them to the free-slot fallback and
# never using them when an Ollama box advertises the same model.
# See issue #128. (llama-server / llama-swap are NOT ext-openai — they
# keep their real resident-subset detection below.)
return await fetch.available_models(endpoint, get_config().api_keys.get(endpoint))
# Check loaded models cache with lock protection
async with _loaded_models_cache_lock:

View file

@ -260,12 +260,20 @@ class TestFetchLoadedModels:
models = await router.fetch.loaded_models(MOCK_OLLAMA_EP)
assert models == set()
async def test_ext_openai_always_empty(self):
async def test_ext_openai_reports_advertised_as_loaded(self):
# Issue #128: external OpenAI-compatible backends (vLLM, OpenAI, …) keep
# every advertised model permanently resident, so loaded_models mirrors
# the advertised set rather than returning empty (which made them lose
# choose_endpoint's "loaded & free" preference to Ollama boxes).
ext_ep = "https://api.openai.com/v1"
cfg = _make_cfg(ollama_eps=[ext_ep], llama_eps=[])
with patch.object(router, "config", cfg):
with patch.object(router, "config", cfg), mock_probe() as m:
m.add_get(
f"{ext_ep}/models",
payload={"data": [{"id": "gpt-4o"}, {"id": "gpt-4o-mini"}]},
)
models = await router.fetch.loaded_models(ext_ep)
assert models == set()
assert models == {"gpt-4o", "gpt-4o-mini"}
async def test_caches_result(self):
cfg = _make_cfg(ollama_eps=[MOCK_OLLAMA_EP], llama_eps=[])