diff --git a/backends/probe.py b/backends/probe.py index f59e65e..2e53f01 100644 --- a/backends/probe.py +++ b/backends/probe.py @@ -321,7 +321,16 @@ class fetch: to serve requests immediately even when cache is stale (refreshing in background). """ if is_ext_openai_endpoint(endpoint): - return set() + # External OpenAI-compatible backends (vLLM, OpenAI, Groq, …) keep + # every advertised model permanently resident — there is no + # /api/ps-style "loaded" subset to probe. Report the advertised set + # as the loaded set so choose_endpoint's "loaded & free" preference + # treats them on par with Ollama/llama-server backends that have the + # model hot, instead of relegating them to the free-slot fallback and + # never using them when an Ollama box advertises the same model. + # See issue #128. (llama-server / llama-swap are NOT ext-openai — they + # keep their real resident-subset detection below.) + return await fetch.available_models(endpoint, get_config().api_keys.get(endpoint)) # Check loaded models cache with lock protection async with _loaded_models_cache_lock: diff --git a/test/test_fetch.py b/test/test_fetch.py index dae51e4..ef7e30e 100644 --- a/test/test_fetch.py +++ b/test/test_fetch.py @@ -260,12 +260,20 @@ class TestFetchLoadedModels: models = await router.fetch.loaded_models(MOCK_OLLAMA_EP) assert models == set() - async def test_ext_openai_always_empty(self): + async def test_ext_openai_reports_advertised_as_loaded(self): + # Issue #128: external OpenAI-compatible backends (vLLM, OpenAI, …) keep + # every advertised model permanently resident, so loaded_models mirrors + # the advertised set rather than returning empty (which made them lose + # choose_endpoint's "loaded & free" preference to Ollama boxes). ext_ep = "https://api.openai.com/v1" cfg = _make_cfg(ollama_eps=[ext_ep], llama_eps=[]) - with patch.object(router, "config", cfg): + with patch.object(router, "config", cfg), mock_probe() as m: + m.add_get( + f"{ext_ep}/models", + payload={"data": [{"id": "gpt-4o"}, {"id": "gpt-4o-mini"}]}, + ) models = await router.fetch.loaded_models(ext_ep) - assert models == set() + assert models == {"gpt-4o", "gpt-4o-mini"} async def test_caches_result(self): cfg = _make_cfg(ollama_eps=[MOCK_OLLAMA_EP], llama_eps=[])