fix: treat external openai compatible endpoint always as loaded for the advertised models #128

2026-06-25 14:22:54 +02:00 · 2026-06-25 14:22:54 +02:00 · c9e495876b
commit c9e495876b
parent 1b3ad057a6
2 changed files with 21 additions and 4 deletions
--- a/backends/probe.py
+++ b/backends/probe.py
@ -321,7 +321,16 @@ class fetch:
        to serve requests immediately even when cache is stale (refreshing in background).
        """
        if is_ext_openai_endpoint(endpoint):
-            return set()
+            # External OpenAI-compatible backends (vLLM, OpenAI, Groq, …) keep
+            # every advertised model permanently resident — there is no
+            # /api/ps-style "loaded" subset to probe. Report the advertised set
+            # as the loaded set so choose_endpoint's "loaded & free" preference
+            # treats them on par with Ollama/llama-server backends that have the
+            # model hot, instead of relegating them to the free-slot fallback and
+            # never using them when an Ollama box advertises the same model.
+            # See issue #128. (llama-server / llama-swap are NOT ext-openai — they
+            # keep their real resident-subset detection below.)
+            return await fetch.available_models(endpoint, get_config().api_keys.get(endpoint))

        # Check loaded models cache with lock protection
        async with _loaded_models_cache_lock:
--- a/test/test_fetch.py
+++ b/test/test_fetch.py
@ -260,12 +260,20 @@ class TestFetchLoadedModels:
            models = await router.fetch.loaded_models(MOCK_OLLAMA_EP)
        assert models == set()

-    async def test_ext_openai_always_empty(self):
+    async def test_ext_openai_reports_advertised_as_loaded(self):
+        # Issue #128: external OpenAI-compatible backends (vLLM, OpenAI, …) keep
+        # every advertised model permanently resident, so loaded_models mirrors
+        # the advertised set rather than returning empty (which made them lose
+        # choose_endpoint's "loaded & free" preference to Ollama boxes).
        ext_ep = "https://api.openai.com/v1"
        cfg = _make_cfg(ollama_eps=[ext_ep], llama_eps=[])
-        with patch.object(router, "config", cfg):
+        with patch.object(router, "config", cfg), mock_probe() as m:
+            m.add_get(
+                f"{ext_ep}/models",
+                payload={"data": [{"id": "gpt-4o"}, {"id": "gpt-4o-mini"}]},
+            )
            models = await router.fetch.loaded_models(ext_ep)
-        assert models == set()
+        assert models == {"gpt-4o", "gpt-4o-mini"}

    async def test_caches_result(self):
        cfg = _make_cfg(ollama_eps=[MOCK_OLLAMA_EP], llama_eps=[])