fix: improve routing logic to favour unloaded backends instead of looking at per-model load now looking at backend total load

2026-06-13 10:22:20 +02:00 · 2026-06-13 10:22:20 +02:00 · 5184123fd2
commit 5184123fd2
parent b28f175b61
2 changed files with 52 additions and 17 deletions
--- a/test/test_choose_endpoint.py
+++ b/test/test_choose_endpoint.py
@ -85,6 +85,33 @@ class TestChooseEndpointBasic:
            ep, _ = await router.choose_endpoint("llama3.2:latest")
        assert ep in (EP1, EP2)

+    async def test_cold_model_avoids_backend_busy_with_other_model(self):
+        # Regression: heterogeneous cluster. A cold model B (loaded nowhere)
+        # must not be routed to a backend already serving a *different* model
+        # while other backends sit idle. The step-4 idle check used to look at
+        # per-model usage (zero everywhere for B) and discard the total-load
+        # ranking, so B could land on the busy backend at random.
+        cfg = _make_cfg([EP1, EP2, EP3], max_conn=4)
+
+        async def available(ep, *_):
+            return {"model-a:latest", "model-b:latest"}
+
+        # EP3 is busy with model A; EP1 and EP2 are completely idle. Model B
+        # is loaded nowhere.
+        router.usage_counts[EP3]["model-a:latest"] = 1
+
+        with (
+            patch.object(router, "config", cfg),
+            patch.object(router.fetch, "available_models", side_effect=available),
+            patch.object(router.fetch, "loaded_models", AsyncMock(return_value=set())),
+        ):
+            # Run repeatedly: the busy backend must be excluded every time,
+            # the idle two share the load at random.
+            for _ in range(50):
+                ep, _ = await router.choose_endpoint("model-b:latest", reserve=False)
+                assert ep in (EP1, EP2)
+                assert ep != EP3
+
    async def test_saturated_picks_least_busy(self):
        cfg = _make_cfg([EP1, EP2])
        cfg.max_concurrent_connections = 1