fix: improve routing logic to favour unloaded backends instead of looking at per-model load now looking at backend total load

2026-06-13 10:22:20 +02:00 · 2026-06-13 10:22:20 +02:00 · 5184123fd2
commit 5184123fd2
parent b28f175b61
2 changed files with 52 additions and 17 deletions
--- a/routing.py
+++ b/routing.py
@ -202,6 +202,22 @@ async def choose_endpoint(model: str, reserve: bool = True,
        def utilization_ratio(ep: str) -> float:
            return tracking_usage(ep) / get_max_connections(ep)

+        def total_load(ep: str) -> int:
+            """Sum of in-flight requests across *all* models on the endpoint."""
+            return sum(usage_counts.get(ep, {}).values())
+
+        def pick_least_loaded(eps: list[str]) -> str:
+            """Pick the endpoint with the lowest total load, breaking ties at
+            random. Using total load (not per-model usage) for both the ranking
+            *and* the tie-break is what keeps a request off a backend already
+            busy with a *different* model — otherwise the per-model count reads
+            zero everywhere and the ranking gets discarded. See issue: a cold
+            model B would land on the backend already serving model A while
+            other backends sat idle."""
+            min_load = min(total_load(ep) for ep in eps)
+            tied = [ep for ep in eps if total_load(ep) == min_load]
+            return random.choice(tied)
+
        # Priority map: position in all_endpoints list (lower = higher priority)
        ep_priority = {ep: i for i, ep in enumerate(all_endpoints)}

@ -235,15 +251,11 @@ async def choose_endpoint(model: str, reserve: bool = True,
                    loaded_and_free.sort(key=utilization_ratio)
                    selected = loaded_and_free[0]
                else:
-                    # Sort ascending for load balancing — all endpoints here already have the
-                    # model loaded, so there is no model-switching cost to optimise for.
-                    loaded_and_free.sort(key=tracking_usage)
-                    # When all candidates are equally idle, randomise to avoid always picking
-                    # the first entry in a stable sort.
-                    if all(tracking_usage(ep) == 0 for ep in loaded_and_free):
-                        selected = random.choice(loaded_and_free)
-                    else:
-                        selected = loaded_and_free[0]
+                    # All endpoints here already have the model loaded, so there
+                    # is no model-switching cost to optimise for. Pick the least
+                    # *total*-loaded one (tie broken at random) so we steer away
+                    # from a backend busy serving other models.
+                    selected = pick_least_loaded(loaded_and_free)
            else:
                # 4️⃣ Endpoints among the candidates that simply have a free slot
                endpoints_with_free_slot = [
@ -257,14 +269,10 @@ async def choose_endpoint(model: str, reserve: bool = True,
                        endpoints_with_free_slot.sort(key=utilization_ratio)
                        selected = endpoints_with_free_slot[0]
                    else:
-                        # Sort by total endpoint load (ascending) to prefer idle endpoints.
-                        endpoints_with_free_slot.sort(
-                            key=lambda ep: sum(usage_counts.get(ep, {}).values())
-                        )
-                        if all(tracking_usage(ep) == 0 for ep in endpoints_with_free_slot):
-                            selected = random.choice(endpoints_with_free_slot)
-                        else:
-                            selected = endpoints_with_free_slot[0]
+                        # Prefer the endpoint with the lowest *total* load so the
+                        # cold-start cost lands on genuinely idle hardware rather
+                        # than a backend already busy with a different model.
+                        selected = pick_least_loaded(endpoints_with_free_slot)
                else:
                    # 5️⃣ All candidate endpoints are saturated – pick the least-busy one (will queue)
                    if config.priority_routing:
--- a/test/test_choose_endpoint.py
+++ b/test/test_choose_endpoint.py
@ -85,6 +85,33 @@ class TestChooseEndpointBasic:
            ep, _ = await router.choose_endpoint("llama3.2:latest")
        assert ep in (EP1, EP2)

+    async def test_cold_model_avoids_backend_busy_with_other_model(self):
+        # Regression: heterogeneous cluster. A cold model B (loaded nowhere)
+        # must not be routed to a backend already serving a *different* model
+        # while other backends sit idle. The step-4 idle check used to look at
+        # per-model usage (zero everywhere for B) and discard the total-load
+        # ranking, so B could land on the busy backend at random.
+        cfg = _make_cfg([EP1, EP2, EP3], max_conn=4)
+
+        async def available(ep, *_):
+            return {"model-a:latest", "model-b:latest"}
+
+        # EP3 is busy with model A; EP1 and EP2 are completely idle. Model B
+        # is loaded nowhere.
+        router.usage_counts[EP3]["model-a:latest"] = 1
+
+        with (
+            patch.object(router, "config", cfg),
+            patch.object(router.fetch, "available_models", side_effect=available),
+            patch.object(router.fetch, "loaded_models", AsyncMock(return_value=set())),
+        ):
+            # Run repeatedly: the busy backend must be excluded every time,
+            # the idle two share the load at random.
+            for _ in range(50):
+                ep, _ = await router.choose_endpoint("model-b:latest", reserve=False)
+                assert ep in (EP1, EP2)
+                assert ep != EP3
+
    async def test_saturated_picks_least_busy(self):
        cfg = _make_cfg([EP1, EP2])
        cfg.max_concurrent_connections = 1