diff --git a/routing.py b/routing.py index 059cf9a..6a0e205 100644 --- a/routing.py +++ b/routing.py @@ -202,6 +202,22 @@ async def choose_endpoint(model: str, reserve: bool = True, def utilization_ratio(ep: str) -> float: return tracking_usage(ep) / get_max_connections(ep) + def total_load(ep: str) -> int: + """Sum of in-flight requests across *all* models on the endpoint.""" + return sum(usage_counts.get(ep, {}).values()) + + def pick_least_loaded(eps: list[str]) -> str: + """Pick the endpoint with the lowest total load, breaking ties at + random. Using total load (not per-model usage) for both the ranking + *and* the tie-break is what keeps a request off a backend already + busy with a *different* model — otherwise the per-model count reads + zero everywhere and the ranking gets discarded. See issue: a cold + model B would land on the backend already serving model A while + other backends sat idle.""" + min_load = min(total_load(ep) for ep in eps) + tied = [ep for ep in eps if total_load(ep) == min_load] + return random.choice(tied) + # Priority map: position in all_endpoints list (lower = higher priority) ep_priority = {ep: i for i, ep in enumerate(all_endpoints)} @@ -235,15 +251,11 @@ async def choose_endpoint(model: str, reserve: bool = True, loaded_and_free.sort(key=utilization_ratio) selected = loaded_and_free[0] else: - # Sort ascending for load balancing — all endpoints here already have the - # model loaded, so there is no model-switching cost to optimise for. - loaded_and_free.sort(key=tracking_usage) - # When all candidates are equally idle, randomise to avoid always picking - # the first entry in a stable sort. - if all(tracking_usage(ep) == 0 for ep in loaded_and_free): - selected = random.choice(loaded_and_free) - else: - selected = loaded_and_free[0] + # All endpoints here already have the model loaded, so there + # is no model-switching cost to optimise for. Pick the least + # *total*-loaded one (tie broken at random) so we steer away + # from a backend busy serving other models. + selected = pick_least_loaded(loaded_and_free) else: # 4️⃣ Endpoints among the candidates that simply have a free slot endpoints_with_free_slot = [ @@ -257,14 +269,10 @@ async def choose_endpoint(model: str, reserve: bool = True, endpoints_with_free_slot.sort(key=utilization_ratio) selected = endpoints_with_free_slot[0] else: - # Sort by total endpoint load (ascending) to prefer idle endpoints. - endpoints_with_free_slot.sort( - key=lambda ep: sum(usage_counts.get(ep, {}).values()) - ) - if all(tracking_usage(ep) == 0 for ep in endpoints_with_free_slot): - selected = random.choice(endpoints_with_free_slot) - else: - selected = endpoints_with_free_slot[0] + # Prefer the endpoint with the lowest *total* load so the + # cold-start cost lands on genuinely idle hardware rather + # than a backend already busy with a different model. + selected = pick_least_loaded(endpoints_with_free_slot) else: # 5️⃣ All candidate endpoints are saturated – pick the least-busy one (will queue) if config.priority_routing: diff --git a/test/test_choose_endpoint.py b/test/test_choose_endpoint.py index ece609a..a6a7905 100644 --- a/test/test_choose_endpoint.py +++ b/test/test_choose_endpoint.py @@ -85,6 +85,33 @@ class TestChooseEndpointBasic: ep, _ = await router.choose_endpoint("llama3.2:latest") assert ep in (EP1, EP2) + async def test_cold_model_avoids_backend_busy_with_other_model(self): + # Regression: heterogeneous cluster. A cold model B (loaded nowhere) + # must not be routed to a backend already serving a *different* model + # while other backends sit idle. The step-4 idle check used to look at + # per-model usage (zero everywhere for B) and discard the total-load + # ranking, so B could land on the busy backend at random. + cfg = _make_cfg([EP1, EP2, EP3], max_conn=4) + + async def available(ep, *_): + return {"model-a:latest", "model-b:latest"} + + # EP3 is busy with model A; EP1 and EP2 are completely idle. Model B + # is loaded nowhere. + router.usage_counts[EP3]["model-a:latest"] = 1 + + with ( + patch.object(router, "config", cfg), + patch.object(router.fetch, "available_models", side_effect=available), + patch.object(router.fetch, "loaded_models", AsyncMock(return_value=set())), + ): + # Run repeatedly: the busy backend must be excluded every time, + # the idle two share the load at random. + for _ in range(50): + ep, _ = await router.choose_endpoint("model-b:latest", reserve=False) + assert ep in (EP1, EP2) + assert ep != EP3 + async def test_saturated_picks_least_busy(self): cfg = _make_cfg([EP1, EP2]) cfg.max_concurrent_connections = 1