fix: logic extend on total_load AND loaded_count
This commit is contained in:
parent
5184123fd2
commit
c8da58430a
2 changed files with 61 additions and 9 deletions
39
routing.py
39
routing.py
|
|
@ -206,16 +206,37 @@ async def choose_endpoint(model: str, reserve: bool = True,
|
|||
"""Sum of in-flight requests across *all* models on the endpoint."""
|
||||
return sum(usage_counts.get(ep, {}).values())
|
||||
|
||||
# How many models each candidate currently has *resident* (from the
|
||||
# /api/ps probe). With infinite keep-alive a model stays loaded long
|
||||
# after its in-flight count drops to zero, so this is the signal that
|
||||
# spreads *distinct* models across backends.
|
||||
ep_loaded_counts = {
|
||||
ep: len(models) for ep, models in zip(candidate_endpoints, loaded_sets)
|
||||
}
|
||||
|
||||
def loaded_count(ep: str) -> int:
|
||||
return ep_loaded_counts.get(ep, 0)
|
||||
|
||||
def pick_least_loaded(eps: list[str]) -> str:
|
||||
"""Pick the endpoint with the lowest total load, breaking ties at
|
||||
random. Using total load (not per-model usage) for both the ranking
|
||||
*and* the tie-break is what keeps a request off a backend already
|
||||
busy with a *different* model — otherwise the per-model count reads
|
||||
zero everywhere and the ranking gets discarded. See issue: a cold
|
||||
model B would land on the backend already serving model A while
|
||||
other backends sat idle."""
|
||||
min_load = min(total_load(ep) for ep in eps)
|
||||
tied = [ep for ep in eps if total_load(ep) == min_load]
|
||||
"""Pick the least-committed endpoint, breaking ties at random.
|
||||
|
||||
Ordering key is ``(total_load, loaded_count)``:
|
||||
|
||||
* ``total_load`` (in-flight requests across *all* models) keeps a
|
||||
request off a backend already busy with a *different* model —
|
||||
otherwise the per-model count reads zero everywhere and the
|
||||
ranking is discarded (cold model B landing on the box serving A).
|
||||
* ``loaded_count`` (number of *resident* models) then spreads
|
||||
distinct models across backends. Two different cold models (27b,
|
||||
35b) requested back-to-back must not pile onto the same box: once
|
||||
27b is resident there, that box has loaded_count 1 while the idle
|
||||
backends have 0, so the next cold model prefers an empty backend
|
||||
even though every backend reports zero in-flight load.
|
||||
|
||||
``random.choice`` only breaks genuine ties on both keys, so a single
|
||||
idle cluster still distributes the very first cold model evenly."""
|
||||
best = min((total_load(ep), loaded_count(ep)) for ep in eps)
|
||||
tied = [ep for ep in eps if (total_load(ep), loaded_count(ep)) == best]
|
||||
return random.choice(tied)
|
||||
|
||||
# Priority map: position in all_endpoints list (lower = higher priority)
|
||||
|
|
|
|||
|
|
@ -112,6 +112,37 @@ class TestChooseEndpointBasic:
|
|||
assert ep in (EP1, EP2)
|
||||
assert ep != EP3
|
||||
|
||||
async def test_two_cold_models_spread_across_backends(self):
|
||||
# Regression: 3 backends all advertise all models. Two *different*
|
||||
# cold models requested back-to-back must land on *different*
|
||||
# backends. Once model-a is resident on the chosen backend (infinite
|
||||
# keep-alive), its in-flight count drops back to 0 — so only the
|
||||
# resident-model count distinguishes the backends. Without it, the
|
||||
# second cold model would randomly re-collide on the busy backend.
|
||||
cfg = _make_cfg([EP1, EP2, EP3], max_conn=4)
|
||||
|
||||
async def available(ep, *_):
|
||||
return {"model-a:latest", "model-b:latest"}
|
||||
|
||||
# model-a finished loading on EP1 and stays resident; its request has
|
||||
# completed so EP1 has zero in-flight load, same as EP2/EP3.
|
||||
loaded = {EP1: {"model-a:latest"}, EP2: set(), EP3: set()}
|
||||
|
||||
async def loaded_models(ep):
|
||||
return loaded[ep]
|
||||
|
||||
with (
|
||||
patch.object(router, "config", cfg),
|
||||
patch.object(router.fetch, "available_models", side_effect=available),
|
||||
patch.object(router.fetch, "loaded_models", side_effect=loaded_models),
|
||||
):
|
||||
# A cold model-b must avoid EP1 (which already holds model-a) and
|
||||
# go to one of the empty backends, every time.
|
||||
for _ in range(50):
|
||||
ep, _ = await router.choose_endpoint("model-b:latest", reserve=False)
|
||||
assert ep in (EP2, EP3)
|
||||
assert ep != EP1
|
||||
|
||||
async def test_saturated_picks_least_busy(self):
|
||||
cfg = _make_cfg([EP1, EP2])
|
||||
cfg.max_concurrent_connections = 1
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue