diff --git a/router.py b/router.py index 5676bb9..1ef096a 100644 --- a/router.py +++ b/router.py @@ -1525,27 +1525,26 @@ async def choose_endpoint(model: str) -> str: # Protect all reads of usage_counts with the lock async with usage_lock: - # Helper: get current usage count for (endpoint, model) - def current_usage(ep: str) -> int: - return usage_counts.get(ep, {}).get(model, 0) + # Helper: current usage for (endpoint, model) using the same normalized key + # that increment_usage/decrement_usage store — raw model names differ from + # tracking names for llama-server (HF prefix / quant suffix stripped). + def tracking_usage(ep: str) -> int: + return usage_counts.get(ep, {}).get(get_tracking_model(ep, model), 0) # 3️⃣ Endpoints that have the model loaded *and* a free slot loaded_and_free = [ ep for ep, models in zip(candidate_endpoints, loaded_sets) - if model in models and usage_counts.get(ep, {}).get(model, 0) < config.max_concurrent_connections + if model in models and tracking_usage(ep) < config.max_concurrent_connections ] if loaded_and_free: - # Sort by per-model usage in ASCENDING order for load balancing. - # All endpoints in this set already have the model loaded, so there is - # no model-switching cost to avoid — prefer the least-busy endpoint. - loaded_and_free.sort( - key=lambda ep: usage_counts.get(ep, {}).get(model, 0) - ) + # Sort ascending for load balancing — all endpoints here already have the + # model loaded, so there is no model-switching cost to optimise for. + loaded_and_free.sort(key=tracking_usage) - # If all endpoints have zero usage for this model, randomize to distribute - # different models across different endpoints for better resource utilization - if all(usage_counts.get(ep, {}).get(model, 0) == 0 for ep in loaded_and_free): + # When all candidates are equally idle, randomise to avoid always picking + # the first entry in a stable sort. + if all(tracking_usage(ep) == 0 for ep in loaded_and_free): return random.choice(loaded_and_free) return loaded_and_free[0] @@ -1553,30 +1552,22 @@ async def choose_endpoint(model: str) -> str: # 4️⃣ Endpoints among the candidates that simply have a free slot endpoints_with_free_slot = [ ep for ep in candidate_endpoints - if usage_counts.get(ep, {}).get(model, 0) < config.max_concurrent_connections + if tracking_usage(ep) < config.max_concurrent_connections ] if endpoints_with_free_slot: - # Sort by per-model usage (descending) first to ensure model affinity - # Even if the model isn't showing as "loaded" in /api/ps yet (e.g., during initial loading), - # we want to send subsequent requests to the endpoint that already has connections for this model - # Then by total endpoint usage (ascending) to balance idle endpoints + # Sort by total endpoint load (ascending) to prefer idle endpoints. endpoints_with_free_slot.sort( - key=lambda ep: ( - #-usage_counts.get(ep, {}).get(model, 0), # Primary: per-model usage (descending - prefer endpoints with connections) - sum(usage_counts.get(ep, {}).values()) # Secondary: total endpoint usage (ascending - prefer idle endpoints) - ) + key=lambda ep: sum(usage_counts.get(ep, {}).values()) ) - # If all endpoints have zero usage for this specific model, randomize to distribute - # different models across different endpoints for better resource utilization - if all(usage_counts.get(ep, {}).get(model, 0) == 0 for ep in endpoints_with_free_slot): + if all(tracking_usage(ep) == 0 for ep in endpoints_with_free_slot): return random.choice(endpoints_with_free_slot) return endpoints_with_free_slot[0] - # 5️⃣ All candidate endpoints are saturated – pick one with lowest usages count (will queue) - ep = min(candidate_endpoints, key=current_usage) + # 5️⃣ All candidate endpoints are saturated – pick the least-busy one (will queue) + ep = min(candidate_endpoints, key=tracking_usage) return ep # -------------------------------------------------------------