diff --git a/router.py b/router.py index 0a2ba57..5bb1eaa 100644 --- a/router.py +++ b/router.py @@ -980,13 +980,11 @@ async def choose_endpoint(model: str) -> str: ] if loaded_and_free: - # Sort by total endpoint usage first (prefer idle endpoints) - # Then by per-model usage (balance load for this specific model) + # Sort by per-model usage in DESCENDING order to ensure model affinity + # Endpoints with higher usage (already handling this model) should be preferred + # until they reach max_concurrent_connections loaded_and_free.sort( - key=lambda ep: ( - sum(usage_counts.get(ep, {}).values()), # Primary: total endpoint usage - usage_counts.get(ep, {}).get(model, 0) # Secondary: per-model usage - ) + key=lambda ep: -usage_counts.get(ep, {}).get(model, 0) # Negative for descending order ) return loaded_and_free[0] @@ -997,12 +995,14 @@ async def choose_endpoint(model: str) -> str: ] if endpoints_with_free_slot: - # Sort by total endpoint usage first (prefer idle endpoints) - # Then by per-model usage (balance load for this specific model) + # Sort by per-model usage (descending) first to ensure model affinity + # Even if the model isn't showing as "loaded" in /api/ps yet (e.g., during initial loading), + # we want to send subsequent requests to the endpoint that already has connections for this model + # Then by total endpoint usage (ascending) to balance idle endpoints endpoints_with_free_slot.sort( key=lambda ep: ( - sum(usage_counts.get(ep, {}).values()), # Primary: total endpoint usage - usage_counts.get(ep, {}).get(model, 0) # Secondary: per-model usage + -usage_counts.get(ep, {}).get(model, 0), # Primary: per-model usage (descending - prefer endpoints with connections) + sum(usage_counts.get(ep, {}).values()) # Secondary: total endpoint usage (ascending - prefer idle endpoints) ) ) return endpoints_with_free_slot[0]