fix: endpoint selection logic
This commit is contained in:
parent
5ad5bfe66e
commit
3e3f0dd383
1 changed files with 10 additions and 10 deletions
20
router.py
20
router.py
|
|
@ -980,13 +980,11 @@ async def choose_endpoint(model: str) -> str:
|
||||||
]
|
]
|
||||||
|
|
||||||
if loaded_and_free:
|
if loaded_and_free:
|
||||||
# Sort by total endpoint usage first (prefer idle endpoints)
|
# Sort by per-model usage in DESCENDING order to ensure model affinity
|
||||||
# Then by per-model usage (balance load for this specific model)
|
# Endpoints with higher usage (already handling this model) should be preferred
|
||||||
|
# until they reach max_concurrent_connections
|
||||||
loaded_and_free.sort(
|
loaded_and_free.sort(
|
||||||
key=lambda ep: (
|
key=lambda ep: -usage_counts.get(ep, {}).get(model, 0) # Negative for descending order
|
||||||
sum(usage_counts.get(ep, {}).values()), # Primary: total endpoint usage
|
|
||||||
usage_counts.get(ep, {}).get(model, 0) # Secondary: per-model usage
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
return loaded_and_free[0]
|
return loaded_and_free[0]
|
||||||
|
|
||||||
|
|
@ -997,12 +995,14 @@ async def choose_endpoint(model: str) -> str:
|
||||||
]
|
]
|
||||||
|
|
||||||
if endpoints_with_free_slot:
|
if endpoints_with_free_slot:
|
||||||
# Sort by total endpoint usage first (prefer idle endpoints)
|
# Sort by per-model usage (descending) first to ensure model affinity
|
||||||
# Then by per-model usage (balance load for this specific model)
|
# Even if the model isn't showing as "loaded" in /api/ps yet (e.g., during initial loading),
|
||||||
|
# we want to send subsequent requests to the endpoint that already has connections for this model
|
||||||
|
# Then by total endpoint usage (ascending) to balance idle endpoints
|
||||||
endpoints_with_free_slot.sort(
|
endpoints_with_free_slot.sort(
|
||||||
key=lambda ep: (
|
key=lambda ep: (
|
||||||
sum(usage_counts.get(ep, {}).values()), # Primary: total endpoint usage
|
-usage_counts.get(ep, {}).get(model, 0), # Primary: per-model usage (descending - prefer endpoints with connections)
|
||||||
usage_counts.get(ep, {}).get(model, 0) # Secondary: per-model usage
|
sum(usage_counts.get(ep, {}).values()) # Secondary: total endpoint usage (ascending - prefer idle endpoints)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
return endpoints_with_free_slot[0]
|
return endpoints_with_free_slot[0]
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue