diff --git a/router.py b/router.py index e15efcc..990974f 100644 --- a/router.py +++ b/router.py @@ -997,11 +997,11 @@ class rechunk: ) with_thinking = chunk.choices[0] if chunk.choices[0] else None if stream == True: - thinking = getattr(with_thinking.delta, "reasoning", None) if with_thinking else None + thinking = (getattr(with_thinking.delta, "reasoning_content", None) or getattr(with_thinking.delta, "reasoning", None)) if with_thinking else None role = chunk.choices[0].delta.role or "assistant" content = chunk.choices[0].delta.content or '' else: - thinking = getattr(with_thinking, "reasoning", None) if with_thinking else None + thinking = (getattr(with_thinking.message, "reasoning_content", None) or getattr(with_thinking.message, "reasoning", None)) if with_thinking else None role = chunk.choices[0].message.role or "assistant" content = chunk.choices[0].message.content or '' assistant_msg = ollama.Message( @@ -1211,7 +1211,7 @@ async def choose_endpoint(model: str) -> str: # Then by total endpoint usage (ascending) to balance idle endpoints endpoints_with_free_slot.sort( key=lambda ep: ( - -usage_counts.get(ep, {}).get(model, 0), # Primary: per-model usage (descending - prefer endpoints with connections) + #-usage_counts.get(ep, {}).get(model, 0), # Primary: per-model usage (descending - prefer endpoints with connections) sum(usage_counts.get(ep, {}).values()) # Secondary: total endpoint usage (ascending - prefer idle endpoints) ) )