feat(router): parallelize llama-server props fetch and add reasoning/tool call support
- Fetch `/props` endpoints in parallel to get context length and auto-unload sleeping models - Add support for reasoning content and tool calls in streaming openai chat/completions responses
This commit is contained in:
parent
4d40048fd2
commit
372fe9fb72
1 changed files with 66 additions and 3 deletions
69
router.py
69
router.py
|
|
@ -2430,6 +2430,10 @@ async def ps_details_proxy(request: Request):
|
||||||
|
|
||||||
# Add llama-server models with endpoint info and full status metadata (if any)
|
# Add llama-server models with endpoint info and full status metadata (if any)
|
||||||
if llama_loaded:
|
if llama_loaded:
|
||||||
|
# Collect (endpoint, raw_id) pairs to fetch /props in parallel
|
||||||
|
props_requests: list[tuple[str, str]] = []
|
||||||
|
llama_models_pending: list[dict] = []
|
||||||
|
|
||||||
for (endpoint, modellist) in zip([ep for ep, _ in llama_tasks], llama_loaded):
|
for (endpoint, modellist) in zip([ep for ep, _ in llama_tasks], llama_loaded):
|
||||||
# Filter for loaded models only
|
# Filter for loaded models only
|
||||||
loaded_models = [item for item in modellist if _is_llama_model_loaded(item)]
|
loaded_models = [item for item in modellist if _is_llama_model_loaded(item)]
|
||||||
|
|
@ -2454,7 +2458,53 @@ async def ps_details_proxy(request: Request):
|
||||||
if isinstance(status_info, dict):
|
if isinstance(status_info, dict):
|
||||||
model_with_endpoint["llama_status_args"] = status_info.get("args")
|
model_with_endpoint["llama_status_args"] = status_info.get("args")
|
||||||
model_with_endpoint["llama_status_preset"] = status_info.get("preset")
|
model_with_endpoint["llama_status_preset"] = status_info.get("preset")
|
||||||
models.append(model_with_endpoint)
|
llama_models_pending.append(model_with_endpoint)
|
||||||
|
props_requests.append((endpoint, raw_id))
|
||||||
|
|
||||||
|
# Fetch /props for each llama-server model to get context length (n_ctx)
|
||||||
|
# and unload sleeping models automatically
|
||||||
|
async def _fetch_llama_props(endpoint: str, model_id: str) -> tuple[int | None, bool]:
|
||||||
|
client: aiohttp.ClientSession = app_state["session"]
|
||||||
|
base_url = endpoint.rstrip("/").removesuffix("/v1")
|
||||||
|
props_url = f"{base_url}/props?model={model_id}"
|
||||||
|
headers = None
|
||||||
|
api_key = config.api_keys.get(endpoint)
|
||||||
|
if api_key:
|
||||||
|
headers = {"Authorization": f"Bearer {api_key}"}
|
||||||
|
try:
|
||||||
|
async with client.get(props_url, headers=headers) as resp:
|
||||||
|
if resp.status == 200:
|
||||||
|
data = await resp.json()
|
||||||
|
dgs = data.get("default_generation_settings", {})
|
||||||
|
n_ctx = dgs.get("n_ctx")
|
||||||
|
is_sleeping = data.get("is_sleeping", False)
|
||||||
|
|
||||||
|
if is_sleeping:
|
||||||
|
unload_url = f"{base_url}/models/unload"
|
||||||
|
try:
|
||||||
|
async with client.post(
|
||||||
|
unload_url,
|
||||||
|
json={"model": model_id},
|
||||||
|
headers=headers,
|
||||||
|
) as unload_resp:
|
||||||
|
print(f"[ps_details] Unloaded sleeping model {model_id} from {endpoint}: {unload_resp.status}")
|
||||||
|
except Exception as ue:
|
||||||
|
print(f"[ps_details] Failed to unload sleeping model {model_id} from {endpoint}: {ue}")
|
||||||
|
|
||||||
|
return n_ctx, is_sleeping
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[ps_details] Failed to fetch props from {props_url}: {e}")
|
||||||
|
return None, False
|
||||||
|
|
||||||
|
props_results = await asyncio.gather(
|
||||||
|
*[_fetch_llama_props(ep, mid) for ep, mid in props_requests]
|
||||||
|
)
|
||||||
|
|
||||||
|
for model_dict, (n_ctx, is_sleeping) in zip(llama_models_pending, props_results):
|
||||||
|
if n_ctx is not None:
|
||||||
|
model_dict["context_length"] = n_ctx
|
||||||
|
if not is_sleeping:
|
||||||
|
models.append(model_dict)
|
||||||
|
|
||||||
return JSONResponse(content={"models": models}, status_code=200)
|
return JSONResponse(content={"models": models}, status_code=200)
|
||||||
|
|
||||||
|
|
@ -2659,7 +2709,14 @@ async def openai_chat_completions_proxy(request: Request):
|
||||||
else orjson.dumps(chunk)
|
else orjson.dumps(chunk)
|
||||||
)
|
)
|
||||||
if chunk.choices:
|
if chunk.choices:
|
||||||
if chunk.choices[0].delta.content is not None:
|
delta = chunk.choices[0].delta
|
||||||
|
has_content = delta.content is not None
|
||||||
|
has_reasoning = (
|
||||||
|
getattr(delta, "reasoning_content", None) is not None
|
||||||
|
or getattr(delta, "reasoning", None) is not None
|
||||||
|
)
|
||||||
|
has_tool_calls = getattr(delta, "tool_calls", None) is not None
|
||||||
|
if has_content or has_reasoning or has_tool_calls:
|
||||||
yield f"data: {data}\n\n".encode("utf-8")
|
yield f"data: {data}\n\n".encode("utf-8")
|
||||||
elif chunk.usage is not None:
|
elif chunk.usage is not None:
|
||||||
# Forward the usage-only final chunk (e.g. from llama-server)
|
# Forward the usage-only final chunk (e.g. from llama-server)
|
||||||
|
|
@ -2792,7 +2849,13 @@ async def openai_completions_proxy(request: Request):
|
||||||
else orjson.dumps(chunk)
|
else orjson.dumps(chunk)
|
||||||
)
|
)
|
||||||
if chunk.choices:
|
if chunk.choices:
|
||||||
if chunk.choices[0].finish_reason == None:
|
choice = chunk.choices[0]
|
||||||
|
has_text = getattr(choice, "text", None) is not None
|
||||||
|
has_reasoning = (
|
||||||
|
getattr(choice, "reasoning_content", None) is not None
|
||||||
|
or getattr(choice, "reasoning", None) is not None
|
||||||
|
)
|
||||||
|
if has_text or has_reasoning or choice.finish_reason is not None:
|
||||||
yield f"data: {data}\n\n".encode("utf-8")
|
yield f"data: {data}\n\n".encode("utf-8")
|
||||||
elif chunk.usage is not None:
|
elif chunk.usage is not None:
|
||||||
# Forward the usage-only final chunk (e.g. from llama-server)
|
# Forward the usage-only final chunk (e.g. from llama-server)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue