feat(router): parallelize llama-server props fetch and add reasoning/tool call support

- Fetch `/props` endpoints in parallel to get context length and auto-unload sleeping models - Add support for reasoning content and tool calls in streaming openai chat/completions responses
2026-02-15 17:05:35 +01:00 · 2026-02-15 17:05:35 +01:00 · 372fe9fb72
commit 372fe9fb72
parent 4d40048fd2
1 changed files with 66 additions and 3 deletions
--- a/router.py
+++ b/router.py
@ -2430,6 +2430,10 @@ async def ps_details_proxy(request: Request):
    # Add llama-server models with endpoint info and full status metadata (if any)
    if llama_loaded:
        # Collect (endpoint, raw_id) pairs to fetch /props in parallel
        props_requests: list[tuple[str, str]] = []
        llama_models_pending: list[dict] = []
        for (endpoint, modellist) in zip([ep for ep, _ in llama_tasks], llama_loaded):
            # Filter for loaded models only
            loaded_models = [item for item in modellist if _is_llama_model_loaded(item)]
@ -2454,7 +2458,53 @@ async def ps_details_proxy(request: Request):
                    if isinstance(status_info, dict):
                        model_with_endpoint["llama_status_args"] = status_info.get("args")
                        model_with_endpoint["llama_status_preset"] = status_info.get("preset")
-                    models.append(model_with_endpoint)
+                    llama_models_pending.append(model_with_endpoint)
                    props_requests.append((endpoint, raw_id))
        # Fetch /props for each llama-server model to get context length (n_ctx)
        # and unload sleeping models automatically
        async def _fetch_llama_props(endpoint: str, model_id: str) -> tuple[int | None, bool]:
            client: aiohttp.ClientSession = app_state["session"]
            base_url = endpoint.rstrip("/").removesuffix("/v1")
            props_url = f"{base_url}/props?model={model_id}"
            headers = None
            api_key = config.api_keys.get(endpoint)
            if api_key:
                headers = {"Authorization": f"Bearer {api_key}"}
            try:
                async with client.get(props_url, headers=headers) as resp:
                    if resp.status == 200:
                        data = await resp.json()
                        dgs = data.get("default_generation_settings", {})
                        n_ctx = dgs.get("n_ctx")
                        is_sleeping = data.get("is_sleeping", False)
                        if is_sleeping:
                            unload_url = f"{base_url}/models/unload"
                            try:
                                async with client.post(
                                    unload_url,
                                    json={"model": model_id},
                                    headers=headers,
                                ) as unload_resp:
                                    print(f"[ps_details] Unloaded sleeping model {model_id} from {endpoint}: {unload_resp.status}")
                            except Exception as ue:
                                print(f"[ps_details] Failed to unload sleeping model {model_id} from {endpoint}: {ue}")
                        return n_ctx, is_sleeping
            except Exception as e:
                print(f"[ps_details] Failed to fetch props from {props_url}: {e}")
            return None, False
        props_results = await asyncio.gather(
            *[_fetch_llama_props(ep, mid) for ep, mid in props_requests]
        )
        for model_dict, (n_ctx, is_sleeping) in zip(llama_models_pending, props_results):
            if n_ctx is not None:
                model_dict["context_length"] = n_ctx
            if not is_sleeping:
                models.append(model_dict)
    return JSONResponse(content={"models": models}, status_code=200)
@ -2659,7 +2709,14 @@ async def openai_chat_completions_proxy(request: Request):
                        else orjson.dumps(chunk)
                    )
                    if chunk.choices:
-                        if chunk.choices[0].delta.content is not None:
+                        delta = chunk.choices[0].delta
                        has_content = delta.content is not None
                        has_reasoning = (
                            getattr(delta, "reasoning_content", None) is not None
                            or getattr(delta, "reasoning", None) is not None
                        )
                        has_tool_calls = getattr(delta, "tool_calls", None) is not None
                        if has_content or has_reasoning or has_tool_calls:
                            yield f"data: {data}\n\n".encode("utf-8")
                    elif chunk.usage is not None:
                        # Forward the usage-only final chunk (e.g. from llama-server)
@ -2792,7 +2849,13 @@ async def openai_completions_proxy(request: Request):
                        else orjson.dumps(chunk)
                    )
                    if chunk.choices:
-                        if chunk.choices[0].finish_reason == None:
+                        choice = chunk.choices[0]
                        has_text = getattr(choice, "text", None) is not None
                        has_reasoning = (
                            getattr(choice, "reasoning_content", None) is not None
                            or getattr(choice, "reasoning", None) is not None
                        )
                        if has_text or has_reasoning or choice.finish_reason is not None:
                            yield f"data: {data}\n\n".encode("utf-8")
                    elif chunk.usage is not None:
                        # Forward the usage-only final chunk (e.g. from llama-server)