From a74cc5be0f1413eb707e9e592b5342e4f8ceb6e2 Mon Sep 17 00:00:00 2001 From: alpha-nerd-nomyo Date: Tue, 23 Sep 2025 12:51:37 +0200 Subject: [PATCH] fixing endpoint usage metrics --- router.py | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/router.py b/router.py index d509ed8..94f84ca 100644 --- a/router.py +++ b/router.py @@ -276,15 +276,29 @@ def iso8601_ns(): class rechunk: def openai_chat_completion2ollama(chunk: dict, stream: bool, start_ts: float) -> ollama.ChatResponse: + if chunk.choices == [] and chunk.usage is not None: + return ollama.ChatResponse( + model=chunk.model, + created_at=iso8601_ns(), + done=True, + done_reason='stop', + total_duration=int((time.perf_counter() - start_ts) * 1_000_000_000), + load_duration=100000, + prompt_eval_count=int(chunk.usage.prompt_tokens), + prompt_eval_duration=int((time.perf_counter() - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)), + eval_count=int(chunk.usage.completion_tokens), + eval_duration=int((time.perf_counter() - start_ts) * 1_000_000_000), + message={"role": "assistant"} + ) with_thinking = chunk.choices[0] if chunk.choices[0] else None if stream == True: thinking = getattr(with_thinking.delta, "reasoning", None) if with_thinking else None role = chunk.choices[0].delta.role or "assistant" - content = chunk.choices[0].delta.content or "" + content = chunk.choices[0].delta.content or '' else: thinking = getattr(with_thinking, "reasoning", None) if with_thinking else None role = chunk.choices[0].message.role or "assistant" - content = chunk.choices[0].message.content or "" + content = chunk.choices[0].message.content or '' assistant_msg = ollama.Message( role=role, content=content, @@ -295,8 +309,8 @@ class rechunk: rechunk = ollama.ChatResponse( model=chunk.model, created_at=iso8601_ns(), - done=True if chunk.choices[0].finish_reason is not None else False, - done_reason=chunk.choices[0].finish_reason, + done=True if chunk.usage is not None else False, + done_reason=chunk.choices[0].finish_reason, #if chunk.choices[0].finish_reason is not None else None, total_duration=int((time.perf_counter() - start_ts) * 1_000_000_000) if chunk.usage is not None else 0, load_duration=100000, prompt_eval_count=int(chunk.usage.prompt_tokens) if chunk.usage is not None else 0, @@ -312,15 +326,15 @@ class rechunk: rechunk = ollama.GenerateResponse( model=chunk.model, created_at=iso8601_ns(), - done=False, #True if chunk.choices[0].finish_reason is not None else False, + done=True if chunk.usage is not None else False, done_reason=chunk.choices[0].finish_reason, total_duration=int((time.perf_counter() - start_ts) * 1000) if chunk.usage is not None else 0, load_duration=10000, - prompt_eval_count=0, + prompt_eval_count=int(chunk.usage.prompt_tokens) if chunk.usage is not None else 0, prompt_eval_duration=int((time.perf_counter() - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None else 0, - eval_count=0, + eval_count=int(chunk.usage.completion_tokens) if chunk.usage is not None else 0, eval_duration=int((time.perf_counter() - start_ts) * 1000) if chunk.usage is not None else 0, - response=chunk.choices[0].text or "", + response=chunk.choices[0].text or '', thinking=thinking) return rechunk @@ -624,6 +638,7 @@ async def chat_proxy(request: Request): optional_params = { "tools": tools, "stream": stream, + "stream_options": {"include_usage": True} if stream is not None else None, "max_tokens": options.get("num_predict") if options and "num_predict" in options else None, "frequency_penalty": options.get("frequency_penalty") if options and "frequency_penalty" in options else None, "presence_penalty": options.get("presence_penalty") if options and "presence_penalty" in options else None, @@ -634,7 +649,6 @@ async def chat_proxy(request: Request): "response_format": {"type": "json_schema", "json_schema": _format} if _format is not None else None } params.update({k: v for k, v in optional_params.items() if v is not None}) - print(params) oclient = openai.AsyncOpenAI(base_url=endpoint, default_headers=default_headers, api_key=config.api_keys[endpoint]) else: client = ollama.AsyncClient(host=endpoint) @@ -650,11 +664,9 @@ async def chat_proxy(request: Request): async_gen = await client.chat(model=model, messages=messages, tools=tools, stream=stream, think=think, format=_format, options=options, keep_alive=keep_alive) if stream == True: async for chunk in async_gen: - print(chunk) if is_openai_endpoint: chunk = rechunk.openai_chat_completion2ollama(chunk, stream, start_ts) # `chunk` can be a dict or a pydantic model – dump to JSON safely - print(chunk) if hasattr(chunk, "model_dump_json"): json_line = chunk.model_dump_json() else: @@ -1315,9 +1327,10 @@ async def openai_chat_completions_proxy(request: Request): if hasattr(chunk, "model_dump_json") else json.dumps(chunk) ) - yield f"data: {data}\n\n".encode("utf-8") + if chunk.choices[0].delta.content is not None: + yield f"data: {data}\n\n".encode("utf-8") # Final DONE event - yield b"data: [DONE]\n\n" + #yield b"data: [DONE]\n\n" else: json_line = ( async_gen.model_dump_json()