From aeca77c1a161dd0cbdaefd5c4a87368ea8ba66ec Mon Sep 17 00:00:00 2001 From: alpha-nerd-nomyo Date: Sun, 21 Sep 2025 16:33:43 +0200 Subject: [PATCH 1/6] formatting, condensing rechunk --- router.py | 92 ++++++++++++++++++++++++++----------------------------- 1 file changed, 44 insertions(+), 48 deletions(-) diff --git a/router.py b/router.py index d67a93b..01e5b9a 100644 --- a/router.py +++ b/router.py @@ -277,48 +277,44 @@ def iso8601_ns(): class rechunk: def openai_chat_completion2ollama(chunk: dict, stream: bool, start_ts: float): if stream == True: - assistant_msg = ollama.Message( - role=chunk.choices[0].delta.role or "assistant", - content=chunk.choices[0].delta.content, - thinking=None, - images=None, - tool_name=None, - tool_calls=None - ) + role = chunk.choices[0].delta.role or "assistant" + content = chunk.choices[0].delta.content else: - assistant_msg = ollama.Message( - role=chunk.choices[0].message.role or "assistant", - content=chunk.choices[0].message.content, - thinking=None, - images=None, - tool_name=None, - tool_calls=None - ) - rechunk = ollama.ChatResponse(model=chunk.model, - created_at=iso8601_ns(), - done_reason=chunk.choices[0].finish_reason, - load_duration=100000, - prompt_eval_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None else None), - eval_count= (chunk.usage.completion_tokens if chunk.usage is not None else None), - prompt_eval_count=(chunk.usage.prompt_tokens if chunk.usage is not None else None), - eval_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000) if chunk.usage is not None else None), - total_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000) if chunk.usage is not None else None), - message=assistant_msg) + role = chunk.choices[0].message.role or "assistant" + content = chunk.choices[0].message.content + assistant_msg = ollama.Message( + role=role, + content=content, + thinking=None, + images=None, + tool_name=None, + tool_calls=None) + rechunk = ollama.ChatResponse( + model=chunk.model, + created_at=iso8601_ns(), + done_reason=chunk.choices[0].finish_reason, + load_duration=100000, + prompt_eval_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None else None), + eval_count= (chunk.usage.completion_tokens if chunk.usage is not None else None), + prompt_eval_count=(chunk.usage.prompt_tokens if chunk.usage is not None else None), + eval_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000) if chunk.usage is not None else None), + total_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000) if chunk.usage is not None else None), + message=assistant_msg) return rechunk def openai_completion2ollama(chunk: dict, stream: bool, start_ts: float): with_thinking = chunk.choices[0] if chunk.choices[0] else None thinking = getattr(with_thinking, "reasoning", None) if with_thinking else None - rechunk = ollama.GenerateResponse(model=chunk.model, - created_at=iso8601_ns(), - load_duration=10000, - done_reason=chunk.choices[0].finish_reason, - done=None, #True if chunk.choices[0].finish_reason is not None else False, - total_duration=(int((time.perf_counter() - start_ts) * 1000) if chunk.usage is not None else None), - eval_duration=(int((time.perf_counter() - start_ts) * 1000) if chunk.usage is not None else None), - thinking=thinking, - response=chunk.choices[0].text - ) + rechunk = ollama.GenerateResponse( + model=chunk.model, + created_at=iso8601_ns(), + load_duration=10000, + done_reason=chunk.choices[0].finish_reason, + done=None, #True if chunk.choices[0].finish_reason is not None else False, + total_duration=(int((time.perf_counter() - start_ts) * 1000) if chunk.usage is not None else None), + eval_duration=(int((time.perf_counter() - start_ts) * 1000) if chunk.usage is not None else None), + thinking=thinking, + response=chunk.choices[0].text) return rechunk def openai_embeddings2ollama(chunk: dict): @@ -326,18 +322,18 @@ class rechunk: return rechunk def openai_embed2ollama(chunk: dict, model: str): - rechunk = ollama.EmbedResponse(model=model, - created_at=iso8601_ns(), - done=None, - done_reason=None, - total_duration=None, - load_duration=None, - prompt_eval_count=None, - prompt_eval_duration=None, - eval_count=None, - eval_duration=None, - embeddings=[chunk.data[0].embedding] - ) + rechunk = ollama.EmbedResponse( + model=model, + created_at=iso8601_ns(), + done=None, + done_reason=None, + total_duration=None, + load_duration=None, + prompt_eval_count=None, + prompt_eval_duration=None, + eval_count=None, + eval_duration=None, + embeddings=[chunk.data[0].embedding]) return rechunk # ------------------------------------------------------------------ # SSE Helpser From 18d2fca0276324cdcade9e4a8b9de56a55124a82 Mon Sep 17 00:00:00 2001 From: alpha-nerd-nomyo Date: Mon, 22 Sep 2025 09:30:27 +0200 Subject: [PATCH 2/6] formatting Response Objects in rechunk and fixing TypeErrors in /api/chat and /api/generate --- router.py | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/router.py b/router.py index 01e5b9a..36bb05d 100644 --- a/router.py +++ b/router.py @@ -275,7 +275,9 @@ def iso8601_ns(): return iso8601_with_ns class rechunk: - def openai_chat_completion2ollama(chunk: dict, stream: bool, start_ts: float): + def openai_chat_completion2ollama(chunk: dict, stream: bool, start_ts: float) -> ollama.ChatResponse: + with_thinking = chunk.choices[0] if chunk.choices[0] else None + thinking = getattr(with_thinking, "reasoning", None) if with_thinking else None if stream == True: role = chunk.choices[0].delta.role or "assistant" content = chunk.choices[0].delta.content @@ -285,43 +287,47 @@ class rechunk: assistant_msg = ollama.Message( role=role, content=content, - thinking=None, + thinking=thinking, images=None, tool_name=None, tool_calls=None) rechunk = ollama.ChatResponse( model=chunk.model, created_at=iso8601_ns(), + done=False, #True if chunk.choices[0].finish_reason is not None else False, done_reason=chunk.choices[0].finish_reason, + total_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000) if chunk.usage is not None else None), load_duration=100000, + prompt_eval_count=(chunk.usage.prompt_tokens if chunk.usage is not None else None), prompt_eval_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None else None), eval_count= (chunk.usage.completion_tokens if chunk.usage is not None else None), - prompt_eval_count=(chunk.usage.prompt_tokens if chunk.usage is not None else None), eval_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000) if chunk.usage is not None else None), - total_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000) if chunk.usage is not None else None), message=assistant_msg) return rechunk - def openai_completion2ollama(chunk: dict, stream: bool, start_ts: float): + def openai_completion2ollama(chunk: dict, stream: bool, start_ts: float) -> ollama.GenerateResponse: with_thinking = chunk.choices[0] if chunk.choices[0] else None thinking = getattr(with_thinking, "reasoning", None) if with_thinking else None rechunk = ollama.GenerateResponse( model=chunk.model, created_at=iso8601_ns(), - load_duration=10000, + done=False, #True if chunk.choices[0].finish_reason is not None else False, done_reason=chunk.choices[0].finish_reason, - done=None, #True if chunk.choices[0].finish_reason is not None else False, total_duration=(int((time.perf_counter() - start_ts) * 1000) if chunk.usage is not None else None), + load_duration=10000, + prompt_eval_count=None, + prompt_eval_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None else None), + eval_count=None, eval_duration=(int((time.perf_counter() - start_ts) * 1000) if chunk.usage is not None else None), - thinking=thinking, - response=chunk.choices[0].text) + response=chunk.choices[0].text, + thinking=thinking) return rechunk - def openai_embeddings2ollama(chunk: dict): + def openai_embeddings2ollama(chunk: dict) -> ollama.EmbeddingsResponse: rechunk = ollama.EmbeddingsResponse(embedding=chunk.data[0].embedding) return rechunk - def openai_embed2ollama(chunk: dict, model: str): + def openai_embed2ollama(chunk: dict, model: str) -> ollama.EmbedResponse: rechunk = ollama.EmbedResponse( model=model, created_at=iso8601_ns(), @@ -538,7 +544,7 @@ async def proxy(request: Request): else: if is_openai_endpoint: response = rechunk.openai_completion2ollama(async_gen, stream, start_ts) - response = json.dumps(response) + response = response.model_dump_json() else: response = async_gen.model_dump_json() json_line = ( @@ -570,7 +576,7 @@ async def chat_proxy(request: Request): try: body_bytes = await request.body() payload = json.loads(body_bytes.decode("utf-8")) - + print(payload) model = payload.get("model") messages = payload.get("messages") tools = payload.get("tools") @@ -628,6 +634,7 @@ async def chat_proxy(request: Request): async_gen = await client.chat(model=model, messages=messages, tools=tools, stream=stream, think=think, format=_format, options=options, keep_alive=keep_alive) if stream == True: async for chunk in async_gen: + print(chunk) if is_openai_endpoint: chunk = rechunk.openai_chat_completion2ollama(chunk, stream, start_ts) # `chunk` can be a dict or a pydantic model – dump to JSON safely @@ -639,13 +646,13 @@ async def chat_proxy(request: Request): else: if is_openai_endpoint: response = rechunk.openai_chat_completion2ollama(async_gen, stream, start_ts) - response = json.dumps(response) + response = response.model_dump_json() else: response = async_gen.model_dump_json() json_line = ( response - if hasattr(async_gen, "model_dump_json") - else json.dumps(async_gen) + if hasattr(response, "model_dump_json") + else json.dumps(response) ) yield json_line.encode("utf-8") + b"\n" From c43dc4139feab0a968482a95dd9a5c0047a2347b Mon Sep 17 00:00:00 2001 From: alpha-nerd-nomyo Date: Mon, 22 Sep 2025 14:04:19 +0200 Subject: [PATCH 3/6] adding optional parameters in ollama to openai translation --- router.py | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/router.py b/router.py index 36bb05d..f8c7796 100644 --- a/router.py +++ b/router.py @@ -294,7 +294,7 @@ class rechunk: rechunk = ollama.ChatResponse( model=chunk.model, created_at=iso8601_ns(), - done=False, #True if chunk.choices[0].finish_reason is not None else False, + done=True if chunk.choices[0].finish_reason is not None else False, done_reason=chunk.choices[0].finish_reason, total_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000) if chunk.usage is not None else None), load_duration=100000, @@ -341,6 +341,7 @@ class rechunk: eval_duration=None, embeddings=[chunk.data[0].embedding]) return rechunk + # ------------------------------------------------------------------ # SSE Helpser # ------------------------------------------------------------------ @@ -490,7 +491,7 @@ async def proxy(request: Request): images = payload.get("images") options = payload.get("options") keep_alive = payload.get("keep_alive") - + if not model: raise HTTPException( status_code=400, detail="Missing required field 'model'" @@ -516,8 +517,15 @@ async def proxy(request: Request): optional_params = { "stream": stream, - } - + "max_tokens": options.get("num_predict") if options and "num_predict" in options else None, + "frequency_penalty": options.get("frequency_penalty") if options and "frequency_penalty" in options else None, + "presence_penalty": options.get("presence_penalty") if options and "presence_penalty" in options else None, + "seed": options.get("seed") if options and "seed" in options else None, + "stop": options.get("stop") if options and "stop" in options else None, + "top_p": options.get("top_p") if options and "top_p" in options else None, + "temperature": options.get("temperature") if options and "temperature" in options else None, + "sufix": suffix, + } params.update({k: v for k, v in optional_params.items() if v is not None}) oclient = openai.AsyncOpenAI(base_url=endpoint, default_headers=default_headers, api_key=config.api_keys[endpoint]) else: @@ -576,7 +584,7 @@ async def chat_proxy(request: Request): try: body_bytes = await request.body() payload = json.loads(body_bytes.decode("utf-8")) - print(payload) + model = payload.get("model") messages = payload.get("messages") tools = payload.get("tools") @@ -586,7 +594,7 @@ async def chat_proxy(request: Request): options = payload.get("options") keep_alive = payload.get("keep_alive") options = payload.get("options") - + if not model: raise HTTPException( status_code=400, detail="Missing required field 'model'" @@ -612,12 +620,19 @@ async def chat_proxy(request: Request): params = { "messages": messages, "model": model, - } - + } optional_params = { "tools": tools, "stream": stream, - } + "max_tokens": options.get("num_predict") if options and "num_predict" in options else None, + "frequency_penalty": options.get("frequency_penalty") if options and "frequency_penalty" in options else None, + "presence_penalty": options.get("presence_penalty") if options and "presence_penalty" in options else None, + "seed": options.get("seed") if options and "seed" in options else None, + "stop": options.get("stop") if options and "stop" in options else None, + "top_p": options.get("top_p") if options and "top_p" in options else None, + "temperature": options.get("temperature") if options and "temperature" in options else None, + "response_format": {"type": "json_schema", "json_schema": _format} if _format is not None else None + } params.update({k: v for k, v in optional_params.items() if v is not None}) oclient = openai.AsyncOpenAI(base_url=endpoint, default_headers=default_headers, api_key=config.api_keys[endpoint]) else: @@ -638,6 +653,7 @@ async def chat_proxy(request: Request): if is_openai_endpoint: chunk = rechunk.openai_chat_completion2ollama(chunk, stream, start_ts) # `chunk` can be a dict or a pydantic model – dump to JSON safely + print(chunk) if hasattr(chunk, "model_dump_json"): json_line = chunk.model_dump_json() else: From 19df75afa90528a0ca334d440c848285d7ba1848 Mon Sep 17 00:00:00 2001 From: alpha-nerd-nomyo Date: Mon, 22 Sep 2025 19:01:14 +0200 Subject: [PATCH 4/6] fixing types and params --- router.py | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/router.py b/router.py index f8c7796..d509ed8 100644 --- a/router.py +++ b/router.py @@ -277,13 +277,14 @@ def iso8601_ns(): class rechunk: def openai_chat_completion2ollama(chunk: dict, stream: bool, start_ts: float) -> ollama.ChatResponse: with_thinking = chunk.choices[0] if chunk.choices[0] else None - thinking = getattr(with_thinking, "reasoning", None) if with_thinking else None if stream == True: + thinking = getattr(with_thinking.delta, "reasoning", None) if with_thinking else None role = chunk.choices[0].delta.role or "assistant" - content = chunk.choices[0].delta.content + content = chunk.choices[0].delta.content or "" else: + thinking = getattr(with_thinking, "reasoning", None) if with_thinking else None role = chunk.choices[0].message.role or "assistant" - content = chunk.choices[0].message.content + content = chunk.choices[0].message.content or "" assistant_msg = ollama.Message( role=role, content=content, @@ -296,12 +297,12 @@ class rechunk: created_at=iso8601_ns(), done=True if chunk.choices[0].finish_reason is not None else False, done_reason=chunk.choices[0].finish_reason, - total_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000) if chunk.usage is not None else None), + total_duration=int((time.perf_counter() - start_ts) * 1_000_000_000) if chunk.usage is not None else 0, load_duration=100000, - prompt_eval_count=(chunk.usage.prompt_tokens if chunk.usage is not None else None), - prompt_eval_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None else None), - eval_count= (chunk.usage.completion_tokens if chunk.usage is not None else None), - eval_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000) if chunk.usage is not None else None), + prompt_eval_count=int(chunk.usage.prompt_tokens) if chunk.usage is not None else 0, + prompt_eval_duration=int((time.perf_counter() - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None else 0, + eval_count=int(chunk.usage.completion_tokens) if chunk.usage is not None else 0, + eval_duration=int((time.perf_counter() - start_ts) * 1_000_000_000) if chunk.usage is not None else 0, message=assistant_msg) return rechunk @@ -313,13 +314,13 @@ class rechunk: created_at=iso8601_ns(), done=False, #True if chunk.choices[0].finish_reason is not None else False, done_reason=chunk.choices[0].finish_reason, - total_duration=(int((time.perf_counter() - start_ts) * 1000) if chunk.usage is not None else None), + total_duration=int((time.perf_counter() - start_ts) * 1000) if chunk.usage is not None else 0, load_duration=10000, - prompt_eval_count=None, - prompt_eval_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None else None), - eval_count=None, - eval_duration=(int((time.perf_counter() - start_ts) * 1000) if chunk.usage is not None else None), - response=chunk.choices[0].text, + prompt_eval_count=0, + prompt_eval_duration=int((time.perf_counter() - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None else 0, + eval_count=0, + eval_duration=int((time.perf_counter() - start_ts) * 1000) if chunk.usage is not None else 0, + response=chunk.choices[0].text or "", thinking=thinking) return rechunk @@ -591,7 +592,6 @@ async def chat_proxy(request: Request): stream = payload.get("stream") think = payload.get("think") _format = payload.get("format") - options = payload.get("options") keep_alive = payload.get("keep_alive") options = payload.get("options") @@ -634,6 +634,7 @@ async def chat_proxy(request: Request): "response_format": {"type": "json_schema", "json_schema": _format} if _format is not None else None } params.update({k: v for k, v in optional_params.items() if v is not None}) + print(params) oclient = openai.AsyncOpenAI(base_url=endpoint, default_headers=default_headers, api_key=config.api_keys[endpoint]) else: client = ollama.AsyncClient(host=endpoint) @@ -667,8 +668,8 @@ async def chat_proxy(request: Request): response = async_gen.model_dump_json() json_line = ( response - if hasattr(response, "model_dump_json") - else json.dumps(response) + if hasattr(async_gen, "model_dump_json") + else json.dumps(async_gen) ) yield json_line.encode("utf-8") + b"\n" @@ -677,9 +678,10 @@ async def chat_proxy(request: Request): await decrement_usage(endpoint, model) # 4. Return a StreamingResponse backed by the generator + media_type = "application/x-ndjson" if stream else "application/json" return StreamingResponse( stream_chat_response(), - media_type="application/json", + media_type=media_type, ) # ------------------------------------------------------------- From a74cc5be0f1413eb707e9e592b5342e4f8ceb6e2 Mon Sep 17 00:00:00 2001 From: alpha-nerd-nomyo Date: Tue, 23 Sep 2025 12:51:37 +0200 Subject: [PATCH 5/6] fixing endpoint usage metrics --- router.py | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/router.py b/router.py index d509ed8..94f84ca 100644 --- a/router.py +++ b/router.py @@ -276,15 +276,29 @@ def iso8601_ns(): class rechunk: def openai_chat_completion2ollama(chunk: dict, stream: bool, start_ts: float) -> ollama.ChatResponse: + if chunk.choices == [] and chunk.usage is not None: + return ollama.ChatResponse( + model=chunk.model, + created_at=iso8601_ns(), + done=True, + done_reason='stop', + total_duration=int((time.perf_counter() - start_ts) * 1_000_000_000), + load_duration=100000, + prompt_eval_count=int(chunk.usage.prompt_tokens), + prompt_eval_duration=int((time.perf_counter() - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)), + eval_count=int(chunk.usage.completion_tokens), + eval_duration=int((time.perf_counter() - start_ts) * 1_000_000_000), + message={"role": "assistant"} + ) with_thinking = chunk.choices[0] if chunk.choices[0] else None if stream == True: thinking = getattr(with_thinking.delta, "reasoning", None) if with_thinking else None role = chunk.choices[0].delta.role or "assistant" - content = chunk.choices[0].delta.content or "" + content = chunk.choices[0].delta.content or '' else: thinking = getattr(with_thinking, "reasoning", None) if with_thinking else None role = chunk.choices[0].message.role or "assistant" - content = chunk.choices[0].message.content or "" + content = chunk.choices[0].message.content or '' assistant_msg = ollama.Message( role=role, content=content, @@ -295,8 +309,8 @@ class rechunk: rechunk = ollama.ChatResponse( model=chunk.model, created_at=iso8601_ns(), - done=True if chunk.choices[0].finish_reason is not None else False, - done_reason=chunk.choices[0].finish_reason, + done=True if chunk.usage is not None else False, + done_reason=chunk.choices[0].finish_reason, #if chunk.choices[0].finish_reason is not None else None, total_duration=int((time.perf_counter() - start_ts) * 1_000_000_000) if chunk.usage is not None else 0, load_duration=100000, prompt_eval_count=int(chunk.usage.prompt_tokens) if chunk.usage is not None else 0, @@ -312,15 +326,15 @@ class rechunk: rechunk = ollama.GenerateResponse( model=chunk.model, created_at=iso8601_ns(), - done=False, #True if chunk.choices[0].finish_reason is not None else False, + done=True if chunk.usage is not None else False, done_reason=chunk.choices[0].finish_reason, total_duration=int((time.perf_counter() - start_ts) * 1000) if chunk.usage is not None else 0, load_duration=10000, - prompt_eval_count=0, + prompt_eval_count=int(chunk.usage.prompt_tokens) if chunk.usage is not None else 0, prompt_eval_duration=int((time.perf_counter() - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None else 0, - eval_count=0, + eval_count=int(chunk.usage.completion_tokens) if chunk.usage is not None else 0, eval_duration=int((time.perf_counter() - start_ts) * 1000) if chunk.usage is not None else 0, - response=chunk.choices[0].text or "", + response=chunk.choices[0].text or '', thinking=thinking) return rechunk @@ -624,6 +638,7 @@ async def chat_proxy(request: Request): optional_params = { "tools": tools, "stream": stream, + "stream_options": {"include_usage": True} if stream is not None else None, "max_tokens": options.get("num_predict") if options and "num_predict" in options else None, "frequency_penalty": options.get("frequency_penalty") if options and "frequency_penalty" in options else None, "presence_penalty": options.get("presence_penalty") if options and "presence_penalty" in options else None, @@ -634,7 +649,6 @@ async def chat_proxy(request: Request): "response_format": {"type": "json_schema", "json_schema": _format} if _format is not None else None } params.update({k: v for k, v in optional_params.items() if v is not None}) - print(params) oclient = openai.AsyncOpenAI(base_url=endpoint, default_headers=default_headers, api_key=config.api_keys[endpoint]) else: client = ollama.AsyncClient(host=endpoint) @@ -650,11 +664,9 @@ async def chat_proxy(request: Request): async_gen = await client.chat(model=model, messages=messages, tools=tools, stream=stream, think=think, format=_format, options=options, keep_alive=keep_alive) if stream == True: async for chunk in async_gen: - print(chunk) if is_openai_endpoint: chunk = rechunk.openai_chat_completion2ollama(chunk, stream, start_ts) # `chunk` can be a dict or a pydantic model – dump to JSON safely - print(chunk) if hasattr(chunk, "model_dump_json"): json_line = chunk.model_dump_json() else: @@ -1315,9 +1327,10 @@ async def openai_chat_completions_proxy(request: Request): if hasattr(chunk, "model_dump_json") else json.dumps(chunk) ) - yield f"data: {data}\n\n".encode("utf-8") + if chunk.choices[0].delta.content is not None: + yield f"data: {data}\n\n".encode("utf-8") # Final DONE event - yield b"data: [DONE]\n\n" + #yield b"data: [DONE]\n\n" else: json_line = ( async_gen.model_dump_json() From fcfabbe9262336e1a6e2eb72b7ea30119f932e15 Mon Sep 17 00:00:00 2001 From: alpha-nerd-nomyo Date: Tue, 23 Sep 2025 13:08:17 +0200 Subject: [PATCH 6/6] mitigating div by zero due to google genai sending completion_token=0 in first chunk --- router.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/router.py b/router.py index 94f84ca..2fa59ef 100644 --- a/router.py +++ b/router.py @@ -314,7 +314,7 @@ class rechunk: total_duration=int((time.perf_counter() - start_ts) * 1_000_000_000) if chunk.usage is not None else 0, load_duration=100000, prompt_eval_count=int(chunk.usage.prompt_tokens) if chunk.usage is not None else 0, - prompt_eval_duration=int((time.perf_counter() - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None else 0, + prompt_eval_duration=int((time.perf_counter() - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None and chunk.usage.completion_tokens != 0 else 0, eval_count=int(chunk.usage.completion_tokens) if chunk.usage is not None else 0, eval_duration=int((time.perf_counter() - start_ts) * 1_000_000_000) if chunk.usage is not None else 0, message=assistant_msg) @@ -331,7 +331,7 @@ class rechunk: total_duration=int((time.perf_counter() - start_ts) * 1000) if chunk.usage is not None else 0, load_duration=10000, prompt_eval_count=int(chunk.usage.prompt_tokens) if chunk.usage is not None else 0, - prompt_eval_duration=int((time.perf_counter() - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None else 0, + prompt_eval_duration=int((time.perf_counter() - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None and chunk.usage.completion_tokens != 0 else 0, eval_count=int(chunk.usage.completion_tokens) if chunk.usage is not None else 0, eval_duration=int((time.perf_counter() - start_ts) * 1000) if chunk.usage is not None else 0, response=chunk.choices[0].text or '', @@ -664,9 +664,11 @@ async def chat_proxy(request: Request): async_gen = await client.chat(model=model, messages=messages, tools=tools, stream=stream, think=think, format=_format, options=options, keep_alive=keep_alive) if stream == True: async for chunk in async_gen: + print(chunk) if is_openai_endpoint: chunk = rechunk.openai_chat_completion2ollama(chunk, stream, start_ts) # `chunk` can be a dict or a pydantic model – dump to JSON safely + print(chunk) if hasattr(chunk, "model_dump_json"): json_line = chunk.model_dump_json() else: