From aeca77c1a161dd0cbdaefd5c4a87368ea8ba66ec Mon Sep 17 00:00:00 2001
From: alpha-nerd-nomyo <alpha-nerd@nomyo.ai>
Date: Sun, 21 Sep 2025 16:33:43 +0200
Subject: [PATCH 1/6] formatting, condensing rechunk

---
 router.py | 92 ++++++++++++++++++++++++++-----------------------------
 1 file changed, 44 insertions(+), 48 deletions(-)

diff --git a/router.py b/router.py
index d67a93b..01e5b9a 100644
--- a/router.py
+++ b/router.py
@@ -277,48 +277,44 @@ def iso8601_ns():
 class rechunk:
     def openai_chat_completion2ollama(chunk: dict, stream: bool, start_ts: float):
         if stream == True:
-            assistant_msg = ollama.Message(
-                role=chunk.choices[0].delta.role or "assistant",
-                content=chunk.choices[0].delta.content,
-                thinking=None,
-                images=None,
-                tool_name=None,
-                tool_calls=None
-            )
+            role = chunk.choices[0].delta.role or "assistant"
+            content = chunk.choices[0].delta.content
         else:
-            assistant_msg = ollama.Message(
-                role=chunk.choices[0].message.role or "assistant",
-                content=chunk.choices[0].message.content,
-                thinking=None,
-                images=None,
-                tool_name=None,
-                tool_calls=None
-            )
-        rechunk = ollama.ChatResponse(model=chunk.model, 
-                    created_at=iso8601_ns(),
-                    done_reason=chunk.choices[0].finish_reason, 
-                    load_duration=100000, 
-                    prompt_eval_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None else None), 
-                    eval_count= (chunk.usage.completion_tokens if chunk.usage is not None else None),
-                    prompt_eval_count=(chunk.usage.prompt_tokens if chunk.usage is not None else None),
-                    eval_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000) if chunk.usage is not None else None),
-                    total_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000) if chunk.usage is not None else None),
-                    message=assistant_msg)
+            role = chunk.choices[0].message.role or "assistant"
+            content = chunk.choices[0].message.content
+        assistant_msg = ollama.Message(
+            role=role,
+            content=content,
+            thinking=None,
+            images=None,
+            tool_name=None,
+            tool_calls=None)
+        rechunk = ollama.ChatResponse(
+            model=chunk.model, 
+            created_at=iso8601_ns(),
+            done_reason=chunk.choices[0].finish_reason, 
+            load_duration=100000, 
+            prompt_eval_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None else None), 
+            eval_count= (chunk.usage.completion_tokens if chunk.usage is not None else None),
+            prompt_eval_count=(chunk.usage.prompt_tokens if chunk.usage is not None else None),
+            eval_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000) if chunk.usage is not None else None),
+            total_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000) if chunk.usage is not None else None),
+            message=assistant_msg)
         return rechunk
     
     def openai_completion2ollama(chunk: dict, stream: bool, start_ts: float):
         with_thinking = chunk.choices[0] if chunk.choices[0] else None
         thinking = getattr(with_thinking, "reasoning", None) if with_thinking else None
-        rechunk = ollama.GenerateResponse(model=chunk.model,
-                                      created_at=iso8601_ns(),
-                                      load_duration=10000,
-                                      done_reason=chunk.choices[0].finish_reason,
-                                      done=None, #True if chunk.choices[0].finish_reason is not None else False,
-                                      total_duration=(int((time.perf_counter() - start_ts) * 1000) if chunk.usage is not None else None),
-                                      eval_duration=(int((time.perf_counter() - start_ts) * 1000) if chunk.usage is not None else None),
-                                      thinking=thinking,
-                                      response=chunk.choices[0].text
-                    )
+        rechunk = ollama.GenerateResponse(
+            model=chunk.model,
+            created_at=iso8601_ns(),
+            load_duration=10000,
+            done_reason=chunk.choices[0].finish_reason,
+            done=None, #True if chunk.choices[0].finish_reason is not None else False,
+            total_duration=(int((time.perf_counter() - start_ts) * 1000) if chunk.usage is not None else None),
+            eval_duration=(int((time.perf_counter() - start_ts) * 1000) if chunk.usage is not None else None),
+            thinking=thinking,
+            response=chunk.choices[0].text)
         return rechunk
     
     def openai_embeddings2ollama(chunk: dict):
@@ -326,18 +322,18 @@ class rechunk:
         return rechunk
 
     def openai_embed2ollama(chunk: dict, model: str):
-        rechunk = ollama.EmbedResponse(model=model,
-                                    created_at=iso8601_ns(),
-                                    done=None,
-                                    done_reason=None,
-                                    total_duration=None,
-                                    load_duration=None,
-                                    prompt_eval_count=None,
-                                    prompt_eval_duration=None,
-                                    eval_count=None,
-                                    eval_duration=None,
-                                    embeddings=[chunk.data[0].embedding]
-                )
+        rechunk = ollama.EmbedResponse(
+            model=model,
+            created_at=iso8601_ns(),
+            done=None,
+            done_reason=None,
+            total_duration=None,
+            load_duration=None,
+            prompt_eval_count=None,
+            prompt_eval_duration=None,
+            eval_count=None,
+            eval_duration=None,
+            embeddings=[chunk.data[0].embedding])
         return rechunk
 # ------------------------------------------------------------------
 # SSE Helpser

From 18d2fca0276324cdcade9e4a8b9de56a55124a82 Mon Sep 17 00:00:00 2001
From: alpha-nerd-nomyo <alpha-nerd@nomyo.ai>
Date: Mon, 22 Sep 2025 09:30:27 +0200
Subject: [PATCH 2/6] formatting Response Objects in rechunk and fixing
 TypeErrors in /api/chat and /api/generate

---
 router.py | 39 +++++++++++++++++++++++----------------
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/router.py b/router.py
index 01e5b9a..36bb05d 100644
--- a/router.py
+++ b/router.py
@@ -275,7 +275,9 @@ def iso8601_ns():
     return iso8601_with_ns
 
 class rechunk:
-    def openai_chat_completion2ollama(chunk: dict, stream: bool, start_ts: float):
+    def openai_chat_completion2ollama(chunk: dict, stream: bool, start_ts: float) -> ollama.ChatResponse:
+        with_thinking = chunk.choices[0] if chunk.choices[0] else None
+        thinking = getattr(with_thinking, "reasoning", None) if with_thinking else None
         if stream == True:
             role = chunk.choices[0].delta.role or "assistant"
             content = chunk.choices[0].delta.content
@@ -285,43 +287,47 @@ class rechunk:
         assistant_msg = ollama.Message(
             role=role,
             content=content,
-            thinking=None,
+            thinking=thinking,
             images=None,
             tool_name=None,
             tool_calls=None)
         rechunk = ollama.ChatResponse(
             model=chunk.model, 
             created_at=iso8601_ns(),
+            done=False, #True if chunk.choices[0].finish_reason is not None else False,
             done_reason=chunk.choices[0].finish_reason, 
+            total_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000) if chunk.usage is not None else None),
             load_duration=100000, 
+            prompt_eval_count=(chunk.usage.prompt_tokens if chunk.usage is not None else None),
             prompt_eval_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None else None), 
             eval_count= (chunk.usage.completion_tokens if chunk.usage is not None else None),
-            prompt_eval_count=(chunk.usage.prompt_tokens if chunk.usage is not None else None),
             eval_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000) if chunk.usage is not None else None),
-            total_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000) if chunk.usage is not None else None),
             message=assistant_msg)
         return rechunk
     
-    def openai_completion2ollama(chunk: dict, stream: bool, start_ts: float):
+    def openai_completion2ollama(chunk: dict, stream: bool, start_ts: float) -> ollama.GenerateResponse:
         with_thinking = chunk.choices[0] if chunk.choices[0] else None
         thinking = getattr(with_thinking, "reasoning", None) if with_thinking else None
         rechunk = ollama.GenerateResponse(
             model=chunk.model,
             created_at=iso8601_ns(),
-            load_duration=10000,
+            done=False, #True if chunk.choices[0].finish_reason is not None else False,
             done_reason=chunk.choices[0].finish_reason,
-            done=None, #True if chunk.choices[0].finish_reason is not None else False,
             total_duration=(int((time.perf_counter() - start_ts) * 1000) if chunk.usage is not None else None),
+            load_duration=10000,
+            prompt_eval_count=None,
+            prompt_eval_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None else None),
+            eval_count=None,
             eval_duration=(int((time.perf_counter() - start_ts) * 1000) if chunk.usage is not None else None),
-            thinking=thinking,
-            response=chunk.choices[0].text)
+            response=chunk.choices[0].text,
+            thinking=thinking)
         return rechunk
     
-    def openai_embeddings2ollama(chunk: dict):
+    def openai_embeddings2ollama(chunk: dict) -> ollama.EmbeddingsResponse:
         rechunk = ollama.EmbeddingsResponse(embedding=chunk.data[0].embedding)
         return rechunk
 
-    def openai_embed2ollama(chunk: dict, model: str):
+    def openai_embed2ollama(chunk: dict, model: str) -> ollama.EmbedResponse:
         rechunk = ollama.EmbedResponse(
             model=model,
             created_at=iso8601_ns(),
@@ -538,7 +544,7 @@ async def proxy(request: Request):
             else:
                 if is_openai_endpoint:
                     response = rechunk.openai_completion2ollama(async_gen, stream, start_ts)
-                    response = json.dumps(response)
+                    response = response.model_dump_json()
                 else:
                     response = async_gen.model_dump_json()
                 json_line = (
@@ -570,7 +576,7 @@ async def chat_proxy(request: Request):
     try:
         body_bytes = await request.body()
         payload = json.loads(body_bytes.decode("utf-8"))
-
+        print(payload)
         model = payload.get("model")
         messages = payload.get("messages")
         tools = payload.get("tools")
@@ -628,6 +634,7 @@ async def chat_proxy(request: Request):
                 async_gen = await client.chat(model=model, messages=messages, tools=tools, stream=stream, think=think, format=_format, options=options, keep_alive=keep_alive)
             if stream == True:
                 async for chunk in async_gen:
+                    print(chunk)
                     if is_openai_endpoint:
                         chunk = rechunk.openai_chat_completion2ollama(chunk, stream, start_ts)
                     # `chunk` can be a dict or a pydantic model – dump to JSON safely
@@ -639,13 +646,13 @@ async def chat_proxy(request: Request):
             else:
                 if is_openai_endpoint:
                     response = rechunk.openai_chat_completion2ollama(async_gen, stream, start_ts)
-                    response = json.dumps(response)
+                    response = response.model_dump_json()
                 else:
                     response = async_gen.model_dump_json()
                 json_line = (
                     response
-                    if hasattr(async_gen, "model_dump_json")
-                    else json.dumps(async_gen)
+                    if hasattr(response, "model_dump_json")
+                    else json.dumps(response)
                 )
                 yield json_line.encode("utf-8") + b"\n"
 

From c43dc4139feab0a968482a95dd9a5c0047a2347b Mon Sep 17 00:00:00 2001
From: alpha-nerd-nomyo <alpha-nerd@nomyo.ai>
Date: Mon, 22 Sep 2025 14:04:19 +0200
Subject: [PATCH 3/6] adding optional parameters in ollama to openai
 translation

---
 router.py | 34 +++++++++++++++++++++++++---------
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/router.py b/router.py
index 36bb05d..f8c7796 100644
--- a/router.py
+++ b/router.py
@@ -294,7 +294,7 @@ class rechunk:
         rechunk = ollama.ChatResponse(
             model=chunk.model, 
             created_at=iso8601_ns(),
-            done=False, #True if chunk.choices[0].finish_reason is not None else False,
+            done=True if chunk.choices[0].finish_reason is not None else False,
             done_reason=chunk.choices[0].finish_reason, 
             total_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000) if chunk.usage is not None else None),
             load_duration=100000, 
@@ -341,6 +341,7 @@ class rechunk:
             eval_duration=None,
             embeddings=[chunk.data[0].embedding])
         return rechunk
+    
 # ------------------------------------------------------------------
 # SSE Helpser
 # ------------------------------------------------------------------
@@ -490,7 +491,7 @@ async def proxy(request: Request):
         images = payload.get("images")
         options = payload.get("options")
         keep_alive = payload.get("keep_alive")
-
+        
         if not model:
             raise HTTPException(
                 status_code=400, detail="Missing required field 'model'"
@@ -516,8 +517,15 @@ async def proxy(request: Request):
 
         optional_params = {
             "stream": stream,
-        }
-
+            "max_tokens": options.get("num_predict") if options and "num_predict" in options else None,
+            "frequency_penalty": options.get("frequency_penalty") if options and "frequency_penalty" in options else None,
+            "presence_penalty": options.get("presence_penalty") if options and "presence_penalty" in options else None,
+            "seed": options.get("seed") if options and "seed" in options else None,
+            "stop": options.get("stop") if options and "stop" in options else None,
+            "top_p": options.get("top_p") if options and "top_p" in options else None,
+            "temperature": options.get("temperature") if options and "temperature" in options else None,
+            "sufix": suffix,
+            }
         params.update({k: v for k, v in optional_params.items() if v is not None})
         oclient = openai.AsyncOpenAI(base_url=endpoint, default_headers=default_headers, api_key=config.api_keys[endpoint])
     else:
@@ -576,7 +584,7 @@ async def chat_proxy(request: Request):
     try:
         body_bytes = await request.body()
         payload = json.loads(body_bytes.decode("utf-8"))
-        print(payload)
+
         model = payload.get("model")
         messages = payload.get("messages")
         tools = payload.get("tools")
@@ -586,7 +594,7 @@ async def chat_proxy(request: Request):
         options = payload.get("options")
         keep_alive = payload.get("keep_alive")
         options = payload.get("options")
-
+        
         if not model:
             raise HTTPException(
                 status_code=400, detail="Missing required field 'model'"
@@ -612,12 +620,19 @@ async def chat_proxy(request: Request):
         params = {
             "messages": messages, 
             "model": model,
-        }
-
+            }
         optional_params = {
             "tools": tools,
             "stream": stream,
-        }
+            "max_tokens": options.get("num_predict") if options and "num_predict" in options else None,
+            "frequency_penalty": options.get("frequency_penalty") if options and "frequency_penalty" in options else None,
+            "presence_penalty": options.get("presence_penalty") if options and "presence_penalty" in options else None,
+            "seed": options.get("seed") if options and "seed" in options else None,
+            "stop": options.get("stop") if options and "stop" in options else None,
+            "top_p": options.get("top_p") if options and "top_p" in options else None,
+            "temperature": options.get("temperature") if options and "temperature" in options else None,
+            "response_format": {"type": "json_schema", "json_schema": _format} if _format is not None else None
+            }
         params.update({k: v for k, v in optional_params.items() if v is not None})
         oclient = openai.AsyncOpenAI(base_url=endpoint, default_headers=default_headers, api_key=config.api_keys[endpoint])
     else:
@@ -638,6 +653,7 @@ async def chat_proxy(request: Request):
                     if is_openai_endpoint:
                         chunk = rechunk.openai_chat_completion2ollama(chunk, stream, start_ts)
                     # `chunk` can be a dict or a pydantic model – dump to JSON safely
+                    print(chunk)
                     if hasattr(chunk, "model_dump_json"):
                         json_line = chunk.model_dump_json()
                     else:

From 19df75afa90528a0ca334d440c848285d7ba1848 Mon Sep 17 00:00:00 2001
From: alpha-nerd-nomyo <alpha-nerd@nomyo.ai>
Date: Mon, 22 Sep 2025 19:01:14 +0200
Subject: [PATCH 4/6] fixing types and params

---
 router.py | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/router.py b/router.py
index f8c7796..d509ed8 100644
--- a/router.py
+++ b/router.py
@@ -277,13 +277,14 @@ def iso8601_ns():
 class rechunk:
     def openai_chat_completion2ollama(chunk: dict, stream: bool, start_ts: float) -> ollama.ChatResponse:
         with_thinking = chunk.choices[0] if chunk.choices[0] else None
-        thinking = getattr(with_thinking, "reasoning", None) if with_thinking else None
         if stream == True:
+            thinking = getattr(with_thinking.delta, "reasoning", None) if with_thinking else None
             role = chunk.choices[0].delta.role or "assistant"
-            content = chunk.choices[0].delta.content
+            content = chunk.choices[0].delta.content or "" 
         else:
+            thinking = getattr(with_thinking, "reasoning", None) if with_thinking else None
             role = chunk.choices[0].message.role or "assistant"
-            content = chunk.choices[0].message.content
+            content = chunk.choices[0].message.content or ""
         assistant_msg = ollama.Message(
             role=role,
             content=content,
@@ -296,12 +297,12 @@ class rechunk:
             created_at=iso8601_ns(),
             done=True if chunk.choices[0].finish_reason is not None else False,
             done_reason=chunk.choices[0].finish_reason, 
-            total_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000) if chunk.usage is not None else None),
+            total_duration=int((time.perf_counter() - start_ts) * 1_000_000_000) if chunk.usage is not None else 0,
             load_duration=100000, 
-            prompt_eval_count=(chunk.usage.prompt_tokens if chunk.usage is not None else None),
-            prompt_eval_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None else None), 
-            eval_count= (chunk.usage.completion_tokens if chunk.usage is not None else None),
-            eval_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000) if chunk.usage is not None else None),
+            prompt_eval_count=int(chunk.usage.prompt_tokens) if chunk.usage is not None else 0,
+            prompt_eval_duration=int((time.perf_counter() - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None else 0, 
+            eval_count=int(chunk.usage.completion_tokens) if chunk.usage is not None else 0,
+            eval_duration=int((time.perf_counter() - start_ts) * 1_000_000_000) if chunk.usage is not None else 0,
             message=assistant_msg)
         return rechunk
     
@@ -313,13 +314,13 @@ class rechunk:
             created_at=iso8601_ns(),
             done=False, #True if chunk.choices[0].finish_reason is not None else False,
             done_reason=chunk.choices[0].finish_reason,
-            total_duration=(int((time.perf_counter() - start_ts) * 1000) if chunk.usage is not None else None),
+            total_duration=int((time.perf_counter() - start_ts) * 1000) if chunk.usage is not None else 0,
             load_duration=10000,
-            prompt_eval_count=None,
-            prompt_eval_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None else None),
-            eval_count=None,
-            eval_duration=(int((time.perf_counter() - start_ts) * 1000) if chunk.usage is not None else None),
-            response=chunk.choices[0].text,
+            prompt_eval_count=0,
+            prompt_eval_duration=int((time.perf_counter() - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None else 0,
+            eval_count=0,
+            eval_duration=int((time.perf_counter() - start_ts) * 1000) if chunk.usage is not None else 0,
+            response=chunk.choices[0].text or "",
             thinking=thinking)
         return rechunk
     
@@ -591,7 +592,6 @@ async def chat_proxy(request: Request):
         stream = payload.get("stream")
         think = payload.get("think")
         _format = payload.get("format")
-        options = payload.get("options")
         keep_alive = payload.get("keep_alive")
         options = payload.get("options")
         
@@ -634,6 +634,7 @@ async def chat_proxy(request: Request):
             "response_format": {"type": "json_schema", "json_schema": _format} if _format is not None else None
             }
         params.update({k: v for k, v in optional_params.items() if v is not None})
+        print(params)
         oclient = openai.AsyncOpenAI(base_url=endpoint, default_headers=default_headers, api_key=config.api_keys[endpoint])
     else:
         client = ollama.AsyncClient(host=endpoint)
@@ -667,8 +668,8 @@ async def chat_proxy(request: Request):
                     response = async_gen.model_dump_json()
                 json_line = (
                     response
-                    if hasattr(response, "model_dump_json")
-                    else json.dumps(response)
+                    if hasattr(async_gen, "model_dump_json")
+                    else json.dumps(async_gen)
                 )
                 yield json_line.encode("utf-8") + b"\n"
 
@@ -677,9 +678,10 @@ async def chat_proxy(request: Request):
             await decrement_usage(endpoint, model)
 
     # 4. Return a StreamingResponse backed by the generator
+    media_type = "application/x-ndjson" if stream else "application/json"
     return StreamingResponse(
         stream_chat_response(),
-        media_type="application/json",
+        media_type=media_type,
     )
 
 # -------------------------------------------------------------

From a74cc5be0f1413eb707e9e592b5342e4f8ceb6e2 Mon Sep 17 00:00:00 2001
From: alpha-nerd-nomyo <alpha-nerd@nomyo.ai>
Date: Tue, 23 Sep 2025 12:51:37 +0200
Subject: [PATCH 5/6] fixing endpoint usage metrics

---
 router.py | 39 ++++++++++++++++++++++++++-------------
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/router.py b/router.py
index d509ed8..94f84ca 100644
--- a/router.py
+++ b/router.py
@@ -276,15 +276,29 @@ def iso8601_ns():
 
 class rechunk:
     def openai_chat_completion2ollama(chunk: dict, stream: bool, start_ts: float) -> ollama.ChatResponse:
+        if chunk.choices == [] and chunk.usage is not None:
+            return ollama.ChatResponse(
+                model=chunk.model,
+                created_at=iso8601_ns(),
+                done=True,
+                done_reason='stop',
+                total_duration=int((time.perf_counter() - start_ts) * 1_000_000_000),
+                load_duration=100000, 
+                prompt_eval_count=int(chunk.usage.prompt_tokens),
+                prompt_eval_duration=int((time.perf_counter() - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)), 
+                eval_count=int(chunk.usage.completion_tokens),
+                eval_duration=int((time.perf_counter() - start_ts) * 1_000_000_000),
+                message={"role": "assistant"}
+                )
         with_thinking = chunk.choices[0] if chunk.choices[0] else None
         if stream == True:
             thinking = getattr(with_thinking.delta, "reasoning", None) if with_thinking else None
             role = chunk.choices[0].delta.role or "assistant"
-            content = chunk.choices[0].delta.content or "" 
+            content = chunk.choices[0].delta.content or ''
         else:
             thinking = getattr(with_thinking, "reasoning", None) if with_thinking else None
             role = chunk.choices[0].message.role or "assistant"
-            content = chunk.choices[0].message.content or ""
+            content = chunk.choices[0].message.content or ''
         assistant_msg = ollama.Message(
             role=role,
             content=content,
@@ -295,8 +309,8 @@ class rechunk:
         rechunk = ollama.ChatResponse(
             model=chunk.model, 
             created_at=iso8601_ns(),
-            done=True if chunk.choices[0].finish_reason is not None else False,
-            done_reason=chunk.choices[0].finish_reason, 
+            done=True if chunk.usage is not None else False,
+            done_reason=chunk.choices[0].finish_reason, #if chunk.choices[0].finish_reason is not None else None,
             total_duration=int((time.perf_counter() - start_ts) * 1_000_000_000) if chunk.usage is not None else 0,
             load_duration=100000, 
             prompt_eval_count=int(chunk.usage.prompt_tokens) if chunk.usage is not None else 0,
@@ -312,15 +326,15 @@ class rechunk:
         rechunk = ollama.GenerateResponse(
             model=chunk.model,
             created_at=iso8601_ns(),
-            done=False, #True if chunk.choices[0].finish_reason is not None else False,
+            done=True if chunk.usage is not None else False,
             done_reason=chunk.choices[0].finish_reason,
             total_duration=int((time.perf_counter() - start_ts) * 1000) if chunk.usage is not None else 0,
             load_duration=10000,
-            prompt_eval_count=0,
+            prompt_eval_count=int(chunk.usage.prompt_tokens) if chunk.usage is not None else 0,
             prompt_eval_duration=int((time.perf_counter() - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None else 0,
-            eval_count=0,
+            eval_count=int(chunk.usage.completion_tokens) if chunk.usage is not None else 0,
             eval_duration=int((time.perf_counter() - start_ts) * 1000) if chunk.usage is not None else 0,
-            response=chunk.choices[0].text or "",
+            response=chunk.choices[0].text or '',
             thinking=thinking)
         return rechunk
     
@@ -624,6 +638,7 @@ async def chat_proxy(request: Request):
         optional_params = {
             "tools": tools,
             "stream": stream,
+            "stream_options": {"include_usage": True} if stream is not None else None,
             "max_tokens": options.get("num_predict") if options and "num_predict" in options else None,
             "frequency_penalty": options.get("frequency_penalty") if options and "frequency_penalty" in options else None,
             "presence_penalty": options.get("presence_penalty") if options and "presence_penalty" in options else None,
@@ -634,7 +649,6 @@ async def chat_proxy(request: Request):
             "response_format": {"type": "json_schema", "json_schema": _format} if _format is not None else None
             }
         params.update({k: v for k, v in optional_params.items() if v is not None})
-        print(params)
         oclient = openai.AsyncOpenAI(base_url=endpoint, default_headers=default_headers, api_key=config.api_keys[endpoint])
     else:
         client = ollama.AsyncClient(host=endpoint)
@@ -650,11 +664,9 @@ async def chat_proxy(request: Request):
                 async_gen = await client.chat(model=model, messages=messages, tools=tools, stream=stream, think=think, format=_format, options=options, keep_alive=keep_alive)
             if stream == True:
                 async for chunk in async_gen:
-                    print(chunk)
                     if is_openai_endpoint:
                         chunk = rechunk.openai_chat_completion2ollama(chunk, stream, start_ts)
                     # `chunk` can be a dict or a pydantic model – dump to JSON safely
-                    print(chunk)
                     if hasattr(chunk, "model_dump_json"):
                         json_line = chunk.model_dump_json()
                     else:
@@ -1315,9 +1327,10 @@ async def openai_chat_completions_proxy(request: Request):
                         if hasattr(chunk, "model_dump_json")
                         else json.dumps(chunk)
                     )
-                    yield f"data: {data}\n\n".encode("utf-8")
+                    if chunk.choices[0].delta.content is not None:
+                        yield f"data: {data}\n\n".encode("utf-8")
                 # Final DONE event
-                yield b"data: [DONE]\n\n"
+                #yield b"data: [DONE]\n\n"
             else:
                 json_line = (
                     async_gen.model_dump_json()

From fcfabbe9262336e1a6e2eb72b7ea30119f932e15 Mon Sep 17 00:00:00 2001
From: alpha-nerd-nomyo <alpha-nerd@nomyo.ai>
Date: Tue, 23 Sep 2025 13:08:17 +0200
Subject: [PATCH 6/6] mitigating div by zero due to google genai sending
 completion_token=0 in first chunk

---
 router.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/router.py b/router.py
index 94f84ca..2fa59ef 100644
--- a/router.py
+++ b/router.py
@@ -314,7 +314,7 @@ class rechunk:
             total_duration=int((time.perf_counter() - start_ts) * 1_000_000_000) if chunk.usage is not None else 0,
             load_duration=100000, 
             prompt_eval_count=int(chunk.usage.prompt_tokens) if chunk.usage is not None else 0,
-            prompt_eval_duration=int((time.perf_counter() - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None else 0, 
+            prompt_eval_duration=int((time.perf_counter() - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None and chunk.usage.completion_tokens != 0 else 0, 
             eval_count=int(chunk.usage.completion_tokens) if chunk.usage is not None else 0,
             eval_duration=int((time.perf_counter() - start_ts) * 1_000_000_000) if chunk.usage is not None else 0,
             message=assistant_msg)
@@ -331,7 +331,7 @@ class rechunk:
             total_duration=int((time.perf_counter() - start_ts) * 1000) if chunk.usage is not None else 0,
             load_duration=10000,
             prompt_eval_count=int(chunk.usage.prompt_tokens) if chunk.usage is not None else 0,
-            prompt_eval_duration=int((time.perf_counter() - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None else 0,
+            prompt_eval_duration=int((time.perf_counter() - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None and chunk.usage.completion_tokens != 0 else 0,
             eval_count=int(chunk.usage.completion_tokens) if chunk.usage is not None else 0,
             eval_duration=int((time.perf_counter() - start_ts) * 1000) if chunk.usage is not None else 0,
             response=chunk.choices[0].text or '',
@@ -664,9 +664,11 @@ async def chat_proxy(request: Request):
                 async_gen = await client.chat(model=model, messages=messages, tools=tools, stream=stream, think=think, format=_format, options=options, keep_alive=keep_alive)
             if stream == True:
                 async for chunk in async_gen:
+                    print(chunk)
                     if is_openai_endpoint:
                         chunk = rechunk.openai_chat_completion2ollama(chunk, stream, start_ts)
                     # `chunk` can be a dict or a pydantic model – dump to JSON safely
+                    print(chunk)
                     if hasattr(chunk, "model_dump_json"):
                         json_line = chunk.model_dump_json()
                     else: