From 19df75afa90528a0ca334d440c848285d7ba1848 Mon Sep 17 00:00:00 2001
From: alpha-nerd-nomyo <alpha-nerd@nomyo.ai>
Date: Mon, 22 Sep 2025 19:01:14 +0200
Subject: [PATCH] fixing types and params

---
 router.py | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/router.py b/router.py
index f8c7796..d509ed8 100644
--- a/router.py
+++ b/router.py
@@ -277,13 +277,14 @@ def iso8601_ns():
 class rechunk:
     def openai_chat_completion2ollama(chunk: dict, stream: bool, start_ts: float) -> ollama.ChatResponse:
         with_thinking = chunk.choices[0] if chunk.choices[0] else None
-        thinking = getattr(with_thinking, "reasoning", None) if with_thinking else None
         if stream == True:
+            thinking = getattr(with_thinking.delta, "reasoning", None) if with_thinking else None
             role = chunk.choices[0].delta.role or "assistant"
-            content = chunk.choices[0].delta.content
+            content = chunk.choices[0].delta.content or "" 
         else:
+            thinking = getattr(with_thinking, "reasoning", None) if with_thinking else None
             role = chunk.choices[0].message.role or "assistant"
-            content = chunk.choices[0].message.content
+            content = chunk.choices[0].message.content or ""
         assistant_msg = ollama.Message(
             role=role,
             content=content,
@@ -296,12 +297,12 @@ class rechunk:
             created_at=iso8601_ns(),
             done=True if chunk.choices[0].finish_reason is not None else False,
             done_reason=chunk.choices[0].finish_reason, 
-            total_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000) if chunk.usage is not None else None),
+            total_duration=int((time.perf_counter() - start_ts) * 1_000_000_000) if chunk.usage is not None else 0,
             load_duration=100000, 
-            prompt_eval_count=(chunk.usage.prompt_tokens if chunk.usage is not None else None),
-            prompt_eval_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None else None), 
-            eval_count= (chunk.usage.completion_tokens if chunk.usage is not None else None),
-            eval_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000) if chunk.usage is not None else None),
+            prompt_eval_count=int(chunk.usage.prompt_tokens) if chunk.usage is not None else 0,
+            prompt_eval_duration=int((time.perf_counter() - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None else 0, 
+            eval_count=int(chunk.usage.completion_tokens) if chunk.usage is not None else 0,
+            eval_duration=int((time.perf_counter() - start_ts) * 1_000_000_000) if chunk.usage is not None else 0,
             message=assistant_msg)
         return rechunk
     
@@ -313,13 +314,13 @@ class rechunk:
             created_at=iso8601_ns(),
             done=False, #True if chunk.choices[0].finish_reason is not None else False,
             done_reason=chunk.choices[0].finish_reason,
-            total_duration=(int((time.perf_counter() - start_ts) * 1000) if chunk.usage is not None else None),
+            total_duration=int((time.perf_counter() - start_ts) * 1000) if chunk.usage is not None else 0,
             load_duration=10000,
-            prompt_eval_count=None,
-            prompt_eval_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None else None),
-            eval_count=None,
-            eval_duration=(int((time.perf_counter() - start_ts) * 1000) if chunk.usage is not None else None),
-            response=chunk.choices[0].text,
+            prompt_eval_count=0,
+            prompt_eval_duration=int((time.perf_counter() - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None else 0,
+            eval_count=0,
+            eval_duration=int((time.perf_counter() - start_ts) * 1000) if chunk.usage is not None else 0,
+            response=chunk.choices[0].text or "",
             thinking=thinking)
         return rechunk
     
@@ -591,7 +592,6 @@ async def chat_proxy(request: Request):
         stream = payload.get("stream")
         think = payload.get("think")
         _format = payload.get("format")
-        options = payload.get("options")
         keep_alive = payload.get("keep_alive")
         options = payload.get("options")
         
@@ -634,6 +634,7 @@ async def chat_proxy(request: Request):
             "response_format": {"type": "json_schema", "json_schema": _format} if _format is not None else None
             }
         params.update({k: v for k, v in optional_params.items() if v is not None})
+        print(params)
         oclient = openai.AsyncOpenAI(base_url=endpoint, default_headers=default_headers, api_key=config.api_keys[endpoint])
     else:
         client = ollama.AsyncClient(host=endpoint)
@@ -667,8 +668,8 @@ async def chat_proxy(request: Request):
                     response = async_gen.model_dump_json()
                 json_line = (
                     response
-                    if hasattr(response, "model_dump_json")
-                    else json.dumps(response)
+                    if hasattr(async_gen, "model_dump_json")
+                    else json.dumps(async_gen)
                 )
                 yield json_line.encode("utf-8") + b"\n"
 
@@ -677,9 +678,10 @@ async def chat_proxy(request: Request):
             await decrement_usage(endpoint, model)
 
     # 4. Return a StreamingResponse backed by the generator
+    media_type = "application/x-ndjson" if stream else "application/json"
     return StreamingResponse(
         stream_chat_response(),
-        media_type="application/json",
+        media_type=media_type,
     )
 
 # -------------------------------------------------------------