From 18d2fca0276324cdcade9e4a8b9de56a55124a82 Mon Sep 17 00:00:00 2001 From: alpha-nerd-nomyo Date: Mon, 22 Sep 2025 09:30:27 +0200 Subject: [PATCH] formatting Response Objects in rechunk and fixing TypeErrors in /api/chat and /api/generate --- router.py | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/router.py b/router.py index 01e5b9a..36bb05d 100644 --- a/router.py +++ b/router.py @@ -275,7 +275,9 @@ def iso8601_ns(): return iso8601_with_ns class rechunk: - def openai_chat_completion2ollama(chunk: dict, stream: bool, start_ts: float): + def openai_chat_completion2ollama(chunk: dict, stream: bool, start_ts: float) -> ollama.ChatResponse: + with_thinking = chunk.choices[0] if chunk.choices[0] else None + thinking = getattr(with_thinking, "reasoning", None) if with_thinking else None if stream == True: role = chunk.choices[0].delta.role or "assistant" content = chunk.choices[0].delta.content @@ -285,43 +287,47 @@ class rechunk: assistant_msg = ollama.Message( role=role, content=content, - thinking=None, + thinking=thinking, images=None, tool_name=None, tool_calls=None) rechunk = ollama.ChatResponse( model=chunk.model, created_at=iso8601_ns(), + done=False, #True if chunk.choices[0].finish_reason is not None else False, done_reason=chunk.choices[0].finish_reason, + total_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000) if chunk.usage is not None else None), load_duration=100000, + prompt_eval_count=(chunk.usage.prompt_tokens if chunk.usage is not None else None), prompt_eval_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None else None), eval_count= (chunk.usage.completion_tokens if chunk.usage is not None else None), - prompt_eval_count=(chunk.usage.prompt_tokens if chunk.usage is not None else None), eval_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000) if chunk.usage is not None else None), - total_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000) if chunk.usage is not None else None), message=assistant_msg) return rechunk - def openai_completion2ollama(chunk: dict, stream: bool, start_ts: float): + def openai_completion2ollama(chunk: dict, stream: bool, start_ts: float) -> ollama.GenerateResponse: with_thinking = chunk.choices[0] if chunk.choices[0] else None thinking = getattr(with_thinking, "reasoning", None) if with_thinking else None rechunk = ollama.GenerateResponse( model=chunk.model, created_at=iso8601_ns(), - load_duration=10000, + done=False, #True if chunk.choices[0].finish_reason is not None else False, done_reason=chunk.choices[0].finish_reason, - done=None, #True if chunk.choices[0].finish_reason is not None else False, total_duration=(int((time.perf_counter() - start_ts) * 1000) if chunk.usage is not None else None), + load_duration=10000, + prompt_eval_count=None, + prompt_eval_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None else None), + eval_count=None, eval_duration=(int((time.perf_counter() - start_ts) * 1000) if chunk.usage is not None else None), - thinking=thinking, - response=chunk.choices[0].text) + response=chunk.choices[0].text, + thinking=thinking) return rechunk - def openai_embeddings2ollama(chunk: dict): + def openai_embeddings2ollama(chunk: dict) -> ollama.EmbeddingsResponse: rechunk = ollama.EmbeddingsResponse(embedding=chunk.data[0].embedding) return rechunk - def openai_embed2ollama(chunk: dict, model: str): + def openai_embed2ollama(chunk: dict, model: str) -> ollama.EmbedResponse: rechunk = ollama.EmbedResponse( model=model, created_at=iso8601_ns(), @@ -538,7 +544,7 @@ async def proxy(request: Request): else: if is_openai_endpoint: response = rechunk.openai_completion2ollama(async_gen, stream, start_ts) - response = json.dumps(response) + response = response.model_dump_json() else: response = async_gen.model_dump_json() json_line = ( @@ -570,7 +576,7 @@ async def chat_proxy(request: Request): try: body_bytes = await request.body() payload = json.loads(body_bytes.decode("utf-8")) - + print(payload) model = payload.get("model") messages = payload.get("messages") tools = payload.get("tools") @@ -628,6 +634,7 @@ async def chat_proxy(request: Request): async_gen = await client.chat(model=model, messages=messages, tools=tools, stream=stream, think=think, format=_format, options=options, keep_alive=keep_alive) if stream == True: async for chunk in async_gen: + print(chunk) if is_openai_endpoint: chunk = rechunk.openai_chat_completion2ollama(chunk, stream, start_ts) # `chunk` can be a dict or a pydantic model – dump to JSON safely @@ -639,13 +646,13 @@ async def chat_proxy(request: Request): else: if is_openai_endpoint: response = rechunk.openai_chat_completion2ollama(async_gen, stream, start_ts) - response = json.dumps(response) + response = response.model_dump_json() else: response = async_gen.model_dump_json() json_line = ( response - if hasattr(async_gen, "model_dump_json") - else json.dumps(async_gen) + if hasattr(response, "model_dump_json") + else json.dumps(response) ) yield json_line.encode("utf-8") + b"\n"