"""OpenAI → Ollama response shape converters. Methods on the ``rechunk`` class are called as bare functions (``rechunk.openai_chat_completion2ollama(...)``) — there is no instance state. The class is just a namespace. ``extract_usage_from_llama_timings`` reads the ``timings`` field that llama-server returns in place of OpenAI's ``usage`` so the router can still count tokens for those backends. """ import time import ollama import orjson from images import iso8601_ns from requests.messages import _convert_openai_logprobs class rechunk: def openai_chat_completion2ollama(chunk: dict, stream: bool, start_ts: float) -> ollama.ChatResponse: now = time.perf_counter() if chunk.choices == [] and chunk.usage is not None: return ollama.ChatResponse( model=chunk.model, created_at=iso8601_ns(), done=True, done_reason='stop', total_duration=int((now - start_ts) * 1_000_000_000), load_duration=100000, prompt_eval_count=int(chunk.usage.prompt_tokens), prompt_eval_duration=int((now - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)), eval_count=int(chunk.usage.completion_tokens), eval_duration=int((now - start_ts) * 1_000_000_000), message=ollama.Message(role="assistant", content=""), ) with_thinking = chunk.choices[0] if chunk.choices[0] else None if stream == True: thinking = (getattr(with_thinking.delta, "reasoning_content", None) or getattr(with_thinking.delta, "reasoning", None)) if with_thinking else None role = chunk.choices[0].delta.role or "assistant" content = chunk.choices[0].delta.content or '' else: thinking = (getattr(with_thinking.message, "reasoning_content", None) or getattr(with_thinking.message, "reasoning", None)) if with_thinking else None role = chunk.choices[0].message.role or "assistant" content = chunk.choices[0].message.content or '' # Convert OpenAI tool_calls to Ollama format # In streaming mode, tool_calls arrive as partial deltas across multiple chunks # (name only in first delta, arguments as incremental JSON fragments). # Callers must accumulate deltas and inject the final result; skip here. ollama_tool_calls = None if not stream: raw_tool_calls = getattr(with_thinking.message, "tool_calls", None) if with_thinking else None if raw_tool_calls: ollama_tool_calls = [] for tc in raw_tool_calls: try: args = orjson.loads(tc.function.arguments) if isinstance(tc.function.arguments, str) else (tc.function.arguments or {}) except (orjson.JSONDecodeError, TypeError): args = {} ollama_tool_calls.append(ollama.Message.ToolCall( function=ollama.Message.ToolCall.Function(name=tc.function.name, arguments=args) )) # Convert OpenAI logprobs to Ollama format ollama_logprobs = _convert_openai_logprobs(with_thinking) if with_thinking else None assistant_msg = ollama.Message( role=role, content=content, thinking=thinking, images=None, tool_name=None, tool_calls=ollama_tool_calls) rechunk = ollama.ChatResponse( model=chunk.model, created_at=iso8601_ns(), done=True if chunk.usage is not None else False, done_reason=chunk.choices[0].finish_reason, #if chunk.choices[0].finish_reason is not None else None, total_duration=int((now - start_ts) * 1_000_000_000) if chunk.usage is not None else 0, load_duration=100000, prompt_eval_count=int(chunk.usage.prompt_tokens) if chunk.usage is not None else 0, prompt_eval_duration=int((now - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None and chunk.usage.completion_tokens != 0 else 0, eval_count=int(chunk.usage.completion_tokens) if chunk.usage is not None else 0, eval_duration=int((now - start_ts) * 1_000_000_000) if chunk.usage is not None else 0, message=assistant_msg, logprobs=ollama_logprobs) return rechunk def openai_completion2ollama(chunk: dict, stream: bool, start_ts: float) -> ollama.GenerateResponse: now = time.perf_counter() with_thinking = chunk.choices[0] if chunk.choices[0] else None thinking = getattr(with_thinking, "reasoning", None) if with_thinking else None rechunk = ollama.GenerateResponse( model=chunk.model, created_at=iso8601_ns(), done=True if chunk.usage is not None else False, done_reason=chunk.choices[0].finish_reason, total_duration=int((now - start_ts) * 1_000_000_000) if chunk.usage is not None else 0, load_duration=10000, prompt_eval_count=int(chunk.usage.prompt_tokens) if chunk.usage is not None else 0, prompt_eval_duration=int((now - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None and chunk.usage.completion_tokens != 0 else 0, eval_count=int(chunk.usage.completion_tokens) if chunk.usage is not None else 0, eval_duration=int((now - start_ts) * 1_000_000_000) if chunk.usage is not None else 0, response=chunk.choices[0].text or '', thinking=thinking) return rechunk def openai_embeddings2ollama(chunk: dict) -> ollama.EmbeddingsResponse: rechunk = ollama.EmbeddingsResponse(embedding=chunk.data[0].embedding) return rechunk def openai_embed2ollama(chunk: dict, model: str) -> ollama.EmbedResponse: rechunk = ollama.EmbedResponse( model=model, created_at=iso8601_ns(), done=None, done_reason=None, total_duration=None, load_duration=None, prompt_eval_count=None, prompt_eval_duration=None, eval_count=None, eval_duration=None, embeddings=[chunk.data[0].embedding]) return rechunk def extract_usage_from_llama_timings(obj) -> tuple[int, int] | None: """Extract (prompt_tokens, completion_tokens) from llama-server's timings object. llama-server returns a ``timings`` dict instead of the standard OpenAI ``usage`` field:: "timings": { "cache_n": 236, // prompt tokens reused from cache "prompt_n": 1, // prompt tokens processed "predicted_n": 35 // predicted (completion) tokens } prompt_tokens = prompt_n + cache_n completion_tokens = predicted_n Returns ``(prompt_tokens, completion_tokens)`` or ``None`` when no timings are found. """ timings = getattr(obj, "timings", None) if timings is None: return None if isinstance(timings, dict): prompt_n = timings.get("prompt_n", 0) or 0 cache_n = timings.get("cache_n", 0) or 0 predicted_n = timings.get("predicted_n", 0) or 0 return (prompt_n + cache_n, predicted_n) return None