151 lines
7.4 KiB
Python
151 lines
7.4 KiB
Python
"""OpenAI → Ollama response shape converters.
|
|
|
|
Methods on the ``rechunk`` class are called as bare functions
|
|
(``rechunk.openai_chat_completion2ollama(...)``) — there is no instance
|
|
state. The class is just a namespace.
|
|
|
|
``extract_usage_from_llama_timings`` reads the ``timings`` field that
|
|
llama-server returns in place of OpenAI's ``usage`` so the router can still
|
|
count tokens for those backends.
|
|
"""
|
|
import time
|
|
|
|
import ollama
|
|
import orjson
|
|
|
|
from images import iso8601_ns
|
|
from requests.messages import _convert_openai_logprobs
|
|
|
|
|
|
class rechunk:
|
|
def openai_chat_completion2ollama(chunk: dict, stream: bool, start_ts: float) -> ollama.ChatResponse:
|
|
now = time.perf_counter()
|
|
if chunk.choices == [] and chunk.usage is not None:
|
|
return ollama.ChatResponse(
|
|
model=chunk.model,
|
|
created_at=iso8601_ns(),
|
|
done=True,
|
|
done_reason='stop',
|
|
total_duration=int((now - start_ts) * 1_000_000_000),
|
|
load_duration=100000,
|
|
prompt_eval_count=int(chunk.usage.prompt_tokens),
|
|
prompt_eval_duration=int((now - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)),
|
|
eval_count=int(chunk.usage.completion_tokens),
|
|
eval_duration=int((now - start_ts) * 1_000_000_000),
|
|
message=ollama.Message(role="assistant", content=""),
|
|
)
|
|
with_thinking = chunk.choices[0] if chunk.choices[0] else None
|
|
if stream == True:
|
|
thinking = (getattr(with_thinking.delta, "reasoning_content", None) or getattr(with_thinking.delta, "reasoning", None)) if with_thinking else None
|
|
role = chunk.choices[0].delta.role or "assistant"
|
|
content = chunk.choices[0].delta.content or ''
|
|
else:
|
|
thinking = (getattr(with_thinking.message, "reasoning_content", None) or getattr(with_thinking.message, "reasoning", None)) if with_thinking else None
|
|
role = chunk.choices[0].message.role or "assistant"
|
|
content = chunk.choices[0].message.content or ''
|
|
# Convert OpenAI tool_calls to Ollama format
|
|
# In streaming mode, tool_calls arrive as partial deltas across multiple chunks
|
|
# (name only in first delta, arguments as incremental JSON fragments).
|
|
# Callers must accumulate deltas and inject the final result; skip here.
|
|
ollama_tool_calls = None
|
|
if not stream:
|
|
raw_tool_calls = getattr(with_thinking.message, "tool_calls", None) if with_thinking else None
|
|
if raw_tool_calls:
|
|
ollama_tool_calls = []
|
|
for tc in raw_tool_calls:
|
|
try:
|
|
args = orjson.loads(tc.function.arguments) if isinstance(tc.function.arguments, str) else (tc.function.arguments or {})
|
|
except (orjson.JSONDecodeError, TypeError):
|
|
args = {}
|
|
ollama_tool_calls.append(ollama.Message.ToolCall(
|
|
function=ollama.Message.ToolCall.Function(name=tc.function.name, arguments=args)
|
|
))
|
|
# Convert OpenAI logprobs to Ollama format
|
|
ollama_logprobs = _convert_openai_logprobs(with_thinking) if with_thinking else None
|
|
assistant_msg = ollama.Message(
|
|
role=role,
|
|
content=content,
|
|
thinking=thinking,
|
|
images=None,
|
|
tool_name=None,
|
|
tool_calls=ollama_tool_calls)
|
|
rechunk = ollama.ChatResponse(
|
|
model=chunk.model,
|
|
created_at=iso8601_ns(),
|
|
done=True if chunk.usage is not None else False,
|
|
done_reason=chunk.choices[0].finish_reason, #if chunk.choices[0].finish_reason is not None else None,
|
|
total_duration=int((now - start_ts) * 1_000_000_000) if chunk.usage is not None else 0,
|
|
load_duration=100000,
|
|
prompt_eval_count=int(chunk.usage.prompt_tokens) if chunk.usage is not None else 0,
|
|
prompt_eval_duration=int((now - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None and chunk.usage.completion_tokens != 0 else 0,
|
|
eval_count=int(chunk.usage.completion_tokens) if chunk.usage is not None else 0,
|
|
eval_duration=int((now - start_ts) * 1_000_000_000) if chunk.usage is not None else 0,
|
|
message=assistant_msg,
|
|
logprobs=ollama_logprobs)
|
|
return rechunk
|
|
|
|
def openai_completion2ollama(chunk: dict, stream: bool, start_ts: float) -> ollama.GenerateResponse:
|
|
now = time.perf_counter()
|
|
with_thinking = chunk.choices[0] if chunk.choices[0] else None
|
|
thinking = getattr(with_thinking, "reasoning", None) if with_thinking else None
|
|
rechunk = ollama.GenerateResponse(
|
|
model=chunk.model,
|
|
created_at=iso8601_ns(),
|
|
done=True if chunk.usage is not None else False,
|
|
done_reason=chunk.choices[0].finish_reason,
|
|
total_duration=int((now - start_ts) * 1_000_000_000) if chunk.usage is not None else 0,
|
|
load_duration=10000,
|
|
prompt_eval_count=int(chunk.usage.prompt_tokens) if chunk.usage is not None else 0,
|
|
prompt_eval_duration=int((now - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None and chunk.usage.completion_tokens != 0 else 0,
|
|
eval_count=int(chunk.usage.completion_tokens) if chunk.usage is not None else 0,
|
|
eval_duration=int((now - start_ts) * 1_000_000_000) if chunk.usage is not None else 0,
|
|
response=chunk.choices[0].text or '',
|
|
thinking=thinking)
|
|
return rechunk
|
|
|
|
def openai_embeddings2ollama(chunk: dict) -> ollama.EmbeddingsResponse:
|
|
rechunk = ollama.EmbeddingsResponse(embedding=chunk.data[0].embedding)
|
|
return rechunk
|
|
|
|
def openai_embed2ollama(chunk: dict, model: str) -> ollama.EmbedResponse:
|
|
rechunk = ollama.EmbedResponse(
|
|
model=model,
|
|
created_at=iso8601_ns(),
|
|
done=None,
|
|
done_reason=None,
|
|
total_duration=None,
|
|
load_duration=None,
|
|
prompt_eval_count=None,
|
|
prompt_eval_duration=None,
|
|
eval_count=None,
|
|
eval_duration=None,
|
|
embeddings=[chunk.data[0].embedding])
|
|
return rechunk
|
|
|
|
def extract_usage_from_llama_timings(obj) -> tuple[int, int] | None:
|
|
"""Extract (prompt_tokens, completion_tokens) from llama-server's timings object.
|
|
|
|
llama-server returns a ``timings`` dict instead of the standard OpenAI
|
|
``usage`` field::
|
|
|
|
"timings": {
|
|
"cache_n": 236, // prompt tokens reused from cache
|
|
"prompt_n": 1, // prompt tokens processed
|
|
"predicted_n": 35 // predicted (completion) tokens
|
|
}
|
|
|
|
prompt_tokens = prompt_n + cache_n
|
|
completion_tokens = predicted_n
|
|
|
|
Returns ``(prompt_tokens, completion_tokens)`` or ``None`` when no
|
|
timings are found.
|
|
"""
|
|
timings = getattr(obj, "timings", None)
|
|
if timings is None:
|
|
return None
|
|
if isinstance(timings, dict):
|
|
prompt_n = timings.get("prompt_n", 0) or 0
|
|
cache_n = timings.get("cache_n", 0) or 0
|
|
predicted_n = timings.get("predicted_n", 0) or 0
|
|
return (prompt_n + cache_n, predicted_n)
|
|
return None
|