feat(router): add logprob support in /api/chat

Add logprob support to the OpenAI-to-Ollama proxy by converting OpenAI logprob formats to Ollama types. Also update the ollama dependency.
This commit is contained in:
Alpha Nerd 2026-02-13 13:29:45 +01:00
parent 07af6e2e36
commit eda48562da
3 changed files with 39 additions and 6 deletions

View file

@ -5,6 +5,9 @@ endpoints:
- http://192.168.0.52:11434
- https://api.openai.com/v1
llama_server_endpoints:
- http://192.168.0.33:8889/v1
# Maximum concurrent connections *per endpointmodel pair* (equals to OLLAMA_NUM_PARALLEL)
max_concurrent_connections: 2
@ -19,3 +22,4 @@ api_keys:
"http://192.168.0.51:11434": "ollama"
"http://192.168.0.52:11434": "ollama"
"https://api.openai.com/v1": "${OPENAI_KEY}"
"http://192.168.0.33:8889/v1": "llama"

View file

@ -19,7 +19,7 @@ httpx==0.28.1
idna==3.10
jiter==0.10.0
multidict==6.6.4
ollama==0.6.0
ollama==0.6.1
openai==1.102.0
orjson>=3.11.5
pillow==12.1.1

View file

@ -183,6 +183,7 @@ def _config_path_from_env() -> Path:
return Path(candidate).expanduser()
return Path("config.yaml")
from ollama._types import TokenLogprob, Logprob
from db import TokenDatabase
@ -1191,6 +1192,27 @@ def _build_ollama_tool_calls(accumulator: dict) -> list | None:
))
return result
def _convert_openai_logprobs(choice) -> list | None:
"""Convert OpenAI logprobs from a choice into Ollama Logprob objects."""
lp = getattr(choice, "logprobs", None)
if lp is None:
return None
content = getattr(lp, "content", None)
if not content:
return None
result = []
for entry in content:
top = [
TokenLogprob(token=alt.token, logprob=alt.logprob)
for alt in (entry.top_logprobs or [])
]
result.append(Logprob(
token=entry.token,
logprob=entry.logprob,
top_logprobs=top or None,
))
return result
class rechunk:
def openai_chat_completion2ollama(chunk: dict, stream: bool, start_ts: float) -> ollama.ChatResponse:
now = time.perf_counter()
@ -1234,6 +1256,8 @@ class rechunk:
ollama_tool_calls.append(ollama.Message.ToolCall(
function=ollama.Message.ToolCall.Function(name=tc.function.name, arguments=args)
))
# Convert OpenAI logprobs to Ollama format
ollama_logprobs = _convert_openai_logprobs(with_thinking) if with_thinking else None
assistant_msg = ollama.Message(
role=role,
content=content,
@ -1242,17 +1266,18 @@ class rechunk:
tool_name=None,
tool_calls=ollama_tool_calls)
rechunk = ollama.ChatResponse(
model=chunk.model,
model=chunk.model,
created_at=iso8601_ns(),
done=True if chunk.usage is not None else False,
done_reason=chunk.choices[0].finish_reason, #if chunk.choices[0].finish_reason is not None else None,
total_duration=int((now - start_ts) * 1_000_000_000) if chunk.usage is not None else 0,
load_duration=100000,
load_duration=100000,
prompt_eval_count=int(chunk.usage.prompt_tokens) if chunk.usage is not None else 0,
prompt_eval_duration=int((now - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None and chunk.usage.completion_tokens != 0 else 0,
prompt_eval_duration=int((now - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None and chunk.usage.completion_tokens != 0 else 0,
eval_count=int(chunk.usage.completion_tokens) if chunk.usage is not None else 0,
eval_duration=int((now - start_ts) * 1_000_000_000) if chunk.usage is not None else 0,
message=assistant_msg)
message=assistant_msg,
logprobs=ollama_logprobs)
return rechunk
def openai_completion2ollama(chunk: dict, stream: bool, start_ts: float) -> ollama.GenerateResponse:
@ -1598,6 +1623,8 @@ async def chat_proxy(request: Request):
_format = payload.get("format")
keep_alive = payload.get("keep_alive")
options = payload.get("options")
logprobs = payload.get("logprobs")
top_logprobs = payload.get("top_logprobs")
if not model:
raise HTTPException(
@ -1644,6 +1671,8 @@ async def chat_proxy(request: Request):
"stop": options.get("stop") if options and "stop" in options else None,
"top_p": options.get("top_p") if options and "top_p" in options else None,
"temperature": options.get("temperature") if options and "temperature" in options else None,
"logprobs": logprobs if logprobs is not None else (options.get("logprobs") if options and "logprobs" in options else None),
"top_logprobs": top_logprobs if top_logprobs is not None else (options.get("top_logprobs") if options and "top_logprobs" in options else None),
"response_format": {"type": "json_schema", "json_schema": _format} if _format is not None else None
}
params.update({k: v for k, v in optional_params.items() if v is not None})
@ -1663,7 +1692,7 @@ async def chat_proxy(request: Request):
# Use the dedicated MOE helper function
async_gen = await _make_moe_requests(model, messages, tools, think, _format, options, keep_alive)
else:
async_gen = await client.chat(model=model, messages=messages, tools=tools, stream=stream, think=think, format=_format, options=options, keep_alive=keep_alive)
async_gen = await client.chat(model=model, messages=messages, tools=tools, stream=stream, think=think, format=_format, options=options, keep_alive=keep_alive, logprobs=logprobs, top_logprobs=top_logprobs)
if stream == True:
tc_acc = {} # accumulate OpenAI tool-call deltas across chunks
async for chunk in async_gen: