From eda48562da4fad52ec66421a8ef61b576a375bf1 Mon Sep 17 00:00:00 2001
From: alpha-nerd-nomyo <alpha-nerd@nomyo.ai>
Date: Fri, 13 Feb 2026 13:29:45 +0100
Subject: [PATCH 1/3] feat(router): add logprob support in /api/chat

Add logprob support to the OpenAI-to-Ollama proxy by converting OpenAI logprob formats to Ollama types. Also update the ollama dependency.
---
 config.yaml      |  4 ++++
 requirements.txt |  2 +-
 router.py        | 39 ++++++++++++++++++++++++++++++++++-----
 3 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/config.yaml b/config.yaml
index 6657e78..752ccec 100644
--- a/config.yaml
+++ b/config.yaml
@@ -5,6 +5,9 @@ endpoints:
   - http://192.168.0.52:11434
   - https://api.openai.com/v1
 
+llama_server_endpoints:
+  - http://192.168.0.33:8889/v1
+
 # Maximum concurrent connections *per endpoint‑model pair* (equals to OLLAMA_NUM_PARALLEL)
 max_concurrent_connections: 2
 
@@ -19,3 +22,4 @@ api_keys:
   "http://192.168.0.51:11434": "ollama"
   "http://192.168.0.52:11434": "ollama"
   "https://api.openai.com/v1": "${OPENAI_KEY}"
+  "http://192.168.0.33:8889/v1": "llama"
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 314345e..aa51a0f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,7 +19,7 @@ httpx==0.28.1
 idna==3.10
 jiter==0.10.0
 multidict==6.6.4
-ollama==0.6.0
+ollama==0.6.1
 openai==1.102.0
 orjson>=3.11.5
 pillow==12.1.1
diff --git a/router.py b/router.py
index 51f5054..d9b54f4 100644
--- a/router.py
+++ b/router.py
@@ -183,6 +183,7 @@ def _config_path_from_env() -> Path:
         return Path(candidate).expanduser()
     return Path("config.yaml")
 
+from ollama._types import TokenLogprob, Logprob
 from db import TokenDatabase
 
 
@@ -1191,6 +1192,27 @@ def _build_ollama_tool_calls(accumulator: dict) -> list | None:
         ))
     return result
 
+def _convert_openai_logprobs(choice) -> list | None:
+    """Convert OpenAI logprobs from a choice into Ollama Logprob objects."""
+    lp = getattr(choice, "logprobs", None)
+    if lp is None:
+        return None
+    content = getattr(lp, "content", None)
+    if not content:
+        return None
+    result = []
+    for entry in content:
+        top = [
+            TokenLogprob(token=alt.token, logprob=alt.logprob)
+            for alt in (entry.top_logprobs or [])
+        ]
+        result.append(Logprob(
+            token=entry.token,
+            logprob=entry.logprob,
+            top_logprobs=top or None,
+        ))
+    return result
+
 class rechunk:
     def openai_chat_completion2ollama(chunk: dict, stream: bool, start_ts: float) -> ollama.ChatResponse:
         now = time.perf_counter()
@@ -1234,6 +1256,8 @@ class rechunk:
                     ollama_tool_calls.append(ollama.Message.ToolCall(
                         function=ollama.Message.ToolCall.Function(name=tc.function.name, arguments=args)
                     ))
+        # Convert OpenAI logprobs to Ollama format
+        ollama_logprobs = _convert_openai_logprobs(with_thinking) if with_thinking else None
         assistant_msg = ollama.Message(
             role=role,
             content=content,
@@ -1242,17 +1266,18 @@ class rechunk:
             tool_name=None,
             tool_calls=ollama_tool_calls)
         rechunk = ollama.ChatResponse(
-            model=chunk.model, 
+            model=chunk.model,
             created_at=iso8601_ns(),
             done=True if chunk.usage is not None else False,
             done_reason=chunk.choices[0].finish_reason, #if chunk.choices[0].finish_reason is not None else None,
             total_duration=int((now - start_ts) * 1_000_000_000) if chunk.usage is not None else 0,
-            load_duration=100000, 
+            load_duration=100000,
             prompt_eval_count=int(chunk.usage.prompt_tokens) if chunk.usage is not None else 0,
-            prompt_eval_duration=int((now - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None and chunk.usage.completion_tokens != 0 else 0, 
+            prompt_eval_duration=int((now - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None and chunk.usage.completion_tokens != 0 else 0,
             eval_count=int(chunk.usage.completion_tokens) if chunk.usage is not None else 0,
             eval_duration=int((now - start_ts) * 1_000_000_000) if chunk.usage is not None else 0,
-            message=assistant_msg)
+            message=assistant_msg,
+            logprobs=ollama_logprobs)
         return rechunk
     
     def openai_completion2ollama(chunk: dict, stream: bool, start_ts: float) -> ollama.GenerateResponse:
@@ -1598,6 +1623,8 @@ async def chat_proxy(request: Request):
         _format = payload.get("format")
         keep_alive = payload.get("keep_alive")
         options = payload.get("options")
+        logprobs = payload.get("logprobs")
+        top_logprobs = payload.get("top_logprobs")
 
         if not model:
             raise HTTPException(
@@ -1644,6 +1671,8 @@ async def chat_proxy(request: Request):
             "stop": options.get("stop") if options and "stop" in options else None,
             "top_p": options.get("top_p") if options and "top_p" in options else None,
             "temperature": options.get("temperature") if options and "temperature" in options else None,
+            "logprobs": logprobs if logprobs is not None else (options.get("logprobs") if options and "logprobs" in options else None),
+            "top_logprobs": top_logprobs if top_logprobs is not None else (options.get("top_logprobs") if options and "top_logprobs" in options else None),
             "response_format": {"type": "json_schema", "json_schema": _format} if _format is not None else None
             }
         params.update({k: v for k, v in optional_params.items() if v is not None})
@@ -1663,7 +1692,7 @@ async def chat_proxy(request: Request):
                     # Use the dedicated MOE helper function
                     async_gen = await _make_moe_requests(model, messages, tools, think, _format, options, keep_alive)
                 else:
-                    async_gen = await client.chat(model=model, messages=messages, tools=tools, stream=stream, think=think, format=_format, options=options, keep_alive=keep_alive)
+                    async_gen = await client.chat(model=model, messages=messages, tools=tools, stream=stream, think=think, format=_format, options=options, keep_alive=keep_alive, logprobs=logprobs, top_logprobs=top_logprobs)
             if stream == True:
                 tc_acc = {}  # accumulate OpenAI tool-call deltas across chunks
                 async for chunk in async_gen:

From 4d80dc5e7c1c3814592a89ea98aaa5f81dd2da9d Mon Sep 17 00:00:00 2001
From: alpha-nerd-nomyo <alpha-nerd@nomyo.ai>
Date: Fri, 13 Feb 2026 14:43:10 +0100
Subject: [PATCH 2/3] feat: adding logprobs to /v1/chat/completion

---
 router.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/router.py b/router.py
index d9b54f4..55a669c 100644
--- a/router.py
+++ b/router.py
@@ -2524,6 +2524,8 @@ async def openai_chat_completions_proxy(request: Request):
         max_tokens = payload.get("max_tokens")
         max_completion_tokens = payload.get("max_completion_tokens")
         tools = payload.get("tools")
+        logprobs = payload.get("logprobs")
+        top_logprobs = payload.get("top_logprobs")
 
         if ":latest" in model:
             model = model.split(":latest")
@@ -2547,6 +2549,8 @@ async def openai_chat_completions_proxy(request: Request):
             "frequency_penalty": frequency_penalty,
             "stop": stop,
             "stream": stream,
+            "logprobs": logprobs,
+            "top_logprobs": top_logprobs,
         }
 
         params.update({k: v for k, v in optional_params.items() if v is not None})

From c9ff384bb2973a0ed7f372a52dea861a3f9bf55b Mon Sep 17 00:00:00 2001
From: alpha-nerd-nomyo <alpha-nerd@nomyo.ai>
Date: Fri, 13 Feb 2026 16:27:06 +0100
Subject: [PATCH 3/3] fix(router): /v1/models endpoint

Shows now all available models
---
 router.py | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/router.py b/router.py
index 55a669c..8e5c2ed 100644
--- a/router.py
+++ b/router.py
@@ -2758,19 +2758,22 @@ async def openai_models_proxy(request: Request):
     """
     # 1. Query Ollama endpoints for all models via /api/tags
     ollama_tasks = [fetch.endpoint_details(ep, "/api/tags", "models") for ep in config.endpoints if "/v1" not in ep]
-    # 2. Query llama-server endpoints for loaded models via /v1/models
+    # 2. Query external OpenAI endpoints (Groq, OpenAI, etc.) via /models
+    ext_openai_tasks = [fetch.endpoint_details(ep, "/models", "data", config.api_keys.get(ep)) for ep in config.endpoints if is_ext_openai_endpoint(ep)]
+    # 3. Query llama-server endpoints for loaded models via /v1/models
     # Also query endpoints from llama_server_endpoints that may not be in config.endpoints
     all_llama_endpoints = set(config.llama_server_endpoints) | set(ep for ep in config.endpoints if ep in config.llama_server_endpoints)
     llama_tasks = [
         fetch.endpoint_details(ep, "/models", "data", config.api_keys.get(ep))
         for ep in all_llama_endpoints
     ]
-    
+
     ollama_models = await asyncio.gather(*ollama_tasks) if ollama_tasks else []
+    ext_openai_models = await asyncio.gather(*ext_openai_tasks) if ext_openai_tasks else []
     llama_models = await asyncio.gather(*llama_tasks) if llama_tasks else []
-    
+
     models = {'data': []}
-    
+
     # Add Ollama models (if any)
     if ollama_models:
         for modellist in ollama_models:
@@ -2780,12 +2783,21 @@ async def openai_models_proxy(request: Request):
                 else:
                     model['name'] = model['id']
                 models['data'].append(model)
-    
-    # Add llama-server models (filter for loaded only, if any)
+
+    # Add external OpenAI models (if any)
+    if ext_openai_models:
+        for modellist in ext_openai_models:
+            for model in modellist:
+                if not "id" in model.keys():
+                    model['id'] = model.get('name', model.get('id', ''))
+                else:
+                    model['name'] = model['id']
+                models['data'].append(model)
+
+    # Add llama-server models (all available, not just loaded)
     if llama_models:
         for modellist in llama_models:
-            loaded_models = [item for item in modellist if _is_llama_model_loaded(item)]
-            for model in loaded_models:
+            for model in modellist:
                 if not "id" in model.keys():
                     model['id'] = model.get('name', model.get('id', ''))
                 else: