nomyo-router/router.py

"""
title: NOMYO Router - an (O)llama and OpenAI API v1 Proxy with Endpoint:Model aware routing
author: alpha-nerd-nomyo
author_url: https://github.com/nomyo-ai
version: 0.9
license: AGPL
"""
# -------------------------------------------------------------
import orjson, time, asyncio, yaml, ollama, openai, os, re, aiohttp, ssl, random, base64, io, enhance, secrets, math, socket, httpx, hashlib
try:
    import truststore; truststore.inject_into_ssl()
except ImportError:
    pass
from datetime import datetime, timezone
from pathlib import Path

# Directory containing static files (relative to this script)
STATIC_DIR = Path(__file__).parent / "static"
from typing import Dict, Set, List, Optional
from urllib.parse import urlparse, parse_qsl, urlencode
from fastapi import FastAPI, Request, HTTPException
from fastapi_sse import sse_handler
from fastapi.staticfiles import StaticFiles
from fastapi.middleware.cors import CORSMiddleware
from starlette.responses import StreamingResponse, JSONResponse, Response, HTMLResponse, RedirectResponse
from pydantic import Field
from pydantic_settings import BaseSettings
from collections import defaultdict
from PIL import Image

from security import _mask_secrets
from context_window import (
    _count_message_tokens,
    _trim_messages_for_context,
    _calibrated_trim_target,
    _endpoint_nctx,
    _CTX_TRIM_SMALL_LIMIT,
)
from state import (
    _models_cache,
    _loaded_models_cache,
    _available_error_cache,
    _loaded_error_cache,
    _completion_error_cache,
    _COMPLETION_ERROR_TTL,
    _models_cache_lock,
    _loaded_models_cache_lock,
    _available_error_cache_lock,
    _loaded_error_cache_lock,
    _completion_error_cache_lock,
    _inflight_available_models,
    _inflight_loaded_models,
    _inflight_lock,
    _bg_refresh_available,
    _bg_refresh_loaded,
    _bg_refresh_lock,
    _subscribers,
    _subscribers_lock,
    token_queue,
    app_state,
    token_buffer,
    time_series_buffer,
    buffer_lock,
    FLUSH_INTERVAL,
)

# Rebound on startup — must stay in router.py module namespace.
token_worker_task: asyncio.Task | None = None
flush_task: asyncio.Task | None = None

from config import Config, _config_path_from_env

from ollama._types import TokenLogprob, Logprob
from db import TokenDatabase
from cache import init_llm_cache, get_llm_cache, openai_nonstream_to_sse


# Create the global config object – it will be overwritten on startup.
# Submodules read it lazily via config.get_config().
config = Config.from_yaml(_config_path_from_env())

# -------------------------------------------------------------
# 2. FastAPI application
# -------------------------------------------------------------
app = FastAPI()
sse_handler.app = app
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["GET", "POST", "DELETE"],
    allow_headers=["Authorization", "Content-Type"],
)
from state import default_headers
        
# -------------------------------------------------------------
# Router-level authentication (optional)
# -------------------------------------------------------------
def _extract_router_api_key(request: Request) -> Optional[str]:
    """
    Extract the provided router API key from the Authorization header or `api_key`
    query parameter. The middleware uses this to gate access to API routes when
    a router_api_key is configured.
    """
    auth_header = request.headers.get("Authorization")
    if auth_header and auth_header.lower().startswith("bearer "):
        key = auth_header.split(" ", 1)[1].strip()
        if key:  # Ensure key is not empty
            return key
    query_key = request.query_params.get("api_key")
    if query_key:
        return query_key
    return None


def _strip_api_key_from_scope(request: Request) -> None:
    """
    Remove api_key from the ASGI scope query string to avoid leaking it in logs.
    """
    scope = request.scope
    raw_qs = scope.get("query_string", b"")
    if not raw_qs:
        return
    params = parse_qsl(raw_qs.decode("utf-8"), keep_blank_values=True)
    filtered = [(k, v) for (k, v) in params if k != "api_key"]
    scope["query_string"] = urlencode(filtered).encode("utf-8")


@app.middleware("http")
async def enforce_router_api_key(request: Request, call_next):
    """
    Enforce the optional NOMYO Router API key for all non-static requests.
    When `config.router_api_key` is set, clients must supply the key either in
    the Authorization header (`Bearer <key>`) or as `api_key` query parameter.
    """
    expected_key = config.router_api_key
    if not expected_key or request.method == "OPTIONS":
        return await call_next(request)

    path = request.url.path
    # Allow static assets (CSS, JS, images, fonts) but NOT HTML pages,
    # which would bypass auth by accessing /static/index.html directly.
    _STATIC_ASSET_EXTS = {".css", ".js", ".ico", ".png", ".jpg", ".jpeg", ".svg", ".woff", ".woff2", ".ttf", ".map"}
    is_static_asset = path.startswith("/static") and Path(path).suffix.lower() in _STATIC_ASSET_EXTS
    if is_static_asset or path in {"/", "/favicon.ico"}:
        return await call_next(request)

    provided_key = _extract_router_api_key(request)
    # Strip the api_key query param from scope so access logs do not leak it
    _strip_api_key_from_scope(request)
    if provided_key is None:
        # No key provided but authentication is required - return 401
        headers = {}
        if "/api/" in path and path != "/api/usage-stream":
            headers = {
                "Access-Control-Allow-Origin": "*",
                "Access-Control-Allow-Headers": "Authorization, Content-Type",
                "Access-Control-Allow-Methods": "GET, POST, PUT, DELETE, OPTIONS",
            }
        return JSONResponse(
            content={"detail": "Missing NOMYO Router API key"},
            status_code=401,
            headers=headers,
        )

    if not secrets.compare_digest(str(provided_key), str(expected_key)):
        return JSONResponse(
            content={"detail": "Invalid NOMYO Router API key"},
            status_code=403,
        )

    response = await call_next(request)
    # Add CORS headers for authenticated API requests
    if "/api/" in path and path != "/api/usage-stream":
        response.headers["Access-Control-Allow-Origin"] = "*"
        response.headers["Access-Control-Allow-Headers"] = "Authorization, Content-Type"
        response.headers["Access-Control-Allow-Methods"] = "GET, POST, PUT, DELETE, OPTIONS"
    return response


@app.exception_handler(openai.APIStatusError)
async def _openai_api_status_error_handler(request: Request, exc: openai.APIStatusError):
    """Forward upstream OpenAI-SDK status errors with their original status code and body
    instead of letting them bubble up as 500s."""
    body = exc.body if exc.body is not None else {"error": {"message": str(exc), "code": exc.status_code}}
    return JSONResponse(status_code=exc.status_code, content=body)


from state import (
    usage_counts,
    token_usage_counts,
    usage_lock,
    token_usage_lock,
    _affinity_map,
    _affinity_lock,
    _AFFINITY_MAX_ENTRIES,
)

from fingerprint import _conversation_fingerprint

# Database instance
db: "TokenDatabase" = None

# -------------------------------------------------------------
# 4. Helperfunctions
# -------------------------------------------------------------
from backends.normalize import (
    _normalize_llama_model_name,
    _extract_llama_quant,
    ep2base,
    dedupe_on_keys,
)
from backends.sessions import (
    _is_unix_socket_endpoint,
    _get_socket_path,
    get_session,
    _make_openai_client,
)
from backends.health import (
    _is_fresh,
    _ensure_success,
    _format_connection_issue,
    _is_backend_connection_error,
    _mark_backend_unhealthy,
    _is_llama_model_loaded,
    _is_llama_model_loaded_or_sleeping,
)


from backends.normalize import (
    is_ext_openai_endpoint,
    is_openai_compatible,
    get_tracking_model,
)

async def token_worker() -> None:
    try:
        while True:
            endpoint, model, prompt, comp = await token_queue.get()
            # Calculate timestamp once before acquiring lock
            now = datetime.now(tz=timezone.utc)
            timestamp = int(datetime(now.year, now.month, now.day, now.hour, now.minute, tzinfo=timezone.utc).timestamp())

            # Accumulate counts in memory buffer (protected by lock)
            async with buffer_lock:
                token_buffer[endpoint][model] = (
                    token_buffer[endpoint].get(model, (0, 0))[0] + prompt,
                    token_buffer[endpoint].get(model, (0, 0))[1] + comp
                )

                # Add to time series buffer with timestamp (UTC)
                time_series_buffer.append({
                    'endpoint': endpoint,
                    'model': model,
                    'input_tokens': prompt,
                    'output_tokens': comp,
                    'total_tokens': prompt + comp,
                    'timestamp': timestamp
                })

            # Update in-memory counts for immediate reporting
            async with token_usage_lock:
                token_usage_counts[endpoint][model] += (prompt + comp)
                snapshot = _capture_snapshot()
            await _distribute_snapshot(snapshot)
    except asyncio.CancelledError:
        # Gracefully handle task cancellation during shutdown
        print("[token_worker] Task cancelled, processing remaining queue items...")
        # Process any remaining items in the queue before exiting
        while not token_queue.empty():
            try:
                endpoint, model, prompt, comp = token_queue.get_nowait()
                # Calculate timestamp once before acquiring lock
                now = datetime.now(tz=timezone.utc)
                timestamp = int(datetime(now.year, now.month, now.day, now.hour, now.minute, tzinfo=timezone.utc).timestamp())

                async with buffer_lock:
                    token_buffer[endpoint][model] = (
                        token_buffer[endpoint].get(model, (0, 0))[0] + prompt,
                        token_buffer[endpoint].get(model, (0, 0))[1] + comp
                    )
                    time_series_buffer.append({
                        'endpoint': endpoint,
                        'model': model,
                        'input_tokens': prompt,
                        'output_tokens': comp,
                        'total_tokens': prompt + comp,
                        'timestamp': timestamp
                    })
                async with token_usage_lock:
                    token_usage_counts[endpoint][model] += (prompt + comp)
                    snapshot = _capture_snapshot()
                await _distribute_snapshot(snapshot)
            except asyncio.QueueEmpty:
                break
        print("[token_worker] Task cancelled, remaining items processed.")
        raise

async def flush_buffer() -> None:
    """Periodically flush accumulated token counts to the database."""
    try:
        while True:
            await asyncio.sleep(FLUSH_INTERVAL)

            # Flush token counts and time series (protected by lock)
            async with buffer_lock:
                if token_buffer:
                    # Copy buffer before releasing lock for DB operation
                    buffer_copy = {ep: dict(models) for ep, models in token_buffer.items()}
                    token_buffer.clear()
                else:
                    buffer_copy = None

                if time_series_buffer:
                    ts_copy = list(time_series_buffer)
                    time_series_buffer.clear()
                else:
                    ts_copy = None

            # Perform DB operations outside the lock to avoid blocking
            if buffer_copy:
                await db.update_batched_counts(buffer_copy)
            if ts_copy:
                await db.add_batched_time_series(ts_copy)
    except asyncio.CancelledError:
        # Gracefully handle task cancellation during shutdown
        print("[flush_buffer] Task cancelled, flushing remaining buffers...")
        # Flush any remaining data before exiting
        try:
            async with buffer_lock:
                if token_buffer:
                    buffer_copy = {ep: dict(models) for ep, models in token_buffer.items()}
                    token_buffer.clear()
                else:
                    buffer_copy = None
                if time_series_buffer:
                    ts_copy = list(time_series_buffer)
                    time_series_buffer.clear()
                else:
                    ts_copy = None
            if buffer_copy:
                await db.update_batched_counts(buffer_copy)
            if ts_copy:
                await db.add_batched_time_series(ts_copy)
            print("[flush_buffer] Task cancelled, remaining buffers flushed.")
        except Exception as e:
            print(f"[flush_buffer] Error during shutdown flush: {e}")
        raise

async def flush_remaining_buffers() -> None:
    """
    Flush any in-memory buffers to the database on shutdown.
    This is designed to be safely invoked during shutdown and should not raise.
    """
    try:
        flushed_entries = 0
        async with buffer_lock:
            if token_buffer:
                buffer_copy = {ep: dict(models) for ep, models in token_buffer.items()}
                flushed_entries += sum(len(v) for v in token_buffer.values())
                token_buffer.clear()
            else:
                buffer_copy = None
            if time_series_buffer:
                ts_copy = list(time_series_buffer)
                flushed_entries += len(time_series_buffer)
                time_series_buffer.clear()
            else:
                ts_copy = None
        # Perform DB operations outside the lock
        if buffer_copy:
            await db.update_batched_counts(buffer_copy)
        if ts_copy:
            await db.add_batched_time_series(ts_copy)
        if flushed_entries:
            print(f"[shutdown] Flushed {flushed_entries} in-memory entries to DB on shutdown.")
        else:
            print("[shutdown] No in-memory entries to flush on shutdown.")
    except Exception as e:
        # Do not raise during shutdown – log and continue teardown
        print(f"[shutdown] Error flushing remaining buffers: {e}")

from backends.probe import fetch


async def increment_usage(endpoint: str, model: str) -> None:
    async with usage_lock:
        usage_counts[endpoint][model] += 1
        snapshot = _capture_snapshot()
    await _distribute_snapshot(snapshot)

async def decrement_usage(endpoint: str, model: str) -> None:
    async with usage_lock:
        # Avoid negative counts
        current = usage_counts[endpoint].get(model, 0)
        if current > 0:
            usage_counts[endpoint][model] = current - 1
        # Optionally, clean up zero entries
        if usage_counts[endpoint].get(model, 0) == 0:
            usage_counts[endpoint].pop(model, None)
        #if not usage_counts[endpoint]:
        #    usage_counts.pop(endpoint, None)
        snapshot = _capture_snapshot()
    await _distribute_snapshot(snapshot)

async def _make_chat_request(model: str, messages: list, tools=None, stream: bool = False, think: bool = False, format=None, options=None, keep_alive: str = None) -> ollama.ChatResponse:
    """
    Helper function to make a chat request to a specific endpoint.
    Handles endpoint selection, client creation, usage tracking, and request execution.
    """
    endpoint, tracking_model = await choose_endpoint(model)  # selects and atomically reserves
    use_openai = is_openai_compatible(endpoint)
    if use_openai:
        if ":latest" in model:
            model = model.split(":latest")[0]
        if messages:
            if any("images" in m for m in messages):
                messages = await asyncio.to_thread(transform_images_to_data_urls, messages)
            messages = transform_tool_calls_to_openai(messages)
            messages = _strip_assistant_prefill(messages)
        params = {
            "messages": messages,
            "model": model,
        }
        optional_params = {
            "tools": tools,
            "stream": stream,
            "stream_options": {"include_usage": True} if stream else None,
            "max_tokens": options.get("num_predict") if options and "num_predict" in options else None,
            "frequency_penalty": options.get("frequency_penalty") if options and "frequency_penalty" in options else None,
            "presence_penalty": options.get("presence_penalty") if options and "presence_penalty" in options else None,
            "seed": options.get("seed") if options and "seed" in options else None,
            "stop": options.get("stop") if options and "stop" in options else None,
            "top_p": options.get("top_p") if options and "top_p" in options else None,
            "temperature": options.get("temperature") if options and "temperature" in options else None,
            "response_format": {"type": "json_schema", "json_schema": format} if format is not None else None
        }
        params.update({k: v for k, v in optional_params.items() if v is not None})
        oclient = _make_openai_client(endpoint, default_headers=default_headers, api_key=config.api_keys.get(endpoint, "no-key"))
    else:
        client = ollama.AsyncClient(host=endpoint)

    try:
        if use_openai:
            start_ts = time.perf_counter()
            try:
                response = await oclient.chat.completions.create(**params)
            except Exception as e:
                _e_str = str(e)
                print(f"[_make_chat_request] caught {type(e).__name__}: {_e_str[:200]}")
                if "exceed_context_size_error" in _e_str or "exceeds the available context size" in _e_str:
                    err_body = getattr(e, "body", {}) or {}
                    err_detail = err_body.get("error", {}) if isinstance(err_body, dict) else {}
                    n_ctx_limit = err_detail.get("n_ctx", 0)
                    actual_tokens = err_detail.get("n_prompt_tokens", 0)
                    if not n_ctx_limit:
                        _m = re.search(r"'n_ctx':\s*(\d+)", _e_str)
                        if _m:
                            n_ctx_limit = int(_m.group(1))
                        _m = re.search(r"'n_prompt_tokens':\s*(\d+)", _e_str)
                        if _m:
                            actual_tokens = int(_m.group(1))
                    if not n_ctx_limit:
                        raise
                    msgs_to_trim = params.get("messages", [])
                    cal_target = _calibrated_trim_target(msgs_to_trim, n_ctx_limit, actual_tokens)
                    trimmed = _trim_messages_for_context(msgs_to_trim, n_ctx_limit, target_tokens=cal_target)
                    print(f"[_make_chat_request] Context exceeded ({actual_tokens}/{n_ctx_limit} tokens, tiktoken_target={cal_target}), dropped {len(msgs_to_trim) - len(trimmed)} oldest message(s) and retrying")
                    try:
                        response = await oclient.chat.completions.create(**{**params, "messages": trimmed})
                    except Exception as e2:
                        if "exceed_context_size_error" in str(e2) or "exceeds the available context size" in str(e2):
                            print(f"[_make_chat_request] Context still exceeded after trimming, also stripping tools")
                            params_no_tools = {k: v for k, v in params.items() if k not in ("tools", "tool_choice")}
                            response = await oclient.chat.completions.create(**{**params_no_tools, "messages": trimmed})
                        else:
                            raise
                elif "image input is not supported" in _e_str:
                    print(f"[_make_chat_request] Model {model} doesn't support images, retrying with text-only messages")
                    params = {**params, "messages": _strip_images_from_messages(params.get("messages", []))}
                    response = await oclient.chat.completions.create(**params)
                else:
                    raise
            if stream:
                # For streaming, we need to collect all chunks
                chunks = []
                tc_acc = {}  # accumulate tool-call deltas
                async for chunk in response:
                    chunks.append(chunk)
                    _accumulate_openai_tc_delta(chunk, tc_acc)
                    prompt_tok = 0
                    comp_tok = 0
                    if chunk.usage is not None:
                        prompt_tok = chunk.usage.prompt_tokens or 0
                        comp_tok = chunk.usage.completion_tokens or 0
                    else:
                        llama_usage = rechunk.extract_usage_from_llama_timings(chunk)
                        if llama_usage:
                            prompt_tok, comp_tok = llama_usage
                    if prompt_tok != 0 or comp_tok != 0:
                        await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))
                # Convert to Ollama format
                if chunks:
                    response = rechunk.openai_chat_completion2ollama(chunks[-1], stream, start_ts)
                    # Inject fully-accumulated tool calls into the final response
                    if tc_acc and response.message:
                        response.message.tool_calls = _build_ollama_tool_calls(tc_acc)
            else:
                prompt_tok = 0
                comp_tok = 0
                if response.usage is not None:
                    prompt_tok = response.usage.prompt_tokens or 0
                    comp_tok = response.usage.completion_tokens or 0
                else:
                    llama_usage = rechunk.extract_usage_from_llama_timings(response)
                    if llama_usage:
                        prompt_tok, comp_tok = llama_usage
                if prompt_tok != 0 or comp_tok != 0:
                    await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))
                response = rechunk.openai_chat_completion2ollama(response, stream, start_ts)
        else:
            response = await client.chat(model=model, messages=messages, tools=tools, stream=stream, think=think, format=format, options=options, keep_alive=keep_alive)
            if stream:
                # For streaming, collect all chunks
                chunks = []
                async for chunk in response:
                    chunks.append(chunk)
                    prompt_tok = chunk.prompt_eval_count or 0
                    comp_tok = chunk.eval_count or 0
                    if prompt_tok != 0 or comp_tok != 0:
                        await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))
                if chunks:
                    response = chunks[-1]
            else:
                prompt_tok = response.prompt_eval_count or 0
                comp_tok = response.eval_count or 0
                if prompt_tok != 0 or comp_tok != 0:
                    await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))

        return response
    finally:
        await decrement_usage(endpoint, tracking_model)

def get_last_user_content(messages):
    """
    Given a list of dicts (e.g., messages from an API),
    return the 'content' of the last dict whose 'role' is 'user'.
    If no such dict exists, return None.
    """
    # Reverse iterate so we stop at the first match
    for msg in reversed(messages):
        if msg.get("role") == "user":
            return msg.get("content")
    return None

async def _make_moe_requests(model: str, messages: list, tools=None, think: bool = False, format=None, options=None, keep_alive: str = None) -> ollama.ChatResponse:
    """
    Helper function to make MOE (Multiple Opinions Ensemble) requests.
    Generates 3 responses, 3 critiques, and returns the final selected response.
    """
    query = get_last_user_content(messages)
    if not query:
        raise ValueError("No user query found in messages")

    if options is None:
        options = {}
    options["temperature"] = 1

    moe_reqs = []

    # Generate 3 responses — choose_endpoint is called inside _make_chat_request and
    # atomically reserves a slot, so all 3 tasks see each other's load immediately.
    response1_task = asyncio.create_task(_make_chat_request(model, messages, tools, stream=False, think=think, format=format, options=options, keep_alive=keep_alive))
    response2_task = asyncio.create_task(_make_chat_request(model, messages, tools, stream=False, think=think, format=format, options=options, keep_alive=keep_alive))
    response3_task = asyncio.create_task(_make_chat_request(model, messages, tools, stream=False, think=think, format=format, options=options, keep_alive=keep_alive))

    responses = await asyncio.gather(response1_task, response2_task, response3_task)

    for n, r in enumerate(responses):
        moe_req = enhance.moe(query, n, r.message.content)
        moe_reqs.append(moe_req)

    # Generate 3 critiques
    critique1_task = asyncio.create_task(_make_chat_request(model, [{"role": "user", "content": moe_reqs[0]}], tools, stream=False, think=think, format=format, options=options, keep_alive=keep_alive))
    critique2_task = asyncio.create_task(_make_chat_request(model, [{"role": "user", "content": moe_reqs[1]}], tools, stream=False, think=think, format=format, options=options, keep_alive=keep_alive))
    critique3_task = asyncio.create_task(_make_chat_request(model, [{"role": "user", "content": moe_reqs[2]}], tools, stream=False, think=think, format=format, options=options, keep_alive=keep_alive))

    critiques = await asyncio.gather(critique1_task, critique2_task, critique3_task)

    # Select final response
    m = enhance.moe_select_candidate(query, critiques)

    # Generate final response
    return await _make_chat_request(model, [{"role": "user", "content": m}], tools, stream=False, think=think, format=format, options=options, keep_alive=keep_alive)

from images import iso8601_ns, is_base64, resize_image_if_needed

def _strip_assistant_prefill(messages: list) -> list:
    """Remove a trailing assistant message used as prefill.
    OpenAI-compatible endpoints (including Claude) do not support prefill and
    will reject requests where the last message has role 'assistant'."""
    if messages and messages[-1].get("role") == "assistant":
        return messages[:-1]
    return messages

def transform_tool_calls_to_openai(message_list):
    """
    Ensure tool_calls in assistant messages conform to the OpenAI format:
    - Each tool call must have "type": "function"
    - Each tool call must have an "id"
    - arguments must be a JSON string, not a dict
    Also ensure tool-role messages have a tool_call_id.
    """
    # Track generated IDs so tool-role messages can reference them
    last_tool_call_ids = {}
    for msg in message_list:
        role = msg.get("role")
        if role == "assistant" and "tool_calls" in msg:
            for tc in msg["tool_calls"]:
                if "type" not in tc:
                    tc["type"] = "function"
                if "id" not in tc:
                    tc["id"] = f"call_{secrets.token_hex(16)}"
                func = tc.get("function", {})
                if isinstance(func.get("arguments"), dict):
                    func["arguments"] = orjson.dumps(func["arguments"]).decode("utf-8")
                # Remember the id for the following tool-role message
                name = func.get("name")
                if name:
                    last_tool_call_ids[name] = tc["id"]
        elif role == "tool":
            if "tool_call_id" not in msg:
                # Try to match by name from a preceding assistant tool_call
                name = msg.get("name") or msg.get("tool_name")
                if name and name in last_tool_call_ids:
                    msg["tool_call_id"] = last_tool_call_ids.pop(name)
    return message_list

def transform_images_to_data_urls(message_list):
    for message in message_list:
        if "images" in message:
            images = message.pop("images")
            if not isinstance(images, list):
                continue
            new_content = []
            for image in images:            #TODO: quality downsize if images are too big to fit into model context window size
                if not is_base64(image):
                    raise ValueError(f"Image string is not a valid base64 encoded string.")
                resized_image = resize_image_if_needed(image)
                if resized_image:
                    data_url = f"data:image/png;base64,{resized_image}"
                    #new_content.append({
                    #    "type": "text",
                    #    "text": ""
                    #})
                    new_content.append({
                        "type": "image_url",
                        "image_url": {
                            "url": data_url
                        }
                    })
            message["content"] = new_content

    return message_list

def _strip_images_from_messages(messages: list) -> list:
    """Remove image_url parts from message content, keeping only text."""
    result = []
    for msg in messages:
        content = msg.get("content")
        if isinstance(content, list):
            text_only = [p for p in content if p.get("type") != "image_url"]
            if len(text_only) == 1 and text_only[0].get("type") == "text":
                content = text_only[0]["text"]
            else:
                content = text_only
            result.append({**msg, "content": content})
        else:
            result.append(msg)
    return result

def _accumulate_openai_tc_delta(chunk, accumulator: dict) -> None:
    """Accumulate tool_call deltas from a single OpenAI streaming chunk.

    ``accumulator`` is a dict mapping tool-call *index* to
    ``{"id": str, "name": str, "arguments": str}`` where ``arguments``
    is the concatenation of all JSON fragments seen so far.
    """
    if not chunk.choices:
        return
    delta = chunk.choices[0].delta
    tc_deltas = getattr(delta, "tool_calls", None)
    if not tc_deltas:
        return
    for tc in tc_deltas:
        idx = tc.index
        if idx not in accumulator:
            accumulator[idx] = {
                "id": getattr(tc, "id", None) or f"call_{secrets.token_hex(16)}",
                "name": tc.function.name if tc.function else None,
                "arguments": "",
            }
        else:
            if getattr(tc, "id", None):
                accumulator[idx]["id"] = tc.id
            if tc.function and tc.function.name:
                accumulator[idx]["name"] = tc.function.name
        if tc.function and tc.function.arguments:
            accumulator[idx]["arguments"] += tc.function.arguments

def _build_ollama_tool_calls(accumulator: dict) -> list | None:
    """Convert accumulated tool-call data into Ollama-format tool_calls list."""
    if not accumulator:
        return None
    result = []
    for idx in sorted(accumulator.keys()):
        tc = accumulator[idx]
        try:
            args = orjson.loads(tc["arguments"]) if tc["arguments"] else {}
        except (orjson.JSONDecodeError, TypeError):
            args = {}
        result.append(ollama.Message.ToolCall(
            function=ollama.Message.ToolCall.Function(name=tc["name"], arguments=args)
        ))
    return result

def _convert_openai_logprobs(choice) -> list | None:
    """Convert OpenAI logprobs from a choice into Ollama Logprob objects."""
    lp = getattr(choice, "logprobs", None)
    if lp is None:
        return None
    content = getattr(lp, "content", None)
    if not content:
        return None
    result = []
    for entry in content:
        top = [
            TokenLogprob(token=alt.token, logprob=alt.logprob)
            for alt in (entry.top_logprobs or [])
        ]
        result.append(Logprob(
            token=entry.token,
            logprob=entry.logprob,
            top_logprobs=top or None,
        ))
    return result

class rechunk:
    def openai_chat_completion2ollama(chunk: dict, stream: bool, start_ts: float) -> ollama.ChatResponse:
        now = time.perf_counter()
        if chunk.choices == [] and chunk.usage is not None:
            return ollama.ChatResponse(
                model=chunk.model,
                created_at=iso8601_ns(),
                done=True,
                done_reason='stop',
                total_duration=int((now - start_ts) * 1_000_000_000),
                load_duration=100000,
                prompt_eval_count=int(chunk.usage.prompt_tokens),
                prompt_eval_duration=int((now - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)),
                eval_count=int(chunk.usage.completion_tokens),
                eval_duration=int((now - start_ts) * 1_000_000_000),
                message=ollama.Message(role="assistant", content=""),
                )
        with_thinking = chunk.choices[0] if chunk.choices[0] else None
        if stream == True:
            thinking = (getattr(with_thinking.delta, "reasoning_content", None) or getattr(with_thinking.delta, "reasoning", None)) if with_thinking else None
            role = chunk.choices[0].delta.role or "assistant"
            content = chunk.choices[0].delta.content or ''
        else:
            thinking = (getattr(with_thinking.message, "reasoning_content", None) or getattr(with_thinking.message, "reasoning", None)) if with_thinking else None
            role = chunk.choices[0].message.role or "assistant"
            content = chunk.choices[0].message.content or ''
        # Convert OpenAI tool_calls to Ollama format
        # In streaming mode, tool_calls arrive as partial deltas across multiple chunks
        # (name only in first delta, arguments as incremental JSON fragments).
        # Callers must accumulate deltas and inject the final result; skip here.
        ollama_tool_calls = None
        if not stream:
            raw_tool_calls = getattr(with_thinking.message, "tool_calls", None) if with_thinking else None
            if raw_tool_calls:
                ollama_tool_calls = []
                for tc in raw_tool_calls:
                    try:
                        args = orjson.loads(tc.function.arguments) if isinstance(tc.function.arguments, str) else (tc.function.arguments or {})
                    except (orjson.JSONDecodeError, TypeError):
                        args = {}
                    ollama_tool_calls.append(ollama.Message.ToolCall(
                        function=ollama.Message.ToolCall.Function(name=tc.function.name, arguments=args)
                    ))
        # Convert OpenAI logprobs to Ollama format
        ollama_logprobs = _convert_openai_logprobs(with_thinking) if with_thinking else None
        assistant_msg = ollama.Message(
            role=role,
            content=content,
            thinking=thinking,
            images=None,
            tool_name=None,
            tool_calls=ollama_tool_calls)
        rechunk = ollama.ChatResponse(
            model=chunk.model,
            created_at=iso8601_ns(),
            done=True if chunk.usage is not None else False,
            done_reason=chunk.choices[0].finish_reason, #if chunk.choices[0].finish_reason is not None else None,
            total_duration=int((now - start_ts) * 1_000_000_000) if chunk.usage is not None else 0,
            load_duration=100000,
            prompt_eval_count=int(chunk.usage.prompt_tokens) if chunk.usage is not None else 0,
            prompt_eval_duration=int((now - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None and chunk.usage.completion_tokens != 0 else 0,
            eval_count=int(chunk.usage.completion_tokens) if chunk.usage is not None else 0,
            eval_duration=int((now - start_ts) * 1_000_000_000) if chunk.usage is not None else 0,
            message=assistant_msg,
            logprobs=ollama_logprobs)
        return rechunk
    
    def openai_completion2ollama(chunk: dict, stream: bool, start_ts: float) -> ollama.GenerateResponse:
        now = time.perf_counter()
        with_thinking = chunk.choices[0] if chunk.choices[0] else None
        thinking = getattr(with_thinking, "reasoning", None) if with_thinking else None
        rechunk = ollama.GenerateResponse(
            model=chunk.model,
            created_at=iso8601_ns(),
            done=True if chunk.usage is not None else False,
            done_reason=chunk.choices[0].finish_reason,
            total_duration=int((now - start_ts) * 1_000_000_000) if chunk.usage is not None else 0,
            load_duration=10000,
            prompt_eval_count=int(chunk.usage.prompt_tokens) if chunk.usage is not None else 0,
            prompt_eval_duration=int((now - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None and chunk.usage.completion_tokens != 0 else 0,
            eval_count=int(chunk.usage.completion_tokens) if chunk.usage is not None else 0,
            eval_duration=int((now - start_ts) * 1_000_000_000) if chunk.usage is not None else 0,
            response=chunk.choices[0].text or '',
            thinking=thinking)
        return rechunk
    
    def openai_embeddings2ollama(chunk: dict) -> ollama.EmbeddingsResponse:
        rechunk = ollama.EmbeddingsResponse(embedding=chunk.data[0].embedding)
        return rechunk

    def openai_embed2ollama(chunk: dict, model: str) -> ollama.EmbedResponse:
        rechunk = ollama.EmbedResponse(
            model=model,
            created_at=iso8601_ns(),
            done=None,
            done_reason=None,
            total_duration=None,
            load_duration=None,
            prompt_eval_count=None,
            prompt_eval_duration=None,
            eval_count=None,
            eval_duration=None,
            embeddings=[chunk.data[0].embedding])
        return rechunk

    def extract_usage_from_llama_timings(obj) -> tuple[int, int] | None:
        """Extract (prompt_tokens, completion_tokens) from llama-server's timings object.

        llama-server returns a ``timings`` dict instead of the standard OpenAI
        ``usage`` field::

            "timings": {
                "cache_n": 236,      // prompt tokens reused from cache
                "prompt_n": 1,       // prompt tokens processed
                "predicted_n": 35    // predicted (completion) tokens
            }

        prompt_tokens  = prompt_n + cache_n
        completion_tokens = predicted_n

        Returns ``(prompt_tokens, completion_tokens)`` or ``None`` when no
        timings are found.
        """
        timings = getattr(obj, "timings", None)
        if timings is None:
            return None
        if isinstance(timings, dict):
            prompt_n = timings.get("prompt_n", 0) or 0
            cache_n = timings.get("cache_n", 0) or 0
            predicted_n = timings.get("predicted_n", 0) or 0
            return (prompt_n + cache_n, predicted_n)
        return None
    
# ------------------------------------------------------------------
# SSE Helpser
# ------------------------------------------------------------------
def _capture_snapshot() -> str:
    """Capture current usage counts as a JSON string. Caller must hold at least one of usage_lock/token_usage_lock."""
    return orjson.dumps({
        "usage_counts": dict(usage_counts),
        "token_usage_counts": dict(token_usage_counts)
    }, option=orjson.OPT_SORT_KEYS).decode("utf-8")

async def _distribute_snapshot(snapshot: str) -> None:
    """Push a pre-captured snapshot to all SSE subscribers. Must be called outside any usage lock."""
    async with _subscribers_lock:
        for q in _subscribers:
            if q.full():
                try:
                    await q.get()
                except asyncio.QueueEmpty:
                    pass
            await q.put(snapshot)

async def close_all_sse_queues():
    for q in list(_subscribers):
        # sentinel value that the generator will recognise
        await q.put(None)

# ------------------------------------------------------------------
# Subscriber helpers
# ------------------------------------------------------------------
async def subscribe() -> asyncio.Queue:
    """
    Returns a new Queue that will receive every snapshot.
    """
    q: asyncio.Queue = asyncio.Queue(maxsize=10)
    async with _subscribers_lock:
        _subscribers.add(q)
    return q

async def unsubscribe(q: asyncio.Queue):
    async with _subscribers_lock:
        _subscribers.discard(q)

# ------------------------------------------------------------------
# Convenience wrapper – returns the current snapshot (for the proxy)
# ------------------------------------------------------------------
async def get_usage_counts() -> Dict:
    return dict(usage_counts)   # shallow copy

# -------------------------------------------------------------
# 5. Endpoint selection logic (respecting the configurable limit)
# -------------------------------------------------------------
def get_max_connections(ep: str) -> int:
    """Per-endpoint max_concurrent_connections, falling back to the global value."""
    return config.endpoint_config.get(ep, {}).get(
        "max_concurrent_connections", config.max_concurrent_connections
    )

async def choose_endpoint(model: str, reserve: bool = True,
                          affinity_key: Optional[str] = None) -> tuple[str, str]:
    """
    Determine which endpoint to use for the given model while respecting
    the `max_concurrent_connections` per endpoint‑model pair **and**
    ensuring that the chosen endpoint actually *advertises* the model.

    The selection algorithm:

    1️⃣  Query every endpoint for its advertised models (`/api/tags`).
    2️⃣  Build a list of endpoints that contain the requested model.
    2️⃣.5  If conversation affinity is enabled and the caller passes
        ``affinity_key``, prefer the endpoint that previously served the
        same conversation — but only when it still has the model loaded
        and a free slot. Otherwise fall through to the standard logic.
    3️⃣  For those endpoints, find those that have the model loaded
        (`/api/ps`) *and* still have a free slot.
    4️⃣  If none are both loaded and free, fall back to any endpoint
        from the filtered list that simply has a free slot and randomly
        select one.
    5️⃣  If all are saturated, pick any endpoint from the filtered list
        (the request will queue on that endpoint).
    6️⃣  If no endpoint advertises the model at all, raise an error.
    """
    # 1️⃣  Gather advertised‑model sets for all endpoints concurrently
    #     Include both config.endpoints and config.llama_server_endpoints
    llama_eps_extra = [ep for ep in config.llama_server_endpoints if ep not in config.endpoints]
    all_endpoints = config.endpoints + llama_eps_extra

    tag_tasks = [fetch.available_models(ep) for ep in config.endpoints if not is_openai_compatible(ep)]
    tag_tasks += [fetch.available_models(ep, config.api_keys.get(ep)) for ep in config.endpoints if is_openai_compatible(ep)]
    tag_tasks += [fetch.available_models(ep, config.api_keys.get(ep)) for ep in llama_eps_extra]
    advertised_sets = await asyncio.gather(*tag_tasks)

    # 2️⃣  Filter endpoints that advertise the requested model
    candidate_endpoints = [
        ep for ep, models in zip(all_endpoints, advertised_sets)
        if model in models
    ]

    # 6️⃣
    if not candidate_endpoints:
        if ":latest" in model:  #ollama naming convention not applicable to openai/llama-server
            model_without_latest = model.split(":latest")[0]
            candidate_endpoints = [
                ep for ep, models in zip(all_endpoints, advertised_sets)
                if model_without_latest in models and (is_ext_openai_endpoint(ep) or ep in config.llama_server_endpoints)
            ]
        if not candidate_endpoints:
            # Only add :latest suffix if model doesn't already have a version suffix
            if ":" not in model:
                model = model + ":latest"
            candidate_endpoints = [
                ep for ep, models in zip(all_endpoints, advertised_sets)
                if model in models
            ]
        if not candidate_endpoints:
            raise RuntimeError(
                f"None of the configured endpoints ({', '.join(all_endpoints)}) "
                f"advertise the model '{model}'."
            )
    # 3️⃣  Among the candidates, find those that have the model *loaded*
    #      (concurrently, but only for the filtered list)
    load_tasks = [fetch.loaded_models(ep) for ep in candidate_endpoints]
    loaded_sets = await asyncio.gather(*load_tasks)

    # 3️⃣.5  Exclude endpoints whose loaded-model probe has been failing
    # recently. Without this filter, an endpoint where `/api/ps` returns 5xx
    # would appear with an empty loaded set but pass through to the
    # free-slot fallback (step 4) — sending completion calls to an
    # unhealthy backend. See issue #83.
    async with _loaded_error_cache_lock:
        unhealthy = {
            ep for ep, ts in _loaded_error_cache.items()
            if _is_fresh(ts, 300)
        }
    if unhealthy:
        filtered = [
            (ep, models) for ep, models in zip(candidate_endpoints, loaded_sets)
            if ep not in unhealthy
        ]
        if filtered:
            candidate_endpoints = [ep for ep, _ in filtered]
            loaded_sets = [models for _, models in filtered]
        # If *every* candidate is unhealthy we still fall through with the
        # original list — refusing to route is worse than retrying a
        # possibly-recovered backend.

    # 3️⃣.6  Exclude (endpoint, model) pairs whose completion path has recently
    # failed with a backend connection error (e.g. llama-server in router mode
    # whose delegated worker for *this* model died). /v1/models keeps reporting
    # OK in that case, so the probe-level filter above cannot catch it.
    async with _completion_error_cache_lock:
        completion_broken = {
            ep for (ep, m), ts in _completion_error_cache.items()
            if m == model and _is_fresh(ts, _COMPLETION_ERROR_TTL)
        }
    if completion_broken:
        filtered = [
            (ep, models) for ep, models in zip(candidate_endpoints, loaded_sets)
            if ep not in completion_broken
        ]
        if filtered:
            candidate_endpoints = [ep for ep, _ in filtered]
            loaded_sets = [models for _, models in filtered]
        # Same fallback: if every candidate is broken for this model, fall
        # through and let the upstream retry — possibly the operator restarted
        # the dead worker.

    # Look up a possible affinity hint *before* taking usage_lock. The two
    # locks are never held together to avoid lock-ordering issues.
    affine_ep: Optional[str] = None
    if config.conversation_affinity and affinity_key:
        async with _affinity_lock:
            entry = _affinity_map.get(affinity_key)
            if entry is not None:
                ep, _stored_model, expires_at = entry
                if expires_at < time.monotonic():
                    _affinity_map.pop(affinity_key, None)
                else:
                    affine_ep = ep

    # Protect all reads/writes of usage_counts with the lock so that selection
    # and reservation are atomic — concurrent callers see each other's pending load.
    async with usage_lock:
        # Helper: current usage for (endpoint, model) using the same normalized key
        # that increment_usage/decrement_usage store — raw model names differ from
        # tracking names for llama-server (HF prefix / quant suffix stripped).
        def tracking_usage(ep: str) -> int:
            return usage_counts.get(ep, {}).get(get_tracking_model(ep, model), 0)

        def utilization_ratio(ep: str) -> float:
            return tracking_usage(ep) / get_max_connections(ep)

        # Priority map: position in all_endpoints list (lower = higher priority)
        ep_priority = {ep: i for i, ep in enumerate(all_endpoints)}

        selected: Optional[str] = None

        # 2️⃣.5  Conversation affinity preference — only honour the hint when
        # the affine endpoint still advertises the model loaded *and* has a
        # free slot. Otherwise fall back to the standard algorithm.
        if affine_ep:
            ep_loaded = {
                ep: set(models)
                for ep, models in zip(candidate_endpoints, loaded_sets)
            }
            if (affine_ep in candidate_endpoints
                    and model in ep_loaded.get(affine_ep, set())
                    and tracking_usage(affine_ep) < get_max_connections(affine_ep)):
                selected = affine_ep

        if selected is None:
            # 3️⃣ Endpoints that have the model loaded *and* a free slot
            loaded_and_free = [
                ep for ep, models in zip(candidate_endpoints, loaded_sets)
                if model in models and tracking_usage(ep) < get_max_connections(ep)
            ]

            if loaded_and_free:
                if config.priority_routing:
                    # WRR: sort by config order first (stable), then by utilization ratio.
                    # Stable sort preserves priority for equal-ratio endpoints.
                    loaded_and_free.sort(key=lambda ep: ep_priority.get(ep, 999))
                    loaded_and_free.sort(key=utilization_ratio)
                    selected = loaded_and_free[0]
                else:
                    # Sort ascending for load balancing — all endpoints here already have the
                    # model loaded, so there is no model-switching cost to optimise for.
                    loaded_and_free.sort(key=tracking_usage)
                    # When all candidates are equally idle, randomise to avoid always picking
                    # the first entry in a stable sort.
                    if all(tracking_usage(ep) == 0 for ep in loaded_and_free):
                        selected = random.choice(loaded_and_free)
                    else:
                        selected = loaded_and_free[0]
            else:
                # 4️⃣ Endpoints among the candidates that simply have a free slot
                endpoints_with_free_slot = [
                    ep for ep in candidate_endpoints
                    if tracking_usage(ep) < get_max_connections(ep)
                ]

                if endpoints_with_free_slot:
                    if config.priority_routing:
                        endpoints_with_free_slot.sort(key=lambda ep: ep_priority.get(ep, 999))
                        endpoints_with_free_slot.sort(key=utilization_ratio)
                        selected = endpoints_with_free_slot[0]
                    else:
                        # Sort by total endpoint load (ascending) to prefer idle endpoints.
                        endpoints_with_free_slot.sort(
                            key=lambda ep: sum(usage_counts.get(ep, {}).values())
                        )
                        if all(tracking_usage(ep) == 0 for ep in endpoints_with_free_slot):
                            selected = random.choice(endpoints_with_free_slot)
                        else:
                            selected = endpoints_with_free_slot[0]
                else:
                    # 5️⃣ All candidate endpoints are saturated – pick the least-busy one (will queue)
                    if config.priority_routing:
                        selected = min(
                            candidate_endpoints,
                            key=lambda ep: (utilization_ratio(ep), ep_priority.get(ep, 999)),
                        )
                    else:
                        selected = min(candidate_endpoints, key=tracking_usage)

        tracking_model = get_tracking_model(selected, model)
        snapshot = None
        if reserve:
            usage_counts[selected][tracking_model] += 1
            snapshot = _capture_snapshot()
    if snapshot is not None:
        await _distribute_snapshot(snapshot)
    # Record / refresh affinity *after* releasing usage_lock.
    if reserve and config.conversation_affinity and affinity_key:
        expires_at = time.monotonic() + config.conversation_affinity_ttl
        async with _affinity_lock:
            _affinity_map[affinity_key] = (selected, model, expires_at)
            if len(_affinity_map) > _AFFINITY_MAX_ENTRIES:
                now = time.monotonic()
                for k in [k for k, v in _affinity_map.items() if v[2] < now]:
                    _affinity_map.pop(k, None)
    return selected, tracking_model

# -------------------------------------------------------------
# 6. API route – Generate
# -------------------------------------------------------------
@app.post("/api/generate")
async def proxy(request: Request):
    """
    Proxy a generate request to Ollama and stream the response back to the client.
    """
    try:
        body_bytes = await request.body()
        payload = orjson.loads(body_bytes.decode("utf-8"))
        
        model = payload.get("model")
        prompt = payload.get("prompt")
        suffix = payload.get("suffix")
        system = payload.get("system")
        template = payload.get("template")
        context = payload.get("context")
        stream = payload.get("stream")
        think = payload.get("think")
        raw = payload.get("raw")
        _format = payload.get("format")
        images = payload.get("images")
        options = payload.get("options")
        keep_alive = payload.get("keep_alive")
        _cache_enabled = payload.get("nomyo", {}).get("cache", False)

        if not model:
            raise HTTPException(
                status_code=400, detail="Missing required field 'model'"
            )
        if not prompt:
            raise HTTPException(
                status_code=400, detail="Missing required field 'prompt'"
            )
    except orjson.JSONDecodeError as e:
        error_msg = f"Invalid JSON format in request body: {str(e)}. Please ensure the request is properly formatted."
        raise HTTPException(status_code=400, detail=error_msg) from e

    # Cache lookup — before endpoint selection so no slot is wasted on a hit
    _cache = get_llm_cache()
    if _cache is not None and _cache_enabled:
        _cached = await _cache.get_generate(model, prompt, system or "")
        if _cached is not None:
            async def _serve_cached_generate():
                yield _cached
            return StreamingResponse(_serve_cached_generate(), media_type="application/json")

    _affinity_key = _conversation_fingerprint(model, None, prompt)
    endpoint, tracking_model = await choose_endpoint(model, affinity_key=_affinity_key)
    use_openai = is_openai_compatible(endpoint)
    if use_openai:
        if ":latest" in model:
            model = model.split(":latest")
            model = model[0]
        params = {
            "prompt": prompt,
            "model": model,
        }

        optional_params = {
            "stream": stream,
            "max_tokens": options.get("num_predict") if options and "num_predict" in options else None,
            "frequency_penalty": options.get("frequency_penalty") if options and "frequency_penalty" in options else None,
            "presence_penalty": options.get("presence_penalty") if options and "presence_penalty" in options else None,
            "seed": options.get("seed") if options and "seed" in options else None,
            "stop": options.get("stop") if options and "stop" in options else None,
            "top_p": options.get("top_p") if options and "top_p" in options else None,
            "temperature": options.get("temperature") if options and "temperature" in options else None,
            "suffix": suffix,
            }
        params.update({k: v for k, v in optional_params.items() if v is not None})
        oclient = _make_openai_client(endpoint, default_headers=default_headers, api_key=config.api_keys.get(endpoint, "no-key"))
    else:
        client = ollama.AsyncClient(host=endpoint)

    # 4. Async generator that streams data and decrements the counter
    async def stream_generate_response():
        try:
            if use_openai:
                start_ts = time.perf_counter()
                async_gen = await oclient.completions.create(**params)
            else:
                async_gen = await client.generate(model=model, prompt=prompt, suffix=suffix, system=system, template=template, context=context, stream=stream, think=think, raw=raw, format=_format, images=images, options=options, keep_alive=keep_alive)
            if stream == True:
                content_parts: list[str] = []
                async for chunk in async_gen:
                    if use_openai:
                        chunk = rechunk.openai_completion2ollama(chunk, stream, start_ts)
                    prompt_tok = chunk.prompt_eval_count or 0
                    comp_tok   = chunk.eval_count or 0
                    if prompt_tok != 0 or comp_tok != 0:
                        await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))
                    if hasattr(chunk, "model_dump_json"):
                        json_line = chunk.model_dump_json()
                    else:
                        json_line = orjson.dumps(chunk)
                    # Accumulate and store cache on done chunk — before yield so it always runs
                    if _cache is not None and _cache_enabled:
                        if getattr(chunk, "response", None):
                            content_parts.append(chunk.response)
                        if getattr(chunk, "done", False):
                            assembled = orjson.dumps({
                                k: v for k, v in {
                                    "model": getattr(chunk, "model", model),
                                    "response": "".join(content_parts),
                                    "done": True,
                                    "done_reason": getattr(chunk, "done_reason", "stop") or "stop",
                                    "prompt_eval_count": getattr(chunk, "prompt_eval_count", None),
                                    "eval_count": getattr(chunk, "eval_count", None),
                                    "total_duration": getattr(chunk, "total_duration", None),
                                    "eval_duration": getattr(chunk, "eval_duration", None),
                                }.items() if v is not None
                            }) + b"\n"
                            try:
                                await _cache.set_generate(model, prompt, system or "", assembled)
                            except Exception as _ce:
                                print(f"[cache] set_generate (streaming) failed: {_ce}")
                    yield json_line.encode("utf-8") + b"\n"
            else:
                if use_openai:
                    response = rechunk.openai_completion2ollama(async_gen, stream, start_ts)
                    response = response.model_dump_json()
                else:
                    response = async_gen.model_dump_json()
                    prompt_tok = async_gen.prompt_eval_count or 0
                    comp_tok   = async_gen.eval_count or 0
                    if prompt_tok != 0 or comp_tok != 0:
                        await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))
                json_line = (
                    response
                    if hasattr(async_gen, "model_dump_json")
                    else orjson.dumps(async_gen)
                )
                cache_bytes = json_line.encode("utf-8") + b"\n"
                yield cache_bytes
                # Cache non-streaming response
                if _cache is not None and _cache_enabled:
                    try:
                        await _cache.set_generate(model, prompt, system or "", cache_bytes)
                    except Exception as _ce:
                        print(f"[cache] set_generate (non-streaming) failed: {_ce}")

        finally:
            # Ensure counter is decremented even if an exception occurs
            await decrement_usage(endpoint, tracking_model)

    # 5. Return a StreamingResponse backed by the generator
    return StreamingResponse(
        stream_generate_response(),
        media_type="application/json",
    )

# -------------------------------------------------------------
# 7. API route – Chat
# -------------------------------------------------------------
@app.post("/api/chat")
async def chat_proxy(request: Request):
    """
    Proxy a chat request to Ollama and stream the endpoint reply.
    """
    # 1. Parse and validate request
    try:
        body_bytes = await request.body()
        payload = orjson.loads(body_bytes.decode("utf-8"))

        model = payload.get("model")
        messages = payload.get("messages")
        tools = payload.get("tools")
        stream = payload.get("stream")
        think = payload.get("think")
        _format = payload.get("format")
        keep_alive = payload.get("keep_alive")
        options = payload.get("options")
        logprobs = payload.get("logprobs")
        top_logprobs = payload.get("top_logprobs")
        _cache_enabled = payload.get("nomyo", {}).get("cache", False)

        if not model:
            raise HTTPException(
                status_code=400, detail="Missing required field 'model'"
            )
        if not isinstance(messages, list):
            raise HTTPException(
                status_code=400, detail="Missing or invalid 'messages' field (must be a list)"
            )
        if options is not None and not isinstance(options, dict):
            raise HTTPException(
                status_code=400, detail="`options` must be a JSON object"
            )
    except orjson.JSONDecodeError as e:
        raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e

    # Cache lookup — before endpoint selection, always bypassed for MOE
    _is_moe = model.startswith("moe-")
    _cache = get_llm_cache()
    # Normalise model name for cache key: strip ":latest" suffix here so that
    # get_chat and set_chat use the same model string regardless of when the
    # strip happens further down (line ~1793 strips it for OpenAI endpoints).
    _cache_model = model[: -len(":latest")] if model.endswith(":latest") else model
    # Snapshot original messages before any OpenAI-format transformation so that
    # get_chat and set_chat always use the same key regardless of backend type.
    _cache_messages = messages
    if _cache is not None and not _is_moe and _cache_enabled:
        _cached = await _cache.get_chat("ollama_chat", _cache_model, messages)
        if _cached is not None:
            async def _serve_cached_chat():
                yield _cached
            return StreamingResponse(
                _serve_cached_chat(),
                media_type="application/x-ndjson" if stream else "application/json",
            )

    # 2. Endpoint logic
    if model.startswith("moe-"):
        model = model.split("moe-")[1]
        opt = True
    else:
        opt = False
    _affinity_key = _conversation_fingerprint(model, messages, None)
    endpoint, tracking_model = await choose_endpoint(model, affinity_key=_affinity_key)
    use_openai = is_openai_compatible(endpoint)
    if use_openai:
        if ":latest" in model:
            model = model.split(":latest")
            model = model[0]
        if messages:
            if any("images" in m for m in messages):
                messages = await asyncio.to_thread(transform_images_to_data_urls, messages)
            messages = transform_tool_calls_to_openai(messages)
            messages = _strip_assistant_prefill(messages)
        params = {
            "messages": messages,
            "model": model,
            }
        optional_params = {
            "tools": tools,
            "stream": stream,
            "stream_options": {"include_usage": True} if stream else None,
            "max_tokens": options.get("num_predict") if options and "num_predict" in options else None,
            "frequency_penalty": options.get("frequency_penalty") if options and "frequency_penalty" in options else None,
            "presence_penalty": options.get("presence_penalty") if options and "presence_penalty" in options else None,
            "seed": options.get("seed") if options and "seed" in options else None,
            "stop": options.get("stop") if options and "stop" in options else None,
            "top_p": options.get("top_p") if options and "top_p" in options else None,
            "temperature": options.get("temperature") if options and "temperature" in options else None,
            "logprobs": logprobs if logprobs is not None else (options.get("logprobs") if options and "logprobs" in options else None),
            "top_logprobs": top_logprobs if top_logprobs is not None else (options.get("top_logprobs") if options and "top_logprobs" in options else None),
            "response_format": {"type": "json_schema", "json_schema": _format} if _format is not None else None
            }
        params.update({k: v for k, v in optional_params.items() if v is not None})
        oclient = _make_openai_client(endpoint, default_headers=default_headers, api_key=config.api_keys.get(endpoint, "no-key"))
    else:
        client = ollama.AsyncClient(host=endpoint)
    # For OpenAI endpoints: make the API call in handler scope
    # (try/except inside async generators is unreliable with Starlette's streaming)
    start_ts = None
    async_gen = None
    if use_openai:
        start_ts = time.perf_counter()
        # Proactive trim: only for small-ctx models we've already seen run out of space
        _lookup_model = _normalize_llama_model_name(model) if endpoint in config.llama_server_endpoints else model
        _known_nctx = _endpoint_nctx.get((endpoint, _lookup_model))
        if _known_nctx and _known_nctx <= _CTX_TRIM_SMALL_LIMIT:
            _pre_target = int((_known_nctx - _known_nctx // 4) / 1.2)
            _pre_est = _count_message_tokens(params.get("messages", []))
            if _pre_est > _pre_target:
                _pre_msgs = params.get("messages", [])
                _pre_trimmed = _trim_messages_for_context(_pre_msgs, _known_nctx, target_tokens=_pre_target)
                _dropped = len(_pre_msgs) - len(_pre_trimmed)
                print(f"[ctx-pre] n_ctx={_known_nctx} est={_pre_est} target={_pre_target} dropped={_dropped}", flush=True)
                params = {**params, "messages": _pre_trimmed}
        try:
            async_gen = await oclient.chat.completions.create(**params)
        except Exception as e:
            _e_str = str(e)
            print(f"[chat_proxy] caught {type(e).__name__}: {_e_str[:200]}")
            if "exceed_context_size_error" in _e_str or "exceeds the available context size" in _e_str:
                err_body = getattr(e, "body", {}) or {}
                err_detail = err_body.get("error", {}) if isinstance(err_body, dict) else {}
                n_ctx_limit = err_detail.get("n_ctx", 0)
                actual_tokens = err_detail.get("n_prompt_tokens", 0)
                if not n_ctx_limit:
                    _m = re.search(r"'n_ctx':\s*(\d+)", _e_str)
                    if _m:
                        n_ctx_limit = int(_m.group(1))
                    _m = re.search(r"'n_prompt_tokens':\s*(\d+)", _e_str)
                    if _m:
                        actual_tokens = int(_m.group(1))
                if not n_ctx_limit:
                    await decrement_usage(endpoint, tracking_model)
                    raise
                if n_ctx_limit <= _CTX_TRIM_SMALL_LIMIT:
                    _endpoint_nctx[(endpoint, model)] = n_ctx_limit
                msgs_to_trim = params.get("messages", [])
                cal_target = _calibrated_trim_target(msgs_to_trim, n_ctx_limit, actual_tokens)
                trimmed = _trim_messages_for_context(msgs_to_trim, n_ctx_limit, target_tokens=cal_target)
                print(f"[chat_proxy] Context exceeded ({actual_tokens}/{n_ctx_limit} tokens, tiktoken_target={cal_target}), dropped {len(msgs_to_trim) - len(trimmed)} oldest message(s) and retrying")
                try:
                    async_gen = await oclient.chat.completions.create(**{**params, "messages": trimmed})
                except Exception as e2:
                    _e2_str = str(e2)
                    if "exceed_context_size_error" in _e2_str or "exceeds the available context size" in _e2_str:
                        print(f"[chat_proxy] Context still exceeded after trimming messages, also stripping tools")
                        params_no_tools = {k: v for k, v in params.items() if k not in ("tools", "tool_choice")}
                        try:
                            async_gen = await oclient.chat.completions.create(**{**params_no_tools, "messages": trimmed})
                        except Exception:
                            await decrement_usage(endpoint, tracking_model)
                            raise
                    else:
                        await decrement_usage(endpoint, tracking_model)
                        raise
            elif _is_backend_connection_error(e):
                print(f"[chat_proxy] backend connection error → marking ({endpoint}, {model}) unhealthy", flush=True)
                await _mark_backend_unhealthy(endpoint, model, _e_str)
                await decrement_usage(endpoint, tracking_model)
                raise
            elif "image input is not supported" in _e_str:
                print(f"[chat_proxy] Model {model} doesn't support images, retrying with text-only messages")
                try:
                    params = {**params, "messages": _strip_images_from_messages(params.get("messages", []))}
                    async_gen = await oclient.chat.completions.create(**params)
                except Exception:
                    await decrement_usage(endpoint, tracking_model)
                    raise
            else:
                await decrement_usage(endpoint, tracking_model)
                raise

    # 3. Async generator that streams chat data and decrements the counter
    async def stream_chat_response():
        try:
            # The chat method returns a generator of dicts (or GenerateResponse)
            if use_openai:
                _async_gen = async_gen  # established in handler scope above
            else:
                if opt == True:
                    # Use the dedicated MOE helper function
                    _async_gen = await _make_moe_requests(model, messages, tools, think, _format, options, keep_alive)
                else:
                    _async_gen = await client.chat(model=model, messages=messages, tools=tools, stream=stream, think=think, format=_format, options=options, keep_alive=keep_alive, logprobs=logprobs, top_logprobs=top_logprobs)
            if stream == True:
                tc_acc = {}  # accumulate OpenAI tool-call deltas across chunks
                content_parts: list[str] = []
                async for chunk in _async_gen:
                    if use_openai:
                        _accumulate_openai_tc_delta(chunk, tc_acc)
                        chunk = rechunk.openai_chat_completion2ollama(chunk, stream, start_ts)
                        # Inject fully-accumulated tool calls only into the final chunk
                        if chunk.done and tc_acc and chunk.message:
                            chunk.message.tool_calls = _build_ollama_tool_calls(tc_acc)
                    # `chunk` can be a dict or a pydantic model – dump to JSON safely
                    prompt_tok = chunk.prompt_eval_count or 0
                    comp_tok   = chunk.eval_count or 0
                    if prompt_tok != 0 or comp_tok != 0:
                        await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))
                    if hasattr(chunk, "model_dump_json"):
                        json_line = chunk.model_dump_json()
                    else:
                        json_line = orjson.dumps(chunk)
                    # Accumulate and store cache on done chunk — before yield so it always runs
                    # Works for both Ollama-native and OpenAI-compatible backends; chunks are
                    # already converted to Ollama format by rechunk before this point.
                    if getattr(chunk, "done", False):
                        # Detect context exhaustion mid-generation for small-ctx models
                        _dr = getattr(chunk, "done_reason", None)
                        # Only cache when no max_tokens limit was set — otherwise
                        # finish_reason=length might just mean max_tokens was hit,
                        # not that the context window was exhausted.
                        _req_max_tok = (
                            params.get("max_tokens") or params.get("max_completion_tokens") or params.get("num_predict")
                            if use_openai else
                            (options.get("num_predict") if options else None)
                        )
                        if _dr == "length" and not _req_max_tok:
                            _pt = getattr(chunk, "prompt_eval_count", 0) or 0
                            _ct = getattr(chunk, "eval_count", 0) or 0
                            _inferred_nctx = _pt + _ct
                            if 0 < _inferred_nctx <= _CTX_TRIM_SMALL_LIMIT:
                                _endpoint_nctx[(endpoint, model)] = _inferred_nctx
                                print(f"[ctx-cache] done_reason=length → cached n_ctx={_inferred_nctx} for ({endpoint},{model})", flush=True)
                    if _cache is not None and not _is_moe and _cache_enabled:
                        if chunk.message and getattr(chunk.message, "content", None):
                            content_parts.append(chunk.message.content)
                        if getattr(chunk, "done", False):
                            assembled = orjson.dumps({
                                k: v for k, v in {
                                    "model": getattr(chunk, "model", model),
                                    "created_at": (lambda ca: ca.isoformat() if hasattr(ca, "isoformat") else ca)(getattr(chunk, "created_at", None)),
                                    "message": {"role": "assistant", "content": "".join(content_parts)},
                                    "done": True,
                                    "done_reason": getattr(chunk, "done_reason", "stop") or "stop",
                                    "prompt_eval_count": getattr(chunk, "prompt_eval_count", None),
                                    "eval_count": getattr(chunk, "eval_count", None),
                                    "total_duration": getattr(chunk, "total_duration", None),
                                    "eval_duration": getattr(chunk, "eval_duration", None),
                                }.items() if v is not None
                            }) + b"\n"
                            try:
                                await _cache.set_chat("ollama_chat", _cache_model, _cache_messages, assembled)
                            except Exception as _ce:
                                print(f"[cache] set_chat (ollama_chat streaming) failed: {_ce}")
                    yield json_line.encode("utf-8") + b"\n"
            else:
                if use_openai:
                    response = rechunk.openai_chat_completion2ollama(_async_gen, stream, start_ts)
                    response = response.model_dump_json()
                else:
                    response = _async_gen.model_dump_json()
                    prompt_tok = _async_gen.prompt_eval_count or 0
                    comp_tok   = _async_gen.eval_count or 0
                    if prompt_tok != 0 or comp_tok != 0:
                        await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))
                json_line = (
                    response
                    if hasattr(_async_gen, "model_dump_json")
                    else orjson.dumps(_async_gen)
                )
                cache_bytes = json_line.encode("utf-8") + b"\n"
                yield cache_bytes
                # Cache non-streaming response (non-MOE; works for both Ollama and OpenAI backends)
                if _cache is not None and not _is_moe and _cache_enabled:
                    try:
                        await _cache.set_chat("ollama_chat", _cache_model, _cache_messages, cache_bytes)
                    except Exception as _ce:
                        print(f"[cache] set_chat (ollama_chat non-streaming) failed: {_ce}")

        finally:
            # Ensure counter is decremented even if an exception occurs
            await decrement_usage(endpoint, tracking_model)

    # 4. Return a StreamingResponse backed by the generator
    media_type = "application/x-ndjson" if stream else "application/json"
    return StreamingResponse(
        stream_chat_response(),
        media_type=media_type,
    )

# -------------------------------------------------------------
# 8. API route – Embedding - deprecated
# -------------------------------------------------------------
@app.post("/api/embeddings")
async def embedding_proxy(request: Request):
    """
    Proxy an embedding request to Ollama and reply with embeddings.

    """
    # 1. Parse and validate request
    try:
        body_bytes = await request.body()
        payload = orjson.loads(body_bytes.decode("utf-8"))

        model = payload.get("model")
        prompt = payload.get("prompt")
        options = payload.get("options")
        keep_alive = payload.get("keep_alive")

        if not model:
            raise HTTPException(
                status_code=400, detail="Missing required field 'model'"
            )
        if not prompt:
            raise HTTPException(
                status_code=400, detail="Missing required field 'prompt'"
            )
    except orjson.JSONDecodeError as e:
        raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e

    # 2. Endpoint logic
    endpoint, tracking_model = await choose_endpoint(model)
    use_openai = is_openai_compatible(endpoint)
    if use_openai:
        if ":latest" in model:
            model = model.split(":latest")
            model = model[0]
        client = _make_openai_client(endpoint, api_key=config.api_keys.get(endpoint, "no-key"))
    else:
        client = ollama.AsyncClient(host=endpoint)
    # 3. Async generator that streams embedding data and decrements the counter
    async def stream_embedding_response():
        try:
            # The chat method returns a generator of dicts (or GenerateResponse)
            if use_openai:
                async_gen = await client.embeddings.create(input=prompt, model=model)
                async_gen = rechunk.openai_embeddings2ollama(async_gen)
            else:
                async_gen = await client.embeddings(model=model, prompt=prompt, options=options, keep_alive=keep_alive)
            if hasattr(async_gen, "model_dump_json"):
                json_line = async_gen.model_dump_json()
            else:
                json_line = orjson.dumps(async_gen)
            yield json_line.encode("utf-8") + b"\n"
        finally:
            # Ensure counter is decremented even if an exception occurs
            await decrement_usage(endpoint, tracking_model)

    # 5. Return a StreamingResponse backed by the generator
    return StreamingResponse(
        stream_embedding_response(),
        media_type="application/json",
    )

# -------------------------------------------------------------
# 9. API route – Embed
# -------------------------------------------------------------
@app.post("/api/embed")
async def embed_proxy(request: Request):
    """
    Proxy an embed request to Ollama and reply with embeddings.

    """
    # 1. Parse and validate request
    try:
        body_bytes = await request.body()
        payload = orjson.loads(body_bytes.decode("utf-8"))

        model = payload.get("model")
        _input = payload.get("input")
        truncate = payload.get("truncate")
        options = payload.get("options")
        keep_alive = payload.get("keep_alive")

        if not model:
            raise HTTPException(
                status_code=400, detail="Missing required field 'model'"
            )
        if not _input:
            raise HTTPException(
                status_code=400, detail="Missing required field 'input'"
            )
    except orjson.JSONDecodeError as e:
        raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e

    # 2. Endpoint logic
    endpoint, tracking_model = await choose_endpoint(model)
    use_openai = is_openai_compatible(endpoint)
    if use_openai:
        if ":latest" in model:
            model = model.split(":latest")
            model = model[0]
        client = _make_openai_client(endpoint, api_key=config.api_keys.get(endpoint, "no-key"))
    else:
        client = ollama.AsyncClient(host=endpoint)
    # 3. Async generator that streams embed data and decrements the counter
    async def stream_embedding_response():
        try:
            # The chat method returns a generator of dicts (or GenerateResponse)
            if use_openai:
                async_gen = await client.embeddings.create(input=_input, model=model)
                async_gen = rechunk.openai_embed2ollama(async_gen, model)
            else:
                async_gen = await client.embed(model=model, input=_input, truncate=truncate, options=options, keep_alive=keep_alive)
            if hasattr(async_gen, "model_dump_json"):
                json_line = async_gen.model_dump_json()
            else:
                json_line = orjson.dumps(async_gen)
            yield json_line.encode("utf-8") + b"\n"
        finally:
            # Ensure counter is decremented even if an exception occurs
            await decrement_usage(endpoint, tracking_model)

    # 4. Return a StreamingResponse backed by the generator
    return StreamingResponse(
        stream_embedding_response(),
        media_type="application/json",
    )

# -------------------------------------------------------------
# 10. API route – Create
# -------------------------------------------------------------
@app.post("/api/create")
async def create_proxy(request: Request):
    """
    Proxy a create request to all Ollama endpoints and reply with deduplicated status.
    """
    try:
        body_bytes = await request.body()
        payload = orjson.loads(body_bytes.decode("utf-8"))

        model = payload.get("model")
        quantize = payload.get("quantize")
        from_ = payload.get("from")
        files = payload.get("files")
        adapters = payload.get("adapters")
        template = payload.get("template")
        license = payload.get("license")
        system = payload.get("system")
        parameters = payload.get("parameters")
        messages = payload.get("messages")
        
        if not model:
            raise HTTPException(
                status_code=400, detail="Missing required field 'model'"
            )
        if not from_ and not files:
            raise HTTPException(
                status_code=400, detail="You need to provide either from_ or files parameter!"
            )
    except orjson.JSONDecodeError as e:
        raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e
    
    status_lists = []

    for endpoint in config.endpoints:
        client = ollama.AsyncClient(host=endpoint)
        create = await client.create(model=model, quantize=quantize, from_=from_, files=files, adapters=adapters, template=template, license=license, system=system, parameters=parameters, messages=messages, stream=False)
        status_lists.append(create)

    combined_status = []
    for status_list in status_lists:
        combined_status += status_list

    final_status = list(dict.fromkeys(combined_status))

    return dict(final_status)

# -------------------------------------------------------------
# 11. API route – Show
# -------------------------------------------------------------
@app.post("/api/show")
async def show_proxy(request: Request, model: Optional[str] = None):
    """
    Proxy a model show request to Ollama and reply with ShowResponse.

    """
    try:
        body_bytes = await request.body()

        if not model:
            payload = orjson.loads(body_bytes.decode("utf-8"))
            model = payload.get("model")

        if not model:
            raise HTTPException(
                status_code=400, detail="Missing required field 'model'"
            )
    except orjson.JSONDecodeError as e:
        raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e

    # 2. Endpoint logic
    endpoint, _ = await choose_endpoint(model, reserve=False)

    client = ollama.AsyncClient(host=endpoint)

    # 3. Proxy a simple show request
    show = await client.show(model=model)

    # 4. Return ShowResponse
    return show

# -------------------------------------------------------------
@app.get("/api/token_counts")
async def token_counts_proxy():
    breakdown = []
    total = 0
    async for entry in db.load_token_counts():
        total += entry['total_tokens']
        breakdown.append({
            "endpoint": entry["endpoint"],
            "model": entry["model"],
            "input_tokens": entry["input_tokens"],
            "output_tokens": entry["output_tokens"],
            "total_tokens": entry["total_tokens"],
        })
    return {"total_tokens": total, "breakdown": breakdown}

@app.post("/api/aggregate_time_series_days")
async def aggregate_time_series_days_proxy(request: Request):
    """
    Aggregate time_series entries older than days into daily aggregates by endpoint/model/date.
    """
    try:
        body_bytes = await request.body()
        if not body_bytes:
            days = 30
            trim_old = False
        else:
            payload = orjson.loads(body_bytes.decode("utf-8"))
            days = int(payload.get("days", 30))
            trim_old = bool(payload.get("trim_old", False))
    except Exception:
        days = 30
        trim_old = False
    aggregated = await db.aggregate_time_series_older_than(days, trim_old=trim_old)
    return {"status": "ok", "days": days, "trim_old": trim_old, "aggregated_groups": aggregated}

# 12. API route – Stats
# -------------------------------------------------------------
@app.post("/api/stats")
async def stats_proxy(request: Request, model: Optional[str] = None):
    """
    Return token usage statistics for a specific model.
    """
    try:
        body_bytes = await request.body()

        if not model:
            payload = orjson.loads(body_bytes.decode("utf-8"))
            model = payload.get("model")

        if not model:
            raise HTTPException(
                status_code=400, detail="Missing required field 'model'"
            )
    except orjson.JSONDecodeError as e:
        raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e

    # Get token counts from database
    token_data = await db.get_token_counts_for_model(model)

    if not token_data:
        raise HTTPException(
            status_code=404, detail="No token data found for this model"
        )

    time_series = [
        entry async for entry in db.get_time_series_for_model(model)
    ]
    endpoint_distribution = await db.get_endpoint_distribution_for_model(model)

    return {
        'model': model,
        'input_tokens': token_data['input_tokens'],
        'output_tokens': token_data['output_tokens'],
        'total_tokens': token_data['total_tokens'],
        'time_series': time_series,
        'endpoint_distribution': endpoint_distribution,
    }

# -------------------------------------------------------------
# 12. API route – Copy
# -------------------------------------------------------------
@app.post("/api/copy")
async def copy_proxy(request: Request, source: Optional[str] = None, destination: Optional[str] = None):
    """
    Proxy a model copy request to each Ollama endpoint and reply with Status Code.

    """
    # 1. Parse and validate request
    try:
        body_bytes = await request.body()

        if not source and not destination:
            payload = orjson.loads(body_bytes.decode("utf-8"))
            src = payload.get("source")
            dst = payload.get("destination")
        else:
            src = source
            dst = destination
        
        if not src:
            raise HTTPException(
                status_code=400, detail="Missing required field 'source'"
            )
        if not dst:
            raise HTTPException(
                status_code=400, detail="Missing required field 'destination'"
            )
    except orjson.JSONDecodeError as e:
        raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e

    # 3. Iterate over all endpoints to copy the model on each endpoint
    status_list = []

    for endpoint in config.endpoints:
        if "/v1" not in endpoint:
            client = ollama.AsyncClient(host=endpoint)
            # 4. Proxy a simple copy request
            copy = await client.copy(source=src, destination=dst)
            status_list.append(copy.status)

    # 4. Return with 200 OK if all went well, 404 if a single endpoint failed
    return Response(status_code=404 if 404 in status_list else 200)

# -------------------------------------------------------------
# 13. API route – Delete
# -------------------------------------------------------------
@app.delete("/api/delete")
async def delete_proxy(request: Request, model: Optional[str] = None):
    """
    Proxy a model delete request to each Ollama endpoint and reply with Status Code.

    """
    # 1. Parse and validate request
    try:
        body_bytes = await request.body()

        if not model:
            payload = orjson.loads(body_bytes.decode("utf-8"))
            model = payload.get("model")
        
        if not model:
            raise HTTPException(
                status_code=400, detail="Missing required field 'model'"
            )
    except orjson.JSONDecodeError as e:
        raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e

    # 2. Iterate over all endpoints to delete the model on each endpoint
    status_list = []

    for endpoint in config.endpoints:
        if "/v1" not in endpoint:
            client = ollama.AsyncClient(host=endpoint)
            # 3. Proxy a simple copy request
            copy = await client.delete(model=model)
            status_list.append(copy.status)
    
    # 4. Return 200 0K, if a single enpoint fails, respond with 404
    return Response(status_code=404 if 404 in status_list else 200)   

# -------------------------------------------------------------
# 14. API route – Pull
# -------------------------------------------------------------
@app.post("/api/pull")
async def pull_proxy(request: Request, model: Optional[str] = None):
    """
    Proxy a pull request to all Ollama endpoint and report status back.
    """
    # 1. Parse and validate request
    try:
        body_bytes = await request.body()

        if not model:
            payload = orjson.loads(body_bytes.decode("utf-8"))
            model = payload.get("model")
            insecure = payload.get("insecure")
        else:
            insecure = None

        if not model:
            raise HTTPException(
                status_code=400, detail="Missing required field 'model'"
            )
    except orjson.JSONDecodeError as e:
        raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e

    # 2. Iterate over all endpoints to pull the model
    status_list = []

    for endpoint in config.endpoints:
        if "/v1" not in endpoint:
            client = ollama.AsyncClient(host=endpoint)
            # 3. Proxy a simple pull request
            pull = await client.pull(model=model, insecure=insecure, stream=False)
            status_list.append(pull)

    combined_status = []
    for status in status_list:
        combined_status += status
    
    # 4. Report back a deduplicated status message
    final_status = list(dict.fromkeys(combined_status))

    return dict(final_status)

# -------------------------------------------------------------
# 15. API route – Push
# -------------------------------------------------------------
@app.post("/api/push")
async def push_proxy(request: Request):
    """
    Proxy a push request to Ollama and respond the deduplicated Ollama endpoint replies.
    """
    # 1. Parse and validate request
    try:
        body_bytes = await request.body()
        payload = orjson.loads(body_bytes.decode("utf-8"))

        model = payload.get("model")
        insecure = payload.get("insecure")

        if not model:
            raise HTTPException(
                status_code=400, detail="Missing required field 'model'"
            )
    except orjson.JSONDecodeError as e:
        raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e

    # 2. Iterate over all endpoints
    status_list = []

    for endpoint in config.endpoints:
        client = ollama.AsyncClient(host=endpoint)
        # 3. Proxy a simple push request
        push = await client.push(model=model, insecure=insecure, stream=False)
        status_list.append(push)

    combined_status = []
    for status in status_list:
        combined_status += status
    
    # 4. Report a deduplicated status
    final_status = list(dict.fromkeys(combined_status))

    return dict(final_status)


# -------------------------------------------------------------
# 16. API route – Version
# -------------------------------------------------------------
@app.get("/api/version")
async def version_proxy(request: Request):
    """
    Proxy a version request to Ollama and reply lowest version of all endpoints.

    """
    # 1. Query all endpoints for version
    tasks = [fetch.endpoint_details(ep, "/api/version", "version") for ep in config.endpoints if "/v1" not in ep]
    all_versions_raw = await asyncio.gather(*tasks)

    # Filter out non-string values (e.g., empty lists from failed/timeout responses)
    all_versions = [v for v in all_versions_raw if isinstance(v, str) and v]

    if not all_versions:
        raise HTTPException(status_code=503, detail="No valid version response from any endpoint")

    def version_key(v):
        return tuple(map(int, v.split('.')))
    
    # 2. Return a JSONResponse with the min Version of all endpoints to maintain compatibility
    return JSONResponse(
        content={"version": str(min(all_versions, key=version_key))},
        status_code=200,
    )

# -------------------------------------------------------------
# 17. API route – tags
# -------------------------------------------------------------
@app.get("/api/tags")
async def tags_proxy(request: Request):
    """
    Proxy a tags request to Ollama endpoints and reply with a unique list of all models.

    """
    
    # 1. Query all endpoints for models
    tasks = [fetch.endpoint_details(ep, "/api/tags", "models", skip_error_cache=True, timeout=8) for ep in config.endpoints if "/v1" not in ep]
    tasks += [fetch.endpoint_details(ep, "/models", "data", config.api_keys[ep], skip_error_cache=True, timeout=8) for ep in config.endpoints if "/v1" in ep]
    # Also query llama-server endpoints not already covered by config.endpoints
    llama_eps_for_tags = [ep for ep in config.llama_server_endpoints if ep not in config.endpoints]
    tasks += [fetch.endpoint_details(ep, "/models", "data", config.api_keys.get(ep), skip_error_cache=True, timeout=8) for ep in llama_eps_for_tags]
    all_models = await asyncio.gather(*tasks)

    models = {'models': []}
    for modellist in all_models:
        for model in modellist:
            if not "model" in model.keys():  # Relable OpenAI models with Ollama Model.model from Model.id
                model['model'] = model['id'] + ":latest"
            else:
                model['id'] = model['model']
            if not "name" in model.keys():  # Relable OpenAI models with Ollama Model.name from Model.model to have model,name keys
                model['name'] = model['model']
            else:
                model['id'] = model['model']
        models['models'] += modellist
    
    # 2. Return a JSONResponse with a deduplicated list of unique models for inference
    return JSONResponse(
        content={"models": dedupe_on_keys(models['models'], ['digest','name','id'])},
        status_code=200,
    )

# -------------------------------------------------------------
# 18. API route – ps
# -------------------------------------------------------------
@app.get("/api/ps")
async def ps_proxy(request: Request):
    """
    Proxy a ps request to all Ollama and llama-server endpoints and reply a unique list of all running models.

    For Ollama endpoints: queries /api/ps
    For llama-server endpoints: queries /v1/models with status.value == "loaded"
    """
    # 1. Query Ollama endpoints for running models via /api/ps
    ollama_tasks = [fetch.endpoint_details(ep, "/api/ps", "models", skip_error_cache=True, timeout=8) for ep in config.endpoints if "/v1" not in ep]
    # 2. Query llama-server endpoints for loaded models via /v1/models
    # Also query endpoints from llama_server_endpoints that may not be in config.endpoints
    all_llama_endpoints = set(config.llama_server_endpoints) | set(ep for ep in config.endpoints if ep in config.llama_server_endpoints)
    llama_tasks = [
        fetch.endpoint_details(ep, "/models", "data", config.api_keys.get(ep), skip_error_cache=True, timeout=8)
        for ep in all_llama_endpoints
    ]
    
    ollama_loaded = await asyncio.gather(*ollama_tasks) if ollama_tasks else []
    llama_loaded = await asyncio.gather(*llama_tasks) if llama_tasks else []

    models = {'models': []}
    # Add Ollama models (if any)
    if ollama_loaded:
        for modellist in ollama_loaded:
            models['models'] += modellist
    # Add llama-server models (filter for loaded only, if any)
    if llama_loaded:
        for modellist in llama_loaded:
            loaded_models = [item for item in modellist if _is_llama_model_loaded(item)]
            # Convert llama-server format to Ollama-like format for consistency
            for item in loaded_models:
                raw_id = item.get("id", "")
                normalized = _normalize_llama_model_name(raw_id)
                quant = _extract_llama_quant(raw_id)
                models['models'].append({
                    "name": normalized,
                    "id": normalized,
                    "digest": "",
                    "status": item.get("status"),
                    "details": {"quantization_level": quant} if quant else {}
                })
    
    # 3. Return a JSONResponse with deduplicated currently deployed models
    # Deduplicate on 'name' rather than 'digest': llama-server models always
    # have digest="" so deduping on digest collapses all of them to one entry.
    return JSONResponse(
        content={"models": dedupe_on_keys(models['models'], ['name'])},
        status_code=200,
    )

# -------------------------------------------------------------
# 18b. API route – ps details (backwards compatible)
# -------------------------------------------------------------
@app.get("/api/ps_details")
async def ps_details_proxy(request: Request):
    """
    Proxy a ps request to all Ollama and llama-server endpoints and reply with per-endpoint instances.
    This keeps /api/ps backward compatible while providing richer data.
    
    For Ollama endpoints: queries /api/ps
    For llama-server endpoints: queries /v1/models with status info
    """
    # 1. Query Ollama endpoints via /api/ps
    ollama_tasks = [(ep, fetch.endpoint_details(ep, "/api/ps", "models", skip_error_cache=True, timeout=8)) for ep in config.endpoints if "/v1" not in ep]
    # 2. Query llama-server endpoints via /v1/models
    # Also query endpoints from llama_server_endpoints that may not be in config.endpoints
    all_llama_endpoints = set(config.llama_server_endpoints) | set(ep for ep in config.endpoints if ep in config.llama_server_endpoints)
    llama_tasks = [
        (ep, fetch.endpoint_details(ep, "/models", "data", config.api_keys.get(ep), skip_error_cache=True, timeout=8))
        for ep in all_llama_endpoints
    ]
    
    ollama_loaded = await asyncio.gather(*[task for _, task in ollama_tasks]) if ollama_tasks else []
    llama_loaded = await asyncio.gather(*[task for _, task in llama_tasks]) if llama_tasks else []

    models: list[dict] = []
    
    # Add Ollama models with endpoint info (if any)
    if ollama_loaded:
        for (endpoint, modellist) in zip([ep for ep, _ in ollama_tasks], ollama_loaded):
            for model in modellist:
                if isinstance(model, dict):
                    model_with_endpoint = dict(model)
                    model_with_endpoint["endpoint"] = endpoint
                    models.append(model_with_endpoint)
    
    # Add llama-server models with endpoint info and full status metadata (if any)
    if llama_loaded:
        # Collect (endpoint, raw_id) pairs to fetch /props in parallel
        props_requests: list[tuple[str, str]] = []
        llama_models_pending: list[dict] = []

        for (endpoint, modellist) in zip([ep for ep, _ in llama_tasks], llama_loaded):
            # Include sleeping models too so _fetch_llama_props can unload them
            loaded_models = [item for item in modellist if _is_llama_model_loaded_or_sleeping(item)]
            for item in loaded_models:
                if isinstance(item, dict) and item.get("id"):
                    raw_id = item["id"]
                    normalized = _normalize_llama_model_name(raw_id)
                    quant = _extract_llama_quant(raw_id)
                    model_with_endpoint = {
                        "name": normalized,
                        "id": normalized,
                        "original_name": raw_id,
                        "digest": "",
                        "details": {"quantization_level": quant} if quant else {},
                        "endpoint": endpoint,
                        "status": item.get("status"),
                        "created": item.get("created"),
                        "owned_by": item.get("owned_by")
                    }
                    # Include full llama-server status details (args, preset)
                    status_info = item.get("status", {})
                    if isinstance(status_info, dict):
                        model_with_endpoint["llama_status_args"] = status_info.get("args")
                        model_with_endpoint["llama_status_preset"] = status_info.get("preset")
                    llama_models_pending.append(model_with_endpoint)
                    props_requests.append((endpoint, raw_id))

        # Fetch /props for each llama-server model to get context length (n_ctx)
        # and unload sleeping models automatically
        async def _fetch_llama_props(endpoint: str, model_id: str) -> tuple[int | None, bool, bool]:
            client: aiohttp.ClientSession = get_session(endpoint)
            base_url = endpoint.rstrip("/").removesuffix("/v1")
            props_url = f"{base_url}/props?model={model_id}"
            headers = None
            api_key = config.api_keys.get(endpoint)
            if api_key:
                headers = {"Authorization": f"Bearer {api_key}"}
            try:
                async with client.get(props_url, headers=headers, timeout=aiohttp.ClientTimeout(total=5)) as resp:
                    if resp.status == 200:
                        data = await resp.json()
                        dgs = data.get("default_generation_settings", {})
                        n_ctx = dgs.get("n_ctx")
                        is_sleeping = data.get("is_sleeping", False)
                        # Embedding models have no sampling params in default_generation_settings
                        is_generation = "temperature" in dgs

                        if is_sleeping:
                            unload_url = f"{base_url}/models/unload"
                            try:
                                async with client.post(
                                    unload_url,
                                    json={"model": model_id},
                                    headers=headers,
                                ) as unload_resp:
                                    print(f"[ps_details] Unloaded sleeping model {model_id} from {endpoint}: {unload_resp.status}")
                            except Exception as ue:
                                print(f"[ps_details] Failed to unload sleeping model {model_id} from {endpoint}: {ue}")

                        return n_ctx, is_sleeping, is_generation
            except Exception as e:
                print(f"[ps_details] Failed to fetch props from {props_url}: {e}")
            return None, False, False

        props_results = await asyncio.gather(
            *[_fetch_llama_props(ep, mid) for ep, mid in props_requests]
        )

        for (ep, raw_id), model_dict, (n_ctx, is_sleeping, is_generation) in zip(props_requests, llama_models_pending, props_results):
            if n_ctx is not None:
                model_dict["context_length"] = n_ctx
                if is_generation and 0 < n_ctx <= _CTX_TRIM_SMALL_LIMIT:
                    normalized = _normalize_llama_model_name(raw_id)
                    _endpoint_nctx[(ep, normalized)] = n_ctx
                    print(f"[ctx-cache/ps] cached n_ctx={n_ctx} for ({ep},{normalized})", flush=True)
            if not is_sleeping:
                models.append(model_dict)

    return JSONResponse(content={"models": models}, status_code=200)

# -------------------------------------------------------------
# 18b. Conversation-affinity stats – feeds the PS-table dot matrix
# -------------------------------------------------------------
@app.get("/api/affinity_stats")
async def affinity_stats(request: Request):
    """
    Aggregate live conversation-affinity pins, one entry per pinned conversation.
    Each entry exposes only the endpoint, model, and remaining TTL in seconds —
    no fingerprints or content. When conversation_affinity is disabled the
    `entries` list is always empty.
    """
    if not config.conversation_affinity:
        return {"enabled": False, "ttl": config.conversation_affinity_ttl, "entries": []}

    now = time.monotonic()
    entries: list[dict] = []
    llama_eps = set(config.llama_server_endpoints)
    async with _affinity_lock:
        for fp, (ep, mdl, expires_at) in list(_affinity_map.items()):
            remaining = expires_at - now
            if remaining <= 0:
                _affinity_map.pop(fp, None)
                continue
            # Mirror the normalisation used by /api/ps_details so the dashboard
            # can join affinity entries to PS rows by (endpoint, model).
            display_model = _normalize_llama_model_name(mdl) if ep in llama_eps else mdl
            entries.append({
                "endpoint": ep,
                "model": display_model,
                "remaining": round(remaining, 2),
            })
    return {
        "enabled": True,
        "ttl": config.conversation_affinity_ttl,
        "entries": entries,
    }

# -------------------------------------------------------------
# 19. Proxy usage route – for monitoring
# -------------------------------------------------------------
@app.get("/api/usage")
async def usage_proxy(request: Request):
    """
    Return a snapshot of the usage counter for each endpoint.
    Useful for debugging / monitoring.
    """
    return {"usage_counts": usage_counts,
            "token_usage_counts": token_usage_counts}

from backends.probe import _raw_probe, _endpoint_health


# -------------------------------------------------------------
# 20b. Proxy config route – for monitoring and frontend usage
# -------------------------------------------------------------
@app.get("/api/config")
async def config_proxy(request: Request):
    """
    Return a simple JSON object that contains the configured
    Ollama endpoints and llama_server_endpoints. The front‑end uses this
    to display which endpoints are being proxied and their health.
    Status is "error" when either liveness (/api/version) or routing
    health (/api/ps) fails — see issue #83.
    """
    async def check(url: str) -> dict:
        return {"url": url, **(await _endpoint_health(url, timeout=5))}

    ollama_results = await asyncio.gather(*[check(ep) for ep in config.endpoints])
    llama_results = []
    if config.llama_server_endpoints:
        llama_results = await asyncio.gather(
            *[check(ep) for ep in config.llama_server_endpoints]
        )

    return {
        "endpoints": ollama_results,
        "llama_server_endpoints": llama_results,
        "require_router_api_key": bool(config.router_api_key),
    }

# -------------------------------------------------------------
# 21. API route – OpenAI compatible Embedding
# -------------------------------------------------------------
@app.post("/v1/embeddings")
async def openai_embedding_proxy(request: Request):
    """
    Proxy an OpenAI API compatible embedding request to Ollama and reply with embeddings.

    """
    # 1. Parse and validate request
    try:
        body_bytes = await request.body()
        payload = orjson.loads(body_bytes.decode("utf-8"))

        model = payload.get("model")
        doc = payload.get("input")

        # Normalize multimodal input: extract only text parts for embedding models
        if isinstance(doc, list):
            normalized = []
            for item in doc:
                if isinstance(item, dict):
                    # Multimodal content part - extract text only, skip images
                    if item.get("type") == "text":
                        normalized.append(item.get("text", ""))
                    # Skip image_url and other non-text types
                else:
                    normalized.append(item)
            doc = normalized if len(normalized) != 1 else normalized[0]
        elif isinstance(doc, dict) and doc.get("type") == "text":
            doc = doc.get("text", "")

        if not model:
            raise HTTPException(
                status_code=400, detail="Missing required field 'model'"
            )
        if not doc:
            raise HTTPException(
                status_code=400, detail="Missing required field 'input'"
            )
    except orjson.JSONDecodeError as e:
        raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e

    # 2. Endpoint logic
    endpoint, tracking_model = await choose_endpoint(model)
    if is_openai_compatible(endpoint):
        api_key = config.api_keys.get(endpoint, "no-key")
    else:
        api_key = "ollama"
    oclient = _make_openai_client(endpoint, default_headers=default_headers, api_key=api_key)

    try:
        async_gen = await oclient.embeddings.create(input=doc, model=model)
        result = async_gen.model_dump()
        for item in result.get("data", []):
            emb = item.get("embedding")
            if emb:
                item["embedding"] = [0.0 if isinstance(v, float) and not math.isfinite(v) else v for v in emb]
        return JSONResponse(content=result)
    finally:
        await decrement_usage(endpoint, tracking_model)

# -------------------------------------------------------------
# 22. API route – OpenAI compatible Chat Completions
# -------------------------------------------------------------
@app.post("/v1/chat/completions")
async def openai_chat_completions_proxy(request: Request):
    """
    Proxy an OpenAI API compatible chat completions request to Ollama and reply with a streaming response.

    """
    # 1. Parse and validate request
    try:
        body_bytes = await request.body()
        payload = orjson.loads(body_bytes.decode("utf-8"))

        model = payload.get("model")
        messages = payload.get("messages")
        frequency_penalty = payload.get("frequency_penalty")
        presence_penalty = payload.get("presence_penalty")
        response_format = payload.get("response_format")
        seed = payload.get("seed")
        stop = payload.get("stop")
        stream = payload.get("stream")
        stream_options = payload.get("stream_options")
        temperature = payload.get("temperature")
        top_p = payload.get("top_p")
        max_tokens = payload.get("max_tokens")
        max_completion_tokens = payload.get("max_completion_tokens")
        tools = payload.get("tools")
        logprobs = payload.get("logprobs")
        top_logprobs = payload.get("top_logprobs")
        _cache_enabled = payload.get("nomyo", {}).get("cache", False)

        if not model:
            raise HTTPException(
                status_code=400, detail="Missing required field 'model'"
            )
        if not isinstance(messages, list):
            raise HTTPException(
                status_code=400, detail="Missing required field 'messages' (must be a list)"
            )

        if ":latest" in model:
            model = model.split(":latest")
            model = model[0]

        messages = _strip_assistant_prefill(messages)
        params = {
            "messages": messages,
            "model": model,
        }

        optional_params = {
            "tools": tools,
            "response_format": response_format,
            "stream_options": stream_options or {"include_usage": True },
            "max_completion_tokens": max_completion_tokens,
            "max_tokens": max_tokens,
            "temperature": temperature,
            "top_p": top_p,
            "seed": seed,
            "presence_penalty": presence_penalty,
            "frequency_penalty": frequency_penalty,
            "stop": stop,
            "stream": stream,
            "logprobs": logprobs,
            "top_logprobs": top_logprobs,
        }

        params.update({k: v for k, v in optional_params.items() if v is not None})
    except orjson.JSONDecodeError as e:
        raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e

    # Reject unsupported image formats (SVG) before doing any work
    for _msg in messages:
        for _item in (_msg.get("content") or []) if isinstance(_msg.get("content"), list) else []:
            if _item.get("type") == "image_url":
                _url = (_item.get("image_url") or {}).get("url", "")
                if _url.startswith("data:image/svg") or _url.lower().endswith(".svg"):
                    raise HTTPException(
                        status_code=400,
                        detail="SVG images are not supported. Please convert the image to PNG or JPEG before sending.",
                    )

    # Cache lookup — before endpoint selection
    _cache = get_llm_cache()
    if _cache is not None and _cache_enabled:
        _cached = await _cache.get_chat("openai_chat", model, messages)
        if _cached is not None:
            if stream:
                _sse = openai_nonstream_to_sse(_cached, model)
                async def _serve_cached_ochat_stream():
                    yield _sse
                return StreamingResponse(_serve_cached_ochat_stream(), media_type="text/event-stream")
            else:
                async def _serve_cached_ochat_json():
                    yield _cached
                return StreamingResponse(_serve_cached_ochat_json(), media_type="application/json")

    # 2. Endpoint logic
    _affinity_key = _conversation_fingerprint(model, messages, None)
    endpoint, tracking_model = await choose_endpoint(model, affinity_key=_affinity_key)
    oclient = _make_openai_client(endpoint, default_headers=default_headers, api_key=config.api_keys.get(endpoint, "no-key"))
    # 3. Helpers and API call — done in handler scope so try/except works reliably
    async def _normalize_images_in_messages(msgs: list) -> list:
        """Fetch remote image URLs and convert them to base64 data URLs so
        Ollama/llama-server can handle them without making outbound HTTP requests."""
        resolved = []
        for msg in msgs:
            content = msg.get("content")
            if not isinstance(content, list):
                resolved.append(msg)
                continue
            new_content = []
            for item in content:
                if item.get("type") == "image_url":
                    url = (item.get("image_url") or {}).get("url", "")
                    if url and not url.startswith("data:"):
                        try:
                            http: aiohttp.ClientSession = app_state["session"]
                            async with http.get(url) as resp:
                                ctype = resp.headers.get("Content-Type", "image/jpeg").split(";")[0].strip()
                                img_bytes = await resp.read()
                            b64 = base64.b64encode(img_bytes).decode("utf-8")
                            new_content.append({
                                "type": "image_url",
                                "image_url": {"url": f"data:{ctype};base64,{b64}"}
                            })
                        except Exception as _ie:
                            print(f"[image] Failed to fetch image URL: {_ie}")
                            new_content.append(item)
                    else:
                        new_content.append(item)
                else:
                    new_content.append(item)
            resolved.append({**msg, "content": new_content})
        return resolved

    # Make the API call in handler scope — try/except inside async generators is unreliable
    # with Starlette's streaming machinery, so we resolve errors here before the generator starts.
    send_params = params
    if not is_ext_openai_endpoint(endpoint):
        resolved_msgs = await _normalize_images_in_messages(params.get("messages", []))
        send_params = {**params, "messages": resolved_msgs}
    # Proactive trim: only for small-ctx models we've already seen run out of space
    _lookup_model = _normalize_llama_model_name(model) if endpoint in config.llama_server_endpoints else model
    _known_nctx = _endpoint_nctx.get((endpoint, _lookup_model))
    if _known_nctx and _known_nctx <= _CTX_TRIM_SMALL_LIMIT:
        _pre_target = int(((_known_nctx - _known_nctx // 4)) / 1.2)
        _pre_est = _count_message_tokens(send_params.get("messages", []))
        if _pre_est > _pre_target:
            _pre_msgs = send_params.get("messages", [])
            _pre_trimmed = _trim_messages_for_context(_pre_msgs, _known_nctx, target_tokens=_pre_target)
            _dropped = len(_pre_msgs) - len(_pre_trimmed)
            print(f"[ctx-pre] n_ctx={_known_nctx} est={_pre_est} target={_pre_target} dropped={_dropped}", flush=True)
            send_params = {**send_params, "messages": _pre_trimmed}
    try:
        async_gen = await oclient.chat.completions.create(**send_params)
    except Exception as e:
        _e_str = str(e)
        _is_ctx_err = "exceed_context_size_error" in _e_str or "exceeds the available context size" in _e_str
        print(f"[ochat] caught={type(e).__name__} ctx={_is_ctx_err} msg={_e_str[:120]}", flush=True)
        if "does not support tools" in _e_str:
            # Model doesn't support tools — retry without them
            print(f"[ochat] retry: no tools", flush=True)
            try:
                params_without_tools = {k: v for k, v in send_params.items() if k != "tools"}
                async_gen = await oclient.chat.completions.create(**params_without_tools)
            except Exception:
                await decrement_usage(endpoint, tracking_model)
                raise
        elif _is_ctx_err:
            # Backend context limit hit — apply sliding-window trim (context-shift at message level)
            err_body = getattr(e, "body", {}) or {}
            err_detail = err_body.get("error", {}) if isinstance(err_body, dict) else {}
            n_ctx_limit = err_detail.get("n_ctx", 0)
            actual_tokens = err_detail.get("n_prompt_tokens", 0)
            # Fallback: parse from string if body parsing yielded nothing (SDK may not parse llama-server errors)
            if not n_ctx_limit:
                import re as _re
                _m = _re.search(r"'n_ctx':\s*(\d+)", _e_str)
                if _m:
                    n_ctx_limit = int(_m.group(1))
                _m = _re.search(r"'n_prompt_tokens':\s*(\d+)", _e_str)
                if _m:
                    actual_tokens = int(_m.group(1))
            print(f"[ctx-trim] n_ctx={n_ctx_limit} actual={actual_tokens}", flush=True)
            if not n_ctx_limit:
                await decrement_usage(endpoint, tracking_model)
                raise
            if n_ctx_limit <= _CTX_TRIM_SMALL_LIMIT:
                _endpoint_nctx[(endpoint, model)] = n_ctx_limit

            msgs_to_trim = send_params.get("messages", [])
            try:
                cal_target = _calibrated_trim_target(msgs_to_trim, n_ctx_limit, actual_tokens)
                trimmed_messages = _trim_messages_for_context(msgs_to_trim, n_ctx_limit, target_tokens=cal_target)
            except Exception as _helper_exc:
                print(f"[ctx-trim] helper crash: {type(_helper_exc).__name__}: {str(_helper_exc)[:100]}", flush=True)
                await decrement_usage(endpoint, tracking_model)
                raise
            dropped = len(msgs_to_trim) - len(trimmed_messages)
            print(f"[ctx-trim] target={cal_target} dropped={dropped} remaining={len(trimmed_messages)} retrying-1", flush=True)
            try:
                async_gen = await oclient.chat.completions.create(**{**send_params, "messages": trimmed_messages})
                print(f"[ctx-trim] retry-1 ok", flush=True)
            except Exception as e2:
                _e2_str = str(e2)
                if "exceed_context_size_error" in _e2_str or "exceeds the available context size" in _e2_str:
                    # Still too large — tool definitions likely consuming too many tokens, strip them too
                    print(f"[ctx-trim] retry-1 still exceeded, stripping tools retrying-2", flush=True)
                    params_no_tools = {k: v for k, v in send_params.items() if k not in ("tools", "tool_choice")}
                    try:
                        async_gen = await oclient.chat.completions.create(**{**params_no_tools, "messages": trimmed_messages})
                        print(f"[ctx-trim] retry-2 ok", flush=True)
                    except Exception:
                        await decrement_usage(endpoint, tracking_model)
                        raise
                else:
                    await decrement_usage(endpoint, tracking_model)
                    raise
        elif _is_backend_connection_error(e):
            # Upstream connection failed (e.g. llama-server in router mode
            # whose delegated worker died). Mark (endpoint, model) so the
            # next request reroutes; the client will retry this one.
            print(f"[ochat] backend connection error → marking ({endpoint}, {model}) unhealthy", flush=True)
            await _mark_backend_unhealthy(endpoint, model, _e_str)
            await decrement_usage(endpoint, tracking_model)
            raise
        elif "image input is not supported" in _e_str:
            # Model doesn't support images — strip and retry
            print(f"[openai_chat_completions_proxy] Model {model} doesn't support images, retrying with text-only messages")
            try:
                async_gen = await oclient.chat.completions.create(**{**send_params, "messages": _strip_images_from_messages(send_params.get("messages", []))})
            except Exception:
                await decrement_usage(endpoint, tracking_model)
                raise
        else:
            await decrement_usage(endpoint, tracking_model)
            raise

    # 4. Async generator — only streams the already-established async_gen
    async def stream_ochat_response():
        try:
            if stream == True:
                content_parts: list[str] = []
                usage_snapshot: dict = {}
                async for chunk in async_gen:
                    data = (
                        chunk.model_dump_json()
                        if hasattr(chunk, "model_dump_json")
                        else orjson.dumps(chunk)
                    )
                    if chunk.choices:
                        delta = chunk.choices[0].delta
                        has_content = delta.content is not None
                        has_reasoning = (
                            getattr(delta, "reasoning_content", None) is not None
                            or getattr(delta, "reasoning", None) is not None
                        )
                        has_tool_calls = getattr(delta, "tool_calls", None) is not None
                        if has_content or has_reasoning or has_tool_calls:
                            yield f"data: {data}\n\n".encode("utf-8")
                        if has_content and delta.content:
                            content_parts.append(delta.content)
                    elif chunk.usage is not None:
                        # Forward the usage-only final chunk (e.g. from llama-server)
                        yield f"data: {data}\n\n".encode("utf-8")
                    prompt_tok = 0
                    comp_tok = 0
                    if chunk.usage is not None:
                        prompt_tok = chunk.usage.prompt_tokens or 0
                        comp_tok   = chunk.usage.completion_tokens or 0
                        usage_snapshot = {"prompt_tokens": prompt_tok, "completion_tokens": comp_tok, "total_tokens": prompt_tok + comp_tok}
                    else:
                        llama_usage = rechunk.extract_usage_from_llama_timings(chunk)
                        if llama_usage:
                            prompt_tok, comp_tok = llama_usage
                    if prompt_tok != 0 or comp_tok != 0:
                        await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))
                    # Detect context exhaustion mid-generation for small-ctx models.
                    # Guard: skip if max_tokens was set in the request — finish_reason=length
                    # could just mean the caller's token budget was exhausted, not the context window.
                    _req_max_tok = send_params.get("max_tokens") or send_params.get("max_completion_tokens")
                    if chunk.choices and chunk.choices[0].finish_reason == "length" and not _req_max_tok:
                        _inferred_nctx = (prompt_tok + comp_tok) or 0
                        if 0 < _inferred_nctx <= _CTX_TRIM_SMALL_LIMIT:
                            _endpoint_nctx[(endpoint, model)] = _inferred_nctx
                            print(f"[ctx-cache] finish_reason=length → cached n_ctx={_inferred_nctx} for ({endpoint},{model})", flush=True)
                # Cache assembled streaming response — before [DONE] so it always runs
                if _cache is not None and _cache_enabled and content_parts:
                    assembled = orjson.dumps({
                        "model": model,
                        "choices": [{"index": 0, "message": {"role": "assistant", "content": "".join(content_parts)}, "finish_reason": "stop"}],
                        **({"usage": usage_snapshot} if usage_snapshot else {}),
                    }) + b"\n"
                    try:
                        await _cache.set_chat("openai_chat", model, messages, assembled)
                    except Exception as _ce:
                        print(f"[cache] set_chat (openai_chat streaming) failed: {_ce}")
                yield b"data: [DONE]\n\n"
            else:
                prompt_tok = 0
                comp_tok = 0
                if async_gen.usage is not None:
                    prompt_tok = async_gen.usage.prompt_tokens or 0
                    comp_tok   = async_gen.usage.completion_tokens or 0
                else:
                    llama_usage = rechunk.extract_usage_from_llama_timings(async_gen)
                    if llama_usage:
                        prompt_tok, comp_tok = llama_usage
                if prompt_tok != 0 or comp_tok != 0:
                    await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))
                json_line = (
                    async_gen.model_dump_json()
                    if hasattr(async_gen, "model_dump_json")
                    else orjson.dumps(async_gen)
                )
                cache_bytes = json_line.encode("utf-8") + b"\n"
                yield cache_bytes
                # Cache non-streaming response
                if _cache is not None and _cache_enabled:
                    try:
                        await _cache.set_chat("openai_chat", model, messages, cache_bytes)
                    except Exception as _ce:
                        print(f"[cache] set_chat (openai_chat non-streaming) failed: {_ce}")

        finally:
            # Ensure counter is decremented even if an exception occurs
            await decrement_usage(endpoint, tracking_model)

    # 4. Return a StreamingResponse backed by the generator
    return StreamingResponse(
        stream_ochat_response(),
        media_type="text/event-stream" if stream else "application/json",
    )

# -------------------------------------------------------------
# 23. API route – OpenAI compatible Completions
# -------------------------------------------------------------
@app.post("/v1/completions")
async def openai_completions_proxy(request: Request):
    """
    Proxy an OpenAI API compatible chat completions request to Ollama and reply with a streaming response.

    """
    # 1. Parse and validate request
    try:
        body_bytes = await request.body()
        payload = orjson.loads(body_bytes.decode("utf-8"))

        model = payload.get("model")
        prompt = payload.get("prompt")
        frequency_penalty = payload.get("frequency_penalty")
        presence_penalty = payload.get("presence_penalty")
        seed = payload.get("seed")
        stop = payload.get("stop")
        stream = payload.get("stream")
        stream_options = payload.get("stream_options")
        temperature = payload.get("temperature")
        top_p = payload.get("top_p")
        max_tokens = payload.get("max_tokens")
        max_completion_tokens = payload.get("max_completion_tokens")
        suffix = payload.get("suffix")
        _cache_enabled = payload.get("nomyo", {}).get("cache", False)

        if not model:
            raise HTTPException(
                status_code=400, detail="Missing required field 'model'"
            )
        if not prompt:
            raise HTTPException(
                status_code=400, detail="Missing required field 'prompt'"
            )

        if ":latest" in model:
            model = model.split(":latest")
            model = model[0]

        params = {
            "prompt": prompt,
            "model": model,
        }

        optional_params = {
            "frequency_penalty": frequency_penalty,
            "presence_penalty": presence_penalty,
            "seed": seed,
            "stop": stop,
            "stream": stream,
            "stream_options": stream_options or {"include_usage": True },
            "temperature": temperature,
            "top_p": top_p,
            "max_tokens": max_tokens,
            "max_completion_tokens": max_completion_tokens,
            "suffix": suffix
        }

        params.update({k: v for k, v in optional_params.items() if v is not None})
    except orjson.JSONDecodeError as e:
        raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e

    # Cache lookup — completions prompt mapped to a single-turn messages list
    _cache = get_llm_cache()
    _compl_messages = [{"role": "user", "content": prompt}]
    if _cache is not None and _cache_enabled:
        _cached = await _cache.get_chat("openai_completions", model, _compl_messages)
        if _cached is not None:
            if stream:
                _sse = openai_nonstream_to_sse(_cached, model)
                async def _serve_cached_ocompl_stream():
                    yield _sse
                return StreamingResponse(_serve_cached_ocompl_stream(), media_type="text/event-stream")
            else:
                async def _serve_cached_ocompl_json():
                    yield _cached
                return StreamingResponse(_serve_cached_ocompl_json(), media_type="application/json")

    # 2. Endpoint logic
    _affinity_key = _conversation_fingerprint(model, None, prompt)
    endpoint, tracking_model = await choose_endpoint(model, affinity_key=_affinity_key)
    oclient = _make_openai_client(endpoint, default_headers=default_headers, api_key=config.api_keys.get(endpoint, "no-key"))

    # 3. Async generator that streams completions data and decrements the counter
    # Make the API call in handler scope (try/except inside async generators is unreliable)
    try:
        async_gen = await oclient.completions.create(**params)
    except Exception as e:
        if _is_backend_connection_error(e):
            print(f"[ocompl] backend connection error → marking ({endpoint}, {model}) unhealthy", flush=True)
            await _mark_backend_unhealthy(endpoint, model, str(e))
        await decrement_usage(endpoint, tracking_model)
        raise

    async def stream_ocompletions_response(model=model):
        try:
            if stream == True:
                text_parts: list[str] = []
                usage_snapshot: dict = {}
                async for chunk in async_gen:
                    data = (
                        chunk.model_dump_json()
                        if hasattr(chunk, "model_dump_json")
                        else orjson.dumps(chunk)
                    )
                    if chunk.choices:
                        choice = chunk.choices[0]
                        has_text = getattr(choice, "text", None) is not None
                        has_reasoning = (
                            getattr(choice, "reasoning_content", None) is not None
                            or getattr(choice, "reasoning", None) is not None
                        )
                        if has_text or has_reasoning or choice.finish_reason is not None:
                            yield f"data: {data}\n\n".encode("utf-8")
                        if has_text and choice.text:
                            text_parts.append(choice.text)
                    elif chunk.usage is not None:
                        # Forward the usage-only final chunk (e.g. from llama-server)
                        yield f"data: {data}\n\n".encode("utf-8")
                    prompt_tok = 0
                    comp_tok = 0
                    if chunk.usage is not None:
                        prompt_tok = chunk.usage.prompt_tokens or 0
                        comp_tok   = chunk.usage.completion_tokens or 0
                        usage_snapshot = {"prompt_tokens": prompt_tok, "completion_tokens": comp_tok, "total_tokens": prompt_tok + comp_tok}
                    else:
                        llama_usage = rechunk.extract_usage_from_llama_timings(chunk)
                        if llama_usage:
                            prompt_tok, comp_tok = llama_usage
                    if prompt_tok != 0 or comp_tok != 0:
                        await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))
                # Cache assembled streaming response — before [DONE] so it always runs
                if _cache is not None and _cache_enabled and text_parts:
                    assembled = orjson.dumps({
                        "model": model,
                        "choices": [{"index": 0, "message": {"role": "assistant", "content": "".join(text_parts)}, "finish_reason": "stop"}],
                        **({"usage": usage_snapshot} if usage_snapshot else {}),
                    }) + b"\n"
                    try:
                        await _cache.set_chat("openai_completions", model, _compl_messages, assembled)
                    except Exception as _ce:
                        print(f"[cache] set_chat (openai_completions streaming) failed: {_ce}")
                # Final DONE event
                yield b"data: [DONE]\n\n"
            else:
                prompt_tok = 0
                comp_tok = 0
                if async_gen.usage is not None:
                    prompt_tok = async_gen.usage.prompt_tokens or 0
                    comp_tok   = async_gen.usage.completion_tokens or 0
                else:
                    llama_usage = rechunk.extract_usage_from_llama_timings(async_gen)
                    if llama_usage:
                        prompt_tok, comp_tok = llama_usage
                if prompt_tok != 0 or comp_tok != 0:
                    await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))
                json_line = (
                    async_gen.model_dump_json()
                    if hasattr(async_gen, "model_dump_json")
                    else orjson.dumps(async_gen)
                )
                cache_bytes = json_line.encode("utf-8") + b"\n"
                yield cache_bytes
                # Cache non-streaming response
                if _cache is not None and _cache_enabled:
                    try:
                        await _cache.set_chat("openai_completions", model, _compl_messages, cache_bytes)
                    except Exception as _ce:
                        print(f"[cache] set_chat (openai_completions non-streaming) failed: {_ce}")

        finally:
            # Ensure counter is decremented even if an exception occurs
            await decrement_usage(endpoint, tracking_model)

    # 4. Return a StreamingResponse backed by the generator
    return StreamingResponse(
        stream_ocompletions_response(),
        media_type="text/event-stream" if stream else "application/json",
    )

# -------------------------------------------------------------
# 24. OpenAI API compatible models endpoint
# -------------------------------------------------------------
@app.get("/v1/models")
async def openai_models_proxy(request: Request):
    """
    Proxy an OpenAI API models request to Ollama and llama-server endpoints and reply with a unique list of models.
    
    For Ollama endpoints: queries /api/tags (all models)
    For llama-server endpoints: queries /v1/models and filters for status.value == "loaded"
    """
    # 1. Query Ollama endpoints for all models via /api/tags
    ollama_tasks = [fetch.endpoint_details(ep, "/api/tags", "models", skip_error_cache=True, timeout=8) for ep in config.endpoints if "/v1" not in ep]
    # 2. Query external OpenAI endpoints (Groq, OpenAI, etc.) via /models
    ext_openai_tasks = [fetch.endpoint_details(ep, "/models", "data", config.api_keys.get(ep), skip_error_cache=True, timeout=8) for ep in config.endpoints if is_ext_openai_endpoint(ep)]
    # 3. Query llama-server endpoints for loaded models via /v1/models
    # Also query endpoints from llama_server_endpoints that may not be in config.endpoints
    all_llama_endpoints = set(config.llama_server_endpoints) | set(ep for ep in config.endpoints if ep in config.llama_server_endpoints)
    llama_tasks = [
        fetch.endpoint_details(ep, "/models", "data", config.api_keys.get(ep), skip_error_cache=True, timeout=8)
        for ep in all_llama_endpoints
    ]

    ollama_models = await asyncio.gather(*ollama_tasks) if ollama_tasks else []
    ext_openai_models = await asyncio.gather(*ext_openai_tasks) if ext_openai_tasks else []
    llama_models = await asyncio.gather(*llama_tasks) if llama_tasks else []

    models = {'data': []}

    # Add Ollama models (if any)
    if ollama_models:
        for modellist in ollama_models:
            for model in modellist:
                if not "id" in model.keys():  # Relable Ollama models with OpenAI Model.id from Model.name
                    model['id'] = model.get('name', model.get('id', ''))
                else:
                    model['name'] = model['id']
                models['data'].append(model)

    # Add external OpenAI models (if any)
    if ext_openai_models:
        for modellist in ext_openai_models:
            for model in modellist:
                if not "id" in model.keys():
                    model['id'] = model.get('name', model.get('id', ''))
                else:
                    model['name'] = model['id']
                models['data'].append(model)

    # Add llama-server models (all available, not just loaded)
    if llama_models:
        for modellist in llama_models:
            for model in modellist:
                if not "id" in model.keys():
                    model['id'] = model.get('name', model.get('id', ''))
                else:
                    model['name'] = model['id']
                models['data'].append(model)
    
    # 2. Return a JSONResponse with a deduplicated list of unique models for inference
    return JSONResponse(
        content={"data": dedupe_on_keys(models['data'], ['name'])},
        status_code=200,
    )

# -------------------------------------------------------------
# 25. API route – OpenAI/Jina/Cohere compatible Rerank
# -------------------------------------------------------------
@app.post("/v1/rerank")
@app.post("/rerank")
async def rerank_proxy(request: Request):
    """
    Proxy a rerank request to a llama-server or external OpenAI-compatible endpoint.

    Compatible with the Jina/Cohere rerank API convention used by llama-server,
    vLLM, and services such as Cohere and Jina AI.

    Ollama does not natively support reranking; requests routed to a plain Ollama
    endpoint will receive a 501 Not Implemented response.

    Request body:
        model           (str, required)  – reranker model name
        query           (str, required)  – search query
        documents       (list[str], required) – candidate documents to rank
        top_n           (int, optional)  – limit returned results (default: all)
        return_documents (bool, optional) – include document text in results
        max_tokens_per_doc (int, optional) – truncation limit per document

    Response (Jina/Cohere-compatible):
        {
          "id": "...",
          "model": "...",
          "usage": {"prompt_tokens": N, "total_tokens": N},
          "results": [{"index": 0, "relevance_score": 0.95}, ...]
        }
    """
    try:
        body_bytes = await request.body()
        payload = orjson.loads(body_bytes.decode("utf-8"))

        model = payload.get("model")
        query = payload.get("query")
        documents = payload.get("documents")

        if not model:
            raise HTTPException(status_code=400, detail="Missing required field 'model'")
        if not query:
            raise HTTPException(status_code=400, detail="Missing required field 'query'")
        if not isinstance(documents, list) or not documents:
            raise HTTPException(status_code=400, detail="Missing or empty required field 'documents' (must be a non-empty list)")
    except orjson.JSONDecodeError as e:
        raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e

    # Determine which endpoint serves this model
    try:
        endpoint, tracking_model = await choose_endpoint(model)
    except RuntimeError as e:
        raise HTTPException(status_code=404, detail=str(e))

    # Ollama endpoints have no native rerank support
    if not is_openai_compatible(endpoint):
        await decrement_usage(endpoint, tracking_model)
        raise HTTPException(
            status_code=501,
            detail=(
                f"Endpoint '{endpoint}' is a plain Ollama instance which does not support "
                "reranking. Use a llama-server or OpenAI-compatible endpoint with a "
                "dedicated reranker model."
            ),
        )

    if ":latest" in model:
        model = model.split(":latest")[0]

    # Build upstream rerank request body – forward only recognised fields
    upstream_payload: dict = {"model": model, "query": query, "documents": documents}
    for optional_key in ("top_n", "return_documents", "max_tokens_per_doc"):
        if optional_key in payload:
            upstream_payload[optional_key] = payload[optional_key]

    # Determine upstream URL:
    #   llama-server exposes /v1/rerank (base already contains /v1 for llama_server_endpoints)
    #   External OpenAI endpoints expose /rerank under their /v1 base
    if endpoint in config.llama_server_endpoints:
        # llama-server: endpoint may or may not already contain /v1
        if "/v1" in endpoint:
            rerank_url = f"{endpoint}/rerank"
        else:
            rerank_url = f"{endpoint}/v1/rerank"
    else:
        # External OpenAI-compatible: ep2base gives us the /v1 base
        rerank_url = f"{ep2base(endpoint)}/rerank"

    api_key = config.api_keys.get(endpoint, "no-key")
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}",
    }

    client: aiohttp.ClientSession = get_session(endpoint)
    try:
        async with client.post(rerank_url, json=upstream_payload, headers=headers) as resp:
            response_bytes = await resp.read()
            if resp.status >= 400:
                raise HTTPException(
                    status_code=resp.status,
                    detail=_mask_secrets(response_bytes.decode("utf-8", errors="replace")),
                )
            data = orjson.loads(response_bytes)

        # Record token usage if the upstream returned a usage object
        usage = data.get("usage") or {}
        prompt_tok = usage.get("prompt_tokens") or 0
        total_tok = usage.get("total_tokens") or 0
        # For reranking there are no completion tokens; we record prompt tokens only
        if prompt_tok or total_tok:
            await token_queue.put((endpoint, tracking_model, prompt_tok, 0))

        return JSONResponse(content=data)
    finally:
        await decrement_usage(endpoint, tracking_model)

# -------------------------------------------------------------
# 25b. Cache management endpoints
# -------------------------------------------------------------
@app.get("/api/cache/stats")
async def cache_stats():
    """Return hit/miss counters and configuration for the LLM response cache."""
    c = get_llm_cache()
    if c is None:
        return {"enabled": False}
    return {"enabled": True, **c.stats()}


@app.post("/api/cache/invalidate")
async def cache_invalidate():
    """Clear all entries from the LLM response cache and reset counters."""
    c = get_llm_cache()
    if c is None:
        return {"enabled": False, "cleared": False}
    await c.clear()
    return {"enabled": True, "cleared": True}


# -------------------------------------------------------------
# 26. Serve the static front‑end
# -------------------------------------------------------------
app.mount("/static", StaticFiles(directory="static"), name="static")

@app.get("/favicon.ico")
async def redirect_favicon():
    return RedirectResponse(url="/static/favicon.ico")

@app.get("/", response_class=HTMLResponse)
async def index(request: Request):
    """
    Render the dynamic NOMYO Router dashboard listing the configured endpoints
    and the models details, availability & task status.
    """
    index_path = STATIC_DIR / "index.html"
    try:
        return HTMLResponse(content=index_path.read_text(encoding="utf-8"), status_code=200)
    except FileNotFoundError:
        raise HTTPException(status_code=404, detail="Page not found")
    except Exception:
        raise HTTPException(status_code=500, detail="Internal server error")

# -------------------------------------------------------------
# 26. Healthendpoint
# -------------------------------------------------------------
@app.get("/health")
async def health_proxy(request: Request):
    """
    Health‑check endpoint for monitoring the proxy.

    * Queries each configured endpoint for both liveness and routing health:
      Ollama endpoints are probed at `/api/version` AND `/api/ps`,
      OpenAI-compatible endpoints at `/models`.
    * Returns a JSON object containing:
        - `status`: "ok" if every endpoint replied to every probe, otherwise "error".
        - `endpoints`: a mapping of endpoint URL → `{status, version|detail}`.
    * The HTTP status code is 200 when everything is healthy, 503 otherwise.
    """
    # Run all health checks in parallel.
    # Ollama endpoints expose /api/version (liveness) and /api/ps (routing
    # health — required by `choose_endpoint`). OpenAI-compatible endpoints
    # (vLLM, llama-server, external) expose /models, which serves both
    # purposes. Probing /api/version alone would miss the case where the
    # Ollama process is up but /api/ps is failing — see issue #83.
    all_endpoints = list(config.endpoints)
    llama_eps_extra = [ep for ep in config.llama_server_endpoints if ep not in config.endpoints]
    all_endpoints += llama_eps_extra

    probe_results = await asyncio.gather(
        *(_endpoint_health(ep) for ep in all_endpoints),
    )

    health_summary = dict(zip(all_endpoints, probe_results))
    overall_ok = all(entry.get("status") == "ok" for entry in probe_results)

    response_payload = {
        "status": "ok" if overall_ok else "error",
        "endpoints": health_summary,
    }

    http_status = 200 if overall_ok else 503
    return JSONResponse(content=response_payload, status_code=http_status)

# -------------------------------------------------------------
# 27. Hostname endpoint
# -------------------------------------------------------------
@app.get("/api/hostname")
async def get_hostname():
    """Return the hostname of the machine running the router."""
    return JSONResponse(content={"hostname": socket.gethostname()})

# -------------------------------------------------------------
# 28. SSE route for usage broadcasts
# -------------------------------------------------------------
@app.get("/api/usage-stream")
async def usage_stream(request: Request):
    """
    Server‑Sent‑Events that emits a JSON payload every time the
    global `usage_counts` dictionary changes.
    """
    async def event_generator():
        # The queue that receives *every* new snapshot
        queue = await subscribe()
        try:
            while True:
                # If the client disconnects, cancel the loop
                if await request.is_disconnected():
                    break
                data = await queue.get()
                if data is None:
                    break
                # Send the data as a single SSE message
                yield f"data: {data}\n\n"
        finally:
            # Clean‑up: unsubscribe from the broadcast channel
            await unsubscribe(queue)

    return StreamingResponse(event_generator(), media_type="text/event-stream")

# -------------------------------------------------------------
# 28. FastAPI startup/shutdown events
# -------------------------------------------------------------
@app.on_event("startup")
async def startup_event() -> None:
    global config, db, token_worker_task, flush_task
    # Load YAML config (or use defaults if not present)
    config_path = _config_path_from_env()
    config = Config.from_yaml(config_path)
    if config_path.exists():
        print(
            f"Loaded configuration from {config_path}:\n"
            f" endpoints={config.endpoints},\n"
            f" llama_server_endpoints={config.llama_server_endpoints},\n"
            f" max_concurrent_connections={config.max_concurrent_connections},\n"
            f" endpoint_config={config.endpoint_config},\n"
            f" priority_routing={config.priority_routing}"
        )
    else:
        print(
            f"No configuration file found at {config_path}. "
            "Falling back to default settings."
        )

    # Initialize database
    db = TokenDatabase(config.db_path)
    await db.init_db()

    # Load existing token counts from database
    async for count_entry in db.load_token_counts():
        endpoint = count_entry['endpoint']
        model = count_entry['model']
        input_tokens = count_entry['input_tokens']
        output_tokens = count_entry['output_tokens']
        total_tokens = count_entry['total_tokens']

        token_usage_counts[endpoint][model] = total_tokens

    ssl_context = ssl.create_default_context()
    connector = aiohttp.TCPConnector(limit=0, limit_per_host=512, ssl=ssl_context)
    timeout = aiohttp.ClientTimeout(total=60, connect=15, sock_read=120, sock_connect=15)
    session = aiohttp.ClientSession(
        connector=connector,
        timeout=timeout,
        headers={"Referer": default_headers.get("HTTP-Referer", "https://nomyo.ai")},
    )

    app_state["connector"] = connector
    app_state["session"] = session

    # Create httpx clients for external OpenAI endpoints (Google, etc.)
    # aiohttp strips Referer headers for cross-origin requests, so we use httpx
    for ep in config.endpoints:
        if is_ext_openai_endpoint(ep):
            app_state["httpx_clients"][ep] = httpx.AsyncClient(timeout=30.0)

    # Create per-endpoint Unix socket sessions for .sock endpoints
    for ep in config.llama_server_endpoints:
        if _is_unix_socket_endpoint(ep):
            sock_path = _get_socket_path(ep)
            sock_connector = aiohttp.UnixConnector(path=sock_path)
            sock_timeout = aiohttp.ClientTimeout(total=300, connect=5, sock_read=300)
            sock_session = aiohttp.ClientSession(connector=sock_connector, timeout=sock_timeout)
            app_state["socket_sessions"][ep] = sock_session
            transport = httpx.AsyncHTTPTransport(uds=sock_path)
            app_state["httpx_clients"][ep] = httpx.AsyncClient(transport=transport, timeout=300.0)
            print(f"[startup] Unix socket session: {ep} -> {sock_path}")

    token_worker_task = asyncio.create_task(token_worker())
    flush_task = asyncio.create_task(flush_buffer())
    await init_llm_cache(config)

@app.on_event("shutdown")
async def shutdown_event() -> None:
    await close_all_sse_queues()

    # Stop background tasks first so they stop touching the DB before we close it.
    for t in (token_worker_task, flush_task):
        if t is not None:
            t.cancel()
            try:
                await t
            except (asyncio.CancelledError, Exception):
                pass

    await flush_remaining_buffers()
    await app_state["session"].close()

    # Close Unix socket sessions
    for ep, sess in list(app_state.get("socket_sessions", {}).items()):
        try:
            await sess.close()
            print(f"[shutdown] Closed Unix socket session: {ep}")
        except Exception as e:
            print(f"[shutdown] Error closing Unix socket session {ep}: {e}")

    # Close httpx Unix socket clients
    for ep, client in list(app_state.get("httpx_clients", {}).items()):
        try:
            await client.aclose()
            print(f"[shutdown] Closed httpx client: {ep}")
        except Exception as e:
            print(f"[shutdown] Error closing httpx client {ep}: {e}")

    # Close the aiosqlite connection last — its worker thread is non-daemon
    # and would otherwise keep the interpreter alive after lifespan completes.
    if db is not None:
        try:
            await db.close()
            print("[shutdown] Closed token DB connection.")
        except Exception as e:
            print(f"[shutdown] Error closing DB: {e}")
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								"""
-												bump version

											
										
										
											2026-03-05 11:09:20 +01:00
+								title: NOMYO Router - an (O)llama and OpenAI API v1 Proxy with Endpoint:Model aware routing
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								author: alpha-nerd-nomyo
 								author_url: https://github.com/nomyo-ai
-												fix: removed dead config key

											
										
										
											2026-05-13 14:59:05 +02:00
+								version: 0.9
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								license: AGPL
 								"""
 								# -------------------------------------------------------------
-												feat: add conversation-endpoint affinity to benefit from hot kv-caches if possible

											
										
										
											2026-05-12 18:33:47 +02:00
+								import orjson, time, asyncio, yaml, ollama, openai, os, re, aiohttp, ssl, random, base64, io, enhance, secrets, math, socket, httpx, hashlib
-												proposal: use global truststore ctx for all connections

											
										
										
											2026-02-12 16:15:39 +01:00
+								try:
 								    import truststore; truststore.inject_into_ssl()
 								except ImportError:
 								    pass
-												adding token timeseries counting in db for future data viz

											
										
										
											2025-11-18 11:16:21 +01:00
+								from datetime import datetime, timezone
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								from pathlib import Path
-												feat:
added buffer_lock to prevent race condition in high concurrency scenarios
added documentation

											
										
										
											2026-01-05 17:16:31 +01:00
 								# Directory containing static files (relative to this script)
 								STATIC_DIR = Path(__file__).parent / "static"
-												Add files via upload

final touches
											
										
										
											2025-09-05 12:11:31 +02:00
+								from typing import Dict, Set, List, Optional
-												add: Optional router-level API key that gates router/API/web UI access

Optional router-level API key that gates router/API/web UI access (leave empty to disable)

## Supplying the router API key

If you set `nomyo-router-api-key` in `config.yaml` (or `NOMYO_ROUTER_API_KEY` env), every request to NOMYO Router must include the key:

- HTTP header (recommended): `Authorization: Bearer <router_key>`
- Query param (fallback): `?api_key=<router_key>`

Examples:
```bash
curl -H "Authorization: Bearer $NOMYO_ROUTER_API_KEY" http://localhost:12434/api/tags
curl "http://localhost:12434/api/tags?api_key=$NOMYO_ROUTER_API_KEY"
```

											
										
										
											2026-01-14 09:28:02 +01:00
+								from urllib.parse import urlparse, parse_qsl, urlencode
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								from fastapi import FastAPI, Request, HTTPException
-												Add files via upload

final touches
											
										
										
											2025-09-05 12:11:31 +02:00
+								from fastapi_sse import sse_handler
-												Additions

- Frontend
- Internal Monitoring Endpoints
- External OpenAI compatible backends
											
										
										
											2025-08-30 00:12:56 +02:00
+								from fastapi.staticfiles import StaticFiles
-												adding CORS middleware

											
										
										
											2025-09-11 09:46:19 +02:00
+								from fastapi.middleware.cors import CORSMiddleware
-												Add files via upload

Adding:
- health endpoint
- extended /api/config
- frontend extension of backend server viz
											
										
										
											2025-08-30 12:43:35 +02:00
+								from starlette.responses import StreamingResponse, JSONResponse, Response, HTMLResponse, RedirectResponse
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								from pydantic import Field
 								from pydantic_settings import BaseSettings
 								from collections import defaultdict
-												new requirement for image preprocessing to downsize and convert to png for faster and safer transaction

											
										
										
											2025-09-24 11:46:38 +02:00
+								from PIL import Image
-												Add files via upload

switching to custom cache logic for faster cache invalidation in faulty cache scenarios
removing aiocache dependency
											
										
										
											2025-09-01 13:38:49 +02:00
-												refac: split into modules I

											
										
										
											2026-05-19 10:05:27 +02:00
+								from security import _mask_secrets
 								from context_window import (
 								    _count_message_tokens,
 								    _trim_messages_for_context,
 								    _calibrated_trim_target,
 								    _endpoint_nctx,
 								    _CTX_TRIM_SMALL_LIMIT,
 								)
-												refac: modularize global states III

											
										
										
											2026-05-19 11:18:06 +02:00
+								from state import (
 								    _models_cache,
 								    _loaded_models_cache,
 								    _available_error_cache,
 								    _loaded_error_cache,
 								    _completion_error_cache,
 								    _COMPLETION_ERROR_TTL,
 								    _models_cache_lock,
 								    _loaded_models_cache_lock,
 								    _available_error_cache_lock,
 								    _loaded_error_cache_lock,
 								    _completion_error_cache_lock,
 								    _inflight_available_models,
 								    _inflight_loaded_models,
 								    _inflight_lock,
 								    _bg_refresh_available,
 								    _bg_refresh_loaded,
 								    _bg_refresh_lock,
 								    _subscribers,
 								    _subscribers_lock,
 								    token_queue,
 								    app_state,
 								    token_buffer,
 								    time_series_buffer,
 								    buffer_lock,
 								    FLUSH_INTERVAL,
 								)
-												fix: changing error_cache to stale-while-revalidate same as available_models_cache

											
										
										
											2026-03-12 14:47:54 +01:00
-												refac: modularize global states III

											
										
										
											2026-05-19 11:18:06 +02:00
+								# Rebound on startup — must stay in router.py module namespace.
-												stopping the token_worker_task gracefully on shutdown

											
										
										
											2025-11-13 10:13:10 +01:00
+								token_worker_task: asyncio.Task | None = None
-												adding token timeseries counting in db for future data viz

											
										
										
											2025-11-18 11:16:21 +01:00
+								flush_task: asyncio.Task | None = None
-												refac: modularize config II

											
										
										
											2026-05-19 11:00:50 +02:00
+								from config import Config, _config_path_from_env
-												Add Docker support

Adds comprehensive docker support

											
										
										
											2025-11-07 13:59:16 +01:00
-												feat(router): add logprob support in /api/chat

Add logprob support to the OpenAI-to-Ollama proxy by converting OpenAI logprob formats to Ollama types. Also update the ollama dependency.

											
										
										
											2026-02-13 13:29:45 +01:00
+								from ollama._types import TokenLogprob, Logprob
-												adding token timeseries counting in db for future data viz

											
										
										
											2025-11-18 11:16:21 +01:00
+								from db import TokenDatabase
-												feat: adding a semantic cache layer

											
										
										
											2026-03-08 09:12:09 +01:00
+								from cache import init_llm_cache, get_llm_cache, openai_nonstream_to_sse
-												adding token timeseries counting in db for future data viz

											
										
										
											2025-11-18 11:16:21 +01:00
-												Add Docker support

Adds comprehensive docker support

											
										
										
											2025-11-07 13:59:16 +01:00
-												refac: modularize backend IV

											
										
										
											2026-05-19 12:05:51 +02:00
+								# Create the global config object – it will be overwritten on startup.
 								# Submodules read it lazily via config.get_config().
-												Add Docker support

Adds comprehensive docker support

											
										
										
											2025-11-07 13:59:16 +01:00
+								config = Config.from_yaml(_config_path_from_env())
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
 								# -------------------------------------------------------------
 								# 2. FastAPI application
 								# -------------------------------------------------------------
 								app = FastAPI()
-												Add files via upload

final touches
											
										
										
											2025-09-05 12:11:31 +02:00
+								sse_handler.app = app
-												adding CORS middleware

											
										
										
											2025-09-11 09:46:19 +02:00
+								app.add_middleware(
 								    CORSMiddleware,
 								    allow_origins=["*"],
 								    allow_credentials=True,
 								    allow_methods=["GET", "POST", "DELETE"],
 								    allow_headers=["Authorization", "Content-Type"],
 								)
-												refac: modularize backend IV

											
										
										
											2026-05-19 12:05:51 +02:00
+								from state import default_headers
-												fixing headers, using ollama.Responses in rechunk class, fixing reseverd words var usage, fixing embedding output, fixing model naming in frontend

											
										
										
											2025-09-21 16:20:36 +02:00
-												add: Optional router-level API key that gates router/API/web UI access

Optional router-level API key that gates router/API/web UI access (leave empty to disable)

## Supplying the router API key

If you set `nomyo-router-api-key` in `config.yaml` (or `NOMYO_ROUTER_API_KEY` env), every request to NOMYO Router must include the key:

- HTTP header (recommended): `Authorization: Bearer <router_key>`
- Query param (fallback): `?api_key=<router_key>`

Examples:
```bash
curl -H "Authorization: Bearer $NOMYO_ROUTER_API_KEY" http://localhost:12434/api/tags
curl "http://localhost:12434/api/tags?api_key=$NOMYO_ROUTER_API_KEY"
```

											
										
										
											2026-01-14 09:28:02 +01:00
+								# -------------------------------------------------------------
 								# Router-level authentication (optional)
 								# -------------------------------------------------------------
 								def _extract_router_api_key(request: Request) -> Optional[str]:
 								    """
 								    Extract the provided router API key from the Authorization header or `api_key`
 								    query parameter. The middleware uses this to gate access to API routes when
 								    a router_api_key is configured.
 								    """
 								    auth_header = request.headers.get("Authorization")
 								    if auth_header and auth_header.lower().startswith("bearer "):
-												Empty key strings could bypass authentication in _extract_router_api_key()  when malformed Authorization headers were sent
- Added validation to check that the extracted key is not empty before returning it
- Added CORS headers to enforce_router_api_key() for proper cross-origin request handling and CORS-related error prevention

											
										
										
											2026-01-26 18:11:28 +01:00
+								        key = auth_header.split(" ", 1)[1].strip()
 								        if key:  # Ensure key is not empty
 								            return key
-												add: Optional router-level API key that gates router/API/web UI access

Optional router-level API key that gates router/API/web UI access (leave empty to disable)

## Supplying the router API key

If you set `nomyo-router-api-key` in `config.yaml` (or `NOMYO_ROUTER_API_KEY` env), every request to NOMYO Router must include the key:

- HTTP header (recommended): `Authorization: Bearer <router_key>`
- Query param (fallback): `?api_key=<router_key>`

Examples:
```bash
curl -H "Authorization: Bearer $NOMYO_ROUTER_API_KEY" http://localhost:12434/api/tags
curl "http://localhost:12434/api/tags?api_key=$NOMYO_ROUTER_API_KEY"
```

											
										
										
											2026-01-14 09:28:02 +01:00
+								    query_key = request.query_params.get("api_key")
 								    if query_key:
 								        return query_key
 								    return None
 								def _strip_api_key_from_scope(request: Request) -> None:
 								    """
 								    Remove api_key from the ASGI scope query string to avoid leaking it in logs.
 								    """
 								    scope = request.scope
 								    raw_qs = scope.get("query_string", b"")
 								    if not raw_qs:
 								        return
 								    params = parse_qsl(raw_qs.decode("utf-8"), keep_blank_values=True)
 								    filtered = [(k, v) for (k, v) in params if k != "api_key"]
 								    scope["query_string"] = urlencode(filtered).encode("utf-8")
 								@app.middleware("http")
 								async def enforce_router_api_key(request: Request, call_next):
 								    """
 								    Enforce the optional NOMYO Router API key for all non-static requests.
 								    When `config.router_api_key` is set, clients must supply the key either in
 								    the Authorization header (`Bearer <key>`) or as `api_key` query parameter.
 								    """
 								    expected_key = config.router_api_key
 								    if not expected_key or request.method == "OPTIONS":
 								        return await call_next(request)
 								    path = request.url.path
-												fix: security, exempt files to prevent path traversal

											
										
										
											2026-04-10 17:40:44 +02:00
+								    # Allow static assets (CSS, JS, images, fonts) but NOT HTML pages,
 								    # which would bypass auth by accessing /static/index.html directly.
 								    _STATIC_ASSET_EXTS = {".css", ".js", ".ico", ".png", ".jpg", ".jpeg", ".svg", ".woff", ".woff2", ".ttf", ".map"}
 								    is_static_asset = path.startswith("/static") and Path(path).suffix.lower() in _STATIC_ASSET_EXTS
 								    if is_static_asset or path in {"/", "/favicon.ico"}:
-												add: Optional router-level API key that gates router/API/web UI access

Optional router-level API key that gates router/API/web UI access (leave empty to disable)

## Supplying the router API key

If you set `nomyo-router-api-key` in `config.yaml` (or `NOMYO_ROUTER_API_KEY` env), every request to NOMYO Router must include the key:

- HTTP header (recommended): `Authorization: Bearer <router_key>`
- Query param (fallback): `?api_key=<router_key>`

Examples:
```bash
curl -H "Authorization: Bearer $NOMYO_ROUTER_API_KEY" http://localhost:12434/api/tags
curl "http://localhost:12434/api/tags?api_key=$NOMYO_ROUTER_API_KEY"
```

											
										
										
											2026-01-14 09:28:02 +01:00
+								        return await call_next(request)
 								    provided_key = _extract_router_api_key(request)
 								    # Strip the api_key query param from scope so access logs do not leak it
 								    _strip_api_key_from_scope(request)
 								    if provided_key is None:
-												feat: enforce api key authentication and update table header

- Added proper API key validation in router.py with 401 response when key is missing
- Implemented CORS headers for authentication requests
- Updated table header from "Until" to "Unload" in static/index.html
- Improved security by preventing API key leakage in access logs

											
										
										
											2026-02-01 10:05:46 +01:00
+								        # No key provided but authentication is required - return 401
 								        headers = {}
-												Empty key strings could bypass authentication in _extract_router_api_key()  when malformed Authorization headers were sent
- Added validation to check that the extracted key is not empty before returning it
- Added CORS headers to enforce_router_api_key() for proper cross-origin request handling and CORS-related error prevention

											
										
										
											2026-01-26 18:11:28 +01:00
+								        if "/api/" in path and path != "/api/usage-stream":
-												feat: enforce api key authentication and update table header

- Added proper API key validation in router.py with 401 response when key is missing
- Implemented CORS headers for authentication requests
- Updated table header from "Until" to "Unload" in static/index.html
- Improved security by preventing API key leakage in access logs

											
										
										
											2026-02-01 10:05:46 +01:00
+								            headers = {
 								                "Access-Control-Allow-Origin": "*",
 								                "Access-Control-Allow-Headers": "Authorization, Content-Type",
 								                "Access-Control-Allow-Methods": "GET, POST, PUT, DELETE, OPTIONS",
 								            }
 								        return JSONResponse(
 								            content={"detail": "Missing NOMYO Router API key"},
 								            status_code=401,
 								            headers=headers,
 								        )
-												add: Optional router-level API key that gates router/API/web UI access

Optional router-level API key that gates router/API/web UI access (leave empty to disable)

## Supplying the router API key

If you set `nomyo-router-api-key` in `config.yaml` (or `NOMYO_ROUTER_API_KEY` env), every request to NOMYO Router must include the key:

- HTTP header (recommended): `Authorization: Bearer <router_key>`
- Query param (fallback): `?api_key=<router_key>`

Examples:
```bash
curl -H "Authorization: Bearer $NOMYO_ROUTER_API_KEY" http://localhost:12434/api/tags
curl "http://localhost:12434/api/tags?api_key=$NOMYO_ROUTER_API_KEY"
```

											
										
										
											2026-01-14 09:28:02 +01:00
 								    if not secrets.compare_digest(str(provided_key), str(expected_key)):
 								        return JSONResponse(
 								            content={"detail": "Invalid NOMYO Router API key"},
 								            status_code=403,
 								        )
-												Empty key strings could bypass authentication in _extract_router_api_key()  when malformed Authorization headers were sent
- Added validation to check that the extracted key is not empty before returning it
- Added CORS headers to enforce_router_api_key() for proper cross-origin request handling and CORS-related error prevention

											
										
										
											2026-01-26 18:11:28 +01:00
+								    response = await call_next(request)
 								    # Add CORS headers for authenticated API requests
 								    if "/api/" in path and path != "/api/usage-stream":
 								        response.headers["Access-Control-Allow-Origin"] = "*"
 								        response.headers["Access-Control-Allow-Headers"] = "Authorization, Content-Type"
 								        response.headers["Access-Control-Allow-Methods"] = "GET, POST, PUT, DELETE, OPTIONS"
 								    return response
-												feat: correct pass through of openai.APIStatusErrors

											
										
										
											2026-05-08 12:19:03 +02:00
 								@app.exception_handler(openai.APIStatusError)
 								async def _openai_api_status_error_handler(request: Request, exc: openai.APIStatusError):
 								    """Forward upstream OpenAI-SDK status errors with their original status code and body
 								    instead of letting them bubble up as 500s."""
 								    body = exc.body if exc.body is not None else {"error": {"message": str(exc), "code": exc.status_code}}
 								    return JSONResponse(status_code=exc.status_code, content=body)
-												refac: modularize global states III

											
										
										
											2026-05-19 11:18:06 +02:00
+								from state import (
 								    usage_counts,
 								    token_usage_counts,
 								    usage_lock,
 								    token_usage_lock,
 								    _affinity_map,
 								    _affinity_lock,
 								    _AFFINITY_MAX_ENTRIES,
 								)
-												feat: add conversation-endpoint affinity to benefit from hot kv-caches if possible

											
										
										
											2026-05-12 18:33:47 +02:00
-												refac: split into modules I

											
										
										
											2026-05-19 10:05:27 +02:00
+								from fingerprint import _conversation_fingerprint
-												feat: add conversation-endpoint affinity to benefit from hot kv-caches if possible

											
										
										
											2026-05-12 18:33:47 +02:00
-												adding token timeseries counting in db for future data viz

											
										
										
											2025-11-18 11:16:21 +01:00
+								# Database instance
 								db: "TokenDatabase" = None
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								# -------------------------------------------------------------
-												refac: modularize backend IV

											
										
										
											2026-05-19 12:05:51 +02:00
+								# 4. Helperfunctions
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								# -------------------------------------------------------------
-												refac: modularize backend IV

											
										
										
											2026-05-19 12:05:51 +02:00
+								from backends.normalize import (
 								    _normalize_llama_model_name,
 								    _extract_llama_quant,
 								    ep2base,
 								    dedupe_on_keys,
 								)
 								from backends.sessions import (
 								    _is_unix_socket_endpoint,
 								    _get_socket_path,
 								    get_session,
 								    _make_openai_client,
 								)
 								from backends.health import (
 								    _is_fresh,
 								    _ensure_success,
 								    _format_connection_issue,
 								    _is_backend_connection_error,
 								    _mark_backend_unhealthy,
 								    _is_llama_model_loaded,
 								    _is_llama_model_loaded_or_sleeping,
 								)
-												feat(router): normalize model names for usage tracking across endpoints (continued)

Introduce `get_tracking_model()` to standardize model names for consistent usage tracking in Prometheus metrics. This ensures llama-server models are stripped of HF prefixes and quantization suffixes, Ollama models append `:latest` when versionless, and external OpenAI models remain unchanged—aligning all tracking keys with the PS table.

											
										
										
											2026-02-18 11:45:37 +01:00
-												refac: modularize backend IV

											
										
										
											2026-05-19 12:05:51 +02:00
+								from backends.normalize import (
 								    is_ext_openai_endpoint,
 								    is_openai_compatible,
 								    get_tracking_model,
 								)
-												feat(router): normalize model names for usage tracking across endpoints (continued)

Introduce `get_tracking_model()` to standardize model names for consistent usage tracking in Prometheus metrics. This ensures llama-server models are stripped of HF prefixes and quantization suffixes, Ollama models append `:latest` when versionless, and external OpenAI models remain unchanged—aligning all tracking keys with the PS table.

											
										
										
											2026-02-18 11:45:37 +01:00
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								async def token_worker() -> None:
-												feat:
added buffer_lock to prevent race condition in high concurrency scenarios
added documentation

											
										
										
											2026-01-05 17:16:31 +01:00
+								    try:
 								        while True:
 								            endpoint, model, prompt, comp = await token_queue.get()
-												feat: add timestamp index and improve cache concurrency

- Added index on token_time_series timestamp for faster queries
- Introduced cache locks to prevent race conditions

											
										
										
											2026-01-16 16:47:24 +01:00
+								            # Calculate timestamp once before acquiring lock
 								            now = datetime.now(tz=timezone.utc)
 								            timestamp = int(datetime(now.year, now.month, now.day, now.hour, now.minute, tzinfo=timezone.utc).timestamp())
-												feat:
added buffer_lock to prevent race condition in high concurrency scenarios
added documentation

											
										
										
											2026-01-05 17:16:31 +01:00
+								            # Accumulate counts in memory buffer (protected by lock)
 								            async with buffer_lock:
 								                token_buffer[endpoint][model] = (
 								                    token_buffer[endpoint].get(model, (0, 0))[0] + prompt,
 								                    token_buffer[endpoint].get(model, (0, 0))[1] + comp
 								                )
-												adding token timeseries counting in db for future data viz

											
										
										
											2025-11-18 11:16:21 +01:00
-												feat:
added buffer_lock to prevent race condition in high concurrency scenarios
added documentation

											
										
										
											2026-01-05 17:16:31 +01:00
+								                # Add to time series buffer with timestamp (UTC)
 								                time_series_buffer.append({
 								                    'endpoint': endpoint,
 								                    'model': model,
 								                    'input_tokens': prompt,
 								                    'output_tokens': comp,
 								                    'total_tokens': prompt + comp,
 								                    'timestamp': timestamp
 								                })
 								            # Update in-memory counts for immediate reporting
 								            async with token_usage_lock:
 								                token_usage_counts[endpoint][model] += (prompt + comp)
-												fix: usage locks now release before the subscriber queue awaits

											
										
										
											2026-04-07 15:30:52 +02:00
+								                snapshot = _capture_snapshot()
 								            await _distribute_snapshot(snapshot)
-												feat:
added buffer_lock to prevent race condition in high concurrency scenarios
added documentation

											
										
										
											2026-01-05 17:16:31 +01:00
+								    except asyncio.CancelledError:
 								        # Gracefully handle task cancellation during shutdown
 								        print("[token_worker] Task cancelled, processing remaining queue items...")
 								        # Process any remaining items in the queue before exiting
 								        while not token_queue.empty():
 								            try:
 								                endpoint, model, prompt, comp = token_queue.get_nowait()
-												feat: add timestamp index and improve cache concurrency

- Added index on token_time_series timestamp for faster queries
- Introduced cache locks to prevent race conditions

											
										
										
											2026-01-16 16:47:24 +01:00
+								                # Calculate timestamp once before acquiring lock
 								                now = datetime.now(tz=timezone.utc)
 								                timestamp = int(datetime(now.year, now.month, now.day, now.hour, now.minute, tzinfo=timezone.utc).timestamp())
-												feat:
added buffer_lock to prevent race condition in high concurrency scenarios
added documentation

											
										
										
											2026-01-05 17:16:31 +01:00
+								                async with buffer_lock:
 								                    token_buffer[endpoint][model] = (
 								                        token_buffer[endpoint].get(model, (0, 0))[0] + prompt,
 								                        token_buffer[endpoint].get(model, (0, 0))[1] + comp
 								                    )
 								                    time_series_buffer.append({
 								                        'endpoint': endpoint,
 								                        'model': model,
 								                        'input_tokens': prompt,
 								                        'output_tokens': comp,
 								                        'total_tokens': prompt + comp,
 								                        'timestamp': timestamp
 								                    })
 								                async with token_usage_lock:
 								                    token_usage_counts[endpoint][model] += (prompt + comp)
-												fix: usage locks now release before the subscriber queue awaits

											
										
										
											2026-04-07 15:30:52 +02:00
+								                    snapshot = _capture_snapshot()
 								                await _distribute_snapshot(snapshot)
-												feat:
added buffer_lock to prevent race condition in high concurrency scenarios
added documentation

											
										
										
											2026-01-05 17:16:31 +01:00
+								            except asyncio.QueueEmpty:
 								                break
 								        print("[token_worker] Task cancelled, remaining items processed.")
 								        raise
-												adding token timeseries counting in db for future data viz

											
										
										
											2025-11-18 11:16:21 +01:00
 								async def flush_buffer() -> None:
 								    """Periodically flush accumulated token counts to the database."""
-												feat:
added buffer_lock to prevent race condition in high concurrency scenarios
added documentation

											
										
										
											2026-01-05 17:16:31 +01:00
+								    try:
 								        while True:
 								            await asyncio.sleep(FLUSH_INTERVAL)
 								            # Flush token counts and time series (protected by lock)
 								            async with buffer_lock:
 								                if token_buffer:
 								                    # Copy buffer before releasing lock for DB operation
 								                    buffer_copy = {ep: dict(models) for ep, models in token_buffer.items()}
 								                    token_buffer.clear()
 								                else:
 								                    buffer_copy = None
-												adding token timeseries counting in db for future data viz

											
										
										
											2025-11-18 11:16:21 +01:00
-												feat:
added buffer_lock to prevent race condition in high concurrency scenarios
added documentation

											
										
										
											2026-01-05 17:16:31 +01:00
+								                if time_series_buffer:
 								                    ts_copy = list(time_series_buffer)
 								                    time_series_buffer.clear()
 								                else:
 								                    ts_copy = None
 								            # Perform DB operations outside the lock to avoid blocking
 								            if buffer_copy:
 								                await db.update_batched_counts(buffer_copy)
 								            if ts_copy:
 								                await db.add_batched_time_series(ts_copy)
 								    except asyncio.CancelledError:
 								        # Gracefully handle task cancellation during shutdown
 								        print("[flush_buffer] Task cancelled, flushing remaining buffers...")
 								        # Flush any remaining data before exiting
 								        try:
 								            async with buffer_lock:
 								                if token_buffer:
 								                    buffer_copy = {ep: dict(models) for ep, models in token_buffer.items()}
 								                    token_buffer.clear()
 								                else:
 								                    buffer_copy = None
 								                if time_series_buffer:
 								                    ts_copy = list(time_series_buffer)
 								                    time_series_buffer.clear()
 								                else:
 								                    ts_copy = None
 								            if buffer_copy:
 								                await db.update_batched_counts(buffer_copy)
 								            if ts_copy:
 								                await db.add_batched_time_series(ts_copy)
 								            print("[flush_buffer] Task cancelled, remaining buffers flushed.")
 								        except Exception as e:
 								            print(f"[flush_buffer] Error during shutdown flush: {e}")
 								        raise
-												record and display total token usage on ollama endpoints using ollama client

											
										
										
											2025-11-04 17:55:19 +01:00
-												refactor: use a persistent WAL-enabled connection with async locks

- Introduce a lazily initialized, shared aiosqlite connection stored in self._db and two asyncio locks (_db_lock, _operation_lock) for safe concurrent access
- Ensure the database directory exists before connecting and enable WAL journaling and foreign keys on first connect
- Add close method to gracefully close the persistent connection
- Guard initialization and write operations with _operation_lock to ensure single-threaded schema setup
- Switch to ON CONFLICT UPSERT for token_counts updates and initialize token_time_series table
- Add typing for _db (Optional[aiosqlite.Connection]) and adjust imports accordingly

addition: Frontend button with total stats aggregation task and feedback span element to keep user informed and a small database footprint

											
										
										
											2025-12-02 12:18:23 +01:00
+								async def flush_remaining_buffers() -> None:
 								    """
 								    Flush any in-memory buffers to the database on shutdown.
 								    This is designed to be safely invoked during shutdown and should not raise.
 								    """
 								    try:
 								        flushed_entries = 0
-												feat:
added buffer_lock to prevent race condition in high concurrency scenarios
added documentation

											
										
										
											2026-01-05 17:16:31 +01:00
+								        async with buffer_lock:
 								            if token_buffer:
 								                buffer_copy = {ep: dict(models) for ep, models in token_buffer.items()}
 								                flushed_entries += sum(len(v) for v in token_buffer.values())
 								                token_buffer.clear()
 								            else:
 								                buffer_copy = None
 								            if time_series_buffer:
 								                ts_copy = list(time_series_buffer)
 								                flushed_entries += len(time_series_buffer)
 								                time_series_buffer.clear()
 								            else:
 								                ts_copy = None
 								        # Perform DB operations outside the lock
 								        if buffer_copy:
 								            await db.update_batched_counts(buffer_copy)
 								        if ts_copy:
 								            await db.add_batched_time_series(ts_copy)
-												refactor: use a persistent WAL-enabled connection with async locks

- Introduce a lazily initialized, shared aiosqlite connection stored in self._db and two asyncio locks (_db_lock, _operation_lock) for safe concurrent access
- Ensure the database directory exists before connecting and enable WAL journaling and foreign keys on first connect
- Add close method to gracefully close the persistent connection
- Guard initialization and write operations with _operation_lock to ensure single-threaded schema setup
- Switch to ON CONFLICT UPSERT for token_counts updates and initialize token_time_series table
- Add typing for _db (Optional[aiosqlite.Connection]) and adjust imports accordingly

addition: Frontend button with total stats aggregation task and feedback span element to keep user informed and a small database footprint

											
										
										
											2025-12-02 12:18:23 +01:00
+								        if flushed_entries:
 								            print(f"[shutdown] Flushed {flushed_entries} in-memory entries to DB on shutdown.")
 								        else:
 								            print("[shutdown] No in-memory entries to flush on shutdown.")
 								    except Exception as e:
 								        # Do not raise during shutdown – log and continue teardown
 								        print(f"[shutdown] Error flushing remaining buffers: {e}")
-												refac: modularize backend IV

											
										
										
											2026-05-19 12:05:51 +02:00
+								from backends.probe import fetch
-												feat: add timestamp index and improve cache concurrency

- Added index on token_time_series timestamp for faster queries
- Introduced cache locks to prevent race conditions

											
										
										
											2026-01-16 16:47:24 +01:00
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
-												enhance routing logic

add a pre-routing model check:
allows for different configs on the ollama backend servers
											
										
										
											2025-08-29 13:13:25 +02:00
+								async def increment_usage(endpoint: str, model: str) -> None:
 								    async with usage_lock:
 								        usage_counts[endpoint][model] += 1
-												fix: usage locks now release before the subscriber queue awaits

											
										
										
											2026-04-07 15:30:52 +02:00
+								        snapshot = _capture_snapshot()
 								    await _distribute_snapshot(snapshot)
-												enhance routing logic

add a pre-routing model check:
allows for different configs on the ollama backend servers
											
										
										
											2025-08-29 13:13:25 +02:00
 								async def decrement_usage(endpoint: str, model: str) -> None:
 								    async with usage_lock:
 								        # Avoid negative counts
 								        current = usage_counts[endpoint].get(model, 0)
 								        if current > 0:
 								            usage_counts[endpoint][model] = current - 1
 								        # Optionally, clean up zero entries
 								        if usage_counts[endpoint].get(model, 0) == 0:
 								            usage_counts[endpoint].pop(model, None)
-												Add files via upload

final touches
											
										
										
											2025-09-05 12:11:31 +02:00
+								        #if not usage_counts[endpoint]:
 								        #    usage_counts.pop(endpoint, None)
-												fix: usage locks now release before the subscriber queue awaits

											
										
										
											2026-04-07 15:30:52 +02:00
+								        snapshot = _capture_snapshot()
 								    await _distribute_snapshot(snapshot)
-												Add files via upload

final touches
											
										
										
											2025-09-05 12:11:31 +02:00
-												refactor: make choose_endpoint use cache incrementer for atomic updates

											
										
										
											2026-03-03 14:57:37 +01:00
+								async def _make_chat_request(model: str, messages: list, tools=None, stream: bool = False, think: bool = False, format=None, options=None, keep_alive: str = None) -> ollama.ChatResponse:
-												fix(enhance.py): correct typo in function name from 'moe_select_candiadate' to 'moe_select_candidate'

feat(router.py): add helper function _make_chat_request for handling enhancing chat requests to endpoints

											
										
										
											2025-12-15 10:35:56 +01:00
+								    """
 								    Helper function to make a chat request to a specific endpoint.
 								    Handles endpoint selection, client creation, usage tracking, and request execution.
 								    """
-												refactor: make choose_endpoint use cache incrementer for atomic updates

											
										
										
											2026-03-03 14:57:37 +01:00
+								    endpoint, tracking_model = await choose_endpoint(model)  # selects and atomically reserves
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								    use_openai = is_openai_compatible(endpoint)
 								    if use_openai:
-												fix(enhance.py): correct typo in function name from 'moe_select_candiadate' to 'moe_select_candidate'

feat(router.py): add helper function _make_chat_request for handling enhancing chat requests to endpoints

											
										
										
											2025-12-15 10:35:56 +01:00
+								        if ":latest" in model:
 								            model = model.split(":latest")[0]
 								        if messages:
-												fix: conditional to_thread for the image_transform to relieve threadpool pressure

											
										
										
											2026-04-07 13:28:34 +02:00
+								            if any("images" in m for m in messages):
 								                messages = await asyncio.to_thread(transform_images_to_data_urls, messages)
-												feat: Add tool call normalization and streaming delta accumulation

Adds support for correctly handling tool calls in chat requests. Normalizes tool call data (ensuring IDs, types, and JSON arguments) in non-streaming mode and accumulates OpenAI-style deltas during streaming to build the final Ollama response.

											
										
										
											2026-02-10 20:21:46 +01:00
+								            messages = transform_tool_calls_to_openai(messages)
-												fix: missing requirement

fix: strip assistant prefill when ollama -> openai translaton + openai guard

											
										
										
											2026-04-06 11:32:47 +02:00
+								            messages = _strip_assistant_prefill(messages)
-												fix(enhance.py): correct typo in function name from 'moe_select_candiadate' to 'moe_select_candidate'

feat(router.py): add helper function _make_chat_request for handling enhancing chat requests to endpoints

											
										
										
											2025-12-15 10:35:56 +01:00
+								        params = {
 								            "messages": messages,
 								            "model": model,
 								        }
 								        optional_params = {
 								            "tools": tools,
 								            "stream": stream,
 								            "stream_options": {"include_usage": True} if stream else None,
 								            "max_tokens": options.get("num_predict") if options and "num_predict" in options else None,
 								            "frequency_penalty": options.get("frequency_penalty") if options and "frequency_penalty" in options else None,
 								            "presence_penalty": options.get("presence_penalty") if options and "presence_penalty" in options else None,
 								            "seed": options.get("seed") if options and "seed" in options else None,
 								            "stop": options.get("stop") if options and "stop" in options else None,
 								            "top_p": options.get("top_p") if options and "top_p" in options else None,
 								            "temperature": options.get("temperature") if options and "temperature" in options else None,
 								            "response_format": {"type": "json_schema", "json_schema": format} if format is not None else None
 								        }
 								        params.update({k: v for k, v in optional_params.items() if v is not None})
-												feat: support localhost llama_server access via unix sockets

											
										
										
											2026-04-17 12:41:57 +02:00
+								        oclient = _make_openai_client(endpoint, default_headers=default_headers, api_key=config.api_keys.get(endpoint, "no-key"))
-												fix(enhance.py): correct typo in function name from 'moe_select_candiadate' to 'moe_select_candidate'

feat(router.py): add helper function _make_chat_request for handling enhancing chat requests to endpoints

											
										
										
											2025-12-15 10:35:56 +01:00
+								    else:
 								        client = ollama.AsyncClient(host=endpoint)
 								    try:
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								        if use_openai:
-												fix(enhance.py): correct typo in function name from 'moe_select_candiadate' to 'moe_select_candidate'

feat(router.py): add helper function _make_chat_request for handling enhancing chat requests to endpoints

											
										
										
											2025-12-15 10:35:56 +01:00
+								            start_ts = time.perf_counter()
-												feat: add an openai retry if request with image is send to a pure text model

											
										
										
											2026-03-12 10:06:18 +01:00
+								            try:
 								                response = await oclient.chat.completions.create(**params)
-												fix: changing error_cache to stale-while-revalidate same as available_models_cache

											
										
										
											2026-03-12 14:47:54 +01:00
+								            except Exception as e:
 								                _e_str = str(e)
 								                print(f"[_make_chat_request] caught {type(e).__name__}: {_e_str[:200]}")
 								                if "exceed_context_size_error" in _e_str or "exceeds the available context size" in _e_str:
-												feat: add reactive auto context-shift in openai endpoints to prevent recover from out of context errors

											
										
										
											2026-03-12 10:15:52 +01:00
+								                    err_body = getattr(e, "body", {}) or {}
 								                    err_detail = err_body.get("error", {}) if isinstance(err_body, dict) else {}
-												fix: changing error_cache to stale-while-revalidate same as available_models_cache

											
										
										
											2026-03-12 14:47:54 +01:00
+								                    n_ctx_limit = err_detail.get("n_ctx", 0)
 								                    actual_tokens = err_detail.get("n_prompt_tokens", 0)
-												fix: catch non-standard openai sdk error bodies for parsing

											
										
										
											2026-03-12 19:08:01 +01:00
+								                    if not n_ctx_limit:
 								                        _m = re.search(r"'n_ctx':\s*(\d+)", _e_str)
 								                        if _m:
 								                            n_ctx_limit = int(_m.group(1))
 								                        _m = re.search(r"'n_prompt_tokens':\s*(\d+)", _e_str)
 								                        if _m:
 								                            actual_tokens = int(_m.group(1))
-												feat: add reactive auto context-shift in openai endpoints to prevent recover from out of context errors

											
										
										
											2026-03-12 10:15:52 +01:00
+								                    if not n_ctx_limit:
 								                        raise
-												fix: changing error_cache to stale-while-revalidate same as available_models_cache

											
										
										
											2026-03-12 14:47:54 +01:00
+								                    msgs_to_trim = params.get("messages", [])
 								                    cal_target = _calibrated_trim_target(msgs_to_trim, n_ctx_limit, actual_tokens)
 								                    trimmed = _trim_messages_for_context(msgs_to_trim, n_ctx_limit, target_tokens=cal_target)
 								                    print(f"[_make_chat_request] Context exceeded ({actual_tokens}/{n_ctx_limit} tokens, tiktoken_target={cal_target}), dropped {len(msgs_to_trim) - len(trimmed)} oldest message(s) and retrying")
 								                    try:
 								                        response = await oclient.chat.completions.create(**{**params, "messages": trimmed})
 								                    except Exception as e2:
 								                        if "exceed_context_size_error" in str(e2) or "exceeds the available context size" in str(e2):
 								                            print(f"[_make_chat_request] Context still exceeded after trimming, also stripping tools")
 								                            params_no_tools = {k: v for k, v in params.items() if k not in ("tools", "tool_choice")}
 								                            response = await oclient.chat.completions.create(**{**params_no_tools, "messages": trimmed})
 								                        else:
 								                            raise
 								                elif "image input is not supported" in _e_str:
-												feat: add an openai retry if request with image is send to a pure text model

											
										
										
											2026-03-12 10:06:18 +01:00
+								                    print(f"[_make_chat_request] Model {model} doesn't support images, retrying with text-only messages")
 								                    params = {**params, "messages": _strip_images_from_messages(params.get("messages", []))}
 								                    response = await oclient.chat.completions.create(**params)
 								                else:
 								                    raise
-												fix(enhance.py): correct typo in function name from 'moe_select_candiadate' to 'moe_select_candidate'

feat(router.py): add helper function _make_chat_request for handling enhancing chat requests to endpoints

											
										
										
											2025-12-15 10:35:56 +01:00
+								            if stream:
 								                # For streaming, we need to collect all chunks
 								                chunks = []
-												feat: Add tool call normalization and streaming delta accumulation

Adds support for correctly handling tool calls in chat requests. Normalizes tool call data (ensuring IDs, types, and JSON arguments) in non-streaming mode and accumulates OpenAI-style deltas during streaming to build the final Ollama response.

											
										
										
											2026-02-10 20:21:46 +01:00
+								                tc_acc = {}  # accumulate tool-call deltas
-												fix(enhance.py): correct typo in function name from 'moe_select_candiadate' to 'moe_select_candidate'

feat(router.py): add helper function _make_chat_request for handling enhancing chat requests to endpoints

											
										
										
											2025-12-15 10:35:56 +01:00
+								                async for chunk in response:
 								                    chunks.append(chunk)
-												feat: Add tool call normalization and streaming delta accumulation

Adds support for correctly handling tool calls in chat requests. Normalizes tool call data (ensuring IDs, types, and JSON arguments) in non-streaming mode and accumulates OpenAI-style deltas during streaming to build the final Ollama response.

											
										
										
											2026-02-10 20:21:46 +01:00
+								                    _accumulate_openai_tc_delta(chunk, tc_acc)
-												feat: deduplicate background refresh tasks and extend cache TTL

Adds lock-protected dictionaries to track running background refresh tasks, preventing duplicate executions per endpoint. Increases cache freshness thresholds from 30s to 300s to reduce blocking behavior.

fix: /v1 endpoints use correct media_types and usage information with proper logging

											
										
										
											2026-02-14 14:51:44 +01:00
+								                    prompt_tok = 0
 								                    comp_tok = 0
-												fix(enhance.py): correct typo in function name from 'moe_select_candiadate' to 'moe_select_candidate'

feat(router.py): add helper function _make_chat_request for handling enhancing chat requests to endpoints

											
										
										
											2025-12-15 10:35:56 +01:00
+								                    if chunk.usage is not None:
 								                        prompt_tok = chunk.usage.prompt_tokens or 0
 								                        comp_tok = chunk.usage.completion_tokens or 0
-												feat: deduplicate background refresh tasks and extend cache TTL

Adds lock-protected dictionaries to track running background refresh tasks, preventing duplicate executions per endpoint. Increases cache freshness thresholds from 30s to 300s to reduce blocking behavior.

fix: /v1 endpoints use correct media_types and usage information with proper logging

											
										
										
											2026-02-14 14:51:44 +01:00
+								                    else:
 								                        llama_usage = rechunk.extract_usage_from_llama_timings(chunk)
 								                        if llama_usage:
 								                            prompt_tok, comp_tok = llama_usage
 								                    if prompt_tok != 0 or comp_tok != 0:
-												feat(router): normalize model names for usage tracking across endpoints (continued)

Introduce `get_tracking_model()` to standardize model names for consistent usage tracking in Prometheus metrics. This ensures llama-server models are stripped of HF prefixes and quantization suffixes, Ollama models append `:latest` when versionless, and external OpenAI models remain unchanged—aligning all tracking keys with the PS table.

											
										
										
											2026-02-18 11:45:37 +01:00
+								                        await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))
-												fix(enhance.py): correct typo in function name from 'moe_select_candiadate' to 'moe_select_candidate'

feat(router.py): add helper function _make_chat_request for handling enhancing chat requests to endpoints

											
										
										
											2025-12-15 10:35:56 +01:00
+								                # Convert to Ollama format
 								                if chunks:
 								                    response = rechunk.openai_chat_completion2ollama(chunks[-1], stream, start_ts)
-												feat: Add tool call normalization and streaming delta accumulation

Adds support for correctly handling tool calls in chat requests. Normalizes tool call data (ensuring IDs, types, and JSON arguments) in non-streaming mode and accumulates OpenAI-style deltas during streaming to build the final Ollama response.

											
										
										
											2026-02-10 20:21:46 +01:00
+								                    # Inject fully-accumulated tool calls into the final response
 								                    if tc_acc and response.message:
 								                        response.message.tool_calls = _build_ollama_tool_calls(tc_acc)
-												fix(enhance.py): correct typo in function name from 'moe_select_candiadate' to 'moe_select_candidate'

feat(router.py): add helper function _make_chat_request for handling enhancing chat requests to endpoints

											
										
										
											2025-12-15 10:35:56 +01:00
+								            else:
-												feat: deduplicate background refresh tasks and extend cache TTL

Adds lock-protected dictionaries to track running background refresh tasks, preventing duplicate executions per endpoint. Increases cache freshness thresholds from 30s to 300s to reduce blocking behavior.

fix: /v1 endpoints use correct media_types and usage information with proper logging

											
										
										
											2026-02-14 14:51:44 +01:00
+								                prompt_tok = 0
 								                comp_tok = 0
 								                if response.usage is not None:
 								                    prompt_tok = response.usage.prompt_tokens or 0
 								                    comp_tok = response.usage.completion_tokens or 0
 								                else:
 								                    llama_usage = rechunk.extract_usage_from_llama_timings(response)
 								                    if llama_usage:
 								                        prompt_tok, comp_tok = llama_usage
-												fix(enhance.py): correct typo in function name from 'moe_select_candiadate' to 'moe_select_candidate'

feat(router.py): add helper function _make_chat_request for handling enhancing chat requests to endpoints

											
										
										
											2025-12-15 10:35:56 +01:00
+								                if prompt_tok != 0 or comp_tok != 0:
-												feat(router): normalize model names for usage tracking across endpoints (continued)

Introduce `get_tracking_model()` to standardize model names for consistent usage tracking in Prometheus metrics. This ensures llama-server models are stripped of HF prefixes and quantization suffixes, Ollama models append `:latest` when versionless, and external OpenAI models remain unchanged—aligning all tracking keys with the PS table.

											
										
										
											2026-02-18 11:45:37 +01:00
+								                    await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))
-												fix(enhance.py): correct typo in function name from 'moe_select_candiadate' to 'moe_select_candidate'

feat(router.py): add helper function _make_chat_request for handling enhancing chat requests to endpoints

											
										
										
											2025-12-15 10:35:56 +01:00
+								                response = rechunk.openai_chat_completion2ollama(response, stream, start_ts)
 								        else:
 								            response = await client.chat(model=model, messages=messages, tools=tools, stream=stream, think=think, format=format, options=options, keep_alive=keep_alive)
 								            if stream:
 								                # For streaming, collect all chunks
 								                chunks = []
 								                async for chunk in response:
 								                    chunks.append(chunk)
 								                    prompt_tok = chunk.prompt_eval_count or 0
 								                    comp_tok = chunk.eval_count or 0
 								                    if prompt_tok != 0 or comp_tok != 0:
-												feat(router): normalize model names for usage tracking across endpoints (continued)

Introduce `get_tracking_model()` to standardize model names for consistent usage tracking in Prometheus metrics. This ensures llama-server models are stripped of HF prefixes and quantization suffixes, Ollama models append `:latest` when versionless, and external OpenAI models remain unchanged—aligning all tracking keys with the PS table.

											
										
										
											2026-02-18 11:45:37 +01:00
+								                        await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))
-												fix(enhance.py): correct typo in function name from 'moe_select_candiadate' to 'moe_select_candidate'

feat(router.py): add helper function _make_chat_request for handling enhancing chat requests to endpoints

											
										
										
											2025-12-15 10:35:56 +01:00
+								                if chunks:
 								                    response = chunks[-1]
 								            else:
 								                prompt_tok = response.prompt_eval_count or 0
 								                comp_tok = response.eval_count or 0
 								                if prompt_tok != 0 or comp_tok != 0:
-												feat(router): normalize model names for usage tracking across endpoints (continued)

Introduce `get_tracking_model()` to standardize model names for consistent usage tracking in Prometheus metrics. This ensures llama-server models are stripped of HF prefixes and quantization suffixes, Ollama models append `:latest` when versionless, and external OpenAI models remain unchanged—aligning all tracking keys with the PS table.

											
										
										
											2026-02-18 11:45:37 +01:00
+								                    await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))
-												fix(enhance.py): correct typo in function name from 'moe_select_candiadate' to 'moe_select_candidate'

feat(router.py): add helper function _make_chat_request for handling enhancing chat requests to endpoints

											
										
										
											2025-12-15 10:35:56 +01:00
 								        return response
 								    finally:
-												feat(router): normalize model names for usage tracking across endpoints (continued)

Introduce `get_tracking_model()` to standardize model names for consistent usage tracking in Prometheus metrics. This ensures llama-server models are stripped of HF prefixes and quantization suffixes, Ollama models append `:latest` when versionless, and external OpenAI models remain unchanged—aligning all tracking keys with the PS table.

											
										
										
											2026-02-18 11:45:37 +01:00
+								        await decrement_usage(endpoint, tracking_model)
-												fix(enhance.py): correct typo in function name from 'moe_select_candiadate' to 'moe_select_candidate'

feat(router.py): add helper function _make_chat_request for handling enhancing chat requests to endpoints

											
										
										
											2025-12-15 10:35:56 +01:00
 								def get_last_user_content(messages):
 								    """
 								    Given a list of dicts (e.g., messages from an API),
 								    return the 'content' of the last dict whose 'role' is 'user'.
 								    If no such dict exists, return None.
 								    """
 								    # Reverse iterate so we stop at the first match
 								    for msg in reversed(messages):
 								        if msg.get("role") == "user":
 								            return msg.get("content")
 								    return None
 								async def _make_moe_requests(model: str, messages: list, tools=None, think: bool = False, format=None, options=None, keep_alive: str = None) -> ollama.ChatResponse:
 								    """
 								    Helper function to make MOE (Multiple Opinions Ensemble) requests.
 								    Generates 3 responses, 3 critiques, and returns the final selected response.
 								    """
 								    query = get_last_user_content(messages)
 								    if not query:
 								        raise ValueError("No user query found in messages")
 								    if options is None:
 								        options = {}
 								    options["temperature"] = 1
 								    moe_reqs = []
-												refactor: make choose_endpoint use cache incrementer for atomic updates

											
										
										
											2026-03-03 14:57:37 +01:00
+								    # Generate 3 responses — choose_endpoint is called inside _make_chat_request and
 								    # atomically reserves a slot, so all 3 tasks see each other's load immediately.
 								    response1_task = asyncio.create_task(_make_chat_request(model, messages, tools, stream=False, think=think, format=format, options=options, keep_alive=keep_alive))
 								    response2_task = asyncio.create_task(_make_chat_request(model, messages, tools, stream=False, think=think, format=format, options=options, keep_alive=keep_alive))
 								    response3_task = asyncio.create_task(_make_chat_request(model, messages, tools, stream=False, think=think, format=format, options=options, keep_alive=keep_alive))
-												fix(enhance.py): correct typo in function name from 'moe_select_candiadate' to 'moe_select_candidate'

feat(router.py): add helper function _make_chat_request for handling enhancing chat requests to endpoints

											
										
										
											2025-12-15 10:35:56 +01:00
 								    responses = await asyncio.gather(response1_task, response2_task, response3_task)
 								    for n, r in enumerate(responses):
 								        moe_req = enhance.moe(query, n, r.message.content)
 								        moe_reqs.append(moe_req)
 								    # Generate 3 critiques
-												refactor: make choose_endpoint use cache incrementer for atomic updates

											
										
										
											2026-03-03 14:57:37 +01:00
+								    critique1_task = asyncio.create_task(_make_chat_request(model, [{"role": "user", "content": moe_reqs[0]}], tools, stream=False, think=think, format=format, options=options, keep_alive=keep_alive))
 								    critique2_task = asyncio.create_task(_make_chat_request(model, [{"role": "user", "content": moe_reqs[1]}], tools, stream=False, think=think, format=format, options=options, keep_alive=keep_alive))
 								    critique3_task = asyncio.create_task(_make_chat_request(model, [{"role": "user", "content": moe_reqs[2]}], tools, stream=False, think=think, format=format, options=options, keep_alive=keep_alive))
-												fix(enhance.py): correct typo in function name from 'moe_select_candiadate' to 'moe_select_candidate'

feat(router.py): add helper function _make_chat_request for handling enhancing chat requests to endpoints

											
										
										
											2025-12-15 10:35:56 +01:00
 								    critiques = await asyncio.gather(critique1_task, critique2_task, critique3_task)
 								    # Select final response
 								    m = enhance.moe_select_candidate(query, critiques)
 								    # Generate final response
-												refactor: make choose_endpoint use cache incrementer for atomic updates

											
										
										
											2026-03-03 14:57:37 +01:00
+								    return await _make_chat_request(model, [{"role": "user", "content": m}], tools, stream=False, think=think, format=format, options=options, keep_alive=keep_alive)
-												fix(enhance.py): correct typo in function name from 'moe_select_candiadate' to 'moe_select_candidate'

feat(router.py): add helper function _make_chat_request for handling enhancing chat requests to endpoints

											
										
										
											2025-12-15 10:35:56 +01:00
-												refac: split into modules I

											
										
										
											2026-05-19 10:05:27 +02:00
+								from images import iso8601_ns, is_base64, resize_image_if_needed
-												new requirement for image preprocessing to downsize and convert to png for faster and safer transaction

											
										
										
											2025-09-24 11:46:38 +02:00
-												fix: missing requirement

fix: strip assistant prefill when ollama -> openai translaton + openai guard

											
										
										
											2026-04-06 11:32:47 +02:00
+								def _strip_assistant_prefill(messages: list) -> list:
 								    """Remove a trailing assistant message used as prefill.
 								    OpenAI-compatible endpoints (including Claude) do not support prefill and
 								    will reject requests where the last message has role 'assistant'."""
 								    if messages and messages[-1].get("role") == "assistant":
 								        return messages[:-1]
 								    return messages
-												feat: Add tool call normalization and streaming delta accumulation

Adds support for correctly handling tool calls in chat requests. Normalizes tool call data (ensuring IDs, types, and JSON arguments) in non-streaming mode and accumulates OpenAI-style deltas during streaming to build the final Ollama response.

											
										
										
											2026-02-10 20:21:46 +01:00
+								def transform_tool_calls_to_openai(message_list):
 								    """
 								    Ensure tool_calls in assistant messages conform to the OpenAI format:
 								    - Each tool call must have "type": "function"
 								    - Each tool call must have an "id"
 								    - arguments must be a JSON string, not a dict
 								    Also ensure tool-role messages have a tool_call_id.
 								    """
 								    # Track generated IDs so tool-role messages can reference them
 								    last_tool_call_ids = {}
 								    for msg in message_list:
 								        role = msg.get("role")
 								        if role == "assistant" and "tool_calls" in msg:
 								            for tc in msg["tool_calls"]:
 								                if "type" not in tc:
 								                    tc["type"] = "function"
 								                if "id" not in tc:
 								                    tc["id"] = f"call_{secrets.token_hex(16)}"
 								                func = tc.get("function", {})
 								                if isinstance(func.get("arguments"), dict):
 								                    func["arguments"] = orjson.dumps(func["arguments"]).decode("utf-8")
 								                # Remember the id for the following tool-role message
 								                name = func.get("name")
 								                if name:
 								                    last_tool_call_ids[name] = tc["id"]
 								        elif role == "tool":
 								            if "tool_call_id" not in msg:
 								                # Try to match by name from a preceding assistant tool_call
 								                name = msg.get("name") or msg.get("tool_name")
 								                if name and name in last_tool_call_ids:
 								                    msg["tool_call_id"] = last_tool_call_ids.pop(name)
 								    return message_list
-												poc: messsage translation with images

											
										
										
											2025-09-23 17:33:15 +02:00
+								def transform_images_to_data_urls(message_list):
 								    for message in message_list:
 								        if "images" in message:
 								            images = message.pop("images")
 								            if not isinstance(images, list):
 								                continue
 								            new_content = []
 								            for image in images:            #TODO: quality downsize if images are too big to fit into model context window size
 								                if not is_base64(image):
 								                    raise ValueError(f"Image string is not a valid base64 encoded string.")
-												new requirement for image preprocessing to downsize and convert to png for faster and safer transaction

											
										
										
											2025-09-24 11:46:38 +02:00
+								                resized_image = resize_image_if_needed(image)
 								                if resized_image:
 								                    data_url = f"data:image/png;base64,{resized_image}"
 								                    #new_content.append({
 								                    #    "type": "text",
 								                    #    "text": ""
 								                    #})
 								                    new_content.append({
 								                        "type": "image_url",
 								                        "image_url": {
 								                            "url": data_url
 								                        }
 								                    })
-												poc: messsage translation with images

											
										
										
											2025-09-23 17:33:15 +02:00
+								            message["content"] = new_content
 								    return message_list
-												feat: add an openai retry if request with image is send to a pure text model

											
										
										
											2026-03-12 10:06:18 +01:00
+								def _strip_images_from_messages(messages: list) -> list:
 								    """Remove image_url parts from message content, keeping only text."""
 								    result = []
 								    for msg in messages:
 								        content = msg.get("content")
 								        if isinstance(content, list):
 								            text_only = [p for p in content if p.get("type") != "image_url"]
 								            if len(text_only) == 1 and text_only[0].get("type") == "text":
 								                content = text_only[0]["text"]
 								            else:
 								                content = text_only
 								            result.append({**msg, "content": content})
 								        else:
 								            result.append(msg)
 								    return result
-												feat: Add tool call normalization and streaming delta accumulation

Adds support for correctly handling tool calls in chat requests. Normalizes tool call data (ensuring IDs, types, and JSON arguments) in non-streaming mode and accumulates OpenAI-style deltas during streaming to build the final Ollama response.

											
										
										
											2026-02-10 20:21:46 +01:00
+								def _accumulate_openai_tc_delta(chunk, accumulator: dict) -> None:
 								    """Accumulate tool_call deltas from a single OpenAI streaming chunk.
 								    ``accumulator`` is a dict mapping tool-call *index* to
 								    ``{"id": str, "name": str, "arguments": str}`` where ``arguments``
 								    is the concatenation of all JSON fragments seen so far.
 								    """
 								    if not chunk.choices:
 								        return
 								    delta = chunk.choices[0].delta
 								    tc_deltas = getattr(delta, "tool_calls", None)
 								    if not tc_deltas:
 								        return
 								    for tc in tc_deltas:
 								        idx = tc.index
 								        if idx not in accumulator:
 								            accumulator[idx] = {
 								                "id": getattr(tc, "id", None) or f"call_{secrets.token_hex(16)}",
 								                "name": tc.function.name if tc.function else None,
 								                "arguments": "",
 								            }
 								        else:
 								            if getattr(tc, "id", None):
 								                accumulator[idx]["id"] = tc.id
 								            if tc.function and tc.function.name:
 								                accumulator[idx]["name"] = tc.function.name
 								        if tc.function and tc.function.arguments:
 								            accumulator[idx]["arguments"] += tc.function.arguments
 								def _build_ollama_tool_calls(accumulator: dict) -> list | None:
 								    """Convert accumulated tool-call data into Ollama-format tool_calls list."""
 								    if not accumulator:
 								        return None
 								    result = []
 								    for idx in sorted(accumulator.keys()):
 								        tc = accumulator[idx]
 								        try:
 								            args = orjson.loads(tc["arguments"]) if tc["arguments"] else {}
 								        except (orjson.JSONDecodeError, TypeError):
 								            args = {}
 								        result.append(ollama.Message.ToolCall(
 								            function=ollama.Message.ToolCall.Function(name=tc["name"], arguments=args)
 								        ))
 								    return result
-												feat(router): add logprob support in /api/chat

Add logprob support to the OpenAI-to-Ollama proxy by converting OpenAI logprob formats to Ollama types. Also update the ollama dependency.

											
										
										
											2026-02-13 13:29:45 +01:00
+								def _convert_openai_logprobs(choice) -> list | None:
 								    """Convert OpenAI logprobs from a choice into Ollama Logprob objects."""
 								    lp = getattr(choice, "logprobs", None)
 								    if lp is None:
 								        return None
 								    content = getattr(lp, "content", None)
 								    if not content:
 								        return None
 								    result = []
 								    for entry in content:
 								        top = [
 								            TokenLogprob(token=alt.token, logprob=alt.logprob)
 								            for alt in (entry.top_logprobs or [])
 								        ]
 								        result.append(Logprob(
 								            token=entry.token,
 								            logprob=entry.logprob,
 								            top_logprobs=top or None,
 								        ))
 								    return result
-												starting an openai2ollama client translation layer with rechunking class

											
										
										
											2025-09-13 11:24:28 +02:00
+								class rechunk:
-												formatting Response Objects in rechunk and fixing TypeErrors in /api/chat and /api/generate

											
										
										
											2025-09-22 09:30:27 +02:00
+								    def openai_chat_completion2ollama(chunk: dict, stream: bool, start_ts: float) -> ollama.ChatResponse:
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								        now = time.perf_counter()
-												fixing endpoint usage metrics

											
										
										
											2025-09-23 12:51:37 +02:00
+								        if chunk.choices == [] and chunk.usage is not None:
 								            return ollama.ChatResponse(
 								                model=chunk.model,
 								                created_at=iso8601_ns(),
 								                done=True,
 								                done_reason='stop',
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								                total_duration=int((now - start_ts) * 1_000_000_000),
-												feat: Add tool call normalization and streaming delta accumulation

Adds support for correctly handling tool calls in chat requests. Normalizes tool call data (ensuring IDs, types, and JSON arguments) in non-streaming mode and accumulates OpenAI-style deltas during streaming to build the final Ollama response.

											
										
										
											2026-02-10 20:21:46 +01:00
+								                load_duration=100000,
-												fixing endpoint usage metrics

											
										
										
											2025-09-23 12:51:37 +02:00
+								                prompt_eval_count=int(chunk.usage.prompt_tokens),
-												feat: Add tool call normalization and streaming delta accumulation

Adds support for correctly handling tool calls in chat requests. Normalizes tool call data (ensuring IDs, types, and JSON arguments) in non-streaming mode and accumulates OpenAI-style deltas during streaming to build the final Ollama response.

											
										
										
											2026-02-10 20:21:46 +01:00
+								                prompt_eval_duration=int((now - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)),
-												fixing endpoint usage metrics

											
										
										
											2025-09-23 12:51:37 +02:00
+								                eval_count=int(chunk.usage.completion_tokens),
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								                eval_duration=int((now - start_ts) * 1_000_000_000),
-												feat: Add tool call normalization and streaming delta accumulation

Adds support for correctly handling tool calls in chat requests. Normalizes tool call data (ensuring IDs, types, and JSON arguments) in non-streaming mode and accumulates OpenAI-style deltas during streaming to build the final Ollama response.

											
										
										
											2026-02-10 20:21:46 +01:00
+								                message=ollama.Message(role="assistant", content=""),
-												fixing endpoint usage metrics

											
										
										
											2025-09-23 12:51:37 +02:00
+								                )
-												formatting Response Objects in rechunk and fixing TypeErrors in /api/chat and /api/generate

											
										
										
											2025-09-22 09:30:27 +02:00
+								        with_thinking = chunk.choices[0] if chunk.choices[0] else None
-												adding stream == False options to ollama 2 openai translation in /api/chat

											
										
										
											2025-09-13 12:28:42 +02:00
+								        if stream == True:
-												feat: update reasoning handling

Updated reasoning content handling in router.py to check for both "reasoning_content" and "reasoning" attributes.

											
										
										
											2026-02-08 11:29:47 +01:00
+								            thinking = (getattr(with_thinking.delta, "reasoning_content", None) or getattr(with_thinking.delta, "reasoning", None)) if with_thinking else None
-												formatting, condensing rechunk

											
										
										
											2025-09-21 16:33:43 +02:00
+								            role = chunk.choices[0].delta.role or "assistant"
-												fixing endpoint usage metrics

											
										
										
											2025-09-23 12:51:37 +02:00
+								            content = chunk.choices[0].delta.content or ''
-												adding stream == False options to ollama 2 openai translation in /api/chat

											
										
										
											2025-09-13 12:28:42 +02:00
+								        else:
-												feat: update reasoning handling

Updated reasoning content handling in router.py to check for both "reasoning_content" and "reasoning" attributes.

											
										
										
											2026-02-08 11:29:47 +01:00
+								            thinking = (getattr(with_thinking.message, "reasoning_content", None) or getattr(with_thinking.message, "reasoning", None)) if with_thinking else None
-												formatting, condensing rechunk

											
										
										
											2025-09-21 16:33:43 +02:00
+								            role = chunk.choices[0].message.role or "assistant"
-												fixing endpoint usage metrics

											
										
										
											2025-09-23 12:51:37 +02:00
+								            content = chunk.choices[0].message.content or ''
-												refactor(router.py): correctly implement OpenAI tool_calls to Ollama format conversion

											
										
										
											2026-02-09 11:04:14 +01:00
+								        # Convert OpenAI tool_calls to Ollama format
-												feat: Add tool call normalization and streaming delta accumulation

Adds support for correctly handling tool calls in chat requests. Normalizes tool call data (ensuring IDs, types, and JSON arguments) in non-streaming mode and accumulates OpenAI-style deltas during streaming to build the final Ollama response.

											
										
										
											2026-02-10 20:21:46 +01:00
+								        # In streaming mode, tool_calls arrive as partial deltas across multiple chunks
 								        # (name only in first delta, arguments as incremental JSON fragments).
 								        # Callers must accumulate deltas and inject the final result; skip here.
-												refactor(router.py): correctly implement OpenAI tool_calls to Ollama format conversion

											
										
										
											2026-02-09 11:04:14 +01:00
+								        ollama_tool_calls = None
-												feat: Add tool call normalization and streaming delta accumulation

Adds support for correctly handling tool calls in chat requests. Normalizes tool call data (ensuring IDs, types, and JSON arguments) in non-streaming mode and accumulates OpenAI-style deltas during streaming to build the final Ollama response.

											
										
										
											2026-02-10 20:21:46 +01:00
+								        if not stream:
-												refactor(router.py): correctly implement OpenAI tool_calls to Ollama format conversion

											
										
										
											2026-02-09 11:04:14 +01:00
+								            raw_tool_calls = getattr(with_thinking.message, "tool_calls", None) if with_thinking else None
-												feat: Add tool call normalization and streaming delta accumulation

Adds support for correctly handling tool calls in chat requests. Normalizes tool call data (ensuring IDs, types, and JSON arguments) in non-streaming mode and accumulates OpenAI-style deltas during streaming to build the final Ollama response.

											
										
										
											2026-02-10 20:21:46 +01:00
+								            if raw_tool_calls:
 								                ollama_tool_calls = []
 								                for tc in raw_tool_calls:
 								                    try:
 								                        args = orjson.loads(tc.function.arguments) if isinstance(tc.function.arguments, str) else (tc.function.arguments or {})
 								                    except (orjson.JSONDecodeError, TypeError):
 								                        args = {}
 								                    ollama_tool_calls.append(ollama.Message.ToolCall(
 								                        function=ollama.Message.ToolCall.Function(name=tc.function.name, arguments=args)
 								                    ))
-												feat(router): add logprob support in /api/chat

Add logprob support to the OpenAI-to-Ollama proxy by converting OpenAI logprob formats to Ollama types. Also update the ollama dependency.

											
										
										
											2026-02-13 13:29:45 +01:00
+								        # Convert OpenAI logprobs to Ollama format
 								        ollama_logprobs = _convert_openai_logprobs(with_thinking) if with_thinking else None
-												formatting, condensing rechunk

											
										
										
											2025-09-21 16:33:43 +02:00
+								        assistant_msg = ollama.Message(
 								            role=role,
 								            content=content,
-												formatting Response Objects in rechunk and fixing TypeErrors in /api/chat and /api/generate

											
										
										
											2025-09-22 09:30:27 +02:00
+								            thinking=thinking,
-												formatting, condensing rechunk

											
										
										
											2025-09-21 16:33:43 +02:00
+								            images=None,
 								            tool_name=None,
-												refactor(router.py): correctly implement OpenAI tool_calls to Ollama format conversion

											
										
										
											2026-02-09 11:04:14 +01:00
+								            tool_calls=ollama_tool_calls)
-												formatting, condensing rechunk

											
										
										
											2025-09-21 16:33:43 +02:00
+								        rechunk = ollama.ChatResponse(
-												feat(router): add logprob support in /api/chat

Add logprob support to the OpenAI-to-Ollama proxy by converting OpenAI logprob formats to Ollama types. Also update the ollama dependency.

											
										
										
											2026-02-13 13:29:45 +01:00
+								            model=chunk.model,
-												formatting, condensing rechunk

											
										
										
											2025-09-21 16:33:43 +02:00
+								            created_at=iso8601_ns(),
-												fixing endpoint usage metrics

											
										
										
											2025-09-23 12:51:37 +02:00
+								            done=True if chunk.usage is not None else False,
 								            done_reason=chunk.choices[0].finish_reason, #if chunk.choices[0].finish_reason is not None else None,
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								            total_duration=int((now - start_ts) * 1_000_000_000) if chunk.usage is not None else 0,
-												feat(router): add logprob support in /api/chat

Add logprob support to the OpenAI-to-Ollama proxy by converting OpenAI logprob formats to Ollama types. Also update the ollama dependency.

											
										
										
											2026-02-13 13:29:45 +01:00
+								            load_duration=100000,
-												fixing types and params

											
										
										
											2025-09-22 19:01:14 +02:00
+								            prompt_eval_count=int(chunk.usage.prompt_tokens) if chunk.usage is not None else 0,
-												feat(router): add logprob support in /api/chat

Add logprob support to the OpenAI-to-Ollama proxy by converting OpenAI logprob formats to Ollama types. Also update the ollama dependency.

											
										
										
											2026-02-13 13:29:45 +01:00
+								            prompt_eval_duration=int((now - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None and chunk.usage.completion_tokens != 0 else 0,
-												fixing types and params

											
										
										
											2025-09-22 19:01:14 +02:00
+								            eval_count=int(chunk.usage.completion_tokens) if chunk.usage is not None else 0,
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								            eval_duration=int((now - start_ts) * 1_000_000_000) if chunk.usage is not None else 0,
-												feat(router): add logprob support in /api/chat

Add logprob support to the OpenAI-to-Ollama proxy by converting OpenAI logprob formats to Ollama types. Also update the ollama dependency.

											
										
										
											2026-02-13 13:29:45 +01:00
+								            message=assistant_msg,
 								            logprobs=ollama_logprobs)
-												simplification in rechunk

											
										
										
											2025-09-13 12:38:13 +02:00
+								        return rechunk
-												adding fetch class and ollama client completions on openai endpoints

											
										
										
											2025-09-13 16:57:09 +02:00
-												formatting Response Objects in rechunk and fixing TypeErrors in /api/chat and /api/generate

											
										
										
											2025-09-22 09:30:27 +02:00
+								    def openai_completion2ollama(chunk: dict, stream: bool, start_ts: float) -> ollama.GenerateResponse:
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								        now = time.perf_counter()
-												fixing model re-naming in /v1 endpoints and thinking in rechunk

											
										
										
											2025-09-17 11:40:48 +02:00
+								        with_thinking = chunk.choices[0] if chunk.choices[0] else None
 								        thinking = getattr(with_thinking, "reasoning", None) if with_thinking else None
-												formatting, condensing rechunk

											
										
										
											2025-09-21 16:33:43 +02:00
+								        rechunk = ollama.GenerateResponse(
 								            model=chunk.model,
 								            created_at=iso8601_ns(),
-												fixing endpoint usage metrics

											
										
										
											2025-09-23 12:51:37 +02:00
+								            done=True if chunk.usage is not None else False,
-												formatting, condensing rechunk

											
										
										
											2025-09-21 16:33:43 +02:00
+								            done_reason=chunk.choices[0].finish_reason,
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								            total_duration=int((now - start_ts) * 1_000_000_000) if chunk.usage is not None else 0,
-												formatting Response Objects in rechunk and fixing TypeErrors in /api/chat and /api/generate

											
										
										
											2025-09-22 09:30:27 +02:00
+								            load_duration=10000,
-												fixing endpoint usage metrics

											
										
										
											2025-09-23 12:51:37 +02:00
+								            prompt_eval_count=int(chunk.usage.prompt_tokens) if chunk.usage is not None else 0,
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								            prompt_eval_duration=int((now - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None and chunk.usage.completion_tokens != 0 else 0,
-												fixing endpoint usage metrics

											
										
										
											2025-09-23 12:51:37 +02:00
+								            eval_count=int(chunk.usage.completion_tokens) if chunk.usage is not None else 0,
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								            eval_duration=int((now - start_ts) * 1_000_000_000) if chunk.usage is not None else 0,
-												fixing endpoint usage metrics

											
										
										
											2025-09-23 12:51:37 +02:00
+								            response=chunk.choices[0].text or '',
-												formatting Response Objects in rechunk and fixing TypeErrors in /api/chat and /api/generate

											
										
										
											2025-09-22 09:30:27 +02:00
+								            thinking=thinking)
-												adding fetch class and ollama client completions on openai endpoints

											
										
										
											2025-09-13 16:57:09 +02:00
+								        return rechunk
-												adding ollama embeddings conversion calls to openai endpoint

											
										
										
											2025-09-15 11:47:55 +02:00
-												formatting Response Objects in rechunk and fixing TypeErrors in /api/chat and /api/generate

											
										
										
											2025-09-22 09:30:27 +02:00
+								    def openai_embeddings2ollama(chunk: dict) -> ollama.EmbeddingsResponse:
-												fixing headers, using ollama.Responses in rechunk class, fixing reseverd words var usage, fixing embedding output, fixing model naming in frontend

											
										
										
											2025-09-21 16:20:36 +02:00
+								        rechunk = ollama.EmbeddingsResponse(embedding=chunk.data[0].embedding)
-												adding ollama embeddings conversion calls to openai endpoint

											
										
										
											2025-09-15 11:47:55 +02:00
+								        return rechunk
-												starting an openai2ollama client translation layer with rechunking class

											
										
										
											2025-09-13 11:24:28 +02:00
-												formatting Response Objects in rechunk and fixing TypeErrors in /api/chat and /api/generate

											
										
										
											2025-09-22 09:30:27 +02:00
+								    def openai_embed2ollama(chunk: dict, model: str) -> ollama.EmbedResponse:
-												formatting, condensing rechunk

											
										
										
											2025-09-21 16:33:43 +02:00
+								        rechunk = ollama.EmbedResponse(
 								            model=model,
 								            created_at=iso8601_ns(),
 								            done=None,
 								            done_reason=None,
 								            total_duration=None,
 								            load_duration=None,
 								            prompt_eval_count=None,
 								            prompt_eval_duration=None,
 								            eval_count=None,
 								            eval_duration=None,
 								            embeddings=[chunk.data[0].embedding])
-												adding ollama embeddings conversion calls to openai endpoint

											
										
										
											2025-09-15 11:47:55 +02:00
+								        return rechunk
-												feat: deduplicate background refresh tasks and extend cache TTL

Adds lock-protected dictionaries to track running background refresh tasks, preventing duplicate executions per endpoint. Increases cache freshness thresholds from 30s to 300s to reduce blocking behavior.

fix: /v1 endpoints use correct media_types and usage information with proper logging

											
										
										
											2026-02-14 14:51:44 +01:00
 								    def extract_usage_from_llama_timings(obj) -> tuple[int, int] | None:
 								        """Extract (prompt_tokens, completion_tokens) from llama-server's timings object.
 								        llama-server returns a ``timings`` dict instead of the standard OpenAI
 								        ``usage`` field::
 								            "timings": {
 								                "cache_n": 236,      // prompt tokens reused from cache
 								                "prompt_n": 1,       // prompt tokens processed
 								                "predicted_n": 35    // predicted (completion) tokens
 								            }
 								        prompt_tokens  = prompt_n + cache_n
 								        completion_tokens = predicted_n
 								        Returns ``(prompt_tokens, completion_tokens)`` or ``None`` when no
 								        timings are found.
 								        """
 								        timings = getattr(obj, "timings", None)
 								        if timings is None:
 								            return None
 								        if isinstance(timings, dict):
 								            prompt_n = timings.get("prompt_n", 0) or 0
 								            cache_n = timings.get("cache_n", 0) or 0
 								            predicted_n = timings.get("predicted_n", 0) or 0
 								            return (prompt_n + cache_n, predicted_n)
 								        return None
-												adding optional parameters in ollama to openai translation

											
										
										
											2025-09-22 14:04:19 +02:00
-												Add files via upload

final touches
											
										
										
											2025-09-05 12:11:31 +02:00
+								# ------------------------------------------------------------------
 								# SSE Helpser
 								# ------------------------------------------------------------------
-												fix: usage locks now release before the subscriber queue awaits

											
										
										
											2026-04-07 15:30:52 +02:00
+								def _capture_snapshot() -> str:
 								    """Capture current usage counts as a JSON string. Caller must hold at least one of usage_lock/token_usage_lock."""
 								    return orjson.dumps({
 								        "usage_counts": dict(usage_counts),
-												feat(router): implement in-flight request tracking to prevent cache stampede in high concurrency scenarios

Added in-flight request tracking mechanism to prevent cache stampede when multiple concurrent requests arrive for the same endpoint. This introduces new dictionaries to track ongoing requests and a lock to coordinate access. The available_models method was refactored to use an internal helper function and includes request coalescing logic to ensure only one HTTP request is made per endpoint when cache entries expire. The loaded_models method was also updated to use the new caching and coalescing pattern.

											
										
										
											2026-01-29 18:00:33 +01:00
+								        "token_usage_counts": dict(token_usage_counts)
 								    }, option=orjson.OPT_SORT_KEYS).decode("utf-8")
-												refactor: improve snapshot safety and usage tracking

Create atomic snapshots by deep copying usage data structures to prevent race conditions.
Protect concurrent reads of usage counts with explicit locking in endpoint selection.
Replace README screenshot with a video link.

											
										
										
											2026-01-26 17:18:57 +01:00
-												fix: usage locks now release before the subscriber queue awaits

											
										
										
											2026-04-07 15:30:52 +02:00
+								async def _distribute_snapshot(snapshot: str) -> None:
 								    """Push a pre-captured snapshot to all SSE subscribers. Must be called outside any usage lock."""
-												Add files via upload

final touches
											
										
										
											2025-09-05 12:11:31 +02:00
+								    async with _subscribers_lock:
 								        for q in _subscribers:
 								            if q.full():
-												improving queue logic for high load scenarios

											
										
										
											2025-09-19 16:38:48 +02:00
+								                try:
 								                    await q.get()
 								                except asyncio.QueueEmpty:
 								                    pass
-												Add files via upload

final touches
											
										
										
											2025-09-05 12:11:31 +02:00
+								            await q.put(snapshot)
-												improved SSE queue handling on shutdown

											
										
										
											2025-09-12 09:44:56 +02:00
+								async def close_all_sse_queues():
 								    for q in list(_subscribers):
 								        # sentinel value that the generator will recognise
 								        await q.put(None)
-												Add files via upload

final touches
											
										
										
											2025-09-05 12:11:31 +02:00
+								# ------------------------------------------------------------------
 								# Subscriber helpers
 								# ------------------------------------------------------------------
 								async def subscribe() -> asyncio.Queue:
 								    """
 								    Returns a new Queue that will receive every snapshot.
 								    """
 								    q: asyncio.Queue = asyncio.Queue(maxsize=10)
 								    async with _subscribers_lock:
 								        _subscribers.add(q)
 								    return q
 								async def unsubscribe(q: asyncio.Queue):
 								    async with _subscribers_lock:
 								        _subscribers.discard(q)
 								# ------------------------------------------------------------------
 								# Convenience wrapper – returns the current snapshot (for the proxy)
 								# ------------------------------------------------------------------
 								async def get_usage_counts() -> Dict:
 								    return dict(usage_counts)   # shallow copy
-												enhance routing logic

add a pre-routing model check:
allows for different configs on the ollama backend servers
											
										
										
											2025-08-29 13:13:25 +02:00
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								# -------------------------------------------------------------
 								# 5. Endpoint selection logic (respecting the configurable limit)
 								# -------------------------------------------------------------
-												feat: enhance load balancing #23

											
										
										
											2026-04-22 17:27:34 +02:00
+								def get_max_connections(ep: str) -> int:
 								    """Per-endpoint max_concurrent_connections, falling back to the global value."""
 								    return config.endpoint_config.get(ep, {}).get(
 								        "max_concurrent_connections", config.max_concurrent_connections
 								    )
-												feat: add conversation-endpoint affinity to benefit from hot kv-caches if possible

											
										
										
											2026-05-12 18:33:47 +02:00
+								async def choose_endpoint(model: str, reserve: bool = True,
 								                          affinity_key: Optional[str] = None) -> tuple[str, str]:
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								    """
-												enhance routing logic

add a pre-routing model check:
allows for different configs on the ollama backend servers
											
										
										
											2025-08-29 13:13:25 +02:00
+								    Determine which endpoint to use for the given model while respecting
 								    the `max_concurrent_connections` per endpoint‑model pair **and**
 								    ensuring that the chosen endpoint actually *advertises* the model.
 								    The selection algorithm:
 ️⃣  Query every endpoint for its advertised models (`/api/tags`).
 ️⃣  Build a list of endpoints that contain the requested model.
-												feat: add conversation-endpoint affinity to benefit from hot kv-caches if possible

											
										
										
											2026-05-12 18:33:47 +02:00
+️⃣.5  If conversation affinity is enabled and the caller passes
 								        ``affinity_key``, prefer the endpoint that previously served the
 								        same conversation — but only when it still has the model loaded
 								        and a free slot. Otherwise fall through to the standard logic.
-												enhance routing logic

add a pre-routing model check:
allows for different configs on the ollama backend servers
											
										
										
											2025-08-29 13:13:25 +02:00
+️⃣  For those endpoints, find those that have the model loaded
-												randomize endpoint selection for bootstrapping ollamas

											
										
										
											2025-09-18 18:49:11 +02:00
+								        (`/api/ps`) *and* still have a free slot.
-												enhance routing logic

add a pre-routing model check:
allows for different configs on the ollama backend servers
											
										
										
											2025-08-29 13:13:25 +02:00
+️⃣  If none are both loaded and free, fall back to any endpoint
-												feat: add conversation-endpoint affinity to benefit from hot kv-caches if possible

											
										
										
											2026-05-12 18:33:47 +02:00
+								        from the filtered list that simply has a free slot and randomly
-												randomize endpoint selection for bootstrapping ollamas

											
										
										
											2025-09-18 18:49:11 +02:00
+								        select one.
-												enhance routing logic

add a pre-routing model check:
allows for different configs on the ollama backend servers
											
										
										
											2025-08-29 13:13:25 +02:00
+️⃣  If all are saturated, pick any endpoint from the filtered list
-												randomize endpoint selection for bootstrapping ollamas

											
										
										
											2025-09-18 18:49:11 +02:00
+								        (the request will queue on that endpoint).
-												enhance routing logic

add a pre-routing model check:
allows for different configs on the ollama backend servers
											
										
										
											2025-08-29 13:13:25 +02:00
+️⃣  If no endpoint advertises the model at all, raise an error.
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								    """
-												enhance routing logic

add a pre-routing model check:
allows for different configs on the ollama backend servers
											
										
										
											2025-08-29 13:13:25 +02:00
+								    # 1️⃣  Gather advertised‑model sets for all endpoints concurrently
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								    #     Include both config.endpoints and config.llama_server_endpoints
 								    llama_eps_extra = [ep for ep in config.llama_server_endpoints if ep not in config.endpoints]
 								    all_endpoints = config.endpoints + llama_eps_extra
 								    tag_tasks = [fetch.available_models(ep) for ep in config.endpoints if not is_openai_compatible(ep)]
 								    tag_tasks += [fetch.available_models(ep, config.api_keys.get(ep)) for ep in config.endpoints if is_openai_compatible(ep)]
 								    tag_tasks += [fetch.available_models(ep, config.api_keys.get(ep)) for ep in llama_eps_extra]
-												enhance routing logic

add a pre-routing model check:
allows for different configs on the ollama backend servers
											
										
										
											2025-08-29 13:13:25 +02:00
+								    advertised_sets = await asyncio.gather(*tag_tasks)
 								    # 2️⃣  Filter endpoints that advertise the requested model
 								    candidate_endpoints = [
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								        ep for ep, models in zip(all_endpoints, advertised_sets)
-												enhance routing logic

add a pre-routing model check:
allows for different configs on the ollama backend servers
											
										
										
											2025-08-29 13:13:25 +02:00
+								        if model in models
 								    ]
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
-												feat: improve model version handling in endpoint selection

Add logic to only append ":latest" suffix to models without existing version suffixes, preventing duplicate version tags and ensuring correct endpoint selection for models following Ollama naming conventions.

											
										
										
											2025-12-14 17:58:45 +01:00
+								    # 6️⃣
-												enhance routing logic

add a pre-routing model check:
allows for different configs on the ollama backend servers
											
										
										
											2025-08-29 13:13:25 +02:00
+								    if not candidate_endpoints:
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								        if ":latest" in model:  #ollama naming convention not applicable to openai/llama-server
-												fixing app logic and eventListeners in  frontend

											
										
										
											2025-10-30 09:06:21 +01:00
+								            model_without_latest = model.split(":latest")[0]
 								            candidate_endpoints = [
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								                ep for ep, models in zip(all_endpoints, advertised_sets)
 								                if model_without_latest in models and (is_ext_openai_endpoint(ep) or ep in config.llama_server_endpoints)
-												fixing app logic and eventListeners in  frontend

											
										
										
											2025-10-30 09:06:21 +01:00
+								            ]
 								        if not candidate_endpoints:
-												feat: improve model version handling in endpoint selection

Add logic to only append ":latest" suffix to models without existing version suffixes, preventing duplicate version tags and ensuring correct endpoint selection for models following Ollama naming conventions.

											
										
										
											2025-12-14 17:58:45 +01:00
+								            # Only add :latest suffix if model doesn't already have a version suffix
 								            if ":" not in model:
 								                model = model + ":latest"
-												comliance with ollama naming conventions and openai model['id']

											
										
										
											2025-09-15 17:39:15 +02:00
+								            candidate_endpoints = [
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								                ep for ep, models in zip(all_endpoints, advertised_sets)
-												comliance with ollama naming conventions and openai model['id']

											
										
										
											2025-09-15 17:39:15 +02:00
+								                if model in models
 								            ]
 								        if not candidate_endpoints:
 								            raise RuntimeError(
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								                f"None of the configured endpoints ({', '.join(all_endpoints)}) "
-												comliance with ollama naming conventions and openai model['id']

											
										
										
											2025-09-15 17:39:15 +02:00
+								                f"advertise the model '{model}'."
 								            )
-												enhance routing logic

add a pre-routing model check:
allows for different configs on the ollama backend servers
											
										
										
											2025-08-29 13:13:25 +02:00
+								    # 3️⃣  Among the candidates, find those that have the model *loaded*
 								    #      (concurrently, but only for the filtered list)
-												adding fetch class and ollama client completions on openai endpoints

											
										
										
											2025-09-13 16:57:09 +02:00
+								    load_tasks = [fetch.loaded_models(ep) for ep in candidate_endpoints]
-												enhance routing logic

add a pre-routing model check:
allows for different configs on the ollama backend servers
											
										
										
											2025-08-29 13:13:25 +02:00
+								    loaded_sets = await asyncio.gather(*load_tasks)
-												refactor: improve snapshot safety and usage tracking

Create atomic snapshots by deep copying usage data structures to prevent race conditions.
Protect concurrent reads of usage counts with explicit locking in endpoint selection.
Replace README screenshot with a video link.

											
										
										
											2026-01-26 17:18:57 +01:00
-												fix:
-  _fetch_loaded_models_internal now writes _loaded_error_cache[endpoint] = time.time() on /api/ps or /v1/models failure, and clears the entry on success
- choose_endpoint now filters out candidates with a fresh (<300s) loaded-models error.
-  /health now probes both /api/version and /api/ps for Ollama endpoints
-  dashboard adaption

relates to #83

											
										
										
											2026-05-18 13:45:06 +02:00
+								    # 3️⃣.5  Exclude endpoints whose loaded-model probe has been failing
 								    # recently. Without this filter, an endpoint where `/api/ps` returns 5xx
 								    # would appear with an empty loaded set but pass through to the
 								    # free-slot fallback (step 4) — sending completion calls to an
 								    # unhealthy backend. See issue #83.
 								    async with _loaded_error_cache_lock:
 								        unhealthy = {
 								            ep for ep, ts in _loaded_error_cache.items()
 								            if _is_fresh(ts, 300)
 								        }
 								    if unhealthy:
 								        filtered = [
 								            (ep, models) for ep, models in zip(candidate_endpoints, loaded_sets)
 								            if ep not in unhealthy
 								        ]
 								        if filtered:
 								            candidate_endpoints = [ep for ep, _ in filtered]
 								            loaded_sets = [models for _, models in filtered]
 								        # If *every* candidate is unhealthy we still fall through with the
 								        # original list — refusing to route is worse than retrying a
 								        # possibly-recovered backend.
-												feat: completion errors on an endpoint:model key a caught, cached and rerouted (openai compatible endpoints)

											
										
										
											2026-05-18 18:14:28 +02:00
+								    # 3️⃣.6  Exclude (endpoint, model) pairs whose completion path has recently
 								    # failed with a backend connection error (e.g. llama-server in router mode
 								    # whose delegated worker for *this* model died). /v1/models keeps reporting
 								    # OK in that case, so the probe-level filter above cannot catch it.
 								    async with _completion_error_cache_lock:
 								        completion_broken = {
 								            ep for (ep, m), ts in _completion_error_cache.items()
 								            if m == model and _is_fresh(ts, _COMPLETION_ERROR_TTL)
 								        }
 								    if completion_broken:
 								        filtered = [
 								            (ep, models) for ep, models in zip(candidate_endpoints, loaded_sets)
 								            if ep not in completion_broken
 								        ]
 								        if filtered:
 								            candidate_endpoints = [ep for ep, _ in filtered]
 								            loaded_sets = [models for _, models in filtered]
 								        # Same fallback: if every candidate is broken for this model, fall
 								        # through and let the upstream retry — possibly the operator restarted
 								        # the dead worker.
-												feat: add conversation-endpoint affinity to benefit from hot kv-caches if possible

											
										
										
											2026-05-12 18:33:47 +02:00
+								    # Look up a possible affinity hint *before* taking usage_lock. The two
 								    # locks are never held together to avoid lock-ordering issues.
 								    affine_ep: Optional[str] = None
 								    if config.conversation_affinity and affinity_key:
 								        async with _affinity_lock:
 								            entry = _affinity_map.get(affinity_key)
 								            if entry is not None:
-												feat: visualization of conversation affinity in dashboard

											
										
										
											2026-05-13 13:38:37 +02:00
+								                ep, _stored_model, expires_at = entry
-												feat: add conversation-endpoint affinity to benefit from hot kv-caches if possible

											
										
										
											2026-05-12 18:33:47 +02:00
+								                if expires_at < time.monotonic():
 								                    _affinity_map.pop(affinity_key, None)
 								                else:
 								                    affine_ep = ep
-												refactor: make choose_endpoint use cache incrementer for atomic updates

											
										
										
											2026-03-03 14:57:37 +01:00
+								    # Protect all reads/writes of usage_counts with the lock so that selection
 								    # and reservation are atomic — concurrent callers see each other's pending load.
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								    async with usage_lock:
-												fix(router): use normalized model keys for endpoint selection

Refactor endpoint selection logic to consistently use tracking model keys (normalized via `get_tracking_model`) instead of raw model names, ensuring usage counts are accurately compared with how increment/decrement operations store them. This fixes inconsistent load balancing and model affinity behavior caused by mismatches between raw and tracked model identifiers.

											
										
										
											2026-02-19 17:32:54 +01:00
+								        # Helper: current usage for (endpoint, model) using the same normalized key
 								        # that increment_usage/decrement_usage store — raw model names differ from
 								        # tracking names for llama-server (HF prefix / quant suffix stripped).
 								        def tracking_usage(ep: str) -> int:
 								            return usage_counts.get(ep, {}).get(get_tracking_model(ep, model), 0)
-												refactor: improve snapshot safety and usage tracking

Create atomic snapshots by deep copying usage data structures to prevent race conditions.
Protect concurrent reads of usage counts with explicit locking in endpoint selection.
Replace README screenshot with a video link.

											
										
										
											2026-01-26 17:18:57 +01:00
-												feat: enhance load balancing #23

											
										
										
											2026-04-22 17:27:34 +02:00
+								        def utilization_ratio(ep: str) -> float:
 								            return tracking_usage(ep) / get_max_connections(ep)
 								        # Priority map: position in all_endpoints list (lower = higher priority)
 								        ep_priority = {ep: i for i, ep in enumerate(all_endpoints)}
-												feat: add conversation-endpoint affinity to benefit from hot kv-caches if possible

											
										
										
											2026-05-12 18:33:47 +02:00
+								        selected: Optional[str] = None
-												refactor: improve snapshot safety and usage tracking

Create atomic snapshots by deep copying usage data structures to prevent race conditions.
Protect concurrent reads of usage counts with explicit locking in endpoint selection.
Replace README screenshot with a video link.

											
										
										
											2026-01-26 17:18:57 +01:00
-												feat: add conversation-endpoint affinity to benefit from hot kv-caches if possible

											
										
										
											2026-05-12 18:33:47 +02:00
+								        # 2️⃣.5  Conversation affinity preference — only honour the hint when
 								        # the affine endpoint still advertises the model loaded *and* has a
 								        # free slot. Otherwise fall back to the standard algorithm.
 								        if affine_ep:
 								            ep_loaded = {
 								                ep: set(models)
 								                for ep, models in zip(candidate_endpoints, loaded_sets)
 								            }
 								            if (affine_ep in candidate_endpoints
 								                    and model in ep_loaded.get(affine_ep, set())
 								                    and tracking_usage(affine_ep) < get_max_connections(affine_ep)):
 								                selected = affine_ep
 								        if selected is None:
 								            # 3️⃣ Endpoints that have the model loaded *and* a free slot
 								            loaded_and_free = [
 								                ep for ep, models in zip(candidate_endpoints, loaded_sets)
 								                if model in models and tracking_usage(ep) < get_max_connections(ep)
-												refactor: make choose_endpoint use cache incrementer for atomic updates

											
										
										
											2026-03-03 14:57:37 +01:00
+								            ]
-												feat(router): implement in-flight request tracking to prevent cache stampede in high concurrency scenarios

Added in-flight request tracking mechanism to prevent cache stampede when multiple concurrent requests arrive for the same endpoint. This introduces new dictionaries to track ongoing requests and a lock to coordinate access. The available_models method was refactored to use an internal helper function and includes request coalescing logic to ensure only one HTTP request is made per endpoint when cache entries expire. The loaded_models method was also updated to use the new caching and coalescing pattern.

											
										
										
											2026-01-29 18:00:33 +01:00
-												feat: add conversation-endpoint affinity to benefit from hot kv-caches if possible

											
										
										
											2026-05-12 18:33:47 +02:00
+								            if loaded_and_free:
-												feat: enhance load balancing #23

											
										
										
											2026-04-22 17:27:34 +02:00
+								                if config.priority_routing:
-												feat: add conversation-endpoint affinity to benefit from hot kv-caches if possible

											
										
										
											2026-05-12 18:33:47 +02:00
+								                    # WRR: sort by config order first (stable), then by utilization ratio.
 								                    # Stable sort preserves priority for equal-ratio endpoints.
 								                    loaded_and_free.sort(key=lambda ep: ep_priority.get(ep, 999))
 								                    loaded_and_free.sort(key=utilization_ratio)
 								                    selected = loaded_and_free[0]
-												feat: enhance load balancing #23

											
										
										
											2026-04-22 17:27:34 +02:00
+								                else:
-												feat: add conversation-endpoint affinity to benefit from hot kv-caches if possible

											
										
										
											2026-05-12 18:33:47 +02:00
+								                    # Sort ascending for load balancing — all endpoints here already have the
 								                    # model loaded, so there is no model-switching cost to optimise for.
 								                    loaded_and_free.sort(key=tracking_usage)
 								                    # When all candidates are equally idle, randomise to avoid always picking
 								                    # the first entry in a stable sort.
 								                    if all(tracking_usage(ep) == 0 for ep in loaded_and_free):
 								                        selected = random.choice(loaded_and_free)
-												feat: enhance load balancing #23

											
										
										
											2026-04-22 17:27:34 +02:00
+								                    else:
-												feat: add conversation-endpoint affinity to benefit from hot kv-caches if possible

											
										
										
											2026-05-12 18:33:47 +02:00
+								                        selected = loaded_and_free[0]
-												refactor: make choose_endpoint use cache incrementer for atomic updates

											
										
										
											2026-03-03 14:57:37 +01:00
+								            else:
-												feat: add conversation-endpoint affinity to benefit from hot kv-caches if possible

											
										
										
											2026-05-12 18:33:47 +02:00
+								                # 4️⃣ Endpoints among the candidates that simply have a free slot
 								                endpoints_with_free_slot = [
 								                    ep for ep in candidate_endpoints
 								                    if tracking_usage(ep) < get_max_connections(ep)
 								                ]
 								                if endpoints_with_free_slot:
 								                    if config.priority_routing:
 								                        endpoints_with_free_slot.sort(key=lambda ep: ep_priority.get(ep, 999))
 								                        endpoints_with_free_slot.sort(key=utilization_ratio)
 								                        selected = endpoints_with_free_slot[0]
 								                    else:
 								                        # Sort by total endpoint load (ascending) to prefer idle endpoints.
 								                        endpoints_with_free_slot.sort(
 								                            key=lambda ep: sum(usage_counts.get(ep, {}).values())
 								                        )
 								                        if all(tracking_usage(ep) == 0 for ep in endpoints_with_free_slot):
 								                            selected = random.choice(endpoints_with_free_slot)
 								                        else:
 								                            selected = endpoints_with_free_slot[0]
-												feat: enhance load balancing #23

											
										
										
											2026-04-22 17:27:34 +02:00
+								                else:
-												feat: add conversation-endpoint affinity to benefit from hot kv-caches if possible

											
										
										
											2026-05-12 18:33:47 +02:00
+								                    # 5️⃣ All candidate endpoints are saturated – pick the least-busy one (will queue)
 								                    if config.priority_routing:
 								                        selected = min(
 								                            candidate_endpoints,
 								                            key=lambda ep: (utilization_ratio(ep), ep_priority.get(ep, 999)),
 								                        )
 								                    else:
 								                        selected = min(candidate_endpoints, key=tracking_usage)
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
-												refactor: make choose_endpoint use cache incrementer for atomic updates

											
										
										
											2026-03-03 14:57:37 +01:00
+								        tracking_model = get_tracking_model(selected, model)
-												fix: usage locks now release before the subscriber queue awaits

											
										
										
											2026-04-07 15:30:52 +02:00
+								        snapshot = None
-												refactor: make choose_endpoint use cache incrementer for atomic updates

											
										
										
											2026-03-03 14:57:37 +01:00
+								        if reserve:
 								            usage_counts[selected][tracking_model] += 1
-												fix: usage locks now release before the subscriber queue awaits

											
										
										
											2026-04-07 15:30:52 +02:00
+								            snapshot = _capture_snapshot()
 								    if snapshot is not None:
 								        await _distribute_snapshot(snapshot)
-												feat: add conversation-endpoint affinity to benefit from hot kv-caches if possible

											
										
										
											2026-05-12 18:33:47 +02:00
+								    # Record / refresh affinity *after* releasing usage_lock.
 								    if reserve and config.conversation_affinity and affinity_key:
 								        expires_at = time.monotonic() + config.conversation_affinity_ttl
 								        async with _affinity_lock:
-												feat: visualization of conversation affinity in dashboard

											
										
										
											2026-05-13 13:38:37 +02:00
+								            _affinity_map[affinity_key] = (selected, model, expires_at)
-												feat: add conversation-endpoint affinity to benefit from hot kv-caches if possible

											
										
										
											2026-05-12 18:33:47 +02:00
+								            if len(_affinity_map) > _AFFINITY_MAX_ENTRIES:
 								                now = time.monotonic()
-												feat: visualization of conversation affinity in dashboard

											
										
										
											2026-05-13 13:38:37 +02:00
+								                for k in [k for k, v in _affinity_map.items() if v[2] < now]:
-												feat: add conversation-endpoint affinity to benefit from hot kv-caches if possible

											
										
										
											2026-05-12 18:33:47 +02:00
+								                    _affinity_map.pop(k, None)
-												fix: usage locks now release before the subscriber queue awaits

											
										
										
											2026-04-07 15:30:52 +02:00
+								    return selected, tracking_model
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
 								# -------------------------------------------------------------
 								# 6. API route – Generate
 								# -------------------------------------------------------------
 								@app.post("/api/generate")
 								async def proxy(request: Request):
 								    """
 								    Proxy a generate request to Ollama and stream the response back to the client.
 								    """
 								    try:
 								        body_bytes = await request.body()
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								        payload = orjson.loads(body_bytes.decode("utf-8"))
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
 								        model = payload.get("model")
 								        prompt = payload.get("prompt")
 								        suffix = payload.get("suffix")
 								        system = payload.get("system")
 								        template = payload.get("template")
 								        context = payload.get("context")
 								        stream = payload.get("stream")
 								        think = payload.get("think")
 								        raw = payload.get("raw")
-												removing reserved words var names

											
										
										
											2025-09-11 18:53:23 +02:00
+								        _format = payload.get("format")
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								        images = payload.get("images")
 								        options = payload.get("options")
 								        keep_alive = payload.get("keep_alive")
-												fix: improvements, fixes and opt-in cache

doc: semantic-cache.md added with detailed write-up

											
										
										
											2026-03-10 15:19:37 +01:00
+								        _cache_enabled = payload.get("nomyo", {}).get("cache", False)
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								        if not model:
 								            raise HTTPException(
 								                status_code=400, detail="Missing required field 'model'"
 								            )
 								        if not prompt:
 								            raise HTTPException(
 								                status_code=400, detail="Missing required field 'prompt'"
 								            )
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								    except orjson.JSONDecodeError as e:
-												check for base64 encoded images and remove alpha channel

											
										
										
											2025-10-03 10:04:50 +02:00
+								        error_msg = f"Invalid JSON format in request body: {str(e)}. Please ensure the request is properly formatted."
 								        raise HTTPException(status_code=400, detail=error_msg) from e
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
-												feat: adding a semantic cache layer

											
										
										
											2026-03-08 09:12:09 +01:00
+								    # Cache lookup — before endpoint selection so no slot is wasted on a hit
 								    _cache = get_llm_cache()
-												fix: improvements, fixes and opt-in cache

doc: semantic-cache.md added with detailed write-up

											
										
										
											2026-03-10 15:19:37 +01:00
+								    if _cache is not None and _cache_enabled:
-												feat: adding a semantic cache layer

											
										
										
											2026-03-08 09:12:09 +01:00
+								        _cached = await _cache.get_generate(model, prompt, system or "")
 								        if _cached is not None:
 								            async def _serve_cached_generate():
 								                yield _cached
 								            return StreamingResponse(_serve_cached_generate(), media_type="application/json")
-												feat: add conversation-endpoint affinity to benefit from hot kv-caches if possible

											
										
										
											2026-05-12 18:33:47 +02:00
+								    _affinity_key = _conversation_fingerprint(model, None, prompt)
 								    endpoint, tracking_model = await choose_endpoint(model, affinity_key=_affinity_key)
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								    use_openai = is_openai_compatible(endpoint)
 								    if use_openai:
-												comliance with ollama naming conventions and openai model['id']

											
										
										
											2025-09-15 17:39:15 +02:00
+								        if ":latest" in model:
-												finalizing compliance tasks

											
										
										
											2025-09-15 19:12:00 +02:00
+								            model = model.split(":latest")
-												comliance with ollama naming conventions and openai model['id']

											
										
										
											2025-09-15 17:39:15 +02:00
+								            model = model[0]
-												adding fetch class and ollama client completions on openai endpoints

											
										
										
											2025-09-13 16:57:09 +02:00
+								        params = {
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								            "prompt": prompt,
-												adding fetch class and ollama client completions on openai endpoints

											
										
										
											2025-09-13 16:57:09 +02:00
+								            "model": model,
 								        }
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
-												adding fetch class and ollama client completions on openai endpoints

											
										
										
											2025-09-13 16:57:09 +02:00
+								        optional_params = {
 								            "stream": stream,
-												adding optional parameters in ollama to openai translation

											
										
										
											2025-09-22 14:04:19 +02:00
+								            "max_tokens": options.get("num_predict") if options and "num_predict" in options else None,
 								            "frequency_penalty": options.get("frequency_penalty") if options and "frequency_penalty" in options else None,
 								            "presence_penalty": options.get("presence_penalty") if options and "presence_penalty" in options else None,
 								            "seed": options.get("seed") if options and "seed" in options else None,
 								            "stop": options.get("stop") if options and "stop" in options else None,
 								            "top_p": options.get("top_p") if options and "top_p" in options else None,
 								            "temperature": options.get("temperature") if options and "temperature" in options else None,
-												fixing typos and smaller issues

											
										
										
											2025-10-28 11:08:52 +01:00
+								            "suffix": suffix,
-												adding optional parameters in ollama to openai translation

											
										
										
											2025-09-22 14:04:19 +02:00
+								            }
-												adding fetch class and ollama client completions on openai endpoints

											
										
										
											2025-09-13 16:57:09 +02:00
+								        params.update({k: v for k, v in optional_params.items() if v is not None})
-												feat: support localhost llama_server access via unix sockets

											
										
										
											2026-04-17 12:41:57 +02:00
+								        oclient = _make_openai_client(endpoint, default_headers=default_headers, api_key=config.api_keys.get(endpoint, "no-key"))
-												adding fetch class and ollama client completions on openai endpoints

											
										
										
											2025-09-13 16:57:09 +02:00
+								    else:
 								        client = ollama.AsyncClient(host=endpoint)
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
 								    # 4. Async generator that streams data and decrements the counter
 								    async def stream_generate_response():
 								        try:
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								            if use_openai:
-												adding fetch class and ollama client completions on openai endpoints

											
										
										
											2025-09-13 16:57:09 +02:00
+								                start_ts = time.perf_counter()
 								                async_gen = await oclient.completions.create(**params)
 								            else:
 								                async_gen = await client.generate(model=model, prompt=prompt, suffix=suffix, system=system, template=template, context=context, stream=stream, think=think, raw=raw, format=_format, images=images, options=options, keep_alive=keep_alive)
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								            if stream == True:
-												feat: adding a semantic cache layer

											
										
										
											2026-03-08 09:12:09 +01:00
+								                content_parts: list[str] = []
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								                async for chunk in async_gen:
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								                    if use_openai:
-												adding fetch class and ollama client completions on openai endpoints

											
										
										
											2025-09-13 16:57:09 +02:00
+								                        chunk = rechunk.openai_completion2ollama(chunk, stream, start_ts)
-												record and display total token usage on ollama endpoints using ollama client

											
										
										
											2025-11-04 17:55:19 +01:00
+								                    prompt_tok = chunk.prompt_eval_count or 0
 								                    comp_tok   = chunk.eval_count or 0
-												fixing token_queue, prepping chart view

											
										
										
											2025-11-18 19:02:36 +01:00
+								                    if prompt_tok != 0 or comp_tok != 0:
-												feat(router): normalize model names for usage tracking across endpoints (continued)

Introduce `get_tracking_model()` to standardize model names for consistent usage tracking in Prometheus metrics. This ensures llama-server models are stripped of HF prefixes and quantization suffixes, Ollama models append `:latest` when versionless, and external OpenAI models remain unchanged—aligning all tracking keys with the PS table.

											
										
										
											2026-02-18 11:45:37 +01:00
+								                        await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								                    if hasattr(chunk, "model_dump_json"):
 								                        json_line = chunk.model_dump_json()
 								                    else:
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								                        json_line = orjson.dumps(chunk)
-												feat: adding a semantic cache layer

											
										
										
											2026-03-08 09:12:09 +01:00
+								                    # Accumulate and store cache on done chunk — before yield so it always runs
-												fix: improvements, fixes and opt-in cache

doc: semantic-cache.md added with detailed write-up

											
										
										
											2026-03-10 15:19:37 +01:00
+								                    if _cache is not None and _cache_enabled:
-												feat: adding a semantic cache layer

											
										
										
											2026-03-08 09:12:09 +01:00
+								                        if getattr(chunk, "response", None):
 								                            content_parts.append(chunk.response)
 								                        if getattr(chunk, "done", False):
 								                            assembled = orjson.dumps({
 								                                k: v for k, v in {
 								                                    "model": getattr(chunk, "model", model),
 								                                    "response": "".join(content_parts),
 								                                    "done": True,
 								                                    "done_reason": getattr(chunk, "done_reason", "stop") or "stop",
 								                                    "prompt_eval_count": getattr(chunk, "prompt_eval_count", None),
 								                                    "eval_count": getattr(chunk, "eval_count", None),
 								                                    "total_duration": getattr(chunk, "total_duration", None),
 								                                    "eval_duration": getattr(chunk, "eval_duration", None),
 								                                }.items() if v is not None
 								                            }) + b"\n"
 								                            try:
 								                                await _cache.set_generate(model, prompt, system or "", assembled)
 								                            except Exception as _ce:
 								                                print(f"[cache] set_generate (streaming) failed: {_ce}")
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								                    yield json_line.encode("utf-8") + b"\n"
 								            else:
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								                if use_openai:
-												adding fetch class and ollama client completions on openai endpoints

											
										
										
											2025-09-13 16:57:09 +02:00
+								                    response = rechunk.openai_completion2ollama(async_gen, stream, start_ts)
-												formatting Response Objects in rechunk and fixing TypeErrors in /api/chat and /api/generate

											
										
										
											2025-09-22 09:30:27 +02:00
+								                    response = response.model_dump_json()
-												adding fetch class and ollama client completions on openai endpoints

											
										
										
											2025-09-13 16:57:09 +02:00
+								                else:
 								                    response = async_gen.model_dump_json()
-												record and display total token usage on ollama endpoints using ollama client

											
										
										
											2025-11-04 17:55:19 +01:00
+								                    prompt_tok = async_gen.prompt_eval_count or 0
 								                    comp_tok   = async_gen.eval_count or 0
-												fixing token_queue, prepping chart view

											
										
										
											2025-11-18 19:02:36 +01:00
+								                    if prompt_tok != 0 or comp_tok != 0:
-												feat(router): normalize model names for usage tracking across endpoints (continued)

Introduce `get_tracking_model()` to standardize model names for consistent usage tracking in Prometheus metrics. This ensures llama-server models are stripped of HF prefixes and quantization suffixes, Ollama models append `:latest` when versionless, and external OpenAI models remain unchanged—aligning all tracking keys with the PS table.

											
										
										
											2026-02-18 11:45:37 +01:00
+								                        await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								                json_line = (
-												adding fetch class and ollama client completions on openai endpoints

											
										
										
											2025-09-13 16:57:09 +02:00
+								                    response
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								                    if hasattr(async_gen, "model_dump_json")
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								                    else orjson.dumps(async_gen)
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								                )
-												feat: adding a semantic cache layer

											
										
										
											2026-03-08 09:12:09 +01:00
+								                cache_bytes = json_line.encode("utf-8") + b"\n"
 								                yield cache_bytes
 								                # Cache non-streaming response
-												fix: improvements, fixes and opt-in cache

doc: semantic-cache.md added with detailed write-up

											
										
										
											2026-03-10 15:19:37 +01:00
+								                if _cache is not None and _cache_enabled:
-												feat: adding a semantic cache layer

											
										
										
											2026-03-08 09:12:09 +01:00
+								                    try:
 								                        await _cache.set_generate(model, prompt, system or "", cache_bytes)
 								                    except Exception as _ce:
 								                        print(f"[cache] set_generate (non-streaming) failed: {_ce}")
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
 								        finally:
 								            # Ensure counter is decremented even if an exception occurs
-												feat(router): normalize model names for usage tracking across endpoints (continued)

Introduce `get_tracking_model()` to standardize model names for consistent usage tracking in Prometheus metrics. This ensures llama-server models are stripped of HF prefixes and quantization suffixes, Ollama models append `:latest` when versionless, and external OpenAI models remain unchanged—aligning all tracking keys with the PS table.

											
										
										
											2026-02-18 11:45:37 +01:00
+								            await decrement_usage(endpoint, tracking_model)
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
 								    # 5. Return a StreamingResponse backed by the generator
 								    return StreamingResponse(
 								        stream_generate_response(),
 								        media_type="application/json",
 								    )
 								# -------------------------------------------------------------
 								# 7. API route – Chat
 								# -------------------------------------------------------------
 								@app.post("/api/chat")
 								async def chat_proxy(request: Request):
 								    """
 								    Proxy a chat request to Ollama and stream the endpoint reply.
 								    """
 								    # 1. Parse and validate request
 								    try:
 								        body_bytes = await request.body()
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								        payload = orjson.loads(body_bytes.decode("utf-8"))
-												adding optional parameters in ollama to openai translation

											
										
										
											2025-09-22 14:04:19 +02:00
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								        model = payload.get("model")
 								        messages = payload.get("messages")
 								        tools = payload.get("tools")
 								        stream = payload.get("stream")
 								        think = payload.get("think")
-												fixing headers, using ollama.Responses in rechunk class, fixing reseverd words var usage, fixing embedding output, fixing model naming in frontend

											
										
										
											2025-09-21 16:20:36 +02:00
+								        _format = payload.get("format")
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								        keep_alive = payload.get("keep_alive")
-												fixing headers, using ollama.Responses in rechunk class, fixing reseverd words var usage, fixing embedding output, fixing model naming in frontend

											
										
										
											2025-09-21 16:20:36 +02:00
+								        options = payload.get("options")
-												feat(router): add logprob support in /api/chat

Add logprob support to the OpenAI-to-Ollama proxy by converting OpenAI logprob formats to Ollama types. Also update the ollama dependency.

											
										
										
											2026-02-13 13:29:45 +01:00
+								        logprobs = payload.get("logprobs")
 								        top_logprobs = payload.get("top_logprobs")
-												fix: improvements, fixes and opt-in cache

doc: semantic-cache.md added with detailed write-up

											
										
										
											2026-03-10 15:19:37 +01:00
+								        _cache_enabled = payload.get("nomyo", {}).get("cache", False)
-												poc: messsage translation with images

											
										
										
											2025-09-23 17:33:15 +02:00
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								        if not model:
 								            raise HTTPException(
 								                status_code=400, detail="Missing required field 'model'"
 								            )
 								        if not isinstance(messages, list):
 								            raise HTTPException(
-												starting an openai2ollama client translation layer with rechunking class

											
										
										
											2025-09-13 11:24:28 +02:00
+								                status_code=400, detail="Missing or invalid 'messages' field (must be a list)"
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								            )
-												fixing headers, using ollama.Responses in rechunk class, fixing reseverd words var usage, fixing embedding output, fixing model naming in frontend

											
										
										
											2025-09-21 16:20:36 +02:00
+								        if options is not None and not isinstance(options, dict):
 								            raise HTTPException(
 								                status_code=400, detail="`options` must be a JSON object"
 								            )
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								    except orjson.JSONDecodeError as e:
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								        raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e
-												feat: adding a semantic cache layer

											
										
										
											2026-03-08 09:12:09 +01:00
+								    # Cache lookup — before endpoint selection, always bypassed for MOE
 								    _is_moe = model.startswith("moe-")
 								    _cache = get_llm_cache()
 								    # Normalise model name for cache key: strip ":latest" suffix here so that
 								    # get_chat and set_chat use the same model string regardless of when the
 								    # strip happens further down (line ~1793 strips it for OpenAI endpoints).
 								    _cache_model = model[: -len(":latest")] if model.endswith(":latest") else model
 								    # Snapshot original messages before any OpenAI-format transformation so that
 								    # get_chat and set_chat always use the same key regardless of backend type.
 								    _cache_messages = messages
-												fix: improvements, fixes and opt-in cache

doc: semantic-cache.md added with detailed write-up

											
										
										
											2026-03-10 15:19:37 +01:00
+								    if _cache is not None and not _is_moe and _cache_enabled:
-												feat: adding a semantic cache layer

											
										
										
											2026-03-08 09:12:09 +01:00
+								        _cached = await _cache.get_chat("ollama_chat", _cache_model, messages)
 								        if _cached is not None:
 								            async def _serve_cached_chat():
 								                yield _cached
 								            return StreamingResponse(
 								                _serve_cached_chat(),
 								                media_type="application/x-ndjson" if stream else "application/json",
 								            )
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								    # 2. Endpoint logic
-												refactor: optimize token aggregation query and enhance chat proxy

- Refactored token aggregation query in db.py to use a single SQL query with SUM() instead of iterating through rows, improving performance
- Combined import statements in db.py and router.py to reduce lines of code
- Enhanced chat proxy in router.py to handle "moe-" prefixed models with multiple query execution and critique generation
- Added last_user_content() helper function to extract user content from messages
- Improved code readability and maintainability through these structural changes

											
										
										
											2025-12-13 11:58:49 +01:00
+								    if model.startswith("moe-"):
 								        model = model.split("moe-")[1]
 								        opt = True
 								    else:
 								        opt = False
-												feat: add conversation-endpoint affinity to benefit from hot kv-caches if possible

											
										
										
											2026-05-12 18:33:47 +02:00
+								    _affinity_key = _conversation_fingerprint(model, messages, None)
 								    endpoint, tracking_model = await choose_endpoint(model, affinity_key=_affinity_key)
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								    use_openai = is_openai_compatible(endpoint)
 								    if use_openai:
-												comliance with ollama naming conventions and openai model['id']

											
										
										
											2025-09-15 17:39:15 +02:00
+								        if ":latest" in model:
-												finalizing compliance tasks

											
										
										
											2025-09-15 19:12:00 +02:00
+								            model = model.split(":latest")
-												comliance with ollama naming conventions and openai model['id']

											
										
										
											2025-09-15 17:39:15 +02:00
+								            model = model[0]
-												poc: messsage translation with images

											
										
										
											2025-09-23 17:33:15 +02:00
+								        if messages:
-												fix: conditional to_thread for the image_transform to relieve threadpool pressure

											
										
										
											2026-04-07 13:28:34 +02:00
+								            if any("images" in m for m in messages):
 								                messages = await asyncio.to_thread(transform_images_to_data_urls, messages)
-												feat: Add tool call normalization and streaming delta accumulation

Adds support for correctly handling tool calls in chat requests. Normalizes tool call data (ensuring IDs, types, and JSON arguments) in non-streaming mode and accumulates OpenAI-style deltas during streaming to build the final Ollama response.

											
										
										
											2026-02-10 20:21:46 +01:00
+								            messages = transform_tool_calls_to_openai(messages)
-												fix: missing requirement

fix: strip assistant prefill when ollama -> openai translaton + openai guard

											
										
										
											2026-04-06 11:32:47 +02:00
+								            messages = _strip_assistant_prefill(messages)
-												starting an openai2ollama client translation layer with rechunking class

											
										
										
											2025-09-13 11:24:28 +02:00
+								        params = {
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								            "messages": messages,
-												starting an openai2ollama client translation layer with rechunking class

											
										
										
											2025-09-13 11:24:28 +02:00
+								            "model": model,
-												adding optional parameters in ollama to openai translation

											
										
										
											2025-09-22 14:04:19 +02:00
+								            }
-												starting an openai2ollama client translation layer with rechunking class

											
										
										
											2025-09-13 11:24:28 +02:00
+								        optional_params = {
 								            "tools": tools,
 								            "stream": stream,
-												record and display total token usage on ollama endpoints using ollama client

											
										
										
											2025-11-04 17:55:19 +01:00
+								            "stream_options": {"include_usage": True} if stream else None,
-												adding optional parameters in ollama to openai translation

											
										
										
											2025-09-22 14:04:19 +02:00
+								            "max_tokens": options.get("num_predict") if options and "num_predict" in options else None,
 								            "frequency_penalty": options.get("frequency_penalty") if options and "frequency_penalty" in options else None,
 								            "presence_penalty": options.get("presence_penalty") if options and "presence_penalty" in options else None,
 								            "seed": options.get("seed") if options and "seed" in options else None,
 								            "stop": options.get("stop") if options and "stop" in options else None,
 								            "top_p": options.get("top_p") if options and "top_p" in options else None,
 								            "temperature": options.get("temperature") if options and "temperature" in options else None,
-												feat(router): add logprob support in /api/chat

Add logprob support to the OpenAI-to-Ollama proxy by converting OpenAI logprob formats to Ollama types. Also update the ollama dependency.

											
										
										
											2026-02-13 13:29:45 +01:00
+								            "logprobs": logprobs if logprobs is not None else (options.get("logprobs") if options and "logprobs" in options else None),
 								            "top_logprobs": top_logprobs if top_logprobs is not None else (options.get("top_logprobs") if options and "top_logprobs" in options else None),
-												adding optional parameters in ollama to openai translation

											
										
										
											2025-09-22 14:04:19 +02:00
+								            "response_format": {"type": "json_schema", "json_schema": _format} if _format is not None else None
 								            }
-												starting an openai2ollama client translation layer with rechunking class

											
										
										
											2025-09-13 11:24:28 +02:00
+								        params.update({k: v for k, v in optional_params.items() if v is not None})
-												feat: support localhost llama_server access via unix sockets

											
										
										
											2026-04-17 12:41:57 +02:00
+								        oclient = _make_openai_client(endpoint, default_headers=default_headers, api_key=config.api_keys.get(endpoint, "no-key"))
-												starting an openai2ollama client translation layer with rechunking class

											
										
										
											2025-09-13 11:24:28 +02:00
+								    else:
 								        client = ollama.AsyncClient(host=endpoint)
-												fix: changing error_cache to stale-while-revalidate same as available_models_cache

											
										
										
											2026-03-12 14:47:54 +01:00
+								    # For OpenAI endpoints: make the API call in handler scope
 								    # (try/except inside async generators is unreliable with Starlette's streaming)
 								    start_ts = None
 								    async_gen = None
 								    if use_openai:
 								        start_ts = time.perf_counter()
-												fix: model name normalization for context_cash preemptive context-shifting for smaller context-windows with previous failure

											
										
										
											2026-03-12 16:08:01 +01:00
+								        # Proactive trim: only for small-ctx models we've already seen run out of space
 								        _lookup_model = _normalize_llama_model_name(model) if endpoint in config.llama_server_endpoints else model
 								        _known_nctx = _endpoint_nctx.get((endpoint, _lookup_model))
 								        if _known_nctx and _known_nctx <= _CTX_TRIM_SMALL_LIMIT:
 								            _pre_target = int((_known_nctx - _known_nctx // 4) / 1.2)
 								            _pre_est = _count_message_tokens(params.get("messages", []))
 								            if _pre_est > _pre_target:
 								                _pre_msgs = params.get("messages", [])
 								                _pre_trimmed = _trim_messages_for_context(_pre_msgs, _known_nctx, target_tokens=_pre_target)
 								                _dropped = len(_pre_msgs) - len(_pre_trimmed)
 								                print(f"[ctx-pre] n_ctx={_known_nctx} est={_pre_est} target={_pre_target} dropped={_dropped}", flush=True)
 								                params = {**params, "messages": _pre_trimmed}
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								        try:
-												fix: changing error_cache to stale-while-revalidate same as available_models_cache

											
										
										
											2026-03-12 14:47:54 +01:00
+								            async_gen = await oclient.chat.completions.create(**params)
 								        except Exception as e:
 								            _e_str = str(e)
 								            print(f"[chat_proxy] caught {type(e).__name__}: {_e_str[:200]}")
 								            if "exceed_context_size_error" in _e_str or "exceeds the available context size" in _e_str:
 								                err_body = getattr(e, "body", {}) or {}
 								                err_detail = err_body.get("error", {}) if isinstance(err_body, dict) else {}
 								                n_ctx_limit = err_detail.get("n_ctx", 0)
 								                actual_tokens = err_detail.get("n_prompt_tokens", 0)
-												fix: catch non-standard openai sdk error bodies for parsing

											
										
										
											2026-03-12 19:08:01 +01:00
+								                if not n_ctx_limit:
 								                    _m = re.search(r"'n_ctx':\s*(\d+)", _e_str)
 								                    if _m:
 								                        n_ctx_limit = int(_m.group(1))
 								                    _m = re.search(r"'n_prompt_tokens':\s*(\d+)", _e_str)
 								                    if _m:
 								                        actual_tokens = int(_m.group(1))
-												fix: changing error_cache to stale-while-revalidate same as available_models_cache

											
										
										
											2026-03-12 14:47:54 +01:00
+								                if not n_ctx_limit:
 								                    await decrement_usage(endpoint, tracking_model)
 								                    raise
-												fix: model name normalization for context_cash preemptive context-shifting for smaller context-windows with previous failure

											
										
										
											2026-03-12 16:08:01 +01:00
+								                if n_ctx_limit <= _CTX_TRIM_SMALL_LIMIT:
 								                    _endpoint_nctx[(endpoint, model)] = n_ctx_limit
-												fix: changing error_cache to stale-while-revalidate same as available_models_cache

											
										
										
											2026-03-12 14:47:54 +01:00
+								                msgs_to_trim = params.get("messages", [])
 								                cal_target = _calibrated_trim_target(msgs_to_trim, n_ctx_limit, actual_tokens)
 								                trimmed = _trim_messages_for_context(msgs_to_trim, n_ctx_limit, target_tokens=cal_target)
 								                print(f"[chat_proxy] Context exceeded ({actual_tokens}/{n_ctx_limit} tokens, tiktoken_target={cal_target}), dropped {len(msgs_to_trim) - len(trimmed)} oldest message(s) and retrying")
-												feat: add an openai retry if request with image is send to a pure text model

											
										
										
											2026-03-12 10:06:18 +01:00
+								                try:
-												fix: changing error_cache to stale-while-revalidate same as available_models_cache

											
										
										
											2026-03-12 14:47:54 +01:00
+								                    async_gen = await oclient.chat.completions.create(**{**params, "messages": trimmed})
 								                except Exception as e2:
 								                    _e2_str = str(e2)
 								                    if "exceed_context_size_error" in _e2_str or "exceeds the available context size" in _e2_str:
 								                        print(f"[chat_proxy] Context still exceeded after trimming messages, also stripping tools")
 								                        params_no_tools = {k: v for k, v in params.items() if k not in ("tools", "tool_choice")}
 								                        try:
 								                            async_gen = await oclient.chat.completions.create(**{**params_no_tools, "messages": trimmed})
 								                        except Exception:
 								                            await decrement_usage(endpoint, tracking_model)
-												feat: add reactive auto context-shift in openai endpoints to prevent recover from out of context errors

											
										
										
											2026-03-12 10:15:52 +01:00
+								                            raise
-												feat: add an openai retry if request with image is send to a pure text model

											
										
										
											2026-03-12 10:06:18 +01:00
+								                    else:
-												fix: changing error_cache to stale-while-revalidate same as available_models_cache

											
										
										
											2026-03-12 14:47:54 +01:00
+								                        await decrement_usage(endpoint, tracking_model)
-												feat: add an openai retry if request with image is send to a pure text model

											
										
										
											2026-03-12 10:06:18 +01:00
+								                        raise
-												feat: completion errors on an endpoint:model key a caught, cached and rerouted (openai compatible endpoints)

											
										
										
											2026-05-18 18:14:28 +02:00
+								            elif _is_backend_connection_error(e):
 								                print(f"[chat_proxy] backend connection error → marking ({endpoint}, {model}) unhealthy", flush=True)
 								                await _mark_backend_unhealthy(endpoint, model, _e_str)
 								                await decrement_usage(endpoint, tracking_model)
 								                raise
-												fix: changing error_cache to stale-while-revalidate same as available_models_cache

											
										
										
											2026-03-12 14:47:54 +01:00
+								            elif "image input is not supported" in _e_str:
 								                print(f"[chat_proxy] Model {model} doesn't support images, retrying with text-only messages")
 								                try:
 								                    params = {**params, "messages": _strip_images_from_messages(params.get("messages", []))}
 								                    async_gen = await oclient.chat.completions.create(**params)
 								                except Exception:
 								                    await decrement_usage(endpoint, tracking_model)
 								                    raise
 								            else:
 								                await decrement_usage(endpoint, tracking_model)
 								                raise
 								    # 3. Async generator that streams chat data and decrements the counter
 								    async def stream_chat_response():
 								        try:
 								            # The chat method returns a generator of dicts (or GenerateResponse)
 								            if use_openai:
 								                _async_gen = async_gen  # established in handler scope above
-												starting an openai2ollama client translation layer with rechunking class

											
										
										
											2025-09-13 11:24:28 +02:00
+								            else:
-												refactor: optimize token aggregation query and enhance chat proxy

- Refactored token aggregation query in db.py to use a single SQL query with SUM() instead of iterating through rows, improving performance
- Combined import statements in db.py and router.py to reduce lines of code
- Enhanced chat proxy in router.py to handle "moe-" prefixed models with multiple query execution and critique generation
- Added last_user_content() helper function to extract user content from messages
- Improved code readability and maintainability through these structural changes

											
										
										
											2025-12-13 11:58:49 +01:00
+								                if opt == True:
-												fix(enhance.py): correct typo in function name from 'moe_select_candiadate' to 'moe_select_candidate'

feat(router.py): add helper function _make_chat_request for handling enhancing chat requests to endpoints

											
										
										
											2025-12-15 10:35:56 +01:00
+								                    # Use the dedicated MOE helper function
-												fix: changing error_cache to stale-while-revalidate same as available_models_cache

											
										
										
											2026-03-12 14:47:54 +01:00
+								                    _async_gen = await _make_moe_requests(model, messages, tools, think, _format, options, keep_alive)
-												refactor: optimize token aggregation query and enhance chat proxy

- Refactored token aggregation query in db.py to use a single SQL query with SUM() instead of iterating through rows, improving performance
- Combined import statements in db.py and router.py to reduce lines of code
- Enhanced chat proxy in router.py to handle "moe-" prefixed models with multiple query execution and critique generation
- Added last_user_content() helper function to extract user content from messages
- Improved code readability and maintainability through these structural changes

											
										
										
											2025-12-13 11:58:49 +01:00
+								                else:
-												fix: changing error_cache to stale-while-revalidate same as available_models_cache

											
										
										
											2026-03-12 14:47:54 +01:00
+								                    _async_gen = await client.chat(model=model, messages=messages, tools=tools, stream=stream, think=think, format=_format, options=options, keep_alive=keep_alive, logprobs=logprobs, top_logprobs=top_logprobs)
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								            if stream == True:
-												feat: Add tool call normalization and streaming delta accumulation

Adds support for correctly handling tool calls in chat requests. Normalizes tool call data (ensuring IDs, types, and JSON arguments) in non-streaming mode and accumulates OpenAI-style deltas during streaming to build the final Ollama response.

											
										
										
											2026-02-10 20:21:46 +01:00
+								                tc_acc = {}  # accumulate OpenAI tool-call deltas across chunks
-												feat: adding a semantic cache layer

											
										
										
											2026-03-08 09:12:09 +01:00
+								                content_parts: list[str] = []
-												fix: changing error_cache to stale-while-revalidate same as available_models_cache

											
										
										
											2026-03-12 14:47:54 +01:00
+								                async for chunk in _async_gen:
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								                    if use_openai:
-												feat: Add tool call normalization and streaming delta accumulation

Adds support for correctly handling tool calls in chat requests. Normalizes tool call data (ensuring IDs, types, and JSON arguments) in non-streaming mode and accumulates OpenAI-style deltas during streaming to build the final Ollama response.

											
										
										
											2026-02-10 20:21:46 +01:00
+								                        _accumulate_openai_tc_delta(chunk, tc_acc)
-												adding stream == False options to ollama 2 openai translation in /api/chat

											
										
										
											2025-09-13 12:28:42 +02:00
+								                        chunk = rechunk.openai_chat_completion2ollama(chunk, stream, start_ts)
-												feat: Add tool call normalization and streaming delta accumulation

Adds support for correctly handling tool calls in chat requests. Normalizes tool call data (ensuring IDs, types, and JSON arguments) in non-streaming mode and accumulates OpenAI-style deltas during streaming to build the final Ollama response.

											
										
										
											2026-02-10 20:21:46 +01:00
+								                        # Inject fully-accumulated tool calls only into the final chunk
 								                        if chunk.done and tc_acc and chunk.message:
 								                            chunk.message.tool_calls = _build_ollama_tool_calls(tc_acc)
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								                    # `chunk` can be a dict or a pydantic model – dump to JSON safely
-												record and display total token usage on ollama endpoints using ollama client

											
										
										
											2025-11-04 17:55:19 +01:00
+								                    prompt_tok = chunk.prompt_eval_count or 0
 								                    comp_tok   = chunk.eval_count or 0
-												fixing token_queue, prepping chart view

											
										
										
											2025-11-18 19:02:36 +01:00
+								                    if prompt_tok != 0 or comp_tok != 0:
-												feat(router): normalize model names for usage tracking across endpoints (continued)

Introduce `get_tracking_model()` to standardize model names for consistent usage tracking in Prometheus metrics. This ensures llama-server models are stripped of HF prefixes and quantization suffixes, Ollama models append `:latest` when versionless, and external OpenAI models remain unchanged—aligning all tracking keys with the PS table.

											
										
										
											2026-02-18 11:45:37 +01:00
+								                        await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								                    if hasattr(chunk, "model_dump_json"):
 								                        json_line = chunk.model_dump_json()
 								                    else:
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								                        json_line = orjson.dumps(chunk)
-												feat: adding a semantic cache layer

											
										
										
											2026-03-08 09:12:09 +01:00
+								                    # Accumulate and store cache on done chunk — before yield so it always runs
 								                    # Works for both Ollama-native and OpenAI-compatible backends; chunks are
 								                    # already converted to Ollama format by rechunk before this point.
-												fix: model name normalization for context_cash preemptive context-shifting for smaller context-windows with previous failure

											
										
										
											2026-03-12 16:08:01 +01:00
+								                    if getattr(chunk, "done", False):
 								                        # Detect context exhaustion mid-generation for small-ctx models
 								                        _dr = getattr(chunk, "done_reason", None)
 								                        # Only cache when no max_tokens limit was set — otherwise
 								                        # finish_reason=length might just mean max_tokens was hit,
 								                        # not that the context window was exhausted.
-												fix: params is never defined in ollama native backend

											
										
										
											2026-04-08 13:01:56 +02:00
+								                        _req_max_tok = (
 								                            params.get("max_tokens") or params.get("max_completion_tokens") or params.get("num_predict")
 								                            if use_openai else
 								                            (options.get("num_predict") if options else None)
 								                        )
-												fix: model name normalization for context_cash preemptive context-shifting for smaller context-windows with previous failure

											
										
										
											2026-03-12 16:08:01 +01:00
+								                        if _dr == "length" and not _req_max_tok:
 								                            _pt = getattr(chunk, "prompt_eval_count", 0) or 0
 								                            _ct = getattr(chunk, "eval_count", 0) or 0
 								                            _inferred_nctx = _pt + _ct
 								                            if 0 < _inferred_nctx <= _CTX_TRIM_SMALL_LIMIT:
 								                                _endpoint_nctx[(endpoint, model)] = _inferred_nctx
 								                                print(f"[ctx-cache] done_reason=length → cached n_ctx={_inferred_nctx} for ({endpoint},{model})", flush=True)
-												fix: improvements, fixes and opt-in cache

doc: semantic-cache.md added with detailed write-up

											
										
										
											2026-03-10 15:19:37 +01:00
+								                    if _cache is not None and not _is_moe and _cache_enabled:
-												feat: adding a semantic cache layer

											
										
										
											2026-03-08 09:12:09 +01:00
+								                        if chunk.message and getattr(chunk.message, "content", None):
 								                            content_parts.append(chunk.message.content)
 								                        if getattr(chunk, "done", False):
 								                            assembled = orjson.dumps({
 								                                k: v for k, v in {
 								                                    "model": getattr(chunk, "model", model),
 								                                    "created_at": (lambda ca: ca.isoformat() if hasattr(ca, "isoformat") else ca)(getattr(chunk, "created_at", None)),
 								                                    "message": {"role": "assistant", "content": "".join(content_parts)},
 								                                    "done": True,
 								                                    "done_reason": getattr(chunk, "done_reason", "stop") or "stop",
 								                                    "prompt_eval_count": getattr(chunk, "prompt_eval_count", None),
 								                                    "eval_count": getattr(chunk, "eval_count", None),
 								                                    "total_duration": getattr(chunk, "total_duration", None),
 								                                    "eval_duration": getattr(chunk, "eval_duration", None),
 								                                }.items() if v is not None
 								                            }) + b"\n"
 								                            try:
 								                                await _cache.set_chat("ollama_chat", _cache_model, _cache_messages, assembled)
 								                            except Exception as _ce:
 								                                print(f"[cache] set_chat (ollama_chat streaming) failed: {_ce}")
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								                    yield json_line.encode("utf-8") + b"\n"
 								            else:
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								                if use_openai:
-												fix: changing error_cache to stale-while-revalidate same as available_models_cache

											
										
										
											2026-03-12 14:47:54 +01:00
+								                    response = rechunk.openai_chat_completion2ollama(_async_gen, stream, start_ts)
-												formatting Response Objects in rechunk and fixing TypeErrors in /api/chat and /api/generate

											
										
										
											2025-09-22 09:30:27 +02:00
+								                    response = response.model_dump_json()
-												adding stream == False options to ollama 2 openai translation in /api/chat

											
										
										
											2025-09-13 12:28:42 +02:00
+								                else:
-												fix: changing error_cache to stale-while-revalidate same as available_models_cache

											
										
										
											2026-03-12 14:47:54 +01:00
+								                    response = _async_gen.model_dump_json()
 								                    prompt_tok = _async_gen.prompt_eval_count or 0
 								                    comp_tok   = _async_gen.eval_count or 0
-												fixing token_queue, prepping chart view

											
										
										
											2025-11-18 19:02:36 +01:00
+								                    if prompt_tok != 0 or comp_tok != 0:
-												feat(router): normalize model names for usage tracking across endpoints (continued)

Introduce `get_tracking_model()` to standardize model names for consistent usage tracking in Prometheus metrics. This ensures llama-server models are stripped of HF prefixes and quantization suffixes, Ollama models append `:latest` when versionless, and external OpenAI models remain unchanged—aligning all tracking keys with the PS table.

											
										
										
											2026-02-18 11:45:37 +01:00
+								                        await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								                json_line = (
-												adding stream == False options to ollama 2 openai translation in /api/chat

											
										
										
											2025-09-13 12:28:42 +02:00
+								                    response
-												fix: changing error_cache to stale-while-revalidate same as available_models_cache

											
										
										
											2026-03-12 14:47:54 +01:00
+								                    if hasattr(_async_gen, "model_dump_json")
 								                    else orjson.dumps(_async_gen)
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								                )
-												feat: adding a semantic cache layer

											
										
										
											2026-03-08 09:12:09 +01:00
+								                cache_bytes = json_line.encode("utf-8") + b"\n"
 								                yield cache_bytes
 								                # Cache non-streaming response (non-MOE; works for both Ollama and OpenAI backends)
-												fix: improvements, fixes and opt-in cache

doc: semantic-cache.md added with detailed write-up

											
										
										
											2026-03-10 15:19:37 +01:00
+								                if _cache is not None and not _is_moe and _cache_enabled:
-												feat: adding a semantic cache layer

											
										
										
											2026-03-08 09:12:09 +01:00
+								                    try:
 								                        await _cache.set_chat("ollama_chat", _cache_model, _cache_messages, cache_bytes)
 								                    except Exception as _ce:
 								                        print(f"[cache] set_chat (ollama_chat non-streaming) failed: {_ce}")
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
 								        finally:
 								            # Ensure counter is decremented even if an exception occurs
-												feat(router): normalize model names for usage tracking across endpoints (continued)

Introduce `get_tracking_model()` to standardize model names for consistent usage tracking in Prometheus metrics. This ensures llama-server models are stripped of HF prefixes and quantization suffixes, Ollama models append `:latest` when versionless, and external OpenAI models remain unchanged—aligning all tracking keys with the PS table.

											
										
										
											2026-02-18 11:45:37 +01:00
+								            await decrement_usage(endpoint, tracking_model)
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
 								    # 4. Return a StreamingResponse backed by the generator
-												fixing types and params

											
										
										
											2025-09-22 19:01:14 +02:00
+								    media_type = "application/x-ndjson" if stream else "application/json"
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								    return StreamingResponse(
 								        stream_chat_response(),
-												fixing types and params

											
										
										
											2025-09-22 19:01:14 +02:00
+								        media_type=media_type,
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								    )
 								# -------------------------------------------------------------
-												Adding OpenAI compatibility

New Endpoints
New Requirements
											
										
										
											2025-08-28 09:40:33 +02:00
+								# 8. API route – Embedding - deprecated
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								# -------------------------------------------------------------
 								@app.post("/api/embeddings")
 								async def embedding_proxy(request: Request):
 								    """
 								    Proxy an embedding request to Ollama and reply with embeddings.
 								    """
 								    # 1. Parse and validate request
 								    try:
 								        body_bytes = await request.body()
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								        payload = orjson.loads(body_bytes.decode("utf-8"))
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
 								        model = payload.get("model")
 								        prompt = payload.get("prompt")
 								        options = payload.get("options")
 								        keep_alive = payload.get("keep_alive")
 								        if not model:
 								            raise HTTPException(
 								                status_code=400, detail="Missing required field 'model'"
 								            )
 								        if not prompt:
 								            raise HTTPException(
 								                status_code=400, detail="Missing required field 'prompt'"
 								            )
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								    except orjson.JSONDecodeError as e:
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								        raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e
 								    # 2. Endpoint logic
-												refactor: make choose_endpoint use cache incrementer for atomic updates

											
										
										
											2026-03-03 14:57:37 +01:00
+								    endpoint, tracking_model = await choose_endpoint(model)
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								    use_openai = is_openai_compatible(endpoint)
 								    if use_openai:
-												compliance for ollama embeddings endpoints using openai models

											
										
										
											2025-09-15 17:48:17 +02:00
+								        if ":latest" in model:
-												finalizing compliance tasks

											
										
										
											2025-09-15 19:12:00 +02:00
+								            model = model.split(":latest")
-												compliance for ollama embeddings endpoints using openai models

											
										
										
											2025-09-15 17:48:17 +02:00
+								            model = model[0]
-												feat: support localhost llama_server access via unix sockets

											
										
										
											2026-04-17 12:41:57 +02:00
+								        client = _make_openai_client(endpoint, api_key=config.api_keys.get(endpoint, "no-key"))
-												adding ollama embeddings conversion calls to openai endpoint

											
										
										
											2025-09-15 11:47:55 +02:00
+								    else:
 								        client = ollama.AsyncClient(host=endpoint)
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								    # 3. Async generator that streams embedding data and decrements the counter
 								    async def stream_embedding_response():
 								        try:
 								            # The chat method returns a generator of dicts (or GenerateResponse)
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								            if use_openai:
-												fixing headers, using ollama.Responses in rechunk class, fixing reseverd words var usage, fixing embedding output, fixing model naming in frontend

											
										
										
											2025-09-21 16:20:36 +02:00
+								                async_gen = await client.embeddings.create(input=prompt, model=model)
-												adding ollama embeddings conversion calls to openai endpoint

											
										
										
											2025-09-15 11:47:55 +02:00
+								                async_gen = rechunk.openai_embeddings2ollama(async_gen)
 								            else:
 								                async_gen = await client.embeddings(model=model, prompt=prompt, options=options, keep_alive=keep_alive)
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								            if hasattr(async_gen, "model_dump_json"):
 								                json_line = async_gen.model_dump_json()
 								            else:
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								                json_line = orjson.dumps(async_gen)
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								            yield json_line.encode("utf-8") + b"\n"
 								        finally:
 								            # Ensure counter is decremented even if an exception occurs
-												feat(router): normalize model names for usage tracking across endpoints (continued)

Introduce `get_tracking_model()` to standardize model names for consistent usage tracking in Prometheus metrics. This ensures llama-server models are stripped of HF prefixes and quantization suffixes, Ollama models append `:latest` when versionless, and external OpenAI models remain unchanged—aligning all tracking keys with the PS table.

											
										
										
											2026-02-18 11:45:37 +01:00
+								            await decrement_usage(endpoint, tracking_model)
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
 								    # 5. Return a StreamingResponse backed by the generator
 								    return StreamingResponse(
 								        stream_embedding_response(),
 								        media_type="application/json",
 								    )
 								# -------------------------------------------------------------
-												Additions

- Frontend
- Internal Monitoring Endpoints
- External OpenAI compatible backends
											
										
										
											2025-08-30 00:12:56 +02:00
+								# 9. API route – Embed
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								# -------------------------------------------------------------
 								@app.post("/api/embed")
 								async def embed_proxy(request: Request):
 								    """
 								    Proxy an embed request to Ollama and reply with embeddings.
 								    """
 								    # 1. Parse and validate request
 								    try:
 								        body_bytes = await request.body()
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								        payload = orjson.loads(body_bytes.decode("utf-8"))
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
 								        model = payload.get("model")
-												adding ollama embeddings conversion calls to openai endpoint

											
										
										
											2025-09-15 11:47:55 +02:00
+								        _input = payload.get("input")
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								        truncate = payload.get("truncate")
 								        options = payload.get("options")
 								        keep_alive = payload.get("keep_alive")
 								        if not model:
 								            raise HTTPException(
 								                status_code=400, detail="Missing required field 'model'"
 								            )
-												adding ollama embeddings conversion calls to openai endpoint

											
										
										
											2025-09-15 11:47:55 +02:00
+								        if not _input:
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								            raise HTTPException(
 								                status_code=400, detail="Missing required field 'input'"
 								            )
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								    except orjson.JSONDecodeError as e:
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								        raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e
 								    # 2. Endpoint logic
-												refactor: make choose_endpoint use cache incrementer for atomic updates

											
										
										
											2026-03-03 14:57:37 +01:00
+								    endpoint, tracking_model = await choose_endpoint(model)
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								    use_openai = is_openai_compatible(endpoint)
 								    if use_openai:
-												compliance for ollama embeddings endpoints using openai models

											
										
										
											2025-09-15 17:48:17 +02:00
+								        if ":latest" in model:
-												finalizing compliance tasks

											
										
										
											2025-09-15 19:12:00 +02:00
+								            model = model.split(":latest")
-												compliance for ollama embeddings endpoints using openai models

											
										
										
											2025-09-15 17:48:17 +02:00
+								            model = model[0]
-												feat: support localhost llama_server access via unix sockets

											
										
										
											2026-04-17 12:41:57 +02:00
+								        client = _make_openai_client(endpoint, api_key=config.api_keys.get(endpoint, "no-key"))
-												adding ollama embeddings conversion calls to openai endpoint

											
										
										
											2025-09-15 11:47:55 +02:00
+								    else:
 								        client = ollama.AsyncClient(host=endpoint)
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								    # 3. Async generator that streams embed data and decrements the counter
 								    async def stream_embedding_response():
 								        try:
 								            # The chat method returns a generator of dicts (or GenerateResponse)
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								            if use_openai:
-												fixing headers, using ollama.Responses in rechunk class, fixing reseverd words var usage, fixing embedding output, fixing model naming in frontend

											
										
										
											2025-09-21 16:20:36 +02:00
+								                async_gen = await client.embeddings.create(input=_input, model=model)
-												adding ollama embeddings conversion calls to openai endpoint

											
										
										
											2025-09-15 11:47:55 +02:00
+								                async_gen = rechunk.openai_embed2ollama(async_gen, model)
 								            else:
 								                async_gen = await client.embed(model=model, input=_input, truncate=truncate, options=options, keep_alive=keep_alive)
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								            if hasattr(async_gen, "model_dump_json"):
 								                json_line = async_gen.model_dump_json()
 								            else:
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								                json_line = orjson.dumps(async_gen)
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								            yield json_line.encode("utf-8") + b"\n"
 								        finally:
 								            # Ensure counter is decremented even if an exception occurs
-												feat(router): normalize model names for usage tracking across endpoints (continued)

Introduce `get_tracking_model()` to standardize model names for consistent usage tracking in Prometheus metrics. This ensures llama-server models are stripped of HF prefixes and quantization suffixes, Ollama models append `:latest` when versionless, and external OpenAI models remain unchanged—aligning all tracking keys with the PS table.

											
										
										
											2026-02-18 11:45:37 +01:00
+								            await decrement_usage(endpoint, tracking_model)
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
 								    # 4. Return a StreamingResponse backed by the generator
 								    return StreamingResponse(
 								        stream_embedding_response(),
 								        media_type="application/json",
 								    )
 								# -------------------------------------------------------------
-												Additions

- Frontend
- Internal Monitoring Endpoints
- External OpenAI compatible backends
											
										
										
											2025-08-30 00:12:56 +02:00
+								# 10. API route – Create
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								# -------------------------------------------------------------
 								@app.post("/api/create")
 								async def create_proxy(request: Request):
 								    """
 								    Proxy a create request to all Ollama endpoints and reply with deduplicated status.
 								    """
 								    try:
 								        body_bytes = await request.body()
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								        payload = orjson.loads(body_bytes.decode("utf-8"))
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
 								        model = payload.get("model")
 								        quantize = payload.get("quantize")
 								        from_ = payload.get("from")
 								        files = payload.get("files")
 								        adapters = payload.get("adapters")
 								        template = payload.get("template")
 								        license = payload.get("license")
 								        system = payload.get("system")
 								        parameters = payload.get("parameters")
 								        messages = payload.get("messages")
 								        if not model:
 								            raise HTTPException(
 								                status_code=400, detail="Missing required field 'model'"
 								            )
 								        if not from_ and not files:
 								            raise HTTPException(
 								                status_code=400, detail="You need to provide either from_ or files parameter!"
 								            )
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								    except orjson.JSONDecodeError as e:
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								        raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e
 								    status_lists = []
-												proposal: use global truststore ctx for all connections

											
										
										
											2026-02-12 16:15:39 +01:00
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								    for endpoint in config.endpoints:
 								        client = ollama.AsyncClient(host=endpoint)
 								        create = await client.create(model=model, quantize=quantize, from_=from_, files=files, adapters=adapters, template=template, license=license, system=system, parameters=parameters, messages=messages, stream=False)
 								        status_lists.append(create)
 								    combined_status = []
 								    for status_list in status_lists:
 								        combined_status += status_list
 								    final_status = list(dict.fromkeys(combined_status))
 								    return dict(final_status)
 								# -------------------------------------------------------------
-												Additions

- Frontend
- Internal Monitoring Endpoints
- External OpenAI compatible backends
											
										
										
											2025-08-30 00:12:56 +02:00
+								# 11. API route – Show
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								# -------------------------------------------------------------
 								@app.post("/api/show")
-												Add files via upload

final touches
											
										
										
											2025-09-05 12:11:31 +02:00
+								async def show_proxy(request: Request, model: Optional[str] = None):
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								    """
 								    Proxy a model show request to Ollama and reply with ShowResponse.
 								    """
 								    try:
 								        body_bytes = await request.body()
-												Add files via upload

final touches
											
										
										
											2025-09-05 12:11:31 +02:00
+								        if not model:
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								            payload = orjson.loads(body_bytes.decode("utf-8"))
-												Add files via upload

final touches
											
										
										
											2025-09-05 12:11:31 +02:00
+								            model = payload.get("model")
-												fixing token_queue, prepping chart view

											
										
										
											2025-11-18 19:02:36 +01:00
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								        if not model:
 								            raise HTTPException(
 								                status_code=400, detail="Missing required field 'model'"
 								            )
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								    except orjson.JSONDecodeError as e:
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								        raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e
 								    # 2. Endpoint logic
-												refactor: make choose_endpoint use cache incrementer for atomic updates

											
										
										
											2026-03-03 14:57:37 +01:00
+								    endpoint, _ = await choose_endpoint(model, reserve=False)
-												proposal: use global truststore ctx for all connections

											
										
										
											2026-02-12 16:15:39 +01:00
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								    client = ollama.AsyncClient(host=endpoint)
 								    # 3. Proxy a simple show request
 								    show = await client.show(model=model)
 								    # 4. Return ShowResponse
 								    return show
-												fixing token_queue, prepping chart view

											
										
										
											2025-11-18 19:02:36 +01:00
+								# -------------------------------------------------------------
-												fixing total stats model, button, labels and code clean up

											
										
										
											2025-11-28 14:59:29 +01:00
+								@app.get("/api/token_counts")
 								async def token_counts_proxy():
 								    breakdown = []
 								    total = 0
 								    async for entry in db.load_token_counts():
 								        total += entry['total_tokens']
 								        breakdown.append({
 								            "endpoint": entry["endpoint"],
 								            "model": entry["model"],
 								            "input_tokens": entry["input_tokens"],
 								            "output_tokens": entry["output_tokens"],
 								            "total_tokens": entry["total_tokens"],
 								        })
 								    return {"total_tokens": total, "breakdown": breakdown}
-												refactor: use a persistent WAL-enabled connection with async locks

- Introduce a lazily initialized, shared aiosqlite connection stored in self._db and two asyncio locks (_db_lock, _operation_lock) for safe concurrent access
- Ensure the database directory exists before connecting and enable WAL journaling and foreign keys on first connect
- Add close method to gracefully close the persistent connection
- Guard initialization and write operations with _operation_lock to ensure single-threaded schema setup
- Switch to ON CONFLICT UPSERT for token_counts updates and initialize token_time_series table
- Add typing for _db (Optional[aiosqlite.Connection]) and adjust imports accordingly

addition: Frontend button with total stats aggregation task and feedback span element to keep user informed and a small database footprint

											
										
										
											2025-12-02 12:18:23 +01:00
+								@app.post("/api/aggregate_time_series_days")
 								async def aggregate_time_series_days_proxy(request: Request):
 								    """
 								    Aggregate time_series entries older than days into daily aggregates by endpoint/model/date.
 								    """
 								    try:
 								        body_bytes = await request.body()
 								        if not body_bytes:
 								            days = 30
 								            trim_old = False
 								        else:
 								            payload = orjson.loads(body_bytes.decode("utf-8"))
 								            days = int(payload.get("days", 30))
 								            trim_old = bool(payload.get("trim_old", False))
 								    except Exception:
 								        days = 30
 								        trim_old = False
 								    aggregated = await db.aggregate_time_series_older_than(days, trim_old=trim_old)
 								    return {"status": "ok", "days": days, "trim_old": trim_old, "aggregated_groups": aggregated}
-												fixing token_queue, prepping chart view

											
										
										
											2025-11-18 19:02:36 +01:00
+								# 12. API route – Stats
 								# -------------------------------------------------------------
 								@app.post("/api/stats")
 								async def stats_proxy(request: Request, model: Optional[str] = None):
 								    """
 								    Return token usage statistics for a specific model.
 								    """
 								    try:
 								        body_bytes = await request.body()
 								        if not model:
 								            payload = orjson.loads(body_bytes.decode("utf-8"))
 								            model = payload.get("model")
 								        if not model:
 								            raise HTTPException(
 								                status_code=400, detail="Missing required field 'model'"
 								            )
 								    except orjson.JSONDecodeError as e:
 								        raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e
 								    # Get token counts from database
 								    token_data = await db.get_token_counts_for_model(model)
 								    if not token_data:
 								        raise HTTPException(
 								            status_code=404, detail="No token data found for this model"
 								        )
-												fix(db.py): remove full table scans with proper where clauses for dashboard statistics and calc in db rather than python

											
										
										
											2026-03-03 17:20:33 +01:00
+								    time_series = [
 								        entry async for entry in db.get_time_series_for_model(model)
 								    ]
 								    endpoint_distribution = await db.get_endpoint_distribution_for_model(model)
-												fixing token_queue, prepping chart view

											
										
										
											2025-11-18 19:02:36 +01:00
 								    return {
 								        'model': model,
 								        'input_tokens': token_data['input_tokens'],
 								        'output_tokens': token_data['output_tokens'],
 								        'total_tokens': token_data['total_tokens'],
-												chart enhancements

											
										
										
											2025-11-19 17:28:31 +01:00
+								        'time_series': time_series,
-												fix(db.py): remove full table scans with proper where clauses for dashboard statistics and calc in db rather than python

											
										
										
											2026-03-03 17:20:33 +01:00
+								        'endpoint_distribution': endpoint_distribution,
-												fixing token_queue, prepping chart view

											
										
										
											2025-11-18 19:02:36 +01:00
+								    }
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								# -------------------------------------------------------------
-												Additions

- Frontend
- Internal Monitoring Endpoints
- External OpenAI compatible backends
											
										
										
											2025-08-30 00:12:56 +02:00
+								# 12. API route – Copy
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								# -------------------------------------------------------------
 								@app.post("/api/copy")
-												Add files via upload

final touches
											
										
										
											2025-09-05 12:11:31 +02:00
+								async def copy_proxy(request: Request, source: Optional[str] = None, destination: Optional[str] = None):
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								    """
 								    Proxy a model copy request to each Ollama endpoint and reply with Status Code.
 								    """
 								    # 1. Parse and validate request
 								    try:
 								        body_bytes = await request.body()
-												Add files via upload

final touches
											
										
										
											2025-09-05 12:11:31 +02:00
+								        if not source and not destination:
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								            payload = orjson.loads(body_bytes.decode("utf-8"))
-												Add files via upload

final touches
											
										
										
											2025-09-05 12:11:31 +02:00
+								            src = payload.get("source")
 								            dst = payload.get("destination")
 								        else:
 								            src = source
 								            dst = destination
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
 								        if not src:
 								            raise HTTPException(
 								                status_code=400, detail="Missing required field 'source'"
 								            )
 								        if not dst:
 								            raise HTTPException(
 								                status_code=400, detail="Missing required field 'destination'"
 								            )
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								    except orjson.JSONDecodeError as e:
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								        raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e
 								    # 3. Iterate over all endpoints to copy the model on each endpoint
 								    status_list = []
-												proposal: use global truststore ctx for all connections

											
										
										
											2026-02-12 16:15:39 +01:00
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								    for endpoint in config.endpoints:
-												Add files via upload

final touches
											
										
										
											2025-09-05 12:11:31 +02:00
+								        if "/v1" not in endpoint:
 								            client = ollama.AsyncClient(host=endpoint)
 								            # 4. Proxy a simple copy request
 								            copy = await client.copy(source=src, destination=dst)
 								            status_list.append(copy.status)
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
 								    # 4. Return with 200 OK if all went well, 404 if a single endpoint failed
-												Add files via upload

final touches
											
										
										
											2025-09-05 12:11:31 +02:00
+								    return Response(status_code=404 if 404 in status_list else 200)
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
 								# -------------------------------------------------------------
-												Additions

- Frontend
- Internal Monitoring Endpoints
- External OpenAI compatible backends
											
										
										
											2025-08-30 00:12:56 +02:00
+								# 13. API route – Delete
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								# -------------------------------------------------------------
 								@app.delete("/api/delete")
-												Add files via upload

final touches
											
										
										
											2025-09-05 12:11:31 +02:00
+								async def delete_proxy(request: Request, model: Optional[str] = None):
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								    """
 								    Proxy a model delete request to each Ollama endpoint and reply with Status Code.
 								    """
 								    # 1. Parse and validate request
 								    try:
 								        body_bytes = await request.body()
-												Add files via upload

final touches
											
										
										
											2025-09-05 12:11:31 +02:00
+								        if not model:
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								            payload = orjson.loads(body_bytes.decode("utf-8"))
-												Add files via upload

final touches
											
										
										
											2025-09-05 12:11:31 +02:00
+								            model = payload.get("model")
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
 								        if not model:
 								            raise HTTPException(
 								                status_code=400, detail="Missing required field 'model'"
 								            )
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								    except orjson.JSONDecodeError as e:
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								        raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e
 								    # 2. Iterate over all endpoints to delete the model on each endpoint
 								    status_list = []
-												proposal: use global truststore ctx for all connections

											
										
										
											2026-02-12 16:15:39 +01:00
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								    for endpoint in config.endpoints:
-												Add files via upload

final touches
											
										
										
											2025-09-05 12:11:31 +02:00
+								        if "/v1" not in endpoint:
 								            client = ollama.AsyncClient(host=endpoint)
 								            # 3. Proxy a simple copy request
 								            copy = await client.delete(model=model)
 								            status_list.append(copy.status)
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
-												fixing typos and smaller issues

											
										
										
											2025-10-28 11:08:52 +01:00
+								    # 4. Return 200 0K, if a single enpoint fails, respond with 404
-												Add files via upload

final touches
											
										
										
											2025-09-05 12:11:31 +02:00
+								    return Response(status_code=404 if 404 in status_list else 200)
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
 								# -------------------------------------------------------------
-												Additions

- Frontend
- Internal Monitoring Endpoints
- External OpenAI compatible backends
											
										
										
											2025-08-30 00:12:56 +02:00
+								# 14. API route – Pull
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								# -------------------------------------------------------------
 								@app.post("/api/pull")
-												Add files via upload

final touches
											
										
										
											2025-09-05 12:11:31 +02:00
+								async def pull_proxy(request: Request, model: Optional[str] = None):
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								    """
 								    Proxy a pull request to all Ollama endpoint and report status back.
 								    """
 								    # 1. Parse and validate request
 								    try:
 								        body_bytes = await request.body()
-												Add files via upload

final touches
											
										
										
											2025-09-05 12:11:31 +02:00
+								        if not model:
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								            payload = orjson.loads(body_bytes.decode("utf-8"))
-												Add files via upload

final touches
											
										
										
											2025-09-05 12:11:31 +02:00
+								            model = payload.get("model")
 								            insecure = payload.get("insecure")
 								        else:
 								            insecure = None
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
 								        if not model:
 								            raise HTTPException(
 								                status_code=400, detail="Missing required field 'model'"
 								            )
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								    except orjson.JSONDecodeError as e:
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								        raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e
 								    # 2. Iterate over all endpoints to pull the model
 								    status_list = []
-												proposal: use global truststore ctx for all connections

											
										
										
											2026-02-12 16:15:39 +01:00
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								    for endpoint in config.endpoints:
-												Add files via upload

final touches
											
										
										
											2025-09-05 12:11:31 +02:00
+								        if "/v1" not in endpoint:
 								            client = ollama.AsyncClient(host=endpoint)
 								            # 3. Proxy a simple pull request
 								            pull = await client.pull(model=model, insecure=insecure, stream=False)
 								            status_list.append(pull)
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
 								    combined_status = []
 								    for status in status_list:
 								        combined_status += status
 								    # 4. Report back a deduplicated status message
 								    final_status = list(dict.fromkeys(combined_status))
 								    return dict(final_status)
 								# -------------------------------------------------------------
-												Additions

- Frontend
- Internal Monitoring Endpoints
- External OpenAI compatible backends
											
										
										
											2025-08-30 00:12:56 +02:00
+								# 15. API route – Push
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								# -------------------------------------------------------------
 								@app.post("/api/push")
 								async def push_proxy(request: Request):
 								    """
 								    Proxy a push request to Ollama and respond the deduplicated Ollama endpoint replies.
 								    """
 								    # 1. Parse and validate request
 								    try:
 								        body_bytes = await request.body()
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								        payload = orjson.loads(body_bytes.decode("utf-8"))
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
 								        model = payload.get("model")
 								        insecure = payload.get("insecure")
 								        if not model:
 								            raise HTTPException(
 								                status_code=400, detail="Missing required field 'model'"
 								            )
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								    except orjson.JSONDecodeError as e:
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								        raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e
 								    # 2. Iterate over all endpoints
 								    status_list = []
-												proposal: use global truststore ctx for all connections

											
										
										
											2026-02-12 16:15:39 +01:00
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								    for endpoint in config.endpoints:
 								        client = ollama.AsyncClient(host=endpoint)
 								        # 3. Proxy a simple push request
 								        push = await client.push(model=model, insecure=insecure, stream=False)
 								        status_list.append(push)
 								    combined_status = []
 								    for status in status_list:
 								        combined_status += status
 								    # 4. Report a deduplicated status
 								    final_status = list(dict.fromkeys(combined_status))
 								    return dict(final_status)
 								# -------------------------------------------------------------
-												Additions

- Frontend
- Internal Monitoring Endpoints
- External OpenAI compatible backends
											
										
										
											2025-08-30 00:12:56 +02:00
+								# 16. API route – Version
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								# -------------------------------------------------------------
 								@app.get("/api/version")
 								async def version_proxy(request: Request):
 								    """
 								    Proxy a version request to Ollama and reply lowest version of all endpoints.
 								    """
 								    # 1. Query all endpoints for version
-												adding fetch class and ollama client completions on openai endpoints

											
										
										
											2025-09-13 16:57:09 +02:00
+								    tasks = [fetch.endpoint_details(ep, "/api/version", "version") for ep in config.endpoints if "/v1" not in ep]
-												fix(router): handle invalid version responses with 503 error

Filter out non-string version responses (e.g., empty lists from failed requests) and return a 503 Service Unavailable error if no valid versions are received from any endpoint.

											
										
										
											2026-02-17 15:56:09 +01:00
+								    all_versions_raw = await asyncio.gather(*tasks)
 								    # Filter out non-string values (e.g., empty lists from failed/timeout responses)
 								    all_versions = [v for v in all_versions_raw if isinstance(v, str) and v]
 								    if not all_versions:
 								        raise HTTPException(status_code=503, detail="No valid version response from any endpoint")
-												Add files via upload

final touches
											
										
										
											2025-09-05 12:11:31 +02:00
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								    def version_key(v):
 								        return tuple(map(int, v.split('.')))
 								    # 2. Return a JSONResponse with the min Version of all endpoints to maintain compatibility
 								    return JSONResponse(
 								        content={"version": str(min(all_versions, key=version_key))},
 								        status_code=200,
 								    )
 								# -------------------------------------------------------------
-												Additions

- Frontend
- Internal Monitoring Endpoints
- External OpenAI compatible backends
											
										
										
											2025-08-30 00:12:56 +02:00
+								# 17. API route – tags
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								# -------------------------------------------------------------
 								@app.get("/api/tags")
 								async def tags_proxy(request: Request):
 								    """
 								    Proxy a tags request to Ollama endpoints and reply with a unique list of all models.
 								    """
-												Add files via upload

final touches
											
										
										
											2025-09-05 12:11:31 +02:00
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								    # 1. Query all endpoints for models
-												fix: prevent dashboard and route hangs when endpoints are down by calling skip_error_cache also with reduced timeout

											
										
										
											2026-05-01 13:49:34 +02:00
+								    tasks = [fetch.endpoint_details(ep, "/api/tags", "models", skip_error_cache=True, timeout=8) for ep in config.endpoints if "/v1" not in ep]
 								    tasks += [fetch.endpoint_details(ep, "/models", "data", config.api_keys[ep], skip_error_cache=True, timeout=8) for ep in config.endpoints if "/v1" in ep]
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								    # Also query llama-server endpoints not already covered by config.endpoints
 								    llama_eps_for_tags = [ep for ep in config.llama_server_endpoints if ep not in config.endpoints]
-												fix: prevent dashboard and route hangs when endpoints are down by calling skip_error_cache also with reduced timeout

											
										
										
											2026-05-01 13:49:34 +02:00
+								    tasks += [fetch.endpoint_details(ep, "/models", "data", config.api_keys.get(ep), skip_error_cache=True, timeout=8) for ep in llama_eps_for_tags]
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								    all_models = await asyncio.gather(*tasks)
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								    models = {'models': []}
 								    for modellist in all_models:
-												relabling openai models with ollama compatible tags

											
										
										
											2025-09-15 11:57:00 +02:00
+								        for model in modellist:
 								            if not "model" in model.keys():  # Relable OpenAI models with Ollama Model.model from Model.id
-												finalizing compliance tasks

											
										
										
											2025-09-15 19:12:00 +02:00
+								                model['model'] = model['id'] + ":latest"
-												relabling openai models with ollama compatible tags

											
										
										
											2025-09-15 11:57:00 +02:00
+								            else:
 								                model['id'] = model['model']
-												fixing openai models relabling for ollama client libs

											
										
										
											2025-09-15 17:00:53 +02:00
+								            if not "name" in model.keys():  # Relable OpenAI models with Ollama Model.name from Model.model to have model,name keys
 								                model['name'] = model['model']
 								            else:
 								                model['id'] = model['model']
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								        models['models'] += modellist
 								    # 2. Return a JSONResponse with a deduplicated list of unique models for inference
 								    return JSONResponse(
-												Add files via upload

final touches
											
										
										
											2025-09-05 12:11:31 +02:00
+								        content={"models": dedupe_on_keys(models['models'], ['digest','name','id'])},
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								        status_code=200,
 								    )
 								# -------------------------------------------------------------
-												Additions

- Frontend
- Internal Monitoring Endpoints
- External OpenAI compatible backends
											
										
										
											2025-08-30 00:12:56 +02:00
+								# 18. API route – ps
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								# -------------------------------------------------------------
 								@app.get("/api/ps")
 								async def ps_proxy(request: Request):
 								    """
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								    Proxy a ps request to all Ollama and llama-server endpoints and reply a unique list of all running models.
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								    For Ollama endpoints: queries /api/ps
 								    For llama-server endpoints: queries /v1/models with status.value == "loaded"
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								    """
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								    # 1. Query Ollama endpoints for running models via /api/ps
-												fix: prevent dashboard and route hangs when endpoints are down by calling skip_error_cache also with reduced timeout

											
										
										
											2026-05-01 13:49:34 +02:00
+								    ollama_tasks = [fetch.endpoint_details(ep, "/api/ps", "models", skip_error_cache=True, timeout=8) for ep in config.endpoints if "/v1" not in ep]
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								    # 2. Query llama-server endpoints for loaded models via /v1/models
 								    # Also query endpoints from llama_server_endpoints that may not be in config.endpoints
 								    all_llama_endpoints = set(config.llama_server_endpoints) | set(ep for ep in config.endpoints if ep in config.llama_server_endpoints)
 								    llama_tasks = [
-												fix: prevent dashboard and route hangs when endpoints are down by calling skip_error_cache also with reduced timeout

											
										
										
											2026-05-01 13:49:34 +02:00
+								        fetch.endpoint_details(ep, "/models", "data", config.api_keys.get(ep), skip_error_cache=True, timeout=8)
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								        for ep in all_llama_endpoints
 								    ]
 								    ollama_loaded = await asyncio.gather(*ollama_tasks) if ollama_tasks else []
 								    llama_loaded = await asyncio.gather(*llama_tasks) if llama_tasks else []
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
 								    models = {'models': []}
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								    # Add Ollama models (if any)
 								    if ollama_loaded:
 								        for modellist in ollama_loaded:
 								            models['models'] += modellist
 								    # Add llama-server models (filter for loaded only, if any)
 								    if llama_loaded:
 								        for modellist in llama_loaded:
 								            loaded_models = [item for item in modellist if _is_llama_model_loaded(item)]
 								            # Convert llama-server format to Ollama-like format for consistency
 								            for item in loaded_models:
 								                raw_id = item.get("id", "")
 								                normalized = _normalize_llama_model_name(raw_id)
 								                quant = _extract_llama_quant(raw_id)
 								                models['models'].append({
 								                    "name": normalized,
 								                    "id": normalized,
 								                    "digest": "",
 								                    "status": item.get("status"),
 								                    "details": {"quantization_level": quant} if quant else {}
 								                })
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								    # 3. Return a JSONResponse with deduplicated currently deployed models
-												fix(router.py):
- added global for orphaned token_worker_task and flust_task
- fixed a regex to effectively _mask_secrets
- fixed several Type and KeyErrors
- fixed model deduplication for llama_server_endpoints

											
										
										
											2026-03-03 16:34:16 +01:00
+								    # Deduplicate on 'name' rather than 'digest': llama-server models always
 								    # have digest="" so deduping on digest collapses all of them to one entry.
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								    return JSONResponse(
-												fix(router.py):
- added global for orphaned token_worker_task and flust_task
- fixed a regex to effectively _mask_secrets
- fixed several Type and KeyErrors
- fixed model deduplication for llama_server_endpoints

											
										
										
											2026-03-03 16:34:16 +01:00
+								        content={"models": dedupe_on_keys(models['models'], ['name'])},
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								        status_code=200,
 								    )
-												Added endpoint differentiation for models ps board

Added endpoint differentiation for models PS board to see where which model is loaded and for how long to ease the viewing of multiple same models deployed for load balancing

											
										
										
											2026-01-27 13:29:54 +01:00
+								# -------------------------------------------------------------
 								# 18b. API route – ps details (backwards compatible)
 								# -------------------------------------------------------------
 								@app.get("/api/ps_details")
 								async def ps_details_proxy(request: Request):
 								    """
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								    Proxy a ps request to all Ollama and llama-server endpoints and reply with per-endpoint instances.
-												Added endpoint differentiation for models ps board

Added endpoint differentiation for models PS board to see where which model is loaded and for how long to ease the viewing of multiple same models deployed for load balancing

											
										
										
											2026-01-27 13:29:54 +01:00
+								    This keeps /api/ps backward compatible while providing richer data.
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
 								    For Ollama endpoints: queries /api/ps
 								    For llama-server endpoints: queries /v1/models with status info
 								    """
 								    # 1. Query Ollama endpoints via /api/ps
-												fix: prevent dashboard and route hangs when endpoints are down by calling skip_error_cache also with reduced timeout

											
										
										
											2026-05-01 13:49:34 +02:00
+								    ollama_tasks = [(ep, fetch.endpoint_details(ep, "/api/ps", "models", skip_error_cache=True, timeout=8)) for ep in config.endpoints if "/v1" not in ep]
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								    # 2. Query llama-server endpoints via /v1/models
 								    # Also query endpoints from llama_server_endpoints that may not be in config.endpoints
 								    all_llama_endpoints = set(config.llama_server_endpoints) | set(ep for ep in config.endpoints if ep in config.llama_server_endpoints)
 								    llama_tasks = [
-												fix: prevent dashboard and route hangs when endpoints are down by calling skip_error_cache also with reduced timeout

											
										
										
											2026-05-01 13:49:34 +02:00
+								        (ep, fetch.endpoint_details(ep, "/models", "data", config.api_keys.get(ep), skip_error_cache=True, timeout=8))
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								        for ep in all_llama_endpoints
 								    ]
 								    ollama_loaded = await asyncio.gather(*[task for _, task in ollama_tasks]) if ollama_tasks else []
 								    llama_loaded = await asyncio.gather(*[task for _, task in llama_tasks]) if llama_tasks else []
-												Added endpoint differentiation for models ps board

Added endpoint differentiation for models PS board to see where which model is loaded and for how long to ease the viewing of multiple same models deployed for load balancing

											
										
										
											2026-01-27 13:29:54 +01:00
 								    models: list[dict] = []
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
 								    # Add Ollama models with endpoint info (if any)
 								    if ollama_loaded:
 								        for (endpoint, modellist) in zip([ep for ep, _ in ollama_tasks], ollama_loaded):
 								            for model in modellist:
 								                if isinstance(model, dict):
 								                    model_with_endpoint = dict(model)
 								                    model_with_endpoint["endpoint"] = endpoint
 								                    models.append(model_with_endpoint)
 								    # Add llama-server models with endpoint info and full status metadata (if any)
 								    if llama_loaded:
-												feat(router): parallelize llama-server props fetch and add reasoning/tool call support

- Fetch `/props` endpoints in parallel to get context length and auto-unload sleeping models
- Add support for reasoning content and tool calls in streaming openai chat/completions responses

											
										
										
											2026-02-15 17:05:35 +01:00
+								        # Collect (endpoint, raw_id) pairs to fetch /props in parallel
 								        props_requests: list[tuple[str, str]] = []
 								        llama_models_pending: list[dict] = []
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								        for (endpoint, modellist) in zip([ep for ep, _ in llama_tasks], llama_loaded):
-												feat: new helper to bridge change of behaviour in llama.cpp v1/models status  - now correctly reporting "sleeping" or "loaded" for auto-unload

											
										
										
											2026-05-07 11:34:09 +02:00
+								            # Include sleeping models too so _fetch_llama_props can unload them
 								            loaded_models = [item for item in modellist if _is_llama_model_loaded_or_sleeping(item)]
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								            for item in loaded_models:
 								                if isinstance(item, dict) and item.get("id"):
 								                    raw_id = item["id"]
 								                    normalized = _normalize_llama_model_name(raw_id)
 								                    quant = _extract_llama_quant(raw_id)
 								                    model_with_endpoint = {
 								                        "name": normalized,
 								                        "id": normalized,
 								                        "original_name": raw_id,
 								                        "digest": "",
 								                        "details": {"quantization_level": quant} if quant else {},
 								                        "endpoint": endpoint,
 								                        "status": item.get("status"),
 								                        "created": item.get("created"),
 								                        "owned_by": item.get("owned_by")
 								                    }
 								                    # Include full llama-server status details (args, preset)
 								                    status_info = item.get("status", {})
 								                    if isinstance(status_info, dict):
 								                        model_with_endpoint["llama_status_args"] = status_info.get("args")
 								                        model_with_endpoint["llama_status_preset"] = status_info.get("preset")
-												feat(router): parallelize llama-server props fetch and add reasoning/tool call support

- Fetch `/props` endpoints in parallel to get context length and auto-unload sleeping models
- Add support for reasoning content and tool calls in streaming openai chat/completions responses

											
										
										
											2026-02-15 17:05:35 +01:00
+								                    llama_models_pending.append(model_with_endpoint)
 								                    props_requests.append((endpoint, raw_id))
 								        # Fetch /props for each llama-server model to get context length (n_ctx)
 								        # and unload sleeping models automatically
-												fix: exclude embedding models from preemptive context shift caches

											
										
										
											2026-03-12 18:56:51 +01:00
+								        async def _fetch_llama_props(endpoint: str, model_id: str) -> tuple[int | None, bool, bool]:
-												feat: support localhost llama_server access via unix sockets

											
										
										
											2026-04-17 12:41:57 +02:00
+								            client: aiohttp.ClientSession = get_session(endpoint)
-												feat(router): parallelize llama-server props fetch and add reasoning/tool call support

- Fetch `/props` endpoints in parallel to get context length and auto-unload sleeping models
- Add support for reasoning content and tool calls in streaming openai chat/completions responses

											
										
										
											2026-02-15 17:05:35 +01:00
+								            base_url = endpoint.rstrip("/").removesuffix("/v1")
 								            props_url = f"{base_url}/props?model={model_id}"
 								            headers = None
 								            api_key = config.api_keys.get(endpoint)
 								            if api_key:
 								                headers = {"Authorization": f"Bearer {api_key}"}
 								            try:
-												fix: prevent dashboard and route hangs when endpoints are down by calling skip_error_cache also with reduced timeout

											
										
										
											2026-05-01 13:49:34 +02:00
+								                async with client.get(props_url, headers=headers, timeout=aiohttp.ClientTimeout(total=5)) as resp:
-												feat(router): parallelize llama-server props fetch and add reasoning/tool call support

- Fetch `/props` endpoints in parallel to get context length and auto-unload sleeping models
- Add support for reasoning content and tool calls in streaming openai chat/completions responses

											
										
										
											2026-02-15 17:05:35 +01:00
+								                    if resp.status == 200:
 								                        data = await resp.json()
 								                        dgs = data.get("default_generation_settings", {})
 								                        n_ctx = dgs.get("n_ctx")
 								                        is_sleeping = data.get("is_sleeping", False)
-												fix: exclude embedding models from preemptive context shift caches

											
										
										
											2026-03-12 18:56:51 +01:00
+								                        # Embedding models have no sampling params in default_generation_settings
 								                        is_generation = "temperature" in dgs
-												feat(router): parallelize llama-server props fetch and add reasoning/tool call support

- Fetch `/props` endpoints in parallel to get context length and auto-unload sleeping models
- Add support for reasoning content and tool calls in streaming openai chat/completions responses

											
										
										
											2026-02-15 17:05:35 +01:00
 								                        if is_sleeping:
 								                            unload_url = f"{base_url}/models/unload"
 								                            try:
 								                                async with client.post(
 								                                    unload_url,
 								                                    json={"model": model_id},
 								                                    headers=headers,
 								                                ) as unload_resp:
 								                                    print(f"[ps_details] Unloaded sleeping model {model_id} from {endpoint}: {unload_resp.status}")
 								                            except Exception as ue:
 								                                print(f"[ps_details] Failed to unload sleeping model {model_id} from {endpoint}: {ue}")
-												fix: exclude embedding models from preemptive context shift caches

											
										
										
											2026-03-12 18:56:51 +01:00
+								                        return n_ctx, is_sleeping, is_generation
-												feat(router): parallelize llama-server props fetch and add reasoning/tool call support

- Fetch `/props` endpoints in parallel to get context length and auto-unload sleeping models
- Add support for reasoning content and tool calls in streaming openai chat/completions responses

											
										
										
											2026-02-15 17:05:35 +01:00
+								            except Exception as e:
 								                print(f"[ps_details] Failed to fetch props from {props_url}: {e}")
-												fix: exclude embedding models from preemptive context shift caches

											
										
										
											2026-03-12 18:56:51 +01:00
+								            return None, False, False
-												feat(router): parallelize llama-server props fetch and add reasoning/tool call support

- Fetch `/props` endpoints in parallel to get context length and auto-unload sleeping models
- Add support for reasoning content and tool calls in streaming openai chat/completions responses

											
										
										
											2026-02-15 17:05:35 +01:00
 								        props_results = await asyncio.gather(
 								            *[_fetch_llama_props(ep, mid) for ep, mid in props_requests]
 								        )
-												fix: exclude embedding models from preemptive context shift caches

											
										
										
											2026-03-12 18:56:51 +01:00
+								        for (ep, raw_id), model_dict, (n_ctx, is_sleeping, is_generation) in zip(props_requests, llama_models_pending, props_results):
-												feat(router): parallelize llama-server props fetch and add reasoning/tool call support

- Fetch `/props` endpoints in parallel to get context length and auto-unload sleeping models
- Add support for reasoning content and tool calls in streaming openai chat/completions responses

											
										
										
											2026-02-15 17:05:35 +01:00
+								            if n_ctx is not None:
 								                model_dict["context_length"] = n_ctx
-												fix: exclude embedding models from preemptive context shift caches

											
										
										
											2026-03-12 18:56:51 +01:00
+								                if is_generation and 0 < n_ctx <= _CTX_TRIM_SMALL_LIMIT:
-												fix: model name normalization for context_cash preemptive context-shifting for smaller context-windows with previous failure

											
										
										
											2026-03-12 16:08:01 +01:00
+								                    normalized = _normalize_llama_model_name(raw_id)
 								                    _endpoint_nctx[(ep, normalized)] = n_ctx
 								                    print(f"[ctx-cache/ps] cached n_ctx={n_ctx} for ({ep},{normalized})", flush=True)
-												feat(router): parallelize llama-server props fetch and add reasoning/tool call support

- Fetch `/props` endpoints in parallel to get context length and auto-unload sleeping models
- Add support for reasoning content and tool calls in streaming openai chat/completions responses

											
										
										
											2026-02-15 17:05:35 +01:00
+								            if not is_sleeping:
 								                models.append(model_dict)
-												Added endpoint differentiation for models ps board

Added endpoint differentiation for models PS board to see where which model is loaded and for how long to ease the viewing of multiple same models deployed for load balancing

											
										
										
											2026-01-27 13:29:54 +01:00
 								    return JSONResponse(content={"models": models}, status_code=200)
-												feat: visualization of conversation affinity in dashboard

											
										
										
											2026-05-13 13:38:37 +02:00
+								# -------------------------------------------------------------
 								# 18b. Conversation-affinity stats – feeds the PS-table dot matrix
 								# -------------------------------------------------------------
 								@app.get("/api/affinity_stats")
 								async def affinity_stats(request: Request):
 								    """
 								    Aggregate live conversation-affinity pins, one entry per pinned conversation.
 								    Each entry exposes only the endpoint, model, and remaining TTL in seconds —
 								    no fingerprints or content. When conversation_affinity is disabled the
 								    `entries` list is always empty.
 								    """
 								    if not config.conversation_affinity:
 								        return {"enabled": False, "ttl": config.conversation_affinity_ttl, "entries": []}
 								    now = time.monotonic()
 								    entries: list[dict] = []
-												fix: model naming for affinity status for llama endpoints

											
										
										
											2026-05-13 14:35:45 +02:00
+								    llama_eps = set(config.llama_server_endpoints)
-												feat: visualization of conversation affinity in dashboard

											
										
										
											2026-05-13 13:38:37 +02:00
+								    async with _affinity_lock:
 								        for fp, (ep, mdl, expires_at) in list(_affinity_map.items()):
 								            remaining = expires_at - now
 								            if remaining <= 0:
 								                _affinity_map.pop(fp, None)
 								                continue
-												fix: model naming for affinity status for llama endpoints

											
										
										
											2026-05-13 14:35:45 +02:00
+								            # Mirror the normalisation used by /api/ps_details so the dashboard
 								            # can join affinity entries to PS rows by (endpoint, model).
 								            display_model = _normalize_llama_model_name(mdl) if ep in llama_eps else mdl
-												feat: visualization of conversation affinity in dashboard

											
										
										
											2026-05-13 13:38:37 +02:00
+								            entries.append({
 								                "endpoint": ep,
-												fix: model naming for affinity status for llama endpoints

											
										
										
											2026-05-13 14:35:45 +02:00
+								                "model": display_model,
-												feat: visualization of conversation affinity in dashboard

											
										
										
											2026-05-13 13:38:37 +02:00
+								                "remaining": round(remaining, 2),
 								            })
 								    return {
 								        "enabled": True,
 								        "ttl": config.conversation_affinity_ttl,
 								        "entries": entries,
 								    }
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								# -------------------------------------------------------------
-												Additions

- Frontend
- Internal Monitoring Endpoints
- External OpenAI compatible backends
											
										
										
											2025-08-30 00:12:56 +02:00
+								# 19. Proxy usage route – for monitoring
 								# -------------------------------------------------------------
 								@app.get("/api/usage")
 								async def usage_proxy(request: Request):
 								    """
 								    Return a snapshot of the usage counter for each endpoint.
 								    Useful for debugging / monitoring.
 								    """
-												record and display total token usage on ollama endpoints using ollama client

											
										
										
											2025-11-04 17:55:19 +01:00
+								    return {"usage_counts": usage_counts,
 								            "token_usage_counts": token_usage_counts}
-												Additions

- Frontend
- Internal Monitoring Endpoints
- External OpenAI compatible backends
											
										
										
											2025-08-30 00:12:56 +02:00
-												refac: modularize backend IV

											
										
										
											2026-05-19 12:05:51 +02:00
+								from backends.probe import _raw_probe, _endpoint_health
-												fix:
-  _fetch_loaded_models_internal now writes _loaded_error_cache[endpoint] = time.time() on /api/ps or /v1/models failure, and clears the entry on success
- choose_endpoint now filters out candidates with a fresh (<300s) loaded-models error.
-  /health now probes both /api/version and /api/ps for Ollama endpoints
-  dashboard adaption

relates to #83

											
										
										
											2026-05-18 13:45:06 +02:00
 								# -------------------------------------------------------------
 								# 20b. Proxy config route – for monitoring and frontend usage
-												Additions

- Frontend
- Internal Monitoring Endpoints
- External OpenAI compatible backends
											
										
										
											2025-08-30 00:12:56 +02:00
+								# -------------------------------------------------------------
 								@app.get("/api/config")
 								async def config_proxy(request: Request):
 								    """
 								    Return a simple JSON object that contains the configured
-												fix:
-  _fetch_loaded_models_internal now writes _loaded_error_cache[endpoint] = time.time() on /api/ps or /v1/models failure, and clears the entry on success
- choose_endpoint now filters out candidates with a fresh (<300s) loaded-models error.
-  /health now probes both /api/version and /api/ps for Ollama endpoints
-  dashboard adaption

relates to #83

											
										
										
											2026-05-18 13:45:06 +02:00
+								    Ollama endpoints and llama_server_endpoints. The front‑end uses this
 								    to display which endpoints are being proxied and their health.
 								    Status is "error" when either liveness (/api/version) or routing
 								    health (/api/ps) fails — see issue #83.
 								    """
 								    async def check(url: str) -> dict:
 								        return {"url": url, **(await _endpoint_health(url, timeout=5))}
-												Add files via upload

Adding:
- health endpoint
- extended /api/config
- frontend extension of backend server viz
											
										
										
											2025-08-30 12:43:35 +02:00
-												fix:
-  _fetch_loaded_models_internal now writes _loaded_error_cache[endpoint] = time.time() on /api/ps or /v1/models failure, and clears the entry on success
- choose_endpoint now filters out candidates with a fresh (<300s) loaded-models error.
-  /health now probes both /api/version and /api/ps for Ollama endpoints
-  dashboard adaption

relates to #83

											
										
										
											2026-05-18 13:45:06 +02:00
+								    ollama_results = await asyncio.gather(*[check(ep) for ep in config.endpoints])
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								    llama_results = []
 								    if config.llama_server_endpoints:
-												fix:
-  _fetch_loaded_models_internal now writes _loaded_error_cache[endpoint] = time.time() on /api/ps or /v1/models failure, and clears the entry on success
- choose_endpoint now filters out candidates with a fresh (<300s) loaded-models error.
-  /health now probes both /api/version and /api/ps for Ollama endpoints
-  dashboard adaption

relates to #83

											
										
										
											2026-05-18 13:45:06 +02:00
+								        llama_results = await asyncio.gather(
 								            *[check(ep) for ep in config.llama_server_endpoints]
 								        )
-												add: Optional router-level API key that gates router/API/web UI access

Optional router-level API key that gates router/API/web UI access (leave empty to disable)

## Supplying the router API key

If you set `nomyo-router-api-key` in `config.yaml` (or `NOMYO_ROUTER_API_KEY` env), every request to NOMYO Router must include the key:

- HTTP header (recommended): `Authorization: Bearer <router_key>`
- Query param (fallback): `?api_key=<router_key>`

Examples:
```bash
curl -H "Authorization: Bearer $NOMYO_ROUTER_API_KEY" http://localhost:12434/api/tags
curl "http://localhost:12434/api/tags?api_key=$NOMYO_ROUTER_API_KEY"
```

											
										
										
											2026-01-14 09:28:02 +01:00
+								    return {
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								        "endpoints": ollama_results,
 								        "llama_server_endpoints": llama_results,
-												add: Optional router-level API key that gates router/API/web UI access

Optional router-level API key that gates router/API/web UI access (leave empty to disable)

## Supplying the router API key

If you set `nomyo-router-api-key` in `config.yaml` (or `NOMYO_ROUTER_API_KEY` env), every request to NOMYO Router must include the key:

- HTTP header (recommended): `Authorization: Bearer <router_key>`
- Query param (fallback): `?api_key=<router_key>`

Examples:
```bash
curl -H "Authorization: Bearer $NOMYO_ROUTER_API_KEY" http://localhost:12434/api/tags
curl "http://localhost:12434/api/tags?api_key=$NOMYO_ROUTER_API_KEY"
```

											
										
										
											2026-01-14 09:28:02 +01:00
+								        "require_router_api_key": bool(config.router_api_key),
 								    }
-												Additions

- Frontend
- Internal Monitoring Endpoints
- External OpenAI compatible backends
											
										
										
											2025-08-30 00:12:56 +02:00
 								# -------------------------------------------------------------
 								# 21. API route – OpenAI compatible Embedding
-												Adding OpenAI compatibility

New Endpoints
New Requirements
											
										
										
											2025-08-28 09:40:33 +02:00
+								# -------------------------------------------------------------
 								@app.post("/v1/embeddings")
 								async def openai_embedding_proxy(request: Request):
 								    """
 								    Proxy an OpenAI API compatible embedding request to Ollama and reply with embeddings.
 								    """
 								    # 1. Parse and validate request
 								    try:
 								        body_bytes = await request.body()
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								        payload = orjson.loads(body_bytes.decode("utf-8"))
-												Adding OpenAI compatibility

New Endpoints
New Requirements
											
										
										
											2025-08-28 09:40:33 +02:00
 								        model = payload.get("model")
-												better v1 endpoint paramter handling

											
										
										
											2025-09-11 13:56:51 +02:00
+								        doc = payload.get("input")
-												Add files via upload

final touches
											
										
										
											2025-09-05 12:11:31 +02:00
-												fix(router): normalize multimodal input to extract text for embeddings

Extract text parts from multimodal payloads (lists/dicts).
Skip image_url and other non-text types to ensure embedding
models receive compatible text-only input.

											
										
										
											2026-03-11 16:41:21 +01:00
+								        # Normalize multimodal input: extract only text parts for embedding models
 								        if isinstance(doc, list):
 								            normalized = []
 								            for item in doc:
 								                if isinstance(item, dict):
 								                    # Multimodal content part - extract text only, skip images
 								                    if item.get("type") == "text":
 								                        normalized.append(item.get("text", ""))
 								                    # Skip image_url and other non-text types
 								                else:
 								                    normalized.append(item)
 								            doc = normalized if len(normalized) != 1 else normalized[0]
 								        elif isinstance(doc, dict) and doc.get("type") == "text":
 								            doc = doc.get("text", "")
-												Adding OpenAI compatibility

New Endpoints
New Requirements
											
										
										
											2025-08-28 09:40:33 +02:00
+								        if not model:
 								            raise HTTPException(
 								                status_code=400, detail="Missing required field 'model'"
 								            )
-												better v1 endpoint paramter handling

											
										
										
											2025-09-11 13:56:51 +02:00
+								        if not doc:
-												Adding OpenAI compatibility

New Endpoints
New Requirements
											
										
										
											2025-08-28 09:40:33 +02:00
+								            raise HTTPException(
 								                status_code=400, detail="Missing required field 'input'"
 								            )
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								    except orjson.JSONDecodeError as e:
-												Adding OpenAI compatibility

New Endpoints
New Requirements
											
										
										
											2025-08-28 09:40:33 +02:00
+								        raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e
 								    # 2. Endpoint logic
-												refactor: make choose_endpoint use cache incrementer for atomic updates

											
										
										
											2026-03-03 14:57:37 +01:00
+								    endpoint, tracking_model = await choose_endpoint(model)
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								    if is_openai_compatible(endpoint):
 								        api_key = config.api_keys.get(endpoint, "no-key")
-												Add files via upload

final touches
											
										
										
											2025-09-05 12:11:31 +02:00
+								    else:
 								        api_key = "ollama"
-												feat: support localhost llama_server access via unix sockets

											
										
										
											2026-04-17 12:41:57 +02:00
+								    oclient = _make_openai_client(endpoint, default_headers=default_headers, api_key=api_key)
-												Adding OpenAI compatibility

New Endpoints
New Requirements
											
										
										
											2025-08-28 09:40:33 +02:00
-												fix(/v1/embeddings): returning the async_gen forced FastAPI serialization which caused Pydantic Errors. Also sanizted nan/inf values to floats (0.0).
Use try - finally to properly decrement usage counters in case of error.

											
										
										
											2026-02-27 16:39:27 +01:00
+								    try:
 								        async_gen = await oclient.embeddings.create(input=doc, model=model)
 								        result = async_gen.model_dump()
 								        for item in result.get("data", []):
 								            emb = item.get("embedding")
 								            if emb:
 								                item["embedding"] = [0.0 if isinstance(v, float) and not math.isfinite(v) else v for v in emb]
 								        return JSONResponse(content=result)
 								    finally:
 								        await decrement_usage(endpoint, tracking_model)
-												Adding OpenAI compatibility

New Endpoints
New Requirements
											
										
										
											2025-08-28 09:40:33 +02:00
 								# -------------------------------------------------------------
-												Additions

- Frontend
- Internal Monitoring Endpoints
- External OpenAI compatible backends
											
										
										
											2025-08-30 00:12:56 +02:00
+								# 22. API route – OpenAI compatible Chat Completions
-												Update router.py

added t.b.d. OpenAI API compatible endpoints
											
										
										
											2025-08-27 09:23:59 +02:00
+								# -------------------------------------------------------------
 								@app.post("/v1/chat/completions")
-												Adding OpenAI compatibility

New Endpoints
New Requirements
											
										
										
											2025-08-28 09:40:33 +02:00
+								async def openai_chat_completions_proxy(request: Request):
 								    """
 								    Proxy an OpenAI API compatible chat completions request to Ollama and reply with a streaming response.
 								    """
 								    # 1. Parse and validate request
 								    try:
 								        body_bytes = await request.body()
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								        payload = orjson.loads(body_bytes.decode("utf-8"))
-												Adding OpenAI compatibility

New Endpoints
New Requirements
											
										
										
											2025-08-28 09:40:33 +02:00
 								        model = payload.get("model")
 								        messages = payload.get("messages")
 								        frequency_penalty = payload.get("frequency_penalty")
 								        presence_penalty = payload.get("presence_penalty")
 								        response_format = payload.get("response_format")
 								        seed = payload.get("seed")
 								        stop = payload.get("stop")
 								        stream = payload.get("stream")
 								        stream_options = payload.get("stream_options")
 								        temperature = payload.get("temperature")
 								        top_p = payload.get("top_p")
 								        max_tokens = payload.get("max_tokens")
-												Add files via upload

final touches
											
										
										
											2025-09-05 12:11:31 +02:00
+								        max_completion_tokens = payload.get("max_completion_tokens")
-												Additions

- Frontend
- Internal Monitoring Endpoints
- External OpenAI compatible backends
											
										
										
											2025-08-30 00:12:56 +02:00
+								        tools = payload.get("tools")
-												feat: adding logprobs to /v1/chat/completion

											
										
										
											2026-02-13 14:43:10 +01:00
+								        logprobs = payload.get("logprobs")
 								        top_logprobs = payload.get("top_logprobs")
-												fix: improvements, fixes and opt-in cache

doc: semantic-cache.md added with detailed write-up

											
										
										
											2026-03-10 15:19:37 +01:00
+								        _cache_enabled = payload.get("nomyo", {}).get("cache", False)
-												Additions

- Frontend
- Internal Monitoring Endpoints
- External OpenAI compatible backends
											
										
										
											2025-08-30 00:12:56 +02:00
-												fix(router.py):
- added global for orphaned token_worker_task and flust_task
- fixed a regex to effectively _mask_secrets
- fixed several Type and KeyErrors
- fixed model deduplication for llama_server_endpoints

											
										
										
											2026-03-03 16:34:16 +01:00
+								        if not model:
 								            raise HTTPException(
 								                status_code=400, detail="Missing required field 'model'"
 								            )
 								        if not isinstance(messages, list):
 								            raise HTTPException(
 								                status_code=400, detail="Missing required field 'messages' (must be a list)"
 								            )
-												fixing model re-naming in /v1 endpoints and thinking in rechunk

											
										
										
											2025-09-17 11:40:48 +02:00
+								        if ":latest" in model:
 								            model = model.split(":latest")
 								            model = model[0]
-												fix: missing requirement

fix: strip assistant prefill when ollama -> openai translaton + openai guard

											
										
										
											2026-04-06 11:32:47 +02:00
+								        messages = _strip_assistant_prefill(messages)
-												Additions

- Frontend
- Internal Monitoring Endpoints
- External OpenAI compatible backends
											
										
										
											2025-08-30 00:12:56 +02:00
+								        params = {
-												fix(router.py):
- added global for orphaned token_worker_task and flust_task
- fixed a regex to effectively _mask_secrets
- fixed several Type and KeyErrors
- fixed model deduplication for llama_server_endpoints

											
										
										
											2026-03-03 16:34:16 +01:00
+								            "messages": messages,
-												Additions

- Frontend
- Internal Monitoring Endpoints
- External OpenAI compatible backends
											
										
										
											2025-08-30 00:12:56 +02:00
+								            "model": model,
 								        }
-												better v1 endpoint paramter handling

											
										
										
											2025-09-11 13:56:51 +02:00
+								        optional_params = {
 								            "tools": tools,
 								            "response_format": response_format,
-												adding usage metrics to /v1 endpoints if stream == True

											
										
										
											2025-11-21 09:56:42 +01:00
+								            "stream_options": stream_options or {"include_usage": True },
-												better v1 endpoint paramter handling

											
										
										
											2025-09-11 13:56:51 +02:00
+								            "max_completion_tokens": max_completion_tokens,
 								            "max_tokens": max_tokens,
 								            "temperature": temperature,
 								            "top_p": top_p,
 								            "seed": seed,
 								            "presence_penalty": presence_penalty,
 								            "frequency_penalty": frequency_penalty,
 								            "stop": stop,
 								            "stream": stream,
-												feat: adding logprobs to /v1/chat/completion

											
										
										
											2026-02-13 14:43:10 +01:00
+								            "logprobs": logprobs,
 								            "top_logprobs": top_logprobs,
-												better v1 endpoint paramter handling

											
										
										
											2025-09-11 13:56:51 +02:00
+								        }
 								        params.update({k: v for k, v in optional_params.items() if v is not None})
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								    except orjson.JSONDecodeError as e:
-												Adding OpenAI compatibility

New Endpoints
New Requirements
											
										
										
											2025-08-28 09:40:33 +02:00
+								        raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e
-												fix: improvements, fixes and opt-in cache

doc: semantic-cache.md added with detailed write-up

											
										
										
											2026-03-10 15:19:37 +01:00
+								    # Reject unsupported image formats (SVG) before doing any work
 								    for _msg in messages:
 								        for _item in (_msg.get("content") or []) if isinstance(_msg.get("content"), list) else []:
 								            if _item.get("type") == "image_url":
 								                _url = (_item.get("image_url") or {}).get("url", "")
 								                if _url.startswith("data:image/svg") or _url.lower().endswith(".svg"):
 								                    raise HTTPException(
 								                        status_code=400,
 								                        detail="SVG images are not supported. Please convert the image to PNG or JPEG before sending.",
 								                    )
-												feat: adding a semantic cache layer

											
										
										
											2026-03-08 09:12:09 +01:00
+								    # Cache lookup — before endpoint selection
 								    _cache = get_llm_cache()
-												fix: improvements, fixes and opt-in cache

doc: semantic-cache.md added with detailed write-up

											
										
										
											2026-03-10 15:19:37 +01:00
+								    if _cache is not None and _cache_enabled:
-												feat: adding a semantic cache layer

											
										
										
											2026-03-08 09:12:09 +01:00
+								        _cached = await _cache.get_chat("openai_chat", model, messages)
 								        if _cached is not None:
 								            if stream:
 								                _sse = openai_nonstream_to_sse(_cached, model)
 								                async def _serve_cached_ochat_stream():
 								                    yield _sse
 								                return StreamingResponse(_serve_cached_ochat_stream(), media_type="text/event-stream")
 								            else:
 								                async def _serve_cached_ochat_json():
 								                    yield _cached
 								                return StreamingResponse(_serve_cached_ochat_json(), media_type="application/json")
-												Adding OpenAI compatibility

New Endpoints
New Requirements
											
										
										
											2025-08-28 09:40:33 +02:00
+								    # 2. Endpoint logic
-												feat: add conversation-endpoint affinity to benefit from hot kv-caches if possible

											
										
										
											2026-05-12 18:33:47 +02:00
+								    _affinity_key = _conversation_fingerprint(model, messages, None)
 								    endpoint, tracking_model = await choose_endpoint(model, affinity_key=_affinity_key)
-												feat: support localhost llama_server access via unix sockets

											
										
										
											2026-04-17 12:41:57 +02:00
+								    oclient = _make_openai_client(endpoint, default_headers=default_headers, api_key=config.api_keys.get(endpoint, "no-key"))
-												fix: changing error_cache to stale-while-revalidate same as available_models_cache

											
										
										
											2026-03-12 14:47:54 +01:00
+								    # 3. Helpers and API call — done in handler scope so try/except works reliably
-												fix: improvements, fixes and opt-in cache

doc: semantic-cache.md added with detailed write-up

											
										
										
											2026-03-10 15:19:37 +01:00
+								    async def _normalize_images_in_messages(msgs: list) -> list:
 								        """Fetch remote image URLs and convert them to base64 data URLs so
 								        Ollama/llama-server can handle them without making outbound HTTP requests."""
 								        resolved = []
 								        for msg in msgs:
 								            content = msg.get("content")
 								            if not isinstance(content, list):
 								                resolved.append(msg)
 								                continue
 								            new_content = []
 								            for item in content:
 								                if item.get("type") == "image_url":
 								                    url = (item.get("image_url") or {}).get("url", "")
 								                    if url and not url.startswith("data:"):
 								                        try:
 								                            http: aiohttp.ClientSession = app_state["session"]
 								                            async with http.get(url) as resp:
 								                                ctype = resp.headers.get("Content-Type", "image/jpeg").split(";")[0].strip()
 								                                img_bytes = await resp.read()
 								                            b64 = base64.b64encode(img_bytes).decode("utf-8")
 								                            new_content.append({
 								                                "type": "image_url",
 								                                "image_url": {"url": f"data:{ctype};base64,{b64}"}
 								                            })
 								                        except Exception as _ie:
 								                            print(f"[image] Failed to fetch image URL: {_ie}")
 								                            new_content.append(item)
 								                    else:
 								                        new_content.append(item)
 								                else:
 								                    new_content.append(item)
 								            resolved.append({**msg, "content": new_content})
 								        return resolved
-												fix: changing error_cache to stale-while-revalidate same as available_models_cache

											
										
										
											2026-03-12 14:47:54 +01:00
+								    # Make the API call in handler scope — try/except inside async generators is unreliable
 								    # with Starlette's streaming machinery, so we resolve errors here before the generator starts.
 								    send_params = params
 								    if not is_ext_openai_endpoint(endpoint):
 								        resolved_msgs = await _normalize_images_in_messages(params.get("messages", []))
 								        send_params = {**params, "messages": resolved_msgs}
-												fix: model name normalization for context_cash preemptive context-shifting for smaller context-windows with previous failure

											
										
										
											2026-03-12 16:08:01 +01:00
+								    # Proactive trim: only for small-ctx models we've already seen run out of space
 								    _lookup_model = _normalize_llama_model_name(model) if endpoint in config.llama_server_endpoints else model
 								    _known_nctx = _endpoint_nctx.get((endpoint, _lookup_model))
 								    if _known_nctx and _known_nctx <= _CTX_TRIM_SMALL_LIMIT:
 								        _pre_target = int(((_known_nctx - _known_nctx // 4)) / 1.2)
 								        _pre_est = _count_message_tokens(send_params.get("messages", []))
 								        if _pre_est > _pre_target:
 								            _pre_msgs = send_params.get("messages", [])
 								            _pre_trimmed = _trim_messages_for_context(_pre_msgs, _known_nctx, target_tokens=_pre_target)
 								            _dropped = len(_pre_msgs) - len(_pre_trimmed)
 								            print(f"[ctx-pre] n_ctx={_known_nctx} est={_pre_est} target={_pre_target} dropped={_dropped}", flush=True)
 								            send_params = {**send_params, "messages": _pre_trimmed}
-												fix: changing error_cache to stale-while-revalidate same as available_models_cache

											
										
										
											2026-03-12 14:47:54 +01:00
+								    try:
 								        async_gen = await oclient.chat.completions.create(**send_params)
 								    except Exception as e:
 								        _e_str = str(e)
 								        _is_ctx_err = "exceed_context_size_error" in _e_str or "exceeds the available context size" in _e_str
 								        print(f"[ochat] caught={type(e).__name__} ctx={_is_ctx_err} msg={_e_str[:120]}", flush=True)
 								        if "does not support tools" in _e_str:
 								            # Model doesn't support tools — retry without them
 								            print(f"[ochat] retry: no tools", flush=True)
-												feat(router): implement in-flight request tracking to prevent cache stampede in high concurrency scenarios

Added in-flight request tracking mechanism to prevent cache stampede when multiple concurrent requests arrive for the same endpoint. This introduces new dictionaries to track ongoing requests and a lock to coordinate access. The available_models method was refactored to use an internal helper function and includes request coalescing logic to ensure only one HTTP request is made per endpoint when cache entries expire. The loaded_models method was also updated to use the new caching and coalescing pattern.

											
										
										
											2026-01-29 18:00:33 +01:00
+								            try:
-												fix: changing error_cache to stale-while-revalidate same as available_models_cache

											
										
										
											2026-03-12 14:47:54 +01:00
+								                params_without_tools = {k: v for k, v in send_params.items() if k != "tools"}
 								                async_gen = await oclient.chat.completions.create(**params_without_tools)
 								            except Exception:
 								                await decrement_usage(endpoint, tracking_model)
 								                raise
 								        elif _is_ctx_err:
 								            # Backend context limit hit — apply sliding-window trim (context-shift at message level)
 								            err_body = getattr(e, "body", {}) or {}
 								            err_detail = err_body.get("error", {}) if isinstance(err_body, dict) else {}
 								            n_ctx_limit = err_detail.get("n_ctx", 0)
 								            actual_tokens = err_detail.get("n_prompt_tokens", 0)
-												fix: catch non-standard openai sdk error bodies for parsing

											
										
										
											2026-03-12 19:08:01 +01:00
+								            # Fallback: parse from string if body parsing yielded nothing (SDK may not parse llama-server errors)
 								            if not n_ctx_limit:
 								                import re as _re
 								                _m = _re.search(r"'n_ctx':\s*(\d+)", _e_str)
 								                if _m:
 								                    n_ctx_limit = int(_m.group(1))
 								                _m = _re.search(r"'n_prompt_tokens':\s*(\d+)", _e_str)
 								                if _m:
 								                    actual_tokens = int(_m.group(1))
-												fix: changing error_cache to stale-while-revalidate same as available_models_cache

											
										
										
											2026-03-12 14:47:54 +01:00
+								            print(f"[ctx-trim] n_ctx={n_ctx_limit} actual={actual_tokens}", flush=True)
 								            if not n_ctx_limit:
 								                await decrement_usage(endpoint, tracking_model)
 								                raise
-												fix: model name normalization for context_cash preemptive context-shifting for smaller context-windows with previous failure

											
										
										
											2026-03-12 16:08:01 +01:00
+								            if n_ctx_limit <= _CTX_TRIM_SMALL_LIMIT:
 								                _endpoint_nctx[(endpoint, model)] = n_ctx_limit
-												fix: changing error_cache to stale-while-revalidate same as available_models_cache

											
										
										
											2026-03-12 14:47:54 +01:00
 								            msgs_to_trim = send_params.get("messages", [])
 								            try:
 								                cal_target = _calibrated_trim_target(msgs_to_trim, n_ctx_limit, actual_tokens)
 								                trimmed_messages = _trim_messages_for_context(msgs_to_trim, n_ctx_limit, target_tokens=cal_target)
 								            except Exception as _helper_exc:
 								                print(f"[ctx-trim] helper crash: {type(_helper_exc).__name__}: {str(_helper_exc)[:100]}", flush=True)
 								                await decrement_usage(endpoint, tracking_model)
 								                raise
 								            dropped = len(msgs_to_trim) - len(trimmed_messages)
 								            print(f"[ctx-trim] target={cal_target} dropped={dropped} remaining={len(trimmed_messages)} retrying-1", flush=True)
 								            try:
 								                async_gen = await oclient.chat.completions.create(**{**send_params, "messages": trimmed_messages})
 								                print(f"[ctx-trim] retry-1 ok", flush=True)
 								            except Exception as e2:
 								                _e2_str = str(e2)
 								                if "exceed_context_size_error" in _e2_str or "exceeds the available context size" in _e2_str:
 								                    # Still too large — tool definitions likely consuming too many tokens, strip them too
 								                    print(f"[ctx-trim] retry-1 still exceeded, stripping tools retrying-2", flush=True)
 								                    params_no_tools = {k: v for k, v in send_params.items() if k not in ("tools", "tool_choice")}
 								                    try:
 								                        async_gen = await oclient.chat.completions.create(**{**params_no_tools, "messages": trimmed_messages})
 								                        print(f"[ctx-trim] retry-2 ok", flush=True)
 								                    except Exception:
 								                        await decrement_usage(endpoint, tracking_model)
-												feat: add reactive auto context-shift in openai endpoints to prevent recover from out of context errors

											
										
										
											2026-03-12 10:15:52 +01:00
+								                        raise
-												feat: add an openai retry if request with image is send to a pure text model

											
										
										
											2026-03-12 10:06:18 +01:00
+								                else:
-												fix: changing error_cache to stale-while-revalidate same as available_models_cache

											
										
										
											2026-03-12 14:47:54 +01:00
+								                    await decrement_usage(endpoint, tracking_model)
-												feat: add an openai retry if request with image is send to a pure text model

											
										
										
											2026-03-12 10:06:18 +01:00
+								                    raise
-												feat: completion errors on an endpoint:model key a caught, cached and rerouted (openai compatible endpoints)

											
										
										
											2026-05-18 18:14:28 +02:00
+								        elif _is_backend_connection_error(e):
 								            # Upstream connection failed (e.g. llama-server in router mode
 								            # whose delegated worker died). Mark (endpoint, model) so the
 								            # next request reroutes; the client will retry this one.
 								            print(f"[ochat] backend connection error → marking ({endpoint}, {model}) unhealthy", flush=True)
 								            await _mark_backend_unhealthy(endpoint, model, _e_str)
 								            await decrement_usage(endpoint, tracking_model)
 								            raise
-												fix: changing error_cache to stale-while-revalidate same as available_models_cache

											
										
										
											2026-03-12 14:47:54 +01:00
+								        elif "image input is not supported" in _e_str:
 								            # Model doesn't support images — strip and retry
 								            print(f"[openai_chat_completions_proxy] Model {model} doesn't support images, retrying with text-only messages")
 								            try:
 								                async_gen = await oclient.chat.completions.create(**{**send_params, "messages": _strip_images_from_messages(send_params.get("messages", []))})
 								            except Exception:
 								                await decrement_usage(endpoint, tracking_model)
 								                raise
 								        else:
 								            await decrement_usage(endpoint, tracking_model)
 								            raise
 								    # 4. Async generator — only streams the already-established async_gen
 								    async def stream_ochat_response():
 								        try:
-												Adding OpenAI compatibility

New Endpoints
New Requirements
											
										
										
											2025-08-28 09:40:33 +02:00
+								            if stream == True:
-												feat: adding a semantic cache layer

											
										
										
											2026-03-08 09:12:09 +01:00
+								                content_parts: list[str] = []
 								                usage_snapshot: dict = {}
-												Adding OpenAI compatibility

New Endpoints
New Requirements
											
										
										
											2025-08-28 09:40:33 +02:00
+								                async for chunk in async_gen:
 								                    data = (
 								                        chunk.model_dump_json()
 								                        if hasattr(chunk, "model_dump_json")
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								                        else orjson.dumps(chunk)
-												Adding OpenAI compatibility

New Endpoints
New Requirements
											
										
										
											2025-08-28 09:40:33 +02:00
+								                    )
-												adding usage metrics to /v1 endpoints if stream == True

											
										
										
											2025-11-21 09:56:42 +01:00
+								                    if chunk.choices:
-												feat(router): parallelize llama-server props fetch and add reasoning/tool call support

- Fetch `/props` endpoints in parallel to get context length and auto-unload sleeping models
- Add support for reasoning content and tool calls in streaming openai chat/completions responses

											
										
										
											2026-02-15 17:05:35 +01:00
+								                        delta = chunk.choices[0].delta
 								                        has_content = delta.content is not None
 								                        has_reasoning = (
 								                            getattr(delta, "reasoning_content", None) is not None
 								                            or getattr(delta, "reasoning", None) is not None
 								                        )
 								                        has_tool_calls = getattr(delta, "tool_calls", None) is not None
 								                        if has_content or has_reasoning or has_tool_calls:
-												adding usage metrics to /v1 endpoints if stream == True

											
										
										
											2025-11-21 09:56:42 +01:00
+								                            yield f"data: {data}\n\n".encode("utf-8")
-												feat: adding a semantic cache layer

											
										
										
											2026-03-08 09:12:09 +01:00
+								                        if has_content and delta.content:
 								                            content_parts.append(delta.content)
-												feat: deduplicate background refresh tasks and extend cache TTL

Adds lock-protected dictionaries to track running background refresh tasks, preventing duplicate executions per endpoint. Increases cache freshness thresholds from 30s to 300s to reduce blocking behavior.

fix: /v1 endpoints use correct media_types and usage information with proper logging

											
										
										
											2026-02-14 14:51:44 +01:00
+								                    elif chunk.usage is not None:
 								                        # Forward the usage-only final chunk (e.g. from llama-server)
 								                        yield f"data: {data}\n\n".encode("utf-8")
 								                    prompt_tok = 0
 								                    comp_tok = 0
-												adding usage metrics to /v1 endpoints if stream == True

											
										
										
											2025-11-21 09:56:42 +01:00
+								                    if chunk.usage is not None:
 								                        prompt_tok = chunk.usage.prompt_tokens or 0
 								                        comp_tok   = chunk.usage.completion_tokens or 0
-												feat: adding a semantic cache layer

											
										
										
											2026-03-08 09:12:09 +01:00
+								                        usage_snapshot = {"prompt_tokens": prompt_tok, "completion_tokens": comp_tok, "total_tokens": prompt_tok + comp_tok}
-												feat: deduplicate background refresh tasks and extend cache TTL

Adds lock-protected dictionaries to track running background refresh tasks, preventing duplicate executions per endpoint. Increases cache freshness thresholds from 30s to 300s to reduce blocking behavior.

fix: /v1 endpoints use correct media_types and usage information with proper logging

											
										
										
											2026-02-14 14:51:44 +01:00
+								                    else:
 								                        llama_usage = rechunk.extract_usage_from_llama_timings(chunk)
 								                        if llama_usage:
 								                            prompt_tok, comp_tok = llama_usage
 								                    if prompt_tok != 0 or comp_tok != 0:
-												fix(router): normalize model names for usage tracking across endpoints

											
										
										
											2026-02-17 11:35:53 +01:00
+								                        await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))
-												fix: model name normalization for context_cash preemptive context-shifting for smaller context-windows with previous failure

											
										
										
											2026-03-12 16:08:01 +01:00
+								                    # Detect context exhaustion mid-generation for small-ctx models.
 								                    # Guard: skip if max_tokens was set in the request — finish_reason=length
 								                    # could just mean the caller's token budget was exhausted, not the context window.
 								                    _req_max_tok = send_params.get("max_tokens") or send_params.get("max_completion_tokens")
 								                    if chunk.choices and chunk.choices[0].finish_reason == "length" and not _req_max_tok:
 								                        _inferred_nctx = (prompt_tok + comp_tok) or 0
 								                        if 0 < _inferred_nctx <= _CTX_TRIM_SMALL_LIMIT:
 								                            _endpoint_nctx[(endpoint, model)] = _inferred_nctx
 								                            print(f"[ctx-cache] finish_reason=length → cached n_ctx={_inferred_nctx} for ({endpoint},{model})", flush=True)
-												feat: adding a semantic cache layer

											
										
										
											2026-03-08 09:12:09 +01:00
+								                # Cache assembled streaming response — before [DONE] so it always runs
-												fix: improvements, fixes and opt-in cache

doc: semantic-cache.md added with detailed write-up

											
										
										
											2026-03-10 15:19:37 +01:00
+								                if _cache is not None and _cache_enabled and content_parts:
-												feat: adding a semantic cache layer

											
										
										
											2026-03-08 09:12:09 +01:00
+								                    assembled = orjson.dumps({
 								                        "model": model,
 								                        "choices": [{"index": 0, "message": {"role": "assistant", "content": "".join(content_parts)}, "finish_reason": "stop"}],
 								                        **({"usage": usage_snapshot} if usage_snapshot else {}),
 								                    }) + b"\n"
 								                    try:
 								                        await _cache.set_chat("openai_chat", model, messages, assembled)
 								                    except Exception as _ce:
 								                        print(f"[cache] set_chat (openai_chat streaming) failed: {_ce}")
-												poc: messsage translation with images

											
										
										
											2025-09-23 17:33:15 +02:00
+								                yield b"data: [DONE]\n\n"
-												Adding OpenAI compatibility

New Endpoints
New Requirements
											
										
										
											2025-08-28 09:40:33 +02:00
+								            else:
-												feat: deduplicate background refresh tasks and extend cache TTL

Adds lock-protected dictionaries to track running background refresh tasks, preventing duplicate executions per endpoint. Increases cache freshness thresholds from 30s to 300s to reduce blocking behavior.

fix: /v1 endpoints use correct media_types and usage information with proper logging

											
										
										
											2026-02-14 14:51:44 +01:00
+								                prompt_tok = 0
 								                comp_tok = 0
 								                if async_gen.usage is not None:
 								                    prompt_tok = async_gen.usage.prompt_tokens or 0
 								                    comp_tok   = async_gen.usage.completion_tokens or 0
 								                else:
 								                    llama_usage = rechunk.extract_usage_from_llama_timings(async_gen)
 								                    if llama_usage:
 								                        prompt_tok, comp_tok = llama_usage
-												fixing token_queue, prepping chart view

											
										
										
											2025-11-18 19:02:36 +01:00
+								                if prompt_tok != 0 or comp_tok != 0:
-												fix(router): normalize model names for usage tracking across endpoints

											
										
										
											2026-02-17 11:35:53 +01:00
+								                    await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))
-												Adding OpenAI compatibility

New Endpoints
New Requirements
											
										
										
											2025-08-28 09:40:33 +02:00
+								                json_line = (
 								                    async_gen.model_dump_json()
 								                    if hasattr(async_gen, "model_dump_json")
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								                    else orjson.dumps(async_gen)
-												Adding OpenAI compatibility

New Endpoints
New Requirements
											
										
										
											2025-08-28 09:40:33 +02:00
+								                )
-												feat: adding a semantic cache layer

											
										
										
											2026-03-08 09:12:09 +01:00
+								                cache_bytes = json_line.encode("utf-8") + b"\n"
 								                yield cache_bytes
 								                # Cache non-streaming response
-												fix: improvements, fixes and opt-in cache

doc: semantic-cache.md added with detailed write-up

											
										
										
											2026-03-10 15:19:37 +01:00
+								                if _cache is not None and _cache_enabled:
-												feat: adding a semantic cache layer

											
										
										
											2026-03-08 09:12:09 +01:00
+								                    try:
 								                        await _cache.set_chat("openai_chat", model, messages, cache_bytes)
 								                    except Exception as _ce:
 								                        print(f"[cache] set_chat (openai_chat non-streaming) failed: {_ce}")
-												Adding OpenAI compatibility

New Endpoints
New Requirements
											
										
										
											2025-08-28 09:40:33 +02:00
 								        finally:
 								            # Ensure counter is decremented even if an exception occurs
-												fix(router): normalize model names for usage tracking across endpoints

											
										
										
											2026-02-17 11:35:53 +01:00
+								            await decrement_usage(endpoint, tracking_model)
-												Adding OpenAI compatibility

New Endpoints
New Requirements
											
										
										
											2025-08-28 09:40:33 +02:00
 								    # 4. Return a StreamingResponse backed by the generator
 								    return StreamingResponse(
 								        stream_ochat_response(),
-												feat: deduplicate background refresh tasks and extend cache TTL

Adds lock-protected dictionaries to track running background refresh tasks, preventing duplicate executions per endpoint. Increases cache freshness thresholds from 30s to 300s to reduce blocking behavior.

fix: /v1 endpoints use correct media_types and usage information with proper logging

											
										
										
											2026-02-14 14:51:44 +01:00
+								        media_type="text/event-stream" if stream else "application/json",
-												Adding OpenAI compatibility

New Endpoints
New Requirements
											
										
										
											2025-08-28 09:40:33 +02:00
+								    )
 								# -------------------------------------------------------------
-												Additions

- Frontend
- Internal Monitoring Endpoints
- External OpenAI compatible backends
											
										
										
											2025-08-30 00:12:56 +02:00
+								# 23. API route – OpenAI compatible Completions
-												Adding OpenAI compatibility

New Endpoints
New Requirements
											
										
										
											2025-08-28 09:40:33 +02:00
+								# -------------------------------------------------------------
-												Update router.py

added t.b.d. OpenAI API compatible endpoints
											
										
										
											2025-08-27 09:23:59 +02:00
+								@app.post("/v1/completions")
-												Adding OpenAI compatibility

New Endpoints
New Requirements
											
										
										
											2025-08-28 09:40:33 +02:00
+								async def openai_completions_proxy(request: Request):
 								    """
 								    Proxy an OpenAI API compatible chat completions request to Ollama and reply with a streaming response.
 								    """
 								    # 1. Parse and validate request
 								    try:
 								        body_bytes = await request.body()
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								        payload = orjson.loads(body_bytes.decode("utf-8"))
-												Adding OpenAI compatibility

New Endpoints
New Requirements
											
										
										
											2025-08-28 09:40:33 +02:00
 								        model = payload.get("model")
 								        prompt = payload.get("prompt")
 								        frequency_penalty = payload.get("frequency_penalty")
 								        presence_penalty = payload.get("presence_penalty")
 								        seed = payload.get("seed")
 								        stop = payload.get("stop")
 								        stream = payload.get("stream")
 								        stream_options = payload.get("stream_options")
 								        temperature = payload.get("temperature")
 								        top_p = payload.get("top_p")
 								        max_tokens = payload.get("max_tokens")
-												Add files via upload

final touches
											
										
										
											2025-09-05 12:11:31 +02:00
+								        max_completion_tokens = payload.get("max_completion_tokens")
-												Additions

- Frontend
- Internal Monitoring Endpoints
- External OpenAI compatible backends
											
										
										
											2025-08-30 00:12:56 +02:00
+								        suffix = payload.get("suffix")
-												fix: improvements, fixes and opt-in cache

doc: semantic-cache.md added with detailed write-up

											
										
										
											2026-03-10 15:19:37 +01:00
+								        _cache_enabled = payload.get("nomyo", {}).get("cache", False)
-												Additions

- Frontend
- Internal Monitoring Endpoints
- External OpenAI compatible backends
											
										
										
											2025-08-30 00:12:56 +02:00
-												fix(router.py):
- added global for orphaned token_worker_task and flust_task
- fixed a regex to effectively _mask_secrets
- fixed several Type and KeyErrors
- fixed model deduplication for llama_server_endpoints

											
										
										
											2026-03-03 16:34:16 +01:00
+								        if not model:
 								            raise HTTPException(
 								                status_code=400, detail="Missing required field 'model'"
 								            )
 								        if not prompt:
 								            raise HTTPException(
 								                status_code=400, detail="Missing required field 'prompt'"
 								            )
-												fixing model re-naming in /v1 endpoints and thinking in rechunk

											
										
										
											2025-09-17 11:40:48 +02:00
+								        if ":latest" in model:
 								            model = model.split(":latest")
 								            model = model[0]
-												Additions

- Frontend
- Internal Monitoring Endpoints
- External OpenAI compatible backends
											
										
										
											2025-08-30 00:12:56 +02:00
+								        params = {
-												fix(router.py):
- added global for orphaned token_worker_task and flust_task
- fixed a regex to effectively _mask_secrets
- fixed several Type and KeyErrors
- fixed model deduplication for llama_server_endpoints

											
										
										
											2026-03-03 16:34:16 +01:00
+								            "prompt": prompt,
-												Additions

- Frontend
- Internal Monitoring Endpoints
- External OpenAI compatible backends
											
										
										
											2025-08-30 00:12:56 +02:00
+								            "model": model,
 								        }
-												better v1 endpoint paramter handling

											
										
										
											2025-09-11 13:56:51 +02:00
+								        optional_params = {
 								            "frequency_penalty": frequency_penalty,
 								            "presence_penalty": presence_penalty,
 								            "seed": seed,
 								            "stop": stop,
 								            "stream": stream,
-												adding usage metrics to /v1 endpoints if stream == True

											
										
										
											2025-11-21 09:56:42 +01:00
+								            "stream_options": stream_options or {"include_usage": True },
-												better v1 endpoint paramter handling

											
										
										
											2025-09-11 13:56:51 +02:00
+								            "temperature": temperature,
 								            "top_p": top_p,
 								            "max_tokens": max_tokens,
 								            "max_completion_tokens": max_completion_tokens,
 								            "suffix": suffix
 								        }
 								        params.update({k: v for k, v in optional_params.items() if v is not None})
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								    except orjson.JSONDecodeError as e:
-												Adding OpenAI compatibility

New Endpoints
New Requirements
											
										
										
											2025-08-28 09:40:33 +02:00
+								        raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e
-												feat: adding a semantic cache layer

											
										
										
											2026-03-08 09:12:09 +01:00
+								    # Cache lookup — completions prompt mapped to a single-turn messages list
 								    _cache = get_llm_cache()
 								    _compl_messages = [{"role": "user", "content": prompt}]
-												fix: improvements, fixes and opt-in cache

doc: semantic-cache.md added with detailed write-up

											
										
										
											2026-03-10 15:19:37 +01:00
+								    if _cache is not None and _cache_enabled:
-												feat: adding a semantic cache layer

											
										
										
											2026-03-08 09:12:09 +01:00
+								        _cached = await _cache.get_chat("openai_completions", model, _compl_messages)
 								        if _cached is not None:
 								            if stream:
 								                _sse = openai_nonstream_to_sse(_cached, model)
 								                async def _serve_cached_ocompl_stream():
 								                    yield _sse
 								                return StreamingResponse(_serve_cached_ocompl_stream(), media_type="text/event-stream")
 								            else:
 								                async def _serve_cached_ocompl_json():
 								                    yield _cached
 								                return StreamingResponse(_serve_cached_ocompl_json(), media_type="application/json")
-												Adding OpenAI compatibility

New Endpoints
New Requirements
											
										
										
											2025-08-28 09:40:33 +02:00
+								    # 2. Endpoint logic
-												feat: add conversation-endpoint affinity to benefit from hot kv-caches if possible

											
										
										
											2026-05-12 18:33:47 +02:00
+								    _affinity_key = _conversation_fingerprint(model, None, prompt)
 								    endpoint, tracking_model = await choose_endpoint(model, affinity_key=_affinity_key)
-												feat: support localhost llama_server access via unix sockets

											
										
										
											2026-04-17 12:41:57 +02:00
+								    oclient = _make_openai_client(endpoint, default_headers=default_headers, api_key=config.api_keys.get(endpoint, "no-key"))
-												Adding OpenAI compatibility

New Endpoints
New Requirements
											
										
										
											2025-08-28 09:40:33 +02:00
 								    # 3. Async generator that streams completions data and decrements the counter
-												fix: changing error_cache to stale-while-revalidate same as available_models_cache

											
										
										
											2026-03-12 14:47:54 +01:00
+								    # Make the API call in handler scope (try/except inside async generators is unreliable)
 								    try:
 								        async_gen = await oclient.completions.create(**params)
-												feat: completion errors on an endpoint:model key a caught, cached and rerouted (openai compatible endpoints)

											
										
										
											2026-05-18 18:14:28 +02:00
+								    except Exception as e:
 								        if _is_backend_connection_error(e):
 								            print(f"[ocompl] backend connection error → marking ({endpoint}, {model}) unhealthy", flush=True)
 								            await _mark_backend_unhealthy(endpoint, model, str(e))
-												fix: changing error_cache to stale-while-revalidate same as available_models_cache

											
										
										
											2026-03-12 14:47:54 +01:00
+								        await decrement_usage(endpoint, tracking_model)
 								        raise
-												adding usage metrics to /v1 endpoints if stream == True

											
										
										
											2025-11-21 09:56:42 +01:00
+								    async def stream_ocompletions_response(model=model):
-												Adding OpenAI compatibility

New Endpoints
New Requirements
											
										
										
											2025-08-28 09:40:33 +02:00
+								        try:
 								            if stream == True:
-												feat: adding a semantic cache layer

											
										
										
											2026-03-08 09:12:09 +01:00
+								                text_parts: list[str] = []
 								                usage_snapshot: dict = {}
-												Adding OpenAI compatibility

New Endpoints
New Requirements
											
										
										
											2025-08-28 09:40:33 +02:00
+								                async for chunk in async_gen:
 								                    data = (
 								                        chunk.model_dump_json()
 								                        if hasattr(chunk, "model_dump_json")
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								                        else orjson.dumps(chunk)
-												Adding OpenAI compatibility

New Endpoints
New Requirements
											
										
										
											2025-08-28 09:40:33 +02:00
+								                    )
-												adding usage metrics to /v1 endpoints if stream == True

											
										
										
											2025-11-21 09:56:42 +01:00
+								                    if chunk.choices:
-												feat(router): parallelize llama-server props fetch and add reasoning/tool call support

- Fetch `/props` endpoints in parallel to get context length and auto-unload sleeping models
- Add support for reasoning content and tool calls in streaming openai chat/completions responses

											
										
										
											2026-02-15 17:05:35 +01:00
+								                        choice = chunk.choices[0]
 								                        has_text = getattr(choice, "text", None) is not None
 								                        has_reasoning = (
 								                            getattr(choice, "reasoning_content", None) is not None
 								                            or getattr(choice, "reasoning", None) is not None
 								                        )
 								                        if has_text or has_reasoning or choice.finish_reason is not None:
-												adding usage metrics to /v1 endpoints if stream == True

											
										
										
											2025-11-21 09:56:42 +01:00
+								                            yield f"data: {data}\n\n".encode("utf-8")
-												feat: adding a semantic cache layer

											
										
										
											2026-03-08 09:12:09 +01:00
+								                        if has_text and choice.text:
 								                            text_parts.append(choice.text)
-												feat: deduplicate background refresh tasks and extend cache TTL

Adds lock-protected dictionaries to track running background refresh tasks, preventing duplicate executions per endpoint. Increases cache freshness thresholds from 30s to 300s to reduce blocking behavior.

fix: /v1 endpoints use correct media_types and usage information with proper logging

											
										
										
											2026-02-14 14:51:44 +01:00
+								                    elif chunk.usage is not None:
 								                        # Forward the usage-only final chunk (e.g. from llama-server)
 								                        yield f"data: {data}\n\n".encode("utf-8")
 								                    prompt_tok = 0
 								                    comp_tok = 0
-												adding usage metrics to /v1 endpoints if stream == True

											
										
										
											2025-11-21 09:56:42 +01:00
+								                    if chunk.usage is not None:
-												feat: deduplicate background refresh tasks and extend cache TTL

Adds lock-protected dictionaries to track running background refresh tasks, preventing duplicate executions per endpoint. Increases cache freshness thresholds from 30s to 300s to reduce blocking behavior.

fix: /v1 endpoints use correct media_types and usage information with proper logging

											
										
										
											2026-02-14 14:51:44 +01:00
+								                        prompt_tok = chunk.usage.prompt_tokens or 0
 								                        comp_tok   = chunk.usage.completion_tokens or 0
-												feat: adding a semantic cache layer

											
										
										
											2026-03-08 09:12:09 +01:00
+								                        usage_snapshot = {"prompt_tokens": prompt_tok, "completion_tokens": comp_tok, "total_tokens": prompt_tok + comp_tok}
-												feat: deduplicate background refresh tasks and extend cache TTL

Adds lock-protected dictionaries to track running background refresh tasks, preventing duplicate executions per endpoint. Increases cache freshness thresholds from 30s to 300s to reduce blocking behavior.

fix: /v1 endpoints use correct media_types and usage information with proper logging

											
										
										
											2026-02-14 14:51:44 +01:00
+								                    else:
 								                        llama_usage = rechunk.extract_usage_from_llama_timings(chunk)
 								                        if llama_usage:
 								                            prompt_tok, comp_tok = llama_usage
 								                    if prompt_tok != 0 or comp_tok != 0:
-												fix(router): normalize model names for usage tracking across endpoints

											
										
										
											2026-02-17 11:35:53 +01:00
+								                        await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))
-												feat: adding a semantic cache layer

											
										
										
											2026-03-08 09:12:09 +01:00
+								                # Cache assembled streaming response — before [DONE] so it always runs
-												fix: improvements, fixes and opt-in cache

doc: semantic-cache.md added with detailed write-up

											
										
										
											2026-03-10 15:19:37 +01:00
+								                if _cache is not None and _cache_enabled and text_parts:
-												feat: adding a semantic cache layer

											
										
										
											2026-03-08 09:12:09 +01:00
+								                    assembled = orjson.dumps({
 								                        "model": model,
 								                        "choices": [{"index": 0, "message": {"role": "assistant", "content": "".join(text_parts)}, "finish_reason": "stop"}],
 								                        **({"usage": usage_snapshot} if usage_snapshot else {}),
 								                    }) + b"\n"
 								                    try:
 								                        await _cache.set_chat("openai_completions", model, _compl_messages, assembled)
 								                    except Exception as _ce:
 								                        print(f"[cache] set_chat (openai_completions streaming) failed: {_ce}")
-												Adding OpenAI compatibility

New Endpoints
New Requirements
											
										
										
											2025-08-28 09:40:33 +02:00
+								                # Final DONE event
 								                yield b"data: [DONE]\n\n"
 								            else:
-												feat: deduplicate background refresh tasks and extend cache TTL

Adds lock-protected dictionaries to track running background refresh tasks, preventing duplicate executions per endpoint. Increases cache freshness thresholds from 30s to 300s to reduce blocking behavior.

fix: /v1 endpoints use correct media_types and usage information with proper logging

											
										
										
											2026-02-14 14:51:44 +01:00
+								                prompt_tok = 0
 								                comp_tok = 0
 								                if async_gen.usage is not None:
 								                    prompt_tok = async_gen.usage.prompt_tokens or 0
 								                    comp_tok   = async_gen.usage.completion_tokens or 0
 								                else:
 								                    llama_usage = rechunk.extract_usage_from_llama_timings(async_gen)
 								                    if llama_usage:
 								                        prompt_tok, comp_tok = llama_usage
-												fixing token_queue, prepping chart view

											
										
										
											2025-11-18 19:02:36 +01:00
+								                if prompt_tok != 0 or comp_tok != 0:
-												fix(router): normalize model names for usage tracking across endpoints

											
										
										
											2026-02-17 11:35:53 +01:00
+								                    await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))
-												Adding OpenAI compatibility

New Endpoints
New Requirements
											
										
										
											2025-08-28 09:40:33 +02:00
+								                json_line = (
 								                    async_gen.model_dump_json()
 								                    if hasattr(async_gen, "model_dump_json")
-												various performance improvements and json replacement orjson

											
										
										
											2025-11-10 15:37:46 +01:00
+								                    else orjson.dumps(async_gen)
-												Adding OpenAI compatibility

New Endpoints
New Requirements
											
										
										
											2025-08-28 09:40:33 +02:00
+								                )
-												feat: adding a semantic cache layer

											
										
										
											2026-03-08 09:12:09 +01:00
+								                cache_bytes = json_line.encode("utf-8") + b"\n"
 								                yield cache_bytes
 								                # Cache non-streaming response
-												fix: improvements, fixes and opt-in cache

doc: semantic-cache.md added with detailed write-up

											
										
										
											2026-03-10 15:19:37 +01:00
+								                if _cache is not None and _cache_enabled:
-												feat: adding a semantic cache layer

											
										
										
											2026-03-08 09:12:09 +01:00
+								                    try:
 								                        await _cache.set_chat("openai_completions", model, _compl_messages, cache_bytes)
 								                    except Exception as _ce:
 								                        print(f"[cache] set_chat (openai_completions non-streaming) failed: {_ce}")
-												Adding OpenAI compatibility

New Endpoints
New Requirements
											
										
										
											2025-08-28 09:40:33 +02:00
 								        finally:
 								            # Ensure counter is decremented even if an exception occurs
-												fix(router): normalize model names for usage tracking across endpoints

											
										
										
											2026-02-17 11:35:53 +01:00
+								            await decrement_usage(endpoint, tracking_model)
-												Adding OpenAI compatibility

New Endpoints
New Requirements
											
										
										
											2025-08-28 09:40:33 +02:00
 								    # 4. Return a StreamingResponse backed by the generator
 								    return StreamingResponse(
 								        stream_ocompletions_response(),
-												feat: deduplicate background refresh tasks and extend cache TTL

Adds lock-protected dictionaries to track running background refresh tasks, preventing duplicate executions per endpoint. Increases cache freshness thresholds from 30s to 300s to reduce blocking behavior.

fix: /v1 endpoints use correct media_types and usage information with proper logging

											
										
										
											2026-02-14 14:51:44 +01:00
+								        media_type="text/event-stream" if stream else "application/json",
-												Adding OpenAI compatibility

New Endpoints
New Requirements
											
										
										
											2025-08-28 09:40:33 +02:00
+								    )
 								# -------------------------------------------------------------
-												Additions

- Frontend
- Internal Monitoring Endpoints
- External OpenAI compatible backends
											
										
										
											2025-08-30 00:12:56 +02:00
+								# 24. OpenAI API compatible models endpoint
-												Adding OpenAI compatibility

New Endpoints
New Requirements
											
										
										
											2025-08-28 09:40:33 +02:00
+								# -------------------------------------------------------------
-												Additions

- Frontend
- Internal Monitoring Endpoints
- External OpenAI compatible backends
											
										
										
											2025-08-30 00:12:56 +02:00
+								@app.get("/v1/models")
 								async def openai_models_proxy(request: Request):
 								    """
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								    Proxy an OpenAI API models request to Ollama and llama-server endpoints and reply with a unique list of models.
 								    For Ollama endpoints: queries /api/tags (all models)
 								    For llama-server endpoints: queries /v1/models and filters for status.value == "loaded"
 								    """
 								    # 1. Query Ollama endpoints for all models via /api/tags
-												fix: prevent dashboard and route hangs when endpoints are down by calling skip_error_cache also with reduced timeout

											
										
										
											2026-05-01 13:49:34 +02:00
+								    ollama_tasks = [fetch.endpoint_details(ep, "/api/tags", "models", skip_error_cache=True, timeout=8) for ep in config.endpoints if "/v1" not in ep]
-												fix(router): /v1/models endpoint

Shows now all available models

											
										
										
											2026-02-13 16:27:06 +01:00
+								    # 2. Query external OpenAI endpoints (Groq, OpenAI, etc.) via /models
-												fix: prevent dashboard and route hangs when endpoints are down by calling skip_error_cache also with reduced timeout

											
										
										
											2026-05-01 13:49:34 +02:00
+								    ext_openai_tasks = [fetch.endpoint_details(ep, "/models", "data", config.api_keys.get(ep), skip_error_cache=True, timeout=8) for ep in config.endpoints if is_ext_openai_endpoint(ep)]
-												fix(router): /v1/models endpoint

Shows now all available models

											
										
										
											2026-02-13 16:27:06 +01:00
+								    # 3. Query llama-server endpoints for loaded models via /v1/models
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								    # Also query endpoints from llama_server_endpoints that may not be in config.endpoints
 								    all_llama_endpoints = set(config.llama_server_endpoints) | set(ep for ep in config.endpoints if ep in config.llama_server_endpoints)
 								    llama_tasks = [
-												fix: prevent dashboard and route hangs when endpoints are down by calling skip_error_cache also with reduced timeout

											
										
										
											2026-05-01 13:49:34 +02:00
+								        fetch.endpoint_details(ep, "/models", "data", config.api_keys.get(ep), skip_error_cache=True, timeout=8)
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								        for ep in all_llama_endpoints
 								    ]
-												fix(router): /v1/models endpoint

Shows now all available models

											
										
										
											2026-02-13 16:27:06 +01:00
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								    ollama_models = await asyncio.gather(*ollama_tasks) if ollama_tasks else []
-												fix(router): /v1/models endpoint

Shows now all available models

											
										
										
											2026-02-13 16:27:06 +01:00
+								    ext_openai_models = await asyncio.gather(*ext_openai_tasks) if ext_openai_tasks else []
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								    llama_models = await asyncio.gather(*llama_tasks) if llama_tasks else []
-												fix(router): /v1/models endpoint

Shows now all available models

											
										
										
											2026-02-13 16:27:06 +01:00
-												Additions

- Frontend
- Internal Monitoring Endpoints
- External OpenAI compatible backends
											
										
										
											2025-08-30 00:12:56 +02:00
+								    models = {'data': []}
-												fix(router): /v1/models endpoint

Shows now all available models

											
										
										
											2026-02-13 16:27:06 +01:00
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								    # Add Ollama models (if any)
 								    if ollama_models:
 								        for modellist in ollama_models:
 								            for model in modellist:
 								                if not "id" in model.keys():  # Relable Ollama models with OpenAI Model.id from Model.name
 								                    model['id'] = model.get('name', model.get('id', ''))
 								                else:
 								                    model['name'] = model['id']
 								                models['data'].append(model)
-												fix(router): /v1/models endpoint

Shows now all available models

											
										
										
											2026-02-13 16:27:06 +01:00
 								    # Add external OpenAI models (if any)
 								    if ext_openai_models:
 								        for modellist in ext_openai_models:
 								            for model in modellist:
 								                if not "id" in model.keys():
 								                    model['id'] = model.get('name', model.get('id', ''))
 								                else:
 								                    model['name'] = model['id']
 								                models['data'].append(model)
 								    # Add llama-server models (all available, not just loaded)
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								    if llama_models:
 								        for modellist in llama_models:
-												fix(router): /v1/models endpoint

Shows now all available models

											
										
										
											2026-02-13 16:27:06 +01:00
+								            for model in modellist:
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								                if not "id" in model.keys():
 								                    model['id'] = model.get('name', model.get('id', ''))
 								                else:
 								                    model['name'] = model['id']
 								                models['data'].append(model)
-												Additions

- Frontend
- Internal Monitoring Endpoints
- External OpenAI compatible backends
											
										
										
											2025-08-30 00:12:56 +02:00
 								    # 2. Return a JSONResponse with a deduplicated list of unique models for inference
 								    return JSONResponse(
 								        content={"data": dedupe_on_keys(models['data'], ['name'])},
 								        status_code=200,
-												Update router.py

added t.b.d. OpenAI API compatible endpoints
											
										
										
											2025-08-27 09:23:59 +02:00
+								    )
 								# -------------------------------------------------------------
-												feat: adding /v1/rerank endpoint with cohere,jina,llama.cpp compatibility

											
										
										
											2026-02-28 09:31:25 +01:00
+								# 25. API route – OpenAI/Jina/Cohere compatible Rerank
 								# -------------------------------------------------------------
 								@app.post("/v1/rerank")
 								@app.post("/rerank")
 								async def rerank_proxy(request: Request):
 								    """
 								    Proxy a rerank request to a llama-server or external OpenAI-compatible endpoint.
 								    Compatible with the Jina/Cohere rerank API convention used by llama-server,
 								    vLLM, and services such as Cohere and Jina AI.
 								    Ollama does not natively support reranking; requests routed to a plain Ollama
 								    endpoint will receive a 501 Not Implemented response.
 								    Request body:
 								        model           (str, required)  – reranker model name
 								        query           (str, required)  – search query
 								        documents       (list[str], required) – candidate documents to rank
 								        top_n           (int, optional)  – limit returned results (default: all)
 								        return_documents (bool, optional) – include document text in results
 								        max_tokens_per_doc (int, optional) – truncation limit per document
 								    Response (Jina/Cohere-compatible):
 								        {
 								          "id": "...",
 								          "model": "...",
 								          "usage": {"prompt_tokens": N, "total_tokens": N},
 								          "results": [{"index": 0, "relevance_score": 0.95}, ...]
 								        }
 								    """
 								    try:
 								        body_bytes = await request.body()
 								        payload = orjson.loads(body_bytes.decode("utf-8"))
 								        model = payload.get("model")
 								        query = payload.get("query")
 								        documents = payload.get("documents")
 								        if not model:
 								            raise HTTPException(status_code=400, detail="Missing required field 'model'")
 								        if not query:
 								            raise HTTPException(status_code=400, detail="Missing required field 'query'")
 								        if not isinstance(documents, list) or not documents:
 								            raise HTTPException(status_code=400, detail="Missing or empty required field 'documents' (must be a non-empty list)")
 								    except orjson.JSONDecodeError as e:
 								        raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e
 								    # Determine which endpoint serves this model
 								    try:
-												refactor: make choose_endpoint use cache incrementer for atomic updates

											
										
										
											2026-03-03 14:57:37 +01:00
+								        endpoint, tracking_model = await choose_endpoint(model)
-												feat: adding /v1/rerank endpoint with cohere,jina,llama.cpp compatibility

											
										
										
											2026-02-28 09:31:25 +01:00
+								    except RuntimeError as e:
 								        raise HTTPException(status_code=404, detail=str(e))
 								    # Ollama endpoints have no native rerank support
 								    if not is_openai_compatible(endpoint):
-												refactor: make choose_endpoint use cache incrementer for atomic updates

											
										
										
											2026-03-03 14:57:37 +01:00
+								        await decrement_usage(endpoint, tracking_model)
-												feat: adding /v1/rerank endpoint with cohere,jina,llama.cpp compatibility

											
										
										
											2026-02-28 09:31:25 +01:00
+								        raise HTTPException(
 								            status_code=501,
 								            detail=(
 								                f"Endpoint '{endpoint}' is a plain Ollama instance which does not support "
 								                "reranking. Use a llama-server or OpenAI-compatible endpoint with a "
 								                "dedicated reranker model."
 								            ),
 								        )
 								    if ":latest" in model:
 								        model = model.split(":latest")[0]
 								    # Build upstream rerank request body – forward only recognised fields
 								    upstream_payload: dict = {"model": model, "query": query, "documents": documents}
 								    for optional_key in ("top_n", "return_documents", "max_tokens_per_doc"):
 								        if optional_key in payload:
 								            upstream_payload[optional_key] = payload[optional_key]
 								    # Determine upstream URL:
 								    #   llama-server exposes /v1/rerank (base already contains /v1 for llama_server_endpoints)
 								    #   External OpenAI endpoints expose /rerank under their /v1 base
 								    if endpoint in config.llama_server_endpoints:
 								        # llama-server: endpoint may or may not already contain /v1
 								        if "/v1" in endpoint:
 								            rerank_url = f"{endpoint}/rerank"
 								        else:
 								            rerank_url = f"{endpoint}/v1/rerank"
 								    else:
 								        # External OpenAI-compatible: ep2base gives us the /v1 base
 								        rerank_url = f"{ep2base(endpoint)}/rerank"
 								    api_key = config.api_keys.get(endpoint, "no-key")
 								    headers = {
 								        "Content-Type": "application/json",
 								        "Authorization": f"Bearer {api_key}",
 								    }
-												feat: support localhost llama_server access via unix sockets

											
										
										
											2026-04-17 12:41:57 +02:00
+								    client: aiohttp.ClientSession = get_session(endpoint)
-												feat: adding /v1/rerank endpoint with cohere,jina,llama.cpp compatibility

											
										
										
											2026-02-28 09:31:25 +01:00
+								    try:
 								        async with client.post(rerank_url, json=upstream_payload, headers=headers) as resp:
 								            response_bytes = await resp.read()
 								            if resp.status >= 400:
 								                raise HTTPException(
 								                    status_code=resp.status,
 								                    detail=_mask_secrets(response_bytes.decode("utf-8", errors="replace")),
 								                )
 								            data = orjson.loads(response_bytes)
 								        # Record token usage if the upstream returned a usage object
 								        usage = data.get("usage") or {}
 								        prompt_tok = usage.get("prompt_tokens") or 0
 								        total_tok = usage.get("total_tokens") or 0
 								        # For reranking there are no completion tokens; we record prompt tokens only
 								        if prompt_tok or total_tok:
 								            await token_queue.put((endpoint, tracking_model, prompt_tok, 0))
 								        return JSONResponse(content=data)
 								    finally:
 								        await decrement_usage(endpoint, tracking_model)
-												feat: adding a semantic cache layer

											
										
										
											2026-03-08 09:12:09 +01:00
+								# -------------------------------------------------------------
 								# 25b. Cache management endpoints
 								# -------------------------------------------------------------
 								@app.get("/api/cache/stats")
 								async def cache_stats():
 								    """Return hit/miss counters and configuration for the LLM response cache."""
 								    c = get_llm_cache()
 								    if c is None:
 								        return {"enabled": False}
 								    return {"enabled": True, **c.stats()}
 								@app.post("/api/cache/invalidate")
 								async def cache_invalidate():
 								    """Clear all entries from the LLM response cache and reset counters."""
 								    c = get_llm_cache()
 								    if c is None:
 								        return {"enabled": False, "cleared": False}
 								    await c.clear()
 								    return {"enabled": True, "cleared": True}
-												feat: adding /v1/rerank endpoint with cohere,jina,llama.cpp compatibility

											
										
										
											2026-02-28 09:31:25 +01:00
+								# -------------------------------------------------------------
 								# 26. Serve the static front‑end
-												Additions

- Frontend
- Internal Monitoring Endpoints
- External OpenAI compatible backends
											
										
										
											2025-08-30 00:12:56 +02:00
+								# -------------------------------------------------------------
 								app.mount("/static", StaticFiles(directory="static"), name="static")
-												Add files via upload

Adding:
- health endpoint
- extended /api/config
- frontend extension of backend server viz
											
										
										
											2025-08-30 12:43:35 +02:00
+								@app.get("/favicon.ico")
 								async def redirect_favicon():
 								    return RedirectResponse(url="/static/favicon.ico")
-												Additions

- Frontend
- Internal Monitoring Endpoints
- External OpenAI compatible backends
											
										
										
											2025-08-30 00:12:56 +02:00
+								@app.get("/", response_class=HTMLResponse)
 								async def index(request: Request):
 								    """
-												Add files via upload

final touches
											
										
										
											2025-09-05 12:11:31 +02:00
+								    Render the dynamic NOMYO Router dashboard listing the configured endpoints
 								    and the models details, availability & task status.
-												Additions

- Frontend
- Internal Monitoring Endpoints
- External OpenAI compatible backends
											
										
										
											2025-08-30 00:12:56 +02:00
+								    """
-												feat:
added buffer_lock to prevent race condition in high concurrency scenarios
added documentation

											
										
										
											2026-01-05 17:16:31 +01:00
+								    index_path = STATIC_DIR / "index.html"
 								    try:
 								        return HTMLResponse(content=index_path.read_text(encoding="utf-8"), status_code=200)
 								    except FileNotFoundError:
 								        raise HTTPException(status_code=404, detail="Page not found")
 								    except Exception:
 								        raise HTTPException(status_code=500, detail="Internal server error")
-												Additions

- Frontend
- Internal Monitoring Endpoints
- External OpenAI compatible backends
											
										
										
											2025-08-30 00:12:56 +02:00
 								# -------------------------------------------------------------
-												Add files via upload

Adding:
- health endpoint
- extended /api/config
- frontend extension of backend server viz
											
										
										
											2025-08-30 12:43:35 +02:00
+								# 26. Healthendpoint
 								# -------------------------------------------------------------
 								@app.get("/health")
 								async def health_proxy(request: Request):
 								    """
 								    Health‑check endpoint for monitoring the proxy.
-												fix:
-  _fetch_loaded_models_internal now writes _loaded_error_cache[endpoint] = time.time() on /api/ps or /v1/models failure, and clears the entry on success
- choose_endpoint now filters out candidates with a fresh (<300s) loaded-models error.
-  /health now probes both /api/version and /api/ps for Ollama endpoints
-  dashboard adaption

relates to #83

											
										
										
											2026-05-18 13:45:06 +02:00
+								    * Queries each configured endpoint for both liveness and routing health:
 								      Ollama endpoints are probed at `/api/version` AND `/api/ps`,
 								      OpenAI-compatible endpoints at `/models`.
-												Add files via upload

Adding:
- health endpoint
- extended /api/config
- frontend extension of backend server viz
											
										
										
											2025-08-30 12:43:35 +02:00
+								    * Returns a JSON object containing:
-												fix:
-  _fetch_loaded_models_internal now writes _loaded_error_cache[endpoint] = time.time() on /api/ps or /v1/models failure, and clears the entry on success
- choose_endpoint now filters out candidates with a fresh (<300s) loaded-models error.
-  /health now probes both /api/version and /api/ps for Ollama endpoints
-  dashboard adaption

relates to #83

											
										
										
											2026-05-18 13:45:06 +02:00
+								        - `status`: "ok" if every endpoint replied to every probe, otherwise "error".
-												Add files via upload

Adding:
- health endpoint
- extended /api/config
- frontend extension of backend server viz
											
										
										
											2025-08-30 12:43:35 +02:00
+								        - `endpoints`: a mapping of endpoint URL → `{status, version|detail}`.
 								    * The HTTP status code is 200 when everything is healthy, 503 otherwise.
 								    """
-												fix: health check all endpoints with right per enpoint path

issue: resolving #24

											
										
										
											2026-04-16 12:18:38 +02:00
+								    # Run all health checks in parallel.
-												fix:
-  _fetch_loaded_models_internal now writes _loaded_error_cache[endpoint] = time.time() on /api/ps or /v1/models failure, and clears the entry on success
- choose_endpoint now filters out candidates with a fresh (<300s) loaded-models error.
-  /health now probes both /api/version and /api/ps for Ollama endpoints
-  dashboard adaption

relates to #83

											
										
										
											2026-05-18 13:45:06 +02:00
+								    # Ollama endpoints expose /api/version (liveness) and /api/ps (routing
 								    # health — required by `choose_endpoint`). OpenAI-compatible endpoints
 								    # (vLLM, llama-server, external) expose /models, which serves both
 								    # purposes. Probing /api/version alone would miss the case where the
 								    # Ollama process is up but /api/ps is failing — see issue #83.
-												fix: health check all endpoints with right per enpoint path

issue: resolving #24

											
										
										
											2026-04-16 12:18:38 +02:00
+								    all_endpoints = list(config.endpoints)
 								    llama_eps_extra = [ep for ep in config.llama_server_endpoints if ep not in config.endpoints]
 								    all_endpoints += llama_eps_extra
-												fix:
-  _fetch_loaded_models_internal now writes _loaded_error_cache[endpoint] = time.time() on /api/ps or /v1/models failure, and clears the entry on success
- choose_endpoint now filters out candidates with a fresh (<300s) loaded-models error.
-  /health now probes both /api/version and /api/ps for Ollama endpoints
-  dashboard adaption

relates to #83

											
										
										
											2026-05-18 13:45:06 +02:00
+								    probe_results = await asyncio.gather(
 								        *(_endpoint_health(ep) for ep in all_endpoints),
 								    )
-												Add files via upload

Adding:
- health endpoint
- extended /api/config
- frontend extension of backend server viz
											
										
										
											2025-08-30 12:43:35 +02:00
-												fix:
-  _fetch_loaded_models_internal now writes _loaded_error_cache[endpoint] = time.time() on /api/ps or /v1/models failure, and clears the entry on success
- choose_endpoint now filters out candidates with a fresh (<300s) loaded-models error.
-  /health now probes both /api/version and /api/ps for Ollama endpoints
-  dashboard adaption

relates to #83

											
										
										
											2026-05-18 13:45:06 +02:00
+								    health_summary = dict(zip(all_endpoints, probe_results))
 								    overall_ok = all(entry.get("status") == "ok" for entry in probe_results)
-												Add files via upload

Adding:
- health endpoint
- extended /api/config
- frontend extension of backend server viz
											
										
										
											2025-08-30 12:43:35 +02:00
 								    response_payload = {
 								        "status": "ok" if overall_ok else "error",
 								        "endpoints": health_summary,
 								    }
 								    http_status = 200 if overall_ok else 503
 								    return JSONResponse(content=response_payload, status_code=http_status)
 								# -------------------------------------------------------------
-												feat: add hostname to dashboard

											
										
										
											2026-04-10 17:29:43 +02:00
+								# 27. Hostname endpoint
 								# -------------------------------------------------------------
 								@app.get("/api/hostname")
 								async def get_hostname():
 								    """Return the hostname of the machine running the router."""
 								    return JSONResponse(content={"hostname": socket.gethostname()})
 								# -------------------------------------------------------------
 								# 28. SSE route for usage broadcasts
-												Add files via upload

final touches
											
										
										
											2025-09-05 12:11:31 +02:00
+								# -------------------------------------------------------------
 								@app.get("/api/usage-stream")
 								async def usage_stream(request: Request):
 								    """
 								    Server‑Sent‑Events that emits a JSON payload every time the
 								    global `usage_counts` dictionary changes.
 								    """
 								    async def event_generator():
 								        # The queue that receives *every* new snapshot
 								        queue = await subscribe()
 								        try:
 								            while True:
 								                # If the client disconnects, cancel the loop
 								                if await request.is_disconnected():
 								                    break
 								                data = await queue.get()
-												improved SSE queue handling on shutdown

											
										
										
											2025-09-12 09:44:56 +02:00
+								                if data is None:
 								                    break
-												Add files via upload

final touches
											
										
										
											2025-09-05 12:11:31 +02:00
+								                # Send the data as a single SSE message
 								                yield f"data: {data}\n\n"
 								        finally:
 								            # Clean‑up: unsubscribe from the broadcast channel
 								            await unsubscribe(queue)
 								    return StreamingResponse(event_generator(), media_type="text/event-stream")
 								# -------------------------------------------------------------
-												using global aiohttp sessionpool for improved performance

											
										
										
											2025-09-10 10:21:49 +02:00
+								# 28. FastAPI startup/shutdown events
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								# -------------------------------------------------------------
 								@app.on_event("startup")
 								async def startup_event() -> None:
-												fix(router.py):
- added global for orphaned token_worker_task and flust_task
- fixed a regex to effectively _mask_secrets
- fixed several Type and KeyErrors
- fixed model deduplication for llama_server_endpoints

											
										
										
											2026-03-03 16:34:16 +01:00
+								    global config, db, token_worker_task, flush_task
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								    # Load YAML config (or use defaults if not present)
-												Add Docker support

Adds comprehensive docker support

											
										
										
											2025-11-07 13:59:16 +01:00
+								    config_path = _config_path_from_env()
 								    config = Config.from_yaml(config_path)
 								    if config_path.exists():
 								        print(
 								            f"Loaded configuration from {config_path}:\n"
 								            f" endpoints={config.endpoints},\n"
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								            f" llama_server_endpoints={config.llama_server_endpoints},\n"
-												feat: enhance load balancing #23

											
										
										
											2026-04-22 17:27:34 +02:00
+								            f" max_concurrent_connections={config.max_concurrent_connections},\n"
 								            f" endpoint_config={config.endpoint_config},\n"
 								            f" priority_routing={config.priority_routing}"
-												Add Docker support

Adds comprehensive docker support

											
										
										
											2025-11-07 13:59:16 +01:00
+								        )
 								    else:
 								        print(
 								            f"No configuration file found at {config_path}. "
 								            "Falling back to default settings."
 								        )
-												adding token timeseries counting in db for future data viz

											
										
										
											2025-11-18 11:16:21 +01:00
 								    # Initialize database
 								    db = TokenDatabase(config.db_path)
 								    await db.init_db()
 								    # Load existing token counts from database
 								    async for count_entry in db.load_token_counts():
 								        endpoint = count_entry['endpoint']
 								        model = count_entry['model']
 								        input_tokens = count_entry['input_tokens']
 								        output_tokens = count_entry['output_tokens']
 								        total_tokens = count_entry['total_tokens']
 								        token_usage_counts[endpoint][model] = total_tokens
-												using global aiohttp sessionpool for improved performance

											
										
										
											2025-09-10 10:21:49 +02:00
+								    ssl_context = ssl.create_default_context()
 								    connector = aiohttp.TCPConnector(limit=0, limit_per_host=512, ssl=ssl_context)
-												check for base64 encoded images and remove alpha channel

											
										
										
											2025-10-03 10:04:50 +02:00
+								    timeout = aiohttp.ClientTimeout(total=60, connect=15, sock_read=120, sock_connect=15)
-												feat: better default referer handling

											
										
										
											2026-05-08 12:15:51 +02:00
+								    session = aiohttp.ClientSession(
 								        connector=connector,
 								        timeout=timeout,
 								        headers={"Referer": default_headers.get("HTTP-Referer", "https://nomyo.ai")},
 								    )
-												using global aiohttp sessionpool for improved performance

											
										
										
											2025-09-10 10:21:49 +02:00
 								    app_state["connector"] = connector
 								    app_state["session"] = session
-												feat: support localhost llama_server access via unix sockets

											
										
										
											2026-04-17 12:41:57 +02:00
-												feat: better default referer handling

											
										
										
											2026-05-08 12:15:51 +02:00
+								    # Create httpx clients for external OpenAI endpoints (Google, etc.)
 								    # aiohttp strips Referer headers for cross-origin requests, so we use httpx
 								    for ep in config.endpoints:
 								        if is_ext_openai_endpoint(ep):
 								            app_state["httpx_clients"][ep] = httpx.AsyncClient(timeout=30.0)
-												feat: support localhost llama_server access via unix sockets

											
										
										
											2026-04-17 12:41:57 +02:00
+								    # Create per-endpoint Unix socket sessions for .sock endpoints
 								    for ep in config.llama_server_endpoints:
 								        if _is_unix_socket_endpoint(ep):
 								            sock_path = _get_socket_path(ep)
 								            sock_connector = aiohttp.UnixConnector(path=sock_path)
 								            sock_timeout = aiohttp.ClientTimeout(total=300, connect=5, sock_read=300)
 								            sock_session = aiohttp.ClientSession(connector=sock_connector, timeout=sock_timeout)
 								            app_state["socket_sessions"][ep] = sock_session
 								            transport = httpx.AsyncHTTPTransport(uds=sock_path)
 								            app_state["httpx_clients"][ep] = httpx.AsyncClient(transport=transport, timeout=300.0)
 								            print(f"[startup] Unix socket session: {ep} -> {sock_path}")
-												stopping the token_worker_task gracefully on shutdown

											
										
										
											2025-11-13 10:13:10 +01:00
+								    token_worker_task = asyncio.create_task(token_worker())
-												adding token timeseries counting in db for future data viz

											
										
										
											2025-11-18 11:16:21 +01:00
+								    flush_task = asyncio.create_task(flush_buffer())
-												feat: adding a semantic cache layer

											
										
										
											2026-03-08 09:12:09 +01:00
+								    await init_llm_cache(config)
-												using global aiohttp sessionpool for improved performance

											
										
										
											2025-09-10 10:21:49 +02:00
 								@app.on_event("shutdown")
 								async def shutdown_event() -> None:
-												improved SSE queue handling on shutdown

											
										
										
											2025-09-12 09:44:56 +02:00
+								    await close_all_sse_queues()
-												fix: stopping background task properly on shutdown

											
										
										
											2026-05-13 11:05:34 +02:00
 								    # Stop background tasks first so they stop touching the DB before we close it.
 								    for t in (token_worker_task, flush_task):
 								        if t is not None:
 								            t.cancel()
 								            try:
 								                await t
 								            except (asyncio.CancelledError, Exception):
 								                pass
-												refactor: use a persistent WAL-enabled connection with async locks

- Introduce a lazily initialized, shared aiosqlite connection stored in self._db and two asyncio locks (_db_lock, _operation_lock) for safe concurrent access
- Ensure the database directory exists before connecting and enable WAL journaling and foreign keys on first connect
- Add close method to gracefully close the persistent connection
- Guard initialization and write operations with _operation_lock to ensure single-threaded schema setup
- Switch to ON CONFLICT UPSERT for token_counts updates and initialize token_time_series table
- Add typing for _db (Optional[aiosqlite.Connection]) and adjust imports accordingly

addition: Frontend button with total stats aggregation task and feedback span element to keep user informed and a small database footprint

											
										
										
											2025-12-02 12:18:23 +01:00
+								    await flush_remaining_buffers()
-												check for base64 encoded images and remove alpha channel

											
										
										
											2025-10-03 10:04:50 +02:00
+								    await app_state["session"].close()
-												feat: support localhost llama_server access via unix sockets

											
										
										
											2026-04-17 12:41:57 +02:00
 								    # Close Unix socket sessions
 								    for ep, sess in list(app_state.get("socket_sessions", {}).items()):
 								        try:
 								            await sess.close()
 								            print(f"[shutdown] Closed Unix socket session: {ep}")
 								        except Exception as e:
 								            print(f"[shutdown] Error closing Unix socket session {ep}: {e}")
 								    # Close httpx Unix socket clients
 								    for ep, client in list(app_state.get("httpx_clients", {}).items()):
 								        try:
 								            await client.aclose()
 								            print(f"[shutdown] Closed httpx client: {ep}")
 								        except Exception as e:
 								            print(f"[shutdown] Error closing httpx client {ep}: {e}")
-												fix: stopping background task properly on shutdown

											
										
										
											2026-05-13 11:05:34 +02:00
+								    # Close the aiosqlite connection last — its worker thread is non-daemon
 								    # and would otherwise keep the interpreter alive after lifespan completes.
 								    if db is not None:
 								        try:
 								            await db.close()
 								            print("[shutdown] Closed token DB connection.")
 								        except Exception as e:
 								            print(f"[shutdown] Error closing DB: {e}")