nomyo-router/router.py

"""
title: NOMYO Router - an (O)llama and OpenAI API v1 Proxy with Endpoint:Model aware routing
author: alpha-nerd-nomyo
author_url: https://github.com/nomyo-ai
version: 0.9
license: AGPL
"""
# -------------------------------------------------------------
import orjson, time, asyncio, yaml, ollama, openai, os, re, aiohttp, ssl, random, base64, io, enhance, secrets, math, socket, httpx, hashlib
try:
    import truststore; truststore.inject_into_ssl()
except ImportError:
    pass
from datetime import datetime, timezone
from pathlib import Path

# Directory containing static files (relative to this script)
STATIC_DIR = Path(__file__).parent / "static"
from typing import Dict, Set, List, Optional
from urllib.parse import urlparse, parse_qsl, urlencode
from fastapi import FastAPI, Request, HTTPException
from fastapi_sse import sse_handler
from fastapi.staticfiles import StaticFiles
from fastapi.middleware.cors import CORSMiddleware
from starlette.responses import StreamingResponse, JSONResponse, Response, HTMLResponse, RedirectResponse
from pydantic import Field
from pydantic_settings import BaseSettings
from collections import defaultdict
from PIL import Image

from security import _mask_secrets
from context_window import (
    _count_message_tokens,
    _trim_messages_for_context,
    _calibrated_trim_target,
    _endpoint_nctx,
    _CTX_TRIM_SMALL_LIMIT,
)
from state import (
    _models_cache,
    _loaded_models_cache,
    _available_error_cache,
    _loaded_error_cache,
    _completion_error_cache,
    _COMPLETION_ERROR_TTL,
    _models_cache_lock,
    _loaded_models_cache_lock,
    _available_error_cache_lock,
    _loaded_error_cache_lock,
    _completion_error_cache_lock,
    _inflight_available_models,
    _inflight_loaded_models,
    _inflight_lock,
    _bg_refresh_available,
    _bg_refresh_loaded,
    _bg_refresh_lock,
    _subscribers,
    _subscribers_lock,
    token_queue,
    app_state,
    token_buffer,
    time_series_buffer,
    buffer_lock,
    FLUSH_INTERVAL,
)

# Rebound on startup — must stay in router.py module namespace.
token_worker_task: asyncio.Task | None = None
flush_task: asyncio.Task | None = None

from config import Config, _config_path_from_env

from ollama._types import TokenLogprob, Logprob
from db import TokenDatabase
from cache import init_llm_cache, get_llm_cache, openai_nonstream_to_sse


# Create the global config object – it will be overwritten on startup.
# Submodules read it lazily via config.get_config().
config = Config.from_yaml(_config_path_from_env())

# -------------------------------------------------------------
# 2. FastAPI application
# -------------------------------------------------------------
app = FastAPI()
sse_handler.app = app
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["GET", "POST", "DELETE"],
    allow_headers=["Authorization", "Content-Type"],
)
from state import default_headers
        
# -------------------------------------------------------------
# Router-level authentication (optional)
# -------------------------------------------------------------
def _extract_router_api_key(request: Request) -> Optional[str]:
    """
    Extract the provided router API key from the Authorization header or `api_key`
    query parameter. The middleware uses this to gate access to API routes when
    a router_api_key is configured.
    """
    auth_header = request.headers.get("Authorization")
    if auth_header and auth_header.lower().startswith("bearer "):
        key = auth_header.split(" ", 1)[1].strip()
        if key:  # Ensure key is not empty
            return key
    query_key = request.query_params.get("api_key")
    if query_key:
        return query_key
    return None


def _strip_api_key_from_scope(request: Request) -> None:
    """
    Remove api_key from the ASGI scope query string to avoid leaking it in logs.
    """
    scope = request.scope
    raw_qs = scope.get("query_string", b"")
    if not raw_qs:
        return
    params = parse_qsl(raw_qs.decode("utf-8"), keep_blank_values=True)
    filtered = [(k, v) for (k, v) in params if k != "api_key"]
    scope["query_string"] = urlencode(filtered).encode("utf-8")


@app.middleware("http")
async def enforce_router_api_key(request: Request, call_next):
    """
    Enforce the optional NOMYO Router API key for all non-static requests.
    When `config.router_api_key` is set, clients must supply the key either in
    the Authorization header (`Bearer <key>`) or as `api_key` query parameter.
    """
    expected_key = config.router_api_key
    if not expected_key or request.method == "OPTIONS":
        return await call_next(request)

    path = request.url.path
    # Allow static assets (CSS, JS, images, fonts) but NOT HTML pages,
    # which would bypass auth by accessing /static/index.html directly.
    _STATIC_ASSET_EXTS = {".css", ".js", ".ico", ".png", ".jpg", ".jpeg", ".svg", ".woff", ".woff2", ".ttf", ".map"}
    is_static_asset = path.startswith("/static") and Path(path).suffix.lower() in _STATIC_ASSET_EXTS
    if is_static_asset or path in {"/", "/favicon.ico"}:
        return await call_next(request)

    provided_key = _extract_router_api_key(request)
    # Strip the api_key query param from scope so access logs do not leak it
    _strip_api_key_from_scope(request)
    if provided_key is None:
        # No key provided but authentication is required - return 401
        headers = {}
        if "/api/" in path and path != "/api/usage-stream":
            headers = {
                "Access-Control-Allow-Origin": "*",
                "Access-Control-Allow-Headers": "Authorization, Content-Type",
                "Access-Control-Allow-Methods": "GET, POST, PUT, DELETE, OPTIONS",
            }
        return JSONResponse(
            content={"detail": "Missing NOMYO Router API key"},
            status_code=401,
            headers=headers,
        )

    if not secrets.compare_digest(str(provided_key), str(expected_key)):
        return JSONResponse(
            content={"detail": "Invalid NOMYO Router API key"},
            status_code=403,
        )

    response = await call_next(request)
    # Add CORS headers for authenticated API requests
    if "/api/" in path and path != "/api/usage-stream":
        response.headers["Access-Control-Allow-Origin"] = "*"
        response.headers["Access-Control-Allow-Headers"] = "Authorization, Content-Type"
        response.headers["Access-Control-Allow-Methods"] = "GET, POST, PUT, DELETE, OPTIONS"
    return response


@app.exception_handler(openai.APIStatusError)
async def _openai_api_status_error_handler(request: Request, exc: openai.APIStatusError):
    """Forward upstream OpenAI-SDK status errors with their original status code and body
    instead of letting them bubble up as 500s."""
    body = exc.body if exc.body is not None else {"error": {"message": str(exc), "code": exc.status_code}}
    return JSONResponse(status_code=exc.status_code, content=body)


from state import (
    usage_counts,
    token_usage_counts,
    usage_lock,
    token_usage_lock,
    _affinity_map,
    _affinity_lock,
    _AFFINITY_MAX_ENTRIES,
)

from fingerprint import _conversation_fingerprint

# Database instance
db: "TokenDatabase" = None

# -------------------------------------------------------------
# 4. Helperfunctions
# -------------------------------------------------------------
from backends.normalize import (
    _normalize_llama_model_name,
    _extract_llama_quant,
    ep2base,
    dedupe_on_keys,
)
from backends.sessions import (
    _is_unix_socket_endpoint,
    _get_socket_path,
    get_session,
    _make_openai_client,
    get_ollama_client,
)
from backends.health import (
    _is_fresh,
    _ensure_success,
    _format_connection_issue,
    _is_backend_connection_error,
    _mark_backend_unhealthy,
    _is_llama_model_loaded,
    _is_llama_model_loaded_or_sleeping,
)


from backends.normalize import (
    is_ext_openai_endpoint,
    is_openai_compatible,
    llama_endpoints,
    get_tracking_model,
)

from tokens import token_worker, flush_buffer, flush_remaining_buffers

from backends.probe import fetch


from routing import increment_usage, decrement_usage


from requests.chat import _make_chat_request, _make_moe_requests

from images import iso8601_ns, is_base64, resize_image_if_needed

from requests.messages import (
    _strip_assistant_prefill,
    transform_tool_calls_to_openai,
    transform_images_to_data_urls,
    _strip_images_from_messages,
    _accumulate_openai_tc_delta,
    _build_ollama_tool_calls,
    _convert_openai_logprobs,
    get_last_user_content,
)
from requests.rechunk import rechunk

from sse import (
    _capture_snapshot,
    _distribute_snapshot,
    close_all_sse_queues,
    subscribe,
    unsubscribe,
    get_usage_counts,
)

# -------------------------------------------------------------
# 5. Endpoint selection logic (respecting the configurable limit)
# -------------------------------------------------------------
from routing import get_max_connections, choose_endpoint

# (Ollama /api/* routes — moved to api/ollama.py)
# -------------------------------------------------------------
# 18b. Conversation-affinity stats – feeds the PS-table dot matrix
# -------------------------------------------------------------
# (affinity_stats, usage, config — moved to api/management.py)
# (v1/* routes — moved to api/openai.py)
# (cache routes — moved to api/management.py)
# -------------------------------------------------------------
# 26. Serve the static front‑end
# -------------------------------------------------------------
app.mount("/static", StaticFiles(directory="static"), name="static")

from api.static import router as static_router
app.include_router(static_router)
from api.management import router as management_router
app.include_router(management_router)
from api.openai import router as openai_router
app.include_router(openai_router)
from api.responses import router as responses_router
app.include_router(responses_router)
from api.ollama import router as ollama_router
app.include_router(ollama_router)

# (health, hostname, usage-stream — moved to api/management.py)
# -------------------------------------------------------------
# 28. FastAPI startup/shutdown events
# -------------------------------------------------------------
@app.on_event("startup")
async def startup_event() -> None:
    global config, db, token_worker_task, flush_task
    # Load YAML config (or use defaults if not present)
    config_path = _config_path_from_env()
    config = Config.from_yaml(config_path)
    if config_path.exists():
        print(
            f"Loaded configuration from {config_path}:\n"
            f" endpoints={config.endpoints},\n"
            f" llama_server_endpoints={config.llama_server_endpoints},\n"
            f" llama_swap_endpoints={config.llama_swap_endpoints},\n"
            f" max_concurrent_connections={config.max_concurrent_connections},\n"
            f" endpoint_config={config.endpoint_config},\n"
            f" priority_routing={config.priority_routing}"
        )
    else:
        print(
            f"No configuration file found at {config_path}. "
            "Falling back to default settings."
        )

    # Initialize database
    db = TokenDatabase(config.db_path)
    await db.init_db()

    # Reconcile Responses-API background tasks lost across a restart: their
    # in-memory asyncio task is gone but the DB row may still read queued /
    # in_progress, so mark those failed to give polling clients a terminal state.
    _orphaned = await db.fail_orphaned_responses()
    if _orphaned:
        print(f"[startup] Marked {_orphaned} orphaned background response(s) as failed.")

    # Load existing token counts from database
    async for count_entry in db.load_token_counts():
        endpoint = count_entry['endpoint']
        model = count_entry['model']
        input_tokens = count_entry['input_tokens']
        output_tokens = count_entry['output_tokens']
        total_tokens = count_entry['total_tokens']

        token_usage_counts[endpoint][model] = total_tokens

    ssl_context = ssl.create_default_context()
    connector = aiohttp.TCPConnector(limit=0, limit_per_host=512, ssl=ssl_context)
    timeout = aiohttp.ClientTimeout(total=60, connect=15, sock_read=120, sock_connect=15)
    session = aiohttp.ClientSession(
        connector=connector,
        timeout=timeout,
        headers={"Referer": default_headers.get("HTTP-Referer", "https://nomyo.ai")},
    )

    app_state["connector"] = connector
    app_state["session"] = session

    # Dedicated pool for health/introspection probes, isolated from the proxy
    # session above. Streaming completions can hold the proxy pool's per-host
    # slots open for a long time; without a separate pool the lightweight
    # probes queue behind them, hit their deadline, and mark healthy endpoints
    # as unavailable under load.
    probe_connector = aiohttp.TCPConnector(limit=0, limit_per_host=64, ssl=ssl_context)
    probe_session = aiohttp.ClientSession(
        connector=probe_connector,
        timeout=timeout,
        headers={"Referer": default_headers.get("HTTP-Referer", "https://nomyo.ai")},
    )
    app_state["probe_connector"] = probe_connector
    app_state["probe_session"] = probe_session

    # Create httpx clients for external OpenAI endpoints (Google, etc.)
    # aiohttp strips Referer headers for cross-origin requests, so we use httpx
    for ep in config.endpoints:
        if is_ext_openai_endpoint(ep):
            app_state["httpx_clients"][ep] = httpx.AsyncClient(timeout=30.0)

    # Create per-endpoint Unix socket sessions for .sock endpoints
    for ep in llama_endpoints(config):
        if _is_unix_socket_endpoint(ep):
            sock_path = _get_socket_path(ep)
            sock_connector = aiohttp.UnixConnector(path=sock_path)
            sock_timeout = aiohttp.ClientTimeout(total=300, connect=5, sock_read=300)
            sock_session = aiohttp.ClientSession(connector=sock_connector, timeout=sock_timeout)
            app_state["socket_sessions"][ep] = sock_session
            transport = httpx.AsyncHTTPTransport(uds=sock_path)
            app_state["httpx_clients"][ep] = httpx.AsyncClient(transport=transport, timeout=300.0)
            print(f"[startup] Unix socket session: {ep} -> {sock_path}")

    # Pre-create long-lived backend clients so the expensive SSL-context /
    # trust-store construction (~40 ms each) happens once here instead of on the
    # request path. Ollama endpoints are reached via both the native ollama
    # client (/api/chat, /api/generate) and the OpenAI client (/v1/* routes),
    # so warm both; OpenAI-compatible endpoints only need the OpenAI client.
    _warm_endpoints = config.endpoints + [
        ep for ep in llama_endpoints(config) if ep not in config.endpoints
    ]
    for ep in _warm_endpoints:
        try:
            if not is_openai_compatible(ep):
                get_ollama_client(ep)
            _make_openai_client(
                ep, default_headers=default_headers,
                api_key=config.api_keys.get(ep, "no-key"),
            )
        except Exception as e:
            print(f"[startup] Backend client pre-warm failed for {ep}: {e}")

    token_worker_task = asyncio.create_task(token_worker())
    flush_task = asyncio.create_task(flush_buffer())
    await init_llm_cache(config)

@app.on_event("shutdown")
async def shutdown_event() -> None:
    await close_all_sse_queues()

    # Stop background tasks first so they stop touching the DB before we close it.
    for t in (token_worker_task, flush_task):
        if t is not None:
            t.cancel()
            try:
                await t
            except (asyncio.CancelledError, Exception):
                pass

    await flush_remaining_buffers()
    await app_state["session"].close()
    if app_state.get("probe_session") is not None:
        await app_state["probe_session"].close()
        app_state["probe_session"] = None
        app_state["probe_connector"] = None

    # Close Unix socket sessions
    for ep, sess in list(app_state.get("socket_sessions", {}).items()):
        try:
            await sess.close()
            print(f"[shutdown] Closed Unix socket session: {ep}")
        except Exception as e:
            print(f"[shutdown] Error closing Unix socket session {ep}: {e}")

    # Close httpx Unix socket clients
    for ep, client in list(app_state.get("httpx_clients", {}).items()):
        try:
            await client.aclose()
            print(f"[shutdown] Closed httpx client: {ep}")
        except Exception as e:
            print(f"[shutdown] Error closing httpx client {ep}: {e}")

    # Close cached backend clients (reused across requests; see startup pre-warm).
    for key, client in list(app_state.get("ollama_clients", {}).items()):
        try:
            await client._client.aclose()
        except Exception as e:
            print(f"[shutdown] Error closing ollama client {key}: {e}")
    app_state["ollama_clients"].clear()
    for key, client in list(app_state.get("openai_clients", {}).items()):
        try:
            await client.close()
        except Exception as e:
            print(f"[shutdown] Error closing openai client {key}: {e}")
    app_state["openai_clients"].clear()

    # Close the aiosqlite connection last — its worker thread is non-daemon
    # and would otherwise keep the interpreter alive after lifespan completes.
    if db is not None:
        try:
            await db.close()
            print("[shutdown] Closed token DB connection.")
        except Exception as e:
            print(f"[shutdown] Error closing DB: {e}")
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								"""
-												bump version

											
										
										
											2026-03-05 11:09:20 +01:00
+								title: NOMYO Router - an (O)llama and OpenAI API v1 Proxy with Endpoint:Model aware routing
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								author: alpha-nerd-nomyo
 								author_url: https://github.com/nomyo-ai
-												fix: removed dead config key

											
										
										
											2026-05-13 14:59:05 +02:00
+								version: 0.9
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								license: AGPL
 								"""
 								# -------------------------------------------------------------
-												feat: add conversation-endpoint affinity to benefit from hot kv-caches if possible

											
										
										
											2026-05-12 18:33:47 +02:00
+								import orjson, time, asyncio, yaml, ollama, openai, os, re, aiohttp, ssl, random, base64, io, enhance, secrets, math, socket, httpx, hashlib
-												proposal: use global truststore ctx for all connections

											
										
										
											2026-02-12 16:15:39 +01:00
+								try:
 								    import truststore; truststore.inject_into_ssl()
 								except ImportError:
 								    pass
-												adding token timeseries counting in db for future data viz

											
										
										
											2025-11-18 11:16:21 +01:00
+								from datetime import datetime, timezone
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								from pathlib import Path
-												feat:
added buffer_lock to prevent race condition in high concurrency scenarios
added documentation

											
										
										
											2026-01-05 17:16:31 +01:00
 								# Directory containing static files (relative to this script)
 								STATIC_DIR = Path(__file__).parent / "static"
-												Add files via upload

final touches
											
										
										
											2025-09-05 12:11:31 +02:00
+								from typing import Dict, Set, List, Optional
-												add: Optional router-level API key that gates router/API/web UI access

Optional router-level API key that gates router/API/web UI access (leave empty to disable)

## Supplying the router API key

If you set `nomyo-router-api-key` in `config.yaml` (or `NOMYO_ROUTER_API_KEY` env), every request to NOMYO Router must include the key:

- HTTP header (recommended): `Authorization: Bearer <router_key>`
- Query param (fallback): `?api_key=<router_key>`

Examples:
```bash
curl -H "Authorization: Bearer $NOMYO_ROUTER_API_KEY" http://localhost:12434/api/tags
curl "http://localhost:12434/api/tags?api_key=$NOMYO_ROUTER_API_KEY"
```

											
										
										
											2026-01-14 09:28:02 +01:00
+								from urllib.parse import urlparse, parse_qsl, urlencode
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								from fastapi import FastAPI, Request, HTTPException
-												Add files via upload

final touches
											
										
										
											2025-09-05 12:11:31 +02:00
+								from fastapi_sse import sse_handler
-												Additions

- Frontend
- Internal Monitoring Endpoints
- External OpenAI compatible backends
											
										
										
											2025-08-30 00:12:56 +02:00
+								from fastapi.staticfiles import StaticFiles
-												adding CORS middleware

											
										
										
											2025-09-11 09:46:19 +02:00
+								from fastapi.middleware.cors import CORSMiddleware
-												Add files via upload

Adding:
- health endpoint
- extended /api/config
- frontend extension of backend server viz
											
										
										
											2025-08-30 12:43:35 +02:00
+								from starlette.responses import StreamingResponse, JSONResponse, Response, HTMLResponse, RedirectResponse
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								from pydantic import Field
 								from pydantic_settings import BaseSettings
 								from collections import defaultdict
-												new requirement for image preprocessing to downsize and convert to png for faster and safer transaction

											
										
										
											2025-09-24 11:46:38 +02:00
+								from PIL import Image
-												Add files via upload

switching to custom cache logic for faster cache invalidation in faulty cache scenarios
removing aiocache dependency
											
										
										
											2025-09-01 13:38:49 +02:00
-												refac: split into modules I

											
										
										
											2026-05-19 10:05:27 +02:00
+								from security import _mask_secrets
 								from context_window import (
 								    _count_message_tokens,
 								    _trim_messages_for_context,
 								    _calibrated_trim_target,
 								    _endpoint_nctx,
 								    _CTX_TRIM_SMALL_LIMIT,
 								)
-												refac: modularize global states III

											
										
										
											2026-05-19 11:18:06 +02:00
+								from state import (
 								    _models_cache,
 								    _loaded_models_cache,
 								    _available_error_cache,
 								    _loaded_error_cache,
 								    _completion_error_cache,
 								    _COMPLETION_ERROR_TTL,
 								    _models_cache_lock,
 								    _loaded_models_cache_lock,
 								    _available_error_cache_lock,
 								    _loaded_error_cache_lock,
 								    _completion_error_cache_lock,
 								    _inflight_available_models,
 								    _inflight_loaded_models,
 								    _inflight_lock,
 								    _bg_refresh_available,
 								    _bg_refresh_loaded,
 								    _bg_refresh_lock,
 								    _subscribers,
 								    _subscribers_lock,
 								    token_queue,
 								    app_state,
 								    token_buffer,
 								    time_series_buffer,
 								    buffer_lock,
 								    FLUSH_INTERVAL,
 								)
-												fix: changing error_cache to stale-while-revalidate same as available_models_cache

											
										
										
											2026-03-12 14:47:54 +01:00
-												refac: modularize global states III

											
										
										
											2026-05-19 11:18:06 +02:00
+								# Rebound on startup — must stay in router.py module namespace.
-												stopping the token_worker_task gracefully on shutdown

											
										
										
											2025-11-13 10:13:10 +01:00
+								token_worker_task: asyncio.Task | None = None
-												adding token timeseries counting in db for future data viz

											
										
										
											2025-11-18 11:16:21 +01:00
+								flush_task: asyncio.Task | None = None
-												refac: modularize config II

											
										
										
											2026-05-19 11:00:50 +02:00
+								from config import Config, _config_path_from_env
-												Add Docker support

Adds comprehensive docker support

											
										
										
											2025-11-07 13:59:16 +01:00
-												feat(router): add logprob support in /api/chat

Add logprob support to the OpenAI-to-Ollama proxy by converting OpenAI logprob formats to Ollama types. Also update the ollama dependency.

											
										
										
											2026-02-13 13:29:45 +01:00
+								from ollama._types import TokenLogprob, Logprob
-												adding token timeseries counting in db for future data viz

											
										
										
											2025-11-18 11:16:21 +01:00
+								from db import TokenDatabase
-												feat: adding a semantic cache layer

											
										
										
											2026-03-08 09:12:09 +01:00
+								from cache import init_llm_cache, get_llm_cache, openai_nonstream_to_sse
-												adding token timeseries counting in db for future data viz

											
										
										
											2025-11-18 11:16:21 +01:00
-												Add Docker support

Adds comprehensive docker support

											
										
										
											2025-11-07 13:59:16 +01:00
-												refac: modularize backend IV

											
										
										
											2026-05-19 12:05:51 +02:00
+								# Create the global config object – it will be overwritten on startup.
 								# Submodules read it lazily via config.get_config().
-												Add Docker support

Adds comprehensive docker support

											
										
										
											2025-11-07 13:59:16 +01:00
+								config = Config.from_yaml(_config_path_from_env())
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
 								# -------------------------------------------------------------
 								# 2. FastAPI application
 								# -------------------------------------------------------------
 								app = FastAPI()
-												Add files via upload

final touches
											
										
										
											2025-09-05 12:11:31 +02:00
+								sse_handler.app = app
-												adding CORS middleware

											
										
										
											2025-09-11 09:46:19 +02:00
+								app.add_middleware(
 								    CORSMiddleware,
 								    allow_origins=["*"],
 								    allow_credentials=True,
 								    allow_methods=["GET", "POST", "DELETE"],
 								    allow_headers=["Authorization", "Content-Type"],
 								)
-												refac: modularize backend IV

											
										
										
											2026-05-19 12:05:51 +02:00
+								from state import default_headers
-												fixing headers, using ollama.Responses in rechunk class, fixing reseverd words var usage, fixing embedding output, fixing model naming in frontend

											
										
										
											2025-09-21 16:20:36 +02:00
-												add: Optional router-level API key that gates router/API/web UI access

Optional router-level API key that gates router/API/web UI access (leave empty to disable)

## Supplying the router API key

If you set `nomyo-router-api-key` in `config.yaml` (or `NOMYO_ROUTER_API_KEY` env), every request to NOMYO Router must include the key:

- HTTP header (recommended): `Authorization: Bearer <router_key>`
- Query param (fallback): `?api_key=<router_key>`

Examples:
```bash
curl -H "Authorization: Bearer $NOMYO_ROUTER_API_KEY" http://localhost:12434/api/tags
curl "http://localhost:12434/api/tags?api_key=$NOMYO_ROUTER_API_KEY"
```

											
										
										
											2026-01-14 09:28:02 +01:00
+								# -------------------------------------------------------------
 								# Router-level authentication (optional)
 								# -------------------------------------------------------------
 								def _extract_router_api_key(request: Request) -> Optional[str]:
 								    """
 								    Extract the provided router API key from the Authorization header or `api_key`
 								    query parameter. The middleware uses this to gate access to API routes when
 								    a router_api_key is configured.
 								    """
 								    auth_header = request.headers.get("Authorization")
 								    if auth_header and auth_header.lower().startswith("bearer "):
-												Empty key strings could bypass authentication in _extract_router_api_key()  when malformed Authorization headers were sent
- Added validation to check that the extracted key is not empty before returning it
- Added CORS headers to enforce_router_api_key() for proper cross-origin request handling and CORS-related error prevention

											
										
										
											2026-01-26 18:11:28 +01:00
+								        key = auth_header.split(" ", 1)[1].strip()
 								        if key:  # Ensure key is not empty
 								            return key
-												add: Optional router-level API key that gates router/API/web UI access

Optional router-level API key that gates router/API/web UI access (leave empty to disable)

## Supplying the router API key

If you set `nomyo-router-api-key` in `config.yaml` (or `NOMYO_ROUTER_API_KEY` env), every request to NOMYO Router must include the key:

- HTTP header (recommended): `Authorization: Bearer <router_key>`
- Query param (fallback): `?api_key=<router_key>`

Examples:
```bash
curl -H "Authorization: Bearer $NOMYO_ROUTER_API_KEY" http://localhost:12434/api/tags
curl "http://localhost:12434/api/tags?api_key=$NOMYO_ROUTER_API_KEY"
```

											
										
										
											2026-01-14 09:28:02 +01:00
+								    query_key = request.query_params.get("api_key")
 								    if query_key:
 								        return query_key
 								    return None
 								def _strip_api_key_from_scope(request: Request) -> None:
 								    """
 								    Remove api_key from the ASGI scope query string to avoid leaking it in logs.
 								    """
 								    scope = request.scope
 								    raw_qs = scope.get("query_string", b"")
 								    if not raw_qs:
 								        return
 								    params = parse_qsl(raw_qs.decode("utf-8"), keep_blank_values=True)
 								    filtered = [(k, v) for (k, v) in params if k != "api_key"]
 								    scope["query_string"] = urlencode(filtered).encode("utf-8")
 								@app.middleware("http")
 								async def enforce_router_api_key(request: Request, call_next):
 								    """
 								    Enforce the optional NOMYO Router API key for all non-static requests.
 								    When `config.router_api_key` is set, clients must supply the key either in
 								    the Authorization header (`Bearer <key>`) or as `api_key` query parameter.
 								    """
 								    expected_key = config.router_api_key
 								    if not expected_key or request.method == "OPTIONS":
 								        return await call_next(request)
 								    path = request.url.path
-												fix: security, exempt files to prevent path traversal

											
										
										
											2026-04-10 17:40:44 +02:00
+								    # Allow static assets (CSS, JS, images, fonts) but NOT HTML pages,
 								    # which would bypass auth by accessing /static/index.html directly.
 								    _STATIC_ASSET_EXTS = {".css", ".js", ".ico", ".png", ".jpg", ".jpeg", ".svg", ".woff", ".woff2", ".ttf", ".map"}
 								    is_static_asset = path.startswith("/static") and Path(path).suffix.lower() in _STATIC_ASSET_EXTS
 								    if is_static_asset or path in {"/", "/favicon.ico"}:
-												add: Optional router-level API key that gates router/API/web UI access

Optional router-level API key that gates router/API/web UI access (leave empty to disable)

## Supplying the router API key

If you set `nomyo-router-api-key` in `config.yaml` (or `NOMYO_ROUTER_API_KEY` env), every request to NOMYO Router must include the key:

- HTTP header (recommended): `Authorization: Bearer <router_key>`
- Query param (fallback): `?api_key=<router_key>`

Examples:
```bash
curl -H "Authorization: Bearer $NOMYO_ROUTER_API_KEY" http://localhost:12434/api/tags
curl "http://localhost:12434/api/tags?api_key=$NOMYO_ROUTER_API_KEY"
```

											
										
										
											2026-01-14 09:28:02 +01:00
+								        return await call_next(request)
 								    provided_key = _extract_router_api_key(request)
 								    # Strip the api_key query param from scope so access logs do not leak it
 								    _strip_api_key_from_scope(request)
 								    if provided_key is None:
-												feat: enforce api key authentication and update table header

- Added proper API key validation in router.py with 401 response when key is missing
- Implemented CORS headers for authentication requests
- Updated table header from "Until" to "Unload" in static/index.html
- Improved security by preventing API key leakage in access logs

											
										
										
											2026-02-01 10:05:46 +01:00
+								        # No key provided but authentication is required - return 401
 								        headers = {}
-												Empty key strings could bypass authentication in _extract_router_api_key()  when malformed Authorization headers were sent
- Added validation to check that the extracted key is not empty before returning it
- Added CORS headers to enforce_router_api_key() for proper cross-origin request handling and CORS-related error prevention

											
										
										
											2026-01-26 18:11:28 +01:00
+								        if "/api/" in path and path != "/api/usage-stream":
-												feat: enforce api key authentication and update table header

- Added proper API key validation in router.py with 401 response when key is missing
- Implemented CORS headers for authentication requests
- Updated table header from "Until" to "Unload" in static/index.html
- Improved security by preventing API key leakage in access logs

											
										
										
											2026-02-01 10:05:46 +01:00
+								            headers = {
 								                "Access-Control-Allow-Origin": "*",
 								                "Access-Control-Allow-Headers": "Authorization, Content-Type",
 								                "Access-Control-Allow-Methods": "GET, POST, PUT, DELETE, OPTIONS",
 								            }
 								        return JSONResponse(
 								            content={"detail": "Missing NOMYO Router API key"},
 								            status_code=401,
 								            headers=headers,
 								        )
-												add: Optional router-level API key that gates router/API/web UI access

Optional router-level API key that gates router/API/web UI access (leave empty to disable)

## Supplying the router API key

If you set `nomyo-router-api-key` in `config.yaml` (or `NOMYO_ROUTER_API_KEY` env), every request to NOMYO Router must include the key:

- HTTP header (recommended): `Authorization: Bearer <router_key>`
- Query param (fallback): `?api_key=<router_key>`

Examples:
```bash
curl -H "Authorization: Bearer $NOMYO_ROUTER_API_KEY" http://localhost:12434/api/tags
curl "http://localhost:12434/api/tags?api_key=$NOMYO_ROUTER_API_KEY"
```

											
										
										
											2026-01-14 09:28:02 +01:00
 								    if not secrets.compare_digest(str(provided_key), str(expected_key)):
 								        return JSONResponse(
 								            content={"detail": "Invalid NOMYO Router API key"},
 								            status_code=403,
 								        )
-												Empty key strings could bypass authentication in _extract_router_api_key()  when malformed Authorization headers were sent
- Added validation to check that the extracted key is not empty before returning it
- Added CORS headers to enforce_router_api_key() for proper cross-origin request handling and CORS-related error prevention

											
										
										
											2026-01-26 18:11:28 +01:00
+								    response = await call_next(request)
 								    # Add CORS headers for authenticated API requests
 								    if "/api/" in path and path != "/api/usage-stream":
 								        response.headers["Access-Control-Allow-Origin"] = "*"
 								        response.headers["Access-Control-Allow-Headers"] = "Authorization, Content-Type"
 								        response.headers["Access-Control-Allow-Methods"] = "GET, POST, PUT, DELETE, OPTIONS"
 								    return response
-												feat: correct pass through of openai.APIStatusErrors

											
										
										
											2026-05-08 12:19:03 +02:00
 								@app.exception_handler(openai.APIStatusError)
 								async def _openai_api_status_error_handler(request: Request, exc: openai.APIStatusError):
 								    """Forward upstream OpenAI-SDK status errors with their original status code and body
 								    instead of letting them bubble up as 500s."""
 								    body = exc.body if exc.body is not None else {"error": {"message": str(exc), "code": exc.status_code}}
 								    return JSONResponse(status_code=exc.status_code, content=body)
-												refac: modularize global states III

											
										
										
											2026-05-19 11:18:06 +02:00
+								from state import (
 								    usage_counts,
 								    token_usage_counts,
 								    usage_lock,
 								    token_usage_lock,
 								    _affinity_map,
 								    _affinity_lock,
 								    _AFFINITY_MAX_ENTRIES,
 								)
-												feat: add conversation-endpoint affinity to benefit from hot kv-caches if possible

											
										
										
											2026-05-12 18:33:47 +02:00
-												refac: split into modules I

											
										
										
											2026-05-19 10:05:27 +02:00
+								from fingerprint import _conversation_fingerprint
-												feat: add conversation-endpoint affinity to benefit from hot kv-caches if possible

											
										
										
											2026-05-12 18:33:47 +02:00
-												adding token timeseries counting in db for future data viz

											
										
										
											2025-11-18 11:16:21 +01:00
+								# Database instance
 								db: "TokenDatabase" = None
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								# -------------------------------------------------------------
-												refac: modularize backend IV

											
										
										
											2026-05-19 12:05:51 +02:00
+								# 4. Helperfunctions
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								# -------------------------------------------------------------
-												refac: modularize backend IV

											
										
										
											2026-05-19 12:05:51 +02:00
+								from backends.normalize import (
 								    _normalize_llama_model_name,
 								    _extract_llama_quant,
 								    ep2base,
 								    dedupe_on_keys,
 								)
 								from backends.sessions import (
 								    _is_unix_socket_endpoint,
 								    _get_socket_path,
 								    get_session,
 								    _make_openai_client,
-												feat: cache backend clients per endpoint instead of building one (with a fresh SSL context) per request

											
										
										
											2026-06-07 09:55:54 +02:00
+								    get_ollama_client,
-												refac: modularize backend IV

											
										
										
											2026-05-19 12:05:51 +02:00
+								)
 								from backends.health import (
 								    _is_fresh,
 								    _ensure_success,
 								    _format_connection_issue,
 								    _is_backend_connection_error,
 								    _mark_backend_unhealthy,
 								    _is_llama_model_loaded,
 								    _is_llama_model_loaded_or_sleeping,
 								)
-												feat(router): normalize model names for usage tracking across endpoints (continued)

Introduce `get_tracking_model()` to standardize model names for consistent usage tracking in Prometheus metrics. This ensures llama-server models are stripped of HF prefixes and quantization suffixes, Ollama models append `:latest` when versionless, and external OpenAI models remain unchanged—aligning all tracking keys with the PS table.

											
										
										
											2026-02-18 11:45:37 +01:00
-												refac: modularize backend IV

											
										
										
											2026-05-19 12:05:51 +02:00
+								from backends.normalize import (
 								    is_ext_openai_endpoint,
 								    is_openai_compatible,
-												feat: add llama-swap as a backend

											
										
										
											2026-06-14 16:34:31 +02:00
+								    llama_endpoints,
-												refac: modularize backend IV

											
										
										
											2026-05-19 12:05:51 +02:00
+								    get_tracking_model,
 								)
-												feat(router): normalize model names for usage tracking across endpoints (continued)

Introduce `get_tracking_model()` to standardize model names for consistent usage tracking in Prometheus metrics. This ensures llama-server models are stripped of HF prefixes and quantization suffixes, Ollama models append `:latest` when versionless, and external OpenAI models remain unchanged—aligning all tracking keys with the PS table.

											
										
										
											2026-02-18 11:45:37 +01:00
-												refac: modularize sse, routing, db and token handling V

											
										
										
											2026-05-19 12:48:55 +02:00
+								from tokens import token_worker, flush_buffer, flush_remaining_buffers
-												refactor: use a persistent WAL-enabled connection with async locks

- Introduce a lazily initialized, shared aiosqlite connection stored in self._db and two asyncio locks (_db_lock, _operation_lock) for safe concurrent access
- Ensure the database directory exists before connecting and enable WAL journaling and foreign keys on first connect
- Add close method to gracefully close the persistent connection
- Guard initialization and write operations with _operation_lock to ensure single-threaded schema setup
- Switch to ON CONFLICT UPSERT for token_counts updates and initialize token_time_series table
- Add typing for _db (Optional[aiosqlite.Connection]) and adjust imports accordingly

addition: Frontend button with total stats aggregation task and feedback span element to keep user informed and a small database footprint

											
										
										
											2025-12-02 12:18:23 +01:00
-												refac: modularize backend IV

											
										
										
											2026-05-19 12:05:51 +02:00
+								from backends.probe import fetch
-												feat: add timestamp index and improve cache concurrency

- Added index on token_time_series timestamp for faster queries
- Introduced cache locks to prevent race conditions

											
										
										
											2026-01-16 16:47:24 +01:00
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
-												refac: modularize sse, routing, db and token handling V

											
										
										
											2026-05-19 12:48:55 +02:00
+								from routing import increment_usage, decrement_usage
-												enhance routing logic

add a pre-routing model check:
allows for different configs on the ollama backend servers
											
										
										
											2025-08-29 13:13:25 +02:00
-												Add files via upload

final touches
											
										
										
											2025-09-05 12:11:31 +02:00
-												refac: request handling VI

											
										
										
											2026-05-19 14:09:52 +02:00
+								from requests.chat import _make_chat_request, _make_moe_requests
-												fix(enhance.py): correct typo in function name from 'moe_select_candiadate' to 'moe_select_candidate'

feat(router.py): add helper function _make_chat_request for handling enhancing chat requests to endpoints

											
										
										
											2025-12-15 10:35:56 +01:00
-												refac: split into modules I

											
										
										
											2026-05-19 10:05:27 +02:00
+								from images import iso8601_ns, is_base64, resize_image_if_needed
-												new requirement for image preprocessing to downsize and convert to png for faster and safer transaction

											
										
										
											2025-09-24 11:46:38 +02:00
-												refac: request handling VI

											
										
										
											2026-05-19 14:09:52 +02:00
+								from requests.messages import (
 								    _strip_assistant_prefill,
 								    transform_tool_calls_to_openai,
 								    transform_images_to_data_urls,
 								    _strip_images_from_messages,
 								    _accumulate_openai_tc_delta,
 								    _build_ollama_tool_calls,
 								    _convert_openai_logprobs,
 								    get_last_user_content,
 								)
 								from requests.rechunk import rechunk
-												feat: deduplicate background refresh tasks and extend cache TTL

Adds lock-protected dictionaries to track running background refresh tasks, preventing duplicate executions per endpoint. Increases cache freshness thresholds from 30s to 300s to reduce blocking behavior.

fix: /v1 endpoints use correct media_types and usage information with proper logging

											
										
										
											2026-02-14 14:51:44 +01:00
-												refac: modularize sse, routing, db and token handling V

											
										
										
											2026-05-19 12:48:55 +02:00
+								from sse import (
 								    _capture_snapshot,
 								    _distribute_snapshot,
 								    close_all_sse_queues,
 								    subscribe,
 								    unsubscribe,
 								    get_usage_counts,
 								)
-												enhance routing logic

add a pre-routing model check:
allows for different configs on the ollama backend servers
											
										
										
											2025-08-29 13:13:25 +02:00
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								# -------------------------------------------------------------
 								# 5. Endpoint selection logic (respecting the configurable limit)
 								# -------------------------------------------------------------
-												refac: modularize sse, routing, db and token handling V

											
										
										
											2026-05-19 12:48:55 +02:00
+								from routing import get_max_connections, choose_endpoint
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
-												refac: modularize apis VII

											
										
										
											2026-05-19 14:57:39 +02:00
+								# (Ollama /api/* routes — moved to api/ollama.py)
-												feat: visualization of conversation affinity in dashboard

											
										
										
											2026-05-13 13:38:37 +02:00
+								# -------------------------------------------------------------
 								# 18b. Conversation-affinity stats – feeds the PS-table dot matrix
 								# -------------------------------------------------------------
-												refac: modularize apis VII

											
										
										
											2026-05-19 14:57:39 +02:00
+								# (affinity_stats, usage, config — moved to api/management.py)
 								# (v1/* routes — moved to api/openai.py)
 								# (cache routes — moved to api/management.py)
-												feat: adding /v1/rerank endpoint with cohere,jina,llama.cpp compatibility

											
										
										
											2026-02-28 09:31:25 +01:00
+								# -------------------------------------------------------------
 								# 26. Serve the static front‑end
-												Additions

- Frontend
- Internal Monitoring Endpoints
- External OpenAI compatible backends
											
										
										
											2025-08-30 00:12:56 +02:00
+								# -------------------------------------------------------------
 								app.mount("/static", StaticFiles(directory="static"), name="static")
-												refac: modularize apis VII

											
										
										
											2026-05-19 14:57:39 +02:00
+								from api.static import router as static_router
 								app.include_router(static_router)
 								from api.management import router as management_router
 								app.include_router(management_router)
 								from api.openai import router as openai_router
 								app.include_router(openai_router)
-												feat: transparent openai responses api integration

											
										
										
											2026-06-10 18:48:26 +02:00
+								from api.responses import router as responses_router
 								app.include_router(responses_router)
-												refac: modularize apis VII

											
										
										
											2026-05-19 14:57:39 +02:00
+								from api.ollama import router as ollama_router
 								app.include_router(ollama_router)
-												Add files via upload

final touches
											
										
										
											2025-09-05 12:11:31 +02:00
-												refac: modularize apis VII

											
										
										
											2026-05-19 14:57:39 +02:00
+								# (health, hostname, usage-stream — moved to api/management.py)
-												Add files via upload

final touches
											
										
										
											2025-09-05 12:11:31 +02:00
+								# -------------------------------------------------------------
-												using global aiohttp sessionpool for improved performance

											
										
										
											2025-09-10 10:21:49 +02:00
+								# 28. FastAPI startup/shutdown events
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								# -------------------------------------------------------------
 								@app.on_event("startup")
 								async def startup_event() -> None:
-												fix(router.py):
- added global for orphaned token_worker_task and flust_task
- fixed a regex to effectively _mask_secrets
- fixed several Type and KeyErrors
- fixed model deduplication for llama_server_endpoints

											
										
										
											2026-03-03 16:34:16 +01:00
+								    global config, db, token_worker_task, flush_task
-												Initial commit
											
										
										
											2025-08-26 18:19:43 +02:00
+								    # Load YAML config (or use defaults if not present)
-												Add Docker support

Adds comprehensive docker support

											
										
										
											2025-11-07 13:59:16 +01:00
+								    config_path = _config_path_from_env()
 								    config = Config.from_yaml(config_path)
 								    if config_path.exists():
 								        print(
 								            f"Loaded configuration from {config_path}:\n"
 								            f" endpoints={config.endpoints},\n"
-												feat(router): Add llama-server endpoints support and model parsing

Add `llama_server_endpoints` configuration field to support llama_server OpenAI-compatible endpoints for status checks. Implement helper functions to parse model names and quantization levels from llama-server responses (best effort). Update `is_ext_openai_endpoint` to properly distinguish these endpoints from external OpenAI services. Update sample configuration documentation.

											
										
										
											2026-02-10 16:46:51 +01:00
+								            f" llama_server_endpoints={config.llama_server_endpoints},\n"
-												feat: add llama-swap as a backend

											
										
										
											2026-06-14 16:34:31 +02:00
+								            f" llama_swap_endpoints={config.llama_swap_endpoints},\n"
-												feat: enhance load balancing #23

											
										
										
											2026-04-22 17:27:34 +02:00
+								            f" max_concurrent_connections={config.max_concurrent_connections},\n"
 								            f" endpoint_config={config.endpoint_config},\n"
 								            f" priority_routing={config.priority_routing}"
-												Add Docker support

Adds comprehensive docker support

											
										
										
											2025-11-07 13:59:16 +01:00
+								        )
 								    else:
 								        print(
 								            f"No configuration file found at {config_path}. "
 								            "Falling back to default settings."
 								        )
-												adding token timeseries counting in db for future data viz

											
										
										
											2025-11-18 11:16:21 +01:00
 								    # Initialize database
 								    db = TokenDatabase(config.db_path)
 								    await db.init_db()
-												feat: transparent openai responses api integration

											
										
										
											2026-06-10 18:48:26 +02:00
+								    # Reconcile Responses-API background tasks lost across a restart: their
 								    # in-memory asyncio task is gone but the DB row may still read queued /
 								    # in_progress, so mark those failed to give polling clients a terminal state.
 								    _orphaned = await db.fail_orphaned_responses()
 								    if _orphaned:
 								        print(f"[startup] Marked {_orphaned} orphaned background response(s) as failed.")
-												adding token timeseries counting in db for future data viz

											
										
										
											2025-11-18 11:16:21 +01:00
+								    # Load existing token counts from database
 								    async for count_entry in db.load_token_counts():
 								        endpoint = count_entry['endpoint']
 								        model = count_entry['model']
 								        input_tokens = count_entry['input_tokens']
 								        output_tokens = count_entry['output_tokens']
 								        total_tokens = count_entry['total_tokens']
 								        token_usage_counts[endpoint][model] = total_tokens
-												using global aiohttp sessionpool for improved performance

											
										
										
											2025-09-10 10:21:49 +02:00
+								    ssl_context = ssl.create_default_context()
 								    connector = aiohttp.TCPConnector(limit=0, limit_per_host=512, ssl=ssl_context)
-												check for base64 encoded images and remove alpha channel

											
										
										
											2025-10-03 10:04:50 +02:00
+								    timeout = aiohttp.ClientTimeout(total=60, connect=15, sock_read=120, sock_connect=15)
-												feat: better default referer handling

											
										
										
											2026-05-08 12:15:51 +02:00
+								    session = aiohttp.ClientSession(
 								        connector=connector,
 								        timeout=timeout,
 								        headers={"Referer": default_headers.get("HTTP-Referer", "https://nomyo.ai")},
 								    )
-												using global aiohttp sessionpool for improved performance

											
										
										
											2025-09-10 10:21:49 +02:00
 								    app_state["connector"] = connector
 								    app_state["session"] = session
-												feat: support localhost llama_server access via unix sockets

											
										
										
											2026-04-17 12:41:57 +02:00
-												fix: Lightweight health/introspection probes no longer compete with long-lived streaming completions for the proxy pool's per-host connection slots

											
										
										
											2026-05-28 09:54:53 +02:00
+								    # Dedicated pool for health/introspection probes, isolated from the proxy
 								    # session above. Streaming completions can hold the proxy pool's per-host
 								    # slots open for a long time; without a separate pool the lightweight
 								    # probes queue behind them, hit their deadline, and mark healthy endpoints
 								    # as unavailable under load.
 								    probe_connector = aiohttp.TCPConnector(limit=0, limit_per_host=64, ssl=ssl_context)
 								    probe_session = aiohttp.ClientSession(
 								        connector=probe_connector,
 								        timeout=timeout,
 								        headers={"Referer": default_headers.get("HTTP-Referer", "https://nomyo.ai")},
 								    )
 								    app_state["probe_connector"] = probe_connector
 								    app_state["probe_session"] = probe_session
-												feat: better default referer handling

											
										
										
											2026-05-08 12:15:51 +02:00
+								    # Create httpx clients for external OpenAI endpoints (Google, etc.)
 								    # aiohttp strips Referer headers for cross-origin requests, so we use httpx
 								    for ep in config.endpoints:
 								        if is_ext_openai_endpoint(ep):
 								            app_state["httpx_clients"][ep] = httpx.AsyncClient(timeout=30.0)
-												feat: support localhost llama_server access via unix sockets

											
										
										
											2026-04-17 12:41:57 +02:00
+								    # Create per-endpoint Unix socket sessions for .sock endpoints
-												feat: add llama-swap as a backend

											
										
										
											2026-06-14 16:34:31 +02:00
+								    for ep in llama_endpoints(config):
-												feat: support localhost llama_server access via unix sockets

											
										
										
											2026-04-17 12:41:57 +02:00
+								        if _is_unix_socket_endpoint(ep):
 								            sock_path = _get_socket_path(ep)
 								            sock_connector = aiohttp.UnixConnector(path=sock_path)
 								            sock_timeout = aiohttp.ClientTimeout(total=300, connect=5, sock_read=300)
 								            sock_session = aiohttp.ClientSession(connector=sock_connector, timeout=sock_timeout)
 								            app_state["socket_sessions"][ep] = sock_session
 								            transport = httpx.AsyncHTTPTransport(uds=sock_path)
 								            app_state["httpx_clients"][ep] = httpx.AsyncClient(transport=transport, timeout=300.0)
 								            print(f"[startup] Unix socket session: {ep} -> {sock_path}")
-												feat: cache backend clients per endpoint instead of building one (with a fresh SSL context) per request

											
										
										
											2026-06-07 09:55:54 +02:00
+								    # Pre-create long-lived backend clients so the expensive SSL-context /
 								    # trust-store construction (~40 ms each) happens once here instead of on the
 								    # request path. Ollama endpoints are reached via both the native ollama
 								    # client (/api/chat, /api/generate) and the OpenAI client (/v1/* routes),
 								    # so warm both; OpenAI-compatible endpoints only need the OpenAI client.
 								    _warm_endpoints = config.endpoints + [
-												feat: add llama-swap as a backend

											
										
										
											2026-06-14 16:34:31 +02:00
+								        ep for ep in llama_endpoints(config) if ep not in config.endpoints
-												feat: cache backend clients per endpoint instead of building one (with a fresh SSL context) per request

											
										
										
											2026-06-07 09:55:54 +02:00
+								    ]
 								    for ep in _warm_endpoints:
 								        try:
 								            if not is_openai_compatible(ep):
 								                get_ollama_client(ep)
 								            _make_openai_client(
 								                ep, default_headers=default_headers,
 								                api_key=config.api_keys.get(ep, "no-key"),
 								            )
 								        except Exception as e:
 								            print(f"[startup] Backend client pre-warm failed for {ep}: {e}")
-												stopping the token_worker_task gracefully on shutdown

											
										
										
											2025-11-13 10:13:10 +01:00
+								    token_worker_task = asyncio.create_task(token_worker())
-												adding token timeseries counting in db for future data viz

											
										
										
											2025-11-18 11:16:21 +01:00
+								    flush_task = asyncio.create_task(flush_buffer())
-												feat: adding a semantic cache layer

											
										
										
											2026-03-08 09:12:09 +01:00
+								    await init_llm_cache(config)
-												using global aiohttp sessionpool for improved performance

											
										
										
											2025-09-10 10:21:49 +02:00
 								@app.on_event("shutdown")
 								async def shutdown_event() -> None:
-												improved SSE queue handling on shutdown

											
										
										
											2025-09-12 09:44:56 +02:00
+								    await close_all_sse_queues()
-												fix: stopping background task properly on shutdown

											
										
										
											2026-05-13 11:05:34 +02:00
 								    # Stop background tasks first so they stop touching the DB before we close it.
 								    for t in (token_worker_task, flush_task):
 								        if t is not None:
 								            t.cancel()
 								            try:
 								                await t
 								            except (asyncio.CancelledError, Exception):
 								                pass
-												refactor: use a persistent WAL-enabled connection with async locks

- Introduce a lazily initialized, shared aiosqlite connection stored in self._db and two asyncio locks (_db_lock, _operation_lock) for safe concurrent access
- Ensure the database directory exists before connecting and enable WAL journaling and foreign keys on first connect
- Add close method to gracefully close the persistent connection
- Guard initialization and write operations with _operation_lock to ensure single-threaded schema setup
- Switch to ON CONFLICT UPSERT for token_counts updates and initialize token_time_series table
- Add typing for _db (Optional[aiosqlite.Connection]) and adjust imports accordingly

addition: Frontend button with total stats aggregation task and feedback span element to keep user informed and a small database footprint

											
										
										
											2025-12-02 12:18:23 +01:00
+								    await flush_remaining_buffers()
-												check for base64 encoded images and remove alpha channel

											
										
										
											2025-10-03 10:04:50 +02:00
+								    await app_state["session"].close()
-												fix: Lightweight health/introspection probes no longer compete with long-lived streaming completions for the proxy pool's per-host connection slots

											
										
										
											2026-05-28 09:54:53 +02:00
+								    if app_state.get("probe_session") is not None:
 								        await app_state["probe_session"].close()
-												feat: after closing the probe session, reset

											
										
										
											2026-05-28 10:16:54 +02:00
+								        app_state["probe_session"] = None
 								        app_state["probe_connector"] = None
-												feat: support localhost llama_server access via unix sockets

											
										
										
											2026-04-17 12:41:57 +02:00
 								    # Close Unix socket sessions
 								    for ep, sess in list(app_state.get("socket_sessions", {}).items()):
 								        try:
 								            await sess.close()
 								            print(f"[shutdown] Closed Unix socket session: {ep}")
 								        except Exception as e:
 								            print(f"[shutdown] Error closing Unix socket session {ep}: {e}")
 								    # Close httpx Unix socket clients
 								    for ep, client in list(app_state.get("httpx_clients", {}).items()):
 								        try:
 								            await client.aclose()
 								            print(f"[shutdown] Closed httpx client: {ep}")
 								        except Exception as e:
 								            print(f"[shutdown] Error closing httpx client {ep}: {e}")
-												feat: cache backend clients per endpoint instead of building one (with a fresh SSL context) per request

											
										
										
											2026-06-07 09:55:54 +02:00
+								    # Close cached backend clients (reused across requests; see startup pre-warm).
 								    for key, client in list(app_state.get("ollama_clients", {}).items()):
 								        try:
 								            await client._client.aclose()
 								        except Exception as e:
 								            print(f"[shutdown] Error closing ollama client {key}: {e}")
 								    app_state["ollama_clients"].clear()
 								    for key, client in list(app_state.get("openai_clients", {}).items()):
 								        try:
 								            await client.close()
 								        except Exception as e:
 								            print(f"[shutdown] Error closing openai client {key}: {e}")
 								    app_state["openai_clients"].clear()
-												fix: stopping background task properly on shutdown

											
										
										
											2026-05-13 11:05:34 +02:00
+								    # Close the aiosqlite connection last — its worker thread is non-daemon
 								    # and would otherwise keep the interpreter alive after lifespan completes.
 								    if db is not None:
 								        try:
 								            await db.close()
 								            print("[shutdown] Closed token DB connection.")
 								        except Exception as e:
 								            print(f"[shutdown] Error closing DB: {e}")