Merge pull request 'dev-v0.8.x' (#58) from dev-v0.8.x into main
Some checks failed
Build and Publish Docker Image (Semantic Cache) / build (arm64, linux/arm64, docker-arm64) (push) Successful in 10m41s
Build and Publish Docker Image (Semantic Cache) / build (amd64, linux/amd64, docker-amd64) (push) Successful in 35s
Build and Publish Docker Image (Semantic Cache) / merge (push) Successful in 32s
Build and Publish Docker Image / merge (push) Has been skipped
Build and Publish Docker Image / build (arm64, linux/arm64, docker-arm64) (push) Successful in 10m40s
Build and Publish Docker Image / build (amd64, linux/amd64, docker-amd64) (push) Failing after 1m20s

Reviewed-on: https://bitfreedom.net/code/code/nomyo-ai/nomyo-router/pulls/58
This commit is contained in:
Alpha Nerd 2026-05-11 09:50:43 +02:00
commit b6923a2b71
2 changed files with 53 additions and 20 deletions

View file

@ -72,7 +72,7 @@ uvicorn router:app --host 127.0.0.1 --port 12434 --loop uvloop
## Docker Deployment
### Pre-built image (GitHub Container Registry)
### Pre-built image (OCI Registry)
Pre-built multi-arch images (`linux/amd64`, `linux/arm64`) are published automatically on every release.

View file

@ -331,6 +331,7 @@ app.add_middleware(
)
default_headers={
"HTTP-Referer": "https://nomyo.ai",
"Referer": "https://nomyo.ai",
"X-Title": "NOMYO Router",
}
@ -417,7 +418,16 @@ async def enforce_router_api_key(request: Request, call_next):
response.headers["Access-Control-Allow-Headers"] = "Authorization, Content-Type"
response.headers["Access-Control-Allow-Methods"] = "GET, POST, PUT, DELETE, OPTIONS"
return response
@app.exception_handler(openai.APIStatusError)
async def _openai_api_status_error_handler(request: Request, exc: openai.APIStatusError):
"""Forward upstream OpenAI-SDK status errors with their original status code and body
instead of letting them bubble up as 500s."""
body = exc.body if exc.body is not None else {"error": {"message": str(exc), "code": exc.status_code}}
return JSONResponse(status_code=exc.status_code, content=body)
# -------------------------------------------------------------
# 3. Global state: perendpoint permodel active connection counters
# -------------------------------------------------------------
@ -572,6 +582,19 @@ def _is_llama_model_loaded(item: dict) -> bool:
return status == "loaded"
return False
def _is_llama_model_loaded_or_sleeping(item: dict) -> bool:
"""Return True if status is 'loaded' or 'sleeping'.
Newer llama-server versions report 'sleeping' in /v1/models when a model is idle;
ps_details needs to include these so _fetch_llama_props can detect and unload them."""
status = item.get("status")
if status is None:
return True
if isinstance(status, dict):
return status.get("value") in ("loaded", "sleeping")
if isinstance(status, str):
return status in ("loaded", "sleeping")
return False
def is_ext_openai_endpoint(endpoint: str) -> bool:
"""
Determine if an endpoint is an external OpenAI-compatible endpoint (not Ollama or llama-server).
@ -784,18 +807,19 @@ class fetch:
Internal function that performs the actual HTTP request to fetch available models.
This is called by available_models() after checking caches and in-flight requests.
"""
headers = None
headers = {"Referer": default_headers.get("HTTP-Referer", "https://nomyo.ai")}
if api_key is not None:
headers = {"Authorization": "Bearer " + api_key}
headers["Authorization"] = "Bearer " + api_key
ep_base = endpoint.rstrip("/")
if endpoint in config.llama_server_endpoints and "/v1" not in endpoint:
endpoint_url = f"{endpoint}/v1/models"
endpoint_url = f"{ep_base}/v1/models"
key = "data"
elif "/v1" in endpoint or endpoint in config.llama_server_endpoints:
endpoint_url = f"{endpoint}/models"
endpoint_url = f"{ep_base}/models"
key = "data"
else:
endpoint_url = f"{endpoint}/api/tags"
endpoint_url = f"{ep_base}/api/tags"
key = "models"
client: aiohttp.ClientSession = get_session(endpoint)
@ -804,13 +828,12 @@ class fetch:
await _ensure_success(resp)
data = await resp.json()
items = data.get(key, [])
models = {item.get("id") or item.get("name") for item in items if item.get("id") or item.get("name")}
items = data.get(key, [])
models = {item.get("id") or item.get("name") for item in items if item.get("id") or item.get("name")}
# Update cache with lock protection
async with _models_cache_lock:
_models_cache[endpoint] = (models, time.time())
return models
async with _models_cache_lock:
_models_cache[endpoint] = (models, time.time())
return models
except Exception as e:
# Treat any error as if the endpoint offers no models
message = _format_connection_issue(endpoint_url, e)
@ -1064,12 +1087,12 @@ class fetch:
if _is_fresh(_available_error_cache[endpoint], 300):
return []
client: aiohttp.ClientSession = get_session(endpoint)
headers = None
headers = {"Referer": default_headers.get("HTTP-Referer", "https://nomyo.ai")}
if api_key is not None:
headers = {"Authorization": "Bearer " + api_key}
headers["Authorization"] = "Bearer " + api_key
request_url = f"{endpoint}{route}"
request_url = f"{endpoint.rstrip('/')}/{route.lstrip('/')}"
client: aiohttp.ClientSession = get_session(endpoint)
req_kwargs = {}
if timeout is not None:
req_kwargs["timeout"] = aiohttp.ClientTimeout(total=timeout)
@ -2908,8 +2931,8 @@ async def ps_details_proxy(request: Request):
llama_models_pending: list[dict] = []
for (endpoint, modellist) in zip([ep for ep, _ in llama_tasks], llama_loaded):
# Filter for loaded models only
loaded_models = [item for item in modellist if _is_llama_model_loaded(item)]
# Include sleeping models too so _fetch_llama_props can unload them
loaded_models = [item for item in modellist if _is_llama_model_loaded_or_sleeping(item)]
for item in loaded_models:
if isinstance(item, dict) and item.get("id"):
raw_id = item["id"]
@ -3971,11 +3994,21 @@ async def startup_event() -> None:
ssl_context = ssl.create_default_context()
connector = aiohttp.TCPConnector(limit=0, limit_per_host=512, ssl=ssl_context)
timeout = aiohttp.ClientTimeout(total=60, connect=15, sock_read=120, sock_connect=15)
session = aiohttp.ClientSession(connector=connector, timeout=timeout)
session = aiohttp.ClientSession(
connector=connector,
timeout=timeout,
headers={"Referer": default_headers.get("HTTP-Referer", "https://nomyo.ai")},
)
app_state["connector"] = connector
app_state["session"] = session
# Create httpx clients for external OpenAI endpoints (Google, etc.)
# aiohttp strips Referer headers for cross-origin requests, so we use httpx
for ep in config.endpoints:
if is_ext_openai_endpoint(ep):
app_state["httpx_clients"][ep] = httpx.AsyncClient(timeout=30.0)
# Create per-endpoint Unix socket sessions for .sock endpoints
for ep in config.llama_server_endpoints:
if _is_unix_socket_endpoint(ep):