Compare commits
21 commits
c4f7611817
...
3669365c3c
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3669365c3c | ||
| a2f100fcd7 | |||
| 50f832668e | |||
| 84808c17bd | |||
| b6923a2b71 | |||
| 1cc4ce9ede | |||
|
|
2a9f42c0e0 | ||
| ab35d16e9a | |||
|
|
cf340d3575 | ||
| 8602f80d6b | |||
| 4c6156dc3f | |||
| 07b36ca480 | |||
|
|
fc3c2a161d | ||
|
|
aefeac1ff1 | ||
| fa7c2978fe | |||
|
|
a2dd6d10b3 | ||
|
|
abcb36bbd4 | ||
| 90a54abc9b | |||
| ecdd228a54 | |||
| e296ac19ba | |||
| 353fadac48 |
4 changed files with 59 additions and 28 deletions
|
|
@ -78,8 +78,6 @@ jobs:
|
|||
push: true
|
||||
provenance: false
|
||||
tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:platform-${{ matrix.arch }}
|
||||
cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache-${{ matrix.arch }}
|
||||
cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache-${{ matrix.arch }},mode=max
|
||||
|
||||
merge:
|
||||
runs-on: docker-amd64
|
||||
|
|
|
|||
|
|
@ -72,7 +72,7 @@ uvicorn router:app --host 127.0.0.1 --port 12434 --loop uvloop
|
|||
|
||||
## Docker Deployment
|
||||
|
||||
### Pre-built image (GitHub Container Registry)
|
||||
### Pre-built image (OCI Registry)
|
||||
|
||||
Pre-built multi-arch images (`linux/amd64`, `linux/arm64`) are published automatically on every release.
|
||||
|
||||
|
|
|
|||
|
|
@ -4,18 +4,18 @@ aiosignal==1.4.0
|
|||
annotated-types==0.7.0
|
||||
anyio==4.13.0
|
||||
async-timeout==5.0.1
|
||||
attrs==25.4.0
|
||||
certifi==2025.11.12
|
||||
attrs==26.1.0
|
||||
certifi==2026.4.22
|
||||
click==8.3.3
|
||||
distro==1.9.0
|
||||
exceptiongroup==1.3.0
|
||||
exceptiongroup==1.3.1
|
||||
fastapi==0.136.1
|
||||
fastapi-sse==1.1.1
|
||||
frozenlist==1.8.0
|
||||
h11==0.16.0
|
||||
httpcore==1.0.9
|
||||
httpx==0.28.1
|
||||
idna==3.13
|
||||
idna==3.14
|
||||
jiter==0.14.0
|
||||
multidict==6.7.1
|
||||
ollama==0.6.2
|
||||
|
|
@ -23,9 +23,9 @@ openai==1.109.1
|
|||
orjson>=3.11.5
|
||||
numpy>=1.26
|
||||
pillow==12.2.0
|
||||
propcache==0.4.1
|
||||
propcache==0.5.2
|
||||
pydantic==2.13.4
|
||||
pydantic-settings==2.10.1
|
||||
pydantic-settings==2.14.1
|
||||
pydantic_core==2.46.4
|
||||
python-dotenv==1.2.2
|
||||
PyYAML==6.0.3
|
||||
|
|
|
|||
71
router.py
71
router.py
|
|
@ -331,6 +331,7 @@ app.add_middleware(
|
|||
)
|
||||
default_headers={
|
||||
"HTTP-Referer": "https://nomyo.ai",
|
||||
"Referer": "https://nomyo.ai",
|
||||
"X-Title": "NOMYO Router",
|
||||
}
|
||||
|
||||
|
|
@ -417,7 +418,16 @@ async def enforce_router_api_key(request: Request, call_next):
|
|||
response.headers["Access-Control-Allow-Headers"] = "Authorization, Content-Type"
|
||||
response.headers["Access-Control-Allow-Methods"] = "GET, POST, PUT, DELETE, OPTIONS"
|
||||
return response
|
||||
|
||||
|
||||
|
||||
@app.exception_handler(openai.APIStatusError)
|
||||
async def _openai_api_status_error_handler(request: Request, exc: openai.APIStatusError):
|
||||
"""Forward upstream OpenAI-SDK status errors with their original status code and body
|
||||
instead of letting them bubble up as 500s."""
|
||||
body = exc.body if exc.body is not None else {"error": {"message": str(exc), "code": exc.status_code}}
|
||||
return JSONResponse(status_code=exc.status_code, content=body)
|
||||
|
||||
|
||||
# -------------------------------------------------------------
|
||||
# 3. Global state: per‑endpoint per‑model active connection counters
|
||||
# -------------------------------------------------------------
|
||||
|
|
@ -572,6 +582,19 @@ def _is_llama_model_loaded(item: dict) -> bool:
|
|||
return status == "loaded"
|
||||
return False
|
||||
|
||||
def _is_llama_model_loaded_or_sleeping(item: dict) -> bool:
|
||||
"""Return True if status is 'loaded' or 'sleeping'.
|
||||
Newer llama-server versions report 'sleeping' in /v1/models when a model is idle;
|
||||
ps_details needs to include these so _fetch_llama_props can detect and unload them."""
|
||||
status = item.get("status")
|
||||
if status is None:
|
||||
return True
|
||||
if isinstance(status, dict):
|
||||
return status.get("value") in ("loaded", "sleeping")
|
||||
if isinstance(status, str):
|
||||
return status in ("loaded", "sleeping")
|
||||
return False
|
||||
|
||||
def is_ext_openai_endpoint(endpoint: str) -> bool:
|
||||
"""
|
||||
Determine if an endpoint is an external OpenAI-compatible endpoint (not Ollama or llama-server).
|
||||
|
|
@ -784,18 +807,19 @@ class fetch:
|
|||
Internal function that performs the actual HTTP request to fetch available models.
|
||||
This is called by available_models() after checking caches and in-flight requests.
|
||||
"""
|
||||
headers = None
|
||||
headers = {"Referer": default_headers.get("HTTP-Referer", "https://nomyo.ai")}
|
||||
if api_key is not None:
|
||||
headers = {"Authorization": "Bearer " + api_key}
|
||||
headers["Authorization"] = "Bearer " + api_key
|
||||
|
||||
ep_base = endpoint.rstrip("/")
|
||||
if endpoint in config.llama_server_endpoints and "/v1" not in endpoint:
|
||||
endpoint_url = f"{endpoint}/v1/models"
|
||||
endpoint_url = f"{ep_base}/v1/models"
|
||||
key = "data"
|
||||
elif "/v1" in endpoint or endpoint in config.llama_server_endpoints:
|
||||
endpoint_url = f"{endpoint}/models"
|
||||
endpoint_url = f"{ep_base}/models"
|
||||
key = "data"
|
||||
else:
|
||||
endpoint_url = f"{endpoint}/api/tags"
|
||||
endpoint_url = f"{ep_base}/api/tags"
|
||||
key = "models"
|
||||
|
||||
client: aiohttp.ClientSession = get_session(endpoint)
|
||||
|
|
@ -804,13 +828,12 @@ class fetch:
|
|||
await _ensure_success(resp)
|
||||
data = await resp.json()
|
||||
|
||||
items = data.get(key, [])
|
||||
models = {item.get("id") or item.get("name") for item in items if item.get("id") or item.get("name")}
|
||||
items = data.get(key, [])
|
||||
models = {item.get("id") or item.get("name") for item in items if item.get("id") or item.get("name")}
|
||||
|
||||
# Update cache with lock protection
|
||||
async with _models_cache_lock:
|
||||
_models_cache[endpoint] = (models, time.time())
|
||||
return models
|
||||
async with _models_cache_lock:
|
||||
_models_cache[endpoint] = (models, time.time())
|
||||
return models
|
||||
except Exception as e:
|
||||
# Treat any error as if the endpoint offers no models
|
||||
message = _format_connection_issue(endpoint_url, e)
|
||||
|
|
@ -1064,12 +1087,12 @@ class fetch:
|
|||
if _is_fresh(_available_error_cache[endpoint], 300):
|
||||
return []
|
||||
|
||||
client: aiohttp.ClientSession = get_session(endpoint)
|
||||
headers = None
|
||||
headers = {"Referer": default_headers.get("HTTP-Referer", "https://nomyo.ai")}
|
||||
if api_key is not None:
|
||||
headers = {"Authorization": "Bearer " + api_key}
|
||||
headers["Authorization"] = "Bearer " + api_key
|
||||
|
||||
request_url = f"{endpoint}{route}"
|
||||
request_url = f"{endpoint.rstrip('/')}/{route.lstrip('/')}"
|
||||
client: aiohttp.ClientSession = get_session(endpoint)
|
||||
req_kwargs = {}
|
||||
if timeout is not None:
|
||||
req_kwargs["timeout"] = aiohttp.ClientTimeout(total=timeout)
|
||||
|
|
@ -2908,8 +2931,8 @@ async def ps_details_proxy(request: Request):
|
|||
llama_models_pending: list[dict] = []
|
||||
|
||||
for (endpoint, modellist) in zip([ep for ep, _ in llama_tasks], llama_loaded):
|
||||
# Filter for loaded models only
|
||||
loaded_models = [item for item in modellist if _is_llama_model_loaded(item)]
|
||||
# Include sleeping models too so _fetch_llama_props can unload them
|
||||
loaded_models = [item for item in modellist if _is_llama_model_loaded_or_sleeping(item)]
|
||||
for item in loaded_models:
|
||||
if isinstance(item, dict) and item.get("id"):
|
||||
raw_id = item["id"]
|
||||
|
|
@ -3971,11 +3994,21 @@ async def startup_event() -> None:
|
|||
ssl_context = ssl.create_default_context()
|
||||
connector = aiohttp.TCPConnector(limit=0, limit_per_host=512, ssl=ssl_context)
|
||||
timeout = aiohttp.ClientTimeout(total=60, connect=15, sock_read=120, sock_connect=15)
|
||||
session = aiohttp.ClientSession(connector=connector, timeout=timeout)
|
||||
session = aiohttp.ClientSession(
|
||||
connector=connector,
|
||||
timeout=timeout,
|
||||
headers={"Referer": default_headers.get("HTTP-Referer", "https://nomyo.ai")},
|
||||
)
|
||||
|
||||
app_state["connector"] = connector
|
||||
app_state["session"] = session
|
||||
|
||||
# Create httpx clients for external OpenAI endpoints (Google, etc.)
|
||||
# aiohttp strips Referer headers for cross-origin requests, so we use httpx
|
||||
for ep in config.endpoints:
|
||||
if is_ext_openai_endpoint(ep):
|
||||
app_state["httpx_clients"][ep] = httpx.AsyncClient(timeout=30.0)
|
||||
|
||||
# Create per-endpoint Unix socket sessions for .sock endpoints
|
||||
for ep in config.llama_server_endpoints:
|
||||
if _is_unix_socket_endpoint(ep):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue