diff --git a/.forgejo/workflows/docker-publish.yml b/.forgejo/workflows/docker-publish.yml index 27cd879..3979f62 100644 --- a/.forgejo/workflows/docker-publish.yml +++ b/.forgejo/workflows/docker-publish.yml @@ -78,6 +78,8 @@ jobs: push: true provenance: false tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:platform-${{ matrix.arch }} + cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache-${{ matrix.arch }} + cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache-${{ matrix.arch }},mode=max merge: runs-on: docker-amd64 diff --git a/README.md b/README.md index fb60988..1b952d9 100644 --- a/README.md +++ b/README.md @@ -72,7 +72,7 @@ uvicorn router:app --host 127.0.0.1 --port 12434 --loop uvloop ## Docker Deployment -### Pre-built image (OCI Registry) +### Pre-built image (GitHub Container Registry) Pre-built multi-arch images (`linux/amd64`, `linux/arm64`) are published automatically on every release. diff --git a/requirements.txt b/requirements.txt index e71ef8a..110aa59 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,18 +4,18 @@ aiosignal==1.4.0 annotated-types==0.7.0 anyio==4.13.0 async-timeout==5.0.1 -attrs==26.1.0 -certifi==2026.4.22 +attrs==25.4.0 +certifi==2025.11.12 click==8.3.3 distro==1.9.0 -exceptiongroup==1.3.1 +exceptiongroup==1.3.0 fastapi==0.136.1 fastapi-sse==1.1.1 frozenlist==1.8.0 h11==0.16.0 httpcore==1.0.9 httpx==0.28.1 -idna==3.14 +idna==3.13 jiter==0.14.0 multidict==6.7.1 ollama==0.6.2 @@ -23,9 +23,9 @@ openai==1.109.1 orjson>=3.11.5 numpy>=1.26 pillow==12.2.0 -propcache==0.5.2 +propcache==0.4.1 pydantic==2.13.4 -pydantic-settings==2.14.1 +pydantic-settings==2.10.1 pydantic_core==2.46.4 python-dotenv==1.2.2 PyYAML==6.0.3 diff --git a/router.py b/router.py index 603387e..326ec33 100644 --- a/router.py +++ b/router.py @@ -331,7 +331,6 @@ app.add_middleware( ) default_headers={ "HTTP-Referer": "https://nomyo.ai", - "Referer": "https://nomyo.ai", "X-Title": "NOMYO Router", } @@ -418,16 +417,7 @@ async def enforce_router_api_key(request: Request, call_next): response.headers["Access-Control-Allow-Headers"] = "Authorization, Content-Type" response.headers["Access-Control-Allow-Methods"] = "GET, POST, PUT, DELETE, OPTIONS" return response - - -@app.exception_handler(openai.APIStatusError) -async def _openai_api_status_error_handler(request: Request, exc: openai.APIStatusError): - """Forward upstream OpenAI-SDK status errors with their original status code and body - instead of letting them bubble up as 500s.""" - body = exc.body if exc.body is not None else {"error": {"message": str(exc), "code": exc.status_code}} - return JSONResponse(status_code=exc.status_code, content=body) - - + # ------------------------------------------------------------- # 3. Global state: per‑endpoint per‑model active connection counters # ------------------------------------------------------------- @@ -582,19 +572,6 @@ def _is_llama_model_loaded(item: dict) -> bool: return status == "loaded" return False -def _is_llama_model_loaded_or_sleeping(item: dict) -> bool: - """Return True if status is 'loaded' or 'sleeping'. - Newer llama-server versions report 'sleeping' in /v1/models when a model is idle; - ps_details needs to include these so _fetch_llama_props can detect and unload them.""" - status = item.get("status") - if status is None: - return True - if isinstance(status, dict): - return status.get("value") in ("loaded", "sleeping") - if isinstance(status, str): - return status in ("loaded", "sleeping") - return False - def is_ext_openai_endpoint(endpoint: str) -> bool: """ Determine if an endpoint is an external OpenAI-compatible endpoint (not Ollama or llama-server). @@ -807,19 +784,18 @@ class fetch: Internal function that performs the actual HTTP request to fetch available models. This is called by available_models() after checking caches and in-flight requests. """ - headers = {"Referer": default_headers.get("HTTP-Referer", "https://nomyo.ai")} + headers = None if api_key is not None: - headers["Authorization"] = "Bearer " + api_key + headers = {"Authorization": "Bearer " + api_key} - ep_base = endpoint.rstrip("/") if endpoint in config.llama_server_endpoints and "/v1" not in endpoint: - endpoint_url = f"{ep_base}/v1/models" + endpoint_url = f"{endpoint}/v1/models" key = "data" elif "/v1" in endpoint or endpoint in config.llama_server_endpoints: - endpoint_url = f"{ep_base}/models" + endpoint_url = f"{endpoint}/models" key = "data" else: - endpoint_url = f"{ep_base}/api/tags" + endpoint_url = f"{endpoint}/api/tags" key = "models" client: aiohttp.ClientSession = get_session(endpoint) @@ -828,12 +804,13 @@ class fetch: await _ensure_success(resp) data = await resp.json() - items = data.get(key, []) - models = {item.get("id") or item.get("name") for item in items if item.get("id") or item.get("name")} + items = data.get(key, []) + models = {item.get("id") or item.get("name") for item in items if item.get("id") or item.get("name")} - async with _models_cache_lock: - _models_cache[endpoint] = (models, time.time()) - return models + # Update cache with lock protection + async with _models_cache_lock: + _models_cache[endpoint] = (models, time.time()) + return models except Exception as e: # Treat any error as if the endpoint offers no models message = _format_connection_issue(endpoint_url, e) @@ -1087,12 +1064,12 @@ class fetch: if _is_fresh(_available_error_cache[endpoint], 300): return [] - headers = {"Referer": default_headers.get("HTTP-Referer", "https://nomyo.ai")} - if api_key is not None: - headers["Authorization"] = "Bearer " + api_key - - request_url = f"{endpoint.rstrip('/')}/{route.lstrip('/')}" client: aiohttp.ClientSession = get_session(endpoint) + headers = None + if api_key is not None: + headers = {"Authorization": "Bearer " + api_key} + + request_url = f"{endpoint}{route}" req_kwargs = {} if timeout is not None: req_kwargs["timeout"] = aiohttp.ClientTimeout(total=timeout) @@ -2931,8 +2908,8 @@ async def ps_details_proxy(request: Request): llama_models_pending: list[dict] = [] for (endpoint, modellist) in zip([ep for ep, _ in llama_tasks], llama_loaded): - # Include sleeping models too so _fetch_llama_props can unload them - loaded_models = [item for item in modellist if _is_llama_model_loaded_or_sleeping(item)] + # Filter for loaded models only + loaded_models = [item for item in modellist if _is_llama_model_loaded(item)] for item in loaded_models: if isinstance(item, dict) and item.get("id"): raw_id = item["id"] @@ -3994,21 +3971,11 @@ async def startup_event() -> None: ssl_context = ssl.create_default_context() connector = aiohttp.TCPConnector(limit=0, limit_per_host=512, ssl=ssl_context) timeout = aiohttp.ClientTimeout(total=60, connect=15, sock_read=120, sock_connect=15) - session = aiohttp.ClientSession( - connector=connector, - timeout=timeout, - headers={"Referer": default_headers.get("HTTP-Referer", "https://nomyo.ai")}, - ) + session = aiohttp.ClientSession(connector=connector, timeout=timeout) app_state["connector"] = connector app_state["session"] = session - # Create httpx clients for external OpenAI endpoints (Google, etc.) - # aiohttp strips Referer headers for cross-origin requests, so we use httpx - for ep in config.endpoints: - if is_ext_openai_endpoint(ep): - app_state["httpx_clients"][ep] = httpx.AsyncClient(timeout=30.0) - # Create per-endpoint Unix socket sessions for .sock endpoints for ep in config.llama_server_endpoints: if _is_unix_socket_endpoint(ep):