diff --git a/.forgejo/workflows/docker-publish-semantic.yml b/.forgejo/workflows/docker-publish-semantic.yml index 2174f21..2fa59d5 100644 --- a/.forgejo/workflows/docker-publish-semantic.yml +++ b/.forgejo/workflows/docker-publish-semantic.yml @@ -18,7 +18,6 @@ on: env: REGISTRY: bitfreedom.net IMAGE_NAME: ${{ github.repository }} - CACHE_IMAGE: ${{ github.repository }}-buildcache-semantic DOCKER_BUILD_SUMMARY: "false" jobs: @@ -87,9 +86,9 @@ jobs: provenance: false build-args: | SEMANTIC_CACHE=true - tags: ${{ env.REGISTRY }}/${{ env.CACHE_IMAGE }}:platform-semantic-${{ matrix.arch }} - cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ env.CACHE_IMAGE }}:buildcache-semantic-${{ matrix.arch }} - cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ env.CACHE_IMAGE }}:buildcache-semantic-${{ matrix.arch }},mode=min + tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:semantic-platform-${{ matrix.arch }} + cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache-semantic-${{ matrix.arch }} + cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache-semantic-${{ matrix.arch }},mode=max merge: runs-on: docker-amd64 @@ -133,6 +132,8 @@ jobs: uses: https://github.com/docker/metadata-action@v5 with: images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + flavor: | + latest=false tags: | type=semver,pattern={{version}}-semantic type=semver,pattern={{major}}.{{minor}}-semantic @@ -143,15 +144,6 @@ jobs: run: | docker buildx imagetools create \ $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \ - ${{ env.REGISTRY }}/${{ env.CACHE_IMAGE }}:platform-semantic-amd64 \ - ${{ env.REGISTRY }}/${{ env.CACHE_IMAGE }}:platform-semantic-arm64 + ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:semantic-platform-amd64 \ + ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:semantic-platform-arm64 - - name: Delete intermediate platform tags - run: | - CACHE_ENCODED=$(echo "${{ env.CACHE_IMAGE }}" | sed 's|/|%2F|g') - for tag in platform-semantic-amd64 platform-semantic-arm64; do - curl -s -X DELETE \ - -H "Authorization: token ${{ secrets.REGISTRY_TOKEN }}" \ - "https://${{ env.REGISTRY }}/api/v1/packages/${{ github.repository_owner }}/container/${CACHE_ENCODED}/${tag}" \ - && echo "Deleted ${tag}" || echo "Failed to delete ${tag} (ignored)" - done diff --git a/.forgejo/workflows/docker-publish.yml b/.forgejo/workflows/docker-publish.yml index c93ec2e..3979f62 100644 --- a/.forgejo/workflows/docker-publish.yml +++ b/.forgejo/workflows/docker-publish.yml @@ -79,7 +79,7 @@ jobs: provenance: false tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:platform-${{ matrix.arch }} cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache-${{ matrix.arch }} - cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache-${{ matrix.arch }},mode=min + cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache-${{ matrix.arch }},mode=max merge: runs-on: docker-amd64 @@ -94,7 +94,7 @@ jobs: - name: Install Docker run: | apt-get update -qq - apt-get install -y -qq docker.io jq curl + apt-get install -y -qq docker.io jq - name: Start Docker daemon run: | @@ -123,6 +123,8 @@ jobs: uses: https://github.com/docker/metadata-action@v5 with: images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + flavor: | + latest=false tags: | type=semver,pattern={{version}} type=semver,pattern={{major}}.{{minor}} @@ -136,12 +138,3 @@ jobs: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:platform-amd64 \ ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:platform-arm64 - - name: Delete intermediate platform tags - run: | - IMAGE_ENCODED=$(echo "${{ env.IMAGE_NAME }}" | sed 's|/|%2F|g') - for tag in platform-amd64 platform-arm64; do - curl -s -X DELETE \ - -H "Authorization: token ${{ secrets.REGISTRY_TOKEN }}" \ - "https://${{ env.REGISTRY }}/api/v1/packages/${{ github.repository_owner }}/container/${IMAGE_ENCODED}/${tag}" \ - && echo "Deleted ${tag}" || echo "Failed to delete ${tag} (ignored)" - done diff --git a/Dockerfile b/Dockerfile index 0caf66c..c14a655 100644 --- a/Dockerfile +++ b/Dockerfile @@ -26,8 +26,8 @@ RUN pip install --root-user-action=ignore --no-cache-dir --upgrade pip \ # CPU-only torch must be installed before sentence-transformers to avoid # pulling the full CUDA-enabled build (~2.5 GB). RUN if [ "$SEMANTIC_CACHE" = "true" ]; then \ - pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu && \ - pip install --no-cache-dir sentence-transformers && \ + pip install --root-user-action=ignore --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu && \ + pip install --root-user-action=ignore --no-cache-dir sentence-transformers && \ python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')"; \ fi diff --git a/README.md b/README.md index 337b4c3..ef3e6f2 100644 --- a/README.md +++ b/README.md @@ -80,14 +80,14 @@ Pre-built multi-arch images (`linux/amd64`, `linux/arm64`) are published automat ```sh docker pull bitfreedom.net/nomyo-ai/nomyo-router:latest -docker pull bitfreedom.net/nomyo-ai/nomyo-router:v0.7.0 +docker pull bitfreedom.net/nomyo-ai/nomyo-router:0.7 ``` **Semantic image** (semantic cache with `all-MiniLM-L6-v2` pre-baked, ~800 MB): ```sh docker pull bitfreedom.net/nomyo-ai/nomyo-router:latest-semantic -docker pull bitfreedom.net/nomyo-ai/nomyo-router:0.7.0-semantic +docker pull bitfreedom.net/nomyo-ai/nomyo-router:0.7-semantic ``` ### Build the container image locally diff --git a/doc/architecture.md b/doc/architecture.md index aa96855..f725573 100644 --- a/doc/architecture.md +++ b/doc/architecture.md @@ -127,6 +127,34 @@ The router can proxy requests to OpenAI-compatible endpoints alongside Ollama en - Handles authentication with API keys - Maintains consistent behavior across endpoint types +### Reactive Context-Shift + +When a backend returns a `exceed_context_size_error` (context window exceeded), the router automatically trims the conversation history and retries rather than surfacing the error to the client. + +**How it works:** + +1. The error body contains `n_ctx` (the model's context limit) and `n_prompt_tokens` (the actual token count as measured by the backend). +2. `_calibrated_trim_target()` computes a tiktoken-scale trim target using the *delta* between actual tokens and the context limit, correcting for the fact that tiktoken counts fewer tokens than the backend tokeniser does. +3. `_trim_messages_for_context()` implements a sliding-window drop: system messages are always preserved; the oldest non-system messages are evicted first (FIFO) until the estimated token count fits the target. The most recent message is never dropped. After trimming, leading assistant/tool messages are removed to satisfy chat-template requirements (first non-system message must be a user message). +4. Two retry attempts are made: + - **Retry 1** — trimmed messages, original tool definitions. + - **Retry 2** — trimmed messages with tool definitions also stripped (handles cases where tool schemas alone consume too many tokens). + +**Proactive pre-trimming:** + +Once a context overflow has been observed for an endpoint/model pair whose `n_ctx` ≤ 32 768, the router records that limit in `_endpoint_nctx`. Subsequent requests to the same pair are pre-trimmed before being sent, avoiding the round-trip to the backend entirely for small-context models. + +### Reactive SSE Push + +The `/api/usage-stream` endpoint delivers real-time usage updates using a pub/sub push model rather than client polling. + +**Mechanism:** + +- `subscribe()` creates a bounded `asyncio.Queue` (capacity 10) and registers it in `_subscribers`. +- Whenever `usage_counts` or `token_usage_counts` change — on every `increment_usage`, `decrement_usage`, or token-worker flush — `_capture_snapshot()` serialises the current state to JSON while the caller still holds the relevant lock, then `_distribute_snapshot()` pushes the snapshot to every registered queue outside the lock. +- If a subscriber's queue is full (slow client), the oldest undelivered snapshot is evicted before the new one is enqueued, so fast producers never block on slow consumers. +- `unsubscribe()` removes the queue when the SSE connection closes; `close_all_sse_queues()` sends a `None` sentinel to all subscribers during router shutdown. + ## Performance Considerations ### Concurrency Model @@ -145,7 +173,7 @@ The router can proxy requests to OpenAI-compatible endpoints alongside Ollama en ### Memory Management - **Write-behind pattern**: Token counts buffered in memory, flushed periodically -- **Queue-based SSE**: Server-Sent Events use bounded queues to prevent memory bloat +- **Queue-based SSE**: Bounded per-subscriber queues (capacity 10) with oldest-eviction — see [Reactive SSE Push](#reactive-sse-push) - **Automatic cleanup**: Zero connection counts are removed from tracking ## Error Handling diff --git a/requirements.txt b/requirements.txt index 2db1ba4..8c1f93b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,7 +22,7 @@ ollama==0.6.1 openai==1.102.0 orjson>=3.11.5 numpy>=1.26 -pillow==12.1.1 +pillow==12.2.0 propcache==0.3.2 pydantic==2.11.7 pydantic-settings==2.10.1 diff --git a/router.py b/router.py index c87c5ca..9c02077 100644 --- a/router.py +++ b/router.py @@ -6,7 +6,7 @@ version: 0.7 license: AGPL """ # ------------------------------------------------------------- -import orjson, time, asyncio, yaml, ollama, openai, os, re, aiohttp, ssl, random, base64, io, enhance, secrets, math +import orjson, time, asyncio, yaml, ollama, openai, os, re, aiohttp, ssl, random, base64, io, enhance, secrets, math, socket try: import truststore; truststore.inject_into_ssl() except ImportError: @@ -373,7 +373,11 @@ async def enforce_router_api_key(request: Request, call_next): return await call_next(request) path = request.url.path - if path.startswith("/static") or path in {"/", "/favicon.ico"}: + # Allow static assets (CSS, JS, images, fonts) but NOT HTML pages, + # which would bypass auth by accessing /static/index.html directly. + _STATIC_ASSET_EXTS = {".css", ".js", ".ico", ".png", ".jpg", ".jpeg", ".svg", ".woff", ".woff2", ".ttf", ".map"} + is_static_asset = path.startswith("/static") and Path(path).suffix.lower() in _STATIC_ASSET_EXTS + if is_static_asset or path in {"/", "/favicon.ico"}: return await call_next(request) provided_key = _extract_router_api_key(request) @@ -2110,7 +2114,11 @@ async def chat_proxy(request: Request): # Only cache when no max_tokens limit was set — otherwise # finish_reason=length might just mean max_tokens was hit, # not that the context window was exhausted. - _req_max_tok = params.get("max_tokens") or params.get("max_completion_tokens") or params.get("num_predict") + _req_max_tok = ( + params.get("max_tokens") or params.get("max_completion_tokens") or params.get("num_predict") + if use_openai else + (options.get("num_predict") if options else None) + ) if _dr == "length" and not _req_max_tok: _pt = getattr(chunk, "prompt_eval_count", 0) or 0 _ct = getattr(chunk, "eval_count", 0) or 0 @@ -3746,22 +3754,38 @@ async def health_proxy(request: Request): - `endpoints`: a mapping of endpoint URL → `{status, version|detail}`. * The HTTP status code is 200 when everything is healthy, 503 otherwise. """ - # Run all health checks in parallel - tasks = [fetch.endpoint_details(ep, "/api/version", "version", skip_error_cache=True) for ep in config.endpoints] # if not is_ext_openai_endpoint(ep)] + # Run all health checks in parallel. + # Ollama endpoints expose /api/version; OpenAI-compatible endpoints (vLLM, + # llama-server, external) expose /models. Using /api/version against an + # OpenAI-compatible endpoint yields a 404 and noisy log output. + all_endpoints = list(config.endpoints) + llama_eps_extra = [ep for ep in config.llama_server_endpoints if ep not in config.endpoints] + all_endpoints += llama_eps_extra + + tasks = [] + for ep in all_endpoints: + if is_openai_compatible(ep): + tasks.append(fetch.endpoint_details(ep, "/models", "data", config.api_keys.get(ep), skip_error_cache=True)) + else: + tasks.append(fetch.endpoint_details(ep, "/api/version", "version", skip_error_cache=True)) results = await asyncio.gather(*tasks, return_exceptions=True) health_summary = {} overall_ok = True - for ep, result in zip(config.endpoints, results): + for ep, result in zip(all_endpoints, results): if isinstance(result, Exception): # Endpoint did not respond / returned an error health_summary[ep] = {"status": "error", "detail": str(result)} overall_ok = False else: - # Successful response – report the reported version - health_summary[ep] = {"status": "ok", "version": result} + # Successful response – report the reported version (Ollama) or + # indicate the endpoint is reachable (OpenAI-compatible). + if is_openai_compatible(ep): + health_summary[ep] = {"status": "ok"} + else: + health_summary[ep] = {"status": "ok", "version": result} response_payload = { "status": "ok" if overall_ok else "error", @@ -3772,7 +3796,15 @@ async def health_proxy(request: Request): return JSONResponse(content=response_payload, status_code=http_status) # ------------------------------------------------------------- -# 27. SSE route for usage broadcasts +# 27. Hostname endpoint +# ------------------------------------------------------------- +@app.get("/api/hostname") +async def get_hostname(): + """Return the hostname of the machine running the router.""" + return JSONResponse(content={"hostname": socket.gethostname()}) + +# ------------------------------------------------------------- +# 28. SSE route for usage broadcasts # ------------------------------------------------------------- @app.get("/api/usage-stream") async def usage_stream(request: Request): diff --git a/static/index.html b/static/index.html index e53e629..419d7bb 100644 --- a/static/index.html +++ b/static/index.html @@ -344,6 +344,7 @@