Merge pull request 'dev-v0.7.x -> main' (#25 ) from dev-v0.7.x into main

Reviewed-on: https://bitfreedom.net/code/code/nomyo-ai/nomyo-router/pulls/25
fix: health check all endpoints with right per enpoint path
2026-04-16 12:27:34 +02:00 · 2026-04-16 12:18:38 +02:00 · 2026-04-14 09:31:05 +02:00 · 2026-04-14 09:17:33 +02:00 · 2026-04-13 14:13:35 +02:00 · 2026-04-13 14:00:20 +02:00
8 changed files with 92 additions and 41 deletions
--- a/.forgejo/workflows/docker-publish-semantic.yml
+++ b/.forgejo/workflows/docker-publish-semantic.yml
@ -18,7 +18,6 @@ on:
 env:
  REGISTRY: bitfreedom.net
  IMAGE_NAME: ${{ github.repository }}
-  CACHE_IMAGE: ${{ github.repository }}-buildcache-semantic
  DOCKER_BUILD_SUMMARY: "false"

 jobs:
@ -87,9 +86,9 @@ jobs:
          provenance: false
          build-args: |
            SEMANTIC_CACHE=true
-          tags: ${{ env.REGISTRY }}/${{ env.CACHE_IMAGE }}:platform-semantic-${{ matrix.arch }}
-          cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ env.CACHE_IMAGE }}:buildcache-semantic-${{ matrix.arch }}
-          cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ env.CACHE_IMAGE }}:buildcache-semantic-${{ matrix.arch }},mode=min
+          tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:semantic-platform-${{ matrix.arch }}
+          cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache-semantic-${{ matrix.arch }}
+          cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache-semantic-${{ matrix.arch }},mode=max

  merge:
    runs-on: docker-amd64
@ -133,6 +132,8 @@ jobs:
        uses: https://github.com/docker/metadata-action@v5
        with:
          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+          flavor: |
+            latest=false
          tags: |
            type=semver,pattern={{version}}-semantic
            type=semver,pattern={{major}}.{{minor}}-semantic
@ -143,15 +144,6 @@ jobs:
        run: |
          docker buildx imagetools create \
            $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \
-            ${{ env.REGISTRY }}/${{ env.CACHE_IMAGE }}:platform-semantic-amd64 \
-            ${{ env.REGISTRY }}/${{ env.CACHE_IMAGE }}:platform-semantic-arm64
+            ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:semantic-platform-amd64 \
+            ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:semantic-platform-arm64

-      - name: Delete intermediate platform tags
-        run: |
-          CACHE_ENCODED=$(echo "${{ env.CACHE_IMAGE }}" | sed 's|/|%2F|g')
-          for tag in platform-semantic-amd64 platform-semantic-arm64; do
-            curl -s -X DELETE \
-              -H "Authorization: token ${{ secrets.REGISTRY_TOKEN }}" \
-              "https://${{ env.REGISTRY }}/api/v1/packages/${{ github.repository_owner }}/container/${CACHE_ENCODED}/${tag}" \
-              && echo "Deleted ${tag}" || echo "Failed to delete ${tag} (ignored)"
-          done
--- a/.forgejo/workflows/docker-publish.yml
+++ b/.forgejo/workflows/docker-publish.yml
@ -79,7 +79,7 @@ jobs:
          provenance: false
          tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:platform-${{ matrix.arch }}
          cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache-${{ matrix.arch }}
-          cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache-${{ matrix.arch }},mode=min
+          cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache-${{ matrix.arch }},mode=max

  merge:
    runs-on: docker-amd64
@ -94,7 +94,7 @@ jobs:
      - name: Install Docker
        run: |
          apt-get update -qq
-          apt-get install -y -qq docker.io jq curl
+          apt-get install -y -qq docker.io jq

      - name: Start Docker daemon
        run: |
@ -123,6 +123,8 @@ jobs:
        uses: https://github.com/docker/metadata-action@v5
        with:
          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+          flavor: |
+            latest=false
          tags: |
            type=semver,pattern={{version}}
            type=semver,pattern={{major}}.{{minor}}
@ -136,12 +138,3 @@ jobs:
            ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:platform-amd64 \
            ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:platform-arm64

-      - name: Delete intermediate platform tags
-        run: |
-          IMAGE_ENCODED=$(echo "${{ env.IMAGE_NAME }}" | sed 's|/|%2F|g')
-          for tag in platform-amd64 platform-arm64; do
-            curl -s -X DELETE \
-              -H "Authorization: token ${{ secrets.REGISTRY_TOKEN }}" \
-              "https://${{ env.REGISTRY }}/api/v1/packages/${{ github.repository_owner }}/container/${IMAGE_ENCODED}/${tag}" \
-              && echo "Deleted ${tag}" || echo "Failed to delete ${tag} (ignored)"
-          done
--- a/4
+++ b/4
@ -26,8 +26,8 @@ RUN pip install --root-user-action=ignore --no-cache-dir --upgrade pip \
 # CPU-only torch must be installed before sentence-transformers to avoid
 # pulling the full CUDA-enabled build (~2.5 GB).
 RUN if [ "$SEMANTIC_CACHE" = "true" ]; then \
-      pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu && \
-      pip install --no-cache-dir sentence-transformers && \
+      pip install --root-user-action=ignore --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu && \
+      pip install --root-user-action=ignore --no-cache-dir sentence-transformers && \
      python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')"; \
    fi

--- a/README.md
+++ b/README.md
@ -80,14 +80,14 @@ Pre-built multi-arch images (`linux/amd64`, `linux/arm64`) are published automat

 ```sh
 docker pull bitfreedom.net/nomyo-ai/nomyo-router:latest
-docker pull bitfreedom.net/nomyo-ai/nomyo-router:v0.7.0
+docker pull bitfreedom.net/nomyo-ai/nomyo-router:0.7
 ```

 **Semantic image** (semantic cache with `all-MiniLM-L6-v2` pre-baked, ~800 MB):

 ```sh
 docker pull bitfreedom.net/nomyo-ai/nomyo-router:latest-semantic
-docker pull bitfreedom.net/nomyo-ai/nomyo-router:0.7.0-semantic
+docker pull bitfreedom.net/nomyo-ai/nomyo-router:0.7-semantic
 ```

 ### Build the container image locally
--- a/doc/architecture.md
+++ b/doc/architecture.md
@ -127,6 +127,34 @@ The router can proxy requests to OpenAI-compatible endpoints alongside Ollama en
 - Handles authentication with API keys
 - Maintains consistent behavior across endpoint types

+### Reactive Context-Shift
+
+When a backend returns a `exceed_context_size_error` (context window exceeded), the router automatically trims the conversation history and retries rather than surfacing the error to the client.
+
+**How it works:**
+
+1. The error body contains `n_ctx` (the model's context limit) and `n_prompt_tokens` (the actual token count as measured by the backend).
+2. `_calibrated_trim_target()` computes a tiktoken-scale trim target using the *delta* between actual tokens and the context limit, correcting for the fact that tiktoken counts fewer tokens than the backend tokeniser does.
+3. `_trim_messages_for_context()` implements a sliding-window drop: system messages are always preserved; the oldest non-system messages are evicted first (FIFO) until the estimated token count fits the target. The most recent message is never dropped. After trimming, leading assistant/tool messages are removed to satisfy chat-template requirements (first non-system message must be a user message).
+4. Two retry attempts are made:
+   - **Retry 1** — trimmed messages, original tool definitions.
+   - **Retry 2** — trimmed messages with tool definitions also stripped (handles cases where tool schemas alone consume too many tokens).
+
+**Proactive pre-trimming:**
+
+Once a context overflow has been observed for an endpoint/model pair whose `n_ctx` ≤ 32 768, the router records that limit in `_endpoint_nctx`. Subsequent requests to the same pair are pre-trimmed before being sent, avoiding the round-trip to the backend entirely for small-context models.
+
+### Reactive SSE Push
+
+The `/api/usage-stream` endpoint delivers real-time usage updates using a pub/sub push model rather than client polling.
+
+**Mechanism:**
+
+- `subscribe()` creates a bounded `asyncio.Queue` (capacity 10) and registers it in `_subscribers`.
+- Whenever `usage_counts` or `token_usage_counts` change — on every `increment_usage`, `decrement_usage`, or token-worker flush — `_capture_snapshot()` serialises the current state to JSON while the caller still holds the relevant lock, then `_distribute_snapshot()` pushes the snapshot to every registered queue outside the lock.
+- If a subscriber's queue is full (slow client), the oldest undelivered snapshot is evicted before the new one is enqueued, so fast producers never block on slow consumers.
+- `unsubscribe()` removes the queue when the SSE connection closes; `close_all_sse_queues()` sends a `None` sentinel to all subscribers during router shutdown.
+
 ## Performance Considerations

 ### Concurrency Model
@ -145,7 +173,7 @@ The router can proxy requests to OpenAI-compatible endpoints alongside Ollama en
 ### Memory Management

 - **Write-behind pattern**: Token counts buffered in memory, flushed periodically
- **Queue-based SSE**: Server-Sent Events use bounded queues to prevent memory bloat
+- **Queue-based SSE**: Bounded per-subscriber queues (capacity 10) with oldest-eviction — see [Reactive SSE Push](#reactive-sse-push)
 - **Automatic cleanup**: Zero connection counts are removed from tracking

 ## Error Handling
--- a/requirements.txt
+++ b/requirements.txt
@ -22,7 +22,7 @@ ollama==0.6.1
 openai==1.102.0
 orjson>=3.11.5
 numpy>=1.26
-pillow==12.1.1
+pillow==12.2.0
 propcache==0.3.2
 pydantic==2.11.7
 pydantic-settings==2.10.1
--- a/router.py
+++ b/router.py
@ -6,7 +6,7 @@ version: 0.7
 license: AGPL
 """
 # -------------------------------------------------------------
-import orjson, time, asyncio, yaml, ollama, openai, os, re, aiohttp, ssl, random, base64, io, enhance, secrets, math
+import orjson, time, asyncio, yaml, ollama, openai, os, re, aiohttp, ssl, random, base64, io, enhance, secrets, math, socket
 try:
    import truststore; truststore.inject_into_ssl()
 except ImportError:
@ -373,7 +373,11 @@ async def enforce_router_api_key(request: Request, call_next):
        return await call_next(request)

    path = request.url.path
-    if path.startswith("/static") or path in {"/", "/favicon.ico"}:
+    # Allow static assets (CSS, JS, images, fonts) but NOT HTML pages,
+    # which would bypass auth by accessing /static/index.html directly.
+    _STATIC_ASSET_EXTS = {".css", ".js", ".ico", ".png", ".jpg", ".jpeg", ".svg", ".woff", ".woff2", ".ttf", ".map"}
+    is_static_asset = path.startswith("/static") and Path(path).suffix.lower() in _STATIC_ASSET_EXTS
+    if is_static_asset or path in {"/", "/favicon.ico"}:
        return await call_next(request)

    provided_key = _extract_router_api_key(request)
@ -2110,7 +2114,11 @@ async def chat_proxy(request: Request):
                        # Only cache when no max_tokens limit was set — otherwise
                        # finish_reason=length might just mean max_tokens was hit,
                        # not that the context window was exhausted.
-                        _req_max_tok = params.get("max_tokens") or params.get("max_completion_tokens") or params.get("num_predict")
+                        _req_max_tok = (
+                            params.get("max_tokens") or params.get("max_completion_tokens") or params.get("num_predict")
+                            if use_openai else
+                            (options.get("num_predict") if options else None)
+                        )
                        if _dr == "length" and not _req_max_tok:
                            _pt = getattr(chunk, "prompt_eval_count", 0) or 0
                            _ct = getattr(chunk, "eval_count", 0) or 0
@ -3746,21 +3754,37 @@ async def health_proxy(request: Request):
        - `endpoints`: a mapping of endpoint URL → `{status, version|detail}`.
    * The HTTP status code is 200 when everything is healthy, 503 otherwise.
    """
-    # Run all health checks in parallel
-    tasks = [fetch.endpoint_details(ep, "/api/version", "version", skip_error_cache=True) for ep in config.endpoints] # if not is_ext_openai_endpoint(ep)]
+    # Run all health checks in parallel.
+    # Ollama endpoints expose /api/version; OpenAI-compatible endpoints (vLLM,
+    # llama-server, external) expose /models.  Using /api/version against an
+    # OpenAI-compatible endpoint yields a 404 and noisy log output.
+    all_endpoints = list(config.endpoints)
+    llama_eps_extra = [ep for ep in config.llama_server_endpoints if ep not in config.endpoints]
+    all_endpoints += llama_eps_extra
+
+    tasks = []
+    for ep in all_endpoints:
+        if is_openai_compatible(ep):
+            tasks.append(fetch.endpoint_details(ep, "/models", "data", config.api_keys.get(ep), skip_error_cache=True))
+        else:
+            tasks.append(fetch.endpoint_details(ep, "/api/version", "version", skip_error_cache=True))

    results = await asyncio.gather(*tasks, return_exceptions=True)

    health_summary = {}
    overall_ok = True

-    for ep, result in zip(config.endpoints, results):
+    for ep, result in zip(all_endpoints, results):
        if isinstance(result, Exception):
            # Endpoint did not respond / returned an error
            health_summary[ep] = {"status": "error", "detail": str(result)}
            overall_ok = False
        else:
-            # Successful response – report the reported version
+            # Successful response – report the reported version (Ollama) or
+            # indicate the endpoint is reachable (OpenAI-compatible).
+            if is_openai_compatible(ep):
+                health_summary[ep] = {"status": "ok"}
+            else:
                health_summary[ep] = {"status": "ok", "version": result}

    response_payload = {
@ -3772,7 +3796,15 @@ async def health_proxy(request: Request):
    return JSONResponse(content=response_payload, status_code=http_status)

 # -------------------------------------------------------------
-# 27. SSE route for usage broadcasts
+# 27. Hostname endpoint
+# -------------------------------------------------------------
+@app.get("/api/hostname")
+async def get_hostname():
+    """Return the hostname of the machine running the router."""
+    return JSONResponse(content={"hostname": socket.gethostname()})
+
+# -------------------------------------------------------------
+# 28. SSE route for usage broadcasts
 # -------------------------------------------------------------
@app.get("/api/usage-stream")
 async def usage_stream(request: Request):
--- a/static/index.html
+++ b/static/index.html
@ -344,6 +344,7 @@
        </div>
        <div class="header-row">
            <h1>Router Dashboard</h1>
+            <span id="hostname" style="color:#777; font-size:0.85em;"></span>
            <button id="total-tokens-btn">Stats Total</button>
            <span id="aggregation-status" class="loading" style="margin-left:8px;"></span>
        </div>
@ -1418,6 +1419,11 @@ function initStatsChart(timeSeriesData, endpointDistribution) {
        </script>
 <script>
 document.addEventListener('DOMContentLoaded', () => {
+    authedFetch('/api/hostname').then(r => r.json()).then(data => {
+        const el = document.getElementById('hostname');
+        if (el && data.hostname) el.textContent = data.hostname;
+    }).catch(() => {});
+
    const totalBtn = document.getElementById('total-tokens-btn');
    if (totalBtn) {
        totalBtn.addEventListener('click', async () => {
Author	SHA1	Message	Date
Alpha Nerd	a3928c9c33	Merge pull request 'dev-v0.7.x -> main' (#25 ) from dev-v0.7.x into main All checks were successful Build and Publish Docker Image (Semantic Cache) / build (amd64, linux/amd64, docker-amd64) (push) Successful in 34s Details Build and Publish Docker Image / build (amd64, linux/amd64, docker-amd64) (push) Successful in 38s Details Build and Publish Docker Image (Semantic Cache) / build (arm64, linux/arm64, docker-arm64) (push) Successful in 10m5s Details Build and Publish Docker Image (Semantic Cache) / merge (push) Successful in 31s Details Build and Publish Docker Image / build (arm64, linux/arm64, docker-arm64) (push) Successful in 10m11s Details Build and Publish Docker Image / merge (push) Successful in 31s Details Reviewed-on: https://bitfreedom.net/code/code/nomyo-ai/nomyo-router/pulls/25	2026-04-16 12:27:34 +02:00
alpha nerd	1a2781ac23	fix: health check all endpoints with right per enpoint path issue: resolving #24	2026-04-16 12:18:38 +02:00
alpha nerd	a3e7e8a007	sec: bump pillow version to mitigate vuln	2026-04-14 09:31:05 +02:00
alpha nerd	5ac412eb5c	doc: feature updates	2026-04-14 09:17:33 +02:00
alpha nerd	537b757c4a	fix: align pip cmds	2026-04-13 14:13:35 +02:00
Alpha Nerd	f4b3a09151	Merge pull request 'dev-v0.7.x -> main' (#22 ) from dev-v0.7.x into main All checks were successful Build and Publish Docker Image (Semantic Cache) / build (amd64, linux/amd64, docker-amd64) (push) Successful in 37s Details Build and Publish Docker Image / build (amd64, linux/amd64, docker-amd64) (push) Successful in 36s Details Build and Publish Docker Image (Semantic Cache) / build (arm64, linux/arm64, docker-arm64) (push) Successful in 10m10s Details Build and Publish Docker Image (Semantic Cache) / merge (push) Successful in 33s Details Build and Publish Docker Image / build (arm64, linux/arm64, docker-arm64) (push) Successful in 10m2s Details Build and Publish Docker Image / merge (push) Successful in 33s Details Reviewed-on: https://bitfreedom.net/code/code/nomyo-ai/nomyo-router/pulls/22	2026-04-13 14:00:20 +02:00
alpha nerd	1058f2418b	fix: security, exempt files to prevent path traversal	2026-04-10 17:40:44 +02:00
alpha nerd	263c66aedd	feat: add hostname to dashboard	2026-04-10 17:29:43 +02:00
Alpha Nerd	07b80e654f	Merge pull request 'dev-v0.7.x' (#21 ) from dev-v0.7.x into main All checks were successful Build and Publish Docker Image (Semantic Cache) / build (amd64, linux/amd64, docker-amd64) (push) Successful in 35s Details Build and Publish Docker Image / build (amd64, linux/amd64, docker-amd64) (push) Successful in 36s Details Build and Publish Docker Image (Semantic Cache) / build (arm64, linux/arm64, docker-arm64) (push) Successful in 10m21s Details Build and Publish Docker Image (Semantic Cache) / merge (push) Successful in 32s Details Build and Publish Docker Image / build (arm64, linux/arm64, docker-arm64) (push) Successful in 10m3s Details Build and Publish Docker Image / merge (push) Successful in 32s Details Reviewed-on: https://bitfreedom.net/code/code/nomyo-ai/nomyo-router/pulls/21	2026-04-08 14:01:54 +02:00
alpha nerd	a432a65396	fix: params is never defined in ollama native backend	2026-04-08 13:01:56 +02:00
alpha nerd	f364a2d123	fix: trying to replicate the non-semantic workflow	2026-04-08 10:28:28 +02:00
alpha nerd	f92099de11	fix: don't know what to say	2026-04-08 10:24:18 +02:00
alpha nerd	56b214ef46	fix: avoid docker hub lookup	2026-04-08 10:21:22 +02:00
alpha nerd	88df31c390	fix: use internal cache tags	2026-04-08 10:18:59 +02:00
alpha nerd	f6a0f7266c	fix: cache tags in workflows	2026-04-08 10:12:19 +02:00
Alpha Nerd	0bf91a6dd0	Merge pull request 'dev-v0.7.x' (#20 ) from dev-v0.7.x into main All checks were successful Build and Publish Docker Image / build (amd64, linux/amd64, docker-amd64) (push) Successful in 1m7s Details Build and Publish Docker Image / build (arm64, linux/arm64, docker-arm64) (push) Successful in 11m34s Details Build and Publish Docker Image / merge (push) Successful in 36s Details Build and Publish Docker Image (Semantic Cache) / build (amd64, linux/amd64, docker-amd64) (push) Successful in 35s Details Build and Publish Docker Image (Semantic Cache) / build (arm64, linux/arm64, docker-arm64) (push) Successful in 14m38s Details Build and Publish Docker Image (Semantic Cache) / merge (push) Successful in 47s Details Reviewed-on: https://bitfreedom.net/code/code/nomyo-ai/nomyo-router/pulls/20	2026-04-07 17:56:32 +02:00
alpha nerd	27d77c6e5d	fix: docker build	2026-04-07 17:50:42 +02:00
alpha nerd	9c4b506805	fix: workflow tagging for releases	2026-04-07 17:33:23 +02:00
alpha nerd	ddbffee4ae	fix: tidy up in workflow merge step	2026-04-07 16:53:07 +02:00