Compare commits

...

19 commits
v0.7.3 ... main

Author SHA1 Message Date
a3928c9c33 Merge pull request 'dev-v0.7.x -> main' (#25) from dev-v0.7.x into main
All checks were successful
Build and Publish Docker Image (Semantic Cache) / build (amd64, linux/amd64, docker-amd64) (push) Successful in 34s
Build and Publish Docker Image / build (amd64, linux/amd64, docker-amd64) (push) Successful in 38s
Build and Publish Docker Image (Semantic Cache) / build (arm64, linux/arm64, docker-arm64) (push) Successful in 10m5s
Build and Publish Docker Image (Semantic Cache) / merge (push) Successful in 31s
Build and Publish Docker Image / build (arm64, linux/arm64, docker-arm64) (push) Successful in 10m11s
Build and Publish Docker Image / merge (push) Successful in 31s
Reviewed-on: https://bitfreedom.net/code/code/nomyo-ai/nomyo-router/pulls/25
2026-04-16 12:27:34 +02:00
1a2781ac23
fix: health check all endpoints with right per enpoint path
issue: resolving #24
2026-04-16 12:18:38 +02:00
a3e7e8a007
sec: bump pillow version to mitigate vuln 2026-04-14 09:31:05 +02:00
5ac412eb5c
doc: feature updates 2026-04-14 09:17:33 +02:00
537b757c4a
fix: align pip cmds 2026-04-13 14:13:35 +02:00
f4b3a09151 Merge pull request 'dev-v0.7.x -> main' (#22) from dev-v0.7.x into main
All checks were successful
Build and Publish Docker Image (Semantic Cache) / build (amd64, linux/amd64, docker-amd64) (push) Successful in 37s
Build and Publish Docker Image / build (amd64, linux/amd64, docker-amd64) (push) Successful in 36s
Build and Publish Docker Image (Semantic Cache) / build (arm64, linux/arm64, docker-arm64) (push) Successful in 10m10s
Build and Publish Docker Image (Semantic Cache) / merge (push) Successful in 33s
Build and Publish Docker Image / build (arm64, linux/arm64, docker-arm64) (push) Successful in 10m2s
Build and Publish Docker Image / merge (push) Successful in 33s
Reviewed-on: https://bitfreedom.net/code/code/nomyo-ai/nomyo-router/pulls/22
2026-04-13 14:00:20 +02:00
1058f2418b
fix: security, exempt files to prevent path traversal 2026-04-10 17:40:44 +02:00
263c66aedd
feat: add hostname to dashboard 2026-04-10 17:29:43 +02:00
07b80e654f Merge pull request 'dev-v0.7.x' (#21) from dev-v0.7.x into main
All checks were successful
Build and Publish Docker Image (Semantic Cache) / build (amd64, linux/amd64, docker-amd64) (push) Successful in 35s
Build and Publish Docker Image / build (amd64, linux/amd64, docker-amd64) (push) Successful in 36s
Build and Publish Docker Image (Semantic Cache) / build (arm64, linux/arm64, docker-arm64) (push) Successful in 10m21s
Build and Publish Docker Image (Semantic Cache) / merge (push) Successful in 32s
Build and Publish Docker Image / build (arm64, linux/arm64, docker-arm64) (push) Successful in 10m3s
Build and Publish Docker Image / merge (push) Successful in 32s
Reviewed-on: https://bitfreedom.net/code/code/nomyo-ai/nomyo-router/pulls/21
2026-04-08 14:01:54 +02:00
a432a65396
fix: params is never defined in ollama native backend 2026-04-08 13:01:56 +02:00
f364a2d123
fix: trying to replicate the non-semantic workflow 2026-04-08 10:28:28 +02:00
f92099de11
fix: don't know what to say 2026-04-08 10:24:18 +02:00
56b214ef46
fix: avoid docker hub lookup 2026-04-08 10:21:22 +02:00
88df31c390
fix: use internal cache tags 2026-04-08 10:18:59 +02:00
f6a0f7266c
fix: cache tags in workflows 2026-04-08 10:12:19 +02:00
0bf91a6dd0 Merge pull request 'dev-v0.7.x' (#20) from dev-v0.7.x into main
All checks were successful
Build and Publish Docker Image / build (amd64, linux/amd64, docker-amd64) (push) Successful in 1m7s
Build and Publish Docker Image / build (arm64, linux/arm64, docker-arm64) (push) Successful in 11m34s
Build and Publish Docker Image / merge (push) Successful in 36s
Build and Publish Docker Image (Semantic Cache) / build (amd64, linux/amd64, docker-amd64) (push) Successful in 35s
Build and Publish Docker Image (Semantic Cache) / build (arm64, linux/arm64, docker-arm64) (push) Successful in 14m38s
Build and Publish Docker Image (Semantic Cache) / merge (push) Successful in 47s
Reviewed-on: https://bitfreedom.net/code/code/nomyo-ai/nomyo-router/pulls/20
2026-04-07 17:56:32 +02:00
27d77c6e5d
fix: docker build 2026-04-07 17:50:42 +02:00
9c4b506805
fix: workflow tagging for releases 2026-04-07 17:33:23 +02:00
ddbffee4ae
fix: tidy up in workflow merge step 2026-04-07 16:53:07 +02:00
8 changed files with 92 additions and 41 deletions

View file

@ -18,7 +18,6 @@ on:
env: env:
REGISTRY: bitfreedom.net REGISTRY: bitfreedom.net
IMAGE_NAME: ${{ github.repository }} IMAGE_NAME: ${{ github.repository }}
CACHE_IMAGE: ${{ github.repository }}-buildcache-semantic
DOCKER_BUILD_SUMMARY: "false" DOCKER_BUILD_SUMMARY: "false"
jobs: jobs:
@ -87,9 +86,9 @@ jobs:
provenance: false provenance: false
build-args: | build-args: |
SEMANTIC_CACHE=true SEMANTIC_CACHE=true
tags: ${{ env.REGISTRY }}/${{ env.CACHE_IMAGE }}:platform-semantic-${{ matrix.arch }} tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:semantic-platform-${{ matrix.arch }}
cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ env.CACHE_IMAGE }}:buildcache-semantic-${{ matrix.arch }} cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache-semantic-${{ matrix.arch }}
cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ env.CACHE_IMAGE }}:buildcache-semantic-${{ matrix.arch }},mode=min cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache-semantic-${{ matrix.arch }},mode=max
merge: merge:
runs-on: docker-amd64 runs-on: docker-amd64
@ -133,6 +132,8 @@ jobs:
uses: https://github.com/docker/metadata-action@v5 uses: https://github.com/docker/metadata-action@v5
with: with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
flavor: |
latest=false
tags: | tags: |
type=semver,pattern={{version}}-semantic type=semver,pattern={{version}}-semantic
type=semver,pattern={{major}}.{{minor}}-semantic type=semver,pattern={{major}}.{{minor}}-semantic
@ -143,15 +144,6 @@ jobs:
run: | run: |
docker buildx imagetools create \ docker buildx imagetools create \
$(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \ $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \
${{ env.REGISTRY }}/${{ env.CACHE_IMAGE }}:platform-semantic-amd64 \ ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:semantic-platform-amd64 \
${{ env.REGISTRY }}/${{ env.CACHE_IMAGE }}:platform-semantic-arm64 ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:semantic-platform-arm64
- name: Delete intermediate platform tags
run: |
CACHE_ENCODED=$(echo "${{ env.CACHE_IMAGE }}" | sed 's|/|%2F|g')
for tag in platform-semantic-amd64 platform-semantic-arm64; do
curl -s -X DELETE \
-H "Authorization: token ${{ secrets.REGISTRY_TOKEN }}" \
"https://${{ env.REGISTRY }}/api/v1/packages/${{ github.repository_owner }}/container/${CACHE_ENCODED}/${tag}" \
&& echo "Deleted ${tag}" || echo "Failed to delete ${tag} (ignored)"
done

View file

@ -79,7 +79,7 @@ jobs:
provenance: false provenance: false
tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:platform-${{ matrix.arch }} tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:platform-${{ matrix.arch }}
cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache-${{ matrix.arch }} cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache-${{ matrix.arch }}
cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache-${{ matrix.arch }},mode=min cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache-${{ matrix.arch }},mode=max
merge: merge:
runs-on: docker-amd64 runs-on: docker-amd64
@ -94,7 +94,7 @@ jobs:
- name: Install Docker - name: Install Docker
run: | run: |
apt-get update -qq apt-get update -qq
apt-get install -y -qq docker.io jq curl apt-get install -y -qq docker.io jq
- name: Start Docker daemon - name: Start Docker daemon
run: | run: |
@ -123,6 +123,8 @@ jobs:
uses: https://github.com/docker/metadata-action@v5 uses: https://github.com/docker/metadata-action@v5
with: with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
flavor: |
latest=false
tags: | tags: |
type=semver,pattern={{version}} type=semver,pattern={{version}}
type=semver,pattern={{major}}.{{minor}} type=semver,pattern={{major}}.{{minor}}
@ -136,12 +138,3 @@ jobs:
${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:platform-amd64 \ ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:platform-amd64 \
${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:platform-arm64 ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:platform-arm64
- name: Delete intermediate platform tags
run: |
IMAGE_ENCODED=$(echo "${{ env.IMAGE_NAME }}" | sed 's|/|%2F|g')
for tag in platform-amd64 platform-arm64; do
curl -s -X DELETE \
-H "Authorization: token ${{ secrets.REGISTRY_TOKEN }}" \
"https://${{ env.REGISTRY }}/api/v1/packages/${{ github.repository_owner }}/container/${IMAGE_ENCODED}/${tag}" \
&& echo "Deleted ${tag}" || echo "Failed to delete ${tag} (ignored)"
done

View file

@ -26,8 +26,8 @@ RUN pip install --root-user-action=ignore --no-cache-dir --upgrade pip \
# CPU-only torch must be installed before sentence-transformers to avoid # CPU-only torch must be installed before sentence-transformers to avoid
# pulling the full CUDA-enabled build (~2.5 GB). # pulling the full CUDA-enabled build (~2.5 GB).
RUN if [ "$SEMANTIC_CACHE" = "true" ]; then \ RUN if [ "$SEMANTIC_CACHE" = "true" ]; then \
pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu && \ pip install --root-user-action=ignore --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu && \
pip install --no-cache-dir sentence-transformers && \ pip install --root-user-action=ignore --no-cache-dir sentence-transformers && \
python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')"; \ python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')"; \
fi fi

View file

@ -80,14 +80,14 @@ Pre-built multi-arch images (`linux/amd64`, `linux/arm64`) are published automat
```sh ```sh
docker pull bitfreedom.net/nomyo-ai/nomyo-router:latest docker pull bitfreedom.net/nomyo-ai/nomyo-router:latest
docker pull bitfreedom.net/nomyo-ai/nomyo-router:v0.7.0 docker pull bitfreedom.net/nomyo-ai/nomyo-router:0.7
``` ```
**Semantic image** (semantic cache with `all-MiniLM-L6-v2` pre-baked, ~800 MB): **Semantic image** (semantic cache with `all-MiniLM-L6-v2` pre-baked, ~800 MB):
```sh ```sh
docker pull bitfreedom.net/nomyo-ai/nomyo-router:latest-semantic docker pull bitfreedom.net/nomyo-ai/nomyo-router:latest-semantic
docker pull bitfreedom.net/nomyo-ai/nomyo-router:0.7.0-semantic docker pull bitfreedom.net/nomyo-ai/nomyo-router:0.7-semantic
``` ```
### Build the container image locally ### Build the container image locally

View file

@ -127,6 +127,34 @@ The router can proxy requests to OpenAI-compatible endpoints alongside Ollama en
- Handles authentication with API keys - Handles authentication with API keys
- Maintains consistent behavior across endpoint types - Maintains consistent behavior across endpoint types
### Reactive Context-Shift
When a backend returns a `exceed_context_size_error` (context window exceeded), the router automatically trims the conversation history and retries rather than surfacing the error to the client.
**How it works:**
1. The error body contains `n_ctx` (the model's context limit) and `n_prompt_tokens` (the actual token count as measured by the backend).
2. `_calibrated_trim_target()` computes a tiktoken-scale trim target using the *delta* between actual tokens and the context limit, correcting for the fact that tiktoken counts fewer tokens than the backend tokeniser does.
3. `_trim_messages_for_context()` implements a sliding-window drop: system messages are always preserved; the oldest non-system messages are evicted first (FIFO) until the estimated token count fits the target. The most recent message is never dropped. After trimming, leading assistant/tool messages are removed to satisfy chat-template requirements (first non-system message must be a user message).
4. Two retry attempts are made:
- **Retry 1** — trimmed messages, original tool definitions.
- **Retry 2** — trimmed messages with tool definitions also stripped (handles cases where tool schemas alone consume too many tokens).
**Proactive pre-trimming:**
Once a context overflow has been observed for an endpoint/model pair whose `n_ctx` ≤ 32 768, the router records that limit in `_endpoint_nctx`. Subsequent requests to the same pair are pre-trimmed before being sent, avoiding the round-trip to the backend entirely for small-context models.
### Reactive SSE Push
The `/api/usage-stream` endpoint delivers real-time usage updates using a pub/sub push model rather than client polling.
**Mechanism:**
- `subscribe()` creates a bounded `asyncio.Queue` (capacity 10) and registers it in `_subscribers`.
- Whenever `usage_counts` or `token_usage_counts` change — on every `increment_usage`, `decrement_usage`, or token-worker flush — `_capture_snapshot()` serialises the current state to JSON while the caller still holds the relevant lock, then `_distribute_snapshot()` pushes the snapshot to every registered queue outside the lock.
- If a subscriber's queue is full (slow client), the oldest undelivered snapshot is evicted before the new one is enqueued, so fast producers never block on slow consumers.
- `unsubscribe()` removes the queue when the SSE connection closes; `close_all_sse_queues()` sends a `None` sentinel to all subscribers during router shutdown.
## Performance Considerations ## Performance Considerations
### Concurrency Model ### Concurrency Model
@ -145,7 +173,7 @@ The router can proxy requests to OpenAI-compatible endpoints alongside Ollama en
### Memory Management ### Memory Management
- **Write-behind pattern**: Token counts buffered in memory, flushed periodically - **Write-behind pattern**: Token counts buffered in memory, flushed periodically
- **Queue-based SSE**: Server-Sent Events use bounded queues to prevent memory bloat - **Queue-based SSE**: Bounded per-subscriber queues (capacity 10) with oldest-eviction — see [Reactive SSE Push](#reactive-sse-push)
- **Automatic cleanup**: Zero connection counts are removed from tracking - **Automatic cleanup**: Zero connection counts are removed from tracking
## Error Handling ## Error Handling

View file

@ -22,7 +22,7 @@ ollama==0.6.1
openai==1.102.0 openai==1.102.0
orjson>=3.11.5 orjson>=3.11.5
numpy>=1.26 numpy>=1.26
pillow==12.1.1 pillow==12.2.0
propcache==0.3.2 propcache==0.3.2
pydantic==2.11.7 pydantic==2.11.7
pydantic-settings==2.10.1 pydantic-settings==2.10.1

View file

@ -6,7 +6,7 @@ version: 0.7
license: AGPL license: AGPL
""" """
# ------------------------------------------------------------- # -------------------------------------------------------------
import orjson, time, asyncio, yaml, ollama, openai, os, re, aiohttp, ssl, random, base64, io, enhance, secrets, math import orjson, time, asyncio, yaml, ollama, openai, os, re, aiohttp, ssl, random, base64, io, enhance, secrets, math, socket
try: try:
import truststore; truststore.inject_into_ssl() import truststore; truststore.inject_into_ssl()
except ImportError: except ImportError:
@ -373,7 +373,11 @@ async def enforce_router_api_key(request: Request, call_next):
return await call_next(request) return await call_next(request)
path = request.url.path path = request.url.path
if path.startswith("/static") or path in {"/", "/favicon.ico"}: # Allow static assets (CSS, JS, images, fonts) but NOT HTML pages,
# which would bypass auth by accessing /static/index.html directly.
_STATIC_ASSET_EXTS = {".css", ".js", ".ico", ".png", ".jpg", ".jpeg", ".svg", ".woff", ".woff2", ".ttf", ".map"}
is_static_asset = path.startswith("/static") and Path(path).suffix.lower() in _STATIC_ASSET_EXTS
if is_static_asset or path in {"/", "/favicon.ico"}:
return await call_next(request) return await call_next(request)
provided_key = _extract_router_api_key(request) provided_key = _extract_router_api_key(request)
@ -2110,7 +2114,11 @@ async def chat_proxy(request: Request):
# Only cache when no max_tokens limit was set — otherwise # Only cache when no max_tokens limit was set — otherwise
# finish_reason=length might just mean max_tokens was hit, # finish_reason=length might just mean max_tokens was hit,
# not that the context window was exhausted. # not that the context window was exhausted.
_req_max_tok = params.get("max_tokens") or params.get("max_completion_tokens") or params.get("num_predict") _req_max_tok = (
params.get("max_tokens") or params.get("max_completion_tokens") or params.get("num_predict")
if use_openai else
(options.get("num_predict") if options else None)
)
if _dr == "length" and not _req_max_tok: if _dr == "length" and not _req_max_tok:
_pt = getattr(chunk, "prompt_eval_count", 0) or 0 _pt = getattr(chunk, "prompt_eval_count", 0) or 0
_ct = getattr(chunk, "eval_count", 0) or 0 _ct = getattr(chunk, "eval_count", 0) or 0
@ -3746,22 +3754,38 @@ async def health_proxy(request: Request):
- `endpoints`: a mapping of endpoint URL `{status, version|detail}`. - `endpoints`: a mapping of endpoint URL `{status, version|detail}`.
* The HTTP status code is 200 when everything is healthy, 503 otherwise. * The HTTP status code is 200 when everything is healthy, 503 otherwise.
""" """
# Run all health checks in parallel # Run all health checks in parallel.
tasks = [fetch.endpoint_details(ep, "/api/version", "version", skip_error_cache=True) for ep in config.endpoints] # if not is_ext_openai_endpoint(ep)] # Ollama endpoints expose /api/version; OpenAI-compatible endpoints (vLLM,
# llama-server, external) expose /models. Using /api/version against an
# OpenAI-compatible endpoint yields a 404 and noisy log output.
all_endpoints = list(config.endpoints)
llama_eps_extra = [ep for ep in config.llama_server_endpoints if ep not in config.endpoints]
all_endpoints += llama_eps_extra
tasks = []
for ep in all_endpoints:
if is_openai_compatible(ep):
tasks.append(fetch.endpoint_details(ep, "/models", "data", config.api_keys.get(ep), skip_error_cache=True))
else:
tasks.append(fetch.endpoint_details(ep, "/api/version", "version", skip_error_cache=True))
results = await asyncio.gather(*tasks, return_exceptions=True) results = await asyncio.gather(*tasks, return_exceptions=True)
health_summary = {} health_summary = {}
overall_ok = True overall_ok = True
for ep, result in zip(config.endpoints, results): for ep, result in zip(all_endpoints, results):
if isinstance(result, Exception): if isinstance(result, Exception):
# Endpoint did not respond / returned an error # Endpoint did not respond / returned an error
health_summary[ep] = {"status": "error", "detail": str(result)} health_summary[ep] = {"status": "error", "detail": str(result)}
overall_ok = False overall_ok = False
else: else:
# Successful response report the reported version # Successful response report the reported version (Ollama) or
health_summary[ep] = {"status": "ok", "version": result} # indicate the endpoint is reachable (OpenAI-compatible).
if is_openai_compatible(ep):
health_summary[ep] = {"status": "ok"}
else:
health_summary[ep] = {"status": "ok", "version": result}
response_payload = { response_payload = {
"status": "ok" if overall_ok else "error", "status": "ok" if overall_ok else "error",
@ -3772,7 +3796,15 @@ async def health_proxy(request: Request):
return JSONResponse(content=response_payload, status_code=http_status) return JSONResponse(content=response_payload, status_code=http_status)
# ------------------------------------------------------------- # -------------------------------------------------------------
# 27. SSE route for usage broadcasts # 27. Hostname endpoint
# -------------------------------------------------------------
@app.get("/api/hostname")
async def get_hostname():
"""Return the hostname of the machine running the router."""
return JSONResponse(content={"hostname": socket.gethostname()})
# -------------------------------------------------------------
# 28. SSE route for usage broadcasts
# ------------------------------------------------------------- # -------------------------------------------------------------
@app.get("/api/usage-stream") @app.get("/api/usage-stream")
async def usage_stream(request: Request): async def usage_stream(request: Request):

View file

@ -344,6 +344,7 @@
</div> </div>
<div class="header-row"> <div class="header-row">
<h1>Router Dashboard</h1> <h1>Router Dashboard</h1>
<span id="hostname" style="color:#777; font-size:0.85em;"></span>
<button id="total-tokens-btn">Stats Total</button> <button id="total-tokens-btn">Stats Total</button>
<span id="aggregation-status" class="loading" style="margin-left:8px;"></span> <span id="aggregation-status" class="loading" style="margin-left:8px;"></span>
</div> </div>
@ -1418,6 +1419,11 @@ function initStatsChart(timeSeriesData, endpointDistribution) {
</script> </script>
<script> <script>
document.addEventListener('DOMContentLoaded', () => { document.addEventListener('DOMContentLoaded', () => {
authedFetch('/api/hostname').then(r => r.json()).then(data => {
const el = document.getElementById('hostname');
if (el && data.hostname) el.textContent = data.hostname;
}).catch(() => {});
const totalBtn = document.getElementById('total-tokens-btn'); const totalBtn = document.getElementById('total-tokens-btn');
if (totalBtn) { if (totalBtn) {
totalBtn.addEventListener('click', async () => { totalBtn.addEventListener('click', async () => {