Compare commits
19 commits
| Author | SHA1 | Date | |
|---|---|---|---|
| a3928c9c33 | |||
| 1a2781ac23 | |||
| a3e7e8a007 | |||
| 5ac412eb5c | |||
| 537b757c4a | |||
| f4b3a09151 | |||
| 1058f2418b | |||
| 263c66aedd | |||
| 07b80e654f | |||
| a432a65396 | |||
| f364a2d123 | |||
| f92099de11 | |||
| 56b214ef46 | |||
| 88df31c390 | |||
| f6a0f7266c | |||
| 0bf91a6dd0 | |||
| 27d77c6e5d | |||
| 9c4b506805 | |||
| ddbffee4ae |
8 changed files with 92 additions and 41 deletions
|
|
@ -18,7 +18,6 @@ on:
|
|||
env:
|
||||
REGISTRY: bitfreedom.net
|
||||
IMAGE_NAME: ${{ github.repository }}
|
||||
CACHE_IMAGE: ${{ github.repository }}-buildcache-semantic
|
||||
DOCKER_BUILD_SUMMARY: "false"
|
||||
|
||||
jobs:
|
||||
|
|
@ -87,9 +86,9 @@ jobs:
|
|||
provenance: false
|
||||
build-args: |
|
||||
SEMANTIC_CACHE=true
|
||||
tags: ${{ env.REGISTRY }}/${{ env.CACHE_IMAGE }}:platform-semantic-${{ matrix.arch }}
|
||||
cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ env.CACHE_IMAGE }}:buildcache-semantic-${{ matrix.arch }}
|
||||
cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ env.CACHE_IMAGE }}:buildcache-semantic-${{ matrix.arch }},mode=min
|
||||
tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:semantic-platform-${{ matrix.arch }}
|
||||
cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache-semantic-${{ matrix.arch }}
|
||||
cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache-semantic-${{ matrix.arch }},mode=max
|
||||
|
||||
merge:
|
||||
runs-on: docker-amd64
|
||||
|
|
@ -133,6 +132,8 @@ jobs:
|
|||
uses: https://github.com/docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
|
||||
flavor: |
|
||||
latest=false
|
||||
tags: |
|
||||
type=semver,pattern={{version}}-semantic
|
||||
type=semver,pattern={{major}}.{{minor}}-semantic
|
||||
|
|
@ -143,15 +144,6 @@ jobs:
|
|||
run: |
|
||||
docker buildx imagetools create \
|
||||
$(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \
|
||||
${{ env.REGISTRY }}/${{ env.CACHE_IMAGE }}:platform-semantic-amd64 \
|
||||
${{ env.REGISTRY }}/${{ env.CACHE_IMAGE }}:platform-semantic-arm64
|
||||
${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:semantic-platform-amd64 \
|
||||
${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:semantic-platform-arm64
|
||||
|
||||
- name: Delete intermediate platform tags
|
||||
run: |
|
||||
CACHE_ENCODED=$(echo "${{ env.CACHE_IMAGE }}" | sed 's|/|%2F|g')
|
||||
for tag in platform-semantic-amd64 platform-semantic-arm64; do
|
||||
curl -s -X DELETE \
|
||||
-H "Authorization: token ${{ secrets.REGISTRY_TOKEN }}" \
|
||||
"https://${{ env.REGISTRY }}/api/v1/packages/${{ github.repository_owner }}/container/${CACHE_ENCODED}/${tag}" \
|
||||
&& echo "Deleted ${tag}" || echo "Failed to delete ${tag} (ignored)"
|
||||
done
|
||||
|
|
|
|||
|
|
@ -79,7 +79,7 @@ jobs:
|
|||
provenance: false
|
||||
tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:platform-${{ matrix.arch }}
|
||||
cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache-${{ matrix.arch }}
|
||||
cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache-${{ matrix.arch }},mode=min
|
||||
cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache-${{ matrix.arch }},mode=max
|
||||
|
||||
merge:
|
||||
runs-on: docker-amd64
|
||||
|
|
@ -94,7 +94,7 @@ jobs:
|
|||
- name: Install Docker
|
||||
run: |
|
||||
apt-get update -qq
|
||||
apt-get install -y -qq docker.io jq curl
|
||||
apt-get install -y -qq docker.io jq
|
||||
|
||||
- name: Start Docker daemon
|
||||
run: |
|
||||
|
|
@ -123,6 +123,8 @@ jobs:
|
|||
uses: https://github.com/docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
|
||||
flavor: |
|
||||
latest=false
|
||||
tags: |
|
||||
type=semver,pattern={{version}}
|
||||
type=semver,pattern={{major}}.{{minor}}
|
||||
|
|
@ -136,12 +138,3 @@ jobs:
|
|||
${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:platform-amd64 \
|
||||
${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:platform-arm64
|
||||
|
||||
- name: Delete intermediate platform tags
|
||||
run: |
|
||||
IMAGE_ENCODED=$(echo "${{ env.IMAGE_NAME }}" | sed 's|/|%2F|g')
|
||||
for tag in platform-amd64 platform-arm64; do
|
||||
curl -s -X DELETE \
|
||||
-H "Authorization: token ${{ secrets.REGISTRY_TOKEN }}" \
|
||||
"https://${{ env.REGISTRY }}/api/v1/packages/${{ github.repository_owner }}/container/${IMAGE_ENCODED}/${tag}" \
|
||||
&& echo "Deleted ${tag}" || echo "Failed to delete ${tag} (ignored)"
|
||||
done
|
||||
|
|
|
|||
|
|
@ -26,8 +26,8 @@ RUN pip install --root-user-action=ignore --no-cache-dir --upgrade pip \
|
|||
# CPU-only torch must be installed before sentence-transformers to avoid
|
||||
# pulling the full CUDA-enabled build (~2.5 GB).
|
||||
RUN if [ "$SEMANTIC_CACHE" = "true" ]; then \
|
||||
pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu && \
|
||||
pip install --no-cache-dir sentence-transformers && \
|
||||
pip install --root-user-action=ignore --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu && \
|
||||
pip install --root-user-action=ignore --no-cache-dir sentence-transformers && \
|
||||
python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')"; \
|
||||
fi
|
||||
|
||||
|
|
|
|||
|
|
@ -80,14 +80,14 @@ Pre-built multi-arch images (`linux/amd64`, `linux/arm64`) are published automat
|
|||
|
||||
```sh
|
||||
docker pull bitfreedom.net/nomyo-ai/nomyo-router:latest
|
||||
docker pull bitfreedom.net/nomyo-ai/nomyo-router:v0.7.0
|
||||
docker pull bitfreedom.net/nomyo-ai/nomyo-router:0.7
|
||||
```
|
||||
|
||||
**Semantic image** (semantic cache with `all-MiniLM-L6-v2` pre-baked, ~800 MB):
|
||||
|
||||
```sh
|
||||
docker pull bitfreedom.net/nomyo-ai/nomyo-router:latest-semantic
|
||||
docker pull bitfreedom.net/nomyo-ai/nomyo-router:0.7.0-semantic
|
||||
docker pull bitfreedom.net/nomyo-ai/nomyo-router:0.7-semantic
|
||||
```
|
||||
|
||||
### Build the container image locally
|
||||
|
|
|
|||
|
|
@ -127,6 +127,34 @@ The router can proxy requests to OpenAI-compatible endpoints alongside Ollama en
|
|||
- Handles authentication with API keys
|
||||
- Maintains consistent behavior across endpoint types
|
||||
|
||||
### Reactive Context-Shift
|
||||
|
||||
When a backend returns a `exceed_context_size_error` (context window exceeded), the router automatically trims the conversation history and retries rather than surfacing the error to the client.
|
||||
|
||||
**How it works:**
|
||||
|
||||
1. The error body contains `n_ctx` (the model's context limit) and `n_prompt_tokens` (the actual token count as measured by the backend).
|
||||
2. `_calibrated_trim_target()` computes a tiktoken-scale trim target using the *delta* between actual tokens and the context limit, correcting for the fact that tiktoken counts fewer tokens than the backend tokeniser does.
|
||||
3. `_trim_messages_for_context()` implements a sliding-window drop: system messages are always preserved; the oldest non-system messages are evicted first (FIFO) until the estimated token count fits the target. The most recent message is never dropped. After trimming, leading assistant/tool messages are removed to satisfy chat-template requirements (first non-system message must be a user message).
|
||||
4. Two retry attempts are made:
|
||||
- **Retry 1** — trimmed messages, original tool definitions.
|
||||
- **Retry 2** — trimmed messages with tool definitions also stripped (handles cases where tool schemas alone consume too many tokens).
|
||||
|
||||
**Proactive pre-trimming:**
|
||||
|
||||
Once a context overflow has been observed for an endpoint/model pair whose `n_ctx` ≤ 32 768, the router records that limit in `_endpoint_nctx`. Subsequent requests to the same pair are pre-trimmed before being sent, avoiding the round-trip to the backend entirely for small-context models.
|
||||
|
||||
### Reactive SSE Push
|
||||
|
||||
The `/api/usage-stream` endpoint delivers real-time usage updates using a pub/sub push model rather than client polling.
|
||||
|
||||
**Mechanism:**
|
||||
|
||||
- `subscribe()` creates a bounded `asyncio.Queue` (capacity 10) and registers it in `_subscribers`.
|
||||
- Whenever `usage_counts` or `token_usage_counts` change — on every `increment_usage`, `decrement_usage`, or token-worker flush — `_capture_snapshot()` serialises the current state to JSON while the caller still holds the relevant lock, then `_distribute_snapshot()` pushes the snapshot to every registered queue outside the lock.
|
||||
- If a subscriber's queue is full (slow client), the oldest undelivered snapshot is evicted before the new one is enqueued, so fast producers never block on slow consumers.
|
||||
- `unsubscribe()` removes the queue when the SSE connection closes; `close_all_sse_queues()` sends a `None` sentinel to all subscribers during router shutdown.
|
||||
|
||||
## Performance Considerations
|
||||
|
||||
### Concurrency Model
|
||||
|
|
@ -145,7 +173,7 @@ The router can proxy requests to OpenAI-compatible endpoints alongside Ollama en
|
|||
### Memory Management
|
||||
|
||||
- **Write-behind pattern**: Token counts buffered in memory, flushed periodically
|
||||
- **Queue-based SSE**: Server-Sent Events use bounded queues to prevent memory bloat
|
||||
- **Queue-based SSE**: Bounded per-subscriber queues (capacity 10) with oldest-eviction — see [Reactive SSE Push](#reactive-sse-push)
|
||||
- **Automatic cleanup**: Zero connection counts are removed from tracking
|
||||
|
||||
## Error Handling
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@ ollama==0.6.1
|
|||
openai==1.102.0
|
||||
orjson>=3.11.5
|
||||
numpy>=1.26
|
||||
pillow==12.1.1
|
||||
pillow==12.2.0
|
||||
propcache==0.3.2
|
||||
pydantic==2.11.7
|
||||
pydantic-settings==2.10.1
|
||||
|
|
|
|||
48
router.py
48
router.py
|
|
@ -6,7 +6,7 @@ version: 0.7
|
|||
license: AGPL
|
||||
"""
|
||||
# -------------------------------------------------------------
|
||||
import orjson, time, asyncio, yaml, ollama, openai, os, re, aiohttp, ssl, random, base64, io, enhance, secrets, math
|
||||
import orjson, time, asyncio, yaml, ollama, openai, os, re, aiohttp, ssl, random, base64, io, enhance, secrets, math, socket
|
||||
try:
|
||||
import truststore; truststore.inject_into_ssl()
|
||||
except ImportError:
|
||||
|
|
@ -373,7 +373,11 @@ async def enforce_router_api_key(request: Request, call_next):
|
|||
return await call_next(request)
|
||||
|
||||
path = request.url.path
|
||||
if path.startswith("/static") or path in {"/", "/favicon.ico"}:
|
||||
# Allow static assets (CSS, JS, images, fonts) but NOT HTML pages,
|
||||
# which would bypass auth by accessing /static/index.html directly.
|
||||
_STATIC_ASSET_EXTS = {".css", ".js", ".ico", ".png", ".jpg", ".jpeg", ".svg", ".woff", ".woff2", ".ttf", ".map"}
|
||||
is_static_asset = path.startswith("/static") and Path(path).suffix.lower() in _STATIC_ASSET_EXTS
|
||||
if is_static_asset or path in {"/", "/favicon.ico"}:
|
||||
return await call_next(request)
|
||||
|
||||
provided_key = _extract_router_api_key(request)
|
||||
|
|
@ -2110,7 +2114,11 @@ async def chat_proxy(request: Request):
|
|||
# Only cache when no max_tokens limit was set — otherwise
|
||||
# finish_reason=length might just mean max_tokens was hit,
|
||||
# not that the context window was exhausted.
|
||||
_req_max_tok = params.get("max_tokens") or params.get("max_completion_tokens") or params.get("num_predict")
|
||||
_req_max_tok = (
|
||||
params.get("max_tokens") or params.get("max_completion_tokens") or params.get("num_predict")
|
||||
if use_openai else
|
||||
(options.get("num_predict") if options else None)
|
||||
)
|
||||
if _dr == "length" and not _req_max_tok:
|
||||
_pt = getattr(chunk, "prompt_eval_count", 0) or 0
|
||||
_ct = getattr(chunk, "eval_count", 0) or 0
|
||||
|
|
@ -3746,21 +3754,37 @@ async def health_proxy(request: Request):
|
|||
- `endpoints`: a mapping of endpoint URL → `{status, version|detail}`.
|
||||
* The HTTP status code is 200 when everything is healthy, 503 otherwise.
|
||||
"""
|
||||
# Run all health checks in parallel
|
||||
tasks = [fetch.endpoint_details(ep, "/api/version", "version", skip_error_cache=True) for ep in config.endpoints] # if not is_ext_openai_endpoint(ep)]
|
||||
# Run all health checks in parallel.
|
||||
# Ollama endpoints expose /api/version; OpenAI-compatible endpoints (vLLM,
|
||||
# llama-server, external) expose /models. Using /api/version against an
|
||||
# OpenAI-compatible endpoint yields a 404 and noisy log output.
|
||||
all_endpoints = list(config.endpoints)
|
||||
llama_eps_extra = [ep for ep in config.llama_server_endpoints if ep not in config.endpoints]
|
||||
all_endpoints += llama_eps_extra
|
||||
|
||||
tasks = []
|
||||
for ep in all_endpoints:
|
||||
if is_openai_compatible(ep):
|
||||
tasks.append(fetch.endpoint_details(ep, "/models", "data", config.api_keys.get(ep), skip_error_cache=True))
|
||||
else:
|
||||
tasks.append(fetch.endpoint_details(ep, "/api/version", "version", skip_error_cache=True))
|
||||
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
health_summary = {}
|
||||
overall_ok = True
|
||||
|
||||
for ep, result in zip(config.endpoints, results):
|
||||
for ep, result in zip(all_endpoints, results):
|
||||
if isinstance(result, Exception):
|
||||
# Endpoint did not respond / returned an error
|
||||
health_summary[ep] = {"status": "error", "detail": str(result)}
|
||||
overall_ok = False
|
||||
else:
|
||||
# Successful response – report the reported version
|
||||
# Successful response – report the reported version (Ollama) or
|
||||
# indicate the endpoint is reachable (OpenAI-compatible).
|
||||
if is_openai_compatible(ep):
|
||||
health_summary[ep] = {"status": "ok"}
|
||||
else:
|
||||
health_summary[ep] = {"status": "ok", "version": result}
|
||||
|
||||
response_payload = {
|
||||
|
|
@ -3772,7 +3796,15 @@ async def health_proxy(request: Request):
|
|||
return JSONResponse(content=response_payload, status_code=http_status)
|
||||
|
||||
# -------------------------------------------------------------
|
||||
# 27. SSE route for usage broadcasts
|
||||
# 27. Hostname endpoint
|
||||
# -------------------------------------------------------------
|
||||
@app.get("/api/hostname")
|
||||
async def get_hostname():
|
||||
"""Return the hostname of the machine running the router."""
|
||||
return JSONResponse(content={"hostname": socket.gethostname()})
|
||||
|
||||
# -------------------------------------------------------------
|
||||
# 28. SSE route for usage broadcasts
|
||||
# -------------------------------------------------------------
|
||||
@app.get("/api/usage-stream")
|
||||
async def usage_stream(request: Request):
|
||||
|
|
|
|||
|
|
@ -344,6 +344,7 @@
|
|||
</div>
|
||||
<div class="header-row">
|
||||
<h1>Router Dashboard</h1>
|
||||
<span id="hostname" style="color:#777; font-size:0.85em;"></span>
|
||||
<button id="total-tokens-btn">Stats Total</button>
|
||||
<span id="aggregation-status" class="loading" style="margin-left:8px;"></span>
|
||||
</div>
|
||||
|
|
@ -1418,6 +1419,11 @@ function initStatsChart(timeSeriesData, endpointDistribution) {
|
|||
</script>
|
||||
<script>
|
||||
document.addEventListener('DOMContentLoaded', () => {
|
||||
authedFetch('/api/hostname').then(r => r.json()).then(data => {
|
||||
const el = document.getElementById('hostname');
|
||||
if (el && data.hostname) el.textContent = data.hostname;
|
||||
}).catch(() => {});
|
||||
|
||||
const totalBtn = document.getElementById('total-tokens-btn');
|
||||
if (totalBtn) {
|
||||
totalBtn.addEventListener('click', async () => {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue