feat: adding a semantic cache layer

This commit is contained in:
Alpha Nerd 2026-03-08 09:12:09 +01:00
parent c3d47c7ffe
commit dd4b12da6a
13 changed files with 1138 additions and 22 deletions

44
.dockerignore Normal file
View file

@ -0,0 +1,44 @@
# Version control
.git
.gitignore
.github
# Environment & secrets
.env
.env.*
*.env
# Python artifacts
__pycache__
*.pyc
*.pyo
*.pyd
.Python
.venv
venv
*.egg-info
dist
build
# Local databases (don't bake data into image)
*.db
*.db-shm
*.db-wal
# IDE / editor
.vscode
.idea
*.swp
*.swo
# Documentation
doc/
docs/
*.md
# Tests
tests/
test_*
# Local config overrides
config.local.yaml

View file

@ -0,0 +1,71 @@
name: Build and Publish Docker Image (Semantic Cache)
# Builds the :semantic variant that includes sentence-transformers + CPU torch
# and the pre-baked all-MiniLM-L6-v2 embedding model (~500 MB larger than lean).
# Tags mirror the lean workflow but carry a -semantic suffix, e.g.:
# ghcr.io/nomyo-ai/nomyo-router:latest-semantic
# ghcr.io/nomyo-ai/nomyo-router:0.7.0-semantic
# ghcr.io/nomyo-ai/nomyo-router:0.7-semantic
on:
push:
branches:
- main
tags:
- "v*.*.*"
workflow_dispatch:
env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}
jobs:
build-and-push-semantic:
runs-on: ubuntu-latest
permissions:
contents: read
packages: write
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up QEMU (for multi-arch builds)
uses: docker/setup-qemu-action@v3
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Log in to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Extract Docker metadata
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
tags: |
# Versioned semantic tags on git tags (e.g. v0.7.0 → 0.7.0-semantic, 0.7-semantic)
type=semver,pattern={{version}}-semantic
type=semver,pattern={{major}}.{{minor}}-semantic
# latest-semantic only on main branch pushes
type=raw,value=latest-semantic,enable=${{ github.ref == 'refs/heads/main' }}
# SHA-tagged for traceability
type=sha,prefix=sha-,suffix=-semantic
- name: Build and push semantic Docker image
uses: docker/build-push-action@v6
with:
context: .
platforms: linux/amd64,linux/arm64
push: true
build-args: |
SEMANTIC_CACHE=true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
cache-from: type=gha
cache-to: type=gha,mode=max

View file

@ -3,21 +3,43 @@ FROM python:3.13-slim
ENV PYTHONUNBUFFERED=1 \ ENV PYTHONUNBUFFERED=1 \
PYTHONDONTWRITEBYTECODE=1 PYTHONDONTWRITEBYTECODE=1
# SEMANTIC_CACHE=true installs sentence-transformers + CPU-only torch and pre-bakes
# the all-MiniLM-L6-v2 embedding model (~500 MB extra). The resulting image is tagged
# :semantic. The default (lean) image supports exact-match caching only.
ARG SEMANTIC_CACHE=false
# Pin HuggingFace cache to a predictable path inside /app/data so it can be
# mounted as a volume and shared between builds.
ENV HF_HOME=/app/data/hf_cache
# Install SQLite # Install SQLite
RUN apt-get update && apt-get install -y sqlite3 RUN apt-get update && apt-get install -y --no-install-recommends sqlite3 \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app WORKDIR /app
COPY requirements.txt . COPY requirements.txt .
RUN pip install --no-cache-dir --upgrade pip \ RUN pip install --no-cache-dir --upgrade pip \
&& pip install --no-cache-dir -r requirements.txt && pip install --no-cache-dir -r requirements.txt
# Semantic cache deps — only installed when SEMANTIC_CACHE=true
# CPU-only torch must be installed before sentence-transformers to avoid
# pulling the full CUDA-enabled build (~2.5 GB).
RUN if [ "$SEMANTIC_CACHE" = "true" ]; then \
pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu && \
pip install --no-cache-dir sentence-transformers && \
python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')"; \
fi
# Create database directory and set permissions # Create database directory and set permissions
RUN mkdir -p /app/data && chown -R www-data:www-data /app/data RUN mkdir -p /app/data && chown -R www-data:www-data /app/data
COPY . . COPY . .
RUN chmod +x /app/entrypoint.sh RUN chmod +x /app/entrypoint.sh && \
chown -R www-data:www-data /app
EXPOSE 12434 EXPOSE 12434
USER www-data
ENTRYPOINT ["/app/entrypoint.sh"] ENTRYPOINT ["/app/entrypoint.sh"]

View file

@ -74,22 +74,28 @@ uvicorn router:app --host 127.0.0.1 --port 12434 --loop uvloop
### Pre-built image (GitHub Container Registry) ### Pre-built image (GitHub Container Registry)
Pre-built multi-arch images (`linux/amd64`, `linux/arm64`) are published automatically on every release: Pre-built multi-arch images (`linux/amd64`, `linux/arm64`) are published automatically on every release.
**Lean image** (exact-match cache, ~300 MB):
```sh ```sh
docker pull ghcr.io/nomyo-ai/nomyo-router:latest docker pull ghcr.io/nomyo-ai/nomyo-router:latest
```
Specific version:
```sh
docker pull ghcr.io/nomyo-ai/nomyo-router:0.7.0 docker pull ghcr.io/nomyo-ai/nomyo-router:0.7.0
``` ```
### Build the container image locally: **Semantic image** (semantic cache with `all-MiniLM-L6-v2` pre-baked, ~800 MB):
```sh
docker pull ghcr.io/nomyo-ai/nomyo-router:latest-semantic
docker pull ghcr.io/nomyo-ai/nomyo-router:0.7.0-semantic
```
### Build the container image locally
```sh ```sh
# Lean build (exact match cache, default)
docker build -t nomyo-router . docker build -t nomyo-router .
# Semantic build — sentence-transformers + model baked in
docker build --build-arg SEMANTIC_CACHE=true -t nomyo-router:semantic .
``` ```
Run the router in Docker with your own configuration file mounted from the host. The entrypoint script accepts a `--config-path` argument so you can point to a file anywhere inside the container: Run the router in Docker with your own configuration file mounted from the host. The entrypoint script accepts a `--config-path` argument so you can point to a file anywhere inside the container:
@ -124,6 +130,53 @@ This way the Ollama backend servers are utilized more efficient than by simply u
NOMYO Router also supports OpenAI API compatible v1 backend servers. NOMYO Router also supports OpenAI API compatible v1 backend servers.
## Semantic LLM Cache
NOMYO Router includes an optional semantic cache that serves repeated or semantically similar LLM requests from cache — no endpoint round-trip, no token cost, response in <10 ms.
### Enable (exact match, any image)
```yaml
# config.yaml
cache_enabled: true
cache_backend: sqlite # persists across restarts
cache_similarity: 1.0 # exact match only
cache_ttl: 3600
```
### Enable (semantic matching, :semantic image)
```yaml
cache_enabled: true
cache_backend: sqlite
cache_similarity: 0.90 # "What is Python?" ≈ "What's Python?" → cache hit
cache_ttl: 3600
cache_history_weight: 0.3
```
Pull the semantic image:
```bash
docker pull ghcr.io/nomyo-ai/nomyo-router:latest-semantic
```
### Cache key strategy
Each request is keyed on `model + system_prompt` (exact) combined with a weighted-mean embedding of BM25-weighted chat history (30%) and the last user message (70%). This means:
- Different system prompts → always separate cache namespaces (no cross-tenant leakage)
- Same question, different phrasing → cache hit (semantic mode)
- MOE requests (`moe-*`) → always bypass the cache
### Cached routes
`/api/chat` · `/api/generate` · `/v1/chat/completions` · `/v1/completions`
### Cache management
```bash
curl http://localhost:12434/api/cache/stats # hit rate, counters, config
curl -X POST http://localhost:12434/api/cache/invalidate # clear all entries
```
## Supplying the router API key ## Supplying the router API key
If you set `nomyo-router-api-key` in `config.yaml` (or `NOMYO_ROUTER_API_KEY` env), every request to NOMYO Router must include the key: If you set `nomyo-router-api-key` in `config.yaml` (or `NOMYO_ROUTER_API_KEY` env), every request to NOMYO Router must include the key:

407
cache.py Normal file
View file

@ -0,0 +1,407 @@
"""
LLM Semantic Cache for NOMYO Router.
Strategy:
- Namespace: sha256(route :: model :: system_prompt)[:16] exact context isolation
- Cache key: hash(normalize(last_user_message), namespace) exact lookup
- Embedding: weighted mean of
α * embed(bm25_weighted(chat_history)) conversation context
1-α * embed(last_user_message) the actual question
with α = cache_history_weight (default 0.3).
- Exact-match caching (similarity=1.0) uses DummyEmbeddingProvider zero extra deps.
- Semantic caching (similarity<1.0) requires sentence-transformers. If missing the
library falls back to exact-match with a warning (lean Docker image behaviour).
- MOE models (moe-*) always bypass the cache.
- Token counts are never recorded for cache hits.
- Streaming cache hits are served as a single-chunk response.
"""
import hashlib
import math
import time
import warnings
from collections import Counter
from typing import Any, Optional
# Lazily resolved once at first embed() call
_semantic_available: Optional[bool] = None
def _check_sentence_transformers() -> bool:
global _semantic_available
if _semantic_available is None:
try:
import sentence_transformers # noqa: F401
_semantic_available = True
except ImportError:
_semantic_available = False
return _semantic_available # type: ignore[return-value]
# ---------------------------------------------------------------------------
# BM25-weighted text representation of chat history
# ---------------------------------------------------------------------------
def _bm25_weighted_text(history: list[dict]) -> str:
"""
Produce a BM25-importance-weighted text string from chat history turns.
High-IDF (rare, domain-specific) terms are repeated proportionally to
their BM25 score so the downstream sentence-transformer embedding
naturally upweights topical signal and downweights stop words.
"""
docs = [m.get("content", "") for m in history if m.get("content")]
if not docs:
return ""
def _tok(text: str) -> list[str]:
return [w.lower() for w in text.split() if len(w) > 2]
tokenized = [_tok(d) for d in docs]
N = len(tokenized)
df: Counter = Counter()
for tokens in tokenized:
for term in set(tokens):
df[term] += 1
k1, b = 1.5, 0.75
avg_dl = sum(len(t) for t in tokenized) / max(N, 1)
term_scores: Counter = Counter()
for tokens in tokenized:
tf_c = Counter(tokens)
dl = len(tokens)
for term, tf in tf_c.items():
idf = math.log((N + 1) / (df[term] + 1)) + 1.0
score = idf * (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * dl / max(avg_dl, 1)))
term_scores[term] += score
top = term_scores.most_common(50)
if not top:
return " ".join(docs)
max_s = top[0][1]
out: list[str] = []
for term, score in top:
out.extend([term] * max(1, round(3 * score / max_s)))
return " ".join(out)
# ---------------------------------------------------------------------------
# LLMCache
# ---------------------------------------------------------------------------
class LLMCache:
"""
Thin async wrapper around async-semantic-llm-cache that adds:
- Route-aware namespace isolation
- Two-vector weighted-mean embedding (history context + question)
- Per-instance hit/miss counters
- Graceful fallback when sentence-transformers is absent
"""
def __init__(self, cfg: Any) -> None:
self._cfg = cfg
self._backend: Any = None
self._emb_cache: Any = None
self._semantic: bool = False
self._hits: int = 0
self._misses: int = 0
async def init(self) -> None:
from semantic_llm_cache.similarity import EmbeddingCache
# --- Backend ---
backend_type: str = self._cfg.cache_backend
if backend_type == "sqlite":
from semantic_llm_cache.backends.sqlite import SQLiteBackend
self._backend = SQLiteBackend(db_path=self._cfg.cache_db_path)
elif backend_type == "redis":
from semantic_llm_cache.backends.redis import RedisBackend
self._backend = RedisBackend(url=self._cfg.cache_redis_url)
await self._backend.ping()
else:
from semantic_llm_cache.backends.memory import MemoryBackend
self._backend = MemoryBackend()
# --- Embedding provider ---
if self._cfg.cache_similarity < 1.0:
if _check_sentence_transformers():
from semantic_llm_cache.similarity import create_embedding_provider
provider = create_embedding_provider("sentence-transformer")
self._emb_cache = EmbeddingCache(provider=provider)
self._semantic = True
print(
f"[cache] Semantic cache ready "
f"(similarity≥{self._cfg.cache_similarity}, backend={backend_type})"
)
else:
warnings.warn(
"[cache] sentence-transformers is not installed. "
"Falling back to exact-match caching (similarity=1.0). "
"Use the :semantic Docker image tag to enable semantic caching.",
RuntimeWarning,
stacklevel=2,
)
self._emb_cache = EmbeddingCache() # DummyEmbeddingProvider
print(f"[cache] Exact-match cache ready (backend={backend_type}) [semantic unavailable]")
else:
self._emb_cache = EmbeddingCache() # DummyEmbeddingProvider
print(f"[cache] Exact-match cache ready (backend={backend_type})")
# ------------------------------------------------------------------
# Internal helpers
# ------------------------------------------------------------------
def _namespace(self, route: str, model: str, system: str) -> str:
raw = f"{route}::{model}::{system}"
return hashlib.sha256(raw.encode()).hexdigest()[:16]
def _cache_key(self, namespace: str, last_user: str) -> str:
from semantic_llm_cache.utils import hash_prompt, normalize_prompt
return hash_prompt(normalize_prompt(last_user), namespace)
def _parse_messages(
self, messages: list[dict]
) -> tuple[str, list[dict], str]:
"""
Returns (system_prompt, prior_history_turns, last_user_message).
Multimodal content lists are reduced to their text parts.
"""
system = ""
turns: list[dict] = []
for m in messages:
role = m.get("role", "")
content = m.get("content", "")
if isinstance(content, list):
content = " ".join(
p.get("text", "")
for p in content
if isinstance(p, dict) and p.get("type") == "text"
)
if role == "system":
system = content
else:
turns.append({"role": role, "content": content})
last_user = ""
for m in reversed(turns):
if m["role"] == "user":
last_user = m["content"]
break
# History = all turns before the final user message
history = turns[:-1] if turns and turns[-1]["role"] == "user" else turns
return system, history, last_user
async def _build_embedding(
self, history: list[dict], last_user: str
) -> list[float] | None:
"""
Weighted mean of BM25-weighted history embedding and last-user embedding.
Returns None when not in semantic mode.
"""
if not self._semantic:
return None
import numpy as np
alpha: float = self._cfg.cache_history_weight # weight for history signal
q_vec = np.array(await self._emb_cache.aencode(last_user), dtype=float)
if not history:
# No history → use question embedding alone (alpha has no effect)
return q_vec.tolist()
h_text = _bm25_weighted_text(history)
h_vec = np.array(await self._emb_cache.aencode(h_text), dtype=float)
combined = alpha * h_vec + (1.0 - alpha) * q_vec
norm = float(np.linalg.norm(combined))
if norm > 0.0:
combined /= norm
return combined.tolist()
# ------------------------------------------------------------------
# Public interface: chat (handles both Ollama and OpenAI message lists)
# ------------------------------------------------------------------
async def get_chat(
self, route: str, model: str, messages: list[dict]
) -> bytes | None:
"""Return cached response bytes, or None on miss."""
if not self._backend:
return None
system, history, last_user = self._parse_messages(messages)
if not last_user:
return None
ns = self._namespace(route, model, system)
key = self._cache_key(ns, last_user)
# 1. Exact key match
entry = await self._backend.get(key)
if entry is not None:
self._hits += 1
return entry.response # type: ignore[return-value]
# 2. Semantic similarity match
if self._semantic and self._cfg.cache_similarity < 1.0:
emb = await self._build_embedding(history, last_user)
result = await self._backend.find_similar(
emb, threshold=self._cfg.cache_similarity, namespace=ns
)
if result is not None:
_, matched, _ = result
self._hits += 1
return matched.response # type: ignore[return-value]
self._misses += 1
return None
async def set_chat(
self, route: str, model: str, messages: list[dict], response_bytes: bytes
) -> None:
"""Store a response in the cache (fire-and-forget friendly)."""
if not self._backend:
return
system, history, last_user = self._parse_messages(messages)
if not last_user:
return
ns = self._namespace(route, model, system)
key = self._cache_key(ns, last_user)
emb = (
await self._build_embedding(history, last_user)
if self._semantic and self._cfg.cache_similarity < 1.0
else None
)
from semantic_llm_cache.config import CacheEntry
await self._backend.set(
key,
CacheEntry(
prompt=last_user,
response=response_bytes,
embedding=emb,
created_at=time.time(),
ttl=self._cfg.cache_ttl,
namespace=ns,
hit_count=0,
),
)
# ------------------------------------------------------------------
# Convenience wrappers for the generate route (prompt string, not messages)
# ------------------------------------------------------------------
async def get_generate(
self, model: str, prompt: str, system: str = ""
) -> bytes | None:
messages: list[dict] = []
if system:
messages.append({"role": "system", "content": system})
messages.append({"role": "user", "content": prompt})
return await self.get_chat("generate", model, messages)
async def set_generate(
self, model: str, prompt: str, system: str, response_bytes: bytes
) -> None:
messages: list[dict] = []
if system:
messages.append({"role": "system", "content": system})
messages.append({"role": "user", "content": prompt})
await self.set_chat("generate", model, messages, response_bytes)
# ------------------------------------------------------------------
# Management
# ------------------------------------------------------------------
def stats(self) -> dict:
total = self._hits + self._misses
return {
"hits": self._hits,
"misses": self._misses,
"hit_rate": round(self._hits / total, 3) if total else 0.0,
"semantic": self._semantic,
"backend": self._cfg.cache_backend,
"similarity_threshold": self._cfg.cache_similarity,
"history_weight": self._cfg.cache_history_weight,
}
async def clear(self) -> None:
if self._backend:
await self._backend.clear()
self._hits = 0
self._misses = 0
# ---------------------------------------------------------------------------
# Module-level singleton
# ---------------------------------------------------------------------------
_cache: LLMCache | None = None
async def init_llm_cache(cfg: Any) -> LLMCache | None:
"""Initialise the module-level cache singleton. Returns None if disabled."""
global _cache
if not cfg.cache_enabled:
print("[cache] Cache disabled (cache_enabled=false).")
return None
_cache = LLMCache(cfg)
await _cache.init()
return _cache
def get_llm_cache() -> LLMCache | None:
return _cache
# ---------------------------------------------------------------------------
# Helper: convert a stored Ollama-format non-streaming response to an
# OpenAI SSE single-chunk stream (used when a streaming OpenAI request
# hits the cache whose entry was populated from a non-streaming response).
# ---------------------------------------------------------------------------
def openai_nonstream_to_sse(cached_bytes: bytes, model: str) -> bytes:
"""
Wrap a stored OpenAI ChatCompletion JSON as a minimal single-chunk SSE stream.
The stored entry always uses the non-streaming ChatCompletion format so that
non-streaming cache hits can be served directly; this function adapts it for
streaming clients.
"""
import orjson, time as _time
try:
d = orjson.loads(cached_bytes)
content = (d.get("choices") or [{}])[0].get("message", {}).get("content", "")
chunk = {
"id": d.get("id", "cache-hit"),
"object": "chat.completion.chunk",
"created": d.get("created", int(_time.time())),
"model": d.get("model", model),
"choices": [
{
"index": 0,
"delta": {"role": "assistant", "content": content},
"finish_reason": "stop",
}
],
}
if d.get("usage"):
chunk["usage"] = d["usage"]
return f"data: {orjson.dumps(chunk).decode()}\n\ndata: [DONE]\n\n".encode()
except Exception as exc:
warnings.warn(
f"[cache] openai_nonstream_to_sse: corrupt cache entry, returning empty stream: {exc}",
RuntimeWarning,
stacklevel=2,
)
return b"data: [DONE]\n\n"

View file

@ -6,7 +6,7 @@ endpoints:
- https://api.openai.com/v1 - https://api.openai.com/v1
llama_server_endpoints: llama_server_endpoints:
- http://192.168.0.33:8889/v1 - http://192.168.0.50:8889/v1
# Maximum concurrent connections *per endpointmodel pair* (equals to OLLAMA_NUM_PARALLEL) # Maximum concurrent connections *per endpointmodel pair* (equals to OLLAMA_NUM_PARALLEL)
max_concurrent_connections: 2 max_concurrent_connections: 2
@ -22,4 +22,38 @@ api_keys:
"http://192.168.0.51:11434": "ollama" "http://192.168.0.51:11434": "ollama"
"http://192.168.0.52:11434": "ollama" "http://192.168.0.52:11434": "ollama"
"https://api.openai.com/v1": "${OPENAI_KEY}" "https://api.openai.com/v1": "${OPENAI_KEY}"
"http://192.168.0.33:8889/v1": "llama" "http://192.168.0.50:8889/v1": "llama"
# -------------------------------------------------------------
# Semantic LLM Cache (optional — disabled by default)
# Caches LLM responses to cut costs and latency on repeated or
# semantically similar prompts.
# Cached routes: /api/chat /api/generate /v1/chat/completions /v1/completions
# MOE requests (moe-* model prefix) always bypass the cache.
# -------------------------------------------------------------
cache_enabled: true
# Backend — where cached responses are stored:
# memory → in-process LRU (lost on restart, not shared across replicas) [default]
# sqlite → persistent file-based (single instance, survives restart)
# redis → distributed (shared across replicas, requires Redis)
cache_backend: memory
# Cosine similarity threshold for a cache hit:
# 1.0 → exact match only (works on any image variant)
# <1.0 → semantic matching (requires the :semantic Docker image tag)
cache_similarity: 0.9
# Response TTL in seconds. Remove the key or set to null to cache forever.
cache_ttl: 3600
# SQLite backend: path to the cache database file
cache_db_path: llm_cache.db
# Redis backend: connection URL
# cache_redis_url: redis://localhost:6379/0
# Weight of the BM25-weighted chat-history embedding vs last-user-message embedding.
# 0.3 = 30% history context signal, 70% question signal.
# Only relevant when cache_similarity < 1.0.
# cache_history_weight: 0.3

View file

@ -204,6 +204,149 @@ max_concurrent_connections: 3
**Recommendation**: Use multiple endpoints for redundancy and load distribution. **Recommendation**: Use multiple endpoints for redundancy and load distribution.
## Semantic LLM Cache
NOMYO Router can cache LLM responses and serve them directly — skipping endpoint selection, model load, and token generation entirely.
### How it works
1. On every cacheable request (`/api/chat`, `/api/generate`, `/v1/chat/completions`, `/v1/completions`) the cache is checked **before** choosing an endpoint.
2. On a **cache hit** the stored response is returned immediately as a single chunk (streaming or non-streaming — both work).
3. On a **cache miss** the request is forwarded normally. The response is stored in the cache after it completes.
4. **MOE requests** (`moe-*` model prefix) always bypass the cache.
5. **Token counts** are never recorded for cache hits.
### Cache key strategy
| Signal | How matched |
|---|---|
| `model + system_prompt` | Exact — hard context isolation per deployment |
| BM25-weighted embedding of chat history | Semantic — conversation context signal |
| Embedding of last user message | Semantic — the actual question |
The two semantic vectors are combined as a weighted mean (tuned by `cache_history_weight`) before cosine similarity comparison, staying at a single 384-dimensional vector compatible with the library's storage format.
### Quick start — exact match (lean image)
```yaml
cache_enabled: true
cache_backend: sqlite # persists across restarts
cache_similarity: 1.0 # exact match only, no sentence-transformers needed
cache_ttl: 3600
```
### Quick start — semantic matching (:semantic image)
```yaml
cache_enabled: true
cache_backend: sqlite
cache_similarity: 0.90 # hit if ≥90% cosine similarity
cache_ttl: 3600
cache_history_weight: 0.3
```
Pull the semantic image:
```bash
docker pull ghcr.io/nomyo-ai/nomyo-router:latest-semantic
```
### Cache configuration options
#### `cache_enabled`
**Type**: `bool` | **Default**: `false`
Enable or disable the cache. All other cache settings are ignored when `false`.
#### `cache_backend`
**Type**: `str` | **Default**: `"memory"`
| Value | Description | Persists | Multi-replica |
|---|---|---|---|
| `memory` | In-process LRU dict | ❌ | ❌ |
| `sqlite` | File-based via `aiosqlite` | ✅ | ❌ |
| `redis` | Redis via `redis.asyncio` | ✅ | ✅ |
Use `redis` when running multiple router replicas behind a load balancer — all replicas share one warm cache.
#### `cache_similarity`
**Type**: `float` | **Default**: `1.0`
Cosine similarity threshold. `1.0` means exact match only (no embedding model needed). Values below `1.0` enable semantic matching, which requires the `:semantic` Docker image tag.
Recommended starting value for semantic mode: `0.90`.
#### `cache_ttl`
**Type**: `int | null` | **Default**: `3600`
Time-to-live for cache entries in seconds. Remove the key or set to `null` to cache forever.
#### `cache_db_path`
**Type**: `str` | **Default**: `"llm_cache.db"`
Path to the SQLite cache database. Only used when `cache_backend: sqlite`.
#### `cache_redis_url`
**Type**: `str` | **Default**: `"redis://localhost:6379/0"`
Redis connection URL. Only used when `cache_backend: redis`.
#### `cache_history_weight`
**Type**: `float` | **Default**: `0.3`
Weight of the BM25-weighted chat-history embedding in the combined cache key vector. `0.3` means the history contributes 30% and the final user message contributes 70% of the similarity signal. Only used when `cache_similarity < 1.0`.
### Cache management endpoints
| Endpoint | Method | Description |
|---|---|---|
| `/api/cache/stats` | `GET` | Hit/miss counters, hit rate, current config |
| `/api/cache/invalidate` | `POST` | Clear all cache entries and reset counters |
```bash
# Check cache performance
curl http://localhost:12434/api/cache/stats
# Clear the cache
curl -X POST http://localhost:12434/api/cache/invalidate
```
Example stats response:
```json
{
"enabled": true,
"hits": 1547,
"misses": 892,
"hit_rate": 0.634,
"semantic": true,
"backend": "sqlite",
"similarity_threshold": 0.9,
"history_weight": 0.3
}
```
### Docker image variants
| Tag | Semantic cache | Image size |
|---|---|---|
| `latest` | ❌ exact match only | ~300 MB |
| `latest-semantic` | ✅ sentence-transformers + model pre-baked | ~800 MB |
Build locally:
```bash
# Lean (exact match)
docker build -t nomyo-router .
# Semantic (~500 MB larger, all-MiniLM-L6-v2 model baked in)
docker build --build-arg SEMANTIC_CACHE=true -t nomyo-router:semantic .
```
## Configuration Validation ## Configuration Validation
The router validates the configuration at startup: The router validates the configuration at startup:

View file

@ -82,10 +82,23 @@ sudo systemctl status nomyo-router
## 2. Docker Deployment ## 2. Docker Deployment
### Image variants
| Tag | Semantic cache | Image size |
|---|---|---|
| `latest` | ❌ exact match only | ~300 MB |
| `latest-semantic` | ✅ sentence-transformers + `all-MiniLM-L6-v2` pre-baked | ~800 MB |
The `:semantic` variant enables `cache_similarity < 1.0` in `config.yaml`. The lean image falls back to exact-match caching with a warning if semantic mode is configured.
### Build the Image ### Build the Image
```bash ```bash
# Lean build (exact match cache, default)
docker build -t nomyo-router . docker build -t nomyo-router .
# Semantic build (~500 MB larger, all-MiniLM-L6-v2 model baked in at build time)
docker build --build-arg SEMANTIC_CACHE=true -t nomyo-router:semantic .
``` ```
### Run the Container ### Run the Container

View file

@ -1,20 +1,30 @@
# Docker Compose example for NOMYO Router with multiple Ollama instances # Docker Compose example for NOMYO Router with multiple Ollama instances
#
# Two router profiles are provided:
# nomyo-router — lean image, exact-match cache only (~300 MB)
# nomyo-router-semantic — semantic image, sentence-transformers baked in (~800 MB)
#
# Uncomment the redis service and set cache_backend: redis in config.yaml
# to share the LLM response cache across multiple router replicas.
version: '3.8' version: '3.8'
services: services:
# NOMYO Router # NOMYO Router — lean image (exact-match cache, default)
nomyo-router: nomyo-router:
image: nomyo-router:latest image: nomyo-router:latest
build: . build:
context: .
args:
SEMANTIC_CACHE: "false"
ports: ports:
- "12434:12434" - "12434:12434"
environment: environment:
- CONFIG_PATH=/app/config/config.yaml - CONFIG_PATH=/app/config/config.yaml
- NOMYO_ROUTER_DB_PATH=/app/token_counts.db - NOMYO_ROUTER_DB_PATH=/app/data/token_counts.db
volumes: volumes:
- ./config:/app/config - ./config:/app/config
- router-db:/app/token_counts.db - router-data:/app/data
depends_on: depends_on:
- ollama1 - ollama1
- ollama2 - ollama2
@ -23,6 +33,45 @@ services:
networks: networks:
- nomyo-net - nomyo-net
# NOMYO Router — semantic image (cache_similarity < 1.0 support, ~800 MB)
# Build: docker compose build nomyo-router-semantic
# Switch: comment out nomyo-router above, uncomment this block.
# nomyo-router-semantic:
# image: nomyo-router:semantic
# build:
# context: .
# args:
# SEMANTIC_CACHE: "true"
# ports:
# - "12434:12434"
# environment:
# - CONFIG_PATH=/app/config/config.yaml
# - NOMYO_ROUTER_DB_PATH=/app/data/token_counts.db
# volumes:
# - ./config:/app/config
# - router-data:/app/data
# - hf-cache:/app/data/hf_cache # share HuggingFace model cache across builds
# depends_on:
# - ollama1
# - ollama2
# - ollama3
# restart: unless-stopped
# networks:
# - nomyo-net
# Optional: Redis for shared LLM response cache across multiple router replicas.
# Requires cache_backend: redis in config.yaml.
# redis:
# image: redis:7-alpine
# ports:
# - "6379:6379"
# volumes:
# - redis-data:/data
# command: redis-server --save 60 1 --loglevel warning
# restart: unless-stopped
# networks:
# - nomyo-net
# Ollama Instance 1 # Ollama Instance 1
ollama1: ollama1:
image: ollama/ollama:latest image: ollama/ollama:latest
@ -87,7 +136,9 @@ services:
- nomyo-net - nomyo-net
volumes: volumes:
router-db: router-data:
# hf-cache: # uncomment when using nomyo-router-semantic
# redis-data: # uncomment when using Redis cache backend
ollama1-data: ollama1-data:
ollama2-data: ollama2-data:
ollama3-data: ollama3-data:

View file

@ -30,3 +30,37 @@ api_keys:
"https://api.openai.com/v1": "${OPENAI_KEY}" "https://api.openai.com/v1": "${OPENAI_KEY}"
"http://localhost:8080/v1": "llama-server" # Optional API key for llama-server - depends on llama_server config "http://localhost:8080/v1": "llama-server" # Optional API key for llama-server - depends on llama_server config
"http://192.168.0.33:8081/v1": "llama-server" "http://192.168.0.33:8081/v1": "llama-server"
# -------------------------------------------------------------
# Semantic LLM Cache (optional — disabled by default)
# Caches LLM responses to cut costs and latency on repeated or
# semantically similar prompts.
# Cached routes: /api/chat /api/generate /v1/chat/completions /v1/completions
# MOE requests (moe-* model prefix) always bypass the cache.
# -------------------------------------------------------------
# cache_enabled: false
# Backend — where cached responses are stored:
# memory → in-process LRU (lost on restart, not shared across replicas) [default]
# sqlite → persistent file-based (single instance, survives restart)
# redis → distributed (shared across replicas, requires Redis)
# cache_backend: memory
# Cosine similarity threshold for a cache hit:
# 1.0 → exact match only (works on any image variant)
# <1.0 → semantic matching (requires the :semantic Docker image tag)
# cache_similarity: 1.0
# Response TTL in seconds. Remove the key or set to null to cache forever.
# cache_ttl: 3600
# SQLite backend: path to the cache database file
# cache_db_path: llm_cache.db
# Redis backend: connection URL
# cache_redis_url: redis://localhost:6379/0
# Weight of the BM25-weighted chat-history embedding vs last-user-message embedding.
# 0.3 = 30% history context signal, 70% question signal.
# Only relevant when cache_similarity < 1.0.
# cache_history_weight: 0.3

View file

@ -133,6 +133,39 @@ Response:
} }
``` ```
### Cache Statistics
```bash
curl http://localhost:12434/api/cache/stats
```
Response when cache is enabled:
```json
{
"enabled": true,
"hits": 1547,
"misses": 892,
"hit_rate": 0.634,
"semantic": true,
"backend": "sqlite",
"similarity_threshold": 0.9,
"history_weight": 0.3
}
```
Response when cache is disabled:
```json
{ "enabled": false }
```
### Cache Invalidation
```bash
curl -X POST http://localhost:12434/api/cache/invalidate
```
Clears all cached entries and resets hit/miss counters.
### Real-time Usage Stream ### Real-time Usage Stream
```bash ```bash

View file

@ -39,3 +39,8 @@ uvicorn==0.38.0
uvloop uvloop
yarl==1.20.1 yarl==1.20.1
aiosqlite aiosqlite
# Semantic LLM cache — base install (exact-match mode, no heavy ML deps)
# For semantic mode use the :semantic Docker image tag (adds sentence-transformers + torch)
# SECURITY: pin to a specific tag or commit hash to prevent supply-chain drift, e.g.:
# semantic-llm-cache @ git+https://github.com/nomyo-ai/async-semantic-llm-cache.git@v1.0.0
semantic-llm-cache @ git+https://github.com/nomyo-ai/async-semantic-llm-cache.git

214
router.py
View file

@ -123,6 +123,22 @@ class Config(BaseSettings):
# Database configuration # Database configuration
db_path: str = Field(default=os.getenv("NOMYO_ROUTER_DB_PATH", "token_counts.db")) db_path: str = Field(default=os.getenv("NOMYO_ROUTER_DB_PATH", "token_counts.db"))
# Semantic LLM Cache configuration
cache_enabled: bool = Field(default=False)
# Backend: "memory" (default, in-process), "sqlite" (persistent), "redis" (distributed)
cache_backend: str = Field(default="memory")
# Cosine similarity threshold: 1.0 = exact match only, <1.0 = semantic (requires :semantic image)
cache_similarity: float = Field(default=1.0)
# TTL in seconds; None = cache forever
cache_ttl: Optional[int] = Field(default=3600)
# SQLite backend: path to cache database file
cache_db_path: str = Field(default="llm_cache.db")
# Redis backend: connection URL
cache_redis_url: str = Field(default="redis://localhost:6379/0")
# Weight of BM25-weighted chat-history embedding vs last-user-message embedding
# 0.3 = 30% history context signal, 70% question signal
cache_history_weight: float = Field(default=0.3)
class Config: class Config:
# Load from `config.yaml` first, then from env variables # Load from `config.yaml` first, then from env variables
env_prefix = "NOMYO_ROUTER_" env_prefix = "NOMYO_ROUTER_"
@ -188,6 +204,7 @@ def _config_path_from_env() -> Path:
from ollama._types import TokenLogprob, Logprob from ollama._types import TokenLogprob, Logprob
from db import TokenDatabase from db import TokenDatabase
from cache import init_llm_cache, get_llm_cache, openai_nonstream_to_sse
# Create the global config object it will be overwritten on startup # Create the global config object it will be overwritten on startup
@ -1596,6 +1613,14 @@ async def proxy(request: Request):
error_msg = f"Invalid JSON format in request body: {str(e)}. Please ensure the request is properly formatted." error_msg = f"Invalid JSON format in request body: {str(e)}. Please ensure the request is properly formatted."
raise HTTPException(status_code=400, detail=error_msg) from e raise HTTPException(status_code=400, detail=error_msg) from e
# Cache lookup — before endpoint selection so no slot is wasted on a hit
_cache = get_llm_cache()
if _cache is not None:
_cached = await _cache.get_generate(model, prompt, system or "")
if _cached is not None:
async def _serve_cached_generate():
yield _cached
return StreamingResponse(_serve_cached_generate(), media_type="application/json")
endpoint, tracking_model = await choose_endpoint(model) endpoint, tracking_model = await choose_endpoint(model)
use_openai = is_openai_compatible(endpoint) use_openai = is_openai_compatible(endpoint)
@ -1633,6 +1658,7 @@ async def proxy(request: Request):
else: else:
async_gen = await client.generate(model=model, prompt=prompt, suffix=suffix, system=system, template=template, context=context, stream=stream, think=think, raw=raw, format=_format, images=images, options=options, keep_alive=keep_alive) async_gen = await client.generate(model=model, prompt=prompt, suffix=suffix, system=system, template=template, context=context, stream=stream, think=think, raw=raw, format=_format, images=images, options=options, keep_alive=keep_alive)
if stream == True: if stream == True:
content_parts: list[str] = []
async for chunk in async_gen: async for chunk in async_gen:
if use_openai: if use_openai:
chunk = rechunk.openai_completion2ollama(chunk, stream, start_ts) chunk = rechunk.openai_completion2ollama(chunk, stream, start_ts)
@ -1644,6 +1670,27 @@ async def proxy(request: Request):
json_line = chunk.model_dump_json() json_line = chunk.model_dump_json()
else: else:
json_line = orjson.dumps(chunk) json_line = orjson.dumps(chunk)
# Accumulate and store cache on done chunk — before yield so it always runs
if _cache is not None:
if getattr(chunk, "response", None):
content_parts.append(chunk.response)
if getattr(chunk, "done", False):
assembled = orjson.dumps({
k: v for k, v in {
"model": getattr(chunk, "model", model),
"response": "".join(content_parts),
"done": True,
"done_reason": getattr(chunk, "done_reason", "stop") or "stop",
"prompt_eval_count": getattr(chunk, "prompt_eval_count", None),
"eval_count": getattr(chunk, "eval_count", None),
"total_duration": getattr(chunk, "total_duration", None),
"eval_duration": getattr(chunk, "eval_duration", None),
}.items() if v is not None
}) + b"\n"
try:
await _cache.set_generate(model, prompt, system or "", assembled)
except Exception as _ce:
print(f"[cache] set_generate (streaming) failed: {_ce}")
yield json_line.encode("utf-8") + b"\n" yield json_line.encode("utf-8") + b"\n"
else: else:
if use_openai: if use_openai:
@ -1660,7 +1707,14 @@ async def proxy(request: Request):
if hasattr(async_gen, "model_dump_json") if hasattr(async_gen, "model_dump_json")
else orjson.dumps(async_gen) else orjson.dumps(async_gen)
) )
yield json_line.encode("utf-8") + b"\n" cache_bytes = json_line.encode("utf-8") + b"\n"
yield cache_bytes
# Cache non-streaming response
if _cache is not None:
try:
await _cache.set_generate(model, prompt, system or "", cache_bytes)
except Exception as _ce:
print(f"[cache] set_generate (non-streaming) failed: {_ce}")
finally: finally:
# Ensure counter is decremented even if an exception occurs # Ensure counter is decremented even if an exception occurs
@ -1711,6 +1765,26 @@ async def chat_proxy(request: Request):
except orjson.JSONDecodeError as e: except orjson.JSONDecodeError as e:
raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e
# Cache lookup — before endpoint selection, always bypassed for MOE
_is_moe = model.startswith("moe-")
_cache = get_llm_cache()
# Normalise model name for cache key: strip ":latest" suffix here so that
# get_chat and set_chat use the same model string regardless of when the
# strip happens further down (line ~1793 strips it for OpenAI endpoints).
_cache_model = model[: -len(":latest")] if model.endswith(":latest") else model
# Snapshot original messages before any OpenAI-format transformation so that
# get_chat and set_chat always use the same key regardless of backend type.
_cache_messages = messages
if _cache is not None and not _is_moe:
_cached = await _cache.get_chat("ollama_chat", _cache_model, messages)
if _cached is not None:
async def _serve_cached_chat():
yield _cached
return StreamingResponse(
_serve_cached_chat(),
media_type="application/x-ndjson" if stream else "application/json",
)
# 2. Endpoint logic # 2. Endpoint logic
if model.startswith("moe-"): if model.startswith("moe-"):
model = model.split("moe-")[1] model = model.split("moe-")[1]
@ -1764,6 +1838,7 @@ async def chat_proxy(request: Request):
async_gen = await client.chat(model=model, messages=messages, tools=tools, stream=stream, think=think, format=_format, options=options, keep_alive=keep_alive, logprobs=logprobs, top_logprobs=top_logprobs) async_gen = await client.chat(model=model, messages=messages, tools=tools, stream=stream, think=think, format=_format, options=options, keep_alive=keep_alive, logprobs=logprobs, top_logprobs=top_logprobs)
if stream == True: if stream == True:
tc_acc = {} # accumulate OpenAI tool-call deltas across chunks tc_acc = {} # accumulate OpenAI tool-call deltas across chunks
content_parts: list[str] = []
async for chunk in async_gen: async for chunk in async_gen:
if use_openai: if use_openai:
_accumulate_openai_tc_delta(chunk, tc_acc) _accumulate_openai_tc_delta(chunk, tc_acc)
@ -1780,6 +1855,30 @@ async def chat_proxy(request: Request):
json_line = chunk.model_dump_json() json_line = chunk.model_dump_json()
else: else:
json_line = orjson.dumps(chunk) json_line = orjson.dumps(chunk)
# Accumulate and store cache on done chunk — before yield so it always runs
# Works for both Ollama-native and OpenAI-compatible backends; chunks are
# already converted to Ollama format by rechunk before this point.
if _cache is not None and not _is_moe:
if chunk.message and getattr(chunk.message, "content", None):
content_parts.append(chunk.message.content)
if getattr(chunk, "done", False):
assembled = orjson.dumps({
k: v for k, v in {
"model": getattr(chunk, "model", model),
"created_at": (lambda ca: ca.isoformat() if hasattr(ca, "isoformat") else ca)(getattr(chunk, "created_at", None)),
"message": {"role": "assistant", "content": "".join(content_parts)},
"done": True,
"done_reason": getattr(chunk, "done_reason", "stop") or "stop",
"prompt_eval_count": getattr(chunk, "prompt_eval_count", None),
"eval_count": getattr(chunk, "eval_count", None),
"total_duration": getattr(chunk, "total_duration", None),
"eval_duration": getattr(chunk, "eval_duration", None),
}.items() if v is not None
}) + b"\n"
try:
await _cache.set_chat("ollama_chat", _cache_model, _cache_messages, assembled)
except Exception as _ce:
print(f"[cache] set_chat (ollama_chat streaming) failed: {_ce}")
yield json_line.encode("utf-8") + b"\n" yield json_line.encode("utf-8") + b"\n"
else: else:
if use_openai: if use_openai:
@ -1796,7 +1895,14 @@ async def chat_proxy(request: Request):
if hasattr(async_gen, "model_dump_json") if hasattr(async_gen, "model_dump_json")
else orjson.dumps(async_gen) else orjson.dumps(async_gen)
) )
yield json_line.encode("utf-8") + b"\n" cache_bytes = json_line.encode("utf-8") + b"\n"
yield cache_bytes
# Cache non-streaming response (non-MOE; works for both Ollama and OpenAI backends)
if _cache is not None and not _is_moe:
try:
await _cache.set_chat("ollama_chat", _cache_model, _cache_messages, cache_bytes)
except Exception as _ce:
print(f"[cache] set_chat (ollama_chat non-streaming) failed: {_ce}")
finally: finally:
# Ensure counter is decremented even if an exception occurs # Ensure counter is decremented even if an exception occurs
@ -2680,6 +2786,21 @@ async def openai_chat_completions_proxy(request: Request):
except orjson.JSONDecodeError as e: except orjson.JSONDecodeError as e:
raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e
# Cache lookup — before endpoint selection
_cache = get_llm_cache()
if _cache is not None:
_cached = await _cache.get_chat("openai_chat", model, messages)
if _cached is not None:
if stream:
_sse = openai_nonstream_to_sse(_cached, model)
async def _serve_cached_ochat_stream():
yield _sse
return StreamingResponse(_serve_cached_ochat_stream(), media_type="text/event-stream")
else:
async def _serve_cached_ochat_json():
yield _cached
return StreamingResponse(_serve_cached_ochat_json(), media_type="application/json")
# 2. Endpoint logic # 2. Endpoint logic
endpoint, tracking_model = await choose_endpoint(model) endpoint, tracking_model = await choose_endpoint(model)
base_url = ep2base(endpoint) base_url = ep2base(endpoint)
@ -2699,6 +2820,8 @@ async def openai_chat_completions_proxy(request: Request):
else: else:
raise raise
if stream == True: if stream == True:
content_parts: list[str] = []
usage_snapshot: dict = {}
async for chunk in async_gen: async for chunk in async_gen:
data = ( data = (
chunk.model_dump_json() chunk.model_dump_json()
@ -2715,6 +2838,8 @@ async def openai_chat_completions_proxy(request: Request):
has_tool_calls = getattr(delta, "tool_calls", None) is not None has_tool_calls = getattr(delta, "tool_calls", None) is not None
if has_content or has_reasoning or has_tool_calls: if has_content or has_reasoning or has_tool_calls:
yield f"data: {data}\n\n".encode("utf-8") yield f"data: {data}\n\n".encode("utf-8")
if has_content and delta.content:
content_parts.append(delta.content)
elif chunk.usage is not None: elif chunk.usage is not None:
# Forward the usage-only final chunk (e.g. from llama-server) # Forward the usage-only final chunk (e.g. from llama-server)
yield f"data: {data}\n\n".encode("utf-8") yield f"data: {data}\n\n".encode("utf-8")
@ -2723,12 +2848,24 @@ async def openai_chat_completions_proxy(request: Request):
if chunk.usage is not None: if chunk.usage is not None:
prompt_tok = chunk.usage.prompt_tokens or 0 prompt_tok = chunk.usage.prompt_tokens or 0
comp_tok = chunk.usage.completion_tokens or 0 comp_tok = chunk.usage.completion_tokens or 0
usage_snapshot = {"prompt_tokens": prompt_tok, "completion_tokens": comp_tok, "total_tokens": prompt_tok + comp_tok}
else: else:
llama_usage = rechunk.extract_usage_from_llama_timings(chunk) llama_usage = rechunk.extract_usage_from_llama_timings(chunk)
if llama_usage: if llama_usage:
prompt_tok, comp_tok = llama_usage prompt_tok, comp_tok = llama_usage
if prompt_tok != 0 or comp_tok != 0: if prompt_tok != 0 or comp_tok != 0:
await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok)) await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))
# Cache assembled streaming response — before [DONE] so it always runs
if _cache is not None and content_parts:
assembled = orjson.dumps({
"model": model,
"choices": [{"index": 0, "message": {"role": "assistant", "content": "".join(content_parts)}, "finish_reason": "stop"}],
**({"usage": usage_snapshot} if usage_snapshot else {}),
}) + b"\n"
try:
await _cache.set_chat("openai_chat", model, messages, assembled)
except Exception as _ce:
print(f"[cache] set_chat (openai_chat streaming) failed: {_ce}")
yield b"data: [DONE]\n\n" yield b"data: [DONE]\n\n"
else: else:
prompt_tok = 0 prompt_tok = 0
@ -2747,7 +2884,14 @@ async def openai_chat_completions_proxy(request: Request):
if hasattr(async_gen, "model_dump_json") if hasattr(async_gen, "model_dump_json")
else orjson.dumps(async_gen) else orjson.dumps(async_gen)
) )
yield json_line.encode("utf-8") + b"\n" cache_bytes = json_line.encode("utf-8") + b"\n"
yield cache_bytes
# Cache non-streaming response
if _cache is not None:
try:
await _cache.set_chat("openai_chat", model, messages, cache_bytes)
except Exception as _ce:
print(f"[cache] set_chat (openai_chat non-streaming) failed: {_ce}")
finally: finally:
# Ensure counter is decremented even if an exception occurs # Ensure counter is decremented even if an exception occurs
@ -2823,6 +2967,22 @@ async def openai_completions_proxy(request: Request):
except orjson.JSONDecodeError as e: except orjson.JSONDecodeError as e:
raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e
# Cache lookup — completions prompt mapped to a single-turn messages list
_cache = get_llm_cache()
_compl_messages = [{"role": "user", "content": prompt}]
if _cache is not None:
_cached = await _cache.get_chat("openai_completions", model, _compl_messages)
if _cached is not None:
if stream:
_sse = openai_nonstream_to_sse(_cached, model)
async def _serve_cached_ocompl_stream():
yield _sse
return StreamingResponse(_serve_cached_ocompl_stream(), media_type="text/event-stream")
else:
async def _serve_cached_ocompl_json():
yield _cached
return StreamingResponse(_serve_cached_ocompl_json(), media_type="application/json")
# 2. Endpoint logic # 2. Endpoint logic
endpoint, tracking_model = await choose_endpoint(model) endpoint, tracking_model = await choose_endpoint(model)
base_url = ep2base(endpoint) base_url = ep2base(endpoint)
@ -2834,6 +2994,8 @@ async def openai_completions_proxy(request: Request):
# The chat method returns a generator of dicts (or GenerateResponse) # The chat method returns a generator of dicts (or GenerateResponse)
async_gen = await oclient.completions.create(**params) async_gen = await oclient.completions.create(**params)
if stream == True: if stream == True:
text_parts: list[str] = []
usage_snapshot: dict = {}
async for chunk in async_gen: async for chunk in async_gen:
data = ( data = (
chunk.model_dump_json() chunk.model_dump_json()
@ -2849,6 +3011,8 @@ async def openai_completions_proxy(request: Request):
) )
if has_text or has_reasoning or choice.finish_reason is not None: if has_text or has_reasoning or choice.finish_reason is not None:
yield f"data: {data}\n\n".encode("utf-8") yield f"data: {data}\n\n".encode("utf-8")
if has_text and choice.text:
text_parts.append(choice.text)
elif chunk.usage is not None: elif chunk.usage is not None:
# Forward the usage-only final chunk (e.g. from llama-server) # Forward the usage-only final chunk (e.g. from llama-server)
yield f"data: {data}\n\n".encode("utf-8") yield f"data: {data}\n\n".encode("utf-8")
@ -2857,12 +3021,24 @@ async def openai_completions_proxy(request: Request):
if chunk.usage is not None: if chunk.usage is not None:
prompt_tok = chunk.usage.prompt_tokens or 0 prompt_tok = chunk.usage.prompt_tokens or 0
comp_tok = chunk.usage.completion_tokens or 0 comp_tok = chunk.usage.completion_tokens or 0
usage_snapshot = {"prompt_tokens": prompt_tok, "completion_tokens": comp_tok, "total_tokens": prompt_tok + comp_tok}
else: else:
llama_usage = rechunk.extract_usage_from_llama_timings(chunk) llama_usage = rechunk.extract_usage_from_llama_timings(chunk)
if llama_usage: if llama_usage:
prompt_tok, comp_tok = llama_usage prompt_tok, comp_tok = llama_usage
if prompt_tok != 0 or comp_tok != 0: if prompt_tok != 0 or comp_tok != 0:
await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok)) await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))
# Cache assembled streaming response — before [DONE] so it always runs
if _cache is not None and text_parts:
assembled = orjson.dumps({
"model": model,
"choices": [{"index": 0, "message": {"role": "assistant", "content": "".join(text_parts)}, "finish_reason": "stop"}],
**({"usage": usage_snapshot} if usage_snapshot else {}),
}) + b"\n"
try:
await _cache.set_chat("openai_completions", model, _compl_messages, assembled)
except Exception as _ce:
print(f"[cache] set_chat (openai_completions streaming) failed: {_ce}")
# Final DONE event # Final DONE event
yield b"data: [DONE]\n\n" yield b"data: [DONE]\n\n"
else: else:
@ -2882,7 +3058,14 @@ async def openai_completions_proxy(request: Request):
if hasattr(async_gen, "model_dump_json") if hasattr(async_gen, "model_dump_json")
else orjson.dumps(async_gen) else orjson.dumps(async_gen)
) )
yield json_line.encode("utf-8") + b"\n" cache_bytes = json_line.encode("utf-8") + b"\n"
yield cache_bytes
# Cache non-streaming response
if _cache is not None:
try:
await _cache.set_chat("openai_completions", model, _compl_messages, cache_bytes)
except Exception as _ce:
print(f"[cache] set_chat (openai_completions non-streaming) failed: {_ce}")
finally: finally:
# Ensure counter is decremented even if an exception occurs # Ensure counter is decremented even if an exception occurs
@ -3076,6 +3259,28 @@ async def rerank_proxy(request: Request):
finally: finally:
await decrement_usage(endpoint, tracking_model) await decrement_usage(endpoint, tracking_model)
# -------------------------------------------------------------
# 25b. Cache management endpoints
# -------------------------------------------------------------
@app.get("/api/cache/stats")
async def cache_stats():
"""Return hit/miss counters and configuration for the LLM response cache."""
c = get_llm_cache()
if c is None:
return {"enabled": False}
return {"enabled": True, **c.stats()}
@app.post("/api/cache/invalidate")
async def cache_invalidate():
"""Clear all entries from the LLM response cache and reset counters."""
c = get_llm_cache()
if c is None:
return {"enabled": False, "cleared": False}
await c.clear()
return {"enabled": True, "cleared": True}
# ------------------------------------------------------------- # -------------------------------------------------------------
# 26. Serve the static frontend # 26. Serve the static frontend
# ------------------------------------------------------------- # -------------------------------------------------------------
@ -3211,6 +3416,7 @@ async def startup_event() -> None:
app_state["session"] = session app_state["session"] = session
token_worker_task = asyncio.create_task(token_worker()) token_worker_task = asyncio.create_task(token_worker())
flush_task = asyncio.create_task(flush_buffer()) flush_task = asyncio.create_task(flush_buffer())
await init_llm_cache(config)
@app.on_event("shutdown") @app.on_event("shutdown")
async def shutdown_event() -> None: async def shutdown_event() -> None: