Merge pull request #35 from nomyo-ai/dev-v0.7.x-semcache

Dev v0.7.x semcache -> dev-v0.7.x
This commit is contained in:
Alpha Nerd 2026-03-11 09:40:55 +01:00 committed by GitHub
commit ca773d6ddb
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
15 changed files with 1773 additions and 25 deletions

44
.dockerignore Normal file
View file

@ -0,0 +1,44 @@
# Version control
.git
.gitignore
.github
# Environment & secrets
.env
.env.*
*.env
# Python artifacts
__pycache__
*.pyc
*.pyo
*.pyd
.Python
.venv
venv
*.egg-info
dist
build
# Local databases (don't bake data into image)
*.db
*.db-shm
*.db-wal
# IDE / editor
.vscode
.idea
*.swp
*.swo
# Documentation
doc/
docs/
*.md
# Tests
tests/
test_*
# Local config overrides
config.local.yaml

View file

@ -0,0 +1,71 @@
name: Build and Publish Docker Image (Semantic Cache)
# Builds the :semantic variant that includes sentence-transformers + CPU torch
# and the pre-baked all-MiniLM-L6-v2 embedding model (~500 MB larger than lean).
# Tags mirror the lean workflow but carry a -semantic suffix, e.g.:
# ghcr.io/nomyo-ai/nomyo-router:latest-semantic
# ghcr.io/nomyo-ai/nomyo-router:0.7.0-semantic
# ghcr.io/nomyo-ai/nomyo-router:0.7-semantic
on:
push:
branches:
- main
tags:
- "v*.*.*"
workflow_dispatch:
env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}
jobs:
build-and-push-semantic:
runs-on: ubuntu-latest
permissions:
contents: read
packages: write
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up QEMU (for multi-arch builds)
uses: docker/setup-qemu-action@v3
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Log in to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Extract Docker metadata
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
tags: |
# Versioned semantic tags on git tags (e.g. v0.7.0 → 0.7.0-semantic, 0.7-semantic)
type=semver,pattern={{version}}-semantic
type=semver,pattern={{major}}.{{minor}}-semantic
# latest-semantic only on main branch pushes
type=raw,value=latest-semantic,enable=${{ github.ref == 'refs/heads/main' }}
# SHA-tagged for traceability
type=sha,prefix=sha-,suffix=-semantic
- name: Build and push semantic Docker image
uses: docker/build-push-action@v6
with:
context: .
platforms: linux/amd64,linux/arm64
push: true
build-args: |
SEMANTIC_CACHE=true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
cache-from: type=gha
cache-to: type=gha,mode=max

View file

@ -3,21 +3,43 @@ FROM python:3.13-slim
ENV PYTHONUNBUFFERED=1 \
PYTHONDONTWRITEBYTECODE=1
# SEMANTIC_CACHE=true installs sentence-transformers + CPU-only torch and pre-bakes
# the all-MiniLM-L6-v2 embedding model (~500 MB extra). The resulting image is tagged
# :semantic. The default (lean) image supports exact-match caching only.
ARG SEMANTIC_CACHE=false
# Pin HuggingFace cache to a predictable path inside /app/data so it can be
# mounted as a volume and shared between builds.
ENV HF_HOME=/app/data/hf_cache
# Install SQLite
RUN apt-get update && apt-get install -y sqlite3
RUN apt-get update && apt-get install -y --no-install-recommends sqlite3 \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir --upgrade pip \
&& pip install --no-cache-dir -r requirements.txt
# Semantic cache deps — only installed when SEMANTIC_CACHE=true
# CPU-only torch must be installed before sentence-transformers to avoid
# pulling the full CUDA-enabled build (~2.5 GB).
RUN if [ "$SEMANTIC_CACHE" = "true" ]; then \
pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu && \
pip install --no-cache-dir sentence-transformers && \
python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')"; \
fi
# Create database directory and set permissions
RUN mkdir -p /app/data && chown -R www-data:www-data /app/data
COPY . .
RUN chmod +x /app/entrypoint.sh
RUN chmod +x /app/entrypoint.sh && \
chown -R www-data:www-data /app
EXPOSE 12434
USER www-data
ENTRYPOINT ["/app/entrypoint.sh"]

View file

@ -74,22 +74,28 @@ uvicorn router:app --host 127.0.0.1 --port 12434 --loop uvloop
### Pre-built image (GitHub Container Registry)
Pre-built multi-arch images (`linux/amd64`, `linux/arm64`) are published automatically on every release:
Pre-built multi-arch images (`linux/amd64`, `linux/arm64`) are published automatically on every release.
**Lean image** (exact-match cache, ~300 MB):
```sh
docker pull ghcr.io/nomyo-ai/nomyo-router:latest
```
Specific version:
```sh
docker pull ghcr.io/nomyo-ai/nomyo-router:0.7.0
```
### Build the container image locally:
**Semantic image** (semantic cache with `all-MiniLM-L6-v2` pre-baked, ~800 MB):
```sh
docker pull ghcr.io/nomyo-ai/nomyo-router:latest-semantic
docker pull ghcr.io/nomyo-ai/nomyo-router:0.7.0-semantic
```
### Build the container image locally
```sh
# Lean build (exact match cache, default)
docker build -t nomyo-router .
# Semantic build — sentence-transformers + model baked in
docker build --build-arg SEMANTIC_CACHE=true -t nomyo-router:semantic .
```
Run the router in Docker with your own configuration file mounted from the host. The entrypoint script accepts a `--config-path` argument so you can point to a file anywhere inside the container:
@ -124,6 +130,53 @@ This way the Ollama backend servers are utilized more efficient than by simply u
NOMYO Router also supports OpenAI API compatible v1 backend servers.
## Semantic LLM Cache
NOMYO Router includes an optional semantic cache that serves repeated or semantically similar LLM requests from cache — no endpoint round-trip, no token cost, response in <10 ms.
### Enable (exact match, any image)
```yaml
# config.yaml
cache_enabled: true
cache_backend: sqlite # persists across restarts
cache_similarity: 1.0 # exact match only
cache_ttl: 3600
```
### Enable (semantic matching, :semantic image)
```yaml
cache_enabled: true
cache_backend: sqlite
cache_similarity: 0.90 # "What is Python?" ≈ "What's Python?" → cache hit
cache_ttl: 3600
cache_history_weight: 0.3
```
Pull the semantic image:
```bash
docker pull ghcr.io/nomyo-ai/nomyo-router:latest-semantic
```
### Cache key strategy
Each request is keyed on `model + system_prompt` (exact) combined with a weighted-mean embedding of BM25-weighted chat history (30%) and the last user message (70%). This means:
- Different system prompts → always separate cache namespaces (no cross-tenant leakage)
- Same question, different phrasing → cache hit (semantic mode)
- MOE requests (`moe-*`) → always bypass the cache
### Cached routes
`/api/chat` · `/api/generate` · `/v1/chat/completions` · `/v1/completions`
### Cache management
```bash
curl http://localhost:12434/api/cache/stats # hit rate, counters, config
curl -X POST http://localhost:12434/api/cache/invalidate # clear all entries
```
## Supplying the router API key
If you set `nomyo-router-api-key` in `config.yaml` (or `NOMYO_ROUTER_API_KEY` env), every request to NOMYO Router must include the key:

557
cache.py Normal file
View file

@ -0,0 +1,557 @@
"""
LLM Semantic Cache for NOMYO Router.
Strategy:
- Namespace: sha256(route :: model :: system_prompt)[:16] exact context isolation
- Cache key: hash(normalize(last_user_message), namespace) exact lookup
- Embedding: weighted mean of
α * embed(bm25_weighted(chat_history)) conversation context
1-α * embed(last_user_message) the actual question
with α = cache_history_weight (default 0.3).
- Exact-match caching (similarity=1.0) uses DummyEmbeddingProvider zero extra deps.
- Semantic caching (similarity<1.0) requires sentence-transformers. If missing the
library falls back to exact-match with a warning (lean Docker image behaviour).
- MOE models (moe-*) always bypass the cache.
- Token counts are never recorded for cache hits.
- Streaming cache hits are served as a single-chunk response.
- Privacy protection: responses that echo back user-identifying tokens from the system
prompt (names, emails, IDs) are stored WITHOUT an embedding. They remain findable
by exact-match for the same user but are invisible to cross-user semantic search.
Generic responses (capital of France "Paris") keep their embeddings and can still
produce cross-user semantic hits as intended.
"""
import hashlib
import math
import re
import time
import warnings
from collections import Counter
from typing import Any, Optional
# Lazily resolved once at first embed() call
_semantic_available: Optional[bool] = None
def _check_sentence_transformers() -> bool:
global _semantic_available
if _semantic_available is None:
try:
import sentence_transformers # noqa: F401
_semantic_available = True
except ImportError:
_semantic_available = False
return _semantic_available # type: ignore[return-value]
# ---------------------------------------------------------------------------
# BM25-weighted text representation of chat history
# ---------------------------------------------------------------------------
def _bm25_weighted_text(history: list[dict]) -> str:
"""
Produce a BM25-importance-weighted text string from chat history turns.
High-IDF (rare, domain-specific) terms are repeated proportionally to
their BM25 score so the downstream sentence-transformer embedding
naturally upweights topical signal and downweights stop words.
"""
docs = [m.get("content", "") for m in history if m.get("content")]
if not docs:
return ""
def _tok(text: str) -> list[str]:
return [w.lower() for w in text.split() if len(w) > 2]
tokenized = [_tok(d) for d in docs]
N = len(tokenized)
df: Counter = Counter()
for tokens in tokenized:
for term in set(tokens):
df[term] += 1
k1, b = 1.5, 0.75
avg_dl = sum(len(t) for t in tokenized) / max(N, 1)
term_scores: Counter = Counter()
for tokens in tokenized:
tf_c = Counter(tokens)
dl = len(tokens)
for term, tf in tf_c.items():
idf = math.log((N + 1) / (df[term] + 1)) + 1.0
score = idf * (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * dl / max(avg_dl, 1)))
term_scores[term] += score
top = term_scores.most_common(50)
if not top:
return " ".join(docs)
max_s = top[0][1]
out: list[str] = []
for term, score in top:
out.extend([term] * max(1, round(3 * score / max_s)))
return " ".join(out)
# ---------------------------------------------------------------------------
# LLMCache
# ---------------------------------------------------------------------------
class LLMCache:
"""
Thin async wrapper around async-semantic-llm-cache that adds:
- Route-aware namespace isolation
- Two-vector weighted-mean embedding (history context + question)
- Per-instance hit/miss counters
- Graceful fallback when sentence-transformers is absent
"""
def __init__(self, cfg: Any) -> None:
self._cfg = cfg
self._backend: Any = None
self._emb_cache: Any = None
self._semantic: bool = False
self._hits: int = 0
self._misses: int = 0
async def init(self) -> None:
from semantic_llm_cache.similarity import EmbeddingCache
# --- Backend ---
backend_type: str = self._cfg.cache_backend
if backend_type == "sqlite":
from semantic_llm_cache.backends.sqlite import SQLiteBackend
self._backend = SQLiteBackend(db_path=self._cfg.cache_db_path)
elif backend_type == "redis":
from semantic_llm_cache.backends.redis import RedisBackend
self._backend = RedisBackend(url=self._cfg.cache_redis_url)
await self._backend.ping()
else:
from semantic_llm_cache.backends.memory import MemoryBackend
self._backend = MemoryBackend()
# --- Embedding provider ---
if self._cfg.cache_similarity < 1.0:
if _check_sentence_transformers():
from semantic_llm_cache.similarity import create_embedding_provider
provider = create_embedding_provider("sentence-transformer")
self._emb_cache = EmbeddingCache(provider=provider)
self._semantic = True
print(
f"[cache] Semantic cache ready "
f"(similarity≥{self._cfg.cache_similarity}, backend={backend_type})"
)
else:
warnings.warn(
"[cache] sentence-transformers is not installed. "
"Falling back to exact-match caching (similarity=1.0). "
"Use the :semantic Docker image tag to enable semantic caching.",
RuntimeWarning,
stacklevel=2,
)
self._emb_cache = EmbeddingCache() # DummyEmbeddingProvider
print(f"[cache] Exact-match cache ready (backend={backend_type}) [semantic unavailable]")
else:
self._emb_cache = EmbeddingCache() # DummyEmbeddingProvider
print(f"[cache] Exact-match cache ready (backend={backend_type})")
# ------------------------------------------------------------------
# Internal helpers
# ------------------------------------------------------------------
def _namespace(self, route: str, model: str, system: str) -> str:
raw = f"{route}::{model}::{system}"
return hashlib.sha256(raw.encode()).hexdigest()[:16]
def _cache_key(self, namespace: str, last_user: str) -> str:
from semantic_llm_cache.utils import hash_prompt, normalize_prompt
return hash_prompt(normalize_prompt(last_user), namespace)
# ------------------------------------------------------------------
# Privacy helpers — prevent cross-user leakage of personal data
# ------------------------------------------------------------------
_IDENTITY_STOPWORDS = frozenset({
"user", "users", "name", "names", "email", "phone", "their", "they",
"this", "that", "with", "from", "have", "been", "also", "more",
"tags", "identity", "preference", "context",
})
# Patterns that unambiguously signal personal data in a response
_EMAIL_RE = re.compile(r'\b[\w.%+-]+@[\w.-]+\.[a-zA-Z]{2,}\b')
_UUID_RE = re.compile(
r'\b[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b',
re.IGNORECASE,
)
# Standalone numeric run ≥ 8 digits (common user/account IDs)
_NUMERIC_ID_RE = re.compile(r'\b\d{8,}\b')
def _extract_response_content(self, response_bytes: bytes) -> str:
"""Parse response bytes (Ollama or OpenAI format) and return the text content."""
try:
import orjson
data = orjson.loads(response_bytes)
if "choices" in data: # OpenAI ChatCompletion
return (data["choices"][0].get("message") or {}).get("content", "")
if "message" in data: # Ollama chat
return (data.get("message") or {}).get("content", "")
if "response" in data: # Ollama generate
return data.get("response", "")
except Exception:
pass
return ""
def _extract_personal_tokens(self, system: str) -> frozenset[str]:
"""
Extract user-identifying tokens from the system prompt.
Looks for:
- Email addresses anywhere in the system prompt
- Numeric IDs ( 4 digits) appearing after "id" keyword
- Proper-noun-like words from [Tags: identity] lines
"""
if not system:
return frozenset()
tokens: set[str] = set()
# Email addresses
for email in self._EMAIL_RE.findall(system):
tokens.add(email.lower())
# Numeric IDs: "id: 1234", "id=5678"
for uid in re.findall(r'\bid\s*[=:]?\s*(\d{4,})\b', system, re.IGNORECASE):
tokens.add(uid)
# Values from [Tags: identity] lines (e.g. "User's name is Andreas")
for line in re.findall(
r'\[Tags:.*?identity.*?\]\s*(.+?)(?:\n|$)', system, re.IGNORECASE
):
for word in re.findall(r'\b\w{4,}\b', line):
w = word.lower()
if w not in self._IDENTITY_STOPWORDS:
tokens.add(w)
return frozenset(tokens)
def _response_is_personalized(self, response_bytes: bytes, system: str) -> bool:
"""
Return True if the response contains user-personal information.
Two complementary checks:
1. Direct PII detection in the response content emails, UUIDs, long
numeric IDs. Catches data retrieved at runtime via tool calls that
never appears in the system prompt.
2. System-prompt token overlap words extracted from [Tags: identity]
lines that reappear verbatim in the response (catches names, etc.).
Such responses are stored WITHOUT a semantic embedding so they are
invisible to cross-user semantic search while still being cacheable for
the same user via exact-match.
"""
content = self._extract_response_content(response_bytes)
if not content:
# Can't parse → err on the side of caution
return bool(response_bytes)
# 1. Direct PII patterns — independent of what's in the system prompt
if (
self._EMAIL_RE.search(content)
or self._UUID_RE.search(content)
or self._NUMERIC_ID_RE.search(content)
):
return True
# 2. System-prompt identity tokens echoed back in the response
personal = self._extract_personal_tokens(system)
if personal:
content_lower = content.lower()
if any(token in content_lower for token in personal):
return True
return False
def _parse_messages(
self, messages: list[dict]
) -> tuple[str, list[dict], str]:
"""
Returns (system_prompt, prior_history_turns, last_user_message).
Multimodal content lists are reduced to their text parts.
"""
system = ""
turns: list[dict] = []
for m in messages:
role = m.get("role", "")
content = m.get("content", "")
if isinstance(content, list):
content = " ".join(
p.get("text", "")
for p in content
if isinstance(p, dict) and p.get("type") == "text"
)
if role == "system":
system = content
else:
turns.append({"role": role, "content": content})
last_user = ""
for m in reversed(turns):
if m["role"] == "user":
last_user = m["content"]
break
# History = all turns before the final user message
history = turns[:-1] if turns and turns[-1]["role"] == "user" else turns
return system, history, last_user
async def _build_embedding(
self, history: list[dict], last_user: str
) -> list[float] | None:
"""
Weighted mean of BM25-weighted history embedding and last-user embedding.
Returns None when not in semantic mode.
"""
if not self._semantic:
return None
import numpy as np
alpha: float = self._cfg.cache_history_weight # weight for history signal
q_vec = np.array(await self._emb_cache.aencode(last_user), dtype=float)
if not history:
# No history → use question embedding alone (alpha has no effect)
return q_vec.tolist()
h_text = _bm25_weighted_text(history)
h_vec = np.array(await self._emb_cache.aencode(h_text), dtype=float)
combined = alpha * h_vec + (1.0 - alpha) * q_vec
norm = float(np.linalg.norm(combined))
if norm > 0.0:
combined /= norm
return combined.tolist()
# ------------------------------------------------------------------
# Public interface: chat (handles both Ollama and OpenAI message lists)
# ------------------------------------------------------------------
async def get_chat(
self, route: str, model: str, messages: list[dict]
) -> bytes | None:
"""Return cached response bytes, or None on miss."""
if not self._backend:
return None
system, history, last_user = self._parse_messages(messages)
if not last_user:
return None
ns = self._namespace(route, model, system)
key = self._cache_key(ns, last_user)
print(
f"[cache] get_chat route={route} model={model} ns={ns} "
f"prompt={last_user[:80]!r} "
f"system_snippet={system[:120]!r}"
)
# 1. Exact key match
entry = await self._backend.get(key)
if entry is not None:
self._hits += 1
print(f"[cache] HIT (exact) ns={ns} prompt={last_user[:80]!r}")
return entry.response # type: ignore[return-value]
# 2. Semantic similarity match
if self._semantic and self._cfg.cache_similarity < 1.0:
emb = await self._build_embedding(history, last_user)
result = await self._backend.find_similar(
emb, threshold=self._cfg.cache_similarity, namespace=ns
)
if result is not None:
_, matched, sim = result
self._hits += 1
print(
f"[cache] HIT (semantic sim={sim:.3f}) ns={ns} "
f"prompt={last_user[:80]!r} matched={matched.prompt[:80]!r}"
)
return matched.response # type: ignore[return-value]
self._misses += 1
print(f"[cache] MISS ns={ns} prompt={last_user[:80]!r}")
return None
async def set_chat(
self, route: str, model: str, messages: list[dict], response_bytes: bytes
) -> None:
"""Store a response in the cache (fire-and-forget friendly)."""
if not self._backend:
return
system, history, last_user = self._parse_messages(messages)
if not last_user:
return
ns = self._namespace(route, model, system)
key = self._cache_key(ns, last_user)
# Privacy guard: check whether the response contains personal data.
personalized = self._response_is_personalized(response_bytes, system)
if personalized:
# Exact-match is only safe when the system prompt is user-specific
# (i.e. different per user → different namespace). When the system
# prompt is generic and shared across all users, the namespace is the
# same for everyone: storing even without an embedding would let any
# user who asks the identical question get another user's personal data
# via exact-match. In that case skip storage entirely.
system_is_user_specific = bool(self._extract_personal_tokens(system))
if not system_is_user_specific:
print(
f"[cache] SKIP personalized response with generic system prompt "
f"route={route} model={model} ns={ns} prompt={last_user[:80]!r} "
f"system_snippet={system[:120]!r}"
)
return
print(
f"[cache] set_chat route={route} model={model} ns={ns} "
f"personalized={personalized} "
f"prompt={last_user[:80]!r} "
f"system_snippet={system[:120]!r}"
)
# Store without embedding when personalized (invisible to semantic search
# across users, but still reachable by exact-match within this namespace).
emb = (
await self._build_embedding(history, last_user)
if self._semantic and self._cfg.cache_similarity < 1.0 and not personalized
else None
)
from semantic_llm_cache.config import CacheEntry
await self._backend.set(
key,
CacheEntry(
prompt=last_user,
response=response_bytes,
embedding=emb,
created_at=time.time(),
ttl=self._cfg.cache_ttl,
namespace=ns,
hit_count=0,
),
)
# ------------------------------------------------------------------
# Convenience wrappers for the generate route (prompt string, not messages)
# ------------------------------------------------------------------
async def get_generate(
self, model: str, prompt: str, system: str = ""
) -> bytes | None:
messages: list[dict] = []
if system:
messages.append({"role": "system", "content": system})
messages.append({"role": "user", "content": prompt})
return await self.get_chat("generate", model, messages)
async def set_generate(
self, model: str, prompt: str, system: str, response_bytes: bytes
) -> None:
messages: list[dict] = []
if system:
messages.append({"role": "system", "content": system})
messages.append({"role": "user", "content": prompt})
await self.set_chat("generate", model, messages, response_bytes)
# ------------------------------------------------------------------
# Management
# ------------------------------------------------------------------
def stats(self) -> dict:
total = self._hits + self._misses
return {
"hits": self._hits,
"misses": self._misses,
"hit_rate": round(self._hits / total, 3) if total else 0.0,
"semantic": self._semantic,
"backend": self._cfg.cache_backend,
"similarity_threshold": self._cfg.cache_similarity,
"history_weight": self._cfg.cache_history_weight,
}
async def clear(self) -> None:
if self._backend:
await self._backend.clear()
self._hits = 0
self._misses = 0
# ---------------------------------------------------------------------------
# Module-level singleton
# ---------------------------------------------------------------------------
_cache: LLMCache | None = None
async def init_llm_cache(cfg: Any) -> LLMCache | None:
"""Initialise the module-level cache singleton. Returns None if disabled."""
global _cache
if not cfg.cache_enabled:
print("[cache] Cache disabled (cache_enabled=false).")
return None
_cache = LLMCache(cfg)
await _cache.init()
return _cache
def get_llm_cache() -> LLMCache | None:
return _cache
# ---------------------------------------------------------------------------
# Helper: convert a stored Ollama-format non-streaming response to an
# OpenAI SSE single-chunk stream (used when a streaming OpenAI request
# hits the cache whose entry was populated from a non-streaming response).
# ---------------------------------------------------------------------------
def openai_nonstream_to_sse(cached_bytes: bytes, model: str) -> bytes:
"""
Wrap a stored OpenAI ChatCompletion JSON as a minimal single-chunk SSE stream.
The stored entry always uses the non-streaming ChatCompletion format so that
non-streaming cache hits can be served directly; this function adapts it for
streaming clients.
"""
import orjson, time as _time
try:
d = orjson.loads(cached_bytes)
content = (d.get("choices") or [{}])[0].get("message", {}).get("content", "")
chunk = {
"id": d.get("id", "cache-hit"),
"object": "chat.completion.chunk",
"created": d.get("created", int(_time.time())),
"model": d.get("model", model),
"choices": [
{
"index": 0,
"delta": {"role": "assistant", "content": content},
"finish_reason": "stop",
}
],
}
if d.get("usage"):
chunk["usage"] = d["usage"]
return f"data: {orjson.dumps(chunk).decode()}\n\ndata: [DONE]\n\n".encode()
except Exception as exc:
warnings.warn(
f"[cache] openai_nonstream_to_sse: corrupt cache entry, returning empty stream: {exc}",
RuntimeWarning,
stacklevel=2,
)
return b"data: [DONE]\n\n"

View file

@ -6,7 +6,7 @@ endpoints:
- https://api.openai.com/v1
llama_server_endpoints:
- http://192.168.0.33:8889/v1
- http://192.168.0.50:8889/v1
# Maximum concurrent connections *per endpointmodel pair* (equals to OLLAMA_NUM_PARALLEL)
max_concurrent_connections: 2
@ -22,4 +22,38 @@ api_keys:
"http://192.168.0.51:11434": "ollama"
"http://192.168.0.52:11434": "ollama"
"https://api.openai.com/v1": "${OPENAI_KEY}"
"http://192.168.0.33:8889/v1": "llama"
"http://192.168.0.50:8889/v1": "llama"
# -------------------------------------------------------------
# Semantic LLM Cache (optional — disabled by default)
# Caches LLM responses to cut costs and latency on repeated or
# semantically similar prompts.
# Cached routes: /api/chat /api/generate /v1/chat/completions /v1/completions
# MOE requests (moe-* model prefix) always bypass the cache.
# -------------------------------------------------------------
# cache_enabled: true
# Backend — where cached responses are stored:
# memory → in-process LRU (lost on restart, not shared across replicas) [default]
# sqlite → persistent file-based (single instance, survives restart)
# redis → distributed (shared across replicas, requires Redis)
# cache_backend: memory
# Cosine similarity threshold for a cache hit:
# 1.0 → exact match only (works on any image variant)
# <1.0 → semantic matching (requires the :semantic Docker image tag)
# cache_similarity: 0.9
# Response TTL in seconds. Remove the key or set to null to cache forever.
# cache_ttl: 3600
# SQLite backend: path to the cache database file
# cache_db_path: llm_cache.db
# Redis backend: connection URL
# cache_redis_url: redis://localhost:6379/0
# Weight of the BM25-weighted chat-history embedding vs last-user-message embedding.
# 0.3 = 30% history context signal, 70% question signal.
# Only relevant when cache_similarity < 1.0.
# cache_history_weight: 0.3

View file

@ -204,6 +204,149 @@ max_concurrent_connections: 3
**Recommendation**: Use multiple endpoints for redundancy and load distribution.
## Semantic LLM Cache
NOMYO Router can cache LLM responses and serve them directly — skipping endpoint selection, model load, and token generation entirely.
### How it works
1. On every cacheable request (`/api/chat`, `/api/generate`, `/v1/chat/completions`, `/v1/completions`) the cache is checked **before** choosing an endpoint.
2. On a **cache hit** the stored response is returned immediately as a single chunk (streaming or non-streaming — both work).
3. On a **cache miss** the request is forwarded normally. The response is stored in the cache after it completes.
4. **MOE requests** (`moe-*` model prefix) always bypass the cache.
5. **Token counts** are never recorded for cache hits.
### Cache key strategy
| Signal | How matched |
|---|---|
| `model + system_prompt` | Exact — hard context isolation per deployment |
| BM25-weighted embedding of chat history | Semantic — conversation context signal |
| Embedding of last user message | Semantic — the actual question |
The two semantic vectors are combined as a weighted mean (tuned by `cache_history_weight`) before cosine similarity comparison, staying at a single 384-dimensional vector compatible with the library's storage format.
### Quick start — exact match (lean image)
```yaml
cache_enabled: true
cache_backend: sqlite # persists across restarts
cache_similarity: 1.0 # exact match only, no sentence-transformers needed
cache_ttl: 3600
```
### Quick start — semantic matching (:semantic image)
```yaml
cache_enabled: true
cache_backend: sqlite
cache_similarity: 0.90 # hit if ≥90% cosine similarity
cache_ttl: 3600
cache_history_weight: 0.3
```
Pull the semantic image:
```bash
docker pull ghcr.io/nomyo-ai/nomyo-router:latest-semantic
```
### Cache configuration options
#### `cache_enabled`
**Type**: `bool` | **Default**: `false`
Enable or disable the cache. All other cache settings are ignored when `false`.
#### `cache_backend`
**Type**: `str` | **Default**: `"memory"`
| Value | Description | Persists | Multi-replica |
|---|---|---|---|
| `memory` | In-process LRU dict | ❌ | ❌ |
| `sqlite` | File-based via `aiosqlite` | ✅ | ❌ |
| `redis` | Redis via `redis.asyncio` | ✅ | ✅ |
Use `redis` when running multiple router replicas behind a load balancer — all replicas share one warm cache.
#### `cache_similarity`
**Type**: `float` | **Default**: `1.0`
Cosine similarity threshold. `1.0` means exact match only (no embedding model needed). Values below `1.0` enable semantic matching, which requires the `:semantic` Docker image tag.
Recommended starting value for semantic mode: `0.90`.
#### `cache_ttl`
**Type**: `int | null` | **Default**: `3600`
Time-to-live for cache entries in seconds. Remove the key or set to `null` to cache forever.
#### `cache_db_path`
**Type**: `str` | **Default**: `"llm_cache.db"`
Path to the SQLite cache database. Only used when `cache_backend: sqlite`.
#### `cache_redis_url`
**Type**: `str` | **Default**: `"redis://localhost:6379/0"`
Redis connection URL. Only used when `cache_backend: redis`.
#### `cache_history_weight`
**Type**: `float` | **Default**: `0.3`
Weight of the BM25-weighted chat-history embedding in the combined cache key vector. `0.3` means the history contributes 30% and the final user message contributes 70% of the similarity signal. Only used when `cache_similarity < 1.0`.
### Cache management endpoints
| Endpoint | Method | Description |
|---|---|---|
| `/api/cache/stats` | `GET` | Hit/miss counters, hit rate, current config |
| `/api/cache/invalidate` | `POST` | Clear all cache entries and reset counters |
```bash
# Check cache performance
curl http://localhost:12434/api/cache/stats
# Clear the cache
curl -X POST http://localhost:12434/api/cache/invalidate
```
Example stats response:
```json
{
"enabled": true,
"hits": 1547,
"misses": 892,
"hit_rate": 0.634,
"semantic": true,
"backend": "sqlite",
"similarity_threshold": 0.9,
"history_weight": 0.3
}
```
### Docker image variants
| Tag | Semantic cache | Image size |
|---|---|---|
| `latest` | ❌ exact match only | ~300 MB |
| `latest-semantic` | ✅ sentence-transformers + model pre-baked | ~800 MB |
Build locally:
```bash
# Lean (exact match)
docker build -t nomyo-router .
# Semantic (~500 MB larger, all-MiniLM-L6-v2 model baked in)
docker build --build-arg SEMANTIC_CACHE=true -t nomyo-router:semantic .
```
## Configuration Validation
The router validates the configuration at startup:

View file

@ -82,10 +82,23 @@ sudo systemctl status nomyo-router
## 2. Docker Deployment
### Image variants
| Tag | Semantic cache | Image size |
|---|---|---|
| `latest` | ❌ exact match only | ~300 MB |
| `latest-semantic` | ✅ sentence-transformers + `all-MiniLM-L6-v2` pre-baked | ~800 MB |
The `:semantic` variant enables `cache_similarity < 1.0` in `config.yaml`. The lean image falls back to exact-match caching with a warning if semantic mode is configured.
### Build the Image
```bash
# Lean build (exact match cache, default)
docker build -t nomyo-router .
# Semantic build (~500 MB larger, all-MiniLM-L6-v2 model baked in at build time)
docker build --build-arg SEMANTIC_CACHE=true -t nomyo-router:semantic .
```
### Run the Container

View file

@ -1,20 +1,30 @@
# Docker Compose example for NOMYO Router with multiple Ollama instances
#
# Two router profiles are provided:
# nomyo-router — lean image, exact-match cache only (~300 MB)
# nomyo-router-semantic — semantic image, sentence-transformers baked in (~800 MB)
#
# Uncomment the redis service and set cache_backend: redis in config.yaml
# to share the LLM response cache across multiple router replicas.
version: '3.8'
services:
# NOMYO Router
# NOMYO Router — lean image (exact-match cache, default)
nomyo-router:
image: nomyo-router:latest
build: .
build:
context: .
args:
SEMANTIC_CACHE: "false"
ports:
- "12434:12434"
environment:
- CONFIG_PATH=/app/config/config.yaml
- NOMYO_ROUTER_DB_PATH=/app/token_counts.db
- NOMYO_ROUTER_DB_PATH=/app/data/token_counts.db
volumes:
- ./config:/app/config
- router-db:/app/token_counts.db
- router-data:/app/data
depends_on:
- ollama1
- ollama2
@ -23,6 +33,45 @@ services:
networks:
- nomyo-net
# NOMYO Router — semantic image (cache_similarity < 1.0 support, ~800 MB)
# Build: docker compose build nomyo-router-semantic
# Switch: comment out nomyo-router above, uncomment this block.
# nomyo-router-semantic:
# image: nomyo-router:semantic
# build:
# context: .
# args:
# SEMANTIC_CACHE: "true"
# ports:
# - "12434:12434"
# environment:
# - CONFIG_PATH=/app/config/config.yaml
# - NOMYO_ROUTER_DB_PATH=/app/data/token_counts.db
# volumes:
# - ./config:/app/config
# - router-data:/app/data
# - hf-cache:/app/data/hf_cache # share HuggingFace model cache across builds
# depends_on:
# - ollama1
# - ollama2
# - ollama3
# restart: unless-stopped
# networks:
# - nomyo-net
# Optional: Redis for shared LLM response cache across multiple router replicas.
# Requires cache_backend: redis in config.yaml.
# redis:
# image: redis:7-alpine
# ports:
# - "6379:6379"
# volumes:
# - redis-data:/data
# command: redis-server --save 60 1 --loglevel warning
# restart: unless-stopped
# networks:
# - nomyo-net
# Ollama Instance 1
ollama1:
image: ollama/ollama:latest
@ -87,7 +136,9 @@ services:
- nomyo-net
volumes:
router-db:
router-data:
# hf-cache: # uncomment when using nomyo-router-semantic
# redis-data: # uncomment when using Redis cache backend
ollama1-data:
ollama2-data:
ollama3-data:

View file

@ -29,4 +29,38 @@ api_keys:
"http://192.168.0.52:11434": "ollama"
"https://api.openai.com/v1": "${OPENAI_KEY}"
"http://localhost:8080/v1": "llama-server" # Optional API key for llama-server - depends on llama_server config
"http://192.168.0.33:8081/v1": "llama-server"
"http://192.168.0.33:8081/v1": "llama-server"
# -------------------------------------------------------------
# Semantic LLM Cache (optional — disabled by default)
# Caches LLM responses to cut costs and latency on repeated or
# semantically similar prompts.
# Cached routes: /api/chat /api/generate /v1/chat/completions /v1/completions
# MOE requests (moe-* model prefix) always bypass the cache.
# -------------------------------------------------------------
# cache_enabled: false
# Backend — where cached responses are stored:
# memory → in-process LRU (lost on restart, not shared across replicas) [default]
# sqlite → persistent file-based (single instance, survives restart)
# redis → distributed (shared across replicas, requires Redis)
# cache_backend: memory
# Cosine similarity threshold for a cache hit:
# 1.0 → exact match only (works on any image variant)
# <1.0 → semantic matching (requires the :semantic Docker image tag)
# cache_similarity: 1.0
# Response TTL in seconds. Remove the key or set to null to cache forever.
# cache_ttl: 3600
# SQLite backend: path to the cache database file
# cache_db_path: llm_cache.db
# Redis backend: connection URL
# cache_redis_url: redis://localhost:6379/0
# Weight of the BM25-weighted chat-history embedding vs last-user-message embedding.
# 0.3 = 30% history context signal, 70% question signal.
# Only relevant when cache_similarity < 1.0.
# cache_history_weight: 0.3

View file

@ -133,6 +133,39 @@ Response:
}
```
### Cache Statistics
```bash
curl http://localhost:12434/api/cache/stats
```
Response when cache is enabled:
```json
{
"enabled": true,
"hits": 1547,
"misses": 892,
"hit_rate": 0.634,
"semantic": true,
"backend": "sqlite",
"similarity_threshold": 0.9,
"history_weight": 0.3
}
```
Response when cache is disabled:
```json
{ "enabled": false }
```
### Cache Invalidation
```bash
curl -X POST http://localhost:12434/api/cache/invalidate
```
Clears all cached entries and resets hit/miss counters.
### Real-time Usage Stream
```bash

375
doc/semantic-cache.md Normal file
View file

@ -0,0 +1,375 @@
# Semantic Cache
NOMYO Router includes a built-in LLM semantic cache that can dramatically reduce inference costs and latency by reusing previous responses — either via exact-match or by finding semantically equivalent questions.
## How It Works
Every incoming request is checked against the cache before being forwarded to an LLM endpoint. On a cache hit the stored response is returned immediately, bypassing inference entirely.
Cache isolation is strict: each combination of **route + model + system prompt** gets its own namespace (a 16-character SHA-256 prefix). A question asked under one system prompt can never leak into a different user's or route's namespace.
Two lookup strategies are available:
| Mode | How it matches | When to use |
|---|---|---|
| **Exact match** | Normalized text hash | When you need 100% identical answers, zero false positives |
| **Semantic match** | Cosine similarity of sentence embeddings | When questions are paraphrased or phrased differently |
In semantic mode the embedding is a weighted average of:
- The **last user message** (70% by default) — captures what is being asked
- A **BM25-weighted summary of the chat history** (30% by default) — provides conversation context
This means "What is the capital of France?" and "Can you tell me the capital city of France?" will hit the same cache entry, but a question in a different conversation context will not.
### MoE Model Bypass
Models with names starting with `moe-` always bypass the cache entirely. Mixture-of-Experts models produce non-deterministic outputs by design and are not suitable for response caching.
### Privacy Protection
Responses that contain personally identifiable information (emails, UUIDs, numeric IDs ≥ 8 digits, or tokens extracted from `[Tags: identity]` lines in the system prompt) are stored **without a semantic embedding**. They remain reachable via exact-match within the same user-specific namespace but are invisible to cross-user semantic search.
If a personalized response is generated with a generic (shared) system prompt, it is skipped entirely — never stored.
---
## Docker Image Variants
| Tag | Semantic cache | Approx. image size | Includes |
|---|---|---|---|
| `latest` | No (exact match only) | ~300 MB | Base router only |
| `latest-semantic` | Yes | ~800 MB | `sentence-transformers`, `torch`, `all-MiniLM-L6-v2` model baked in |
Use the **`latest`** image when you only need exact-match deduplication. It has no heavy ML dependencies.
Use **`latest-semantic`** when you want to catch paraphrased or semantically equivalent questions. The `all-MiniLM-L6-v2` model is baked into the image at build time so no internet access is needed at runtime.
> **Note:** If you set `cache_similarity < 1.0` but use the `latest` image, the router falls back to exact-match caching and logs a warning. It will not error.
Build locally:
```bash
# Lean image (exact match only)
docker build -t nomyo-router .
# Semantic image (~500 MB larger)
docker build --build-arg SEMANTIC_CACHE=true -t nomyo-router:semantic .
```
---
## Dependencies
### Lean image / bare-metal (exact match)
No extra dependencies beyond the base `requirements.txt`. The `semantic-llm-cache` library provides exact-match storage with a no-op embedding provider.
### Semantic image / bare-metal (semantic match)
Additional heavy packages are required:
| Package | Purpose | Approx. size |
|---|---|---|
| `sentence-transformers` | Sentence embedding model wrapper | ~100 MB |
| `torch` (CPU) | PyTorch inference backend | ~700 MB |
| `numpy` | Vector math for weighted mean embeddings | ~20 MB |
Install for bare-metal semantic mode:
```bash
pip install sentence-transformers torch --index-url https://download.pytorch.org/whl/cpu
```
The `all-MiniLM-L6-v2` model (~90 MB) is downloaded on first use (or baked in when using the `:semantic` Docker image).
---
## Configuration
All cache settings live in `config.yaml` and can be overridden with environment variables prefixed `NOMYO_ROUTER_`.
### Minimal — enable exact-match with in-memory backend
```yaml
# config.yaml
cache_enabled: true
```
### Full reference
```yaml
# config.yaml
# Master switch — set to true to enable the cache
cache_enabled: false
# Storage backend: "memory" | "sqlite" | "redis"
# memory — fast, in-process, lost on restart
# sqlite — persistent single-file, good for single-instance deployments
# redis — persistent, shared across multiple router instances
cache_backend: memory
# Cosine similarity threshold for semantic matching
# 1.0 = exact match only (no sentence-transformers required)
# 0.95 = very close paraphrases only (recommended starting point)
# 0.85 = broader matching, higher false-positive risk
# Requires :semantic Docker image (or sentence-transformers installed)
cache_similarity: 1.0
# Time-to-live in seconds; null or omit to cache forever
cache_ttl: 3600
# SQLite backend: path to cache database file
cache_db_path: llm_cache.db
# Redis backend: connection URL
cache_redis_url: redis://localhost:6379/0
# Weight of chat-history embedding in the combined query vector (0.01.0)
# 0.3 = 30% history context, 70% last user message (default)
# Increase if your use case is highly conversation-dependent
cache_history_weight: 0.3
```
### Environment variable overrides
```bash
NOMYO_ROUTER_CACHE_ENABLED=true
NOMYO_ROUTER_CACHE_BACKEND=sqlite
NOMYO_ROUTER_CACHE_SIMILARITY=0.95
NOMYO_ROUTER_CACHE_TTL=7200
NOMYO_ROUTER_CACHE_DB_PATH=/data/llm_cache.db
NOMYO_ROUTER_CACHE_REDIS_URL=redis://redis:6379/0
NOMYO_ROUTER_CACHE_HISTORY_WEIGHT=0.3
```
### Per-request opt-in
The cache is opted in per-request via the `nomyo.cache` field in the request body. The global `cache_enabled` setting must also be `true`.
```json
{
"model": "llama3.2",
"messages": [{"role": "user", "content": "What is the capital of France?"}],
"nomyo": {
"cache": true
}
}
```
---
## Backend Comparison
| | `memory` | `sqlite` | `redis` |
|---|---|---|---|
| Persistence | No (lost on restart) | Yes | Yes |
| Multi-instance | No | No | Yes |
| Setup required | None | None | Redis server |
| Recommended for | Development, testing | Single-instance production | Multi-instance / HA production |
### SQLite — persistent single-instance
```yaml
cache_enabled: true
cache_backend: sqlite
cache_db_path: /data/llm_cache.db
cache_ttl: 86400 # 24 hours
```
Mount the path in Docker:
```bash
docker run -d \
-v /path/to/data:/data \
-e NOMYO_ROUTER_CACHE_ENABLED=true \
-e NOMYO_ROUTER_CACHE_BACKEND=sqlite \
-e NOMYO_ROUTER_CACHE_DB_PATH=/data/llm_cache.db \
ghcr.io/nomyo-ai/nomyo-router:latest
```
### Redis — distributed / multi-instance
```yaml
cache_enabled: true
cache_backend: redis
cache_redis_url: redis://redis:6379/0
cache_similarity: 0.95
cache_ttl: 3600
```
Docker Compose example:
```yaml
services:
nomyo-router:
image: ghcr.io/nomyo-ai/nomyo-router:latest-semantic
ports:
- "12434:12434"
environment:
NOMYO_ROUTER_CACHE_ENABLED: "true"
NOMYO_ROUTER_CACHE_BACKEND: redis
NOMYO_ROUTER_CACHE_REDIS_URL: redis://redis:6379/0
NOMYO_ROUTER_CACHE_SIMILARITY: "0.95"
depends_on:
- redis
redis:
image: redis:7-alpine
volumes:
- redis_data:/data
volumes:
redis_data:
```
---
## Code Examples
### Python (OpenAI SDK)
```python
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:12434/v1",
api_key="unused",
)
# First call — cache miss, hits the LLM
response = client.chat.completions.create(
model="llama3.2",
messages=[{"role": "user", "content": "What is the capital of France?"}],
extra_body={"nomyo": {"cache": True}},
)
print(response.choices[0].message.content)
# Second call (exact same question) — cache hit, returned instantly
response = client.chat.completions.create(
model="llama3.2",
messages=[{"role": "user", "content": "What is the capital of France?"}],
extra_body={"nomyo": {"cache": True}},
)
print(response.choices[0].message.content)
# With semantic mode enabled, this also hits the cache:
response = client.chat.completions.create(
model="llama3.2",
messages=[{"role": "user", "content": "Tell me the capital city of France"}],
extra_body={"nomyo": {"cache": True}},
)
print(response.choices[0].message.content)
```
### Python (raw HTTP / httpx)
```python
import httpx
payload = {
"model": "llama3.2",
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Explain quantum entanglement in one sentence."},
],
"nomyo": {"cache": True},
}
resp = httpx.post("http://localhost:12434/v1/chat/completions", json=payload)
print(resp.json()["choices"][0]["message"]["content"])
```
### curl
```bash
curl -s http://localhost:12434/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "llama3.2",
"messages": [{"role": "user", "content": "What is the capital of France?"}],
"nomyo": {"cache": true}
}' | jq .choices[0].message.content
```
### Check cache statistics
```bash
curl -s http://localhost:12434/api/cache/stats | jq .
```
```json
{
"hits": 42,
"misses": 18,
"hit_rate": 0.7,
"semantic": true,
"backend": "sqlite",
"similarity_threshold": 0.95,
"history_weight": 0.3
}
```
### Clear the cache
```bash
curl -X POST http://localhost:12434/api/cache/clear
```
---
## Use Cases Where Semantic Caching Helps Most
### FAQ and support bots
Users ask the same questions in slightly different ways. "How do I reset my password?", "I forgot my password", "password reset instructions" — all semantically equivalent. Semantic caching collapses these into a single LLM call.
**Recommended settings:** `cache_similarity: 0.900.95`, `cache_backend: sqlite` or `redis`, long TTL.
### Document Q&A / RAG pipelines
When the same document set is queried repeatedly by many users, common factual questions ("What is the contract start date?", "When does this contract begin?") hit the cache across users — as long as the system prompt is not user-personalized.
**Recommended settings:** `cache_similarity: 0.92`, `cache_backend: redis` for multi-user deployments.
### Code generation with repeated patterns
Development tools that generate boilerplate, explain errors, or convert code often receive nearly-identical prompts. Caching prevents re-running expensive large-model inference for the same error message phrased differently.
**Recommended settings:** `cache_similarity: 0.93`, `cache_backend: sqlite`.
### High-traffic chatbots with shared context
Public-facing bots where many users share the same system prompt (same product, same persona). Generic factual answers can be safely reused across users. The privacy guard ensures personalized responses are never shared.
**Recommended settings:** `cache_similarity: 0.900.95`, `cache_backend: redis`.
### Batch processing / data pipelines
When running LLM inference over large datasets with many near-duplicate or repeated inputs, caching can reduce inference calls dramatically and make pipelines idempotent.
**Recommended settings:** `cache_similarity: 0.95`, `cache_backend: sqlite`, `cache_ttl: null` (cache forever for deterministic batch runs).
---
## Tuning the Similarity Threshold
| `cache_similarity` | Behaviour | Risk |
|---|---|---|
| `1.0` | Exact match only | No false positives |
| `0.97` | Catches minor typos and punctuation differences | Very low |
| `0.95` | Catches clear paraphrases (recommended default) | Low |
| `0.90` | Catches broader rewordings | Moderate — may match different intents |
| `< 0.85` | Very aggressive matching | High false-positive rate, not recommended |
Start at `0.95` and lower gradually while monitoring cache hit rate via `/api/cache/stats`.
---
## Limitations
- **Streaming responses** cached from a non-streaming request are served as a single chunk (not token-by-token). This is indistinguishable to most clients.
- **MoE models** (`moe-*` prefix) are never cached.
- **Token counts** are not recorded for cache hits (the LLM was not called).
- The `memory` backend does not survive a router restart.
- Semantic search requires the `:semantic` image (or local `sentence-transformers` install); without it, `cache_similarity < 1.0` silently falls back to exact match.

View file

@ -79,6 +79,8 @@ For OpenAI API compatibility:
| `/api/config` | GET | Endpoint configuration |
| `/api/usage-stream` | GET | Real-time usage updates (SSE) |
| `/health` | GET | Health check |
| `/api/cache/stats` | GET | Cache hit/miss counters and config |
| `/api/cache/invalidate` | POST | Clear all cache entries and counters |
## Making Requests
@ -147,6 +149,58 @@ The MOE system:
3. Selects the best response
4. Generates a final refined response
### Semantic LLM Cache
The router can cache LLM responses and serve them instantly — bypassing endpoint selection, model loading, and token generation entirely. Cached responses work for both streaming and non-streaming clients.
Enable it in `config.yaml`:
```yaml
cache_enabled: true
cache_backend: sqlite # persists across restarts
cache_similarity: 0.9 # semantic matching (requires :semantic image)
cache_ttl: 3600
```
For exact-match only (no extra dependencies):
```yaml
cache_enabled: true
cache_backend: sqlite
cache_similarity: 1.0
```
Check cache performance:
```bash
curl http://localhost:12434/api/cache/stats
```
```json
{
"enabled": true,
"hits": 1547,
"misses": 892,
"hit_rate": 0.634,
"semantic": true,
"backend": "sqlite",
"similarity_threshold": 0.9,
"history_weight": 0.3
}
```
Clear the cache:
```bash
curl -X POST http://localhost:12434/api/cache/invalidate
```
**Notes:**
- MOE requests (`moe-*` model prefix) always bypass the cache
- Cache is isolated per `model + system prompt` — different users with different system prompts cannot receive each other's cached responses
- Semantic matching requires the `:semantic` Docker image tag (`ghcr.io/nomyo-ai/nomyo-router:latest-semantic`)
- See [configuration.md](configuration.md#semantic-llm-cache) for all cache options
### Token Tracking
The router automatically tracks token usage:

View file

@ -39,3 +39,6 @@ uvicorn==0.38.0
uvloop
yarl==1.20.1
aiosqlite
# Semantic LLM cache — base install (exact-match mode, no heavy ML deps)
# For semantic mode use the :semantic Docker image tag (adds sentence-transformers + torch)
semantic-llm-cache@git+https://github.com/nomyo-ai/async-semantic-llm-cache.git@v0.1

277
router.py
View file

@ -123,6 +123,22 @@ class Config(BaseSettings):
# Database configuration
db_path: str = Field(default=os.getenv("NOMYO_ROUTER_DB_PATH", "token_counts.db"))
# Semantic LLM Cache configuration
cache_enabled: bool = Field(default=False)
# Backend: "memory" (default, in-process), "sqlite" (persistent), "redis" (distributed)
cache_backend: str = Field(default="memory")
# Cosine similarity threshold: 1.0 = exact match only, <1.0 = semantic (requires :semantic image)
cache_similarity: float = Field(default=1.0)
# TTL in seconds; None = cache forever
cache_ttl: Optional[int] = Field(default=3600)
# SQLite backend: path to cache database file
cache_db_path: str = Field(default="llm_cache.db")
# Redis backend: connection URL
cache_redis_url: str = Field(default="redis://localhost:6379/0")
# Weight of BM25-weighted chat-history embedding vs last-user-message embedding
# 0.3 = 30% history context signal, 70% question signal
cache_history_weight: float = Field(default=0.3)
class Config:
# Load from `config.yaml` first, then from env variables
env_prefix = "NOMYO_ROUTER_"
@ -188,6 +204,7 @@ def _config_path_from_env() -> Path:
from ollama._types import TokenLogprob, Logprob
from db import TokenDatabase
from cache import init_llm_cache, get_llm_cache, openai_nonstream_to_sse
# Create the global config object it will be overwritten on startup
@ -1583,7 +1600,8 @@ async def proxy(request: Request):
images = payload.get("images")
options = payload.get("options")
keep_alive = payload.get("keep_alive")
_cache_enabled = payload.get("nomyo", {}).get("cache", False)
if not model:
raise HTTPException(
status_code=400, detail="Missing required field 'model'"
@ -1596,7 +1614,15 @@ async def proxy(request: Request):
error_msg = f"Invalid JSON format in request body: {str(e)}. Please ensure the request is properly formatted."
raise HTTPException(status_code=400, detail=error_msg) from e
# Cache lookup — before endpoint selection so no slot is wasted on a hit
_cache = get_llm_cache()
if _cache is not None and _cache_enabled:
_cached = await _cache.get_generate(model, prompt, system or "")
if _cached is not None:
async def _serve_cached_generate():
yield _cached
return StreamingResponse(_serve_cached_generate(), media_type="application/json")
endpoint, tracking_model = await choose_endpoint(model)
use_openai = is_openai_compatible(endpoint)
if use_openai:
@ -1633,6 +1659,7 @@ async def proxy(request: Request):
else:
async_gen = await client.generate(model=model, prompt=prompt, suffix=suffix, system=system, template=template, context=context, stream=stream, think=think, raw=raw, format=_format, images=images, options=options, keep_alive=keep_alive)
if stream == True:
content_parts: list[str] = []
async for chunk in async_gen:
if use_openai:
chunk = rechunk.openai_completion2ollama(chunk, stream, start_ts)
@ -1644,6 +1671,27 @@ async def proxy(request: Request):
json_line = chunk.model_dump_json()
else:
json_line = orjson.dumps(chunk)
# Accumulate and store cache on done chunk — before yield so it always runs
if _cache is not None and _cache_enabled:
if getattr(chunk, "response", None):
content_parts.append(chunk.response)
if getattr(chunk, "done", False):
assembled = orjson.dumps({
k: v for k, v in {
"model": getattr(chunk, "model", model),
"response": "".join(content_parts),
"done": True,
"done_reason": getattr(chunk, "done_reason", "stop") or "stop",
"prompt_eval_count": getattr(chunk, "prompt_eval_count", None),
"eval_count": getattr(chunk, "eval_count", None),
"total_duration": getattr(chunk, "total_duration", None),
"eval_duration": getattr(chunk, "eval_duration", None),
}.items() if v is not None
}) + b"\n"
try:
await _cache.set_generate(model, prompt, system or "", assembled)
except Exception as _ce:
print(f"[cache] set_generate (streaming) failed: {_ce}")
yield json_line.encode("utf-8") + b"\n"
else:
if use_openai:
@ -1660,7 +1708,14 @@ async def proxy(request: Request):
if hasattr(async_gen, "model_dump_json")
else orjson.dumps(async_gen)
)
yield json_line.encode("utf-8") + b"\n"
cache_bytes = json_line.encode("utf-8") + b"\n"
yield cache_bytes
# Cache non-streaming response
if _cache is not None and _cache_enabled:
try:
await _cache.set_generate(model, prompt, system or "", cache_bytes)
except Exception as _ce:
print(f"[cache] set_generate (non-streaming) failed: {_ce}")
finally:
# Ensure counter is decremented even if an exception occurs
@ -1695,6 +1750,7 @@ async def chat_proxy(request: Request):
options = payload.get("options")
logprobs = payload.get("logprobs")
top_logprobs = payload.get("top_logprobs")
_cache_enabled = payload.get("nomyo", {}).get("cache", False)
if not model:
raise HTTPException(
@ -1711,6 +1767,26 @@ async def chat_proxy(request: Request):
except orjson.JSONDecodeError as e:
raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e
# Cache lookup — before endpoint selection, always bypassed for MOE
_is_moe = model.startswith("moe-")
_cache = get_llm_cache()
# Normalise model name for cache key: strip ":latest" suffix here so that
# get_chat and set_chat use the same model string regardless of when the
# strip happens further down (line ~1793 strips it for OpenAI endpoints).
_cache_model = model[: -len(":latest")] if model.endswith(":latest") else model
# Snapshot original messages before any OpenAI-format transformation so that
# get_chat and set_chat always use the same key regardless of backend type.
_cache_messages = messages
if _cache is not None and not _is_moe and _cache_enabled:
_cached = await _cache.get_chat("ollama_chat", _cache_model, messages)
if _cached is not None:
async def _serve_cached_chat():
yield _cached
return StreamingResponse(
_serve_cached_chat(),
media_type="application/x-ndjson" if stream else "application/json",
)
# 2. Endpoint logic
if model.startswith("moe-"):
model = model.split("moe-")[1]
@ -1764,6 +1840,7 @@ async def chat_proxy(request: Request):
async_gen = await client.chat(model=model, messages=messages, tools=tools, stream=stream, think=think, format=_format, options=options, keep_alive=keep_alive, logprobs=logprobs, top_logprobs=top_logprobs)
if stream == True:
tc_acc = {} # accumulate OpenAI tool-call deltas across chunks
content_parts: list[str] = []
async for chunk in async_gen:
if use_openai:
_accumulate_openai_tc_delta(chunk, tc_acc)
@ -1780,6 +1857,30 @@ async def chat_proxy(request: Request):
json_line = chunk.model_dump_json()
else:
json_line = orjson.dumps(chunk)
# Accumulate and store cache on done chunk — before yield so it always runs
# Works for both Ollama-native and OpenAI-compatible backends; chunks are
# already converted to Ollama format by rechunk before this point.
if _cache is not None and not _is_moe and _cache_enabled:
if chunk.message and getattr(chunk.message, "content", None):
content_parts.append(chunk.message.content)
if getattr(chunk, "done", False):
assembled = orjson.dumps({
k: v for k, v in {
"model": getattr(chunk, "model", model),
"created_at": (lambda ca: ca.isoformat() if hasattr(ca, "isoformat") else ca)(getattr(chunk, "created_at", None)),
"message": {"role": "assistant", "content": "".join(content_parts)},
"done": True,
"done_reason": getattr(chunk, "done_reason", "stop") or "stop",
"prompt_eval_count": getattr(chunk, "prompt_eval_count", None),
"eval_count": getattr(chunk, "eval_count", None),
"total_duration": getattr(chunk, "total_duration", None),
"eval_duration": getattr(chunk, "eval_duration", None),
}.items() if v is not None
}) + b"\n"
try:
await _cache.set_chat("ollama_chat", _cache_model, _cache_messages, assembled)
except Exception as _ce:
print(f"[cache] set_chat (ollama_chat streaming) failed: {_ce}")
yield json_line.encode("utf-8") + b"\n"
else:
if use_openai:
@ -1796,7 +1897,14 @@ async def chat_proxy(request: Request):
if hasattr(async_gen, "model_dump_json")
else orjson.dumps(async_gen)
)
yield json_line.encode("utf-8") + b"\n"
cache_bytes = json_line.encode("utf-8") + b"\n"
yield cache_bytes
# Cache non-streaming response (non-MOE; works for both Ollama and OpenAI backends)
if _cache is not None and not _is_moe and _cache_enabled:
try:
await _cache.set_chat("ollama_chat", _cache_model, _cache_messages, cache_bytes)
except Exception as _ce:
print(f"[cache] set_chat (ollama_chat non-streaming) failed: {_ce}")
finally:
# Ensure counter is decremented even if an exception occurs
@ -2640,6 +2748,7 @@ async def openai_chat_completions_proxy(request: Request):
tools = payload.get("tools")
logprobs = payload.get("logprobs")
top_logprobs = payload.get("top_logprobs")
_cache_enabled = payload.get("nomyo", {}).get("cache", False)
if not model:
raise HTTPException(
@ -2680,25 +2789,93 @@ async def openai_chat_completions_proxy(request: Request):
except orjson.JSONDecodeError as e:
raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e
# Reject unsupported image formats (SVG) before doing any work
for _msg in messages:
for _item in (_msg.get("content") or []) if isinstance(_msg.get("content"), list) else []:
if _item.get("type") == "image_url":
_url = (_item.get("image_url") or {}).get("url", "")
if _url.startswith("data:image/svg") or _url.lower().endswith(".svg"):
raise HTTPException(
status_code=400,
detail="SVG images are not supported. Please convert the image to PNG or JPEG before sending.",
)
# Cache lookup — before endpoint selection
_cache = get_llm_cache()
if _cache is not None and _cache_enabled:
_cached = await _cache.get_chat("openai_chat", model, messages)
if _cached is not None:
if stream:
_sse = openai_nonstream_to_sse(_cached, model)
async def _serve_cached_ochat_stream():
yield _sse
return StreamingResponse(_serve_cached_ochat_stream(), media_type="text/event-stream")
else:
async def _serve_cached_ochat_json():
yield _cached
return StreamingResponse(_serve_cached_ochat_json(), media_type="application/json")
# 2. Endpoint logic
endpoint, tracking_model = await choose_endpoint(model)
base_url = ep2base(endpoint)
oclient = openai.AsyncOpenAI(base_url=base_url, default_headers=default_headers, api_key=config.api_keys.get(endpoint, "no-key"))
# 3. Async generator that streams completions data and decrements the counter
async def _normalize_images_in_messages(msgs: list) -> list:
"""Fetch remote image URLs and convert them to base64 data URLs so
Ollama/llama-server can handle them without making outbound HTTP requests."""
resolved = []
for msg in msgs:
content = msg.get("content")
if not isinstance(content, list):
resolved.append(msg)
continue
new_content = []
for item in content:
if item.get("type") == "image_url":
url = (item.get("image_url") or {}).get("url", "")
if url and not url.startswith("data:"):
try:
http: aiohttp.ClientSession = app_state["session"]
async with http.get(url) as resp:
ctype = resp.headers.get("Content-Type", "image/jpeg").split(";")[0].strip()
img_bytes = await resp.read()
b64 = base64.b64encode(img_bytes).decode("utf-8")
new_content.append({
"type": "image_url",
"image_url": {"url": f"data:{ctype};base64,{b64}"}
})
except Exception as _ie:
print(f"[image] Failed to fetch image URL: {_ie}")
new_content.append(item)
else:
new_content.append(item)
else:
new_content.append(item)
resolved.append({**msg, "content": new_content})
return resolved
async def stream_ochat_response():
try:
# The chat method returns a generator of dicts (or GenerateResponse)
try:
async_gen = await oclient.chat.completions.create(**params)
# For non-external endpoints (Ollama, llama-server), resolve remote
# image URLs to base64 data URLs so the server can handle them locally.
send_params = params
if not is_ext_openai_endpoint(endpoint):
resolved_msgs = await _normalize_images_in_messages(params.get("messages", []))
send_params = {**params, "messages": resolved_msgs}
async_gen = await oclient.chat.completions.create(**send_params)
except openai.BadRequestError as e:
# If tools are not supported by the model, retry without tools
if "does not support tools" in str(e):
print(f"[openai_chat_completions_proxy] Model {model} doesn't support tools, retrying without tools")
params_without_tools = {k: v for k, v in params.items() if k != "tools"}
params_without_tools = {k: v for k, v in send_params.items() if k != "tools"}
async_gen = await oclient.chat.completions.create(**params_without_tools)
else:
raise
if stream == True:
content_parts: list[str] = []
usage_snapshot: dict = {}
async for chunk in async_gen:
data = (
chunk.model_dump_json()
@ -2715,6 +2892,8 @@ async def openai_chat_completions_proxy(request: Request):
has_tool_calls = getattr(delta, "tool_calls", None) is not None
if has_content or has_reasoning or has_tool_calls:
yield f"data: {data}\n\n".encode("utf-8")
if has_content and delta.content:
content_parts.append(delta.content)
elif chunk.usage is not None:
# Forward the usage-only final chunk (e.g. from llama-server)
yield f"data: {data}\n\n".encode("utf-8")
@ -2723,12 +2902,24 @@ async def openai_chat_completions_proxy(request: Request):
if chunk.usage is not None:
prompt_tok = chunk.usage.prompt_tokens or 0
comp_tok = chunk.usage.completion_tokens or 0
usage_snapshot = {"prompt_tokens": prompt_tok, "completion_tokens": comp_tok, "total_tokens": prompt_tok + comp_tok}
else:
llama_usage = rechunk.extract_usage_from_llama_timings(chunk)
if llama_usage:
prompt_tok, comp_tok = llama_usage
if prompt_tok != 0 or comp_tok != 0:
await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))
# Cache assembled streaming response — before [DONE] so it always runs
if _cache is not None and _cache_enabled and content_parts:
assembled = orjson.dumps({
"model": model,
"choices": [{"index": 0, "message": {"role": "assistant", "content": "".join(content_parts)}, "finish_reason": "stop"}],
**({"usage": usage_snapshot} if usage_snapshot else {}),
}) + b"\n"
try:
await _cache.set_chat("openai_chat", model, messages, assembled)
except Exception as _ce:
print(f"[cache] set_chat (openai_chat streaming) failed: {_ce}")
yield b"data: [DONE]\n\n"
else:
prompt_tok = 0
@ -2747,7 +2938,14 @@ async def openai_chat_completions_proxy(request: Request):
if hasattr(async_gen, "model_dump_json")
else orjson.dumps(async_gen)
)
yield json_line.encode("utf-8") + b"\n"
cache_bytes = json_line.encode("utf-8") + b"\n"
yield cache_bytes
# Cache non-streaming response
if _cache is not None and _cache_enabled:
try:
await _cache.set_chat("openai_chat", model, messages, cache_bytes)
except Exception as _ce:
print(f"[cache] set_chat (openai_chat non-streaming) failed: {_ce}")
finally:
# Ensure counter is decremented even if an exception occurs
@ -2786,6 +2984,7 @@ async def openai_completions_proxy(request: Request):
max_tokens = payload.get("max_tokens")
max_completion_tokens = payload.get("max_completion_tokens")
suffix = payload.get("suffix")
_cache_enabled = payload.get("nomyo", {}).get("cache", False)
if not model:
raise HTTPException(
@ -2823,6 +3022,22 @@ async def openai_completions_proxy(request: Request):
except orjson.JSONDecodeError as e:
raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e
# Cache lookup — completions prompt mapped to a single-turn messages list
_cache = get_llm_cache()
_compl_messages = [{"role": "user", "content": prompt}]
if _cache is not None and _cache_enabled:
_cached = await _cache.get_chat("openai_completions", model, _compl_messages)
if _cached is not None:
if stream:
_sse = openai_nonstream_to_sse(_cached, model)
async def _serve_cached_ocompl_stream():
yield _sse
return StreamingResponse(_serve_cached_ocompl_stream(), media_type="text/event-stream")
else:
async def _serve_cached_ocompl_json():
yield _cached
return StreamingResponse(_serve_cached_ocompl_json(), media_type="application/json")
# 2. Endpoint logic
endpoint, tracking_model = await choose_endpoint(model)
base_url = ep2base(endpoint)
@ -2834,6 +3049,8 @@ async def openai_completions_proxy(request: Request):
# The chat method returns a generator of dicts (or GenerateResponse)
async_gen = await oclient.completions.create(**params)
if stream == True:
text_parts: list[str] = []
usage_snapshot: dict = {}
async for chunk in async_gen:
data = (
chunk.model_dump_json()
@ -2849,6 +3066,8 @@ async def openai_completions_proxy(request: Request):
)
if has_text or has_reasoning or choice.finish_reason is not None:
yield f"data: {data}\n\n".encode("utf-8")
if has_text and choice.text:
text_parts.append(choice.text)
elif chunk.usage is not None:
# Forward the usage-only final chunk (e.g. from llama-server)
yield f"data: {data}\n\n".encode("utf-8")
@ -2857,12 +3076,24 @@ async def openai_completions_proxy(request: Request):
if chunk.usage is not None:
prompt_tok = chunk.usage.prompt_tokens or 0
comp_tok = chunk.usage.completion_tokens or 0
usage_snapshot = {"prompt_tokens": prompt_tok, "completion_tokens": comp_tok, "total_tokens": prompt_tok + comp_tok}
else:
llama_usage = rechunk.extract_usage_from_llama_timings(chunk)
if llama_usage:
prompt_tok, comp_tok = llama_usage
if prompt_tok != 0 or comp_tok != 0:
await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))
# Cache assembled streaming response — before [DONE] so it always runs
if _cache is not None and _cache_enabled and text_parts:
assembled = orjson.dumps({
"model": model,
"choices": [{"index": 0, "message": {"role": "assistant", "content": "".join(text_parts)}, "finish_reason": "stop"}],
**({"usage": usage_snapshot} if usage_snapshot else {}),
}) + b"\n"
try:
await _cache.set_chat("openai_completions", model, _compl_messages, assembled)
except Exception as _ce:
print(f"[cache] set_chat (openai_completions streaming) failed: {_ce}")
# Final DONE event
yield b"data: [DONE]\n\n"
else:
@ -2882,7 +3113,14 @@ async def openai_completions_proxy(request: Request):
if hasattr(async_gen, "model_dump_json")
else orjson.dumps(async_gen)
)
yield json_line.encode("utf-8") + b"\n"
cache_bytes = json_line.encode("utf-8") + b"\n"
yield cache_bytes
# Cache non-streaming response
if _cache is not None and _cache_enabled:
try:
await _cache.set_chat("openai_completions", model, _compl_messages, cache_bytes)
except Exception as _ce:
print(f"[cache] set_chat (openai_completions non-streaming) failed: {_ce}")
finally:
# Ensure counter is decremented even if an exception occurs
@ -3076,6 +3314,28 @@ async def rerank_proxy(request: Request):
finally:
await decrement_usage(endpoint, tracking_model)
# -------------------------------------------------------------
# 25b. Cache management endpoints
# -------------------------------------------------------------
@app.get("/api/cache/stats")
async def cache_stats():
"""Return hit/miss counters and configuration for the LLM response cache."""
c = get_llm_cache()
if c is None:
return {"enabled": False}
return {"enabled": True, **c.stats()}
@app.post("/api/cache/invalidate")
async def cache_invalidate():
"""Clear all entries from the LLM response cache and reset counters."""
c = get_llm_cache()
if c is None:
return {"enabled": False, "cleared": False}
await c.clear()
return {"enabled": True, "cleared": True}
# -------------------------------------------------------------
# 26. Serve the static frontend
# -------------------------------------------------------------
@ -3211,6 +3471,7 @@ async def startup_event() -> None:
app_state["session"] = session
token_worker_task = asyncio.create_task(token_worker())
flush_task = asyncio.create_task(flush_buffer())
await init_llm_cache(config)
@app.on_event("shutdown")
async def shutdown_event() -> None: