From dd4b12da6a5611516b52155bf771d1ddd481235e Mon Sep 17 00:00:00 2001 From: alpha-nerd-nomyo Date: Sun, 8 Mar 2026 09:12:09 +0100 Subject: [PATCH 1/5] feat: adding a semantic cache layer --- .dockerignore | 44 ++ .github/workflows/docker-publish-semantic.yml | 71 +++ Dockerfile | 26 +- README.md | 67 ++- cache.py | 407 ++++++++++++++++++ config.yaml | 38 +- doc/configuration.md | 143 ++++++ doc/deployment.md | 13 + doc/examples/docker-compose.yml | 61 ++- doc/examples/sample-config.yaml | 36 +- doc/monitoring.md | 33 ++ requirements.txt | 5 + router.py | 216 +++++++++- 13 files changed, 1138 insertions(+), 22 deletions(-) create mode 100644 .dockerignore create mode 100644 .github/workflows/docker-publish-semantic.yml create mode 100644 cache.py diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..86dfdea --- /dev/null +++ b/.dockerignore @@ -0,0 +1,44 @@ +# Version control +.git +.gitignore +.github + +# Environment & secrets +.env +.env.* +*.env + +# Python artifacts +__pycache__ +*.pyc +*.pyo +*.pyd +.Python +.venv +venv +*.egg-info +dist +build + +# Local databases (don't bake data into image) +*.db +*.db-shm +*.db-wal + +# IDE / editor +.vscode +.idea +*.swp +*.swo + +# Documentation +doc/ +docs/ +*.md + +# Tests +tests/ +test_* + +# Local config overrides +config.local.yaml diff --git a/.github/workflows/docker-publish-semantic.yml b/.github/workflows/docker-publish-semantic.yml new file mode 100644 index 0000000..00ad041 --- /dev/null +++ b/.github/workflows/docker-publish-semantic.yml @@ -0,0 +1,71 @@ +name: Build and Publish Docker Image (Semantic Cache) + +# Builds the :semantic variant that includes sentence-transformers + CPU torch +# and the pre-baked all-MiniLM-L6-v2 embedding model (~500 MB larger than lean). +# Tags mirror the lean workflow but carry a -semantic suffix, e.g.: +# ghcr.io/nomyo-ai/nomyo-router:latest-semantic +# ghcr.io/nomyo-ai/nomyo-router:0.7.0-semantic +# ghcr.io/nomyo-ai/nomyo-router:0.7-semantic + +on: + push: + branches: + - main + tags: + - "v*.*.*" + workflow_dispatch: + +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }} + +jobs: + build-and-push-semantic: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up QEMU (for multi-arch builds) + uses: docker/setup-qemu-action@v3 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract Docker metadata + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + tags: | + # Versioned semantic tags on git tags (e.g. v0.7.0 → 0.7.0-semantic, 0.7-semantic) + type=semver,pattern={{version}}-semantic + type=semver,pattern={{major}}.{{minor}}-semantic + # latest-semantic only on main branch pushes + type=raw,value=latest-semantic,enable=${{ github.ref == 'refs/heads/main' }} + # SHA-tagged for traceability + type=sha,prefix=sha-,suffix=-semantic + + - name: Build and push semantic Docker image + uses: docker/build-push-action@v6 + with: + context: . + platforms: linux/amd64,linux/arm64 + push: true + build-args: | + SEMANTIC_CACHE=true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max diff --git a/Dockerfile b/Dockerfile index 073496d..75ee26f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,21 +3,43 @@ FROM python:3.13-slim ENV PYTHONUNBUFFERED=1 \ PYTHONDONTWRITEBYTECODE=1 +# SEMANTIC_CACHE=true installs sentence-transformers + CPU-only torch and pre-bakes +# the all-MiniLM-L6-v2 embedding model (~500 MB extra). The resulting image is tagged +# :semantic. The default (lean) image supports exact-match caching only. +ARG SEMANTIC_CACHE=false + +# Pin HuggingFace cache to a predictable path inside /app/data so it can be +# mounted as a volume and shared between builds. +ENV HF_HOME=/app/data/hf_cache + # Install SQLite -RUN apt-get update && apt-get install -y sqlite3 +RUN apt-get update && apt-get install -y --no-install-recommends sqlite3 \ + && rm -rf /var/lib/apt/lists/* WORKDIR /app COPY requirements.txt . RUN pip install --no-cache-dir --upgrade pip \ && pip install --no-cache-dir -r requirements.txt +# Semantic cache deps — only installed when SEMANTIC_CACHE=true +# CPU-only torch must be installed before sentence-transformers to avoid +# pulling the full CUDA-enabled build (~2.5 GB). +RUN if [ "$SEMANTIC_CACHE" = "true" ]; then \ + pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu && \ + pip install --no-cache-dir sentence-transformers && \ + python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')"; \ + fi + # Create database directory and set permissions RUN mkdir -p /app/data && chown -R www-data:www-data /app/data COPY . . -RUN chmod +x /app/entrypoint.sh +RUN chmod +x /app/entrypoint.sh && \ + chown -R www-data:www-data /app EXPOSE 12434 +USER www-data + ENTRYPOINT ["/app/entrypoint.sh"] diff --git a/README.md b/README.md index 744a86f..0a3b341 100644 --- a/README.md +++ b/README.md @@ -74,22 +74,28 @@ uvicorn router:app --host 127.0.0.1 --port 12434 --loop uvloop ### Pre-built image (GitHub Container Registry) -Pre-built multi-arch images (`linux/amd64`, `linux/arm64`) are published automatically on every release: +Pre-built multi-arch images (`linux/amd64`, `linux/arm64`) are published automatically on every release. +**Lean image** (exact-match cache, ~300 MB): ```sh docker pull ghcr.io/nomyo-ai/nomyo-router:latest -``` - -Specific version: - -```sh docker pull ghcr.io/nomyo-ai/nomyo-router:0.7.0 ``` -### Build the container image locally: +**Semantic image** (semantic cache with `all-MiniLM-L6-v2` pre-baked, ~800 MB): +```sh +docker pull ghcr.io/nomyo-ai/nomyo-router:latest-semantic +docker pull ghcr.io/nomyo-ai/nomyo-router:0.7.0-semantic +``` + +### Build the container image locally ```sh +# Lean build (exact match cache, default) docker build -t nomyo-router . + +# Semantic build — sentence-transformers + model baked in +docker build --build-arg SEMANTIC_CACHE=true -t nomyo-router:semantic . ``` Run the router in Docker with your own configuration file mounted from the host. The entrypoint script accepts a `--config-path` argument so you can point to a file anywhere inside the container: @@ -124,6 +130,53 @@ This way the Ollama backend servers are utilized more efficient than by simply u NOMYO Router also supports OpenAI API compatible v1 backend servers. +## Semantic LLM Cache + +NOMYO Router includes an optional semantic cache that serves repeated or semantically similar LLM requests from cache — no endpoint round-trip, no token cost, response in <10 ms. + +### Enable (exact match, any image) + +```yaml +# config.yaml +cache_enabled: true +cache_backend: sqlite # persists across restarts +cache_similarity: 1.0 # exact match only +cache_ttl: 3600 +``` + +### Enable (semantic matching, :semantic image) + +```yaml +cache_enabled: true +cache_backend: sqlite +cache_similarity: 0.90 # "What is Python?" ≈ "What's Python?" → cache hit +cache_ttl: 3600 +cache_history_weight: 0.3 +``` + +Pull the semantic image: +```bash +docker pull ghcr.io/nomyo-ai/nomyo-router:latest-semantic +``` + +### Cache key strategy + +Each request is keyed on `model + system_prompt` (exact) combined with a weighted-mean embedding of BM25-weighted chat history (30%) and the last user message (70%). This means: +- Different system prompts → always separate cache namespaces (no cross-tenant leakage) +- Same question, different phrasing → cache hit (semantic mode) +- MOE requests (`moe-*`) → always bypass the cache + +### Cached routes + +`/api/chat` · `/api/generate` · `/v1/chat/completions` · `/v1/completions` + +### Cache management + +```bash +curl http://localhost:12434/api/cache/stats # hit rate, counters, config +curl -X POST http://localhost:12434/api/cache/invalidate # clear all entries +``` + ## Supplying the router API key If you set `nomyo-router-api-key` in `config.yaml` (or `NOMYO_ROUTER_API_KEY` env), every request to NOMYO Router must include the key: diff --git a/cache.py b/cache.py new file mode 100644 index 0000000..219b1b9 --- /dev/null +++ b/cache.py @@ -0,0 +1,407 @@ +""" +LLM Semantic Cache for NOMYO Router. + +Strategy: +- Namespace: sha256(route :: model :: system_prompt)[:16] — exact context isolation +- Cache key: hash(normalize(last_user_message), namespace) — exact lookup +- Embedding: weighted mean of + α * embed(bm25_weighted(chat_history)) — conversation context + 1-α * embed(last_user_message) — the actual question + with α = cache_history_weight (default 0.3). +- Exact-match caching (similarity=1.0) uses DummyEmbeddingProvider — zero extra deps. +- Semantic caching (similarity<1.0) requires sentence-transformers. If missing the + library falls back to exact-match with a warning (lean Docker image behaviour). +- MOE models (moe-*) always bypass the cache. +- Token counts are never recorded for cache hits. +- Streaming cache hits are served as a single-chunk response. +""" + +import hashlib +import math +import time +import warnings +from collections import Counter +from typing import Any, Optional + +# Lazily resolved once at first embed() call +_semantic_available: Optional[bool] = None + + +def _check_sentence_transformers() -> bool: + global _semantic_available + if _semantic_available is None: + try: + import sentence_transformers # noqa: F401 + _semantic_available = True + except ImportError: + _semantic_available = False + return _semantic_available # type: ignore[return-value] + + +# --------------------------------------------------------------------------- +# BM25-weighted text representation of chat history +# --------------------------------------------------------------------------- + +def _bm25_weighted_text(history: list[dict]) -> str: + """ + Produce a BM25-importance-weighted text string from chat history turns. + + High-IDF (rare, domain-specific) terms are repeated proportionally to + their BM25 score so the downstream sentence-transformer embedding + naturally upweights topical signal and downweights stop words. + """ + docs = [m.get("content", "") for m in history if m.get("content")] + if not docs: + return "" + + def _tok(text: str) -> list[str]: + return [w.lower() for w in text.split() if len(w) > 2] + + tokenized = [_tok(d) for d in docs] + N = len(tokenized) + + df: Counter = Counter() + for tokens in tokenized: + for term in set(tokens): + df[term] += 1 + + k1, b = 1.5, 0.75 + avg_dl = sum(len(t) for t in tokenized) / max(N, 1) + + term_scores: Counter = Counter() + for tokens in tokenized: + tf_c = Counter(tokens) + dl = len(tokens) + for term, tf in tf_c.items(): + idf = math.log((N + 1) / (df[term] + 1)) + 1.0 + score = idf * (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * dl / max(avg_dl, 1))) + term_scores[term] += score + + top = term_scores.most_common(50) + if not top: + return " ".join(docs) + + max_s = top[0][1] + out: list[str] = [] + for term, score in top: + out.extend([term] * max(1, round(3 * score / max_s))) + return " ".join(out) + + +# --------------------------------------------------------------------------- +# LLMCache +# --------------------------------------------------------------------------- + +class LLMCache: + """ + Thin async wrapper around async-semantic-llm-cache that adds: + - Route-aware namespace isolation + - Two-vector weighted-mean embedding (history context + question) + - Per-instance hit/miss counters + - Graceful fallback when sentence-transformers is absent + """ + + def __init__(self, cfg: Any) -> None: + self._cfg = cfg + self._backend: Any = None + self._emb_cache: Any = None + self._semantic: bool = False + self._hits: int = 0 + self._misses: int = 0 + + async def init(self) -> None: + from semantic_llm_cache.similarity import EmbeddingCache + + # --- Backend --- + backend_type: str = self._cfg.cache_backend + if backend_type == "sqlite": + from semantic_llm_cache.backends.sqlite import SQLiteBackend + self._backend = SQLiteBackend(db_path=self._cfg.cache_db_path) + elif backend_type == "redis": + from semantic_llm_cache.backends.redis import RedisBackend + self._backend = RedisBackend(url=self._cfg.cache_redis_url) + await self._backend.ping() + else: + from semantic_llm_cache.backends.memory import MemoryBackend + self._backend = MemoryBackend() + + # --- Embedding provider --- + if self._cfg.cache_similarity < 1.0: + if _check_sentence_transformers(): + from semantic_llm_cache.similarity import create_embedding_provider + provider = create_embedding_provider("sentence-transformer") + self._emb_cache = EmbeddingCache(provider=provider) + self._semantic = True + print( + f"[cache] Semantic cache ready " + f"(similarity≥{self._cfg.cache_similarity}, backend={backend_type})" + ) + else: + warnings.warn( + "[cache] sentence-transformers is not installed. " + "Falling back to exact-match caching (similarity=1.0). " + "Use the :semantic Docker image tag to enable semantic caching.", + RuntimeWarning, + stacklevel=2, + ) + self._emb_cache = EmbeddingCache() # DummyEmbeddingProvider + print(f"[cache] Exact-match cache ready (backend={backend_type}) [semantic unavailable]") + else: + self._emb_cache = EmbeddingCache() # DummyEmbeddingProvider + print(f"[cache] Exact-match cache ready (backend={backend_type})") + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _namespace(self, route: str, model: str, system: str) -> str: + raw = f"{route}::{model}::{system}" + return hashlib.sha256(raw.encode()).hexdigest()[:16] + + def _cache_key(self, namespace: str, last_user: str) -> str: + from semantic_llm_cache.utils import hash_prompt, normalize_prompt + return hash_prompt(normalize_prompt(last_user), namespace) + + def _parse_messages( + self, messages: list[dict] + ) -> tuple[str, list[dict], str]: + """ + Returns (system_prompt, prior_history_turns, last_user_message). + Multimodal content lists are reduced to their text parts. + """ + system = "" + turns: list[dict] = [] + + for m in messages: + role = m.get("role", "") + content = m.get("content", "") + if isinstance(content, list): + content = " ".join( + p.get("text", "") + for p in content + if isinstance(p, dict) and p.get("type") == "text" + ) + if role == "system": + system = content + else: + turns.append({"role": role, "content": content}) + + last_user = "" + for m in reversed(turns): + if m["role"] == "user": + last_user = m["content"] + break + + # History = all turns before the final user message + history = turns[:-1] if turns and turns[-1]["role"] == "user" else turns + return system, history, last_user + + async def _build_embedding( + self, history: list[dict], last_user: str + ) -> list[float] | None: + """ + Weighted mean of BM25-weighted history embedding and last-user embedding. + Returns None when not in semantic mode. + """ + if not self._semantic: + return None + + import numpy as np + + alpha: float = self._cfg.cache_history_weight # weight for history signal + q_vec = np.array(await self._emb_cache.aencode(last_user), dtype=float) + + if not history: + # No history → use question embedding alone (alpha has no effect) + return q_vec.tolist() + + h_text = _bm25_weighted_text(history) + h_vec = np.array(await self._emb_cache.aencode(h_text), dtype=float) + + combined = alpha * h_vec + (1.0 - alpha) * q_vec + norm = float(np.linalg.norm(combined)) + if norm > 0.0: + combined /= norm + return combined.tolist() + + # ------------------------------------------------------------------ + # Public interface: chat (handles both Ollama and OpenAI message lists) + # ------------------------------------------------------------------ + + async def get_chat( + self, route: str, model: str, messages: list[dict] + ) -> bytes | None: + """Return cached response bytes, or None on miss.""" + if not self._backend: + return None + + system, history, last_user = self._parse_messages(messages) + if not last_user: + return None + + ns = self._namespace(route, model, system) + key = self._cache_key(ns, last_user) + + # 1. Exact key match + entry = await self._backend.get(key) + if entry is not None: + self._hits += 1 + return entry.response # type: ignore[return-value] + + # 2. Semantic similarity match + if self._semantic and self._cfg.cache_similarity < 1.0: + emb = await self._build_embedding(history, last_user) + result = await self._backend.find_similar( + emb, threshold=self._cfg.cache_similarity, namespace=ns + ) + if result is not None: + _, matched, _ = result + self._hits += 1 + return matched.response # type: ignore[return-value] + + self._misses += 1 + return None + + async def set_chat( + self, route: str, model: str, messages: list[dict], response_bytes: bytes + ) -> None: + """Store a response in the cache (fire-and-forget friendly).""" + if not self._backend: + return + + system, history, last_user = self._parse_messages(messages) + if not last_user: + return + + ns = self._namespace(route, model, system) + key = self._cache_key(ns, last_user) + + emb = ( + await self._build_embedding(history, last_user) + if self._semantic and self._cfg.cache_similarity < 1.0 + else None + ) + + from semantic_llm_cache.config import CacheEntry + + await self._backend.set( + key, + CacheEntry( + prompt=last_user, + response=response_bytes, + embedding=emb, + created_at=time.time(), + ttl=self._cfg.cache_ttl, + namespace=ns, + hit_count=0, + ), + ) + + # ------------------------------------------------------------------ + # Convenience wrappers for the generate route (prompt string, not messages) + # ------------------------------------------------------------------ + + async def get_generate( + self, model: str, prompt: str, system: str = "" + ) -> bytes | None: + messages: list[dict] = [] + if system: + messages.append({"role": "system", "content": system}) + messages.append({"role": "user", "content": prompt}) + return await self.get_chat("generate", model, messages) + + async def set_generate( + self, model: str, prompt: str, system: str, response_bytes: bytes + ) -> None: + messages: list[dict] = [] + if system: + messages.append({"role": "system", "content": system}) + messages.append({"role": "user", "content": prompt}) + await self.set_chat("generate", model, messages, response_bytes) + + # ------------------------------------------------------------------ + # Management + # ------------------------------------------------------------------ + + def stats(self) -> dict: + total = self._hits + self._misses + return { + "hits": self._hits, + "misses": self._misses, + "hit_rate": round(self._hits / total, 3) if total else 0.0, + "semantic": self._semantic, + "backend": self._cfg.cache_backend, + "similarity_threshold": self._cfg.cache_similarity, + "history_weight": self._cfg.cache_history_weight, + } + + async def clear(self) -> None: + if self._backend: + await self._backend.clear() + self._hits = 0 + self._misses = 0 + + +# --------------------------------------------------------------------------- +# Module-level singleton +# --------------------------------------------------------------------------- + +_cache: LLMCache | None = None + + +async def init_llm_cache(cfg: Any) -> LLMCache | None: + """Initialise the module-level cache singleton. Returns None if disabled.""" + global _cache + if not cfg.cache_enabled: + print("[cache] Cache disabled (cache_enabled=false).") + return None + _cache = LLMCache(cfg) + await _cache.init() + return _cache + + +def get_llm_cache() -> LLMCache | None: + return _cache + + +# --------------------------------------------------------------------------- +# Helper: convert a stored Ollama-format non-streaming response to an +# OpenAI SSE single-chunk stream (used when a streaming OpenAI request +# hits the cache whose entry was populated from a non-streaming response). +# --------------------------------------------------------------------------- + +def openai_nonstream_to_sse(cached_bytes: bytes, model: str) -> bytes: + """ + Wrap a stored OpenAI ChatCompletion JSON as a minimal single-chunk SSE stream. + The stored entry always uses the non-streaming ChatCompletion format so that + non-streaming cache hits can be served directly; this function adapts it for + streaming clients. + """ + import orjson, time as _time + + try: + d = orjson.loads(cached_bytes) + content = (d.get("choices") or [{}])[0].get("message", {}).get("content", "") + chunk = { + "id": d.get("id", "cache-hit"), + "object": "chat.completion.chunk", + "created": d.get("created", int(_time.time())), + "model": d.get("model", model), + "choices": [ + { + "index": 0, + "delta": {"role": "assistant", "content": content}, + "finish_reason": "stop", + } + ], + } + if d.get("usage"): + chunk["usage"] = d["usage"] + return f"data: {orjson.dumps(chunk).decode()}\n\ndata: [DONE]\n\n".encode() + except Exception as exc: + warnings.warn( + f"[cache] openai_nonstream_to_sse: corrupt cache entry, returning empty stream: {exc}", + RuntimeWarning, + stacklevel=2, + ) + return b"data: [DONE]\n\n" diff --git a/config.yaml b/config.yaml index 752ccec..757873e 100644 --- a/config.yaml +++ b/config.yaml @@ -6,7 +6,7 @@ endpoints: - https://api.openai.com/v1 llama_server_endpoints: - - http://192.168.0.33:8889/v1 + - http://192.168.0.50:8889/v1 # Maximum concurrent connections *per endpoint‑model pair* (equals to OLLAMA_NUM_PARALLEL) max_concurrent_connections: 2 @@ -22,4 +22,38 @@ api_keys: "http://192.168.0.51:11434": "ollama" "http://192.168.0.52:11434": "ollama" "https://api.openai.com/v1": "${OPENAI_KEY}" - "http://192.168.0.33:8889/v1": "llama" \ No newline at end of file + "http://192.168.0.50:8889/v1": "llama" + +# ------------------------------------------------------------- +# Semantic LLM Cache (optional — disabled by default) +# Caches LLM responses to cut costs and latency on repeated or +# semantically similar prompts. +# Cached routes: /api/chat /api/generate /v1/chat/completions /v1/completions +# MOE requests (moe-* model prefix) always bypass the cache. +# ------------------------------------------------------------- +cache_enabled: true + +# Backend — where cached responses are stored: +# memory → in-process LRU (lost on restart, not shared across replicas) [default] +# sqlite → persistent file-based (single instance, survives restart) +# redis → distributed (shared across replicas, requires Redis) +cache_backend: memory + +# Cosine similarity threshold for a cache hit: +# 1.0 → exact match only (works on any image variant) +# <1.0 → semantic matching (requires the :semantic Docker image tag) +cache_similarity: 0.9 + +# Response TTL in seconds. Remove the key or set to null to cache forever. +cache_ttl: 3600 + +# SQLite backend: path to the cache database file +cache_db_path: llm_cache.db + +# Redis backend: connection URL +# cache_redis_url: redis://localhost:6379/0 + +# Weight of the BM25-weighted chat-history embedding vs last-user-message embedding. +# 0.3 = 30% history context signal, 70% question signal. +# Only relevant when cache_similarity < 1.0. +# cache_history_weight: 0.3 \ No newline at end of file diff --git a/doc/configuration.md b/doc/configuration.md index afb7dda..af2dfae 100644 --- a/doc/configuration.md +++ b/doc/configuration.md @@ -204,6 +204,149 @@ max_concurrent_connections: 3 **Recommendation**: Use multiple endpoints for redundancy and load distribution. +## Semantic LLM Cache + +NOMYO Router can cache LLM responses and serve them directly — skipping endpoint selection, model load, and token generation entirely. + +### How it works + +1. On every cacheable request (`/api/chat`, `/api/generate`, `/v1/chat/completions`, `/v1/completions`) the cache is checked **before** choosing an endpoint. +2. On a **cache hit** the stored response is returned immediately as a single chunk (streaming or non-streaming — both work). +3. On a **cache miss** the request is forwarded normally. The response is stored in the cache after it completes. +4. **MOE requests** (`moe-*` model prefix) always bypass the cache. +5. **Token counts** are never recorded for cache hits. + +### Cache key strategy + +| Signal | How matched | +|---|---| +| `model + system_prompt` | Exact — hard context isolation per deployment | +| BM25-weighted embedding of chat history | Semantic — conversation context signal | +| Embedding of last user message | Semantic — the actual question | + +The two semantic vectors are combined as a weighted mean (tuned by `cache_history_weight`) before cosine similarity comparison, staying at a single 384-dimensional vector compatible with the library's storage format. + +### Quick start — exact match (lean image) + +```yaml +cache_enabled: true +cache_backend: sqlite # persists across restarts +cache_similarity: 1.0 # exact match only, no sentence-transformers needed +cache_ttl: 3600 +``` + +### Quick start — semantic matching (:semantic image) + +```yaml +cache_enabled: true +cache_backend: sqlite +cache_similarity: 0.90 # hit if ≥90% cosine similarity +cache_ttl: 3600 +cache_history_weight: 0.3 +``` + +Pull the semantic image: +```bash +docker pull ghcr.io/nomyo-ai/nomyo-router:latest-semantic +``` + +### Cache configuration options + +#### `cache_enabled` + +**Type**: `bool` | **Default**: `false` + +Enable or disable the cache. All other cache settings are ignored when `false`. + +#### `cache_backend` + +**Type**: `str` | **Default**: `"memory"` + +| Value | Description | Persists | Multi-replica | +|---|---|---|---| +| `memory` | In-process LRU dict | ❌ | ❌ | +| `sqlite` | File-based via `aiosqlite` | ✅ | ❌ | +| `redis` | Redis via `redis.asyncio` | ✅ | ✅ | + +Use `redis` when running multiple router replicas behind a load balancer — all replicas share one warm cache. + +#### `cache_similarity` + +**Type**: `float` | **Default**: `1.0` + +Cosine similarity threshold. `1.0` means exact match only (no embedding model needed). Values below `1.0` enable semantic matching, which requires the `:semantic` Docker image tag. + +Recommended starting value for semantic mode: `0.90`. + +#### `cache_ttl` + +**Type**: `int | null` | **Default**: `3600` + +Time-to-live for cache entries in seconds. Remove the key or set to `null` to cache forever. + +#### `cache_db_path` + +**Type**: `str` | **Default**: `"llm_cache.db"` + +Path to the SQLite cache database. Only used when `cache_backend: sqlite`. + +#### `cache_redis_url` + +**Type**: `str` | **Default**: `"redis://localhost:6379/0"` + +Redis connection URL. Only used when `cache_backend: redis`. + +#### `cache_history_weight` + +**Type**: `float` | **Default**: `0.3` + +Weight of the BM25-weighted chat-history embedding in the combined cache key vector. `0.3` means the history contributes 30% and the final user message contributes 70% of the similarity signal. Only used when `cache_similarity < 1.0`. + +### Cache management endpoints + +| Endpoint | Method | Description | +|---|---|---| +| `/api/cache/stats` | `GET` | Hit/miss counters, hit rate, current config | +| `/api/cache/invalidate` | `POST` | Clear all cache entries and reset counters | + +```bash +# Check cache performance +curl http://localhost:12434/api/cache/stats + +# Clear the cache +curl -X POST http://localhost:12434/api/cache/invalidate +``` + +Example stats response: +```json +{ + "enabled": true, + "hits": 1547, + "misses": 892, + "hit_rate": 0.634, + "semantic": true, + "backend": "sqlite", + "similarity_threshold": 0.9, + "history_weight": 0.3 +} +``` + +### Docker image variants + +| Tag | Semantic cache | Image size | +|---|---|---| +| `latest` | ❌ exact match only | ~300 MB | +| `latest-semantic` | ✅ sentence-transformers + model pre-baked | ~800 MB | + +Build locally: +```bash +# Lean (exact match) +docker build -t nomyo-router . + +# Semantic (~500 MB larger, all-MiniLM-L6-v2 model baked in) +docker build --build-arg SEMANTIC_CACHE=true -t nomyo-router:semantic . +``` + ## Configuration Validation The router validates the configuration at startup: diff --git a/doc/deployment.md b/doc/deployment.md index 0970a35..efff3ff 100644 --- a/doc/deployment.md +++ b/doc/deployment.md @@ -82,10 +82,23 @@ sudo systemctl status nomyo-router ## 2. Docker Deployment +### Image variants + +| Tag | Semantic cache | Image size | +|---|---|---| +| `latest` | ❌ exact match only | ~300 MB | +| `latest-semantic` | ✅ sentence-transformers + `all-MiniLM-L6-v2` pre-baked | ~800 MB | + +The `:semantic` variant enables `cache_similarity < 1.0` in `config.yaml`. The lean image falls back to exact-match caching with a warning if semantic mode is configured. + ### Build the Image ```bash +# Lean build (exact match cache, default) docker build -t nomyo-router . + +# Semantic build (~500 MB larger, all-MiniLM-L6-v2 model baked in at build time) +docker build --build-arg SEMANTIC_CACHE=true -t nomyo-router:semantic . ``` ### Run the Container diff --git a/doc/examples/docker-compose.yml b/doc/examples/docker-compose.yml index d940c73..ab34070 100644 --- a/doc/examples/docker-compose.yml +++ b/doc/examples/docker-compose.yml @@ -1,20 +1,30 @@ # Docker Compose example for NOMYO Router with multiple Ollama instances +# +# Two router profiles are provided: +# nomyo-router — lean image, exact-match cache only (~300 MB) +# nomyo-router-semantic — semantic image, sentence-transformers baked in (~800 MB) +# +# Uncomment the redis service and set cache_backend: redis in config.yaml +# to share the LLM response cache across multiple router replicas. version: '3.8' services: - # NOMYO Router + # NOMYO Router — lean image (exact-match cache, default) nomyo-router: image: nomyo-router:latest - build: . + build: + context: . + args: + SEMANTIC_CACHE: "false" ports: - "12434:12434" environment: - CONFIG_PATH=/app/config/config.yaml - - NOMYO_ROUTER_DB_PATH=/app/token_counts.db + - NOMYO_ROUTER_DB_PATH=/app/data/token_counts.db volumes: - ./config:/app/config - - router-db:/app/token_counts.db + - router-data:/app/data depends_on: - ollama1 - ollama2 @@ -23,6 +33,45 @@ services: networks: - nomyo-net + # NOMYO Router — semantic image (cache_similarity < 1.0 support, ~800 MB) + # Build: docker compose build nomyo-router-semantic + # Switch: comment out nomyo-router above, uncomment this block. + # nomyo-router-semantic: + # image: nomyo-router:semantic + # build: + # context: . + # args: + # SEMANTIC_CACHE: "true" + # ports: + # - "12434:12434" + # environment: + # - CONFIG_PATH=/app/config/config.yaml + # - NOMYO_ROUTER_DB_PATH=/app/data/token_counts.db + # volumes: + # - ./config:/app/config + # - router-data:/app/data + # - hf-cache:/app/data/hf_cache # share HuggingFace model cache across builds + # depends_on: + # - ollama1 + # - ollama2 + # - ollama3 + # restart: unless-stopped + # networks: + # - nomyo-net + + # Optional: Redis for shared LLM response cache across multiple router replicas. + # Requires cache_backend: redis in config.yaml. + # redis: + # image: redis:7-alpine + # ports: + # - "6379:6379" + # volumes: + # - redis-data:/data + # command: redis-server --save 60 1 --loglevel warning + # restart: unless-stopped + # networks: + # - nomyo-net + # Ollama Instance 1 ollama1: image: ollama/ollama:latest @@ -87,7 +136,9 @@ services: - nomyo-net volumes: - router-db: + router-data: + # hf-cache: # uncomment when using nomyo-router-semantic + # redis-data: # uncomment when using Redis cache backend ollama1-data: ollama2-data: ollama3-data: diff --git a/doc/examples/sample-config.yaml b/doc/examples/sample-config.yaml index 72dd0ff..a1081af 100644 --- a/doc/examples/sample-config.yaml +++ b/doc/examples/sample-config.yaml @@ -29,4 +29,38 @@ api_keys: "http://192.168.0.52:11434": "ollama" "https://api.openai.com/v1": "${OPENAI_KEY}" "http://localhost:8080/v1": "llama-server" # Optional API key for llama-server - depends on llama_server config - "http://192.168.0.33:8081/v1": "llama-server" \ No newline at end of file + "http://192.168.0.33:8081/v1": "llama-server" + +# ------------------------------------------------------------- +# Semantic LLM Cache (optional — disabled by default) +# Caches LLM responses to cut costs and latency on repeated or +# semantically similar prompts. +# Cached routes: /api/chat /api/generate /v1/chat/completions /v1/completions +# MOE requests (moe-* model prefix) always bypass the cache. +# ------------------------------------------------------------- +# cache_enabled: false + +# Backend — where cached responses are stored: +# memory → in-process LRU (lost on restart, not shared across replicas) [default] +# sqlite → persistent file-based (single instance, survives restart) +# redis → distributed (shared across replicas, requires Redis) +# cache_backend: memory + +# Cosine similarity threshold for a cache hit: +# 1.0 → exact match only (works on any image variant) +# <1.0 → semantic matching (requires the :semantic Docker image tag) +# cache_similarity: 1.0 + +# Response TTL in seconds. Remove the key or set to null to cache forever. +# cache_ttl: 3600 + +# SQLite backend: path to the cache database file +# cache_db_path: llm_cache.db + +# Redis backend: connection URL +# cache_redis_url: redis://localhost:6379/0 + +# Weight of the BM25-weighted chat-history embedding vs last-user-message embedding. +# 0.3 = 30% history context signal, 70% question signal. +# Only relevant when cache_similarity < 1.0. +# cache_history_weight: 0.3 \ No newline at end of file diff --git a/doc/monitoring.md b/doc/monitoring.md index 7f96def..b5bcbff 100644 --- a/doc/monitoring.md +++ b/doc/monitoring.md @@ -133,6 +133,39 @@ Response: } ``` +### Cache Statistics + +```bash +curl http://localhost:12434/api/cache/stats +``` + +Response when cache is enabled: +```json +{ + "enabled": true, + "hits": 1547, + "misses": 892, + "hit_rate": 0.634, + "semantic": true, + "backend": "sqlite", + "similarity_threshold": 0.9, + "history_weight": 0.3 +} +``` + +Response when cache is disabled: +```json +{ "enabled": false } +``` + +### Cache Invalidation + +```bash +curl -X POST http://localhost:12434/api/cache/invalidate +``` + +Clears all cached entries and resets hit/miss counters. + ### Real-time Usage Stream ```bash diff --git a/requirements.txt b/requirements.txt index f1db419..a2739b0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -39,3 +39,8 @@ uvicorn==0.38.0 uvloop yarl==1.20.1 aiosqlite +# Semantic LLM cache — base install (exact-match mode, no heavy ML deps) +# For semantic mode use the :semantic Docker image tag (adds sentence-transformers + torch) +# SECURITY: pin to a specific tag or commit hash to prevent supply-chain drift, e.g.: +# semantic-llm-cache @ git+https://github.com/nomyo-ai/async-semantic-llm-cache.git@v1.0.0 +semantic-llm-cache @ git+https://github.com/nomyo-ai/async-semantic-llm-cache.git diff --git a/router.py b/router.py index e6987e4..82bed12 100644 --- a/router.py +++ b/router.py @@ -123,6 +123,22 @@ class Config(BaseSettings): # Database configuration db_path: str = Field(default=os.getenv("NOMYO_ROUTER_DB_PATH", "token_counts.db")) + # Semantic LLM Cache configuration + cache_enabled: bool = Field(default=False) + # Backend: "memory" (default, in-process), "sqlite" (persistent), "redis" (distributed) + cache_backend: str = Field(default="memory") + # Cosine similarity threshold: 1.0 = exact match only, <1.0 = semantic (requires :semantic image) + cache_similarity: float = Field(default=1.0) + # TTL in seconds; None = cache forever + cache_ttl: Optional[int] = Field(default=3600) + # SQLite backend: path to cache database file + cache_db_path: str = Field(default="llm_cache.db") + # Redis backend: connection URL + cache_redis_url: str = Field(default="redis://localhost:6379/0") + # Weight of BM25-weighted chat-history embedding vs last-user-message embedding + # 0.3 = 30% history context signal, 70% question signal + cache_history_weight: float = Field(default=0.3) + class Config: # Load from `config.yaml` first, then from env variables env_prefix = "NOMYO_ROUTER_" @@ -188,6 +204,7 @@ def _config_path_from_env() -> Path: from ollama._types import TokenLogprob, Logprob from db import TokenDatabase +from cache import init_llm_cache, get_llm_cache, openai_nonstream_to_sse # Create the global config object – it will be overwritten on startup @@ -1596,7 +1613,15 @@ async def proxy(request: Request): error_msg = f"Invalid JSON format in request body: {str(e)}. Please ensure the request is properly formatted." raise HTTPException(status_code=400, detail=error_msg) from e - + # Cache lookup — before endpoint selection so no slot is wasted on a hit + _cache = get_llm_cache() + if _cache is not None: + _cached = await _cache.get_generate(model, prompt, system or "") + if _cached is not None: + async def _serve_cached_generate(): + yield _cached + return StreamingResponse(_serve_cached_generate(), media_type="application/json") + endpoint, tracking_model = await choose_endpoint(model) use_openai = is_openai_compatible(endpoint) if use_openai: @@ -1633,6 +1658,7 @@ async def proxy(request: Request): else: async_gen = await client.generate(model=model, prompt=prompt, suffix=suffix, system=system, template=template, context=context, stream=stream, think=think, raw=raw, format=_format, images=images, options=options, keep_alive=keep_alive) if stream == True: + content_parts: list[str] = [] async for chunk in async_gen: if use_openai: chunk = rechunk.openai_completion2ollama(chunk, stream, start_ts) @@ -1644,6 +1670,27 @@ async def proxy(request: Request): json_line = chunk.model_dump_json() else: json_line = orjson.dumps(chunk) + # Accumulate and store cache on done chunk — before yield so it always runs + if _cache is not None: + if getattr(chunk, "response", None): + content_parts.append(chunk.response) + if getattr(chunk, "done", False): + assembled = orjson.dumps({ + k: v for k, v in { + "model": getattr(chunk, "model", model), + "response": "".join(content_parts), + "done": True, + "done_reason": getattr(chunk, "done_reason", "stop") or "stop", + "prompt_eval_count": getattr(chunk, "prompt_eval_count", None), + "eval_count": getattr(chunk, "eval_count", None), + "total_duration": getattr(chunk, "total_duration", None), + "eval_duration": getattr(chunk, "eval_duration", None), + }.items() if v is not None + }) + b"\n" + try: + await _cache.set_generate(model, prompt, system or "", assembled) + except Exception as _ce: + print(f"[cache] set_generate (streaming) failed: {_ce}") yield json_line.encode("utf-8") + b"\n" else: if use_openai: @@ -1660,7 +1707,14 @@ async def proxy(request: Request): if hasattr(async_gen, "model_dump_json") else orjson.dumps(async_gen) ) - yield json_line.encode("utf-8") + b"\n" + cache_bytes = json_line.encode("utf-8") + b"\n" + yield cache_bytes + # Cache non-streaming response + if _cache is not None: + try: + await _cache.set_generate(model, prompt, system or "", cache_bytes) + except Exception as _ce: + print(f"[cache] set_generate (non-streaming) failed: {_ce}") finally: # Ensure counter is decremented even if an exception occurs @@ -1711,6 +1765,26 @@ async def chat_proxy(request: Request): except orjson.JSONDecodeError as e: raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e + # Cache lookup — before endpoint selection, always bypassed for MOE + _is_moe = model.startswith("moe-") + _cache = get_llm_cache() + # Normalise model name for cache key: strip ":latest" suffix here so that + # get_chat and set_chat use the same model string regardless of when the + # strip happens further down (line ~1793 strips it for OpenAI endpoints). + _cache_model = model[: -len(":latest")] if model.endswith(":latest") else model + # Snapshot original messages before any OpenAI-format transformation so that + # get_chat and set_chat always use the same key regardless of backend type. + _cache_messages = messages + if _cache is not None and not _is_moe: + _cached = await _cache.get_chat("ollama_chat", _cache_model, messages) + if _cached is not None: + async def _serve_cached_chat(): + yield _cached + return StreamingResponse( + _serve_cached_chat(), + media_type="application/x-ndjson" if stream else "application/json", + ) + # 2. Endpoint logic if model.startswith("moe-"): model = model.split("moe-")[1] @@ -1764,6 +1838,7 @@ async def chat_proxy(request: Request): async_gen = await client.chat(model=model, messages=messages, tools=tools, stream=stream, think=think, format=_format, options=options, keep_alive=keep_alive, logprobs=logprobs, top_logprobs=top_logprobs) if stream == True: tc_acc = {} # accumulate OpenAI tool-call deltas across chunks + content_parts: list[str] = [] async for chunk in async_gen: if use_openai: _accumulate_openai_tc_delta(chunk, tc_acc) @@ -1780,6 +1855,30 @@ async def chat_proxy(request: Request): json_line = chunk.model_dump_json() else: json_line = orjson.dumps(chunk) + # Accumulate and store cache on done chunk — before yield so it always runs + # Works for both Ollama-native and OpenAI-compatible backends; chunks are + # already converted to Ollama format by rechunk before this point. + if _cache is not None and not _is_moe: + if chunk.message and getattr(chunk.message, "content", None): + content_parts.append(chunk.message.content) + if getattr(chunk, "done", False): + assembled = orjson.dumps({ + k: v for k, v in { + "model": getattr(chunk, "model", model), + "created_at": (lambda ca: ca.isoformat() if hasattr(ca, "isoformat") else ca)(getattr(chunk, "created_at", None)), + "message": {"role": "assistant", "content": "".join(content_parts)}, + "done": True, + "done_reason": getattr(chunk, "done_reason", "stop") or "stop", + "prompt_eval_count": getattr(chunk, "prompt_eval_count", None), + "eval_count": getattr(chunk, "eval_count", None), + "total_duration": getattr(chunk, "total_duration", None), + "eval_duration": getattr(chunk, "eval_duration", None), + }.items() if v is not None + }) + b"\n" + try: + await _cache.set_chat("ollama_chat", _cache_model, _cache_messages, assembled) + except Exception as _ce: + print(f"[cache] set_chat (ollama_chat streaming) failed: {_ce}") yield json_line.encode("utf-8") + b"\n" else: if use_openai: @@ -1796,7 +1895,14 @@ async def chat_proxy(request: Request): if hasattr(async_gen, "model_dump_json") else orjson.dumps(async_gen) ) - yield json_line.encode("utf-8") + b"\n" + cache_bytes = json_line.encode("utf-8") + b"\n" + yield cache_bytes + # Cache non-streaming response (non-MOE; works for both Ollama and OpenAI backends) + if _cache is not None and not _is_moe: + try: + await _cache.set_chat("ollama_chat", _cache_model, _cache_messages, cache_bytes) + except Exception as _ce: + print(f"[cache] set_chat (ollama_chat non-streaming) failed: {_ce}") finally: # Ensure counter is decremented even if an exception occurs @@ -2680,6 +2786,21 @@ async def openai_chat_completions_proxy(request: Request): except orjson.JSONDecodeError as e: raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e + # Cache lookup — before endpoint selection + _cache = get_llm_cache() + if _cache is not None: + _cached = await _cache.get_chat("openai_chat", model, messages) + if _cached is not None: + if stream: + _sse = openai_nonstream_to_sse(_cached, model) + async def _serve_cached_ochat_stream(): + yield _sse + return StreamingResponse(_serve_cached_ochat_stream(), media_type="text/event-stream") + else: + async def _serve_cached_ochat_json(): + yield _cached + return StreamingResponse(_serve_cached_ochat_json(), media_type="application/json") + # 2. Endpoint logic endpoint, tracking_model = await choose_endpoint(model) base_url = ep2base(endpoint) @@ -2699,6 +2820,8 @@ async def openai_chat_completions_proxy(request: Request): else: raise if stream == True: + content_parts: list[str] = [] + usage_snapshot: dict = {} async for chunk in async_gen: data = ( chunk.model_dump_json() @@ -2715,6 +2838,8 @@ async def openai_chat_completions_proxy(request: Request): has_tool_calls = getattr(delta, "tool_calls", None) is not None if has_content or has_reasoning or has_tool_calls: yield f"data: {data}\n\n".encode("utf-8") + if has_content and delta.content: + content_parts.append(delta.content) elif chunk.usage is not None: # Forward the usage-only final chunk (e.g. from llama-server) yield f"data: {data}\n\n".encode("utf-8") @@ -2723,12 +2848,24 @@ async def openai_chat_completions_proxy(request: Request): if chunk.usage is not None: prompt_tok = chunk.usage.prompt_tokens or 0 comp_tok = chunk.usage.completion_tokens or 0 + usage_snapshot = {"prompt_tokens": prompt_tok, "completion_tokens": comp_tok, "total_tokens": prompt_tok + comp_tok} else: llama_usage = rechunk.extract_usage_from_llama_timings(chunk) if llama_usage: prompt_tok, comp_tok = llama_usage if prompt_tok != 0 or comp_tok != 0: await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok)) + # Cache assembled streaming response — before [DONE] so it always runs + if _cache is not None and content_parts: + assembled = orjson.dumps({ + "model": model, + "choices": [{"index": 0, "message": {"role": "assistant", "content": "".join(content_parts)}, "finish_reason": "stop"}], + **({"usage": usage_snapshot} if usage_snapshot else {}), + }) + b"\n" + try: + await _cache.set_chat("openai_chat", model, messages, assembled) + except Exception as _ce: + print(f"[cache] set_chat (openai_chat streaming) failed: {_ce}") yield b"data: [DONE]\n\n" else: prompt_tok = 0 @@ -2747,7 +2884,14 @@ async def openai_chat_completions_proxy(request: Request): if hasattr(async_gen, "model_dump_json") else orjson.dumps(async_gen) ) - yield json_line.encode("utf-8") + b"\n" + cache_bytes = json_line.encode("utf-8") + b"\n" + yield cache_bytes + # Cache non-streaming response + if _cache is not None: + try: + await _cache.set_chat("openai_chat", model, messages, cache_bytes) + except Exception as _ce: + print(f"[cache] set_chat (openai_chat non-streaming) failed: {_ce}") finally: # Ensure counter is decremented even if an exception occurs @@ -2823,6 +2967,22 @@ async def openai_completions_proxy(request: Request): except orjson.JSONDecodeError as e: raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e + # Cache lookup — completions prompt mapped to a single-turn messages list + _cache = get_llm_cache() + _compl_messages = [{"role": "user", "content": prompt}] + if _cache is not None: + _cached = await _cache.get_chat("openai_completions", model, _compl_messages) + if _cached is not None: + if stream: + _sse = openai_nonstream_to_sse(_cached, model) + async def _serve_cached_ocompl_stream(): + yield _sse + return StreamingResponse(_serve_cached_ocompl_stream(), media_type="text/event-stream") + else: + async def _serve_cached_ocompl_json(): + yield _cached + return StreamingResponse(_serve_cached_ocompl_json(), media_type="application/json") + # 2. Endpoint logic endpoint, tracking_model = await choose_endpoint(model) base_url = ep2base(endpoint) @@ -2834,6 +2994,8 @@ async def openai_completions_proxy(request: Request): # The chat method returns a generator of dicts (or GenerateResponse) async_gen = await oclient.completions.create(**params) if stream == True: + text_parts: list[str] = [] + usage_snapshot: dict = {} async for chunk in async_gen: data = ( chunk.model_dump_json() @@ -2849,6 +3011,8 @@ async def openai_completions_proxy(request: Request): ) if has_text or has_reasoning or choice.finish_reason is not None: yield f"data: {data}\n\n".encode("utf-8") + if has_text and choice.text: + text_parts.append(choice.text) elif chunk.usage is not None: # Forward the usage-only final chunk (e.g. from llama-server) yield f"data: {data}\n\n".encode("utf-8") @@ -2857,12 +3021,24 @@ async def openai_completions_proxy(request: Request): if chunk.usage is not None: prompt_tok = chunk.usage.prompt_tokens or 0 comp_tok = chunk.usage.completion_tokens or 0 + usage_snapshot = {"prompt_tokens": prompt_tok, "completion_tokens": comp_tok, "total_tokens": prompt_tok + comp_tok} else: llama_usage = rechunk.extract_usage_from_llama_timings(chunk) if llama_usage: prompt_tok, comp_tok = llama_usage if prompt_tok != 0 or comp_tok != 0: await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok)) + # Cache assembled streaming response — before [DONE] so it always runs + if _cache is not None and text_parts: + assembled = orjson.dumps({ + "model": model, + "choices": [{"index": 0, "message": {"role": "assistant", "content": "".join(text_parts)}, "finish_reason": "stop"}], + **({"usage": usage_snapshot} if usage_snapshot else {}), + }) + b"\n" + try: + await _cache.set_chat("openai_completions", model, _compl_messages, assembled) + except Exception as _ce: + print(f"[cache] set_chat (openai_completions streaming) failed: {_ce}") # Final DONE event yield b"data: [DONE]\n\n" else: @@ -2882,7 +3058,14 @@ async def openai_completions_proxy(request: Request): if hasattr(async_gen, "model_dump_json") else orjson.dumps(async_gen) ) - yield json_line.encode("utf-8") + b"\n" + cache_bytes = json_line.encode("utf-8") + b"\n" + yield cache_bytes + # Cache non-streaming response + if _cache is not None: + try: + await _cache.set_chat("openai_completions", model, _compl_messages, cache_bytes) + except Exception as _ce: + print(f"[cache] set_chat (openai_completions non-streaming) failed: {_ce}") finally: # Ensure counter is decremented even if an exception occurs @@ -3076,6 +3259,28 @@ async def rerank_proxy(request: Request): finally: await decrement_usage(endpoint, tracking_model) +# ------------------------------------------------------------- +# 25b. Cache management endpoints +# ------------------------------------------------------------- +@app.get("/api/cache/stats") +async def cache_stats(): + """Return hit/miss counters and configuration for the LLM response cache.""" + c = get_llm_cache() + if c is None: + return {"enabled": False} + return {"enabled": True, **c.stats()} + + +@app.post("/api/cache/invalidate") +async def cache_invalidate(): + """Clear all entries from the LLM response cache and reset counters.""" + c = get_llm_cache() + if c is None: + return {"enabled": False, "cleared": False} + await c.clear() + return {"enabled": True, "cleared": True} + + # ------------------------------------------------------------- # 26. Serve the static front‑end # ------------------------------------------------------------- @@ -3211,6 +3416,7 @@ async def startup_event() -> None: app_state["session"] = session token_worker_task = asyncio.create_task(token_worker()) flush_task = asyncio.create_task(flush_buffer()) + await init_llm_cache(config) @app.on_event("shutdown") async def shutdown_event() -> None: From e8b89814219409cb69fd40b9055732e4d1bd4efb Mon Sep 17 00:00:00 2001 From: alpha-nerd-nomyo Date: Sun, 8 Mar 2026 09:26:53 +0100 Subject: [PATCH 2/5] doc: updated usage.md --- doc/usage.md | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/doc/usage.md b/doc/usage.md index 4912ce8..9e980af 100644 --- a/doc/usage.md +++ b/doc/usage.md @@ -79,6 +79,8 @@ For OpenAI API compatibility: | `/api/config` | GET | Endpoint configuration | | `/api/usage-stream` | GET | Real-time usage updates (SSE) | | `/health` | GET | Health check | +| `/api/cache/stats` | GET | Cache hit/miss counters and config | +| `/api/cache/invalidate` | POST | Clear all cache entries and counters | ## Making Requests @@ -147,6 +149,58 @@ The MOE system: 3. Selects the best response 4. Generates a final refined response +### Semantic LLM Cache + +The router can cache LLM responses and serve them instantly — bypassing endpoint selection, model loading, and token generation entirely. Cached responses work for both streaming and non-streaming clients. + +Enable it in `config.yaml`: + +```yaml +cache_enabled: true +cache_backend: sqlite # persists across restarts +cache_similarity: 0.9 # semantic matching (requires :semantic image) +cache_ttl: 3600 +``` + +For exact-match only (no extra dependencies): + +```yaml +cache_enabled: true +cache_backend: sqlite +cache_similarity: 1.0 +``` + +Check cache performance: + +```bash +curl http://localhost:12434/api/cache/stats +``` + +```json +{ + "enabled": true, + "hits": 1547, + "misses": 892, + "hit_rate": 0.634, + "semantic": true, + "backend": "sqlite", + "similarity_threshold": 0.9, + "history_weight": 0.3 +} +``` + +Clear the cache: + +```bash +curl -X POST http://localhost:12434/api/cache/invalidate +``` + +**Notes:** +- MOE requests (`moe-*` model prefix) always bypass the cache +- Cache is isolated per `model + system prompt` — different users with different system prompts cannot receive each other's cached responses +- Semantic matching requires the `:semantic` Docker image tag (`ghcr.io/nomyo-ai/nomyo-router:latest-semantic`) +- See [configuration.md](configuration.md#semantic-llm-cache) for all cache options + ### Token Tracking The router automatically tracks token usage: From a5108486e3d7e98d365d0fc2c775e47bc910d241 Mon Sep 17 00:00:00 2001 From: alpha-nerd-nomyo Date: Sun, 8 Mar 2026 09:35:40 +0100 Subject: [PATCH 3/5] conf: clean default conf --- config.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/config.yaml b/config.yaml index 757873e..4d7a5e4 100644 --- a/config.yaml +++ b/config.yaml @@ -31,24 +31,24 @@ api_keys: # Cached routes: /api/chat /api/generate /v1/chat/completions /v1/completions # MOE requests (moe-* model prefix) always bypass the cache. # ------------------------------------------------------------- -cache_enabled: true +# cache_enabled: true # Backend — where cached responses are stored: # memory → in-process LRU (lost on restart, not shared across replicas) [default] # sqlite → persistent file-based (single instance, survives restart) # redis → distributed (shared across replicas, requires Redis) -cache_backend: memory +# cache_backend: memory # Cosine similarity threshold for a cache hit: # 1.0 → exact match only (works on any image variant) # <1.0 → semantic matching (requires the :semantic Docker image tag) -cache_similarity: 0.9 +# cache_similarity: 0.9 # Response TTL in seconds. Remove the key or set to null to cache forever. -cache_ttl: 3600 +# cache_ttl: 3600 # SQLite backend: path to the cache database file -cache_db_path: llm_cache.db +# cache_db_path: llm_cache.db # Redis backend: connection URL # cache_redis_url: redis://localhost:6379/0 From fbdc73eebb050383291c48ca711ec9182142ca7a Mon Sep 17 00:00:00 2001 From: alpha-nerd-nomyo Date: Tue, 10 Mar 2026 15:19:37 +0100 Subject: [PATCH 4/5] fix: improvements, fixes and opt-in cache doc: semantic-cache.md added with detailed write-up --- cache.py | 154 ++++++++++++++++- doc/semantic-cache.md | 375 ++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 2 +- router.py | 85 ++++++++-- 4 files changed, 598 insertions(+), 18 deletions(-) create mode 100644 doc/semantic-cache.md diff --git a/cache.py b/cache.py index 219b1b9..b9db015 100644 --- a/cache.py +++ b/cache.py @@ -14,10 +14,16 @@ Strategy: - MOE models (moe-*) always bypass the cache. - Token counts are never recorded for cache hits. - Streaming cache hits are served as a single-chunk response. +- Privacy protection: responses that echo back user-identifying tokens from the system + prompt (names, emails, IDs) are stored WITHOUT an embedding. They remain findable + by exact-match for the same user but are invisible to cross-user semantic search. + Generic responses (capital of France → "Paris") keep their embeddings and can still + produce cross-user semantic hits as intended. """ import hashlib import math +import re import time import warnings from collections import Counter @@ -162,6 +168,111 @@ class LLMCache: from semantic_llm_cache.utils import hash_prompt, normalize_prompt return hash_prompt(normalize_prompt(last_user), namespace) + # ------------------------------------------------------------------ + # Privacy helpers — prevent cross-user leakage of personal data + # ------------------------------------------------------------------ + + _IDENTITY_STOPWORDS = frozenset({ + "user", "users", "name", "names", "email", "phone", "their", "they", + "this", "that", "with", "from", "have", "been", "also", "more", + "tags", "identity", "preference", "context", + }) + # Patterns that unambiguously signal personal data in a response + _EMAIL_RE = re.compile(r'\b[\w.%+-]+@[\w.-]+\.[a-zA-Z]{2,}\b') + _UUID_RE = re.compile( + r'\b[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b', + re.IGNORECASE, + ) + # Standalone numeric run ≥ 8 digits (common user/account IDs) + _NUMERIC_ID_RE = re.compile(r'\b\d{8,}\b') + + def _extract_response_content(self, response_bytes: bytes) -> str: + """Parse response bytes (Ollama or OpenAI format) and return the text content.""" + try: + import orjson + data = orjson.loads(response_bytes) + if "choices" in data: # OpenAI ChatCompletion + return (data["choices"][0].get("message") or {}).get("content", "") + if "message" in data: # Ollama chat + return (data.get("message") or {}).get("content", "") + if "response" in data: # Ollama generate + return data.get("response", "") + except Exception: + pass + return "" + + def _extract_personal_tokens(self, system: str) -> frozenset[str]: + """ + Extract user-identifying tokens from the system prompt. + + Looks for: + - Email addresses anywhere in the system prompt + - Numeric IDs (≥ 4 digits) appearing after "id" keyword + - Proper-noun-like words from [Tags: identity] lines + """ + if not system: + return frozenset() + + tokens: set[str] = set() + + # Email addresses + for email in self._EMAIL_RE.findall(system): + tokens.add(email.lower()) + + # Numeric IDs: "id: 1234", "id=5678" + for uid in re.findall(r'\bid\s*[=:]?\s*(\d{4,})\b', system, re.IGNORECASE): + tokens.add(uid) + + # Values from [Tags: identity] lines (e.g. "User's name is Andreas") + for line in re.findall( + r'\[Tags:.*?identity.*?\]\s*(.+?)(?:\n|$)', system, re.IGNORECASE + ): + for word in re.findall(r'\b\w{4,}\b', line): + w = word.lower() + if w not in self._IDENTITY_STOPWORDS: + tokens.add(w) + + return frozenset(tokens) + + def _response_is_personalized(self, response_bytes: bytes, system: str) -> bool: + """ + Return True if the response contains user-personal information. + + Two complementary checks: + + 1. Direct PII detection in the response content — emails, UUIDs, long + numeric IDs. Catches data retrieved at runtime via tool calls that + never appears in the system prompt. + + 2. System-prompt token overlap — words extracted from [Tags: identity] + lines that reappear verbatim in the response (catches names, etc.). + + Such responses are stored WITHOUT a semantic embedding so they are + invisible to cross-user semantic search while still being cacheable for + the same user via exact-match. + """ + content = self._extract_response_content(response_bytes) + if not content: + # Can't parse → err on the side of caution + return bool(response_bytes) + + # 1. Direct PII patterns — independent of what's in the system prompt + if ( + self._EMAIL_RE.search(content) + or self._UUID_RE.search(content) + or self._NUMERIC_ID_RE.search(content) + ): + return True + + # 2. System-prompt identity tokens echoed back in the response + personal = self._extract_personal_tokens(system) + if personal: + content_lower = content.lower() + if any(token in content_lower for token in personal): + return True + + return False + def _parse_messages( self, messages: list[dict] ) -> tuple[str, list[dict], str]: @@ -242,10 +353,17 @@ class LLMCache: ns = self._namespace(route, model, system) key = self._cache_key(ns, last_user) + print( + f"[cache] get_chat route={route} model={model} ns={ns} " + f"prompt={last_user[:80]!r} " + f"system_snippet={system[:120]!r}" + ) + # 1. Exact key match entry = await self._backend.get(key) if entry is not None: self._hits += 1 + print(f"[cache] HIT (exact) ns={ns} prompt={last_user[:80]!r}") return entry.response # type: ignore[return-value] # 2. Semantic similarity match @@ -255,11 +373,16 @@ class LLMCache: emb, threshold=self._cfg.cache_similarity, namespace=ns ) if result is not None: - _, matched, _ = result + _, matched, sim = result self._hits += 1 + print( + f"[cache] HIT (semantic sim={sim:.3f}) ns={ns} " + f"prompt={last_user[:80]!r} matched={matched.prompt[:80]!r}" + ) return matched.response # type: ignore[return-value] self._misses += 1 + print(f"[cache] MISS ns={ns} prompt={last_user[:80]!r}") return None async def set_chat( @@ -276,9 +399,36 @@ class LLMCache: ns = self._namespace(route, model, system) key = self._cache_key(ns, last_user) + # Privacy guard: check whether the response contains personal data. + personalized = self._response_is_personalized(response_bytes, system) + + if personalized: + # Exact-match is only safe when the system prompt is user-specific + # (i.e. different per user → different namespace). When the system + # prompt is generic and shared across all users, the namespace is the + # same for everyone: storing even without an embedding would let any + # user who asks the identical question get another user's personal data + # via exact-match. In that case skip storage entirely. + system_is_user_specific = bool(self._extract_personal_tokens(system)) + if not system_is_user_specific: + print( + f"[cache] SKIP personalized response with generic system prompt " + f"route={route} model={model} ns={ns} prompt={last_user[:80]!r} " + f"system_snippet={system[:120]!r}" + ) + return + + print( + f"[cache] set_chat route={route} model={model} ns={ns} " + f"personalized={personalized} " + f"prompt={last_user[:80]!r} " + f"system_snippet={system[:120]!r}" + ) + # Store without embedding when personalized (invisible to semantic search + # across users, but still reachable by exact-match within this namespace). emb = ( await self._build_embedding(history, last_user) - if self._semantic and self._cfg.cache_similarity < 1.0 + if self._semantic and self._cfg.cache_similarity < 1.0 and not personalized else None ) diff --git a/doc/semantic-cache.md b/doc/semantic-cache.md new file mode 100644 index 0000000..3469922 --- /dev/null +++ b/doc/semantic-cache.md @@ -0,0 +1,375 @@ +# Semantic Cache + +NOMYO Router includes a built-in LLM semantic cache that can dramatically reduce inference costs and latency by reusing previous responses — either via exact-match or by finding semantically equivalent questions. + +## How It Works + +Every incoming request is checked against the cache before being forwarded to an LLM endpoint. On a cache hit the stored response is returned immediately, bypassing inference entirely. + +Cache isolation is strict: each combination of **route + model + system prompt** gets its own namespace (a 16-character SHA-256 prefix). A question asked under one system prompt can never leak into a different user's or route's namespace. + +Two lookup strategies are available: + +| Mode | How it matches | When to use | +|---|---|---| +| **Exact match** | Normalized text hash | When you need 100% identical answers, zero false positives | +| **Semantic match** | Cosine similarity of sentence embeddings | When questions are paraphrased or phrased differently | + +In semantic mode the embedding is a weighted average of: +- The **last user message** (70% by default) — captures what is being asked +- A **BM25-weighted summary of the chat history** (30% by default) — provides conversation context + +This means "What is the capital of France?" and "Can you tell me the capital city of France?" will hit the same cache entry, but a question in a different conversation context will not. + +### MoE Model Bypass + +Models with names starting with `moe-` always bypass the cache entirely. Mixture-of-Experts models produce non-deterministic outputs by design and are not suitable for response caching. + +### Privacy Protection + +Responses that contain personally identifiable information (emails, UUIDs, numeric IDs ≥ 8 digits, or tokens extracted from `[Tags: identity]` lines in the system prompt) are stored **without a semantic embedding**. They remain reachable via exact-match within the same user-specific namespace but are invisible to cross-user semantic search. + +If a personalized response is generated with a generic (shared) system prompt, it is skipped entirely — never stored. + +--- + +## Docker Image Variants + +| Tag | Semantic cache | Approx. image size | Includes | +|---|---|---|---| +| `latest` | No (exact match only) | ~300 MB | Base router only | +| `latest-semantic` | Yes | ~800 MB | `sentence-transformers`, `torch`, `all-MiniLM-L6-v2` model baked in | + +Use the **`latest`** image when you only need exact-match deduplication. It has no heavy ML dependencies. + +Use **`latest-semantic`** when you want to catch paraphrased or semantically equivalent questions. The `all-MiniLM-L6-v2` model is baked into the image at build time so no internet access is needed at runtime. + +> **Note:** If you set `cache_similarity < 1.0` but use the `latest` image, the router falls back to exact-match caching and logs a warning. It will not error. + +Build locally: + +```bash +# Lean image (exact match only) +docker build -t nomyo-router . + +# Semantic image (~500 MB larger) +docker build --build-arg SEMANTIC_CACHE=true -t nomyo-router:semantic . +``` + +--- + +## Dependencies + +### Lean image / bare-metal (exact match) + +No extra dependencies beyond the base `requirements.txt`. The `semantic-llm-cache` library provides exact-match storage with a no-op embedding provider. + +### Semantic image / bare-metal (semantic match) + +Additional heavy packages are required: + +| Package | Purpose | Approx. size | +|---|---|---| +| `sentence-transformers` | Sentence embedding model wrapper | ~100 MB | +| `torch` (CPU) | PyTorch inference backend | ~700 MB | +| `numpy` | Vector math for weighted mean embeddings | ~20 MB | + +Install for bare-metal semantic mode: + +```bash +pip install sentence-transformers torch --index-url https://download.pytorch.org/whl/cpu +``` + +The `all-MiniLM-L6-v2` model (~90 MB) is downloaded on first use (or baked in when using the `:semantic` Docker image). + +--- + +## Configuration + +All cache settings live in `config.yaml` and can be overridden with environment variables prefixed `NOMYO_ROUTER_`. + +### Minimal — enable exact-match with in-memory backend + +```yaml +# config.yaml +cache_enabled: true +``` + +### Full reference + +```yaml +# config.yaml + +# Master switch — set to true to enable the cache +cache_enabled: false + +# Storage backend: "memory" | "sqlite" | "redis" +# memory — fast, in-process, lost on restart +# sqlite — persistent single-file, good for single-instance deployments +# redis — persistent, shared across multiple router instances +cache_backend: memory + +# Cosine similarity threshold for semantic matching +# 1.0 = exact match only (no sentence-transformers required) +# 0.95 = very close paraphrases only (recommended starting point) +# 0.85 = broader matching, higher false-positive risk +# Requires :semantic Docker image (or sentence-transformers installed) +cache_similarity: 1.0 + +# Time-to-live in seconds; null or omit to cache forever +cache_ttl: 3600 + +# SQLite backend: path to cache database file +cache_db_path: llm_cache.db + +# Redis backend: connection URL +cache_redis_url: redis://localhost:6379/0 + +# Weight of chat-history embedding in the combined query vector (0.0–1.0) +# 0.3 = 30% history context, 70% last user message (default) +# Increase if your use case is highly conversation-dependent +cache_history_weight: 0.3 +``` + +### Environment variable overrides + +```bash +NOMYO_ROUTER_CACHE_ENABLED=true +NOMYO_ROUTER_CACHE_BACKEND=sqlite +NOMYO_ROUTER_CACHE_SIMILARITY=0.95 +NOMYO_ROUTER_CACHE_TTL=7200 +NOMYO_ROUTER_CACHE_DB_PATH=/data/llm_cache.db +NOMYO_ROUTER_CACHE_REDIS_URL=redis://redis:6379/0 +NOMYO_ROUTER_CACHE_HISTORY_WEIGHT=0.3 +``` + +### Per-request opt-in + +The cache is opted in per-request via the `nomyo.cache` field in the request body. The global `cache_enabled` setting must also be `true`. + +```json +{ + "model": "llama3.2", + "messages": [{"role": "user", "content": "What is the capital of France?"}], + "nomyo": { + "cache": true + } +} +``` + +--- + +## Backend Comparison + +| | `memory` | `sqlite` | `redis` | +|---|---|---|---| +| Persistence | No (lost on restart) | Yes | Yes | +| Multi-instance | No | No | Yes | +| Setup required | None | None | Redis server | +| Recommended for | Development, testing | Single-instance production | Multi-instance / HA production | + +### SQLite — persistent single-instance + +```yaml +cache_enabled: true +cache_backend: sqlite +cache_db_path: /data/llm_cache.db +cache_ttl: 86400 # 24 hours +``` + +Mount the path in Docker: + +```bash +docker run -d \ + -v /path/to/data:/data \ + -e NOMYO_ROUTER_CACHE_ENABLED=true \ + -e NOMYO_ROUTER_CACHE_BACKEND=sqlite \ + -e NOMYO_ROUTER_CACHE_DB_PATH=/data/llm_cache.db \ + ghcr.io/nomyo-ai/nomyo-router:latest +``` + +### Redis — distributed / multi-instance + +```yaml +cache_enabled: true +cache_backend: redis +cache_redis_url: redis://redis:6379/0 +cache_similarity: 0.95 +cache_ttl: 3600 +``` + +Docker Compose example: + +```yaml +services: + nomyo-router: + image: ghcr.io/nomyo-ai/nomyo-router:latest-semantic + ports: + - "12434:12434" + environment: + NOMYO_ROUTER_CACHE_ENABLED: "true" + NOMYO_ROUTER_CACHE_BACKEND: redis + NOMYO_ROUTER_CACHE_REDIS_URL: redis://redis:6379/0 + NOMYO_ROUTER_CACHE_SIMILARITY: "0.95" + depends_on: + - redis + + redis: + image: redis:7-alpine + volumes: + - redis_data:/data + +volumes: + redis_data: +``` + +--- + +## Code Examples + +### Python (OpenAI SDK) + +```python +from openai import OpenAI + +client = OpenAI( + base_url="http://localhost:12434/v1", + api_key="unused", +) + +# First call — cache miss, hits the LLM +response = client.chat.completions.create( + model="llama3.2", + messages=[{"role": "user", "content": "What is the capital of France?"}], + extra_body={"nomyo": {"cache": True}}, +) +print(response.choices[0].message.content) + +# Second call (exact same question) — cache hit, returned instantly +response = client.chat.completions.create( + model="llama3.2", + messages=[{"role": "user", "content": "What is the capital of France?"}], + extra_body={"nomyo": {"cache": True}}, +) +print(response.choices[0].message.content) + +# With semantic mode enabled, this also hits the cache: +response = client.chat.completions.create( + model="llama3.2", + messages=[{"role": "user", "content": "Tell me the capital city of France"}], + extra_body={"nomyo": {"cache": True}}, +) +print(response.choices[0].message.content) +``` + +### Python (raw HTTP / httpx) + +```python +import httpx + +payload = { + "model": "llama3.2", + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Explain quantum entanglement in one sentence."}, + ], + "nomyo": {"cache": True}, +} + +resp = httpx.post("http://localhost:12434/v1/chat/completions", json=payload) +print(resp.json()["choices"][0]["message"]["content"]) +``` + +### curl + +```bash +curl -s http://localhost:12434/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "llama3.2", + "messages": [{"role": "user", "content": "What is the capital of France?"}], + "nomyo": {"cache": true} + }' | jq .choices[0].message.content +``` + +### Check cache statistics + +```bash +curl -s http://localhost:12434/api/cache/stats | jq . +``` + +```json +{ + "hits": 42, + "misses": 18, + "hit_rate": 0.7, + "semantic": true, + "backend": "sqlite", + "similarity_threshold": 0.95, + "history_weight": 0.3 +} +``` + +### Clear the cache + +```bash +curl -X POST http://localhost:12434/api/cache/clear +``` + +--- + +## Use Cases Where Semantic Caching Helps Most + +### FAQ and support bots + +Users ask the same questions in slightly different ways. "How do I reset my password?", "I forgot my password", "password reset instructions" — all semantically equivalent. Semantic caching collapses these into a single LLM call. + +**Recommended settings:** `cache_similarity: 0.90–0.95`, `cache_backend: sqlite` or `redis`, long TTL. + +### Document Q&A / RAG pipelines + +When the same document set is queried repeatedly by many users, common factual questions ("What is the contract start date?", "When does this contract begin?") hit the cache across users — as long as the system prompt is not user-personalized. + +**Recommended settings:** `cache_similarity: 0.92`, `cache_backend: redis` for multi-user deployments. + +### Code generation with repeated patterns + +Development tools that generate boilerplate, explain errors, or convert code often receive nearly-identical prompts. Caching prevents re-running expensive large-model inference for the same error message phrased differently. + +**Recommended settings:** `cache_similarity: 0.93`, `cache_backend: sqlite`. + +### High-traffic chatbots with shared context + +Public-facing bots where many users share the same system prompt (same product, same persona). Generic factual answers can be safely reused across users. The privacy guard ensures personalized responses are never shared. + +**Recommended settings:** `cache_similarity: 0.90–0.95`, `cache_backend: redis`. + +### Batch processing / data pipelines + +When running LLM inference over large datasets with many near-duplicate or repeated inputs, caching can reduce inference calls dramatically and make pipelines idempotent. + +**Recommended settings:** `cache_similarity: 0.95`, `cache_backend: sqlite`, `cache_ttl: null` (cache forever for deterministic batch runs). + +--- + +## Tuning the Similarity Threshold + +| `cache_similarity` | Behaviour | Risk | +|---|---|---| +| `1.0` | Exact match only | No false positives | +| `0.97` | Catches minor typos and punctuation differences | Very low | +| `0.95` | Catches clear paraphrases (recommended default) | Low | +| `0.90` | Catches broader rewordings | Moderate — may match different intents | +| `< 0.85` | Very aggressive matching | High false-positive rate, not recommended | + +Start at `0.95` and lower gradually while monitoring cache hit rate via `/api/cache/stats`. + +--- + +## Limitations + +- **Streaming responses** cached from a non-streaming request are served as a single chunk (not token-by-token). This is indistinguishable to most clients. +- **MoE models** (`moe-*` prefix) are never cached. +- **Token counts** are not recorded for cache hits (the LLM was not called). +- The `memory` backend does not survive a router restart. +- Semantic search requires the `:semantic` image (or local `sentence-transformers` install); without it, `cache_similarity < 1.0` silently falls back to exact match. diff --git a/requirements.txt b/requirements.txt index a2739b0..873701c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -43,4 +43,4 @@ aiosqlite # For semantic mode use the :semantic Docker image tag (adds sentence-transformers + torch) # SECURITY: pin to a specific tag or commit hash to prevent supply-chain drift, e.g.: # semantic-llm-cache @ git+https://github.com/nomyo-ai/async-semantic-llm-cache.git@v1.0.0 -semantic-llm-cache @ git+https://github.com/nomyo-ai/async-semantic-llm-cache.git +semantic-llm-cache@git+https://github.com/nomyo-ai/async-semantic-llm-cache.git@v0.1 diff --git a/router.py b/router.py index 82bed12..43ac488 100644 --- a/router.py +++ b/router.py @@ -1600,7 +1600,8 @@ async def proxy(request: Request): images = payload.get("images") options = payload.get("options") keep_alive = payload.get("keep_alive") - + _cache_enabled = payload.get("nomyo", {}).get("cache", False) + if not model: raise HTTPException( status_code=400, detail="Missing required field 'model'" @@ -1615,7 +1616,7 @@ async def proxy(request: Request): # Cache lookup — before endpoint selection so no slot is wasted on a hit _cache = get_llm_cache() - if _cache is not None: + if _cache is not None and _cache_enabled: _cached = await _cache.get_generate(model, prompt, system or "") if _cached is not None: async def _serve_cached_generate(): @@ -1671,7 +1672,7 @@ async def proxy(request: Request): else: json_line = orjson.dumps(chunk) # Accumulate and store cache on done chunk — before yield so it always runs - if _cache is not None: + if _cache is not None and _cache_enabled: if getattr(chunk, "response", None): content_parts.append(chunk.response) if getattr(chunk, "done", False): @@ -1710,7 +1711,7 @@ async def proxy(request: Request): cache_bytes = json_line.encode("utf-8") + b"\n" yield cache_bytes # Cache non-streaming response - if _cache is not None: + if _cache is not None and _cache_enabled: try: await _cache.set_generate(model, prompt, system or "", cache_bytes) except Exception as _ce: @@ -1749,6 +1750,7 @@ async def chat_proxy(request: Request): options = payload.get("options") logprobs = payload.get("logprobs") top_logprobs = payload.get("top_logprobs") + _cache_enabled = payload.get("nomyo", {}).get("cache", False) if not model: raise HTTPException( @@ -1775,7 +1777,7 @@ async def chat_proxy(request: Request): # Snapshot original messages before any OpenAI-format transformation so that # get_chat and set_chat always use the same key regardless of backend type. _cache_messages = messages - if _cache is not None and not _is_moe: + if _cache is not None and not _is_moe and _cache_enabled: _cached = await _cache.get_chat("ollama_chat", _cache_model, messages) if _cached is not None: async def _serve_cached_chat(): @@ -1858,7 +1860,7 @@ async def chat_proxy(request: Request): # Accumulate and store cache on done chunk — before yield so it always runs # Works for both Ollama-native and OpenAI-compatible backends; chunks are # already converted to Ollama format by rechunk before this point. - if _cache is not None and not _is_moe: + if _cache is not None and not _is_moe and _cache_enabled: if chunk.message and getattr(chunk.message, "content", None): content_parts.append(chunk.message.content) if getattr(chunk, "done", False): @@ -1898,7 +1900,7 @@ async def chat_proxy(request: Request): cache_bytes = json_line.encode("utf-8") + b"\n" yield cache_bytes # Cache non-streaming response (non-MOE; works for both Ollama and OpenAI backends) - if _cache is not None and not _is_moe: + if _cache is not None and not _is_moe and _cache_enabled: try: await _cache.set_chat("ollama_chat", _cache_model, _cache_messages, cache_bytes) except Exception as _ce: @@ -2746,6 +2748,7 @@ async def openai_chat_completions_proxy(request: Request): tools = payload.get("tools") logprobs = payload.get("logprobs") top_logprobs = payload.get("top_logprobs") + _cache_enabled = payload.get("nomyo", {}).get("cache", False) if not model: raise HTTPException( @@ -2786,9 +2789,20 @@ async def openai_chat_completions_proxy(request: Request): except orjson.JSONDecodeError as e: raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e + # Reject unsupported image formats (SVG) before doing any work + for _msg in messages: + for _item in (_msg.get("content") or []) if isinstance(_msg.get("content"), list) else []: + if _item.get("type") == "image_url": + _url = (_item.get("image_url") or {}).get("url", "") + if _url.startswith("data:image/svg") or _url.lower().endswith(".svg"): + raise HTTPException( + status_code=400, + detail="SVG images are not supported. Please convert the image to PNG or JPEG before sending.", + ) + # Cache lookup — before endpoint selection _cache = get_llm_cache() - if _cache is not None: + if _cache is not None and _cache_enabled: _cached = await _cache.get_chat("openai_chat", model, messages) if _cached is not None: if stream: @@ -2806,16 +2820,56 @@ async def openai_chat_completions_proxy(request: Request): base_url = ep2base(endpoint) oclient = openai.AsyncOpenAI(base_url=base_url, default_headers=default_headers, api_key=config.api_keys.get(endpoint, "no-key")) # 3. Async generator that streams completions data and decrements the counter + async def _normalize_images_in_messages(msgs: list) -> list: + """Fetch remote image URLs and convert them to base64 data URLs so + Ollama/llama-server can handle them without making outbound HTTP requests.""" + resolved = [] + for msg in msgs: + content = msg.get("content") + if not isinstance(content, list): + resolved.append(msg) + continue + new_content = [] + for item in content: + if item.get("type") == "image_url": + url = (item.get("image_url") or {}).get("url", "") + if url and not url.startswith("data:"): + try: + http: aiohttp.ClientSession = app_state["session"] + async with http.get(url) as resp: + ctype = resp.headers.get("Content-Type", "image/jpeg").split(";")[0].strip() + img_bytes = await resp.read() + b64 = base64.b64encode(img_bytes).decode("utf-8") + new_content.append({ + "type": "image_url", + "image_url": {"url": f"data:{ctype};base64,{b64}"} + }) + except Exception as _ie: + print(f"[image] Failed to fetch image URL: {_ie}") + new_content.append(item) + else: + new_content.append(item) + else: + new_content.append(item) + resolved.append({**msg, "content": new_content}) + return resolved + async def stream_ochat_response(): try: # The chat method returns a generator of dicts (or GenerateResponse) try: - async_gen = await oclient.chat.completions.create(**params) + # For non-external endpoints (Ollama, llama-server), resolve remote + # image URLs to base64 data URLs so the server can handle them locally. + send_params = params + if not is_ext_openai_endpoint(endpoint): + resolved_msgs = await _normalize_images_in_messages(params.get("messages", [])) + send_params = {**params, "messages": resolved_msgs} + async_gen = await oclient.chat.completions.create(**send_params) except openai.BadRequestError as e: # If tools are not supported by the model, retry without tools if "does not support tools" in str(e): print(f"[openai_chat_completions_proxy] Model {model} doesn't support tools, retrying without tools") - params_without_tools = {k: v for k, v in params.items() if k != "tools"} + params_without_tools = {k: v for k, v in send_params.items() if k != "tools"} async_gen = await oclient.chat.completions.create(**params_without_tools) else: raise @@ -2856,7 +2910,7 @@ async def openai_chat_completions_proxy(request: Request): if prompt_tok != 0 or comp_tok != 0: await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok)) # Cache assembled streaming response — before [DONE] so it always runs - if _cache is not None and content_parts: + if _cache is not None and _cache_enabled and content_parts: assembled = orjson.dumps({ "model": model, "choices": [{"index": 0, "message": {"role": "assistant", "content": "".join(content_parts)}, "finish_reason": "stop"}], @@ -2887,7 +2941,7 @@ async def openai_chat_completions_proxy(request: Request): cache_bytes = json_line.encode("utf-8") + b"\n" yield cache_bytes # Cache non-streaming response - if _cache is not None: + if _cache is not None and _cache_enabled: try: await _cache.set_chat("openai_chat", model, messages, cache_bytes) except Exception as _ce: @@ -2930,6 +2984,7 @@ async def openai_completions_proxy(request: Request): max_tokens = payload.get("max_tokens") max_completion_tokens = payload.get("max_completion_tokens") suffix = payload.get("suffix") + _cache_enabled = payload.get("nomyo", {}).get("cache", False) if not model: raise HTTPException( @@ -2970,7 +3025,7 @@ async def openai_completions_proxy(request: Request): # Cache lookup — completions prompt mapped to a single-turn messages list _cache = get_llm_cache() _compl_messages = [{"role": "user", "content": prompt}] - if _cache is not None: + if _cache is not None and _cache_enabled: _cached = await _cache.get_chat("openai_completions", model, _compl_messages) if _cached is not None: if stream: @@ -3029,7 +3084,7 @@ async def openai_completions_proxy(request: Request): if prompt_tok != 0 or comp_tok != 0: await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok)) # Cache assembled streaming response — before [DONE] so it always runs - if _cache is not None and text_parts: + if _cache is not None and _cache_enabled and text_parts: assembled = orjson.dumps({ "model": model, "choices": [{"index": 0, "message": {"role": "assistant", "content": "".join(text_parts)}, "finish_reason": "stop"}], @@ -3061,7 +3116,7 @@ async def openai_completions_proxy(request: Request): cache_bytes = json_line.encode("utf-8") + b"\n" yield cache_bytes # Cache non-streaming response - if _cache is not None: + if _cache is not None and _cache_enabled: try: await _cache.set_chat("openai_completions", model, _compl_messages, cache_bytes) except Exception as _ce: From 46da392a5317d007e15b5811fb10189096be00ce Mon Sep 17 00:00:00 2001 From: alpha-nerd-nomyo Date: Wed, 11 Mar 2026 09:40:00 +0100 Subject: [PATCH 5/5] fix: semcache version pinned --- requirements.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 873701c..9e89c90 100644 --- a/requirements.txt +++ b/requirements.txt @@ -41,6 +41,4 @@ yarl==1.20.1 aiosqlite # Semantic LLM cache — base install (exact-match mode, no heavy ML deps) # For semantic mode use the :semantic Docker image tag (adds sentence-transformers + torch) -# SECURITY: pin to a specific tag or commit hash to prevent supply-chain drift, e.g.: -# semantic-llm-cache @ git+https://github.com/nomyo-ai/async-semantic-llm-cache.git@v1.0.0 semantic-llm-cache@git+https://github.com/nomyo-ai/async-semantic-llm-cache.git@v0.1