diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..86dfdea --- /dev/null +++ b/.dockerignore @@ -0,0 +1,44 @@ +# Version control +.git +.gitignore +.github + +# Environment & secrets +.env +.env.* +*.env + +# Python artifacts +__pycache__ +*.pyc +*.pyo +*.pyd +.Python +.venv +venv +*.egg-info +dist +build + +# Local databases (don't bake data into image) +*.db +*.db-shm +*.db-wal + +# IDE / editor +.vscode +.idea +*.swp +*.swo + +# Documentation +doc/ +docs/ +*.md + +# Tests +tests/ +test_* + +# Local config overrides +config.local.yaml diff --git a/.github/workflows/docker-publish-semantic.yml b/.github/workflows/docker-publish-semantic.yml new file mode 100644 index 0000000..00ad041 --- /dev/null +++ b/.github/workflows/docker-publish-semantic.yml @@ -0,0 +1,71 @@ +name: Build and Publish Docker Image (Semantic Cache) + +# Builds the :semantic variant that includes sentence-transformers + CPU torch +# and the pre-baked all-MiniLM-L6-v2 embedding model (~500 MB larger than lean). +# Tags mirror the lean workflow but carry a -semantic suffix, e.g.: +# ghcr.io/nomyo-ai/nomyo-router:latest-semantic +# ghcr.io/nomyo-ai/nomyo-router:0.7.0-semantic +# ghcr.io/nomyo-ai/nomyo-router:0.7-semantic + +on: + push: + branches: + - main + tags: + - "v*.*.*" + workflow_dispatch: + +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }} + +jobs: + build-and-push-semantic: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up QEMU (for multi-arch builds) + uses: docker/setup-qemu-action@v3 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract Docker metadata + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + tags: | + # Versioned semantic tags on git tags (e.g. v0.7.0 → 0.7.0-semantic, 0.7-semantic) + type=semver,pattern={{version}}-semantic + type=semver,pattern={{major}}.{{minor}}-semantic + # latest-semantic only on main branch pushes + type=raw,value=latest-semantic,enable=${{ github.ref == 'refs/heads/main' }} + # SHA-tagged for traceability + type=sha,prefix=sha-,suffix=-semantic + + - name: Build and push semantic Docker image + uses: docker/build-push-action@v6 + with: + context: . + platforms: linux/amd64,linux/arm64 + push: true + build-args: | + SEMANTIC_CACHE=true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max diff --git a/Dockerfile b/Dockerfile index 073496d..75ee26f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,21 +3,43 @@ FROM python:3.13-slim ENV PYTHONUNBUFFERED=1 \ PYTHONDONTWRITEBYTECODE=1 +# SEMANTIC_CACHE=true installs sentence-transformers + CPU-only torch and pre-bakes +# the all-MiniLM-L6-v2 embedding model (~500 MB extra). The resulting image is tagged +# :semantic. The default (lean) image supports exact-match caching only. +ARG SEMANTIC_CACHE=false + +# Pin HuggingFace cache to a predictable path inside /app/data so it can be +# mounted as a volume and shared between builds. +ENV HF_HOME=/app/data/hf_cache + # Install SQLite -RUN apt-get update && apt-get install -y sqlite3 +RUN apt-get update && apt-get install -y --no-install-recommends sqlite3 \ + && rm -rf /var/lib/apt/lists/* WORKDIR /app COPY requirements.txt . RUN pip install --no-cache-dir --upgrade pip \ && pip install --no-cache-dir -r requirements.txt +# Semantic cache deps — only installed when SEMANTIC_CACHE=true +# CPU-only torch must be installed before sentence-transformers to avoid +# pulling the full CUDA-enabled build (~2.5 GB). +RUN if [ "$SEMANTIC_CACHE" = "true" ]; then \ + pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu && \ + pip install --no-cache-dir sentence-transformers && \ + python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')"; \ + fi + # Create database directory and set permissions RUN mkdir -p /app/data && chown -R www-data:www-data /app/data COPY . . -RUN chmod +x /app/entrypoint.sh +RUN chmod +x /app/entrypoint.sh && \ + chown -R www-data:www-data /app EXPOSE 12434 +USER www-data + ENTRYPOINT ["/app/entrypoint.sh"] diff --git a/README.md b/README.md index 744a86f..0a3b341 100644 --- a/README.md +++ b/README.md @@ -74,22 +74,28 @@ uvicorn router:app --host 127.0.0.1 --port 12434 --loop uvloop ### Pre-built image (GitHub Container Registry) -Pre-built multi-arch images (`linux/amd64`, `linux/arm64`) are published automatically on every release: +Pre-built multi-arch images (`linux/amd64`, `linux/arm64`) are published automatically on every release. +**Lean image** (exact-match cache, ~300 MB): ```sh docker pull ghcr.io/nomyo-ai/nomyo-router:latest -``` - -Specific version: - -```sh docker pull ghcr.io/nomyo-ai/nomyo-router:0.7.0 ``` -### Build the container image locally: +**Semantic image** (semantic cache with `all-MiniLM-L6-v2` pre-baked, ~800 MB): +```sh +docker pull ghcr.io/nomyo-ai/nomyo-router:latest-semantic +docker pull ghcr.io/nomyo-ai/nomyo-router:0.7.0-semantic +``` + +### Build the container image locally ```sh +# Lean build (exact match cache, default) docker build -t nomyo-router . + +# Semantic build — sentence-transformers + model baked in +docker build --build-arg SEMANTIC_CACHE=true -t nomyo-router:semantic . ``` Run the router in Docker with your own configuration file mounted from the host. The entrypoint script accepts a `--config-path` argument so you can point to a file anywhere inside the container: @@ -124,6 +130,53 @@ This way the Ollama backend servers are utilized more efficient than by simply u NOMYO Router also supports OpenAI API compatible v1 backend servers. +## Semantic LLM Cache + +NOMYO Router includes an optional semantic cache that serves repeated or semantically similar LLM requests from cache — no endpoint round-trip, no token cost, response in <10 ms. + +### Enable (exact match, any image) + +```yaml +# config.yaml +cache_enabled: true +cache_backend: sqlite # persists across restarts +cache_similarity: 1.0 # exact match only +cache_ttl: 3600 +``` + +### Enable (semantic matching, :semantic image) + +```yaml +cache_enabled: true +cache_backend: sqlite +cache_similarity: 0.90 # "What is Python?" ≈ "What's Python?" → cache hit +cache_ttl: 3600 +cache_history_weight: 0.3 +``` + +Pull the semantic image: +```bash +docker pull ghcr.io/nomyo-ai/nomyo-router:latest-semantic +``` + +### Cache key strategy + +Each request is keyed on `model + system_prompt` (exact) combined with a weighted-mean embedding of BM25-weighted chat history (30%) and the last user message (70%). This means: +- Different system prompts → always separate cache namespaces (no cross-tenant leakage) +- Same question, different phrasing → cache hit (semantic mode) +- MOE requests (`moe-*`) → always bypass the cache + +### Cached routes + +`/api/chat` · `/api/generate` · `/v1/chat/completions` · `/v1/completions` + +### Cache management + +```bash +curl http://localhost:12434/api/cache/stats # hit rate, counters, config +curl -X POST http://localhost:12434/api/cache/invalidate # clear all entries +``` + ## Supplying the router API key If you set `nomyo-router-api-key` in `config.yaml` (or `NOMYO_ROUTER_API_KEY` env), every request to NOMYO Router must include the key: diff --git a/cache.py b/cache.py new file mode 100644 index 0000000..219b1b9 --- /dev/null +++ b/cache.py @@ -0,0 +1,407 @@ +""" +LLM Semantic Cache for NOMYO Router. + +Strategy: +- Namespace: sha256(route :: model :: system_prompt)[:16] — exact context isolation +- Cache key: hash(normalize(last_user_message), namespace) — exact lookup +- Embedding: weighted mean of + α * embed(bm25_weighted(chat_history)) — conversation context + 1-α * embed(last_user_message) — the actual question + with α = cache_history_weight (default 0.3). +- Exact-match caching (similarity=1.0) uses DummyEmbeddingProvider — zero extra deps. +- Semantic caching (similarity<1.0) requires sentence-transformers. If missing the + library falls back to exact-match with a warning (lean Docker image behaviour). +- MOE models (moe-*) always bypass the cache. +- Token counts are never recorded for cache hits. +- Streaming cache hits are served as a single-chunk response. +""" + +import hashlib +import math +import time +import warnings +from collections import Counter +from typing import Any, Optional + +# Lazily resolved once at first embed() call +_semantic_available: Optional[bool] = None + + +def _check_sentence_transformers() -> bool: + global _semantic_available + if _semantic_available is None: + try: + import sentence_transformers # noqa: F401 + _semantic_available = True + except ImportError: + _semantic_available = False + return _semantic_available # type: ignore[return-value] + + +# --------------------------------------------------------------------------- +# BM25-weighted text representation of chat history +# --------------------------------------------------------------------------- + +def _bm25_weighted_text(history: list[dict]) -> str: + """ + Produce a BM25-importance-weighted text string from chat history turns. + + High-IDF (rare, domain-specific) terms are repeated proportionally to + their BM25 score so the downstream sentence-transformer embedding + naturally upweights topical signal and downweights stop words. + """ + docs = [m.get("content", "") for m in history if m.get("content")] + if not docs: + return "" + + def _tok(text: str) -> list[str]: + return [w.lower() for w in text.split() if len(w) > 2] + + tokenized = [_tok(d) for d in docs] + N = len(tokenized) + + df: Counter = Counter() + for tokens in tokenized: + for term in set(tokens): + df[term] += 1 + + k1, b = 1.5, 0.75 + avg_dl = sum(len(t) for t in tokenized) / max(N, 1) + + term_scores: Counter = Counter() + for tokens in tokenized: + tf_c = Counter(tokens) + dl = len(tokens) + for term, tf in tf_c.items(): + idf = math.log((N + 1) / (df[term] + 1)) + 1.0 + score = idf * (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * dl / max(avg_dl, 1))) + term_scores[term] += score + + top = term_scores.most_common(50) + if not top: + return " ".join(docs) + + max_s = top[0][1] + out: list[str] = [] + for term, score in top: + out.extend([term] * max(1, round(3 * score / max_s))) + return " ".join(out) + + +# --------------------------------------------------------------------------- +# LLMCache +# --------------------------------------------------------------------------- + +class LLMCache: + """ + Thin async wrapper around async-semantic-llm-cache that adds: + - Route-aware namespace isolation + - Two-vector weighted-mean embedding (history context + question) + - Per-instance hit/miss counters + - Graceful fallback when sentence-transformers is absent + """ + + def __init__(self, cfg: Any) -> None: + self._cfg = cfg + self._backend: Any = None + self._emb_cache: Any = None + self._semantic: bool = False + self._hits: int = 0 + self._misses: int = 0 + + async def init(self) -> None: + from semantic_llm_cache.similarity import EmbeddingCache + + # --- Backend --- + backend_type: str = self._cfg.cache_backend + if backend_type == "sqlite": + from semantic_llm_cache.backends.sqlite import SQLiteBackend + self._backend = SQLiteBackend(db_path=self._cfg.cache_db_path) + elif backend_type == "redis": + from semantic_llm_cache.backends.redis import RedisBackend + self._backend = RedisBackend(url=self._cfg.cache_redis_url) + await self._backend.ping() + else: + from semantic_llm_cache.backends.memory import MemoryBackend + self._backend = MemoryBackend() + + # --- Embedding provider --- + if self._cfg.cache_similarity < 1.0: + if _check_sentence_transformers(): + from semantic_llm_cache.similarity import create_embedding_provider + provider = create_embedding_provider("sentence-transformer") + self._emb_cache = EmbeddingCache(provider=provider) + self._semantic = True + print( + f"[cache] Semantic cache ready " + f"(similarity≥{self._cfg.cache_similarity}, backend={backend_type})" + ) + else: + warnings.warn( + "[cache] sentence-transformers is not installed. " + "Falling back to exact-match caching (similarity=1.0). " + "Use the :semantic Docker image tag to enable semantic caching.", + RuntimeWarning, + stacklevel=2, + ) + self._emb_cache = EmbeddingCache() # DummyEmbeddingProvider + print(f"[cache] Exact-match cache ready (backend={backend_type}) [semantic unavailable]") + else: + self._emb_cache = EmbeddingCache() # DummyEmbeddingProvider + print(f"[cache] Exact-match cache ready (backend={backend_type})") + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _namespace(self, route: str, model: str, system: str) -> str: + raw = f"{route}::{model}::{system}" + return hashlib.sha256(raw.encode()).hexdigest()[:16] + + def _cache_key(self, namespace: str, last_user: str) -> str: + from semantic_llm_cache.utils import hash_prompt, normalize_prompt + return hash_prompt(normalize_prompt(last_user), namespace) + + def _parse_messages( + self, messages: list[dict] + ) -> tuple[str, list[dict], str]: + """ + Returns (system_prompt, prior_history_turns, last_user_message). + Multimodal content lists are reduced to their text parts. + """ + system = "" + turns: list[dict] = [] + + for m in messages: + role = m.get("role", "") + content = m.get("content", "") + if isinstance(content, list): + content = " ".join( + p.get("text", "") + for p in content + if isinstance(p, dict) and p.get("type") == "text" + ) + if role == "system": + system = content + else: + turns.append({"role": role, "content": content}) + + last_user = "" + for m in reversed(turns): + if m["role"] == "user": + last_user = m["content"] + break + + # History = all turns before the final user message + history = turns[:-1] if turns and turns[-1]["role"] == "user" else turns + return system, history, last_user + + async def _build_embedding( + self, history: list[dict], last_user: str + ) -> list[float] | None: + """ + Weighted mean of BM25-weighted history embedding and last-user embedding. + Returns None when not in semantic mode. + """ + if not self._semantic: + return None + + import numpy as np + + alpha: float = self._cfg.cache_history_weight # weight for history signal + q_vec = np.array(await self._emb_cache.aencode(last_user), dtype=float) + + if not history: + # No history → use question embedding alone (alpha has no effect) + return q_vec.tolist() + + h_text = _bm25_weighted_text(history) + h_vec = np.array(await self._emb_cache.aencode(h_text), dtype=float) + + combined = alpha * h_vec + (1.0 - alpha) * q_vec + norm = float(np.linalg.norm(combined)) + if norm > 0.0: + combined /= norm + return combined.tolist() + + # ------------------------------------------------------------------ + # Public interface: chat (handles both Ollama and OpenAI message lists) + # ------------------------------------------------------------------ + + async def get_chat( + self, route: str, model: str, messages: list[dict] + ) -> bytes | None: + """Return cached response bytes, or None on miss.""" + if not self._backend: + return None + + system, history, last_user = self._parse_messages(messages) + if not last_user: + return None + + ns = self._namespace(route, model, system) + key = self._cache_key(ns, last_user) + + # 1. Exact key match + entry = await self._backend.get(key) + if entry is not None: + self._hits += 1 + return entry.response # type: ignore[return-value] + + # 2. Semantic similarity match + if self._semantic and self._cfg.cache_similarity < 1.0: + emb = await self._build_embedding(history, last_user) + result = await self._backend.find_similar( + emb, threshold=self._cfg.cache_similarity, namespace=ns + ) + if result is not None: + _, matched, _ = result + self._hits += 1 + return matched.response # type: ignore[return-value] + + self._misses += 1 + return None + + async def set_chat( + self, route: str, model: str, messages: list[dict], response_bytes: bytes + ) -> None: + """Store a response in the cache (fire-and-forget friendly).""" + if not self._backend: + return + + system, history, last_user = self._parse_messages(messages) + if not last_user: + return + + ns = self._namespace(route, model, system) + key = self._cache_key(ns, last_user) + + emb = ( + await self._build_embedding(history, last_user) + if self._semantic and self._cfg.cache_similarity < 1.0 + else None + ) + + from semantic_llm_cache.config import CacheEntry + + await self._backend.set( + key, + CacheEntry( + prompt=last_user, + response=response_bytes, + embedding=emb, + created_at=time.time(), + ttl=self._cfg.cache_ttl, + namespace=ns, + hit_count=0, + ), + ) + + # ------------------------------------------------------------------ + # Convenience wrappers for the generate route (prompt string, not messages) + # ------------------------------------------------------------------ + + async def get_generate( + self, model: str, prompt: str, system: str = "" + ) -> bytes | None: + messages: list[dict] = [] + if system: + messages.append({"role": "system", "content": system}) + messages.append({"role": "user", "content": prompt}) + return await self.get_chat("generate", model, messages) + + async def set_generate( + self, model: str, prompt: str, system: str, response_bytes: bytes + ) -> None: + messages: list[dict] = [] + if system: + messages.append({"role": "system", "content": system}) + messages.append({"role": "user", "content": prompt}) + await self.set_chat("generate", model, messages, response_bytes) + + # ------------------------------------------------------------------ + # Management + # ------------------------------------------------------------------ + + def stats(self) -> dict: + total = self._hits + self._misses + return { + "hits": self._hits, + "misses": self._misses, + "hit_rate": round(self._hits / total, 3) if total else 0.0, + "semantic": self._semantic, + "backend": self._cfg.cache_backend, + "similarity_threshold": self._cfg.cache_similarity, + "history_weight": self._cfg.cache_history_weight, + } + + async def clear(self) -> None: + if self._backend: + await self._backend.clear() + self._hits = 0 + self._misses = 0 + + +# --------------------------------------------------------------------------- +# Module-level singleton +# --------------------------------------------------------------------------- + +_cache: LLMCache | None = None + + +async def init_llm_cache(cfg: Any) -> LLMCache | None: + """Initialise the module-level cache singleton. Returns None if disabled.""" + global _cache + if not cfg.cache_enabled: + print("[cache] Cache disabled (cache_enabled=false).") + return None + _cache = LLMCache(cfg) + await _cache.init() + return _cache + + +def get_llm_cache() -> LLMCache | None: + return _cache + + +# --------------------------------------------------------------------------- +# Helper: convert a stored Ollama-format non-streaming response to an +# OpenAI SSE single-chunk stream (used when a streaming OpenAI request +# hits the cache whose entry was populated from a non-streaming response). +# --------------------------------------------------------------------------- + +def openai_nonstream_to_sse(cached_bytes: bytes, model: str) -> bytes: + """ + Wrap a stored OpenAI ChatCompletion JSON as a minimal single-chunk SSE stream. + The stored entry always uses the non-streaming ChatCompletion format so that + non-streaming cache hits can be served directly; this function adapts it for + streaming clients. + """ + import orjson, time as _time + + try: + d = orjson.loads(cached_bytes) + content = (d.get("choices") or [{}])[0].get("message", {}).get("content", "") + chunk = { + "id": d.get("id", "cache-hit"), + "object": "chat.completion.chunk", + "created": d.get("created", int(_time.time())), + "model": d.get("model", model), + "choices": [ + { + "index": 0, + "delta": {"role": "assistant", "content": content}, + "finish_reason": "stop", + } + ], + } + if d.get("usage"): + chunk["usage"] = d["usage"] + return f"data: {orjson.dumps(chunk).decode()}\n\ndata: [DONE]\n\n".encode() + except Exception as exc: + warnings.warn( + f"[cache] openai_nonstream_to_sse: corrupt cache entry, returning empty stream: {exc}", + RuntimeWarning, + stacklevel=2, + ) + return b"data: [DONE]\n\n" diff --git a/config.yaml b/config.yaml index 752ccec..757873e 100644 --- a/config.yaml +++ b/config.yaml @@ -6,7 +6,7 @@ endpoints: - https://api.openai.com/v1 llama_server_endpoints: - - http://192.168.0.33:8889/v1 + - http://192.168.0.50:8889/v1 # Maximum concurrent connections *per endpoint‑model pair* (equals to OLLAMA_NUM_PARALLEL) max_concurrent_connections: 2 @@ -22,4 +22,38 @@ api_keys: "http://192.168.0.51:11434": "ollama" "http://192.168.0.52:11434": "ollama" "https://api.openai.com/v1": "${OPENAI_KEY}" - "http://192.168.0.33:8889/v1": "llama" \ No newline at end of file + "http://192.168.0.50:8889/v1": "llama" + +# ------------------------------------------------------------- +# Semantic LLM Cache (optional — disabled by default) +# Caches LLM responses to cut costs and latency on repeated or +# semantically similar prompts. +# Cached routes: /api/chat /api/generate /v1/chat/completions /v1/completions +# MOE requests (moe-* model prefix) always bypass the cache. +# ------------------------------------------------------------- +cache_enabled: true + +# Backend — where cached responses are stored: +# memory → in-process LRU (lost on restart, not shared across replicas) [default] +# sqlite → persistent file-based (single instance, survives restart) +# redis → distributed (shared across replicas, requires Redis) +cache_backend: memory + +# Cosine similarity threshold for a cache hit: +# 1.0 → exact match only (works on any image variant) +# <1.0 → semantic matching (requires the :semantic Docker image tag) +cache_similarity: 0.9 + +# Response TTL in seconds. Remove the key or set to null to cache forever. +cache_ttl: 3600 + +# SQLite backend: path to the cache database file +cache_db_path: llm_cache.db + +# Redis backend: connection URL +# cache_redis_url: redis://localhost:6379/0 + +# Weight of the BM25-weighted chat-history embedding vs last-user-message embedding. +# 0.3 = 30% history context signal, 70% question signal. +# Only relevant when cache_similarity < 1.0. +# cache_history_weight: 0.3 \ No newline at end of file diff --git a/doc/configuration.md b/doc/configuration.md index afb7dda..af2dfae 100644 --- a/doc/configuration.md +++ b/doc/configuration.md @@ -204,6 +204,149 @@ max_concurrent_connections: 3 **Recommendation**: Use multiple endpoints for redundancy and load distribution. +## Semantic LLM Cache + +NOMYO Router can cache LLM responses and serve them directly — skipping endpoint selection, model load, and token generation entirely. + +### How it works + +1. On every cacheable request (`/api/chat`, `/api/generate`, `/v1/chat/completions`, `/v1/completions`) the cache is checked **before** choosing an endpoint. +2. On a **cache hit** the stored response is returned immediately as a single chunk (streaming or non-streaming — both work). +3. On a **cache miss** the request is forwarded normally. The response is stored in the cache after it completes. +4. **MOE requests** (`moe-*` model prefix) always bypass the cache. +5. **Token counts** are never recorded for cache hits. + +### Cache key strategy + +| Signal | How matched | +|---|---| +| `model + system_prompt` | Exact — hard context isolation per deployment | +| BM25-weighted embedding of chat history | Semantic — conversation context signal | +| Embedding of last user message | Semantic — the actual question | + +The two semantic vectors are combined as a weighted mean (tuned by `cache_history_weight`) before cosine similarity comparison, staying at a single 384-dimensional vector compatible with the library's storage format. + +### Quick start — exact match (lean image) + +```yaml +cache_enabled: true +cache_backend: sqlite # persists across restarts +cache_similarity: 1.0 # exact match only, no sentence-transformers needed +cache_ttl: 3600 +``` + +### Quick start — semantic matching (:semantic image) + +```yaml +cache_enabled: true +cache_backend: sqlite +cache_similarity: 0.90 # hit if ≥90% cosine similarity +cache_ttl: 3600 +cache_history_weight: 0.3 +``` + +Pull the semantic image: +```bash +docker pull ghcr.io/nomyo-ai/nomyo-router:latest-semantic +``` + +### Cache configuration options + +#### `cache_enabled` + +**Type**: `bool` | **Default**: `false` + +Enable or disable the cache. All other cache settings are ignored when `false`. + +#### `cache_backend` + +**Type**: `str` | **Default**: `"memory"` + +| Value | Description | Persists | Multi-replica | +|---|---|---|---| +| `memory` | In-process LRU dict | ❌ | ❌ | +| `sqlite` | File-based via `aiosqlite` | ✅ | ❌ | +| `redis` | Redis via `redis.asyncio` | ✅ | ✅ | + +Use `redis` when running multiple router replicas behind a load balancer — all replicas share one warm cache. + +#### `cache_similarity` + +**Type**: `float` | **Default**: `1.0` + +Cosine similarity threshold. `1.0` means exact match only (no embedding model needed). Values below `1.0` enable semantic matching, which requires the `:semantic` Docker image tag. + +Recommended starting value for semantic mode: `0.90`. + +#### `cache_ttl` + +**Type**: `int | null` | **Default**: `3600` + +Time-to-live for cache entries in seconds. Remove the key or set to `null` to cache forever. + +#### `cache_db_path` + +**Type**: `str` | **Default**: `"llm_cache.db"` + +Path to the SQLite cache database. Only used when `cache_backend: sqlite`. + +#### `cache_redis_url` + +**Type**: `str` | **Default**: `"redis://localhost:6379/0"` + +Redis connection URL. Only used when `cache_backend: redis`. + +#### `cache_history_weight` + +**Type**: `float` | **Default**: `0.3` + +Weight of the BM25-weighted chat-history embedding in the combined cache key vector. `0.3` means the history contributes 30% and the final user message contributes 70% of the similarity signal. Only used when `cache_similarity < 1.0`. + +### Cache management endpoints + +| Endpoint | Method | Description | +|---|---|---| +| `/api/cache/stats` | `GET` | Hit/miss counters, hit rate, current config | +| `/api/cache/invalidate` | `POST` | Clear all cache entries and reset counters | + +```bash +# Check cache performance +curl http://localhost:12434/api/cache/stats + +# Clear the cache +curl -X POST http://localhost:12434/api/cache/invalidate +``` + +Example stats response: +```json +{ + "enabled": true, + "hits": 1547, + "misses": 892, + "hit_rate": 0.634, + "semantic": true, + "backend": "sqlite", + "similarity_threshold": 0.9, + "history_weight": 0.3 +} +``` + +### Docker image variants + +| Tag | Semantic cache | Image size | +|---|---|---| +| `latest` | ❌ exact match only | ~300 MB | +| `latest-semantic` | ✅ sentence-transformers + model pre-baked | ~800 MB | + +Build locally: +```bash +# Lean (exact match) +docker build -t nomyo-router . + +# Semantic (~500 MB larger, all-MiniLM-L6-v2 model baked in) +docker build --build-arg SEMANTIC_CACHE=true -t nomyo-router:semantic . +``` + ## Configuration Validation The router validates the configuration at startup: diff --git a/doc/deployment.md b/doc/deployment.md index 0970a35..efff3ff 100644 --- a/doc/deployment.md +++ b/doc/deployment.md @@ -82,10 +82,23 @@ sudo systemctl status nomyo-router ## 2. Docker Deployment +### Image variants + +| Tag | Semantic cache | Image size | +|---|---|---| +| `latest` | ❌ exact match only | ~300 MB | +| `latest-semantic` | ✅ sentence-transformers + `all-MiniLM-L6-v2` pre-baked | ~800 MB | + +The `:semantic` variant enables `cache_similarity < 1.0` in `config.yaml`. The lean image falls back to exact-match caching with a warning if semantic mode is configured. + ### Build the Image ```bash +# Lean build (exact match cache, default) docker build -t nomyo-router . + +# Semantic build (~500 MB larger, all-MiniLM-L6-v2 model baked in at build time) +docker build --build-arg SEMANTIC_CACHE=true -t nomyo-router:semantic . ``` ### Run the Container diff --git a/doc/examples/docker-compose.yml b/doc/examples/docker-compose.yml index d940c73..ab34070 100644 --- a/doc/examples/docker-compose.yml +++ b/doc/examples/docker-compose.yml @@ -1,20 +1,30 @@ # Docker Compose example for NOMYO Router with multiple Ollama instances +# +# Two router profiles are provided: +# nomyo-router — lean image, exact-match cache only (~300 MB) +# nomyo-router-semantic — semantic image, sentence-transformers baked in (~800 MB) +# +# Uncomment the redis service and set cache_backend: redis in config.yaml +# to share the LLM response cache across multiple router replicas. version: '3.8' services: - # NOMYO Router + # NOMYO Router — lean image (exact-match cache, default) nomyo-router: image: nomyo-router:latest - build: . + build: + context: . + args: + SEMANTIC_CACHE: "false" ports: - "12434:12434" environment: - CONFIG_PATH=/app/config/config.yaml - - NOMYO_ROUTER_DB_PATH=/app/token_counts.db + - NOMYO_ROUTER_DB_PATH=/app/data/token_counts.db volumes: - ./config:/app/config - - router-db:/app/token_counts.db + - router-data:/app/data depends_on: - ollama1 - ollama2 @@ -23,6 +33,45 @@ services: networks: - nomyo-net + # NOMYO Router — semantic image (cache_similarity < 1.0 support, ~800 MB) + # Build: docker compose build nomyo-router-semantic + # Switch: comment out nomyo-router above, uncomment this block. + # nomyo-router-semantic: + # image: nomyo-router:semantic + # build: + # context: . + # args: + # SEMANTIC_CACHE: "true" + # ports: + # - "12434:12434" + # environment: + # - CONFIG_PATH=/app/config/config.yaml + # - NOMYO_ROUTER_DB_PATH=/app/data/token_counts.db + # volumes: + # - ./config:/app/config + # - router-data:/app/data + # - hf-cache:/app/data/hf_cache # share HuggingFace model cache across builds + # depends_on: + # - ollama1 + # - ollama2 + # - ollama3 + # restart: unless-stopped + # networks: + # - nomyo-net + + # Optional: Redis for shared LLM response cache across multiple router replicas. + # Requires cache_backend: redis in config.yaml. + # redis: + # image: redis:7-alpine + # ports: + # - "6379:6379" + # volumes: + # - redis-data:/data + # command: redis-server --save 60 1 --loglevel warning + # restart: unless-stopped + # networks: + # - nomyo-net + # Ollama Instance 1 ollama1: image: ollama/ollama:latest @@ -87,7 +136,9 @@ services: - nomyo-net volumes: - router-db: + router-data: + # hf-cache: # uncomment when using nomyo-router-semantic + # redis-data: # uncomment when using Redis cache backend ollama1-data: ollama2-data: ollama3-data: diff --git a/doc/examples/sample-config.yaml b/doc/examples/sample-config.yaml index 72dd0ff..a1081af 100644 --- a/doc/examples/sample-config.yaml +++ b/doc/examples/sample-config.yaml @@ -29,4 +29,38 @@ api_keys: "http://192.168.0.52:11434": "ollama" "https://api.openai.com/v1": "${OPENAI_KEY}" "http://localhost:8080/v1": "llama-server" # Optional API key for llama-server - depends on llama_server config - "http://192.168.0.33:8081/v1": "llama-server" \ No newline at end of file + "http://192.168.0.33:8081/v1": "llama-server" + +# ------------------------------------------------------------- +# Semantic LLM Cache (optional — disabled by default) +# Caches LLM responses to cut costs and latency on repeated or +# semantically similar prompts. +# Cached routes: /api/chat /api/generate /v1/chat/completions /v1/completions +# MOE requests (moe-* model prefix) always bypass the cache. +# ------------------------------------------------------------- +# cache_enabled: false + +# Backend — where cached responses are stored: +# memory → in-process LRU (lost on restart, not shared across replicas) [default] +# sqlite → persistent file-based (single instance, survives restart) +# redis → distributed (shared across replicas, requires Redis) +# cache_backend: memory + +# Cosine similarity threshold for a cache hit: +# 1.0 → exact match only (works on any image variant) +# <1.0 → semantic matching (requires the :semantic Docker image tag) +# cache_similarity: 1.0 + +# Response TTL in seconds. Remove the key or set to null to cache forever. +# cache_ttl: 3600 + +# SQLite backend: path to the cache database file +# cache_db_path: llm_cache.db + +# Redis backend: connection URL +# cache_redis_url: redis://localhost:6379/0 + +# Weight of the BM25-weighted chat-history embedding vs last-user-message embedding. +# 0.3 = 30% history context signal, 70% question signal. +# Only relevant when cache_similarity < 1.0. +# cache_history_weight: 0.3 \ No newline at end of file diff --git a/doc/monitoring.md b/doc/monitoring.md index 7f96def..b5bcbff 100644 --- a/doc/monitoring.md +++ b/doc/monitoring.md @@ -133,6 +133,39 @@ Response: } ``` +### Cache Statistics + +```bash +curl http://localhost:12434/api/cache/stats +``` + +Response when cache is enabled: +```json +{ + "enabled": true, + "hits": 1547, + "misses": 892, + "hit_rate": 0.634, + "semantic": true, + "backend": "sqlite", + "similarity_threshold": 0.9, + "history_weight": 0.3 +} +``` + +Response when cache is disabled: +```json +{ "enabled": false } +``` + +### Cache Invalidation + +```bash +curl -X POST http://localhost:12434/api/cache/invalidate +``` + +Clears all cached entries and resets hit/miss counters. + ### Real-time Usage Stream ```bash diff --git a/requirements.txt b/requirements.txt index f1db419..a2739b0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -39,3 +39,8 @@ uvicorn==0.38.0 uvloop yarl==1.20.1 aiosqlite +# Semantic LLM cache — base install (exact-match mode, no heavy ML deps) +# For semantic mode use the :semantic Docker image tag (adds sentence-transformers + torch) +# SECURITY: pin to a specific tag or commit hash to prevent supply-chain drift, e.g.: +# semantic-llm-cache @ git+https://github.com/nomyo-ai/async-semantic-llm-cache.git@v1.0.0 +semantic-llm-cache @ git+https://github.com/nomyo-ai/async-semantic-llm-cache.git diff --git a/router.py b/router.py index e6987e4..82bed12 100644 --- a/router.py +++ b/router.py @@ -123,6 +123,22 @@ class Config(BaseSettings): # Database configuration db_path: str = Field(default=os.getenv("NOMYO_ROUTER_DB_PATH", "token_counts.db")) + # Semantic LLM Cache configuration + cache_enabled: bool = Field(default=False) + # Backend: "memory" (default, in-process), "sqlite" (persistent), "redis" (distributed) + cache_backend: str = Field(default="memory") + # Cosine similarity threshold: 1.0 = exact match only, <1.0 = semantic (requires :semantic image) + cache_similarity: float = Field(default=1.0) + # TTL in seconds; None = cache forever + cache_ttl: Optional[int] = Field(default=3600) + # SQLite backend: path to cache database file + cache_db_path: str = Field(default="llm_cache.db") + # Redis backend: connection URL + cache_redis_url: str = Field(default="redis://localhost:6379/0") + # Weight of BM25-weighted chat-history embedding vs last-user-message embedding + # 0.3 = 30% history context signal, 70% question signal + cache_history_weight: float = Field(default=0.3) + class Config: # Load from `config.yaml` first, then from env variables env_prefix = "NOMYO_ROUTER_" @@ -188,6 +204,7 @@ def _config_path_from_env() -> Path: from ollama._types import TokenLogprob, Logprob from db import TokenDatabase +from cache import init_llm_cache, get_llm_cache, openai_nonstream_to_sse # Create the global config object – it will be overwritten on startup @@ -1596,7 +1613,15 @@ async def proxy(request: Request): error_msg = f"Invalid JSON format in request body: {str(e)}. Please ensure the request is properly formatted." raise HTTPException(status_code=400, detail=error_msg) from e - + # Cache lookup — before endpoint selection so no slot is wasted on a hit + _cache = get_llm_cache() + if _cache is not None: + _cached = await _cache.get_generate(model, prompt, system or "") + if _cached is not None: + async def _serve_cached_generate(): + yield _cached + return StreamingResponse(_serve_cached_generate(), media_type="application/json") + endpoint, tracking_model = await choose_endpoint(model) use_openai = is_openai_compatible(endpoint) if use_openai: @@ -1633,6 +1658,7 @@ async def proxy(request: Request): else: async_gen = await client.generate(model=model, prompt=prompt, suffix=suffix, system=system, template=template, context=context, stream=stream, think=think, raw=raw, format=_format, images=images, options=options, keep_alive=keep_alive) if stream == True: + content_parts: list[str] = [] async for chunk in async_gen: if use_openai: chunk = rechunk.openai_completion2ollama(chunk, stream, start_ts) @@ -1644,6 +1670,27 @@ async def proxy(request: Request): json_line = chunk.model_dump_json() else: json_line = orjson.dumps(chunk) + # Accumulate and store cache on done chunk — before yield so it always runs + if _cache is not None: + if getattr(chunk, "response", None): + content_parts.append(chunk.response) + if getattr(chunk, "done", False): + assembled = orjson.dumps({ + k: v for k, v in { + "model": getattr(chunk, "model", model), + "response": "".join(content_parts), + "done": True, + "done_reason": getattr(chunk, "done_reason", "stop") or "stop", + "prompt_eval_count": getattr(chunk, "prompt_eval_count", None), + "eval_count": getattr(chunk, "eval_count", None), + "total_duration": getattr(chunk, "total_duration", None), + "eval_duration": getattr(chunk, "eval_duration", None), + }.items() if v is not None + }) + b"\n" + try: + await _cache.set_generate(model, prompt, system or "", assembled) + except Exception as _ce: + print(f"[cache] set_generate (streaming) failed: {_ce}") yield json_line.encode("utf-8") + b"\n" else: if use_openai: @@ -1660,7 +1707,14 @@ async def proxy(request: Request): if hasattr(async_gen, "model_dump_json") else orjson.dumps(async_gen) ) - yield json_line.encode("utf-8") + b"\n" + cache_bytes = json_line.encode("utf-8") + b"\n" + yield cache_bytes + # Cache non-streaming response + if _cache is not None: + try: + await _cache.set_generate(model, prompt, system or "", cache_bytes) + except Exception as _ce: + print(f"[cache] set_generate (non-streaming) failed: {_ce}") finally: # Ensure counter is decremented even if an exception occurs @@ -1711,6 +1765,26 @@ async def chat_proxy(request: Request): except orjson.JSONDecodeError as e: raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e + # Cache lookup — before endpoint selection, always bypassed for MOE + _is_moe = model.startswith("moe-") + _cache = get_llm_cache() + # Normalise model name for cache key: strip ":latest" suffix here so that + # get_chat and set_chat use the same model string regardless of when the + # strip happens further down (line ~1793 strips it for OpenAI endpoints). + _cache_model = model[: -len(":latest")] if model.endswith(":latest") else model + # Snapshot original messages before any OpenAI-format transformation so that + # get_chat and set_chat always use the same key regardless of backend type. + _cache_messages = messages + if _cache is not None and not _is_moe: + _cached = await _cache.get_chat("ollama_chat", _cache_model, messages) + if _cached is not None: + async def _serve_cached_chat(): + yield _cached + return StreamingResponse( + _serve_cached_chat(), + media_type="application/x-ndjson" if stream else "application/json", + ) + # 2. Endpoint logic if model.startswith("moe-"): model = model.split("moe-")[1] @@ -1764,6 +1838,7 @@ async def chat_proxy(request: Request): async_gen = await client.chat(model=model, messages=messages, tools=tools, stream=stream, think=think, format=_format, options=options, keep_alive=keep_alive, logprobs=logprobs, top_logprobs=top_logprobs) if stream == True: tc_acc = {} # accumulate OpenAI tool-call deltas across chunks + content_parts: list[str] = [] async for chunk in async_gen: if use_openai: _accumulate_openai_tc_delta(chunk, tc_acc) @@ -1780,6 +1855,30 @@ async def chat_proxy(request: Request): json_line = chunk.model_dump_json() else: json_line = orjson.dumps(chunk) + # Accumulate and store cache on done chunk — before yield so it always runs + # Works for both Ollama-native and OpenAI-compatible backends; chunks are + # already converted to Ollama format by rechunk before this point. + if _cache is not None and not _is_moe: + if chunk.message and getattr(chunk.message, "content", None): + content_parts.append(chunk.message.content) + if getattr(chunk, "done", False): + assembled = orjson.dumps({ + k: v for k, v in { + "model": getattr(chunk, "model", model), + "created_at": (lambda ca: ca.isoformat() if hasattr(ca, "isoformat") else ca)(getattr(chunk, "created_at", None)), + "message": {"role": "assistant", "content": "".join(content_parts)}, + "done": True, + "done_reason": getattr(chunk, "done_reason", "stop") or "stop", + "prompt_eval_count": getattr(chunk, "prompt_eval_count", None), + "eval_count": getattr(chunk, "eval_count", None), + "total_duration": getattr(chunk, "total_duration", None), + "eval_duration": getattr(chunk, "eval_duration", None), + }.items() if v is not None + }) + b"\n" + try: + await _cache.set_chat("ollama_chat", _cache_model, _cache_messages, assembled) + except Exception as _ce: + print(f"[cache] set_chat (ollama_chat streaming) failed: {_ce}") yield json_line.encode("utf-8") + b"\n" else: if use_openai: @@ -1796,7 +1895,14 @@ async def chat_proxy(request: Request): if hasattr(async_gen, "model_dump_json") else orjson.dumps(async_gen) ) - yield json_line.encode("utf-8") + b"\n" + cache_bytes = json_line.encode("utf-8") + b"\n" + yield cache_bytes + # Cache non-streaming response (non-MOE; works for both Ollama and OpenAI backends) + if _cache is not None and not _is_moe: + try: + await _cache.set_chat("ollama_chat", _cache_model, _cache_messages, cache_bytes) + except Exception as _ce: + print(f"[cache] set_chat (ollama_chat non-streaming) failed: {_ce}") finally: # Ensure counter is decremented even if an exception occurs @@ -2680,6 +2786,21 @@ async def openai_chat_completions_proxy(request: Request): except orjson.JSONDecodeError as e: raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e + # Cache lookup — before endpoint selection + _cache = get_llm_cache() + if _cache is not None: + _cached = await _cache.get_chat("openai_chat", model, messages) + if _cached is not None: + if stream: + _sse = openai_nonstream_to_sse(_cached, model) + async def _serve_cached_ochat_stream(): + yield _sse + return StreamingResponse(_serve_cached_ochat_stream(), media_type="text/event-stream") + else: + async def _serve_cached_ochat_json(): + yield _cached + return StreamingResponse(_serve_cached_ochat_json(), media_type="application/json") + # 2. Endpoint logic endpoint, tracking_model = await choose_endpoint(model) base_url = ep2base(endpoint) @@ -2699,6 +2820,8 @@ async def openai_chat_completions_proxy(request: Request): else: raise if stream == True: + content_parts: list[str] = [] + usage_snapshot: dict = {} async for chunk in async_gen: data = ( chunk.model_dump_json() @@ -2715,6 +2838,8 @@ async def openai_chat_completions_proxy(request: Request): has_tool_calls = getattr(delta, "tool_calls", None) is not None if has_content or has_reasoning or has_tool_calls: yield f"data: {data}\n\n".encode("utf-8") + if has_content and delta.content: + content_parts.append(delta.content) elif chunk.usage is not None: # Forward the usage-only final chunk (e.g. from llama-server) yield f"data: {data}\n\n".encode("utf-8") @@ -2723,12 +2848,24 @@ async def openai_chat_completions_proxy(request: Request): if chunk.usage is not None: prompt_tok = chunk.usage.prompt_tokens or 0 comp_tok = chunk.usage.completion_tokens or 0 + usage_snapshot = {"prompt_tokens": prompt_tok, "completion_tokens": comp_tok, "total_tokens": prompt_tok + comp_tok} else: llama_usage = rechunk.extract_usage_from_llama_timings(chunk) if llama_usage: prompt_tok, comp_tok = llama_usage if prompt_tok != 0 or comp_tok != 0: await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok)) + # Cache assembled streaming response — before [DONE] so it always runs + if _cache is not None and content_parts: + assembled = orjson.dumps({ + "model": model, + "choices": [{"index": 0, "message": {"role": "assistant", "content": "".join(content_parts)}, "finish_reason": "stop"}], + **({"usage": usage_snapshot} if usage_snapshot else {}), + }) + b"\n" + try: + await _cache.set_chat("openai_chat", model, messages, assembled) + except Exception as _ce: + print(f"[cache] set_chat (openai_chat streaming) failed: {_ce}") yield b"data: [DONE]\n\n" else: prompt_tok = 0 @@ -2747,7 +2884,14 @@ async def openai_chat_completions_proxy(request: Request): if hasattr(async_gen, "model_dump_json") else orjson.dumps(async_gen) ) - yield json_line.encode("utf-8") + b"\n" + cache_bytes = json_line.encode("utf-8") + b"\n" + yield cache_bytes + # Cache non-streaming response + if _cache is not None: + try: + await _cache.set_chat("openai_chat", model, messages, cache_bytes) + except Exception as _ce: + print(f"[cache] set_chat (openai_chat non-streaming) failed: {_ce}") finally: # Ensure counter is decremented even if an exception occurs @@ -2823,6 +2967,22 @@ async def openai_completions_proxy(request: Request): except orjson.JSONDecodeError as e: raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e + # Cache lookup — completions prompt mapped to a single-turn messages list + _cache = get_llm_cache() + _compl_messages = [{"role": "user", "content": prompt}] + if _cache is not None: + _cached = await _cache.get_chat("openai_completions", model, _compl_messages) + if _cached is not None: + if stream: + _sse = openai_nonstream_to_sse(_cached, model) + async def _serve_cached_ocompl_stream(): + yield _sse + return StreamingResponse(_serve_cached_ocompl_stream(), media_type="text/event-stream") + else: + async def _serve_cached_ocompl_json(): + yield _cached + return StreamingResponse(_serve_cached_ocompl_json(), media_type="application/json") + # 2. Endpoint logic endpoint, tracking_model = await choose_endpoint(model) base_url = ep2base(endpoint) @@ -2834,6 +2994,8 @@ async def openai_completions_proxy(request: Request): # The chat method returns a generator of dicts (or GenerateResponse) async_gen = await oclient.completions.create(**params) if stream == True: + text_parts: list[str] = [] + usage_snapshot: dict = {} async for chunk in async_gen: data = ( chunk.model_dump_json() @@ -2849,6 +3011,8 @@ async def openai_completions_proxy(request: Request): ) if has_text or has_reasoning or choice.finish_reason is not None: yield f"data: {data}\n\n".encode("utf-8") + if has_text and choice.text: + text_parts.append(choice.text) elif chunk.usage is not None: # Forward the usage-only final chunk (e.g. from llama-server) yield f"data: {data}\n\n".encode("utf-8") @@ -2857,12 +3021,24 @@ async def openai_completions_proxy(request: Request): if chunk.usage is not None: prompt_tok = chunk.usage.prompt_tokens or 0 comp_tok = chunk.usage.completion_tokens or 0 + usage_snapshot = {"prompt_tokens": prompt_tok, "completion_tokens": comp_tok, "total_tokens": prompt_tok + comp_tok} else: llama_usage = rechunk.extract_usage_from_llama_timings(chunk) if llama_usage: prompt_tok, comp_tok = llama_usage if prompt_tok != 0 or comp_tok != 0: await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok)) + # Cache assembled streaming response — before [DONE] so it always runs + if _cache is not None and text_parts: + assembled = orjson.dumps({ + "model": model, + "choices": [{"index": 0, "message": {"role": "assistant", "content": "".join(text_parts)}, "finish_reason": "stop"}], + **({"usage": usage_snapshot} if usage_snapshot else {}), + }) + b"\n" + try: + await _cache.set_chat("openai_completions", model, _compl_messages, assembled) + except Exception as _ce: + print(f"[cache] set_chat (openai_completions streaming) failed: {_ce}") # Final DONE event yield b"data: [DONE]\n\n" else: @@ -2882,7 +3058,14 @@ async def openai_completions_proxy(request: Request): if hasattr(async_gen, "model_dump_json") else orjson.dumps(async_gen) ) - yield json_line.encode("utf-8") + b"\n" + cache_bytes = json_line.encode("utf-8") + b"\n" + yield cache_bytes + # Cache non-streaming response + if _cache is not None: + try: + await _cache.set_chat("openai_completions", model, _compl_messages, cache_bytes) + except Exception as _ce: + print(f"[cache] set_chat (openai_completions non-streaming) failed: {_ce}") finally: # Ensure counter is decremented even if an exception occurs @@ -3076,6 +3259,28 @@ async def rerank_proxy(request: Request): finally: await decrement_usage(endpoint, tracking_model) +# ------------------------------------------------------------- +# 25b. Cache management endpoints +# ------------------------------------------------------------- +@app.get("/api/cache/stats") +async def cache_stats(): + """Return hit/miss counters and configuration for the LLM response cache.""" + c = get_llm_cache() + if c is None: + return {"enabled": False} + return {"enabled": True, **c.stats()} + + +@app.post("/api/cache/invalidate") +async def cache_invalidate(): + """Clear all entries from the LLM response cache and reset counters.""" + c = get_llm_cache() + if c is None: + return {"enabled": False, "cleared": False} + await c.clear() + return {"enabled": True, "cleared": True} + + # ------------------------------------------------------------- # 26. Serve the static front‑end # ------------------------------------------------------------- @@ -3211,6 +3416,7 @@ async def startup_event() -> None: app_state["session"] = session token_worker_task = asyncio.create_task(token_worker()) flush_task = asyncio.create_task(flush_buffer()) + await init_llm_cache(config) @app.on_event("shutdown") async def shutdown_event() -> None: