feat: adding a semantic cache layer

2026-03-08 09:12:09 +01:00 · 2026-03-08 09:12:09 +01:00 · dd4b12da6a
commit dd4b12da6a
parent c3d47c7ffe
13 changed files with 1138 additions and 22 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -0,0 +1,44 @@
+# Version control
+.git
+.gitignore
+.github
+
+# Environment & secrets
+.env
+.env.*
+*.env
+
+# Python artifacts
+__pycache__
+*.pyc
+*.pyo
+*.pyd
+.Python
+.venv
+venv
+*.egg-info
+dist
+build
+
+# Local databases (don't bake data into image)
+*.db
+*.db-shm
+*.db-wal
+
+# IDE / editor
+.vscode
+.idea
+*.swp
+*.swo
+
+# Documentation
+doc/
+docs/
+*.md
+
+# Tests
+tests/
+test_*
+
+# Local config overrides
+config.local.yaml
--- a/.github/workflows/docker-publish-semantic.yml
+++ b/.github/workflows/docker-publish-semantic.yml
@ -0,0 +1,71 @@
+name: Build and Publish Docker Image (Semantic Cache)
+
+# Builds the :semantic variant that includes sentence-transformers + CPU torch
+# and the pre-baked all-MiniLM-L6-v2 embedding model (~500 MB larger than lean).
+# Tags mirror the lean workflow but carry a -semantic suffix, e.g.:
+#   ghcr.io/nomyo-ai/nomyo-router:latest-semantic
+#   ghcr.io/nomyo-ai/nomyo-router:0.7.0-semantic
+#   ghcr.io/nomyo-ai/nomyo-router:0.7-semantic
+
+on:
+  push:
+    branches:
+      - main
+    tags:
+      - "v*.*.*"
+  workflow_dispatch:
+
+env:
+  REGISTRY: ghcr.io
+  IMAGE_NAME: ${{ github.repository }}
+
+jobs:
+  build-and-push-semantic:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up QEMU (for multi-arch builds)
+        uses: docker/setup-qemu-action@v3
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Extract Docker metadata
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+          tags: |
+            # Versioned semantic tags on git tags (e.g. v0.7.0 → 0.7.0-semantic, 0.7-semantic)
+            type=semver,pattern={{version}}-semantic
+            type=semver,pattern={{major}}.{{minor}}-semantic
+            # latest-semantic only on main branch pushes
+            type=raw,value=latest-semantic,enable=${{ github.ref == 'refs/heads/main' }}
+            # SHA-tagged for traceability
+            type=sha,prefix=sha-,suffix=-semantic
+
+      - name: Build and push semantic Docker image
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          platforms: linux/amd64,linux/arm64
+          push: true
+          build-args: |
+            SEMANTIC_CACHE=true
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
--- a/26
+++ b/26
@ -3,21 +3,43 @@ FROM python:3.13-slim
 ENV PYTHONUNBUFFERED=1 \
    PYTHONDONTWRITEBYTECODE=1

+# SEMANTIC_CACHE=true installs sentence-transformers + CPU-only torch and pre-bakes
+# the all-MiniLM-L6-v2 embedding model (~500 MB extra).  The resulting image is tagged
+# :semantic.  The default (lean) image supports exact-match caching only.
+ARG SEMANTIC_CACHE=false
+
+# Pin HuggingFace cache to a predictable path inside /app/data so it can be
+# mounted as a volume and shared between builds.
+ENV HF_HOME=/app/data/hf_cache
+
 # Install SQLite
-RUN apt-get update && apt-get install -y sqlite3
+RUN apt-get update && apt-get install -y --no-install-recommends sqlite3 \
+    && rm -rf /var/lib/apt/lists/*

 WORKDIR /app
 COPY requirements.txt .
 RUN pip install --no-cache-dir --upgrade pip \
    && pip install --no-cache-dir -r requirements.txt

+# Semantic cache deps — only installed when SEMANTIC_CACHE=true
+# CPU-only torch must be installed before sentence-transformers to avoid
+# pulling the full CUDA-enabled build (~2.5 GB).
+RUN if [ "$SEMANTIC_CACHE" = "true" ]; then \
+      pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu && \
+      pip install --no-cache-dir sentence-transformers && \
+      python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')"; \
+    fi
+
 # Create database directory and set permissions
 RUN mkdir -p /app/data && chown -R www-data:www-data /app/data

 COPY . .

-RUN chmod +x /app/entrypoint.sh
+RUN chmod +x /app/entrypoint.sh && \
+    chown -R www-data:www-data /app

 EXPOSE 12434

+USER www-data
+
 ENTRYPOINT ["/app/entrypoint.sh"]
--- a/README.md
+++ b/README.md
@ -74,22 +74,28 @@ uvicorn router:app --host 127.0.0.1 --port 12434 --loop uvloop

 ### Pre-built image (GitHub Container Registry)

-Pre-built multi-arch images (`linux/amd64`, `linux/arm64`) are published automatically on every release:
+Pre-built multi-arch images (`linux/amd64`, `linux/arm64`) are published automatically on every release.

+**Lean image** (exact-match cache, ~300 MB):
 ```sh
 docker pull ghcr.io/nomyo-ai/nomyo-router:latest
-```
-
-Specific version:
-
-```sh
 docker pull ghcr.io/nomyo-ai/nomyo-router:0.7.0
 ```

-### Build the container image locally:
+**Semantic image** (semantic cache with `all-MiniLM-L6-v2` pre-baked, ~800 MB):
+```sh
+docker pull ghcr.io/nomyo-ai/nomyo-router:latest-semantic
+docker pull ghcr.io/nomyo-ai/nomyo-router:0.7.0-semantic
+```
+
+### Build the container image locally

 ```sh
+# Lean build (exact match cache, default)
 docker build -t nomyo-router .
+
+# Semantic build — sentence-transformers + model baked in
+docker build --build-arg SEMANTIC_CACHE=true -t nomyo-router:semantic .
 ```

 Run the router in Docker with your own configuration file mounted from the host. The entrypoint script accepts a `--config-path` argument so you can point to a file anywhere inside the container:
@ -124,6 +130,53 @@ This way the Ollama backend servers are utilized more efficient than by simply u

 NOMYO Router also supports OpenAI API compatible v1 backend servers.

+## Semantic LLM Cache
+
+NOMYO Router includes an optional semantic cache that serves repeated or semantically similar LLM requests from cache — no endpoint round-trip, no token cost, response in <10 ms.
+
+### Enable (exact match, any image)
+
+```yaml
+# config.yaml
+cache_enabled: true
+cache_backend: sqlite    # persists across restarts
+cache_similarity: 1.0   # exact match only
+cache_ttl: 3600
+```
+
+### Enable (semantic matching, :semantic image)
+
+```yaml
+cache_enabled: true
+cache_backend: sqlite
+cache_similarity: 0.90   # "What is Python?" ≈ "What's Python?" → cache hit
+cache_ttl: 3600
+cache_history_weight: 0.3
+```
+
+Pull the semantic image:
+```bash
+docker pull ghcr.io/nomyo-ai/nomyo-router:latest-semantic
+```
+
+### Cache key strategy
+
+Each request is keyed on `model + system_prompt` (exact) combined with a weighted-mean embedding of BM25-weighted chat history (30%) and the last user message (70%). This means:
+- Different system prompts → always separate cache namespaces (no cross-tenant leakage)
+- Same question, different phrasing → cache hit (semantic mode)
+- MOE requests (`moe-*`) → always bypass the cache
+
+### Cached routes
+
+`/api/chat` · `/api/generate` · `/v1/chat/completions` · `/v1/completions`
+
+### Cache management
+
+```bash
+curl http://localhost:12434/api/cache/stats        # hit rate, counters, config
+curl -X POST http://localhost:12434/api/cache/invalidate  # clear all entries
+```
+
 ## Supplying the router API key

 If you set `nomyo-router-api-key` in `config.yaml` (or `NOMYO_ROUTER_API_KEY` env), every request to NOMYO Router must include the key:
--- a/cache.py
+++ b/cache.py
@ -0,0 +1,407 @@
+"""
+LLM Semantic Cache for NOMYO Router.
+
+Strategy:
+- Namespace: sha256(route :: model :: system_prompt)[:16]  — exact context isolation
+- Cache key:  hash(normalize(last_user_message), namespace) — exact lookup
+- Embedding:  weighted mean of
+                α  * embed(bm25_weighted(chat_history))   — conversation context
+                1-α * embed(last_user_message)             — the actual question
+  with α = cache_history_weight (default 0.3).
+- Exact-match caching (similarity=1.0) uses DummyEmbeddingProvider — zero extra deps.
+- Semantic caching (similarity<1.0) requires sentence-transformers.  If missing the
+  library falls back to exact-match with a warning (lean Docker image behaviour).
+- MOE models (moe-*) always bypass the cache.
+- Token counts are never recorded for cache hits.
+- Streaming cache hits are served as a single-chunk response.
+"""
+
+import hashlib
+import math
+import time
+import warnings
+from collections import Counter
+from typing import Any, Optional
+
+# Lazily resolved once at first embed() call
+_semantic_available: Optional[bool] = None
+
+
+def _check_sentence_transformers() -> bool:
+    global _semantic_available
+    if _semantic_available is None:
+        try:
+            import sentence_transformers  # noqa: F401
+            _semantic_available = True
+        except ImportError:
+            _semantic_available = False
+    return _semantic_available  # type: ignore[return-value]
+
+
+# ---------------------------------------------------------------------------
+# BM25-weighted text representation of chat history
+# ---------------------------------------------------------------------------
+
+def _bm25_weighted_text(history: list[dict]) -> str:
+    """
+    Produce a BM25-importance-weighted text string from chat history turns.
+
+    High-IDF (rare, domain-specific) terms are repeated proportionally to
+    their BM25 score so the downstream sentence-transformer embedding
+    naturally upweights topical signal and downweights stop words.
+    """
+    docs = [m.get("content", "") for m in history if m.get("content")]
+    if not docs:
+        return ""
+
+    def _tok(text: str) -> list[str]:
+        return [w.lower() for w in text.split() if len(w) > 2]
+
+    tokenized = [_tok(d) for d in docs]
+    N = len(tokenized)
+
+    df: Counter = Counter()
+    for tokens in tokenized:
+        for term in set(tokens):
+            df[term] += 1
+
+    k1, b = 1.5, 0.75
+    avg_dl = sum(len(t) for t in tokenized) / max(N, 1)
+
+    term_scores: Counter = Counter()
+    for tokens in tokenized:
+        tf_c = Counter(tokens)
+        dl = len(tokens)
+        for term, tf in tf_c.items():
+            idf = math.log((N + 1) / (df[term] + 1)) + 1.0
+            score = idf * (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * dl / max(avg_dl, 1)))
+            term_scores[term] += score
+
+    top = term_scores.most_common(50)
+    if not top:
+        return " ".join(docs)
+
+    max_s = top[0][1]
+    out: list[str] = []
+    for term, score in top:
+        out.extend([term] * max(1, round(3 * score / max_s)))
+    return " ".join(out)
+
+
+# ---------------------------------------------------------------------------
+# LLMCache
+# ---------------------------------------------------------------------------
+
+class LLMCache:
+    """
+    Thin async wrapper around async-semantic-llm-cache that adds:
+    - Route-aware namespace isolation
+    - Two-vector weighted-mean embedding (history context + question)
+    - Per-instance hit/miss counters
+    - Graceful fallback when sentence-transformers is absent
+    """
+
+    def __init__(self, cfg: Any) -> None:
+        self._cfg = cfg
+        self._backend: Any = None
+        self._emb_cache: Any = None
+        self._semantic: bool = False
+        self._hits: int = 0
+        self._misses: int = 0
+
+    async def init(self) -> None:
+        from semantic_llm_cache.similarity import EmbeddingCache
+
+        # --- Backend ---
+        backend_type: str = self._cfg.cache_backend
+        if backend_type == "sqlite":
+            from semantic_llm_cache.backends.sqlite import SQLiteBackend
+            self._backend = SQLiteBackend(db_path=self._cfg.cache_db_path)
+        elif backend_type == "redis":
+            from semantic_llm_cache.backends.redis import RedisBackend
+            self._backend = RedisBackend(url=self._cfg.cache_redis_url)
+            await self._backend.ping()
+        else:
+            from semantic_llm_cache.backends.memory import MemoryBackend
+            self._backend = MemoryBackend()
+
+        # --- Embedding provider ---
+        if self._cfg.cache_similarity < 1.0:
+            if _check_sentence_transformers():
+                from semantic_llm_cache.similarity import create_embedding_provider
+                provider = create_embedding_provider("sentence-transformer")
+                self._emb_cache = EmbeddingCache(provider=provider)
+                self._semantic = True
+                print(
+                    f"[cache] Semantic cache ready "
+                    f"(similarity≥{self._cfg.cache_similarity}, backend={backend_type})"
+                )
+            else:
+                warnings.warn(
+                    "[cache] sentence-transformers is not installed. "
+                    "Falling back to exact-match caching (similarity=1.0). "
+                    "Use the :semantic Docker image tag to enable semantic caching.",
+                    RuntimeWarning,
+                    stacklevel=2,
+                )
+                self._emb_cache = EmbeddingCache()   # DummyEmbeddingProvider
+                print(f"[cache] Exact-match cache ready (backend={backend_type}) [semantic unavailable]")
+        else:
+            self._emb_cache = EmbeddingCache()       # DummyEmbeddingProvider
+            print(f"[cache] Exact-match cache ready (backend={backend_type})")
+
+    # ------------------------------------------------------------------
+    # Internal helpers
+    # ------------------------------------------------------------------
+
+    def _namespace(self, route: str, model: str, system: str) -> str:
+        raw = f"{route}::{model}::{system}"
+        return hashlib.sha256(raw.encode()).hexdigest()[:16]
+
+    def _cache_key(self, namespace: str, last_user: str) -> str:
+        from semantic_llm_cache.utils import hash_prompt, normalize_prompt
+        return hash_prompt(normalize_prompt(last_user), namespace)
+
+    def _parse_messages(
+        self, messages: list[dict]
+    ) -> tuple[str, list[dict], str]:
+        """
+        Returns (system_prompt, prior_history_turns, last_user_message).
+        Multimodal content lists are reduced to their text parts.
+        """
+        system = ""
+        turns: list[dict] = []
+
+        for m in messages:
+            role = m.get("role", "")
+            content = m.get("content", "")
+            if isinstance(content, list):
+                content = " ".join(
+                    p.get("text", "")
+                    for p in content
+                    if isinstance(p, dict) and p.get("type") == "text"
+                )
+            if role == "system":
+                system = content
+            else:
+                turns.append({"role": role, "content": content})
+
+        last_user = ""
+        for m in reversed(turns):
+            if m["role"] == "user":
+                last_user = m["content"]
+                break
+
+        # History = all turns before the final user message
+        history = turns[:-1] if turns and turns[-1]["role"] == "user" else turns
+        return system, history, last_user
+
+    async def _build_embedding(
+        self, history: list[dict], last_user: str
+    ) -> list[float] | None:
+        """
+        Weighted mean of BM25-weighted history embedding and last-user embedding.
+        Returns None when not in semantic mode.
+        """
+        if not self._semantic:
+            return None
+
+        import numpy as np
+
+        alpha: float = self._cfg.cache_history_weight   # weight for history signal
+        q_vec = np.array(await self._emb_cache.aencode(last_user), dtype=float)
+
+        if not history:
+            # No history → use question embedding alone (alpha has no effect)
+            return q_vec.tolist()
+
+        h_text = _bm25_weighted_text(history)
+        h_vec = np.array(await self._emb_cache.aencode(h_text), dtype=float)
+
+        combined = alpha * h_vec + (1.0 - alpha) * q_vec
+        norm = float(np.linalg.norm(combined))
+        if norm > 0.0:
+            combined /= norm
+        return combined.tolist()
+
+    # ------------------------------------------------------------------
+    # Public interface: chat (handles both Ollama and OpenAI message lists)
+    # ------------------------------------------------------------------
+
+    async def get_chat(
+        self, route: str, model: str, messages: list[dict]
+    ) -> bytes | None:
+        """Return cached response bytes, or None on miss."""
+        if not self._backend:
+            return None
+
+        system, history, last_user = self._parse_messages(messages)
+        if not last_user:
+            return None
+
+        ns = self._namespace(route, model, system)
+        key = self._cache_key(ns, last_user)
+
+        # 1. Exact key match
+        entry = await self._backend.get(key)
+        if entry is not None:
+            self._hits += 1
+            return entry.response  # type: ignore[return-value]
+
+        # 2. Semantic similarity match
+        if self._semantic and self._cfg.cache_similarity < 1.0:
+            emb = await self._build_embedding(history, last_user)
+            result = await self._backend.find_similar(
+                emb, threshold=self._cfg.cache_similarity, namespace=ns
+            )
+            if result is not None:
+                _, matched, _ = result
+                self._hits += 1
+                return matched.response  # type: ignore[return-value]
+
+        self._misses += 1
+        return None
+
+    async def set_chat(
+        self, route: str, model: str, messages: list[dict], response_bytes: bytes
+    ) -> None:
+        """Store a response in the cache (fire-and-forget friendly)."""
+        if not self._backend:
+            return
+
+        system, history, last_user = self._parse_messages(messages)
+        if not last_user:
+            return
+
+        ns = self._namespace(route, model, system)
+        key = self._cache_key(ns, last_user)
+
+        emb = (
+            await self._build_embedding(history, last_user)
+            if self._semantic and self._cfg.cache_similarity < 1.0
+            else None
+        )
+
+        from semantic_llm_cache.config import CacheEntry
+
+        await self._backend.set(
+            key,
+            CacheEntry(
+                prompt=last_user,
+                response=response_bytes,
+                embedding=emb,
+                created_at=time.time(),
+                ttl=self._cfg.cache_ttl,
+                namespace=ns,
+                hit_count=0,
+            ),
+        )
+
+    # ------------------------------------------------------------------
+    # Convenience wrappers for the generate route (prompt string, not messages)
+    # ------------------------------------------------------------------
+
+    async def get_generate(
+        self, model: str, prompt: str, system: str = ""
+    ) -> bytes | None:
+        messages: list[dict] = []
+        if system:
+            messages.append({"role": "system", "content": system})
+        messages.append({"role": "user", "content": prompt})
+        return await self.get_chat("generate", model, messages)
+
+    async def set_generate(
+        self, model: str, prompt: str, system: str, response_bytes: bytes
+    ) -> None:
+        messages: list[dict] = []
+        if system:
+            messages.append({"role": "system", "content": system})
+        messages.append({"role": "user", "content": prompt})
+        await self.set_chat("generate", model, messages, response_bytes)
+
+    # ------------------------------------------------------------------
+    # Management
+    # ------------------------------------------------------------------
+
+    def stats(self) -> dict:
+        total = self._hits + self._misses
+        return {
+            "hits": self._hits,
+            "misses": self._misses,
+            "hit_rate": round(self._hits / total, 3) if total else 0.0,
+            "semantic": self._semantic,
+            "backend": self._cfg.cache_backend,
+            "similarity_threshold": self._cfg.cache_similarity,
+            "history_weight": self._cfg.cache_history_weight,
+        }
+
+    async def clear(self) -> None:
+        if self._backend:
+            await self._backend.clear()
+        self._hits = 0
+        self._misses = 0
+
+
+# ---------------------------------------------------------------------------
+# Module-level singleton
+# ---------------------------------------------------------------------------
+
+_cache: LLMCache | None = None
+
+
+async def init_llm_cache(cfg: Any) -> LLMCache | None:
+    """Initialise the module-level cache singleton. Returns None if disabled."""
+    global _cache
+    if not cfg.cache_enabled:
+        print("[cache] Cache disabled (cache_enabled=false).")
+        return None
+    _cache = LLMCache(cfg)
+    await _cache.init()
+    return _cache
+
+
+def get_llm_cache() -> LLMCache | None:
+    return _cache
+
+
+# ---------------------------------------------------------------------------
+# Helper: convert a stored Ollama-format non-streaming response to an
+# OpenAI SSE single-chunk stream (used when a streaming OpenAI request
+# hits the cache whose entry was populated from a non-streaming response).
+# ---------------------------------------------------------------------------
+
+def openai_nonstream_to_sse(cached_bytes: bytes, model: str) -> bytes:
+    """
+    Wrap a stored OpenAI ChatCompletion JSON as a minimal single-chunk SSE stream.
+    The stored entry always uses the non-streaming ChatCompletion format so that
+    non-streaming cache hits can be served directly; this function adapts it for
+    streaming clients.
+    """
+    import orjson, time as _time
+
+    try:
+        d = orjson.loads(cached_bytes)
+        content = (d.get("choices") or [{}])[0].get("message", {}).get("content", "")
+        chunk = {
+            "id": d.get("id", "cache-hit"),
+            "object": "chat.completion.chunk",
+            "created": d.get("created", int(_time.time())),
+            "model": d.get("model", model),
+            "choices": [
+                {
+                    "index": 0,
+                    "delta": {"role": "assistant", "content": content},
+                    "finish_reason": "stop",
+                }
+            ],
+        }
+        if d.get("usage"):
+            chunk["usage"] = d["usage"]
+        return f"data: {orjson.dumps(chunk).decode()}\n\ndata: [DONE]\n\n".encode()
+    except Exception as exc:
+        warnings.warn(
+            f"[cache] openai_nonstream_to_sse: corrupt cache entry, returning empty stream: {exc}",
+            RuntimeWarning,
+            stacklevel=2,
+        )
+        return b"data: [DONE]\n\n"
--- a/config.yaml
+++ b/config.yaml
@ -6,7 +6,7 @@ endpoints:
  - https://api.openai.com/v1

 llama_server_endpoints:
-  - http://192.168.0.33:8889/v1
+  - http://192.168.0.50:8889/v1

 # Maximum concurrent connections *per endpoint‑model pair* (equals to OLLAMA_NUM_PARALLEL)
 max_concurrent_connections: 2
@ -22,4 +22,38 @@ api_keys:
  "http://192.168.0.51:11434": "ollama"
  "http://192.168.0.52:11434": "ollama"
  "https://api.openai.com/v1": "${OPENAI_KEY}"
-  "http://192.168.0.33:8889/v1": "llama"
+  "http://192.168.0.50:8889/v1": "llama"
+
+# -------------------------------------------------------------
+# Semantic LLM Cache (optional — disabled by default)
+# Caches LLM responses to cut costs and latency on repeated or
+# semantically similar prompts.
+# Cached routes: /api/chat  /api/generate  /v1/chat/completions  /v1/completions
+# MOE requests (moe-* model prefix) always bypass the cache.
+# -------------------------------------------------------------
+cache_enabled: true
+
+# Backend — where cached responses are stored:
+#   memory  → in-process LRU (lost on restart, not shared across replicas) [default]
+#   sqlite  → persistent file-based   (single instance, survives restart)
+#   redis   → distributed             (shared across replicas, requires Redis)
+cache_backend: memory
+
+# Cosine similarity threshold for a cache hit:
+#   1.0  → exact match only  (works on any image variant)
+#   <1.0 → semantic matching (requires the :semantic Docker image tag)
+cache_similarity: 0.9
+
+# Response TTL in seconds. Remove the key or set to null to cache forever.
+cache_ttl: 3600
+
+# SQLite backend: path to the cache database file
+cache_db_path: llm_cache.db
+
+# Redis backend: connection URL
+# cache_redis_url: redis://localhost:6379/0
+
+# Weight of the BM25-weighted chat-history embedding vs last-user-message embedding.
+# 0.3 = 30% history context signal, 70% question signal.
+# Only relevant when cache_similarity < 1.0.
+# cache_history_weight: 0.3
--- a/doc/configuration.md
+++ b/doc/configuration.md
@ -204,6 +204,149 @@ max_concurrent_connections: 3

 **Recommendation**: Use multiple endpoints for redundancy and load distribution.

+## Semantic LLM Cache
+
+NOMYO Router can cache LLM responses and serve them directly — skipping endpoint selection, model load, and token generation entirely.
+
+### How it works
+
+1. On every cacheable request (`/api/chat`, `/api/generate`, `/v1/chat/completions`, `/v1/completions`) the cache is checked **before** choosing an endpoint.
+2. On a **cache hit** the stored response is returned immediately as a single chunk (streaming or non-streaming — both work).
+3. On a **cache miss** the request is forwarded normally. The response is stored in the cache after it completes.
+4. **MOE requests** (`moe-*` model prefix) always bypass the cache.
+5. **Token counts** are never recorded for cache hits.
+
+### Cache key strategy
+
+| Signal | How matched |
+|---|---|
+| `model + system_prompt` | Exact — hard context isolation per deployment |
+| BM25-weighted embedding of chat history | Semantic — conversation context signal |
+| Embedding of last user message | Semantic — the actual question |
+
+The two semantic vectors are combined as a weighted mean (tuned by `cache_history_weight`) before cosine similarity comparison, staying at a single 384-dimensional vector compatible with the library's storage format.
+
+### Quick start — exact match (lean image)
+
+```yaml
+cache_enabled: true
+cache_backend: sqlite    # persists across restarts
+cache_similarity: 1.0   # exact match only, no sentence-transformers needed
+cache_ttl: 3600
+```
+
+### Quick start — semantic matching (:semantic image)
+
+```yaml
+cache_enabled: true
+cache_backend: sqlite
+cache_similarity: 0.90   # hit if ≥90% cosine similarity
+cache_ttl: 3600
+cache_history_weight: 0.3
+```
+
+Pull the semantic image:
+```bash
+docker pull ghcr.io/nomyo-ai/nomyo-router:latest-semantic
+```
+
+### Cache configuration options
+
+#### `cache_enabled`
+
+**Type**: `bool` | **Default**: `false`
+
+Enable or disable the cache. All other cache settings are ignored when `false`.
+
+#### `cache_backend`
+
+**Type**: `str` | **Default**: `"memory"`
+
+| Value | Description | Persists | Multi-replica |
+|---|---|---|---|
+| `memory` | In-process LRU dict | ❌ | ❌ |
+| `sqlite` | File-based via `aiosqlite` | ✅ | ❌ |
+| `redis` | Redis via `redis.asyncio` | ✅ | ✅ |
+
+Use `redis` when running multiple router replicas behind a load balancer — all replicas share one warm cache.
+
+#### `cache_similarity`
+
+**Type**: `float` | **Default**: `1.0`
+
+Cosine similarity threshold. `1.0` means exact match only (no embedding model needed). Values below `1.0` enable semantic matching, which requires the `:semantic` Docker image tag.
+
+Recommended starting value for semantic mode: `0.90`.
+
+#### `cache_ttl`
+
+**Type**: `int | null` | **Default**: `3600`
+
+Time-to-live for cache entries in seconds. Remove the key or set to `null` to cache forever.
+
+#### `cache_db_path`
+
+**Type**: `str` | **Default**: `"llm_cache.db"`
+
+Path to the SQLite cache database. Only used when `cache_backend: sqlite`.
+
+#### `cache_redis_url`
+
+**Type**: `str` | **Default**: `"redis://localhost:6379/0"`
+
+Redis connection URL. Only used when `cache_backend: redis`.
+
+#### `cache_history_weight`
+
+**Type**: `float` | **Default**: `0.3`
+
+Weight of the BM25-weighted chat-history embedding in the combined cache key vector. `0.3` means the history contributes 30% and the final user message contributes 70% of the similarity signal. Only used when `cache_similarity < 1.0`.
+
+### Cache management endpoints
+
+| Endpoint | Method | Description |
+|---|---|---|
+| `/api/cache/stats` | `GET` | Hit/miss counters, hit rate, current config |
+| `/api/cache/invalidate` | `POST` | Clear all cache entries and reset counters |
+
+```bash
+# Check cache performance
+curl http://localhost:12434/api/cache/stats
+
+# Clear the cache
+curl -X POST http://localhost:12434/api/cache/invalidate
+```
+
+Example stats response:
+```json
+{
+  "enabled": true,
+  "hits": 1547,
+  "misses": 892,
+  "hit_rate": 0.634,
+  "semantic": true,
+  "backend": "sqlite",
+  "similarity_threshold": 0.9,
+  "history_weight": 0.3
+}
+```
+
+### Docker image variants
+
+| Tag | Semantic cache | Image size |
+|---|---|---|
+| `latest` | ❌ exact match only | ~300 MB |
+| `latest-semantic` | ✅ sentence-transformers + model pre-baked | ~800 MB |
+
+Build locally:
+```bash
+# Lean (exact match)
+docker build -t nomyo-router .
+
+# Semantic (~500 MB larger, all-MiniLM-L6-v2 model baked in)
+docker build --build-arg SEMANTIC_CACHE=true -t nomyo-router:semantic .
+```
+
 ## Configuration Validation

 The router validates the configuration at startup:
--- a/doc/deployment.md
+++ b/doc/deployment.md
@ -82,10 +82,23 @@ sudo systemctl status nomyo-router

 ## 2. Docker Deployment

+### Image variants
+
+| Tag | Semantic cache | Image size |
+|---|---|---|
+| `latest` | ❌ exact match only | ~300 MB |
+| `latest-semantic` | ✅ sentence-transformers + `all-MiniLM-L6-v2` pre-baked | ~800 MB |
+
+The `:semantic` variant enables `cache_similarity < 1.0` in `config.yaml`. The lean image falls back to exact-match caching with a warning if semantic mode is configured.
+
 ### Build the Image

 ```bash
+# Lean build (exact match cache, default)
 docker build -t nomyo-router .
+
+# Semantic build (~500 MB larger, all-MiniLM-L6-v2 model baked in at build time)
+docker build --build-arg SEMANTIC_CACHE=true -t nomyo-router:semantic .
 ```

 ### Run the Container
--- a/doc/examples/docker-compose.yml
+++ b/doc/examples/docker-compose.yml
@ -1,20 +1,30 @@
 # Docker Compose example for NOMYO Router with multiple Ollama instances
+#
+# Two router profiles are provided:
+#   nomyo-router          — lean image, exact-match cache only (~300 MB)
+#   nomyo-router-semantic — semantic image, sentence-transformers baked in (~800 MB)
+#
+# Uncomment the redis service and set cache_backend: redis in config.yaml
+# to share the LLM response cache across multiple router replicas.

 version: '3.8'

 services:
-  # NOMYO Router
+  # NOMYO Router — lean image (exact-match cache, default)
  nomyo-router:
    image: nomyo-router:latest
-    build: .
+    build:
+      context: .
+      args:
+        SEMANTIC_CACHE: "false"
    ports:
      - "12434:12434"
    environment:
      - CONFIG_PATH=/app/config/config.yaml
-      - NOMYO_ROUTER_DB_PATH=/app/token_counts.db
+      - NOMYO_ROUTER_DB_PATH=/app/data/token_counts.db
    volumes:
      - ./config:/app/config
-      - router-db:/app/token_counts.db
+      - router-data:/app/data
    depends_on:
      - ollama1
      - ollama2
@ -23,6 +33,45 @@ services:
    networks:
      - nomyo-net

+  # NOMYO Router — semantic image (cache_similarity < 1.0 support, ~800 MB)
+  # Build:  docker compose build nomyo-router-semantic
+  # Switch: comment out nomyo-router above, uncomment this block.
+  # nomyo-router-semantic:
+  #   image: nomyo-router:semantic
+  #   build:
+  #     context: .
+  #     args:
+  #       SEMANTIC_CACHE: "true"
+  #   ports:
+  #     - "12434:12434"
+  #   environment:
+  #     - CONFIG_PATH=/app/config/config.yaml
+  #     - NOMYO_ROUTER_DB_PATH=/app/data/token_counts.db
+  #   volumes:
+  #     - ./config:/app/config
+  #     - router-data:/app/data
+  #     - hf-cache:/app/data/hf_cache   # share HuggingFace model cache across builds
+  #   depends_on:
+  #     - ollama1
+  #     - ollama2
+  #     - ollama3
+  #   restart: unless-stopped
+  #   networks:
+  #     - nomyo-net
+
+  # Optional: Redis for shared LLM response cache across multiple router replicas.
+  # Requires cache_backend: redis in config.yaml.
+  # redis:
+  #   image: redis:7-alpine
+  #   ports:
+  #     - "6379:6379"
+  #   volumes:
+  #     - redis-data:/data
+  #   command: redis-server --save 60 1 --loglevel warning
+  #   restart: unless-stopped
+  #   networks:
+  #     - nomyo-net
+
  # Ollama Instance 1
  ollama1:
    image: ollama/ollama:latest
@ -87,7 +136,9 @@ services:
      - nomyo-net

 volumes:
-  router-db:
+  router-data:
+  # hf-cache:     # uncomment when using nomyo-router-semantic
+  # redis-data:   # uncomment when using Redis cache backend
  ollama1-data:
  ollama2-data:
  ollama3-data:
--- a/doc/examples/sample-config.yaml
+++ b/doc/examples/sample-config.yaml
@ -30,3 +30,37 @@ api_keys:
  "https://api.openai.com/v1": "${OPENAI_KEY}"
  "http://localhost:8080/v1": "llama-server"  # Optional API key for llama-server - depends on llama_server config
  "http://192.168.0.33:8081/v1": "llama-server"
+
+# -------------------------------------------------------------
+# Semantic LLM Cache (optional — disabled by default)
+# Caches LLM responses to cut costs and latency on repeated or
+# semantically similar prompts.
+# Cached routes: /api/chat  /api/generate  /v1/chat/completions  /v1/completions
+# MOE requests (moe-* model prefix) always bypass the cache.
+# -------------------------------------------------------------
+# cache_enabled: false
+
+# Backend — where cached responses are stored:
+#   memory  → in-process LRU (lost on restart, not shared across replicas) [default]
+#   sqlite  → persistent file-based   (single instance, survives restart)
+#   redis   → distributed             (shared across replicas, requires Redis)
+# cache_backend: memory
+
+# Cosine similarity threshold for a cache hit:
+#   1.0  → exact match only  (works on any image variant)
+#   <1.0 → semantic matching (requires the :semantic Docker image tag)
+# cache_similarity: 1.0
+
+# Response TTL in seconds. Remove the key or set to null to cache forever.
+# cache_ttl: 3600
+
+# SQLite backend: path to the cache database file
+# cache_db_path: llm_cache.db
+
+# Redis backend: connection URL
+# cache_redis_url: redis://localhost:6379/0
+
+# Weight of the BM25-weighted chat-history embedding vs last-user-message embedding.
+# 0.3 = 30% history context signal, 70% question signal.
+# Only relevant when cache_similarity < 1.0.
+# cache_history_weight: 0.3
--- a/doc/monitoring.md
+++ b/doc/monitoring.md
@ -133,6 +133,39 @@ Response:
 }
 ```

+### Cache Statistics
+
+```bash
+curl http://localhost:12434/api/cache/stats
+```
+
+Response when cache is enabled:
+```json
+{
+  "enabled": true,
+  "hits": 1547,
+  "misses": 892,
+  "hit_rate": 0.634,
+  "semantic": true,
+  "backend": "sqlite",
+  "similarity_threshold": 0.9,
+  "history_weight": 0.3
+}
+```
+
+Response when cache is disabled:
+```json
+{ "enabled": false }
+```
+
+### Cache Invalidation
+
+```bash
+curl -X POST http://localhost:12434/api/cache/invalidate
+```
+
+Clears all cached entries and resets hit/miss counters.
+
 ### Real-time Usage Stream

 ```bash
--- a/requirements.txt
+++ b/requirements.txt
@ -39,3 +39,8 @@ uvicorn==0.38.0
 uvloop
 yarl==1.20.1
 aiosqlite
+# Semantic LLM cache — base install (exact-match mode, no heavy ML deps)
+# For semantic mode use the :semantic Docker image tag (adds sentence-transformers + torch)
+# SECURITY: pin to a specific tag or commit hash to prevent supply-chain drift, e.g.:
+#   semantic-llm-cache @ git+https://github.com/nomyo-ai/async-semantic-llm-cache.git@v1.0.0
+semantic-llm-cache @ git+https://github.com/nomyo-ai/async-semantic-llm-cache.git
--- a/router.py
+++ b/router.py
@ -123,6 +123,22 @@ class Config(BaseSettings):
    # Database configuration
    db_path: str = Field(default=os.getenv("NOMYO_ROUTER_DB_PATH", "token_counts.db"))

+    # Semantic LLM Cache configuration
+    cache_enabled: bool = Field(default=False)
+    # Backend: "memory" (default, in-process), "sqlite" (persistent), "redis" (distributed)
+    cache_backend: str = Field(default="memory")
+    # Cosine similarity threshold: 1.0 = exact match only, <1.0 = semantic (requires :semantic image)
+    cache_similarity: float = Field(default=1.0)
+    # TTL in seconds; None = cache forever
+    cache_ttl: Optional[int] = Field(default=3600)
+    # SQLite backend: path to cache database file
+    cache_db_path: str = Field(default="llm_cache.db")
+    # Redis backend: connection URL
+    cache_redis_url: str = Field(default="redis://localhost:6379/0")
+    # Weight of BM25-weighted chat-history embedding vs last-user-message embedding
+    # 0.3 = 30% history context signal, 70% question signal
+    cache_history_weight: float = Field(default=0.3)
+
    class Config:
        # Load from `config.yaml` first, then from env variables
        env_prefix = "NOMYO_ROUTER_"
@ -188,6 +204,7 @@ def _config_path_from_env() -> Path:

 from ollama._types import TokenLogprob, Logprob
 from db import TokenDatabase
+from cache import init_llm_cache, get_llm_cache, openai_nonstream_to_sse


 # Create the global config object – it will be overwritten on startup
@ -1596,6 +1613,14 @@ async def proxy(request: Request):
        error_msg = f"Invalid JSON format in request body: {str(e)}. Please ensure the request is properly formatted."
        raise HTTPException(status_code=400, detail=error_msg) from e

+    # Cache lookup — before endpoint selection so no slot is wasted on a hit
+    _cache = get_llm_cache()
+    if _cache is not None:
+        _cached = await _cache.get_generate(model, prompt, system or "")
+        if _cached is not None:
+            async def _serve_cached_generate():
+                yield _cached
+            return StreamingResponse(_serve_cached_generate(), media_type="application/json")

    endpoint, tracking_model = await choose_endpoint(model)
    use_openai = is_openai_compatible(endpoint)
@ -1633,6 +1658,7 @@ async def proxy(request: Request):
            else:
                async_gen = await client.generate(model=model, prompt=prompt, suffix=suffix, system=system, template=template, context=context, stream=stream, think=think, raw=raw, format=_format, images=images, options=options, keep_alive=keep_alive)
            if stream == True:
+                content_parts: list[str] = []
                async for chunk in async_gen:
                    if use_openai:
                        chunk = rechunk.openai_completion2ollama(chunk, stream, start_ts)
@ -1644,6 +1670,27 @@ async def proxy(request: Request):
                        json_line = chunk.model_dump_json()
                    else:
                        json_line = orjson.dumps(chunk)
+                    # Accumulate and store cache on done chunk — before yield so it always runs
+                    if _cache is not None:
+                        if getattr(chunk, "response", None):
+                            content_parts.append(chunk.response)
+                        if getattr(chunk, "done", False):
+                            assembled = orjson.dumps({
+                                k: v for k, v in {
+                                    "model": getattr(chunk, "model", model),
+                                    "response": "".join(content_parts),
+                                    "done": True,
+                                    "done_reason": getattr(chunk, "done_reason", "stop") or "stop",
+                                    "prompt_eval_count": getattr(chunk, "prompt_eval_count", None),
+                                    "eval_count": getattr(chunk, "eval_count", None),
+                                    "total_duration": getattr(chunk, "total_duration", None),
+                                    "eval_duration": getattr(chunk, "eval_duration", None),
+                                }.items() if v is not None
+                            }) + b"\n"
+                            try:
+                                await _cache.set_generate(model, prompt, system or "", assembled)
+                            except Exception as _ce:
+                                print(f"[cache] set_generate (streaming) failed: {_ce}")
                    yield json_line.encode("utf-8") + b"\n"
            else:
                if use_openai:
@ -1660,7 +1707,14 @@ async def proxy(request: Request):
                    if hasattr(async_gen, "model_dump_json")
                    else orjson.dumps(async_gen)
                )
-                yield json_line.encode("utf-8") + b"\n"
+                cache_bytes = json_line.encode("utf-8") + b"\n"
+                yield cache_bytes
+                # Cache non-streaming response
+                if _cache is not None:
+                    try:
+                        await _cache.set_generate(model, prompt, system or "", cache_bytes)
+                    except Exception as _ce:
+                        print(f"[cache] set_generate (non-streaming) failed: {_ce}")

        finally:
            # Ensure counter is decremented even if an exception occurs
@ -1711,6 +1765,26 @@ async def chat_proxy(request: Request):
    except orjson.JSONDecodeError as e:
        raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e

+    # Cache lookup — before endpoint selection, always bypassed for MOE
+    _is_moe = model.startswith("moe-")
+    _cache = get_llm_cache()
+    # Normalise model name for cache key: strip ":latest" suffix here so that
+    # get_chat and set_chat use the same model string regardless of when the
+    # strip happens further down (line ~1793 strips it for OpenAI endpoints).
+    _cache_model = model[: -len(":latest")] if model.endswith(":latest") else model
+    # Snapshot original messages before any OpenAI-format transformation so that
+    # get_chat and set_chat always use the same key regardless of backend type.
+    _cache_messages = messages
+    if _cache is not None and not _is_moe:
+        _cached = await _cache.get_chat("ollama_chat", _cache_model, messages)
+        if _cached is not None:
+            async def _serve_cached_chat():
+                yield _cached
+            return StreamingResponse(
+                _serve_cached_chat(),
+                media_type="application/x-ndjson" if stream else "application/json",
+            )
+
    # 2. Endpoint logic
    if model.startswith("moe-"):
        model = model.split("moe-")[1]
@ -1764,6 +1838,7 @@ async def chat_proxy(request: Request):
                    async_gen = await client.chat(model=model, messages=messages, tools=tools, stream=stream, think=think, format=_format, options=options, keep_alive=keep_alive, logprobs=logprobs, top_logprobs=top_logprobs)
            if stream == True:
                tc_acc = {}  # accumulate OpenAI tool-call deltas across chunks
+                content_parts: list[str] = []
                async for chunk in async_gen:
                    if use_openai:
                        _accumulate_openai_tc_delta(chunk, tc_acc)
@ -1780,6 +1855,30 @@ async def chat_proxy(request: Request):
                        json_line = chunk.model_dump_json()
                    else:
                        json_line = orjson.dumps(chunk)
+                    # Accumulate and store cache on done chunk — before yield so it always runs
+                    # Works for both Ollama-native and OpenAI-compatible backends; chunks are
+                    # already converted to Ollama format by rechunk before this point.
+                    if _cache is not None and not _is_moe:
+                        if chunk.message and getattr(chunk.message, "content", None):
+                            content_parts.append(chunk.message.content)
+                        if getattr(chunk, "done", False):
+                            assembled = orjson.dumps({
+                                k: v for k, v in {
+                                    "model": getattr(chunk, "model", model),
+                                    "created_at": (lambda ca: ca.isoformat() if hasattr(ca, "isoformat") else ca)(getattr(chunk, "created_at", None)),
+                                    "message": {"role": "assistant", "content": "".join(content_parts)},
+                                    "done": True,
+                                    "done_reason": getattr(chunk, "done_reason", "stop") or "stop",
+                                    "prompt_eval_count": getattr(chunk, "prompt_eval_count", None),
+                                    "eval_count": getattr(chunk, "eval_count", None),
+                                    "total_duration": getattr(chunk, "total_duration", None),
+                                    "eval_duration": getattr(chunk, "eval_duration", None),
+                                }.items() if v is not None
+                            }) + b"\n"
+                            try:
+                                await _cache.set_chat("ollama_chat", _cache_model, _cache_messages, assembled)
+                            except Exception as _ce:
+                                print(f"[cache] set_chat (ollama_chat streaming) failed: {_ce}")
                    yield json_line.encode("utf-8") + b"\n"
            else:
                if use_openai:
@ -1796,7 +1895,14 @@ async def chat_proxy(request: Request):
                    if hasattr(async_gen, "model_dump_json")
                    else orjson.dumps(async_gen)
                )
-                yield json_line.encode("utf-8") + b"\n"
+                cache_bytes = json_line.encode("utf-8") + b"\n"
+                yield cache_bytes
+                # Cache non-streaming response (non-MOE; works for both Ollama and OpenAI backends)
+                if _cache is not None and not _is_moe:
+                    try:
+                        await _cache.set_chat("ollama_chat", _cache_model, _cache_messages, cache_bytes)
+                    except Exception as _ce:
+                        print(f"[cache] set_chat (ollama_chat non-streaming) failed: {_ce}")

        finally:
            # Ensure counter is decremented even if an exception occurs
@ -2680,6 +2786,21 @@ async def openai_chat_completions_proxy(request: Request):
    except orjson.JSONDecodeError as e:
        raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e

+    # Cache lookup — before endpoint selection
+    _cache = get_llm_cache()
+    if _cache is not None:
+        _cached = await _cache.get_chat("openai_chat", model, messages)
+        if _cached is not None:
+            if stream:
+                _sse = openai_nonstream_to_sse(_cached, model)
+                async def _serve_cached_ochat_stream():
+                    yield _sse
+                return StreamingResponse(_serve_cached_ochat_stream(), media_type="text/event-stream")
+            else:
+                async def _serve_cached_ochat_json():
+                    yield _cached
+                return StreamingResponse(_serve_cached_ochat_json(), media_type="application/json")
+
    # 2. Endpoint logic
    endpoint, tracking_model = await choose_endpoint(model)
    base_url = ep2base(endpoint)
@ -2699,6 +2820,8 @@ async def openai_chat_completions_proxy(request: Request):
                else:
                    raise
            if stream == True:
+                content_parts: list[str] = []
+                usage_snapshot: dict = {}
                async for chunk in async_gen:
                    data = (
                        chunk.model_dump_json()
@ -2715,6 +2838,8 @@ async def openai_chat_completions_proxy(request: Request):
                        has_tool_calls = getattr(delta, "tool_calls", None) is not None
                        if has_content or has_reasoning or has_tool_calls:
                            yield f"data: {data}\n\n".encode("utf-8")
+                        if has_content and delta.content:
+                            content_parts.append(delta.content)
                    elif chunk.usage is not None:
                        # Forward the usage-only final chunk (e.g. from llama-server)
                        yield f"data: {data}\n\n".encode("utf-8")
@ -2723,12 +2848,24 @@ async def openai_chat_completions_proxy(request: Request):
                    if chunk.usage is not None:
                        prompt_tok = chunk.usage.prompt_tokens or 0
                        comp_tok   = chunk.usage.completion_tokens or 0
+                        usage_snapshot = {"prompt_tokens": prompt_tok, "completion_tokens": comp_tok, "total_tokens": prompt_tok + comp_tok}
                    else:
                        llama_usage = rechunk.extract_usage_from_llama_timings(chunk)
                        if llama_usage:
                            prompt_tok, comp_tok = llama_usage
                    if prompt_tok != 0 or comp_tok != 0:
                        await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))
+                # Cache assembled streaming response — before [DONE] so it always runs
+                if _cache is not None and content_parts:
+                    assembled = orjson.dumps({
+                        "model": model,
+                        "choices": [{"index": 0, "message": {"role": "assistant", "content": "".join(content_parts)}, "finish_reason": "stop"}],
+                        **({"usage": usage_snapshot} if usage_snapshot else {}),
+                    }) + b"\n"
+                    try:
+                        await _cache.set_chat("openai_chat", model, messages, assembled)
+                    except Exception as _ce:
+                        print(f"[cache] set_chat (openai_chat streaming) failed: {_ce}")
                yield b"data: [DONE]\n\n"
            else:
                prompt_tok = 0
@ -2747,7 +2884,14 @@ async def openai_chat_completions_proxy(request: Request):
                    if hasattr(async_gen, "model_dump_json")
                    else orjson.dumps(async_gen)
                )
-                yield json_line.encode("utf-8") + b"\n"
+                cache_bytes = json_line.encode("utf-8") + b"\n"
+                yield cache_bytes
+                # Cache non-streaming response
+                if _cache is not None:
+                    try:
+                        await _cache.set_chat("openai_chat", model, messages, cache_bytes)
+                    except Exception as _ce:
+                        print(f"[cache] set_chat (openai_chat non-streaming) failed: {_ce}")

        finally:
            # Ensure counter is decremented even if an exception occurs
@ -2823,6 +2967,22 @@ async def openai_completions_proxy(request: Request):
    except orjson.JSONDecodeError as e:
        raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e

+    # Cache lookup — completions prompt mapped to a single-turn messages list
+    _cache = get_llm_cache()
+    _compl_messages = [{"role": "user", "content": prompt}]
+    if _cache is not None:
+        _cached = await _cache.get_chat("openai_completions", model, _compl_messages)
+        if _cached is not None:
+            if stream:
+                _sse = openai_nonstream_to_sse(_cached, model)
+                async def _serve_cached_ocompl_stream():
+                    yield _sse
+                return StreamingResponse(_serve_cached_ocompl_stream(), media_type="text/event-stream")
+            else:
+                async def _serve_cached_ocompl_json():
+                    yield _cached
+                return StreamingResponse(_serve_cached_ocompl_json(), media_type="application/json")
+
    # 2. Endpoint logic
    endpoint, tracking_model = await choose_endpoint(model)
    base_url = ep2base(endpoint)
@ -2834,6 +2994,8 @@ async def openai_completions_proxy(request: Request):
            # The chat method returns a generator of dicts (or GenerateResponse)
            async_gen = await oclient.completions.create(**params)
            if stream == True:
+                text_parts: list[str] = []
+                usage_snapshot: dict = {}
                async for chunk in async_gen:
                    data = (
                        chunk.model_dump_json()
@ -2849,6 +3011,8 @@ async def openai_completions_proxy(request: Request):
                        )
                        if has_text or has_reasoning or choice.finish_reason is not None:
                            yield f"data: {data}\n\n".encode("utf-8")
+                        if has_text and choice.text:
+                            text_parts.append(choice.text)
                    elif chunk.usage is not None:
                        # Forward the usage-only final chunk (e.g. from llama-server)
                        yield f"data: {data}\n\n".encode("utf-8")
@ -2857,12 +3021,24 @@ async def openai_completions_proxy(request: Request):
                    if chunk.usage is not None:
                        prompt_tok = chunk.usage.prompt_tokens or 0
                        comp_tok   = chunk.usage.completion_tokens or 0
+                        usage_snapshot = {"prompt_tokens": prompt_tok, "completion_tokens": comp_tok, "total_tokens": prompt_tok + comp_tok}
                    else:
                        llama_usage = rechunk.extract_usage_from_llama_timings(chunk)
                        if llama_usage:
                            prompt_tok, comp_tok = llama_usage
                    if prompt_tok != 0 or comp_tok != 0:
                        await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))
+                # Cache assembled streaming response — before [DONE] so it always runs
+                if _cache is not None and text_parts:
+                    assembled = orjson.dumps({
+                        "model": model,
+                        "choices": [{"index": 0, "message": {"role": "assistant", "content": "".join(text_parts)}, "finish_reason": "stop"}],
+                        **({"usage": usage_snapshot} if usage_snapshot else {}),
+                    }) + b"\n"
+                    try:
+                        await _cache.set_chat("openai_completions", model, _compl_messages, assembled)
+                    except Exception as _ce:
+                        print(f"[cache] set_chat (openai_completions streaming) failed: {_ce}")
                # Final DONE event
                yield b"data: [DONE]\n\n"
            else:
@ -2882,7 +3058,14 @@ async def openai_completions_proxy(request: Request):
                    if hasattr(async_gen, "model_dump_json")
                    else orjson.dumps(async_gen)
                )
-                yield json_line.encode("utf-8") + b"\n"
+                cache_bytes = json_line.encode("utf-8") + b"\n"
+                yield cache_bytes
+                # Cache non-streaming response
+                if _cache is not None:
+                    try:
+                        await _cache.set_chat("openai_completions", model, _compl_messages, cache_bytes)
+                    except Exception as _ce:
+                        print(f"[cache] set_chat (openai_completions non-streaming) failed: {_ce}")

        finally:
            # Ensure counter is decremented even if an exception occurs
@ -3076,6 +3259,28 @@ async def rerank_proxy(request: Request):
    finally:
        await decrement_usage(endpoint, tracking_model)

+# -------------------------------------------------------------
+# 25b. Cache management endpoints
+# -------------------------------------------------------------
+@app.get("/api/cache/stats")
+async def cache_stats():
+    """Return hit/miss counters and configuration for the LLM response cache."""
+    c = get_llm_cache()
+    if c is None:
+        return {"enabled": False}
+    return {"enabled": True, **c.stats()}
+
+
+@app.post("/api/cache/invalidate")
+async def cache_invalidate():
+    """Clear all entries from the LLM response cache and reset counters."""
+    c = get_llm_cache()
+    if c is None:
+        return {"enabled": False, "cleared": False}
+    await c.clear()
+    return {"enabled": True, "cleared": True}
+
+
 # -------------------------------------------------------------
 # 26. Serve the static front‑end
 # -------------------------------------------------------------
@ -3211,6 +3416,7 @@ async def startup_event() -> None:
    app_state["session"] = session
    token_worker_task = asyncio.create_task(token_worker())
    flush_task = asyncio.create_task(flush_buffer())
+    await init_llm_cache(config)

@app.on_event("shutdown")
 async def shutdown_event() -> None: