feat: adding a semantic cache layer

2026-03-08 09:12:09 +01:00 · 2026-03-08 09:12:09 +01:00 · dd4b12da6a
commit dd4b12da6a
parent c3d47c7ffe
13 changed files with 1138 additions and 22 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -0,0 +1,44 @@
 # Version control
 .git
 .gitignore
 .github
 # Environment & secrets
 .env
 .env.*
 *.env
 # Python artifacts
 __pycache__
 *.pyc
 *.pyo
 *.pyd
 .Python
 .venv
 venv
 *.egg-info
 dist
 build
 # Local databases (don't bake data into image)
 *.db
 *.db-shm
 *.db-wal
 # IDE / editor
 .vscode
 .idea
 *.swp
 *.swo
 # Documentation
 doc/
 docs/
 *.md
 # Tests
 tests/
 test_*
 # Local config overrides
 config.local.yaml
--- a/.github/workflows/docker-publish-semantic.yml
+++ b/.github/workflows/docker-publish-semantic.yml
@ -0,0 +1,71 @@
 name: Build and Publish Docker Image (Semantic Cache)
 # Builds the :semantic variant that includes sentence-transformers + CPU torch
 # and the pre-baked all-MiniLM-L6-v2 embedding model (~500 MB larger than lean).
 # Tags mirror the lean workflow but carry a -semantic suffix, e.g.:
 #   ghcr.io/nomyo-ai/nomyo-router:latest-semantic
 #   ghcr.io/nomyo-ai/nomyo-router:0.7.0-semantic
 #   ghcr.io/nomyo-ai/nomyo-router:0.7-semantic
 on:
  push:
    branches:
      - main
    tags:
      - "v*.*.*"
  workflow_dispatch:
 env:
  REGISTRY: ghcr.io
  IMAGE_NAME: ${{ github.repository }}
 jobs:
  build-and-push-semantic:
    runs-on: ubuntu-latest
    permissions:
      contents: read
      packages: write
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
      - name: Set up QEMU (for multi-arch builds)
        uses: docker/setup-qemu-action@v3
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      - name: Log in to GitHub Container Registry
        uses: docker/login-action@v3
        with:
          registry: ${{ env.REGISTRY }}
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}
      - name: Extract Docker metadata
        id: meta
        uses: docker/metadata-action@v5
        with:
          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
          tags: |
            # Versioned semantic tags on git tags (e.g. v0.7.0 → 0.7.0-semantic, 0.7-semantic)
            type=semver,pattern={{version}}-semantic
            type=semver,pattern={{major}}.{{minor}}-semantic
            # latest-semantic only on main branch pushes
            type=raw,value=latest-semantic,enable=${{ github.ref == 'refs/heads/main' }}
            # SHA-tagged for traceability
            type=sha,prefix=sha-,suffix=-semantic
      - name: Build and push semantic Docker image
        uses: docker/build-push-action@v6
        with:
          context: .
          platforms: linux/amd64,linux/arm64
          push: true
          build-args: |
            SEMANTIC_CACHE=true
          tags: ${{ steps.meta.outputs.tags }}
          labels: ${{ steps.meta.outputs.labels }}
          cache-from: type=gha
          cache-to: type=gha,mode=max
--- a/26
+++ b/26
@ -3,21 +3,43 @@ FROM python:3.13-slim
 ENV PYTHONUNBUFFERED=1 \
    PYTHONDONTWRITEBYTECODE=1
 # SEMANTIC_CACHE=true installs sentence-transformers + CPU-only torch and pre-bakes
 # the all-MiniLM-L6-v2 embedding model (~500 MB extra).  The resulting image is tagged
 # :semantic.  The default (lean) image supports exact-match caching only.
 ARG SEMANTIC_CACHE=false
 # Pin HuggingFace cache to a predictable path inside /app/data so it can be
 # mounted as a volume and shared between builds.
 ENV HF_HOME=/app/data/hf_cache
 # Install SQLite
-RUN apt-get update && apt-get install -y sqlite3
+RUN apt-get update && apt-get install -y --no-install-recommends sqlite3 \
    && rm -rf /var/lib/apt/lists/*
 WORKDIR /app
 COPY requirements.txt .
 RUN pip install --no-cache-dir --upgrade pip \
    && pip install --no-cache-dir -r requirements.txt
 # Semantic cache deps — only installed when SEMANTIC_CACHE=true
 # CPU-only torch must be installed before sentence-transformers to avoid
 # pulling the full CUDA-enabled build (~2.5 GB).
 RUN if [ "$SEMANTIC_CACHE" = "true" ]; then \
      pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu && \
      pip install --no-cache-dir sentence-transformers && \
      python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')"; \
    fi
 # Create database directory and set permissions
 RUN mkdir -p /app/data && chown -R www-data:www-data /app/data
 COPY . .
-RUN chmod +x /app/entrypoint.sh
+RUN chmod +x /app/entrypoint.sh && \
    chown -R www-data:www-data /app
 EXPOSE 12434
 USER www-data
 ENTRYPOINT ["/app/entrypoint.sh"]
--- a/README.md
+++ b/README.md
@ -74,22 +74,28 @@ uvicorn router:app --host 127.0.0.1 --port 12434 --loop uvloop
 ### Pre-built image (GitHub Container Registry)
-Pre-built multi-arch images (`linux/amd64`, `linux/arm64`) are published automatically on every release:
+Pre-built multi-arch images (`linux/amd64`, `linux/arm64`) are published automatically on every release.
 **Lean image** (exact-match cache, ~300 MB):
 ```sh
 docker pull ghcr.io/nomyo-ai/nomyo-router:latest
 ```
 Specific version:
 ```sh
 docker pull ghcr.io/nomyo-ai/nomyo-router:0.7.0
 ```
-### Build the container image locally:
+**Semantic image** (semantic cache with `all-MiniLM-L6-v2` pre-baked, ~800 MB):
 ```sh
 docker pull ghcr.io/nomyo-ai/nomyo-router:latest-semantic
 docker pull ghcr.io/nomyo-ai/nomyo-router:0.7.0-semantic
 ```
 ### Build the container image locally
 ```sh
 # Lean build (exact match cache, default)
 docker build -t nomyo-router .
 # Semantic build — sentence-transformers + model baked in
 docker build --build-arg SEMANTIC_CACHE=true -t nomyo-router:semantic .
 ```
 Run the router in Docker with your own configuration file mounted from the host. The entrypoint script accepts a `--config-path` argument so you can point to a file anywhere inside the container:
@ -124,6 +130,53 @@ This way the Ollama backend servers are utilized more efficient than by simply u
 NOMYO Router also supports OpenAI API compatible v1 backend servers.
 ## Semantic LLM Cache
 NOMYO Router includes an optional semantic cache that serves repeated or semantically similar LLM requests from cache — no endpoint round-trip, no token cost, response in <10 ms.
 ### Enable (exact match, any image)
 ```yaml
 # config.yaml
 cache_enabled: true
 cache_backend: sqlite    # persists across restarts
 cache_similarity: 1.0   # exact match only
 cache_ttl: 3600
 ```
 ### Enable (semantic matching, :semantic image)
 ```yaml
 cache_enabled: true
 cache_backend: sqlite
 cache_similarity: 0.90   # "What is Python?" ≈ "What's Python?" → cache hit
 cache_ttl: 3600
 cache_history_weight: 0.3
 ```
 Pull the semantic image:
 ```bash
 docker pull ghcr.io/nomyo-ai/nomyo-router:latest-semantic
 ```
 ### Cache key strategy
 Each request is keyed on `model + system_prompt` (exact) combined with a weighted-mean embedding of BM25-weighted chat history (30%) and the last user message (70%). This means:
 - Different system prompts → always separate cache namespaces (no cross-tenant leakage)
 - Same question, different phrasing → cache hit (semantic mode)
 - MOE requests (`moe-*`) → always bypass the cache
 ### Cached routes
 `/api/chat` · `/api/generate` · `/v1/chat/completions` · `/v1/completions`
 ### Cache management
 ```bash
 curl http://localhost:12434/api/cache/stats        # hit rate, counters, config
 curl -X POST http://localhost:12434/api/cache/invalidate  # clear all entries
 ```
 ## Supplying the router API key
 If you set `nomyo-router-api-key` in `config.yaml` (or `NOMYO_ROUTER_API_KEY` env), every request to NOMYO Router must include the key:
--- a/cache.py
+++ b/cache.py
@ -0,0 +1,407 @@
 """
 LLM Semantic Cache for NOMYO Router.
 Strategy:
 - Namespace: sha256(route :: model :: system_prompt)[:16]  — exact context isolation
 - Cache key:  hash(normalize(last_user_message), namespace) — exact lookup
 - Embedding:  weighted mean of
                α  * embed(bm25_weighted(chat_history))   — conversation context
                1-α * embed(last_user_message)             — the actual question
  with α = cache_history_weight (default 0.3).
 - Exact-match caching (similarity=1.0) uses DummyEmbeddingProvider — zero extra deps.
 - Semantic caching (similarity<1.0) requires sentence-transformers.  If missing the
  library falls back to exact-match with a warning (lean Docker image behaviour).
 - MOE models (moe-*) always bypass the cache.
 - Token counts are never recorded for cache hits.
 - Streaming cache hits are served as a single-chunk response.
 """
 import hashlib
 import math
 import time
 import warnings
 from collections import Counter
 from typing import Any, Optional
 # Lazily resolved once at first embed() call
 _semantic_available: Optional[bool] = None
 def _check_sentence_transformers() -> bool:
    global _semantic_available
    if _semantic_available is None:
        try:
            import sentence_transformers  # noqa: F401
            _semantic_available = True
        except ImportError:
            _semantic_available = False
    return _semantic_available  # type: ignore[return-value]
 # ---------------------------------------------------------------------------
 # BM25-weighted text representation of chat history
 # ---------------------------------------------------------------------------
 def _bm25_weighted_text(history: list[dict]) -> str:
    """
    Produce a BM25-importance-weighted text string from chat history turns.
    High-IDF (rare, domain-specific) terms are repeated proportionally to
    their BM25 score so the downstream sentence-transformer embedding
    naturally upweights topical signal and downweights stop words.
    """
    docs = [m.get("content", "") for m in history if m.get("content")]
    if not docs:
        return ""
    def _tok(text: str) -> list[str]:
        return [w.lower() for w in text.split() if len(w) > 2]
    tokenized = [_tok(d) for d in docs]
    N = len(tokenized)
    df: Counter = Counter()
    for tokens in tokenized:
        for term in set(tokens):
            df[term] += 1
    k1, b = 1.5, 0.75
    avg_dl = sum(len(t) for t in tokenized) / max(N, 1)
    term_scores: Counter = Counter()
    for tokens in tokenized:
        tf_c = Counter(tokens)
        dl = len(tokens)
        for term, tf in tf_c.items():
            idf = math.log((N + 1) / (df[term] + 1)) + 1.0
            score = idf * (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * dl / max(avg_dl, 1)))
            term_scores[term] += score
    top = term_scores.most_common(50)
    if not top:
        return " ".join(docs)
    max_s = top[0][1]
    out: list[str] = []
    for term, score in top:
        out.extend([term] * max(1, round(3 * score / max_s)))
    return " ".join(out)
 # ---------------------------------------------------------------------------
 # LLMCache
 # ---------------------------------------------------------------------------
 class LLMCache:
    """
    Thin async wrapper around async-semantic-llm-cache that adds:
    - Route-aware namespace isolation
    - Two-vector weighted-mean embedding (history context + question)
    - Per-instance hit/miss counters
    - Graceful fallback when sentence-transformers is absent
    """
    def __init__(self, cfg: Any) -> None:
        self._cfg = cfg
        self._backend: Any = None
        self._emb_cache: Any = None
        self._semantic: bool = False
        self._hits: int = 0
        self._misses: int = 0
    async def init(self) -> None:
        from semantic_llm_cache.similarity import EmbeddingCache
        # --- Backend ---
        backend_type: str = self._cfg.cache_backend
        if backend_type == "sqlite":
            from semantic_llm_cache.backends.sqlite import SQLiteBackend
            self._backend = SQLiteBackend(db_path=self._cfg.cache_db_path)
        elif backend_type == "redis":
            from semantic_llm_cache.backends.redis import RedisBackend
            self._backend = RedisBackend(url=self._cfg.cache_redis_url)
            await self._backend.ping()
        else:
            from semantic_llm_cache.backends.memory import MemoryBackend
            self._backend = MemoryBackend()
        # --- Embedding provider ---
        if self._cfg.cache_similarity < 1.0:
            if _check_sentence_transformers():
                from semantic_llm_cache.similarity import create_embedding_provider
                provider = create_embedding_provider("sentence-transformer")
                self._emb_cache = EmbeddingCache(provider=provider)
                self._semantic = True
                print(
                    f"[cache] Semantic cache ready "
                    f"(similarity≥{self._cfg.cache_similarity}, backend={backend_type})"
                )
            else:
                warnings.warn(
                    "[cache] sentence-transformers is not installed. "
                    "Falling back to exact-match caching (similarity=1.0). "
                    "Use the :semantic Docker image tag to enable semantic caching.",
                    RuntimeWarning,
                    stacklevel=2,
                )
                self._emb_cache = EmbeddingCache()   # DummyEmbeddingProvider
                print(f"[cache] Exact-match cache ready (backend={backend_type}) [semantic unavailable]")
        else:
            self._emb_cache = EmbeddingCache()       # DummyEmbeddingProvider
            print(f"[cache] Exact-match cache ready (backend={backend_type})")
    # ------------------------------------------------------------------
    # Internal helpers
    # ------------------------------------------------------------------
    def _namespace(self, route: str, model: str, system: str) -> str:
        raw = f"{route}::{model}::{system}"
        return hashlib.sha256(raw.encode()).hexdigest()[:16]
    def _cache_key(self, namespace: str, last_user: str) -> str:
        from semantic_llm_cache.utils import hash_prompt, normalize_prompt
        return hash_prompt(normalize_prompt(last_user), namespace)
    def _parse_messages(
        self, messages: list[dict]
    ) -> tuple[str, list[dict], str]:
        """
        Returns (system_prompt, prior_history_turns, last_user_message).
        Multimodal content lists are reduced to their text parts.
        """
        system = ""
        turns: list[dict] = []
        for m in messages:
            role = m.get("role", "")
            content = m.get("content", "")
            if isinstance(content, list):
                content = " ".join(
                    p.get("text", "")
                    for p in content
                    if isinstance(p, dict) and p.get("type") == "text"
                )
            if role == "system":
                system = content
            else:
                turns.append({"role": role, "content": content})
        last_user = ""
        for m in reversed(turns):
            if m["role"] == "user":
                last_user = m["content"]
                break
        # History = all turns before the final user message
        history = turns[:-1] if turns and turns[-1]["role"] == "user" else turns
        return system, history, last_user
    async def _build_embedding(
        self, history: list[dict], last_user: str
    ) -> list[float] | None:
        """
        Weighted mean of BM25-weighted history embedding and last-user embedding.
        Returns None when not in semantic mode.
        """
        if not self._semantic:
            return None
        import numpy as np
        alpha: float = self._cfg.cache_history_weight   # weight for history signal
        q_vec = np.array(await self._emb_cache.aencode(last_user), dtype=float)
        if not history:
            # No history → use question embedding alone (alpha has no effect)
            return q_vec.tolist()
        h_text = _bm25_weighted_text(history)
        h_vec = np.array(await self._emb_cache.aencode(h_text), dtype=float)
        combined = alpha * h_vec + (1.0 - alpha) * q_vec
        norm = float(np.linalg.norm(combined))
        if norm > 0.0:
            combined /= norm
        return combined.tolist()
    # ------------------------------------------------------------------
    # Public interface: chat (handles both Ollama and OpenAI message lists)
    # ------------------------------------------------------------------
    async def get_chat(
        self, route: str, model: str, messages: list[dict]
    ) -> bytes | None:
        """Return cached response bytes, or None on miss."""
        if not self._backend:
            return None
        system, history, last_user = self._parse_messages(messages)
        if not last_user:
            return None
        ns = self._namespace(route, model, system)
        key = self._cache_key(ns, last_user)
        # 1. Exact key match
        entry = await self._backend.get(key)
        if entry is not None:
            self._hits += 1
            return entry.response  # type: ignore[return-value]
        # 2. Semantic similarity match
        if self._semantic and self._cfg.cache_similarity < 1.0:
            emb = await self._build_embedding(history, last_user)
            result = await self._backend.find_similar(
                emb, threshold=self._cfg.cache_similarity, namespace=ns
            )
            if result is not None:
                _, matched, _ = result
                self._hits += 1
                return matched.response  # type: ignore[return-value]
        self._misses += 1
        return None
    async def set_chat(
        self, route: str, model: str, messages: list[dict], response_bytes: bytes
    ) -> None:
        """Store a response in the cache (fire-and-forget friendly)."""
        if not self._backend:
            return
        system, history, last_user = self._parse_messages(messages)
        if not last_user:
            return
        ns = self._namespace(route, model, system)
        key = self._cache_key(ns, last_user)
        emb = (
            await self._build_embedding(history, last_user)
            if self._semantic and self._cfg.cache_similarity < 1.0
            else None
        )
        from semantic_llm_cache.config import CacheEntry
        await self._backend.set(
            key,
            CacheEntry(
                prompt=last_user,
                response=response_bytes,
                embedding=emb,
                created_at=time.time(),
                ttl=self._cfg.cache_ttl,
                namespace=ns,
                hit_count=0,
            ),
        )
    # ------------------------------------------------------------------
    # Convenience wrappers for the generate route (prompt string, not messages)
    # ------------------------------------------------------------------
    async def get_generate(
        self, model: str, prompt: str, system: str = ""
    ) -> bytes | None:
        messages: list[dict] = []
        if system:
            messages.append({"role": "system", "content": system})
        messages.append({"role": "user", "content": prompt})
        return await self.get_chat("generate", model, messages)
    async def set_generate(
        self, model: str, prompt: str, system: str, response_bytes: bytes
    ) -> None:
        messages: list[dict] = []
        if system:
            messages.append({"role": "system", "content": system})
        messages.append({"role": "user", "content": prompt})
        await self.set_chat("generate", model, messages, response_bytes)
    # ------------------------------------------------------------------
    # Management
    # ------------------------------------------------------------------
    def stats(self) -> dict:
        total = self._hits + self._misses
        return {
            "hits": self._hits,
            "misses": self._misses,
            "hit_rate": round(self._hits / total, 3) if total else 0.0,
            "semantic": self._semantic,
            "backend": self._cfg.cache_backend,
            "similarity_threshold": self._cfg.cache_similarity,
            "history_weight": self._cfg.cache_history_weight,
        }
    async def clear(self) -> None:
        if self._backend:
            await self._backend.clear()
        self._hits = 0
        self._misses = 0
 # ---------------------------------------------------------------------------
 # Module-level singleton
 # ---------------------------------------------------------------------------
 _cache: LLMCache | None = None
 async def init_llm_cache(cfg: Any) -> LLMCache | None:
    """Initialise the module-level cache singleton. Returns None if disabled."""
    global _cache
    if not cfg.cache_enabled:
        print("[cache] Cache disabled (cache_enabled=false).")
        return None
    _cache = LLMCache(cfg)
    await _cache.init()
    return _cache
 def get_llm_cache() -> LLMCache | None:
    return _cache
 # ---------------------------------------------------------------------------
 # Helper: convert a stored Ollama-format non-streaming response to an
 # OpenAI SSE single-chunk stream (used when a streaming OpenAI request
 # hits the cache whose entry was populated from a non-streaming response).
 # ---------------------------------------------------------------------------
 def openai_nonstream_to_sse(cached_bytes: bytes, model: str) -> bytes:
    """
    Wrap a stored OpenAI ChatCompletion JSON as a minimal single-chunk SSE stream.
    The stored entry always uses the non-streaming ChatCompletion format so that
    non-streaming cache hits can be served directly; this function adapts it for
    streaming clients.
    """
    import orjson, time as _time
    try:
        d = orjson.loads(cached_bytes)
        content = (d.get("choices") or [{}])[0].get("message", {}).get("content", "")
        chunk = {
            "id": d.get("id", "cache-hit"),
            "object": "chat.completion.chunk",
            "created": d.get("created", int(_time.time())),
            "model": d.get("model", model),
            "choices": [
                {
                    "index": 0,
                    "delta": {"role": "assistant", "content": content},
                    "finish_reason": "stop",
                }
            ],
        }
        if d.get("usage"):
            chunk["usage"] = d["usage"]
        return f"data: {orjson.dumps(chunk).decode()}\n\ndata: [DONE]\n\n".encode()
    except Exception as exc:
        warnings.warn(
            f"[cache] openai_nonstream_to_sse: corrupt cache entry, returning empty stream: {exc}",
            RuntimeWarning,
            stacklevel=2,
        )
        return b"data: [DONE]\n\n"
--- a/config.yaml
+++ b/config.yaml
@ -6,7 +6,7 @@ endpoints:
  - https://api.openai.com/v1
 llama_server_endpoints:
-  - http://192.168.0.33:8889/v1
+  - http://192.168.0.50:8889/v1
 # Maximum concurrent connections *per endpoint‑model pair* (equals to OLLAMA_NUM_PARALLEL)
 max_concurrent_connections: 2
@ -22,4 +22,38 @@ api_keys:
  "http://192.168.0.51:11434": "ollama"
  "http://192.168.0.52:11434": "ollama"
  "https://api.openai.com/v1": "${OPENAI_KEY}"
-  "http://192.168.0.33:8889/v1": "llama"
+  "http://192.168.0.50:8889/v1": "llama"
 # -------------------------------------------------------------
 # Semantic LLM Cache (optional — disabled by default)
 # Caches LLM responses to cut costs and latency on repeated or
 # semantically similar prompts.
 # Cached routes: /api/chat  /api/generate  /v1/chat/completions  /v1/completions
 # MOE requests (moe-* model prefix) always bypass the cache.
 # -------------------------------------------------------------
 cache_enabled: true
 # Backend — where cached responses are stored:
 #   memory  → in-process LRU (lost on restart, not shared across replicas) [default]
 #   sqlite  → persistent file-based   (single instance, survives restart)
 #   redis   → distributed             (shared across replicas, requires Redis)
 cache_backend: memory
 # Cosine similarity threshold for a cache hit:
 #   1.0  → exact match only  (works on any image variant)
 #   <1.0 → semantic matching (requires the :semantic Docker image tag)
 cache_similarity: 0.9
 # Response TTL in seconds. Remove the key or set to null to cache forever.
 cache_ttl: 3600
 # SQLite backend: path to the cache database file
 cache_db_path: llm_cache.db
 # Redis backend: connection URL
 # cache_redis_url: redis://localhost:6379/0
 # Weight of the BM25-weighted chat-history embedding vs last-user-message embedding.
 # 0.3 = 30% history context signal, 70% question signal.
 # Only relevant when cache_similarity < 1.0.
 # cache_history_weight: 0.3
--- a/doc/configuration.md
+++ b/doc/configuration.md
@ -204,6 +204,149 @@ max_concurrent_connections: 3
 **Recommendation**: Use multiple endpoints for redundancy and load distribution.
 ## Semantic LLM Cache
 NOMYO Router can cache LLM responses and serve them directly — skipping endpoint selection, model load, and token generation entirely.
 ### How it works
 1. On every cacheable request (`/api/chat`, `/api/generate`, `/v1/chat/completions`, `/v1/completions`) the cache is checked **before** choosing an endpoint.
 2. On a **cache hit** the stored response is returned immediately as a single chunk (streaming or non-streaming — both work).
 3. On a **cache miss** the request is forwarded normally. The response is stored in the cache after it completes.
 4. **MOE requests** (`moe-*` model prefix) always bypass the cache.
 5. **Token counts** are never recorded for cache hits.
 ### Cache key strategy
 | Signal | How matched |
 |---|---|
 | `model + system_prompt` | Exact — hard context isolation per deployment |
 | BM25-weighted embedding of chat history | Semantic — conversation context signal |
 | Embedding of last user message | Semantic — the actual question |
 The two semantic vectors are combined as a weighted mean (tuned by `cache_history_weight`) before cosine similarity comparison, staying at a single 384-dimensional vector compatible with the library's storage format.
 ### Quick start — exact match (lean image)
 ```yaml
 cache_enabled: true
 cache_backend: sqlite    # persists across restarts
 cache_similarity: 1.0   # exact match only, no sentence-transformers needed
 cache_ttl: 3600
 ```
 ### Quick start — semantic matching (:semantic image)
 ```yaml
 cache_enabled: true
 cache_backend: sqlite
 cache_similarity: 0.90   # hit if ≥90% cosine similarity
 cache_ttl: 3600
 cache_history_weight: 0.3
 ```
 Pull the semantic image:
 ```bash
 docker pull ghcr.io/nomyo-ai/nomyo-router:latest-semantic
 ```
 ### Cache configuration options
 #### `cache_enabled`
 **Type**: `bool` | **Default**: `false`
 Enable or disable the cache. All other cache settings are ignored when `false`.
 #### `cache_backend`
 **Type**: `str` | **Default**: `"memory"`
 | Value | Description | Persists | Multi-replica |
 |---|---|---|---|
 | `memory` | In-process LRU dict | ❌ | ❌ |
 | `sqlite` | File-based via `aiosqlite` | ✅ | ❌ |
 | `redis` | Redis via `redis.asyncio` | ✅ | ✅ |
 Use `redis` when running multiple router replicas behind a load balancer — all replicas share one warm cache.
 #### `cache_similarity`
 **Type**: `float` | **Default**: `1.0`
 Cosine similarity threshold. `1.0` means exact match only (no embedding model needed). Values below `1.0` enable semantic matching, which requires the `:semantic` Docker image tag.
 Recommended starting value for semantic mode: `0.90`.
 #### `cache_ttl`
 **Type**: `int | null` | **Default**: `3600`
 Time-to-live for cache entries in seconds. Remove the key or set to `null` to cache forever.
 #### `cache_db_path`
 **Type**: `str` | **Default**: `"llm_cache.db"`
 Path to the SQLite cache database. Only used when `cache_backend: sqlite`.
 #### `cache_redis_url`
 **Type**: `str` | **Default**: `"redis://localhost:6379/0"`
 Redis connection URL. Only used when `cache_backend: redis`.
 #### `cache_history_weight`
 **Type**: `float` | **Default**: `0.3`
 Weight of the BM25-weighted chat-history embedding in the combined cache key vector. `0.3` means the history contributes 30% and the final user message contributes 70% of the similarity signal. Only used when `cache_similarity < 1.0`.
 ### Cache management endpoints
 | Endpoint | Method | Description |
 |---|---|---|
 | `/api/cache/stats` | `GET` | Hit/miss counters, hit rate, current config |
 | `/api/cache/invalidate` | `POST` | Clear all cache entries and reset counters |
 ```bash
 # Check cache performance
 curl http://localhost:12434/api/cache/stats
 # Clear the cache
 curl -X POST http://localhost:12434/api/cache/invalidate
 ```
 Example stats response:
 ```json
 {
  "enabled": true,
  "hits": 1547,
  "misses": 892,
  "hit_rate": 0.634,
  "semantic": true,
  "backend": "sqlite",
  "similarity_threshold": 0.9,
  "history_weight": 0.3
 }
 ```
 ### Docker image variants
 | Tag | Semantic cache | Image size |
 |---|---|---|
 | `latest` | ❌ exact match only | ~300 MB |
 | `latest-semantic` | ✅ sentence-transformers + model pre-baked | ~800 MB |
 Build locally:
 ```bash
 # Lean (exact match)
 docker build -t nomyo-router .
 # Semantic (~500 MB larger, all-MiniLM-L6-v2 model baked in)
 docker build --build-arg SEMANTIC_CACHE=true -t nomyo-router:semantic .
 ```
 ## Configuration Validation
 The router validates the configuration at startup:
--- a/doc/deployment.md
+++ b/doc/deployment.md
@ -82,10 +82,23 @@ sudo systemctl status nomyo-router
 ## 2. Docker Deployment
 ### Image variants
 | Tag | Semantic cache | Image size |
 |---|---|---|
 | `latest` | ❌ exact match only | ~300 MB |
 | `latest-semantic` | ✅ sentence-transformers + `all-MiniLM-L6-v2` pre-baked | ~800 MB |
 The `:semantic` variant enables `cache_similarity < 1.0` in `config.yaml`. The lean image falls back to exact-match caching with a warning if semantic mode is configured.
 ### Build the Image
 ```bash
 # Lean build (exact match cache, default)
 docker build -t nomyo-router .
 # Semantic build (~500 MB larger, all-MiniLM-L6-v2 model baked in at build time)
 docker build --build-arg SEMANTIC_CACHE=true -t nomyo-router:semantic .
 ```
 ### Run the Container
--- a/doc/examples/docker-compose.yml
+++ b/doc/examples/docker-compose.yml
@ -1,20 +1,30 @@
 # Docker Compose example for NOMYO Router with multiple Ollama instances
 #
 # Two router profiles are provided:
 #   nomyo-router          — lean image, exact-match cache only (~300 MB)
 #   nomyo-router-semantic — semantic image, sentence-transformers baked in (~800 MB)
 #
 # Uncomment the redis service and set cache_backend: redis in config.yaml
 # to share the LLM response cache across multiple router replicas.
 version: '3.8'
 services:
-  # NOMYO Router
+  # NOMYO Router — lean image (exact-match cache, default)
  nomyo-router:
    image: nomyo-router:latest
-    build: .
+    build:
      context: .
      args:
        SEMANTIC_CACHE: "false"
    ports:
      - "12434:12434"
    environment:
      - CONFIG_PATH=/app/config/config.yaml
-      - NOMYO_ROUTER_DB_PATH=/app/token_counts.db
+      - NOMYO_ROUTER_DB_PATH=/app/data/token_counts.db
    volumes:
      - ./config:/app/config
-      - router-db:/app/token_counts.db
+      - router-data:/app/data
    depends_on:
      - ollama1
      - ollama2
@ -23,6 +33,45 @@ services:
    networks:
      - nomyo-net
  # NOMYO Router — semantic image (cache_similarity < 1.0 support, ~800 MB)
  # Build:  docker compose build nomyo-router-semantic
  # Switch: comment out nomyo-router above, uncomment this block.
  # nomyo-router-semantic:
  #   image: nomyo-router:semantic
  #   build:
  #     context: .
  #     args:
  #       SEMANTIC_CACHE: "true"
  #   ports:
  #     - "12434:12434"
  #   environment:
  #     - CONFIG_PATH=/app/config/config.yaml
  #     - NOMYO_ROUTER_DB_PATH=/app/data/token_counts.db
  #   volumes:
  #     - ./config:/app/config
  #     - router-data:/app/data
  #     - hf-cache:/app/data/hf_cache   # share HuggingFace model cache across builds
  #   depends_on:
  #     - ollama1
  #     - ollama2
  #     - ollama3
  #   restart: unless-stopped
  #   networks:
  #     - nomyo-net
  # Optional: Redis for shared LLM response cache across multiple router replicas.
  # Requires cache_backend: redis in config.yaml.
  # redis:
  #   image: redis:7-alpine
  #   ports:
  #     - "6379:6379"
  #   volumes:
  #     - redis-data:/data
  #   command: redis-server --save 60 1 --loglevel warning
  #   restart: unless-stopped
  #   networks:
  #     - nomyo-net
  # Ollama Instance 1
  ollama1:
    image: ollama/ollama:latest
@ -87,7 +136,9 @@ services:
      - nomyo-net
 volumes:
-  router-db:
+  router-data:
  # hf-cache:     # uncomment when using nomyo-router-semantic
  # redis-data:   # uncomment when using Redis cache backend
  ollama1-data:
  ollama2-data:
  ollama3-data:
--- a/doc/examples/sample-config.yaml
+++ b/doc/examples/sample-config.yaml
@ -30,3 +30,37 @@ api_keys:
  "https://api.openai.com/v1": "${OPENAI_KEY}"
  "http://localhost:8080/v1": "llama-server"  # Optional API key for llama-server - depends on llama_server config
  "http://192.168.0.33:8081/v1": "llama-server"
 # -------------------------------------------------------------
 # Semantic LLM Cache (optional — disabled by default)
 # Caches LLM responses to cut costs and latency on repeated or
 # semantically similar prompts.
 # Cached routes: /api/chat  /api/generate  /v1/chat/completions  /v1/completions
 # MOE requests (moe-* model prefix) always bypass the cache.
 # -------------------------------------------------------------
 # cache_enabled: false
 # Backend — where cached responses are stored:
 #   memory  → in-process LRU (lost on restart, not shared across replicas) [default]
 #   sqlite  → persistent file-based   (single instance, survives restart)
 #   redis   → distributed             (shared across replicas, requires Redis)
 # cache_backend: memory
 # Cosine similarity threshold for a cache hit:
 #   1.0  → exact match only  (works on any image variant)
 #   <1.0 → semantic matching (requires the :semantic Docker image tag)
 # cache_similarity: 1.0
 # Response TTL in seconds. Remove the key or set to null to cache forever.
 # cache_ttl: 3600
 # SQLite backend: path to the cache database file
 # cache_db_path: llm_cache.db
 # Redis backend: connection URL
 # cache_redis_url: redis://localhost:6379/0
 # Weight of the BM25-weighted chat-history embedding vs last-user-message embedding.
 # 0.3 = 30% history context signal, 70% question signal.
 # Only relevant when cache_similarity < 1.0.
 # cache_history_weight: 0.3
--- a/doc/monitoring.md
+++ b/doc/monitoring.md
@ -133,6 +133,39 @@ Response:
 }
 ```
 ### Cache Statistics
 ```bash
 curl http://localhost:12434/api/cache/stats
 ```
 Response when cache is enabled:
 ```json
 {
  "enabled": true,
  "hits": 1547,
  "misses": 892,
  "hit_rate": 0.634,
  "semantic": true,
  "backend": "sqlite",
  "similarity_threshold": 0.9,
  "history_weight": 0.3
 }
 ```
 Response when cache is disabled:
 ```json
 { "enabled": false }
 ```
 ### Cache Invalidation
 ```bash
 curl -X POST http://localhost:12434/api/cache/invalidate
 ```
 Clears all cached entries and resets hit/miss counters.
 ### Real-time Usage Stream
 ```bash
--- a/requirements.txt
+++ b/requirements.txt
@ -39,3 +39,8 @@ uvicorn==0.38.0
 uvloop
 yarl==1.20.1
 aiosqlite
 # Semantic LLM cache — base install (exact-match mode, no heavy ML deps)
 # For semantic mode use the :semantic Docker image tag (adds sentence-transformers + torch)
 # SECURITY: pin to a specific tag or commit hash to prevent supply-chain drift, e.g.:
 #   semantic-llm-cache @ git+https://github.com/nomyo-ai/async-semantic-llm-cache.git@v1.0.0
 semantic-llm-cache @ git+https://github.com/nomyo-ai/async-semantic-llm-cache.git
--- a/router.py
+++ b/router.py
@ -123,6 +123,22 @@ class Config(BaseSettings):
    # Database configuration
    db_path: str = Field(default=os.getenv("NOMYO_ROUTER_DB_PATH", "token_counts.db"))
    # Semantic LLM Cache configuration
    cache_enabled: bool = Field(default=False)
    # Backend: "memory" (default, in-process), "sqlite" (persistent), "redis" (distributed)
    cache_backend: str = Field(default="memory")
    # Cosine similarity threshold: 1.0 = exact match only, <1.0 = semantic (requires :semantic image)
    cache_similarity: float = Field(default=1.0)
    # TTL in seconds; None = cache forever
    cache_ttl: Optional[int] = Field(default=3600)
    # SQLite backend: path to cache database file
    cache_db_path: str = Field(default="llm_cache.db")
    # Redis backend: connection URL
    cache_redis_url: str = Field(default="redis://localhost:6379/0")
    # Weight of BM25-weighted chat-history embedding vs last-user-message embedding
    # 0.3 = 30% history context signal, 70% question signal
    cache_history_weight: float = Field(default=0.3)
    class Config:
        # Load from `config.yaml` first, then from env variables
        env_prefix = "NOMYO_ROUTER_"
@ -188,6 +204,7 @@ def _config_path_from_env() -> Path:
 from ollama._types import TokenLogprob, Logprob
 from db import TokenDatabase
 from cache import init_llm_cache, get_llm_cache, openai_nonstream_to_sse
 # Create the global config object – it will be overwritten on startup
@ -1596,6 +1613,14 @@ async def proxy(request: Request):
        error_msg = f"Invalid JSON format in request body: {str(e)}. Please ensure the request is properly formatted."
        raise HTTPException(status_code=400, detail=error_msg) from e
    # Cache lookup — before endpoint selection so no slot is wasted on a hit
    _cache = get_llm_cache()
    if _cache is not None:
        _cached = await _cache.get_generate(model, prompt, system or "")
        if _cached is not None:
            async def _serve_cached_generate():
                yield _cached
            return StreamingResponse(_serve_cached_generate(), media_type="application/json")
    endpoint, tracking_model = await choose_endpoint(model)
    use_openai = is_openai_compatible(endpoint)
@ -1633,6 +1658,7 @@ async def proxy(request: Request):
            else:
                async_gen = await client.generate(model=model, prompt=prompt, suffix=suffix, system=system, template=template, context=context, stream=stream, think=think, raw=raw, format=_format, images=images, options=options, keep_alive=keep_alive)
            if stream == True:
                content_parts: list[str] = []
                async for chunk in async_gen:
                    if use_openai:
                        chunk = rechunk.openai_completion2ollama(chunk, stream, start_ts)
@ -1644,6 +1670,27 @@ async def proxy(request: Request):
                        json_line = chunk.model_dump_json()
                    else:
                        json_line = orjson.dumps(chunk)
                    # Accumulate and store cache on done chunk — before yield so it always runs
                    if _cache is not None:
                        if getattr(chunk, "response", None):
                            content_parts.append(chunk.response)
                        if getattr(chunk, "done", False):
                            assembled = orjson.dumps({
                                k: v for k, v in {
                                    "model": getattr(chunk, "model", model),
                                    "response": "".join(content_parts),
                                    "done": True,
                                    "done_reason": getattr(chunk, "done_reason", "stop") or "stop",
                                    "prompt_eval_count": getattr(chunk, "prompt_eval_count", None),
                                    "eval_count": getattr(chunk, "eval_count", None),
                                    "total_duration": getattr(chunk, "total_duration", None),
                                    "eval_duration": getattr(chunk, "eval_duration", None),
                                }.items() if v is not None
                            }) + b"\n"
                            try:
                                await _cache.set_generate(model, prompt, system or "", assembled)
                            except Exception as _ce:
                                print(f"[cache] set_generate (streaming) failed: {_ce}")
                    yield json_line.encode("utf-8") + b"\n"
            else:
                if use_openai:
@ -1660,7 +1707,14 @@ async def proxy(request: Request):
                    if hasattr(async_gen, "model_dump_json")
                    else orjson.dumps(async_gen)
                )
-                yield json_line.encode("utf-8") + b"\n"
+                cache_bytes = json_line.encode("utf-8") + b"\n"
                yield cache_bytes
                # Cache non-streaming response
                if _cache is not None:
                    try:
                        await _cache.set_generate(model, prompt, system or "", cache_bytes)
                    except Exception as _ce:
                        print(f"[cache] set_generate (non-streaming) failed: {_ce}")
        finally:
            # Ensure counter is decremented even if an exception occurs
@ -1711,6 +1765,26 @@ async def chat_proxy(request: Request):
    except orjson.JSONDecodeError as e:
        raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e
    # Cache lookup — before endpoint selection, always bypassed for MOE
    _is_moe = model.startswith("moe-")
    _cache = get_llm_cache()
    # Normalise model name for cache key: strip ":latest" suffix here so that
    # get_chat and set_chat use the same model string regardless of when the
    # strip happens further down (line ~1793 strips it for OpenAI endpoints).
    _cache_model = model[: -len(":latest")] if model.endswith(":latest") else model
    # Snapshot original messages before any OpenAI-format transformation so that
    # get_chat and set_chat always use the same key regardless of backend type.
    _cache_messages = messages
    if _cache is not None and not _is_moe:
        _cached = await _cache.get_chat("ollama_chat", _cache_model, messages)
        if _cached is not None:
            async def _serve_cached_chat():
                yield _cached
            return StreamingResponse(
                _serve_cached_chat(),
                media_type="application/x-ndjson" if stream else "application/json",
            )
    # 2. Endpoint logic
    if model.startswith("moe-"):
        model = model.split("moe-")[1]
@ -1764,6 +1838,7 @@ async def chat_proxy(request: Request):
                    async_gen = await client.chat(model=model, messages=messages, tools=tools, stream=stream, think=think, format=_format, options=options, keep_alive=keep_alive, logprobs=logprobs, top_logprobs=top_logprobs)
            if stream == True:
                tc_acc = {}  # accumulate OpenAI tool-call deltas across chunks
                content_parts: list[str] = []
                async for chunk in async_gen:
                    if use_openai:
                        _accumulate_openai_tc_delta(chunk, tc_acc)
@ -1780,6 +1855,30 @@ async def chat_proxy(request: Request):
                        json_line = chunk.model_dump_json()
                    else:
                        json_line = orjson.dumps(chunk)
                    # Accumulate and store cache on done chunk — before yield so it always runs
                    # Works for both Ollama-native and OpenAI-compatible backends; chunks are
                    # already converted to Ollama format by rechunk before this point.
                    if _cache is not None and not _is_moe:
                        if chunk.message and getattr(chunk.message, "content", None):
                            content_parts.append(chunk.message.content)
                        if getattr(chunk, "done", False):
                            assembled = orjson.dumps({
                                k: v for k, v in {
                                    "model": getattr(chunk, "model", model),
                                    "created_at": (lambda ca: ca.isoformat() if hasattr(ca, "isoformat") else ca)(getattr(chunk, "created_at", None)),
                                    "message": {"role": "assistant", "content": "".join(content_parts)},
                                    "done": True,
                                    "done_reason": getattr(chunk, "done_reason", "stop") or "stop",
                                    "prompt_eval_count": getattr(chunk, "prompt_eval_count", None),
                                    "eval_count": getattr(chunk, "eval_count", None),
                                    "total_duration": getattr(chunk, "total_duration", None),
                                    "eval_duration": getattr(chunk, "eval_duration", None),
                                }.items() if v is not None
                            }) + b"\n"
                            try:
                                await _cache.set_chat("ollama_chat", _cache_model, _cache_messages, assembled)
                            except Exception as _ce:
                                print(f"[cache] set_chat (ollama_chat streaming) failed: {_ce}")
                    yield json_line.encode("utf-8") + b"\n"
            else:
                if use_openai:
@ -1796,7 +1895,14 @@ async def chat_proxy(request: Request):
                    if hasattr(async_gen, "model_dump_json")
                    else orjson.dumps(async_gen)
                )
-                yield json_line.encode("utf-8") + b"\n"
+                cache_bytes = json_line.encode("utf-8") + b"\n"
                yield cache_bytes
                # Cache non-streaming response (non-MOE; works for both Ollama and OpenAI backends)
                if _cache is not None and not _is_moe:
                    try:
                        await _cache.set_chat("ollama_chat", _cache_model, _cache_messages, cache_bytes)
                    except Exception as _ce:
                        print(f"[cache] set_chat (ollama_chat non-streaming) failed: {_ce}")
        finally:
            # Ensure counter is decremented even if an exception occurs
@ -2680,6 +2786,21 @@ async def openai_chat_completions_proxy(request: Request):
    except orjson.JSONDecodeError as e:
        raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e
    # Cache lookup — before endpoint selection
    _cache = get_llm_cache()
    if _cache is not None:
        _cached = await _cache.get_chat("openai_chat", model, messages)
        if _cached is not None:
            if stream:
                _sse = openai_nonstream_to_sse(_cached, model)
                async def _serve_cached_ochat_stream():
                    yield _sse
                return StreamingResponse(_serve_cached_ochat_stream(), media_type="text/event-stream")
            else:
                async def _serve_cached_ochat_json():
                    yield _cached
                return StreamingResponse(_serve_cached_ochat_json(), media_type="application/json")
    # 2. Endpoint logic
    endpoint, tracking_model = await choose_endpoint(model)
    base_url = ep2base(endpoint)
@ -2699,6 +2820,8 @@ async def openai_chat_completions_proxy(request: Request):
                else:
                    raise
            if stream == True:
                content_parts: list[str] = []
                usage_snapshot: dict = {}
                async for chunk in async_gen:
                    data = (
                        chunk.model_dump_json()
@ -2715,6 +2838,8 @@ async def openai_chat_completions_proxy(request: Request):
                        has_tool_calls = getattr(delta, "tool_calls", None) is not None
                        if has_content or has_reasoning or has_tool_calls:
                            yield f"data: {data}\n\n".encode("utf-8")
                        if has_content and delta.content:
                            content_parts.append(delta.content)
                    elif chunk.usage is not None:
                        # Forward the usage-only final chunk (e.g. from llama-server)
                        yield f"data: {data}\n\n".encode("utf-8")
@ -2723,12 +2848,24 @@ async def openai_chat_completions_proxy(request: Request):
                    if chunk.usage is not None:
                        prompt_tok = chunk.usage.prompt_tokens or 0
                        comp_tok   = chunk.usage.completion_tokens or 0
                        usage_snapshot = {"prompt_tokens": prompt_tok, "completion_tokens": comp_tok, "total_tokens": prompt_tok + comp_tok}
                    else:
                        llama_usage = rechunk.extract_usage_from_llama_timings(chunk)
                        if llama_usage:
                            prompt_tok, comp_tok = llama_usage
                    if prompt_tok != 0 or comp_tok != 0:
                        await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))
                # Cache assembled streaming response — before [DONE] so it always runs
                if _cache is not None and content_parts:
                    assembled = orjson.dumps({
                        "model": model,
                        "choices": [{"index": 0, "message": {"role": "assistant", "content": "".join(content_parts)}, "finish_reason": "stop"}],
                        **({"usage": usage_snapshot} if usage_snapshot else {}),
                    }) + b"\n"
                    try:
                        await _cache.set_chat("openai_chat", model, messages, assembled)
                    except Exception as _ce:
                        print(f"[cache] set_chat (openai_chat streaming) failed: {_ce}")
                yield b"data: [DONE]\n\n"
            else:
                prompt_tok = 0
@ -2747,7 +2884,14 @@ async def openai_chat_completions_proxy(request: Request):
                    if hasattr(async_gen, "model_dump_json")
                    else orjson.dumps(async_gen)
                )
-                yield json_line.encode("utf-8") + b"\n"
+                cache_bytes = json_line.encode("utf-8") + b"\n"
                yield cache_bytes
                # Cache non-streaming response
                if _cache is not None:
                    try:
                        await _cache.set_chat("openai_chat", model, messages, cache_bytes)
                    except Exception as _ce:
                        print(f"[cache] set_chat (openai_chat non-streaming) failed: {_ce}")
        finally:
            # Ensure counter is decremented even if an exception occurs
@ -2823,6 +2967,22 @@ async def openai_completions_proxy(request: Request):
    except orjson.JSONDecodeError as e:
        raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e
    # Cache lookup — completions prompt mapped to a single-turn messages list
    _cache = get_llm_cache()
    _compl_messages = [{"role": "user", "content": prompt}]
    if _cache is not None:
        _cached = await _cache.get_chat("openai_completions", model, _compl_messages)
        if _cached is not None:
            if stream:
                _sse = openai_nonstream_to_sse(_cached, model)
                async def _serve_cached_ocompl_stream():
                    yield _sse
                return StreamingResponse(_serve_cached_ocompl_stream(), media_type="text/event-stream")
            else:
                async def _serve_cached_ocompl_json():
                    yield _cached
                return StreamingResponse(_serve_cached_ocompl_json(), media_type="application/json")
    # 2. Endpoint logic
    endpoint, tracking_model = await choose_endpoint(model)
    base_url = ep2base(endpoint)
@ -2834,6 +2994,8 @@ async def openai_completions_proxy(request: Request):
            # The chat method returns a generator of dicts (or GenerateResponse)
            async_gen = await oclient.completions.create(**params)
            if stream == True:
                text_parts: list[str] = []
                usage_snapshot: dict = {}
                async for chunk in async_gen:
                    data = (
                        chunk.model_dump_json()
@ -2849,6 +3011,8 @@ async def openai_completions_proxy(request: Request):
                        )
                        if has_text or has_reasoning or choice.finish_reason is not None:
                            yield f"data: {data}\n\n".encode("utf-8")
                        if has_text and choice.text:
                            text_parts.append(choice.text)
                    elif chunk.usage is not None:
                        # Forward the usage-only final chunk (e.g. from llama-server)
                        yield f"data: {data}\n\n".encode("utf-8")
@ -2857,12 +3021,24 @@ async def openai_completions_proxy(request: Request):
                    if chunk.usage is not None:
                        prompt_tok = chunk.usage.prompt_tokens or 0
                        comp_tok   = chunk.usage.completion_tokens or 0
                        usage_snapshot = {"prompt_tokens": prompt_tok, "completion_tokens": comp_tok, "total_tokens": prompt_tok + comp_tok}
                    else:
                        llama_usage = rechunk.extract_usage_from_llama_timings(chunk)
                        if llama_usage:
                            prompt_tok, comp_tok = llama_usage
                    if prompt_tok != 0 or comp_tok != 0:
                        await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))
                # Cache assembled streaming response — before [DONE] so it always runs
                if _cache is not None and text_parts:
                    assembled = orjson.dumps({
                        "model": model,
                        "choices": [{"index": 0, "message": {"role": "assistant", "content": "".join(text_parts)}, "finish_reason": "stop"}],
                        **({"usage": usage_snapshot} if usage_snapshot else {}),
                    }) + b"\n"
                    try:
                        await _cache.set_chat("openai_completions", model, _compl_messages, assembled)
                    except Exception as _ce:
                        print(f"[cache] set_chat (openai_completions streaming) failed: {_ce}")
                # Final DONE event
                yield b"data: [DONE]\n\n"
            else:
@ -2882,7 +3058,14 @@ async def openai_completions_proxy(request: Request):
                    if hasattr(async_gen, "model_dump_json")
                    else orjson.dumps(async_gen)
                )
-                yield json_line.encode("utf-8") + b"\n"
+                cache_bytes = json_line.encode("utf-8") + b"\n"
                yield cache_bytes
                # Cache non-streaming response
                if _cache is not None:
                    try:
                        await _cache.set_chat("openai_completions", model, _compl_messages, cache_bytes)
                    except Exception as _ce:
                        print(f"[cache] set_chat (openai_completions non-streaming) failed: {_ce}")
        finally:
            # Ensure counter is decremented even if an exception occurs
@ -3076,6 +3259,28 @@ async def rerank_proxy(request: Request):
    finally:
        await decrement_usage(endpoint, tracking_model)
 # -------------------------------------------------------------
 # 25b. Cache management endpoints
 # -------------------------------------------------------------
@app.get("/api/cache/stats")
 async def cache_stats():
    """Return hit/miss counters and configuration for the LLM response cache."""
    c = get_llm_cache()
    if c is None:
        return {"enabled": False}
    return {"enabled": True, **c.stats()}
@app.post("/api/cache/invalidate")
 async def cache_invalidate():
    """Clear all entries from the LLM response cache and reset counters."""
    c = get_llm_cache()
    if c is None:
        return {"enabled": False, "cleared": False}
    await c.clear()
    return {"enabled": True, "cleared": True}
 # -------------------------------------------------------------
 # 26. Serve the static front‑end
 # -------------------------------------------------------------
@ -3211,6 +3416,7 @@ async def startup_event() -> None:
    app_state["session"] = session
    token_worker_task = asyncio.create_task(token_worker())
    flush_task = asyncio.create_task(flush_buffer())
    await init_llm_cache(config)
@app.on_event("shutdown")
 async def shutdown_event() -> None: