feat: adding a semantic cache layer
This commit is contained in:
parent
c3d47c7ffe
commit
dd4b12da6a
13 changed files with 1138 additions and 22 deletions
44
.dockerignore
Normal file
44
.dockerignore
Normal file
|
|
@ -0,0 +1,44 @@
|
|||
# Version control
|
||||
.git
|
||||
.gitignore
|
||||
.github
|
||||
|
||||
# Environment & secrets
|
||||
.env
|
||||
.env.*
|
||||
*.env
|
||||
|
||||
# Python artifacts
|
||||
__pycache__
|
||||
*.pyc
|
||||
*.pyo
|
||||
*.pyd
|
||||
.Python
|
||||
.venv
|
||||
venv
|
||||
*.egg-info
|
||||
dist
|
||||
build
|
||||
|
||||
# Local databases (don't bake data into image)
|
||||
*.db
|
||||
*.db-shm
|
||||
*.db-wal
|
||||
|
||||
# IDE / editor
|
||||
.vscode
|
||||
.idea
|
||||
*.swp
|
||||
*.swo
|
||||
|
||||
# Documentation
|
||||
doc/
|
||||
docs/
|
||||
*.md
|
||||
|
||||
# Tests
|
||||
tests/
|
||||
test_*
|
||||
|
||||
# Local config overrides
|
||||
config.local.yaml
|
||||
71
.github/workflows/docker-publish-semantic.yml
vendored
Normal file
71
.github/workflows/docker-publish-semantic.yml
vendored
Normal file
|
|
@ -0,0 +1,71 @@
|
|||
name: Build and Publish Docker Image (Semantic Cache)
|
||||
|
||||
# Builds the :semantic variant that includes sentence-transformers + CPU torch
|
||||
# and the pre-baked all-MiniLM-L6-v2 embedding model (~500 MB larger than lean).
|
||||
# Tags mirror the lean workflow but carry a -semantic suffix, e.g.:
|
||||
# ghcr.io/nomyo-ai/nomyo-router:latest-semantic
|
||||
# ghcr.io/nomyo-ai/nomyo-router:0.7.0-semantic
|
||||
# ghcr.io/nomyo-ai/nomyo-router:0.7-semantic
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
tags:
|
||||
- "v*.*.*"
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
REGISTRY: ghcr.io
|
||||
IMAGE_NAME: ${{ github.repository }}
|
||||
|
||||
jobs:
|
||||
build-and-push-semantic:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up QEMU (for multi-arch builds)
|
||||
uses: docker/setup-qemu-action@v3
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to GitHub Container Registry
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ${{ env.REGISTRY }}
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Extract Docker metadata
|
||||
id: meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
|
||||
tags: |
|
||||
# Versioned semantic tags on git tags (e.g. v0.7.0 → 0.7.0-semantic, 0.7-semantic)
|
||||
type=semver,pattern={{version}}-semantic
|
||||
type=semver,pattern={{major}}.{{minor}}-semantic
|
||||
# latest-semantic only on main branch pushes
|
||||
type=raw,value=latest-semantic,enable=${{ github.ref == 'refs/heads/main' }}
|
||||
# SHA-tagged for traceability
|
||||
type=sha,prefix=sha-,suffix=-semantic
|
||||
|
||||
- name: Build and push semantic Docker image
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: .
|
||||
platforms: linux/amd64,linux/arm64
|
||||
push: true
|
||||
build-args: |
|
||||
SEMANTIC_CACHE=true
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
26
Dockerfile
26
Dockerfile
|
|
@ -3,21 +3,43 @@ FROM python:3.13-slim
|
|||
ENV PYTHONUNBUFFERED=1 \
|
||||
PYTHONDONTWRITEBYTECODE=1
|
||||
|
||||
# SEMANTIC_CACHE=true installs sentence-transformers + CPU-only torch and pre-bakes
|
||||
# the all-MiniLM-L6-v2 embedding model (~500 MB extra). The resulting image is tagged
|
||||
# :semantic. The default (lean) image supports exact-match caching only.
|
||||
ARG SEMANTIC_CACHE=false
|
||||
|
||||
# Pin HuggingFace cache to a predictable path inside /app/data so it can be
|
||||
# mounted as a volume and shared between builds.
|
||||
ENV HF_HOME=/app/data/hf_cache
|
||||
|
||||
# Install SQLite
|
||||
RUN apt-get update && apt-get install -y sqlite3
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends sqlite3 \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /app
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir --upgrade pip \
|
||||
&& pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Semantic cache deps — only installed when SEMANTIC_CACHE=true
|
||||
# CPU-only torch must be installed before sentence-transformers to avoid
|
||||
# pulling the full CUDA-enabled build (~2.5 GB).
|
||||
RUN if [ "$SEMANTIC_CACHE" = "true" ]; then \
|
||||
pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu && \
|
||||
pip install --no-cache-dir sentence-transformers && \
|
||||
python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')"; \
|
||||
fi
|
||||
|
||||
# Create database directory and set permissions
|
||||
RUN mkdir -p /app/data && chown -R www-data:www-data /app/data
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN chmod +x /app/entrypoint.sh
|
||||
RUN chmod +x /app/entrypoint.sh && \
|
||||
chown -R www-data:www-data /app
|
||||
|
||||
EXPOSE 12434
|
||||
|
||||
USER www-data
|
||||
|
||||
ENTRYPOINT ["/app/entrypoint.sh"]
|
||||
|
|
|
|||
67
README.md
67
README.md
|
|
@ -74,22 +74,28 @@ uvicorn router:app --host 127.0.0.1 --port 12434 --loop uvloop
|
|||
|
||||
### Pre-built image (GitHub Container Registry)
|
||||
|
||||
Pre-built multi-arch images (`linux/amd64`, `linux/arm64`) are published automatically on every release:
|
||||
Pre-built multi-arch images (`linux/amd64`, `linux/arm64`) are published automatically on every release.
|
||||
|
||||
**Lean image** (exact-match cache, ~300 MB):
|
||||
```sh
|
||||
docker pull ghcr.io/nomyo-ai/nomyo-router:latest
|
||||
```
|
||||
|
||||
Specific version:
|
||||
|
||||
```sh
|
||||
docker pull ghcr.io/nomyo-ai/nomyo-router:0.7.0
|
||||
```
|
||||
|
||||
### Build the container image locally:
|
||||
**Semantic image** (semantic cache with `all-MiniLM-L6-v2` pre-baked, ~800 MB):
|
||||
```sh
|
||||
docker pull ghcr.io/nomyo-ai/nomyo-router:latest-semantic
|
||||
docker pull ghcr.io/nomyo-ai/nomyo-router:0.7.0-semantic
|
||||
```
|
||||
|
||||
### Build the container image locally
|
||||
|
||||
```sh
|
||||
# Lean build (exact match cache, default)
|
||||
docker build -t nomyo-router .
|
||||
|
||||
# Semantic build — sentence-transformers + model baked in
|
||||
docker build --build-arg SEMANTIC_CACHE=true -t nomyo-router:semantic .
|
||||
```
|
||||
|
||||
Run the router in Docker with your own configuration file mounted from the host. The entrypoint script accepts a `--config-path` argument so you can point to a file anywhere inside the container:
|
||||
|
|
@ -124,6 +130,53 @@ This way the Ollama backend servers are utilized more efficient than by simply u
|
|||
|
||||
NOMYO Router also supports OpenAI API compatible v1 backend servers.
|
||||
|
||||
## Semantic LLM Cache
|
||||
|
||||
NOMYO Router includes an optional semantic cache that serves repeated or semantically similar LLM requests from cache — no endpoint round-trip, no token cost, response in <10 ms.
|
||||
|
||||
### Enable (exact match, any image)
|
||||
|
||||
```yaml
|
||||
# config.yaml
|
||||
cache_enabled: true
|
||||
cache_backend: sqlite # persists across restarts
|
||||
cache_similarity: 1.0 # exact match only
|
||||
cache_ttl: 3600
|
||||
```
|
||||
|
||||
### Enable (semantic matching, :semantic image)
|
||||
|
||||
```yaml
|
||||
cache_enabled: true
|
||||
cache_backend: sqlite
|
||||
cache_similarity: 0.90 # "What is Python?" ≈ "What's Python?" → cache hit
|
||||
cache_ttl: 3600
|
||||
cache_history_weight: 0.3
|
||||
```
|
||||
|
||||
Pull the semantic image:
|
||||
```bash
|
||||
docker pull ghcr.io/nomyo-ai/nomyo-router:latest-semantic
|
||||
```
|
||||
|
||||
### Cache key strategy
|
||||
|
||||
Each request is keyed on `model + system_prompt` (exact) combined with a weighted-mean embedding of BM25-weighted chat history (30%) and the last user message (70%). This means:
|
||||
- Different system prompts → always separate cache namespaces (no cross-tenant leakage)
|
||||
- Same question, different phrasing → cache hit (semantic mode)
|
||||
- MOE requests (`moe-*`) → always bypass the cache
|
||||
|
||||
### Cached routes
|
||||
|
||||
`/api/chat` · `/api/generate` · `/v1/chat/completions` · `/v1/completions`
|
||||
|
||||
### Cache management
|
||||
|
||||
```bash
|
||||
curl http://localhost:12434/api/cache/stats # hit rate, counters, config
|
||||
curl -X POST http://localhost:12434/api/cache/invalidate # clear all entries
|
||||
```
|
||||
|
||||
## Supplying the router API key
|
||||
|
||||
If you set `nomyo-router-api-key` in `config.yaml` (or `NOMYO_ROUTER_API_KEY` env), every request to NOMYO Router must include the key:
|
||||
|
|
|
|||
407
cache.py
Normal file
407
cache.py
Normal file
|
|
@ -0,0 +1,407 @@
|
|||
"""
|
||||
LLM Semantic Cache for NOMYO Router.
|
||||
|
||||
Strategy:
|
||||
- Namespace: sha256(route :: model :: system_prompt)[:16] — exact context isolation
|
||||
- Cache key: hash(normalize(last_user_message), namespace) — exact lookup
|
||||
- Embedding: weighted mean of
|
||||
α * embed(bm25_weighted(chat_history)) — conversation context
|
||||
1-α * embed(last_user_message) — the actual question
|
||||
with α = cache_history_weight (default 0.3).
|
||||
- Exact-match caching (similarity=1.0) uses DummyEmbeddingProvider — zero extra deps.
|
||||
- Semantic caching (similarity<1.0) requires sentence-transformers. If missing the
|
||||
library falls back to exact-match with a warning (lean Docker image behaviour).
|
||||
- MOE models (moe-*) always bypass the cache.
|
||||
- Token counts are never recorded for cache hits.
|
||||
- Streaming cache hits are served as a single-chunk response.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import math
|
||||
import time
|
||||
import warnings
|
||||
from collections import Counter
|
||||
from typing import Any, Optional
|
||||
|
||||
# Lazily resolved once at first embed() call
|
||||
_semantic_available: Optional[bool] = None
|
||||
|
||||
|
||||
def _check_sentence_transformers() -> bool:
|
||||
global _semantic_available
|
||||
if _semantic_available is None:
|
||||
try:
|
||||
import sentence_transformers # noqa: F401
|
||||
_semantic_available = True
|
||||
except ImportError:
|
||||
_semantic_available = False
|
||||
return _semantic_available # type: ignore[return-value]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# BM25-weighted text representation of chat history
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _bm25_weighted_text(history: list[dict]) -> str:
|
||||
"""
|
||||
Produce a BM25-importance-weighted text string from chat history turns.
|
||||
|
||||
High-IDF (rare, domain-specific) terms are repeated proportionally to
|
||||
their BM25 score so the downstream sentence-transformer embedding
|
||||
naturally upweights topical signal and downweights stop words.
|
||||
"""
|
||||
docs = [m.get("content", "") for m in history if m.get("content")]
|
||||
if not docs:
|
||||
return ""
|
||||
|
||||
def _tok(text: str) -> list[str]:
|
||||
return [w.lower() for w in text.split() if len(w) > 2]
|
||||
|
||||
tokenized = [_tok(d) for d in docs]
|
||||
N = len(tokenized)
|
||||
|
||||
df: Counter = Counter()
|
||||
for tokens in tokenized:
|
||||
for term in set(tokens):
|
||||
df[term] += 1
|
||||
|
||||
k1, b = 1.5, 0.75
|
||||
avg_dl = sum(len(t) for t in tokenized) / max(N, 1)
|
||||
|
||||
term_scores: Counter = Counter()
|
||||
for tokens in tokenized:
|
||||
tf_c = Counter(tokens)
|
||||
dl = len(tokens)
|
||||
for term, tf in tf_c.items():
|
||||
idf = math.log((N + 1) / (df[term] + 1)) + 1.0
|
||||
score = idf * (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * dl / max(avg_dl, 1)))
|
||||
term_scores[term] += score
|
||||
|
||||
top = term_scores.most_common(50)
|
||||
if not top:
|
||||
return " ".join(docs)
|
||||
|
||||
max_s = top[0][1]
|
||||
out: list[str] = []
|
||||
for term, score in top:
|
||||
out.extend([term] * max(1, round(3 * score / max_s)))
|
||||
return " ".join(out)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# LLMCache
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class LLMCache:
|
||||
"""
|
||||
Thin async wrapper around async-semantic-llm-cache that adds:
|
||||
- Route-aware namespace isolation
|
||||
- Two-vector weighted-mean embedding (history context + question)
|
||||
- Per-instance hit/miss counters
|
||||
- Graceful fallback when sentence-transformers is absent
|
||||
"""
|
||||
|
||||
def __init__(self, cfg: Any) -> None:
|
||||
self._cfg = cfg
|
||||
self._backend: Any = None
|
||||
self._emb_cache: Any = None
|
||||
self._semantic: bool = False
|
||||
self._hits: int = 0
|
||||
self._misses: int = 0
|
||||
|
||||
async def init(self) -> None:
|
||||
from semantic_llm_cache.similarity import EmbeddingCache
|
||||
|
||||
# --- Backend ---
|
||||
backend_type: str = self._cfg.cache_backend
|
||||
if backend_type == "sqlite":
|
||||
from semantic_llm_cache.backends.sqlite import SQLiteBackend
|
||||
self._backend = SQLiteBackend(db_path=self._cfg.cache_db_path)
|
||||
elif backend_type == "redis":
|
||||
from semantic_llm_cache.backends.redis import RedisBackend
|
||||
self._backend = RedisBackend(url=self._cfg.cache_redis_url)
|
||||
await self._backend.ping()
|
||||
else:
|
||||
from semantic_llm_cache.backends.memory import MemoryBackend
|
||||
self._backend = MemoryBackend()
|
||||
|
||||
# --- Embedding provider ---
|
||||
if self._cfg.cache_similarity < 1.0:
|
||||
if _check_sentence_transformers():
|
||||
from semantic_llm_cache.similarity import create_embedding_provider
|
||||
provider = create_embedding_provider("sentence-transformer")
|
||||
self._emb_cache = EmbeddingCache(provider=provider)
|
||||
self._semantic = True
|
||||
print(
|
||||
f"[cache] Semantic cache ready "
|
||||
f"(similarity≥{self._cfg.cache_similarity}, backend={backend_type})"
|
||||
)
|
||||
else:
|
||||
warnings.warn(
|
||||
"[cache] sentence-transformers is not installed. "
|
||||
"Falling back to exact-match caching (similarity=1.0). "
|
||||
"Use the :semantic Docker image tag to enable semantic caching.",
|
||||
RuntimeWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
self._emb_cache = EmbeddingCache() # DummyEmbeddingProvider
|
||||
print(f"[cache] Exact-match cache ready (backend={backend_type}) [semantic unavailable]")
|
||||
else:
|
||||
self._emb_cache = EmbeddingCache() # DummyEmbeddingProvider
|
||||
print(f"[cache] Exact-match cache ready (backend={backend_type})")
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Internal helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _namespace(self, route: str, model: str, system: str) -> str:
|
||||
raw = f"{route}::{model}::{system}"
|
||||
return hashlib.sha256(raw.encode()).hexdigest()[:16]
|
||||
|
||||
def _cache_key(self, namespace: str, last_user: str) -> str:
|
||||
from semantic_llm_cache.utils import hash_prompt, normalize_prompt
|
||||
return hash_prompt(normalize_prompt(last_user), namespace)
|
||||
|
||||
def _parse_messages(
|
||||
self, messages: list[dict]
|
||||
) -> tuple[str, list[dict], str]:
|
||||
"""
|
||||
Returns (system_prompt, prior_history_turns, last_user_message).
|
||||
Multimodal content lists are reduced to their text parts.
|
||||
"""
|
||||
system = ""
|
||||
turns: list[dict] = []
|
||||
|
||||
for m in messages:
|
||||
role = m.get("role", "")
|
||||
content = m.get("content", "")
|
||||
if isinstance(content, list):
|
||||
content = " ".join(
|
||||
p.get("text", "")
|
||||
for p in content
|
||||
if isinstance(p, dict) and p.get("type") == "text"
|
||||
)
|
||||
if role == "system":
|
||||
system = content
|
||||
else:
|
||||
turns.append({"role": role, "content": content})
|
||||
|
||||
last_user = ""
|
||||
for m in reversed(turns):
|
||||
if m["role"] == "user":
|
||||
last_user = m["content"]
|
||||
break
|
||||
|
||||
# History = all turns before the final user message
|
||||
history = turns[:-1] if turns and turns[-1]["role"] == "user" else turns
|
||||
return system, history, last_user
|
||||
|
||||
async def _build_embedding(
|
||||
self, history: list[dict], last_user: str
|
||||
) -> list[float] | None:
|
||||
"""
|
||||
Weighted mean of BM25-weighted history embedding and last-user embedding.
|
||||
Returns None when not in semantic mode.
|
||||
"""
|
||||
if not self._semantic:
|
||||
return None
|
||||
|
||||
import numpy as np
|
||||
|
||||
alpha: float = self._cfg.cache_history_weight # weight for history signal
|
||||
q_vec = np.array(await self._emb_cache.aencode(last_user), dtype=float)
|
||||
|
||||
if not history:
|
||||
# No history → use question embedding alone (alpha has no effect)
|
||||
return q_vec.tolist()
|
||||
|
||||
h_text = _bm25_weighted_text(history)
|
||||
h_vec = np.array(await self._emb_cache.aencode(h_text), dtype=float)
|
||||
|
||||
combined = alpha * h_vec + (1.0 - alpha) * q_vec
|
||||
norm = float(np.linalg.norm(combined))
|
||||
if norm > 0.0:
|
||||
combined /= norm
|
||||
return combined.tolist()
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Public interface: chat (handles both Ollama and OpenAI message lists)
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def get_chat(
|
||||
self, route: str, model: str, messages: list[dict]
|
||||
) -> bytes | None:
|
||||
"""Return cached response bytes, or None on miss."""
|
||||
if not self._backend:
|
||||
return None
|
||||
|
||||
system, history, last_user = self._parse_messages(messages)
|
||||
if not last_user:
|
||||
return None
|
||||
|
||||
ns = self._namespace(route, model, system)
|
||||
key = self._cache_key(ns, last_user)
|
||||
|
||||
# 1. Exact key match
|
||||
entry = await self._backend.get(key)
|
||||
if entry is not None:
|
||||
self._hits += 1
|
||||
return entry.response # type: ignore[return-value]
|
||||
|
||||
# 2. Semantic similarity match
|
||||
if self._semantic and self._cfg.cache_similarity < 1.0:
|
||||
emb = await self._build_embedding(history, last_user)
|
||||
result = await self._backend.find_similar(
|
||||
emb, threshold=self._cfg.cache_similarity, namespace=ns
|
||||
)
|
||||
if result is not None:
|
||||
_, matched, _ = result
|
||||
self._hits += 1
|
||||
return matched.response # type: ignore[return-value]
|
||||
|
||||
self._misses += 1
|
||||
return None
|
||||
|
||||
async def set_chat(
|
||||
self, route: str, model: str, messages: list[dict], response_bytes: bytes
|
||||
) -> None:
|
||||
"""Store a response in the cache (fire-and-forget friendly)."""
|
||||
if not self._backend:
|
||||
return
|
||||
|
||||
system, history, last_user = self._parse_messages(messages)
|
||||
if not last_user:
|
||||
return
|
||||
|
||||
ns = self._namespace(route, model, system)
|
||||
key = self._cache_key(ns, last_user)
|
||||
|
||||
emb = (
|
||||
await self._build_embedding(history, last_user)
|
||||
if self._semantic and self._cfg.cache_similarity < 1.0
|
||||
else None
|
||||
)
|
||||
|
||||
from semantic_llm_cache.config import CacheEntry
|
||||
|
||||
await self._backend.set(
|
||||
key,
|
||||
CacheEntry(
|
||||
prompt=last_user,
|
||||
response=response_bytes,
|
||||
embedding=emb,
|
||||
created_at=time.time(),
|
||||
ttl=self._cfg.cache_ttl,
|
||||
namespace=ns,
|
||||
hit_count=0,
|
||||
),
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Convenience wrappers for the generate route (prompt string, not messages)
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def get_generate(
|
||||
self, model: str, prompt: str, system: str = ""
|
||||
) -> bytes | None:
|
||||
messages: list[dict] = []
|
||||
if system:
|
||||
messages.append({"role": "system", "content": system})
|
||||
messages.append({"role": "user", "content": prompt})
|
||||
return await self.get_chat("generate", model, messages)
|
||||
|
||||
async def set_generate(
|
||||
self, model: str, prompt: str, system: str, response_bytes: bytes
|
||||
) -> None:
|
||||
messages: list[dict] = []
|
||||
if system:
|
||||
messages.append({"role": "system", "content": system})
|
||||
messages.append({"role": "user", "content": prompt})
|
||||
await self.set_chat("generate", model, messages, response_bytes)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Management
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def stats(self) -> dict:
|
||||
total = self._hits + self._misses
|
||||
return {
|
||||
"hits": self._hits,
|
||||
"misses": self._misses,
|
||||
"hit_rate": round(self._hits / total, 3) if total else 0.0,
|
||||
"semantic": self._semantic,
|
||||
"backend": self._cfg.cache_backend,
|
||||
"similarity_threshold": self._cfg.cache_similarity,
|
||||
"history_weight": self._cfg.cache_history_weight,
|
||||
}
|
||||
|
||||
async def clear(self) -> None:
|
||||
if self._backend:
|
||||
await self._backend.clear()
|
||||
self._hits = 0
|
||||
self._misses = 0
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Module-level singleton
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_cache: LLMCache | None = None
|
||||
|
||||
|
||||
async def init_llm_cache(cfg: Any) -> LLMCache | None:
|
||||
"""Initialise the module-level cache singleton. Returns None if disabled."""
|
||||
global _cache
|
||||
if not cfg.cache_enabled:
|
||||
print("[cache] Cache disabled (cache_enabled=false).")
|
||||
return None
|
||||
_cache = LLMCache(cfg)
|
||||
await _cache.init()
|
||||
return _cache
|
||||
|
||||
|
||||
def get_llm_cache() -> LLMCache | None:
|
||||
return _cache
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helper: convert a stored Ollama-format non-streaming response to an
|
||||
# OpenAI SSE single-chunk stream (used when a streaming OpenAI request
|
||||
# hits the cache whose entry was populated from a non-streaming response).
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def openai_nonstream_to_sse(cached_bytes: bytes, model: str) -> bytes:
|
||||
"""
|
||||
Wrap a stored OpenAI ChatCompletion JSON as a minimal single-chunk SSE stream.
|
||||
The stored entry always uses the non-streaming ChatCompletion format so that
|
||||
non-streaming cache hits can be served directly; this function adapts it for
|
||||
streaming clients.
|
||||
"""
|
||||
import orjson, time as _time
|
||||
|
||||
try:
|
||||
d = orjson.loads(cached_bytes)
|
||||
content = (d.get("choices") or [{}])[0].get("message", {}).get("content", "")
|
||||
chunk = {
|
||||
"id": d.get("id", "cache-hit"),
|
||||
"object": "chat.completion.chunk",
|
||||
"created": d.get("created", int(_time.time())),
|
||||
"model": d.get("model", model),
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"delta": {"role": "assistant", "content": content},
|
||||
"finish_reason": "stop",
|
||||
}
|
||||
],
|
||||
}
|
||||
if d.get("usage"):
|
||||
chunk["usage"] = d["usage"]
|
||||
return f"data: {orjson.dumps(chunk).decode()}\n\ndata: [DONE]\n\n".encode()
|
||||
except Exception as exc:
|
||||
warnings.warn(
|
||||
f"[cache] openai_nonstream_to_sse: corrupt cache entry, returning empty stream: {exc}",
|
||||
RuntimeWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
return b"data: [DONE]\n\n"
|
||||
38
config.yaml
38
config.yaml
|
|
@ -6,7 +6,7 @@ endpoints:
|
|||
- https://api.openai.com/v1
|
||||
|
||||
llama_server_endpoints:
|
||||
- http://192.168.0.33:8889/v1
|
||||
- http://192.168.0.50:8889/v1
|
||||
|
||||
# Maximum concurrent connections *per endpoint‑model pair* (equals to OLLAMA_NUM_PARALLEL)
|
||||
max_concurrent_connections: 2
|
||||
|
|
@ -22,4 +22,38 @@ api_keys:
|
|||
"http://192.168.0.51:11434": "ollama"
|
||||
"http://192.168.0.52:11434": "ollama"
|
||||
"https://api.openai.com/v1": "${OPENAI_KEY}"
|
||||
"http://192.168.0.33:8889/v1": "llama"
|
||||
"http://192.168.0.50:8889/v1": "llama"
|
||||
|
||||
# -------------------------------------------------------------
|
||||
# Semantic LLM Cache (optional — disabled by default)
|
||||
# Caches LLM responses to cut costs and latency on repeated or
|
||||
# semantically similar prompts.
|
||||
# Cached routes: /api/chat /api/generate /v1/chat/completions /v1/completions
|
||||
# MOE requests (moe-* model prefix) always bypass the cache.
|
||||
# -------------------------------------------------------------
|
||||
cache_enabled: true
|
||||
|
||||
# Backend — where cached responses are stored:
|
||||
# memory → in-process LRU (lost on restart, not shared across replicas) [default]
|
||||
# sqlite → persistent file-based (single instance, survives restart)
|
||||
# redis → distributed (shared across replicas, requires Redis)
|
||||
cache_backend: memory
|
||||
|
||||
# Cosine similarity threshold for a cache hit:
|
||||
# 1.0 → exact match only (works on any image variant)
|
||||
# <1.0 → semantic matching (requires the :semantic Docker image tag)
|
||||
cache_similarity: 0.9
|
||||
|
||||
# Response TTL in seconds. Remove the key or set to null to cache forever.
|
||||
cache_ttl: 3600
|
||||
|
||||
# SQLite backend: path to the cache database file
|
||||
cache_db_path: llm_cache.db
|
||||
|
||||
# Redis backend: connection URL
|
||||
# cache_redis_url: redis://localhost:6379/0
|
||||
|
||||
# Weight of the BM25-weighted chat-history embedding vs last-user-message embedding.
|
||||
# 0.3 = 30% history context signal, 70% question signal.
|
||||
# Only relevant when cache_similarity < 1.0.
|
||||
# cache_history_weight: 0.3
|
||||
|
|
@ -204,6 +204,149 @@ max_concurrent_connections: 3
|
|||
|
||||
**Recommendation**: Use multiple endpoints for redundancy and load distribution.
|
||||
|
||||
## Semantic LLM Cache
|
||||
|
||||
NOMYO Router can cache LLM responses and serve them directly — skipping endpoint selection, model load, and token generation entirely.
|
||||
|
||||
### How it works
|
||||
|
||||
1. On every cacheable request (`/api/chat`, `/api/generate`, `/v1/chat/completions`, `/v1/completions`) the cache is checked **before** choosing an endpoint.
|
||||
2. On a **cache hit** the stored response is returned immediately as a single chunk (streaming or non-streaming — both work).
|
||||
3. On a **cache miss** the request is forwarded normally. The response is stored in the cache after it completes.
|
||||
4. **MOE requests** (`moe-*` model prefix) always bypass the cache.
|
||||
5. **Token counts** are never recorded for cache hits.
|
||||
|
||||
### Cache key strategy
|
||||
|
||||
| Signal | How matched |
|
||||
|---|---|
|
||||
| `model + system_prompt` | Exact — hard context isolation per deployment |
|
||||
| BM25-weighted embedding of chat history | Semantic — conversation context signal |
|
||||
| Embedding of last user message | Semantic — the actual question |
|
||||
|
||||
The two semantic vectors are combined as a weighted mean (tuned by `cache_history_weight`) before cosine similarity comparison, staying at a single 384-dimensional vector compatible with the library's storage format.
|
||||
|
||||
### Quick start — exact match (lean image)
|
||||
|
||||
```yaml
|
||||
cache_enabled: true
|
||||
cache_backend: sqlite # persists across restarts
|
||||
cache_similarity: 1.0 # exact match only, no sentence-transformers needed
|
||||
cache_ttl: 3600
|
||||
```
|
||||
|
||||
### Quick start — semantic matching (:semantic image)
|
||||
|
||||
```yaml
|
||||
cache_enabled: true
|
||||
cache_backend: sqlite
|
||||
cache_similarity: 0.90 # hit if ≥90% cosine similarity
|
||||
cache_ttl: 3600
|
||||
cache_history_weight: 0.3
|
||||
```
|
||||
|
||||
Pull the semantic image:
|
||||
```bash
|
||||
docker pull ghcr.io/nomyo-ai/nomyo-router:latest-semantic
|
||||
```
|
||||
|
||||
### Cache configuration options
|
||||
|
||||
#### `cache_enabled`
|
||||
|
||||
**Type**: `bool` | **Default**: `false`
|
||||
|
||||
Enable or disable the cache. All other cache settings are ignored when `false`.
|
||||
|
||||
#### `cache_backend`
|
||||
|
||||
**Type**: `str` | **Default**: `"memory"`
|
||||
|
||||
| Value | Description | Persists | Multi-replica |
|
||||
|---|---|---|---|
|
||||
| `memory` | In-process LRU dict | ❌ | ❌ |
|
||||
| `sqlite` | File-based via `aiosqlite` | ✅ | ❌ |
|
||||
| `redis` | Redis via `redis.asyncio` | ✅ | ✅ |
|
||||
|
||||
Use `redis` when running multiple router replicas behind a load balancer — all replicas share one warm cache.
|
||||
|
||||
#### `cache_similarity`
|
||||
|
||||
**Type**: `float` | **Default**: `1.0`
|
||||
|
||||
Cosine similarity threshold. `1.0` means exact match only (no embedding model needed). Values below `1.0` enable semantic matching, which requires the `:semantic` Docker image tag.
|
||||
|
||||
Recommended starting value for semantic mode: `0.90`.
|
||||
|
||||
#### `cache_ttl`
|
||||
|
||||
**Type**: `int | null` | **Default**: `3600`
|
||||
|
||||
Time-to-live for cache entries in seconds. Remove the key or set to `null` to cache forever.
|
||||
|
||||
#### `cache_db_path`
|
||||
|
||||
**Type**: `str` | **Default**: `"llm_cache.db"`
|
||||
|
||||
Path to the SQLite cache database. Only used when `cache_backend: sqlite`.
|
||||
|
||||
#### `cache_redis_url`
|
||||
|
||||
**Type**: `str` | **Default**: `"redis://localhost:6379/0"`
|
||||
|
||||
Redis connection URL. Only used when `cache_backend: redis`.
|
||||
|
||||
#### `cache_history_weight`
|
||||
|
||||
**Type**: `float` | **Default**: `0.3`
|
||||
|
||||
Weight of the BM25-weighted chat-history embedding in the combined cache key vector. `0.3` means the history contributes 30% and the final user message contributes 70% of the similarity signal. Only used when `cache_similarity < 1.0`.
|
||||
|
||||
### Cache management endpoints
|
||||
|
||||
| Endpoint | Method | Description |
|
||||
|---|---|---|
|
||||
| `/api/cache/stats` | `GET` | Hit/miss counters, hit rate, current config |
|
||||
| `/api/cache/invalidate` | `POST` | Clear all cache entries and reset counters |
|
||||
|
||||
```bash
|
||||
# Check cache performance
|
||||
curl http://localhost:12434/api/cache/stats
|
||||
|
||||
# Clear the cache
|
||||
curl -X POST http://localhost:12434/api/cache/invalidate
|
||||
```
|
||||
|
||||
Example stats response:
|
||||
```json
|
||||
{
|
||||
"enabled": true,
|
||||
"hits": 1547,
|
||||
"misses": 892,
|
||||
"hit_rate": 0.634,
|
||||
"semantic": true,
|
||||
"backend": "sqlite",
|
||||
"similarity_threshold": 0.9,
|
||||
"history_weight": 0.3
|
||||
}
|
||||
```
|
||||
|
||||
### Docker image variants
|
||||
|
||||
| Tag | Semantic cache | Image size |
|
||||
|---|---|---|
|
||||
| `latest` | ❌ exact match only | ~300 MB |
|
||||
| `latest-semantic` | ✅ sentence-transformers + model pre-baked | ~800 MB |
|
||||
|
||||
Build locally:
|
||||
```bash
|
||||
# Lean (exact match)
|
||||
docker build -t nomyo-router .
|
||||
|
||||
# Semantic (~500 MB larger, all-MiniLM-L6-v2 model baked in)
|
||||
docker build --build-arg SEMANTIC_CACHE=true -t nomyo-router:semantic .
|
||||
```
|
||||
|
||||
## Configuration Validation
|
||||
|
||||
The router validates the configuration at startup:
|
||||
|
|
|
|||
|
|
@ -82,10 +82,23 @@ sudo systemctl status nomyo-router
|
|||
|
||||
## 2. Docker Deployment
|
||||
|
||||
### Image variants
|
||||
|
||||
| Tag | Semantic cache | Image size |
|
||||
|---|---|---|
|
||||
| `latest` | ❌ exact match only | ~300 MB |
|
||||
| `latest-semantic` | ✅ sentence-transformers + `all-MiniLM-L6-v2` pre-baked | ~800 MB |
|
||||
|
||||
The `:semantic` variant enables `cache_similarity < 1.0` in `config.yaml`. The lean image falls back to exact-match caching with a warning if semantic mode is configured.
|
||||
|
||||
### Build the Image
|
||||
|
||||
```bash
|
||||
# Lean build (exact match cache, default)
|
||||
docker build -t nomyo-router .
|
||||
|
||||
# Semantic build (~500 MB larger, all-MiniLM-L6-v2 model baked in at build time)
|
||||
docker build --build-arg SEMANTIC_CACHE=true -t nomyo-router:semantic .
|
||||
```
|
||||
|
||||
### Run the Container
|
||||
|
|
|
|||
|
|
@ -1,20 +1,30 @@
|
|||
# Docker Compose example for NOMYO Router with multiple Ollama instances
|
||||
#
|
||||
# Two router profiles are provided:
|
||||
# nomyo-router — lean image, exact-match cache only (~300 MB)
|
||||
# nomyo-router-semantic — semantic image, sentence-transformers baked in (~800 MB)
|
||||
#
|
||||
# Uncomment the redis service and set cache_backend: redis in config.yaml
|
||||
# to share the LLM response cache across multiple router replicas.
|
||||
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
# NOMYO Router
|
||||
# NOMYO Router — lean image (exact-match cache, default)
|
||||
nomyo-router:
|
||||
image: nomyo-router:latest
|
||||
build: .
|
||||
build:
|
||||
context: .
|
||||
args:
|
||||
SEMANTIC_CACHE: "false"
|
||||
ports:
|
||||
- "12434:12434"
|
||||
environment:
|
||||
- CONFIG_PATH=/app/config/config.yaml
|
||||
- NOMYO_ROUTER_DB_PATH=/app/token_counts.db
|
||||
- NOMYO_ROUTER_DB_PATH=/app/data/token_counts.db
|
||||
volumes:
|
||||
- ./config:/app/config
|
||||
- router-db:/app/token_counts.db
|
||||
- router-data:/app/data
|
||||
depends_on:
|
||||
- ollama1
|
||||
- ollama2
|
||||
|
|
@ -23,6 +33,45 @@ services:
|
|||
networks:
|
||||
- nomyo-net
|
||||
|
||||
# NOMYO Router — semantic image (cache_similarity < 1.0 support, ~800 MB)
|
||||
# Build: docker compose build nomyo-router-semantic
|
||||
# Switch: comment out nomyo-router above, uncomment this block.
|
||||
# nomyo-router-semantic:
|
||||
# image: nomyo-router:semantic
|
||||
# build:
|
||||
# context: .
|
||||
# args:
|
||||
# SEMANTIC_CACHE: "true"
|
||||
# ports:
|
||||
# - "12434:12434"
|
||||
# environment:
|
||||
# - CONFIG_PATH=/app/config/config.yaml
|
||||
# - NOMYO_ROUTER_DB_PATH=/app/data/token_counts.db
|
||||
# volumes:
|
||||
# - ./config:/app/config
|
||||
# - router-data:/app/data
|
||||
# - hf-cache:/app/data/hf_cache # share HuggingFace model cache across builds
|
||||
# depends_on:
|
||||
# - ollama1
|
||||
# - ollama2
|
||||
# - ollama3
|
||||
# restart: unless-stopped
|
||||
# networks:
|
||||
# - nomyo-net
|
||||
|
||||
# Optional: Redis for shared LLM response cache across multiple router replicas.
|
||||
# Requires cache_backend: redis in config.yaml.
|
||||
# redis:
|
||||
# image: redis:7-alpine
|
||||
# ports:
|
||||
# - "6379:6379"
|
||||
# volumes:
|
||||
# - redis-data:/data
|
||||
# command: redis-server --save 60 1 --loglevel warning
|
||||
# restart: unless-stopped
|
||||
# networks:
|
||||
# - nomyo-net
|
||||
|
||||
# Ollama Instance 1
|
||||
ollama1:
|
||||
image: ollama/ollama:latest
|
||||
|
|
@ -87,7 +136,9 @@ services:
|
|||
- nomyo-net
|
||||
|
||||
volumes:
|
||||
router-db:
|
||||
router-data:
|
||||
# hf-cache: # uncomment when using nomyo-router-semantic
|
||||
# redis-data: # uncomment when using Redis cache backend
|
||||
ollama1-data:
|
||||
ollama2-data:
|
||||
ollama3-data:
|
||||
|
|
|
|||
|
|
@ -30,3 +30,37 @@ api_keys:
|
|||
"https://api.openai.com/v1": "${OPENAI_KEY}"
|
||||
"http://localhost:8080/v1": "llama-server" # Optional API key for llama-server - depends on llama_server config
|
||||
"http://192.168.0.33:8081/v1": "llama-server"
|
||||
|
||||
# -------------------------------------------------------------
|
||||
# Semantic LLM Cache (optional — disabled by default)
|
||||
# Caches LLM responses to cut costs and latency on repeated or
|
||||
# semantically similar prompts.
|
||||
# Cached routes: /api/chat /api/generate /v1/chat/completions /v1/completions
|
||||
# MOE requests (moe-* model prefix) always bypass the cache.
|
||||
# -------------------------------------------------------------
|
||||
# cache_enabled: false
|
||||
|
||||
# Backend — where cached responses are stored:
|
||||
# memory → in-process LRU (lost on restart, not shared across replicas) [default]
|
||||
# sqlite → persistent file-based (single instance, survives restart)
|
||||
# redis → distributed (shared across replicas, requires Redis)
|
||||
# cache_backend: memory
|
||||
|
||||
# Cosine similarity threshold for a cache hit:
|
||||
# 1.0 → exact match only (works on any image variant)
|
||||
# <1.0 → semantic matching (requires the :semantic Docker image tag)
|
||||
# cache_similarity: 1.0
|
||||
|
||||
# Response TTL in seconds. Remove the key or set to null to cache forever.
|
||||
# cache_ttl: 3600
|
||||
|
||||
# SQLite backend: path to the cache database file
|
||||
# cache_db_path: llm_cache.db
|
||||
|
||||
# Redis backend: connection URL
|
||||
# cache_redis_url: redis://localhost:6379/0
|
||||
|
||||
# Weight of the BM25-weighted chat-history embedding vs last-user-message embedding.
|
||||
# 0.3 = 30% history context signal, 70% question signal.
|
||||
# Only relevant when cache_similarity < 1.0.
|
||||
# cache_history_weight: 0.3
|
||||
|
|
@ -133,6 +133,39 @@ Response:
|
|||
}
|
||||
```
|
||||
|
||||
### Cache Statistics
|
||||
|
||||
```bash
|
||||
curl http://localhost:12434/api/cache/stats
|
||||
```
|
||||
|
||||
Response when cache is enabled:
|
||||
```json
|
||||
{
|
||||
"enabled": true,
|
||||
"hits": 1547,
|
||||
"misses": 892,
|
||||
"hit_rate": 0.634,
|
||||
"semantic": true,
|
||||
"backend": "sqlite",
|
||||
"similarity_threshold": 0.9,
|
||||
"history_weight": 0.3
|
||||
}
|
||||
```
|
||||
|
||||
Response when cache is disabled:
|
||||
```json
|
||||
{ "enabled": false }
|
||||
```
|
||||
|
||||
### Cache Invalidation
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:12434/api/cache/invalidate
|
||||
```
|
||||
|
||||
Clears all cached entries and resets hit/miss counters.
|
||||
|
||||
### Real-time Usage Stream
|
||||
|
||||
```bash
|
||||
|
|
|
|||
|
|
@ -39,3 +39,8 @@ uvicorn==0.38.0
|
|||
uvloop
|
||||
yarl==1.20.1
|
||||
aiosqlite
|
||||
# Semantic LLM cache — base install (exact-match mode, no heavy ML deps)
|
||||
# For semantic mode use the :semantic Docker image tag (adds sentence-transformers + torch)
|
||||
# SECURITY: pin to a specific tag or commit hash to prevent supply-chain drift, e.g.:
|
||||
# semantic-llm-cache @ git+https://github.com/nomyo-ai/async-semantic-llm-cache.git@v1.0.0
|
||||
semantic-llm-cache @ git+https://github.com/nomyo-ai/async-semantic-llm-cache.git
|
||||
|
|
|
|||
214
router.py
214
router.py
|
|
@ -123,6 +123,22 @@ class Config(BaseSettings):
|
|||
# Database configuration
|
||||
db_path: str = Field(default=os.getenv("NOMYO_ROUTER_DB_PATH", "token_counts.db"))
|
||||
|
||||
# Semantic LLM Cache configuration
|
||||
cache_enabled: bool = Field(default=False)
|
||||
# Backend: "memory" (default, in-process), "sqlite" (persistent), "redis" (distributed)
|
||||
cache_backend: str = Field(default="memory")
|
||||
# Cosine similarity threshold: 1.0 = exact match only, <1.0 = semantic (requires :semantic image)
|
||||
cache_similarity: float = Field(default=1.0)
|
||||
# TTL in seconds; None = cache forever
|
||||
cache_ttl: Optional[int] = Field(default=3600)
|
||||
# SQLite backend: path to cache database file
|
||||
cache_db_path: str = Field(default="llm_cache.db")
|
||||
# Redis backend: connection URL
|
||||
cache_redis_url: str = Field(default="redis://localhost:6379/0")
|
||||
# Weight of BM25-weighted chat-history embedding vs last-user-message embedding
|
||||
# 0.3 = 30% history context signal, 70% question signal
|
||||
cache_history_weight: float = Field(default=0.3)
|
||||
|
||||
class Config:
|
||||
# Load from `config.yaml` first, then from env variables
|
||||
env_prefix = "NOMYO_ROUTER_"
|
||||
|
|
@ -188,6 +204,7 @@ def _config_path_from_env() -> Path:
|
|||
|
||||
from ollama._types import TokenLogprob, Logprob
|
||||
from db import TokenDatabase
|
||||
from cache import init_llm_cache, get_llm_cache, openai_nonstream_to_sse
|
||||
|
||||
|
||||
# Create the global config object – it will be overwritten on startup
|
||||
|
|
@ -1596,6 +1613,14 @@ async def proxy(request: Request):
|
|||
error_msg = f"Invalid JSON format in request body: {str(e)}. Please ensure the request is properly formatted."
|
||||
raise HTTPException(status_code=400, detail=error_msg) from e
|
||||
|
||||
# Cache lookup — before endpoint selection so no slot is wasted on a hit
|
||||
_cache = get_llm_cache()
|
||||
if _cache is not None:
|
||||
_cached = await _cache.get_generate(model, prompt, system or "")
|
||||
if _cached is not None:
|
||||
async def _serve_cached_generate():
|
||||
yield _cached
|
||||
return StreamingResponse(_serve_cached_generate(), media_type="application/json")
|
||||
|
||||
endpoint, tracking_model = await choose_endpoint(model)
|
||||
use_openai = is_openai_compatible(endpoint)
|
||||
|
|
@ -1633,6 +1658,7 @@ async def proxy(request: Request):
|
|||
else:
|
||||
async_gen = await client.generate(model=model, prompt=prompt, suffix=suffix, system=system, template=template, context=context, stream=stream, think=think, raw=raw, format=_format, images=images, options=options, keep_alive=keep_alive)
|
||||
if stream == True:
|
||||
content_parts: list[str] = []
|
||||
async for chunk in async_gen:
|
||||
if use_openai:
|
||||
chunk = rechunk.openai_completion2ollama(chunk, stream, start_ts)
|
||||
|
|
@ -1644,6 +1670,27 @@ async def proxy(request: Request):
|
|||
json_line = chunk.model_dump_json()
|
||||
else:
|
||||
json_line = orjson.dumps(chunk)
|
||||
# Accumulate and store cache on done chunk — before yield so it always runs
|
||||
if _cache is not None:
|
||||
if getattr(chunk, "response", None):
|
||||
content_parts.append(chunk.response)
|
||||
if getattr(chunk, "done", False):
|
||||
assembled = orjson.dumps({
|
||||
k: v for k, v in {
|
||||
"model": getattr(chunk, "model", model),
|
||||
"response": "".join(content_parts),
|
||||
"done": True,
|
||||
"done_reason": getattr(chunk, "done_reason", "stop") or "stop",
|
||||
"prompt_eval_count": getattr(chunk, "prompt_eval_count", None),
|
||||
"eval_count": getattr(chunk, "eval_count", None),
|
||||
"total_duration": getattr(chunk, "total_duration", None),
|
||||
"eval_duration": getattr(chunk, "eval_duration", None),
|
||||
}.items() if v is not None
|
||||
}) + b"\n"
|
||||
try:
|
||||
await _cache.set_generate(model, prompt, system or "", assembled)
|
||||
except Exception as _ce:
|
||||
print(f"[cache] set_generate (streaming) failed: {_ce}")
|
||||
yield json_line.encode("utf-8") + b"\n"
|
||||
else:
|
||||
if use_openai:
|
||||
|
|
@ -1660,7 +1707,14 @@ async def proxy(request: Request):
|
|||
if hasattr(async_gen, "model_dump_json")
|
||||
else orjson.dumps(async_gen)
|
||||
)
|
||||
yield json_line.encode("utf-8") + b"\n"
|
||||
cache_bytes = json_line.encode("utf-8") + b"\n"
|
||||
yield cache_bytes
|
||||
# Cache non-streaming response
|
||||
if _cache is not None:
|
||||
try:
|
||||
await _cache.set_generate(model, prompt, system or "", cache_bytes)
|
||||
except Exception as _ce:
|
||||
print(f"[cache] set_generate (non-streaming) failed: {_ce}")
|
||||
|
||||
finally:
|
||||
# Ensure counter is decremented even if an exception occurs
|
||||
|
|
@ -1711,6 +1765,26 @@ async def chat_proxy(request: Request):
|
|||
except orjson.JSONDecodeError as e:
|
||||
raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e
|
||||
|
||||
# Cache lookup — before endpoint selection, always bypassed for MOE
|
||||
_is_moe = model.startswith("moe-")
|
||||
_cache = get_llm_cache()
|
||||
# Normalise model name for cache key: strip ":latest" suffix here so that
|
||||
# get_chat and set_chat use the same model string regardless of when the
|
||||
# strip happens further down (line ~1793 strips it for OpenAI endpoints).
|
||||
_cache_model = model[: -len(":latest")] if model.endswith(":latest") else model
|
||||
# Snapshot original messages before any OpenAI-format transformation so that
|
||||
# get_chat and set_chat always use the same key regardless of backend type.
|
||||
_cache_messages = messages
|
||||
if _cache is not None and not _is_moe:
|
||||
_cached = await _cache.get_chat("ollama_chat", _cache_model, messages)
|
||||
if _cached is not None:
|
||||
async def _serve_cached_chat():
|
||||
yield _cached
|
||||
return StreamingResponse(
|
||||
_serve_cached_chat(),
|
||||
media_type="application/x-ndjson" if stream else "application/json",
|
||||
)
|
||||
|
||||
# 2. Endpoint logic
|
||||
if model.startswith("moe-"):
|
||||
model = model.split("moe-")[1]
|
||||
|
|
@ -1764,6 +1838,7 @@ async def chat_proxy(request: Request):
|
|||
async_gen = await client.chat(model=model, messages=messages, tools=tools, stream=stream, think=think, format=_format, options=options, keep_alive=keep_alive, logprobs=logprobs, top_logprobs=top_logprobs)
|
||||
if stream == True:
|
||||
tc_acc = {} # accumulate OpenAI tool-call deltas across chunks
|
||||
content_parts: list[str] = []
|
||||
async for chunk in async_gen:
|
||||
if use_openai:
|
||||
_accumulate_openai_tc_delta(chunk, tc_acc)
|
||||
|
|
@ -1780,6 +1855,30 @@ async def chat_proxy(request: Request):
|
|||
json_line = chunk.model_dump_json()
|
||||
else:
|
||||
json_line = orjson.dumps(chunk)
|
||||
# Accumulate and store cache on done chunk — before yield so it always runs
|
||||
# Works for both Ollama-native and OpenAI-compatible backends; chunks are
|
||||
# already converted to Ollama format by rechunk before this point.
|
||||
if _cache is not None and not _is_moe:
|
||||
if chunk.message and getattr(chunk.message, "content", None):
|
||||
content_parts.append(chunk.message.content)
|
||||
if getattr(chunk, "done", False):
|
||||
assembled = orjson.dumps({
|
||||
k: v for k, v in {
|
||||
"model": getattr(chunk, "model", model),
|
||||
"created_at": (lambda ca: ca.isoformat() if hasattr(ca, "isoformat") else ca)(getattr(chunk, "created_at", None)),
|
||||
"message": {"role": "assistant", "content": "".join(content_parts)},
|
||||
"done": True,
|
||||
"done_reason": getattr(chunk, "done_reason", "stop") or "stop",
|
||||
"prompt_eval_count": getattr(chunk, "prompt_eval_count", None),
|
||||
"eval_count": getattr(chunk, "eval_count", None),
|
||||
"total_duration": getattr(chunk, "total_duration", None),
|
||||
"eval_duration": getattr(chunk, "eval_duration", None),
|
||||
}.items() if v is not None
|
||||
}) + b"\n"
|
||||
try:
|
||||
await _cache.set_chat("ollama_chat", _cache_model, _cache_messages, assembled)
|
||||
except Exception as _ce:
|
||||
print(f"[cache] set_chat (ollama_chat streaming) failed: {_ce}")
|
||||
yield json_line.encode("utf-8") + b"\n"
|
||||
else:
|
||||
if use_openai:
|
||||
|
|
@ -1796,7 +1895,14 @@ async def chat_proxy(request: Request):
|
|||
if hasattr(async_gen, "model_dump_json")
|
||||
else orjson.dumps(async_gen)
|
||||
)
|
||||
yield json_line.encode("utf-8") + b"\n"
|
||||
cache_bytes = json_line.encode("utf-8") + b"\n"
|
||||
yield cache_bytes
|
||||
# Cache non-streaming response (non-MOE; works for both Ollama and OpenAI backends)
|
||||
if _cache is not None and not _is_moe:
|
||||
try:
|
||||
await _cache.set_chat("ollama_chat", _cache_model, _cache_messages, cache_bytes)
|
||||
except Exception as _ce:
|
||||
print(f"[cache] set_chat (ollama_chat non-streaming) failed: {_ce}")
|
||||
|
||||
finally:
|
||||
# Ensure counter is decremented even if an exception occurs
|
||||
|
|
@ -2680,6 +2786,21 @@ async def openai_chat_completions_proxy(request: Request):
|
|||
except orjson.JSONDecodeError as e:
|
||||
raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e
|
||||
|
||||
# Cache lookup — before endpoint selection
|
||||
_cache = get_llm_cache()
|
||||
if _cache is not None:
|
||||
_cached = await _cache.get_chat("openai_chat", model, messages)
|
||||
if _cached is not None:
|
||||
if stream:
|
||||
_sse = openai_nonstream_to_sse(_cached, model)
|
||||
async def _serve_cached_ochat_stream():
|
||||
yield _sse
|
||||
return StreamingResponse(_serve_cached_ochat_stream(), media_type="text/event-stream")
|
||||
else:
|
||||
async def _serve_cached_ochat_json():
|
||||
yield _cached
|
||||
return StreamingResponse(_serve_cached_ochat_json(), media_type="application/json")
|
||||
|
||||
# 2. Endpoint logic
|
||||
endpoint, tracking_model = await choose_endpoint(model)
|
||||
base_url = ep2base(endpoint)
|
||||
|
|
@ -2699,6 +2820,8 @@ async def openai_chat_completions_proxy(request: Request):
|
|||
else:
|
||||
raise
|
||||
if stream == True:
|
||||
content_parts: list[str] = []
|
||||
usage_snapshot: dict = {}
|
||||
async for chunk in async_gen:
|
||||
data = (
|
||||
chunk.model_dump_json()
|
||||
|
|
@ -2715,6 +2838,8 @@ async def openai_chat_completions_proxy(request: Request):
|
|||
has_tool_calls = getattr(delta, "tool_calls", None) is not None
|
||||
if has_content or has_reasoning or has_tool_calls:
|
||||
yield f"data: {data}\n\n".encode("utf-8")
|
||||
if has_content and delta.content:
|
||||
content_parts.append(delta.content)
|
||||
elif chunk.usage is not None:
|
||||
# Forward the usage-only final chunk (e.g. from llama-server)
|
||||
yield f"data: {data}\n\n".encode("utf-8")
|
||||
|
|
@ -2723,12 +2848,24 @@ async def openai_chat_completions_proxy(request: Request):
|
|||
if chunk.usage is not None:
|
||||
prompt_tok = chunk.usage.prompt_tokens or 0
|
||||
comp_tok = chunk.usage.completion_tokens or 0
|
||||
usage_snapshot = {"prompt_tokens": prompt_tok, "completion_tokens": comp_tok, "total_tokens": prompt_tok + comp_tok}
|
||||
else:
|
||||
llama_usage = rechunk.extract_usage_from_llama_timings(chunk)
|
||||
if llama_usage:
|
||||
prompt_tok, comp_tok = llama_usage
|
||||
if prompt_tok != 0 or comp_tok != 0:
|
||||
await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))
|
||||
# Cache assembled streaming response — before [DONE] so it always runs
|
||||
if _cache is not None and content_parts:
|
||||
assembled = orjson.dumps({
|
||||
"model": model,
|
||||
"choices": [{"index": 0, "message": {"role": "assistant", "content": "".join(content_parts)}, "finish_reason": "stop"}],
|
||||
**({"usage": usage_snapshot} if usage_snapshot else {}),
|
||||
}) + b"\n"
|
||||
try:
|
||||
await _cache.set_chat("openai_chat", model, messages, assembled)
|
||||
except Exception as _ce:
|
||||
print(f"[cache] set_chat (openai_chat streaming) failed: {_ce}")
|
||||
yield b"data: [DONE]\n\n"
|
||||
else:
|
||||
prompt_tok = 0
|
||||
|
|
@ -2747,7 +2884,14 @@ async def openai_chat_completions_proxy(request: Request):
|
|||
if hasattr(async_gen, "model_dump_json")
|
||||
else orjson.dumps(async_gen)
|
||||
)
|
||||
yield json_line.encode("utf-8") + b"\n"
|
||||
cache_bytes = json_line.encode("utf-8") + b"\n"
|
||||
yield cache_bytes
|
||||
# Cache non-streaming response
|
||||
if _cache is not None:
|
||||
try:
|
||||
await _cache.set_chat("openai_chat", model, messages, cache_bytes)
|
||||
except Exception as _ce:
|
||||
print(f"[cache] set_chat (openai_chat non-streaming) failed: {_ce}")
|
||||
|
||||
finally:
|
||||
# Ensure counter is decremented even if an exception occurs
|
||||
|
|
@ -2823,6 +2967,22 @@ async def openai_completions_proxy(request: Request):
|
|||
except orjson.JSONDecodeError as e:
|
||||
raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e
|
||||
|
||||
# Cache lookup — completions prompt mapped to a single-turn messages list
|
||||
_cache = get_llm_cache()
|
||||
_compl_messages = [{"role": "user", "content": prompt}]
|
||||
if _cache is not None:
|
||||
_cached = await _cache.get_chat("openai_completions", model, _compl_messages)
|
||||
if _cached is not None:
|
||||
if stream:
|
||||
_sse = openai_nonstream_to_sse(_cached, model)
|
||||
async def _serve_cached_ocompl_stream():
|
||||
yield _sse
|
||||
return StreamingResponse(_serve_cached_ocompl_stream(), media_type="text/event-stream")
|
||||
else:
|
||||
async def _serve_cached_ocompl_json():
|
||||
yield _cached
|
||||
return StreamingResponse(_serve_cached_ocompl_json(), media_type="application/json")
|
||||
|
||||
# 2. Endpoint logic
|
||||
endpoint, tracking_model = await choose_endpoint(model)
|
||||
base_url = ep2base(endpoint)
|
||||
|
|
@ -2834,6 +2994,8 @@ async def openai_completions_proxy(request: Request):
|
|||
# The chat method returns a generator of dicts (or GenerateResponse)
|
||||
async_gen = await oclient.completions.create(**params)
|
||||
if stream == True:
|
||||
text_parts: list[str] = []
|
||||
usage_snapshot: dict = {}
|
||||
async for chunk in async_gen:
|
||||
data = (
|
||||
chunk.model_dump_json()
|
||||
|
|
@ -2849,6 +3011,8 @@ async def openai_completions_proxy(request: Request):
|
|||
)
|
||||
if has_text or has_reasoning or choice.finish_reason is not None:
|
||||
yield f"data: {data}\n\n".encode("utf-8")
|
||||
if has_text and choice.text:
|
||||
text_parts.append(choice.text)
|
||||
elif chunk.usage is not None:
|
||||
# Forward the usage-only final chunk (e.g. from llama-server)
|
||||
yield f"data: {data}\n\n".encode("utf-8")
|
||||
|
|
@ -2857,12 +3021,24 @@ async def openai_completions_proxy(request: Request):
|
|||
if chunk.usage is not None:
|
||||
prompt_tok = chunk.usage.prompt_tokens or 0
|
||||
comp_tok = chunk.usage.completion_tokens or 0
|
||||
usage_snapshot = {"prompt_tokens": prompt_tok, "completion_tokens": comp_tok, "total_tokens": prompt_tok + comp_tok}
|
||||
else:
|
||||
llama_usage = rechunk.extract_usage_from_llama_timings(chunk)
|
||||
if llama_usage:
|
||||
prompt_tok, comp_tok = llama_usage
|
||||
if prompt_tok != 0 or comp_tok != 0:
|
||||
await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))
|
||||
# Cache assembled streaming response — before [DONE] so it always runs
|
||||
if _cache is not None and text_parts:
|
||||
assembled = orjson.dumps({
|
||||
"model": model,
|
||||
"choices": [{"index": 0, "message": {"role": "assistant", "content": "".join(text_parts)}, "finish_reason": "stop"}],
|
||||
**({"usage": usage_snapshot} if usage_snapshot else {}),
|
||||
}) + b"\n"
|
||||
try:
|
||||
await _cache.set_chat("openai_completions", model, _compl_messages, assembled)
|
||||
except Exception as _ce:
|
||||
print(f"[cache] set_chat (openai_completions streaming) failed: {_ce}")
|
||||
# Final DONE event
|
||||
yield b"data: [DONE]\n\n"
|
||||
else:
|
||||
|
|
@ -2882,7 +3058,14 @@ async def openai_completions_proxy(request: Request):
|
|||
if hasattr(async_gen, "model_dump_json")
|
||||
else orjson.dumps(async_gen)
|
||||
)
|
||||
yield json_line.encode("utf-8") + b"\n"
|
||||
cache_bytes = json_line.encode("utf-8") + b"\n"
|
||||
yield cache_bytes
|
||||
# Cache non-streaming response
|
||||
if _cache is not None:
|
||||
try:
|
||||
await _cache.set_chat("openai_completions", model, _compl_messages, cache_bytes)
|
||||
except Exception as _ce:
|
||||
print(f"[cache] set_chat (openai_completions non-streaming) failed: {_ce}")
|
||||
|
||||
finally:
|
||||
# Ensure counter is decremented even if an exception occurs
|
||||
|
|
@ -3076,6 +3259,28 @@ async def rerank_proxy(request: Request):
|
|||
finally:
|
||||
await decrement_usage(endpoint, tracking_model)
|
||||
|
||||
# -------------------------------------------------------------
|
||||
# 25b. Cache management endpoints
|
||||
# -------------------------------------------------------------
|
||||
@app.get("/api/cache/stats")
|
||||
async def cache_stats():
|
||||
"""Return hit/miss counters and configuration for the LLM response cache."""
|
||||
c = get_llm_cache()
|
||||
if c is None:
|
||||
return {"enabled": False}
|
||||
return {"enabled": True, **c.stats()}
|
||||
|
||||
|
||||
@app.post("/api/cache/invalidate")
|
||||
async def cache_invalidate():
|
||||
"""Clear all entries from the LLM response cache and reset counters."""
|
||||
c = get_llm_cache()
|
||||
if c is None:
|
||||
return {"enabled": False, "cleared": False}
|
||||
await c.clear()
|
||||
return {"enabled": True, "cleared": True}
|
||||
|
||||
|
||||
# -------------------------------------------------------------
|
||||
# 26. Serve the static front‑end
|
||||
# -------------------------------------------------------------
|
||||
|
|
@ -3211,6 +3416,7 @@ async def startup_event() -> None:
|
|||
app_state["session"] = session
|
||||
token_worker_task = asyncio.create_task(token_worker())
|
||||
flush_task = asyncio.create_task(flush_buffer())
|
||||
await init_llm_cache(config)
|
||||
|
||||
@app.on_event("shutdown")
|
||||
async def shutdown_event() -> None:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue