feat: adding a semantic cache layer
This commit is contained in:
parent
c3d47c7ffe
commit
dd4b12da6a
13 changed files with 1138 additions and 22 deletions
44
.dockerignore
Normal file
44
.dockerignore
Normal file
|
|
@ -0,0 +1,44 @@
|
||||||
|
# Version control
|
||||||
|
.git
|
||||||
|
.gitignore
|
||||||
|
.github
|
||||||
|
|
||||||
|
# Environment & secrets
|
||||||
|
.env
|
||||||
|
.env.*
|
||||||
|
*.env
|
||||||
|
|
||||||
|
# Python artifacts
|
||||||
|
__pycache__
|
||||||
|
*.pyc
|
||||||
|
*.pyo
|
||||||
|
*.pyd
|
||||||
|
.Python
|
||||||
|
.venv
|
||||||
|
venv
|
||||||
|
*.egg-info
|
||||||
|
dist
|
||||||
|
build
|
||||||
|
|
||||||
|
# Local databases (don't bake data into image)
|
||||||
|
*.db
|
||||||
|
*.db-shm
|
||||||
|
*.db-wal
|
||||||
|
|
||||||
|
# IDE / editor
|
||||||
|
.vscode
|
||||||
|
.idea
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
|
||||||
|
# Documentation
|
||||||
|
doc/
|
||||||
|
docs/
|
||||||
|
*.md
|
||||||
|
|
||||||
|
# Tests
|
||||||
|
tests/
|
||||||
|
test_*
|
||||||
|
|
||||||
|
# Local config overrides
|
||||||
|
config.local.yaml
|
||||||
71
.github/workflows/docker-publish-semantic.yml
vendored
Normal file
71
.github/workflows/docker-publish-semantic.yml
vendored
Normal file
|
|
@ -0,0 +1,71 @@
|
||||||
|
name: Build and Publish Docker Image (Semantic Cache)
|
||||||
|
|
||||||
|
# Builds the :semantic variant that includes sentence-transformers + CPU torch
|
||||||
|
# and the pre-baked all-MiniLM-L6-v2 embedding model (~500 MB larger than lean).
|
||||||
|
# Tags mirror the lean workflow but carry a -semantic suffix, e.g.:
|
||||||
|
# ghcr.io/nomyo-ai/nomyo-router:latest-semantic
|
||||||
|
# ghcr.io/nomyo-ai/nomyo-router:0.7.0-semantic
|
||||||
|
# ghcr.io/nomyo-ai/nomyo-router:0.7-semantic
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
tags:
|
||||||
|
- "v*.*.*"
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
|
env:
|
||||||
|
REGISTRY: ghcr.io
|
||||||
|
IMAGE_NAME: ${{ github.repository }}
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build-and-push-semantic:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
packages: write
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Set up QEMU (for multi-arch builds)
|
||||||
|
uses: docker/setup-qemu-action@v3
|
||||||
|
|
||||||
|
- name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v3
|
||||||
|
|
||||||
|
- name: Log in to GitHub Container Registry
|
||||||
|
uses: docker/login-action@v3
|
||||||
|
with:
|
||||||
|
registry: ${{ env.REGISTRY }}
|
||||||
|
username: ${{ github.actor }}
|
||||||
|
password: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
|
||||||
|
- name: Extract Docker metadata
|
||||||
|
id: meta
|
||||||
|
uses: docker/metadata-action@v5
|
||||||
|
with:
|
||||||
|
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
|
||||||
|
tags: |
|
||||||
|
# Versioned semantic tags on git tags (e.g. v0.7.0 → 0.7.0-semantic, 0.7-semantic)
|
||||||
|
type=semver,pattern={{version}}-semantic
|
||||||
|
type=semver,pattern={{major}}.{{minor}}-semantic
|
||||||
|
# latest-semantic only on main branch pushes
|
||||||
|
type=raw,value=latest-semantic,enable=${{ github.ref == 'refs/heads/main' }}
|
||||||
|
# SHA-tagged for traceability
|
||||||
|
type=sha,prefix=sha-,suffix=-semantic
|
||||||
|
|
||||||
|
- name: Build and push semantic Docker image
|
||||||
|
uses: docker/build-push-action@v6
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
platforms: linux/amd64,linux/arm64
|
||||||
|
push: true
|
||||||
|
build-args: |
|
||||||
|
SEMANTIC_CACHE=true
|
||||||
|
tags: ${{ steps.meta.outputs.tags }}
|
||||||
|
labels: ${{ steps.meta.outputs.labels }}
|
||||||
|
cache-from: type=gha
|
||||||
|
cache-to: type=gha,mode=max
|
||||||
26
Dockerfile
26
Dockerfile
|
|
@ -3,21 +3,43 @@ FROM python:3.13-slim
|
||||||
ENV PYTHONUNBUFFERED=1 \
|
ENV PYTHONUNBUFFERED=1 \
|
||||||
PYTHONDONTWRITEBYTECODE=1
|
PYTHONDONTWRITEBYTECODE=1
|
||||||
|
|
||||||
|
# SEMANTIC_CACHE=true installs sentence-transformers + CPU-only torch and pre-bakes
|
||||||
|
# the all-MiniLM-L6-v2 embedding model (~500 MB extra). The resulting image is tagged
|
||||||
|
# :semantic. The default (lean) image supports exact-match caching only.
|
||||||
|
ARG SEMANTIC_CACHE=false
|
||||||
|
|
||||||
|
# Pin HuggingFace cache to a predictable path inside /app/data so it can be
|
||||||
|
# mounted as a volume and shared between builds.
|
||||||
|
ENV HF_HOME=/app/data/hf_cache
|
||||||
|
|
||||||
# Install SQLite
|
# Install SQLite
|
||||||
RUN apt-get update && apt-get install -y sqlite3
|
RUN apt-get update && apt-get install -y --no-install-recommends sqlite3 \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
COPY requirements.txt .
|
COPY requirements.txt .
|
||||||
RUN pip install --no-cache-dir --upgrade pip \
|
RUN pip install --no-cache-dir --upgrade pip \
|
||||||
&& pip install --no-cache-dir -r requirements.txt
|
&& pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
# Semantic cache deps — only installed when SEMANTIC_CACHE=true
|
||||||
|
# CPU-only torch must be installed before sentence-transformers to avoid
|
||||||
|
# pulling the full CUDA-enabled build (~2.5 GB).
|
||||||
|
RUN if [ "$SEMANTIC_CACHE" = "true" ]; then \
|
||||||
|
pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu && \
|
||||||
|
pip install --no-cache-dir sentence-transformers && \
|
||||||
|
python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')"; \
|
||||||
|
fi
|
||||||
|
|
||||||
# Create database directory and set permissions
|
# Create database directory and set permissions
|
||||||
RUN mkdir -p /app/data && chown -R www-data:www-data /app/data
|
RUN mkdir -p /app/data && chown -R www-data:www-data /app/data
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
RUN chmod +x /app/entrypoint.sh
|
RUN chmod +x /app/entrypoint.sh && \
|
||||||
|
chown -R www-data:www-data /app
|
||||||
|
|
||||||
EXPOSE 12434
|
EXPOSE 12434
|
||||||
|
|
||||||
|
USER www-data
|
||||||
|
|
||||||
ENTRYPOINT ["/app/entrypoint.sh"]
|
ENTRYPOINT ["/app/entrypoint.sh"]
|
||||||
|
|
|
||||||
67
README.md
67
README.md
|
|
@ -74,22 +74,28 @@ uvicorn router:app --host 127.0.0.1 --port 12434 --loop uvloop
|
||||||
|
|
||||||
### Pre-built image (GitHub Container Registry)
|
### Pre-built image (GitHub Container Registry)
|
||||||
|
|
||||||
Pre-built multi-arch images (`linux/amd64`, `linux/arm64`) are published automatically on every release:
|
Pre-built multi-arch images (`linux/amd64`, `linux/arm64`) are published automatically on every release.
|
||||||
|
|
||||||
|
**Lean image** (exact-match cache, ~300 MB):
|
||||||
```sh
|
```sh
|
||||||
docker pull ghcr.io/nomyo-ai/nomyo-router:latest
|
docker pull ghcr.io/nomyo-ai/nomyo-router:latest
|
||||||
```
|
|
||||||
|
|
||||||
Specific version:
|
|
||||||
|
|
||||||
```sh
|
|
||||||
docker pull ghcr.io/nomyo-ai/nomyo-router:0.7.0
|
docker pull ghcr.io/nomyo-ai/nomyo-router:0.7.0
|
||||||
```
|
```
|
||||||
|
|
||||||
### Build the container image locally:
|
**Semantic image** (semantic cache with `all-MiniLM-L6-v2` pre-baked, ~800 MB):
|
||||||
|
```sh
|
||||||
|
docker pull ghcr.io/nomyo-ai/nomyo-router:latest-semantic
|
||||||
|
docker pull ghcr.io/nomyo-ai/nomyo-router:0.7.0-semantic
|
||||||
|
```
|
||||||
|
|
||||||
|
### Build the container image locally
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
|
# Lean build (exact match cache, default)
|
||||||
docker build -t nomyo-router .
|
docker build -t nomyo-router .
|
||||||
|
|
||||||
|
# Semantic build — sentence-transformers + model baked in
|
||||||
|
docker build --build-arg SEMANTIC_CACHE=true -t nomyo-router:semantic .
|
||||||
```
|
```
|
||||||
|
|
||||||
Run the router in Docker with your own configuration file mounted from the host. The entrypoint script accepts a `--config-path` argument so you can point to a file anywhere inside the container:
|
Run the router in Docker with your own configuration file mounted from the host. The entrypoint script accepts a `--config-path` argument so you can point to a file anywhere inside the container:
|
||||||
|
|
@ -124,6 +130,53 @@ This way the Ollama backend servers are utilized more efficient than by simply u
|
||||||
|
|
||||||
NOMYO Router also supports OpenAI API compatible v1 backend servers.
|
NOMYO Router also supports OpenAI API compatible v1 backend servers.
|
||||||
|
|
||||||
|
## Semantic LLM Cache
|
||||||
|
|
||||||
|
NOMYO Router includes an optional semantic cache that serves repeated or semantically similar LLM requests from cache — no endpoint round-trip, no token cost, response in <10 ms.
|
||||||
|
|
||||||
|
### Enable (exact match, any image)
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# config.yaml
|
||||||
|
cache_enabled: true
|
||||||
|
cache_backend: sqlite # persists across restarts
|
||||||
|
cache_similarity: 1.0 # exact match only
|
||||||
|
cache_ttl: 3600
|
||||||
|
```
|
||||||
|
|
||||||
|
### Enable (semantic matching, :semantic image)
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
cache_enabled: true
|
||||||
|
cache_backend: sqlite
|
||||||
|
cache_similarity: 0.90 # "What is Python?" ≈ "What's Python?" → cache hit
|
||||||
|
cache_ttl: 3600
|
||||||
|
cache_history_weight: 0.3
|
||||||
|
```
|
||||||
|
|
||||||
|
Pull the semantic image:
|
||||||
|
```bash
|
||||||
|
docker pull ghcr.io/nomyo-ai/nomyo-router:latest-semantic
|
||||||
|
```
|
||||||
|
|
||||||
|
### Cache key strategy
|
||||||
|
|
||||||
|
Each request is keyed on `model + system_prompt` (exact) combined with a weighted-mean embedding of BM25-weighted chat history (30%) and the last user message (70%). This means:
|
||||||
|
- Different system prompts → always separate cache namespaces (no cross-tenant leakage)
|
||||||
|
- Same question, different phrasing → cache hit (semantic mode)
|
||||||
|
- MOE requests (`moe-*`) → always bypass the cache
|
||||||
|
|
||||||
|
### Cached routes
|
||||||
|
|
||||||
|
`/api/chat` · `/api/generate` · `/v1/chat/completions` · `/v1/completions`
|
||||||
|
|
||||||
|
### Cache management
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl http://localhost:12434/api/cache/stats # hit rate, counters, config
|
||||||
|
curl -X POST http://localhost:12434/api/cache/invalidate # clear all entries
|
||||||
|
```
|
||||||
|
|
||||||
## Supplying the router API key
|
## Supplying the router API key
|
||||||
|
|
||||||
If you set `nomyo-router-api-key` in `config.yaml` (or `NOMYO_ROUTER_API_KEY` env), every request to NOMYO Router must include the key:
|
If you set `nomyo-router-api-key` in `config.yaml` (or `NOMYO_ROUTER_API_KEY` env), every request to NOMYO Router must include the key:
|
||||||
|
|
|
||||||
407
cache.py
Normal file
407
cache.py
Normal file
|
|
@ -0,0 +1,407 @@
|
||||||
|
"""
|
||||||
|
LLM Semantic Cache for NOMYO Router.
|
||||||
|
|
||||||
|
Strategy:
|
||||||
|
- Namespace: sha256(route :: model :: system_prompt)[:16] — exact context isolation
|
||||||
|
- Cache key: hash(normalize(last_user_message), namespace) — exact lookup
|
||||||
|
- Embedding: weighted mean of
|
||||||
|
α * embed(bm25_weighted(chat_history)) — conversation context
|
||||||
|
1-α * embed(last_user_message) — the actual question
|
||||||
|
with α = cache_history_weight (default 0.3).
|
||||||
|
- Exact-match caching (similarity=1.0) uses DummyEmbeddingProvider — zero extra deps.
|
||||||
|
- Semantic caching (similarity<1.0) requires sentence-transformers. If missing the
|
||||||
|
library falls back to exact-match with a warning (lean Docker image behaviour).
|
||||||
|
- MOE models (moe-*) always bypass the cache.
|
||||||
|
- Token counts are never recorded for cache hits.
|
||||||
|
- Streaming cache hits are served as a single-chunk response.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
import math
|
||||||
|
import time
|
||||||
|
import warnings
|
||||||
|
from collections import Counter
|
||||||
|
from typing import Any, Optional
|
||||||
|
|
||||||
|
# Lazily resolved once at first embed() call
|
||||||
|
_semantic_available: Optional[bool] = None
|
||||||
|
|
||||||
|
|
||||||
|
def _check_sentence_transformers() -> bool:
|
||||||
|
global _semantic_available
|
||||||
|
if _semantic_available is None:
|
||||||
|
try:
|
||||||
|
import sentence_transformers # noqa: F401
|
||||||
|
_semantic_available = True
|
||||||
|
except ImportError:
|
||||||
|
_semantic_available = False
|
||||||
|
return _semantic_available # type: ignore[return-value]
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# BM25-weighted text representation of chat history
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _bm25_weighted_text(history: list[dict]) -> str:
|
||||||
|
"""
|
||||||
|
Produce a BM25-importance-weighted text string from chat history turns.
|
||||||
|
|
||||||
|
High-IDF (rare, domain-specific) terms are repeated proportionally to
|
||||||
|
their BM25 score so the downstream sentence-transformer embedding
|
||||||
|
naturally upweights topical signal and downweights stop words.
|
||||||
|
"""
|
||||||
|
docs = [m.get("content", "") for m in history if m.get("content")]
|
||||||
|
if not docs:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def _tok(text: str) -> list[str]:
|
||||||
|
return [w.lower() for w in text.split() if len(w) > 2]
|
||||||
|
|
||||||
|
tokenized = [_tok(d) for d in docs]
|
||||||
|
N = len(tokenized)
|
||||||
|
|
||||||
|
df: Counter = Counter()
|
||||||
|
for tokens in tokenized:
|
||||||
|
for term in set(tokens):
|
||||||
|
df[term] += 1
|
||||||
|
|
||||||
|
k1, b = 1.5, 0.75
|
||||||
|
avg_dl = sum(len(t) for t in tokenized) / max(N, 1)
|
||||||
|
|
||||||
|
term_scores: Counter = Counter()
|
||||||
|
for tokens in tokenized:
|
||||||
|
tf_c = Counter(tokens)
|
||||||
|
dl = len(tokens)
|
||||||
|
for term, tf in tf_c.items():
|
||||||
|
idf = math.log((N + 1) / (df[term] + 1)) + 1.0
|
||||||
|
score = idf * (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * dl / max(avg_dl, 1)))
|
||||||
|
term_scores[term] += score
|
||||||
|
|
||||||
|
top = term_scores.most_common(50)
|
||||||
|
if not top:
|
||||||
|
return " ".join(docs)
|
||||||
|
|
||||||
|
max_s = top[0][1]
|
||||||
|
out: list[str] = []
|
||||||
|
for term, score in top:
|
||||||
|
out.extend([term] * max(1, round(3 * score / max_s)))
|
||||||
|
return " ".join(out)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# LLMCache
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class LLMCache:
|
||||||
|
"""
|
||||||
|
Thin async wrapper around async-semantic-llm-cache that adds:
|
||||||
|
- Route-aware namespace isolation
|
||||||
|
- Two-vector weighted-mean embedding (history context + question)
|
||||||
|
- Per-instance hit/miss counters
|
||||||
|
- Graceful fallback when sentence-transformers is absent
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, cfg: Any) -> None:
|
||||||
|
self._cfg = cfg
|
||||||
|
self._backend: Any = None
|
||||||
|
self._emb_cache: Any = None
|
||||||
|
self._semantic: bool = False
|
||||||
|
self._hits: int = 0
|
||||||
|
self._misses: int = 0
|
||||||
|
|
||||||
|
async def init(self) -> None:
|
||||||
|
from semantic_llm_cache.similarity import EmbeddingCache
|
||||||
|
|
||||||
|
# --- Backend ---
|
||||||
|
backend_type: str = self._cfg.cache_backend
|
||||||
|
if backend_type == "sqlite":
|
||||||
|
from semantic_llm_cache.backends.sqlite import SQLiteBackend
|
||||||
|
self._backend = SQLiteBackend(db_path=self._cfg.cache_db_path)
|
||||||
|
elif backend_type == "redis":
|
||||||
|
from semantic_llm_cache.backends.redis import RedisBackend
|
||||||
|
self._backend = RedisBackend(url=self._cfg.cache_redis_url)
|
||||||
|
await self._backend.ping()
|
||||||
|
else:
|
||||||
|
from semantic_llm_cache.backends.memory import MemoryBackend
|
||||||
|
self._backend = MemoryBackend()
|
||||||
|
|
||||||
|
# --- Embedding provider ---
|
||||||
|
if self._cfg.cache_similarity < 1.0:
|
||||||
|
if _check_sentence_transformers():
|
||||||
|
from semantic_llm_cache.similarity import create_embedding_provider
|
||||||
|
provider = create_embedding_provider("sentence-transformer")
|
||||||
|
self._emb_cache = EmbeddingCache(provider=provider)
|
||||||
|
self._semantic = True
|
||||||
|
print(
|
||||||
|
f"[cache] Semantic cache ready "
|
||||||
|
f"(similarity≥{self._cfg.cache_similarity}, backend={backend_type})"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
warnings.warn(
|
||||||
|
"[cache] sentence-transformers is not installed. "
|
||||||
|
"Falling back to exact-match caching (similarity=1.0). "
|
||||||
|
"Use the :semantic Docker image tag to enable semantic caching.",
|
||||||
|
RuntimeWarning,
|
||||||
|
stacklevel=2,
|
||||||
|
)
|
||||||
|
self._emb_cache = EmbeddingCache() # DummyEmbeddingProvider
|
||||||
|
print(f"[cache] Exact-match cache ready (backend={backend_type}) [semantic unavailable]")
|
||||||
|
else:
|
||||||
|
self._emb_cache = EmbeddingCache() # DummyEmbeddingProvider
|
||||||
|
print(f"[cache] Exact-match cache ready (backend={backend_type})")
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Internal helpers
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _namespace(self, route: str, model: str, system: str) -> str:
|
||||||
|
raw = f"{route}::{model}::{system}"
|
||||||
|
return hashlib.sha256(raw.encode()).hexdigest()[:16]
|
||||||
|
|
||||||
|
def _cache_key(self, namespace: str, last_user: str) -> str:
|
||||||
|
from semantic_llm_cache.utils import hash_prompt, normalize_prompt
|
||||||
|
return hash_prompt(normalize_prompt(last_user), namespace)
|
||||||
|
|
||||||
|
def _parse_messages(
|
||||||
|
self, messages: list[dict]
|
||||||
|
) -> tuple[str, list[dict], str]:
|
||||||
|
"""
|
||||||
|
Returns (system_prompt, prior_history_turns, last_user_message).
|
||||||
|
Multimodal content lists are reduced to their text parts.
|
||||||
|
"""
|
||||||
|
system = ""
|
||||||
|
turns: list[dict] = []
|
||||||
|
|
||||||
|
for m in messages:
|
||||||
|
role = m.get("role", "")
|
||||||
|
content = m.get("content", "")
|
||||||
|
if isinstance(content, list):
|
||||||
|
content = " ".join(
|
||||||
|
p.get("text", "")
|
||||||
|
for p in content
|
||||||
|
if isinstance(p, dict) and p.get("type") == "text"
|
||||||
|
)
|
||||||
|
if role == "system":
|
||||||
|
system = content
|
||||||
|
else:
|
||||||
|
turns.append({"role": role, "content": content})
|
||||||
|
|
||||||
|
last_user = ""
|
||||||
|
for m in reversed(turns):
|
||||||
|
if m["role"] == "user":
|
||||||
|
last_user = m["content"]
|
||||||
|
break
|
||||||
|
|
||||||
|
# History = all turns before the final user message
|
||||||
|
history = turns[:-1] if turns and turns[-1]["role"] == "user" else turns
|
||||||
|
return system, history, last_user
|
||||||
|
|
||||||
|
async def _build_embedding(
|
||||||
|
self, history: list[dict], last_user: str
|
||||||
|
) -> list[float] | None:
|
||||||
|
"""
|
||||||
|
Weighted mean of BM25-weighted history embedding and last-user embedding.
|
||||||
|
Returns None when not in semantic mode.
|
||||||
|
"""
|
||||||
|
if not self._semantic:
|
||||||
|
return None
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
alpha: float = self._cfg.cache_history_weight # weight for history signal
|
||||||
|
q_vec = np.array(await self._emb_cache.aencode(last_user), dtype=float)
|
||||||
|
|
||||||
|
if not history:
|
||||||
|
# No history → use question embedding alone (alpha has no effect)
|
||||||
|
return q_vec.tolist()
|
||||||
|
|
||||||
|
h_text = _bm25_weighted_text(history)
|
||||||
|
h_vec = np.array(await self._emb_cache.aencode(h_text), dtype=float)
|
||||||
|
|
||||||
|
combined = alpha * h_vec + (1.0 - alpha) * q_vec
|
||||||
|
norm = float(np.linalg.norm(combined))
|
||||||
|
if norm > 0.0:
|
||||||
|
combined /= norm
|
||||||
|
return combined.tolist()
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Public interface: chat (handles both Ollama and OpenAI message lists)
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
async def get_chat(
|
||||||
|
self, route: str, model: str, messages: list[dict]
|
||||||
|
) -> bytes | None:
|
||||||
|
"""Return cached response bytes, or None on miss."""
|
||||||
|
if not self._backend:
|
||||||
|
return None
|
||||||
|
|
||||||
|
system, history, last_user = self._parse_messages(messages)
|
||||||
|
if not last_user:
|
||||||
|
return None
|
||||||
|
|
||||||
|
ns = self._namespace(route, model, system)
|
||||||
|
key = self._cache_key(ns, last_user)
|
||||||
|
|
||||||
|
# 1. Exact key match
|
||||||
|
entry = await self._backend.get(key)
|
||||||
|
if entry is not None:
|
||||||
|
self._hits += 1
|
||||||
|
return entry.response # type: ignore[return-value]
|
||||||
|
|
||||||
|
# 2. Semantic similarity match
|
||||||
|
if self._semantic and self._cfg.cache_similarity < 1.0:
|
||||||
|
emb = await self._build_embedding(history, last_user)
|
||||||
|
result = await self._backend.find_similar(
|
||||||
|
emb, threshold=self._cfg.cache_similarity, namespace=ns
|
||||||
|
)
|
||||||
|
if result is not None:
|
||||||
|
_, matched, _ = result
|
||||||
|
self._hits += 1
|
||||||
|
return matched.response # type: ignore[return-value]
|
||||||
|
|
||||||
|
self._misses += 1
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def set_chat(
|
||||||
|
self, route: str, model: str, messages: list[dict], response_bytes: bytes
|
||||||
|
) -> None:
|
||||||
|
"""Store a response in the cache (fire-and-forget friendly)."""
|
||||||
|
if not self._backend:
|
||||||
|
return
|
||||||
|
|
||||||
|
system, history, last_user = self._parse_messages(messages)
|
||||||
|
if not last_user:
|
||||||
|
return
|
||||||
|
|
||||||
|
ns = self._namespace(route, model, system)
|
||||||
|
key = self._cache_key(ns, last_user)
|
||||||
|
|
||||||
|
emb = (
|
||||||
|
await self._build_embedding(history, last_user)
|
||||||
|
if self._semantic and self._cfg.cache_similarity < 1.0
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
|
||||||
|
from semantic_llm_cache.config import CacheEntry
|
||||||
|
|
||||||
|
await self._backend.set(
|
||||||
|
key,
|
||||||
|
CacheEntry(
|
||||||
|
prompt=last_user,
|
||||||
|
response=response_bytes,
|
||||||
|
embedding=emb,
|
||||||
|
created_at=time.time(),
|
||||||
|
ttl=self._cfg.cache_ttl,
|
||||||
|
namespace=ns,
|
||||||
|
hit_count=0,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Convenience wrappers for the generate route (prompt string, not messages)
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
async def get_generate(
|
||||||
|
self, model: str, prompt: str, system: str = ""
|
||||||
|
) -> bytes | None:
|
||||||
|
messages: list[dict] = []
|
||||||
|
if system:
|
||||||
|
messages.append({"role": "system", "content": system})
|
||||||
|
messages.append({"role": "user", "content": prompt})
|
||||||
|
return await self.get_chat("generate", model, messages)
|
||||||
|
|
||||||
|
async def set_generate(
|
||||||
|
self, model: str, prompt: str, system: str, response_bytes: bytes
|
||||||
|
) -> None:
|
||||||
|
messages: list[dict] = []
|
||||||
|
if system:
|
||||||
|
messages.append({"role": "system", "content": system})
|
||||||
|
messages.append({"role": "user", "content": prompt})
|
||||||
|
await self.set_chat("generate", model, messages, response_bytes)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Management
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def stats(self) -> dict:
|
||||||
|
total = self._hits + self._misses
|
||||||
|
return {
|
||||||
|
"hits": self._hits,
|
||||||
|
"misses": self._misses,
|
||||||
|
"hit_rate": round(self._hits / total, 3) if total else 0.0,
|
||||||
|
"semantic": self._semantic,
|
||||||
|
"backend": self._cfg.cache_backend,
|
||||||
|
"similarity_threshold": self._cfg.cache_similarity,
|
||||||
|
"history_weight": self._cfg.cache_history_weight,
|
||||||
|
}
|
||||||
|
|
||||||
|
async def clear(self) -> None:
|
||||||
|
if self._backend:
|
||||||
|
await self._backend.clear()
|
||||||
|
self._hits = 0
|
||||||
|
self._misses = 0
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Module-level singleton
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_cache: LLMCache | None = None
|
||||||
|
|
||||||
|
|
||||||
|
async def init_llm_cache(cfg: Any) -> LLMCache | None:
|
||||||
|
"""Initialise the module-level cache singleton. Returns None if disabled."""
|
||||||
|
global _cache
|
||||||
|
if not cfg.cache_enabled:
|
||||||
|
print("[cache] Cache disabled (cache_enabled=false).")
|
||||||
|
return None
|
||||||
|
_cache = LLMCache(cfg)
|
||||||
|
await _cache.init()
|
||||||
|
return _cache
|
||||||
|
|
||||||
|
|
||||||
|
def get_llm_cache() -> LLMCache | None:
|
||||||
|
return _cache
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Helper: convert a stored Ollama-format non-streaming response to an
|
||||||
|
# OpenAI SSE single-chunk stream (used when a streaming OpenAI request
|
||||||
|
# hits the cache whose entry was populated from a non-streaming response).
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def openai_nonstream_to_sse(cached_bytes: bytes, model: str) -> bytes:
|
||||||
|
"""
|
||||||
|
Wrap a stored OpenAI ChatCompletion JSON as a minimal single-chunk SSE stream.
|
||||||
|
The stored entry always uses the non-streaming ChatCompletion format so that
|
||||||
|
non-streaming cache hits can be served directly; this function adapts it for
|
||||||
|
streaming clients.
|
||||||
|
"""
|
||||||
|
import orjson, time as _time
|
||||||
|
|
||||||
|
try:
|
||||||
|
d = orjson.loads(cached_bytes)
|
||||||
|
content = (d.get("choices") or [{}])[0].get("message", {}).get("content", "")
|
||||||
|
chunk = {
|
||||||
|
"id": d.get("id", "cache-hit"),
|
||||||
|
"object": "chat.completion.chunk",
|
||||||
|
"created": d.get("created", int(_time.time())),
|
||||||
|
"model": d.get("model", model),
|
||||||
|
"choices": [
|
||||||
|
{
|
||||||
|
"index": 0,
|
||||||
|
"delta": {"role": "assistant", "content": content},
|
||||||
|
"finish_reason": "stop",
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
if d.get("usage"):
|
||||||
|
chunk["usage"] = d["usage"]
|
||||||
|
return f"data: {orjson.dumps(chunk).decode()}\n\ndata: [DONE]\n\n".encode()
|
||||||
|
except Exception as exc:
|
||||||
|
warnings.warn(
|
||||||
|
f"[cache] openai_nonstream_to_sse: corrupt cache entry, returning empty stream: {exc}",
|
||||||
|
RuntimeWarning,
|
||||||
|
stacklevel=2,
|
||||||
|
)
|
||||||
|
return b"data: [DONE]\n\n"
|
||||||
38
config.yaml
38
config.yaml
|
|
@ -6,7 +6,7 @@ endpoints:
|
||||||
- https://api.openai.com/v1
|
- https://api.openai.com/v1
|
||||||
|
|
||||||
llama_server_endpoints:
|
llama_server_endpoints:
|
||||||
- http://192.168.0.33:8889/v1
|
- http://192.168.0.50:8889/v1
|
||||||
|
|
||||||
# Maximum concurrent connections *per endpoint‑model pair* (equals to OLLAMA_NUM_PARALLEL)
|
# Maximum concurrent connections *per endpoint‑model pair* (equals to OLLAMA_NUM_PARALLEL)
|
||||||
max_concurrent_connections: 2
|
max_concurrent_connections: 2
|
||||||
|
|
@ -22,4 +22,38 @@ api_keys:
|
||||||
"http://192.168.0.51:11434": "ollama"
|
"http://192.168.0.51:11434": "ollama"
|
||||||
"http://192.168.0.52:11434": "ollama"
|
"http://192.168.0.52:11434": "ollama"
|
||||||
"https://api.openai.com/v1": "${OPENAI_KEY}"
|
"https://api.openai.com/v1": "${OPENAI_KEY}"
|
||||||
"http://192.168.0.33:8889/v1": "llama"
|
"http://192.168.0.50:8889/v1": "llama"
|
||||||
|
|
||||||
|
# -------------------------------------------------------------
|
||||||
|
# Semantic LLM Cache (optional — disabled by default)
|
||||||
|
# Caches LLM responses to cut costs and latency on repeated or
|
||||||
|
# semantically similar prompts.
|
||||||
|
# Cached routes: /api/chat /api/generate /v1/chat/completions /v1/completions
|
||||||
|
# MOE requests (moe-* model prefix) always bypass the cache.
|
||||||
|
# -------------------------------------------------------------
|
||||||
|
cache_enabled: true
|
||||||
|
|
||||||
|
# Backend — where cached responses are stored:
|
||||||
|
# memory → in-process LRU (lost on restart, not shared across replicas) [default]
|
||||||
|
# sqlite → persistent file-based (single instance, survives restart)
|
||||||
|
# redis → distributed (shared across replicas, requires Redis)
|
||||||
|
cache_backend: memory
|
||||||
|
|
||||||
|
# Cosine similarity threshold for a cache hit:
|
||||||
|
# 1.0 → exact match only (works on any image variant)
|
||||||
|
# <1.0 → semantic matching (requires the :semantic Docker image tag)
|
||||||
|
cache_similarity: 0.9
|
||||||
|
|
||||||
|
# Response TTL in seconds. Remove the key or set to null to cache forever.
|
||||||
|
cache_ttl: 3600
|
||||||
|
|
||||||
|
# SQLite backend: path to the cache database file
|
||||||
|
cache_db_path: llm_cache.db
|
||||||
|
|
||||||
|
# Redis backend: connection URL
|
||||||
|
# cache_redis_url: redis://localhost:6379/0
|
||||||
|
|
||||||
|
# Weight of the BM25-weighted chat-history embedding vs last-user-message embedding.
|
||||||
|
# 0.3 = 30% history context signal, 70% question signal.
|
||||||
|
# Only relevant when cache_similarity < 1.0.
|
||||||
|
# cache_history_weight: 0.3
|
||||||
|
|
@ -204,6 +204,149 @@ max_concurrent_connections: 3
|
||||||
|
|
||||||
**Recommendation**: Use multiple endpoints for redundancy and load distribution.
|
**Recommendation**: Use multiple endpoints for redundancy and load distribution.
|
||||||
|
|
||||||
|
## Semantic LLM Cache
|
||||||
|
|
||||||
|
NOMYO Router can cache LLM responses and serve them directly — skipping endpoint selection, model load, and token generation entirely.
|
||||||
|
|
||||||
|
### How it works
|
||||||
|
|
||||||
|
1. On every cacheable request (`/api/chat`, `/api/generate`, `/v1/chat/completions`, `/v1/completions`) the cache is checked **before** choosing an endpoint.
|
||||||
|
2. On a **cache hit** the stored response is returned immediately as a single chunk (streaming or non-streaming — both work).
|
||||||
|
3. On a **cache miss** the request is forwarded normally. The response is stored in the cache after it completes.
|
||||||
|
4. **MOE requests** (`moe-*` model prefix) always bypass the cache.
|
||||||
|
5. **Token counts** are never recorded for cache hits.
|
||||||
|
|
||||||
|
### Cache key strategy
|
||||||
|
|
||||||
|
| Signal | How matched |
|
||||||
|
|---|---|
|
||||||
|
| `model + system_prompt` | Exact — hard context isolation per deployment |
|
||||||
|
| BM25-weighted embedding of chat history | Semantic — conversation context signal |
|
||||||
|
| Embedding of last user message | Semantic — the actual question |
|
||||||
|
|
||||||
|
The two semantic vectors are combined as a weighted mean (tuned by `cache_history_weight`) before cosine similarity comparison, staying at a single 384-dimensional vector compatible with the library's storage format.
|
||||||
|
|
||||||
|
### Quick start — exact match (lean image)
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
cache_enabled: true
|
||||||
|
cache_backend: sqlite # persists across restarts
|
||||||
|
cache_similarity: 1.0 # exact match only, no sentence-transformers needed
|
||||||
|
cache_ttl: 3600
|
||||||
|
```
|
||||||
|
|
||||||
|
### Quick start — semantic matching (:semantic image)
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
cache_enabled: true
|
||||||
|
cache_backend: sqlite
|
||||||
|
cache_similarity: 0.90 # hit if ≥90% cosine similarity
|
||||||
|
cache_ttl: 3600
|
||||||
|
cache_history_weight: 0.3
|
||||||
|
```
|
||||||
|
|
||||||
|
Pull the semantic image:
|
||||||
|
```bash
|
||||||
|
docker pull ghcr.io/nomyo-ai/nomyo-router:latest-semantic
|
||||||
|
```
|
||||||
|
|
||||||
|
### Cache configuration options
|
||||||
|
|
||||||
|
#### `cache_enabled`
|
||||||
|
|
||||||
|
**Type**: `bool` | **Default**: `false`
|
||||||
|
|
||||||
|
Enable or disable the cache. All other cache settings are ignored when `false`.
|
||||||
|
|
||||||
|
#### `cache_backend`
|
||||||
|
|
||||||
|
**Type**: `str` | **Default**: `"memory"`
|
||||||
|
|
||||||
|
| Value | Description | Persists | Multi-replica |
|
||||||
|
|---|---|---|---|
|
||||||
|
| `memory` | In-process LRU dict | ❌ | ❌ |
|
||||||
|
| `sqlite` | File-based via `aiosqlite` | ✅ | ❌ |
|
||||||
|
| `redis` | Redis via `redis.asyncio` | ✅ | ✅ |
|
||||||
|
|
||||||
|
Use `redis` when running multiple router replicas behind a load balancer — all replicas share one warm cache.
|
||||||
|
|
||||||
|
#### `cache_similarity`
|
||||||
|
|
||||||
|
**Type**: `float` | **Default**: `1.0`
|
||||||
|
|
||||||
|
Cosine similarity threshold. `1.0` means exact match only (no embedding model needed). Values below `1.0` enable semantic matching, which requires the `:semantic` Docker image tag.
|
||||||
|
|
||||||
|
Recommended starting value for semantic mode: `0.90`.
|
||||||
|
|
||||||
|
#### `cache_ttl`
|
||||||
|
|
||||||
|
**Type**: `int | null` | **Default**: `3600`
|
||||||
|
|
||||||
|
Time-to-live for cache entries in seconds. Remove the key or set to `null` to cache forever.
|
||||||
|
|
||||||
|
#### `cache_db_path`
|
||||||
|
|
||||||
|
**Type**: `str` | **Default**: `"llm_cache.db"`
|
||||||
|
|
||||||
|
Path to the SQLite cache database. Only used when `cache_backend: sqlite`.
|
||||||
|
|
||||||
|
#### `cache_redis_url`
|
||||||
|
|
||||||
|
**Type**: `str` | **Default**: `"redis://localhost:6379/0"`
|
||||||
|
|
||||||
|
Redis connection URL. Only used when `cache_backend: redis`.
|
||||||
|
|
||||||
|
#### `cache_history_weight`
|
||||||
|
|
||||||
|
**Type**: `float` | **Default**: `0.3`
|
||||||
|
|
||||||
|
Weight of the BM25-weighted chat-history embedding in the combined cache key vector. `0.3` means the history contributes 30% and the final user message contributes 70% of the similarity signal. Only used when `cache_similarity < 1.0`.
|
||||||
|
|
||||||
|
### Cache management endpoints
|
||||||
|
|
||||||
|
| Endpoint | Method | Description |
|
||||||
|
|---|---|---|
|
||||||
|
| `/api/cache/stats` | `GET` | Hit/miss counters, hit rate, current config |
|
||||||
|
| `/api/cache/invalidate` | `POST` | Clear all cache entries and reset counters |
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check cache performance
|
||||||
|
curl http://localhost:12434/api/cache/stats
|
||||||
|
|
||||||
|
# Clear the cache
|
||||||
|
curl -X POST http://localhost:12434/api/cache/invalidate
|
||||||
|
```
|
||||||
|
|
||||||
|
Example stats response:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"enabled": true,
|
||||||
|
"hits": 1547,
|
||||||
|
"misses": 892,
|
||||||
|
"hit_rate": 0.634,
|
||||||
|
"semantic": true,
|
||||||
|
"backend": "sqlite",
|
||||||
|
"similarity_threshold": 0.9,
|
||||||
|
"history_weight": 0.3
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Docker image variants
|
||||||
|
|
||||||
|
| Tag | Semantic cache | Image size |
|
||||||
|
|---|---|---|
|
||||||
|
| `latest` | ❌ exact match only | ~300 MB |
|
||||||
|
| `latest-semantic` | ✅ sentence-transformers + model pre-baked | ~800 MB |
|
||||||
|
|
||||||
|
Build locally:
|
||||||
|
```bash
|
||||||
|
# Lean (exact match)
|
||||||
|
docker build -t nomyo-router .
|
||||||
|
|
||||||
|
# Semantic (~500 MB larger, all-MiniLM-L6-v2 model baked in)
|
||||||
|
docker build --build-arg SEMANTIC_CACHE=true -t nomyo-router:semantic .
|
||||||
|
```
|
||||||
|
|
||||||
## Configuration Validation
|
## Configuration Validation
|
||||||
|
|
||||||
The router validates the configuration at startup:
|
The router validates the configuration at startup:
|
||||||
|
|
|
||||||
|
|
@ -82,10 +82,23 @@ sudo systemctl status nomyo-router
|
||||||
|
|
||||||
## 2. Docker Deployment
|
## 2. Docker Deployment
|
||||||
|
|
||||||
|
### Image variants
|
||||||
|
|
||||||
|
| Tag | Semantic cache | Image size |
|
||||||
|
|---|---|---|
|
||||||
|
| `latest` | ❌ exact match only | ~300 MB |
|
||||||
|
| `latest-semantic` | ✅ sentence-transformers + `all-MiniLM-L6-v2` pre-baked | ~800 MB |
|
||||||
|
|
||||||
|
The `:semantic` variant enables `cache_similarity < 1.0` in `config.yaml`. The lean image falls back to exact-match caching with a warning if semantic mode is configured.
|
||||||
|
|
||||||
### Build the Image
|
### Build the Image
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
# Lean build (exact match cache, default)
|
||||||
docker build -t nomyo-router .
|
docker build -t nomyo-router .
|
||||||
|
|
||||||
|
# Semantic build (~500 MB larger, all-MiniLM-L6-v2 model baked in at build time)
|
||||||
|
docker build --build-arg SEMANTIC_CACHE=true -t nomyo-router:semantic .
|
||||||
```
|
```
|
||||||
|
|
||||||
### Run the Container
|
### Run the Container
|
||||||
|
|
|
||||||
|
|
@ -1,20 +1,30 @@
|
||||||
# Docker Compose example for NOMYO Router with multiple Ollama instances
|
# Docker Compose example for NOMYO Router with multiple Ollama instances
|
||||||
|
#
|
||||||
|
# Two router profiles are provided:
|
||||||
|
# nomyo-router — lean image, exact-match cache only (~300 MB)
|
||||||
|
# nomyo-router-semantic — semantic image, sentence-transformers baked in (~800 MB)
|
||||||
|
#
|
||||||
|
# Uncomment the redis service and set cache_backend: redis in config.yaml
|
||||||
|
# to share the LLM response cache across multiple router replicas.
|
||||||
|
|
||||||
version: '3.8'
|
version: '3.8'
|
||||||
|
|
||||||
services:
|
services:
|
||||||
# NOMYO Router
|
# NOMYO Router — lean image (exact-match cache, default)
|
||||||
nomyo-router:
|
nomyo-router:
|
||||||
image: nomyo-router:latest
|
image: nomyo-router:latest
|
||||||
build: .
|
build:
|
||||||
|
context: .
|
||||||
|
args:
|
||||||
|
SEMANTIC_CACHE: "false"
|
||||||
ports:
|
ports:
|
||||||
- "12434:12434"
|
- "12434:12434"
|
||||||
environment:
|
environment:
|
||||||
- CONFIG_PATH=/app/config/config.yaml
|
- CONFIG_PATH=/app/config/config.yaml
|
||||||
- NOMYO_ROUTER_DB_PATH=/app/token_counts.db
|
- NOMYO_ROUTER_DB_PATH=/app/data/token_counts.db
|
||||||
volumes:
|
volumes:
|
||||||
- ./config:/app/config
|
- ./config:/app/config
|
||||||
- router-db:/app/token_counts.db
|
- router-data:/app/data
|
||||||
depends_on:
|
depends_on:
|
||||||
- ollama1
|
- ollama1
|
||||||
- ollama2
|
- ollama2
|
||||||
|
|
@ -23,6 +33,45 @@ services:
|
||||||
networks:
|
networks:
|
||||||
- nomyo-net
|
- nomyo-net
|
||||||
|
|
||||||
|
# NOMYO Router — semantic image (cache_similarity < 1.0 support, ~800 MB)
|
||||||
|
# Build: docker compose build nomyo-router-semantic
|
||||||
|
# Switch: comment out nomyo-router above, uncomment this block.
|
||||||
|
# nomyo-router-semantic:
|
||||||
|
# image: nomyo-router:semantic
|
||||||
|
# build:
|
||||||
|
# context: .
|
||||||
|
# args:
|
||||||
|
# SEMANTIC_CACHE: "true"
|
||||||
|
# ports:
|
||||||
|
# - "12434:12434"
|
||||||
|
# environment:
|
||||||
|
# - CONFIG_PATH=/app/config/config.yaml
|
||||||
|
# - NOMYO_ROUTER_DB_PATH=/app/data/token_counts.db
|
||||||
|
# volumes:
|
||||||
|
# - ./config:/app/config
|
||||||
|
# - router-data:/app/data
|
||||||
|
# - hf-cache:/app/data/hf_cache # share HuggingFace model cache across builds
|
||||||
|
# depends_on:
|
||||||
|
# - ollama1
|
||||||
|
# - ollama2
|
||||||
|
# - ollama3
|
||||||
|
# restart: unless-stopped
|
||||||
|
# networks:
|
||||||
|
# - nomyo-net
|
||||||
|
|
||||||
|
# Optional: Redis for shared LLM response cache across multiple router replicas.
|
||||||
|
# Requires cache_backend: redis in config.yaml.
|
||||||
|
# redis:
|
||||||
|
# image: redis:7-alpine
|
||||||
|
# ports:
|
||||||
|
# - "6379:6379"
|
||||||
|
# volumes:
|
||||||
|
# - redis-data:/data
|
||||||
|
# command: redis-server --save 60 1 --loglevel warning
|
||||||
|
# restart: unless-stopped
|
||||||
|
# networks:
|
||||||
|
# - nomyo-net
|
||||||
|
|
||||||
# Ollama Instance 1
|
# Ollama Instance 1
|
||||||
ollama1:
|
ollama1:
|
||||||
image: ollama/ollama:latest
|
image: ollama/ollama:latest
|
||||||
|
|
@ -87,7 +136,9 @@ services:
|
||||||
- nomyo-net
|
- nomyo-net
|
||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
router-db:
|
router-data:
|
||||||
|
# hf-cache: # uncomment when using nomyo-router-semantic
|
||||||
|
# redis-data: # uncomment when using Redis cache backend
|
||||||
ollama1-data:
|
ollama1-data:
|
||||||
ollama2-data:
|
ollama2-data:
|
||||||
ollama3-data:
|
ollama3-data:
|
||||||
|
|
|
||||||
|
|
@ -30,3 +30,37 @@ api_keys:
|
||||||
"https://api.openai.com/v1": "${OPENAI_KEY}"
|
"https://api.openai.com/v1": "${OPENAI_KEY}"
|
||||||
"http://localhost:8080/v1": "llama-server" # Optional API key for llama-server - depends on llama_server config
|
"http://localhost:8080/v1": "llama-server" # Optional API key for llama-server - depends on llama_server config
|
||||||
"http://192.168.0.33:8081/v1": "llama-server"
|
"http://192.168.0.33:8081/v1": "llama-server"
|
||||||
|
|
||||||
|
# -------------------------------------------------------------
|
||||||
|
# Semantic LLM Cache (optional — disabled by default)
|
||||||
|
# Caches LLM responses to cut costs and latency on repeated or
|
||||||
|
# semantically similar prompts.
|
||||||
|
# Cached routes: /api/chat /api/generate /v1/chat/completions /v1/completions
|
||||||
|
# MOE requests (moe-* model prefix) always bypass the cache.
|
||||||
|
# -------------------------------------------------------------
|
||||||
|
# cache_enabled: false
|
||||||
|
|
||||||
|
# Backend — where cached responses are stored:
|
||||||
|
# memory → in-process LRU (lost on restart, not shared across replicas) [default]
|
||||||
|
# sqlite → persistent file-based (single instance, survives restart)
|
||||||
|
# redis → distributed (shared across replicas, requires Redis)
|
||||||
|
# cache_backend: memory
|
||||||
|
|
||||||
|
# Cosine similarity threshold for a cache hit:
|
||||||
|
# 1.0 → exact match only (works on any image variant)
|
||||||
|
# <1.0 → semantic matching (requires the :semantic Docker image tag)
|
||||||
|
# cache_similarity: 1.0
|
||||||
|
|
||||||
|
# Response TTL in seconds. Remove the key or set to null to cache forever.
|
||||||
|
# cache_ttl: 3600
|
||||||
|
|
||||||
|
# SQLite backend: path to the cache database file
|
||||||
|
# cache_db_path: llm_cache.db
|
||||||
|
|
||||||
|
# Redis backend: connection URL
|
||||||
|
# cache_redis_url: redis://localhost:6379/0
|
||||||
|
|
||||||
|
# Weight of the BM25-weighted chat-history embedding vs last-user-message embedding.
|
||||||
|
# 0.3 = 30% history context signal, 70% question signal.
|
||||||
|
# Only relevant when cache_similarity < 1.0.
|
||||||
|
# cache_history_weight: 0.3
|
||||||
|
|
@ -133,6 +133,39 @@ Response:
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Cache Statistics
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl http://localhost:12434/api/cache/stats
|
||||||
|
```
|
||||||
|
|
||||||
|
Response when cache is enabled:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"enabled": true,
|
||||||
|
"hits": 1547,
|
||||||
|
"misses": 892,
|
||||||
|
"hit_rate": 0.634,
|
||||||
|
"semantic": true,
|
||||||
|
"backend": "sqlite",
|
||||||
|
"similarity_threshold": 0.9,
|
||||||
|
"history_weight": 0.3
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Response when cache is disabled:
|
||||||
|
```json
|
||||||
|
{ "enabled": false }
|
||||||
|
```
|
||||||
|
|
||||||
|
### Cache Invalidation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -X POST http://localhost:12434/api/cache/invalidate
|
||||||
|
```
|
||||||
|
|
||||||
|
Clears all cached entries and resets hit/miss counters.
|
||||||
|
|
||||||
### Real-time Usage Stream
|
### Real-time Usage Stream
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
|
|
||||||
|
|
@ -39,3 +39,8 @@ uvicorn==0.38.0
|
||||||
uvloop
|
uvloop
|
||||||
yarl==1.20.1
|
yarl==1.20.1
|
||||||
aiosqlite
|
aiosqlite
|
||||||
|
# Semantic LLM cache — base install (exact-match mode, no heavy ML deps)
|
||||||
|
# For semantic mode use the :semantic Docker image tag (adds sentence-transformers + torch)
|
||||||
|
# SECURITY: pin to a specific tag or commit hash to prevent supply-chain drift, e.g.:
|
||||||
|
# semantic-llm-cache @ git+https://github.com/nomyo-ai/async-semantic-llm-cache.git@v1.0.0
|
||||||
|
semantic-llm-cache @ git+https://github.com/nomyo-ai/async-semantic-llm-cache.git
|
||||||
|
|
|
||||||
214
router.py
214
router.py
|
|
@ -123,6 +123,22 @@ class Config(BaseSettings):
|
||||||
# Database configuration
|
# Database configuration
|
||||||
db_path: str = Field(default=os.getenv("NOMYO_ROUTER_DB_PATH", "token_counts.db"))
|
db_path: str = Field(default=os.getenv("NOMYO_ROUTER_DB_PATH", "token_counts.db"))
|
||||||
|
|
||||||
|
# Semantic LLM Cache configuration
|
||||||
|
cache_enabled: bool = Field(default=False)
|
||||||
|
# Backend: "memory" (default, in-process), "sqlite" (persistent), "redis" (distributed)
|
||||||
|
cache_backend: str = Field(default="memory")
|
||||||
|
# Cosine similarity threshold: 1.0 = exact match only, <1.0 = semantic (requires :semantic image)
|
||||||
|
cache_similarity: float = Field(default=1.0)
|
||||||
|
# TTL in seconds; None = cache forever
|
||||||
|
cache_ttl: Optional[int] = Field(default=3600)
|
||||||
|
# SQLite backend: path to cache database file
|
||||||
|
cache_db_path: str = Field(default="llm_cache.db")
|
||||||
|
# Redis backend: connection URL
|
||||||
|
cache_redis_url: str = Field(default="redis://localhost:6379/0")
|
||||||
|
# Weight of BM25-weighted chat-history embedding vs last-user-message embedding
|
||||||
|
# 0.3 = 30% history context signal, 70% question signal
|
||||||
|
cache_history_weight: float = Field(default=0.3)
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
# Load from `config.yaml` first, then from env variables
|
# Load from `config.yaml` first, then from env variables
|
||||||
env_prefix = "NOMYO_ROUTER_"
|
env_prefix = "NOMYO_ROUTER_"
|
||||||
|
|
@ -188,6 +204,7 @@ def _config_path_from_env() -> Path:
|
||||||
|
|
||||||
from ollama._types import TokenLogprob, Logprob
|
from ollama._types import TokenLogprob, Logprob
|
||||||
from db import TokenDatabase
|
from db import TokenDatabase
|
||||||
|
from cache import init_llm_cache, get_llm_cache, openai_nonstream_to_sse
|
||||||
|
|
||||||
|
|
||||||
# Create the global config object – it will be overwritten on startup
|
# Create the global config object – it will be overwritten on startup
|
||||||
|
|
@ -1596,6 +1613,14 @@ async def proxy(request: Request):
|
||||||
error_msg = f"Invalid JSON format in request body: {str(e)}. Please ensure the request is properly formatted."
|
error_msg = f"Invalid JSON format in request body: {str(e)}. Please ensure the request is properly formatted."
|
||||||
raise HTTPException(status_code=400, detail=error_msg) from e
|
raise HTTPException(status_code=400, detail=error_msg) from e
|
||||||
|
|
||||||
|
# Cache lookup — before endpoint selection so no slot is wasted on a hit
|
||||||
|
_cache = get_llm_cache()
|
||||||
|
if _cache is not None:
|
||||||
|
_cached = await _cache.get_generate(model, prompt, system or "")
|
||||||
|
if _cached is not None:
|
||||||
|
async def _serve_cached_generate():
|
||||||
|
yield _cached
|
||||||
|
return StreamingResponse(_serve_cached_generate(), media_type="application/json")
|
||||||
|
|
||||||
endpoint, tracking_model = await choose_endpoint(model)
|
endpoint, tracking_model = await choose_endpoint(model)
|
||||||
use_openai = is_openai_compatible(endpoint)
|
use_openai = is_openai_compatible(endpoint)
|
||||||
|
|
@ -1633,6 +1658,7 @@ async def proxy(request: Request):
|
||||||
else:
|
else:
|
||||||
async_gen = await client.generate(model=model, prompt=prompt, suffix=suffix, system=system, template=template, context=context, stream=stream, think=think, raw=raw, format=_format, images=images, options=options, keep_alive=keep_alive)
|
async_gen = await client.generate(model=model, prompt=prompt, suffix=suffix, system=system, template=template, context=context, stream=stream, think=think, raw=raw, format=_format, images=images, options=options, keep_alive=keep_alive)
|
||||||
if stream == True:
|
if stream == True:
|
||||||
|
content_parts: list[str] = []
|
||||||
async for chunk in async_gen:
|
async for chunk in async_gen:
|
||||||
if use_openai:
|
if use_openai:
|
||||||
chunk = rechunk.openai_completion2ollama(chunk, stream, start_ts)
|
chunk = rechunk.openai_completion2ollama(chunk, stream, start_ts)
|
||||||
|
|
@ -1644,6 +1670,27 @@ async def proxy(request: Request):
|
||||||
json_line = chunk.model_dump_json()
|
json_line = chunk.model_dump_json()
|
||||||
else:
|
else:
|
||||||
json_line = orjson.dumps(chunk)
|
json_line = orjson.dumps(chunk)
|
||||||
|
# Accumulate and store cache on done chunk — before yield so it always runs
|
||||||
|
if _cache is not None:
|
||||||
|
if getattr(chunk, "response", None):
|
||||||
|
content_parts.append(chunk.response)
|
||||||
|
if getattr(chunk, "done", False):
|
||||||
|
assembled = orjson.dumps({
|
||||||
|
k: v for k, v in {
|
||||||
|
"model": getattr(chunk, "model", model),
|
||||||
|
"response": "".join(content_parts),
|
||||||
|
"done": True,
|
||||||
|
"done_reason": getattr(chunk, "done_reason", "stop") or "stop",
|
||||||
|
"prompt_eval_count": getattr(chunk, "prompt_eval_count", None),
|
||||||
|
"eval_count": getattr(chunk, "eval_count", None),
|
||||||
|
"total_duration": getattr(chunk, "total_duration", None),
|
||||||
|
"eval_duration": getattr(chunk, "eval_duration", None),
|
||||||
|
}.items() if v is not None
|
||||||
|
}) + b"\n"
|
||||||
|
try:
|
||||||
|
await _cache.set_generate(model, prompt, system or "", assembled)
|
||||||
|
except Exception as _ce:
|
||||||
|
print(f"[cache] set_generate (streaming) failed: {_ce}")
|
||||||
yield json_line.encode("utf-8") + b"\n"
|
yield json_line.encode("utf-8") + b"\n"
|
||||||
else:
|
else:
|
||||||
if use_openai:
|
if use_openai:
|
||||||
|
|
@ -1660,7 +1707,14 @@ async def proxy(request: Request):
|
||||||
if hasattr(async_gen, "model_dump_json")
|
if hasattr(async_gen, "model_dump_json")
|
||||||
else orjson.dumps(async_gen)
|
else orjson.dumps(async_gen)
|
||||||
)
|
)
|
||||||
yield json_line.encode("utf-8") + b"\n"
|
cache_bytes = json_line.encode("utf-8") + b"\n"
|
||||||
|
yield cache_bytes
|
||||||
|
# Cache non-streaming response
|
||||||
|
if _cache is not None:
|
||||||
|
try:
|
||||||
|
await _cache.set_generate(model, prompt, system or "", cache_bytes)
|
||||||
|
except Exception as _ce:
|
||||||
|
print(f"[cache] set_generate (non-streaming) failed: {_ce}")
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
# Ensure counter is decremented even if an exception occurs
|
# Ensure counter is decremented even if an exception occurs
|
||||||
|
|
@ -1711,6 +1765,26 @@ async def chat_proxy(request: Request):
|
||||||
except orjson.JSONDecodeError as e:
|
except orjson.JSONDecodeError as e:
|
||||||
raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e
|
raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e
|
||||||
|
|
||||||
|
# Cache lookup — before endpoint selection, always bypassed for MOE
|
||||||
|
_is_moe = model.startswith("moe-")
|
||||||
|
_cache = get_llm_cache()
|
||||||
|
# Normalise model name for cache key: strip ":latest" suffix here so that
|
||||||
|
# get_chat and set_chat use the same model string regardless of when the
|
||||||
|
# strip happens further down (line ~1793 strips it for OpenAI endpoints).
|
||||||
|
_cache_model = model[: -len(":latest")] if model.endswith(":latest") else model
|
||||||
|
# Snapshot original messages before any OpenAI-format transformation so that
|
||||||
|
# get_chat and set_chat always use the same key regardless of backend type.
|
||||||
|
_cache_messages = messages
|
||||||
|
if _cache is not None and not _is_moe:
|
||||||
|
_cached = await _cache.get_chat("ollama_chat", _cache_model, messages)
|
||||||
|
if _cached is not None:
|
||||||
|
async def _serve_cached_chat():
|
||||||
|
yield _cached
|
||||||
|
return StreamingResponse(
|
||||||
|
_serve_cached_chat(),
|
||||||
|
media_type="application/x-ndjson" if stream else "application/json",
|
||||||
|
)
|
||||||
|
|
||||||
# 2. Endpoint logic
|
# 2. Endpoint logic
|
||||||
if model.startswith("moe-"):
|
if model.startswith("moe-"):
|
||||||
model = model.split("moe-")[1]
|
model = model.split("moe-")[1]
|
||||||
|
|
@ -1764,6 +1838,7 @@ async def chat_proxy(request: Request):
|
||||||
async_gen = await client.chat(model=model, messages=messages, tools=tools, stream=stream, think=think, format=_format, options=options, keep_alive=keep_alive, logprobs=logprobs, top_logprobs=top_logprobs)
|
async_gen = await client.chat(model=model, messages=messages, tools=tools, stream=stream, think=think, format=_format, options=options, keep_alive=keep_alive, logprobs=logprobs, top_logprobs=top_logprobs)
|
||||||
if stream == True:
|
if stream == True:
|
||||||
tc_acc = {} # accumulate OpenAI tool-call deltas across chunks
|
tc_acc = {} # accumulate OpenAI tool-call deltas across chunks
|
||||||
|
content_parts: list[str] = []
|
||||||
async for chunk in async_gen:
|
async for chunk in async_gen:
|
||||||
if use_openai:
|
if use_openai:
|
||||||
_accumulate_openai_tc_delta(chunk, tc_acc)
|
_accumulate_openai_tc_delta(chunk, tc_acc)
|
||||||
|
|
@ -1780,6 +1855,30 @@ async def chat_proxy(request: Request):
|
||||||
json_line = chunk.model_dump_json()
|
json_line = chunk.model_dump_json()
|
||||||
else:
|
else:
|
||||||
json_line = orjson.dumps(chunk)
|
json_line = orjson.dumps(chunk)
|
||||||
|
# Accumulate and store cache on done chunk — before yield so it always runs
|
||||||
|
# Works for both Ollama-native and OpenAI-compatible backends; chunks are
|
||||||
|
# already converted to Ollama format by rechunk before this point.
|
||||||
|
if _cache is not None and not _is_moe:
|
||||||
|
if chunk.message and getattr(chunk.message, "content", None):
|
||||||
|
content_parts.append(chunk.message.content)
|
||||||
|
if getattr(chunk, "done", False):
|
||||||
|
assembled = orjson.dumps({
|
||||||
|
k: v for k, v in {
|
||||||
|
"model": getattr(chunk, "model", model),
|
||||||
|
"created_at": (lambda ca: ca.isoformat() if hasattr(ca, "isoformat") else ca)(getattr(chunk, "created_at", None)),
|
||||||
|
"message": {"role": "assistant", "content": "".join(content_parts)},
|
||||||
|
"done": True,
|
||||||
|
"done_reason": getattr(chunk, "done_reason", "stop") or "stop",
|
||||||
|
"prompt_eval_count": getattr(chunk, "prompt_eval_count", None),
|
||||||
|
"eval_count": getattr(chunk, "eval_count", None),
|
||||||
|
"total_duration": getattr(chunk, "total_duration", None),
|
||||||
|
"eval_duration": getattr(chunk, "eval_duration", None),
|
||||||
|
}.items() if v is not None
|
||||||
|
}) + b"\n"
|
||||||
|
try:
|
||||||
|
await _cache.set_chat("ollama_chat", _cache_model, _cache_messages, assembled)
|
||||||
|
except Exception as _ce:
|
||||||
|
print(f"[cache] set_chat (ollama_chat streaming) failed: {_ce}")
|
||||||
yield json_line.encode("utf-8") + b"\n"
|
yield json_line.encode("utf-8") + b"\n"
|
||||||
else:
|
else:
|
||||||
if use_openai:
|
if use_openai:
|
||||||
|
|
@ -1796,7 +1895,14 @@ async def chat_proxy(request: Request):
|
||||||
if hasattr(async_gen, "model_dump_json")
|
if hasattr(async_gen, "model_dump_json")
|
||||||
else orjson.dumps(async_gen)
|
else orjson.dumps(async_gen)
|
||||||
)
|
)
|
||||||
yield json_line.encode("utf-8") + b"\n"
|
cache_bytes = json_line.encode("utf-8") + b"\n"
|
||||||
|
yield cache_bytes
|
||||||
|
# Cache non-streaming response (non-MOE; works for both Ollama and OpenAI backends)
|
||||||
|
if _cache is not None and not _is_moe:
|
||||||
|
try:
|
||||||
|
await _cache.set_chat("ollama_chat", _cache_model, _cache_messages, cache_bytes)
|
||||||
|
except Exception as _ce:
|
||||||
|
print(f"[cache] set_chat (ollama_chat non-streaming) failed: {_ce}")
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
# Ensure counter is decremented even if an exception occurs
|
# Ensure counter is decremented even if an exception occurs
|
||||||
|
|
@ -2680,6 +2786,21 @@ async def openai_chat_completions_proxy(request: Request):
|
||||||
except orjson.JSONDecodeError as e:
|
except orjson.JSONDecodeError as e:
|
||||||
raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e
|
raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e
|
||||||
|
|
||||||
|
# Cache lookup — before endpoint selection
|
||||||
|
_cache = get_llm_cache()
|
||||||
|
if _cache is not None:
|
||||||
|
_cached = await _cache.get_chat("openai_chat", model, messages)
|
||||||
|
if _cached is not None:
|
||||||
|
if stream:
|
||||||
|
_sse = openai_nonstream_to_sse(_cached, model)
|
||||||
|
async def _serve_cached_ochat_stream():
|
||||||
|
yield _sse
|
||||||
|
return StreamingResponse(_serve_cached_ochat_stream(), media_type="text/event-stream")
|
||||||
|
else:
|
||||||
|
async def _serve_cached_ochat_json():
|
||||||
|
yield _cached
|
||||||
|
return StreamingResponse(_serve_cached_ochat_json(), media_type="application/json")
|
||||||
|
|
||||||
# 2. Endpoint logic
|
# 2. Endpoint logic
|
||||||
endpoint, tracking_model = await choose_endpoint(model)
|
endpoint, tracking_model = await choose_endpoint(model)
|
||||||
base_url = ep2base(endpoint)
|
base_url = ep2base(endpoint)
|
||||||
|
|
@ -2699,6 +2820,8 @@ async def openai_chat_completions_proxy(request: Request):
|
||||||
else:
|
else:
|
||||||
raise
|
raise
|
||||||
if stream == True:
|
if stream == True:
|
||||||
|
content_parts: list[str] = []
|
||||||
|
usage_snapshot: dict = {}
|
||||||
async for chunk in async_gen:
|
async for chunk in async_gen:
|
||||||
data = (
|
data = (
|
||||||
chunk.model_dump_json()
|
chunk.model_dump_json()
|
||||||
|
|
@ -2715,6 +2838,8 @@ async def openai_chat_completions_proxy(request: Request):
|
||||||
has_tool_calls = getattr(delta, "tool_calls", None) is not None
|
has_tool_calls = getattr(delta, "tool_calls", None) is not None
|
||||||
if has_content or has_reasoning or has_tool_calls:
|
if has_content or has_reasoning or has_tool_calls:
|
||||||
yield f"data: {data}\n\n".encode("utf-8")
|
yield f"data: {data}\n\n".encode("utf-8")
|
||||||
|
if has_content and delta.content:
|
||||||
|
content_parts.append(delta.content)
|
||||||
elif chunk.usage is not None:
|
elif chunk.usage is not None:
|
||||||
# Forward the usage-only final chunk (e.g. from llama-server)
|
# Forward the usage-only final chunk (e.g. from llama-server)
|
||||||
yield f"data: {data}\n\n".encode("utf-8")
|
yield f"data: {data}\n\n".encode("utf-8")
|
||||||
|
|
@ -2723,12 +2848,24 @@ async def openai_chat_completions_proxy(request: Request):
|
||||||
if chunk.usage is not None:
|
if chunk.usage is not None:
|
||||||
prompt_tok = chunk.usage.prompt_tokens or 0
|
prompt_tok = chunk.usage.prompt_tokens or 0
|
||||||
comp_tok = chunk.usage.completion_tokens or 0
|
comp_tok = chunk.usage.completion_tokens or 0
|
||||||
|
usage_snapshot = {"prompt_tokens": prompt_tok, "completion_tokens": comp_tok, "total_tokens": prompt_tok + comp_tok}
|
||||||
else:
|
else:
|
||||||
llama_usage = rechunk.extract_usage_from_llama_timings(chunk)
|
llama_usage = rechunk.extract_usage_from_llama_timings(chunk)
|
||||||
if llama_usage:
|
if llama_usage:
|
||||||
prompt_tok, comp_tok = llama_usage
|
prompt_tok, comp_tok = llama_usage
|
||||||
if prompt_tok != 0 or comp_tok != 0:
|
if prompt_tok != 0 or comp_tok != 0:
|
||||||
await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))
|
await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))
|
||||||
|
# Cache assembled streaming response — before [DONE] so it always runs
|
||||||
|
if _cache is not None and content_parts:
|
||||||
|
assembled = orjson.dumps({
|
||||||
|
"model": model,
|
||||||
|
"choices": [{"index": 0, "message": {"role": "assistant", "content": "".join(content_parts)}, "finish_reason": "stop"}],
|
||||||
|
**({"usage": usage_snapshot} if usage_snapshot else {}),
|
||||||
|
}) + b"\n"
|
||||||
|
try:
|
||||||
|
await _cache.set_chat("openai_chat", model, messages, assembled)
|
||||||
|
except Exception as _ce:
|
||||||
|
print(f"[cache] set_chat (openai_chat streaming) failed: {_ce}")
|
||||||
yield b"data: [DONE]\n\n"
|
yield b"data: [DONE]\n\n"
|
||||||
else:
|
else:
|
||||||
prompt_tok = 0
|
prompt_tok = 0
|
||||||
|
|
@ -2747,7 +2884,14 @@ async def openai_chat_completions_proxy(request: Request):
|
||||||
if hasattr(async_gen, "model_dump_json")
|
if hasattr(async_gen, "model_dump_json")
|
||||||
else orjson.dumps(async_gen)
|
else orjson.dumps(async_gen)
|
||||||
)
|
)
|
||||||
yield json_line.encode("utf-8") + b"\n"
|
cache_bytes = json_line.encode("utf-8") + b"\n"
|
||||||
|
yield cache_bytes
|
||||||
|
# Cache non-streaming response
|
||||||
|
if _cache is not None:
|
||||||
|
try:
|
||||||
|
await _cache.set_chat("openai_chat", model, messages, cache_bytes)
|
||||||
|
except Exception as _ce:
|
||||||
|
print(f"[cache] set_chat (openai_chat non-streaming) failed: {_ce}")
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
# Ensure counter is decremented even if an exception occurs
|
# Ensure counter is decremented even if an exception occurs
|
||||||
|
|
@ -2823,6 +2967,22 @@ async def openai_completions_proxy(request: Request):
|
||||||
except orjson.JSONDecodeError as e:
|
except orjson.JSONDecodeError as e:
|
||||||
raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e
|
raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e
|
||||||
|
|
||||||
|
# Cache lookup — completions prompt mapped to a single-turn messages list
|
||||||
|
_cache = get_llm_cache()
|
||||||
|
_compl_messages = [{"role": "user", "content": prompt}]
|
||||||
|
if _cache is not None:
|
||||||
|
_cached = await _cache.get_chat("openai_completions", model, _compl_messages)
|
||||||
|
if _cached is not None:
|
||||||
|
if stream:
|
||||||
|
_sse = openai_nonstream_to_sse(_cached, model)
|
||||||
|
async def _serve_cached_ocompl_stream():
|
||||||
|
yield _sse
|
||||||
|
return StreamingResponse(_serve_cached_ocompl_stream(), media_type="text/event-stream")
|
||||||
|
else:
|
||||||
|
async def _serve_cached_ocompl_json():
|
||||||
|
yield _cached
|
||||||
|
return StreamingResponse(_serve_cached_ocompl_json(), media_type="application/json")
|
||||||
|
|
||||||
# 2. Endpoint logic
|
# 2. Endpoint logic
|
||||||
endpoint, tracking_model = await choose_endpoint(model)
|
endpoint, tracking_model = await choose_endpoint(model)
|
||||||
base_url = ep2base(endpoint)
|
base_url = ep2base(endpoint)
|
||||||
|
|
@ -2834,6 +2994,8 @@ async def openai_completions_proxy(request: Request):
|
||||||
# The chat method returns a generator of dicts (or GenerateResponse)
|
# The chat method returns a generator of dicts (or GenerateResponse)
|
||||||
async_gen = await oclient.completions.create(**params)
|
async_gen = await oclient.completions.create(**params)
|
||||||
if stream == True:
|
if stream == True:
|
||||||
|
text_parts: list[str] = []
|
||||||
|
usage_snapshot: dict = {}
|
||||||
async for chunk in async_gen:
|
async for chunk in async_gen:
|
||||||
data = (
|
data = (
|
||||||
chunk.model_dump_json()
|
chunk.model_dump_json()
|
||||||
|
|
@ -2849,6 +3011,8 @@ async def openai_completions_proxy(request: Request):
|
||||||
)
|
)
|
||||||
if has_text or has_reasoning or choice.finish_reason is not None:
|
if has_text or has_reasoning or choice.finish_reason is not None:
|
||||||
yield f"data: {data}\n\n".encode("utf-8")
|
yield f"data: {data}\n\n".encode("utf-8")
|
||||||
|
if has_text and choice.text:
|
||||||
|
text_parts.append(choice.text)
|
||||||
elif chunk.usage is not None:
|
elif chunk.usage is not None:
|
||||||
# Forward the usage-only final chunk (e.g. from llama-server)
|
# Forward the usage-only final chunk (e.g. from llama-server)
|
||||||
yield f"data: {data}\n\n".encode("utf-8")
|
yield f"data: {data}\n\n".encode("utf-8")
|
||||||
|
|
@ -2857,12 +3021,24 @@ async def openai_completions_proxy(request: Request):
|
||||||
if chunk.usage is not None:
|
if chunk.usage is not None:
|
||||||
prompt_tok = chunk.usage.prompt_tokens or 0
|
prompt_tok = chunk.usage.prompt_tokens or 0
|
||||||
comp_tok = chunk.usage.completion_tokens or 0
|
comp_tok = chunk.usage.completion_tokens or 0
|
||||||
|
usage_snapshot = {"prompt_tokens": prompt_tok, "completion_tokens": comp_tok, "total_tokens": prompt_tok + comp_tok}
|
||||||
else:
|
else:
|
||||||
llama_usage = rechunk.extract_usage_from_llama_timings(chunk)
|
llama_usage = rechunk.extract_usage_from_llama_timings(chunk)
|
||||||
if llama_usage:
|
if llama_usage:
|
||||||
prompt_tok, comp_tok = llama_usage
|
prompt_tok, comp_tok = llama_usage
|
||||||
if prompt_tok != 0 or comp_tok != 0:
|
if prompt_tok != 0 or comp_tok != 0:
|
||||||
await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))
|
await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))
|
||||||
|
# Cache assembled streaming response — before [DONE] so it always runs
|
||||||
|
if _cache is not None and text_parts:
|
||||||
|
assembled = orjson.dumps({
|
||||||
|
"model": model,
|
||||||
|
"choices": [{"index": 0, "message": {"role": "assistant", "content": "".join(text_parts)}, "finish_reason": "stop"}],
|
||||||
|
**({"usage": usage_snapshot} if usage_snapshot else {}),
|
||||||
|
}) + b"\n"
|
||||||
|
try:
|
||||||
|
await _cache.set_chat("openai_completions", model, _compl_messages, assembled)
|
||||||
|
except Exception as _ce:
|
||||||
|
print(f"[cache] set_chat (openai_completions streaming) failed: {_ce}")
|
||||||
# Final DONE event
|
# Final DONE event
|
||||||
yield b"data: [DONE]\n\n"
|
yield b"data: [DONE]\n\n"
|
||||||
else:
|
else:
|
||||||
|
|
@ -2882,7 +3058,14 @@ async def openai_completions_proxy(request: Request):
|
||||||
if hasattr(async_gen, "model_dump_json")
|
if hasattr(async_gen, "model_dump_json")
|
||||||
else orjson.dumps(async_gen)
|
else orjson.dumps(async_gen)
|
||||||
)
|
)
|
||||||
yield json_line.encode("utf-8") + b"\n"
|
cache_bytes = json_line.encode("utf-8") + b"\n"
|
||||||
|
yield cache_bytes
|
||||||
|
# Cache non-streaming response
|
||||||
|
if _cache is not None:
|
||||||
|
try:
|
||||||
|
await _cache.set_chat("openai_completions", model, _compl_messages, cache_bytes)
|
||||||
|
except Exception as _ce:
|
||||||
|
print(f"[cache] set_chat (openai_completions non-streaming) failed: {_ce}")
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
# Ensure counter is decremented even if an exception occurs
|
# Ensure counter is decremented even if an exception occurs
|
||||||
|
|
@ -3076,6 +3259,28 @@ async def rerank_proxy(request: Request):
|
||||||
finally:
|
finally:
|
||||||
await decrement_usage(endpoint, tracking_model)
|
await decrement_usage(endpoint, tracking_model)
|
||||||
|
|
||||||
|
# -------------------------------------------------------------
|
||||||
|
# 25b. Cache management endpoints
|
||||||
|
# -------------------------------------------------------------
|
||||||
|
@app.get("/api/cache/stats")
|
||||||
|
async def cache_stats():
|
||||||
|
"""Return hit/miss counters and configuration for the LLM response cache."""
|
||||||
|
c = get_llm_cache()
|
||||||
|
if c is None:
|
||||||
|
return {"enabled": False}
|
||||||
|
return {"enabled": True, **c.stats()}
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/api/cache/invalidate")
|
||||||
|
async def cache_invalidate():
|
||||||
|
"""Clear all entries from the LLM response cache and reset counters."""
|
||||||
|
c = get_llm_cache()
|
||||||
|
if c is None:
|
||||||
|
return {"enabled": False, "cleared": False}
|
||||||
|
await c.clear()
|
||||||
|
return {"enabled": True, "cleared": True}
|
||||||
|
|
||||||
|
|
||||||
# -------------------------------------------------------------
|
# -------------------------------------------------------------
|
||||||
# 26. Serve the static front‑end
|
# 26. Serve the static front‑end
|
||||||
# -------------------------------------------------------------
|
# -------------------------------------------------------------
|
||||||
|
|
@ -3211,6 +3416,7 @@ async def startup_event() -> None:
|
||||||
app_state["session"] = session
|
app_state["session"] = session
|
||||||
token_worker_task = asyncio.create_task(token_worker())
|
token_worker_task = asyncio.create_task(token_worker())
|
||||||
flush_task = asyncio.create_task(flush_buffer())
|
flush_task = asyncio.create_task(flush_buffer())
|
||||||
|
await init_llm_cache(config)
|
||||||
|
|
||||||
@app.on_event("shutdown")
|
@app.on_event("shutdown")
|
||||||
async def shutdown_event() -> None:
|
async def shutdown_event() -> None:
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue