fix: improvements, fixes and opt-in cache
doc: semantic-cache.md added with detailed write-up
This commit is contained in:
parent
a5108486e3
commit
fbdc73eebb
4 changed files with 598 additions and 18 deletions
154
cache.py
154
cache.py
|
|
@ -14,10 +14,16 @@ Strategy:
|
|||
- MOE models (moe-*) always bypass the cache.
|
||||
- Token counts are never recorded for cache hits.
|
||||
- Streaming cache hits are served as a single-chunk response.
|
||||
- Privacy protection: responses that echo back user-identifying tokens from the system
|
||||
prompt (names, emails, IDs) are stored WITHOUT an embedding. They remain findable
|
||||
by exact-match for the same user but are invisible to cross-user semantic search.
|
||||
Generic responses (capital of France → "Paris") keep their embeddings and can still
|
||||
produce cross-user semantic hits as intended.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import math
|
||||
import re
|
||||
import time
|
||||
import warnings
|
||||
from collections import Counter
|
||||
|
|
@ -162,6 +168,111 @@ class LLMCache:
|
|||
from semantic_llm_cache.utils import hash_prompt, normalize_prompt
|
||||
return hash_prompt(normalize_prompt(last_user), namespace)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Privacy helpers — prevent cross-user leakage of personal data
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
_IDENTITY_STOPWORDS = frozenset({
|
||||
"user", "users", "name", "names", "email", "phone", "their", "they",
|
||||
"this", "that", "with", "from", "have", "been", "also", "more",
|
||||
"tags", "identity", "preference", "context",
|
||||
})
|
||||
# Patterns that unambiguously signal personal data in a response
|
||||
_EMAIL_RE = re.compile(r'\b[\w.%+-]+@[\w.-]+\.[a-zA-Z]{2,}\b')
|
||||
_UUID_RE = re.compile(
|
||||
r'\b[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b',
|
||||
re.IGNORECASE,
|
||||
)
|
||||
# Standalone numeric run ≥ 8 digits (common user/account IDs)
|
||||
_NUMERIC_ID_RE = re.compile(r'\b\d{8,}\b')
|
||||
|
||||
def _extract_response_content(self, response_bytes: bytes) -> str:
|
||||
"""Parse response bytes (Ollama or OpenAI format) and return the text content."""
|
||||
try:
|
||||
import orjson
|
||||
data = orjson.loads(response_bytes)
|
||||
if "choices" in data: # OpenAI ChatCompletion
|
||||
return (data["choices"][0].get("message") or {}).get("content", "")
|
||||
if "message" in data: # Ollama chat
|
||||
return (data.get("message") or {}).get("content", "")
|
||||
if "response" in data: # Ollama generate
|
||||
return data.get("response", "")
|
||||
except Exception:
|
||||
pass
|
||||
return ""
|
||||
|
||||
def _extract_personal_tokens(self, system: str) -> frozenset[str]:
|
||||
"""
|
||||
Extract user-identifying tokens from the system prompt.
|
||||
|
||||
Looks for:
|
||||
- Email addresses anywhere in the system prompt
|
||||
- Numeric IDs (≥ 4 digits) appearing after "id" keyword
|
||||
- Proper-noun-like words from [Tags: identity] lines
|
||||
"""
|
||||
if not system:
|
||||
return frozenset()
|
||||
|
||||
tokens: set[str] = set()
|
||||
|
||||
# Email addresses
|
||||
for email in self._EMAIL_RE.findall(system):
|
||||
tokens.add(email.lower())
|
||||
|
||||
# Numeric IDs: "id: 1234", "id=5678"
|
||||
for uid in re.findall(r'\bid\s*[=:]?\s*(\d{4,})\b', system, re.IGNORECASE):
|
||||
tokens.add(uid)
|
||||
|
||||
# Values from [Tags: identity] lines (e.g. "User's name is Andreas")
|
||||
for line in re.findall(
|
||||
r'\[Tags:.*?identity.*?\]\s*(.+?)(?:\n|$)', system, re.IGNORECASE
|
||||
):
|
||||
for word in re.findall(r'\b\w{4,}\b', line):
|
||||
w = word.lower()
|
||||
if w not in self._IDENTITY_STOPWORDS:
|
||||
tokens.add(w)
|
||||
|
||||
return frozenset(tokens)
|
||||
|
||||
def _response_is_personalized(self, response_bytes: bytes, system: str) -> bool:
|
||||
"""
|
||||
Return True if the response contains user-personal information.
|
||||
|
||||
Two complementary checks:
|
||||
|
||||
1. Direct PII detection in the response content — emails, UUIDs, long
|
||||
numeric IDs. Catches data retrieved at runtime via tool calls that
|
||||
never appears in the system prompt.
|
||||
|
||||
2. System-prompt token overlap — words extracted from [Tags: identity]
|
||||
lines that reappear verbatim in the response (catches names, etc.).
|
||||
|
||||
Such responses are stored WITHOUT a semantic embedding so they are
|
||||
invisible to cross-user semantic search while still being cacheable for
|
||||
the same user via exact-match.
|
||||
"""
|
||||
content = self._extract_response_content(response_bytes)
|
||||
if not content:
|
||||
# Can't parse → err on the side of caution
|
||||
return bool(response_bytes)
|
||||
|
||||
# 1. Direct PII patterns — independent of what's in the system prompt
|
||||
if (
|
||||
self._EMAIL_RE.search(content)
|
||||
or self._UUID_RE.search(content)
|
||||
or self._NUMERIC_ID_RE.search(content)
|
||||
):
|
||||
return True
|
||||
|
||||
# 2. System-prompt identity tokens echoed back in the response
|
||||
personal = self._extract_personal_tokens(system)
|
||||
if personal:
|
||||
content_lower = content.lower()
|
||||
if any(token in content_lower for token in personal):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _parse_messages(
|
||||
self, messages: list[dict]
|
||||
) -> tuple[str, list[dict], str]:
|
||||
|
|
@ -242,10 +353,17 @@ class LLMCache:
|
|||
ns = self._namespace(route, model, system)
|
||||
key = self._cache_key(ns, last_user)
|
||||
|
||||
print(
|
||||
f"[cache] get_chat route={route} model={model} ns={ns} "
|
||||
f"prompt={last_user[:80]!r} "
|
||||
f"system_snippet={system[:120]!r}"
|
||||
)
|
||||
|
||||
# 1. Exact key match
|
||||
entry = await self._backend.get(key)
|
||||
if entry is not None:
|
||||
self._hits += 1
|
||||
print(f"[cache] HIT (exact) ns={ns} prompt={last_user[:80]!r}")
|
||||
return entry.response # type: ignore[return-value]
|
||||
|
||||
# 2. Semantic similarity match
|
||||
|
|
@ -255,11 +373,16 @@ class LLMCache:
|
|||
emb, threshold=self._cfg.cache_similarity, namespace=ns
|
||||
)
|
||||
if result is not None:
|
||||
_, matched, _ = result
|
||||
_, matched, sim = result
|
||||
self._hits += 1
|
||||
print(
|
||||
f"[cache] HIT (semantic sim={sim:.3f}) ns={ns} "
|
||||
f"prompt={last_user[:80]!r} matched={matched.prompt[:80]!r}"
|
||||
)
|
||||
return matched.response # type: ignore[return-value]
|
||||
|
||||
self._misses += 1
|
||||
print(f"[cache] MISS ns={ns} prompt={last_user[:80]!r}")
|
||||
return None
|
||||
|
||||
async def set_chat(
|
||||
|
|
@ -276,9 +399,36 @@ class LLMCache:
|
|||
ns = self._namespace(route, model, system)
|
||||
key = self._cache_key(ns, last_user)
|
||||
|
||||
# Privacy guard: check whether the response contains personal data.
|
||||
personalized = self._response_is_personalized(response_bytes, system)
|
||||
|
||||
if personalized:
|
||||
# Exact-match is only safe when the system prompt is user-specific
|
||||
# (i.e. different per user → different namespace). When the system
|
||||
# prompt is generic and shared across all users, the namespace is the
|
||||
# same for everyone: storing even without an embedding would let any
|
||||
# user who asks the identical question get another user's personal data
|
||||
# via exact-match. In that case skip storage entirely.
|
||||
system_is_user_specific = bool(self._extract_personal_tokens(system))
|
||||
if not system_is_user_specific:
|
||||
print(
|
||||
f"[cache] SKIP personalized response with generic system prompt "
|
||||
f"route={route} model={model} ns={ns} prompt={last_user[:80]!r} "
|
||||
f"system_snippet={system[:120]!r}"
|
||||
)
|
||||
return
|
||||
|
||||
print(
|
||||
f"[cache] set_chat route={route} model={model} ns={ns} "
|
||||
f"personalized={personalized} "
|
||||
f"prompt={last_user[:80]!r} "
|
||||
f"system_snippet={system[:120]!r}"
|
||||
)
|
||||
# Store without embedding when personalized (invisible to semantic search
|
||||
# across users, but still reachable by exact-match within this namespace).
|
||||
emb = (
|
||||
await self._build_embedding(history, last_user)
|
||||
if self._semantic and self._cfg.cache_similarity < 1.0
|
||||
if self._semantic and self._cfg.cache_similarity < 1.0 and not personalized
|
||||
else None
|
||||
)
|
||||
|
||||
|
|
|
|||
375
doc/semantic-cache.md
Normal file
375
doc/semantic-cache.md
Normal file
|
|
@ -0,0 +1,375 @@
|
|||
# Semantic Cache
|
||||
|
||||
NOMYO Router includes a built-in LLM semantic cache that can dramatically reduce inference costs and latency by reusing previous responses — either via exact-match or by finding semantically equivalent questions.
|
||||
|
||||
## How It Works
|
||||
|
||||
Every incoming request is checked against the cache before being forwarded to an LLM endpoint. On a cache hit the stored response is returned immediately, bypassing inference entirely.
|
||||
|
||||
Cache isolation is strict: each combination of **route + model + system prompt** gets its own namespace (a 16-character SHA-256 prefix). A question asked under one system prompt can never leak into a different user's or route's namespace.
|
||||
|
||||
Two lookup strategies are available:
|
||||
|
||||
| Mode | How it matches | When to use |
|
||||
|---|---|---|
|
||||
| **Exact match** | Normalized text hash | When you need 100% identical answers, zero false positives |
|
||||
| **Semantic match** | Cosine similarity of sentence embeddings | When questions are paraphrased or phrased differently |
|
||||
|
||||
In semantic mode the embedding is a weighted average of:
|
||||
- The **last user message** (70% by default) — captures what is being asked
|
||||
- A **BM25-weighted summary of the chat history** (30% by default) — provides conversation context
|
||||
|
||||
This means "What is the capital of France?" and "Can you tell me the capital city of France?" will hit the same cache entry, but a question in a different conversation context will not.
|
||||
|
||||
### MoE Model Bypass
|
||||
|
||||
Models with names starting with `moe-` always bypass the cache entirely. Mixture-of-Experts models produce non-deterministic outputs by design and are not suitable for response caching.
|
||||
|
||||
### Privacy Protection
|
||||
|
||||
Responses that contain personally identifiable information (emails, UUIDs, numeric IDs ≥ 8 digits, or tokens extracted from `[Tags: identity]` lines in the system prompt) are stored **without a semantic embedding**. They remain reachable via exact-match within the same user-specific namespace but are invisible to cross-user semantic search.
|
||||
|
||||
If a personalized response is generated with a generic (shared) system prompt, it is skipped entirely — never stored.
|
||||
|
||||
---
|
||||
|
||||
## Docker Image Variants
|
||||
|
||||
| Tag | Semantic cache | Approx. image size | Includes |
|
||||
|---|---|---|---|
|
||||
| `latest` | No (exact match only) | ~300 MB | Base router only |
|
||||
| `latest-semantic` | Yes | ~800 MB | `sentence-transformers`, `torch`, `all-MiniLM-L6-v2` model baked in |
|
||||
|
||||
Use the **`latest`** image when you only need exact-match deduplication. It has no heavy ML dependencies.
|
||||
|
||||
Use **`latest-semantic`** when you want to catch paraphrased or semantically equivalent questions. The `all-MiniLM-L6-v2` model is baked into the image at build time so no internet access is needed at runtime.
|
||||
|
||||
> **Note:** If you set `cache_similarity < 1.0` but use the `latest` image, the router falls back to exact-match caching and logs a warning. It will not error.
|
||||
|
||||
Build locally:
|
||||
|
||||
```bash
|
||||
# Lean image (exact match only)
|
||||
docker build -t nomyo-router .
|
||||
|
||||
# Semantic image (~500 MB larger)
|
||||
docker build --build-arg SEMANTIC_CACHE=true -t nomyo-router:semantic .
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Dependencies
|
||||
|
||||
### Lean image / bare-metal (exact match)
|
||||
|
||||
No extra dependencies beyond the base `requirements.txt`. The `semantic-llm-cache` library provides exact-match storage with a no-op embedding provider.
|
||||
|
||||
### Semantic image / bare-metal (semantic match)
|
||||
|
||||
Additional heavy packages are required:
|
||||
|
||||
| Package | Purpose | Approx. size |
|
||||
|---|---|---|
|
||||
| `sentence-transformers` | Sentence embedding model wrapper | ~100 MB |
|
||||
| `torch` (CPU) | PyTorch inference backend | ~700 MB |
|
||||
| `numpy` | Vector math for weighted mean embeddings | ~20 MB |
|
||||
|
||||
Install for bare-metal semantic mode:
|
||||
|
||||
```bash
|
||||
pip install sentence-transformers torch --index-url https://download.pytorch.org/whl/cpu
|
||||
```
|
||||
|
||||
The `all-MiniLM-L6-v2` model (~90 MB) is downloaded on first use (or baked in when using the `:semantic` Docker image).
|
||||
|
||||
---
|
||||
|
||||
## Configuration
|
||||
|
||||
All cache settings live in `config.yaml` and can be overridden with environment variables prefixed `NOMYO_ROUTER_`.
|
||||
|
||||
### Minimal — enable exact-match with in-memory backend
|
||||
|
||||
```yaml
|
||||
# config.yaml
|
||||
cache_enabled: true
|
||||
```
|
||||
|
||||
### Full reference
|
||||
|
||||
```yaml
|
||||
# config.yaml
|
||||
|
||||
# Master switch — set to true to enable the cache
|
||||
cache_enabled: false
|
||||
|
||||
# Storage backend: "memory" | "sqlite" | "redis"
|
||||
# memory — fast, in-process, lost on restart
|
||||
# sqlite — persistent single-file, good for single-instance deployments
|
||||
# redis — persistent, shared across multiple router instances
|
||||
cache_backend: memory
|
||||
|
||||
# Cosine similarity threshold for semantic matching
|
||||
# 1.0 = exact match only (no sentence-transformers required)
|
||||
# 0.95 = very close paraphrases only (recommended starting point)
|
||||
# 0.85 = broader matching, higher false-positive risk
|
||||
# Requires :semantic Docker image (or sentence-transformers installed)
|
||||
cache_similarity: 1.0
|
||||
|
||||
# Time-to-live in seconds; null or omit to cache forever
|
||||
cache_ttl: 3600
|
||||
|
||||
# SQLite backend: path to cache database file
|
||||
cache_db_path: llm_cache.db
|
||||
|
||||
# Redis backend: connection URL
|
||||
cache_redis_url: redis://localhost:6379/0
|
||||
|
||||
# Weight of chat-history embedding in the combined query vector (0.0–1.0)
|
||||
# 0.3 = 30% history context, 70% last user message (default)
|
||||
# Increase if your use case is highly conversation-dependent
|
||||
cache_history_weight: 0.3
|
||||
```
|
||||
|
||||
### Environment variable overrides
|
||||
|
||||
```bash
|
||||
NOMYO_ROUTER_CACHE_ENABLED=true
|
||||
NOMYO_ROUTER_CACHE_BACKEND=sqlite
|
||||
NOMYO_ROUTER_CACHE_SIMILARITY=0.95
|
||||
NOMYO_ROUTER_CACHE_TTL=7200
|
||||
NOMYO_ROUTER_CACHE_DB_PATH=/data/llm_cache.db
|
||||
NOMYO_ROUTER_CACHE_REDIS_URL=redis://redis:6379/0
|
||||
NOMYO_ROUTER_CACHE_HISTORY_WEIGHT=0.3
|
||||
```
|
||||
|
||||
### Per-request opt-in
|
||||
|
||||
The cache is opted in per-request via the `nomyo.cache` field in the request body. The global `cache_enabled` setting must also be `true`.
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "llama3.2",
|
||||
"messages": [{"role": "user", "content": "What is the capital of France?"}],
|
||||
"nomyo": {
|
||||
"cache": true
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Backend Comparison
|
||||
|
||||
| | `memory` | `sqlite` | `redis` |
|
||||
|---|---|---|---|
|
||||
| Persistence | No (lost on restart) | Yes | Yes |
|
||||
| Multi-instance | No | No | Yes |
|
||||
| Setup required | None | None | Redis server |
|
||||
| Recommended for | Development, testing | Single-instance production | Multi-instance / HA production |
|
||||
|
||||
### SQLite — persistent single-instance
|
||||
|
||||
```yaml
|
||||
cache_enabled: true
|
||||
cache_backend: sqlite
|
||||
cache_db_path: /data/llm_cache.db
|
||||
cache_ttl: 86400 # 24 hours
|
||||
```
|
||||
|
||||
Mount the path in Docker:
|
||||
|
||||
```bash
|
||||
docker run -d \
|
||||
-v /path/to/data:/data \
|
||||
-e NOMYO_ROUTER_CACHE_ENABLED=true \
|
||||
-e NOMYO_ROUTER_CACHE_BACKEND=sqlite \
|
||||
-e NOMYO_ROUTER_CACHE_DB_PATH=/data/llm_cache.db \
|
||||
ghcr.io/nomyo-ai/nomyo-router:latest
|
||||
```
|
||||
|
||||
### Redis — distributed / multi-instance
|
||||
|
||||
```yaml
|
||||
cache_enabled: true
|
||||
cache_backend: redis
|
||||
cache_redis_url: redis://redis:6379/0
|
||||
cache_similarity: 0.95
|
||||
cache_ttl: 3600
|
||||
```
|
||||
|
||||
Docker Compose example:
|
||||
|
||||
```yaml
|
||||
services:
|
||||
nomyo-router:
|
||||
image: ghcr.io/nomyo-ai/nomyo-router:latest-semantic
|
||||
ports:
|
||||
- "12434:12434"
|
||||
environment:
|
||||
NOMYO_ROUTER_CACHE_ENABLED: "true"
|
||||
NOMYO_ROUTER_CACHE_BACKEND: redis
|
||||
NOMYO_ROUTER_CACHE_REDIS_URL: redis://redis:6379/0
|
||||
NOMYO_ROUTER_CACHE_SIMILARITY: "0.95"
|
||||
depends_on:
|
||||
- redis
|
||||
|
||||
redis:
|
||||
image: redis:7-alpine
|
||||
volumes:
|
||||
- redis_data:/data
|
||||
|
||||
volumes:
|
||||
redis_data:
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Code Examples
|
||||
|
||||
### Python (OpenAI SDK)
|
||||
|
||||
```python
|
||||
from openai import OpenAI
|
||||
|
||||
client = OpenAI(
|
||||
base_url="http://localhost:12434/v1",
|
||||
api_key="unused",
|
||||
)
|
||||
|
||||
# First call — cache miss, hits the LLM
|
||||
response = client.chat.completions.create(
|
||||
model="llama3.2",
|
||||
messages=[{"role": "user", "content": "What is the capital of France?"}],
|
||||
extra_body={"nomyo": {"cache": True}},
|
||||
)
|
||||
print(response.choices[0].message.content)
|
||||
|
||||
# Second call (exact same question) — cache hit, returned instantly
|
||||
response = client.chat.completions.create(
|
||||
model="llama3.2",
|
||||
messages=[{"role": "user", "content": "What is the capital of France?"}],
|
||||
extra_body={"nomyo": {"cache": True}},
|
||||
)
|
||||
print(response.choices[0].message.content)
|
||||
|
||||
# With semantic mode enabled, this also hits the cache:
|
||||
response = client.chat.completions.create(
|
||||
model="llama3.2",
|
||||
messages=[{"role": "user", "content": "Tell me the capital city of France"}],
|
||||
extra_body={"nomyo": {"cache": True}},
|
||||
)
|
||||
print(response.choices[0].message.content)
|
||||
```
|
||||
|
||||
### Python (raw HTTP / httpx)
|
||||
|
||||
```python
|
||||
import httpx
|
||||
|
||||
payload = {
|
||||
"model": "llama3.2",
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "Explain quantum entanglement in one sentence."},
|
||||
],
|
||||
"nomyo": {"cache": True},
|
||||
}
|
||||
|
||||
resp = httpx.post("http://localhost:12434/v1/chat/completions", json=payload)
|
||||
print(resp.json()["choices"][0]["message"]["content"])
|
||||
```
|
||||
|
||||
### curl
|
||||
|
||||
```bash
|
||||
curl -s http://localhost:12434/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "llama3.2",
|
||||
"messages": [{"role": "user", "content": "What is the capital of France?"}],
|
||||
"nomyo": {"cache": true}
|
||||
}' | jq .choices[0].message.content
|
||||
```
|
||||
|
||||
### Check cache statistics
|
||||
|
||||
```bash
|
||||
curl -s http://localhost:12434/api/cache/stats | jq .
|
||||
```
|
||||
|
||||
```json
|
||||
{
|
||||
"hits": 42,
|
||||
"misses": 18,
|
||||
"hit_rate": 0.7,
|
||||
"semantic": true,
|
||||
"backend": "sqlite",
|
||||
"similarity_threshold": 0.95,
|
||||
"history_weight": 0.3
|
||||
}
|
||||
```
|
||||
|
||||
### Clear the cache
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:12434/api/cache/clear
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Use Cases Where Semantic Caching Helps Most
|
||||
|
||||
### FAQ and support bots
|
||||
|
||||
Users ask the same questions in slightly different ways. "How do I reset my password?", "I forgot my password", "password reset instructions" — all semantically equivalent. Semantic caching collapses these into a single LLM call.
|
||||
|
||||
**Recommended settings:** `cache_similarity: 0.90–0.95`, `cache_backend: sqlite` or `redis`, long TTL.
|
||||
|
||||
### Document Q&A / RAG pipelines
|
||||
|
||||
When the same document set is queried repeatedly by many users, common factual questions ("What is the contract start date?", "When does this contract begin?") hit the cache across users — as long as the system prompt is not user-personalized.
|
||||
|
||||
**Recommended settings:** `cache_similarity: 0.92`, `cache_backend: redis` for multi-user deployments.
|
||||
|
||||
### Code generation with repeated patterns
|
||||
|
||||
Development tools that generate boilerplate, explain errors, or convert code often receive nearly-identical prompts. Caching prevents re-running expensive large-model inference for the same error message phrased differently.
|
||||
|
||||
**Recommended settings:** `cache_similarity: 0.93`, `cache_backend: sqlite`.
|
||||
|
||||
### High-traffic chatbots with shared context
|
||||
|
||||
Public-facing bots where many users share the same system prompt (same product, same persona). Generic factual answers can be safely reused across users. The privacy guard ensures personalized responses are never shared.
|
||||
|
||||
**Recommended settings:** `cache_similarity: 0.90–0.95`, `cache_backend: redis`.
|
||||
|
||||
### Batch processing / data pipelines
|
||||
|
||||
When running LLM inference over large datasets with many near-duplicate or repeated inputs, caching can reduce inference calls dramatically and make pipelines idempotent.
|
||||
|
||||
**Recommended settings:** `cache_similarity: 0.95`, `cache_backend: sqlite`, `cache_ttl: null` (cache forever for deterministic batch runs).
|
||||
|
||||
---
|
||||
|
||||
## Tuning the Similarity Threshold
|
||||
|
||||
| `cache_similarity` | Behaviour | Risk |
|
||||
|---|---|---|
|
||||
| `1.0` | Exact match only | No false positives |
|
||||
| `0.97` | Catches minor typos and punctuation differences | Very low |
|
||||
| `0.95` | Catches clear paraphrases (recommended default) | Low |
|
||||
| `0.90` | Catches broader rewordings | Moderate — may match different intents |
|
||||
| `< 0.85` | Very aggressive matching | High false-positive rate, not recommended |
|
||||
|
||||
Start at `0.95` and lower gradually while monitoring cache hit rate via `/api/cache/stats`.
|
||||
|
||||
---
|
||||
|
||||
## Limitations
|
||||
|
||||
- **Streaming responses** cached from a non-streaming request are served as a single chunk (not token-by-token). This is indistinguishable to most clients.
|
||||
- **MoE models** (`moe-*` prefix) are never cached.
|
||||
- **Token counts** are not recorded for cache hits (the LLM was not called).
|
||||
- The `memory` backend does not survive a router restart.
|
||||
- Semantic search requires the `:semantic` image (or local `sentence-transformers` install); without it, `cache_similarity < 1.0` silently falls back to exact match.
|
||||
|
|
@ -43,4 +43,4 @@ aiosqlite
|
|||
# For semantic mode use the :semantic Docker image tag (adds sentence-transformers + torch)
|
||||
# SECURITY: pin to a specific tag or commit hash to prevent supply-chain drift, e.g.:
|
||||
# semantic-llm-cache @ git+https://github.com/nomyo-ai/async-semantic-llm-cache.git@v1.0.0
|
||||
semantic-llm-cache @ git+https://github.com/nomyo-ai/async-semantic-llm-cache.git
|
||||
semantic-llm-cache@git+https://github.com/nomyo-ai/async-semantic-llm-cache.git@v0.1
|
||||
|
|
|
|||
83
router.py
83
router.py
|
|
@ -1600,6 +1600,7 @@ async def proxy(request: Request):
|
|||
images = payload.get("images")
|
||||
options = payload.get("options")
|
||||
keep_alive = payload.get("keep_alive")
|
||||
_cache_enabled = payload.get("nomyo", {}).get("cache", False)
|
||||
|
||||
if not model:
|
||||
raise HTTPException(
|
||||
|
|
@ -1615,7 +1616,7 @@ async def proxy(request: Request):
|
|||
|
||||
# Cache lookup — before endpoint selection so no slot is wasted on a hit
|
||||
_cache = get_llm_cache()
|
||||
if _cache is not None:
|
||||
if _cache is not None and _cache_enabled:
|
||||
_cached = await _cache.get_generate(model, prompt, system or "")
|
||||
if _cached is not None:
|
||||
async def _serve_cached_generate():
|
||||
|
|
@ -1671,7 +1672,7 @@ async def proxy(request: Request):
|
|||
else:
|
||||
json_line = orjson.dumps(chunk)
|
||||
# Accumulate and store cache on done chunk — before yield so it always runs
|
||||
if _cache is not None:
|
||||
if _cache is not None and _cache_enabled:
|
||||
if getattr(chunk, "response", None):
|
||||
content_parts.append(chunk.response)
|
||||
if getattr(chunk, "done", False):
|
||||
|
|
@ -1710,7 +1711,7 @@ async def proxy(request: Request):
|
|||
cache_bytes = json_line.encode("utf-8") + b"\n"
|
||||
yield cache_bytes
|
||||
# Cache non-streaming response
|
||||
if _cache is not None:
|
||||
if _cache is not None and _cache_enabled:
|
||||
try:
|
||||
await _cache.set_generate(model, prompt, system or "", cache_bytes)
|
||||
except Exception as _ce:
|
||||
|
|
@ -1749,6 +1750,7 @@ async def chat_proxy(request: Request):
|
|||
options = payload.get("options")
|
||||
logprobs = payload.get("logprobs")
|
||||
top_logprobs = payload.get("top_logprobs")
|
||||
_cache_enabled = payload.get("nomyo", {}).get("cache", False)
|
||||
|
||||
if not model:
|
||||
raise HTTPException(
|
||||
|
|
@ -1775,7 +1777,7 @@ async def chat_proxy(request: Request):
|
|||
# Snapshot original messages before any OpenAI-format transformation so that
|
||||
# get_chat and set_chat always use the same key regardless of backend type.
|
||||
_cache_messages = messages
|
||||
if _cache is not None and not _is_moe:
|
||||
if _cache is not None and not _is_moe and _cache_enabled:
|
||||
_cached = await _cache.get_chat("ollama_chat", _cache_model, messages)
|
||||
if _cached is not None:
|
||||
async def _serve_cached_chat():
|
||||
|
|
@ -1858,7 +1860,7 @@ async def chat_proxy(request: Request):
|
|||
# Accumulate and store cache on done chunk — before yield so it always runs
|
||||
# Works for both Ollama-native and OpenAI-compatible backends; chunks are
|
||||
# already converted to Ollama format by rechunk before this point.
|
||||
if _cache is not None and not _is_moe:
|
||||
if _cache is not None and not _is_moe and _cache_enabled:
|
||||
if chunk.message and getattr(chunk.message, "content", None):
|
||||
content_parts.append(chunk.message.content)
|
||||
if getattr(chunk, "done", False):
|
||||
|
|
@ -1898,7 +1900,7 @@ async def chat_proxy(request: Request):
|
|||
cache_bytes = json_line.encode("utf-8") + b"\n"
|
||||
yield cache_bytes
|
||||
# Cache non-streaming response (non-MOE; works for both Ollama and OpenAI backends)
|
||||
if _cache is not None and not _is_moe:
|
||||
if _cache is not None and not _is_moe and _cache_enabled:
|
||||
try:
|
||||
await _cache.set_chat("ollama_chat", _cache_model, _cache_messages, cache_bytes)
|
||||
except Exception as _ce:
|
||||
|
|
@ -2746,6 +2748,7 @@ async def openai_chat_completions_proxy(request: Request):
|
|||
tools = payload.get("tools")
|
||||
logprobs = payload.get("logprobs")
|
||||
top_logprobs = payload.get("top_logprobs")
|
||||
_cache_enabled = payload.get("nomyo", {}).get("cache", False)
|
||||
|
||||
if not model:
|
||||
raise HTTPException(
|
||||
|
|
@ -2786,9 +2789,20 @@ async def openai_chat_completions_proxy(request: Request):
|
|||
except orjson.JSONDecodeError as e:
|
||||
raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e
|
||||
|
||||
# Reject unsupported image formats (SVG) before doing any work
|
||||
for _msg in messages:
|
||||
for _item in (_msg.get("content") or []) if isinstance(_msg.get("content"), list) else []:
|
||||
if _item.get("type") == "image_url":
|
||||
_url = (_item.get("image_url") or {}).get("url", "")
|
||||
if _url.startswith("data:image/svg") or _url.lower().endswith(".svg"):
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="SVG images are not supported. Please convert the image to PNG or JPEG before sending.",
|
||||
)
|
||||
|
||||
# Cache lookup — before endpoint selection
|
||||
_cache = get_llm_cache()
|
||||
if _cache is not None:
|
||||
if _cache is not None and _cache_enabled:
|
||||
_cached = await _cache.get_chat("openai_chat", model, messages)
|
||||
if _cached is not None:
|
||||
if stream:
|
||||
|
|
@ -2806,16 +2820,56 @@ async def openai_chat_completions_proxy(request: Request):
|
|||
base_url = ep2base(endpoint)
|
||||
oclient = openai.AsyncOpenAI(base_url=base_url, default_headers=default_headers, api_key=config.api_keys.get(endpoint, "no-key"))
|
||||
# 3. Async generator that streams completions data and decrements the counter
|
||||
async def _normalize_images_in_messages(msgs: list) -> list:
|
||||
"""Fetch remote image URLs and convert them to base64 data URLs so
|
||||
Ollama/llama-server can handle them without making outbound HTTP requests."""
|
||||
resolved = []
|
||||
for msg in msgs:
|
||||
content = msg.get("content")
|
||||
if not isinstance(content, list):
|
||||
resolved.append(msg)
|
||||
continue
|
||||
new_content = []
|
||||
for item in content:
|
||||
if item.get("type") == "image_url":
|
||||
url = (item.get("image_url") or {}).get("url", "")
|
||||
if url and not url.startswith("data:"):
|
||||
try:
|
||||
http: aiohttp.ClientSession = app_state["session"]
|
||||
async with http.get(url) as resp:
|
||||
ctype = resp.headers.get("Content-Type", "image/jpeg").split(";")[0].strip()
|
||||
img_bytes = await resp.read()
|
||||
b64 = base64.b64encode(img_bytes).decode("utf-8")
|
||||
new_content.append({
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:{ctype};base64,{b64}"}
|
||||
})
|
||||
except Exception as _ie:
|
||||
print(f"[image] Failed to fetch image URL: {_ie}")
|
||||
new_content.append(item)
|
||||
else:
|
||||
new_content.append(item)
|
||||
else:
|
||||
new_content.append(item)
|
||||
resolved.append({**msg, "content": new_content})
|
||||
return resolved
|
||||
|
||||
async def stream_ochat_response():
|
||||
try:
|
||||
# The chat method returns a generator of dicts (or GenerateResponse)
|
||||
try:
|
||||
async_gen = await oclient.chat.completions.create(**params)
|
||||
# For non-external endpoints (Ollama, llama-server), resolve remote
|
||||
# image URLs to base64 data URLs so the server can handle them locally.
|
||||
send_params = params
|
||||
if not is_ext_openai_endpoint(endpoint):
|
||||
resolved_msgs = await _normalize_images_in_messages(params.get("messages", []))
|
||||
send_params = {**params, "messages": resolved_msgs}
|
||||
async_gen = await oclient.chat.completions.create(**send_params)
|
||||
except openai.BadRequestError as e:
|
||||
# If tools are not supported by the model, retry without tools
|
||||
if "does not support tools" in str(e):
|
||||
print(f"[openai_chat_completions_proxy] Model {model} doesn't support tools, retrying without tools")
|
||||
params_without_tools = {k: v for k, v in params.items() if k != "tools"}
|
||||
params_without_tools = {k: v for k, v in send_params.items() if k != "tools"}
|
||||
async_gen = await oclient.chat.completions.create(**params_without_tools)
|
||||
else:
|
||||
raise
|
||||
|
|
@ -2856,7 +2910,7 @@ async def openai_chat_completions_proxy(request: Request):
|
|||
if prompt_tok != 0 or comp_tok != 0:
|
||||
await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))
|
||||
# Cache assembled streaming response — before [DONE] so it always runs
|
||||
if _cache is not None and content_parts:
|
||||
if _cache is not None and _cache_enabled and content_parts:
|
||||
assembled = orjson.dumps({
|
||||
"model": model,
|
||||
"choices": [{"index": 0, "message": {"role": "assistant", "content": "".join(content_parts)}, "finish_reason": "stop"}],
|
||||
|
|
@ -2887,7 +2941,7 @@ async def openai_chat_completions_proxy(request: Request):
|
|||
cache_bytes = json_line.encode("utf-8") + b"\n"
|
||||
yield cache_bytes
|
||||
# Cache non-streaming response
|
||||
if _cache is not None:
|
||||
if _cache is not None and _cache_enabled:
|
||||
try:
|
||||
await _cache.set_chat("openai_chat", model, messages, cache_bytes)
|
||||
except Exception as _ce:
|
||||
|
|
@ -2930,6 +2984,7 @@ async def openai_completions_proxy(request: Request):
|
|||
max_tokens = payload.get("max_tokens")
|
||||
max_completion_tokens = payload.get("max_completion_tokens")
|
||||
suffix = payload.get("suffix")
|
||||
_cache_enabled = payload.get("nomyo", {}).get("cache", False)
|
||||
|
||||
if not model:
|
||||
raise HTTPException(
|
||||
|
|
@ -2970,7 +3025,7 @@ async def openai_completions_proxy(request: Request):
|
|||
# Cache lookup — completions prompt mapped to a single-turn messages list
|
||||
_cache = get_llm_cache()
|
||||
_compl_messages = [{"role": "user", "content": prompt}]
|
||||
if _cache is not None:
|
||||
if _cache is not None and _cache_enabled:
|
||||
_cached = await _cache.get_chat("openai_completions", model, _compl_messages)
|
||||
if _cached is not None:
|
||||
if stream:
|
||||
|
|
@ -3029,7 +3084,7 @@ async def openai_completions_proxy(request: Request):
|
|||
if prompt_tok != 0 or comp_tok != 0:
|
||||
await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))
|
||||
# Cache assembled streaming response — before [DONE] so it always runs
|
||||
if _cache is not None and text_parts:
|
||||
if _cache is not None and _cache_enabled and text_parts:
|
||||
assembled = orjson.dumps({
|
||||
"model": model,
|
||||
"choices": [{"index": 0, "message": {"role": "assistant", "content": "".join(text_parts)}, "finish_reason": "stop"}],
|
||||
|
|
@ -3061,7 +3116,7 @@ async def openai_completions_proxy(request: Request):
|
|||
cache_bytes = json_line.encode("utf-8") + b"\n"
|
||||
yield cache_bytes
|
||||
# Cache non-streaming response
|
||||
if _cache is not None:
|
||||
if _cache is not None and _cache_enabled:
|
||||
try:
|
||||
await _cache.set_chat("openai_completions", model, _compl_messages, cache_bytes)
|
||||
except Exception as _ce:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue