fix: improvements, fixes and opt-in cache

doc: semantic-cache.md added with detailed write-up
This commit is contained in:
Alpha Nerd 2026-03-10 15:19:37 +01:00
parent a5108486e3
commit fbdc73eebb
4 changed files with 598 additions and 18 deletions

154
cache.py
View file

@ -14,10 +14,16 @@ Strategy:
- MOE models (moe-*) always bypass the cache. - MOE models (moe-*) always bypass the cache.
- Token counts are never recorded for cache hits. - Token counts are never recorded for cache hits.
- Streaming cache hits are served as a single-chunk response. - Streaming cache hits are served as a single-chunk response.
- Privacy protection: responses that echo back user-identifying tokens from the system
prompt (names, emails, IDs) are stored WITHOUT an embedding. They remain findable
by exact-match for the same user but are invisible to cross-user semantic search.
Generic responses (capital of France "Paris") keep their embeddings and can still
produce cross-user semantic hits as intended.
""" """
import hashlib import hashlib
import math import math
import re
import time import time
import warnings import warnings
from collections import Counter from collections import Counter
@ -162,6 +168,111 @@ class LLMCache:
from semantic_llm_cache.utils import hash_prompt, normalize_prompt from semantic_llm_cache.utils import hash_prompt, normalize_prompt
return hash_prompt(normalize_prompt(last_user), namespace) return hash_prompt(normalize_prompt(last_user), namespace)
# ------------------------------------------------------------------
# Privacy helpers — prevent cross-user leakage of personal data
# ------------------------------------------------------------------
_IDENTITY_STOPWORDS = frozenset({
"user", "users", "name", "names", "email", "phone", "their", "they",
"this", "that", "with", "from", "have", "been", "also", "more",
"tags", "identity", "preference", "context",
})
# Patterns that unambiguously signal personal data in a response
_EMAIL_RE = re.compile(r'\b[\w.%+-]+@[\w.-]+\.[a-zA-Z]{2,}\b')
_UUID_RE = re.compile(
r'\b[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b',
re.IGNORECASE,
)
# Standalone numeric run ≥ 8 digits (common user/account IDs)
_NUMERIC_ID_RE = re.compile(r'\b\d{8,}\b')
def _extract_response_content(self, response_bytes: bytes) -> str:
"""Parse response bytes (Ollama or OpenAI format) and return the text content."""
try:
import orjson
data = orjson.loads(response_bytes)
if "choices" in data: # OpenAI ChatCompletion
return (data["choices"][0].get("message") or {}).get("content", "")
if "message" in data: # Ollama chat
return (data.get("message") or {}).get("content", "")
if "response" in data: # Ollama generate
return data.get("response", "")
except Exception:
pass
return ""
def _extract_personal_tokens(self, system: str) -> frozenset[str]:
"""
Extract user-identifying tokens from the system prompt.
Looks for:
- Email addresses anywhere in the system prompt
- Numeric IDs ( 4 digits) appearing after "id" keyword
- Proper-noun-like words from [Tags: identity] lines
"""
if not system:
return frozenset()
tokens: set[str] = set()
# Email addresses
for email in self._EMAIL_RE.findall(system):
tokens.add(email.lower())
# Numeric IDs: "id: 1234", "id=5678"
for uid in re.findall(r'\bid\s*[=:]?\s*(\d{4,})\b', system, re.IGNORECASE):
tokens.add(uid)
# Values from [Tags: identity] lines (e.g. "User's name is Andreas")
for line in re.findall(
r'\[Tags:.*?identity.*?\]\s*(.+?)(?:\n|$)', system, re.IGNORECASE
):
for word in re.findall(r'\b\w{4,}\b', line):
w = word.lower()
if w not in self._IDENTITY_STOPWORDS:
tokens.add(w)
return frozenset(tokens)
def _response_is_personalized(self, response_bytes: bytes, system: str) -> bool:
"""
Return True if the response contains user-personal information.
Two complementary checks:
1. Direct PII detection in the response content emails, UUIDs, long
numeric IDs. Catches data retrieved at runtime via tool calls that
never appears in the system prompt.
2. System-prompt token overlap words extracted from [Tags: identity]
lines that reappear verbatim in the response (catches names, etc.).
Such responses are stored WITHOUT a semantic embedding so they are
invisible to cross-user semantic search while still being cacheable for
the same user via exact-match.
"""
content = self._extract_response_content(response_bytes)
if not content:
# Can't parse → err on the side of caution
return bool(response_bytes)
# 1. Direct PII patterns — independent of what's in the system prompt
if (
self._EMAIL_RE.search(content)
or self._UUID_RE.search(content)
or self._NUMERIC_ID_RE.search(content)
):
return True
# 2. System-prompt identity tokens echoed back in the response
personal = self._extract_personal_tokens(system)
if personal:
content_lower = content.lower()
if any(token in content_lower for token in personal):
return True
return False
def _parse_messages( def _parse_messages(
self, messages: list[dict] self, messages: list[dict]
) -> tuple[str, list[dict], str]: ) -> tuple[str, list[dict], str]:
@ -242,10 +353,17 @@ class LLMCache:
ns = self._namespace(route, model, system) ns = self._namespace(route, model, system)
key = self._cache_key(ns, last_user) key = self._cache_key(ns, last_user)
print(
f"[cache] get_chat route={route} model={model} ns={ns} "
f"prompt={last_user[:80]!r} "
f"system_snippet={system[:120]!r}"
)
# 1. Exact key match # 1. Exact key match
entry = await self._backend.get(key) entry = await self._backend.get(key)
if entry is not None: if entry is not None:
self._hits += 1 self._hits += 1
print(f"[cache] HIT (exact) ns={ns} prompt={last_user[:80]!r}")
return entry.response # type: ignore[return-value] return entry.response # type: ignore[return-value]
# 2. Semantic similarity match # 2. Semantic similarity match
@ -255,11 +373,16 @@ class LLMCache:
emb, threshold=self._cfg.cache_similarity, namespace=ns emb, threshold=self._cfg.cache_similarity, namespace=ns
) )
if result is not None: if result is not None:
_, matched, _ = result _, matched, sim = result
self._hits += 1 self._hits += 1
print(
f"[cache] HIT (semantic sim={sim:.3f}) ns={ns} "
f"prompt={last_user[:80]!r} matched={matched.prompt[:80]!r}"
)
return matched.response # type: ignore[return-value] return matched.response # type: ignore[return-value]
self._misses += 1 self._misses += 1
print(f"[cache] MISS ns={ns} prompt={last_user[:80]!r}")
return None return None
async def set_chat( async def set_chat(
@ -276,9 +399,36 @@ class LLMCache:
ns = self._namespace(route, model, system) ns = self._namespace(route, model, system)
key = self._cache_key(ns, last_user) key = self._cache_key(ns, last_user)
# Privacy guard: check whether the response contains personal data.
personalized = self._response_is_personalized(response_bytes, system)
if personalized:
# Exact-match is only safe when the system prompt is user-specific
# (i.e. different per user → different namespace). When the system
# prompt is generic and shared across all users, the namespace is the
# same for everyone: storing even without an embedding would let any
# user who asks the identical question get another user's personal data
# via exact-match. In that case skip storage entirely.
system_is_user_specific = bool(self._extract_personal_tokens(system))
if not system_is_user_specific:
print(
f"[cache] SKIP personalized response with generic system prompt "
f"route={route} model={model} ns={ns} prompt={last_user[:80]!r} "
f"system_snippet={system[:120]!r}"
)
return
print(
f"[cache] set_chat route={route} model={model} ns={ns} "
f"personalized={personalized} "
f"prompt={last_user[:80]!r} "
f"system_snippet={system[:120]!r}"
)
# Store without embedding when personalized (invisible to semantic search
# across users, but still reachable by exact-match within this namespace).
emb = ( emb = (
await self._build_embedding(history, last_user) await self._build_embedding(history, last_user)
if self._semantic and self._cfg.cache_similarity < 1.0 if self._semantic and self._cfg.cache_similarity < 1.0 and not personalized
else None else None
) )

375
doc/semantic-cache.md Normal file
View file

@ -0,0 +1,375 @@
# Semantic Cache
NOMYO Router includes a built-in LLM semantic cache that can dramatically reduce inference costs and latency by reusing previous responses — either via exact-match or by finding semantically equivalent questions.
## How It Works
Every incoming request is checked against the cache before being forwarded to an LLM endpoint. On a cache hit the stored response is returned immediately, bypassing inference entirely.
Cache isolation is strict: each combination of **route + model + system prompt** gets its own namespace (a 16-character SHA-256 prefix). A question asked under one system prompt can never leak into a different user's or route's namespace.
Two lookup strategies are available:
| Mode | How it matches | When to use |
|---|---|---|
| **Exact match** | Normalized text hash | When you need 100% identical answers, zero false positives |
| **Semantic match** | Cosine similarity of sentence embeddings | When questions are paraphrased or phrased differently |
In semantic mode the embedding is a weighted average of:
- The **last user message** (70% by default) — captures what is being asked
- A **BM25-weighted summary of the chat history** (30% by default) — provides conversation context
This means "What is the capital of France?" and "Can you tell me the capital city of France?" will hit the same cache entry, but a question in a different conversation context will not.
### MoE Model Bypass
Models with names starting with `moe-` always bypass the cache entirely. Mixture-of-Experts models produce non-deterministic outputs by design and are not suitable for response caching.
### Privacy Protection
Responses that contain personally identifiable information (emails, UUIDs, numeric IDs ≥ 8 digits, or tokens extracted from `[Tags: identity]` lines in the system prompt) are stored **without a semantic embedding**. They remain reachable via exact-match within the same user-specific namespace but are invisible to cross-user semantic search.
If a personalized response is generated with a generic (shared) system prompt, it is skipped entirely — never stored.
---
## Docker Image Variants
| Tag | Semantic cache | Approx. image size | Includes |
|---|---|---|---|
| `latest` | No (exact match only) | ~300 MB | Base router only |
| `latest-semantic` | Yes | ~800 MB | `sentence-transformers`, `torch`, `all-MiniLM-L6-v2` model baked in |
Use the **`latest`** image when you only need exact-match deduplication. It has no heavy ML dependencies.
Use **`latest-semantic`** when you want to catch paraphrased or semantically equivalent questions. The `all-MiniLM-L6-v2` model is baked into the image at build time so no internet access is needed at runtime.
> **Note:** If you set `cache_similarity < 1.0` but use the `latest` image, the router falls back to exact-match caching and logs a warning. It will not error.
Build locally:
```bash
# Lean image (exact match only)
docker build -t nomyo-router .
# Semantic image (~500 MB larger)
docker build --build-arg SEMANTIC_CACHE=true -t nomyo-router:semantic .
```
---
## Dependencies
### Lean image / bare-metal (exact match)
No extra dependencies beyond the base `requirements.txt`. The `semantic-llm-cache` library provides exact-match storage with a no-op embedding provider.
### Semantic image / bare-metal (semantic match)
Additional heavy packages are required:
| Package | Purpose | Approx. size |
|---|---|---|
| `sentence-transformers` | Sentence embedding model wrapper | ~100 MB |
| `torch` (CPU) | PyTorch inference backend | ~700 MB |
| `numpy` | Vector math for weighted mean embeddings | ~20 MB |
Install for bare-metal semantic mode:
```bash
pip install sentence-transformers torch --index-url https://download.pytorch.org/whl/cpu
```
The `all-MiniLM-L6-v2` model (~90 MB) is downloaded on first use (or baked in when using the `:semantic` Docker image).
---
## Configuration
All cache settings live in `config.yaml` and can be overridden with environment variables prefixed `NOMYO_ROUTER_`.
### Minimal — enable exact-match with in-memory backend
```yaml
# config.yaml
cache_enabled: true
```
### Full reference
```yaml
# config.yaml
# Master switch — set to true to enable the cache
cache_enabled: false
# Storage backend: "memory" | "sqlite" | "redis"
# memory — fast, in-process, lost on restart
# sqlite — persistent single-file, good for single-instance deployments
# redis — persistent, shared across multiple router instances
cache_backend: memory
# Cosine similarity threshold for semantic matching
# 1.0 = exact match only (no sentence-transformers required)
# 0.95 = very close paraphrases only (recommended starting point)
# 0.85 = broader matching, higher false-positive risk
# Requires :semantic Docker image (or sentence-transformers installed)
cache_similarity: 1.0
# Time-to-live in seconds; null or omit to cache forever
cache_ttl: 3600
# SQLite backend: path to cache database file
cache_db_path: llm_cache.db
# Redis backend: connection URL
cache_redis_url: redis://localhost:6379/0
# Weight of chat-history embedding in the combined query vector (0.01.0)
# 0.3 = 30% history context, 70% last user message (default)
# Increase if your use case is highly conversation-dependent
cache_history_weight: 0.3
```
### Environment variable overrides
```bash
NOMYO_ROUTER_CACHE_ENABLED=true
NOMYO_ROUTER_CACHE_BACKEND=sqlite
NOMYO_ROUTER_CACHE_SIMILARITY=0.95
NOMYO_ROUTER_CACHE_TTL=7200
NOMYO_ROUTER_CACHE_DB_PATH=/data/llm_cache.db
NOMYO_ROUTER_CACHE_REDIS_URL=redis://redis:6379/0
NOMYO_ROUTER_CACHE_HISTORY_WEIGHT=0.3
```
### Per-request opt-in
The cache is opted in per-request via the `nomyo.cache` field in the request body. The global `cache_enabled` setting must also be `true`.
```json
{
"model": "llama3.2",
"messages": [{"role": "user", "content": "What is the capital of France?"}],
"nomyo": {
"cache": true
}
}
```
---
## Backend Comparison
| | `memory` | `sqlite` | `redis` |
|---|---|---|---|
| Persistence | No (lost on restart) | Yes | Yes |
| Multi-instance | No | No | Yes |
| Setup required | None | None | Redis server |
| Recommended for | Development, testing | Single-instance production | Multi-instance / HA production |
### SQLite — persistent single-instance
```yaml
cache_enabled: true
cache_backend: sqlite
cache_db_path: /data/llm_cache.db
cache_ttl: 86400 # 24 hours
```
Mount the path in Docker:
```bash
docker run -d \
-v /path/to/data:/data \
-e NOMYO_ROUTER_CACHE_ENABLED=true \
-e NOMYO_ROUTER_CACHE_BACKEND=sqlite \
-e NOMYO_ROUTER_CACHE_DB_PATH=/data/llm_cache.db \
ghcr.io/nomyo-ai/nomyo-router:latest
```
### Redis — distributed / multi-instance
```yaml
cache_enabled: true
cache_backend: redis
cache_redis_url: redis://redis:6379/0
cache_similarity: 0.95
cache_ttl: 3600
```
Docker Compose example:
```yaml
services:
nomyo-router:
image: ghcr.io/nomyo-ai/nomyo-router:latest-semantic
ports:
- "12434:12434"
environment:
NOMYO_ROUTER_CACHE_ENABLED: "true"
NOMYO_ROUTER_CACHE_BACKEND: redis
NOMYO_ROUTER_CACHE_REDIS_URL: redis://redis:6379/0
NOMYO_ROUTER_CACHE_SIMILARITY: "0.95"
depends_on:
- redis
redis:
image: redis:7-alpine
volumes:
- redis_data:/data
volumes:
redis_data:
```
---
## Code Examples
### Python (OpenAI SDK)
```python
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:12434/v1",
api_key="unused",
)
# First call — cache miss, hits the LLM
response = client.chat.completions.create(
model="llama3.2",
messages=[{"role": "user", "content": "What is the capital of France?"}],
extra_body={"nomyo": {"cache": True}},
)
print(response.choices[0].message.content)
# Second call (exact same question) — cache hit, returned instantly
response = client.chat.completions.create(
model="llama3.2",
messages=[{"role": "user", "content": "What is the capital of France?"}],
extra_body={"nomyo": {"cache": True}},
)
print(response.choices[0].message.content)
# With semantic mode enabled, this also hits the cache:
response = client.chat.completions.create(
model="llama3.2",
messages=[{"role": "user", "content": "Tell me the capital city of France"}],
extra_body={"nomyo": {"cache": True}},
)
print(response.choices[0].message.content)
```
### Python (raw HTTP / httpx)
```python
import httpx
payload = {
"model": "llama3.2",
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Explain quantum entanglement in one sentence."},
],
"nomyo": {"cache": True},
}
resp = httpx.post("http://localhost:12434/v1/chat/completions", json=payload)
print(resp.json()["choices"][0]["message"]["content"])
```
### curl
```bash
curl -s http://localhost:12434/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "llama3.2",
"messages": [{"role": "user", "content": "What is the capital of France?"}],
"nomyo": {"cache": true}
}' | jq .choices[0].message.content
```
### Check cache statistics
```bash
curl -s http://localhost:12434/api/cache/stats | jq .
```
```json
{
"hits": 42,
"misses": 18,
"hit_rate": 0.7,
"semantic": true,
"backend": "sqlite",
"similarity_threshold": 0.95,
"history_weight": 0.3
}
```
### Clear the cache
```bash
curl -X POST http://localhost:12434/api/cache/clear
```
---
## Use Cases Where Semantic Caching Helps Most
### FAQ and support bots
Users ask the same questions in slightly different ways. "How do I reset my password?", "I forgot my password", "password reset instructions" — all semantically equivalent. Semantic caching collapses these into a single LLM call.
**Recommended settings:** `cache_similarity: 0.900.95`, `cache_backend: sqlite` or `redis`, long TTL.
### Document Q&A / RAG pipelines
When the same document set is queried repeatedly by many users, common factual questions ("What is the contract start date?", "When does this contract begin?") hit the cache across users — as long as the system prompt is not user-personalized.
**Recommended settings:** `cache_similarity: 0.92`, `cache_backend: redis` for multi-user deployments.
### Code generation with repeated patterns
Development tools that generate boilerplate, explain errors, or convert code often receive nearly-identical prompts. Caching prevents re-running expensive large-model inference for the same error message phrased differently.
**Recommended settings:** `cache_similarity: 0.93`, `cache_backend: sqlite`.
### High-traffic chatbots with shared context
Public-facing bots where many users share the same system prompt (same product, same persona). Generic factual answers can be safely reused across users. The privacy guard ensures personalized responses are never shared.
**Recommended settings:** `cache_similarity: 0.900.95`, `cache_backend: redis`.
### Batch processing / data pipelines
When running LLM inference over large datasets with many near-duplicate or repeated inputs, caching can reduce inference calls dramatically and make pipelines idempotent.
**Recommended settings:** `cache_similarity: 0.95`, `cache_backend: sqlite`, `cache_ttl: null` (cache forever for deterministic batch runs).
---
## Tuning the Similarity Threshold
| `cache_similarity` | Behaviour | Risk |
|---|---|---|
| `1.0` | Exact match only | No false positives |
| `0.97` | Catches minor typos and punctuation differences | Very low |
| `0.95` | Catches clear paraphrases (recommended default) | Low |
| `0.90` | Catches broader rewordings | Moderate — may match different intents |
| `< 0.85` | Very aggressive matching | High false-positive rate, not recommended |
Start at `0.95` and lower gradually while monitoring cache hit rate via `/api/cache/stats`.
---
## Limitations
- **Streaming responses** cached from a non-streaming request are served as a single chunk (not token-by-token). This is indistinguishable to most clients.
- **MoE models** (`moe-*` prefix) are never cached.
- **Token counts** are not recorded for cache hits (the LLM was not called).
- The `memory` backend does not survive a router restart.
- Semantic search requires the `:semantic` image (or local `sentence-transformers` install); without it, `cache_similarity < 1.0` silently falls back to exact match.

View file

@ -43,4 +43,4 @@ aiosqlite
# For semantic mode use the :semantic Docker image tag (adds sentence-transformers + torch) # For semantic mode use the :semantic Docker image tag (adds sentence-transformers + torch)
# SECURITY: pin to a specific tag or commit hash to prevent supply-chain drift, e.g.: # SECURITY: pin to a specific tag or commit hash to prevent supply-chain drift, e.g.:
# semantic-llm-cache @ git+https://github.com/nomyo-ai/async-semantic-llm-cache.git@v1.0.0 # semantic-llm-cache @ git+https://github.com/nomyo-ai/async-semantic-llm-cache.git@v1.0.0
semantic-llm-cache @ git+https://github.com/nomyo-ai/async-semantic-llm-cache.git semantic-llm-cache@git+https://github.com/nomyo-ai/async-semantic-llm-cache.git@v0.1

View file

@ -1600,7 +1600,8 @@ async def proxy(request: Request):
images = payload.get("images") images = payload.get("images")
options = payload.get("options") options = payload.get("options")
keep_alive = payload.get("keep_alive") keep_alive = payload.get("keep_alive")
_cache_enabled = payload.get("nomyo", {}).get("cache", False)
if not model: if not model:
raise HTTPException( raise HTTPException(
status_code=400, detail="Missing required field 'model'" status_code=400, detail="Missing required field 'model'"
@ -1615,7 +1616,7 @@ async def proxy(request: Request):
# Cache lookup — before endpoint selection so no slot is wasted on a hit # Cache lookup — before endpoint selection so no slot is wasted on a hit
_cache = get_llm_cache() _cache = get_llm_cache()
if _cache is not None: if _cache is not None and _cache_enabled:
_cached = await _cache.get_generate(model, prompt, system or "") _cached = await _cache.get_generate(model, prompt, system or "")
if _cached is not None: if _cached is not None:
async def _serve_cached_generate(): async def _serve_cached_generate():
@ -1671,7 +1672,7 @@ async def proxy(request: Request):
else: else:
json_line = orjson.dumps(chunk) json_line = orjson.dumps(chunk)
# Accumulate and store cache on done chunk — before yield so it always runs # Accumulate and store cache on done chunk — before yield so it always runs
if _cache is not None: if _cache is not None and _cache_enabled:
if getattr(chunk, "response", None): if getattr(chunk, "response", None):
content_parts.append(chunk.response) content_parts.append(chunk.response)
if getattr(chunk, "done", False): if getattr(chunk, "done", False):
@ -1710,7 +1711,7 @@ async def proxy(request: Request):
cache_bytes = json_line.encode("utf-8") + b"\n" cache_bytes = json_line.encode("utf-8") + b"\n"
yield cache_bytes yield cache_bytes
# Cache non-streaming response # Cache non-streaming response
if _cache is not None: if _cache is not None and _cache_enabled:
try: try:
await _cache.set_generate(model, prompt, system or "", cache_bytes) await _cache.set_generate(model, prompt, system or "", cache_bytes)
except Exception as _ce: except Exception as _ce:
@ -1749,6 +1750,7 @@ async def chat_proxy(request: Request):
options = payload.get("options") options = payload.get("options")
logprobs = payload.get("logprobs") logprobs = payload.get("logprobs")
top_logprobs = payload.get("top_logprobs") top_logprobs = payload.get("top_logprobs")
_cache_enabled = payload.get("nomyo", {}).get("cache", False)
if not model: if not model:
raise HTTPException( raise HTTPException(
@ -1775,7 +1777,7 @@ async def chat_proxy(request: Request):
# Snapshot original messages before any OpenAI-format transformation so that # Snapshot original messages before any OpenAI-format transformation so that
# get_chat and set_chat always use the same key regardless of backend type. # get_chat and set_chat always use the same key regardless of backend type.
_cache_messages = messages _cache_messages = messages
if _cache is not None and not _is_moe: if _cache is not None and not _is_moe and _cache_enabled:
_cached = await _cache.get_chat("ollama_chat", _cache_model, messages) _cached = await _cache.get_chat("ollama_chat", _cache_model, messages)
if _cached is not None: if _cached is not None:
async def _serve_cached_chat(): async def _serve_cached_chat():
@ -1858,7 +1860,7 @@ async def chat_proxy(request: Request):
# Accumulate and store cache on done chunk — before yield so it always runs # Accumulate and store cache on done chunk — before yield so it always runs
# Works for both Ollama-native and OpenAI-compatible backends; chunks are # Works for both Ollama-native and OpenAI-compatible backends; chunks are
# already converted to Ollama format by rechunk before this point. # already converted to Ollama format by rechunk before this point.
if _cache is not None and not _is_moe: if _cache is not None and not _is_moe and _cache_enabled:
if chunk.message and getattr(chunk.message, "content", None): if chunk.message and getattr(chunk.message, "content", None):
content_parts.append(chunk.message.content) content_parts.append(chunk.message.content)
if getattr(chunk, "done", False): if getattr(chunk, "done", False):
@ -1898,7 +1900,7 @@ async def chat_proxy(request: Request):
cache_bytes = json_line.encode("utf-8") + b"\n" cache_bytes = json_line.encode("utf-8") + b"\n"
yield cache_bytes yield cache_bytes
# Cache non-streaming response (non-MOE; works for both Ollama and OpenAI backends) # Cache non-streaming response (non-MOE; works for both Ollama and OpenAI backends)
if _cache is not None and not _is_moe: if _cache is not None and not _is_moe and _cache_enabled:
try: try:
await _cache.set_chat("ollama_chat", _cache_model, _cache_messages, cache_bytes) await _cache.set_chat("ollama_chat", _cache_model, _cache_messages, cache_bytes)
except Exception as _ce: except Exception as _ce:
@ -2746,6 +2748,7 @@ async def openai_chat_completions_proxy(request: Request):
tools = payload.get("tools") tools = payload.get("tools")
logprobs = payload.get("logprobs") logprobs = payload.get("logprobs")
top_logprobs = payload.get("top_logprobs") top_logprobs = payload.get("top_logprobs")
_cache_enabled = payload.get("nomyo", {}).get("cache", False)
if not model: if not model:
raise HTTPException( raise HTTPException(
@ -2786,9 +2789,20 @@ async def openai_chat_completions_proxy(request: Request):
except orjson.JSONDecodeError as e: except orjson.JSONDecodeError as e:
raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e
# Reject unsupported image formats (SVG) before doing any work
for _msg in messages:
for _item in (_msg.get("content") or []) if isinstance(_msg.get("content"), list) else []:
if _item.get("type") == "image_url":
_url = (_item.get("image_url") or {}).get("url", "")
if _url.startswith("data:image/svg") or _url.lower().endswith(".svg"):
raise HTTPException(
status_code=400,
detail="SVG images are not supported. Please convert the image to PNG or JPEG before sending.",
)
# Cache lookup — before endpoint selection # Cache lookup — before endpoint selection
_cache = get_llm_cache() _cache = get_llm_cache()
if _cache is not None: if _cache is not None and _cache_enabled:
_cached = await _cache.get_chat("openai_chat", model, messages) _cached = await _cache.get_chat("openai_chat", model, messages)
if _cached is not None: if _cached is not None:
if stream: if stream:
@ -2806,16 +2820,56 @@ async def openai_chat_completions_proxy(request: Request):
base_url = ep2base(endpoint) base_url = ep2base(endpoint)
oclient = openai.AsyncOpenAI(base_url=base_url, default_headers=default_headers, api_key=config.api_keys.get(endpoint, "no-key")) oclient = openai.AsyncOpenAI(base_url=base_url, default_headers=default_headers, api_key=config.api_keys.get(endpoint, "no-key"))
# 3. Async generator that streams completions data and decrements the counter # 3. Async generator that streams completions data and decrements the counter
async def _normalize_images_in_messages(msgs: list) -> list:
"""Fetch remote image URLs and convert them to base64 data URLs so
Ollama/llama-server can handle them without making outbound HTTP requests."""
resolved = []
for msg in msgs:
content = msg.get("content")
if not isinstance(content, list):
resolved.append(msg)
continue
new_content = []
for item in content:
if item.get("type") == "image_url":
url = (item.get("image_url") or {}).get("url", "")
if url and not url.startswith("data:"):
try:
http: aiohttp.ClientSession = app_state["session"]
async with http.get(url) as resp:
ctype = resp.headers.get("Content-Type", "image/jpeg").split(";")[0].strip()
img_bytes = await resp.read()
b64 = base64.b64encode(img_bytes).decode("utf-8")
new_content.append({
"type": "image_url",
"image_url": {"url": f"data:{ctype};base64,{b64}"}
})
except Exception as _ie:
print(f"[image] Failed to fetch image URL: {_ie}")
new_content.append(item)
else:
new_content.append(item)
else:
new_content.append(item)
resolved.append({**msg, "content": new_content})
return resolved
async def stream_ochat_response(): async def stream_ochat_response():
try: try:
# The chat method returns a generator of dicts (or GenerateResponse) # The chat method returns a generator of dicts (or GenerateResponse)
try: try:
async_gen = await oclient.chat.completions.create(**params) # For non-external endpoints (Ollama, llama-server), resolve remote
# image URLs to base64 data URLs so the server can handle them locally.
send_params = params
if not is_ext_openai_endpoint(endpoint):
resolved_msgs = await _normalize_images_in_messages(params.get("messages", []))
send_params = {**params, "messages": resolved_msgs}
async_gen = await oclient.chat.completions.create(**send_params)
except openai.BadRequestError as e: except openai.BadRequestError as e:
# If tools are not supported by the model, retry without tools # If tools are not supported by the model, retry without tools
if "does not support tools" in str(e): if "does not support tools" in str(e):
print(f"[openai_chat_completions_proxy] Model {model} doesn't support tools, retrying without tools") print(f"[openai_chat_completions_proxy] Model {model} doesn't support tools, retrying without tools")
params_without_tools = {k: v for k, v in params.items() if k != "tools"} params_without_tools = {k: v for k, v in send_params.items() if k != "tools"}
async_gen = await oclient.chat.completions.create(**params_without_tools) async_gen = await oclient.chat.completions.create(**params_without_tools)
else: else:
raise raise
@ -2856,7 +2910,7 @@ async def openai_chat_completions_proxy(request: Request):
if prompt_tok != 0 or comp_tok != 0: if prompt_tok != 0 or comp_tok != 0:
await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok)) await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))
# Cache assembled streaming response — before [DONE] so it always runs # Cache assembled streaming response — before [DONE] so it always runs
if _cache is not None and content_parts: if _cache is not None and _cache_enabled and content_parts:
assembled = orjson.dumps({ assembled = orjson.dumps({
"model": model, "model": model,
"choices": [{"index": 0, "message": {"role": "assistant", "content": "".join(content_parts)}, "finish_reason": "stop"}], "choices": [{"index": 0, "message": {"role": "assistant", "content": "".join(content_parts)}, "finish_reason": "stop"}],
@ -2887,7 +2941,7 @@ async def openai_chat_completions_proxy(request: Request):
cache_bytes = json_line.encode("utf-8") + b"\n" cache_bytes = json_line.encode("utf-8") + b"\n"
yield cache_bytes yield cache_bytes
# Cache non-streaming response # Cache non-streaming response
if _cache is not None: if _cache is not None and _cache_enabled:
try: try:
await _cache.set_chat("openai_chat", model, messages, cache_bytes) await _cache.set_chat("openai_chat", model, messages, cache_bytes)
except Exception as _ce: except Exception as _ce:
@ -2930,6 +2984,7 @@ async def openai_completions_proxy(request: Request):
max_tokens = payload.get("max_tokens") max_tokens = payload.get("max_tokens")
max_completion_tokens = payload.get("max_completion_tokens") max_completion_tokens = payload.get("max_completion_tokens")
suffix = payload.get("suffix") suffix = payload.get("suffix")
_cache_enabled = payload.get("nomyo", {}).get("cache", False)
if not model: if not model:
raise HTTPException( raise HTTPException(
@ -2970,7 +3025,7 @@ async def openai_completions_proxy(request: Request):
# Cache lookup — completions prompt mapped to a single-turn messages list # Cache lookup — completions prompt mapped to a single-turn messages list
_cache = get_llm_cache() _cache = get_llm_cache()
_compl_messages = [{"role": "user", "content": prompt}] _compl_messages = [{"role": "user", "content": prompt}]
if _cache is not None: if _cache is not None and _cache_enabled:
_cached = await _cache.get_chat("openai_completions", model, _compl_messages) _cached = await _cache.get_chat("openai_completions", model, _compl_messages)
if _cached is not None: if _cached is not None:
if stream: if stream:
@ -3029,7 +3084,7 @@ async def openai_completions_proxy(request: Request):
if prompt_tok != 0 or comp_tok != 0: if prompt_tok != 0 or comp_tok != 0:
await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok)) await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))
# Cache assembled streaming response — before [DONE] so it always runs # Cache assembled streaming response — before [DONE] so it always runs
if _cache is not None and text_parts: if _cache is not None and _cache_enabled and text_parts:
assembled = orjson.dumps({ assembled = orjson.dumps({
"model": model, "model": model,
"choices": [{"index": 0, "message": {"role": "assistant", "content": "".join(text_parts)}, "finish_reason": "stop"}], "choices": [{"index": 0, "message": {"role": "assistant", "content": "".join(text_parts)}, "finish_reason": "stop"}],
@ -3061,7 +3116,7 @@ async def openai_completions_proxy(request: Request):
cache_bytes = json_line.encode("utf-8") + b"\n" cache_bytes = json_line.encode("utf-8") + b"\n"
yield cache_bytes yield cache_bytes
# Cache non-streaming response # Cache non-streaming response
if _cache is not None: if _cache is not None and _cache_enabled:
try: try:
await _cache.set_chat("openai_completions", model, _compl_messages, cache_bytes) await _cache.set_chat("openai_completions", model, _compl_messages, cache_bytes)
except Exception as _ce: except Exception as _ce: