fix: improvements, fixes and opt-in cache

doc: semantic-cache.md added with detailed write-up
2026-03-10 15:19:37 +01:00 · 2026-03-10 15:19:37 +01:00 · fbdc73eebb
commit fbdc73eebb
parent a5108486e3
4 changed files with 598 additions and 18 deletions
--- a/cache.py
+++ b/cache.py
@ -14,10 +14,16 @@ Strategy:
 - MOE models (moe-*) always bypass the cache.
 - Token counts are never recorded for cache hits.
 - Streaming cache hits are served as a single-chunk response.
 - Privacy protection: responses that echo back user-identifying tokens from the system
  prompt (names, emails, IDs) are stored WITHOUT an embedding.  They remain findable
  by exact-match for the same user but are invisible to cross-user semantic search.
  Generic responses (capital of France → "Paris") keep their embeddings and can still
  produce cross-user semantic hits as intended.
 """
 import hashlib
 import math
 import re
 import time
 import warnings
 from collections import Counter
@ -162,6 +168,111 @@ class LLMCache:
        from semantic_llm_cache.utils import hash_prompt, normalize_prompt
        return hash_prompt(normalize_prompt(last_user), namespace)
    # ------------------------------------------------------------------
    # Privacy helpers — prevent cross-user leakage of personal data
    # ------------------------------------------------------------------
    _IDENTITY_STOPWORDS = frozenset({
        "user", "users", "name", "names", "email", "phone", "their", "they",
        "this", "that", "with", "from", "have", "been", "also", "more",
        "tags", "identity", "preference", "context",
    })
    # Patterns that unambiguously signal personal data in a response
    _EMAIL_RE    = re.compile(r'\b[\w.%+-]+@[\w.-]+\.[a-zA-Z]{2,}\b')
    _UUID_RE     = re.compile(
        r'\b[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b',
        re.IGNORECASE,
    )
    # Standalone numeric run ≥ 8 digits (common user/account IDs)
    _NUMERIC_ID_RE = re.compile(r'\b\d{8,}\b')
    def _extract_response_content(self, response_bytes: bytes) -> str:
        """Parse response bytes (Ollama or OpenAI format) and return the text content."""
        try:
            import orjson
            data = orjson.loads(response_bytes)
            if "choices" in data:                          # OpenAI ChatCompletion
                return (data["choices"][0].get("message") or {}).get("content", "")
            if "message" in data:                          # Ollama chat
                return (data.get("message") or {}).get("content", "")
            if "response" in data:                         # Ollama generate
                return data.get("response", "")
        except Exception:
            pass
        return ""
    def _extract_personal_tokens(self, system: str) -> frozenset[str]:
        """
        Extract user-identifying tokens from the system prompt.
        Looks for:
        - Email addresses anywhere in the system prompt
        - Numeric IDs (≥ 4 digits) appearing after "id" keyword
        - Proper-noun-like words from [Tags: identity] lines
        """
        if not system:
            return frozenset()
        tokens: set[str] = set()
        # Email addresses
        for email in self._EMAIL_RE.findall(system):
            tokens.add(email.lower())
        # Numeric IDs: "id: 1234", "id=5678"
        for uid in re.findall(r'\bid\s*[=:]?\s*(\d{4,})\b', system, re.IGNORECASE):
            tokens.add(uid)
        # Values from [Tags: identity] lines  (e.g. "User's name is Andreas")
        for line in re.findall(
            r'\[Tags:.*?identity.*?\]\s*(.+?)(?:\n|$)', system, re.IGNORECASE
        ):
            for word in re.findall(r'\b\w{4,}\b', line):
                w = word.lower()
                if w not in self._IDENTITY_STOPWORDS:
                    tokens.add(w)
        return frozenset(tokens)
    def _response_is_personalized(self, response_bytes: bytes, system: str) -> bool:
        """
        Return True if the response contains user-personal information.
        Two complementary checks:
        1. Direct PII detection in the response content — emails, UUIDs, long
           numeric IDs.  Catches data retrieved at runtime via tool calls that
           never appears in the system prompt.
        2. System-prompt token overlap — words extracted from [Tags: identity]
           lines that reappear verbatim in the response (catches names, etc.).
        Such responses are stored WITHOUT a semantic embedding so they are
        invisible to cross-user semantic search while still being cacheable for
        the same user via exact-match.
        """
        content = self._extract_response_content(response_bytes)
        if not content:
            # Can't parse → err on the side of caution
            return bool(response_bytes)
        # 1. Direct PII patterns — independent of what's in the system prompt
        if (
            self._EMAIL_RE.search(content)
            or self._UUID_RE.search(content)
            or self._NUMERIC_ID_RE.search(content)
        ):
            return True
        # 2. System-prompt identity tokens echoed back in the response
        personal = self._extract_personal_tokens(system)
        if personal:
            content_lower = content.lower()
            if any(token in content_lower for token in personal):
                return True
        return False
    def _parse_messages(
        self, messages: list[dict]
    ) -> tuple[str, list[dict], str]:
@ -242,10 +353,17 @@ class LLMCache:
        ns = self._namespace(route, model, system)
        key = self._cache_key(ns, last_user)
        print(
            f"[cache] get_chat route={route} model={model} ns={ns} "
            f"prompt={last_user[:80]!r} "
            f"system_snippet={system[:120]!r}"
        )
        # 1. Exact key match
        entry = await self._backend.get(key)
        if entry is not None:
            self._hits += 1
            print(f"[cache] HIT (exact) ns={ns} prompt={last_user[:80]!r}")
            return entry.response  # type: ignore[return-value]
        # 2. Semantic similarity match
@ -255,11 +373,16 @@ class LLMCache:
                emb, threshold=self._cfg.cache_similarity, namespace=ns
            )
            if result is not None:
-                _, matched, _ = result
+                _, matched, sim = result
                self._hits += 1
                print(
                    f"[cache] HIT (semantic sim={sim:.3f}) ns={ns} "
                    f"prompt={last_user[:80]!r} matched={matched.prompt[:80]!r}"
                )
                return matched.response  # type: ignore[return-value]
        self._misses += 1
        print(f"[cache] MISS ns={ns} prompt={last_user[:80]!r}")
        return None
    async def set_chat(
@ -276,9 +399,36 @@ class LLMCache:
        ns = self._namespace(route, model, system)
        key = self._cache_key(ns, last_user)
        # Privacy guard: check whether the response contains personal data.
        personalized = self._response_is_personalized(response_bytes, system)
        if personalized:
            # Exact-match is only safe when the system prompt is user-specific
            # (i.e. different per user → different namespace).  When the system
            # prompt is generic and shared across all users, the namespace is the
            # same for everyone: storing even without an embedding would let any
            # user who asks the identical question get another user's personal data
            # via exact-match.  In that case skip storage entirely.
            system_is_user_specific = bool(self._extract_personal_tokens(system))
            if not system_is_user_specific:
                print(
                    f"[cache] SKIP personalized response with generic system prompt "
                    f"route={route} model={model} ns={ns} prompt={last_user[:80]!r} "
                    f"system_snippet={system[:120]!r}"
                )
                return
        print(
            f"[cache] set_chat route={route} model={model} ns={ns} "
            f"personalized={personalized} "
            f"prompt={last_user[:80]!r} "
            f"system_snippet={system[:120]!r}"
        )
        # Store without embedding when personalized (invisible to semantic search
        # across users, but still reachable by exact-match within this namespace).
        emb = (
            await self._build_embedding(history, last_user)
-            if self._semantic and self._cfg.cache_similarity < 1.0
+            if self._semantic and self._cfg.cache_similarity < 1.0 and not personalized
            else None
        )
--- a/doc/semantic-cache.md
+++ b/doc/semantic-cache.md
@ -0,0 +1,375 @@
 # Semantic Cache
 NOMYO Router includes a built-in LLM semantic cache that can dramatically reduce inference costs and latency by reusing previous responses — either via exact-match or by finding semantically equivalent questions.
 ## How It Works
 Every incoming request is checked against the cache before being forwarded to an LLM endpoint. On a cache hit the stored response is returned immediately, bypassing inference entirely.
 Cache isolation is strict: each combination of **route + model + system prompt** gets its own namespace (a 16-character SHA-256 prefix). A question asked under one system prompt can never leak into a different user's or route's namespace.
 Two lookup strategies are available:
 | Mode | How it matches | When to use |
 |---|---|---|
 | **Exact match** | Normalized text hash | When you need 100% identical answers, zero false positives |
 | **Semantic match** | Cosine similarity of sentence embeddings | When questions are paraphrased or phrased differently |
 In semantic mode the embedding is a weighted average of:
 - The **last user message** (70% by default) — captures what is being asked
 - A **BM25-weighted summary of the chat history** (30% by default) — provides conversation context
 This means "What is the capital of France?" and "Can you tell me the capital city of France?" will hit the same cache entry, but a question in a different conversation context will not.
 ### MoE Model Bypass
 Models with names starting with `moe-` always bypass the cache entirely. Mixture-of-Experts models produce non-deterministic outputs by design and are not suitable for response caching.
 ### Privacy Protection
 Responses that contain personally identifiable information (emails, UUIDs, numeric IDs ≥ 8 digits, or tokens extracted from `[Tags: identity]` lines in the system prompt) are stored **without a semantic embedding**. They remain reachable via exact-match within the same user-specific namespace but are invisible to cross-user semantic search.
 If a personalized response is generated with a generic (shared) system prompt, it is skipped entirely — never stored.
 ---
 ## Docker Image Variants
 | Tag | Semantic cache | Approx. image size | Includes |
 |---|---|---|---|
 | `latest` | No (exact match only) | ~300 MB | Base router only |
 | `latest-semantic` | Yes | ~800 MB | `sentence-transformers`, `torch`, `all-MiniLM-L6-v2` model baked in |
 Use the **`latest`** image when you only need exact-match deduplication. It has no heavy ML dependencies.
 Use **`latest-semantic`** when you want to catch paraphrased or semantically equivalent questions. The `all-MiniLM-L6-v2` model is baked into the image at build time so no internet access is needed at runtime.
 > **Note:** If you set `cache_similarity < 1.0` but use the `latest` image, the router falls back to exact-match caching and logs a warning. It will not error.
 Build locally:
 ```bash
 # Lean image (exact match only)
 docker build -t nomyo-router .
 # Semantic image (~500 MB larger)
 docker build --build-arg SEMANTIC_CACHE=true -t nomyo-router:semantic .
 ```
 ---
 ## Dependencies
 ### Lean image / bare-metal (exact match)
 No extra dependencies beyond the base `requirements.txt`. The `semantic-llm-cache` library provides exact-match storage with a no-op embedding provider.
 ### Semantic image / bare-metal (semantic match)
 Additional heavy packages are required:
 | Package | Purpose | Approx. size |
 |---|---|---|
 | `sentence-transformers` | Sentence embedding model wrapper | ~100 MB |
 | `torch` (CPU) | PyTorch inference backend | ~700 MB |
 | `numpy` | Vector math for weighted mean embeddings | ~20 MB |
 Install for bare-metal semantic mode:
 ```bash
 pip install sentence-transformers torch --index-url https://download.pytorch.org/whl/cpu
 ```
 The `all-MiniLM-L6-v2` model (~90 MB) is downloaded on first use (or baked in when using the `:semantic` Docker image).
 ---
 ## Configuration
 All cache settings live in `config.yaml` and can be overridden with environment variables prefixed `NOMYO_ROUTER_`.
 ### Minimal — enable exact-match with in-memory backend
 ```yaml
 # config.yaml
 cache_enabled: true
 ```
 ### Full reference
 ```yaml
 # config.yaml
 # Master switch — set to true to enable the cache
 cache_enabled: false
 # Storage backend: "memory" | "sqlite" | "redis"
 # memory  — fast, in-process, lost on restart
 # sqlite  — persistent single-file, good for single-instance deployments
 # redis   — persistent, shared across multiple router instances
 cache_backend: memory
 # Cosine similarity threshold for semantic matching
 # 1.0  = exact match only (no sentence-transformers required)
 # 0.95 = very close paraphrases only (recommended starting point)
 # 0.85 = broader matching, higher false-positive risk
 # Requires :semantic Docker image (or sentence-transformers installed)
 cache_similarity: 1.0
 # Time-to-live in seconds; null or omit to cache forever
 cache_ttl: 3600
 # SQLite backend: path to cache database file
 cache_db_path: llm_cache.db
 # Redis backend: connection URL
 cache_redis_url: redis://localhost:6379/0
 # Weight of chat-history embedding in the combined query vector (0.0–1.0)
 # 0.3 = 30% history context, 70% last user message (default)
 # Increase if your use case is highly conversation-dependent
 cache_history_weight: 0.3
 ```
 ### Environment variable overrides
 ```bash
 NOMYO_ROUTER_CACHE_ENABLED=true
 NOMYO_ROUTER_CACHE_BACKEND=sqlite
 NOMYO_ROUTER_CACHE_SIMILARITY=0.95
 NOMYO_ROUTER_CACHE_TTL=7200
 NOMYO_ROUTER_CACHE_DB_PATH=/data/llm_cache.db
 NOMYO_ROUTER_CACHE_REDIS_URL=redis://redis:6379/0
 NOMYO_ROUTER_CACHE_HISTORY_WEIGHT=0.3
 ```
 ### Per-request opt-in
 The cache is opted in per-request via the `nomyo.cache` field in the request body. The global `cache_enabled` setting must also be `true`.
 ```json
 {
  "model": "llama3.2",
  "messages": [{"role": "user", "content": "What is the capital of France?"}],
  "nomyo": {
    "cache": true
  }
 }
 ```
 ---
 ## Backend Comparison
 | | `memory` | `sqlite` | `redis` |
 |---|---|---|---|
 | Persistence | No (lost on restart) | Yes | Yes |
 | Multi-instance | No | No | Yes |
 | Setup required | None | None | Redis server |
 | Recommended for | Development, testing | Single-instance production | Multi-instance / HA production |
 ### SQLite — persistent single-instance
 ```yaml
 cache_enabled: true
 cache_backend: sqlite
 cache_db_path: /data/llm_cache.db
 cache_ttl: 86400   # 24 hours
 ```
 Mount the path in Docker:
 ```bash
 docker run -d \
  -v /path/to/data:/data \
  -e NOMYO_ROUTER_CACHE_ENABLED=true \
  -e NOMYO_ROUTER_CACHE_BACKEND=sqlite \
  -e NOMYO_ROUTER_CACHE_DB_PATH=/data/llm_cache.db \
  ghcr.io/nomyo-ai/nomyo-router:latest
 ```
 ### Redis — distributed / multi-instance
 ```yaml
 cache_enabled: true
 cache_backend: redis
 cache_redis_url: redis://redis:6379/0
 cache_similarity: 0.95
 cache_ttl: 3600
 ```
 Docker Compose example:
 ```yaml
 services:
  nomyo-router:
    image: ghcr.io/nomyo-ai/nomyo-router:latest-semantic
    ports:
      - "12434:12434"
    environment:
      NOMYO_ROUTER_CACHE_ENABLED: "true"
      NOMYO_ROUTER_CACHE_BACKEND: redis
      NOMYO_ROUTER_CACHE_REDIS_URL: redis://redis:6379/0
      NOMYO_ROUTER_CACHE_SIMILARITY: "0.95"
    depends_on:
      - redis
  redis:
    image: redis:7-alpine
    volumes:
      - redis_data:/data
 volumes:
  redis_data:
 ```
 ---
 ## Code Examples
 ### Python (OpenAI SDK)
 ```python
 from openai import OpenAI
 client = OpenAI(
    base_url="http://localhost:12434/v1",
    api_key="unused",
 )
 # First call — cache miss, hits the LLM
 response = client.chat.completions.create(
    model="llama3.2",
    messages=[{"role": "user", "content": "What is the capital of France?"}],
    extra_body={"nomyo": {"cache": True}},
 )
 print(response.choices[0].message.content)
 # Second call (exact same question) — cache hit, returned instantly
 response = client.chat.completions.create(
    model="llama3.2",
    messages=[{"role": "user", "content": "What is the capital of France?"}],
    extra_body={"nomyo": {"cache": True}},
 )
 print(response.choices[0].message.content)
 # With semantic mode enabled, this also hits the cache:
 response = client.chat.completions.create(
    model="llama3.2",
    messages=[{"role": "user", "content": "Tell me the capital city of France"}],
    extra_body={"nomyo": {"cache": True}},
 )
 print(response.choices[0].message.content)
 ```
 ### Python (raw HTTP / httpx)
 ```python
 import httpx
 payload = {
    "model": "llama3.2",
    "messages": [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Explain quantum entanglement in one sentence."},
    ],
    "nomyo": {"cache": True},
 }
 resp = httpx.post("http://localhost:12434/v1/chat/completions", json=payload)
 print(resp.json()["choices"][0]["message"]["content"])
 ```
 ### curl
 ```bash
 curl -s http://localhost:12434/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "llama3.2",
    "messages": [{"role": "user", "content": "What is the capital of France?"}],
    "nomyo": {"cache": true}
  }' | jq .choices[0].message.content
 ```
 ### Check cache statistics
 ```bash
 curl -s http://localhost:12434/api/cache/stats | jq .
 ```
 ```json
 {
  "hits": 42,
  "misses": 18,
  "hit_rate": 0.7,
  "semantic": true,
  "backend": "sqlite",
  "similarity_threshold": 0.95,
  "history_weight": 0.3
 }
 ```
 ### Clear the cache
 ```bash
 curl -X POST http://localhost:12434/api/cache/clear
 ```
 ---
 ## Use Cases Where Semantic Caching Helps Most
 ### FAQ and support bots
 Users ask the same questions in slightly different ways. "How do I reset my password?", "I forgot my password", "password reset instructions" — all semantically equivalent. Semantic caching collapses these into a single LLM call.
 **Recommended settings:** `cache_similarity: 0.90–0.95`, `cache_backend: sqlite` or `redis`, long TTL.
 ### Document Q&A / RAG pipelines
 When the same document set is queried repeatedly by many users, common factual questions ("What is the contract start date?", "When does this contract begin?") hit the cache across users — as long as the system prompt is not user-personalized.
 **Recommended settings:** `cache_similarity: 0.92`, `cache_backend: redis` for multi-user deployments.
 ### Code generation with repeated patterns
 Development tools that generate boilerplate, explain errors, or convert code often receive nearly-identical prompts. Caching prevents re-running expensive large-model inference for the same error message phrased differently.
 **Recommended settings:** `cache_similarity: 0.93`, `cache_backend: sqlite`.
 ### High-traffic chatbots with shared context
 Public-facing bots where many users share the same system prompt (same product, same persona). Generic factual answers can be safely reused across users. The privacy guard ensures personalized responses are never shared.
 **Recommended settings:** `cache_similarity: 0.90–0.95`, `cache_backend: redis`.
 ### Batch processing / data pipelines
 When running LLM inference over large datasets with many near-duplicate or repeated inputs, caching can reduce inference calls dramatically and make pipelines idempotent.
 **Recommended settings:** `cache_similarity: 0.95`, `cache_backend: sqlite`, `cache_ttl: null` (cache forever for deterministic batch runs).
 ---
 ## Tuning the Similarity Threshold
 | `cache_similarity` | Behaviour | Risk |
 |---|---|---|
 | `1.0` | Exact match only | No false positives |
 | `0.97` | Catches minor typos and punctuation differences | Very low |
 | `0.95` | Catches clear paraphrases (recommended default) | Low |
 | `0.90` | Catches broader rewordings | Moderate — may match different intents |
 | `< 0.85` | Very aggressive matching | High false-positive rate, not recommended |
 Start at `0.95` and lower gradually while monitoring cache hit rate via `/api/cache/stats`.
 ---
 ## Limitations
 - **Streaming responses** cached from a non-streaming request are served as a single chunk (not token-by-token). This is indistinguishable to most clients.
 - **MoE models** (`moe-*` prefix) are never cached.
 - **Token counts** are not recorded for cache hits (the LLM was not called).
 - The `memory` backend does not survive a router restart.
 - Semantic search requires the `:semantic` image (or local `sentence-transformers` install); without it, `cache_similarity < 1.0` silently falls back to exact match.
--- a/requirements.txt
+++ b/requirements.txt
@ -43,4 +43,4 @@ aiosqlite
 # For semantic mode use the :semantic Docker image tag (adds sentence-transformers + torch)
 # SECURITY: pin to a specific tag or commit hash to prevent supply-chain drift, e.g.:
 #   semantic-llm-cache @ git+https://github.com/nomyo-ai/async-semantic-llm-cache.git@v1.0.0
-semantic-llm-cache @ git+https://github.com/nomyo-ai/async-semantic-llm-cache.git
+semantic-llm-cache@git+https://github.com/nomyo-ai/async-semantic-llm-cache.git@v0.1
--- a/router.py
+++ b/router.py
@ -1600,7 +1600,8 @@ async def proxy(request: Request):
        images = payload.get("images")
        options = payload.get("options")
        keep_alive = payload.get("keep_alive")
-        
+        _cache_enabled = payload.get("nomyo", {}).get("cache", False)
        if not model:
            raise HTTPException(
                status_code=400, detail="Missing required field 'model'"
@ -1615,7 +1616,7 @@ async def proxy(request: Request):
    # Cache lookup — before endpoint selection so no slot is wasted on a hit
    _cache = get_llm_cache()
-    if _cache is not None:
+    if _cache is not None and _cache_enabled:
        _cached = await _cache.get_generate(model, prompt, system or "")
        if _cached is not None:
            async def _serve_cached_generate():
@ -1671,7 +1672,7 @@ async def proxy(request: Request):
                    else:
                        json_line = orjson.dumps(chunk)
                    # Accumulate and store cache on done chunk — before yield so it always runs
-                    if _cache is not None:
+                    if _cache is not None and _cache_enabled:
                        if getattr(chunk, "response", None):
                            content_parts.append(chunk.response)
                        if getattr(chunk, "done", False):
@ -1710,7 +1711,7 @@ async def proxy(request: Request):
                cache_bytes = json_line.encode("utf-8") + b"\n"
                yield cache_bytes
                # Cache non-streaming response
-                if _cache is not None:
+                if _cache is not None and _cache_enabled:
                    try:
                        await _cache.set_generate(model, prompt, system or "", cache_bytes)
                    except Exception as _ce:
@ -1749,6 +1750,7 @@ async def chat_proxy(request: Request):
        options = payload.get("options")
        logprobs = payload.get("logprobs")
        top_logprobs = payload.get("top_logprobs")
        _cache_enabled = payload.get("nomyo", {}).get("cache", False)
        if not model:
            raise HTTPException(
@ -1775,7 +1777,7 @@ async def chat_proxy(request: Request):
    # Snapshot original messages before any OpenAI-format transformation so that
    # get_chat and set_chat always use the same key regardless of backend type.
    _cache_messages = messages
-    if _cache is not None and not _is_moe:
+    if _cache is not None and not _is_moe and _cache_enabled:
        _cached = await _cache.get_chat("ollama_chat", _cache_model, messages)
        if _cached is not None:
            async def _serve_cached_chat():
@ -1858,7 +1860,7 @@ async def chat_proxy(request: Request):
                    # Accumulate and store cache on done chunk — before yield so it always runs
                    # Works for both Ollama-native and OpenAI-compatible backends; chunks are
                    # already converted to Ollama format by rechunk before this point.
-                    if _cache is not None and not _is_moe:
+                    if _cache is not None and not _is_moe and _cache_enabled:
                        if chunk.message and getattr(chunk.message, "content", None):
                            content_parts.append(chunk.message.content)
                        if getattr(chunk, "done", False):
@ -1898,7 +1900,7 @@ async def chat_proxy(request: Request):
                cache_bytes = json_line.encode("utf-8") + b"\n"
                yield cache_bytes
                # Cache non-streaming response (non-MOE; works for both Ollama and OpenAI backends)
-                if _cache is not None and not _is_moe:
+                if _cache is not None and not _is_moe and _cache_enabled:
                    try:
                        await _cache.set_chat("ollama_chat", _cache_model, _cache_messages, cache_bytes)
                    except Exception as _ce:
@ -2746,6 +2748,7 @@ async def openai_chat_completions_proxy(request: Request):
        tools = payload.get("tools")
        logprobs = payload.get("logprobs")
        top_logprobs = payload.get("top_logprobs")
        _cache_enabled = payload.get("nomyo", {}).get("cache", False)
        if not model:
            raise HTTPException(
@ -2786,9 +2789,20 @@ async def openai_chat_completions_proxy(request: Request):
    except orjson.JSONDecodeError as e:
        raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e
    # Reject unsupported image formats (SVG) before doing any work
    for _msg in messages:
        for _item in (_msg.get("content") or []) if isinstance(_msg.get("content"), list) else []:
            if _item.get("type") == "image_url":
                _url = (_item.get("image_url") or {}).get("url", "")
                if _url.startswith("data:image/svg") or _url.lower().endswith(".svg"):
                    raise HTTPException(
                        status_code=400,
                        detail="SVG images are not supported. Please convert the image to PNG or JPEG before sending.",
                    )
    # Cache lookup — before endpoint selection
    _cache = get_llm_cache()
-    if _cache is not None:
+    if _cache is not None and _cache_enabled:
        _cached = await _cache.get_chat("openai_chat", model, messages)
        if _cached is not None:
            if stream:
@ -2806,16 +2820,56 @@ async def openai_chat_completions_proxy(request: Request):
    base_url = ep2base(endpoint)
    oclient = openai.AsyncOpenAI(base_url=base_url, default_headers=default_headers, api_key=config.api_keys.get(endpoint, "no-key"))
    # 3. Async generator that streams completions data and decrements the counter
    async def _normalize_images_in_messages(msgs: list) -> list:
        """Fetch remote image URLs and convert them to base64 data URLs so
        Ollama/llama-server can handle them without making outbound HTTP requests."""
        resolved = []
        for msg in msgs:
            content = msg.get("content")
            if not isinstance(content, list):
                resolved.append(msg)
                continue
            new_content = []
            for item in content:
                if item.get("type") == "image_url":
                    url = (item.get("image_url") or {}).get("url", "")
                    if url and not url.startswith("data:"):
                        try:
                            http: aiohttp.ClientSession = app_state["session"]
                            async with http.get(url) as resp:
                                ctype = resp.headers.get("Content-Type", "image/jpeg").split(";")[0].strip()
                                img_bytes = await resp.read()
                            b64 = base64.b64encode(img_bytes).decode("utf-8")
                            new_content.append({
                                "type": "image_url",
                                "image_url": {"url": f"data:{ctype};base64,{b64}"}
                            })
                        except Exception as _ie:
                            print(f"[image] Failed to fetch image URL: {_ie}")
                            new_content.append(item)
                    else:
                        new_content.append(item)
                else:
                    new_content.append(item)
            resolved.append({**msg, "content": new_content})
        return resolved
    async def stream_ochat_response():
        try:
            # The chat method returns a generator of dicts (or GenerateResponse)
            try:
-                async_gen = await oclient.chat.completions.create(**params)
+                # For non-external endpoints (Ollama, llama-server), resolve remote
                # image URLs to base64 data URLs so the server can handle them locally.
                send_params = params
                if not is_ext_openai_endpoint(endpoint):
                    resolved_msgs = await _normalize_images_in_messages(params.get("messages", []))
                    send_params = {**params, "messages": resolved_msgs}
                async_gen = await oclient.chat.completions.create(**send_params)
            except openai.BadRequestError as e:
                # If tools are not supported by the model, retry without tools
                if "does not support tools" in str(e):
                    print(f"[openai_chat_completions_proxy] Model {model} doesn't support tools, retrying without tools")
-                    params_without_tools = {k: v for k, v in params.items() if k != "tools"}
+                    params_without_tools = {k: v for k, v in send_params.items() if k != "tools"}
                    async_gen = await oclient.chat.completions.create(**params_without_tools)
                else:
                    raise
@ -2856,7 +2910,7 @@ async def openai_chat_completions_proxy(request: Request):
                    if prompt_tok != 0 or comp_tok != 0:
                        await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))
                # Cache assembled streaming response — before [DONE] so it always runs
-                if _cache is not None and content_parts:
+                if _cache is not None and _cache_enabled and content_parts:
                    assembled = orjson.dumps({
                        "model": model,
                        "choices": [{"index": 0, "message": {"role": "assistant", "content": "".join(content_parts)}, "finish_reason": "stop"}],
@ -2887,7 +2941,7 @@ async def openai_chat_completions_proxy(request: Request):
                cache_bytes = json_line.encode("utf-8") + b"\n"
                yield cache_bytes
                # Cache non-streaming response
-                if _cache is not None:
+                if _cache is not None and _cache_enabled:
                    try:
                        await _cache.set_chat("openai_chat", model, messages, cache_bytes)
                    except Exception as _ce:
@ -2930,6 +2984,7 @@ async def openai_completions_proxy(request: Request):
        max_tokens = payload.get("max_tokens")
        max_completion_tokens = payload.get("max_completion_tokens")
        suffix = payload.get("suffix")
        _cache_enabled = payload.get("nomyo", {}).get("cache", False)
        if not model:
            raise HTTPException(
@ -2970,7 +3025,7 @@ async def openai_completions_proxy(request: Request):
    # Cache lookup — completions prompt mapped to a single-turn messages list
    _cache = get_llm_cache()
    _compl_messages = [{"role": "user", "content": prompt}]
-    if _cache is not None:
+    if _cache is not None and _cache_enabled:
        _cached = await _cache.get_chat("openai_completions", model, _compl_messages)
        if _cached is not None:
            if stream:
@ -3029,7 +3084,7 @@ async def openai_completions_proxy(request: Request):
                    if prompt_tok != 0 or comp_tok != 0:
                        await token_queue.put((endpoint, tracking_model, prompt_tok, comp_tok))
                # Cache assembled streaming response — before [DONE] so it always runs
-                if _cache is not None and text_parts:
+                if _cache is not None and _cache_enabled and text_parts:
                    assembled = orjson.dumps({
                        "model": model,
                        "choices": [{"index": 0, "message": {"role": "assistant", "content": "".join(text_parts)}, "finish_reason": "stop"}],
@ -3061,7 +3116,7 @@ async def openai_completions_proxy(request: Request):
                cache_bytes = json_line.encode("utf-8") + b"\n"
                yield cache_bytes
                # Cache non-streaming response
-                if _cache is not None:
+                if _cache is not None and _cache_enabled:
                    try:
                        await _cache.set_chat("openai_completions", model, _compl_messages, cache_bytes)
                    except Exception as _ce: