perf(prompt-cache): enable Azure prompt_cache_key routing hint

Splits the OpenAI-family gate into per-param predicates so AZURE and AZURE_OPENAI configs now receive prompt_cache_key for backend routing affinity (Microsoft auto-caches GPT-4o+ deployments at >=1024 tokens; the key clusters same-prefix requests on the same GPU pool and raises hit rate on turn 2+). prompt_cache_retention stays opted out for Azure because litellm 1.83.14's Azure transformer would drop it silently; revisit when Azure's supported params list is updated.
2026-07-12 22:42:13 +02:00 · 2026-05-20 11:58:15 +02:00 · 2026-05-20 11:58:15 +02:00 · db8bffab38
commit db8bffab38
parent 71dead0406
2 changed files with 129 additions and 35 deletions
--- a/surfsense_backend/tests/unit/agents/new_chat/test_prompt_caching.py
+++ b/surfsense_backend/tests/unit/agents/new_chat/test_prompt_caching.py
@ -12,13 +12,19 @@ prompt caching. It mutates ``llm.model_kwargs`` so the kwargs flow to
   the deepagent stack accumulates multiple ``SystemMessage``\ s in
   ``state["messages"]`` and ``role: system`` would tag every one of
   them, blowing past Anthropic's 4-block ``cache_control`` cap.
-2. Adds ``prompt_cache_key``/``prompt_cache_retention`` only for
-   single-model OPENAI/DEEPSEEK/XAI configs (where OpenAI's automatic
-   prompt-cache surface is available).
-3. Treats ``ChatLiteLLMRouter`` (auto-mode) as universal-only — no
-   OpenAI-only kwargs because the router fans out across providers.
-4. Idempotent: user-supplied values in ``model_kwargs`` are preserved.
-5. Defensive: LLMs without a writable ``model_kwargs`` are silently
+2. Adds ``prompt_cache_key`` for OPENAI/DEEPSEEK/XAI/AZURE/AZURE_OPENAI
+   configs (Microsoft's Azure transformer was added to LiteLLM in
+   https://github.com/BerriAI/litellm/pull/20989, Feb 2026).
+3. Adds ``prompt_cache_retention="24h"`` ONLY for OPENAI/DEEPSEEK/XAI.
+   Azure's server-side support landed in Microsoft's docs on 2026-05-13
+   but LiteLLM 1.83.14 hasn't wired it through yet, so we let Azure use
+   its default in-memory retention rather than send a param that
+   ``litellm.drop_params`` would silently strip.
+4. Treats ``ChatLiteLLMRouter`` (auto-mode) as universal-only — no
+   destination-specific kwargs because the router fans out across
+   providers.
+5. Idempotent: user-supplied values in ``model_kwargs`` are preserved.
+6. Defensive: LLMs without a writable ``model_kwargs`` are silently
   skipped rather than raising.
 """

@ -191,9 +197,9 @@ def test_does_not_overwrite_user_supplied_prompt_cache_key() -> None:

@pytest.mark.parametrize("provider", ["OPENAI", "DEEPSEEK", "XAI"])
 def test_sets_openai_family_extras(provider: str) -> None:
-    """OpenAI-style providers gain ``prompt_cache_key`` (raises hit rate
-    via routing affinity) and ``prompt_cache_retention="24h"`` (extends
-    cache TTL beyond the default 5-10 min)."""
+    """Native OpenAI-style providers gain ``prompt_cache_key`` (raises
+    hit rate via routing affinity) and ``prompt_cache_retention="24h"``
+    (extends cache TTL beyond the default 5-10 min)."""
    cfg = _make_cfg(provider=provider)
    llm = _FakeLLM()

@ -203,6 +209,27 @@ def test_sets_openai_family_extras(provider: str) -> None:
    assert llm.model_kwargs["prompt_cache_retention"] == "24h"


+@pytest.mark.parametrize("provider", ["AZURE", "AZURE_OPENAI"])
+def test_azure_gets_prompt_cache_key_only(provider: str) -> None:
+    """Azure configs gain ``prompt_cache_key`` for routing affinity
+    (Microsoft auto-caches every GPT-4o+ deployment at ≥1024 tokens;
+    the key clusters same-prefix requests on the same backend GPU pool
+    so hit rate climbs). They DO NOT get ``prompt_cache_retention``
+    because LiteLLM 1.83.14's Azure transformer omits it from its
+    supported params list — ``drop_params`` would silently strip it.
+    Azure's default in-memory retention (5-10 min, max 1 h) is already
+    enough to cover intra-conversation turns; revisit when LiteLLM
+    bumps Azure to match its OpenAI surface."""
+    cfg = _make_cfg(provider=provider, model_name="gpt-5.4")
+    llm = _FakeLLM(model="azure/gpt-5.4")
+
+    apply_litellm_prompt_caching(llm, agent_config=cfg, thread_id=42)
+
+    assert llm.model_kwargs["prompt_cache_key"] == "surfsense-thread-42"
+    assert "prompt_cache_retention" not in llm.model_kwargs
+    assert "cache_control_injection_points" in llm.model_kwargs
+
+
 def test_skips_prompt_cache_key_when_no_thread_id() -> None:
    """Without a thread id we can't construct a per-thread key. Retention
    is still useful so we set it (it's free)."""
@ -215,12 +242,26 @@ def test_skips_prompt_cache_key_when_no_thread_id() -> None:
    assert llm.model_kwargs["prompt_cache_retention"] == "24h"


+def test_azure_skips_prompt_cache_key_when_no_thread_id() -> None:
+    """Azure without a thread id ends up with no extras (retention is
+    Azure-skipped, key needs a thread id) — universal injection points
+    still land."""
+    cfg = _make_cfg(provider="AZURE", model_name="gpt-5.4")
+    llm = _FakeLLM(model="azure/gpt-5.4")
+
+    apply_litellm_prompt_caching(llm, agent_config=cfg, thread_id=None)
+
+    assert "prompt_cache_key" not in llm.model_kwargs
+    assert "prompt_cache_retention" not in llm.model_kwargs
+    assert "cache_control_injection_points" in llm.model_kwargs
+
+
@pytest.mark.parametrize(
    "provider",
    ["ANTHROPIC", "BEDROCK", "VERTEX_AI", "GOOGLE_AI_STUDIO", "GROQ", "MOONSHOT"],
 )
 def test_no_openai_extras_for_other_providers(provider: str) -> None:
-    """Non-OpenAI-family providers don't expose ``prompt_cache_key`` —
+    """Non-OpenAI-style providers don't expose ``prompt_cache_key`` —
    skip it. ``cache_control_injection_points`` is still set (universal)."""
    cfg = _make_cfg(provider=provider)
    llm = _FakeLLM()