perf(prompt-cache): enable Azure prompt_cache_key routing hint

Splits the OpenAI-family gate into per-param predicates so AZURE and
AZURE_OPENAI configs now receive prompt_cache_key for backend routing
affinity (Microsoft auto-caches GPT-4o+ deployments at >=1024 tokens;
the key clusters same-prefix requests on the same GPU pool and raises
hit rate on turn 2+). prompt_cache_retention stays opted out for Azure
because litellm 1.83.14's Azure transformer would drop it silently;
revisit when Azure's supported params list is updated.
This commit is contained in:
CREDO23 2026-05-20 11:58:15 +02:00
parent 71dead0406
commit db8bffab38
2 changed files with 129 additions and 35 deletions

View file

@ -12,13 +12,19 @@ prompt caching. It mutates ``llm.model_kwargs`` so the kwargs flow to
the deepagent stack accumulates multiple ``SystemMessage``\ s in
``state["messages"]`` and ``role: system`` would tag every one of
them, blowing past Anthropic's 4-block ``cache_control`` cap.
2. Adds ``prompt_cache_key``/``prompt_cache_retention`` only for
single-model OPENAI/DEEPSEEK/XAI configs (where OpenAI's automatic
prompt-cache surface is available).
3. Treats ``ChatLiteLLMRouter`` (auto-mode) as universal-only no
OpenAI-only kwargs because the router fans out across providers.
4. Idempotent: user-supplied values in ``model_kwargs`` are preserved.
5. Defensive: LLMs without a writable ``model_kwargs`` are silently
2. Adds ``prompt_cache_key`` for OPENAI/DEEPSEEK/XAI/AZURE/AZURE_OPENAI
configs (Microsoft's Azure transformer was added to LiteLLM in
https://github.com/BerriAI/litellm/pull/20989, Feb 2026).
3. Adds ``prompt_cache_retention="24h"`` ONLY for OPENAI/DEEPSEEK/XAI.
Azure's server-side support landed in Microsoft's docs on 2026-05-13
but LiteLLM 1.83.14 hasn't wired it through yet, so we let Azure use
its default in-memory retention rather than send a param that
``litellm.drop_params`` would silently strip.
4. Treats ``ChatLiteLLMRouter`` (auto-mode) as universal-only no
destination-specific kwargs because the router fans out across
providers.
5. Idempotent: user-supplied values in ``model_kwargs`` are preserved.
6. Defensive: LLMs without a writable ``model_kwargs`` are silently
skipped rather than raising.
"""
@ -191,9 +197,9 @@ def test_does_not_overwrite_user_supplied_prompt_cache_key() -> None:
@pytest.mark.parametrize("provider", ["OPENAI", "DEEPSEEK", "XAI"])
def test_sets_openai_family_extras(provider: str) -> None:
"""OpenAI-style providers gain ``prompt_cache_key`` (raises hit rate
via routing affinity) and ``prompt_cache_retention="24h"`` (extends
cache TTL beyond the default 5-10 min)."""
"""Native OpenAI-style providers gain ``prompt_cache_key`` (raises
hit rate via routing affinity) and ``prompt_cache_retention="24h"``
(extends cache TTL beyond the default 5-10 min)."""
cfg = _make_cfg(provider=provider)
llm = _FakeLLM()
@ -203,6 +209,27 @@ def test_sets_openai_family_extras(provider: str) -> None:
assert llm.model_kwargs["prompt_cache_retention"] == "24h"
@pytest.mark.parametrize("provider", ["AZURE", "AZURE_OPENAI"])
def test_azure_gets_prompt_cache_key_only(provider: str) -> None:
"""Azure configs gain ``prompt_cache_key`` for routing affinity
(Microsoft auto-caches every GPT-4o+ deployment at 1024 tokens;
the key clusters same-prefix requests on the same backend GPU pool
so hit rate climbs). They DO NOT get ``prompt_cache_retention``
because LiteLLM 1.83.14's Azure transformer omits it from its
supported params list ``drop_params`` would silently strip it.
Azure's default in-memory retention (5-10 min, max 1 h) is already
enough to cover intra-conversation turns; revisit when LiteLLM
bumps Azure to match its OpenAI surface."""
cfg = _make_cfg(provider=provider, model_name="gpt-5.4")
llm = _FakeLLM(model="azure/gpt-5.4")
apply_litellm_prompt_caching(llm, agent_config=cfg, thread_id=42)
assert llm.model_kwargs["prompt_cache_key"] == "surfsense-thread-42"
assert "prompt_cache_retention" not in llm.model_kwargs
assert "cache_control_injection_points" in llm.model_kwargs
def test_skips_prompt_cache_key_when_no_thread_id() -> None:
"""Without a thread id we can't construct a per-thread key. Retention
is still useful so we set it (it's free)."""
@ -215,12 +242,26 @@ def test_skips_prompt_cache_key_when_no_thread_id() -> None:
assert llm.model_kwargs["prompt_cache_retention"] == "24h"
def test_azure_skips_prompt_cache_key_when_no_thread_id() -> None:
"""Azure without a thread id ends up with no extras (retention is
Azure-skipped, key needs a thread id) universal injection points
still land."""
cfg = _make_cfg(provider="AZURE", model_name="gpt-5.4")
llm = _FakeLLM(model="azure/gpt-5.4")
apply_litellm_prompt_caching(llm, agent_config=cfg, thread_id=None)
assert "prompt_cache_key" not in llm.model_kwargs
assert "prompt_cache_retention" not in llm.model_kwargs
assert "cache_control_injection_points" in llm.model_kwargs
@pytest.mark.parametrize(
"provider",
["ANTHROPIC", "BEDROCK", "VERTEX_AI", "GOOGLE_AI_STUDIO", "GROQ", "MOONSHOT"],
)
def test_no_openai_extras_for_other_providers(provider: str) -> None:
"""Non-OpenAI-family providers don't expose ``prompt_cache_key`` —
"""Non-OpenAI-style providers don't expose ``prompt_cache_key`` —
skip it. ``cache_control_injection_points`` is still set (universal)."""
cfg = _make_cfg(provider=provider)
llm = _FakeLLM()