perf(prompt-cache): enable Azure prompt_cache_key routing hint

Splits the OpenAI-family gate into per-param predicates so AZURE and
AZURE_OPENAI configs now receive prompt_cache_key for backend routing
affinity (Microsoft auto-caches GPT-4o+ deployments at >=1024 tokens;
the key clusters same-prefix requests on the same GPU pool and raises
hit rate on turn 2+). prompt_cache_retention stays opted out for Azure
because litellm 1.83.14's Azure transformer would drop it silently;
revisit when Azure's supported params list is updated.
This commit is contained in:
CREDO23 2026-05-20 11:58:15 +02:00
parent 71dead0406
commit db8bffab38
2 changed files with 129 additions and 35 deletions

View file

@ -39,9 +39,19 @@ For OpenAI-family configs we additionally pass:
- ``prompt_cache_key=f"surfsense-thread-{thread_id}"`` routing hint that - ``prompt_cache_key=f"surfsense-thread-{thread_id}"`` routing hint that
raises hit rate by sending requests with a shared prefix to the same raises hit rate by sending requests with a shared prefix to the same
backend. backend. Supported by ``openai/``, ``deepseek/``, ``xai/``, and
``azure/`` (added to LiteLLM's Azure transformer in
https://github.com/BerriAI/litellm/pull/20989, Feb 2026; verified
against ``AzureOpenAIConfig.get_supported_openai_params`` in our
installed litellm 1.83.14 for ``azure/gpt-4o``, ``azure/gpt-4o-mini``,
``azure/gpt-5.4``, ``azure/gpt-5.4-mini``).
- ``prompt_cache_retention="24h"`` extends cache TTL beyond the default - ``prompt_cache_retention="24h"`` extends cache TTL beyond the default
5-10 min in-memory cache. 5-10 min in-memory cache. Set ONLY for OpenAI/DeepSeek/xAI: Azure's
server-side support landed in Microsoft's docs on 2026-05-13 but
LiteLLM 1.83.14's Azure transformer still omits it from its supported
params list, so it gets silently dropped by ``litellm.drop_params``.
Azure's default in-memory retention (5-10 min, max 1 h) already
bridges intra-conversation turns; revisit when LiteLLM bumps Azure.
Safety net: ``litellm.drop_params=True`` is set globally in Safety net: ``litellm.drop_params=True`` is set globally in
``app.services.llm_service`` at module-load time. Any kwarg the destination ``app.services.llm_service`` at module-load time. Any kwarg the destination
@ -81,13 +91,31 @@ _DEFAULT_INJECTION_POINTS: tuple[dict[str, Any], ...] = (
{"location": "message", "index": -1}, {"location": "message", "index": -1},
) )
# Providers (uppercase ``AgentConfig.provider`` values) that natively expose # Providers (uppercase ``AgentConfig.provider`` values) that accept the
# OpenAI-style automatic prompt caching with ``prompt_cache_key`` and # OpenAI ``prompt_cache_key`` routing hint. Microsoft's Azure OpenAI docs
# ``prompt_cache_retention`` kwargs. Strict whitelist — many other providers # (2026-05-13) confirm automatic prompt caching applies to every GPT-4o
# in ``PROVIDER_MAP`` route through litellm's ``openai`` prefix without # or newer Azure deployment at ≥1024 tokens with no configuration needed,
# implementing the OpenAI prompt-cache surface (e.g. MOONSHOT, ZHIPU, # and that ``prompt_cache_key`` is combined with the prefix hash to
# MINIMAX), so we can't infer family from the litellm prefix alone. # improve routing affinity and therefore cache hit rate. LiteLLM's Azure
_OPENAI_FAMILY_PROVIDERS: frozenset[str] = frozenset({"OPENAI", "DEEPSEEK", "XAI"}) # transformer ships ``prompt_cache_key`` in its supported params as of
# https://github.com/BerriAI/litellm/pull/20989.
#
# Strict whitelist — many other providers in ``PROVIDER_MAP`` route
# through litellm's ``openai`` prefix without implementing the OpenAI
# prompt-cache surface (e.g. MOONSHOT, ZHIPU, MINIMAX), so we can't infer
# family from the litellm prefix alone.
_PROMPT_CACHE_KEY_PROVIDERS: frozenset[str] = frozenset(
{"OPENAI", "DEEPSEEK", "XAI", "AZURE", "AZURE_OPENAI"}
)
# Subset of ``_PROMPT_CACHE_KEY_PROVIDERS`` that also accept
# ``prompt_cache_retention="24h"``. Azure is excluded: see module
# docstring — LiteLLM 1.83.14's Azure transformer omits the param so
# ``drop_params`` silently strips it. Re-add Azure once a future LiteLLM
# release wires it into ``AzureOpenAIConfig.get_supported_openai_params``.
_PROMPT_CACHE_RETENTION_PROVIDERS: frozenset[str] = frozenset(
{"OPENAI", "DEEPSEEK", "XAI"}
)
def _is_router_llm(llm: BaseChatModel) -> bool: def _is_router_llm(llm: BaseChatModel) -> bool:
@ -101,13 +129,13 @@ def _is_router_llm(llm: BaseChatModel) -> bool:
return type(llm).__name__ == "ChatLiteLLMRouter" return type(llm).__name__ == "ChatLiteLLMRouter"
def _is_openai_family_config(agent_config: AgentConfig | None) -> bool: def _provider_supports_prompt_cache_key(agent_config: AgentConfig | None) -> bool:
"""Whether the config targets an OpenAI-style prompt-cache surface. """Whether the config targets a provider that accepts ``prompt_cache_key``.
Strict only returns True when the user explicitly chose OPENAI, Strict only returns True for explicitly chosen OPENAI, DEEPSEEK,
DEEPSEEK, or XAI as the provider in their ``NewLLMConfig`` / XAI, AZURE, or AZURE_OPENAI providers. Auto-mode and custom
``YAMLConfig``. Auto-mode and custom providers return False because providers return False because we can't statically know the
we can't statically know the destination. destination and the router fans out across mixed providers.
""" """
if agent_config is None or not agent_config.provider: if agent_config is None or not agent_config.provider:
return False return False
@ -115,7 +143,25 @@ def _is_openai_family_config(agent_config: AgentConfig | None) -> bool:
return False return False
if agent_config.custom_provider: if agent_config.custom_provider:
return False return False
return agent_config.provider.upper() in _OPENAI_FAMILY_PROVIDERS return agent_config.provider.upper() in _PROMPT_CACHE_KEY_PROVIDERS
def _provider_supports_prompt_cache_retention(
agent_config: AgentConfig | None,
) -> bool:
"""Whether the config targets a provider that accepts ``prompt_cache_retention``.
Tighter than :func:`_provider_supports_prompt_cache_key` Azure
deployments are excluded until LiteLLM ships the param in its Azure
transformer (see module docstring).
"""
if agent_config is None or not agent_config.provider:
return False
if agent_config.is_auto_mode:
return False
if agent_config.custom_provider:
return False
return agent_config.provider.upper() in _PROMPT_CACHE_RETENTION_PROVIDERS
def _get_or_init_model_kwargs(llm: BaseChatModel) -> dict[str, Any] | None: def _get_or_init_model_kwargs(llm: BaseChatModel) -> dict[str, Any] | None:
@ -173,16 +219,23 @@ def apply_litellm_prompt_caching(
dict(point) for point in _DEFAULT_INJECTION_POINTS dict(point) for point in _DEFAULT_INJECTION_POINTS
] ]
# OpenAI-family extras only when we statically know the destination is # OpenAI-style extras only when we statically know the destination
# OpenAI / DeepSeek / xAI. Auto-mode router fans out across providers # accepts them. Auto-mode router fans out across mixed providers so
# so we can't safely set OpenAI-only kwargs there (drop_params would # we can't safely set destination-specific kwargs there (drop_params
# strip them but it's wasteful to set them in the first place). # would strip them but it's wasteful to set them in the first
# place).
if _is_router_llm(llm): if _is_router_llm(llm):
return return
if not _is_openai_family_config(agent_config):
return
if thread_id is not None and "prompt_cache_key" not in model_kwargs: if (
thread_id is not None
and "prompt_cache_key" not in model_kwargs
and _provider_supports_prompt_cache_key(agent_config)
):
model_kwargs["prompt_cache_key"] = f"surfsense-thread-{thread_id}" model_kwargs["prompt_cache_key"] = f"surfsense-thread-{thread_id}"
if "prompt_cache_retention" not in model_kwargs:
if (
"prompt_cache_retention" not in model_kwargs
and _provider_supports_prompt_cache_retention(agent_config)
):
model_kwargs["prompt_cache_retention"] = "24h" model_kwargs["prompt_cache_retention"] = "24h"

View file

@ -12,13 +12,19 @@ prompt caching. It mutates ``llm.model_kwargs`` so the kwargs flow to
the deepagent stack accumulates multiple ``SystemMessage``\ s in the deepagent stack accumulates multiple ``SystemMessage``\ s in
``state["messages"]`` and ``role: system`` would tag every one of ``state["messages"]`` and ``role: system`` would tag every one of
them, blowing past Anthropic's 4-block ``cache_control`` cap. them, blowing past Anthropic's 4-block ``cache_control`` cap.
2. Adds ``prompt_cache_key``/``prompt_cache_retention`` only for 2. Adds ``prompt_cache_key`` for OPENAI/DEEPSEEK/XAI/AZURE/AZURE_OPENAI
single-model OPENAI/DEEPSEEK/XAI configs (where OpenAI's automatic configs (Microsoft's Azure transformer was added to LiteLLM in
prompt-cache surface is available). https://github.com/BerriAI/litellm/pull/20989, Feb 2026).
3. Treats ``ChatLiteLLMRouter`` (auto-mode) as universal-only no 3. Adds ``prompt_cache_retention="24h"`` ONLY for OPENAI/DEEPSEEK/XAI.
OpenAI-only kwargs because the router fans out across providers. Azure's server-side support landed in Microsoft's docs on 2026-05-13
4. Idempotent: user-supplied values in ``model_kwargs`` are preserved. but LiteLLM 1.83.14 hasn't wired it through yet, so we let Azure use
5. Defensive: LLMs without a writable ``model_kwargs`` are silently its default in-memory retention rather than send a param that
``litellm.drop_params`` would silently strip.
4. Treats ``ChatLiteLLMRouter`` (auto-mode) as universal-only no
destination-specific kwargs because the router fans out across
providers.
5. Idempotent: user-supplied values in ``model_kwargs`` are preserved.
6. Defensive: LLMs without a writable ``model_kwargs`` are silently
skipped rather than raising. skipped rather than raising.
""" """
@ -191,9 +197,9 @@ def test_does_not_overwrite_user_supplied_prompt_cache_key() -> None:
@pytest.mark.parametrize("provider", ["OPENAI", "DEEPSEEK", "XAI"]) @pytest.mark.parametrize("provider", ["OPENAI", "DEEPSEEK", "XAI"])
def test_sets_openai_family_extras(provider: str) -> None: def test_sets_openai_family_extras(provider: str) -> None:
"""OpenAI-style providers gain ``prompt_cache_key`` (raises hit rate """Native OpenAI-style providers gain ``prompt_cache_key`` (raises
via routing affinity) and ``prompt_cache_retention="24h"`` (extends hit rate via routing affinity) and ``prompt_cache_retention="24h"``
cache TTL beyond the default 5-10 min).""" (extends cache TTL beyond the default 5-10 min)."""
cfg = _make_cfg(provider=provider) cfg = _make_cfg(provider=provider)
llm = _FakeLLM() llm = _FakeLLM()
@ -203,6 +209,27 @@ def test_sets_openai_family_extras(provider: str) -> None:
assert llm.model_kwargs["prompt_cache_retention"] == "24h" assert llm.model_kwargs["prompt_cache_retention"] == "24h"
@pytest.mark.parametrize("provider", ["AZURE", "AZURE_OPENAI"])
def test_azure_gets_prompt_cache_key_only(provider: str) -> None:
"""Azure configs gain ``prompt_cache_key`` for routing affinity
(Microsoft auto-caches every GPT-4o+ deployment at 1024 tokens;
the key clusters same-prefix requests on the same backend GPU pool
so hit rate climbs). They DO NOT get ``prompt_cache_retention``
because LiteLLM 1.83.14's Azure transformer omits it from its
supported params list ``drop_params`` would silently strip it.
Azure's default in-memory retention (5-10 min, max 1 h) is already
enough to cover intra-conversation turns; revisit when LiteLLM
bumps Azure to match its OpenAI surface."""
cfg = _make_cfg(provider=provider, model_name="gpt-5.4")
llm = _FakeLLM(model="azure/gpt-5.4")
apply_litellm_prompt_caching(llm, agent_config=cfg, thread_id=42)
assert llm.model_kwargs["prompt_cache_key"] == "surfsense-thread-42"
assert "prompt_cache_retention" not in llm.model_kwargs
assert "cache_control_injection_points" in llm.model_kwargs
def test_skips_prompt_cache_key_when_no_thread_id() -> None: def test_skips_prompt_cache_key_when_no_thread_id() -> None:
"""Without a thread id we can't construct a per-thread key. Retention """Without a thread id we can't construct a per-thread key. Retention
is still useful so we set it (it's free).""" is still useful so we set it (it's free)."""
@ -215,12 +242,26 @@ def test_skips_prompt_cache_key_when_no_thread_id() -> None:
assert llm.model_kwargs["prompt_cache_retention"] == "24h" assert llm.model_kwargs["prompt_cache_retention"] == "24h"
def test_azure_skips_prompt_cache_key_when_no_thread_id() -> None:
"""Azure without a thread id ends up with no extras (retention is
Azure-skipped, key needs a thread id) universal injection points
still land."""
cfg = _make_cfg(provider="AZURE", model_name="gpt-5.4")
llm = _FakeLLM(model="azure/gpt-5.4")
apply_litellm_prompt_caching(llm, agent_config=cfg, thread_id=None)
assert "prompt_cache_key" not in llm.model_kwargs
assert "prompt_cache_retention" not in llm.model_kwargs
assert "cache_control_injection_points" in llm.model_kwargs
@pytest.mark.parametrize( @pytest.mark.parametrize(
"provider", "provider",
["ANTHROPIC", "BEDROCK", "VERTEX_AI", "GOOGLE_AI_STUDIO", "GROQ", "MOONSHOT"], ["ANTHROPIC", "BEDROCK", "VERTEX_AI", "GOOGLE_AI_STUDIO", "GROQ", "MOONSHOT"],
) )
def test_no_openai_extras_for_other_providers(provider: str) -> None: def test_no_openai_extras_for_other_providers(provider: str) -> None:
"""Non-OpenAI-family providers don't expose ``prompt_cache_key`` — """Non-OpenAI-style providers don't expose ``prompt_cache_key`` —
skip it. ``cache_control_injection_points`` is still set (universal).""" skip it. ``cache_control_injection_points`` is still set (universal)."""
cfg = _make_cfg(provider=provider) cfg = _make_cfg(provider=provider)
llm = _FakeLLM() llm = _FakeLLM()