From db8bffab38bc1eb3b39e0c805d29ea960a7c3e08 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Wed, 20 May 2026 11:58:15 +0200 Subject: [PATCH] perf(prompt-cache): enable Azure prompt_cache_key routing hint Splits the OpenAI-family gate into per-param predicates so AZURE and AZURE_OPENAI configs now receive prompt_cache_key for backend routing affinity (Microsoft auto-caches GPT-4o+ deployments at >=1024 tokens; the key clusters same-prefix requests on the same GPU pool and raises hit rate on turn 2+). prompt_cache_retention stays opted out for Azure because litellm 1.83.14's Azure transformer would drop it silently; revisit when Azure's supported params list is updated. --- .../app/agents/new_chat/prompt_caching.py | 101 +++++++++++++----- .../agents/new_chat/test_prompt_caching.py | 63 +++++++++-- 2 files changed, 129 insertions(+), 35 deletions(-) diff --git a/surfsense_backend/app/agents/new_chat/prompt_caching.py b/surfsense_backend/app/agents/new_chat/prompt_caching.py index 9fe47cdac..b58a48266 100644 --- a/surfsense_backend/app/agents/new_chat/prompt_caching.py +++ b/surfsense_backend/app/agents/new_chat/prompt_caching.py @@ -39,9 +39,19 @@ For OpenAI-family configs we additionally pass: - ``prompt_cache_key=f"surfsense-thread-{thread_id}"`` — routing hint that raises hit rate by sending requests with a shared prefix to the same - backend. + backend. Supported by ``openai/``, ``deepseek/``, ``xai/``, and + ``azure/`` (added to LiteLLM's Azure transformer in + https://github.com/BerriAI/litellm/pull/20989, Feb 2026; verified + against ``AzureOpenAIConfig.get_supported_openai_params`` in our + installed litellm 1.83.14 for ``azure/gpt-4o``, ``azure/gpt-4o-mini``, + ``azure/gpt-5.4``, ``azure/gpt-5.4-mini``). - ``prompt_cache_retention="24h"`` — extends cache TTL beyond the default - 5-10 min in-memory cache. + 5-10 min in-memory cache. Set ONLY for OpenAI/DeepSeek/xAI: Azure's + server-side support landed in Microsoft's docs on 2026-05-13 but + LiteLLM 1.83.14's Azure transformer still omits it from its supported + params list, so it gets silently dropped by ``litellm.drop_params``. + Azure's default in-memory retention (5-10 min, max 1 h) already + bridges intra-conversation turns; revisit when LiteLLM bumps Azure. Safety net: ``litellm.drop_params=True`` is set globally in ``app.services.llm_service`` at module-load time. Any kwarg the destination @@ -81,13 +91,31 @@ _DEFAULT_INJECTION_POINTS: tuple[dict[str, Any], ...] = ( {"location": "message", "index": -1}, ) -# Providers (uppercase ``AgentConfig.provider`` values) that natively expose -# OpenAI-style automatic prompt caching with ``prompt_cache_key`` and -# ``prompt_cache_retention`` kwargs. Strict whitelist — many other providers -# in ``PROVIDER_MAP`` route through litellm's ``openai`` prefix without -# implementing the OpenAI prompt-cache surface (e.g. MOONSHOT, ZHIPU, -# MINIMAX), so we can't infer family from the litellm prefix alone. -_OPENAI_FAMILY_PROVIDERS: frozenset[str] = frozenset({"OPENAI", "DEEPSEEK", "XAI"}) +# Providers (uppercase ``AgentConfig.provider`` values) that accept the +# OpenAI ``prompt_cache_key`` routing hint. Microsoft's Azure OpenAI docs +# (2026-05-13) confirm automatic prompt caching applies to every GPT-4o +# or newer Azure deployment at ≥1024 tokens with no configuration needed, +# and that ``prompt_cache_key`` is combined with the prefix hash to +# improve routing affinity and therefore cache hit rate. LiteLLM's Azure +# transformer ships ``prompt_cache_key`` in its supported params as of +# https://github.com/BerriAI/litellm/pull/20989. +# +# Strict whitelist — many other providers in ``PROVIDER_MAP`` route +# through litellm's ``openai`` prefix without implementing the OpenAI +# prompt-cache surface (e.g. MOONSHOT, ZHIPU, MINIMAX), so we can't infer +# family from the litellm prefix alone. +_PROMPT_CACHE_KEY_PROVIDERS: frozenset[str] = frozenset( + {"OPENAI", "DEEPSEEK", "XAI", "AZURE", "AZURE_OPENAI"} +) + +# Subset of ``_PROMPT_CACHE_KEY_PROVIDERS`` that also accept +# ``prompt_cache_retention="24h"``. Azure is excluded: see module +# docstring — LiteLLM 1.83.14's Azure transformer omits the param so +# ``drop_params`` silently strips it. Re-add Azure once a future LiteLLM +# release wires it into ``AzureOpenAIConfig.get_supported_openai_params``. +_PROMPT_CACHE_RETENTION_PROVIDERS: frozenset[str] = frozenset( + {"OPENAI", "DEEPSEEK", "XAI"} +) def _is_router_llm(llm: BaseChatModel) -> bool: @@ -101,13 +129,13 @@ def _is_router_llm(llm: BaseChatModel) -> bool: return type(llm).__name__ == "ChatLiteLLMRouter" -def _is_openai_family_config(agent_config: AgentConfig | None) -> bool: - """Whether the config targets an OpenAI-style prompt-cache surface. +def _provider_supports_prompt_cache_key(agent_config: AgentConfig | None) -> bool: + """Whether the config targets a provider that accepts ``prompt_cache_key``. - Strict — only returns True when the user explicitly chose OPENAI, - DEEPSEEK, or XAI as the provider in their ``NewLLMConfig`` / - ``YAMLConfig``. Auto-mode and custom providers return False because - we can't statically know the destination. + Strict — only returns True for explicitly chosen OPENAI, DEEPSEEK, + XAI, AZURE, or AZURE_OPENAI providers. Auto-mode and custom + providers return False because we can't statically know the + destination and the router fans out across mixed providers. """ if agent_config is None or not agent_config.provider: return False @@ -115,7 +143,25 @@ def _is_openai_family_config(agent_config: AgentConfig | None) -> bool: return False if agent_config.custom_provider: return False - return agent_config.provider.upper() in _OPENAI_FAMILY_PROVIDERS + return agent_config.provider.upper() in _PROMPT_CACHE_KEY_PROVIDERS + + +def _provider_supports_prompt_cache_retention( + agent_config: AgentConfig | None, +) -> bool: + """Whether the config targets a provider that accepts ``prompt_cache_retention``. + + Tighter than :func:`_provider_supports_prompt_cache_key` — Azure + deployments are excluded until LiteLLM ships the param in its Azure + transformer (see module docstring). + """ + if agent_config is None or not agent_config.provider: + return False + if agent_config.is_auto_mode: + return False + if agent_config.custom_provider: + return False + return agent_config.provider.upper() in _PROMPT_CACHE_RETENTION_PROVIDERS def _get_or_init_model_kwargs(llm: BaseChatModel) -> dict[str, Any] | None: @@ -173,16 +219,23 @@ def apply_litellm_prompt_caching( dict(point) for point in _DEFAULT_INJECTION_POINTS ] - # OpenAI-family extras only when we statically know the destination is - # OpenAI / DeepSeek / xAI. Auto-mode router fans out across providers - # so we can't safely set OpenAI-only kwargs there (drop_params would - # strip them but it's wasteful to set them in the first place). + # OpenAI-style extras only when we statically know the destination + # accepts them. Auto-mode router fans out across mixed providers so + # we can't safely set destination-specific kwargs there (drop_params + # would strip them but it's wasteful to set them in the first + # place). if _is_router_llm(llm): return - if not _is_openai_family_config(agent_config): - return - if thread_id is not None and "prompt_cache_key" not in model_kwargs: + if ( + thread_id is not None + and "prompt_cache_key" not in model_kwargs + and _provider_supports_prompt_cache_key(agent_config) + ): model_kwargs["prompt_cache_key"] = f"surfsense-thread-{thread_id}" - if "prompt_cache_retention" not in model_kwargs: + + if ( + "prompt_cache_retention" not in model_kwargs + and _provider_supports_prompt_cache_retention(agent_config) + ): model_kwargs["prompt_cache_retention"] = "24h" diff --git a/surfsense_backend/tests/unit/agents/new_chat/test_prompt_caching.py b/surfsense_backend/tests/unit/agents/new_chat/test_prompt_caching.py index 4cf53969d..c3de15c58 100644 --- a/surfsense_backend/tests/unit/agents/new_chat/test_prompt_caching.py +++ b/surfsense_backend/tests/unit/agents/new_chat/test_prompt_caching.py @@ -12,13 +12,19 @@ prompt caching. It mutates ``llm.model_kwargs`` so the kwargs flow to the deepagent stack accumulates multiple ``SystemMessage``\ s in ``state["messages"]`` and ``role: system`` would tag every one of them, blowing past Anthropic's 4-block ``cache_control`` cap. -2. Adds ``prompt_cache_key``/``prompt_cache_retention`` only for - single-model OPENAI/DEEPSEEK/XAI configs (where OpenAI's automatic - prompt-cache surface is available). -3. Treats ``ChatLiteLLMRouter`` (auto-mode) as universal-only — no - OpenAI-only kwargs because the router fans out across providers. -4. Idempotent: user-supplied values in ``model_kwargs`` are preserved. -5. Defensive: LLMs without a writable ``model_kwargs`` are silently +2. Adds ``prompt_cache_key`` for OPENAI/DEEPSEEK/XAI/AZURE/AZURE_OPENAI + configs (Microsoft's Azure transformer was added to LiteLLM in + https://github.com/BerriAI/litellm/pull/20989, Feb 2026). +3. Adds ``prompt_cache_retention="24h"`` ONLY for OPENAI/DEEPSEEK/XAI. + Azure's server-side support landed in Microsoft's docs on 2026-05-13 + but LiteLLM 1.83.14 hasn't wired it through yet, so we let Azure use + its default in-memory retention rather than send a param that + ``litellm.drop_params`` would silently strip. +4. Treats ``ChatLiteLLMRouter`` (auto-mode) as universal-only — no + destination-specific kwargs because the router fans out across + providers. +5. Idempotent: user-supplied values in ``model_kwargs`` are preserved. +6. Defensive: LLMs without a writable ``model_kwargs`` are silently skipped rather than raising. """ @@ -191,9 +197,9 @@ def test_does_not_overwrite_user_supplied_prompt_cache_key() -> None: @pytest.mark.parametrize("provider", ["OPENAI", "DEEPSEEK", "XAI"]) def test_sets_openai_family_extras(provider: str) -> None: - """OpenAI-style providers gain ``prompt_cache_key`` (raises hit rate - via routing affinity) and ``prompt_cache_retention="24h"`` (extends - cache TTL beyond the default 5-10 min).""" + """Native OpenAI-style providers gain ``prompt_cache_key`` (raises + hit rate via routing affinity) and ``prompt_cache_retention="24h"`` + (extends cache TTL beyond the default 5-10 min).""" cfg = _make_cfg(provider=provider) llm = _FakeLLM() @@ -203,6 +209,27 @@ def test_sets_openai_family_extras(provider: str) -> None: assert llm.model_kwargs["prompt_cache_retention"] == "24h" +@pytest.mark.parametrize("provider", ["AZURE", "AZURE_OPENAI"]) +def test_azure_gets_prompt_cache_key_only(provider: str) -> None: + """Azure configs gain ``prompt_cache_key`` for routing affinity + (Microsoft auto-caches every GPT-4o+ deployment at ≥1024 tokens; + the key clusters same-prefix requests on the same backend GPU pool + so hit rate climbs). They DO NOT get ``prompt_cache_retention`` + because LiteLLM 1.83.14's Azure transformer omits it from its + supported params list — ``drop_params`` would silently strip it. + Azure's default in-memory retention (5-10 min, max 1 h) is already + enough to cover intra-conversation turns; revisit when LiteLLM + bumps Azure to match its OpenAI surface.""" + cfg = _make_cfg(provider=provider, model_name="gpt-5.4") + llm = _FakeLLM(model="azure/gpt-5.4") + + apply_litellm_prompt_caching(llm, agent_config=cfg, thread_id=42) + + assert llm.model_kwargs["prompt_cache_key"] == "surfsense-thread-42" + assert "prompt_cache_retention" not in llm.model_kwargs + assert "cache_control_injection_points" in llm.model_kwargs + + def test_skips_prompt_cache_key_when_no_thread_id() -> None: """Without a thread id we can't construct a per-thread key. Retention is still useful so we set it (it's free).""" @@ -215,12 +242,26 @@ def test_skips_prompt_cache_key_when_no_thread_id() -> None: assert llm.model_kwargs["prompt_cache_retention"] == "24h" +def test_azure_skips_prompt_cache_key_when_no_thread_id() -> None: + """Azure without a thread id ends up with no extras (retention is + Azure-skipped, key needs a thread id) — universal injection points + still land.""" + cfg = _make_cfg(provider="AZURE", model_name="gpt-5.4") + llm = _FakeLLM(model="azure/gpt-5.4") + + apply_litellm_prompt_caching(llm, agent_config=cfg, thread_id=None) + + assert "prompt_cache_key" not in llm.model_kwargs + assert "prompt_cache_retention" not in llm.model_kwargs + assert "cache_control_injection_points" in llm.model_kwargs + + @pytest.mark.parametrize( "provider", ["ANTHROPIC", "BEDROCK", "VERTEX_AI", "GOOGLE_AI_STUDIO", "GROQ", "MOONSHOT"], ) def test_no_openai_extras_for_other_providers(provider: str) -> None: - """Non-OpenAI-family providers don't expose ``prompt_cache_key`` — + """Non-OpenAI-style providers don't expose ``prompt_cache_key`` — skip it. ``cache_control_injection_points`` is still set (universal).""" cfg = _make_cfg(provider=provider) llm = _FakeLLM()