mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-25 19:15:18 +02:00
perf(prompt-cache): enable Azure prompt_cache_key routing hint
Splits the OpenAI-family gate into per-param predicates so AZURE and AZURE_OPENAI configs now receive prompt_cache_key for backend routing affinity (Microsoft auto-caches GPT-4o+ deployments at >=1024 tokens; the key clusters same-prefix requests on the same GPU pool and raises hit rate on turn 2+). prompt_cache_retention stays opted out for Azure because litellm 1.83.14's Azure transformer would drop it silently; revisit when Azure's supported params list is updated.
This commit is contained in:
parent
71dead0406
commit
db8bffab38
2 changed files with 129 additions and 35 deletions
|
|
@ -39,9 +39,19 @@ For OpenAI-family configs we additionally pass:
|
||||||
|
|
||||||
- ``prompt_cache_key=f"surfsense-thread-{thread_id}"`` — routing hint that
|
- ``prompt_cache_key=f"surfsense-thread-{thread_id}"`` — routing hint that
|
||||||
raises hit rate by sending requests with a shared prefix to the same
|
raises hit rate by sending requests with a shared prefix to the same
|
||||||
backend.
|
backend. Supported by ``openai/``, ``deepseek/``, ``xai/``, and
|
||||||
|
``azure/`` (added to LiteLLM's Azure transformer in
|
||||||
|
https://github.com/BerriAI/litellm/pull/20989, Feb 2026; verified
|
||||||
|
against ``AzureOpenAIConfig.get_supported_openai_params`` in our
|
||||||
|
installed litellm 1.83.14 for ``azure/gpt-4o``, ``azure/gpt-4o-mini``,
|
||||||
|
``azure/gpt-5.4``, ``azure/gpt-5.4-mini``).
|
||||||
- ``prompt_cache_retention="24h"`` — extends cache TTL beyond the default
|
- ``prompt_cache_retention="24h"`` — extends cache TTL beyond the default
|
||||||
5-10 min in-memory cache.
|
5-10 min in-memory cache. Set ONLY for OpenAI/DeepSeek/xAI: Azure's
|
||||||
|
server-side support landed in Microsoft's docs on 2026-05-13 but
|
||||||
|
LiteLLM 1.83.14's Azure transformer still omits it from its supported
|
||||||
|
params list, so it gets silently dropped by ``litellm.drop_params``.
|
||||||
|
Azure's default in-memory retention (5-10 min, max 1 h) already
|
||||||
|
bridges intra-conversation turns; revisit when LiteLLM bumps Azure.
|
||||||
|
|
||||||
Safety net: ``litellm.drop_params=True`` is set globally in
|
Safety net: ``litellm.drop_params=True`` is set globally in
|
||||||
``app.services.llm_service`` at module-load time. Any kwarg the destination
|
``app.services.llm_service`` at module-load time. Any kwarg the destination
|
||||||
|
|
@ -81,13 +91,31 @@ _DEFAULT_INJECTION_POINTS: tuple[dict[str, Any], ...] = (
|
||||||
{"location": "message", "index": -1},
|
{"location": "message", "index": -1},
|
||||||
)
|
)
|
||||||
|
|
||||||
# Providers (uppercase ``AgentConfig.provider`` values) that natively expose
|
# Providers (uppercase ``AgentConfig.provider`` values) that accept the
|
||||||
# OpenAI-style automatic prompt caching with ``prompt_cache_key`` and
|
# OpenAI ``prompt_cache_key`` routing hint. Microsoft's Azure OpenAI docs
|
||||||
# ``prompt_cache_retention`` kwargs. Strict whitelist — many other providers
|
# (2026-05-13) confirm automatic prompt caching applies to every GPT-4o
|
||||||
# in ``PROVIDER_MAP`` route through litellm's ``openai`` prefix without
|
# or newer Azure deployment at ≥1024 tokens with no configuration needed,
|
||||||
# implementing the OpenAI prompt-cache surface (e.g. MOONSHOT, ZHIPU,
|
# and that ``prompt_cache_key`` is combined with the prefix hash to
|
||||||
# MINIMAX), so we can't infer family from the litellm prefix alone.
|
# improve routing affinity and therefore cache hit rate. LiteLLM's Azure
|
||||||
_OPENAI_FAMILY_PROVIDERS: frozenset[str] = frozenset({"OPENAI", "DEEPSEEK", "XAI"})
|
# transformer ships ``prompt_cache_key`` in its supported params as of
|
||||||
|
# https://github.com/BerriAI/litellm/pull/20989.
|
||||||
|
#
|
||||||
|
# Strict whitelist — many other providers in ``PROVIDER_MAP`` route
|
||||||
|
# through litellm's ``openai`` prefix without implementing the OpenAI
|
||||||
|
# prompt-cache surface (e.g. MOONSHOT, ZHIPU, MINIMAX), so we can't infer
|
||||||
|
# family from the litellm prefix alone.
|
||||||
|
_PROMPT_CACHE_KEY_PROVIDERS: frozenset[str] = frozenset(
|
||||||
|
{"OPENAI", "DEEPSEEK", "XAI", "AZURE", "AZURE_OPENAI"}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Subset of ``_PROMPT_CACHE_KEY_PROVIDERS`` that also accept
|
||||||
|
# ``prompt_cache_retention="24h"``. Azure is excluded: see module
|
||||||
|
# docstring — LiteLLM 1.83.14's Azure transformer omits the param so
|
||||||
|
# ``drop_params`` silently strips it. Re-add Azure once a future LiteLLM
|
||||||
|
# release wires it into ``AzureOpenAIConfig.get_supported_openai_params``.
|
||||||
|
_PROMPT_CACHE_RETENTION_PROVIDERS: frozenset[str] = frozenset(
|
||||||
|
{"OPENAI", "DEEPSEEK", "XAI"}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _is_router_llm(llm: BaseChatModel) -> bool:
|
def _is_router_llm(llm: BaseChatModel) -> bool:
|
||||||
|
|
@ -101,13 +129,13 @@ def _is_router_llm(llm: BaseChatModel) -> bool:
|
||||||
return type(llm).__name__ == "ChatLiteLLMRouter"
|
return type(llm).__name__ == "ChatLiteLLMRouter"
|
||||||
|
|
||||||
|
|
||||||
def _is_openai_family_config(agent_config: AgentConfig | None) -> bool:
|
def _provider_supports_prompt_cache_key(agent_config: AgentConfig | None) -> bool:
|
||||||
"""Whether the config targets an OpenAI-style prompt-cache surface.
|
"""Whether the config targets a provider that accepts ``prompt_cache_key``.
|
||||||
|
|
||||||
Strict — only returns True when the user explicitly chose OPENAI,
|
Strict — only returns True for explicitly chosen OPENAI, DEEPSEEK,
|
||||||
DEEPSEEK, or XAI as the provider in their ``NewLLMConfig`` /
|
XAI, AZURE, or AZURE_OPENAI providers. Auto-mode and custom
|
||||||
``YAMLConfig``. Auto-mode and custom providers return False because
|
providers return False because we can't statically know the
|
||||||
we can't statically know the destination.
|
destination and the router fans out across mixed providers.
|
||||||
"""
|
"""
|
||||||
if agent_config is None or not agent_config.provider:
|
if agent_config is None or not agent_config.provider:
|
||||||
return False
|
return False
|
||||||
|
|
@ -115,7 +143,25 @@ def _is_openai_family_config(agent_config: AgentConfig | None) -> bool:
|
||||||
return False
|
return False
|
||||||
if agent_config.custom_provider:
|
if agent_config.custom_provider:
|
||||||
return False
|
return False
|
||||||
return agent_config.provider.upper() in _OPENAI_FAMILY_PROVIDERS
|
return agent_config.provider.upper() in _PROMPT_CACHE_KEY_PROVIDERS
|
||||||
|
|
||||||
|
|
||||||
|
def _provider_supports_prompt_cache_retention(
|
||||||
|
agent_config: AgentConfig | None,
|
||||||
|
) -> bool:
|
||||||
|
"""Whether the config targets a provider that accepts ``prompt_cache_retention``.
|
||||||
|
|
||||||
|
Tighter than :func:`_provider_supports_prompt_cache_key` — Azure
|
||||||
|
deployments are excluded until LiteLLM ships the param in its Azure
|
||||||
|
transformer (see module docstring).
|
||||||
|
"""
|
||||||
|
if agent_config is None or not agent_config.provider:
|
||||||
|
return False
|
||||||
|
if agent_config.is_auto_mode:
|
||||||
|
return False
|
||||||
|
if agent_config.custom_provider:
|
||||||
|
return False
|
||||||
|
return agent_config.provider.upper() in _PROMPT_CACHE_RETENTION_PROVIDERS
|
||||||
|
|
||||||
|
|
||||||
def _get_or_init_model_kwargs(llm: BaseChatModel) -> dict[str, Any] | None:
|
def _get_or_init_model_kwargs(llm: BaseChatModel) -> dict[str, Any] | None:
|
||||||
|
|
@ -173,16 +219,23 @@ def apply_litellm_prompt_caching(
|
||||||
dict(point) for point in _DEFAULT_INJECTION_POINTS
|
dict(point) for point in _DEFAULT_INJECTION_POINTS
|
||||||
]
|
]
|
||||||
|
|
||||||
# OpenAI-family extras only when we statically know the destination is
|
# OpenAI-style extras only when we statically know the destination
|
||||||
# OpenAI / DeepSeek / xAI. Auto-mode router fans out across providers
|
# accepts them. Auto-mode router fans out across mixed providers so
|
||||||
# so we can't safely set OpenAI-only kwargs there (drop_params would
|
# we can't safely set destination-specific kwargs there (drop_params
|
||||||
# strip them but it's wasteful to set them in the first place).
|
# would strip them but it's wasteful to set them in the first
|
||||||
|
# place).
|
||||||
if _is_router_llm(llm):
|
if _is_router_llm(llm):
|
||||||
return
|
return
|
||||||
if not _is_openai_family_config(agent_config):
|
|
||||||
return
|
|
||||||
|
|
||||||
if thread_id is not None and "prompt_cache_key" not in model_kwargs:
|
if (
|
||||||
|
thread_id is not None
|
||||||
|
and "prompt_cache_key" not in model_kwargs
|
||||||
|
and _provider_supports_prompt_cache_key(agent_config)
|
||||||
|
):
|
||||||
model_kwargs["prompt_cache_key"] = f"surfsense-thread-{thread_id}"
|
model_kwargs["prompt_cache_key"] = f"surfsense-thread-{thread_id}"
|
||||||
if "prompt_cache_retention" not in model_kwargs:
|
|
||||||
|
if (
|
||||||
|
"prompt_cache_retention" not in model_kwargs
|
||||||
|
and _provider_supports_prompt_cache_retention(agent_config)
|
||||||
|
):
|
||||||
model_kwargs["prompt_cache_retention"] = "24h"
|
model_kwargs["prompt_cache_retention"] = "24h"
|
||||||
|
|
|
||||||
|
|
@ -12,13 +12,19 @@ prompt caching. It mutates ``llm.model_kwargs`` so the kwargs flow to
|
||||||
the deepagent stack accumulates multiple ``SystemMessage``\ s in
|
the deepagent stack accumulates multiple ``SystemMessage``\ s in
|
||||||
``state["messages"]`` and ``role: system`` would tag every one of
|
``state["messages"]`` and ``role: system`` would tag every one of
|
||||||
them, blowing past Anthropic's 4-block ``cache_control`` cap.
|
them, blowing past Anthropic's 4-block ``cache_control`` cap.
|
||||||
2. Adds ``prompt_cache_key``/``prompt_cache_retention`` only for
|
2. Adds ``prompt_cache_key`` for OPENAI/DEEPSEEK/XAI/AZURE/AZURE_OPENAI
|
||||||
single-model OPENAI/DEEPSEEK/XAI configs (where OpenAI's automatic
|
configs (Microsoft's Azure transformer was added to LiteLLM in
|
||||||
prompt-cache surface is available).
|
https://github.com/BerriAI/litellm/pull/20989, Feb 2026).
|
||||||
3. Treats ``ChatLiteLLMRouter`` (auto-mode) as universal-only — no
|
3. Adds ``prompt_cache_retention="24h"`` ONLY for OPENAI/DEEPSEEK/XAI.
|
||||||
OpenAI-only kwargs because the router fans out across providers.
|
Azure's server-side support landed in Microsoft's docs on 2026-05-13
|
||||||
4. Idempotent: user-supplied values in ``model_kwargs`` are preserved.
|
but LiteLLM 1.83.14 hasn't wired it through yet, so we let Azure use
|
||||||
5. Defensive: LLMs without a writable ``model_kwargs`` are silently
|
its default in-memory retention rather than send a param that
|
||||||
|
``litellm.drop_params`` would silently strip.
|
||||||
|
4. Treats ``ChatLiteLLMRouter`` (auto-mode) as universal-only — no
|
||||||
|
destination-specific kwargs because the router fans out across
|
||||||
|
providers.
|
||||||
|
5. Idempotent: user-supplied values in ``model_kwargs`` are preserved.
|
||||||
|
6. Defensive: LLMs without a writable ``model_kwargs`` are silently
|
||||||
skipped rather than raising.
|
skipped rather than raising.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
@ -191,9 +197,9 @@ def test_does_not_overwrite_user_supplied_prompt_cache_key() -> None:
|
||||||
|
|
||||||
@pytest.mark.parametrize("provider", ["OPENAI", "DEEPSEEK", "XAI"])
|
@pytest.mark.parametrize("provider", ["OPENAI", "DEEPSEEK", "XAI"])
|
||||||
def test_sets_openai_family_extras(provider: str) -> None:
|
def test_sets_openai_family_extras(provider: str) -> None:
|
||||||
"""OpenAI-style providers gain ``prompt_cache_key`` (raises hit rate
|
"""Native OpenAI-style providers gain ``prompt_cache_key`` (raises
|
||||||
via routing affinity) and ``prompt_cache_retention="24h"`` (extends
|
hit rate via routing affinity) and ``prompt_cache_retention="24h"``
|
||||||
cache TTL beyond the default 5-10 min)."""
|
(extends cache TTL beyond the default 5-10 min)."""
|
||||||
cfg = _make_cfg(provider=provider)
|
cfg = _make_cfg(provider=provider)
|
||||||
llm = _FakeLLM()
|
llm = _FakeLLM()
|
||||||
|
|
||||||
|
|
@ -203,6 +209,27 @@ def test_sets_openai_family_extras(provider: str) -> None:
|
||||||
assert llm.model_kwargs["prompt_cache_retention"] == "24h"
|
assert llm.model_kwargs["prompt_cache_retention"] == "24h"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("provider", ["AZURE", "AZURE_OPENAI"])
|
||||||
|
def test_azure_gets_prompt_cache_key_only(provider: str) -> None:
|
||||||
|
"""Azure configs gain ``prompt_cache_key`` for routing affinity
|
||||||
|
(Microsoft auto-caches every GPT-4o+ deployment at ≥1024 tokens;
|
||||||
|
the key clusters same-prefix requests on the same backend GPU pool
|
||||||
|
so hit rate climbs). They DO NOT get ``prompt_cache_retention``
|
||||||
|
because LiteLLM 1.83.14's Azure transformer omits it from its
|
||||||
|
supported params list — ``drop_params`` would silently strip it.
|
||||||
|
Azure's default in-memory retention (5-10 min, max 1 h) is already
|
||||||
|
enough to cover intra-conversation turns; revisit when LiteLLM
|
||||||
|
bumps Azure to match its OpenAI surface."""
|
||||||
|
cfg = _make_cfg(provider=provider, model_name="gpt-5.4")
|
||||||
|
llm = _FakeLLM(model="azure/gpt-5.4")
|
||||||
|
|
||||||
|
apply_litellm_prompt_caching(llm, agent_config=cfg, thread_id=42)
|
||||||
|
|
||||||
|
assert llm.model_kwargs["prompt_cache_key"] == "surfsense-thread-42"
|
||||||
|
assert "prompt_cache_retention" not in llm.model_kwargs
|
||||||
|
assert "cache_control_injection_points" in llm.model_kwargs
|
||||||
|
|
||||||
|
|
||||||
def test_skips_prompt_cache_key_when_no_thread_id() -> None:
|
def test_skips_prompt_cache_key_when_no_thread_id() -> None:
|
||||||
"""Without a thread id we can't construct a per-thread key. Retention
|
"""Without a thread id we can't construct a per-thread key. Retention
|
||||||
is still useful so we set it (it's free)."""
|
is still useful so we set it (it's free)."""
|
||||||
|
|
@ -215,12 +242,26 @@ def test_skips_prompt_cache_key_when_no_thread_id() -> None:
|
||||||
assert llm.model_kwargs["prompt_cache_retention"] == "24h"
|
assert llm.model_kwargs["prompt_cache_retention"] == "24h"
|
||||||
|
|
||||||
|
|
||||||
|
def test_azure_skips_prompt_cache_key_when_no_thread_id() -> None:
|
||||||
|
"""Azure without a thread id ends up with no extras (retention is
|
||||||
|
Azure-skipped, key needs a thread id) — universal injection points
|
||||||
|
still land."""
|
||||||
|
cfg = _make_cfg(provider="AZURE", model_name="gpt-5.4")
|
||||||
|
llm = _FakeLLM(model="azure/gpt-5.4")
|
||||||
|
|
||||||
|
apply_litellm_prompt_caching(llm, agent_config=cfg, thread_id=None)
|
||||||
|
|
||||||
|
assert "prompt_cache_key" not in llm.model_kwargs
|
||||||
|
assert "prompt_cache_retention" not in llm.model_kwargs
|
||||||
|
assert "cache_control_injection_points" in llm.model_kwargs
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"provider",
|
"provider",
|
||||||
["ANTHROPIC", "BEDROCK", "VERTEX_AI", "GOOGLE_AI_STUDIO", "GROQ", "MOONSHOT"],
|
["ANTHROPIC", "BEDROCK", "VERTEX_AI", "GOOGLE_AI_STUDIO", "GROQ", "MOONSHOT"],
|
||||||
)
|
)
|
||||||
def test_no_openai_extras_for_other_providers(provider: str) -> None:
|
def test_no_openai_extras_for_other_providers(provider: str) -> None:
|
||||||
"""Non-OpenAI-family providers don't expose ``prompt_cache_key`` —
|
"""Non-OpenAI-style providers don't expose ``prompt_cache_key`` —
|
||||||
skip it. ``cache_control_injection_points`` is still set (universal)."""
|
skip it. ``cache_control_injection_points`` is still set (universal)."""
|
||||||
cfg = _make_cfg(provider=provider)
|
cfg = _make_cfg(provider=provider)
|
||||||
llm = _FakeLLM()
|
llm = _FakeLLM()
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue