feat: prompt caching

- Updated `litellm` dependency version from `1.83.4` to `1.83.7`. - Adjusted `aiohttp` version from `3.13.5` to `3.13.4` in the lock file. - Implemented `apply_litellm_prompt_caching` in `chat_deepagent.py` to improve prompt caching. - Added model name resolution logic in `chat_deepagent.py` to ensure correct provider-variant dispatch. - Enhanced `llm_config.py` to configure prompt caching for various LLM providers. - Updated tests to verify correct model name forwarding and prompt caching behavior.
2026-05-04 13:22:41 +02:00 · 2026-05-01 05:10:53 -07:00 · 2026-05-01 05:10:53 -07:00 · e57c3a7d0c
commit e57c3a7d0c
parent 360b5f8e3a
12 changed files with 877 additions and 156 deletions
--- a/surfsense_backend/app/services/llm_router_service.py
+++ b/surfsense_backend/app/services/llm_router_service.py
@ -28,6 +28,7 @@ from litellm.exceptions import (
    BadRequestError as LiteLLMBadRequestError,
    ContextWindowExceededError,
 )
+from pydantic import Field

 from app.utils.perf import get_perf_logger

@ -573,6 +574,11 @@ class ChatLiteLLMRouter(BaseChatModel):
    # Public attributes that Pydantic will manage
    model: str = "auto"
    streaming: bool = True
+    # Static kwargs that flow through to ``litellm.completion(...)`` on every
+    # invocation (e.g. ``cache_control_injection_points`` set by
+    # ``apply_litellm_prompt_caching``). Per-call ``**kwargs`` from
+    # ``invoke()`` still take precedence — see ``_generate``/``_astream``.
+    model_kwargs: dict[str, Any] = Field(default_factory=dict)

    # Bound tools and tool choice for tool calling
    _bound_tools: list[dict] | None = None
@ -898,13 +904,16 @@ class ChatLiteLLMRouter(BaseChatModel):
                    logger.warning(f"Failed to convert tool {tool}: {e}")
                    continue

-        # Create a new instance with tools bound
+        # Create a new instance with tools bound. Carry through ``model_kwargs``
+        # so static settings (e.g. cache_control_injection_points) survive the
+        # bind_tools rebuild.
        return ChatLiteLLMRouter(
            router=self._router,
            bound_tools=formatted_tools if formatted_tools else None,
            tool_choice=tool_choice,
            model=self.model,
            streaming=self.streaming,
+            model_kwargs=dict(self.model_kwargs),
            **kwargs,
        )

@ -929,8 +938,10 @@ class ChatLiteLLMRouter(BaseChatModel):
        formatted_messages = self._convert_messages(messages)
        formatted_messages = self._trim_messages_to_fit_context(formatted_messages)

-        # Add tools if bound
-        call_kwargs = {**kwargs}
+        # Merge static model_kwargs (e.g. cache_control_injection_points) under
+        # per-call kwargs so callers can still override per invocation. Then add
+        # bound tools.
+        call_kwargs = {**self.model_kwargs, **kwargs}
        if self._bound_tools:
            call_kwargs["tools"] = self._bound_tools
        if self._tool_choice is not None:
@ -997,8 +1008,10 @@ class ChatLiteLLMRouter(BaseChatModel):
        formatted_messages = self._convert_messages(messages)
        formatted_messages = self._trim_messages_to_fit_context(formatted_messages)

-        # Add tools if bound
-        call_kwargs = {**kwargs}
+        # Merge static model_kwargs (e.g. cache_control_injection_points) under
+        # per-call kwargs so callers can still override per invocation. Then add
+        # bound tools.
+        call_kwargs = {**self.model_kwargs, **kwargs}
        if self._bound_tools:
            call_kwargs["tools"] = self._bound_tools
        if self._tool_choice is not None:
@ -1060,8 +1073,10 @@ class ChatLiteLLMRouter(BaseChatModel):
        formatted_messages = self._convert_messages(messages)
        formatted_messages = self._trim_messages_to_fit_context(formatted_messages)

-        # Add tools if bound
-        call_kwargs = {**kwargs}
+        # Merge static model_kwargs (e.g. cache_control_injection_points) under
+        # per-call kwargs so callers can still override per invocation. Then add
+        # bound tools.
+        call_kwargs = {**self.model_kwargs, **kwargs}
        if self._bound_tools:
            call_kwargs["tools"] = self._bound_tools
        if self._tool_choice is not None:
@ -1110,8 +1125,10 @@ class ChatLiteLLMRouter(BaseChatModel):
        formatted_messages = self._convert_messages(messages)
        formatted_messages = self._trim_messages_to_fit_context(formatted_messages)

-        # Add tools if bound
-        call_kwargs = {**kwargs}
+        # Merge static model_kwargs (e.g. cache_control_injection_points) under
+        # per-call kwargs so callers can still override per invocation. Then add
+        # bound tools.
+        call_kwargs = {**self.model_kwargs, **kwargs}
        if self._bound_tools:
            call_kwargs["tools"] = self._bound_tools
        if self._tool_choice is not None: