mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-06 20:15:17 +02:00
docs(agents): tighten docstrings and comments across agent module
Recursive pass over the agents module to make docstrings and inline comments concise and intent-oriented: drop narration that just restates the code, condense verbose module/function docstrings, and keep only the non-obvious "why" notes. No functional code changed.
This commit is contained in:
parent
620c378254
commit
a3d05f6418
16 changed files with 319 additions and 1055 deletions
|
|
@ -92,15 +92,9 @@ class SanitizedChatLiteLLM(ChatLiteLLM):
|
|||
yield chunk
|
||||
|
||||
|
||||
# Provider mapping for LiteLLM model string construction.
|
||||
#
|
||||
# Single source of truth lives in
|
||||
# :mod:`app.services.provider_capabilities` so the YAML loader (which
|
||||
# runs during ``app.config`` class-body init) can resolve provider
|
||||
# prefixes without dragging the agent / tools tree into module load
|
||||
# order. Re-exported here under the historical ``PROVIDER_MAP`` name
|
||||
# so existing callers (``llm_router_service``, ``image_gen_router_service``,
|
||||
# tests) keep working unchanged.
|
||||
# Re-exported under the historical name ``PROVIDER_MAP``. Source of truth lives
|
||||
# in provider_capabilities so the YAML loader can resolve prefixes during
|
||||
# app.config init without importing the agent/tools tree.
|
||||
from app.services.provider_capabilities import ( # noqa: E402
|
||||
_PROVIDER_PREFIX_MAP as PROVIDER_MAP,
|
||||
)
|
||||
|
|
@ -157,25 +151,14 @@ class AgentConfig:
|
|||
anonymous_enabled: bool = False
|
||||
quota_reserve_tokens: int | None = None
|
||||
|
||||
# Capability flag: best-effort True for the chat selector / catalog.
|
||||
# Resolved via :func:`provider_capabilities.derive_supports_image_input`
|
||||
# which prefers OpenRouter's ``architecture.input_modalities`` and
|
||||
# otherwise consults LiteLLM's authoritative model map. Default True
|
||||
# is the conservative-allow stance — the streaming-task safety net
|
||||
# (``is_known_text_only_chat_model``) is the *only* place a False
|
||||
# actually blocks a request. Setting this to False here without an
|
||||
# authoritative source would silently hide vision-capable models
|
||||
# (the regression we're fixing).
|
||||
# Default-allow: only the streaming safety net (is_known_text_only_chat_model)
|
||||
# actually blocks on False, so defaulting False would silently hide
|
||||
# vision-capable models. Resolved via derive_supports_image_input.
|
||||
supports_image_input: bool = True
|
||||
|
||||
@classmethod
|
||||
def from_auto_mode(cls) -> "AgentConfig":
|
||||
"""
|
||||
Create an AgentConfig for Auto mode (LiteLLM Router load balancing).
|
||||
|
||||
Returns:
|
||||
AgentConfig instance configured for Auto mode
|
||||
"""
|
||||
"""Build an AgentConfig for Auto mode (LiteLLM Router load balancing)."""
|
||||
return cls(
|
||||
provider="AUTO",
|
||||
model_name="auto",
|
||||
|
|
@ -193,27 +176,15 @@ class AgentConfig:
|
|||
is_premium=False,
|
||||
anonymous_enabled=False,
|
||||
quota_reserve_tokens=None,
|
||||
# Auto routes across the configured pool, which usually
|
||||
# contains at least one vision-capable deployment; the router
|
||||
# will surface a 404 from a non-vision deployment as a normal
|
||||
# ``allowed_fails`` event and fail over rather than blocking
|
||||
# the request outright.
|
||||
# Auto fails over across the pool, so a non-vision deployment's 404
|
||||
# is just an allowed_fails event rather than a hard block.
|
||||
supports_image_input=True,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_new_llm_config(cls, config) -> "AgentConfig":
|
||||
"""
|
||||
Create an AgentConfig from a NewLLMConfig database model.
|
||||
|
||||
Args:
|
||||
config: NewLLMConfig database model instance
|
||||
|
||||
Returns:
|
||||
AgentConfig instance
|
||||
"""
|
||||
# Lazy import to avoid pulling provider_capabilities (and its
|
||||
# transitive litellm import) into module-init order.
|
||||
"""Build an AgentConfig from a NewLLMConfig database model."""
|
||||
# Lazy import: keeps provider_capabilities (and litellm) out of init order.
|
||||
from app.services.provider_capabilities import derive_supports_image_input
|
||||
|
||||
provider_value = (
|
||||
|
|
@ -245,10 +216,8 @@ class AgentConfig:
|
|||
is_premium=False,
|
||||
anonymous_enabled=False,
|
||||
quota_reserve_tokens=None,
|
||||
# BYOK rows have no operator-curated capability flag, so we
|
||||
# ask LiteLLM (default-allow on unknown). The streaming
|
||||
# safety net still blocks if the model is *explicitly*
|
||||
# marked text-only.
|
||||
# BYOK rows have no curated flag; ask LiteLLM (default-allow on
|
||||
# unknown). The streaming safety net still blocks explicit text-only.
|
||||
supports_image_input=derive_supports_image_input(
|
||||
provider=provider_value,
|
||||
model_name=config.model_name,
|
||||
|
|
@ -259,25 +228,14 @@ class AgentConfig:
|
|||
|
||||
@classmethod
|
||||
def from_yaml_config(cls, yaml_config: dict) -> "AgentConfig":
|
||||
"""Build an AgentConfig from a YAML configuration dictionary.
|
||||
|
||||
Supports the same prompt fields as NewLLMConfig (system_instructions,
|
||||
use_default_system_instructions, citations_enabled).
|
||||
"""
|
||||
Create an AgentConfig from a YAML configuration dictionary.
|
||||
|
||||
YAML configs now support the same prompt configuration fields as NewLLMConfig:
|
||||
- system_instructions: Custom system instructions (empty string uses defaults)
|
||||
- use_default_system_instructions: Whether to use default instructions
|
||||
- citations_enabled: Whether citations are enabled
|
||||
|
||||
Args:
|
||||
yaml_config: Configuration dictionary from YAML file
|
||||
|
||||
Returns:
|
||||
AgentConfig instance
|
||||
"""
|
||||
# Lazy import to avoid pulling provider_capabilities (and its
|
||||
# transitive litellm import) into module-init order.
|
||||
# Lazy import: keeps provider_capabilities (and litellm) out of init order.
|
||||
from app.services.provider_capabilities import derive_supports_image_input
|
||||
|
||||
# Get system instructions from YAML, default to empty string
|
||||
system_instructions = yaml_config.get("system_instructions", "")
|
||||
|
||||
provider = yaml_config.get("provider", "").upper()
|
||||
|
|
@ -290,13 +248,8 @@ class AgentConfig:
|
|||
else None
|
||||
)
|
||||
|
||||
# Explicit YAML override wins; otherwise derive from LiteLLM /
|
||||
# OpenRouter modalities. The YAML loader already populates this
|
||||
# field, but this method is also called from
|
||||
# ``load_global_llm_config_by_id``'s file fallback (hot reload),
|
||||
# so we re-derive here for safety. The bool() coercion preserves
|
||||
# the loader's behaviour for explicit ``true`` / ``false``
|
||||
# strings that PyYAML may surface.
|
||||
# Explicit YAML override wins; otherwise re-derive (the hot-reload file
|
||||
# fallback reaches this method without the loader having populated it).
|
||||
if "supports_image_input" in yaml_config:
|
||||
supports_image_input = bool(yaml_config.get("supports_image_input"))
|
||||
else:
|
||||
|
|
@ -314,7 +267,6 @@ class AgentConfig:
|
|||
api_base=yaml_config.get("api_base"),
|
||||
custom_provider=custom_provider,
|
||||
litellm_params=yaml_config.get("litellm_params"),
|
||||
# Prompt configuration from YAML (with defaults for backwards compatibility)
|
||||
system_instructions=system_instructions if system_instructions else None,
|
||||
use_default_system_instructions=yaml_config.get(
|
||||
"use_default_system_instructions", True
|
||||
|
|
@ -332,20 +284,10 @@ class AgentConfig:
|
|||
|
||||
|
||||
def load_llm_config_from_yaml(llm_config_id: int = -1) -> dict | None:
|
||||
"""
|
||||
Load a specific LLM config from global_llm_config.yaml.
|
||||
|
||||
Args:
|
||||
llm_config_id: The id of the config to load (default: -1)
|
||||
|
||||
Returns:
|
||||
LLM config dict or None if not found
|
||||
"""
|
||||
# Get the config file path
|
||||
"""Load a specific LLM config from global_llm_config.yaml."""
|
||||
base_dir = Path(__file__).resolve().parent.parent.parent.parent
|
||||
config_file = base_dir / "app" / "config" / "global_llm_config.yaml"
|
||||
|
||||
# Fallback to example file if main config doesn't exist
|
||||
if not config_file.exists():
|
||||
config_file = base_dir / "app" / "config" / "global_llm_config.example.yaml"
|
||||
if not config_file.exists():
|
||||
|
|
@ -368,24 +310,17 @@ def load_llm_config_from_yaml(llm_config_id: int = -1) -> dict | None:
|
|||
|
||||
|
||||
def load_global_llm_config_by_id(llm_config_id: int) -> dict | None:
|
||||
"""
|
||||
Load a global LLM config by ID, checking in-memory configs first.
|
||||
"""Load a global LLM config by ID, checking in-memory configs first.
|
||||
|
||||
This handles both static YAML configs and dynamically injected configs
|
||||
(e.g. OpenRouter integration models that only exist in memory).
|
||||
|
||||
Args:
|
||||
llm_config_id: The negative ID of the global config to load
|
||||
|
||||
Returns:
|
||||
LLM config dict or None if not found
|
||||
In-memory covers both static YAML and dynamically injected configs (e.g.
|
||||
OpenRouter integration models that only exist in memory).
|
||||
"""
|
||||
from app.config import config as app_config
|
||||
|
||||
for cfg in app_config.GLOBAL_LLM_CONFIGS:
|
||||
if cfg.get("id") == llm_config_id:
|
||||
return cfg
|
||||
# Fallback to YAML file read (covers edge cases like hot-reload)
|
||||
# Fallback to YAML file read (covers hot-reload edge cases).
|
||||
return load_llm_config_from_yaml(llm_config_id)
|
||||
|
||||
|
||||
|
|
@ -393,17 +328,7 @@ async def load_new_llm_config_from_db(
|
|||
session: AsyncSession,
|
||||
config_id: int,
|
||||
) -> "AgentConfig | None":
|
||||
"""
|
||||
Load a NewLLMConfig from the database by ID.
|
||||
|
||||
Args:
|
||||
session: AsyncSession for database access
|
||||
config_id: The ID of the NewLLMConfig to load
|
||||
|
||||
Returns:
|
||||
AgentConfig instance or None if not found
|
||||
"""
|
||||
# Import here to avoid circular imports
|
||||
"""Load a NewLLMConfig from the database by ID."""
|
||||
from app.db import NewLLMConfig
|
||||
|
||||
try:
|
||||
|
|
@ -426,26 +351,13 @@ async def load_agent_llm_config_for_search_space(
|
|||
session: AsyncSession,
|
||||
search_space_id: int,
|
||||
) -> "AgentConfig | None":
|
||||
"""Load the agent LLM config for a search space via its agent_llm_id.
|
||||
|
||||
Positive id -> DB; negative -> YAML; None -> first global config (-1).
|
||||
"""
|
||||
Load the agent LLM configuration for a search space.
|
||||
|
||||
This loads the LLM config based on the search space's agent_llm_id setting:
|
||||
- Positive ID: Load from NewLLMConfig database table
|
||||
- Negative ID: Load from YAML global configs
|
||||
- None: Falls back to first global config (id=-1)
|
||||
|
||||
Args:
|
||||
session: AsyncSession for database access
|
||||
search_space_id: The search space ID
|
||||
|
||||
Returns:
|
||||
AgentConfig instance or None if not found
|
||||
"""
|
||||
# Import here to avoid circular imports
|
||||
from app.db import SearchSpace
|
||||
|
||||
try:
|
||||
# Get the search space to check its agent_llm_id preference
|
||||
result = await session.execute(
|
||||
select(SearchSpace).filter(SearchSpace.id == search_space_id)
|
||||
)
|
||||
|
|
@ -455,12 +367,9 @@ async def load_agent_llm_config_for_search_space(
|
|||
print(f"Error: SearchSpace with id {search_space_id} not found")
|
||||
return None
|
||||
|
||||
# Use agent_llm_id from search space, fallback to -1 (first global config)
|
||||
config_id = (
|
||||
search_space.agent_llm_id if search_space.agent_llm_id is not None else -1
|
||||
)
|
||||
|
||||
# Load the config using the unified loader
|
||||
return await load_agent_config(session, config_id, search_space_id)
|
||||
except Exception as e:
|
||||
print(f"Error loading agent LLM config for search space {search_space_id}: {e}")
|
||||
|
|
@ -472,23 +381,7 @@ async def load_agent_config(
|
|||
config_id: int,
|
||||
search_space_id: int | None = None,
|
||||
) -> "AgentConfig | None":
|
||||
"""
|
||||
Load an agent configuration, supporting Auto mode, YAML, and database configs.
|
||||
|
||||
This is the main entry point for loading configurations:
|
||||
- ID 0: Auto mode (uses LiteLLM Router for load balancing)
|
||||
- Negative IDs: Load from YAML file (global configs)
|
||||
- Positive IDs: Load from NewLLMConfig database table
|
||||
|
||||
Args:
|
||||
session: AsyncSession for database access
|
||||
config_id: The config ID (0 for Auto, negative for YAML, positive for database)
|
||||
search_space_id: Optional search space ID for context
|
||||
|
||||
Returns:
|
||||
AgentConfig instance or None if not found
|
||||
"""
|
||||
# Auto mode (ID 0) - use LiteLLM Router
|
||||
"""Main config loader: id 0 -> Auto mode; negative -> YAML; positive -> DB."""
|
||||
if is_auto_mode(config_id):
|
||||
if not LLMRouterService.is_initialized():
|
||||
print("Error: Auto mode requested but LLM Router not initialized")
|
||||
|
|
@ -496,33 +389,22 @@ async def load_agent_config(
|
|||
return AgentConfig.from_auto_mode()
|
||||
|
||||
if config_id < 0:
|
||||
# Check in-memory configs first (includes static YAML + dynamic OpenRouter)
|
||||
# In-memory covers static YAML + dynamic OpenRouter configs.
|
||||
from app.config import config as app_config
|
||||
|
||||
for cfg in app_config.GLOBAL_LLM_CONFIGS:
|
||||
if cfg.get("id") == config_id:
|
||||
return AgentConfig.from_yaml_config(cfg)
|
||||
# Fallback to YAML file read for safety
|
||||
yaml_config = load_llm_config_from_yaml(config_id)
|
||||
if yaml_config:
|
||||
return AgentConfig.from_yaml_config(yaml_config)
|
||||
return None
|
||||
else:
|
||||
# Load from database (NewLLMConfig)
|
||||
return await load_new_llm_config_from_db(session, config_id)
|
||||
|
||||
|
||||
def create_chat_litellm_from_config(llm_config: dict) -> ChatLiteLLM | None:
|
||||
"""
|
||||
Create a ChatLiteLLM instance from a global LLM config dictionary.
|
||||
|
||||
Args:
|
||||
llm_config: LLM configuration dictionary from YAML
|
||||
|
||||
Returns:
|
||||
ChatLiteLLM instance or None on error
|
||||
"""
|
||||
# Build the model string
|
||||
"""Create a ChatLiteLLM instance from a global LLM config dictionary."""
|
||||
if llm_config.get("custom_provider"):
|
||||
model_string = f"{llm_config['custom_provider']}/{llm_config['model_name']}"
|
||||
else:
|
||||
|
|
@ -530,27 +412,20 @@ def create_chat_litellm_from_config(llm_config: dict) -> ChatLiteLLM | None:
|
|||
provider_prefix = PROVIDER_MAP.get(provider, provider.lower())
|
||||
model_string = f"{provider_prefix}/{llm_config['model_name']}"
|
||||
|
||||
# Create ChatLiteLLM instance with streaming enabled
|
||||
litellm_kwargs = {
|
||||
"model": model_string,
|
||||
"api_key": llm_config.get("api_key"),
|
||||
"streaming": True, # Enable streaming for real-time token streaming
|
||||
"streaming": True,
|
||||
}
|
||||
|
||||
# Add optional parameters
|
||||
if llm_config.get("api_base"):
|
||||
litellm_kwargs["api_base"] = llm_config["api_base"]
|
||||
|
||||
# Add any additional litellm parameters
|
||||
if llm_config.get("litellm_params"):
|
||||
litellm_kwargs.update(llm_config["litellm_params"])
|
||||
|
||||
llm = SanitizedChatLiteLLM(**litellm_kwargs)
|
||||
_attach_model_profile(llm, model_string)
|
||||
# Configure LiteLLM-native prompt caching (cache_control_injection_points
|
||||
# for Anthropic/Bedrock/Vertex/Gemini/Azure-AI/OpenRouter/Databricks/etc.).
|
||||
# ``agent_config=None`` here — the YAML path doesn't have provider intent
|
||||
# in a structured form, so we set only the universal injection points.
|
||||
# agent_config=None: the YAML path lacks structured provider intent, so set
|
||||
# only the universal cache_control_injection_points.
|
||||
apply_litellm_prompt_caching(llm)
|
||||
return llm
|
||||
|
||||
|
|
@ -558,19 +433,7 @@ def create_chat_litellm_from_config(llm_config: dict) -> ChatLiteLLM | None:
|
|||
def create_chat_litellm_from_agent_config(
|
||||
agent_config: AgentConfig,
|
||||
) -> ChatLiteLLM | ChatLiteLLMRouter | None:
|
||||
"""
|
||||
Create a ChatLiteLLM or ChatLiteLLMRouter instance from an AgentConfig.
|
||||
|
||||
For Auto mode configs, returns a ChatLiteLLMRouter that uses LiteLLM Router
|
||||
for automatic load balancing across available providers.
|
||||
|
||||
Args:
|
||||
agent_config: AgentConfig instance
|
||||
|
||||
Returns:
|
||||
ChatLiteLLM or ChatLiteLLMRouter instance, or None on error
|
||||
"""
|
||||
# Handle Auto mode - return ChatLiteLLMRouter
|
||||
"""Create a ChatLiteLLM (or, for Auto mode, a load-balancing router) from config."""
|
||||
if agent_config.is_auto_mode:
|
||||
if not LLMRouterService.is_initialized():
|
||||
print("Error: Auto mode requested but LLM Router not initialized")
|
||||
|
|
@ -578,19 +441,14 @@ def create_chat_litellm_from_agent_config(
|
|||
try:
|
||||
router_llm = get_auto_mode_llm()
|
||||
if router_llm is not None:
|
||||
# Universal cache_control_injection_points only — auto-mode
|
||||
# fans out across providers, so OpenAI-only kwargs (e.g.
|
||||
# ``prompt_cache_key``) are left off here. ``drop_params``
|
||||
# would strip them at the provider boundary anyway, but
|
||||
# there's no point setting them when we don't know the
|
||||
# destination.
|
||||
# Universal injection points only: auto-mode fans out across
|
||||
# providers, so provider-specific kwargs have no known target.
|
||||
apply_litellm_prompt_caching(router_llm, agent_config=agent_config)
|
||||
return router_llm
|
||||
except Exception as e:
|
||||
print(f"Error creating ChatLiteLLMRouter: {e}")
|
||||
return None
|
||||
|
||||
# Build the model string
|
||||
if agent_config.custom_provider:
|
||||
model_string = f"{agent_config.custom_provider}/{agent_config.model_name}"
|
||||
else:
|
||||
|
|
@ -599,26 +457,19 @@ def create_chat_litellm_from_agent_config(
|
|||
)
|
||||
model_string = f"{provider_prefix}/{agent_config.model_name}"
|
||||
|
||||
# Create ChatLiteLLM instance with streaming enabled
|
||||
litellm_kwargs = {
|
||||
"model": model_string,
|
||||
"api_key": agent_config.api_key,
|
||||
"streaming": True, # Enable streaming for real-time token streaming
|
||||
"streaming": True,
|
||||
}
|
||||
|
||||
# Add optional parameters
|
||||
if agent_config.api_base:
|
||||
litellm_kwargs["api_base"] = agent_config.api_base
|
||||
|
||||
# Add any additional litellm parameters
|
||||
if agent_config.litellm_params:
|
||||
litellm_kwargs.update(agent_config.litellm_params)
|
||||
|
||||
llm = SanitizedChatLiteLLM(**litellm_kwargs)
|
||||
_attach_model_profile(llm, model_string)
|
||||
# Build-time prompt caching: sets ``cache_control_injection_points`` for
|
||||
# all providers and (for OpenAI/DeepSeek/xAI) ``prompt_cache_retention``.
|
||||
# Per-thread ``prompt_cache_key`` is layered on later in
|
||||
# ``create_surfsense_deep_agent`` once ``thread_id`` is known.
|
||||
# Build-time caching only; the per-thread prompt_cache_key is layered on
|
||||
# later in create_surfsense_deep_agent once thread_id is known.
|
||||
apply_litellm_prompt_caching(llm, agent_config=agent_config)
|
||||
return llm
|
||||
|
|
|
|||
|
|
@ -1,63 +1,28 @@
|
|||
r"""LiteLLM-native prompt caching configuration for SurfSense agents.
|
||||
r"""LiteLLM-native prompt caching for SurfSense agents.
|
||||
|
||||
Replaces the legacy ``AnthropicPromptCachingMiddleware`` (which never
|
||||
activated for our LiteLLM-based stack — its ``isinstance(model, ChatAnthropic)``
|
||||
gate always failed) with LiteLLM's universal caching mechanism.
|
||||
Replaces the legacy ``AnthropicPromptCachingMiddleware`` (its
|
||||
``isinstance(model, ChatAnthropic)`` gate never matched our LiteLLM stack)
|
||||
with LiteLLM's universal ``cache_control_injection_points`` mechanism, which
|
||||
covers the Anthropic/Bedrock/Vertex/Gemini/OpenRouter/etc. marker-based
|
||||
providers and the auto-caching OpenAI family.
|
||||
|
||||
Coverage:
|
||||
Two breakpoints per request:
|
||||
|
||||
- Marker-based providers (need ``cache_control`` injection, which LiteLLM
|
||||
performs automatically when ``cache_control_injection_points`` is set):
|
||||
``anthropic/``, ``bedrock/``, ``vertex_ai/``, ``gemini/``, ``azure_ai/``,
|
||||
``openrouter/`` (Claude/Gemini/MiniMax/GLM/z-ai routes), ``databricks/``
|
||||
(Claude), ``dashscope/`` (Qwen), ``minimax/``, ``zai/`` (GLM).
|
||||
- Auto-cached (LiteLLM strips the marker silently): ``openai/``,
|
||||
``deepseek/``, ``xai/`` — these caches automatically for prompts ≥1024
|
||||
tokens and surface ``prompt_cache_key`` / ``prompt_cache_retention``.
|
||||
- ``index: 0`` pins the head-of-request system prompt. We use ``index: 0``,
|
||||
NOT ``role: system``: ``before_agent`` injectors accumulate many
|
||||
SystemMessages, and tagging all of them overflows Anthropic's 4-block cap
|
||||
(upstream 400 via OpenRouter).
|
||||
- ``index: -1`` pins the latest message so longest-prefix lookup compounds
|
||||
multi-turn savings.
|
||||
|
||||
We inject **two** breakpoints per request:
|
||||
OpenAI-family configs also get ``prompt_cache_key`` (per-thread routing hint)
|
||||
and ``prompt_cache_retention="24h"``. Azure is excluded from the latter
|
||||
because LiteLLM's Azure transformer drops it (see
|
||||
``_PROMPT_CACHE_RETENTION_PROVIDERS``).
|
||||
|
||||
- ``index: 0`` — pins the SurfSense system prompt at the head of the
|
||||
request (provider variant, citation rules, tool catalog, KB tree,
|
||||
skills metadata). The langchain agent factory always prepends
|
||||
``request.system_message`` at index 0 (see ``factory.py``
|
||||
``_execute_model_async``), so this targets exactly the main system
|
||||
prompt regardless of how many other ``SystemMessage``\ s the
|
||||
``before_agent`` injectors (priority, tree, memory, file-intent,
|
||||
anonymous-doc) have inserted into ``state["messages"]``. Using
|
||||
``role: system`` here would apply ``cache_control`` to **every**
|
||||
system-role message and trip Anthropic's hard cap of 4 cache
|
||||
breakpoints per request once the conversation accumulates enough
|
||||
injected system messages — which surfaces as the upstream 400
|
||||
``A maximum of 4 blocks with cache_control may be provided. Found N``
|
||||
via OpenRouter→Anthropic.
|
||||
- ``index: -1`` — pins the latest message so multi-turn savings compound:
|
||||
Anthropic-family providers use longest-matching-prefix lookup, so turn
|
||||
N+1 still reads turn N's cache up to the shared prefix.
|
||||
|
||||
For OpenAI-family configs we additionally pass:
|
||||
|
||||
- ``prompt_cache_key=f"surfsense-thread-{thread_id}"`` — routing hint that
|
||||
raises hit rate by sending requests with a shared prefix to the same
|
||||
backend. Supported by ``openai/``, ``deepseek/``, ``xai/``, and
|
||||
``azure/`` (added to LiteLLM's Azure transformer in
|
||||
https://github.com/BerriAI/litellm/pull/20989, Feb 2026; verified
|
||||
against ``AzureOpenAIConfig.get_supported_openai_params`` in our
|
||||
installed litellm 1.83.14 for ``azure/gpt-4o``, ``azure/gpt-4o-mini``,
|
||||
``azure/gpt-5.4``, ``azure/gpt-5.4-mini``).
|
||||
- ``prompt_cache_retention="24h"`` — extends cache TTL beyond the default
|
||||
5-10 min in-memory cache. Set ONLY for OpenAI/DeepSeek/xAI: Azure's
|
||||
server-side support landed in Microsoft's docs on 2026-05-13 but
|
||||
LiteLLM 1.83.14's Azure transformer still omits it from its supported
|
||||
params list, so it gets silently dropped by ``litellm.drop_params``.
|
||||
Azure's default in-memory retention (5-10 min, max 1 h) already
|
||||
bridges intra-conversation turns; revisit when LiteLLM bumps Azure.
|
||||
|
||||
Safety net: ``litellm.drop_params=True`` is set globally in
|
||||
``app.services.llm_service`` at module-load time. Any kwarg the destination
|
||||
provider doesn't recognise is auto-stripped at the provider transformer
|
||||
layer, so an OpenAI→Bedrock auto-mode fallback can't 400 on
|
||||
``prompt_cache_key`` etc.
|
||||
Safety net: ``litellm.drop_params=True`` (set in ``app.services.llm_service``)
|
||||
strips any kwarg the destination provider rejects, so an auto-mode fallback
|
||||
can't 400 on these extras.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
|
@ -73,57 +38,29 @@ if TYPE_CHECKING:
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Two-breakpoint policy: head-of-request + latest message. See module
|
||||
# docstring for rationale. Anthropic caps requests at 4 ``cache_control``
|
||||
# blocks; we use 2 here, leaving headroom for Phase-2 tool caching.
|
||||
#
|
||||
# IMPORTANT: ``index: 0`` (not ``role: system``). The deepagent stack's
|
||||
# ``before_agent`` middlewares (priority, tree, memory, anonymous-doc)
|
||||
# insert ``SystemMessage`` instances into ``state["messages"]`` that
|
||||
# accumulate across turns. With ``role: system`` the LiteLLM hook would
|
||||
# tag *every* one of them with ``cache_control`` and overflow Anthropic's
|
||||
# 4-block limit. ``index: 0`` always targets the langchain-prepended
|
||||
# ``request.system_message``, giving us exactly one stable cache breakpoint.
|
||||
# Head-of-request + latest message (see module docstring for the index:0 vs
|
||||
# role:system rationale and Anthropic's 4-block cap).
|
||||
_DEFAULT_INJECTION_POINTS: tuple[dict[str, Any], ...] = (
|
||||
{"location": "message", "index": 0},
|
||||
{"location": "message", "index": -1},
|
||||
)
|
||||
|
||||
# Providers (uppercase ``AgentConfig.provider`` values) that accept the
|
||||
# OpenAI ``prompt_cache_key`` routing hint. Microsoft's Azure OpenAI docs
|
||||
# (2026-05-13) confirm automatic prompt caching applies to every GPT-4o
|
||||
# or newer Azure deployment at ≥1024 tokens with no configuration needed,
|
||||
# and that ``prompt_cache_key`` is combined with the prefix hash to
|
||||
# improve routing affinity and therefore cache hit rate. LiteLLM's Azure
|
||||
# transformer ships ``prompt_cache_key`` in its supported params as of
|
||||
# https://github.com/BerriAI/litellm/pull/20989.
|
||||
#
|
||||
# Strict whitelist — many other providers in ``PROVIDER_MAP`` route
|
||||
# through litellm's ``openai`` prefix without implementing the OpenAI
|
||||
# prompt-cache surface (e.g. MOONSHOT, ZHIPU, MINIMAX), so we can't infer
|
||||
# family from the litellm prefix alone.
|
||||
# Providers that accept the OpenAI ``prompt_cache_key`` routing hint. Strict
|
||||
# whitelist: many providers route through litellm's ``openai`` prefix without
|
||||
# the prompt-cache surface, so the prefix alone isn't enough to infer family.
|
||||
_PROMPT_CACHE_KEY_PROVIDERS: frozenset[str] = frozenset(
|
||||
{"OPENAI", "DEEPSEEK", "XAI", "AZURE", "AZURE_OPENAI"}
|
||||
)
|
||||
|
||||
# Subset of ``_PROMPT_CACHE_KEY_PROVIDERS`` that also accept
|
||||
# ``prompt_cache_retention="24h"``. Azure is excluded: see module
|
||||
# docstring — LiteLLM 1.83.14's Azure transformer omits the param so
|
||||
# ``drop_params`` silently strips it. Re-add Azure once a future LiteLLM
|
||||
# release wires it into ``AzureOpenAIConfig.get_supported_openai_params``.
|
||||
# Subset that also accepts ``prompt_cache_retention="24h"``. Azure is excluded
|
||||
# because LiteLLM's Azure transformer omits the param (drop_params strips it).
|
||||
_PROMPT_CACHE_RETENTION_PROVIDERS: frozenset[str] = frozenset(
|
||||
{"OPENAI", "DEEPSEEK", "XAI"}
|
||||
)
|
||||
|
||||
|
||||
def _is_router_llm(llm: BaseChatModel) -> bool:
|
||||
"""Detect ``ChatLiteLLMRouter`` (auto-mode) without an eager import.
|
||||
|
||||
Importing ``app.services.llm_router_service`` at module-load time would
|
||||
create a cycle via ``llm_config -> prompt_caching -> llm_router_service``.
|
||||
Class-name comparison is sufficient since the class is defined in a
|
||||
single place.
|
||||
"""
|
||||
"""Detect ``ChatLiteLLMRouter`` by class name to avoid an import cycle."""
|
||||
return type(llm).__name__ == "ChatLiteLLMRouter"
|
||||
|
||||
|
||||
|
|
@ -188,21 +125,10 @@ def apply_litellm_prompt_caching(
|
|||
) -> None:
|
||||
"""Configure LiteLLM prompt caching on a ChatLiteLLM/ChatLiteLLMRouter.
|
||||
|
||||
Idempotent — values already present in ``llm.model_kwargs`` (e.g. from
|
||||
``agent_config.litellm_params`` overrides) are preserved. Mutates
|
||||
``llm.model_kwargs`` in place; the kwargs flow to ``litellm.completion``
|
||||
via ``ChatLiteLLM._default_params`` and via ``self.model_kwargs`` merge
|
||||
in our custom ``ChatLiteLLMRouter``.
|
||||
|
||||
Args:
|
||||
llm: ChatLiteLLM, SanitizedChatLiteLLM, or ChatLiteLLMRouter instance.
|
||||
agent_config: Optional ``AgentConfig`` driving provider-specific
|
||||
behaviour. When omitted (or auto-mode), only the universal
|
||||
``cache_control_injection_points`` are set.
|
||||
thread_id: Optional thread id used to construct a per-thread
|
||||
``prompt_cache_key`` for OpenAI-family providers. Caching still
|
||||
works without it (server-side automatic), but the key improves
|
||||
backend routing affinity and therefore hit rate.
|
||||
Idempotent (existing ``model_kwargs`` values are preserved) and mutates
|
||||
``llm.model_kwargs`` in place. Without ``agent_config`` (or in auto-mode)
|
||||
only the universal injection points are set; ``thread_id`` adds a per-thread
|
||||
``prompt_cache_key`` for OpenAI-family providers to improve routing affinity.
|
||||
"""
|
||||
model_kwargs = _get_or_init_model_kwargs(llm)
|
||||
if model_kwargs is None:
|
||||
|
|
@ -217,11 +143,8 @@ def apply_litellm_prompt_caching(
|
|||
dict(point) for point in _DEFAULT_INJECTION_POINTS
|
||||
]
|
||||
|
||||
# OpenAI-style extras only when we statically know the destination
|
||||
# accepts them. Auto-mode router fans out across mixed providers so
|
||||
# we can't safely set destination-specific kwargs there (drop_params
|
||||
# would strip them but it's wasteful to set them in the first
|
||||
# place).
|
||||
# OpenAI-style extras only when the destination is statically known. The
|
||||
# auto-mode router fans out across mixed providers, so skip them there.
|
||||
if _is_router_llm(llm):
|
||||
return
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue