From 946f8a8c5dec1fd211cae2b114bf2849090510d7 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 4 Jun 2026 12:41:52 +0200 Subject: [PATCH] refactor(agents): move llm_config + prompt_caching to app/agents/shared (slice 4b) Relocate the mutually-dependent LLM config layer and the LiteLLM prompt-caching helper to the shared kernel as one unit, rewiring their internal cross-reference to the shared paths. Flip 21 non-frozen importers. Re-export shims remain at new_chat/{llm_config,prompt_caching}.py for the frozen single-agent stack (chat_deepagent); they will be removed when that stack is retired. --- .../main_agent/runtime/factory.py | 4 +- .../app/agents/new_chat/llm_config.py | 643 +----------------- .../new_chat/middleware/flatten_system.py | 2 +- .../app/agents/new_chat/prompt_caching.py | 242 +------ .../app/agents/shared/llm_config.py | 622 +++++++++++++++++ .../app/agents/shared/prompt_caching.py | 241 +++++++ .../app/automations/services/model_policy.py | 2 +- .../app/routes/anonymous_chat_routes.py | 2 +- surfsense_backend/app/services/llm_service.py | 12 +- .../app/services/provider_capabilities.py | 2 +- .../app/tasks/chat/stream_new_chat.py | 2 +- .../app/tasks/chat/streaming/agent/builder.py | 2 +- .../flows/new_chat/llm_capability.py | 2 +- .../streaming/flows/new_chat/title_gen.py | 2 +- .../chat/streaming/flows/shared/llm_bundle.py | 2 +- .../streaming/flows/shared/premium_quota.py | 2 +- surfsense_backend/tests/e2e/run_backend.py | 4 +- surfsense_backend/tests/e2e/run_celery.py | 4 +- .../agents/new_chat/test_prompt_caching.py | 6 +- .../test_resolve_prompt_model_name.py | 2 +- .../automations/services/test_model_policy.py | 2 +- .../services/test_supports_image_input.py | 6 +- .../test_vision_llm_api_base_defense.py | 2 +- 23 files changed, 928 insertions(+), 882 deletions(-) create mode 100644 surfsense_backend/app/agents/shared/llm_config.py create mode 100644 surfsense_backend/app/agents/shared/prompt_caching.py diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/runtime/factory.py b/surfsense_backend/app/agents/multi_agent_chat/main_agent/runtime/factory.py index 63c2fc799..8ec1235b7 100644 --- a/surfsense_backend/app/agents/multi_agent_chat/main_agent/runtime/factory.py +++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/runtime/factory.py @@ -25,8 +25,8 @@ from app.agents.new_chat.connector_searchable_types import ( from app.agents.shared.feature_flags import AgentFeatureFlags, get_flags from app.agents.new_chat.filesystem_backends import build_backend_resolver from app.agents.shared.filesystem_selection import FilesystemMode, FilesystemSelection -from app.agents.new_chat.llm_config import AgentConfig -from app.agents.new_chat.prompt_caching import apply_litellm_prompt_caching +from app.agents.shared.llm_config import AgentConfig +from app.agents.shared.prompt_caching import apply_litellm_prompt_caching from app.agents.new_chat.tools.invalid_tool import INVALID_TOOL_NAME, invalid_tool from app.agents.new_chat.tools.registry import build_tools_async from app.db import ChatVisibility diff --git a/surfsense_backend/app/agents/new_chat/llm_config.py b/surfsense_backend/app/agents/new_chat/llm_config.py index bc37bf1c4..28bca9360 100644 --- a/surfsense_backend/app/agents/new_chat/llm_config.py +++ b/surfsense_backend/app/agents/new_chat/llm_config.py @@ -1,622 +1,33 @@ -""" -LLM configuration utilities for SurfSense agents. +"""Backward-compatible shim. -This module provides functions for loading LLM configurations from: -1. Auto mode (ID 0) - Uses LiteLLM Router for load balancing -2. YAML files (global configs with negative IDs) -3. Database NewLLMConfig table (user-created configs with positive IDs) - -It also provides utilities for creating ChatLiteLLM instances and -managing prompt configurations. +The LLM configuration layer now lives in the shared agent kernel at +``app.agents.shared.llm_config``. This module re-exports it so frozen +single-agent code (``chat_deepagent``) keeps working until that stack is +retired. """ -from collections.abc import AsyncIterator -from dataclasses import dataclass -from pathlib import Path -from typing import Any +from __future__ import annotations -import yaml -from langchain_core.callbacks import ( - AsyncCallbackManagerForLLMRun, - CallbackManagerForLLMRun, -) -from langchain_core.messages import AIMessage, BaseMessage -from langchain_core.outputs import ChatGenerationChunk, ChatResult -from langchain_litellm import ChatLiteLLM -from litellm import get_model_info -from sqlalchemy import select -from sqlalchemy.ext.asyncio import AsyncSession - -from app.agents.new_chat.prompt_caching import apply_litellm_prompt_caching -from app.services.llm_router_service import ( - AUTO_MODE_ID, - ChatLiteLLMRouter, - LLMRouterService, - _sanitize_content, - get_auto_mode_llm, - is_auto_mode, +from app.agents.shared.llm_config import ( + AgentConfig, + SanitizedChatLiteLLM, + create_chat_litellm_from_agent_config, + create_chat_litellm_from_config, + load_agent_config, + load_agent_llm_config_for_search_space, + load_global_llm_config_by_id, + load_llm_config_from_yaml, + load_new_llm_config_from_db, ) - -def _sanitize_messages(messages: list[BaseMessage]) -> list[BaseMessage]: - """Sanitize content on every message so it is safe for any provider. - - Handles three cross-provider incompatibilities: - - List content with provider-specific blocks (e.g. ``thinking``) - - List content with bare strings or empty text blocks - - AI messages with empty content + tool calls: some providers (Bedrock) - convert ``""`` to ``[{"type":"text","text":""}]`` server-side then - reject the blank text. The OpenAI spec says ``content`` should be - ``null`` when an assistant message only carries tool calls. - """ - for msg in messages: - if isinstance(msg.content, list): - msg.content = _sanitize_content(msg.content) - if ( - isinstance(msg, AIMessage) - and (not msg.content or msg.content == "") - and getattr(msg, "tool_calls", None) - ): - msg.content = None # type: ignore[assignment] - return messages - - -class SanitizedChatLiteLLM(ChatLiteLLM): - """ChatLiteLLM subclass that strips provider-specific content blocks - (e.g. ``thinking`` from reasoning models) and normalises bare strings - in content arrays before forwarding to the underlying provider.""" - - def _generate( - self, - messages: list[BaseMessage], - stop: list[str] | None = None, - run_manager: CallbackManagerForLLMRun | None = None, - **kwargs: Any, - ) -> ChatResult: - return super()._generate( - _sanitize_messages(messages), stop, run_manager, **kwargs - ) - - async def _astream( - self, - messages: list[BaseMessage], - stop: list[str] | None = None, - run_manager: AsyncCallbackManagerForLLMRun | None = None, - **kwargs: Any, - ) -> AsyncIterator[ChatGenerationChunk]: - async for chunk in super()._astream( - _sanitize_messages(messages), stop, run_manager, **kwargs - ): - yield chunk - - -# Provider mapping for LiteLLM model string construction. -# -# Single source of truth lives in -# :mod:`app.services.provider_capabilities` so the YAML loader (which -# runs during ``app.config`` class-body init) can resolve provider -# prefixes without dragging the agent / tools tree into module load -# order. Re-exported here under the historical ``PROVIDER_MAP`` name -# so existing callers (``llm_router_service``, ``image_gen_router_service``, -# tests) keep working unchanged. -from app.services.provider_capabilities import ( # noqa: E402 - _PROVIDER_PREFIX_MAP as PROVIDER_MAP, -) - - -def _attach_model_profile(llm: ChatLiteLLM, model_string: str) -> None: - """Attach a ``profile`` dict to ChatLiteLLM with model context metadata.""" - try: - info = get_model_info(model_string) - max_input_tokens = info.get("max_input_tokens") - if isinstance(max_input_tokens, int) and max_input_tokens > 0: - llm.profile = { - "max_input_tokens": max_input_tokens, - "max_input_tokens_upper": max_input_tokens, - "token_count_model": model_string, - "token_count_models": [model_string], - } - except Exception: - return - - -@dataclass -class AgentConfig: - """ - Complete configuration for the SurfSense agent. - - This combines LLM settings with prompt configuration from NewLLMConfig. - Supports Auto mode (ID 0) which uses LiteLLM Router for load balancing. - """ - - # LLM Model Settings - provider: str - model_name: str - api_key: str - api_base: str | None = None - custom_provider: str | None = None - litellm_params: dict | None = None - - # Prompt Configuration - system_instructions: str | None = None - use_default_system_instructions: bool = True - citations_enabled: bool = True - - # Metadata - config_id: int | None = None - config_name: str | None = None - - # Auto mode flag - is_auto_mode: bool = False - - # Token quota and policy - billing_tier: str = "free" - is_premium: bool = False - anonymous_enabled: bool = False - quota_reserve_tokens: int | None = None - - # Capability flag: best-effort True for the chat selector / catalog. - # Resolved via :func:`provider_capabilities.derive_supports_image_input` - # which prefers OpenRouter's ``architecture.input_modalities`` and - # otherwise consults LiteLLM's authoritative model map. Default True - # is the conservative-allow stance — the streaming-task safety net - # (``is_known_text_only_chat_model``) is the *only* place a False - # actually blocks a request. Setting this to False here without an - # authoritative source would silently hide vision-capable models - # (the regression we're fixing). - supports_image_input: bool = True - - @classmethod - def from_auto_mode(cls) -> "AgentConfig": - """ - Create an AgentConfig for Auto mode (LiteLLM Router load balancing). - - Returns: - AgentConfig instance configured for Auto mode - """ - return cls( - provider="AUTO", - model_name="auto", - api_key="", # Not needed for router - api_base=None, - custom_provider=None, - litellm_params=None, - system_instructions=None, - use_default_system_instructions=True, - citations_enabled=True, - config_id=AUTO_MODE_ID, - config_name="Auto (Fastest)", - is_auto_mode=True, - billing_tier="free", - is_premium=False, - anonymous_enabled=False, - quota_reserve_tokens=None, - # Auto routes across the configured pool, which usually - # contains at least one vision-capable deployment; the router - # will surface a 404 from a non-vision deployment as a normal - # ``allowed_fails`` event and fail over rather than blocking - # the request outright. - supports_image_input=True, - ) - - @classmethod - def from_new_llm_config(cls, config) -> "AgentConfig": - """ - Create an AgentConfig from a NewLLMConfig database model. - - Args: - config: NewLLMConfig database model instance - - Returns: - AgentConfig instance - """ - # Lazy import to avoid pulling provider_capabilities (and its - # transitive litellm import) into module-init order. - from app.services.provider_capabilities import derive_supports_image_input - - provider_value = ( - config.provider.value - if hasattr(config.provider, "value") - else str(config.provider) - ) - litellm_params = config.litellm_params or {} - base_model = ( - litellm_params.get("base_model") - if isinstance(litellm_params, dict) - else None - ) - - return cls( - provider=provider_value, - model_name=config.model_name, - api_key=config.api_key, - api_base=config.api_base, - custom_provider=config.custom_provider, - litellm_params=config.litellm_params, - system_instructions=config.system_instructions, - use_default_system_instructions=config.use_default_system_instructions, - citations_enabled=config.citations_enabled, - config_id=config.id, - config_name=config.name, - is_auto_mode=False, - billing_tier="free", - is_premium=False, - anonymous_enabled=False, - quota_reserve_tokens=None, - # BYOK rows have no operator-curated capability flag, so we - # ask LiteLLM (default-allow on unknown). The streaming - # safety net still blocks if the model is *explicitly* - # marked text-only. - supports_image_input=derive_supports_image_input( - provider=provider_value, - model_name=config.model_name, - base_model=base_model, - custom_provider=config.custom_provider, - ), - ) - - @classmethod - def from_yaml_config(cls, yaml_config: dict) -> "AgentConfig": - """ - Create an AgentConfig from a YAML configuration dictionary. - - YAML configs now support the same prompt configuration fields as NewLLMConfig: - - system_instructions: Custom system instructions (empty string uses defaults) - - use_default_system_instructions: Whether to use default instructions - - citations_enabled: Whether citations are enabled - - Args: - yaml_config: Configuration dictionary from YAML file - - Returns: - AgentConfig instance - """ - # Lazy import to avoid pulling provider_capabilities (and its - # transitive litellm import) into module-init order. - from app.services.provider_capabilities import derive_supports_image_input - - # Get system instructions from YAML, default to empty string - system_instructions = yaml_config.get("system_instructions", "") - - provider = yaml_config.get("provider", "").upper() - model_name = yaml_config.get("model_name", "") - custom_provider = yaml_config.get("custom_provider") - litellm_params = yaml_config.get("litellm_params") or {} - base_model = ( - litellm_params.get("base_model") - if isinstance(litellm_params, dict) - else None - ) - - # Explicit YAML override wins; otherwise derive from LiteLLM / - # OpenRouter modalities. The YAML loader already populates this - # field, but this method is also called from - # ``load_global_llm_config_by_id``'s file fallback (hot reload), - # so we re-derive here for safety. The bool() coercion preserves - # the loader's behaviour for explicit ``true`` / ``false`` - # strings that PyYAML may surface. - if "supports_image_input" in yaml_config: - supports_image_input = bool(yaml_config.get("supports_image_input")) - else: - supports_image_input = derive_supports_image_input( - provider=provider, - model_name=model_name, - base_model=base_model, - custom_provider=custom_provider, - ) - - return cls( - provider=provider, - model_name=model_name, - api_key=yaml_config.get("api_key", ""), - api_base=yaml_config.get("api_base"), - custom_provider=custom_provider, - litellm_params=yaml_config.get("litellm_params"), - # Prompt configuration from YAML (with defaults for backwards compatibility) - system_instructions=system_instructions if system_instructions else None, - use_default_system_instructions=yaml_config.get( - "use_default_system_instructions", True - ), - citations_enabled=yaml_config.get("citations_enabled", True), - config_id=yaml_config.get("id"), - config_name=yaml_config.get("name"), - is_auto_mode=False, - billing_tier=yaml_config.get("billing_tier", "free"), - is_premium=yaml_config.get("billing_tier", "free") == "premium", - anonymous_enabled=yaml_config.get("anonymous_enabled", False), - quota_reserve_tokens=yaml_config.get("quota_reserve_tokens"), - supports_image_input=supports_image_input, - ) - - -def load_llm_config_from_yaml(llm_config_id: int = -1) -> dict | None: - """ - Load a specific LLM config from global_llm_config.yaml. - - Args: - llm_config_id: The id of the config to load (default: -1) - - Returns: - LLM config dict or None if not found - """ - # Get the config file path - base_dir = Path(__file__).resolve().parent.parent.parent.parent - config_file = base_dir / "app" / "config" / "global_llm_config.yaml" - - # Fallback to example file if main config doesn't exist - if not config_file.exists(): - config_file = base_dir / "app" / "config" / "global_llm_config.example.yaml" - if not config_file.exists(): - print("Error: No global_llm_config.yaml or example file found") - return None - - try: - with open(config_file, encoding="utf-8") as f: - data = yaml.safe_load(f) - configs = data.get("global_llm_configs", []) - for cfg in configs: - if isinstance(cfg, dict) and cfg.get("id") == llm_config_id: - return cfg - - print(f"Error: Global LLM config id {llm_config_id} not found") - return None - except Exception as e: - print(f"Error loading config: {e}") - return None - - -def load_global_llm_config_by_id(llm_config_id: int) -> dict | None: - """ - Load a global LLM config by ID, checking in-memory configs first. - - This handles both static YAML configs and dynamically injected configs - (e.g. OpenRouter integration models that only exist in memory). - - Args: - llm_config_id: The negative ID of the global config to load - - Returns: - LLM config dict or None if not found - """ - from app.config import config as app_config - - for cfg in app_config.GLOBAL_LLM_CONFIGS: - if cfg.get("id") == llm_config_id: - return cfg - # Fallback to YAML file read (covers edge cases like hot-reload) - return load_llm_config_from_yaml(llm_config_id) - - -async def load_new_llm_config_from_db( - session: AsyncSession, - config_id: int, -) -> "AgentConfig | None": - """ - Load a NewLLMConfig from the database by ID. - - Args: - session: AsyncSession for database access - config_id: The ID of the NewLLMConfig to load - - Returns: - AgentConfig instance or None if not found - """ - # Import here to avoid circular imports - from app.db import NewLLMConfig - - try: - result = await session.execute( - select(NewLLMConfig).filter(NewLLMConfig.id == config_id) - ) - config = result.scalars().first() - - if not config: - print(f"Error: NewLLMConfig with id {config_id} not found") - return None - - return AgentConfig.from_new_llm_config(config) - except Exception as e: - print(f"Error loading NewLLMConfig from database: {e}") - return None - - -async def load_agent_llm_config_for_search_space( - session: AsyncSession, - search_space_id: int, -) -> "AgentConfig | None": - """ - Load the agent LLM configuration for a search space. - - This loads the LLM config based on the search space's agent_llm_id setting: - - Positive ID: Load from NewLLMConfig database table - - Negative ID: Load from YAML global configs - - None: Falls back to first global config (id=-1) - - Args: - session: AsyncSession for database access - search_space_id: The search space ID - - Returns: - AgentConfig instance or None if not found - """ - # Import here to avoid circular imports - from app.db import SearchSpace - - try: - # Get the search space to check its agent_llm_id preference - result = await session.execute( - select(SearchSpace).filter(SearchSpace.id == search_space_id) - ) - search_space = result.scalars().first() - - if not search_space: - print(f"Error: SearchSpace with id {search_space_id} not found") - return None - - # Use agent_llm_id from search space, fallback to -1 (first global config) - config_id = ( - search_space.agent_llm_id if search_space.agent_llm_id is not None else -1 - ) - - # Load the config using the unified loader - return await load_agent_config(session, config_id, search_space_id) - except Exception as e: - print(f"Error loading agent LLM config for search space {search_space_id}: {e}") - return None - - -async def load_agent_config( - session: AsyncSession, - config_id: int, - search_space_id: int | None = None, -) -> "AgentConfig | None": - """ - Load an agent configuration, supporting Auto mode, YAML, and database configs. - - This is the main entry point for loading configurations: - - ID 0: Auto mode (uses LiteLLM Router for load balancing) - - Negative IDs: Load from YAML file (global configs) - - Positive IDs: Load from NewLLMConfig database table - - Args: - session: AsyncSession for database access - config_id: The config ID (0 for Auto, negative for YAML, positive for database) - search_space_id: Optional search space ID for context - - Returns: - AgentConfig instance or None if not found - """ - # Auto mode (ID 0) - use LiteLLM Router - if is_auto_mode(config_id): - if not LLMRouterService.is_initialized(): - print("Error: Auto mode requested but LLM Router not initialized") - return None - return AgentConfig.from_auto_mode() - - if config_id < 0: - # Check in-memory configs first (includes static YAML + dynamic OpenRouter) - from app.config import config as app_config - - for cfg in app_config.GLOBAL_LLM_CONFIGS: - if cfg.get("id") == config_id: - return AgentConfig.from_yaml_config(cfg) - # Fallback to YAML file read for safety - yaml_config = load_llm_config_from_yaml(config_id) - if yaml_config: - return AgentConfig.from_yaml_config(yaml_config) - return None - else: - # Load from database (NewLLMConfig) - return await load_new_llm_config_from_db(session, config_id) - - -def create_chat_litellm_from_config(llm_config: dict) -> ChatLiteLLM | None: - """ - Create a ChatLiteLLM instance from a global LLM config dictionary. - - Args: - llm_config: LLM configuration dictionary from YAML - - Returns: - ChatLiteLLM instance or None on error - """ - # Build the model string - if llm_config.get("custom_provider"): - model_string = f"{llm_config['custom_provider']}/{llm_config['model_name']}" - else: - provider = llm_config.get("provider", "").upper() - provider_prefix = PROVIDER_MAP.get(provider, provider.lower()) - model_string = f"{provider_prefix}/{llm_config['model_name']}" - - # Create ChatLiteLLM instance with streaming enabled - litellm_kwargs = { - "model": model_string, - "api_key": llm_config.get("api_key"), - "streaming": True, # Enable streaming for real-time token streaming - } - - # Add optional parameters - if llm_config.get("api_base"): - litellm_kwargs["api_base"] = llm_config["api_base"] - - # Add any additional litellm parameters - if llm_config.get("litellm_params"): - litellm_kwargs.update(llm_config["litellm_params"]) - - llm = SanitizedChatLiteLLM(**litellm_kwargs) - _attach_model_profile(llm, model_string) - # Configure LiteLLM-native prompt caching (cache_control_injection_points - # for Anthropic/Bedrock/Vertex/Gemini/Azure-AI/OpenRouter/Databricks/etc.). - # ``agent_config=None`` here — the YAML path doesn't have provider intent - # in a structured form, so we set only the universal injection points. - apply_litellm_prompt_caching(llm) - return llm - - -def create_chat_litellm_from_agent_config( - agent_config: AgentConfig, -) -> ChatLiteLLM | ChatLiteLLMRouter | None: - """ - Create a ChatLiteLLM or ChatLiteLLMRouter instance from an AgentConfig. - - For Auto mode configs, returns a ChatLiteLLMRouter that uses LiteLLM Router - for automatic load balancing across available providers. - - Args: - agent_config: AgentConfig instance - - Returns: - ChatLiteLLM or ChatLiteLLMRouter instance, or None on error - """ - # Handle Auto mode - return ChatLiteLLMRouter - if agent_config.is_auto_mode: - if not LLMRouterService.is_initialized(): - print("Error: Auto mode requested but LLM Router not initialized") - return None - try: - router_llm = get_auto_mode_llm() - if router_llm is not None: - # Universal cache_control_injection_points only — auto-mode - # fans out across providers, so OpenAI-only kwargs (e.g. - # ``prompt_cache_key``) are left off here. ``drop_params`` - # would strip them at the provider boundary anyway, but - # there's no point setting them when we don't know the - # destination. - apply_litellm_prompt_caching(router_llm, agent_config=agent_config) - return router_llm - except Exception as e: - print(f"Error creating ChatLiteLLMRouter: {e}") - return None - - # Build the model string - if agent_config.custom_provider: - model_string = f"{agent_config.custom_provider}/{agent_config.model_name}" - else: - provider_prefix = PROVIDER_MAP.get( - agent_config.provider, agent_config.provider.lower() - ) - model_string = f"{provider_prefix}/{agent_config.model_name}" - - # Create ChatLiteLLM instance with streaming enabled - litellm_kwargs = { - "model": model_string, - "api_key": agent_config.api_key, - "streaming": True, # Enable streaming for real-time token streaming - } - - # Add optional parameters - if agent_config.api_base: - litellm_kwargs["api_base"] = agent_config.api_base - - # Add any additional litellm parameters - if agent_config.litellm_params: - litellm_kwargs.update(agent_config.litellm_params) - - llm = SanitizedChatLiteLLM(**litellm_kwargs) - _attach_model_profile(llm, model_string) - # Build-time prompt caching: sets ``cache_control_injection_points`` for - # all providers and (for OpenAI/DeepSeek/xAI) ``prompt_cache_retention``. - # Per-thread ``prompt_cache_key`` is layered on later in - # ``create_surfsense_deep_agent`` once ``thread_id`` is known. - apply_litellm_prompt_caching(llm, agent_config=agent_config) - return llm +__all__ = [ + "AgentConfig", + "SanitizedChatLiteLLM", + "create_chat_litellm_from_agent_config", + "create_chat_litellm_from_config", + "load_agent_config", + "load_agent_llm_config_for_search_space", + "load_global_llm_config_by_id", + "load_llm_config_from_yaml", + "load_new_llm_config_from_db", +] diff --git a/surfsense_backend/app/agents/new_chat/middleware/flatten_system.py b/surfsense_backend/app/agents/new_chat/middleware/flatten_system.py index 29cd57aa0..49d51a043 100644 --- a/surfsense_backend/app/agents/new_chat/middleware/flatten_system.py +++ b/surfsense_backend/app/agents/new_chat/middleware/flatten_system.py @@ -28,7 +28,7 @@ https://github.com/BerriAI/litellm/issues/20485 — the litellm-side fix in PR #15395 covers the litellm transformer but does not protect us when the OpenRouter SaaS itself does the redistribution.) -A separate fix in :mod:`app.agents.new_chat.prompt_caching` (switching +A separate fix in :mod:`app.agents.shared.prompt_caching` (switching the first injection point from ``role: system`` to ``index: 0``) neutralises the *primary* cause of the same 400 — multiple ``SystemMessage``\ s injected by ``before_agent`` middlewares diff --git a/surfsense_backend/app/agents/new_chat/prompt_caching.py b/surfsense_backend/app/agents/new_chat/prompt_caching.py index b58a48266..bfe421db0 100644 --- a/surfsense_backend/app/agents/new_chat/prompt_caching.py +++ b/surfsense_backend/app/agents/new_chat/prompt_caching.py @@ -1,241 +1,13 @@ -r"""LiteLLM-native prompt caching configuration for SurfSense agents. +"""Backward-compatible shim. -Replaces the legacy ``AnthropicPromptCachingMiddleware`` (which never -activated for our LiteLLM-based stack — its ``isinstance(model, ChatAnthropic)`` -gate always failed) with LiteLLM's universal caching mechanism. - -Coverage: - -- Marker-based providers (need ``cache_control`` injection, which LiteLLM - performs automatically when ``cache_control_injection_points`` is set): - ``anthropic/``, ``bedrock/``, ``vertex_ai/``, ``gemini/``, ``azure_ai/``, - ``openrouter/`` (Claude/Gemini/MiniMax/GLM/z-ai routes), ``databricks/`` - (Claude), ``dashscope/`` (Qwen), ``minimax/``, ``zai/`` (GLM). -- Auto-cached (LiteLLM strips the marker silently): ``openai/``, - ``deepseek/``, ``xai/`` — these caches automatically for prompts ≥1024 - tokens and surface ``prompt_cache_key`` / ``prompt_cache_retention``. - -We inject **two** breakpoints per request: - -- ``index: 0`` — pins the SurfSense system prompt at the head of the - request (provider variant, citation rules, tool catalog, KB tree, - skills metadata). The langchain agent factory always prepends - ``request.system_message`` at index 0 (see ``factory.py`` - ``_execute_model_async``), so this targets exactly the main system - prompt regardless of how many other ``SystemMessage``\ s the - ``before_agent`` injectors (priority, tree, memory, file-intent, - anonymous-doc) have inserted into ``state["messages"]``. Using - ``role: system`` here would apply ``cache_control`` to **every** - system-role message and trip Anthropic's hard cap of 4 cache - breakpoints per request once the conversation accumulates enough - injected system messages — which surfaces as the upstream 400 - ``A maximum of 4 blocks with cache_control may be provided. Found N`` - via OpenRouter→Anthropic. -- ``index: -1`` — pins the latest message so multi-turn savings compound: - Anthropic-family providers use longest-matching-prefix lookup, so turn - N+1 still reads turn N's cache up to the shared prefix. - -For OpenAI-family configs we additionally pass: - -- ``prompt_cache_key=f"surfsense-thread-{thread_id}"`` — routing hint that - raises hit rate by sending requests with a shared prefix to the same - backend. Supported by ``openai/``, ``deepseek/``, ``xai/``, and - ``azure/`` (added to LiteLLM's Azure transformer in - https://github.com/BerriAI/litellm/pull/20989, Feb 2026; verified - against ``AzureOpenAIConfig.get_supported_openai_params`` in our - installed litellm 1.83.14 for ``azure/gpt-4o``, ``azure/gpt-4o-mini``, - ``azure/gpt-5.4``, ``azure/gpt-5.4-mini``). -- ``prompt_cache_retention="24h"`` — extends cache TTL beyond the default - 5-10 min in-memory cache. Set ONLY for OpenAI/DeepSeek/xAI: Azure's - server-side support landed in Microsoft's docs on 2026-05-13 but - LiteLLM 1.83.14's Azure transformer still omits it from its supported - params list, so it gets silently dropped by ``litellm.drop_params``. - Azure's default in-memory retention (5-10 min, max 1 h) already - bridges intra-conversation turns; revisit when LiteLLM bumps Azure. - -Safety net: ``litellm.drop_params=True`` is set globally in -``app.services.llm_service`` at module-load time. Any kwarg the destination -provider doesn't recognise is auto-stripped at the provider transformer -layer, so an OpenAI→Bedrock auto-mode fallback can't 400 on -``prompt_cache_key`` etc. +The LiteLLM prompt-caching helper now lives in the shared agent kernel at +``app.agents.shared.prompt_caching``. This module re-exports it so frozen +single-agent code (``chat_deepagent``) keeps working until that stack is +retired. """ from __future__ import annotations -import logging -from typing import TYPE_CHECKING, Any +from app.agents.shared.prompt_caching import apply_litellm_prompt_caching -from langchain_core.language_models import BaseChatModel - -if TYPE_CHECKING: - from app.agents.new_chat.llm_config import AgentConfig - -logger = logging.getLogger(__name__) - - -# Two-breakpoint policy: head-of-request + latest message. See module -# docstring for rationale. Anthropic caps requests at 4 ``cache_control`` -# blocks; we use 2 here, leaving headroom for Phase-2 tool caching. -# -# IMPORTANT: ``index: 0`` (not ``role: system``). The deepagent stack's -# ``before_agent`` middlewares (priority, tree, memory, file-intent, -# anonymous-doc) insert ``SystemMessage`` instances into -# ``state["messages"]`` that accumulate across turns. With -# ``role: system`` the LiteLLM hook would tag *every* one of them with -# ``cache_control`` and overflow Anthropic's 4-block limit. ``index: 0`` -# always targets the langchain-prepended ``request.system_message`` -# (which our ``FlattenSystemMessageMiddleware`` reduces to a single text -# block), giving us exactly one stable cache breakpoint. -_DEFAULT_INJECTION_POINTS: tuple[dict[str, Any], ...] = ( - {"location": "message", "index": 0}, - {"location": "message", "index": -1}, -) - -# Providers (uppercase ``AgentConfig.provider`` values) that accept the -# OpenAI ``prompt_cache_key`` routing hint. Microsoft's Azure OpenAI docs -# (2026-05-13) confirm automatic prompt caching applies to every GPT-4o -# or newer Azure deployment at ≥1024 tokens with no configuration needed, -# and that ``prompt_cache_key`` is combined with the prefix hash to -# improve routing affinity and therefore cache hit rate. LiteLLM's Azure -# transformer ships ``prompt_cache_key`` in its supported params as of -# https://github.com/BerriAI/litellm/pull/20989. -# -# Strict whitelist — many other providers in ``PROVIDER_MAP`` route -# through litellm's ``openai`` prefix without implementing the OpenAI -# prompt-cache surface (e.g. MOONSHOT, ZHIPU, MINIMAX), so we can't infer -# family from the litellm prefix alone. -_PROMPT_CACHE_KEY_PROVIDERS: frozenset[str] = frozenset( - {"OPENAI", "DEEPSEEK", "XAI", "AZURE", "AZURE_OPENAI"} -) - -# Subset of ``_PROMPT_CACHE_KEY_PROVIDERS`` that also accept -# ``prompt_cache_retention="24h"``. Azure is excluded: see module -# docstring — LiteLLM 1.83.14's Azure transformer omits the param so -# ``drop_params`` silently strips it. Re-add Azure once a future LiteLLM -# release wires it into ``AzureOpenAIConfig.get_supported_openai_params``. -_PROMPT_CACHE_RETENTION_PROVIDERS: frozenset[str] = frozenset( - {"OPENAI", "DEEPSEEK", "XAI"} -) - - -def _is_router_llm(llm: BaseChatModel) -> bool: - """Detect ``ChatLiteLLMRouter`` (auto-mode) without an eager import. - - Importing ``app.services.llm_router_service`` at module-load time would - create a cycle via ``llm_config -> prompt_caching -> llm_router_service``. - Class-name comparison is sufficient since the class is defined in a - single place. - """ - return type(llm).__name__ == "ChatLiteLLMRouter" - - -def _provider_supports_prompt_cache_key(agent_config: AgentConfig | None) -> bool: - """Whether the config targets a provider that accepts ``prompt_cache_key``. - - Strict — only returns True for explicitly chosen OPENAI, DEEPSEEK, - XAI, AZURE, or AZURE_OPENAI providers. Auto-mode and custom - providers return False because we can't statically know the - destination and the router fans out across mixed providers. - """ - if agent_config is None or not agent_config.provider: - return False - if agent_config.is_auto_mode: - return False - if agent_config.custom_provider: - return False - return agent_config.provider.upper() in _PROMPT_CACHE_KEY_PROVIDERS - - -def _provider_supports_prompt_cache_retention( - agent_config: AgentConfig | None, -) -> bool: - """Whether the config targets a provider that accepts ``prompt_cache_retention``. - - Tighter than :func:`_provider_supports_prompt_cache_key` — Azure - deployments are excluded until LiteLLM ships the param in its Azure - transformer (see module docstring). - """ - if agent_config is None or not agent_config.provider: - return False - if agent_config.is_auto_mode: - return False - if agent_config.custom_provider: - return False - return agent_config.provider.upper() in _PROMPT_CACHE_RETENTION_PROVIDERS - - -def _get_or_init_model_kwargs(llm: BaseChatModel) -> dict[str, Any] | None: - """Return ``llm.model_kwargs`` as a writable dict, or ``None`` to bail. - - Initialises the field to ``{}`` when present-but-None on a Pydantic v2 - model. Returns ``None`` if the LLM type doesn't expose a writable - ``model_kwargs`` attribute (caller should treat as no-op). - """ - model_kwargs = getattr(llm, "model_kwargs", None) - if isinstance(model_kwargs, dict): - return model_kwargs - try: - llm.model_kwargs = {} # type: ignore[attr-defined] - except Exception: - return None - refreshed = getattr(llm, "model_kwargs", None) - return refreshed if isinstance(refreshed, dict) else None - - -def apply_litellm_prompt_caching( - llm: BaseChatModel, - *, - agent_config: AgentConfig | None = None, - thread_id: int | None = None, -) -> None: - """Configure LiteLLM prompt caching on a ChatLiteLLM/ChatLiteLLMRouter. - - Idempotent — values already present in ``llm.model_kwargs`` (e.g. from - ``agent_config.litellm_params`` overrides) are preserved. Mutates - ``llm.model_kwargs`` in place; the kwargs flow to ``litellm.completion`` - via ``ChatLiteLLM._default_params`` and via ``self.model_kwargs`` merge - in our custom ``ChatLiteLLMRouter``. - - Args: - llm: ChatLiteLLM, SanitizedChatLiteLLM, or ChatLiteLLMRouter instance. - agent_config: Optional ``AgentConfig`` driving provider-specific - behaviour. When omitted (or auto-mode), only the universal - ``cache_control_injection_points`` are set. - thread_id: Optional thread id used to construct a per-thread - ``prompt_cache_key`` for OpenAI-family providers. Caching still - works without it (server-side automatic), but the key improves - backend routing affinity and therefore hit rate. - """ - model_kwargs = _get_or_init_model_kwargs(llm) - if model_kwargs is None: - logger.debug( - "apply_litellm_prompt_caching: %s exposes no writable model_kwargs; skipping", - type(llm).__name__, - ) - return - - if "cache_control_injection_points" not in model_kwargs: - model_kwargs["cache_control_injection_points"] = [ - dict(point) for point in _DEFAULT_INJECTION_POINTS - ] - - # OpenAI-style extras only when we statically know the destination - # accepts them. Auto-mode router fans out across mixed providers so - # we can't safely set destination-specific kwargs there (drop_params - # would strip them but it's wasteful to set them in the first - # place). - if _is_router_llm(llm): - return - - if ( - thread_id is not None - and "prompt_cache_key" not in model_kwargs - and _provider_supports_prompt_cache_key(agent_config) - ): - model_kwargs["prompt_cache_key"] = f"surfsense-thread-{thread_id}" - - if ( - "prompt_cache_retention" not in model_kwargs - and _provider_supports_prompt_cache_retention(agent_config) - ): - model_kwargs["prompt_cache_retention"] = "24h" +__all__ = ["apply_litellm_prompt_caching"] diff --git a/surfsense_backend/app/agents/shared/llm_config.py b/surfsense_backend/app/agents/shared/llm_config.py new file mode 100644 index 000000000..2e60129a9 --- /dev/null +++ b/surfsense_backend/app/agents/shared/llm_config.py @@ -0,0 +1,622 @@ +""" +LLM configuration utilities for SurfSense agents. + +This module provides functions for loading LLM configurations from: +1. Auto mode (ID 0) - Uses LiteLLM Router for load balancing +2. YAML files (global configs with negative IDs) +3. Database NewLLMConfig table (user-created configs with positive IDs) + +It also provides utilities for creating ChatLiteLLM instances and +managing prompt configurations. +""" + +from collections.abc import AsyncIterator +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +import yaml +from langchain_core.callbacks import ( + AsyncCallbackManagerForLLMRun, + CallbackManagerForLLMRun, +) +from langchain_core.messages import AIMessage, BaseMessage +from langchain_core.outputs import ChatGenerationChunk, ChatResult +from langchain_litellm import ChatLiteLLM +from litellm import get_model_info +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from app.agents.shared.prompt_caching import apply_litellm_prompt_caching +from app.services.llm_router_service import ( + AUTO_MODE_ID, + ChatLiteLLMRouter, + LLMRouterService, + _sanitize_content, + get_auto_mode_llm, + is_auto_mode, +) + + +def _sanitize_messages(messages: list[BaseMessage]) -> list[BaseMessage]: + """Sanitize content on every message so it is safe for any provider. + + Handles three cross-provider incompatibilities: + - List content with provider-specific blocks (e.g. ``thinking``) + - List content with bare strings or empty text blocks + - AI messages with empty content + tool calls: some providers (Bedrock) + convert ``""`` to ``[{"type":"text","text":""}]`` server-side then + reject the blank text. The OpenAI spec says ``content`` should be + ``null`` when an assistant message only carries tool calls. + """ + for msg in messages: + if isinstance(msg.content, list): + msg.content = _sanitize_content(msg.content) + if ( + isinstance(msg, AIMessage) + and (not msg.content or msg.content == "") + and getattr(msg, "tool_calls", None) + ): + msg.content = None # type: ignore[assignment] + return messages + + +class SanitizedChatLiteLLM(ChatLiteLLM): + """ChatLiteLLM subclass that strips provider-specific content blocks + (e.g. ``thinking`` from reasoning models) and normalises bare strings + in content arrays before forwarding to the underlying provider.""" + + def _generate( + self, + messages: list[BaseMessage], + stop: list[str] | None = None, + run_manager: CallbackManagerForLLMRun | None = None, + **kwargs: Any, + ) -> ChatResult: + return super()._generate( + _sanitize_messages(messages), stop, run_manager, **kwargs + ) + + async def _astream( + self, + messages: list[BaseMessage], + stop: list[str] | None = None, + run_manager: AsyncCallbackManagerForLLMRun | None = None, + **kwargs: Any, + ) -> AsyncIterator[ChatGenerationChunk]: + async for chunk in super()._astream( + _sanitize_messages(messages), stop, run_manager, **kwargs + ): + yield chunk + + +# Provider mapping for LiteLLM model string construction. +# +# Single source of truth lives in +# :mod:`app.services.provider_capabilities` so the YAML loader (which +# runs during ``app.config`` class-body init) can resolve provider +# prefixes without dragging the agent / tools tree into module load +# order. Re-exported here under the historical ``PROVIDER_MAP`` name +# so existing callers (``llm_router_service``, ``image_gen_router_service``, +# tests) keep working unchanged. +from app.services.provider_capabilities import ( # noqa: E402 + _PROVIDER_PREFIX_MAP as PROVIDER_MAP, +) + + +def _attach_model_profile(llm: ChatLiteLLM, model_string: str) -> None: + """Attach a ``profile`` dict to ChatLiteLLM with model context metadata.""" + try: + info = get_model_info(model_string) + max_input_tokens = info.get("max_input_tokens") + if isinstance(max_input_tokens, int) and max_input_tokens > 0: + llm.profile = { + "max_input_tokens": max_input_tokens, + "max_input_tokens_upper": max_input_tokens, + "token_count_model": model_string, + "token_count_models": [model_string], + } + except Exception: + return + + +@dataclass +class AgentConfig: + """ + Complete configuration for the SurfSense agent. + + This combines LLM settings with prompt configuration from NewLLMConfig. + Supports Auto mode (ID 0) which uses LiteLLM Router for load balancing. + """ + + # LLM Model Settings + provider: str + model_name: str + api_key: str + api_base: str | None = None + custom_provider: str | None = None + litellm_params: dict | None = None + + # Prompt Configuration + system_instructions: str | None = None + use_default_system_instructions: bool = True + citations_enabled: bool = True + + # Metadata + config_id: int | None = None + config_name: str | None = None + + # Auto mode flag + is_auto_mode: bool = False + + # Token quota and policy + billing_tier: str = "free" + is_premium: bool = False + anonymous_enabled: bool = False + quota_reserve_tokens: int | None = None + + # Capability flag: best-effort True for the chat selector / catalog. + # Resolved via :func:`provider_capabilities.derive_supports_image_input` + # which prefers OpenRouter's ``architecture.input_modalities`` and + # otherwise consults LiteLLM's authoritative model map. Default True + # is the conservative-allow stance — the streaming-task safety net + # (``is_known_text_only_chat_model``) is the *only* place a False + # actually blocks a request. Setting this to False here without an + # authoritative source would silently hide vision-capable models + # (the regression we're fixing). + supports_image_input: bool = True + + @classmethod + def from_auto_mode(cls) -> "AgentConfig": + """ + Create an AgentConfig for Auto mode (LiteLLM Router load balancing). + + Returns: + AgentConfig instance configured for Auto mode + """ + return cls( + provider="AUTO", + model_name="auto", + api_key="", # Not needed for router + api_base=None, + custom_provider=None, + litellm_params=None, + system_instructions=None, + use_default_system_instructions=True, + citations_enabled=True, + config_id=AUTO_MODE_ID, + config_name="Auto (Fastest)", + is_auto_mode=True, + billing_tier="free", + is_premium=False, + anonymous_enabled=False, + quota_reserve_tokens=None, + # Auto routes across the configured pool, which usually + # contains at least one vision-capable deployment; the router + # will surface a 404 from a non-vision deployment as a normal + # ``allowed_fails`` event and fail over rather than blocking + # the request outright. + supports_image_input=True, + ) + + @classmethod + def from_new_llm_config(cls, config) -> "AgentConfig": + """ + Create an AgentConfig from a NewLLMConfig database model. + + Args: + config: NewLLMConfig database model instance + + Returns: + AgentConfig instance + """ + # Lazy import to avoid pulling provider_capabilities (and its + # transitive litellm import) into module-init order. + from app.services.provider_capabilities import derive_supports_image_input + + provider_value = ( + config.provider.value + if hasattr(config.provider, "value") + else str(config.provider) + ) + litellm_params = config.litellm_params or {} + base_model = ( + litellm_params.get("base_model") + if isinstance(litellm_params, dict) + else None + ) + + return cls( + provider=provider_value, + model_name=config.model_name, + api_key=config.api_key, + api_base=config.api_base, + custom_provider=config.custom_provider, + litellm_params=config.litellm_params, + system_instructions=config.system_instructions, + use_default_system_instructions=config.use_default_system_instructions, + citations_enabled=config.citations_enabled, + config_id=config.id, + config_name=config.name, + is_auto_mode=False, + billing_tier="free", + is_premium=False, + anonymous_enabled=False, + quota_reserve_tokens=None, + # BYOK rows have no operator-curated capability flag, so we + # ask LiteLLM (default-allow on unknown). The streaming + # safety net still blocks if the model is *explicitly* + # marked text-only. + supports_image_input=derive_supports_image_input( + provider=provider_value, + model_name=config.model_name, + base_model=base_model, + custom_provider=config.custom_provider, + ), + ) + + @classmethod + def from_yaml_config(cls, yaml_config: dict) -> "AgentConfig": + """ + Create an AgentConfig from a YAML configuration dictionary. + + YAML configs now support the same prompt configuration fields as NewLLMConfig: + - system_instructions: Custom system instructions (empty string uses defaults) + - use_default_system_instructions: Whether to use default instructions + - citations_enabled: Whether citations are enabled + + Args: + yaml_config: Configuration dictionary from YAML file + + Returns: + AgentConfig instance + """ + # Lazy import to avoid pulling provider_capabilities (and its + # transitive litellm import) into module-init order. + from app.services.provider_capabilities import derive_supports_image_input + + # Get system instructions from YAML, default to empty string + system_instructions = yaml_config.get("system_instructions", "") + + provider = yaml_config.get("provider", "").upper() + model_name = yaml_config.get("model_name", "") + custom_provider = yaml_config.get("custom_provider") + litellm_params = yaml_config.get("litellm_params") or {} + base_model = ( + litellm_params.get("base_model") + if isinstance(litellm_params, dict) + else None + ) + + # Explicit YAML override wins; otherwise derive from LiteLLM / + # OpenRouter modalities. The YAML loader already populates this + # field, but this method is also called from + # ``load_global_llm_config_by_id``'s file fallback (hot reload), + # so we re-derive here for safety. The bool() coercion preserves + # the loader's behaviour for explicit ``true`` / ``false`` + # strings that PyYAML may surface. + if "supports_image_input" in yaml_config: + supports_image_input = bool(yaml_config.get("supports_image_input")) + else: + supports_image_input = derive_supports_image_input( + provider=provider, + model_name=model_name, + base_model=base_model, + custom_provider=custom_provider, + ) + + return cls( + provider=provider, + model_name=model_name, + api_key=yaml_config.get("api_key", ""), + api_base=yaml_config.get("api_base"), + custom_provider=custom_provider, + litellm_params=yaml_config.get("litellm_params"), + # Prompt configuration from YAML (with defaults for backwards compatibility) + system_instructions=system_instructions if system_instructions else None, + use_default_system_instructions=yaml_config.get( + "use_default_system_instructions", True + ), + citations_enabled=yaml_config.get("citations_enabled", True), + config_id=yaml_config.get("id"), + config_name=yaml_config.get("name"), + is_auto_mode=False, + billing_tier=yaml_config.get("billing_tier", "free"), + is_premium=yaml_config.get("billing_tier", "free") == "premium", + anonymous_enabled=yaml_config.get("anonymous_enabled", False), + quota_reserve_tokens=yaml_config.get("quota_reserve_tokens"), + supports_image_input=supports_image_input, + ) + + +def load_llm_config_from_yaml(llm_config_id: int = -1) -> dict | None: + """ + Load a specific LLM config from global_llm_config.yaml. + + Args: + llm_config_id: The id of the config to load (default: -1) + + Returns: + LLM config dict or None if not found + """ + # Get the config file path + base_dir = Path(__file__).resolve().parent.parent.parent.parent + config_file = base_dir / "app" / "config" / "global_llm_config.yaml" + + # Fallback to example file if main config doesn't exist + if not config_file.exists(): + config_file = base_dir / "app" / "config" / "global_llm_config.example.yaml" + if not config_file.exists(): + print("Error: No global_llm_config.yaml or example file found") + return None + + try: + with open(config_file, encoding="utf-8") as f: + data = yaml.safe_load(f) + configs = data.get("global_llm_configs", []) + for cfg in configs: + if isinstance(cfg, dict) and cfg.get("id") == llm_config_id: + return cfg + + print(f"Error: Global LLM config id {llm_config_id} not found") + return None + except Exception as e: + print(f"Error loading config: {e}") + return None + + +def load_global_llm_config_by_id(llm_config_id: int) -> dict | None: + """ + Load a global LLM config by ID, checking in-memory configs first. + + This handles both static YAML configs and dynamically injected configs + (e.g. OpenRouter integration models that only exist in memory). + + Args: + llm_config_id: The negative ID of the global config to load + + Returns: + LLM config dict or None if not found + """ + from app.config import config as app_config + + for cfg in app_config.GLOBAL_LLM_CONFIGS: + if cfg.get("id") == llm_config_id: + return cfg + # Fallback to YAML file read (covers edge cases like hot-reload) + return load_llm_config_from_yaml(llm_config_id) + + +async def load_new_llm_config_from_db( + session: AsyncSession, + config_id: int, +) -> "AgentConfig | None": + """ + Load a NewLLMConfig from the database by ID. + + Args: + session: AsyncSession for database access + config_id: The ID of the NewLLMConfig to load + + Returns: + AgentConfig instance or None if not found + """ + # Import here to avoid circular imports + from app.db import NewLLMConfig + + try: + result = await session.execute( + select(NewLLMConfig).filter(NewLLMConfig.id == config_id) + ) + config = result.scalars().first() + + if not config: + print(f"Error: NewLLMConfig with id {config_id} not found") + return None + + return AgentConfig.from_new_llm_config(config) + except Exception as e: + print(f"Error loading NewLLMConfig from database: {e}") + return None + + +async def load_agent_llm_config_for_search_space( + session: AsyncSession, + search_space_id: int, +) -> "AgentConfig | None": + """ + Load the agent LLM configuration for a search space. + + This loads the LLM config based on the search space's agent_llm_id setting: + - Positive ID: Load from NewLLMConfig database table + - Negative ID: Load from YAML global configs + - None: Falls back to first global config (id=-1) + + Args: + session: AsyncSession for database access + search_space_id: The search space ID + + Returns: + AgentConfig instance or None if not found + """ + # Import here to avoid circular imports + from app.db import SearchSpace + + try: + # Get the search space to check its agent_llm_id preference + result = await session.execute( + select(SearchSpace).filter(SearchSpace.id == search_space_id) + ) + search_space = result.scalars().first() + + if not search_space: + print(f"Error: SearchSpace with id {search_space_id} not found") + return None + + # Use agent_llm_id from search space, fallback to -1 (first global config) + config_id = ( + search_space.agent_llm_id if search_space.agent_llm_id is not None else -1 + ) + + # Load the config using the unified loader + return await load_agent_config(session, config_id, search_space_id) + except Exception as e: + print(f"Error loading agent LLM config for search space {search_space_id}: {e}") + return None + + +async def load_agent_config( + session: AsyncSession, + config_id: int, + search_space_id: int | None = None, +) -> "AgentConfig | None": + """ + Load an agent configuration, supporting Auto mode, YAML, and database configs. + + This is the main entry point for loading configurations: + - ID 0: Auto mode (uses LiteLLM Router for load balancing) + - Negative IDs: Load from YAML file (global configs) + - Positive IDs: Load from NewLLMConfig database table + + Args: + session: AsyncSession for database access + config_id: The config ID (0 for Auto, negative for YAML, positive for database) + search_space_id: Optional search space ID for context + + Returns: + AgentConfig instance or None if not found + """ + # Auto mode (ID 0) - use LiteLLM Router + if is_auto_mode(config_id): + if not LLMRouterService.is_initialized(): + print("Error: Auto mode requested but LLM Router not initialized") + return None + return AgentConfig.from_auto_mode() + + if config_id < 0: + # Check in-memory configs first (includes static YAML + dynamic OpenRouter) + from app.config import config as app_config + + for cfg in app_config.GLOBAL_LLM_CONFIGS: + if cfg.get("id") == config_id: + return AgentConfig.from_yaml_config(cfg) + # Fallback to YAML file read for safety + yaml_config = load_llm_config_from_yaml(config_id) + if yaml_config: + return AgentConfig.from_yaml_config(yaml_config) + return None + else: + # Load from database (NewLLMConfig) + return await load_new_llm_config_from_db(session, config_id) + + +def create_chat_litellm_from_config(llm_config: dict) -> ChatLiteLLM | None: + """ + Create a ChatLiteLLM instance from a global LLM config dictionary. + + Args: + llm_config: LLM configuration dictionary from YAML + + Returns: + ChatLiteLLM instance or None on error + """ + # Build the model string + if llm_config.get("custom_provider"): + model_string = f"{llm_config['custom_provider']}/{llm_config['model_name']}" + else: + provider = llm_config.get("provider", "").upper() + provider_prefix = PROVIDER_MAP.get(provider, provider.lower()) + model_string = f"{provider_prefix}/{llm_config['model_name']}" + + # Create ChatLiteLLM instance with streaming enabled + litellm_kwargs = { + "model": model_string, + "api_key": llm_config.get("api_key"), + "streaming": True, # Enable streaming for real-time token streaming + } + + # Add optional parameters + if llm_config.get("api_base"): + litellm_kwargs["api_base"] = llm_config["api_base"] + + # Add any additional litellm parameters + if llm_config.get("litellm_params"): + litellm_kwargs.update(llm_config["litellm_params"]) + + llm = SanitizedChatLiteLLM(**litellm_kwargs) + _attach_model_profile(llm, model_string) + # Configure LiteLLM-native prompt caching (cache_control_injection_points + # for Anthropic/Bedrock/Vertex/Gemini/Azure-AI/OpenRouter/Databricks/etc.). + # ``agent_config=None`` here — the YAML path doesn't have provider intent + # in a structured form, so we set only the universal injection points. + apply_litellm_prompt_caching(llm) + return llm + + +def create_chat_litellm_from_agent_config( + agent_config: AgentConfig, +) -> ChatLiteLLM | ChatLiteLLMRouter | None: + """ + Create a ChatLiteLLM or ChatLiteLLMRouter instance from an AgentConfig. + + For Auto mode configs, returns a ChatLiteLLMRouter that uses LiteLLM Router + for automatic load balancing across available providers. + + Args: + agent_config: AgentConfig instance + + Returns: + ChatLiteLLM or ChatLiteLLMRouter instance, or None on error + """ + # Handle Auto mode - return ChatLiteLLMRouter + if agent_config.is_auto_mode: + if not LLMRouterService.is_initialized(): + print("Error: Auto mode requested but LLM Router not initialized") + return None + try: + router_llm = get_auto_mode_llm() + if router_llm is not None: + # Universal cache_control_injection_points only — auto-mode + # fans out across providers, so OpenAI-only kwargs (e.g. + # ``prompt_cache_key``) are left off here. ``drop_params`` + # would strip them at the provider boundary anyway, but + # there's no point setting them when we don't know the + # destination. + apply_litellm_prompt_caching(router_llm, agent_config=agent_config) + return router_llm + except Exception as e: + print(f"Error creating ChatLiteLLMRouter: {e}") + return None + + # Build the model string + if agent_config.custom_provider: + model_string = f"{agent_config.custom_provider}/{agent_config.model_name}" + else: + provider_prefix = PROVIDER_MAP.get( + agent_config.provider, agent_config.provider.lower() + ) + model_string = f"{provider_prefix}/{agent_config.model_name}" + + # Create ChatLiteLLM instance with streaming enabled + litellm_kwargs = { + "model": model_string, + "api_key": agent_config.api_key, + "streaming": True, # Enable streaming for real-time token streaming + } + + # Add optional parameters + if agent_config.api_base: + litellm_kwargs["api_base"] = agent_config.api_base + + # Add any additional litellm parameters + if agent_config.litellm_params: + litellm_kwargs.update(agent_config.litellm_params) + + llm = SanitizedChatLiteLLM(**litellm_kwargs) + _attach_model_profile(llm, model_string) + # Build-time prompt caching: sets ``cache_control_injection_points`` for + # all providers and (for OpenAI/DeepSeek/xAI) ``prompt_cache_retention``. + # Per-thread ``prompt_cache_key`` is layered on later in + # ``create_surfsense_deep_agent`` once ``thread_id`` is known. + apply_litellm_prompt_caching(llm, agent_config=agent_config) + return llm diff --git a/surfsense_backend/app/agents/shared/prompt_caching.py b/surfsense_backend/app/agents/shared/prompt_caching.py new file mode 100644 index 000000000..f8aae45a8 --- /dev/null +++ b/surfsense_backend/app/agents/shared/prompt_caching.py @@ -0,0 +1,241 @@ +r"""LiteLLM-native prompt caching configuration for SurfSense agents. + +Replaces the legacy ``AnthropicPromptCachingMiddleware`` (which never +activated for our LiteLLM-based stack — its ``isinstance(model, ChatAnthropic)`` +gate always failed) with LiteLLM's universal caching mechanism. + +Coverage: + +- Marker-based providers (need ``cache_control`` injection, which LiteLLM + performs automatically when ``cache_control_injection_points`` is set): + ``anthropic/``, ``bedrock/``, ``vertex_ai/``, ``gemini/``, ``azure_ai/``, + ``openrouter/`` (Claude/Gemini/MiniMax/GLM/z-ai routes), ``databricks/`` + (Claude), ``dashscope/`` (Qwen), ``minimax/``, ``zai/`` (GLM). +- Auto-cached (LiteLLM strips the marker silently): ``openai/``, + ``deepseek/``, ``xai/`` — these caches automatically for prompts ≥1024 + tokens and surface ``prompt_cache_key`` / ``prompt_cache_retention``. + +We inject **two** breakpoints per request: + +- ``index: 0`` — pins the SurfSense system prompt at the head of the + request (provider variant, citation rules, tool catalog, KB tree, + skills metadata). The langchain agent factory always prepends + ``request.system_message`` at index 0 (see ``factory.py`` + ``_execute_model_async``), so this targets exactly the main system + prompt regardless of how many other ``SystemMessage``\ s the + ``before_agent`` injectors (priority, tree, memory, file-intent, + anonymous-doc) have inserted into ``state["messages"]``. Using + ``role: system`` here would apply ``cache_control`` to **every** + system-role message and trip Anthropic's hard cap of 4 cache + breakpoints per request once the conversation accumulates enough + injected system messages — which surfaces as the upstream 400 + ``A maximum of 4 blocks with cache_control may be provided. Found N`` + via OpenRouter→Anthropic. +- ``index: -1`` — pins the latest message so multi-turn savings compound: + Anthropic-family providers use longest-matching-prefix lookup, so turn + N+1 still reads turn N's cache up to the shared prefix. + +For OpenAI-family configs we additionally pass: + +- ``prompt_cache_key=f"surfsense-thread-{thread_id}"`` — routing hint that + raises hit rate by sending requests with a shared prefix to the same + backend. Supported by ``openai/``, ``deepseek/``, ``xai/``, and + ``azure/`` (added to LiteLLM's Azure transformer in + https://github.com/BerriAI/litellm/pull/20989, Feb 2026; verified + against ``AzureOpenAIConfig.get_supported_openai_params`` in our + installed litellm 1.83.14 for ``azure/gpt-4o``, ``azure/gpt-4o-mini``, + ``azure/gpt-5.4``, ``azure/gpt-5.4-mini``). +- ``prompt_cache_retention="24h"`` — extends cache TTL beyond the default + 5-10 min in-memory cache. Set ONLY for OpenAI/DeepSeek/xAI: Azure's + server-side support landed in Microsoft's docs on 2026-05-13 but + LiteLLM 1.83.14's Azure transformer still omits it from its supported + params list, so it gets silently dropped by ``litellm.drop_params``. + Azure's default in-memory retention (5-10 min, max 1 h) already + bridges intra-conversation turns; revisit when LiteLLM bumps Azure. + +Safety net: ``litellm.drop_params=True`` is set globally in +``app.services.llm_service`` at module-load time. Any kwarg the destination +provider doesn't recognise is auto-stripped at the provider transformer +layer, so an OpenAI→Bedrock auto-mode fallback can't 400 on +``prompt_cache_key`` etc. +""" + +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING, Any + +from langchain_core.language_models import BaseChatModel + +if TYPE_CHECKING: + from app.agents.shared.llm_config import AgentConfig + +logger = logging.getLogger(__name__) + + +# Two-breakpoint policy: head-of-request + latest message. See module +# docstring for rationale. Anthropic caps requests at 4 ``cache_control`` +# blocks; we use 2 here, leaving headroom for Phase-2 tool caching. +# +# IMPORTANT: ``index: 0`` (not ``role: system``). The deepagent stack's +# ``before_agent`` middlewares (priority, tree, memory, file-intent, +# anonymous-doc) insert ``SystemMessage`` instances into +# ``state["messages"]`` that accumulate across turns. With +# ``role: system`` the LiteLLM hook would tag *every* one of them with +# ``cache_control`` and overflow Anthropic's 4-block limit. ``index: 0`` +# always targets the langchain-prepended ``request.system_message`` +# (which our ``FlattenSystemMessageMiddleware`` reduces to a single text +# block), giving us exactly one stable cache breakpoint. +_DEFAULT_INJECTION_POINTS: tuple[dict[str, Any], ...] = ( + {"location": "message", "index": 0}, + {"location": "message", "index": -1}, +) + +# Providers (uppercase ``AgentConfig.provider`` values) that accept the +# OpenAI ``prompt_cache_key`` routing hint. Microsoft's Azure OpenAI docs +# (2026-05-13) confirm automatic prompt caching applies to every GPT-4o +# or newer Azure deployment at ≥1024 tokens with no configuration needed, +# and that ``prompt_cache_key`` is combined with the prefix hash to +# improve routing affinity and therefore cache hit rate. LiteLLM's Azure +# transformer ships ``prompt_cache_key`` in its supported params as of +# https://github.com/BerriAI/litellm/pull/20989. +# +# Strict whitelist — many other providers in ``PROVIDER_MAP`` route +# through litellm's ``openai`` prefix without implementing the OpenAI +# prompt-cache surface (e.g. MOONSHOT, ZHIPU, MINIMAX), so we can't infer +# family from the litellm prefix alone. +_PROMPT_CACHE_KEY_PROVIDERS: frozenset[str] = frozenset( + {"OPENAI", "DEEPSEEK", "XAI", "AZURE", "AZURE_OPENAI"} +) + +# Subset of ``_PROMPT_CACHE_KEY_PROVIDERS`` that also accept +# ``prompt_cache_retention="24h"``. Azure is excluded: see module +# docstring — LiteLLM 1.83.14's Azure transformer omits the param so +# ``drop_params`` silently strips it. Re-add Azure once a future LiteLLM +# release wires it into ``AzureOpenAIConfig.get_supported_openai_params``. +_PROMPT_CACHE_RETENTION_PROVIDERS: frozenset[str] = frozenset( + {"OPENAI", "DEEPSEEK", "XAI"} +) + + +def _is_router_llm(llm: BaseChatModel) -> bool: + """Detect ``ChatLiteLLMRouter`` (auto-mode) without an eager import. + + Importing ``app.services.llm_router_service`` at module-load time would + create a cycle via ``llm_config -> prompt_caching -> llm_router_service``. + Class-name comparison is sufficient since the class is defined in a + single place. + """ + return type(llm).__name__ == "ChatLiteLLMRouter" + + +def _provider_supports_prompt_cache_key(agent_config: AgentConfig | None) -> bool: + """Whether the config targets a provider that accepts ``prompt_cache_key``. + + Strict — only returns True for explicitly chosen OPENAI, DEEPSEEK, + XAI, AZURE, or AZURE_OPENAI providers. Auto-mode and custom + providers return False because we can't statically know the + destination and the router fans out across mixed providers. + """ + if agent_config is None or not agent_config.provider: + return False + if agent_config.is_auto_mode: + return False + if agent_config.custom_provider: + return False + return agent_config.provider.upper() in _PROMPT_CACHE_KEY_PROVIDERS + + +def _provider_supports_prompt_cache_retention( + agent_config: AgentConfig | None, +) -> bool: + """Whether the config targets a provider that accepts ``prompt_cache_retention``. + + Tighter than :func:`_provider_supports_prompt_cache_key` — Azure + deployments are excluded until LiteLLM ships the param in its Azure + transformer (see module docstring). + """ + if agent_config is None or not agent_config.provider: + return False + if agent_config.is_auto_mode: + return False + if agent_config.custom_provider: + return False + return agent_config.provider.upper() in _PROMPT_CACHE_RETENTION_PROVIDERS + + +def _get_or_init_model_kwargs(llm: BaseChatModel) -> dict[str, Any] | None: + """Return ``llm.model_kwargs`` as a writable dict, or ``None`` to bail. + + Initialises the field to ``{}`` when present-but-None on a Pydantic v2 + model. Returns ``None`` if the LLM type doesn't expose a writable + ``model_kwargs`` attribute (caller should treat as no-op). + """ + model_kwargs = getattr(llm, "model_kwargs", None) + if isinstance(model_kwargs, dict): + return model_kwargs + try: + llm.model_kwargs = {} # type: ignore[attr-defined] + except Exception: + return None + refreshed = getattr(llm, "model_kwargs", None) + return refreshed if isinstance(refreshed, dict) else None + + +def apply_litellm_prompt_caching( + llm: BaseChatModel, + *, + agent_config: AgentConfig | None = None, + thread_id: int | None = None, +) -> None: + """Configure LiteLLM prompt caching on a ChatLiteLLM/ChatLiteLLMRouter. + + Idempotent — values already present in ``llm.model_kwargs`` (e.g. from + ``agent_config.litellm_params`` overrides) are preserved. Mutates + ``llm.model_kwargs`` in place; the kwargs flow to ``litellm.completion`` + via ``ChatLiteLLM._default_params`` and via ``self.model_kwargs`` merge + in our custom ``ChatLiteLLMRouter``. + + Args: + llm: ChatLiteLLM, SanitizedChatLiteLLM, or ChatLiteLLMRouter instance. + agent_config: Optional ``AgentConfig`` driving provider-specific + behaviour. When omitted (or auto-mode), only the universal + ``cache_control_injection_points`` are set. + thread_id: Optional thread id used to construct a per-thread + ``prompt_cache_key`` for OpenAI-family providers. Caching still + works without it (server-side automatic), but the key improves + backend routing affinity and therefore hit rate. + """ + model_kwargs = _get_or_init_model_kwargs(llm) + if model_kwargs is None: + logger.debug( + "apply_litellm_prompt_caching: %s exposes no writable model_kwargs; skipping", + type(llm).__name__, + ) + return + + if "cache_control_injection_points" not in model_kwargs: + model_kwargs["cache_control_injection_points"] = [ + dict(point) for point in _DEFAULT_INJECTION_POINTS + ] + + # OpenAI-style extras only when we statically know the destination + # accepts them. Auto-mode router fans out across mixed providers so + # we can't safely set destination-specific kwargs there (drop_params + # would strip them but it's wasteful to set them in the first + # place). + if _is_router_llm(llm): + return + + if ( + thread_id is not None + and "prompt_cache_key" not in model_kwargs + and _provider_supports_prompt_cache_key(agent_config) + ): + model_kwargs["prompt_cache_key"] = f"surfsense-thread-{thread_id}" + + if ( + "prompt_cache_retention" not in model_kwargs + and _provider_supports_prompt_cache_retention(agent_config) + ): + model_kwargs["prompt_cache_retention"] = "24h" diff --git a/surfsense_backend/app/automations/services/model_policy.py b/surfsense_backend/app/automations/services/model_policy.py index 88e9d5f28..e6ab8bde5 100644 --- a/surfsense_backend/app/automations/services/model_policy.py +++ b/surfsense_backend/app/automations/services/model_policy.py @@ -39,7 +39,7 @@ def _is_premium_global(kind: ModelKind, config_id: int) -> bool: cfg: dict | None = None if kind == "llm": - from app.agents.new_chat.llm_config import load_global_llm_config_by_id + from app.agents.shared.llm_config import load_global_llm_config_by_id cfg = load_global_llm_config_by_id(config_id) elif kind == "image": diff --git a/surfsense_backend/app/routes/anonymous_chat_routes.py b/surfsense_backend/app/routes/anonymous_chat_routes.py index eb952e684..8072664a0 100644 --- a/surfsense_backend/app/routes/anonymous_chat_routes.py +++ b/surfsense_backend/app/routes/anonymous_chat_routes.py @@ -236,7 +236,7 @@ async def stream_anonymous_chat( detail="No-login mode is not enabled.", ) - from app.agents.new_chat.llm_config import ( + from app.agents.shared.llm_config import ( AgentConfig, create_chat_litellm_from_agent_config, ) diff --git a/surfsense_backend/app/services/llm_service.py b/surfsense_backend/app/services/llm_service.py index aadb60cde..5ef5a530b 100644 --- a/surfsense_backend/app/services/llm_service.py +++ b/surfsense_backend/app/services/llm_service.py @@ -204,7 +204,7 @@ async def validate_llm_config( if litellm_params: litellm_kwargs.update(litellm_params) - from app.agents.new_chat.llm_config import SanitizedChatLiteLLM + from app.agents.shared.llm_config import SanitizedChatLiteLLM llm = SanitizedChatLiteLLM(**litellm_kwargs) @@ -379,7 +379,7 @@ async def get_search_space_llm_instance( if disable_streaming: litellm_kwargs["disable_streaming"] = True - from app.agents.new_chat.llm_config import SanitizedChatLiteLLM + from app.agents.shared.llm_config import SanitizedChatLiteLLM return SanitizedChatLiteLLM(**litellm_kwargs) @@ -458,7 +458,7 @@ async def get_search_space_llm_instance( if disable_streaming: litellm_kwargs["disable_streaming"] = True - from app.agents.new_chat.llm_config import SanitizedChatLiteLLM + from app.agents.shared.llm_config import SanitizedChatLiteLLM return SanitizedChatLiteLLM(**litellm_kwargs) @@ -580,7 +580,7 @@ async def get_vision_llm( if global_cfg.get("litellm_params"): litellm_kwargs.update(global_cfg["litellm_params"]) - from app.agents.new_chat.llm_config import SanitizedChatLiteLLM + from app.agents.shared.llm_config import SanitizedChatLiteLLM inner_llm = SanitizedChatLiteLLM(**litellm_kwargs) @@ -634,7 +634,7 @@ async def get_vision_llm( if vision_cfg.litellm_params: litellm_kwargs.update(vision_cfg.litellm_params) - from app.agents.new_chat.llm_config import SanitizedChatLiteLLM + from app.agents.shared.llm_config import SanitizedChatLiteLLM return SanitizedChatLiteLLM(**litellm_kwargs) @@ -679,7 +679,7 @@ def get_planner_llm() -> ChatLiteLLM | None: Callers MUST fall back to their chat LLM when this returns ``None`` so deployments without a planner config keep working unchanged. """ - from app.agents.new_chat.llm_config import create_chat_litellm_from_config + from app.agents.shared.llm_config import create_chat_litellm_from_config planner_cfg = next( (cfg for cfg in config.GLOBAL_LLM_CONFIGS if cfg.get("is_planner") is True), diff --git a/surfsense_backend/app/services/provider_capabilities.py b/surfsense_backend/app/services/provider_capabilities.py index e9a1c33e1..74fae0e19 100644 --- a/surfsense_backend/app/services/provider_capabilities.py +++ b/surfsense_backend/app/services/provider_capabilities.py @@ -53,7 +53,7 @@ logger = logging.getLogger(__name__) # # Owned here because ``app.services.provider_capabilities`` is the # only edge that's safe to call from ``app.config``'s YAML loader at -# class-body init time. ``app.agents.new_chat.llm_config`` re-exports +# class-body init time. ``app.agents.shared.llm_config`` re-exports # this constant under the historical ``PROVIDER_MAP`` name; placing the # map there directly would re-introduce the # ``app.config -> ... -> app.agents.new_chat.tools.generate_image -> diff --git a/surfsense_backend/app/tasks/chat/stream_new_chat.py b/surfsense_backend/app/tasks/chat/stream_new_chat.py index 2772fd588..22fdc1df9 100644 --- a/surfsense_backend/app/tasks/chat/stream_new_chat.py +++ b/surfsense_backend/app/tasks/chat/stream_new_chat.py @@ -32,7 +32,7 @@ from app.agents.new_chat.checkpointer import get_checkpointer from app.agents.shared.context import SurfSenseContextSchema from app.agents.shared.errors import BusyError from app.agents.shared.filesystem_selection import FilesystemMode, FilesystemSelection -from app.agents.new_chat.llm_config import ( +from app.agents.shared.llm_config import ( AgentConfig, create_chat_litellm_from_agent_config, create_chat_litellm_from_config, diff --git a/surfsense_backend/app/tasks/chat/streaming/agent/builder.py b/surfsense_backend/app/tasks/chat/streaming/agent/builder.py index 313e87058..b622ab7e5 100644 --- a/surfsense_backend/app/tasks/chat/streaming/agent/builder.py +++ b/surfsense_backend/app/tasks/chat/streaming/agent/builder.py @@ -10,7 +10,7 @@ from __future__ import annotations from typing import Any from app.agents.shared.filesystem_selection import FilesystemSelection -from app.agents.new_chat.llm_config import AgentConfig +from app.agents.shared.llm_config import AgentConfig from app.db import ChatVisibility from app.services.connector_service import ConnectorService diff --git a/surfsense_backend/app/tasks/chat/streaming/flows/new_chat/llm_capability.py b/surfsense_backend/app/tasks/chat/streaming/flows/new_chat/llm_capability.py index 9f4e5d2d8..9f921fa74 100644 --- a/surfsense_backend/app/tasks/chat/streaming/flows/new_chat/llm_capability.py +++ b/surfsense_backend/app/tasks/chat/streaming/flows/new_chat/llm_capability.py @@ -15,7 +15,7 @@ tells the user what to change. from __future__ import annotations -from app.agents.new_chat.llm_config import AgentConfig +from app.agents.shared.llm_config import AgentConfig from app.observability import otel as ot diff --git a/surfsense_backend/app/tasks/chat/streaming/flows/new_chat/title_gen.py b/surfsense_backend/app/tasks/chat/streaming/flows/new_chat/title_gen.py index 7db45941b..dfa82b5bf 100644 --- a/surfsense_backend/app/tasks/chat/streaming/flows/new_chat/title_gen.py +++ b/surfsense_backend/app/tasks/chat/streaming/flows/new_chat/title_gen.py @@ -30,7 +30,7 @@ from app.prompts import TITLE_GENERATION_PROMPT from app.services.new_streaming_service import VercelStreamingService if TYPE_CHECKING: - from app.agents.new_chat.llm_config import AgentConfig + from app.agents.shared.llm_config import AgentConfig from app.services.token_tracking_service import TokenAccumulator diff --git a/surfsense_backend/app/tasks/chat/streaming/flows/shared/llm_bundle.py b/surfsense_backend/app/tasks/chat/streaming/flows/shared/llm_bundle.py index 2f334114c..b455e8fde 100644 --- a/surfsense_backend/app/tasks/chat/streaming/flows/shared/llm_bundle.py +++ b/surfsense_backend/app/tasks/chat/streaming/flows/shared/llm_bundle.py @@ -14,7 +14,7 @@ from typing import Any from sqlalchemy.ext.asyncio import AsyncSession -from app.agents.new_chat.llm_config import ( +from app.agents.shared.llm_config import ( AgentConfig, create_chat_litellm_from_agent_config, create_chat_litellm_from_config, diff --git a/surfsense_backend/app/tasks/chat/streaming/flows/shared/premium_quota.py b/surfsense_backend/app/tasks/chat/streaming/flows/shared/premium_quota.py index cbf44764c..1d364c84b 100644 --- a/surfsense_backend/app/tasks/chat/streaming/flows/shared/premium_quota.py +++ b/surfsense_backend/app/tasks/chat/streaming/flows/shared/premium_quota.py @@ -19,7 +19,7 @@ from dataclasses import dataclass from typing import TYPE_CHECKING from uuid import UUID -from app.agents.new_chat.llm_config import AgentConfig +from app.agents.shared.llm_config import AgentConfig from app.db import shielded_async_session if TYPE_CHECKING: diff --git a/surfsense_backend/tests/e2e/run_backend.py b/surfsense_backend/tests/e2e/run_backend.py index 5a787ac52..2567cc7a4 100644 --- a/surfsense_backend/tests/e2e/run_backend.py +++ b/surfsense_backend/tests/e2e/run_backend.py @@ -239,11 +239,11 @@ def _patch_llm_bindings() -> None: chat_targets = [ ( - "app.agents.new_chat.llm_config.create_chat_litellm_from_agent_config", + "app.agents.shared.llm_config.create_chat_litellm_from_agent_config", fake_create_chat_litellm_from_agent_config, ), ( - "app.agents.new_chat.llm_config.create_chat_litellm_from_config", + "app.agents.shared.llm_config.create_chat_litellm_from_config", fake_create_chat_litellm_from_config, ), ( diff --git a/surfsense_backend/tests/e2e/run_celery.py b/surfsense_backend/tests/e2e/run_celery.py index e4091d689..9e7576a51 100644 --- a/surfsense_backend/tests/e2e/run_celery.py +++ b/surfsense_backend/tests/e2e/run_celery.py @@ -212,11 +212,11 @@ def _patch_llm_bindings() -> None: chat_targets = [ ( - "app.agents.new_chat.llm_config.create_chat_litellm_from_agent_config", + "app.agents.shared.llm_config.create_chat_litellm_from_agent_config", fake_create_chat_litellm_from_agent_config, ), ( - "app.agents.new_chat.llm_config.create_chat_litellm_from_config", + "app.agents.shared.llm_config.create_chat_litellm_from_config", fake_create_chat_litellm_from_config, ), ( diff --git a/surfsense_backend/tests/unit/agents/new_chat/test_prompt_caching.py b/surfsense_backend/tests/unit/agents/new_chat/test_prompt_caching.py index c3de15c58..f5452e62e 100644 --- a/surfsense_backend/tests/unit/agents/new_chat/test_prompt_caching.py +++ b/surfsense_backend/tests/unit/agents/new_chat/test_prompt_caching.py @@ -1,5 +1,5 @@ r"""Tests for ``apply_litellm_prompt_caching`` in -:mod:`app.agents.new_chat.prompt_caching`. +:mod:`app.agents.shared.prompt_caching`. The helper replaces the legacy ``AnthropicPromptCachingMiddleware`` (which never activated for our LiteLLM stack) with LiteLLM-native multi-provider @@ -34,8 +34,8 @@ from typing import Any import pytest -from app.agents.new_chat.llm_config import AgentConfig -from app.agents.new_chat.prompt_caching import apply_litellm_prompt_caching +from app.agents.shared.llm_config import AgentConfig +from app.agents.shared.prompt_caching import apply_litellm_prompt_caching pytestmark = pytest.mark.unit diff --git a/surfsense_backend/tests/unit/agents/new_chat/test_resolve_prompt_model_name.py b/surfsense_backend/tests/unit/agents/new_chat/test_resolve_prompt_model_name.py index ffe3dbaa4..a9041f5a7 100644 --- a/surfsense_backend/tests/unit/agents/new_chat/test_resolve_prompt_model_name.py +++ b/surfsense_backend/tests/unit/agents/new_chat/test_resolve_prompt_model_name.py @@ -17,7 +17,7 @@ from __future__ import annotations import pytest from app.agents.new_chat.chat_deepagent import _resolve_prompt_model_name -from app.agents.new_chat.llm_config import AgentConfig +from app.agents.shared.llm_config import AgentConfig pytestmark = pytest.mark.unit diff --git a/surfsense_backend/tests/unit/automations/services/test_model_policy.py b/surfsense_backend/tests/unit/automations/services/test_model_policy.py index 2a471b4e9..2c5b8895f 100644 --- a/surfsense_backend/tests/unit/automations/services/test_model_policy.py +++ b/surfsense_backend/tests/unit/automations/services/test_model_policy.py @@ -44,7 +44,7 @@ def patched_globals(monkeypatch: pytest.MonkeyPatch): -2: {"id": -2, "billing_tier": "free"}, } monkeypatch.setattr( - "app.agents.new_chat.llm_config.load_global_llm_config_by_id", + "app.agents.shared.llm_config.load_global_llm_config_by_id", lambda cid: llm_configs.get(cid), ) diff --git a/surfsense_backend/tests/unit/services/test_supports_image_input.py b/surfsense_backend/tests/unit/services/test_supports_image_input.py index 71fdee1c7..ffd0f4bf0 100644 --- a/surfsense_backend/tests/unit/services/test_supports_image_input.py +++ b/surfsense_backend/tests/unit/services/test_supports_image_input.py @@ -227,7 +227,7 @@ global_llm_configs: def test_agent_config_from_yaml_explicit_overrides_resolver(): - from app.agents.new_chat.llm_config import AgentConfig + from app.agents.shared.llm_config import AgentConfig cfg_text_only = AgentConfig.from_yaml_config( { @@ -256,7 +256,7 @@ def test_agent_config_from_yaml_explicit_overrides_resolver(): def test_agent_config_from_yaml_unannotated_uses_resolver(): """Without an explicit YAML key, AgentConfig defers to the catalog resolver — for ``gpt-4o`` LiteLLM's map says supports_vision=True.""" - from app.agents.new_chat.llm_config import AgentConfig + from app.agents.shared.llm_config import AgentConfig cfg = AgentConfig.from_yaml_config( { @@ -275,7 +275,7 @@ def test_agent_config_auto_mode_supports_image_input(): so users can keep their selection on Auto with a vision-capable deployment somewhere in the pool. The router's own `allowed_fails` handles non-vision deployments via fallback.""" - from app.agents.new_chat.llm_config import AgentConfig + from app.agents.shared.llm_config import AgentConfig auto = AgentConfig.from_auto_mode() assert auto.supports_image_input is True diff --git a/surfsense_backend/tests/unit/services/test_vision_llm_api_base_defense.py b/surfsense_backend/tests/unit/services/test_vision_llm_api_base_defense.py index b8ba9d80c..8703dcb66 100644 --- a/surfsense_backend/tests/unit/services/test_vision_llm_api_base_defense.py +++ b/surfsense_backend/tests/unit/services/test_vision_llm_api_base_defense.py @@ -61,7 +61,7 @@ async def test_get_vision_llm_global_openrouter_sets_api_base(): return_value=cfg, ), patch( - "app.agents.new_chat.llm_config.SanitizedChatLiteLLM", + "app.agents.shared.llm_config.SanitizedChatLiteLLM", new=FakeSanitized, ), ):