2026-06-04 12:41:52 +02:00
|
|
|
"""
|
|
|
|
|
LLM configuration utilities for SurfSense agents.
|
|
|
|
|
|
|
|
|
|
This module provides functions for loading LLM configurations from:
|
2026-06-11 18:22:23 +05:30
|
|
|
1. Auto mode (ID 0) - Resolved by callers to a concrete model-connection model
|
2026-06-04 12:41:52 +02:00
|
|
|
2. YAML files (global configs with negative IDs)
|
2026-06-11 18:22:23 +05:30
|
|
|
3. Database model-connections table (user-created configs with positive IDs)
|
2026-06-04 12:41:52 +02:00
|
|
|
|
|
|
|
|
It also provides utilities for creating ChatLiteLLM instances and
|
|
|
|
|
managing prompt configurations.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
from collections.abc import AsyncIterator
|
|
|
|
|
from dataclasses import dataclass
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
from typing import Any
|
|
|
|
|
|
|
|
|
|
import yaml
|
|
|
|
|
from langchain_core.callbacks import (
|
|
|
|
|
AsyncCallbackManagerForLLMRun,
|
|
|
|
|
CallbackManagerForLLMRun,
|
|
|
|
|
)
|
|
|
|
|
from langchain_core.messages import AIMessage, BaseMessage
|
|
|
|
|
from langchain_core.outputs import ChatGenerationChunk, ChatResult
|
|
|
|
|
from langchain_litellm import ChatLiteLLM
|
|
|
|
|
from litellm import get_model_info
|
|
|
|
|
|
2026-06-05 13:19:24 +02:00
|
|
|
from app.agents.chat.runtime.prompt_caching import (
|
refactor(agents): move mac-only modules out of the cross-agent shared kernel
app/agents/shared/ is a sibling of anonymous_chat/podcaster/multi_agent_chat/
video_presentation, so it should only hold code shared across 2+ of those
agents. In practice podcaster and video_presentation import nothing from it,
and anonymous_chat needs only context + compaction + retry_after + web_search.
Everything else was multi_agent_chat-only (the boundary just passes through).
Move the multi_agent_chat-only cluster into multi_agent_chat/shared/ (files
moved verbatim via git rename; ~116 import sites rewritten):
errors, feature_flags, filesystem_selection, path_resolver, prompt_caching,
sandbox, llm_config, mention_resolver
middleware/busy_mutex, middleware/kb_persistence
busy_mutex/llm_config/mention_resolver are boundary-only but import the moved
modules, so they were folded in to avoid a backwards shared -> multi_agent_chat
dependency. main_agent builders now import the impls directly; the shared
middleware barrel keeps only the genuinely-shared compaction + retry_after.
Also delete the dead leftover shared/plugins and shared/skills dirs (live
copies already live under main_agent/).
Remaining in app/agents/shared/: context, system_prompt(+prompts), checkpointer,
middleware/{compaction,retry_after,dedup_tool_calls}, tools/. checkpointer and
system_prompt are boundary-only infra pending a dedicated home decision.
2026-06-05 12:30:15 +02:00
|
|
|
apply_litellm_prompt_caching,
|
|
|
|
|
)
|
2026-06-04 12:41:52 +02:00
|
|
|
from app.services.llm_router_service import (
|
|
|
|
|
AUTO_MODE_ID,
|
|
|
|
|
ChatLiteLLMRouter,
|
|
|
|
|
_sanitize_content,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _sanitize_messages(messages: list[BaseMessage]) -> list[BaseMessage]:
|
|
|
|
|
"""Sanitize content on every message so it is safe for any provider.
|
|
|
|
|
|
|
|
|
|
Handles three cross-provider incompatibilities:
|
|
|
|
|
- List content with provider-specific blocks (e.g. ``thinking``)
|
|
|
|
|
- List content with bare strings or empty text blocks
|
|
|
|
|
- AI messages with empty content + tool calls: some providers (Bedrock)
|
|
|
|
|
convert ``""`` to ``[{"type":"text","text":""}]`` server-side then
|
|
|
|
|
reject the blank text. The OpenAI spec says ``content`` should be
|
|
|
|
|
``null`` when an assistant message only carries tool calls.
|
|
|
|
|
"""
|
2026-06-12 02:17:22 +05:30
|
|
|
sanitized: list[BaseMessage] = []
|
2026-06-04 12:41:52 +02:00
|
|
|
for msg in messages:
|
2026-06-12 02:17:22 +05:30
|
|
|
next_msg = msg.model_copy(deep=True)
|
|
|
|
|
if isinstance(next_msg.content, list):
|
|
|
|
|
next_msg.content = _sanitize_content(next_msg.content)
|
2026-06-04 12:41:52 +02:00
|
|
|
if (
|
2026-06-12 02:17:22 +05:30
|
|
|
isinstance(next_msg, AIMessage)
|
|
|
|
|
and (not next_msg.content or next_msg.content == "")
|
|
|
|
|
and getattr(next_msg, "tool_calls", None)
|
2026-06-04 12:41:52 +02:00
|
|
|
):
|
2026-06-12 02:17:22 +05:30
|
|
|
next_msg.content = None # type: ignore[assignment]
|
|
|
|
|
sanitized.append(next_msg)
|
|
|
|
|
return sanitized
|
2026-06-04 12:41:52 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class SanitizedChatLiteLLM(ChatLiteLLM):
|
|
|
|
|
"""ChatLiteLLM subclass that strips provider-specific content blocks
|
|
|
|
|
(e.g. ``thinking`` from reasoning models) and normalises bare strings
|
|
|
|
|
in content arrays before forwarding to the underlying provider."""
|
|
|
|
|
|
|
|
|
|
def _generate(
|
|
|
|
|
self,
|
|
|
|
|
messages: list[BaseMessage],
|
|
|
|
|
stop: list[str] | None = None,
|
|
|
|
|
run_manager: CallbackManagerForLLMRun | None = None,
|
|
|
|
|
**kwargs: Any,
|
|
|
|
|
) -> ChatResult:
|
|
|
|
|
return super()._generate(
|
|
|
|
|
_sanitize_messages(messages), stop, run_manager, **kwargs
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
async def _astream(
|
|
|
|
|
self,
|
|
|
|
|
messages: list[BaseMessage],
|
|
|
|
|
stop: list[str] | None = None,
|
|
|
|
|
run_manager: AsyncCallbackManagerForLLMRun | None = None,
|
|
|
|
|
**kwargs: Any,
|
|
|
|
|
) -> AsyncIterator[ChatGenerationChunk]:
|
|
|
|
|
async for chunk in super()._astream(
|
|
|
|
|
_sanitize_messages(messages), stop, run_manager, **kwargs
|
|
|
|
|
):
|
|
|
|
|
yield chunk
|
|
|
|
|
|
2026-06-12 02:17:22 +05:30
|
|
|
async def _agenerate(
|
|
|
|
|
self,
|
|
|
|
|
messages: list[BaseMessage],
|
|
|
|
|
stop: list[str] | None = None,
|
|
|
|
|
run_manager: AsyncCallbackManagerForLLMRun | None = None,
|
|
|
|
|
stream: bool | None = None,
|
|
|
|
|
**kwargs: Any,
|
|
|
|
|
) -> ChatResult:
|
|
|
|
|
return await super()._agenerate(
|
|
|
|
|
_sanitize_messages(messages),
|
|
|
|
|
stop=stop,
|
|
|
|
|
run_manager=run_manager,
|
|
|
|
|
stream=stream,
|
|
|
|
|
**kwargs,
|
|
|
|
|
)
|
|
|
|
|
|
2026-06-04 12:41:52 +02:00
|
|
|
|
|
|
|
|
def _attach_model_profile(llm: ChatLiteLLM, model_string: str) -> None:
|
|
|
|
|
"""Attach a ``profile`` dict to ChatLiteLLM with model context metadata."""
|
|
|
|
|
try:
|
|
|
|
|
info = get_model_info(model_string)
|
|
|
|
|
max_input_tokens = info.get("max_input_tokens")
|
|
|
|
|
if isinstance(max_input_tokens, int) and max_input_tokens > 0:
|
|
|
|
|
llm.profile = {
|
|
|
|
|
"max_input_tokens": max_input_tokens,
|
|
|
|
|
"max_input_tokens_upper": max_input_tokens,
|
|
|
|
|
"token_count_model": model_string,
|
|
|
|
|
"token_count_models": [model_string],
|
|
|
|
|
}
|
|
|
|
|
except Exception:
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
class AgentConfig:
|
|
|
|
|
"""
|
|
|
|
|
Complete configuration for the SurfSense agent.
|
|
|
|
|
|
2026-06-13 12:45:43 +05:30
|
|
|
This combines resolved model settings with prompt configuration.
|
2026-06-11 18:22:23 +05:30
|
|
|
Supports Auto mode metadata (ID 0). Runtime callers must resolve Auto to
|
|
|
|
|
a concrete global or BYOK model before constructing ChatLiteLLM.
|
2026-06-04 12:41:52 +02:00
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
# LLM Model Settings
|
|
|
|
|
provider: str
|
|
|
|
|
model_name: str
|
|
|
|
|
api_key: str
|
|
|
|
|
api_base: str | None = None
|
|
|
|
|
custom_provider: str | None = None
|
|
|
|
|
litellm_params: dict | None = None
|
|
|
|
|
|
|
|
|
|
# Prompt Configuration
|
|
|
|
|
system_instructions: str | None = None
|
|
|
|
|
use_default_system_instructions: bool = True
|
|
|
|
|
citations_enabled: bool = True
|
|
|
|
|
|
|
|
|
|
# Metadata
|
|
|
|
|
config_id: int | None = None
|
|
|
|
|
config_name: str | None = None
|
|
|
|
|
|
|
|
|
|
# Auto mode flag
|
|
|
|
|
is_auto_mode: bool = False
|
|
|
|
|
|
|
|
|
|
# Token quota and policy
|
|
|
|
|
billing_tier: str = "free"
|
|
|
|
|
is_premium: bool = False
|
|
|
|
|
anonymous_enabled: bool = False
|
|
|
|
|
quota_reserve_tokens: int | None = None
|
|
|
|
|
|
2026-06-05 17:39:38 +02:00
|
|
|
# Default-allow: only the streaming safety net (is_known_text_only_chat_model)
|
|
|
|
|
# actually blocks on False, so defaulting False would silently hide
|
|
|
|
|
# vision-capable models. Resolved via derive_supports_image_input.
|
2026-06-04 12:41:52 +02:00
|
|
|
supports_image_input: bool = True
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def from_auto_mode(cls) -> "AgentConfig":
|
2026-06-05 17:39:38 +02:00
|
|
|
"""Build an AgentConfig for Auto mode (LiteLLM Router load balancing)."""
|
2026-06-04 12:41:52 +02:00
|
|
|
return cls(
|
|
|
|
|
provider="AUTO",
|
|
|
|
|
model_name="auto",
|
|
|
|
|
api_key="", # Not needed for router
|
|
|
|
|
api_base=None,
|
|
|
|
|
custom_provider=None,
|
|
|
|
|
litellm_params=None,
|
|
|
|
|
system_instructions=None,
|
|
|
|
|
use_default_system_instructions=True,
|
|
|
|
|
citations_enabled=True,
|
|
|
|
|
config_id=AUTO_MODE_ID,
|
2026-06-13 12:45:43 +05:30
|
|
|
config_name="Auto",
|
2026-06-04 12:41:52 +02:00
|
|
|
is_auto_mode=True,
|
|
|
|
|
billing_tier="free",
|
|
|
|
|
is_premium=False,
|
|
|
|
|
anonymous_enabled=False,
|
|
|
|
|
quota_reserve_tokens=None,
|
2026-06-05 17:39:38 +02:00
|
|
|
# Auto fails over across the pool, so a non-vision deployment's 404
|
|
|
|
|
# is just an allowed_fails event rather than a hard block.
|
2026-06-04 12:41:52 +02:00
|
|
|
supports_image_input=True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def from_yaml_config(cls, yaml_config: dict) -> "AgentConfig":
|
2026-06-05 17:39:38 +02:00
|
|
|
"""Build an AgentConfig from a YAML configuration dictionary.
|
2026-06-04 12:41:52 +02:00
|
|
|
|
2026-06-13 12:45:43 +05:30
|
|
|
Supports prompt fields such as system_instructions,
|
|
|
|
|
use_default_system_instructions, and citations_enabled.
|
2026-06-04 12:41:52 +02:00
|
|
|
"""
|
2026-06-05 17:39:38 +02:00
|
|
|
# Lazy import: keeps provider_capabilities (and litellm) out of init order.
|
2026-06-04 12:41:52 +02:00
|
|
|
from app.services.provider_capabilities import derive_supports_image_input
|
|
|
|
|
|
|
|
|
|
system_instructions = yaml_config.get("system_instructions", "")
|
|
|
|
|
|
2026-06-13 21:59:35 +05:30
|
|
|
provider = yaml_config.get("provider") or yaml_config.get(
|
|
|
|
|
"litellm_provider", ""
|
|
|
|
|
)
|
2026-06-04 12:41:52 +02:00
|
|
|
model_name = yaml_config.get("model_name", "")
|
|
|
|
|
custom_provider = yaml_config.get("custom_provider")
|
|
|
|
|
litellm_params = yaml_config.get("litellm_params") or {}
|
|
|
|
|
base_model = (
|
|
|
|
|
litellm_params.get("base_model")
|
|
|
|
|
if isinstance(litellm_params, dict)
|
|
|
|
|
else None
|
|
|
|
|
)
|
|
|
|
|
|
2026-06-05 17:39:38 +02:00
|
|
|
# Explicit YAML override wins; otherwise re-derive (the hot-reload file
|
|
|
|
|
# fallback reaches this method without the loader having populated it).
|
2026-06-04 12:41:52 +02:00
|
|
|
if "supports_image_input" in yaml_config:
|
|
|
|
|
supports_image_input = bool(yaml_config.get("supports_image_input"))
|
|
|
|
|
else:
|
|
|
|
|
supports_image_input = derive_supports_image_input(
|
2026-06-12 02:17:22 +05:30
|
|
|
provider=provider,
|
2026-06-04 12:41:52 +02:00
|
|
|
model_name=model_name,
|
|
|
|
|
base_model=base_model,
|
|
|
|
|
custom_provider=custom_provider,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
return cls(
|
|
|
|
|
provider=provider,
|
|
|
|
|
model_name=model_name,
|
|
|
|
|
api_key=yaml_config.get("api_key", ""),
|
|
|
|
|
api_base=yaml_config.get("api_base"),
|
|
|
|
|
custom_provider=custom_provider,
|
|
|
|
|
litellm_params=yaml_config.get("litellm_params"),
|
|
|
|
|
system_instructions=system_instructions if system_instructions else None,
|
|
|
|
|
use_default_system_instructions=yaml_config.get(
|
|
|
|
|
"use_default_system_instructions", True
|
|
|
|
|
),
|
|
|
|
|
citations_enabled=yaml_config.get("citations_enabled", True),
|
|
|
|
|
config_id=yaml_config.get("id"),
|
|
|
|
|
config_name=yaml_config.get("name"),
|
|
|
|
|
is_auto_mode=False,
|
|
|
|
|
billing_tier=yaml_config.get("billing_tier", "free"),
|
|
|
|
|
is_premium=yaml_config.get("billing_tier", "free") == "premium",
|
|
|
|
|
anonymous_enabled=yaml_config.get("anonymous_enabled", False),
|
|
|
|
|
quota_reserve_tokens=yaml_config.get("quota_reserve_tokens"),
|
|
|
|
|
supports_image_input=supports_image_input,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_llm_config_from_yaml(llm_config_id: int = -1) -> dict | None:
|
2026-06-05 17:39:38 +02:00
|
|
|
"""Load a specific LLM config from global_llm_config.yaml."""
|
2026-06-04 12:41:52 +02:00
|
|
|
base_dir = Path(__file__).resolve().parent.parent.parent.parent
|
|
|
|
|
config_file = base_dir / "app" / "config" / "global_llm_config.yaml"
|
|
|
|
|
|
|
|
|
|
if not config_file.exists():
|
|
|
|
|
config_file = base_dir / "app" / "config" / "global_llm_config.example.yaml"
|
|
|
|
|
if not config_file.exists():
|
|
|
|
|
print("Error: No global_llm_config.yaml or example file found")
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
with open(config_file, encoding="utf-8") as f:
|
|
|
|
|
data = yaml.safe_load(f)
|
|
|
|
|
configs = data.get("global_llm_configs", [])
|
|
|
|
|
for cfg in configs:
|
|
|
|
|
if isinstance(cfg, dict) and cfg.get("id") == llm_config_id:
|
|
|
|
|
return cfg
|
|
|
|
|
|
|
|
|
|
print(f"Error: Global LLM config id {llm_config_id} not found")
|
|
|
|
|
return None
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"Error loading config: {e}")
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_global_llm_config_by_id(llm_config_id: int) -> dict | None:
|
2026-06-05 17:39:38 +02:00
|
|
|
"""Load a global LLM config by ID, checking in-memory configs first.
|
2026-06-04 12:41:52 +02:00
|
|
|
|
2026-06-05 17:39:38 +02:00
|
|
|
In-memory covers both static YAML and dynamically injected configs (e.g.
|
|
|
|
|
OpenRouter integration models that only exist in memory).
|
2026-06-04 12:41:52 +02:00
|
|
|
"""
|
|
|
|
|
from app.config import config as app_config
|
|
|
|
|
|
|
|
|
|
for cfg in app_config.GLOBAL_LLM_CONFIGS:
|
|
|
|
|
if cfg.get("id") == llm_config_id:
|
|
|
|
|
return cfg
|
2026-06-05 17:39:38 +02:00
|
|
|
# Fallback to YAML file read (covers hot-reload edge cases).
|
2026-06-04 12:41:52 +02:00
|
|
|
return load_llm_config_from_yaml(llm_config_id)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_chat_litellm_from_config(llm_config: dict) -> ChatLiteLLM | None:
|
2026-06-05 17:39:38 +02:00
|
|
|
"""Create a ChatLiteLLM instance from a global LLM config dictionary."""
|
2026-06-04 12:41:52 +02:00
|
|
|
if llm_config.get("custom_provider"):
|
|
|
|
|
model_string = f"{llm_config['custom_provider']}/{llm_config['model_name']}"
|
|
|
|
|
else:
|
2026-06-13 21:59:35 +05:30
|
|
|
provider = llm_config.get("provider") or llm_config.get(
|
|
|
|
|
"litellm_provider", "openai"
|
|
|
|
|
)
|
2026-06-12 02:17:22 +05:30
|
|
|
model_string = f"{provider}/{llm_config['model_name']}"
|
2026-06-04 12:41:52 +02:00
|
|
|
|
|
|
|
|
litellm_kwargs = {
|
|
|
|
|
"model": model_string,
|
|
|
|
|
"api_key": llm_config.get("api_key"),
|
2026-06-05 17:39:38 +02:00
|
|
|
"streaming": True,
|
2026-06-04 12:41:52 +02:00
|
|
|
}
|
|
|
|
|
if llm_config.get("api_base"):
|
|
|
|
|
litellm_kwargs["api_base"] = llm_config["api_base"]
|
|
|
|
|
if llm_config.get("litellm_params"):
|
|
|
|
|
litellm_kwargs.update(llm_config["litellm_params"])
|
|
|
|
|
|
|
|
|
|
llm = SanitizedChatLiteLLM(**litellm_kwargs)
|
|
|
|
|
_attach_model_profile(llm, model_string)
|
2026-06-05 17:39:38 +02:00
|
|
|
# agent_config=None: the YAML path lacks structured provider intent, so set
|
|
|
|
|
# only the universal cache_control_injection_points.
|
2026-06-04 12:41:52 +02:00
|
|
|
apply_litellm_prompt_caching(llm)
|
|
|
|
|
return llm
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_chat_litellm_from_agent_config(
|
|
|
|
|
agent_config: AgentConfig,
|
|
|
|
|
) -> ChatLiteLLM | ChatLiteLLMRouter | None:
|
2026-06-11 18:22:23 +05:30
|
|
|
"""Create a ChatLiteLLM from an already resolved concrete model config."""
|
2026-06-04 12:41:52 +02:00
|
|
|
if agent_config.is_auto_mode:
|
2026-06-13 21:59:35 +05:30
|
|
|
print(
|
|
|
|
|
"Error: Auto mode must be resolved to a concrete model before LLM creation"
|
|
|
|
|
)
|
2026-06-11 18:22:23 +05:30
|
|
|
return None
|
2026-06-04 12:41:52 +02:00
|
|
|
|
|
|
|
|
if agent_config.custom_provider:
|
|
|
|
|
model_string = f"{agent_config.custom_provider}/{agent_config.model_name}"
|
|
|
|
|
else:
|
2026-06-11 18:22:23 +05:30
|
|
|
model_string = f"{agent_config.provider}/{agent_config.model_name}"
|
2026-06-04 12:41:52 +02:00
|
|
|
|
|
|
|
|
litellm_kwargs = {
|
|
|
|
|
"model": model_string,
|
|
|
|
|
"api_key": agent_config.api_key,
|
2026-06-05 17:39:38 +02:00
|
|
|
"streaming": True,
|
2026-06-04 12:41:52 +02:00
|
|
|
}
|
|
|
|
|
if agent_config.api_base:
|
|
|
|
|
litellm_kwargs["api_base"] = agent_config.api_base
|
|
|
|
|
if agent_config.litellm_params:
|
|
|
|
|
litellm_kwargs.update(agent_config.litellm_params)
|
|
|
|
|
|
|
|
|
|
llm = SanitizedChatLiteLLM(**litellm_kwargs)
|
|
|
|
|
_attach_model_profile(llm, model_string)
|
2026-06-05 17:39:38 +02:00
|
|
|
# Build-time caching only; the per-thread prompt_cache_key is layered on
|
|
|
|
|
# later in create_surfsense_deep_agent once thread_id is known.
|
2026-06-04 12:41:52 +02:00
|
|
|
apply_litellm_prompt_caching(llm, agent_config=agent_config)
|
|
|
|
|
return llm
|