mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-27 01:36:30 +02:00
hotpatch(cloud): add llm load balancing
This commit is contained in:
parent
5d5f9d3bfb
commit
6fb656fd8f
21 changed files with 1324 additions and 103 deletions
|
|
@ -48,6 +48,63 @@ def load_global_llm_configs():
|
|||
return []
|
||||
|
||||
|
||||
def load_router_settings():
|
||||
"""
|
||||
Load router settings for Auto mode from YAML file.
|
||||
Falls back to default settings if not found.
|
||||
|
||||
Returns:
|
||||
dict: Router settings dictionary
|
||||
"""
|
||||
# Default router settings
|
||||
default_settings = {
|
||||
"routing_strategy": "usage-based-routing",
|
||||
"num_retries": 3,
|
||||
"allowed_fails": 3,
|
||||
"cooldown_time": 60,
|
||||
}
|
||||
|
||||
# Try main config file first
|
||||
global_config_file = BASE_DIR / "app" / "config" / "global_llm_config.yaml"
|
||||
|
||||
if not global_config_file.exists():
|
||||
return default_settings
|
||||
|
||||
try:
|
||||
with open(global_config_file, encoding="utf-8") as f:
|
||||
data = yaml.safe_load(f)
|
||||
settings = data.get("router_settings", {})
|
||||
# Merge with defaults
|
||||
return {**default_settings, **settings}
|
||||
except Exception as e:
|
||||
print(f"Warning: Failed to load router settings: {e}")
|
||||
return default_settings
|
||||
|
||||
|
||||
def initialize_llm_router():
|
||||
"""
|
||||
Initialize the LLM Router service for Auto mode.
|
||||
This should be called during application startup.
|
||||
"""
|
||||
global_configs = load_global_llm_configs()
|
||||
router_settings = load_router_settings()
|
||||
|
||||
if not global_configs:
|
||||
print("Info: No global LLM configs found, Auto mode will not be available")
|
||||
return
|
||||
|
||||
try:
|
||||
from app.services.llm_router_service import LLMRouterService
|
||||
|
||||
LLMRouterService.initialize(global_configs, router_settings)
|
||||
print(
|
||||
f"Info: LLM Router initialized with {len(global_configs)} models "
|
||||
f"(strategy: {router_settings.get('routing_strategy', 'usage-based-routing')})"
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Warning: Failed to initialize LLM Router: {e}")
|
||||
|
||||
|
||||
class Config:
|
||||
# Check if ffmpeg is installed
|
||||
if not is_ffmpeg_installed():
|
||||
|
|
@ -156,6 +213,9 @@ class Config:
|
|||
# These can be used as default options for users
|
||||
GLOBAL_LLM_CONFIGS = load_global_llm_configs()
|
||||
|
||||
# Router settings for Auto mode (LiteLLM Router load balancing)
|
||||
ROUTER_SETTINGS = load_router_settings()
|
||||
|
||||
# Chonkie Configuration | Edit this to your needs
|
||||
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
|
||||
# Azure OpenAI credentials from environment variables
|
||||
|
|
|
|||
|
|
@ -10,10 +10,39 @@
|
|||
# These configurations will be available to all users as a convenient option
|
||||
# Users can choose to use these global configs or add their own
|
||||
#
|
||||
# AUTO MODE (Recommended):
|
||||
# - Auto mode (ID: 0) uses LiteLLM Router to automatically load balance across all global configs
|
||||
# - This helps avoid rate limits by distributing requests across multiple providers
|
||||
# - New users are automatically assigned Auto mode by default
|
||||
# - Configure router_settings below to customize the load balancing behavior
|
||||
#
|
||||
# Structure matches NewLLMConfig:
|
||||
# - LLM model configuration (provider, model_name, api_key, etc.)
|
||||
# - Prompt configuration (system_instructions, citations_enabled)
|
||||
|
||||
# Router Settings for Auto Mode
|
||||
# These settings control how the LiteLLM Router distributes requests across models
|
||||
router_settings:
|
||||
# Routing strategy options:
|
||||
# - "usage-based-routing": Routes to deployment with lowest current usage (recommended for rate limits)
|
||||
# - "simple-shuffle": Random distribution with optional RPM/TPM weighting
|
||||
# - "least-busy": Routes to least busy deployment
|
||||
# - "latency-based-routing": Routes based on response latency
|
||||
routing_strategy: "usage-based-routing"
|
||||
|
||||
# Number of retries before failing
|
||||
num_retries: 3
|
||||
|
||||
# Number of failures allowed before cooling down a deployment
|
||||
allowed_fails: 3
|
||||
|
||||
# Cooldown time in seconds after allowed_fails is exceeded
|
||||
cooldown_time: 60
|
||||
|
||||
# Fallback models (optional) - when primary fails, try these
|
||||
# Format: [{"primary_model": ["fallback1", "fallback2"]}]
|
||||
# fallbacks: []
|
||||
|
||||
global_llm_configs:
|
||||
# Example: OpenAI GPT-4 Turbo with citations enabled
|
||||
- id: -1
|
||||
|
|
@ -23,6 +52,9 @@ global_llm_configs:
|
|||
model_name: "gpt-4-turbo-preview"
|
||||
api_key: "sk-your-openai-api-key-here"
|
||||
api_base: ""
|
||||
# Rate limits for load balancing (requests/tokens per minute)
|
||||
rpm: 500 # Requests per minute
|
||||
tpm: 100000 # Tokens per minute
|
||||
litellm_params:
|
||||
temperature: 0.7
|
||||
max_tokens: 4000
|
||||
|
|
@ -39,6 +71,8 @@ global_llm_configs:
|
|||
model_name: "claude-3-opus-20240229"
|
||||
api_key: "sk-ant-your-anthropic-api-key-here"
|
||||
api_base: ""
|
||||
rpm: 1000
|
||||
tpm: 100000
|
||||
litellm_params:
|
||||
temperature: 0.7
|
||||
max_tokens: 4000
|
||||
|
|
@ -54,6 +88,8 @@ global_llm_configs:
|
|||
model_name: "gpt-3.5-turbo"
|
||||
api_key: "sk-your-openai-api-key-here"
|
||||
api_base: ""
|
||||
rpm: 3500 # GPT-3.5 has higher rate limits
|
||||
tpm: 200000
|
||||
litellm_params:
|
||||
temperature: 0.5
|
||||
max_tokens: 2000
|
||||
|
|
@ -69,6 +105,8 @@ global_llm_configs:
|
|||
model_name: "deepseek-chat"
|
||||
api_key: "your-deepseek-api-key-here"
|
||||
api_base: "https://api.deepseek.com/v1"
|
||||
rpm: 60
|
||||
tpm: 100000
|
||||
litellm_params:
|
||||
temperature: 0.7
|
||||
max_tokens: 4000
|
||||
|
|
@ -92,6 +130,8 @@ global_llm_configs:
|
|||
model_name: "llama3-70b-8192"
|
||||
api_key: "your-groq-api-key-here"
|
||||
api_base: ""
|
||||
rpm: 30 # Groq has lower rate limits on free tier
|
||||
tpm: 14400
|
||||
litellm_params:
|
||||
temperature: 0.7
|
||||
max_tokens: 8000
|
||||
|
|
@ -100,6 +140,7 @@ global_llm_configs:
|
|||
citations_enabled: true
|
||||
|
||||
# Notes:
|
||||
# - ID 0 is reserved for "Auto" mode - uses LiteLLM Router for load balancing
|
||||
# - Use negative IDs to distinguish global configs from user configs (NewLLMConfig in DB)
|
||||
# - IDs should be unique and sequential (e.g., -1, -2, -3, etc.)
|
||||
# - The 'api_key' field will not be exposed to users via API
|
||||
|
|
@ -107,3 +148,5 @@ global_llm_configs:
|
|||
# - use_default_system_instructions: true = use SURFSENSE_SYSTEM_INSTRUCTIONS when system_instructions is empty
|
||||
# - citations_enabled: true = include citation instructions, false = include anti-citation instructions
|
||||
# - All standard LiteLLM providers are supported
|
||||
# - rpm/tpm: Optional rate limits for load balancing (requests/tokens per minute)
|
||||
# These help the router distribute load evenly and avoid rate limit errors
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue