hotpatch(cloud): add llm load balancing

2026-04-27 01:36:30 +02:00 · 2026-01-29 15:28:31 -08:00 · 2026-01-29 15:28:31 -08:00 · 6fb656fd8f
commit 6fb656fd8f
parent 5d5f9d3bfb
21 changed files with 1324 additions and 103 deletions
--- a/surfsense_backend/app/config/init.py
+++ b/surfsense_backend/app/config/init.py
@ -48,6 +48,63 @@ def load_global_llm_configs():
        return []


+def load_router_settings():
+    """
+    Load router settings for Auto mode from YAML file.
+    Falls back to default settings if not found.
+
+    Returns:
+        dict: Router settings dictionary
+    """
+    # Default router settings
+    default_settings = {
+        "routing_strategy": "usage-based-routing",
+        "num_retries": 3,
+        "allowed_fails": 3,
+        "cooldown_time": 60,
+    }
+
+    # Try main config file first
+    global_config_file = BASE_DIR / "app" / "config" / "global_llm_config.yaml"
+
+    if not global_config_file.exists():
+        return default_settings
+
+    try:
+        with open(global_config_file, encoding="utf-8") as f:
+            data = yaml.safe_load(f)
+            settings = data.get("router_settings", {})
+            # Merge with defaults
+            return {**default_settings, **settings}
+    except Exception as e:
+        print(f"Warning: Failed to load router settings: {e}")
+        return default_settings
+
+
+def initialize_llm_router():
+    """
+    Initialize the LLM Router service for Auto mode.
+    This should be called during application startup.
+    """
+    global_configs = load_global_llm_configs()
+    router_settings = load_router_settings()
+
+    if not global_configs:
+        print("Info: No global LLM configs found, Auto mode will not be available")
+        return
+
+    try:
+        from app.services.llm_router_service import LLMRouterService
+
+        LLMRouterService.initialize(global_configs, router_settings)
+        print(
+            f"Info: LLM Router initialized with {len(global_configs)} models "
+            f"(strategy: {router_settings.get('routing_strategy', 'usage-based-routing')})"
+        )
+    except Exception as e:
+        print(f"Warning: Failed to initialize LLM Router: {e}")
+
+
 class Config:
    # Check if ffmpeg is installed
    if not is_ffmpeg_installed():
@ -156,6 +213,9 @@ class Config:
    # These can be used as default options for users
    GLOBAL_LLM_CONFIGS = load_global_llm_configs()

+    # Router settings for Auto mode (LiteLLM Router load balancing)
+    ROUTER_SETTINGS = load_router_settings()
+
    # Chonkie Configuration | Edit this to your needs
    EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
    # Azure OpenAI credentials from environment variables
--- a/surfsense_backend/app/config/global_llm_config.example.yaml
+++ b/surfsense_backend/app/config/global_llm_config.example.yaml
@ -10,10 +10,39 @@
 # These configurations will be available to all users as a convenient option
 # Users can choose to use these global configs or add their own
 #
+# AUTO MODE (Recommended):
+# - Auto mode (ID: 0) uses LiteLLM Router to automatically load balance across all global configs
+# - This helps avoid rate limits by distributing requests across multiple providers
+# - New users are automatically assigned Auto mode by default
+# - Configure router_settings below to customize the load balancing behavior
+#
 # Structure matches NewLLMConfig:
 # - LLM model configuration (provider, model_name, api_key, etc.)
 # - Prompt configuration (system_instructions, citations_enabled)

+# Router Settings for Auto Mode
+# These settings control how the LiteLLM Router distributes requests across models
+router_settings:
+  # Routing strategy options:
+  # - "usage-based-routing": Routes to deployment with lowest current usage (recommended for rate limits)
+  # - "simple-shuffle": Random distribution with optional RPM/TPM weighting
+  # - "least-busy": Routes to least busy deployment
+  # - "latency-based-routing": Routes based on response latency
+  routing_strategy: "usage-based-routing"
+  
+  # Number of retries before failing
+  num_retries: 3
+  
+  # Number of failures allowed before cooling down a deployment
+  allowed_fails: 3
+  
+  # Cooldown time in seconds after allowed_fails is exceeded
+  cooldown_time: 60
+  
+  # Fallback models (optional) - when primary fails, try these
+  # Format: [{"primary_model": ["fallback1", "fallback2"]}]
+  # fallbacks: []
+
 global_llm_configs:
  # Example: OpenAI GPT-4 Turbo with citations enabled
  - id: -1
@ -23,6 +52,9 @@ global_llm_configs:
    model_name: "gpt-4-turbo-preview"
    api_key: "sk-your-openai-api-key-here"
    api_base: ""
+    # Rate limits for load balancing (requests/tokens per minute)
+    rpm: 500  # Requests per minute
+    tpm: 100000  # Tokens per minute
    litellm_params:
      temperature: 0.7
      max_tokens: 4000
@ -39,6 +71,8 @@ global_llm_configs:
    model_name: "claude-3-opus-20240229"
    api_key: "sk-ant-your-anthropic-api-key-here"
    api_base: ""
+    rpm: 1000
+    tpm: 100000
    litellm_params:
      temperature: 0.7
      max_tokens: 4000
@ -54,6 +88,8 @@ global_llm_configs:
    model_name: "gpt-3.5-turbo"
    api_key: "sk-your-openai-api-key-here"
    api_base: ""
+    rpm: 3500  # GPT-3.5 has higher rate limits
+    tpm: 200000
    litellm_params:
      temperature: 0.5
      max_tokens: 2000
@ -69,6 +105,8 @@ global_llm_configs:
    model_name: "deepseek-chat"
    api_key: "your-deepseek-api-key-here"
    api_base: "https://api.deepseek.com/v1"
+    rpm: 60
+    tpm: 100000
    litellm_params:
      temperature: 0.7
      max_tokens: 4000
@ -92,6 +130,8 @@ global_llm_configs:
    model_name: "llama3-70b-8192"
    api_key: "your-groq-api-key-here"
    api_base: ""
+    rpm: 30  # Groq has lower rate limits on free tier
+    tpm: 14400
    litellm_params:
      temperature: 0.7
      max_tokens: 8000
@ -100,6 +140,7 @@ global_llm_configs:
    citations_enabled: true

 # Notes:
+# - ID 0 is reserved for "Auto" mode - uses LiteLLM Router for load balancing
 # - Use negative IDs to distinguish global configs from user configs (NewLLMConfig in DB)
 # - IDs should be unique and sequential (e.g., -1, -2, -3, etc.)
 # - The 'api_key' field will not be exposed to users via API
@ -107,3 +148,5 @@ global_llm_configs:
 # - use_default_system_instructions: true = use SURFSENSE_SYSTEM_INSTRUCTIONS when system_instructions is empty
 # - citations_enabled: true = include citation instructions, false = include anti-citation instructions
 # - All standard LiteLLM providers are supported
+# - rpm/tpm: Optional rate limits for load balancing (requests/tokens per minute)
+#   These help the router distribute load evenly and avoid rate limit errors