cloud: added openrouter integration with global configs

2026-04-29 10:56:24 +02:00 · 2026-04-15 23:46:29 -07:00 · 2026-04-15 23:46:29 -07:00 · 4a51ccdc2c
commit 4a51ccdc2c
parent ff4e0f9b62
26 changed files with 911 additions and 178 deletions
--- a/surfsense_backend/app/config/init.py
+++ b/surfsense_backend/app/config/init.py
@ -187,24 +187,82 @@ def load_image_gen_router_settings():
        return default_settings


+def load_openrouter_integration_settings() -> dict | None:
+    """
+    Load OpenRouter integration settings from the YAML config.
+
+    Returns:
+        dict with settings if present and enabled, None otherwise
+    """
+    global_config_file = BASE_DIR / "app" / "config" / "global_llm_config.yaml"
+
+    if not global_config_file.exists():
+        return None
+
+    try:
+        with open(global_config_file, encoding="utf-8") as f:
+            data = yaml.safe_load(f)
+            settings = data.get("openrouter_integration")
+            if settings and settings.get("enabled"):
+                return settings
+            return None
+    except Exception as e:
+        print(f"Warning: Failed to load OpenRouter integration settings: {e}")
+        return None
+
+
+def initialize_openrouter_integration():
+    """
+    If enabled, fetch all OpenRouter models and append them to
+    config.GLOBAL_LLM_CONFIGS as dynamic premium entries.
+    Should be called BEFORE initialize_llm_router() so the router
+    correctly excludes premium models from Auto mode.
+    """
+    settings = load_openrouter_integration_settings()
+    if not settings:
+        return
+
+    try:
+        from app.services.openrouter_integration_service import (
+            OpenRouterIntegrationService,
+        )
+
+        service = OpenRouterIntegrationService.get_instance()
+        new_configs = service.initialize(settings)
+
+        if new_configs:
+            config.GLOBAL_LLM_CONFIGS.extend(new_configs)
+            print(
+                f"Info: OpenRouter integration added {len(new_configs)} models "
+                f"(billing_tier={settings.get('billing_tier', 'premium')})"
+            )
+        else:
+            print("Info: OpenRouter integration enabled but no models fetched")
+    except Exception as e:
+        print(f"Warning: Failed to initialize OpenRouter integration: {e}")
+
+
 def initialize_llm_router():
    """
    Initialize the LLM Router service for Auto mode.
-    This should be called during application startup.
+    This should be called during application startup, AFTER
+    initialize_openrouter_integration() so dynamic models are included.
+    Uses config.GLOBAL_LLM_CONFIGS (in-memory) which includes both
+    static YAML configs and dynamic OpenRouter models.
    """
-    global_configs = load_global_llm_configs()
+    all_configs = config.GLOBAL_LLM_CONFIGS
    router_settings = load_router_settings()

-    if not global_configs:
+    if not all_configs:
        print("Info: No global LLM configs found, Auto mode will not be available")
        return

    try:
        from app.services.llm_router_service import LLMRouterService

-        LLMRouterService.initialize(global_configs, router_settings)
+        LLMRouterService.initialize(all_configs, router_settings)
        print(
-            f"Info: LLM Router initialized with {len(global_configs)} models "
+            f"Info: LLM Router initialized with {len(all_configs)} models "
            f"(strategy: {router_settings.get('routing_strategy', 'usage-based-routing')})"
        )
    except Exception as e:
@ -326,7 +384,7 @@ class Config:
    )

    # Premium token quota settings
-    PREMIUM_TOKEN_LIMIT = int(os.getenv("PREMIUM_TOKEN_LIMIT", "5000000"))
+    PREMIUM_TOKEN_LIMIT = int(os.getenv("PREMIUM_TOKEN_LIMIT", "3000000"))
    STRIPE_PREMIUM_TOKEN_PRICE_ID = os.getenv("STRIPE_PREMIUM_TOKEN_PRICE_ID")
    STRIPE_TOKENS_PER_UNIT = int(os.getenv("STRIPE_TOKENS_PER_UNIT", "1000000"))
    STRIPE_TOKEN_BUYING_ENABLED = (
@ -335,9 +393,9 @@ class Config:

    # Anonymous / no-login mode settings
    NOLOGIN_MODE_ENABLED = os.getenv("NOLOGIN_MODE_ENABLED", "FALSE").upper() == "TRUE"
-    ANON_TOKEN_LIMIT = int(os.getenv("ANON_TOKEN_LIMIT", "1000000"))
+    ANON_TOKEN_LIMIT = int(os.getenv("ANON_TOKEN_LIMIT", "500000"))
    ANON_TOKEN_WARNING_THRESHOLD = int(
-        os.getenv("ANON_TOKEN_WARNING_THRESHOLD", "800000")
+        os.getenv("ANON_TOKEN_WARNING_THRESHOLD", "400000")
    )
    ANON_TOKEN_QUOTA_TTL_DAYS = int(os.getenv("ANON_TOKEN_QUOTA_TTL_DAYS", "30"))
    ANON_MAX_UPLOAD_SIZE_MB = int(os.getenv("ANON_MAX_UPLOAD_SIZE_MB", "5"))
@ -450,6 +508,9 @@ class Config:
    # Router settings for Vision LLM Auto mode
    VISION_LLM_ROUTER_SETTINGS = load_vision_llm_router_settings()

+    # OpenRouter Integration settings (optional)
+    OPENROUTER_INTEGRATION_SETTINGS = load_openrouter_integration_settings()
+
    # Chonkie Configuration | Edit this to your needs
    EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
    # Azure OpenAI credentials from environment variables
--- a/surfsense_backend/app/config/global_llm_config.example.yaml
+++ b/surfsense_backend/app/config/global_llm_config.example.yaml
@ -1,5 +1,5 @@
 # Global LLM Configuration
-# 
+#
 # SETUP INSTRUCTIONS:
 # 1. For production: Copy this file to global_llm_config.yaml and add your real API keys
 # 2. For testing: The system will use this example file automatically if global_llm_config.yaml doesn't exist
@ -29,16 +29,16 @@ router_settings:
  # - "least-busy": Routes to least busy deployment
  # - "latency-based-routing": Routes based on response latency
  routing_strategy: "usage-based-routing"
-  
+
  # Number of retries before failing
  num_retries: 3
-  
+
  # Number of failures allowed before cooling down a deployment
  allowed_fails: 3
-  
+
  # Cooldown time in seconds after allowed_fails is exceeded
  cooldown_time: 60
-  
+
  # Fallback models (optional) - when primary fails, try these
  # Format: [{"primary_model": ["fallback1", "fallback2"]}]
  # fallbacks: []
@ -58,13 +58,13 @@ global_llm_configs:
    api_key: "sk-your-openai-api-key-here"
    api_base: ""
    # Rate limits for load balancing (requests/tokens per minute)
-    rpm: 500  # Requests per minute
-    tpm: 100000  # Tokens per minute
+    rpm: 500 # Requests per minute
+    tpm: 100000 # Tokens per minute
    litellm_params:
      temperature: 0.7
      max_tokens: 4000
    # Prompt Configuration
-    system_instructions: ""  # Empty = use default SURFSENSE_SYSTEM_INSTRUCTIONS
+    system_instructions: "" # Empty = use default SURFSENSE_SYSTEM_INSTRUCTIONS
    use_default_system_instructions: true
    citations_enabled: true

@ -103,14 +103,14 @@ global_llm_configs:
    model_name: "gpt-3.5-turbo"
    api_key: "sk-your-openai-api-key-here"
    api_base: ""
-    rpm: 3500  # GPT-3.5 has higher rate limits
+    rpm: 3500 # GPT-3.5 has higher rate limits
    tpm: 200000
    litellm_params:
      temperature: 0.5
      max_tokens: 2000
    system_instructions: ""
    use_default_system_instructions: true
-    citations_enabled: false  # Disabled for faster responses
+    citations_enabled: false # Disabled for faster responses

  # Example: Chinese LLM - DeepSeek with custom instructions
  - id: -4
@ -134,9 +134,9 @@ global_llm_configs:
    system_instructions: |
      <system_instruction>
      You are SurfSense, a reasoning and acting AI agent designed to answer user questions using the user's personal knowledge base.
-      
+
      Today's date (UTC): {resolved_today}
-      
+
      IMPORTANT: Please respond in Chinese (简体中文) unless the user specifically requests another language.
      </system_instruction>
    use_default_system_instructions: false
@ -158,7 +158,7 @@ global_llm_configs:
    model_name: "azure/gpt-4o-deployment"
    api_key: "your-azure-api-key-here"
    api_base: "https://your-resource.openai.azure.com"
-    api_version: "2024-02-15-preview"  # Azure API version
+    api_version: "2024-02-15-preview" # Azure API version
    rpm: 1000
    tpm: 150000
    litellm_params:
@ -191,7 +191,7 @@ global_llm_configs:
    litellm_params:
      temperature: 0.7
      max_tokens: 4000
-      base_model: "gpt-4-turbo"  # Maps to gpt-4-turbo-preview
+      base_model: "gpt-4-turbo" # Maps to gpt-4-turbo-preview
    system_instructions: ""
    use_default_system_instructions: true
    citations_enabled: true
@ -209,7 +209,7 @@ global_llm_configs:
    model_name: "llama3-70b-8192"
    api_key: "your-groq-api-key-here"
    api_base: ""
-    rpm: 30  # Groq has lower rate limits on free tier
+    rpm: 30 # Groq has lower rate limits on free tier
    tpm: 14400
    litellm_params:
      temperature: 0.7
@ -234,12 +234,48 @@ global_llm_configs:
    rpm: 60
    tpm: 100000
    litellm_params:
-      temperature: 1.0  # MiniMax requires temperature in (0.0, 1.0], cannot be 0
+      temperature: 1.0 # MiniMax requires temperature in (0.0, 1.0], cannot be 0
      max_tokens: 4000
    system_instructions: ""
    use_default_system_instructions: true
    citations_enabled: true

+# =============================================================================
+# OpenRouter Integration
+# =============================================================================
+# When enabled, dynamically fetches ALL available models from the OpenRouter API
+# and injects them as global configs. This gives premium users access to any model
+# on OpenRouter (Claude, Gemini, Llama, Mistral, etc.) via their premium token quota.
+# Models are fetched at startup and refreshed periodically in the background.
+# All calls go through LiteLLM with the openrouter/ prefix.
+openrouter_integration:
+  enabled: false
+  api_key: "sk-or-your-openrouter-api-key"
+  # billing_tier: "premium" or "free". Controls whether users need premium tokens.
+  billing_tier: "premium"
+  # anonymous_enabled: set true to also show OpenRouter models to no-login users
+  anonymous_enabled: false
+  seo_enabled: false
+  # quota_reserve_tokens: tokens reserved per call for quota enforcement
+  quota_reserve_tokens: 4000
+  # id_offset: starting negative ID for dynamically generated configs.
+  # Must not overlap with your static global_llm_configs IDs above.
+  id_offset: -10000
+  # refresh_interval_hours: how often to re-fetch models from OpenRouter (0 = startup only)
+  refresh_interval_hours: 24
+  # rpm/tpm: Applied uniformly to all OpenRouter models for LiteLLM Router load balancing.
+  # OpenRouter doesn't expose per-model rate limits via API; actual throttling is handled
+  # upstream by OpenRouter itself (your account limits are at https://openrouter.ai/settings/limits).
+  # These values only matter if you set billing_tier to "free" (adding them to Auto mode).
+  # For premium-only models they are cosmetic. Set conservatively or match your account tier.
+  rpm: 200
+  tpm: 1000000
+  litellm_params:
+    max_tokens: 16384
+  system_instructions: ""
+  use_default_system_instructions: true
+  citations_enabled: true
+
 # =============================================================================
 # Image Generation Configuration
 # =============================================================================
@ -265,7 +301,7 @@ global_image_generation_configs:
    model_name: "dall-e-3"
    api_key: "sk-your-openai-api-key-here"
    api_base: ""
-    rpm: 50  # Requests per minute (image gen is rate-limited by RPM, not tokens)
+    rpm: 50 # Requests per minute (image gen is rate-limited by RPM, not tokens)
    litellm_params: {}

  # Example: OpenAI GPT Image 1
@ -394,7 +430,7 @@ global_vision_llm_configs:
 #
 # IMAGE GENERATION NOTES:
 # - Image generation configs use the same ID scheme as LLM configs (negative for global)
-# - Supported models: dall-e-2, dall-e-3, gpt-image-1 (OpenAI), azure/* (Azure), 
+# - Supported models: dall-e-2, dall-e-3, gpt-image-1 (OpenAI), azure/* (Azure),
 #   bedrock/* (AWS), vertex_ai/* (Google), recraft/* (Recraft), openrouter/* (OpenRouter)
 # - The router uses litellm.aimage_generation() for async image generation
 # - Only RPM (requests per minute) is relevant for image generation rate limiting.