cloud: added openrouter integration with global configs

This commit is contained in:
DESKTOP-RTLN3BA\$punk 2026-04-15 23:46:29 -07:00
parent ff4e0f9b62
commit 4a51ccdc2c
26 changed files with 911 additions and 178 deletions

View file

@ -187,24 +187,82 @@ def load_image_gen_router_settings():
return default_settings
def load_openrouter_integration_settings() -> dict | None:
"""
Load OpenRouter integration settings from the YAML config.
Returns:
dict with settings if present and enabled, None otherwise
"""
global_config_file = BASE_DIR / "app" / "config" / "global_llm_config.yaml"
if not global_config_file.exists():
return None
try:
with open(global_config_file, encoding="utf-8") as f:
data = yaml.safe_load(f)
settings = data.get("openrouter_integration")
if settings and settings.get("enabled"):
return settings
return None
except Exception as e:
print(f"Warning: Failed to load OpenRouter integration settings: {e}")
return None
def initialize_openrouter_integration():
"""
If enabled, fetch all OpenRouter models and append them to
config.GLOBAL_LLM_CONFIGS as dynamic premium entries.
Should be called BEFORE initialize_llm_router() so the router
correctly excludes premium models from Auto mode.
"""
settings = load_openrouter_integration_settings()
if not settings:
return
try:
from app.services.openrouter_integration_service import (
OpenRouterIntegrationService,
)
service = OpenRouterIntegrationService.get_instance()
new_configs = service.initialize(settings)
if new_configs:
config.GLOBAL_LLM_CONFIGS.extend(new_configs)
print(
f"Info: OpenRouter integration added {len(new_configs)} models "
f"(billing_tier={settings.get('billing_tier', 'premium')})"
)
else:
print("Info: OpenRouter integration enabled but no models fetched")
except Exception as e:
print(f"Warning: Failed to initialize OpenRouter integration: {e}")
def initialize_llm_router():
"""
Initialize the LLM Router service for Auto mode.
This should be called during application startup.
This should be called during application startup, AFTER
initialize_openrouter_integration() so dynamic models are included.
Uses config.GLOBAL_LLM_CONFIGS (in-memory) which includes both
static YAML configs and dynamic OpenRouter models.
"""
global_configs = load_global_llm_configs()
all_configs = config.GLOBAL_LLM_CONFIGS
router_settings = load_router_settings()
if not global_configs:
if not all_configs:
print("Info: No global LLM configs found, Auto mode will not be available")
return
try:
from app.services.llm_router_service import LLMRouterService
LLMRouterService.initialize(global_configs, router_settings)
LLMRouterService.initialize(all_configs, router_settings)
print(
f"Info: LLM Router initialized with {len(global_configs)} models "
f"Info: LLM Router initialized with {len(all_configs)} models "
f"(strategy: {router_settings.get('routing_strategy', 'usage-based-routing')})"
)
except Exception as e:
@ -326,7 +384,7 @@ class Config:
)
# Premium token quota settings
PREMIUM_TOKEN_LIMIT = int(os.getenv("PREMIUM_TOKEN_LIMIT", "5000000"))
PREMIUM_TOKEN_LIMIT = int(os.getenv("PREMIUM_TOKEN_LIMIT", "3000000"))
STRIPE_PREMIUM_TOKEN_PRICE_ID = os.getenv("STRIPE_PREMIUM_TOKEN_PRICE_ID")
STRIPE_TOKENS_PER_UNIT = int(os.getenv("STRIPE_TOKENS_PER_UNIT", "1000000"))
STRIPE_TOKEN_BUYING_ENABLED = (
@ -335,9 +393,9 @@ class Config:
# Anonymous / no-login mode settings
NOLOGIN_MODE_ENABLED = os.getenv("NOLOGIN_MODE_ENABLED", "FALSE").upper() == "TRUE"
ANON_TOKEN_LIMIT = int(os.getenv("ANON_TOKEN_LIMIT", "1000000"))
ANON_TOKEN_LIMIT = int(os.getenv("ANON_TOKEN_LIMIT", "500000"))
ANON_TOKEN_WARNING_THRESHOLD = int(
os.getenv("ANON_TOKEN_WARNING_THRESHOLD", "800000")
os.getenv("ANON_TOKEN_WARNING_THRESHOLD", "400000")
)
ANON_TOKEN_QUOTA_TTL_DAYS = int(os.getenv("ANON_TOKEN_QUOTA_TTL_DAYS", "30"))
ANON_MAX_UPLOAD_SIZE_MB = int(os.getenv("ANON_MAX_UPLOAD_SIZE_MB", "5"))
@ -450,6 +508,9 @@ class Config:
# Router settings for Vision LLM Auto mode
VISION_LLM_ROUTER_SETTINGS = load_vision_llm_router_settings()
# OpenRouter Integration settings (optional)
OPENROUTER_INTEGRATION_SETTINGS = load_openrouter_integration_settings()
# Chonkie Configuration | Edit this to your needs
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
# Azure OpenAI credentials from environment variables

View file

@ -1,5 +1,5 @@
# Global LLM Configuration
#
#
# SETUP INSTRUCTIONS:
# 1. For production: Copy this file to global_llm_config.yaml and add your real API keys
# 2. For testing: The system will use this example file automatically if global_llm_config.yaml doesn't exist
@ -29,16 +29,16 @@ router_settings:
# - "least-busy": Routes to least busy deployment
# - "latency-based-routing": Routes based on response latency
routing_strategy: "usage-based-routing"
# Number of retries before failing
num_retries: 3
# Number of failures allowed before cooling down a deployment
allowed_fails: 3
# Cooldown time in seconds after allowed_fails is exceeded
cooldown_time: 60
# Fallback models (optional) - when primary fails, try these
# Format: [{"primary_model": ["fallback1", "fallback2"]}]
# fallbacks: []
@ -58,13 +58,13 @@ global_llm_configs:
api_key: "sk-your-openai-api-key-here"
api_base: ""
# Rate limits for load balancing (requests/tokens per minute)
rpm: 500 # Requests per minute
tpm: 100000 # Tokens per minute
rpm: 500 # Requests per minute
tpm: 100000 # Tokens per minute
litellm_params:
temperature: 0.7
max_tokens: 4000
# Prompt Configuration
system_instructions: "" # Empty = use default SURFSENSE_SYSTEM_INSTRUCTIONS
system_instructions: "" # Empty = use default SURFSENSE_SYSTEM_INSTRUCTIONS
use_default_system_instructions: true
citations_enabled: true
@ -103,14 +103,14 @@ global_llm_configs:
model_name: "gpt-3.5-turbo"
api_key: "sk-your-openai-api-key-here"
api_base: ""
rpm: 3500 # GPT-3.5 has higher rate limits
rpm: 3500 # GPT-3.5 has higher rate limits
tpm: 200000
litellm_params:
temperature: 0.5
max_tokens: 2000
system_instructions: ""
use_default_system_instructions: true
citations_enabled: false # Disabled for faster responses
citations_enabled: false # Disabled for faster responses
# Example: Chinese LLM - DeepSeek with custom instructions
- id: -4
@ -134,9 +134,9 @@ global_llm_configs:
system_instructions: |
<system_instruction>
You are SurfSense, a reasoning and acting AI agent designed to answer user questions using the user's personal knowledge base.
Today's date (UTC): {resolved_today}
IMPORTANT: Please respond in Chinese (简体中文) unless the user specifically requests another language.
</system_instruction>
use_default_system_instructions: false
@ -158,7 +158,7 @@ global_llm_configs:
model_name: "azure/gpt-4o-deployment"
api_key: "your-azure-api-key-here"
api_base: "https://your-resource.openai.azure.com"
api_version: "2024-02-15-preview" # Azure API version
api_version: "2024-02-15-preview" # Azure API version
rpm: 1000
tpm: 150000
litellm_params:
@ -191,7 +191,7 @@ global_llm_configs:
litellm_params:
temperature: 0.7
max_tokens: 4000
base_model: "gpt-4-turbo" # Maps to gpt-4-turbo-preview
base_model: "gpt-4-turbo" # Maps to gpt-4-turbo-preview
system_instructions: ""
use_default_system_instructions: true
citations_enabled: true
@ -209,7 +209,7 @@ global_llm_configs:
model_name: "llama3-70b-8192"
api_key: "your-groq-api-key-here"
api_base: ""
rpm: 30 # Groq has lower rate limits on free tier
rpm: 30 # Groq has lower rate limits on free tier
tpm: 14400
litellm_params:
temperature: 0.7
@ -234,12 +234,48 @@ global_llm_configs:
rpm: 60
tpm: 100000
litellm_params:
temperature: 1.0 # MiniMax requires temperature in (0.0, 1.0], cannot be 0
temperature: 1.0 # MiniMax requires temperature in (0.0, 1.0], cannot be 0
max_tokens: 4000
system_instructions: ""
use_default_system_instructions: true
citations_enabled: true
# =============================================================================
# OpenRouter Integration
# =============================================================================
# When enabled, dynamically fetches ALL available models from the OpenRouter API
# and injects them as global configs. This gives premium users access to any model
# on OpenRouter (Claude, Gemini, Llama, Mistral, etc.) via their premium token quota.
# Models are fetched at startup and refreshed periodically in the background.
# All calls go through LiteLLM with the openrouter/ prefix.
openrouter_integration:
enabled: false
api_key: "sk-or-your-openrouter-api-key"
# billing_tier: "premium" or "free". Controls whether users need premium tokens.
billing_tier: "premium"
# anonymous_enabled: set true to also show OpenRouter models to no-login users
anonymous_enabled: false
seo_enabled: false
# quota_reserve_tokens: tokens reserved per call for quota enforcement
quota_reserve_tokens: 4000
# id_offset: starting negative ID for dynamically generated configs.
# Must not overlap with your static global_llm_configs IDs above.
id_offset: -10000
# refresh_interval_hours: how often to re-fetch models from OpenRouter (0 = startup only)
refresh_interval_hours: 24
# rpm/tpm: Applied uniformly to all OpenRouter models for LiteLLM Router load balancing.
# OpenRouter doesn't expose per-model rate limits via API; actual throttling is handled
# upstream by OpenRouter itself (your account limits are at https://openrouter.ai/settings/limits).
# These values only matter if you set billing_tier to "free" (adding them to Auto mode).
# For premium-only models they are cosmetic. Set conservatively or match your account tier.
rpm: 200
tpm: 1000000
litellm_params:
max_tokens: 16384
system_instructions: ""
use_default_system_instructions: true
citations_enabled: true
# =============================================================================
# Image Generation Configuration
# =============================================================================
@ -265,7 +301,7 @@ global_image_generation_configs:
model_name: "dall-e-3"
api_key: "sk-your-openai-api-key-here"
api_base: ""
rpm: 50 # Requests per minute (image gen is rate-limited by RPM, not tokens)
rpm: 50 # Requests per minute (image gen is rate-limited by RPM, not tokens)
litellm_params: {}
# Example: OpenAI GPT Image 1
@ -394,7 +430,7 @@ global_vision_llm_configs:
#
# IMAGE GENERATION NOTES:
# - Image generation configs use the same ID scheme as LLM configs (negative for global)
# - Supported models: dall-e-2, dall-e-3, gpt-image-1 (OpenAI), azure/* (Azure),
# - Supported models: dall-e-2, dall-e-3, gpt-image-1 (OpenAI), azure/* (Azure),
# bedrock/* (AWS), vertex_ai/* (Google), recraft/* (Recraft), openrouter/* (OpenRouter)
# - The router uses litellm.aimage_generation() for async image generation
# - Only RPM (requests per minute) is relevant for image generation rate limiting.