# Global LLM Configuration # # SETUP INSTRUCTIONS: # 1. For production: Copy this file to global_llm_config.yaml and add your real API keys # 2. For testing: The system will use this example file automatically if global_llm_config.yaml doesn't exist # # NOTE: The example API keys below are placeholders and won't work. # Replace them with your actual API keys to enable global configurations. # # These configurations will be available to all users as a convenient option # Users can choose to use these global configs or add their own # # AUTO MODE (Recommended): # - Auto mode (ID: 0) uses LiteLLM Router to automatically load balance across all global configs # - This helps avoid rate limits by distributing requests across multiple providers # - New users are automatically assigned Auto mode by default # - Configure router_settings below to customize the load balancing behavior # # Structure matches NewLLMConfig: # - Model configuration (provider, model_name, api_key, etc.) # - Prompt configuration (system_instructions, citations_enabled) # # COST-BASED PREMIUM CREDITS: # Each premium config bills the user's USD-credit balance based on the # actual provider cost reported by LiteLLM. For models LiteLLM already # knows (most OpenAI/Anthropic/etc. names) you don't need to do anything. # For custom Azure deployment names (e.g. an in-house "gpt-5.4" deployment) # or any model LiteLLM doesn't have in its built-in pricing table, declare # per-token costs inline so they bill correctly: # # litellm_params: # base_model: "my-custom-azure-deploy" # # USD per token; e.g. 0.000003 == $3.00 per million input tokens # input_cost_per_token: 0.000003 # output_cost_per_token: 0.000015 # # OpenRouter dynamic models pull pricing automatically from OpenRouter's # API — no inline declaration needed. Models without resolvable pricing # debit $0 from the user's balance and log a WARNING. # Router Settings for Auto Mode # These settings control how the LiteLLM Router distributes requests across models router_settings: # Routing strategy options: # - "usage-based-routing": Routes to deployment with lowest current usage (recommended for rate limits) # - "simple-shuffle": Random distribution with optional RPM/TPM weighting # - "least-busy": Routes to least busy deployment # - "latency-based-routing": Routes based on response latency routing_strategy: "usage-based-routing" # Number of retries before failing num_retries: 3 # Number of failures allowed before cooling down a deployment allowed_fails: 3 # Cooldown time in seconds after allowed_fails is exceeded cooldown_time: 60 # Fallback models (optional) - when primary fails, try these # Format: [{"primary_model": ["fallback1", "fallback2"]}] # fallbacks: [] global_llm_configs: # Example: OpenAI GPT-4 Turbo with citations enabled - id: -1 name: "Global GPT-4 Turbo" description: "OpenAI's GPT-4 Turbo with default prompts and citations" billing_tier: "free" anonymous_enabled: true seo_enabled: true seo_slug: "gpt-4-turbo" quota_reserve_tokens: 4000 provider: "OPENAI" model_name: "gpt-4-turbo-preview" api_key: "sk-your-openai-api-key-here" api_base: "" # Rate limits for load balancing (requests/tokens per minute) rpm: 500 # Requests per minute tpm: 100000 # Tokens per minute litellm_params: temperature: 0.7 max_tokens: 4000 # Prompt Configuration system_instructions: "" # Empty = use default SURFSENSE_SYSTEM_INSTRUCTIONS use_default_system_instructions: true citations_enabled: true # Example: Anthropic Claude 3 Opus - id: -2 name: "Global Claude 3 Opus" description: "Anthropic's most capable model with citations" billing_tier: "free" anonymous_enabled: true seo_enabled: true seo_slug: "claude-3-opus" quota_reserve_tokens: 4000 provider: "ANTHROPIC" model_name: "claude-3-opus-20240229" api_key: "sk-ant-your-anthropic-api-key-here" api_base: "" rpm: 1000 tpm: 100000 litellm_params: temperature: 0.7 max_tokens: 4000 system_instructions: "" use_default_system_instructions: true citations_enabled: true # Example: Fast model - GPT-3.5 Turbo (citations disabled for speed) - id: -3 name: "Global GPT-3.5 Turbo (Fast)" description: "Fast responses without citations for quick queries" billing_tier: "free" anonymous_enabled: true seo_enabled: true seo_slug: "gpt-3.5-turbo-fast" quota_reserve_tokens: 2000 provider: "OPENAI" model_name: "gpt-3.5-turbo" api_key: "sk-your-openai-api-key-here" api_base: "" rpm: 3500 # GPT-3.5 has higher rate limits tpm: 200000 litellm_params: temperature: 0.5 max_tokens: 2000 system_instructions: "" use_default_system_instructions: true citations_enabled: false # Disabled for faster responses # Example: Chinese LLM - DeepSeek with custom instructions - id: -4 name: "Global DeepSeek Chat (Chinese)" description: "DeepSeek optimized for Chinese language responses" billing_tier: "free" anonymous_enabled: true seo_enabled: true seo_slug: "deepseek-chat-chinese" quota_reserve_tokens: 4000 provider: "DEEPSEEK" model_name: "deepseek-chat" api_key: "your-deepseek-api-key-here" api_base: "https://api.deepseek.com/v1" rpm: 60 tpm: 100000 litellm_params: temperature: 0.7 max_tokens: 4000 # Custom system instructions for Chinese responses system_instructions: | You are SurfSense, a reasoning and acting AI agent designed to answer user questions using the user's personal knowledge base. Today's date (UTC): {resolved_today} IMPORTANT: Please respond in Chinese (简体中文) unless the user specifically requests another language. use_default_system_instructions: false citations_enabled: true # Example: Azure OpenAI GPT-4o # IMPORTANT: For Azure deployments, always include 'base_model' in litellm_params # to enable accurate token counting, cost tracking, and max token limits - id: -5 name: "Global Azure GPT-4o" description: "Azure OpenAI GPT-4o deployment" billing_tier: "free" anonymous_enabled: true seo_enabled: true seo_slug: "azure-gpt-4o" quota_reserve_tokens: 4000 provider: "AZURE" # model_name format for Azure: azure/ model_name: "azure/gpt-4o-deployment" api_key: "your-azure-api-key-here" api_base: "https://your-resource.openai.azure.com" api_version: "2024-02-15-preview" # Azure API version rpm: 1000 tpm: 150000 litellm_params: temperature: 0.7 max_tokens: 4000 # REQUIRED for Azure: Specify the underlying OpenAI model # This fixes "Could not identify azure model" warnings # Common base_model values: gpt-4, gpt-4-turbo, gpt-4o, gpt-4o-mini, gpt-3.5-turbo base_model: "gpt-4o" system_instructions: "" use_default_system_instructions: true citations_enabled: true # Example: Azure OpenAI GPT-4 Turbo - id: -6 name: "Global Azure GPT-4 Turbo" description: "Azure OpenAI GPT-4 Turbo deployment" billing_tier: "free" anonymous_enabled: true seo_enabled: true seo_slug: "azure-gpt-4-turbo" quota_reserve_tokens: 4000 provider: "AZURE" model_name: "azure/gpt-4-turbo-deployment" api_key: "your-azure-api-key-here" api_base: "https://your-resource.openai.azure.com" api_version: "2024-02-15-preview" rpm: 500 tpm: 100000 litellm_params: temperature: 0.7 max_tokens: 4000 base_model: "gpt-4-turbo" # Maps to gpt-4-turbo-preview system_instructions: "" use_default_system_instructions: true citations_enabled: true # Example: Groq - Fast inference - id: -7 name: "Global Groq Llama 3" description: "Ultra-fast Llama 3 70B via Groq" billing_tier: "free" anonymous_enabled: true seo_enabled: true seo_slug: "groq-llama-3" quota_reserve_tokens: 8000 provider: "GROQ" model_name: "llama3-70b-8192" api_key: "your-groq-api-key-here" api_base: "" rpm: 30 # Groq has lower rate limits on free tier tpm: 14400 litellm_params: temperature: 0.7 max_tokens: 8000 system_instructions: "" use_default_system_instructions: true citations_enabled: true # Example: MiniMax M2.5 - High-performance with 204K context window - id: -8 name: "Global MiniMax M2.5" description: "MiniMax M2.5 with 204K context window and competitive pricing" billing_tier: "free" anonymous_enabled: true seo_enabled: true seo_slug: "minimax-m2.5" quota_reserve_tokens: 4000 provider: "MINIMAX" model_name: "MiniMax-M2.5" api_key: "your-minimax-api-key-here" api_base: "https://api.minimax.io/v1" rpm: 60 tpm: 100000 litellm_params: temperature: 1.0 # MiniMax requires temperature in (0.0, 1.0], cannot be 0 max_tokens: 4000 system_instructions: "" use_default_system_instructions: true citations_enabled: true # ============================================================================= # OpenRouter Integration # ============================================================================= # When enabled, dynamically fetches ALL available models from the OpenRouter API # and injects them as global configs. This gives premium users access to any model # on OpenRouter (Claude, Gemini, Llama, Mistral, etc.) via their premium token quota, # while free-tier OpenRouter models show up with a green Free badge and do NOT # consume premium quota. # Models are fetched at startup and refreshed periodically in the background. # All calls go through LiteLLM with the openrouter/ prefix. openrouter_integration: enabled: false api_key: "sk-or-your-openrouter-api-key" # Tier is derived PER MODEL from OpenRouter's own API signals: # - id ends with ":free" -> billing_tier=free # - pricing.prompt AND pricing.completion == "0" -> billing_tier=free # - otherwise -> billing_tier=premium # No global billing_tier knob is honored; any legacy value emits a startup warning. # Anonymous access is split by tier so operators can expose only free # models to no-login users without leaking paid inference. anonymous_enabled_paid: false anonymous_enabled_free: false seo_enabled: false # quota_reserve_tokens: tokens reserved per call for quota enforcement quota_reserve_tokens: 4000 # id_offset: base negative ID for dynamically generated configs. # Model IDs are derived deterministically via BLAKE2b so they survive # catalogue churn. Must not overlap with your static global_llm_configs IDs. id_offset: -10000 # refresh_interval_hours: how often to re-fetch models from OpenRouter (0 = startup only) refresh_interval_hours: 24 # Rate limits for PAID OpenRouter models. These are used by LiteLLM Router # for per-deployment accounting when OR premium models participate in the # shared sub-agent "auto" pool. They do NOT cap OpenRouter itself — your # real account limits live at https://openrouter.ai/settings/limits. rpm: 200 tpm: 1000000 # Rate limits for FREE OpenRouter models. Informational only: free OR # models are intentionally kept OUT of the LiteLLM Router pool, because # OpenRouter enforces free-tier limits globally per account (~20 RPM + # 50-1000 daily requests across every ":free" model combined) — # per-deployment router accounting can't represent a shared bucket # correctly. Free OR models stay fully available in the model selector # and for user-facing Auto thread pinning. free_rpm: 20 free_tpm: 100000 # Image generation + vision LLM emission are OPT-IN. OpenRouter's catalogue # contains hundreds of image- and vision-capable models; turning these on # injects them into the global Image-Generation / Vision-LLM model # selectors alongside any static configs. Tier (free/premium) is derived # per model the same way it is for chat (`:free` suffix or zero pricing). # When a user picks a premium image/vision model the call debits the # shared $5 USD-cost-based premium credit pool — so leaving these off # avoids surprise quota burn on existing deployments. Default: false. image_generation_enabled: false vision_enabled: false litellm_params: max_tokens: 16384 system_instructions: "" use_default_system_instructions: true citations_enabled: true # ============================================================================= # Image Generation Configuration # ============================================================================= # These configurations power the image generation feature using litellm.aimage_generation(). # Supported providers: OpenAI, Azure, Google AI Studio, Vertex AI, AWS Bedrock, # Recraft, OpenRouter, Xinference, Nscale # # Auto mode (ID 0) uses LiteLLM Router for load balancing across all image gen configs. # Router Settings for Image Generation Auto Mode image_generation_router_settings: routing_strategy: "usage-based-routing" num_retries: 3 allowed_fails: 3 cooldown_time: 60 global_image_generation_configs: # Example: OpenAI DALL-E 3 - id: -1 name: "Global DALL-E 3" description: "OpenAI's DALL-E 3 for high-quality image generation" provider: "OPENAI" model_name: "dall-e-3" api_key: "sk-your-openai-api-key-here" api_base: "" rpm: 50 # Requests per minute (image gen is rate-limited by RPM, not tokens) litellm_params: {} # Example: OpenAI GPT Image 1 - id: -2 name: "Global GPT Image 1" description: "OpenAI's GPT Image 1 model" provider: "OPENAI" model_name: "gpt-image-1" api_key: "sk-your-openai-api-key-here" api_base: "" rpm: 50 litellm_params: {} # Example: Azure OpenAI DALL-E 3 - id: -3 name: "Global Azure DALL-E 3" description: "Azure-hosted DALL-E 3 deployment" provider: "AZURE_OPENAI" model_name: "azure/dall-e-3-deployment" api_key: "your-azure-api-key-here" api_base: "https://your-resource.openai.azure.com" api_version: "2024-02-15-preview" rpm: 50 litellm_params: base_model: "dall-e-3" # Example: OpenRouter Gemini Image Generation # - id: -4 # name: "Global Gemini Image Gen" # description: "Google Gemini image generation via OpenRouter" # provider: "OPENROUTER" # model_name: "google/gemini-2.5-flash-image" # api_key: "your-openrouter-api-key-here" # api_base: "" # rpm: 30 # litellm_params: {} # ============================================================================= # Vision LLM Configuration # ============================================================================= # These configurations power the vision autocomplete feature (screenshot analysis). # Only vision-capable models should be used here (e.g. GPT-4o, Gemini Pro, Claude 3). # Supported providers: OpenAI, Anthropic, Google, Azure OpenAI, Vertex AI, Bedrock, # xAI, OpenRouter, Ollama, Groq, Together AI, Fireworks AI, DeepSeek, Mistral, Custom # # Auto mode (ID 0) uses LiteLLM Router for load balancing across all vision configs. # Router Settings for Vision LLM Auto Mode vision_llm_router_settings: routing_strategy: "usage-based-routing" num_retries: 3 allowed_fails: 3 cooldown_time: 60 global_vision_llm_configs: # Example: OpenAI GPT-4o (recommended for vision) - id: -1 name: "Global GPT-4o Vision" description: "OpenAI's GPT-4o with strong vision capabilities" provider: "OPENAI" model_name: "gpt-4o" api_key: "sk-your-openai-api-key-here" api_base: "" rpm: 500 tpm: 100000 litellm_params: temperature: 0.3 max_tokens: 1000 # Example: Google Gemini 2.0 Flash - id: -2 name: "Global Gemini 2.0 Flash" description: "Google's fast vision model with large context" provider: "GOOGLE" model_name: "gemini-2.0-flash" api_key: "your-google-ai-api-key-here" api_base: "" rpm: 1000 tpm: 200000 litellm_params: temperature: 0.3 max_tokens: 1000 # Example: Anthropic Claude 3.5 Sonnet - id: -3 name: "Global Claude 3.5 Sonnet Vision" description: "Anthropic's Claude 3.5 Sonnet with vision support" provider: "ANTHROPIC" model_name: "claude-3-5-sonnet-20241022" api_key: "sk-ant-your-anthropic-api-key-here" api_base: "" rpm: 1000 tpm: 100000 litellm_params: temperature: 0.3 max_tokens: 1000 # Example: Azure OpenAI GPT-4o # - id: -4 # name: "Global Azure GPT-4o Vision" # description: "Azure-hosted GPT-4o for vision analysis" # provider: "AZURE_OPENAI" # model_name: "azure/gpt-4o-deployment" # api_key: "your-azure-api-key-here" # api_base: "https://your-resource.openai.azure.com" # api_version: "2024-02-15-preview" # rpm: 500 # tpm: 100000 # litellm_params: # temperature: 0.3 # max_tokens: 1000 # base_model: "gpt-4o" # Notes: # - ID 0 is reserved for "Auto" mode - uses LiteLLM Router for load balancing # - Use negative IDs to distinguish global configs from user configs (NewLLMConfig in DB) # - IDs should be unique and sequential (e.g., -1, -2, -3, etc.) # - The 'api_key' field will not be exposed to users via API # - system_instructions: Custom prompt or empty string to use defaults # - use_default_system_instructions: true = use SURFSENSE_SYSTEM_INSTRUCTIONS when system_instructions is empty # - citations_enabled: true = include citation instructions, false = include anti-citation instructions # - All standard LiteLLM providers are supported # - rpm/tpm: Optional rate limits for load balancing (requests/tokens per minute) # These help the router distribute load evenly and avoid rate limit errors # # # IMAGE GENERATION NOTES: # - Image generation configs use the same ID scheme as LLM configs (negative for global) # - Supported models: dall-e-2, dall-e-3, gpt-image-1 (OpenAI), azure/* (Azure), # bedrock/* (AWS), vertex_ai/* (Google), recraft/* (Recraft), openrouter/* (OpenRouter) # - The router uses litellm.aimage_generation() for async image generation # - Only RPM (requests per minute) is relevant for image generation rate limiting. # TPM (tokens per minute) does not apply since image APIs are billed/rate-limited per request, not per token. # # VISION LLM NOTES: # - Vision configs use the same ID scheme (negative for global, positive for user DB) # - Only use vision-capable models (GPT-4o, Gemini, Claude 3, etc.) # - Lower temperature (0.3) is recommended for accurate screenshot analysis # - Lower max_tokens (1000) is sufficient since autocomplete produces short suggestions # # TOKEN QUOTA & ANONYMOUS ACCESS NOTES: # - billing_tier: "free" or "premium". Controls whether registered users need premium token quota. # - anonymous_enabled: true/false. Whether the model appears in the public no-login catalog. # - seo_enabled: true/false. Whether a /free/ landing page is generated. # - seo_slug: Stable URL slug for SEO pages. Must be unique. Do NOT change once public. # - seo_title: Optional HTML title tag override for the model's /free/ page. # - seo_description: Optional meta description override for the model's /free/ page. # - quota_reserve_tokens: Tokens reserved before each LLM call for quota enforcement. # Independent of litellm_params.max_tokens. Used by the token quota service.