SurfSense/surfsense_backend/app/config/global_llm_config.example.yaml
2026-04-15 23:46:29 -07:00

453 lines
17 KiB
YAML

# Global LLM Configuration
#
# SETUP INSTRUCTIONS:
# 1. For production: Copy this file to global_llm_config.yaml and add your real API keys
# 2. For testing: The system will use this example file automatically if global_llm_config.yaml doesn't exist
#
# NOTE: The example API keys below are placeholders and won't work.
# Replace them with your actual API keys to enable global configurations.
#
# These configurations will be available to all users as a convenient option
# Users can choose to use these global configs or add their own
#
# AUTO MODE (Recommended):
# - Auto mode (ID: 0) uses LiteLLM Router to automatically load balance across all global configs
# - This helps avoid rate limits by distributing requests across multiple providers
# - New users are automatically assigned Auto mode by default
# - Configure router_settings below to customize the load balancing behavior
#
# Structure matches NewLLMConfig:
# - Model configuration (provider, model_name, api_key, etc.)
# - Prompt configuration (system_instructions, citations_enabled)
# Router Settings for Auto Mode
# These settings control how the LiteLLM Router distributes requests across models
router_settings:
# Routing strategy options:
# - "usage-based-routing": Routes to deployment with lowest current usage (recommended for rate limits)
# - "simple-shuffle": Random distribution with optional RPM/TPM weighting
# - "least-busy": Routes to least busy deployment
# - "latency-based-routing": Routes based on response latency
routing_strategy: "usage-based-routing"
# Number of retries before failing
num_retries: 3
# Number of failures allowed before cooling down a deployment
allowed_fails: 3
# Cooldown time in seconds after allowed_fails is exceeded
cooldown_time: 60
# Fallback models (optional) - when primary fails, try these
# Format: [{"primary_model": ["fallback1", "fallback2"]}]
# fallbacks: []
global_llm_configs:
# Example: OpenAI GPT-4 Turbo with citations enabled
- id: -1
name: "Global GPT-4 Turbo"
description: "OpenAI's GPT-4 Turbo with default prompts and citations"
billing_tier: "free"
anonymous_enabled: true
seo_enabled: true
seo_slug: "gpt-4-turbo"
quota_reserve_tokens: 4000
provider: "OPENAI"
model_name: "gpt-4-turbo-preview"
api_key: "sk-your-openai-api-key-here"
api_base: ""
# Rate limits for load balancing (requests/tokens per minute)
rpm: 500 # Requests per minute
tpm: 100000 # Tokens per minute
litellm_params:
temperature: 0.7
max_tokens: 4000
# Prompt Configuration
system_instructions: "" # Empty = use default SURFSENSE_SYSTEM_INSTRUCTIONS
use_default_system_instructions: true
citations_enabled: true
# Example: Anthropic Claude 3 Opus
- id: -2
name: "Global Claude 3 Opus"
description: "Anthropic's most capable model with citations"
billing_tier: "free"
anonymous_enabled: true
seo_enabled: true
seo_slug: "claude-3-opus"
quota_reserve_tokens: 4000
provider: "ANTHROPIC"
model_name: "claude-3-opus-20240229"
api_key: "sk-ant-your-anthropic-api-key-here"
api_base: ""
rpm: 1000
tpm: 100000
litellm_params:
temperature: 0.7
max_tokens: 4000
system_instructions: ""
use_default_system_instructions: true
citations_enabled: true
# Example: Fast model - GPT-3.5 Turbo (citations disabled for speed)
- id: -3
name: "Global GPT-3.5 Turbo (Fast)"
description: "Fast responses without citations for quick queries"
billing_tier: "free"
anonymous_enabled: true
seo_enabled: true
seo_slug: "gpt-3.5-turbo-fast"
quota_reserve_tokens: 2000
provider: "OPENAI"
model_name: "gpt-3.5-turbo"
api_key: "sk-your-openai-api-key-here"
api_base: ""
rpm: 3500 # GPT-3.5 has higher rate limits
tpm: 200000
litellm_params:
temperature: 0.5
max_tokens: 2000
system_instructions: ""
use_default_system_instructions: true
citations_enabled: false # Disabled for faster responses
# Example: Chinese LLM - DeepSeek with custom instructions
- id: -4
name: "Global DeepSeek Chat (Chinese)"
description: "DeepSeek optimized for Chinese language responses"
billing_tier: "free"
anonymous_enabled: true
seo_enabled: true
seo_slug: "deepseek-chat-chinese"
quota_reserve_tokens: 4000
provider: "DEEPSEEK"
model_name: "deepseek-chat"
api_key: "your-deepseek-api-key-here"
api_base: "https://api.deepseek.com/v1"
rpm: 60
tpm: 100000
litellm_params:
temperature: 0.7
max_tokens: 4000
# Custom system instructions for Chinese responses
system_instructions: |
<system_instruction>
You are SurfSense, a reasoning and acting AI agent designed to answer user questions using the user's personal knowledge base.
Today's date (UTC): {resolved_today}
IMPORTANT: Please respond in Chinese (简体中文) unless the user specifically requests another language.
</system_instruction>
use_default_system_instructions: false
citations_enabled: true
# Example: Azure OpenAI GPT-4o
# IMPORTANT: For Azure deployments, always include 'base_model' in litellm_params
# to enable accurate token counting, cost tracking, and max token limits
- id: -5
name: "Global Azure GPT-4o"
description: "Azure OpenAI GPT-4o deployment"
billing_tier: "free"
anonymous_enabled: true
seo_enabled: true
seo_slug: "azure-gpt-4o"
quota_reserve_tokens: 4000
provider: "AZURE"
# model_name format for Azure: azure/<your-deployment-name>
model_name: "azure/gpt-4o-deployment"
api_key: "your-azure-api-key-here"
api_base: "https://your-resource.openai.azure.com"
api_version: "2024-02-15-preview" # Azure API version
rpm: 1000
tpm: 150000
litellm_params:
temperature: 0.7
max_tokens: 4000
# REQUIRED for Azure: Specify the underlying OpenAI model
# This fixes "Could not identify azure model" warnings
# Common base_model values: gpt-4, gpt-4-turbo, gpt-4o, gpt-4o-mini, gpt-3.5-turbo
base_model: "gpt-4o"
system_instructions: ""
use_default_system_instructions: true
citations_enabled: true
# Example: Azure OpenAI GPT-4 Turbo
- id: -6
name: "Global Azure GPT-4 Turbo"
description: "Azure OpenAI GPT-4 Turbo deployment"
billing_tier: "free"
anonymous_enabled: true
seo_enabled: true
seo_slug: "azure-gpt-4-turbo"
quota_reserve_tokens: 4000
provider: "AZURE"
model_name: "azure/gpt-4-turbo-deployment"
api_key: "your-azure-api-key-here"
api_base: "https://your-resource.openai.azure.com"
api_version: "2024-02-15-preview"
rpm: 500
tpm: 100000
litellm_params:
temperature: 0.7
max_tokens: 4000
base_model: "gpt-4-turbo" # Maps to gpt-4-turbo-preview
system_instructions: ""
use_default_system_instructions: true
citations_enabled: true
# Example: Groq - Fast inference
- id: -7
name: "Global Groq Llama 3"
description: "Ultra-fast Llama 3 70B via Groq"
billing_tier: "free"
anonymous_enabled: true
seo_enabled: true
seo_slug: "groq-llama-3"
quota_reserve_tokens: 8000
provider: "GROQ"
model_name: "llama3-70b-8192"
api_key: "your-groq-api-key-here"
api_base: ""
rpm: 30 # Groq has lower rate limits on free tier
tpm: 14400
litellm_params:
temperature: 0.7
max_tokens: 8000
system_instructions: ""
use_default_system_instructions: true
citations_enabled: true
# Example: MiniMax M2.5 - High-performance with 204K context window
- id: -8
name: "Global MiniMax M2.5"
description: "MiniMax M2.5 with 204K context window and competitive pricing"
billing_tier: "free"
anonymous_enabled: true
seo_enabled: true
seo_slug: "minimax-m2.5"
quota_reserve_tokens: 4000
provider: "MINIMAX"
model_name: "MiniMax-M2.5"
api_key: "your-minimax-api-key-here"
api_base: "https://api.minimax.io/v1"
rpm: 60
tpm: 100000
litellm_params:
temperature: 1.0 # MiniMax requires temperature in (0.0, 1.0], cannot be 0
max_tokens: 4000
system_instructions: ""
use_default_system_instructions: true
citations_enabled: true
# =============================================================================
# OpenRouter Integration
# =============================================================================
# When enabled, dynamically fetches ALL available models from the OpenRouter API
# and injects them as global configs. This gives premium users access to any model
# on OpenRouter (Claude, Gemini, Llama, Mistral, etc.) via their premium token quota.
# Models are fetched at startup and refreshed periodically in the background.
# All calls go through LiteLLM with the openrouter/ prefix.
openrouter_integration:
enabled: false
api_key: "sk-or-your-openrouter-api-key"
# billing_tier: "premium" or "free". Controls whether users need premium tokens.
billing_tier: "premium"
# anonymous_enabled: set true to also show OpenRouter models to no-login users
anonymous_enabled: false
seo_enabled: false
# quota_reserve_tokens: tokens reserved per call for quota enforcement
quota_reserve_tokens: 4000
# id_offset: starting negative ID for dynamically generated configs.
# Must not overlap with your static global_llm_configs IDs above.
id_offset: -10000
# refresh_interval_hours: how often to re-fetch models from OpenRouter (0 = startup only)
refresh_interval_hours: 24
# rpm/tpm: Applied uniformly to all OpenRouter models for LiteLLM Router load balancing.
# OpenRouter doesn't expose per-model rate limits via API; actual throttling is handled
# upstream by OpenRouter itself (your account limits are at https://openrouter.ai/settings/limits).
# These values only matter if you set billing_tier to "free" (adding them to Auto mode).
# For premium-only models they are cosmetic. Set conservatively or match your account tier.
rpm: 200
tpm: 1000000
litellm_params:
max_tokens: 16384
system_instructions: ""
use_default_system_instructions: true
citations_enabled: true
# =============================================================================
# Image Generation Configuration
# =============================================================================
# These configurations power the image generation feature using litellm.aimage_generation().
# Supported providers: OpenAI, Azure, Google AI Studio, Vertex AI, AWS Bedrock,
# Recraft, OpenRouter, Xinference, Nscale
#
# Auto mode (ID 0) uses LiteLLM Router for load balancing across all image gen configs.
# Router Settings for Image Generation Auto Mode
image_generation_router_settings:
routing_strategy: "usage-based-routing"
num_retries: 3
allowed_fails: 3
cooldown_time: 60
global_image_generation_configs:
# Example: OpenAI DALL-E 3
- id: -1
name: "Global DALL-E 3"
description: "OpenAI's DALL-E 3 for high-quality image generation"
provider: "OPENAI"
model_name: "dall-e-3"
api_key: "sk-your-openai-api-key-here"
api_base: ""
rpm: 50 # Requests per minute (image gen is rate-limited by RPM, not tokens)
litellm_params: {}
# Example: OpenAI GPT Image 1
- id: -2
name: "Global GPT Image 1"
description: "OpenAI's GPT Image 1 model"
provider: "OPENAI"
model_name: "gpt-image-1"
api_key: "sk-your-openai-api-key-here"
api_base: ""
rpm: 50
litellm_params: {}
# Example: Azure OpenAI DALL-E 3
- id: -3
name: "Global Azure DALL-E 3"
description: "Azure-hosted DALL-E 3 deployment"
provider: "AZURE_OPENAI"
model_name: "azure/dall-e-3-deployment"
api_key: "your-azure-api-key-here"
api_base: "https://your-resource.openai.azure.com"
api_version: "2024-02-15-preview"
rpm: 50
litellm_params:
base_model: "dall-e-3"
# Example: OpenRouter Gemini Image Generation
# - id: -4
# name: "Global Gemini Image Gen"
# description: "Google Gemini image generation via OpenRouter"
# provider: "OPENROUTER"
# model_name: "google/gemini-2.5-flash-image"
# api_key: "your-openrouter-api-key-here"
# api_base: ""
# rpm: 30
# litellm_params: {}
# =============================================================================
# Vision LLM Configuration
# =============================================================================
# These configurations power the vision autocomplete feature (screenshot analysis).
# Only vision-capable models should be used here (e.g. GPT-4o, Gemini Pro, Claude 3).
# Supported providers: OpenAI, Anthropic, Google, Azure OpenAI, Vertex AI, Bedrock,
# xAI, OpenRouter, Ollama, Groq, Together AI, Fireworks AI, DeepSeek, Mistral, Custom
#
# Auto mode (ID 0) uses LiteLLM Router for load balancing across all vision configs.
# Router Settings for Vision LLM Auto Mode
vision_llm_router_settings:
routing_strategy: "usage-based-routing"
num_retries: 3
allowed_fails: 3
cooldown_time: 60
global_vision_llm_configs:
# Example: OpenAI GPT-4o (recommended for vision)
- id: -1
name: "Global GPT-4o Vision"
description: "OpenAI's GPT-4o with strong vision capabilities"
provider: "OPENAI"
model_name: "gpt-4o"
api_key: "sk-your-openai-api-key-here"
api_base: ""
rpm: 500
tpm: 100000
litellm_params:
temperature: 0.3
max_tokens: 1000
# Example: Google Gemini 2.0 Flash
- id: -2
name: "Global Gemini 2.0 Flash"
description: "Google's fast vision model with large context"
provider: "GOOGLE"
model_name: "gemini-2.0-flash"
api_key: "your-google-ai-api-key-here"
api_base: ""
rpm: 1000
tpm: 200000
litellm_params:
temperature: 0.3
max_tokens: 1000
# Example: Anthropic Claude 3.5 Sonnet
- id: -3
name: "Global Claude 3.5 Sonnet Vision"
description: "Anthropic's Claude 3.5 Sonnet with vision support"
provider: "ANTHROPIC"
model_name: "claude-3-5-sonnet-20241022"
api_key: "sk-ant-your-anthropic-api-key-here"
api_base: ""
rpm: 1000
tpm: 100000
litellm_params:
temperature: 0.3
max_tokens: 1000
# Example: Azure OpenAI GPT-4o
# - id: -4
# name: "Global Azure GPT-4o Vision"
# description: "Azure-hosted GPT-4o for vision analysis"
# provider: "AZURE_OPENAI"
# model_name: "azure/gpt-4o-deployment"
# api_key: "your-azure-api-key-here"
# api_base: "https://your-resource.openai.azure.com"
# api_version: "2024-02-15-preview"
# rpm: 500
# tpm: 100000
# litellm_params:
# temperature: 0.3
# max_tokens: 1000
# base_model: "gpt-4o"
# Notes:
# - ID 0 is reserved for "Auto" mode - uses LiteLLM Router for load balancing
# - Use negative IDs to distinguish global configs from user configs (NewLLMConfig in DB)
# - IDs should be unique and sequential (e.g., -1, -2, -3, etc.)
# - The 'api_key' field will not be exposed to users via API
# - system_instructions: Custom prompt or empty string to use defaults
# - use_default_system_instructions: true = use SURFSENSE_SYSTEM_INSTRUCTIONS when system_instructions is empty
# - citations_enabled: true = include citation instructions, false = include anti-citation instructions
# - All standard LiteLLM providers are supported
# - rpm/tpm: Optional rate limits for load balancing (requests/tokens per minute)
# These help the router distribute load evenly and avoid rate limit errors
#
#
# IMAGE GENERATION NOTES:
# - Image generation configs use the same ID scheme as LLM configs (negative for global)
# - Supported models: dall-e-2, dall-e-3, gpt-image-1 (OpenAI), azure/* (Azure),
# bedrock/* (AWS), vertex_ai/* (Google), recraft/* (Recraft), openrouter/* (OpenRouter)
# - The router uses litellm.aimage_generation() for async image generation
# - Only RPM (requests per minute) is relevant for image generation rate limiting.
# TPM (tokens per minute) does not apply since image APIs are billed/rate-limited per request, not per token.
#
# VISION LLM NOTES:
# - Vision configs use the same ID scheme (negative for global, positive for user DB)
# - Only use vision-capable models (GPT-4o, Gemini, Claude 3, etc.)
# - Lower temperature (0.3) is recommended for accurate screenshot analysis
# - Lower max_tokens (1000) is sufficient since autocomplete produces short suggestions
#
# TOKEN QUOTA & ANONYMOUS ACCESS NOTES:
# - billing_tier: "free" or "premium". Controls whether registered users need premium token quota.
# - anonymous_enabled: true/false. Whether the model appears in the public no-login catalog.
# - seo_enabled: true/false. Whether a /free/<seo_slug> landing page is generated.
# - seo_slug: Stable URL slug for SEO pages. Must be unique. Do NOT change once public.
# - seo_title: Optional HTML title tag override for the model's /free/<slug> page.
# - seo_description: Optional meta description override for the model's /free/<slug> page.
# - quota_reserve_tokens: Tokens reserved before each LLM call for quota enforcement.
# Independent of litellm_params.max_tokens. Used by the token quota service.