mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-25 08:46:22 +02:00
Add full MiniMax provider support across the entire stack: Backend: - Add MINIMAX to LiteLLMProvider enum in db.py - Add MINIMAX mapping to all provider_map dicts in llm_service.py, llm_router_service.py, and llm_config.py - Add Alembic migration (rev 106) for PostgreSQL enum - Add MiniMax M2.5 example in global_llm_config.example.yaml Frontend: - Add MiniMax to LLM_PROVIDERS enum with apiBase - Add MiniMax-M2.5 and MiniMax-M2.5-highspeed to LLM_MODELS - Add MINIMAX to Zod validation schema - Add MiniMax SVG icon and wire up in provider-icons Docs: - Add MiniMax setup guide in chinese-llm-setup.md MiniMax uses an OpenAI-compatible API (https://api.minimax.io/v1) with models supporting up to 204K context window. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
285 lines
10 KiB
YAML
285 lines
10 KiB
YAML
# Global LLM Configuration
|
|
#
|
|
# SETUP INSTRUCTIONS:
|
|
# 1. For production: Copy this file to global_llm_config.yaml and add your real API keys
|
|
# 2. For testing: The system will use this example file automatically if global_llm_config.yaml doesn't exist
|
|
#
|
|
# NOTE: The example API keys below are placeholders and won't work.
|
|
# Replace them with your actual API keys to enable global configurations.
|
|
#
|
|
# These configurations will be available to all users as a convenient option
|
|
# Users can choose to use these global configs or add their own
|
|
#
|
|
# AUTO MODE (Recommended):
|
|
# - Auto mode (ID: 0) uses LiteLLM Router to automatically load balance across all global configs
|
|
# - This helps avoid rate limits by distributing requests across multiple providers
|
|
# - New users are automatically assigned Auto mode by default
|
|
# - Configure router_settings below to customize the load balancing behavior
|
|
#
|
|
# Structure matches NewLLMConfig:
|
|
# - LLM model configuration (provider, model_name, api_key, etc.)
|
|
# - Prompt configuration (system_instructions, citations_enabled)
|
|
|
|
# Router Settings for Auto Mode
|
|
# These settings control how the LiteLLM Router distributes requests across models
|
|
router_settings:
|
|
# Routing strategy options:
|
|
# - "usage-based-routing": Routes to deployment with lowest current usage (recommended for rate limits)
|
|
# - "simple-shuffle": Random distribution with optional RPM/TPM weighting
|
|
# - "least-busy": Routes to least busy deployment
|
|
# - "latency-based-routing": Routes based on response latency
|
|
routing_strategy: "usage-based-routing"
|
|
|
|
# Number of retries before failing
|
|
num_retries: 3
|
|
|
|
# Number of failures allowed before cooling down a deployment
|
|
allowed_fails: 3
|
|
|
|
# Cooldown time in seconds after allowed_fails is exceeded
|
|
cooldown_time: 60
|
|
|
|
# Fallback models (optional) - when primary fails, try these
|
|
# Format: [{"primary_model": ["fallback1", "fallback2"]}]
|
|
# fallbacks: []
|
|
|
|
global_llm_configs:
|
|
# Example: OpenAI GPT-4 Turbo with citations enabled
|
|
- id: -1
|
|
name: "Global GPT-4 Turbo"
|
|
description: "OpenAI's GPT-4 Turbo with default prompts and citations"
|
|
provider: "OPENAI"
|
|
model_name: "gpt-4-turbo-preview"
|
|
api_key: "sk-your-openai-api-key-here"
|
|
api_base: ""
|
|
# Rate limits for load balancing (requests/tokens per minute)
|
|
rpm: 500 # Requests per minute
|
|
tpm: 100000 # Tokens per minute
|
|
litellm_params:
|
|
temperature: 0.7
|
|
max_tokens: 4000
|
|
# Prompt Configuration
|
|
system_instructions: "" # Empty = use default SURFSENSE_SYSTEM_INSTRUCTIONS
|
|
use_default_system_instructions: true
|
|
citations_enabled: true
|
|
|
|
# Example: Anthropic Claude 3 Opus
|
|
- id: -2
|
|
name: "Global Claude 3 Opus"
|
|
description: "Anthropic's most capable model with citations"
|
|
provider: "ANTHROPIC"
|
|
model_name: "claude-3-opus-20240229"
|
|
api_key: "sk-ant-your-anthropic-api-key-here"
|
|
api_base: ""
|
|
rpm: 1000
|
|
tpm: 100000
|
|
litellm_params:
|
|
temperature: 0.7
|
|
max_tokens: 4000
|
|
system_instructions: ""
|
|
use_default_system_instructions: true
|
|
citations_enabled: true
|
|
|
|
# Example: Fast model - GPT-3.5 Turbo (citations disabled for speed)
|
|
- id: -3
|
|
name: "Global GPT-3.5 Turbo (Fast)"
|
|
description: "Fast responses without citations for quick queries"
|
|
provider: "OPENAI"
|
|
model_name: "gpt-3.5-turbo"
|
|
api_key: "sk-your-openai-api-key-here"
|
|
api_base: ""
|
|
rpm: 3500 # GPT-3.5 has higher rate limits
|
|
tpm: 200000
|
|
litellm_params:
|
|
temperature: 0.5
|
|
max_tokens: 2000
|
|
system_instructions: ""
|
|
use_default_system_instructions: true
|
|
citations_enabled: false # Disabled for faster responses
|
|
|
|
# Example: Chinese LLM - DeepSeek with custom instructions
|
|
- id: -4
|
|
name: "Global DeepSeek Chat (Chinese)"
|
|
description: "DeepSeek optimized for Chinese language responses"
|
|
provider: "DEEPSEEK"
|
|
model_name: "deepseek-chat"
|
|
api_key: "your-deepseek-api-key-here"
|
|
api_base: "https://api.deepseek.com/v1"
|
|
rpm: 60
|
|
tpm: 100000
|
|
litellm_params:
|
|
temperature: 0.7
|
|
max_tokens: 4000
|
|
# Custom system instructions for Chinese responses
|
|
system_instructions: |
|
|
<system_instruction>
|
|
You are SurfSense, a reasoning and acting AI agent designed to answer user questions using the user's personal knowledge base.
|
|
|
|
Today's date (UTC): {resolved_today}
|
|
|
|
IMPORTANT: Please respond in Chinese (简体中文) unless the user specifically requests another language.
|
|
</system_instruction>
|
|
use_default_system_instructions: false
|
|
citations_enabled: true
|
|
|
|
# Example: Azure OpenAI GPT-4o
|
|
# IMPORTANT: For Azure deployments, always include 'base_model' in litellm_params
|
|
# to enable accurate token counting, cost tracking, and max token limits
|
|
- id: -5
|
|
name: "Global Azure GPT-4o"
|
|
description: "Azure OpenAI GPT-4o deployment"
|
|
provider: "AZURE"
|
|
# model_name format for Azure: azure/<your-deployment-name>
|
|
model_name: "azure/gpt-4o-deployment"
|
|
api_key: "your-azure-api-key-here"
|
|
api_base: "https://your-resource.openai.azure.com"
|
|
api_version: "2024-02-15-preview" # Azure API version
|
|
rpm: 1000
|
|
tpm: 150000
|
|
litellm_params:
|
|
temperature: 0.7
|
|
max_tokens: 4000
|
|
# REQUIRED for Azure: Specify the underlying OpenAI model
|
|
# This fixes "Could not identify azure model" warnings
|
|
# Common base_model values: gpt-4, gpt-4-turbo, gpt-4o, gpt-4o-mini, gpt-3.5-turbo
|
|
base_model: "gpt-4o"
|
|
system_instructions: ""
|
|
use_default_system_instructions: true
|
|
citations_enabled: true
|
|
|
|
# Example: Azure OpenAI GPT-4 Turbo
|
|
- id: -6
|
|
name: "Global Azure GPT-4 Turbo"
|
|
description: "Azure OpenAI GPT-4 Turbo deployment"
|
|
provider: "AZURE"
|
|
model_name: "azure/gpt-4-turbo-deployment"
|
|
api_key: "your-azure-api-key-here"
|
|
api_base: "https://your-resource.openai.azure.com"
|
|
api_version: "2024-02-15-preview"
|
|
rpm: 500
|
|
tpm: 100000
|
|
litellm_params:
|
|
temperature: 0.7
|
|
max_tokens: 4000
|
|
base_model: "gpt-4-turbo" # Maps to gpt-4-turbo-preview
|
|
system_instructions: ""
|
|
use_default_system_instructions: true
|
|
citations_enabled: true
|
|
|
|
# Example: Groq - Fast inference
|
|
- id: -7
|
|
name: "Global Groq Llama 3"
|
|
description: "Ultra-fast Llama 3 70B via Groq"
|
|
provider: "GROQ"
|
|
model_name: "llama3-70b-8192"
|
|
api_key: "your-groq-api-key-here"
|
|
api_base: ""
|
|
rpm: 30 # Groq has lower rate limits on free tier
|
|
tpm: 14400
|
|
litellm_params:
|
|
temperature: 0.7
|
|
max_tokens: 8000
|
|
system_instructions: ""
|
|
use_default_system_instructions: true
|
|
citations_enabled: true
|
|
|
|
# Example: MiniMax M2.5 - High-performance with 204K context window
|
|
- id: -8
|
|
name: "Global MiniMax M2.5"
|
|
description: "MiniMax M2.5 with 204K context window and competitive pricing"
|
|
provider: "MINIMAX"
|
|
model_name: "MiniMax-M2.5"
|
|
api_key: "your-minimax-api-key-here"
|
|
api_base: "https://api.minimax.io/v1"
|
|
rpm: 60
|
|
tpm: 100000
|
|
litellm_params:
|
|
temperature: 1.0 # MiniMax requires temperature in (0.0, 1.0], cannot be 0
|
|
max_tokens: 4000
|
|
system_instructions: ""
|
|
use_default_system_instructions: true
|
|
citations_enabled: true
|
|
|
|
# =============================================================================
|
|
# Image Generation Configuration
|
|
# =============================================================================
|
|
# These configurations power the image generation feature using litellm.aimage_generation().
|
|
# Supported providers: OpenAI, Azure, Google AI Studio, Vertex AI, AWS Bedrock,
|
|
# Recraft, OpenRouter, Xinference, Nscale
|
|
#
|
|
# Auto mode (ID 0) uses LiteLLM Router for load balancing across all image gen configs.
|
|
|
|
# Router Settings for Image Generation Auto Mode
|
|
image_generation_router_settings:
|
|
routing_strategy: "usage-based-routing"
|
|
num_retries: 3
|
|
allowed_fails: 3
|
|
cooldown_time: 60
|
|
|
|
global_image_generation_configs:
|
|
# Example: OpenAI DALL-E 3
|
|
- id: -1
|
|
name: "Global DALL-E 3"
|
|
description: "OpenAI's DALL-E 3 for high-quality image generation"
|
|
provider: "OPENAI"
|
|
model_name: "dall-e-3"
|
|
api_key: "sk-your-openai-api-key-here"
|
|
api_base: ""
|
|
rpm: 50 # Requests per minute (image gen is rate-limited by RPM, not tokens)
|
|
litellm_params: {}
|
|
|
|
# Example: OpenAI GPT Image 1
|
|
- id: -2
|
|
name: "Global GPT Image 1"
|
|
description: "OpenAI's GPT Image 1 model"
|
|
provider: "OPENAI"
|
|
model_name: "gpt-image-1"
|
|
api_key: "sk-your-openai-api-key-here"
|
|
api_base: ""
|
|
rpm: 50
|
|
litellm_params: {}
|
|
|
|
# Example: Azure OpenAI DALL-E 3
|
|
- id: -3
|
|
name: "Global Azure DALL-E 3"
|
|
description: "Azure-hosted DALL-E 3 deployment"
|
|
provider: "AZURE_OPENAI"
|
|
model_name: "azure/dall-e-3-deployment"
|
|
api_key: "your-azure-api-key-here"
|
|
api_base: "https://your-resource.openai.azure.com"
|
|
api_version: "2024-02-15-preview"
|
|
rpm: 50
|
|
litellm_params:
|
|
base_model: "dall-e-3"
|
|
|
|
# Example: OpenRouter Gemini Image Generation
|
|
# - id: -4
|
|
# name: "Global Gemini Image Gen"
|
|
# description: "Google Gemini image generation via OpenRouter"
|
|
# provider: "OPENROUTER"
|
|
# model_name: "google/gemini-2.5-flash-image"
|
|
# api_key: "your-openrouter-api-key-here"
|
|
# api_base: ""
|
|
# rpm: 30
|
|
# litellm_params: {}
|
|
|
|
# Notes:
|
|
# - ID 0 is reserved for "Auto" mode - uses LiteLLM Router for load balancing
|
|
# - Use negative IDs to distinguish global configs from user configs (NewLLMConfig in DB)
|
|
# - IDs should be unique and sequential (e.g., -1, -2, -3, etc.)
|
|
# - The 'api_key' field will not be exposed to users via API
|
|
# - system_instructions: Custom prompt or empty string to use defaults
|
|
# - use_default_system_instructions: true = use SURFSENSE_SYSTEM_INSTRUCTIONS when system_instructions is empty
|
|
# - citations_enabled: true = include citation instructions, false = include anti-citation instructions
|
|
# - All standard LiteLLM providers are supported
|
|
# - rpm/tpm: Optional rate limits for load balancing (requests/tokens per minute)
|
|
# These help the router distribute load evenly and avoid rate limit errors
|
|
#
|
|
#
|
|
# IMAGE GENERATION NOTES:
|
|
# - Image generation configs use the same ID scheme as LLM configs (negative for global)
|
|
# - Supported models: dall-e-2, dall-e-3, gpt-image-1 (OpenAI), azure/* (Azure),
|
|
# bedrock/* (AWS), vertex_ai/* (Google), recraft/* (Recraft), openrouter/* (OpenRouter)
|
|
# - The router uses litellm.aimage_generation() for async image generation
|
|
# - Only RPM (requests per minute) is relevant for image generation rate limiting.
|
|
# TPM (tokens per minute) does not apply since image APIs are billed/rate-limited per request, not per token.
|