SurfSense/surfsense_backend/app/config/global_llm_config.example.yaml

320 lines
12 KiB
YAML

# Global LLM Configuration
#
# SETUP INSTRUCTIONS:
# 1. Copy this file to global_llm_config.yaml.
# 2. Replace placeholder credentials, endpoints, deployment names, and pricing
# with values from your own provider accounts.
#
# This file is intentionally safe to commit. Do not put real API keys in this
# example file.
#
# These YAML entries are materialized at startup as server-owned GLOBAL
# connections and models:
#
# global_llm_configs -> GLOBAL chat models
# global_image_generation_configs -> GLOBAL image generation models
#
# Do not add global_connections or global_models sections here. They are
# runtime-derived metadata exposed through the model-connections APIs.
#
# Static config shape:
# - Connection fields: provider, api_key, api_base, api_version
# - Model fields: model_name, billing_tier, rpm/tpm, capabilities, litellm_params
# - Public no-login SEO metadata: seo_title, seo_description
# - Prompt defaults: system_instructions, use_default_system_instructions,
# citations_enabled
#
# Provider notes:
# - Use the canonical provider field.
# - For Azure, use the bare deployment name in model_name, for example
# model_name: "gpt-5.1". The resolver prefixes the LiteLLM model string from
# provider: "azure".
#
# GLOBAL ID namespace:
# - ID 0 is reserved for Auto mode.
# - Negative IDs are server-owned GLOBAL models.
# - Positive IDs are user/BYOK database models.
# - Keep static IDs unique across chat and image generation.
# - Suggested static ranges: chat -1..-999, image -2001..-2999.
# - Vision is not a separate config/table. Chat models that accept images use
# supports_image_input: true.
#
# COST-BASED PREMIUM CREDITS:
# Each premium model bills the user's USD-credit balance based on provider cost
# reported by LiteLLM. For custom Azure deployments or any model LiteLLM does
# not know, declare per-token costs inline:
#
# litellm_params:
# base_model: "my-custom-deployment"
# # USD per token; 0.00000125 == $1.25 per million input tokens.
# input_cost_per_token: 0.00000125
# output_cost_per_token: 0.00001
#
# OpenRouter dynamic chat models pull pricing automatically from OpenRouter's
# API. Models without resolvable pricing debit $0 and log a warning.
# =============================================================================
# Chat Auto Mode Router Settings
# =============================================================================
# These settings control how the LiteLLM Router distributes Auto-mode requests
# across curated router-eligible GLOBAL chat deployments.
router_settings:
# Routing strategy options:
# - "usage-based-routing": Routes to deployment with lowest current usage.
# - "simple-shuffle": Random distribution with optional RPM/TPM weighting.
# - "least-busy": Routes to least busy deployment.
# - "latency-based-routing": Routes based on response latency.
routing_strategy: "usage-based-routing"
num_retries: 3
allowed_fails: 3
cooldown_time: 60
# Optional fallback map:
# fallbacks:
# - {"azure/gpt-5.1": ["azure/gpt-5.4-mini"]}
# =============================================================================
# Static GLOBAL Chat Models
# =============================================================================
global_llm_configs:
# Premium Azure chat model with image input support and explicit custom
# pricing. This is the current shape to use for hosted GPT 5.x deployments.
- id: -1
name: "Azure GPT 5.1"
billing_tier: "premium"
anonymous_enabled: false
seo_enabled: false
seo_slug: "azure-gpt-5-1"
quota_reserve_tokens: 4000
provider: "azure"
model_name: "gpt-5.1"
supports_image_input: true
supports_tools: true
max_input_tokens: 400000
api_key: "your-azure-api-key-here"
api_base: "https://your-resource.openai.azure.com"
# api_version is optional. Include it if your Azure deployment requires a
# specific API version.
# api_version: "2025-04-01-preview"
rpm: 47500
tpm: 14750000
litellm_params:
max_tokens: 16384
base_model: "gpt-5.1"
input_cost_per_token: 0.00000125
output_cost_per_token: 0.00001
system_instructions: ""
use_default_system_instructions: true
citations_enabled: true
# Larger premium chat model. If your provider prices long-context traffic
# differently, choose a conservative flat price or document the limitation
# next to the inline pricing.
- id: -2
name: "Azure GPT 5.4"
billing_tier: "premium"
anonymous_enabled: false
seo_enabled: false
seo_slug: "azure-gpt-5-4"
quota_reserve_tokens: 4000
provider: "azure"
model_name: "gpt-5.4"
supports_image_input: true
supports_tools: true
max_input_tokens: 400000
api_key: "your-azure-api-key-here"
api_base: "https://your-resource.openai.azure.com"
rpm: 150000
tpm: 15000000
litellm_params:
max_tokens: 16384
base_model: "gpt-5.4"
input_cost_per_token: 0.0000025
output_cost_per_token: 0.000015
system_instructions: ""
use_default_system_instructions: true
citations_enabled: true
# Free/no-login hosted model. Free models are visible to users when
# anonymous_enabled/seo_enabled are true but do not debit premium credits.
- id: -3
name: "Azure GPT 5.4 Mini"
billing_tier: "free"
anonymous_enabled: true
seo_enabled: true
seo_slug: "gpt-5-4-mini-no-login"
seo_title: "Free GPT 5.4 Mini Chat"
seo_description: "Chat with a hosted GPT 5.4 Mini model without signing in."
quota_reserve_tokens: 4000
provider: "azure"
model_name: "gpt-5.4-mini"
supports_image_input: false
supports_tools: true
max_input_tokens: 128000
api_key: "your-azure-api-key-here"
api_base: "https://your-resource.openai.azure.com"
rpm: 15000
tpm: 15000000
litellm_params:
max_tokens: 16384
base_model: "gpt-5.4-mini"
system_instructions: ""
use_default_system_instructions: true
citations_enabled: true
# Planner LLM. This is operator-only and is not shown in the user-facing
# model selector. Only one global_llm_configs entry should set is_planner.
- id: -9
name: "Azure GPT 5.x Nano Planner"
is_planner: true
billing_tier: "free"
anonymous_enabled: false
seo_enabled: false
quota_reserve_tokens: 1000
provider: "azure"
model_name: "gpt-5.4-nano"
supports_image_input: false
supports_tools: false
router_pool_eligible: false
api_key: "your-azure-api-key-here"
api_base: "https://your-resource.openai.azure.com"
rpm: 20000
tpm: 4000000
litellm_params:
temperature: 0
max_tokens: 1000
base_model: "gpt-5.4-nano"
system_instructions: ""
use_default_system_instructions: true
citations_enabled: false
# =============================================================================
# OpenRouter Dynamic Model Integration
# =============================================================================
# When enabled, SurfSense fetches the OpenRouter catalog at startup and injects
# supported models as GLOBAL chat and optionally image-generation models.
# Tier is derived per model from OpenRouter data:
# - model id ends with ":free" -> billing_tier=free
# - prompt and completion pricing are zero -> billing_tier=free
# - otherwise -> billing_tier=premium
#
# Do not use deprecated openrouter_integration.billing_tier or
# openrouter_integration.anonymous_enabled. Use the tier-specific anonymous
# switches below.
openrouter_integration:
enabled: false
api_key: "sk-or-your-openrouter-api-key"
anonymous_enabled_paid: false
anonymous_enabled_free: false
seo_enabled: false
quota_reserve_tokens: 4000
# Base negative ID namespace for dynamic chat models. IDs are derived
# deterministically so they survive catalog churn. Do not overlap static IDs.
id_offset: -10000
# Separate base negative ID namespace for dynamic image-generation models.
image_id_offset: -20000
# How often to refresh the OpenRouter catalog. 0 means startup only.
refresh_interval_hours: 24
# Paid OpenRouter models may join curated router pools when eligible.
rpm: 200
tpm: 1000000
# Free OpenRouter models are available for user-facing selection/pinning but
# should be treated as a shared-account bucket, not normal router capacity.
free_rpm: 20
free_tpm: 100000
# Image generation is opt-in to avoid injecting a large image catalog during
# upgrades. Vision-capable chat models are represented with
# supports_image_input: true.
image_generation_enabled: false
vision_enabled: false
litellm_params:
max_tokens: 16384
system_instructions: ""
use_default_system_instructions: true
citations_enabled: true
# =============================================================================
# Image Generation Auto Mode Router Settings
# =============================================================================
image_generation_router_settings:
routing_strategy: "usage-based-routing"
num_retries: 3
allowed_fails: 3
cooldown_time: 60
# =============================================================================
# Static GLOBAL Image Generation Models
# =============================================================================
global_image_generation_configs:
- id: -2001
name: "Azure GPT Image 1.5"
billing_tier: "premium"
provider: "azure"
model_name: "gpt-image-1.5"
api_key: "your-azure-api-key-here"
api_base: "https://your-resource.openai.azure.com"
# api_version: "2025-04-01-preview"
rpm: 60
litellm_params:
base_model: "gpt-image-1.5"
- id: -2002
name: "Azure GPT Image 1 Mini"
billing_tier: "free"
provider: "azure"
model_name: "gpt-image-1-mini"
api_key: "your-azure-api-key-here"
api_base: "https://your-resource.openai.azure.com"
# api_version: "2025-04-01-preview"
rpm: 120
litellm_params:
base_model: "gpt-image-1-mini"
# =============================================================================
# Field Notes
# =============================================================================
# Common chat/image fields:
# - provider: Canonical provider adapter name. Example: azure, openai,
# anthropic, openrouter, groq, bedrock.
# - model_name: Provider model or deployment id. For Azure, use the bare
# deployment name. The resolver prefixes LiteLLM model strings from provider.
# - api_base: Provider endpoint/root URL. For OpenAI-compatible providers, the
# resolver adds /v1 when needed.
# - api_version: Optional provider-specific API version, stored on the
# materialized connection extra metadata.
# - litellm_params: Passed to LiteLLM when invoking the model. Also used for
# base_model and inline pricing registration.
#
# Chat model fields:
# - supports_image_input: true when the chat model can consume image inputs.
# - supports_tools: true when the model can use tools/function calling.
# - max_input_tokens: Optional UI/catalog metadata for context size.
# - router_pool_eligible: false keeps a model out of shared router pools while
# still allowing direct selection/pinning.
# - is_planner: true marks the internal-only planner model. Only one config
# should set this flag.
#
# Catalog and access fields:
# - billing_tier: "free" or "premium".
# - anonymous_enabled: Whether the model appears in the public no-login catalog.
# - seo_enabled: Whether a /free/<seo_slug> landing page is generated.
# - seo_slug: Stable URL slug for SEO pages. Keep unique and do not change once
# public.
# - seo_title / seo_description: Optional SEO metadata overrides.
# - quota_reserve_tokens: Tokens reserved before each chat LLM call.
# - rpm / tpm: Optional rate limits for router accounting and load balancing.
#
# Image generation notes:
# - Image-generation configs use the same GLOBAL ID namespace as chat models.
# - Only RPM is relevant for most image-generation APIs.
# - The runtime uses litellm.aimage_generation().
# - Image billing currently uses billing_tier and model catalog metadata. Keep
# quota reserve tuning in code/catalog unless the materializer copies a YAML
# key for image quota reservation.