diff --git a/surfsense_backend/app/config/global_llm_config.example.yaml b/surfsense_backend/app/config/global_llm_config.example.yaml index c5b65fee0..8a1e603fe 100644 --- a/surfsense_backend/app/config/global_llm_config.example.yaml +++ b/surfsense_backend/app/config/global_llm_config.example.yaml @@ -1,366 +1,237 @@ # Global LLM Configuration # # SETUP INSTRUCTIONS: -# 1. For production: Copy this file to global_llm_config.yaml and add your real API keys -# 2. For testing: The system will use this example file automatically if global_llm_config.yaml doesn't exist +# 1. Copy this file to global_llm_config.yaml. +# 2. Replace placeholder credentials, endpoints, deployment names, and pricing +# with values from your own provider accounts. # -# NOTE: The example API keys below are placeholders and won't work. -# Replace them with your actual API keys to enable global configurations. +# This file is intentionally safe to commit. Do not put real API keys in this +# example file. # -# These configurations are materialized as server-owned GLOBAL connections/models -# and become available on the Models page. Users can choose hosted/global models -# or add their own BYOK/local connections. +# These YAML entries are materialized at startup as server-owned GLOBAL +# connections and models: # -# AUTO MODE (Recommended): -# - Auto mode (ID: 0) uses LiteLLM Router to automatically load balance across all global configs -# - This helps avoid rate limits by distributing requests across multiple providers -# - New users are automatically assigned Auto mode by default -# - Configure router_settings below to customize the load balancing behavior +# global_llm_configs -> GLOBAL chat models +# global_image_generation_configs -> GLOBAL image generation models +# +# Do not add global_connections or global_models sections here. They are +# runtime-derived metadata exposed through the model-connections APIs. # # Static config shape: -# - Connection fields: litellm_provider, api_key, api_base, api_version -# - Model fields: model_name, billing_tier, rpm/tpm, litellm_params -# - Prompt defaults: system_instructions, citations_enabled -# IDs share one GLOBAL model namespace across chat, vision, and image generation. -# Suggested ranges: chat -1..-999, vision -1001..-1999, image -2001..-2999. +# - Connection fields: provider, api_key, api_base, api_version +# - Model fields: model_name, billing_tier, rpm/tpm, capabilities, litellm_params +# - Public no-login metadata: description, seo_title, seo_description +# - Prompt defaults: system_instructions, use_default_system_instructions, +# citations_enabled +# +# Provider notes: +# - Use the canonical provider field. +# - For Azure, use the bare deployment name in model_name, for example +# model_name: "gpt-5.1". The resolver prefixes the LiteLLM model string from +# provider: "azure". +# +# GLOBAL ID namespace: +# - ID 0 is reserved for Auto mode. +# - Negative IDs are server-owned GLOBAL models. +# - Positive IDs are user/BYOK database models. +# - Keep static IDs unique across chat and image generation. +# - Suggested static ranges: chat -1..-999, image -2001..-2999. +# - Vision is not a separate config/table. Chat models that accept images use +# supports_image_input: true. # # COST-BASED PREMIUM CREDITS: -# Each premium config bills the user's USD-credit balance based on the -# actual provider cost reported by LiteLLM. For models LiteLLM already -# knows (most OpenAI/Anthropic/etc. names) you don't need to do anything. -# For custom Azure deployment names (e.g. an in-house "gpt-5.4" deployment) -# or any model LiteLLM doesn't have in its built-in pricing table, declare -# per-token costs inline so they bill correctly: +# Each premium model bills the user's USD-credit balance based on provider cost +# reported by LiteLLM. For custom Azure deployments or any model LiteLLM does +# not know, declare per-token costs inline: # # litellm_params: -# base_model: "my-custom-azure-deploy" -# # USD per token; e.g. 0.000003 == $3.00 per million input tokens -# input_cost_per_token: 0.000003 -# output_cost_per_token: 0.000015 +# base_model: "my-custom-deployment" +# # USD per token; 0.00000125 == $1.25 per million input tokens. +# input_cost_per_token: 0.00000125 +# output_cost_per_token: 0.00001 # -# OpenRouter dynamic models pull pricing automatically from OpenRouter's -# API — no inline declaration needed. Models without resolvable pricing -# debit $0 from the user's balance and log a WARNING. +# OpenRouter dynamic chat models pull pricing automatically from OpenRouter's +# API. Models without resolvable pricing debit $0 and log a warning. -# Router Settings for Auto Mode -# These settings control how the LiteLLM Router distributes requests across models +# ============================================================================= +# Chat Auto Mode Router Settings +# ============================================================================= +# These settings control how the LiteLLM Router distributes Auto-mode requests +# across curated router-eligible GLOBAL chat deployments. router_settings: # Routing strategy options: - # - "usage-based-routing": Routes to deployment with lowest current usage (recommended for rate limits) - # - "simple-shuffle": Random distribution with optional RPM/TPM weighting - # - "least-busy": Routes to least busy deployment - # - "latency-based-routing": Routes based on response latency + # - "usage-based-routing": Routes to deployment with lowest current usage. + # - "simple-shuffle": Random distribution with optional RPM/TPM weighting. + # - "least-busy": Routes to least busy deployment. + # - "latency-based-routing": Routes based on response latency. routing_strategy: "usage-based-routing" - - # Number of retries before failing num_retries: 3 - - # Number of failures allowed before cooling down a deployment allowed_fails: 3 - - # Cooldown time in seconds after allowed_fails is exceeded cooldown_time: 60 + # Optional fallback map: + # fallbacks: + # - {"azure/gpt-5.1": ["azure/gpt-5.4-mini"]} - # Fallback models (optional) - when primary fails, try these - # Format: [{"primary_model": ["fallback1", "fallback2"]}] - # fallbacks: [] - +# ============================================================================= +# Static GLOBAL Chat Models +# ============================================================================= global_llm_configs: - # Example: OpenAI GPT-4 Turbo with citations enabled + # Premium Azure chat model with image input support and explicit custom + # pricing. This is the current shape to use for hosted GPT 5.x deployments. - id: -1 - name: "Global GPT-4 Turbo" - description: "OpenAI's GPT-4 Turbo with default prompts and citations" - billing_tier: "free" - anonymous_enabled: true - seo_enabled: true - seo_slug: "gpt-4-turbo" + name: "Azure GPT 5.1" + billing_tier: "premium" + anonymous_enabled: false + seo_enabled: false + seo_slug: "azure-gpt-5-1" quota_reserve_tokens: 4000 - litellm_provider: "openai" - model_name: "gpt-4-turbo-preview" - api_key: "sk-your-openai-api-key-here" - api_base: "https://api.openai.com/v1" - # Rate limits for load balancing (requests/tokens per minute) - rpm: 500 # Requests per minute - tpm: 100000 # Tokens per minute + provider: "azure" + model_name: "gpt-5.1" + supports_image_input: true + supports_tools: true + max_input_tokens: 400000 + api_key: "your-azure-api-key-here" + api_base: "https://your-resource.openai.azure.com" + # api_version is optional. Include it if your Azure deployment requires a + # specific API version. + # api_version: "2025-04-01-preview" + rpm: 47500 + tpm: 14750000 litellm_params: - temperature: 0.7 - max_tokens: 4000 - # Prompt Configuration - system_instructions: "" # Empty = use default SURFSENSE_SYSTEM_INSTRUCTIONS + max_tokens: 16384 + base_model: "gpt-5.1" + input_cost_per_token: 0.00000125 + output_cost_per_token: 0.00001 + system_instructions: "" use_default_system_instructions: true citations_enabled: true - # Example: Anthropic Claude 3 Opus + # Larger premium chat model. If your provider prices long-context traffic + # differently, choose a conservative flat price or document the limitation + # next to the inline pricing. - id: -2 - name: "Global Claude 3 Opus" - description: "Anthropic's most capable model with citations" - billing_tier: "free" - anonymous_enabled: true - seo_enabled: true - seo_slug: "claude-3-opus" + name: "Azure GPT 5.4" + billing_tier: "premium" + anonymous_enabled: false + seo_enabled: false + seo_slug: "azure-gpt-5-4" quota_reserve_tokens: 4000 - litellm_provider: "anthropic" - model_name: "claude-3-opus-20240229" - api_key: "sk-ant-your-anthropic-api-key-here" - api_base: "https://api.anthropic.com/v1" - rpm: 1000 - tpm: 100000 + provider: "azure" + model_name: "gpt-5.4" + supports_image_input: true + supports_tools: true + max_input_tokens: 400000 + api_key: "your-azure-api-key-here" + api_base: "https://your-resource.openai.azure.com" + rpm: 150000 + tpm: 15000000 litellm_params: - temperature: 0.7 - max_tokens: 4000 + max_tokens: 16384 + base_model: "gpt-5.4" + input_cost_per_token: 0.0000025 + output_cost_per_token: 0.000015 system_instructions: "" use_default_system_instructions: true citations_enabled: true - # Example: Fast model - GPT-3.5 Turbo (citations disabled for speed) + # Free/no-login hosted model. Free models are visible to users when + # anonymous_enabled/seo_enabled are true but do not debit premium credits. - id: -3 - name: "Global GPT-3.5 Turbo (Fast)" - description: "Fast responses without citations for quick queries" + name: "Azure GPT 5.4 Mini" + description: "Free hosted Azure GPT 5.4 Mini deployment" billing_tier: "free" anonymous_enabled: true seo_enabled: true - seo_slug: "gpt-3.5-turbo-fast" - quota_reserve_tokens: 2000 - litellm_provider: "openai" - model_name: "gpt-3.5-turbo" - api_key: "sk-your-openai-api-key-here" - api_base: "https://api.openai.com/v1" - rpm: 3500 # GPT-3.5 has higher rate limits - tpm: 200000 - litellm_params: - temperature: 0.5 - max_tokens: 2000 - system_instructions: "" - use_default_system_instructions: true - citations_enabled: false # Disabled for faster responses - - # Example: Chinese LLM - DeepSeek with custom instructions - - id: -4 - name: "Global DeepSeek Chat (Chinese)" - description: "DeepSeek optimized for Chinese language responses" - billing_tier: "free" - anonymous_enabled: true - seo_enabled: true - seo_slug: "deepseek-chat-chinese" + seo_slug: "gpt-5-4-mini-no-login" + seo_title: "Free GPT 5.4 Mini Chat" + seo_description: "Chat with a hosted GPT 5.4 Mini model without signing in." quota_reserve_tokens: 4000 - litellm_provider: "openai" - model_name: "deepseek-chat" - api_key: "your-deepseek-api-key-here" - api_base: "https://api.deepseek.com/v1" - rpm: 60 - tpm: 100000 - litellm_params: - temperature: 0.7 - max_tokens: 4000 - # Custom system instructions for Chinese responses - system_instructions: | - - You are SurfSense, a reasoning and acting AI agent designed to answer user questions using the user's personal knowledge base. - - Today's date (UTC): {resolved_today} - - IMPORTANT: Please respond in Chinese (简体中文) unless the user specifically requests another language. - - use_default_system_instructions: false - citations_enabled: true - - # Example: Azure OpenAI GPT-4o - # IMPORTANT: For Azure deployments, always include 'base_model' in litellm_params - # to enable accurate token counting, cost tracking, and max token limits - - id: -5 - name: "Global Azure GPT-4o" - description: "Azure OpenAI GPT-4o deployment" - billing_tier: "free" - anonymous_enabled: true - seo_enabled: true - seo_slug: "azure-gpt-4o" - quota_reserve_tokens: 4000 - litellm_provider: "azure" - # model_name format for Azure: azure/ - model_name: "azure/gpt-4o-deployment" + provider: "azure" + model_name: "gpt-5.4-mini" + supports_image_input: false + supports_tools: true + max_input_tokens: 128000 api_key: "your-azure-api-key-here" api_base: "https://your-resource.openai.azure.com" - api_version: "2024-02-15-preview" # Azure API version - rpm: 1000 - tpm: 150000 + rpm: 15000 + tpm: 15000000 litellm_params: - temperature: 0.7 - max_tokens: 4000 - # REQUIRED for Azure: Specify the underlying OpenAI model - # This fixes "Could not identify azure model" warnings - # Common base_model values: gpt-4, gpt-4-turbo, gpt-4o, gpt-4o-mini, gpt-3.5-turbo - base_model: "gpt-4o" + max_tokens: 16384 + base_model: "gpt-5.4-mini" system_instructions: "" use_default_system_instructions: true citations_enabled: true - # Example: Azure OpenAI GPT-4 Turbo - - id: -6 - name: "Global Azure GPT-4 Turbo" - description: "Azure OpenAI GPT-4 Turbo deployment" - billing_tier: "free" - anonymous_enabled: true - seo_enabled: true - seo_slug: "azure-gpt-4-turbo" - quota_reserve_tokens: 4000 - litellm_provider: "azure" - model_name: "azure/gpt-4-turbo-deployment" - api_key: "your-azure-api-key-here" - api_base: "https://your-resource.openai.azure.com" - api_version: "2024-02-15-preview" - rpm: 500 - tpm: 100000 - litellm_params: - temperature: 0.7 - max_tokens: 4000 - base_model: "gpt-4-turbo" # Maps to gpt-4-turbo-preview - system_instructions: "" - use_default_system_instructions: true - citations_enabled: true - - # Example: Groq - Fast inference - - id: -7 - name: "Global Groq Llama 3" - description: "Ultra-fast Llama 3 70B via Groq" - billing_tier: "free" - anonymous_enabled: true - seo_enabled: true - seo_slug: "groq-llama-3" - quota_reserve_tokens: 8000 - litellm_provider: "groq" - model_name: "llama3-70b-8192" - api_key: "your-groq-api-key-here" - api_base: "https://api.groq.com/openai/v1" - rpm: 30 # Groq has lower rate limits on free tier - tpm: 14400 - litellm_params: - temperature: 0.7 - max_tokens: 8000 - system_instructions: "" - use_default_system_instructions: true - citations_enabled: true - - # Example: MiniMax M3 - High-performance with 512K context window - - id: -8 - name: "Global MiniMax M3" - description: "MiniMax M3 with 512K context window and competitive pricing" - billing_tier: "free" - anonymous_enabled: true - seo_enabled: true - seo_slug: "minimax-m3" - quota_reserve_tokens: 4000 - litellm_provider: "openai" - model_name: "MiniMax-M3" - api_key: "your-minimax-api-key-here" - api_base: "https://api.minimax.io/v1" - rpm: 60 - tpm: 100000 - litellm_params: - temperature: 1.0 # MiniMax requires temperature in (0.0, 1.0], cannot be 0 - max_tokens: 4000 - system_instructions: "" - use_default_system_instructions: true - citations_enabled: true - - # Example: Planner LLM - small, fast model used for internal utility tasks - # - # The PLANNER role handles short, structured internal calls (KB query - # rewriting, date extraction, recency classification, etc.) that don't - # need frontier-tier capability. Pointing the planner at a cheap+fast - # model (gpt-4o-mini, Claude Haiku, Azure gpt-5.x-nano, Groq Llama, ...) - # typically saves 500ms-1.5s per turn vs. routing those same internal - # calls through the user's chat model. - # - # Activation: - # - Mark EXACTLY ONE global config with ``is_planner: true``. - # - If multiple are marked, the first one wins and a WARNING is logged. - # - If none is marked, every internal call falls back to the user's - # chat LLM (same behavior as before this flag existed). - # - # This config is operator-only — it is NOT exposed in the user-facing - # model selector, never billed against premium quota, and the - # billing_tier / anonymous_enabled fields below are ignored. + # Planner LLM. This is operator-only and is not shown in the user-facing + # model selector. Only one global_llm_configs entry should set is_planner. - id: -9 - name: "Global Planner (GPT-4o mini)" - description: "Internal-only planner LLM for query rewriting and classification" + name: "Azure GPT 5.x Nano Planner" is_planner: true billing_tier: "free" anonymous_enabled: false seo_enabled: false quota_reserve_tokens: 1000 - litellm_provider: "openai" - model_name: "gpt-4o-mini" - api_key: "sk-your-openai-api-key-here" - api_base: "https://api.openai.com/v1" - rpm: 3500 - tpm: 200000 + provider: "azure" + model_name: "gpt-5.4-nano" + supports_image_input: false + supports_tools: false + router_pool_eligible: false + api_key: "your-azure-api-key-here" + api_base: "https://your-resource.openai.azure.com" + rpm: 20000 + tpm: 4000000 litellm_params: temperature: 0 max_tokens: 1000 + base_model: "gpt-5.4-nano" system_instructions: "" use_default_system_instructions: true citations_enabled: false # ============================================================================= -# OpenRouter Integration +# OpenRouter Dynamic Model Integration # ============================================================================= -# When enabled, dynamically fetches ALL available models from the OpenRouter API -# and injects them as global configs. This gives premium users access to any model -# on OpenRouter (Claude, Gemini, Llama, Mistral, etc.) via their premium token quota, -# while free-tier OpenRouter models show up with a green Free badge and do NOT -# consume premium quota. -# Models are fetched at startup and refreshed periodically in the background. -# All calls go through LiteLLM with the openrouter/ prefix. +# When enabled, SurfSense fetches the OpenRouter catalog at startup and injects +# supported models as GLOBAL chat and optionally image-generation models. +# Tier is derived per model from OpenRouter data: +# - model id ends with ":free" -> billing_tier=free +# - prompt and completion pricing are zero -> billing_tier=free +# - otherwise -> billing_tier=premium +# +# Do not use deprecated openrouter_integration.billing_tier or +# openrouter_integration.anonymous_enabled. Use the tier-specific anonymous +# switches below. openrouter_integration: enabled: false api_key: "sk-or-your-openrouter-api-key" - # Tier is derived PER MODEL from OpenRouter's own API signals: - # - id ends with ":free" -> billing_tier=free - # - pricing.prompt AND pricing.completion == "0" -> billing_tier=free - # - otherwise -> billing_tier=premium - # No global billing_tier knob is honored; any legacy value emits a startup warning. - - # Anonymous access is split by tier so operators can expose only free - # models to no-login users without leaking paid inference. anonymous_enabled_paid: false anonymous_enabled_free: false - seo_enabled: false - # quota_reserve_tokens: tokens reserved per call for quota enforcement quota_reserve_tokens: 4000 - # id_offset: base negative ID for dynamically generated configs. - # Model IDs are derived deterministically via BLAKE2b so they survive - # catalogue churn. Must not overlap with any static GLOBAL model IDs. + + # Base negative ID namespace for dynamic chat models. IDs are derived + # deterministically so they survive catalog churn. Do not overlap static IDs. id_offset: -10000 - # refresh_interval_hours: how often to re-fetch models from OpenRouter (0 = startup only) + + # Separate base negative ID namespace for dynamic image-generation models. + image_id_offset: -20000 + + # How often to refresh the OpenRouter catalog. 0 means startup only. refresh_interval_hours: 24 - # Rate limits for PAID OpenRouter models. These are used by LiteLLM Router - # for per-deployment accounting when OR premium models participate in the - # shared sub-agent "auto" pool. They do NOT cap OpenRouter itself — your - # real account limits live at https://openrouter.ai/settings/limits. + # Paid OpenRouter models may join curated router pools when eligible. rpm: 200 tpm: 1000000 - # Rate limits for FREE OpenRouter models. Informational only: free OR - # models are intentionally kept OUT of the LiteLLM Router pool, because - # OpenRouter enforces free-tier limits globally per account (~20 RPM + - # 50-1000 daily requests across every ":free" model combined) — - # per-deployment router accounting can't represent a shared bucket - # correctly. Free OR models stay fully available in the model selector - # and for user-facing Auto thread pinning. + # Free OpenRouter models are available for user-facing selection/pinning but + # should be treated as a shared-account bucket, not normal router capacity. free_rpm: 20 free_tpm: 100000 - # Image generation + vision LLM emission are OPT-IN. OpenRouter's catalogue - # contains hundreds of image- and vision-capable models; turning these on - # injects them into the global image-generation / vision model lists - # alongside any static configs. Tier (free/premium) is derived - # per model the same way it is for chat (`:free` suffix or zero pricing). - # When a user picks a premium image/vision model the call debits the - # shared $5 USD-cost-based premium credit pool — so leaving these off - # avoids surprise quota burn on existing deployments. Default: false. + # Image generation is opt-in to avoid injecting a large image catalog during + # upgrades. Vision-capable chat models are represented with + # supports_image_input: true. image_generation_enabled: false vision_enabled: false @@ -371,116 +242,83 @@ openrouter_integration: citations_enabled: true # ============================================================================= -# Image Generation Configuration +# Image Generation Auto Mode Router Settings # ============================================================================= -# These configurations power the image generation feature using litellm.aimage_generation(). -# Supported providers: OpenAI, Azure, Google AI Studio, Vertex AI, AWS Bedrock, -# Recraft, OpenRouter, Xinference, Nscale -# -# Auto mode (ID 0) uses LiteLLM Router for load balancing across all image gen configs. - -# Router Settings for Image Generation Auto Mode image_generation_router_settings: routing_strategy: "usage-based-routing" num_retries: 3 allowed_fails: 3 cooldown_time: 60 +# ============================================================================= +# Static GLOBAL Image Generation Models +# ============================================================================= global_image_generation_configs: - # Example: OpenAI DALL-E 3 - id: -2001 - name: "Global DALL-E 3" - description: "OpenAI's DALL-E 3 for high-quality image generation" - litellm_provider: "openai" - model_name: "dall-e-3" - api_key: "sk-your-openai-api-key-here" - api_base: "https://api.openai.com/v1" - rpm: 50 # Requests per minute (image gen is rate-limited by RPM, not tokens) - litellm_params: {} - - # Example: OpenAI GPT Image 1 - - id: -2002 - name: "Global GPT Image 1" - description: "OpenAI's GPT Image 1 model" - litellm_provider: "openai" - model_name: "gpt-image-1" - api_key: "sk-your-openai-api-key-here" - api_base: "https://api.openai.com/v1" - rpm: 50 - litellm_params: {} - - # Example: Azure OpenAI DALL-E 3 - - id: -2003 - name: "Global Azure DALL-E 3" - description: "Azure-hosted DALL-E 3 deployment" - litellm_provider: "azure" - model_name: "azure/dall-e-3-deployment" + name: "Azure GPT Image 1.5" + billing_tier: "premium" + provider: "azure" + model_name: "gpt-image-1.5" api_key: "your-azure-api-key-here" api_base: "https://your-resource.openai.azure.com" - api_version: "2024-02-15-preview" - rpm: 50 + # api_version: "2025-04-01-preview" + rpm: 60 litellm_params: - base_model: "dall-e-3" + base_model: "gpt-image-1.5" - # Example: OpenRouter Gemini Image Generation - # - id: -2004 - # name: "Global Gemini Image Gen" - # description: "Google Gemini image generation via OpenRouter" - # litellm_provider: "openrouter" - # model_name: "google/gemini-2.5-flash-image" - # api_key: "your-openrouter-api-key-here" - # api_base: "https://openrouter.ai/api/v1" - # rpm: 30 - # litellm_params: {} + - id: -2002 + name: "Azure GPT Image 1 Mini" + billing_tier: "free" + provider: "azure" + model_name: "gpt-image-1-mini" + api_key: "your-azure-api-key-here" + api_base: "https://your-resource.openai.azure.com" + # api_version: "2025-04-01-preview" + rpm: 120 + litellm_params: + base_model: "gpt-image-1-mini" -# Notes: -# - ID 0 is reserved for "Auto" mode - uses LiteLLM Router for load balancing -# - Use negative IDs to distinguish global models from BYOK/local DB models -# - IDs must be unique across chat and image generation configs -# - Suggested static ranges: chat -1..-999, image -2001..-2999 -# - The 'api_key' field will not be exposed to users via API -# - system_instructions: Custom prompt or empty string to use defaults -# - use_default_system_instructions: true = use SURFSENSE_SYSTEM_INSTRUCTIONS when system_instructions is empty -# - citations_enabled: true = include citation instructions, false = include anti-citation instructions -# - All standard LiteLLM provider adapter names are supported -# - rpm/tpm: Optional rate limits for load balancing (requests/tokens per minute) -# These help the router distribute load evenly and avoid rate limit errors +# ============================================================================= +# Field Notes +# ============================================================================= +# Common chat/image fields: +# - provider: Canonical provider adapter name. Example: azure, openai, +# anthropic, openrouter, groq, bedrock. +# - model_name: Provider model or deployment id. For Azure, use the bare +# deployment name. The resolver prefixes LiteLLM model strings from provider. +# - api_base: Provider endpoint/root URL. For OpenAI-compatible providers, the +# resolver adds /v1 when needed. +# - api_version: Optional provider-specific API version, stored on the +# materialized connection extra metadata. +# - litellm_params: Passed to LiteLLM when invoking the model. Also used for +# base_model and inline pricing registration. # +# Chat model fields: +# - supports_image_input: true when the chat model can consume image inputs. +# - supports_tools: true when the model can use tools/function calling. +# - max_input_tokens: Optional UI/catalog metadata for context size. +# - router_pool_eligible: false keeps a model out of shared router pools while +# still allowing direct selection/pinning. +# - is_planner: true marks the internal-only planner model. Only one config +# should set this flag. # -# IMAGE GENERATION NOTES: -# - Image generation configs use the shared GLOBAL ID namespace -# - Supported models: dall-e-2, dall-e-3, gpt-image-1 (OpenAI), azure/* (Azure), -# bedrock/* (AWS), vertex_ai/* (Google), recraft/* (Recraft), openrouter/* (OpenRouter) -# - The router uses litellm.aimage_generation() for async image generation -# - Only RPM (requests per minute) is relevant for image generation rate limiting. -# TPM (tokens per minute) does not apply since image APIs are billed/rate-limited per request, not per token. +# Catalog and access fields: +# - billing_tier: "free" or "premium". +# - anonymous_enabled: Whether the model appears in the public no-login catalog. +# - description: Optional no-login UI copy for anonymous-enabled chat models. +# This is not materialized into GLOBAL model metadata and is ignored by image +# generation configs. +# - seo_enabled: Whether a /free/ landing page is generated. +# - seo_slug: Stable URL slug for SEO pages. Keep unique and do not change once +# public. +# - seo_title / seo_description: Optional SEO metadata overrides. +# - quota_reserve_tokens: Tokens reserved before each chat LLM call. +# - rpm / tpm: Optional rate limits for router accounting and load balancing. # -# VISION LLM NOTES: -# - Vision configs use the shared GLOBAL ID namespace -# - Only use vision-capable models (GPT-4o, Gemini, Claude 3, etc.) -# - Lower temperature (0.3) is recommended for accurate screenshot analysis -# - Lower max_tokens (1000) is sufficient since autocomplete produces short suggestions -# -# PLANNER LLM NOTES: -# - is_planner: true marks a config as the internal-only planner LLM (small, -# fast model used for KB query rewriting, date extraction, recency -# classification, etc.). Only one config may carry this flag — if -# multiple do, the first one wins and a startup WARNING is logged. -# - When no config is marked is_planner, every internal utility call falls -# back to the user's chat LLM (the historical behavior). -# - Planner configs are NOT shown in the user-facing model selector and -# are NOT billed against the user's premium quota. Their billing_tier, -# anonymous_enabled, seo_* fields are ignored. -# - Recommended models: gpt-4o-mini, claude-3-5-haiku, gemini-1.5-flash, -# azure gpt-5.x-nano, groq llama3-8b — anything <200ms p50 on a 1-2k -# prompt. Frontier models here defeat the purpose of the flag. -# -# TOKEN QUOTA & ANONYMOUS ACCESS NOTES: -# - billing_tier: "free" or "premium". Controls whether registered users need premium token quota. -# - anonymous_enabled: true/false. Whether the model appears in the public no-login catalog. -# - seo_enabled: true/false. Whether a /free/ landing page is generated. -# - seo_slug: Stable URL slug for SEO pages. Must be unique. Do NOT change once public. -# - seo_title: Optional HTML title tag override for the model's /free/ page. -# - seo_description: Optional meta description override for the model's /free/ page. -# - quota_reserve_tokens: Tokens reserved before each LLM call for quota enforcement. -# Independent of litellm_params.max_tokens. Used by the token quota service. +# Image generation notes: +# - Image-generation configs use the same GLOBAL ID namespace as chat models. +# - Only RPM is relevant for most image-generation APIs. +# - The runtime uses litellm.aimage_generation(). +# - Image billing currently uses billing_tier and model catalog metadata. Keep +# quota reserve tuning in code/catalog unless the materializer copies a YAML +# key for image quota reservation.