# Global LLM Configuration # # SETUP INSTRUCTIONS: # 1. Copy this file to global_llm_config.yaml. # 2. Replace placeholder credentials, endpoints, deployment names, and pricing # with values from your own provider accounts. # # This file is intentionally safe to commit. Do not put real API keys in this # example file. # # These YAML entries are materialized at startup as server-owned GLOBAL # connections and models: # # global_llm_configs -> GLOBAL chat models # global_image_generation_configs -> GLOBAL image generation models # # Do not add global_connections or global_models sections here. They are # runtime-derived metadata exposed through the model-connections APIs. # # Static config shape: # - Connection fields: provider, api_key, api_base, api_version # - Model fields: model_name, billing_tier, rpm/tpm, capabilities, litellm_params # - Public no-login SEO metadata: seo_title, seo_description # - Prompt defaults: system_instructions, use_default_system_instructions, # citations_enabled # # Provider notes: # - Use the canonical provider field. # - For Azure, use the bare deployment name in model_name, for example # model_name: "gpt-5.1". The resolver prefixes the LiteLLM model string from # provider: "azure". # # GLOBAL ID namespace: # - ID 0 is reserved for Auto mode. # - Negative IDs are server-owned GLOBAL models. # - Positive IDs are user/BYOK database models. # - Keep static IDs unique across chat and image generation. # - Suggested static ranges: chat -1..-999, image -2001..-2999. # - Vision is not a separate config/table. Chat models that accept images use # supports_image_input: true. # # COST-BASED PREMIUM CREDITS: # Each premium model bills the user's USD-credit balance based on provider cost # reported by LiteLLM. For custom Azure deployments or any model LiteLLM does # not know, declare per-token costs inline: # # litellm_params: # base_model: "my-custom-deployment" # # USD per token; 0.00000125 == $1.25 per million input tokens. # input_cost_per_token: 0.00000125 # output_cost_per_token: 0.00001 # # OpenRouter dynamic chat models pull pricing automatically from OpenRouter's # API. Models without resolvable pricing debit $0 and log a warning. # ============================================================================= # Chat Auto Mode Router Settings # ============================================================================= # These settings control how the LiteLLM Router distributes Auto-mode requests # across curated router-eligible GLOBAL chat deployments. router_settings: # Routing strategy options: # - "usage-based-routing": Routes to deployment with lowest current usage. # - "simple-shuffle": Random distribution with optional RPM/TPM weighting. # - "least-busy": Routes to least busy deployment. # - "latency-based-routing": Routes based on response latency. routing_strategy: "usage-based-routing" num_retries: 3 allowed_fails: 3 cooldown_time: 60 # Optional fallback map: # fallbacks: # - {"azure/gpt-5.1": ["azure/gpt-5.4-mini"]} # ============================================================================= # Static GLOBAL Chat Models # ============================================================================= global_llm_configs: # Premium Azure chat model with image input support and explicit custom # pricing. This is the current shape to use for hosted GPT 5.x deployments. - id: -1 name: "Azure GPT 5.1" billing_tier: "premium" anonymous_enabled: false seo_enabled: false seo_slug: "azure-gpt-5-1" quota_reserve_tokens: 4000 provider: "azure" model_name: "gpt-5.1" supports_image_input: true supports_tools: true max_input_tokens: 400000 api_key: "your-azure-api-key-here" api_base: "https://your-resource.openai.azure.com" # api_version is optional. Include it if your Azure deployment requires a # specific API version. # api_version: "2025-04-01-preview" rpm: 47500 tpm: 14750000 litellm_params: max_tokens: 16384 base_model: "gpt-5.1" input_cost_per_token: 0.00000125 output_cost_per_token: 0.00001 system_instructions: "" use_default_system_instructions: true citations_enabled: true # Larger premium chat model. If your provider prices long-context traffic # differently, choose a conservative flat price or document the limitation # next to the inline pricing. - id: -2 name: "Azure GPT 5.4" billing_tier: "premium" anonymous_enabled: false seo_enabled: false seo_slug: "azure-gpt-5-4" quota_reserve_tokens: 4000 provider: "azure" model_name: "gpt-5.4" supports_image_input: true supports_tools: true max_input_tokens: 400000 api_key: "your-azure-api-key-here" api_base: "https://your-resource.openai.azure.com" rpm: 150000 tpm: 15000000 litellm_params: max_tokens: 16384 base_model: "gpt-5.4" input_cost_per_token: 0.0000025 output_cost_per_token: 0.000015 system_instructions: "" use_default_system_instructions: true citations_enabled: true # Free/no-login hosted model. Free models are visible to users when # anonymous_enabled/seo_enabled are true but do not debit premium credits. - id: -3 name: "Azure GPT 5.4 Mini" billing_tier: "free" anonymous_enabled: true seo_enabled: true seo_slug: "gpt-5-4-mini-no-login" seo_title: "Free GPT 5.4 Mini Chat" seo_description: "Chat with a hosted GPT 5.4 Mini model without signing in." quota_reserve_tokens: 4000 provider: "azure" model_name: "gpt-5.4-mini" supports_image_input: false supports_tools: true max_input_tokens: 128000 api_key: "your-azure-api-key-here" api_base: "https://your-resource.openai.azure.com" rpm: 15000 tpm: 15000000 litellm_params: max_tokens: 16384 base_model: "gpt-5.4-mini" system_instructions: "" use_default_system_instructions: true citations_enabled: true # Planner LLM. This is operator-only and is not shown in the user-facing # model selector. Only one global_llm_configs entry should set is_planner. - id: -9 name: "Azure GPT 5.x Nano Planner" is_planner: true billing_tier: "free" anonymous_enabled: false seo_enabled: false quota_reserve_tokens: 1000 provider: "azure" model_name: "gpt-5.4-nano" supports_image_input: false supports_tools: false router_pool_eligible: false api_key: "your-azure-api-key-here" api_base: "https://your-resource.openai.azure.com" rpm: 20000 tpm: 4000000 litellm_params: temperature: 0 max_tokens: 1000 base_model: "gpt-5.4-nano" system_instructions: "" use_default_system_instructions: true citations_enabled: false # ============================================================================= # OpenRouter Dynamic Model Integration # ============================================================================= # When enabled, SurfSense fetches the OpenRouter catalog at startup and injects # supported models as GLOBAL chat and optionally image-generation models. # Tier is derived per model from OpenRouter data: # - model id ends with ":free" -> billing_tier=free # - prompt and completion pricing are zero -> billing_tier=free # - otherwise -> billing_tier=premium # # Do not use deprecated openrouter_integration.billing_tier or # openrouter_integration.anonymous_enabled. Use the tier-specific anonymous # switches below. openrouter_integration: enabled: false api_key: "sk-or-your-openrouter-api-key" anonymous_enabled_paid: false anonymous_enabled_free: false seo_enabled: false quota_reserve_tokens: 4000 # Base negative ID namespace for dynamic chat models. IDs are derived # deterministically so they survive catalog churn. Do not overlap static IDs. id_offset: -10000 # Separate base negative ID namespace for dynamic image-generation models. image_id_offset: -20000 # How often to refresh the OpenRouter catalog. 0 means startup only. refresh_interval_hours: 24 # Paid OpenRouter models may join curated router pools when eligible. rpm: 200 tpm: 1000000 # Free OpenRouter models are available for user-facing selection/pinning but # should be treated as a shared-account bucket, not normal router capacity. free_rpm: 20 free_tpm: 100000 # Image generation is opt-in to avoid injecting a large image catalog during # upgrades. Vision-capable chat models are represented with # supports_image_input: true. image_generation_enabled: false vision_enabled: false litellm_params: max_tokens: 16384 system_instructions: "" use_default_system_instructions: true citations_enabled: true # ============================================================================= # Image Generation Auto Mode Router Settings # ============================================================================= image_generation_router_settings: routing_strategy: "usage-based-routing" num_retries: 3 allowed_fails: 3 cooldown_time: 60 # ============================================================================= # Static GLOBAL Image Generation Models # ============================================================================= global_image_generation_configs: - id: -2001 name: "Azure GPT Image 1.5" billing_tier: "premium" provider: "azure" model_name: "gpt-image-1.5" api_key: "your-azure-api-key-here" api_base: "https://your-resource.openai.azure.com" # api_version: "2025-04-01-preview" rpm: 60 litellm_params: base_model: "gpt-image-1.5" - id: -2002 name: "Azure GPT Image 1 Mini" billing_tier: "free" provider: "azure" model_name: "gpt-image-1-mini" api_key: "your-azure-api-key-here" api_base: "https://your-resource.openai.azure.com" # api_version: "2025-04-01-preview" rpm: 120 litellm_params: base_model: "gpt-image-1-mini" # ============================================================================= # Field Notes # ============================================================================= # Common chat/image fields: # - provider: Canonical provider adapter name. Example: azure, openai, # anthropic, openrouter, groq, bedrock. # - model_name: Provider model or deployment id. For Azure, use the bare # deployment name. The resolver prefixes LiteLLM model strings from provider. # - api_base: Provider endpoint/root URL. For OpenAI-compatible providers, the # resolver adds /v1 when needed. # - api_version: Optional provider-specific API version, stored on the # materialized connection extra metadata. # - litellm_params: Passed to LiteLLM when invoking the model. Also used for # base_model and inline pricing registration. # # Chat model fields: # - supports_image_input: true when the chat model can consume image inputs. # - supports_tools: true when the model can use tools/function calling. # - max_input_tokens: Optional UI/catalog metadata for context size. # - router_pool_eligible: false keeps a model out of shared router pools while # still allowing direct selection/pinning. # - is_planner: true marks the internal-only planner model. Only one config # should set this flag. # # Catalog and access fields: # - billing_tier: "free" or "premium". # - anonymous_enabled: Whether the model appears in the public no-login catalog. # - seo_enabled: Whether a /free/ landing page is generated. # - seo_slug: Stable URL slug for SEO pages. Keep unique and do not change once # public. # - seo_title / seo_description: Optional SEO metadata overrides. # - quota_reserve_tokens: Tokens reserved before each chat LLM call. # - rpm / tpm: Optional rate limits for router accounting and load balancing. # # Image generation notes: # - Image-generation configs use the same GLOBAL ID namespace as chat models. # - Only RPM is relevant for most image-generation APIs. # - The runtime uses litellm.aimage_generation(). # - Image billing currently uses billing_tier and model catalog metadata. Keep # quota reserve tuning in code/catalog unless the materializer copies a YAML # key for image quota reservation.