SurfSense/surfsense_backend/app/config/global_llm_config.example.yaml

# Global LLM Configuration
#
# SETUP INSTRUCTIONS:
# 1. Copy this file to global_llm_config.yaml.
# 2. Replace placeholder credentials, endpoints, deployment names, and pricing
#    with values from your own provider accounts.
#
# This file is intentionally safe to commit. Do not put real API keys in this
# example file.
#
# These YAML entries are materialized at startup as server-owned GLOBAL
# connections and models:
#
#   global_llm_configs              -> GLOBAL chat models
#   global_image_generation_configs -> GLOBAL image generation models
#
# Do not add global_connections or global_models sections here. They are
# runtime-derived metadata exposed through the model-connections APIs.
#
# Static config shape:
# - Connection fields: provider, api_key, api_base, api_version
# - Model fields: model_name, billing_tier, rpm/tpm, capabilities, litellm_params
# - Public no-login SEO metadata: seo_title, seo_description
# - Prompt defaults: system_instructions, use_default_system_instructions,
#   citations_enabled
#
# Provider notes:
# - Use the canonical provider field.
# - For Azure, use the bare deployment name in model_name, for example
#   model_name: "gpt-5.1". The resolver prefixes the LiteLLM model string from
#   provider: "azure".
#
# GLOBAL ID namespace:
# - ID 0 is reserved for Auto mode.
# - Negative IDs are server-owned GLOBAL models.
# - Positive IDs are user/BYOK database models.
# - Keep static IDs unique across chat and image generation.
# - Suggested static ranges: chat -1..-999, image -2001..-2999.
# - Vision is not a separate config/table. Chat models that accept images use
#   supports_image_input: true.
#
# COST-BASED PREMIUM CREDITS:
# Each premium model bills the user's USD-credit balance based on provider cost
# reported by LiteLLM. For custom Azure deployments or any model LiteLLM does
# not know, declare per-token costs inline:
#
#   litellm_params:
#     base_model: "my-custom-deployment"
#     # USD per token; 0.00000125 == $1.25 per million input tokens.
#     input_cost_per_token: 0.00000125
#     output_cost_per_token: 0.00001
#
# OpenRouter dynamic chat models pull pricing automatically from OpenRouter's
# API. Models without resolvable pricing debit $0 and log a warning.

# =============================================================================
# Chat Auto Mode Router Settings
# =============================================================================
# These settings control how the LiteLLM Router distributes Auto-mode requests
# across curated router-eligible GLOBAL chat deployments.
router_settings:
  # Routing strategy options:
  # - "usage-based-routing": Routes to deployment with lowest current usage.
  # - "simple-shuffle": Random distribution with optional RPM/TPM weighting.
  # - "least-busy": Routes to least busy deployment.
  # - "latency-based-routing": Routes based on response latency.
  routing_strategy: "usage-based-routing"
  num_retries: 3
  allowed_fails: 3
  cooldown_time: 60
  # Optional fallback map:
  # fallbacks:
  #   - {"azure/gpt-5.1": ["azure/gpt-5.4-mini"]}

# =============================================================================
# Static GLOBAL Chat Models
# =============================================================================
global_llm_configs:
  # Premium Azure chat model with image input support and explicit custom
  # pricing. This is the current shape to use for hosted GPT 5.x deployments.
  - id: -1
    name: "Azure GPT 5.1"
    billing_tier: "premium"
    anonymous_enabled: false
    seo_enabled: false
    seo_slug: "azure-gpt-5-1"
    quota_reserve_tokens: 4000
    provider: "azure"
    model_name: "gpt-5.1"
    supports_image_input: true
    supports_tools: true
    max_input_tokens: 400000
    api_key: "your-azure-api-key-here"
    api_base: "https://your-resource.openai.azure.com"
    # api_version is optional. Include it if your Azure deployment requires a
    # specific API version.
    # api_version: "2025-04-01-preview"
    rpm: 47500
    tpm: 14750000
    litellm_params:
      max_tokens: 16384
      base_model: "gpt-5.1"
      input_cost_per_token: 0.00000125
      output_cost_per_token: 0.00001
    system_instructions: ""
    use_default_system_instructions: true
    citations_enabled: true

  # Larger premium chat model. If your provider prices long-context traffic
  # differently, choose a conservative flat price or document the limitation
  # next to the inline pricing.
  - id: -2
    name: "Azure GPT 5.4"
    billing_tier: "premium"
    anonymous_enabled: false
    seo_enabled: false
    seo_slug: "azure-gpt-5-4"
    quota_reserve_tokens: 4000
    provider: "azure"
    model_name: "gpt-5.4"
    supports_image_input: true
    supports_tools: true
    max_input_tokens: 400000
    api_key: "your-azure-api-key-here"
    api_base: "https://your-resource.openai.azure.com"
    rpm: 150000
    tpm: 15000000
    litellm_params:
      max_tokens: 16384
      base_model: "gpt-5.4"
      input_cost_per_token: 0.0000025
      output_cost_per_token: 0.000015
    system_instructions: ""
    use_default_system_instructions: true
    citations_enabled: true

  # Free/no-login hosted model. Free models are visible to users when
  # anonymous_enabled/seo_enabled are true but do not debit premium credits.
  - id: -3
    name: "Azure GPT 5.4 Mini"
    billing_tier: "free"
    anonymous_enabled: true
    seo_enabled: true
    seo_slug: "gpt-5-4-mini-no-login"
    seo_title: "Free GPT 5.4 Mini Chat"
    seo_description: "Chat with a hosted GPT 5.4 Mini model without signing in."
    quota_reserve_tokens: 4000
    provider: "azure"
    model_name: "gpt-5.4-mini"
    supports_image_input: false
    supports_tools: true
    max_input_tokens: 128000
    api_key: "your-azure-api-key-here"
    api_base: "https://your-resource.openai.azure.com"
    rpm: 15000
    tpm: 15000000
    litellm_params:
      max_tokens: 16384
      base_model: "gpt-5.4-mini"
    system_instructions: ""
    use_default_system_instructions: true
    citations_enabled: true

  # Planner LLM. This is operator-only and is not shown in the user-facing
  # model selector. Only one global_llm_configs entry should set is_planner.
  - id: -9
    name: "Azure GPT 5.x Nano Planner"
    is_planner: true
    billing_tier: "free"
    anonymous_enabled: false
    seo_enabled: false
    quota_reserve_tokens: 1000
    provider: "azure"
    model_name: "gpt-5.4-nano"
    supports_image_input: false
    supports_tools: false
    router_pool_eligible: false
    api_key: "your-azure-api-key-here"
    api_base: "https://your-resource.openai.azure.com"
    rpm: 20000
    tpm: 4000000
    litellm_params:
      temperature: 0
      max_tokens: 1000
      base_model: "gpt-5.4-nano"
    system_instructions: ""
    use_default_system_instructions: true
    citations_enabled: false

# =============================================================================
# OpenRouter Dynamic Model Integration
# =============================================================================
# When enabled, SurfSense fetches the OpenRouter catalog at startup and injects
# supported models as GLOBAL chat and optionally image-generation models.
# Tier is derived per model from OpenRouter data:
# - model id ends with ":free" -> billing_tier=free
# - prompt and completion pricing are zero -> billing_tier=free
# - otherwise -> billing_tier=premium
#
# Do not use deprecated openrouter_integration.billing_tier or
# openrouter_integration.anonymous_enabled. Use the tier-specific anonymous
# switches below.
openrouter_integration:
  enabled: false
  api_key: "sk-or-your-openrouter-api-key"

  anonymous_enabled_paid: false
  anonymous_enabled_free: false
  seo_enabled: false
  quota_reserve_tokens: 4000

  # Base negative ID namespace for dynamic chat models. IDs are derived
  # deterministically so they survive catalog churn. Do not overlap static IDs.
  id_offset: -10000

  # Separate base negative ID namespace for dynamic image-generation models.
  image_id_offset: -20000

  # How often to refresh the OpenRouter catalog. 0 means startup only.
  refresh_interval_hours: 24

  # Paid OpenRouter models may join curated router pools when eligible.
  rpm: 200
  tpm: 1000000

  # Free OpenRouter models are available for user-facing selection/pinning but
  # should be treated as a shared-account bucket, not normal router capacity.
  free_rpm: 20
  free_tpm: 100000

  # Image generation is opt-in to avoid injecting a large image catalog during
  # upgrades. Vision-capable chat models are represented with
  # supports_image_input: true.
  image_generation_enabled: false
  vision_enabled: false

  litellm_params:
    max_tokens: 16384
  system_instructions: ""
  use_default_system_instructions: true
  citations_enabled: true

# =============================================================================
# Image Generation Auto Mode Router Settings
# =============================================================================
image_generation_router_settings:
  routing_strategy: "usage-based-routing"
  num_retries: 3
  allowed_fails: 3
  cooldown_time: 60

# =============================================================================
# Static GLOBAL Image Generation Models
# =============================================================================
global_image_generation_configs:
  - id: -2001
    name: "Azure GPT Image 1.5"
    billing_tier: "premium"
    provider: "azure"
    model_name: "gpt-image-1.5"
    api_key: "your-azure-api-key-here"
    api_base: "https://your-resource.openai.azure.com"
    # api_version: "2025-04-01-preview"
    rpm: 60
    litellm_params:
      base_model: "gpt-image-1.5"

  - id: -2002
    name: "Azure GPT Image 1 Mini"
    billing_tier: "free"
    provider: "azure"
    model_name: "gpt-image-1-mini"
    api_key: "your-azure-api-key-here"
    api_base: "https://your-resource.openai.azure.com"
    # api_version: "2025-04-01-preview"
    rpm: 120
    litellm_params:
      base_model: "gpt-image-1-mini"

# =============================================================================
# Field Notes
# =============================================================================
# Common chat/image fields:
# - provider: Canonical provider adapter name. Example: azure, openai,
#   anthropic, openrouter, groq, bedrock.
# - model_name: Provider model or deployment id. For Azure, use the bare
#   deployment name. The resolver prefixes LiteLLM model strings from provider.
# - api_base: Provider endpoint/root URL. For OpenAI-compatible providers, the
#   resolver adds /v1 when needed.
# - api_version: Optional provider-specific API version, stored on the
#   materialized connection extra metadata.
# - litellm_params: Passed to LiteLLM when invoking the model. Also used for
#   base_model and inline pricing registration.
#
# Chat model fields:
# - supports_image_input: true when the chat model can consume image inputs.
# - supports_tools: true when the model can use tools/function calling.
# - max_input_tokens: Optional UI/catalog metadata for context size.
# - router_pool_eligible: false keeps a model out of shared router pools while
#   still allowing direct selection/pinning.
# - is_planner: true marks the internal-only planner model. Only one config
#   should set this flag.
#
# Catalog and access fields:
# - billing_tier: "free" or "premium".
# - anonymous_enabled: Whether the model appears in the public no-login catalog.
# - seo_enabled: Whether a /free/<seo_slug> landing page is generated.
# - seo_slug: Stable URL slug for SEO pages. Keep unique and do not change once
#   public.
# - seo_title / seo_description: Optional SEO metadata overrides.
# - quota_reserve_tokens: Tokens reserved before each chat LLM call.
# - rpm / tpm: Optional rate limits for router accounting and load balancing.
#
# Image generation notes:
# - Image-generation configs use the same GLOBAL ID namespace as chat models.
# - Only RPM is relevant for most image-generation APIs.
# - The runtime uses litellm.aimage_generation().
# - Image billing currently uses billing_tier and model catalog metadata. Keep
#   quota reserve tuning in code/catalog unless the materializer copies a YAML
#   key for image quota reservation.