From 40c2ff11426c649629b692cd588c56081d9f420f Mon Sep 17 00:00:00 2001 From: Spherrrical Date: Wed, 3 Jun 2026 20:39:19 +0000 Subject: [PATCH] deploy: b5ebb1beea001c0432ac955b086bf04d51fe4d2a --- .../plano_config_full_reference.yaml | 8 + includes/llms.txt | 32 +- resources/configuration_reference.html | 403 ++++++++++-------- searchindex.js | 2 +- 4 files changed, 259 insertions(+), 186 deletions(-) diff --git a/_downloads/c86f9e8fb1f2994b1ba4a0b98481410e/plano_config_full_reference.yaml b/_downloads/c86f9e8fb1f2994b1ba4a0b98481410e/plano_config_full_reference.yaml index 99eb4510..2231a01f 100755 --- a/_downloads/c86f9e8fb1f2994b1ba4a0b98481410e/plano_config_full_reference.yaml +++ b/_downloads/c86f9e8fb1f2994b1ba4a0b98481410e/plano_config_full_reference.yaml @@ -47,6 +47,14 @@ model_providers: http_host: api.custom-provider.com access_key: $CUSTOM_API_KEY + # headers: optional map of extra HTTP headers sent on upstream requests (after auth). + # Use for provider-specific requirements such as User-Agent, org IDs, or account headers. + - model: moonshotai/kimi-for-coding + access_key: $MOONSHOTAI_API_KEY + base_url: https://api.kimi.com/coding/v1 + headers: + User-Agent: "KimiCLI/1.3" + # Model aliases - use friendly names instead of full provider model names model_aliases: fast-llm: diff --git a/includes/llms.txt b/includes/llms.txt index 7d3a883f..a4b71a52 100755 --- a/includes/llms.txt +++ b/includes/llms.txt @@ -1,6 +1,6 @@ Plano Docs v0.4.22 llms.txt (auto-generated) -Generated (UTC): 2026-06-03T17:10:24.816770+00:00 +Generated (UTC): 2026-06-03T20:39:15.011041+00:00 Table of contents - Agents (concepts/agents) @@ -6960,6 +6960,28 @@ The following is a complete reference of the plano_config.yml that controls the the Plano gateway. This where you enable capabilities like routing to upstream LLm providers, defining prompt_targets where prompts get routed to, apply guardrails, and enable critical agent observability features. +Model provider headers + +Each entry under model_providers (or the legacy llm_providers alias) may include a headers map of extra +HTTP headers that Plano adds to upstream LLM requests. Plano applies these headers after it sets authentication from +access_key or passthrough_auth, so you can supply provider-specific metadata without replacing the configured +credentials. + +Type: map of strings (header name → value) + +Optional: yes + +Common uses: required User-Agent values, organization or account identifiers, or other headers some APIs expect + +model_providers: + - model: moonshotai/kimi-for-coding + access_key: $MOONSHOTAI_API_KEY + base_url: https://api.kimi.com/coding/v1 + headers: + User-Agent: "KimiCLI/1.3" + +The example below includes this and other provider options in context. + Plano Configuration - Full Reference # Plano Gateway configuration version @@ -7011,6 +7033,14 @@ model_providers: http_host: api.custom-provider.com access_key: $CUSTOM_API_KEY + # headers: optional map of extra HTTP headers sent on upstream requests (after auth). + # Use for provider-specific requirements such as User-Agent, org IDs, or account headers. + - model: moonshotai/kimi-for-coding + access_key: $MOONSHOTAI_API_KEY + base_url: https://api.kimi.com/coding/v1 + headers: + User-Agent: "KimiCLI/1.3" + # Model aliases - use friendly names instead of full provider model names model_aliases: fast-llm: diff --git a/resources/configuration_reference.html b/resources/configuration_reference.html index cfca7f35..805b9a2b 100755 --- a/resources/configuration_reference.html +++ b/resources/configuration_reference.html @@ -165,6 +165,26 @@

The following is a complete reference of the plano_config.yml that controls the behavior of a single instance of the Plano gateway. This where you enable capabilities like routing to upstream LLm providers, defining prompt_targets where prompts get routed to, apply guardrails, and enable critical agent observability features.

+
+

Model provider headers

+

Each entry under model_providers (or the legacy llm_providers alias) may include a headers map of extra +HTTP headers that Plano adds to upstream LLM requests. Plano applies these headers after it sets authentication from +access_key or passthrough_auth, so you can supply provider-specific metadata without replacing the configured +credentials.

+ +
model_providers:
+  - model: moonshotai/kimi-for-coding
+    access_key: $MOONSHOTAI_API_KEY
+    base_url: https://api.kimi.com/coding/v1
+    headers:
+      User-Agent: "KimiCLI/1.3"
+
+
+

The example below includes this and other provider options in context.

  1# Plano Gateway configuration version
@@ -216,198 +236,207 @@ where prompts get routed to, apply guardrails, and enable critical agent observa
  47    http_host: api.custom-provider.com
  48    access_key: $CUSTOM_API_KEY
  49
- 50# Model aliases - use friendly names instead of full provider model names
- 51model_aliases:
- 52  fast-llm:
- 53    target: gpt-4o-mini
- 54
- 55  smart-llm:
- 56    target: gpt-4o
+ 50  # headers: optional map of extra HTTP headers sent on upstream requests (after auth).
+ 51  # Use for provider-specific requirements such as User-Agent, org IDs, or account headers.
+ 52  - model: moonshotai/kimi-for-coding
+ 53    access_key: $MOONSHOTAI_API_KEY
+ 54    base_url: https://api.kimi.com/coding/v1
+ 55    headers:
+ 56      User-Agent: "KimiCLI/1.3"
  57
- 58# routing_preferences: top-level list that tags named task categories with an
- 59# ordered pool of candidate models. Plano's LLM router matches incoming requests
- 60# against these descriptions and returns an ordered list of models; the client
- 61# uses models[0] as primary and retries with models[1], models[2]... on 429/5xx.
- 62# Requires overrides.llm_routing_model to point at Plano-Orchestrator (or equivalent).
- 63# Each model in `models` must be declared in model_providers above.
- 64# selection_policy is optional: {prefer: cheapest|fastest|none} lets the router
- 65# reorder candidates using live cost/latency data from model_metrics_sources.
- 66routing_preferences:
- 67  - name: code generation
- 68    description: generating new code snippets, functions, or boilerplate based on user prompts or requirements
- 69    models:
- 70      - anthropic/claude-sonnet-4-0
- 71      - openai/gpt-4o
- 72      - groq/llama-3.3-70b-versatile
- 73  - name: code review
- 74    description: reviewing, analyzing, and suggesting improvements to existing code
- 75    models:
- 76      - anthropic/claude-sonnet-4-0
- 77      - groq/llama-3.3-70b-versatile
- 78    selection_policy:
- 79      prefer: cheapest
- 80
- 81# HTTP listeners - entry points for agent routing, prompt targets, and direct LLM access
- 82listeners:
- 83  # Agent listener for routing requests to multiple agents
- 84  - type: agent
- 85    name: travel_booking_service
- 86    port: 8001
- 87    router: plano_orchestrator_v1
- 88    address: 0.0.0.0
- 89    agents:
- 90      - id: rag_agent
- 91        description: virtual assistant for retrieval augmented generation tasks
- 92        input_filters:
- 93          - input_guards
- 94
- 95  # Model listener for direct LLM access
- 96  - type: model
- 97    name: model_1
- 98    address: 0.0.0.0
- 99    port: 12000
-100    timeout: 30s          # Request timeout (e.g. "30s", "60s")
-101    max_retries: 3        # Number of retries on upstream failure
-102    input_filters:        # Filters applied before forwarding to LLM
-103      - input_guards
-104    output_filters:       # Filters applied to LLM responses before returning to client
-105      - input_guards
-106
-107  # Prompt listener for function calling (for prompt_targets)
-108  - type: prompt
-109    name: prompt_function_listener
-110    address: 0.0.0.0
-111    port: 10000
-112
-113# Reusable service endpoints
-114endpoints:
-115  app_server:
-116    endpoint: 127.0.0.1:80
-117    connect_timeout: 0.005s
-118    protocol: http        # http or https
-119
-120  mistral_local:
-121    endpoint: 127.0.0.1:8001
-122
-123  secure_service:
-124    endpoint: api.example.com:443
-125    protocol: https
-126    http_host: api.example.com  # Override the Host header sent upstream
+ 58# Model aliases - use friendly names instead of full provider model names
+ 59model_aliases:
+ 60  fast-llm:
+ 61    target: gpt-4o-mini
+ 62
+ 63  smart-llm:
+ 64    target: gpt-4o
+ 65
+ 66# routing_preferences: top-level list that tags named task categories with an
+ 67# ordered pool of candidate models. Plano's LLM router matches incoming requests
+ 68# against these descriptions and returns an ordered list of models; the client
+ 69# uses models[0] as primary and retries with models[1], models[2]... on 429/5xx.
+ 70# Requires overrides.llm_routing_model to point at Plano-Orchestrator (or equivalent).
+ 71# Each model in `models` must be declared in model_providers above.
+ 72# selection_policy is optional: {prefer: cheapest|fastest|none} lets the router
+ 73# reorder candidates using live cost/latency data from model_metrics_sources.
+ 74routing_preferences:
+ 75  - name: code generation
+ 76    description: generating new code snippets, functions, or boilerplate based on user prompts or requirements
+ 77    models:
+ 78      - anthropic/claude-sonnet-4-0
+ 79      - openai/gpt-4o
+ 80      - groq/llama-3.3-70b-versatile
+ 81  - name: code review
+ 82    description: reviewing, analyzing, and suggesting improvements to existing code
+ 83    models:
+ 84      - anthropic/claude-sonnet-4-0
+ 85      - groq/llama-3.3-70b-versatile
+ 86    selection_policy:
+ 87      prefer: cheapest
+ 88
+ 89# HTTP listeners - entry points for agent routing, prompt targets, and direct LLM access
+ 90listeners:
+ 91  # Agent listener for routing requests to multiple agents
+ 92  - type: agent
+ 93    name: travel_booking_service
+ 94    port: 8001
+ 95    router: plano_orchestrator_v1
+ 96    address: 0.0.0.0
+ 97    agents:
+ 98      - id: rag_agent
+ 99        description: virtual assistant for retrieval augmented generation tasks
+100        input_filters:
+101          - input_guards
+102
+103  # Model listener for direct LLM access
+104  - type: model
+105    name: model_1
+106    address: 0.0.0.0
+107    port: 12000
+108    timeout: 30s          # Request timeout (e.g. "30s", "60s")
+109    max_retries: 3        # Number of retries on upstream failure
+110    input_filters:        # Filters applied before forwarding to LLM
+111      - input_guards
+112    output_filters:       # Filters applied to LLM responses before returning to client
+113      - input_guards
+114
+115  # Prompt listener for function calling (for prompt_targets)
+116  - type: prompt
+117    name: prompt_function_listener
+118    address: 0.0.0.0
+119    port: 10000
+120
+121# Reusable service endpoints
+122endpoints:
+123  app_server:
+124    endpoint: 127.0.0.1:80
+125    connect_timeout: 0.005s
+126    protocol: http        # http or https
 127
-128# Optional top-level system prompt applied to all prompt_targets
-129system_prompt: |
-130  You are a helpful assistant. Always respond concisely and accurately.
-131
-132# Prompt targets for function calling and API orchestration
-133prompt_targets:
-134  - name: get_current_weather
-135    description: Get current weather at a location.
-136    parameters:
-137      - name: location
-138        description: The location to get the weather for
-139        required: true
-140        type: string
-141        format: City, State
-142      - name: days
-143        description: the number of days for the request
-144        required: true
-145        type: int
-146    endpoint:
-147      name: app_server
-148      path: /weather
-149      http_method: POST
-150    # Per-target system prompt (overrides top-level system_prompt for this target)
-151    system_prompt: You are a weather expert. Provide accurate and concise weather information.
-152    # auto_llm_dispatch_on_response: when true, the LLM is called again with the
-153    # function response to produce a final natural-language answer for the user
-154    auto_llm_dispatch_on_response: true
-155
-156# Rate limits - control token usage per model and request selector
-157ratelimits:
-158  - model: openai/gpt-4o
-159    selector:
-160      key: x-user-id       # HTTP header key used to identify the rate-limit subject
-161      value: "*"           # Wildcard matches any value; use a specific string to target one
-162    limit:
-163      tokens: 100000       # Maximum tokens allowed in the given time unit
-164      unit: hour           # Time unit: "minute", "hour", or "day"
-165
-166  - model: openai/gpt-4o-mini
+128  mistral_local:
+129    endpoint: 127.0.0.1:8001
+130
+131  secure_service:
+132    endpoint: api.example.com:443
+133    protocol: https
+134    http_host: api.example.com  # Override the Host header sent upstream
+135
+136# Optional top-level system prompt applied to all prompt_targets
+137system_prompt: |
+138  You are a helpful assistant. Always respond concisely and accurately.
+139
+140# Prompt targets for function calling and API orchestration
+141prompt_targets:
+142  - name: get_current_weather
+143    description: Get current weather at a location.
+144    parameters:
+145      - name: location
+146        description: The location to get the weather for
+147        required: true
+148        type: string
+149        format: City, State
+150      - name: days
+151        description: the number of days for the request
+152        required: true
+153        type: int
+154    endpoint:
+155      name: app_server
+156      path: /weather
+157      http_method: POST
+158    # Per-target system prompt (overrides top-level system_prompt for this target)
+159    system_prompt: You are a weather expert. Provide accurate and concise weather information.
+160    # auto_llm_dispatch_on_response: when true, the LLM is called again with the
+161    # function response to produce a final natural-language answer for the user
+162    auto_llm_dispatch_on_response: true
+163
+164# Rate limits - control token usage per model and request selector
+165ratelimits:
+166  - model: openai/gpt-4o
 167    selector:
-168      key: x-org-id
-169      value: acme-corp
+168      key: x-user-id       # HTTP header key used to identify the rate-limit subject
+169      value: "*"           # Wildcard matches any value; use a specific string to target one
 170    limit:
-171      tokens: 500000
-172      unit: day
+171      tokens: 100000       # Maximum tokens allowed in the given time unit
+172      unit: hour           # Time unit: "minute", "hour", or "day"
 173
-174# Global behavior overrides
-175overrides:
-176  # Threshold for routing a request to a prompt_target (0.0–1.0). Lower = more permissive.
-177  prompt_target_intent_matching_threshold: 0.7
-178  # Trim conversation history to fit within the model's context window
-179  optimize_context_window: true
-180  # Use Plano's agent orchestrator for multi-agent request routing
-181  use_agent_orchestrator: false
-182  # Connect timeout for upstream provider clusters (e.g., "5s", "10s"). Default: "5s"
-183  upstream_connect_timeout: 10s
-184  # Path to the trusted CA bundle for upstream TLS verification
-185  upstream_tls_ca_path: /etc/ssl/certs/ca-certificates.crt
-186  # Model used for intent-based LLM routing (must be listed in model_providers)
-187  llm_routing_model: Plano-Orchestrator
-188  # Model used for agent orchestration (must be listed in model_providers)
-189  agent_orchestration_model: Plano-Orchestrator
-190  # Disable agentic signal analysis (frustration, repetition, escalation, etc.)
-191  # on LLM responses to save CPU. Default: false.
-192  disable_signals: false
-193
-194# Model affinity — pin routing decisions for agentic loops
-195routing:
-196  session_ttl_seconds: 600    # How long a pinned session lasts (default: 600s / 10 min)
-197  session_max_entries: 10000  # Max cached sessions before eviction (upper limit: 10000)
-198  # session_cache controls the backend used to store affinity state.
-199  # "memory" (default) is in-process and works for single-instance deployments.
-200  # "redis" shares state across replicas — required for multi-replica / Kubernetes setups.
-201  session_cache:
-202    type: memory              # "memory" (default) or "redis"
-203    # url is required when type is "redis". Supports redis:// and rediss:// (TLS).
-204    # url: redis://localhost:6379
-205    # tenant_header: x-org-id  # optional; when set, keys are scoped as plano:affinity:{tenant_id}:{session_id}
-206
-207# State storage for multi-turn conversation history
-208state_storage:
-209  type: memory            # "memory" (in-process) or "postgres" (persistent)
-210  # connection_string is required when type is postgres.
-211  # Supports environment variable substitution: $VAR or ${VAR}
-212  # connection_string: postgresql://user:$DB_PASS@localhost:5432/plano
-213
-214# Input guardrails applied globally to all incoming requests
-215prompt_guards:
-216  input_guards:
-217    jailbreak:
-218      on_exception:
-219        message: "I'm sorry, I can't help with that request."
-220
-221# OpenTelemetry tracing configuration
-222tracing:
-223  # Random sampling percentage (1-100)
-224  random_sampling: 100
-225  # Include internal Plano spans in traces
-226  trace_arch_internal: false
-227  # gRPC endpoint for OpenTelemetry collector (e.g., Jaeger, Tempo)
-228  opentracing_grpc_endpoint: http://localhost:4317
-229  span_attributes:
-230    # Propagate request headers whose names start with these prefixes as span attributes
-231    header_prefixes:
-232      - x-user-
-233      - x-org-
-234    # Static key/value pairs added to every span
-235    static:
-236      environment: production
-237      service.team: platform
+174  - model: openai/gpt-4o-mini
+175    selector:
+176      key: x-org-id
+177      value: acme-corp
+178    limit:
+179      tokens: 500000
+180      unit: day
+181
+182# Global behavior overrides
+183overrides:
+184  # Threshold for routing a request to a prompt_target (0.0–1.0). Lower = more permissive.
+185  prompt_target_intent_matching_threshold: 0.7
+186  # Trim conversation history to fit within the model's context window
+187  optimize_context_window: true
+188  # Use Plano's agent orchestrator for multi-agent request routing
+189  use_agent_orchestrator: false
+190  # Connect timeout for upstream provider clusters (e.g., "5s", "10s"). Default: "5s"
+191  upstream_connect_timeout: 10s
+192  # Path to the trusted CA bundle for upstream TLS verification
+193  upstream_tls_ca_path: /etc/ssl/certs/ca-certificates.crt
+194  # Model used for intent-based LLM routing (must be listed in model_providers)
+195  llm_routing_model: Plano-Orchestrator
+196  # Model used for agent orchestration (must be listed in model_providers)
+197  agent_orchestration_model: Plano-Orchestrator
+198  # Disable agentic signal analysis (frustration, repetition, escalation, etc.)
+199  # on LLM responses to save CPU. Default: false.
+200  disable_signals: false
+201
+202# Model affinity — pin routing decisions for agentic loops
+203routing:
+204  session_ttl_seconds: 600    # How long a pinned session lasts (default: 600s / 10 min)
+205  session_max_entries: 10000  # Max cached sessions before eviction (upper limit: 10000)
+206  # session_cache controls the backend used to store affinity state.
+207  # "memory" (default) is in-process and works for single-instance deployments.
+208  # "redis" shares state across replicas — required for multi-replica / Kubernetes setups.
+209  session_cache:
+210    type: memory              # "memory" (default) or "redis"
+211    # url is required when type is "redis". Supports redis:// and rediss:// (TLS).
+212    # url: redis://localhost:6379
+213    # tenant_header: x-org-id  # optional; when set, keys are scoped as plano:affinity:{tenant_id}:{session_id}
+214
+215# State storage for multi-turn conversation history
+216state_storage:
+217  type: memory            # "memory" (in-process) or "postgres" (persistent)
+218  # connection_string is required when type is postgres.
+219  # Supports environment variable substitution: $VAR or ${VAR}
+220  # connection_string: postgresql://user:$DB_PASS@localhost:5432/plano
+221
+222# Input guardrails applied globally to all incoming requests
+223prompt_guards:
+224  input_guards:
+225    jailbreak:
+226      on_exception:
+227        message: "I'm sorry, I can't help with that request."
+228
+229# OpenTelemetry tracing configuration
+230tracing:
+231  # Random sampling percentage (1-100)
+232  random_sampling: 100
+233  # Include internal Plano spans in traces
+234  trace_arch_internal: false
+235  # gRPC endpoint for OpenTelemetry collector (e.g., Jaeger, Tempo)
+236  opentracing_grpc_endpoint: http://localhost:4317
+237  span_attributes:
+238    # Propagate request headers whose names start with these prefixes as span attributes
+239    header_prefixes:
+240      - x-user-
+241      - x-org-
+242    # Static key/value pairs added to every span
+243    static:
+244      environment: production
+245      service.team: platform
 
+
-
+