diff --git a/_downloads/c86f9e8fb1f2994b1ba4a0b98481410e/plano_config_full_reference.yaml b/_downloads/c86f9e8fb1f2994b1ba4a0b98481410e/plano_config_full_reference.yaml index 452bc17a..787b09d3 100755 --- a/_downloads/c86f9e8fb1f2994b1ba4a0b98481410e/plano_config_full_reference.yaml +++ b/_downloads/c86f9e8fb1f2994b1ba4a0b98481410e/plano_config_full_reference.yaml @@ -174,6 +174,11 @@ overrides: # Model used for agent orchestration (must be listed in model_providers) agent_orchestration_model: Plano-Orchestrator +# Model affinity — pin routing decisions for agentic loops +routing: + session_ttl_seconds: 600 # How long a pinned session lasts (default: 600s / 10 min) + session_max_entries: 10000 # Max cached sessions before eviction (upper limit: 10000) + # State storage for multi-turn conversation history state_storage: type: memory # "memory" (in-process) or "postgres" (persistent) diff --git a/concepts/agents.html b/concepts/agents.html index 9fd6b9ec..4aa77809 100755 --- a/concepts/agents.html +++ b/concepts/agents.html @@ -267,7 +267,7 @@ diff --git a/concepts/filter_chain.html b/concepts/filter_chain.html index 394add25..6aec3664 100755 --- a/concepts/filter_chain.html +++ b/concepts/filter_chain.html @@ -333,7 +333,7 @@ powerful abstraction for evolving your agent workflows over time.
diff --git a/concepts/listeners.html b/concepts/listeners.html index de4dcd3c..d669ddae 100755 --- a/concepts/listeners.html +++ b/concepts/listeners.html @@ -270,7 +270,7 @@ application to LLMs (API-based or hosted) via prompt targets. diff --git a/concepts/llm_providers/client_libraries.html b/concepts/llm_providers/client_libraries.html index ba2a8ec0..ed9cbd67 100755 --- a/concepts/llm_providers/client_libraries.html +++ b/concepts/llm_providers/client_libraries.html @@ -660,7 +660,7 @@ Implement fallback logic for better reliability: diff --git a/concepts/llm_providers/llm_providers.html b/concepts/llm_providers/llm_providers.html index f5a44832..b86a590e 100755 --- a/concepts/llm_providers/llm_providers.html +++ b/concepts/llm_providers/llm_providers.html @@ -304,7 +304,7 @@ Use your preferred client library without changing existing code (see© 2026, Katanemo Labs, a DigitalOcean Company Last updated: Apr 04, 2026.
+© 2026, Katanemo Labs, a DigitalOcean Company Last updated: Apr 09, 2026.
Configure your LLM providers with specific provider/model names:
-listeners:
egress_traffic:
address: 0.0.0.0
@@ -231,8 +231,8 @@
Configuration
Configure semantic aliases that map to underlying models:
-
-
+
+
listeners:
egress_traffic:
address: 0.0.0.0
@@ -293,8 +293,8 @@
Configuration
To configure preference-aligned dynamic routing, define routing preferences that map domains and actions to specific models:
-
-
+
+
listeners:
egress_traffic:
address: 0.0.0.0
@@ -497,11 +497,39 @@ instead of a file.
demo README.
+
+Model Affinity
+In agentic loops — where a single user request triggers multiple LLM calls through tool use — Plano’s router classifies each turn independently. Because successive prompts differ in intent (tool selection looks like code generation, reasoning about results looks like analysis), the router may select different models mid-session. This causes behavioral inconsistency and invalidates provider-side KV caches, increasing both latency and cost.
+Model affinity pins the routing decision for the duration of a session. Send an X-Model-Affinity header with any string identifier (typically a UUID). The first request routes normally and caches the result. All subsequent requests with the same affinity ID skip routing and reuse the cached model.
+import uuid
+from openai import OpenAI
+
+client = OpenAI(base_url="http://localhost:12000/v1", api_key="EMPTY")
+affinity_id = str(uuid.uuid4())
+
+# Every call in the loop uses the same header
+response = client.chat.completions.create(
+ model="gpt-4o-mini",
+ messages=messages,
+ tools=tools,
+ extra_headers={"X-Model-Affinity": affinity_id},
+)
+
+
+Without the header, routing runs fresh on every request — no behavior change for existing clients.
+Configuration:
+routing:
+ session_ttl_seconds: 600 # How long affinity lasts (default: 10 min)
+ session_max_entries: 10000 # Max cached sessions (upper limit: 10000)
+
+
+To start a new routing decision (e.g., when the agent’s task changes), generate a new affinity ID.
+
Combining Routing Methods
You can combine static model selection with dynamic routing preferences for maximum flexibility:
-
-
+
+
llm_providers:
- model: openai/gpt-5.2
access_key: $OPENAI_API_KEY
@@ -635,6 +663,7 @@ instead of a file.
Using vLLM on Kubernetes (GPU nodes)
+Model Affinity
Combining Routing Methods
Example Use Cases
Best practices
@@ -647,7 +676,7 @@ instead of a file.
diff --git a/guides/observability/access_logging.html b/guides/observability/access_logging.html
index 3141ff04..0d20a4ed 100755
--- a/guides/observability/access_logging.html
+++ b/guides/observability/access_logging.html
@@ -248,7 +248,7 @@ Access logs can be exported to centralized logging systems (e.g., ELK stack or F
diff --git a/guides/observability/monitoring.html b/guides/observability/monitoring.html
index de157744..10d994f0 100755
--- a/guides/observability/monitoring.html
+++ b/guides/observability/monitoring.html
@@ -260,7 +260,7 @@ are some sample configuration files for both, respectively.
diff --git a/guides/observability/observability.html b/guides/observability/observability.html
index e01c037a..55dce723 100755
--- a/guides/observability/observability.html
+++ b/guides/observability/observability.html
@@ -216,7 +216,7 @@
diff --git a/guides/observability/tracing.html b/guides/observability/tracing.html
index 9d246638..819f05ae 100755
--- a/guides/observability/tracing.html
+++ b/guides/observability/tracing.html
@@ -792,7 +792,7 @@ tools like AWS X-Ray and Datadog, enhancing observability and facilitating faste
diff --git a/guides/orchestration.html b/guides/orchestration.html
index ef554a6d..4b4235a3 100755
--- a/guides/orchestration.html
+++ b/guides/orchestration.html
@@ -1003,7 +1003,7 @@ Plano makes it easy to build and scale these systems by managing the orchestrati
diff --git a/guides/prompt_guard.html b/guides/prompt_guard.html
index feec51b6..e1000107 100755
--- a/guides/prompt_guard.html
+++ b/guides/prompt_guard.html
@@ -298,7 +298,7 @@ the agent. If validation fails (
diff --git a/guides/state.html b/guides/state.html
index b679231c..0fa0c9fd 100755
--- a/guides/state.html
+++ b/guides/state.html
@@ -453,7 +453,7 @@
diff --git a/includes/llms.txt b/includes/llms.txt
index 3fa92ef9..8d80b936 100755
--- a/includes/llms.txt
+++ b/includes/llms.txt
@@ -1,6 +1,6 @@
Plano Docs v0.4.17
llms.txt (auto-generated)
-Generated (UTC): 2026-04-04T16:59:07.910060+00:00
+Generated (UTC): 2026-04-09T00:32:32.796454+00:00
Table of contents
- Agents (concepts/agents)
@@ -3979,6 +3979,38 @@ For the canonical Plano Kubernetes deployment (ConfigMap, Secrets, Deployment YA
deployment. For full step-by-step commands specific to this demo, see the
demo README.
+
+
+Model Affinity
+
+In agentic loops — where a single user request triggers multiple LLM calls through tool use — Plano’s router classifies each turn independently. Because successive prompts differ in intent (tool selection looks like code generation, reasoning about results looks like analysis), the router may select different models mid-session. This causes behavioral inconsistency and invalidates provider-side KV caches, increasing both latency and cost.
+
+Model affinity pins the routing decision for the duration of a session. Send an X-Model-Affinity header with any string identifier (typically a UUID). The first request routes normally and caches the result. All subsequent requests with the same affinity ID skip routing and reuse the cached model.
+
+import uuid
+from openai import OpenAI
+
+client = OpenAI(base_url="http://localhost:12000/v1", api_key="EMPTY")
+affinity_id = str(uuid.uuid4())
+
+# Every call in the loop uses the same header
+response = client.chat.completions.create(
+ model="gpt-4o-mini",
+ messages=messages,
+ tools=tools,
+ extra_headers={"X-Model-Affinity": affinity_id},
+)
+
+Without the header, routing runs fresh on every request — no behavior change for existing clients.
+
+Configuration:
+
+routing:
+ session_ttl_seconds: 600 # How long affinity lasts (default: 10 min)
+ session_max_entries: 10000 # Max cached sessions (upper limit: 10000)
+
+To start a new routing decision (e.g., when the agent’s task changes), generate a new affinity ID.
+
Combining Routing Methods
You can combine static model selection with dynamic routing preferences for maximum flexibility:
@@ -6525,6 +6557,11 @@ overrides:
# Model used for agent orchestration (must be listed in model_providers)
agent_orchestration_model: Plano-Orchestrator
+# Model affinity — pin routing decisions for agentic loops
+routing:
+ session_ttl_seconds: 600 # How long a pinned session lasts (default: 600s / 10 min)
+ session_max_entries: 10000 # Max cached sessions before eviction (upper limit: 10000)
+
# State storage for multi-turn conversation history
state_storage:
type: memory # "memory" (in-process) or "postgres" (persistent)
diff --git a/index.html b/index.html
index f01ebb03..82ee90d5 100755
--- a/index.html
+++ b/index.html
@@ -247,7 +247,7 @@ Resources
diff --git a/objects.inv b/objects.inv
index 43d6aa98..895f2944 100755
Binary files a/objects.inv and b/objects.inv differ
diff --git a/resources/cli_reference.html b/resources/cli_reference.html
index 2e34a95a..5d32513c 100755
--- a/resources/cli_reference.html
+++ b/resources/cli_reference.html
@@ -437,7 +437,7 @@ Use this page as the canonical source for command syntax, options, and recommend
diff --git a/resources/configuration_reference.html b/resources/configuration_reference.html
index caf6878c..5da39212 100755
--- a/resources/configuration_reference.html
+++ b/resources/configuration_reference.html
@@ -343,37 +343,42 @@ where prompts get routed to, apply guardrails, and enable critical agent observa
174 # Model used for agent orchestration (must be listed in model_providers)
175 agent_orchestration_model: Plano-Orchestrator
176
-177# State storage for multi-turn conversation history
-178state_storage:
-179 type: memory # "memory" (in-process) or "postgres" (persistent)
-180 # connection_string is required when type is postgres.
-181 # Supports environment variable substitution: $VAR or ${VAR}
-182 # connection_string: postgresql://user:$DB_PASS@localhost:5432/plano
-183
-184# Input guardrails applied globally to all incoming requests
-185prompt_guards:
-186 input_guards:
-187 jailbreak:
-188 on_exception:
-189 message: "I'm sorry, I can't help with that request."
-190
-191# OpenTelemetry tracing configuration
-192tracing:
-193 # Random sampling percentage (1-100)
-194 random_sampling: 100
-195 # Include internal Plano spans in traces
-196 trace_arch_internal: false
-197 # gRPC endpoint for OpenTelemetry collector (e.g., Jaeger, Tempo)
-198 opentracing_grpc_endpoint: http://localhost:4317
-199 span_attributes:
-200 # Propagate request headers whose names start with these prefixes as span attributes
-201 header_prefixes:
-202 - x-user-
-203 - x-org-
-204 # Static key/value pairs added to every span
-205 static:
-206 environment: production
-207 service.team: platform
+177# Model affinity — pin routing decisions for agentic loops
+178routing:
+179 session_ttl_seconds: 600 # How long a pinned session lasts (default: 600s / 10 min)
+180 session_max_entries: 10000 # Max cached sessions before eviction (upper limit: 10000)
+181
+182# State storage for multi-turn conversation history
+183state_storage:
+184 type: memory # "memory" (in-process) or "postgres" (persistent)
+185 # connection_string is required when type is postgres.
+186 # Supports environment variable substitution: $VAR or ${VAR}
+187 # connection_string: postgresql://user:$DB_PASS@localhost:5432/plano
+188
+189# Input guardrails applied globally to all incoming requests
+190prompt_guards:
+191 input_guards:
+192 jailbreak:
+193 on_exception:
+194 message: "I'm sorry, I can't help with that request."
+195
+196# OpenTelemetry tracing configuration
+197tracing:
+198 # Random sampling percentage (1-100)
+199 random_sampling: 100
+200 # Include internal Plano spans in traces
+201 trace_arch_internal: false
+202 # gRPC endpoint for OpenTelemetry collector (e.g., Jaeger, Tempo)
+203 opentracing_grpc_endpoint: http://localhost:4317
+204 span_attributes:
+205 # Propagate request headers whose names start with these prefixes as span attributes
+206 header_prefixes:
+207 - x-user-
+208 - x-org-
+209 # Static key/value pairs added to every span
+210 static:
+211 environment: production
+212 service.team: platform
@@ -401,7 +406,7 @@ where prompts get routed to, apply guardrails, and enable critical agent observa
diff --git a/resources/deployment.html b/resources/deployment.html
index 73f326de..5a135b89 100755
--- a/resources/deployment.html
+++ b/resources/deployment.html
@@ -542,7 +542,7 @@