merge origin/main into musa/custom-trace-attributes

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-06-17 15:25:17 +02:00 · 2026-02-23 13:43:57 -08:00 · 2026-02-23 13:43:57 -08:00 · e30f93b1cd
commit e30f93b1cd
parent 32c4713aa7 69d650a4e5
24 changed files with 268 additions and 45 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -79,13 +79,13 @@ jobs:
          load: true
          tags: |
            ${{ env.PLANO_DOCKER_IMAGE }}
-            ${{ env.DOCKER_IMAGE }}:0.4.7
+            ${{ env.DOCKER_IMAGE }}:0.4.8
            ${{ env.DOCKER_IMAGE }}:latest
          cache-from: type=gha
          cache-to: type=gha,mode=max

      - name: Save image as artifact
-        run: docker save ${{ env.PLANO_DOCKER_IMAGE }} ${{ env.DOCKER_IMAGE }}:0.4.7 ${{ env.DOCKER_IMAGE }}:latest -o /tmp/plano-image.tar
+        run: docker save ${{ env.PLANO_DOCKER_IMAGE }} ${{ env.DOCKER_IMAGE }}:0.4.8 ${{ env.DOCKER_IMAGE }}:latest -o /tmp/plano-image.tar

      - name: Upload image artifact
        uses: actions/upload-artifact@v4
--- a/CLAUDE.md
+++ b/CLAUDE.md
@ -137,6 +137,12 @@ To prepare a release (e.g., bumping from `0.4.6` to `0.4.7`), update the version

 Commit message format: `release X.Y.Z`

+## Workflow Preferences
+
+- **Git commits:** Do NOT add `Co-Authored-By` lines. Keep commit messages short and concise (one line, no verbose descriptions). NEVER commit and push directly to `main`—always use a feature branch and PR.
+- **Git branches:** Use the format `<github_username>/<feature_name>` when creating branches for PRs. Determine the username from `gh api user --jq .login`.
+- **GitHub issues:** When a GitHub issue URL is pasted, fetch all requirements and context from the issue first. The end goal is always a PR with all tests passing.
+
 ## Key Conventions

 - Rust edition 2021, formatted with `cargo fmt`, linted with `cargo clippy -D warnings`
--- a/apps/www/src/components/Hero.tsx
+++ b/apps/www/src/components/Hero.tsx
@ -24,7 +24,7 @@ export function Hero() {
            >
              <div className="inline-flex flex-wrap items-center gap-1.5 sm:gap-2 px-3 sm:px-4 py-1 rounded-full bg-[rgba(185,191,255,0.4)] border border-[var(--secondary)] shadow backdrop-blur hover:bg-[rgba(185,191,255,0.6)] transition-colors cursor-pointer">
                <span className="text-xs sm:text-sm font-medium text-black/65">
-                  v0.4.7
+                  v0.4.8
                </span>
                <span className="text-xs sm:text-sm font-medium text-black ">
                  —
--- a/build_filter_image.sh
+++ b/build_filter_image.sh
@ -1 +1 @@
-docker build  -f Dockerfile . -t katanemo/plano -t katanemo/plano:0.4.7
+docker build  -f Dockerfile . -t katanemo/plano -t katanemo/plano:0.4.8
--- a/cli/planoai/init.py
+++ b/cli/planoai/init.py
@ -1,3 +1,3 @@
 """Plano CLI - Intelligent Prompt Gateway."""

-__version__ = "0.4.7"
+__version__ = "0.4.8"
--- a/cli/planoai/config_generator.py
+++ b/cli/planoai/config_generator.py
@ -460,6 +460,12 @@ def validate_and_render_schema():

    print("agent_orchestrator: ", agent_orchestrator)

+    overrides = config_yaml.get("overrides", {})
+    upstream_connect_timeout = overrides.get("upstream_connect_timeout", "5s")
+    upstream_tls_ca_path = overrides.get(
+        "upstream_tls_ca_path", "/etc/ssl/certs/ca-certificates.crt"
+    )
+
    data = {
        "prompt_gateway_listener": prompt_gateway,
        "llm_gateway_listener": llm_gateway,
@ -471,6 +477,8 @@ def validate_and_render_schema():
        "local_llms": llms_with_endpoint,
        "agent_orchestrator": agent_orchestrator,
        "listeners": listeners,
+        "upstream_connect_timeout": upstream_connect_timeout,
+        "upstream_tls_ca_path": upstream_tls_ca_path,
    }

    rendered = template.render(data)
--- a/cli/planoai/consts.py
+++ b/cli/planoai/consts.py
@ -5,5 +5,5 @@ PLANO_COLOR = "#969FF4"

 SERVICE_NAME_ARCHGW = "plano"
 PLANO_DOCKER_NAME = "plano"
-PLANO_DOCKER_IMAGE = os.getenv("PLANO_DOCKER_IMAGE", "katanemo/plano:0.4.7")
+PLANO_DOCKER_IMAGE = os.getenv("PLANO_DOCKER_IMAGE", "katanemo/plano:0.4.8")
 DEFAULT_OTEL_TRACING_GRPC_ENDPOINT = "http://host.docker.internal:4317"
--- a/cli/pyproject.toml
+++ b/cli/pyproject.toml
@ -1,6 +1,6 @@
 [project]
 name = "planoai"
-version = "0.4.7"
+version = "0.4.8"
 description = "Python-based CLI tool to manage Plano."
 authors = [{name = "Katanemo Labs, Inc."}]
 readme = "README.md"
--- a/cli/uv.lock
+++ b/cli/uv.lock
@ -337,7 +337,7 @@ wheels = [

 [[package]]
 name = "planoai"
-version = "0.4.6"
+version = "0.4.7"
 source = { editable = "." }
 dependencies = [
    { name = "click" },
--- a/config/envoy.template.yaml
+++ b/config/envoy.template.yaml
@ -595,7 +595,7 @@ static_resources:
  clusters:

    - name: arch
-      connect_timeout: 5s
+      connect_timeout: {{ upstream_connect_timeout | default('5s') }}
      type: LOGICAL_DNS
      dns_lookup_family: V4_ONLY
      lb_policy: ROUND_ROBIN
@ -618,9 +618,12 @@ static_resources:
            tls_params:
              tls_minimum_protocol_version: TLSv1_2
              tls_maximum_protocol_version: TLSv1_3
+            validation_context:
+              trusted_ca:
+                filename: {{ upstream_tls_ca_path | default('/etc/ssl/certs/ca-certificates.crt') }}

    - name: anthropic
-      connect_timeout: 0.5s
+      connect_timeout: {{ upstream_connect_timeout | default('5s') }}
      type: LOGICAL_DNS
      dns_lookup_family: V4_ONLY
      lb_policy: ROUND_ROBIN
@ -643,9 +646,12 @@ static_resources:
            tls_params:
              tls_minimum_protocol_version: TLSv1_2
              tls_maximum_protocol_version: TLSv1_3
+            validation_context:
+              trusted_ca:
+                filename: {{ upstream_tls_ca_path | default('/etc/ssl/certs/ca-certificates.crt') }}

    - name: deepseek
-      connect_timeout: 0.5s
+      connect_timeout: {{ upstream_connect_timeout | default('5s') }}
      type: LOGICAL_DNS
      dns_lookup_family: V4_ONLY
      lb_policy: ROUND_ROBIN
@ -668,9 +674,12 @@ static_resources:
            tls_params:
              tls_minimum_protocol_version: TLSv1_2
              tls_maximum_protocol_version: TLSv1_3
+            validation_context:
+              trusted_ca:
+                filename: {{ upstream_tls_ca_path | default('/etc/ssl/certs/ca-certificates.crt') }}

    - name: xai
-      connect_timeout: 0.5s
+      connect_timeout: {{ upstream_connect_timeout | default('5s') }}
      type: LOGICAL_DNS
      dns_lookup_family: V4_ONLY
      lb_policy: ROUND_ROBIN
@ -693,9 +702,12 @@ static_resources:
            tls_params:
              tls_minimum_protocol_version: TLSv1_2
              tls_maximum_protocol_version: TLSv1_3
+            validation_context:
+              trusted_ca:
+                filename: {{ upstream_tls_ca_path | default('/etc/ssl/certs/ca-certificates.crt') }}

    - name: moonshotai
-      connect_timeout: 0.5s
+      connect_timeout: {{ upstream_connect_timeout | default('5s') }}
      type: LOGICAL_DNS
      dns_lookup_family: V4_ONLY
      lb_policy: ROUND_ROBIN
@ -718,9 +730,12 @@ static_resources:
            tls_params:
              tls_minimum_protocol_version: TLSv1_2
              tls_maximum_protocol_version: TLSv1_3
+            validation_context:
+              trusted_ca:
+                filename: {{ upstream_tls_ca_path | default('/etc/ssl/certs/ca-certificates.crt') }}

    - name: zhipu
-      connect_timeout: 0.5s
+      connect_timeout: {{ upstream_connect_timeout | default('5s') }}
      type: LOGICAL_DNS
      dns_lookup_family: V4_ONLY
      lb_policy: ROUND_ROBIN
@ -743,9 +758,12 @@ static_resources:
            tls_params:
              tls_minimum_protocol_version: TLSv1_2
              tls_maximum_protocol_version: TLSv1_3
+            validation_context:
+              trusted_ca:
+                filename: {{ upstream_tls_ca_path | default('/etc/ssl/certs/ca-certificates.crt') }}

    - name: together_ai
-      connect_timeout: 0.5s
+      connect_timeout: {{ upstream_connect_timeout | default('5s') }}
      type: LOGICAL_DNS
      dns_lookup_family: V4_ONLY
      lb_policy: ROUND_ROBIN
@ -768,9 +786,12 @@ static_resources:
            tls_params:
              tls_minimum_protocol_version: TLSv1_2
              tls_maximum_protocol_version: TLSv1_3
+            validation_context:
+              trusted_ca:
+                filename: {{ upstream_tls_ca_path | default('/etc/ssl/certs/ca-certificates.crt') }}

    - name: gemini
-      connect_timeout: 0.5s
+      connect_timeout: {{ upstream_connect_timeout | default('5s') }}
      type: LOGICAL_DNS
      dns_lookup_family: V4_ONLY
      lb_policy: ROUND_ROBIN
@ -793,9 +814,12 @@ static_resources:
            tls_params:
              tls_minimum_protocol_version: TLSv1_2
              tls_maximum_protocol_version: TLSv1_3
+            validation_context:
+              trusted_ca:
+                filename: {{ upstream_tls_ca_path | default('/etc/ssl/certs/ca-certificates.crt') }}

    - name: groq
-      connect_timeout: 0.5s
+      connect_timeout: {{ upstream_connect_timeout | default('5s') }}
      type: LOGICAL_DNS
      dns_lookup_family: V4_ONLY
      lb_policy: ROUND_ROBIN
@ -818,9 +842,12 @@ static_resources:
            tls_params:
              tls_minimum_protocol_version: TLSv1_2
              tls_maximum_protocol_version: TLSv1_3
+            validation_context:
+              trusted_ca:
+                filename: {{ upstream_tls_ca_path | default('/etc/ssl/certs/ca-certificates.crt') }}

    - name: mistral
-      connect_timeout: 0.5s
+      connect_timeout: {{ upstream_connect_timeout | default('5s') }}
      type: LOGICAL_DNS
      dns_lookup_family: V4_ONLY
      lb_policy: ROUND_ROBIN
@ -839,9 +866,16 @@ static_resources:
        typed_config:
          "@type": type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext
          sni: api.mistral.ai
+          common_tls_context:
+            tls_params:
+              tls_minimum_protocol_version: TLSv1_2
+              tls_maximum_protocol_version: TLSv1_3
+            validation_context:
+              trusted_ca:
+                filename: {{ upstream_tls_ca_path | default('/etc/ssl/certs/ca-certificates.crt') }}

    - name: openai
-      connect_timeout: 0.5s
+      connect_timeout: {{ upstream_connect_timeout | default('5s') }}
      type: LOGICAL_DNS
      dns_lookup_family: V4_ONLY
      lb_policy: ROUND_ROBIN
@ -864,6 +898,9 @@ static_resources:
            tls_params:
              tls_minimum_protocol_version: TLSv1_2
              tls_maximum_protocol_version: TLSv1_3
+            validation_context:
+              trusted_ca:
+                filename: {{ upstream_tls_ca_path | default('/etc/ssl/certs/ca-certificates.crt') }}
    - name: mistral_7b_instruct
      connect_timeout: 0.5s
      type: STRICT_DNS
@ -884,7 +921,7 @@ static_resources:
      {% if cluster.connect_timeout -%}
      connect_timeout: {{ cluster.connect_timeout }}
      {% else -%}
-      connect_timeout: 0.5s
+      connect_timeout: {{ upstream_connect_timeout | default('5s') }}
      {% endif -%}
      type: LOGICAL_DNS
      dns_lookup_family: V4_ONLY
@ -913,12 +950,15 @@ static_resources:
            tls_params:
              tls_minimum_protocol_version: TLSv1_2
              tls_maximum_protocol_version: TLSv1_3
+            validation_context:
+              trusted_ca:
+                filename: {{ upstream_tls_ca_path | default('/etc/ssl/certs/ca-certificates.crt') }}
      {% endif %}
 {% endfor %}

 {% for local_llm_provider in local_llms %}
    - name: {{ local_llm_provider.cluster_name }}
-      connect_timeout: 0.5s
+      connect_timeout: {{ upstream_connect_timeout | default('5s') }}
      type: LOGICAL_DNS
      dns_lookup_family: V4_ONLY
      lb_policy: ROUND_ROBIN
@ -946,6 +986,9 @@ static_resources:
            tls_params:
              tls_minimum_protocol_version: TLSv1_2
              tls_maximum_protocol_version: TLSv1_3
+            validation_context:
+              trusted_ca:
+                filename: {{ upstream_tls_ca_path | default('/etc/ssl/certs/ca-certificates.crt') }}
      {% endif %}

 {% endfor %}
--- a/config/plano_config_schema.yaml
+++ b/config/plano_config_schema.yaml
@ -265,6 +265,12 @@ properties:
        type: boolean
      use_agent_orchestrator:
        type: boolean
+      upstream_connect_timeout:
+        type: string
+        description: "Connect timeout for upstream provider clusters (e.g., '5s', '10s'). Default is '5s'."
+      upstream_tls_ca_path:
+        type: string
+        description: "Path to the trusted CA bundle for upstream TLS verification. Default is '/etc/ssl/certs/ca-certificates.crt'."
  system_prompt:
    type: string
  prompt_targets:
--- a/config/validate_plano_config.sh
+++ b/config/validate_plano_config.sh
@ -5,7 +5,7 @@ failed_files=()
 for file in $(find . -name config.yaml -o -name plano_config_full_reference.yaml); do
  echo "Validating ${file}..."
  touch $(pwd)/${file}_rendered
-  if ! docker run --rm -v "$(pwd)/${file}:/app/plano_config.yaml:ro" -v "$(pwd)/${file}_rendered:/app/plano_config_rendered.yaml:rw" --entrypoint /bin/sh ${PLANO_DOCKER_IMAGE:-katanemo/plano:0.4.7} -c "python -m planoai.config_generator" 2>&1 > /dev/null ; then
+  if ! docker run --rm -v "$(pwd)/${file}:/app/plano_config.yaml:ro" -v "$(pwd)/${file}_rendered:/app/plano_config_rendered.yaml:rw" --entrypoint /bin/sh ${PLANO_DOCKER_IMAGE:-katanemo/plano:0.4.8} -c "python -m planoai.config_generator" 2>&1 > /dev/null ; then
    echo "Validation failed for $file"
    failed_files+=("$file")
  fi
--- a/crates/brightstaff/src/handlers/agent_chat_completions.rs
+++ b/crates/brightstaff/src/handlers/agent_chat_completions.rs
@ -3,15 +3,17 @@ use std::time::Instant;

 use bytes::Bytes;
 use common::configuration::SpanAttributes;
+use common::llm_providers::LlmProviders;
 use hermesllm::apis::OpenAIMessage;
 use hermesllm::clients::SupportedAPIsFromClient;
 use hermesllm::providers::request::ProviderRequest;
 use hermesllm::ProviderRequestType;
 use http_body_util::combinators::BoxBody;
 use http_body_util::BodyExt;
-use hyper::{Request, Response};
+use hyper::{Request, Response, StatusCode};
 use opentelemetry::trace::get_active_span;
 use serde::ser::Error as SerError;
+use tokio::sync::RwLock;
 use tracing::{debug, info, info_span, warn, Instrument};

 use super::agent_selector::{AgentSelectionError, AgentSelector};
@ -42,6 +44,7 @@ pub async fn agent_chat(
    agents_list: Arc<tokio::sync::RwLock<Option<Vec<common::configuration::Agent>>>>,
    listeners: Arc<tokio::sync::RwLock<Vec<common::configuration::Listener>>>,
    span_attributes: Arc<Option<SpanAttributes>>,
+    llm_providers: Arc<RwLock<LlmProviders>>,
 ) -> Result<Response<BoxBody<Bytes, hyper::Error>>, hyper::Error> {
    let custom_attrs =
        collect_custom_trace_attributes(request.headers(), span_attributes.as_ref().as_ref());
@ -75,6 +78,7 @@ pub async fn agent_chat(
            orchestrator_service,
            agents_list,
            listeners,
+            llm_providers,
            request_id,
            custom_attrs,
        )
@ -160,6 +164,7 @@ async fn handle_agent_chat_inner(
    orchestrator_service: Arc<OrchestratorService>,
    agents_list: Arc<tokio::sync::RwLock<Option<Vec<common::configuration::Agent>>>>,
    listeners: Arc<tokio::sync::RwLock<Vec<common::configuration::Listener>>>,
+    llm_providers: Arc<RwLock<LlmProviders>>,
    request_id: String,
    custom_attrs: std::collections::HashMap<String, String>,
 ) -> Result<Response<BoxBody<Bytes, hyper::Error>>, AgentFilterChainError> {
@ -230,16 +235,36 @@ async fn handle_agent_chat_inner(
            AgentFilterChainError::RequestParsing(serde_json::Error::custom(err_msg))
        })?;

-    let client_request = match ProviderRequestType::try_from((&chat_request_bytes[..], &api_type)) {
-        Ok(request) => request,
-        Err(err) => {
-            warn!("failed to parse request as ProviderRequestType: {}", err);
-            let err_msg = format!("Failed to parse request: {}", err);
-            return Err(AgentFilterChainError::RequestParsing(
-                serde_json::Error::custom(err_msg),
-            ));
+    let mut client_request =
+        match ProviderRequestType::try_from((&chat_request_bytes[..], &api_type)) {
+            Ok(request) => request,
+            Err(err) => {
+                warn!("failed to parse request as ProviderRequestType: {}", err);
+                let err_msg = format!("Failed to parse request: {}", err);
+                return Err(AgentFilterChainError::RequestParsing(
+                    serde_json::Error::custom(err_msg),
+                ));
+            }
+        };
+
+    // If model is not specified in the request, resolve from default provider
+    if client_request.model().is_empty() {
+        match llm_providers.read().await.default() {
+            Some(default_provider) => {
+                let default_model = default_provider.name.clone();
+                info!(default_model = %default_model, "no model specified in request, using default provider");
+                client_request.set_model(default_model);
+            }
+            None => {
+                let err_msg = "No model specified in request and no default provider configured";
+                warn!("{}", err_msg);
+                let mut bad_request =
+                    Response::new(ResponseHandler::create_full_body(err_msg.to_string()));
+                *bad_request.status_mut() = StatusCode::BAD_REQUEST;
+                return Ok(bad_request);
+            }
        }
-    };
+    }

    let message: Vec<OpenAIMessage> = client_request.get_messages();

--- a/crates/brightstaff/src/handlers/llm.rs
+++ b/crates/brightstaff/src/handlers/llm.rs
@ -162,9 +162,30 @@ async fn llm_chat_inner(
        Some(SupportedAPIsFromClient::OpenAIResponsesAPI(_))
    );

+    // If model is not specified in the request, resolve from default provider
+    let model_from_request = client_request.model().to_string();
+    let model_from_request = if model_from_request.is_empty() {
+        match llm_providers.read().await.default() {
+            Some(default_provider) => {
+                let default_model = default_provider.name.clone();
+                info!(default_model = %default_model, "no model specified in request, using default provider");
+                client_request.set_model(default_model.clone());
+                default_model
+            }
+            None => {
+                let err_msg = "No model specified in request and no default provider configured";
+                warn!("{}", err_msg);
+                let mut bad_request = Response::new(full(err_msg.to_string()));
+                *bad_request.status_mut() = StatusCode::BAD_REQUEST;
+                return Ok(bad_request);
+            }
+        }
+    } else {
+        model_from_request
+    };
+
    // Model alias resolution: update model field in client_request immediately
    // This ensures all downstream objects use the resolved model
-    let model_from_request = client_request.model().to_string();
    let temperature = client_request.get_temperature();
    let is_streaming_request = client_request.is_streaming();
    let alias_resolved_model = resolve_model_alias(&model_from_request, &model_aliases);
--- a/crates/brightstaff/src/main.rs
+++ b/crates/brightstaff/src/main.rs
@ -211,6 +211,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
                            agents_list,
                            listeners,
                            span_attributes,
+                            llm_providers,
                        )
                        .with_context(parent_cx)
                        .await;
--- a/crates/hermesllm/src/apis/anthropic.rs
+++ b/crates/hermesllm/src/apis/anthropic.rs
@ -102,6 +102,7 @@ pub struct McpServer {
 #[skip_serializing_none]
 #[derive(Serialize, Deserialize, Debug, Clone)]
 pub struct MessagesRequest {
+    #[serde(default)]
    pub model: String,
    pub messages: Vec<MessagesMessage>,
    pub max_tokens: u32,
--- a/crates/hermesllm/src/apis/openai.rs
+++ b/crates/hermesllm/src/apis/openai.rs
@ -74,6 +74,7 @@ impl ApiDefinition for OpenAIApi {
 #[derive(Serialize, Deserialize, Debug, Clone, Default)]
 pub struct ChatCompletionsRequest {
    pub messages: Vec<Message>,
+    #[serde(default)]
    pub model: String,
    // pub audio: Option<Audio> // GOOD FIRST ISSUE: future support for audio input
    pub frequency_penalty: Option<f32>,
--- a/crates/hermesllm/src/apis/openai_responses.rs
+++ b/crates/hermesllm/src/apis/openai_responses.rs
@ -29,6 +29,7 @@ impl TryFrom<&[u8]> for ResponsesAPIResponse {
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct ResponsesAPIRequest {
    /// The model to use for generating the response
+    #[serde(default)]
    pub model: String,

    /// Text, image, or file inputs to the model
--- a/demos/llm_routing/openclaw_routing/config.yaml
+++ b/demos/llm_routing/openclaw_routing/config.yaml
@ -12,7 +12,6 @@ listeners:
    timeout: 30s

 llm_providers:
-
  # Kimi K2.5 — Moonshot AI's open model (1T MoE, 32B active params)
  # Great for general conversation, agentic tasks, and multimodal work
  # OpenAI-compatible API at $0.60/M input, $2.50/M output tokens
@ -21,13 +20,13 @@ llm_providers:
    base_url: https://api.moonshot.ai/v1
    default: true
    routing_preferences:
-      - name: code generation
-        description: generating code, writing scripts, implementing functions, and building tool integrations
+      - name: general conversation
+        description: general chat, greetings, casual conversation, Q&A, and everyday questions

  # Claude — Anthropic's most capable model
  # Best for complex reasoning, code, tool use, and evaluation
  - model: anthropic/claude-sonnet-4-5
    access_key: $ANTHROPIC_API_KEY
    routing_preferences:
-      - name: general conversation
-        description: general chat, greetings, casual conversation, Q&A, and everyday questions
+      - name: code generation
+        description: generating code, writing scripts, implementing functions, and building tool integrations
--- a/demos/llm_routing/preference_based_routing/README.md
+++ b/demos/llm_routing/preference_based_routing/README.md
@ -15,9 +15,9 @@ Make sure your machine is up to date with [latest version of plano]([url](https:
 ```bash
 (venv) $ planoai up --service plano --foreground
 # Or if installed with uv: uvx planoai up --service plano --foreground
-2025-05-30 18:00:09,953 - planoai.main - INFO - Starting plano cli version: 0.4.7
+2025-05-30 18:00:09,953 - planoai.main - INFO - Starting plano cli version: 0.4.8
 2025-05-30 18:00:09,953 - planoai.main - INFO - Validating /Users/adilhafeez/src/intelligent-prompt-gateway/demos/llm_routing/preference_based_routing/config.yaml
-2025-05-30 18:00:10,422 - cli.core - INFO - Starting plano gateway, image name: plano, tag: katanemo/plano:0.4.7
+2025-05-30 18:00:10,422 - cli.core - INFO - Starting plano gateway, image name: plano, tag: katanemo/plano:0.4.8
 2025-05-30 18:00:10,662 - cli.core - INFO - plano status: running, health status: starting
 2025-05-30 18:00:11,712 - cli.core - INFO - plano status: running, health status: starting
 2025-05-30 18:00:12,761 - cli.core - INFO - plano is running and is healthy!
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -17,7 +17,7 @@ from sphinxawesome_theme.postprocess import Icons
 project = "Plano Docs"
 copyright = "2025, Katanemo Labs, Inc"
 author = "Katanemo Labs, Inc"
-release = " v0.4.7"
+release = " v0.4.8"

 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
--- a/docs/source/get_started/quickstart.rst
+++ b/docs/source/get_started/quickstart.rst
@ -37,7 +37,7 @@ Plano's CLI allows you to manage and interact with the Plano efficiently. To ins

 .. code-block:: console

-   $ uv tool install planoai==0.4.7
+   $ uv tool install planoai==0.4.8

 **Option 2: Install with pip (Traditional)**

@ -45,7 +45,7 @@ Plano's CLI allows you to manage and interact with the Plano efficiently. To ins

   $ python -m venv venv
   $ source venv/bin/activate   # On Windows, use: venv\Scripts\activate
-   $ pip install planoai==0.4.7
+   $ pip install planoai==0.4.8


 .. _llm_routing_quickstart:
@ -90,7 +90,7 @@ Start Plano:

   $ planoai up plano_config.yaml
   # Or if installed with uv tool: uvx planoai up plano_config.yaml
-   2024-12-05 11:24:51,288 - planoai.main - INFO - Starting plano cli version: 0.4.7
+   2024-12-05 11:24:51,288 - planoai.main - INFO - Starting plano cli version: 0.4.8
   2024-12-05 11:24:51,825 - planoai.utils - INFO - Schema validation successful!
   2024-12-05 11:24:51,825 - planoai.main - INFO - Starting plano
   ...
--- a/docs/source/resources/deployment.rst
+++ b/docs/source/resources/deployment.rst
@ -25,7 +25,7 @@ Create a ``docker-compose.yml`` file with the following configuration:
   # docker-compose.yml
   services:
     plano:
-       image: katanemo/plano:0.4.7
+       image: katanemo/plano:0.4.8
       container_name: plano
       ports:
         - "10000:10000" # ingress (client -> plano)
--- a/docs/source/resources/tech_overview/request_lifecycle.rst
+++ b/docs/source/resources/tech_overview/request_lifecycle.rst
@ -46,6 +46,117 @@ Also, Plano utilizes `Envoy event-based thread model <https://blog.envoyproxy.io
 Worker threads rarely share state and operate in a trivially parallel fashion. This threading model
 enables scaling to very high core count CPUs.

+.. code-block:: text
+
+   ┌─────────────────────────────────────────────────────────────────────────────────────┐
+   │                                    P L A N O                                        │
+   │                  AI-native proxy and data plane for agentic applications            │
+   │                                                                                     │
+   │                              ┌─────────────────────┐                                │
+   │                              │    YOUR CLIENTS     │                                │
+   │                              │ (apps· agents · UI) │                                │
+   │                              └──────────┬──────────┘                                │
+   │                                         │                                           │
+   │          ┌──────────────────────────────┼──────────────────────────┐                │
+   │          │                              │                          │                │
+   │   ┌──────▼──────────┐         ┌─────────▼────────┐       ┌────────▼─────────┐       │
+   │   │  Agent Port(s)  │         │   Model Port     │       │  Function-Call   │       │
+   │   │  :8001+         │         │   :12000         │       │  Port  :10000    │       │
+   │   │                 │         │                  │       │                  │       │
+   │   │  route your     │         │  direct LLM      │       │  prompt-target / │       │
+   │   │  prompts to     │         │  calls with      │       │  tool dispatch   │       │
+   │   │  the right      │         │  model-alias     │       │  with parameter  │       │
+   │   │  agent          │         │  translation     │       │  extraction      │       │
+   │   └──────┬──────────┘         └─────────┬────────┘       └────────┬─────────┘       │
+   │          └──────────────────────────────┼─────────────────────────┘                 │
+   │                                         │                                           │
+   │  ╔══════════════════════════════════════▼══════════════════════════════════════╗    │
+   │  ║            BRIGHTSTAFF (SUBSYSTEM) —  Agentic Control Plane                 ║    │
+   │  ║            Async · non-blocking · parallel per-request Tokio tasks          ║    │
+   │  ║                                                                             ║    │
+   │  ║   ┌─────────────────────────────────────────────────────────────────────┐   ║    │
+   │  ║   │  Agentic ROUTER                                                     │   ║    │
+   │  ║   │  Reads listener config · maps incoming request to execution path    │   ║    │
+   │  ║   │                                                                     │   ║    │
+   │  ║   │   /agents/*  ──────────────────────►  AGENT PATH                    │   ║    │
+   │  ║   │   /v1/chat|messages|responses ──────►  LLM PATH                     │   ║    │
+   │  ║   └─────────────────────────────────────────────────────────────────────┘   ║    │
+   │  ║                                                                             ║    │
+   │  ║   ─────────────────────── AGENT PATH ────────────────────────────────────   ║    │
+   │  ║                                                                             ║    │
+   │  ║   ┌──────────────────────────────────────────────────────────────────────┐  ║    │
+   │  ║   │  FILTER CHAIN                        (pipeline_processor.rs)         │  ║    │
+   │  ║   │                                                                      │  ║    │
+   │  ║   │  prompt ──► [input_guards] ──► [query_rewrite] ──► [context_builder] │  ║    │
+   │  ║   │             guardrails       prompt mutation      RAG / enrichment   │  ║    │
+   │  ║   │                                                                      │  ║    │
+   │  ║   │  Each filter: HTTP or MCP · can mutate, enrich, or short-circuit     │  ║    │
+   │  ║   └──────────────────────────────────┬───────────────────────────────────┘  ║    │
+   │  ║                                      │                                      ║    │
+   │  ║   ┌──────────────────────────────────▼───────────────────────────────────┐  ║    │
+   │  ║   │  AGENT ORCHESTRATOR               (agent_chat_completions.rs)        │  ║    │
+   │  ║   │  Select agent · forward enriched request · manage conversation state │  ║    │
+   │  ║   │  Stream response back · multi-turn aware                             │  ║    │
+   │  ║   └──────────────────────────────────────────────────────────────────────┘  ║    │
+   │  ║                                                                             ║    │
+   │  ║   ─────────────────────── LLM PATH ──────────────────────────────────────   ║    │
+   │  ║                                                                             ║    │
+   │  ║   ┌──────────────────────────────────────────────────────────────────────┐  ║    │
+   │  ║   │  MODEL ROUTER                       (llm_router.rs + router_chat.rs) │  ║    │
+   │  ║   │  Model alias resolution · preference-based provider selection        │  ║    │
+   │  ║   │  "fast-llm" → gpt-4o-mini  ·  "smart-llm" → gpt-4o                   │  ║    │
+   │  ║   └──────────────────────────────────────────────────────────────────────┘  ║    │
+   │  ║                                                                             ║    │
+   │  ║   ─────────────────── ALWAYS ON (every request) ─────────────────────────   ║    │
+   │  ║                                                                             ║    │
+   │  ║   ┌────────────────────┐  ┌─────────────────────┐  ┌──────────────────┐     ║    │
+   │  ║   │  SIGNALS ANALYZER  │  │   STATE STORAGE     │  │  OTEL TRACING    │     ║    │
+   │  ║   │  loop detection    │  │   memory / postgres │  │  traceparent     │     ║    │
+   │  ║   │  repetition score  │  │   /v1/responses     │  │  span injection  │     ║    │
+   │  ║   │  quality indicators│  │   stateful API      │  │  trace export    │     ║    │
+   │  ║   └────────────────────┘  └─────────────────────┘  └──────────────────┘     ║    │
+   │  ╚═════════════════════════════════════╤═══════════════════════════════════════╝    │
+   │                                        │                                            │
+   │  ┌─────────────────────────────────────▼──────────────────────────────────────┐     │
+   │  │  LLM GATEWAY   (llm_gateway.wasm — embedded in Envoy egress filter chain)  │     │
+   │  │                                                                            │     │
+   │  │  Rate limiting  ·  Provider format translation  ·  TTFT metrics            │     │
+   │  │  OpenAI → Anthropic · Gemini · Mistral · Groq · DeepSeek · xAI · Bedrock   │     │
+   │  │                                                                            │     │
+   │  │  Envoy handles beneath this: TLS origination · SNI · retry + backoff       │     │
+   │  │  connection pooling · LOGICAL_DNS · structured access logs                 │     │
+   │  └─────────────────────────────────────┬──────────────────────────────────────┘     │
+   │                                         │                                           │
+   └─────────────────────────────────────────┼───────────────────────────────────────────┘
+                                             │
+                 ┌───────────────────────────┼────────────────────────────┐
+                 │                           │                             │
+       ┌─────────▼──────────┐   ┌────────────▼──────────┐   ┌────────────▼──────────┐
+       │  LLM PROVIDERS     │   │  EXTERNAL AGENTS      │   │  TOOL / API BACKENDS  │
+       │  OpenAI · Anthropic│   │  (filter chain svc)   │   │  (endpoint clusters)  │
+       │  Gemini · Mistral  │   │  HTTP / MCP  :10500+  │   │  user-defined hosts   │
+       │  Groq · DeepSeek   │   │  input_guards         │   │                       │
+       │  xAI · Together.ai │   │  query_rewriter       │   │                       │
+       └────────────────────┘   │  context_builder      │   └───────────────────────┘
+                                └───────────────────────┘
+
+
+     HOW PLANO IS DIFFERENT
+     ─────────────────────────────────────────────────────────────────────────────────
+     Brightstaff is the entire agentic brain — one async Rust binary that handles
+     agent selection, filter chain orchestration, model routing, state, and signals
+     without blocking a thread per request.
+
+     Filter chains are programmable dataplane steps — reusable HTTP/MCP services
+     you wire into any agent, executing in-path before the agent ever sees the prompt.
+
+     The LLM gateway is a zero-overhead WASM plugin inside Envoy — format translation
+     and rate limiting happen in-process with the proxy, not as a separate service hop.
+
+     Envoy provides the transport substrate (TLS, HTTP codecs, retries, connection
+     pools, access logs) so Plano never reimplements solved infrastructure problems.
+
+
 Request Flow (Ingress)
 ----------------------