move custom tracer to llm filter (#267)

2026-06-29 15:49:40 +02:00 · 2024-11-15 10:44:01 -08:00 · 2024-11-15 10:44:01 -08:00 · d3c17c7abd
commit d3c17c7abd
parent 1d229cba8f
22 changed files with 335 additions and 133 deletions
--- a/arch/Dockerfile
+++ b/arch/Dockerfile
@ -13,16 +13,21 @@ FROM envoyproxy/envoy:v1.32-latest as envoy
 #Build config generator, so that we have a single build image for both Rust and Python
 FROM python:3.12-slim as arch
-RUN apt-get update && apt-get install -y gettext-base curl && apt-get clean && rm -rf /var/lib/apt/lists/*
+RUN apt-get update && apt-get install -y gettext-base curl supervisor && apt-get clean && rm -rf /var/lib/apt/lists/*
 COPY --from=builder /arch/target/wasm32-wasip1/release/prompt_gateway.wasm /etc/envoy/proxy-wasm-plugins/prompt_gateway.wasm
 COPY --from=builder /arch/target/wasm32-wasip1/release/llm_gateway.wasm /etc/envoy/proxy-wasm-plugins/llm_gateway.wasm
 COPY --from=envoy /usr/local/bin/envoy /usr/local/bin/envoy
-WORKDIR /config
+WORKDIR /app
 COPY arch/requirements.txt .
 RUN pip install -r requirements.txt
 COPY arch/tools/cli/config_generator.py .
 COPY arch/envoy.template.yaml .
 COPY arch/arch_config_schema.yaml .
 COPY arch/supervisord.conf /etc/supervisor/conf.d/supervisord.conf
 COPY arch/stream_traces.py .
-ENTRYPOINT ["sh", "-c", "python config_generator.py && envsubst < /etc/envoy/envoy.yaml > /etc/envoy.env_sub.yaml && envoy -c /etc/envoy.env_sub.yaml --component-log-level wasm:debug 2>&1 | tee /var/log/envoy.log"]
+RUN pip install requests
 RUN touch /var/log/envoy.log
 ENTRYPOINT ["supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]
--- a/arch/docker-compose.dev.yaml
+++ b/arch/docker-compose.dev.yaml
@ -8,11 +8,11 @@ services:
      - "12000:12000"
      - "19901:9901"
    volumes:
-      - ${ARCH_CONFIG_FILE:-../demos/weather_forecast/arch_config.yaml}:/config/arch_config.yaml
+      - ${ARCH_CONFIG_FILE:-../demos/weather_forecast/arch_config.yaml}:/app/arch_config.yaml
      - /etc/ssl/cert.pem:/etc/ssl/cert.pem
-      - ./envoy.template.yaml:/config/envoy.template.yaml
+      - ./envoy.template.yaml:/app/envoy.template.yaml
-      - ./arch_config_schema.yaml:/config/arch_config_schema.yaml
+      - ./arch_config_schema.yaml:/app/arch_config_schema.yaml
-      - ./tools/cli/config_generator.py:/config/config_generator.py
+      - ./tools/cli/config_generator.py:/app/config_generator.py
      - ../crates/target/wasm32-wasip1/release/llm_gateway.wasm:/etc/envoy/proxy-wasm-plugins/llm_gateway.wasm
      - ../crates/target/wasm32-wasip1/release/prompt_gateway.wasm:/etc/envoy/proxy-wasm-plugins/prompt_gateway.wasm
      - ~/archgw_logs:/var/log/
@ -21,3 +21,4 @@ services:
    environment:
      - OPENAI_API_KEY=${OPENAI_API_KEY:?error}
      - MISTRAL_API_KEY=${MISTRAL_API_KEY:?error}
      - OTEL_TRACING_HTTP_ENDPOINT=http://host.docker.internal:4318/v1/traces
--- a/arch/docker-compose.e2e.yaml
+++ b/arch/docker-compose.e2e.yaml
@ -16,3 +16,4 @@ services:
    environment:
      - OPENAI_API_KEY=${OPENAI_API_KEY:?error}
      - MISTRAL_API_KEY=${MISTRAL_API_KEY:?error}
      - OTEL_TRACING_HTTP_ENDPOINT=http://host.docker.internal:4318/v1/traces
--- a/arch/docker-compose.yaml
+++ b/arch/docker-compose.yaml
@ -8,11 +8,13 @@ services:
      - "12000:12000"
      - "19901:9901"
    volumes:
-      - ${ARCH_CONFIG_FILE:-../demos/weather_forecast/arch_config.yaml}:/config/arch_config.yaml
+      - ${ARCH_CONFIG_FILE:-../demos/weather_forecast/arch_config.yaml}:/app/arch_config.yaml
      - /etc/ssl/cert.pem:/etc/ssl/cert.pem
      - ~/archgw_logs:/var/log/
    env_file:
      - env.list
    environment:
      - OTEL_TRACING_HTTP_ENDPOINT=http://host.docker.internal:4318/v1/traces
    extra_hosts:
      - "host.docker.internal:host-gateway"
    healthcheck:
--- a/arch/envoy.template.yaml
+++ b/arch/envoy.template.yaml
@ -242,11 +242,66 @@ static_resources:
                    typed_config:
                      "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
-    - name: arch_listener_llm
+
    - name: arch_listener_http_llm
      address:
        socket_address:
          address: 0.0.0.0
          port_value: 12000
      traffic_direction: INBOUND
      filter_chains:
        - filters:
            - name: envoy.filters.network.http_connection_manager
              typed_config:
                "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager
                {% if "random_sampling" in arch_tracing and arch_tracing["random_sampling"] > 0 %}
                generate_request_id: true
                tracing:
                  provider:
                    name: envoy.tracers.opentelemetry
                    typed_config:
                      "@type": type.googleapis.com/envoy.config.trace.v3.OpenTelemetryConfig
                      grpc_service:
                        envoy_grpc:
                          cluster_name: opentelemetry_collector
                        timeout: 0.250s
                      service_name: arch_gateway
                  random_sampling:
                    value: {{ arch_tracing.random_sampling }}
                {% endif %}
                stat_prefix: arch_listener_http
                codec_type: AUTO
                scheme_header_transformation:
                  scheme_to_overwrite: https
                access_log:
                - name: envoy.access_loggers.file
                  typed_config:
                    "@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog
                    path: "/var/log/access_llm.log"
                route_config:
                  name: local_routes
                  virtual_hosts:
                    - name: local_service
                      domains:
                        - "*"
                      routes:
                        - match:
                            prefix: "/"
                          route:
                            auto_host_rewrite: true
                            cluster: arch_listener_llm
                            timeout: 60s
                http_filters:
                  - name: envoy.filters.http.router
                    typed_config:
                      "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
    - name: arch_listener_llm
      address:
        socket_address:
          address: 0.0.0.0
          port_value: 12001
      filter_chains:
        - filters:
            - name: envoy.filters.network.http_connection_manager
@ -479,6 +534,23 @@ static_resources:
                      port_value: 10001
                  hostname: arch_prompt_gateway_listener
    - name: arch_listener_llm
      connect_timeout: 5s
      type: LOGICAL_DNS
      dns_lookup_family: V4_ONLY
      lb_policy: ROUND_ROBIN
      load_assignment:
        cluster_name: arch_listener_llm
        endpoints:
          - lb_endpoints:
              - endpoint:
                  address:
                    socket_address:
                      address: 0.0.0.0
                      port_value: 12001
                  hostname: arch_listener_llm
 {% if "random_sampling" in arch_tracing and arch_tracing["random_sampling"] > 0 %}
    - name: opentelemetry_collector
      type: STRICT_DNS
--- a/demos/shared/trace_streamer/stream_traces.py
+++ b/demos/shared/trace_streamer/stream_traces.py
@ -1,4 +1,5 @@
 import os
 import sys
 import time
 import requests
 import logging
@ -29,14 +30,8 @@ def process_log_line(line):
        logging.error(f"Failed to send trace to otel-tracing: {e}")
-with open(envoy_log_path, "r") as f:
+for line in sys.stdin:
-    # Seek to the end of the file so we only read new lines
+    if line:
-    f.seek(0, os.SEEK_END)
+        tokens = line.split("gateway: upstream_llm trace details: ")
    while True:
        line = f.readline()
        if not line:
            time.sleep(1)
            continue
        tokens = line.split("prompt_gateway: upstream_llm trace details: ")
        if len(tokens) > 1:
            process_log_line(tokens[1])
--- a/arch/supervisord.conf
+++ b/arch/supervisord.conf
@ -0,0 +1,25 @@
 [supervisord]
 nodaemon=true
 [program:trace_streamer]
 command=sh -c "tail -F /var/log/envoy.log | python stream_traces.py"
 autostart=true
 autorestart=false
 startretries=3
 priority=1
 stdout_logfile=/dev/stdout
 stderr_logfile=/dev/stderr
 stdout_logfile_maxbytes = 0
 stderr_logfile_maxbytes = 0
 [program:envoy]
 command=sh -c "python config_generator.py && envsubst < /etc/envoy/envoy.yaml > /etc/envoy.env_sub.yaml && envoy -c /etc/envoy.env_sub.yaml --component-log-level wasm:debug 2>&1 | tee /var/log/envoy.log"
 autostart=true
 autorestart=true
 startretries=3
 priority=2
 stdout_logfile=/dev/stdout
 stderr_logfile=/dev/stderr
 stdout_logfile_maxbytes = 0
 stderr_logfile_maxbytes = 0
--- a/arch/tools/cli/config_generator.py
+++ b/arch/tools/cli/config_generator.py
@ -6,7 +6,7 @@ from jsonschema import validate
 ENVOY_CONFIG_TEMPLATE_FILE = os.getenv(
    "ENVOY_CONFIG_TEMPLATE_FILE", "envoy.template.yaml"
 )
-ARCH_CONFIG_FILE = os.getenv("ARCH_CONFIG_FILE", "/config/arch_config.yaml")
+ARCH_CONFIG_FILE = os.getenv("ARCH_CONFIG_FILE", "/app/arch_config.yaml")
 ENVOY_CONFIG_FILE_RENDERED = os.getenv(
    "ENVOY_CONFIG_FILE_RENDERED", "/etc/envoy/envoy.yaml"
 )
--- a/archgw.code-workspace
+++ b/archgw.code-workspace
@ -21,22 +21,25 @@
      "path": "e2e_tests"
    },
    {
-      "name": "demos/weather_forecast",
+      "name": "chatbot_ui",
-      "path": "./demos/weather_forecast",
+      "path": "demos/shared/chatbot_ui"
-    },
+    }
    {
      "name": "demos/insurance_agent",
      "path": "./demos/insurance_agent",
    },
  ],
  "settings": {
    "[python]": {
      "editor.defaultFormatter": "ms-python.black-formatter",
      "editor.formatOnSave": true
    },
  },
  "extensions": {
    "recommendations": [
      "ms-python.python",
      "ms-python.debugpy",
      "rust-lang.rust-analyzer",
-      "humao.rest-client"
+      "humao.rest-client",
      "github.copilot",
      "eamodio.gitlens",
      "ms-python.black-formatter",
      ]
  }
 }
--- a/crates/llm_gateway/src/stream_context.rs
+++ b/crates/llm_gateway/src/stream_context.rs
@ -1,17 +1,18 @@
 use crate::filter_context::WasmMetrics;
 use common::common_types::open_ai::{
    ChatCompletionStreamResponseServerEvents, ChatCompletionsRequest, ChatCompletionsResponse,
-    StreamOptions,
+    Message, StreamOptions,
 };
 use common::configuration::LlmProvider;
 use common::consts::{
    ARCH_PROVIDER_HINT_HEADER, ARCH_ROUTING_HEADER, CHAT_COMPLETIONS_PATH,
-    RATELIMIT_SELECTOR_HEADER_KEY, REQUEST_ID_HEADER,
+    RATELIMIT_SELECTOR_HEADER_KEY, REQUEST_ID_HEADER, TRACE_PARENT_HEADER,
 };
 use common::errors::ServerError;
 use common::llm_providers::LlmProviders;
 use common::pii::obfuscate_auth_header;
 use common::ratelimit::Header;
 use common::tracing::{Event, Span};
 use common::{ratelimit, routing, tokenizer};
 use http::StatusCode;
 use log::{debug, trace, warn};
@ -23,7 +24,7 @@ use std::rc::Rc;
 use common::stats::{IncrementingMetric, RecordingMetric};
 use proxy_wasm::hostcalls::get_current_time;
-use std::time::{Duration, SystemTime};
+use std::time::{Duration, SystemTime, UNIX_EPOCH};
 pub struct StreamContext {
    context_id: u32,
@ -36,7 +37,10 @@ pub struct StreamContext {
    llm_provider: Option<Rc<LlmProvider>>,
    request_id: Option<String>,
    start_time: Option<SystemTime>,
-    ttft_duration: Option<Duration>, // Store the duration directly
+    ttft_duration: Option<Duration>,
    ttft_time: Option<SystemTime>,
    pub traceparent: Option<String>,
    user_message: Option<Message>,
 }
 impl StreamContext {
@ -53,6 +57,9 @@ impl StreamContext {
            request_id: None,
            start_time: None,
            ttft_duration: None,
            traceparent: None,
            ttft_time: None,
            user_message: None,
        }
    }
    fn llm_provider(&self) -> &LlmProvider {
@ -176,9 +183,10 @@ impl HttpContext for StreamContext {
        );
        self.request_id = self.get_http_request_header(REQUEST_ID_HEADER);
        self.traceparent = self.get_http_request_header(TRACE_PARENT_HEADER);
        //start the timing for the request using get_current_time()
-        let current_time = get_current_time().unwrap();
+        let current_time: SystemTime = get_current_time().unwrap();
        self.start_time = Some(current_time);
        self.ttft_duration = None;
@ -229,6 +237,13 @@ impl HttpContext for StreamContext {
            message.model = None;
        }
        self.user_message = deserialized_body
            .messages
            .iter()
            .filter(|m| m.role == "user")
            .last()
            .cloned();
        // override model name from the llm provider
        deserialized_body
            .model
@ -318,6 +333,52 @@ impl HttpContext for StreamContext {
                .output_sequence_length
                .record(self.response_tokens as u64);
            if let Some(traceparent) = self.traceparent.as_ref() {
                let since_the_epoch_ns = SystemTime::now()
                    .duration_since(UNIX_EPOCH)
                    .unwrap()
                    .as_nanos();
                let traceparent_tokens = traceparent.split("-").collect::<Vec<&str>>();
                if traceparent_tokens.len() != 4 {
                    warn!("traceparent header is invalid: {}", traceparent);
                    return Action::Continue;
                }
                let parent_trace_id = traceparent_tokens[1];
                let parent_span_id = traceparent_tokens[2];
                let mut trace_data = common::tracing::TraceData::new();
                let mut llm_span = Span::new(
                    "upstream_llm_time".to_string(),
                    parent_trace_id.to_string(),
                    Some(parent_span_id.to_string()),
                    self.start_time
                        .unwrap()
                        .duration_since(UNIX_EPOCH)
                        .unwrap()
                        .as_nanos(),
                    since_the_epoch_ns,
                );
                if let Some(user_message) = self.user_message.as_ref() {
                    if let Some(prompt) = user_message.content.as_ref() {
                        llm_span.add_attribute("user_prompt".to_string(), prompt.to_string());
                    }
                }
                llm_span.add_attribute("model".to_string(), self.llm_provider().name.to_string());
                llm_span.add_event(Event::new(
                    "time_to_first_token".to_string(),
                    self.ttft_time
                        .unwrap()
                        .duration_since(UNIX_EPOCH)
                        .unwrap()
                        .as_nanos(),
                ));
                trace_data.add_span(llm_span);
                let trace_data_str = serde_json::to_string(&trace_data).unwrap();
                debug!("upstream_llm trace details: {}", trace_data_str);
                // send trace_data to http tracing endpoint
            }
            return Action::Continue;
        }
@ -413,6 +474,7 @@ impl HttpContext for StreamContext {
            if self.ttft_duration.is_none() {
                if let Some(start_time) = self.start_time {
                    let current_time = get_current_time().unwrap();
                    self.ttft_time = Some(current_time);
                    match current_time.duration_since(start_time) {
                        Ok(duration) => {
                            let duration_ms = duration.as_millis();
--- a/crates/llm_gateway/tests/integration.rs
+++ b/crates/llm_gateway/tests/integration.rs
@ -51,6 +51,8 @@ fn request_headers_expectations(module: &mut Tester, http_context: i32) {
        .expect_log(Some(LogLevel::Debug), None)
        .expect_get_header_map_value(Some(MapType::HttpRequestHeaders), Some("x-request-id"))
        .returning(None)
        .expect_get_header_map_value(Some(MapType::HttpRequestHeaders), Some("traceparent"))
        .returning(None)
        .expect_get_current_time_nanos()
        .returning(Some(0))
        .execute_and_expect(ReturnType::Action(Action::Continue))
--- a/crates/prompt_gateway/src/http_context.rs
+++ b/crates/prompt_gateway/src/http_context.rs
@ -18,7 +18,6 @@ use common::{
    errors::ServerError,
    http::{CallArgs, Client},
    pii::obfuscate_auth_header,
    tracing::{Event, Span},
 };
 use http::StatusCode;
 use log::{debug, trace, warn};
@ -265,42 +264,6 @@ impl HttpContext for StreamContext {
        }
        if end_of_stream && body_size == 0 {
            if let Some(traceparent) = self.traceparent.as_ref() {
                let since_the_epoch_ns = SystemTime::now()
                    .duration_since(UNIX_EPOCH)
                    .unwrap()
                    .as_nanos();
                let traceparent_tokens = traceparent.split("-").collect::<Vec<&str>>();
                if traceparent_tokens.len() != 4 {
                    warn!("traceparent header is invalid: {}", traceparent);
                    return Action::Continue;
                }
                let parent_trace_id = traceparent_tokens[1];
                let parent_span_id = traceparent_tokens[2];
                let mut trace_data = common::tracing::TraceData::new();
                let mut llm_span = Span::new(
                    "upstream_llm_time".to_string(),
                    parent_trace_id.to_string(),
                    Some(parent_span_id.to_string()),
                    self.start_upstream_llm_request_time,
                    since_the_epoch_ns,
                );
                if let Some(prompt) = self.user_prompt.as_ref() {
                    if let Some(content) = prompt.content.as_ref() {
                        llm_span.add_attribute("user_prompt".to_string(), content.to_string());
                    }
                }
                llm_span.add_event(Event::new(
                    "time_to_first_token".to_string(),
                    self.time_to_first_token.unwrap(),
                ));
                trace_data.add_span(llm_span);
                let trace_data_str = serde_json::to_string(&trace_data).unwrap();
                debug!("upstream_llm trace details: {}", trace_data_str);
                // send trace_data to http tracing endpoint
            }
            return Action::Continue;
        }
--- a/demos/llm_routing/README.md
+++ b/demos/llm_routing/README.md
@ -0,0 +1,19 @@
 # LLM Routing
 This demo shows how you can arch gateway to manage keys and route to appropricate LLM.
 # Starting the demo
 1. Please make sure the [pre-requisites](https://github.com/katanemo/arch/?tab=readme-ov-file#prerequisites) are installed correctly
 1. Start Arch
   ```sh
   sh run_demo.sh
   ```
 1. Navigate to http://localhost:18080/
 # Observability
 Arch gateway publishes stats endpoint at http://localhost:19901/stats. In this demo we are using prometheus to pull stats from arch and we are using grafana to visalize the stats in dashboard. To see grafana dashboard follow instructions below,
 1. Navigate to http://localhost:3000/ to open grafana UI (use admin/grafana as credentials)
 1. From grafana left nav click on dashboards and select "Intelligent Gateway Overview" to view arch gateway stats
 # Selecting different LLM
 You can pick different LLM based on header `x-arch-llm-provider-hint` to override default LLM.
--- a/demos/llm_routing/docker-compose.yaml
+++ b/demos/llm_routing/docker-compose.yaml
@ -2,7 +2,7 @@ services:
  chatbot_ui:
    build:
-      context: ../../chatbot_ui
+      context: ../shared/chatbot_ui
      dockerfile: Dockerfile
    ports:
      - "18080:8080"
@ -12,3 +12,21 @@ services:
      - "host.docker.internal:host-gateway"
    volumes:
      - ./arch_config.yaml:/app/arch_config.yaml
  jaeger:
    build:
      context: ../shared/jaeger
    ports:
      - "16686:16686"
      - "4317:4317"
      - "4318:4318"
  prometheus:
    build:
      context: ../shared/prometheus
  grafana:
    build:
      context: ../shared/grafana
    ports:
      - "3000:3000"
--- a/demos/shared/chatbot_ui/.vscode/launch.json
+++ b/demos/shared/chatbot_ui/.vscode/launch.json
@ -15,19 +15,21 @@
        "LLM": "1",
        "CHAT_COMPLETION_ENDPOINT": "http://localhost:10000/v1",
        "STREAMING": "True",
-        "ARCH_CONFIG": "../demos/weather_forecast/arch_config.yaml"
+        "ARCH_CONFIG": "../../weather_forecast/arch_config.yaml"
      }
    },
    {
      "python": "${workspaceFolder}/venv/bin/python",
      "name": "chatbot-ui llm",
      "cwd": "${workspaceFolder}/app",
      "type": "debugpy",
      "request": "launch",
-      "program": "run.py",
+      "program": "run_stream.py",
      "console": "integratedTerminal",
      "env": {
        "LLM": "1",
-        "CHAT_COMPLETION_ENDPOINT": "http://localhost:12000/v1"
+        "CHAT_COMPLETION_ENDPOINT": "http://localhost:12000/v1",
        "STREAMING": "True",
        "ARCH_CONFIG": "../../llm_routing/arch_config.yaml"
      }
    },
  ]
--- a/demos/shared/chatbot_ui/common.py
+++ b/demos/shared/chatbot_ui/common.py
@ -1,3 +1,4 @@
 from datetime import datetime
 import json
 import logging
 import os
@ -159,13 +160,44 @@ def get_prompt_targets():
            config = yaml.safe_load(file)
            available_tools = []
-            for target in config["prompt_targets"]:
+            if "prompt_targets" in config:
-                if not target.get("default", False):
+                for target in config["prompt_targets"]:
-                    available_tools.append(
+                    if not target.get("default", False):
-                        convert_prompt_target_to_openai_format(target)
+                        available_tools.append(
-                    )
+                            convert_prompt_target_to_openai_format(target)
                        )
                return {tool["name"]: tool["info"] for tool in available_tools}
            elif "llm_providers" in config:
                return config["llm_providers"]
            return {tool["name"]: tool["info"] for tool in available_tools}
    except Exception as e:
        log.info(e)
        return None
 def get_llm_models():
    try:
        with open(os.getenv("ARCH_CONFIG", "arch_config.yaml"), "r") as file:
            config = yaml.safe_load(file)
            available_models = [""]
            default_llm = None
            for llm_providers in config["llm_providers"]:
                if llm_providers.get("default", False):
                    default_llm = llm_providers["name"]
                else:
                    available_models.append(llm_providers["name"])
            # place default model at the beginning of the list
            if default_llm:
                available_models.insert(0, default_llm)
            return available_models
    except Exception as e:
        log.info(e)
        return []
 def format_log(message):
    time_now = datetime.now().strftime("%Y-%m-%d %H:%M:%S,%f")[:-3]
    return f"{time_now} - {message}"
--- a/demos/shared/chatbot_ui/run_stream.py
+++ b/demos/shared/chatbot_ui/run_stream.py
@ -8,7 +8,7 @@ from typing import List, Optional, Tuple
 from openai import OpenAI
 from dotenv import load_dotenv
-from common import get_prompt_targets, process_stream_chunk
+from common import format_log, get_llm_models, get_prompt_targets, process_stream_chunk
 load_dotenv()
@ -36,20 +36,28 @@ CSS_STYLE = """
 footer {visibility: hidden}
 """
 client = OpenAI(
    api_key="--",
    base_url=CHAT_COMPLETION_ENDPOINT,
 )
 def chat(
    query: Optional[str],
    conversation: Optional[List[Tuple[str, str]]],
    history: List[dict],
    debug_output: str,
    model_selector: str,
 ):
    history.append({"role": "user", "content": query})
    if debug_output is None:
        debug_output = ""
    try:
        headers = {}
        if model_selector and model_selector != "":
            headers["x-arch-llm-provider-hint"] = model_selector
        client = OpenAI(
            api_key="--",
            base_url=CHAT_COMPLETION_ENDPOINT,
            default_headers=headers,
        )
        response = client.chat.completions.create(
            # we select model from arch_config file
            model="--",
@ -65,15 +73,20 @@ def chat(
    conversation.append((query, ""))
    model_is_set = False
    for chunk in response:
        tokens = process_stream_chunk(chunk, history)
        if tokens and not model_is_set:
            model_is_set = True
            model = history[-1]["model"]
            debug_output = debug_output + "\n" + format_log(f"model: {model}")
        if tokens:
            conversation[-1] = (
                conversation[-1][0],
                conversation[-1][1] + tokens,
            )
-            yield "", conversation, history
+            yield "", conversation, history, debug_output, model_selector
 def main():
@ -94,8 +107,17 @@ def main():
                            value=get_prompt_targets(),
                            show_indices=False,
                            elem_classes="json-container",
-                            min_height="95vh",
+                            min_height="50vh",
                        )
                    model_selector_textbox = gr.Dropdown(
                        get_llm_models(),
                        label="override model",
                        elem_classes="dropdown",
                    )
                    debug_output = gr.TextArea(
                        label="debug output",
                        elem_classes="debug_output",
                    )
            with gr.Column(scale=2):
                chatbot = gr.Chatbot(
@ -110,7 +132,9 @@ def main():
                )
            textbox.submit(
-                chat, [textbox, chatbot, history], [textbox, chatbot, history]
+                chat,
                [textbox, chatbot, history, debug_output, model_selector_textbox],
                [textbox, chatbot, history, debug_output, model_selector_textbox],
            )
    demo.launch(server_name="0.0.0.0", server_port=8080, show_error=True, debug=True)
--- a/demos/shared/grafana/dashboards/envoy_overview.json
+++ b/demos/shared/grafana/dashboards/envoy_overview.json
@ -190,8 +190,8 @@
      "targets": [
        {
          "disableTextWrap": false,
-          "editorMode": "builder",
+          "editorMode": "code",
-          "expr": "histogram_quantile(0.5, sum by(le) (rate(input_sequence_length_bucket[1h])))",
+          "expr": "histogram_quantile(0.9, sum by(le) (rate(input_sequence_length_bucket[5m])))",
          "fullMetaSearch": false,
          "includeNullMetadata": false,
          "legendFormat": "__auto",
@ -200,7 +200,7 @@
          "useBackend": false
        }
      ],
-      "title": "input sequence length (p50)",
+      "title": "input sequence length (p90)",
      "type": "timeseries"
    },
    {
@ -305,7 +305,7 @@
          },
          "disableTextWrap": false,
          "editorMode": "code",
-          "expr": "histogram_quantile(0.5, sum(rate(output_sequence_length_bucket[1h])) by(le))",
+          "expr": "histogram_quantile(0.9, sum(rate(output_sequence_length_bucket[5m])) by(le))",
          "fullMetaSearch": false,
          "includeNullMetadata": false,
          "instant": false,
@ -315,7 +315,7 @@
          "useBackend": false
        }
      ],
-      "title": "output sequence length (p50)",
+      "title": "output sequence length (p90)",
      "type": "timeseries"
    },
    {
@ -415,7 +415,11 @@
        {
          "disableTextWrap": false,
          "editorMode": "code",
 <<<<<<< HEAD
          "expr": "histogram_quantile(0.9, sum by(le) (rate(time_to_first_token_bucket[5m])))",
 =======
          "expr": "histogram_quantile(0.5, sum by(le) (rate(time_to_first_token_bucket[1h])))",
 >>>>>>> main
          "fullMetaSearch": false,
          "includeNullMetadata": false,
          "legendFormat": "__auto",
@ -424,7 +428,7 @@
          "useBackend": false
        }
      ],
-      "title": "time to first token (p50)",
+      "title": "time to first token (p90)",
      "type": "timeseries"
    },
    {
@ -539,20 +543,29 @@
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
 <<<<<<< HEAD
          "editorMode": "code",
          "expr": "histogram_quantile(0.9, sum(rate(request_latency_bucket[60m])) by (le))",
 =======
          "disableTextWrap": false,
          "editorMode": "builder",
          "expr": "histogram_quantile(0.5, sum by(le) (rate(request_latency_bucket[1h])))",
          "fullMetaSearch": false,
 >>>>>>> main
          "hide": false,
          "includeNullMetadata": false,
          "instant": false,
          "legendFormat": "__auto",
          "range": true,
 <<<<<<< HEAD
          "refId": "B"
 =======
          "refId": "A",
          "useBackend": false
 >>>>>>> main
        }
      ],
-      "title": "request latency (p50)",
+      "title": "request latency (p90)",
      "type": "timeseries"
    },
    {
--- a/demos/shared/trace_streamer/Dockerfile
+++ b/demos/shared/trace_streamer/Dockerfile
@ -1,11 +0,0 @@
 FROM python:3.12-slim as arch
 WORKDIR /app
 RUN pip install requests
 COPY stream_traces.py .
 RUN mkdir -p /var/log
 RUN touch /var/log/envoy.log
 CMD ["python", "stream_traces.py"]
--- a/demos/weather_forecast/docker-compose.yaml
+++ b/demos/weather_forecast/docker-compose.yaml
@ -30,14 +30,6 @@ services:
      - "4317:4317"
      - "4318:4318"
  trace_streamer:
    build:
      context: ../shared/trace_streamer
    environment:
      - OTEL_TRACING_HTTP_ENDPOINT=http://jaeger:4318/v1/traces
    volumes:
      - ~/archgw_logs:/var/log/
  prometheus:
    build:
      context: ../shared/prometheus
--- a/demos/weather_forecast_signoz/docker-compose.yaml
+++ b/demos/weather_forecast_signoz/docker-compose.yaml
@ -25,14 +25,6 @@ services:
    volumes:
      - ./arch_config.yaml:/app/arch_config.yaml
  trace_streamer:
    build:
      context: ../shared/trace_streamer
    environment:
      - OTEL_TRACING_HTTP_ENDPOINT=http://otel-collector:4318/v1/traces
    volumes:
      - ~/archgw_logs:/var/log/
  prometheus:
    build:
      context: ../shared/prometheus
--- a/e2e_tests/run_e2e_tests.sh
+++ b/e2e_tests/run_e2e_tests.sh
@ -29,32 +29,24 @@ cd ../demos/weather_forecast
 docker compose up weather_forecast_service --build -d
 cd -
 print_disk_usage
 log building and install model server
 log =================================
 cd ../model_server
 poetry install
 cd -
 print_disk_usage
 log building and installing archgw cli
 log ==================================
 cd ../arch/tools
 sh build_cli.sh
 cd -
 print_disk_usage
 log building docker image for arch gateway
 log ======================================
 cd ../
 archgw build
 cd -
 print_disk_usage
 log startup arch gateway with function calling demo
 cd ..
 tail -F ~/archgw_logs/modelserver.log &
@ -64,8 +56,6 @@ archgw up demos/weather_forecast/arch_config.yaml
 kill $model_server_tail_pid
 cd -
 print_disk_usage
 log running e2e tests
 log =================
 poetry install