From d3c17c7abd7034aa7d9dbba8ea83c9bfdf7a30a0 Mon Sep 17 00:00:00 2001
From: Adil Hafeez <adil@katanemo.com>
Date: Fri, 15 Nov 2024 10:44:01 -0800
Subject: [PATCH] move custom tracer to llm filter (#267)

---
 arch/Dockerfile                               | 11 ++-
 arch/docker-compose.dev.yaml                  |  9 ++-
 arch/docker-compose.e2e.yaml                  |  1 +
 arch/docker-compose.yaml                      |  4 +-
 arch/envoy.template.yaml                      | 74 ++++++++++++++++++-
 .../trace_streamer => arch}/stream_traces.py  | 13 +---
 arch/supervisord.conf                         | 25 +++++++
 arch/tools/cli/config_generator.py            |  2 +-
 archgw.code-workspace                         | 19 +++--
 crates/llm_gateway/src/stream_context.rs      | 72 ++++++++++++++++--
 crates/llm_gateway/tests/integration.rs       |  2 +
 crates/prompt_gateway/src/http_context.rs     | 37 ----------
 demos/llm_routing/README.md                   | 19 +++++
 demos/llm_routing/docker-compose.yaml         | 20 ++++-
 demos/shared/chatbot_ui/.vscode/launch.json   | 10 ++-
 demos/shared/chatbot_ui/common.py             | 44 +++++++++--
 demos/shared/chatbot_ui/run_stream.py         | 42 ++++++++---
 .../grafana/dashboards/envoy_overview.json    | 27 +++++--
 demos/shared/trace_streamer/Dockerfile        | 11 ---
 demos/weather_forecast/docker-compose.yaml    |  8 --
 .../docker-compose.yaml                       |  8 --
 e2e_tests/run_e2e_tests.sh                    | 10 ---
 22 files changed, 335 insertions(+), 133 deletions(-)
 rename {demos/shared/trace_streamer => arch}/stream_traces.py (74%)
 create mode 100644 arch/supervisord.conf
 create mode 100644 demos/llm_routing/README.md
 delete mode 100644 demos/shared/trace_streamer/Dockerfile

diff --git a/arch/Dockerfile b/arch/Dockerfile
index 74cfd40a..0d96713c 100644
--- a/arch/Dockerfile
+++ b/arch/Dockerfile
@@ -13,16 +13,21 @@ FROM envoyproxy/envoy:v1.32-latest as envoy
 #Build config generator, so that we have a single build image for both Rust and Python
 FROM python:3.12-slim as arch
 
-RUN apt-get update && apt-get install -y gettext-base curl && apt-get clean && rm -rf /var/lib/apt/lists/*
+RUN apt-get update && apt-get install -y gettext-base curl supervisor && apt-get clean && rm -rf /var/lib/apt/lists/*
 
 COPY --from=builder /arch/target/wasm32-wasip1/release/prompt_gateway.wasm /etc/envoy/proxy-wasm-plugins/prompt_gateway.wasm
 COPY --from=builder /arch/target/wasm32-wasip1/release/llm_gateway.wasm /etc/envoy/proxy-wasm-plugins/llm_gateway.wasm
 COPY --from=envoy /usr/local/bin/envoy /usr/local/bin/envoy
-WORKDIR /config
+WORKDIR /app
 COPY arch/requirements.txt .
 RUN pip install -r requirements.txt
 COPY arch/tools/cli/config_generator.py .
 COPY arch/envoy.template.yaml .
 COPY arch/arch_config_schema.yaml .
+COPY arch/supervisord.conf /etc/supervisor/conf.d/supervisord.conf
+COPY arch/stream_traces.py .
 
-ENTRYPOINT ["sh", "-c", "python config_generator.py && envsubst < /etc/envoy/envoy.yaml > /etc/envoy.env_sub.yaml && envoy -c /etc/envoy.env_sub.yaml --component-log-level wasm:debug 2>&1 | tee /var/log/envoy.log"]
+RUN pip install requests
+RUN touch /var/log/envoy.log
+
+ENTRYPOINT ["supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]
diff --git a/arch/docker-compose.dev.yaml b/arch/docker-compose.dev.yaml
index 134f3853..378e0eca 100644
--- a/arch/docker-compose.dev.yaml
+++ b/arch/docker-compose.dev.yaml
@@ -8,11 +8,11 @@ services:
       - "12000:12000"
       - "19901:9901"
     volumes:
-      - ${ARCH_CONFIG_FILE:-../demos/weather_forecast/arch_config.yaml}:/config/arch_config.yaml
+      - ${ARCH_CONFIG_FILE:-../demos/weather_forecast/arch_config.yaml}:/app/arch_config.yaml
       - /etc/ssl/cert.pem:/etc/ssl/cert.pem
-      - ./envoy.template.yaml:/config/envoy.template.yaml
-      - ./arch_config_schema.yaml:/config/arch_config_schema.yaml
-      - ./tools/cli/config_generator.py:/config/config_generator.py
+      - ./envoy.template.yaml:/app/envoy.template.yaml
+      - ./arch_config_schema.yaml:/app/arch_config_schema.yaml
+      - ./tools/cli/config_generator.py:/app/config_generator.py
       - ../crates/target/wasm32-wasip1/release/llm_gateway.wasm:/etc/envoy/proxy-wasm-plugins/llm_gateway.wasm
       - ../crates/target/wasm32-wasip1/release/prompt_gateway.wasm:/etc/envoy/proxy-wasm-plugins/prompt_gateway.wasm
       - ~/archgw_logs:/var/log/
@@ -21,3 +21,4 @@ services:
     environment:
       - OPENAI_API_KEY=${OPENAI_API_KEY:?error}
       - MISTRAL_API_KEY=${MISTRAL_API_KEY:?error}
+      - OTEL_TRACING_HTTP_ENDPOINT=http://host.docker.internal:4318/v1/traces
diff --git a/arch/docker-compose.e2e.yaml b/arch/docker-compose.e2e.yaml
index 42195962..208dc1e7 100644
--- a/arch/docker-compose.e2e.yaml
+++ b/arch/docker-compose.e2e.yaml
@@ -16,3 +16,4 @@ services:
     environment:
       - OPENAI_API_KEY=${OPENAI_API_KEY:?error}
       - MISTRAL_API_KEY=${MISTRAL_API_KEY:?error}
+      - OTEL_TRACING_HTTP_ENDPOINT=http://host.docker.internal:4318/v1/traces
diff --git a/arch/docker-compose.yaml b/arch/docker-compose.yaml
index 05d2c05f..51874ead 100644
--- a/arch/docker-compose.yaml
+++ b/arch/docker-compose.yaml
@@ -8,11 +8,13 @@ services:
       - "12000:12000"
       - "19901:9901"
     volumes:
-      - ${ARCH_CONFIG_FILE:-../demos/weather_forecast/arch_config.yaml}:/config/arch_config.yaml
+      - ${ARCH_CONFIG_FILE:-../demos/weather_forecast/arch_config.yaml}:/app/arch_config.yaml
       - /etc/ssl/cert.pem:/etc/ssl/cert.pem
       - ~/archgw_logs:/var/log/
     env_file:
       - env.list
+    environment:
+      - OTEL_TRACING_HTTP_ENDPOINT=http://host.docker.internal:4318/v1/traces
     extra_hosts:
       - "host.docker.internal:host-gateway"
     healthcheck:
diff --git a/arch/envoy.template.yaml b/arch/envoy.template.yaml
index f08a2b2f..52671f99 100644
--- a/arch/envoy.template.yaml
+++ b/arch/envoy.template.yaml
@@ -242,11 +242,66 @@ static_resources:
                     typed_config:
                       "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
 
-    - name: arch_listener_llm
+
+    - name: arch_listener_http_llm
       address:
         socket_address:
           address: 0.0.0.0
           port_value: 12000
+      traffic_direction: INBOUND
+      filter_chains:
+        - filters:
+            - name: envoy.filters.network.http_connection_manager
+              typed_config:
+                "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager
+                {% if "random_sampling" in arch_tracing and arch_tracing["random_sampling"] > 0 %}
+                generate_request_id: true
+                tracing:
+                  provider:
+                    name: envoy.tracers.opentelemetry
+                    typed_config:
+                      "@type": type.googleapis.com/envoy.config.trace.v3.OpenTelemetryConfig
+                      grpc_service:
+                        envoy_grpc:
+                          cluster_name: opentelemetry_collector
+                        timeout: 0.250s
+                      service_name: arch_gateway
+                  random_sampling:
+                    value: {{ arch_tracing.random_sampling }}
+                {% endif %}
+                stat_prefix: arch_listener_http
+                codec_type: AUTO
+                scheme_header_transformation:
+                  scheme_to_overwrite: https
+                access_log:
+                - name: envoy.access_loggers.file
+                  typed_config:
+                    "@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog
+                    path: "/var/log/access_llm.log"
+                route_config:
+                  name: local_routes
+                  virtual_hosts:
+                    - name: local_service
+                      domains:
+                        - "*"
+                      routes:
+                        - match:
+                            prefix: "/"
+                          route:
+                            auto_host_rewrite: true
+                            cluster: arch_listener_llm
+                            timeout: 60s
+                http_filters:
+                  - name: envoy.filters.http.router
+                    typed_config:
+                      "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
+
+
+    - name: arch_listener_llm
+      address:
+        socket_address:
+          address: 0.0.0.0
+          port_value: 12001
       filter_chains:
         - filters:
             - name: envoy.filters.network.http_connection_manager
@@ -479,6 +534,23 @@ static_resources:
                       port_value: 10001
                   hostname: arch_prompt_gateway_listener
 
+    - name: arch_listener_llm
+      connect_timeout: 5s
+      type: LOGICAL_DNS
+      dns_lookup_family: V4_ONLY
+      lb_policy: ROUND_ROBIN
+      load_assignment:
+        cluster_name: arch_listener_llm
+        endpoints:
+          - lb_endpoints:
+              - endpoint:
+                  address:
+                    socket_address:
+                      address: 0.0.0.0
+                      port_value: 12001
+                  hostname: arch_listener_llm
+
+
 {% if "random_sampling" in arch_tracing and arch_tracing["random_sampling"] > 0 %}
     - name: opentelemetry_collector
       type: STRICT_DNS
diff --git a/demos/shared/trace_streamer/stream_traces.py b/arch/stream_traces.py
similarity index 74%
rename from demos/shared/trace_streamer/stream_traces.py
rename to arch/stream_traces.py
index 4f1bf20c..1a165a8a 100644
--- a/demos/shared/trace_streamer/stream_traces.py
+++ b/arch/stream_traces.py
@@ -1,4 +1,5 @@
 import os
+import sys
 import time
 import requests
 import logging
@@ -29,14 +30,8 @@ def process_log_line(line):
         logging.error(f"Failed to send trace to otel-tracing: {e}")
 
 
-with open(envoy_log_path, "r") as f:
-    # Seek to the end of the file so we only read new lines
-    f.seek(0, os.SEEK_END)
-    while True:
-        line = f.readline()
-        if not line:
-            time.sleep(1)
-            continue
-        tokens = line.split("prompt_gateway: upstream_llm trace details: ")
+for line in sys.stdin:
+    if line:
+        tokens = line.split("gateway: upstream_llm trace details: ")
         if len(tokens) > 1:
             process_log_line(tokens[1])
diff --git a/arch/supervisord.conf b/arch/supervisord.conf
new file mode 100644
index 00000000..da659e65
--- /dev/null
+++ b/arch/supervisord.conf
@@ -0,0 +1,25 @@
+[supervisord]
+nodaemon=true
+
+[program:trace_streamer]
+command=sh -c "tail -F /var/log/envoy.log | python stream_traces.py"
+autostart=true
+autorestart=false
+startretries=3
+priority=1
+stdout_logfile=/dev/stdout
+stderr_logfile=/dev/stderr
+stdout_logfile_maxbytes = 0
+stderr_logfile_maxbytes = 0
+
+
+[program:envoy]
+command=sh -c "python config_generator.py && envsubst < /etc/envoy/envoy.yaml > /etc/envoy.env_sub.yaml && envoy -c /etc/envoy.env_sub.yaml --component-log-level wasm:debug 2>&1 | tee /var/log/envoy.log"
+autostart=true
+autorestart=true
+startretries=3
+priority=2
+stdout_logfile=/dev/stdout
+stderr_logfile=/dev/stderr
+stdout_logfile_maxbytes = 0
+stderr_logfile_maxbytes = 0
diff --git a/arch/tools/cli/config_generator.py b/arch/tools/cli/config_generator.py
index 90c1406f..b8c35562 100644
--- a/arch/tools/cli/config_generator.py
+++ b/arch/tools/cli/config_generator.py
@@ -6,7 +6,7 @@ from jsonschema import validate
 ENVOY_CONFIG_TEMPLATE_FILE = os.getenv(
     "ENVOY_CONFIG_TEMPLATE_FILE", "envoy.template.yaml"
 )
-ARCH_CONFIG_FILE = os.getenv("ARCH_CONFIG_FILE", "/config/arch_config.yaml")
+ARCH_CONFIG_FILE = os.getenv("ARCH_CONFIG_FILE", "/app/arch_config.yaml")
 ENVOY_CONFIG_FILE_RENDERED = os.getenv(
     "ENVOY_CONFIG_FILE_RENDERED", "/etc/envoy/envoy.yaml"
 )
diff --git a/archgw.code-workspace b/archgw.code-workspace
index 3e36fff2..07b23996 100644
--- a/archgw.code-workspace
+++ b/archgw.code-workspace
@@ -21,22 +21,25 @@
       "path": "e2e_tests"
     },
     {
-      "name": "demos/weather_forecast",
-      "path": "./demos/weather_forecast",
-    },
-    {
-      "name": "demos/insurance_agent",
-      "path": "./demos/insurance_agent",
-    },
+      "name": "chatbot_ui",
+      "path": "demos/shared/chatbot_ui"
+    }
   ],
   "settings": {
+    "[python]": {
+      "editor.defaultFormatter": "ms-python.black-formatter",
+      "editor.formatOnSave": true
+    },
   },
   "extensions": {
     "recommendations": [
       "ms-python.python",
       "ms-python.debugpy",
       "rust-lang.rust-analyzer",
-      "humao.rest-client"
+      "humao.rest-client",
+      "github.copilot",
+      "eamodio.gitlens",
+      "ms-python.black-formatter",
       ]
   }
 }
diff --git a/crates/llm_gateway/src/stream_context.rs b/crates/llm_gateway/src/stream_context.rs
index 38266f72..7e35e7f2 100644
--- a/crates/llm_gateway/src/stream_context.rs
+++ b/crates/llm_gateway/src/stream_context.rs
@@ -1,17 +1,18 @@
 use crate::filter_context::WasmMetrics;
 use common::common_types::open_ai::{
     ChatCompletionStreamResponseServerEvents, ChatCompletionsRequest, ChatCompletionsResponse,
-    StreamOptions,
+    Message, StreamOptions,
 };
 use common::configuration::LlmProvider;
 use common::consts::{
     ARCH_PROVIDER_HINT_HEADER, ARCH_ROUTING_HEADER, CHAT_COMPLETIONS_PATH,
-    RATELIMIT_SELECTOR_HEADER_KEY, REQUEST_ID_HEADER,
+    RATELIMIT_SELECTOR_HEADER_KEY, REQUEST_ID_HEADER, TRACE_PARENT_HEADER,
 };
 use common::errors::ServerError;
 use common::llm_providers::LlmProviders;
 use common::pii::obfuscate_auth_header;
 use common::ratelimit::Header;
+use common::tracing::{Event, Span};
 use common::{ratelimit, routing, tokenizer};
 use http::StatusCode;
 use log::{debug, trace, warn};
@@ -23,7 +24,7 @@ use std::rc::Rc;
 use common::stats::{IncrementingMetric, RecordingMetric};
 
 use proxy_wasm::hostcalls::get_current_time;
-use std::time::{Duration, SystemTime};
+use std::time::{Duration, SystemTime, UNIX_EPOCH};
 
 pub struct StreamContext {
     context_id: u32,
@@ -36,7 +37,10 @@ pub struct StreamContext {
     llm_provider: Option<Rc<LlmProvider>>,
     request_id: Option<String>,
     start_time: Option<SystemTime>,
-    ttft_duration: Option<Duration>, // Store the duration directly
+    ttft_duration: Option<Duration>,
+    ttft_time: Option<SystemTime>,
+    pub traceparent: Option<String>,
+    user_message: Option<Message>,
 }
 
 impl StreamContext {
@@ -53,6 +57,9 @@ impl StreamContext {
             request_id: None,
             start_time: None,
             ttft_duration: None,
+            traceparent: None,
+            ttft_time: None,
+            user_message: None,
         }
     }
     fn llm_provider(&self) -> &LlmProvider {
@@ -176,9 +183,10 @@ impl HttpContext for StreamContext {
         );
 
         self.request_id = self.get_http_request_header(REQUEST_ID_HEADER);
+        self.traceparent = self.get_http_request_header(TRACE_PARENT_HEADER);
 
         //start the timing for the request using get_current_time()
-        let current_time = get_current_time().unwrap();
+        let current_time: SystemTime = get_current_time().unwrap();
         self.start_time = Some(current_time);
         self.ttft_duration = None;
 
@@ -229,6 +237,13 @@ impl HttpContext for StreamContext {
             message.model = None;
         }
 
+        self.user_message = deserialized_body
+            .messages
+            .iter()
+            .filter(|m| m.role == "user")
+            .last()
+            .cloned();
+
         // override model name from the llm provider
         deserialized_body
             .model
@@ -318,6 +333,52 @@ impl HttpContext for StreamContext {
                 .output_sequence_length
                 .record(self.response_tokens as u64);
 
+            if let Some(traceparent) = self.traceparent.as_ref() {
+                let since_the_epoch_ns = SystemTime::now()
+                    .duration_since(UNIX_EPOCH)
+                    .unwrap()
+                    .as_nanos();
+
+                let traceparent_tokens = traceparent.split("-").collect::<Vec<&str>>();
+                if traceparent_tokens.len() != 4 {
+                    warn!("traceparent header is invalid: {}", traceparent);
+                    return Action::Continue;
+                }
+                let parent_trace_id = traceparent_tokens[1];
+                let parent_span_id = traceparent_tokens[2];
+                let mut trace_data = common::tracing::TraceData::new();
+                let mut llm_span = Span::new(
+                    "upstream_llm_time".to_string(),
+                    parent_trace_id.to_string(),
+                    Some(parent_span_id.to_string()),
+                    self.start_time
+                        .unwrap()
+                        .duration_since(UNIX_EPOCH)
+                        .unwrap()
+                        .as_nanos(),
+                    since_the_epoch_ns,
+                );
+                if let Some(user_message) = self.user_message.as_ref() {
+                    if let Some(prompt) = user_message.content.as_ref() {
+                        llm_span.add_attribute("user_prompt".to_string(), prompt.to_string());
+                    }
+                }
+                llm_span.add_attribute("model".to_string(), self.llm_provider().name.to_string());
+                llm_span.add_event(Event::new(
+                    "time_to_first_token".to_string(),
+                    self.ttft_time
+                        .unwrap()
+                        .duration_since(UNIX_EPOCH)
+                        .unwrap()
+                        .as_nanos(),
+                ));
+                trace_data.add_span(llm_span);
+
+                let trace_data_str = serde_json::to_string(&trace_data).unwrap();
+                debug!("upstream_llm trace details: {}", trace_data_str);
+                // send trace_data to http tracing endpoint
+            }
+
             return Action::Continue;
         }
 
@@ -413,6 +474,7 @@ impl HttpContext for StreamContext {
             if self.ttft_duration.is_none() {
                 if let Some(start_time) = self.start_time {
                     let current_time = get_current_time().unwrap();
+                    self.ttft_time = Some(current_time);
                     match current_time.duration_since(start_time) {
                         Ok(duration) => {
                             let duration_ms = duration.as_millis();
diff --git a/crates/llm_gateway/tests/integration.rs b/crates/llm_gateway/tests/integration.rs
index 7107b4d2..a40389aa 100644
--- a/crates/llm_gateway/tests/integration.rs
+++ b/crates/llm_gateway/tests/integration.rs
@@ -51,6 +51,8 @@ fn request_headers_expectations(module: &mut Tester, http_context: i32) {
         .expect_log(Some(LogLevel::Debug), None)
         .expect_get_header_map_value(Some(MapType::HttpRequestHeaders), Some("x-request-id"))
         .returning(None)
+        .expect_get_header_map_value(Some(MapType::HttpRequestHeaders), Some("traceparent"))
+        .returning(None)
         .expect_get_current_time_nanos()
         .returning(Some(0))
         .execute_and_expect(ReturnType::Action(Action::Continue))
diff --git a/crates/prompt_gateway/src/http_context.rs b/crates/prompt_gateway/src/http_context.rs
index 3174a597..aef5b491 100644
--- a/crates/prompt_gateway/src/http_context.rs
+++ b/crates/prompt_gateway/src/http_context.rs
@@ -18,7 +18,6 @@ use common::{
     errors::ServerError,
     http::{CallArgs, Client},
     pii::obfuscate_auth_header,
-    tracing::{Event, Span},
 };
 use http::StatusCode;
 use log::{debug, trace, warn};
@@ -265,42 +264,6 @@ impl HttpContext for StreamContext {
         }
 
         if end_of_stream && body_size == 0 {
-            if let Some(traceparent) = self.traceparent.as_ref() {
-                let since_the_epoch_ns = SystemTime::now()
-                    .duration_since(UNIX_EPOCH)
-                    .unwrap()
-                    .as_nanos();
-
-                let traceparent_tokens = traceparent.split("-").collect::<Vec<&str>>();
-                if traceparent_tokens.len() != 4 {
-                    warn!("traceparent header is invalid: {}", traceparent);
-                    return Action::Continue;
-                }
-                let parent_trace_id = traceparent_tokens[1];
-                let parent_span_id = traceparent_tokens[2];
-                let mut trace_data = common::tracing::TraceData::new();
-                let mut llm_span = Span::new(
-                    "upstream_llm_time".to_string(),
-                    parent_trace_id.to_string(),
-                    Some(parent_span_id.to_string()),
-                    self.start_upstream_llm_request_time,
-                    since_the_epoch_ns,
-                );
-                if let Some(prompt) = self.user_prompt.as_ref() {
-                    if let Some(content) = prompt.content.as_ref() {
-                        llm_span.add_attribute("user_prompt".to_string(), content.to_string());
-                    }
-                }
-                llm_span.add_event(Event::new(
-                    "time_to_first_token".to_string(),
-                    self.time_to_first_token.unwrap(),
-                ));
-                trace_data.add_span(llm_span);
-
-                let trace_data_str = serde_json::to_string(&trace_data).unwrap();
-                debug!("upstream_llm trace details: {}", trace_data_str);
-                // send trace_data to http tracing endpoint
-            }
             return Action::Continue;
         }
 
diff --git a/demos/llm_routing/README.md b/demos/llm_routing/README.md
new file mode 100644
index 00000000..f5a49971
--- /dev/null
+++ b/demos/llm_routing/README.md
@@ -0,0 +1,19 @@
+# LLM Routing
+This demo shows how you can arch gateway to manage keys and route to appropricate LLM.
+
+# Starting the demo
+1. Please make sure the [pre-requisites](https://github.com/katanemo/arch/?tab=readme-ov-file#prerequisites) are installed correctly
+1. Start Arch
+   ```sh
+   sh run_demo.sh
+   ```
+1. Navigate to http://localhost:18080/
+
+# Observability
+Arch gateway publishes stats endpoint at http://localhost:19901/stats. In this demo we are using prometheus to pull stats from arch and we are using grafana to visalize the stats in dashboard. To see grafana dashboard follow instructions below,
+
+1. Navigate to http://localhost:3000/ to open grafana UI (use admin/grafana as credentials)
+1. From grafana left nav click on dashboards and select "Intelligent Gateway Overview" to view arch gateway stats
+
+# Selecting different LLM
+You can pick different LLM based on header `x-arch-llm-provider-hint` to override default LLM.
diff --git a/demos/llm_routing/docker-compose.yaml b/demos/llm_routing/docker-compose.yaml
index 1ce6963b..ac59499c 100644
--- a/demos/llm_routing/docker-compose.yaml
+++ b/demos/llm_routing/docker-compose.yaml
@@ -2,7 +2,7 @@ services:
 
   chatbot_ui:
     build:
-      context: ../../chatbot_ui
+      context: ../shared/chatbot_ui
       dockerfile: Dockerfile
     ports:
       - "18080:8080"
@@ -12,3 +12,21 @@ services:
       - "host.docker.internal:host-gateway"
     volumes:
       - ./arch_config.yaml:/app/arch_config.yaml
+
+  jaeger:
+    build:
+      context: ../shared/jaeger
+    ports:
+      - "16686:16686"
+      - "4317:4317"
+      - "4318:4318"
+
+  prometheus:
+    build:
+      context: ../shared/prometheus
+
+  grafana:
+    build:
+      context: ../shared/grafana
+    ports:
+      - "3000:3000"
diff --git a/demos/shared/chatbot_ui/.vscode/launch.json b/demos/shared/chatbot_ui/.vscode/launch.json
index cc443eee..e7f91d36 100644
--- a/demos/shared/chatbot_ui/.vscode/launch.json
+++ b/demos/shared/chatbot_ui/.vscode/launch.json
@@ -15,19 +15,21 @@
         "LLM": "1",
         "CHAT_COMPLETION_ENDPOINT": "http://localhost:10000/v1",
         "STREAMING": "True",
-        "ARCH_CONFIG": "../demos/weather_forecast/arch_config.yaml"
+        "ARCH_CONFIG": "../../weather_forecast/arch_config.yaml"
       }
     },
     {
+      "python": "${workspaceFolder}/venv/bin/python",
       "name": "chatbot-ui llm",
-      "cwd": "${workspaceFolder}/app",
       "type": "debugpy",
       "request": "launch",
-      "program": "run.py",
+      "program": "run_stream.py",
       "console": "integratedTerminal",
       "env": {
         "LLM": "1",
-        "CHAT_COMPLETION_ENDPOINT": "http://localhost:12000/v1"
+        "CHAT_COMPLETION_ENDPOINT": "http://localhost:12000/v1",
+        "STREAMING": "True",
+        "ARCH_CONFIG": "../../llm_routing/arch_config.yaml"
       }
     },
   ]
diff --git a/demos/shared/chatbot_ui/common.py b/demos/shared/chatbot_ui/common.py
index 27838397..cfcc6556 100644
--- a/demos/shared/chatbot_ui/common.py
+++ b/demos/shared/chatbot_ui/common.py
@@ -1,3 +1,4 @@
+from datetime import datetime
 import json
 import logging
 import os
@@ -159,13 +160,44 @@ def get_prompt_targets():
             config = yaml.safe_load(file)
 
             available_tools = []
-            for target in config["prompt_targets"]:
-                if not target.get("default", False):
-                    available_tools.append(
-                        convert_prompt_target_to_openai_format(target)
-                    )
+            if "prompt_targets" in config:
+                for target in config["prompt_targets"]:
+                    if not target.get("default", False):
+                        available_tools.append(
+                            convert_prompt_target_to_openai_format(target)
+                        )
+
+                return {tool["name"]: tool["info"] for tool in available_tools}
+            elif "llm_providers" in config:
+                return config["llm_providers"]
 
-            return {tool["name"]: tool["info"] for tool in available_tools}
     except Exception as e:
         log.info(e)
         return None
+
+
+def get_llm_models():
+    try:
+        with open(os.getenv("ARCH_CONFIG", "arch_config.yaml"), "r") as file:
+            config = yaml.safe_load(file)
+
+            available_models = [""]
+            default_llm = None
+            for llm_providers in config["llm_providers"]:
+                if llm_providers.get("default", False):
+                    default_llm = llm_providers["name"]
+                else:
+                    available_models.append(llm_providers["name"])
+
+            # place default model at the beginning of the list
+            if default_llm:
+                available_models.insert(0, default_llm)
+            return available_models
+    except Exception as e:
+        log.info(e)
+        return []
+
+
+def format_log(message):
+    time_now = datetime.now().strftime("%Y-%m-%d %H:%M:%S,%f")[:-3]
+    return f"{time_now} - {message}"
diff --git a/demos/shared/chatbot_ui/run_stream.py b/demos/shared/chatbot_ui/run_stream.py
index bd4eab56..407d4c05 100644
--- a/demos/shared/chatbot_ui/run_stream.py
+++ b/demos/shared/chatbot_ui/run_stream.py
@@ -8,7 +8,7 @@ from typing import List, Optional, Tuple
 from openai import OpenAI
 from dotenv import load_dotenv
 
-from common import get_prompt_targets, process_stream_chunk
+from common import format_log, get_llm_models, get_prompt_targets, process_stream_chunk
 
 load_dotenv()
 
@@ -36,20 +36,28 @@ CSS_STYLE = """
 footer {visibility: hidden}
 """
 
-client = OpenAI(
-    api_key="--",
-    base_url=CHAT_COMPLETION_ENDPOINT,
-)
-
 
 def chat(
     query: Optional[str],
     conversation: Optional[List[Tuple[str, str]]],
     history: List[dict],
+    debug_output: str,
+    model_selector: str,
 ):
     history.append({"role": "user", "content": query})
 
+    if debug_output is None:
+        debug_output = ""
+
     try:
+        headers = {}
+        if model_selector and model_selector != "":
+            headers["x-arch-llm-provider-hint"] = model_selector
+        client = OpenAI(
+            api_key="--",
+            base_url=CHAT_COMPLETION_ENDPOINT,
+            default_headers=headers,
+        )
         response = client.chat.completions.create(
             # we select model from arch_config file
             model="--",
@@ -65,15 +73,20 @@ def chat(
 
     conversation.append((query, ""))
 
+    model_is_set = False
     for chunk in response:
         tokens = process_stream_chunk(chunk, history)
+        if tokens and not model_is_set:
+            model_is_set = True
+            model = history[-1]["model"]
+            debug_output = debug_output + "\n" + format_log(f"model: {model}")
         if tokens:
             conversation[-1] = (
                 conversation[-1][0],
                 conversation[-1][1] + tokens,
             )
 
-            yield "", conversation, history
+            yield "", conversation, history, debug_output, model_selector
 
 
 def main():
@@ -94,8 +107,17 @@ def main():
                             value=get_prompt_targets(),
                             show_indices=False,
                             elem_classes="json-container",
-                            min_height="95vh",
+                            min_height="50vh",
                         )
+                    model_selector_textbox = gr.Dropdown(
+                        get_llm_models(),
+                        label="override model",
+                        elem_classes="dropdown",
+                    )
+                    debug_output = gr.TextArea(
+                        label="debug output",
+                        elem_classes="debug_output",
+                    )
 
             with gr.Column(scale=2):
                 chatbot = gr.Chatbot(
@@ -110,7 +132,9 @@ def main():
                 )
 
             textbox.submit(
-                chat, [textbox, chatbot, history], [textbox, chatbot, history]
+                chat,
+                [textbox, chatbot, history, debug_output, model_selector_textbox],
+                [textbox, chatbot, history, debug_output, model_selector_textbox],
             )
 
     demo.launch(server_name="0.0.0.0", server_port=8080, show_error=True, debug=True)
diff --git a/demos/shared/grafana/dashboards/envoy_overview.json b/demos/shared/grafana/dashboards/envoy_overview.json
index 4089dade..e710e748 100644
--- a/demos/shared/grafana/dashboards/envoy_overview.json
+++ b/demos/shared/grafana/dashboards/envoy_overview.json
@@ -190,8 +190,8 @@
       "targets": [
         {
           "disableTextWrap": false,
-          "editorMode": "builder",
-          "expr": "histogram_quantile(0.5, sum by(le) (rate(input_sequence_length_bucket[1h])))",
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.9, sum by(le) (rate(input_sequence_length_bucket[5m])))",
           "fullMetaSearch": false,
           "includeNullMetadata": false,
           "legendFormat": "__auto",
@@ -200,7 +200,7 @@
           "useBackend": false
         }
       ],
-      "title": "input sequence length (p50)",
+      "title": "input sequence length (p90)",
       "type": "timeseries"
     },
     {
@@ -305,7 +305,7 @@
           },
           "disableTextWrap": false,
           "editorMode": "code",
-          "expr": "histogram_quantile(0.5, sum(rate(output_sequence_length_bucket[1h])) by(le))",
+          "expr": "histogram_quantile(0.9, sum(rate(output_sequence_length_bucket[5m])) by(le))",
           "fullMetaSearch": false,
           "includeNullMetadata": false,
           "instant": false,
@@ -315,7 +315,7 @@
           "useBackend": false
         }
       ],
-      "title": "output sequence length (p50)",
+      "title": "output sequence length (p90)",
       "type": "timeseries"
     },
     {
@@ -415,7 +415,11 @@
         {
           "disableTextWrap": false,
           "editorMode": "code",
+<<<<<<< HEAD
+          "expr": "histogram_quantile(0.9, sum by(le) (rate(time_to_first_token_bucket[5m])))",
+=======
           "expr": "histogram_quantile(0.5, sum by(le) (rate(time_to_first_token_bucket[1h])))",
+>>>>>>> main
           "fullMetaSearch": false,
           "includeNullMetadata": false,
           "legendFormat": "__auto",
@@ -424,7 +428,7 @@
           "useBackend": false
         }
       ],
-      "title": "time to first token (p50)",
+      "title": "time to first token (p90)",
       "type": "timeseries"
     },
     {
@@ -539,20 +543,29 @@
             "type": "prometheus",
             "uid": "PBFA97CFB590B2093"
           },
+<<<<<<< HEAD
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.9, sum(rate(request_latency_bucket[60m])) by (le))",
+=======
           "disableTextWrap": false,
           "editorMode": "builder",
           "expr": "histogram_quantile(0.5, sum by(le) (rate(request_latency_bucket[1h])))",
           "fullMetaSearch": false,
+>>>>>>> main
           "hide": false,
           "includeNullMetadata": false,
           "instant": false,
           "legendFormat": "__auto",
           "range": true,
+<<<<<<< HEAD
+          "refId": "B"
+=======
           "refId": "A",
           "useBackend": false
+>>>>>>> main
         }
       ],
-      "title": "request latency (p50)",
+      "title": "request latency (p90)",
       "type": "timeseries"
     },
     {
diff --git a/demos/shared/trace_streamer/Dockerfile b/demos/shared/trace_streamer/Dockerfile
deleted file mode 100644
index 189c650a..00000000
--- a/demos/shared/trace_streamer/Dockerfile
+++ /dev/null
@@ -1,11 +0,0 @@
-FROM python:3.12-slim as arch
-
-WORKDIR /app
-
-RUN pip install requests
-COPY stream_traces.py .
-
-RUN mkdir -p /var/log
-RUN touch /var/log/envoy.log
-
-CMD ["python", "stream_traces.py"]
diff --git a/demos/weather_forecast/docker-compose.yaml b/demos/weather_forecast/docker-compose.yaml
index 7d074ae4..fdaa7fcd 100644
--- a/demos/weather_forecast/docker-compose.yaml
+++ b/demos/weather_forecast/docker-compose.yaml
@@ -30,14 +30,6 @@ services:
       - "4317:4317"
       - "4318:4318"
 
-  trace_streamer:
-    build:
-      context: ../shared/trace_streamer
-    environment:
-      - OTEL_TRACING_HTTP_ENDPOINT=http://jaeger:4318/v1/traces
-    volumes:
-      - ~/archgw_logs:/var/log/
-
   prometheus:
     build:
       context: ../shared/prometheus
diff --git a/demos/weather_forecast_signoz/docker-compose.yaml b/demos/weather_forecast_signoz/docker-compose.yaml
index 56d7c9da..1c23f464 100644
--- a/demos/weather_forecast_signoz/docker-compose.yaml
+++ b/demos/weather_forecast_signoz/docker-compose.yaml
@@ -25,14 +25,6 @@ services:
     volumes:
       - ./arch_config.yaml:/app/arch_config.yaml
 
-  trace_streamer:
-    build:
-      context: ../shared/trace_streamer
-    environment:
-      - OTEL_TRACING_HTTP_ENDPOINT=http://otel-collector:4318/v1/traces
-    volumes:
-      - ~/archgw_logs:/var/log/
-
   prometheus:
     build:
       context: ../shared/prometheus
diff --git a/e2e_tests/run_e2e_tests.sh b/e2e_tests/run_e2e_tests.sh
index 16ef22db..d69d4af5 100644
--- a/e2e_tests/run_e2e_tests.sh
+++ b/e2e_tests/run_e2e_tests.sh
@@ -29,32 +29,24 @@ cd ../demos/weather_forecast
 docker compose up weather_forecast_service --build -d
 cd -
 
-print_disk_usage
-
 log building and install model server
 log =================================
 cd ../model_server
 poetry install
 cd -
 
-print_disk_usage
-
 log building and installing archgw cli
 log ==================================
 cd ../arch/tools
 sh build_cli.sh
 cd -
 
-print_disk_usage
-
 log building docker image for arch gateway
 log ======================================
 cd ../
 archgw build
 cd -
 
-print_disk_usage
-
 log startup arch gateway with function calling demo
 cd ..
 tail -F ~/archgw_logs/modelserver.log &
@@ -64,8 +56,6 @@ archgw up demos/weather_forecast/arch_config.yaml
 kill $model_server_tail_pid
 cd -
 
-print_disk_usage
-
 log running e2e tests
 log =================
 poetry install