Introduce brightstaff a new terminal service for llm routing (#477)

2026-07-23 16:51:04 +02:00 · 2025-05-19 09:59:22 -07:00 · 2025-05-19 09:59:22 -07:00 · 27c0f2fdce
commit 27c0f2fdce
parent 1f95fac4af
36 changed files with 2817 additions and 150 deletions
--- a/.github/workflows/e2e_archgw.yml
+++ b/.github/workflows/e2e_archgw.yml
@ -24,7 +24,7 @@ jobs:

      - name: build arch docker image
        run: |
-          cd ../../ && docker build -f arch/Dockerfile . -t katanemo/archgw -t katanemo/archgw:0.2.8
+          cd ../../ && docker build -f arch/Dockerfile . -t katanemo/archgw -t katanemo/archgw:0.2.8 -t katanemo/archgw:latest

      - name: start archgw
        env:
--- a/.github/workflows/rust_tests.yml
+++ b/.github/workflows/rust_tests.yml
@ -24,7 +24,8 @@ jobs:
        run: rustup target add wasm32-wasip1

      - name: Build wasm module
-        run: cargo build --release --target=wasm32-wasip1
+        run: |
+         cargo build --release --target=wasm32-wasip1 -p llm_gateway -p prompt_gateway

      - name: Run unit tests
        run: cargo test --lib
--- a/arch/Dockerfile
+++ b/arch/Dockerfile
@ -1,11 +1,11 @@
-# build filter using rust toolchain
+# build docker image for arch gateway
 FROM rust:1.82.0 as builder
 RUN rustup -v target add wasm32-wasip1
 WORKDIR /arch
 COPY crates .

-RUN cd prompt_gateway && cargo build --release --target wasm32-wasip1
-RUN cd llm_gateway && cargo build --release --target wasm32-wasip1
+RUN cargo build --release --target wasm32-wasip1 -p prompt_gateway -p llm_gateway
+RUN cargo build --release -p brightstaff

 # copy built filter into envoy image
 FROM docker.io/envoyproxy/envoy:v1.32-latest as envoy
@ -13,20 +13,27 @@ FROM docker.io/envoyproxy/envoy:v1.32-latest as envoy
 #Build config generator, so that we have a single build image for both Rust and Python
 FROM python:3.12-slim as arch

-RUN apt-get update && apt-get install -y gettext-base curl && apt-get clean && rm -rf /var/lib/apt/lists/*
+RUN apt-get update && apt-get install -y supervisor gettext-base curl && apt-get clean && rm -rf /var/lib/apt/lists/*

 COPY --from=builder /arch/target/wasm32-wasip1/release/prompt_gateway.wasm /etc/envoy/proxy-wasm-plugins/prompt_gateway.wasm
 COPY --from=builder /arch/target/wasm32-wasip1/release/llm_gateway.wasm /etc/envoy/proxy-wasm-plugins/llm_gateway.wasm
+COPY --from=builder /arch/target/release/brightstaff /app/brightstaff
 COPY --from=envoy /usr/local/bin/envoy /usr/local/bin/envoy
+
 WORKDIR /app
 COPY arch/requirements.txt .
 RUN pip install -r requirements.txt
 COPY arch/tools/cli/config_generator.py .
 COPY arch/envoy.template.yaml .
 COPY arch/arch_config_schema.yaml .
+COPY arch/supervisord.conf /etc/supervisor/conf.d/supervisord.conf

 RUN pip install requests
 RUN touch /var/log/envoy.log
+RUN mkdir -p /var/log/supervisor/
+RUN touch /var/log/supervisor/supervisord.log
+
+ENTRYPOINT ["sh","-c", "/usr/bin/supervisord"]

 # ENTRYPOINT ["sh","-c", "python config_generator.py && envsubst < /etc/envoy/envoy.yaml > /etc/envoy.env_sub.yaml && envoy -c /etc/envoy.env_sub.yaml --log-level trace 2>&1 | tee /var/log/envoy.log"]
-ENTRYPOINT ["sh","-c", "python config_generator.py && envsubst < /etc/envoy/envoy.yaml > /etc/envoy.env_sub.yaml && envoy -c /etc/envoy.env_sub.yaml --component-log-level wasm:info 2>&1 | tee /var/log/envoy.log"]
+# ENTRYPOINT ["sh","-c", "python config_generator.py && envsubst < /etc/envoy/envoy.yaml > /etc/envoy.env_sub.yaml && envoy -c /etc/envoy.env_sub.yaml --component-log-level wasm:info 2>&1 | tee /var/log/envoy.log"]
--- a/arch/arch_config_schema.yaml
+++ b/arch/arch_config_schema.yaml
@ -90,6 +90,8 @@ properties:
            - https
        http_host:
          type: string
+        usage:
+          type: string
      additionalProperties: false
      required:
        - name
@ -225,6 +227,12 @@ properties:
    enum:
      - llm
      - prompt
+  routing:
+    type: object
+    properties:
+      model:
+        type: string
+      additionalProperties: false
  prompt_guards:
    type: object
    properties:
--- a/arch/envoy.template.yaml
+++ b/arch/envoy.template.yaml
@ -328,11 +328,15 @@ static_resources:
                      domains:
                        - "*"
                      routes:
+                        - match:
+                            prefix: "/healthz"
+                          direct_response:
+                            status: 200
                        - match:
                            prefix: "/"
                          route:
                            auto_host_rewrite: true
-                            cluster: arch_listener_llm
+                            cluster: bright_staff
                            timeout: {{ llm_gateway_listener.timeout }}
                http_filters:
                  - name: envoy.filters.http.router
@ -380,12 +384,6 @@ static_resources:
                      domains:
                        - "*"
                      routes:
-                        - match:
-                            prefix: "/healthz"
-                          route:
-                            auto_host_rewrite: true
-                            cluster: openai
-                            timeout: 60s
                      {% for provider in arch_llm_providers %}
                        # if endpoint is set then use custom cluster for upstream llm
                        {% if provider.endpoint %}
@ -615,6 +613,38 @@ static_resources:
                      port_value: 11000
                  hostname: arch_internal

+    - name: bright_staff
+      connect_timeout: 0.5s
+      type: LOGICAL_DNS
+      dns_lookup_family: V4_ONLY
+      lb_policy: ROUND_ROBIN
+      load_assignment:
+        cluster_name: bright_staff
+        endpoints:
+          - lb_endpoints:
+              - endpoint:
+                  address:
+                    socket_address:
+                      address: 0.0.0.0
+                      port_value: 9091
+                  hostname: localhost
+
+    - name: router_model_host
+      connect_timeout: 0.5s
+      type: LOGICAL_DNS
+      dns_lookup_family: V4_ONLY
+      lb_policy: ROUND_ROBIN
+      load_assignment:
+        cluster_name: router_model_host
+        endpoints:
+          - lb_endpoints:
+              - endpoint:
+                  address:
+                    socket_address:
+                      address: 34.30.16.38
+                      port_value: 8000
+                  hostname: router_model_host
+
    - name: arch_prompt_gateway_listener
      connect_timeout: 0.5s
      type: LOGICAL_DNS
--- a/arch/supervisord.conf
+++ b/arch/supervisord.conf
@ -0,0 +1,16 @@
+[supervisord]
+nodaemon=true
+
+[program:brightstaff]
+command=sh -c "/app/brightstaff 2>&1 | tee /var/log/brightstaff.log"
+stdout_logfile=/dev/stdout
+redirect_stderr=true
+stdout_logfile_maxbytes=0
+stderr_logfile_maxbytes=0
+
+[program:envoy]
+command=/bin/sh -c "python /app/config_generator.py && envsubst < /etc/envoy/envoy.yaml > /etc/envoy.env_sub.yaml && envoy -c /etc/envoy.env_sub.yaml 2>&1 | tee /var/log//envoy.log"
+stdout_logfile=/dev/stdout
+redirect_stderr=true
+stdout_logfile_maxbytes=0
+stderr_logfile_maxbytes=0
--- a/arch/tools/cli/core.py
+++ b/arch/tools/cli/core.py
@ -6,6 +6,7 @@ import sys
 import yaml
 from cli.utils import getLogger
 from cli.consts import (
+    ARCHGW_DOCKER_IMAGE,
    ARCHGW_DOCKER_NAME,
    KATANEMO_LOCAL_MODEL_LIST,
 )
@ -55,7 +56,9 @@ def start_arch(arch_config_file, env, log_timeout=120, foreground=False):
        path (str): The path where the prompt_config.yml file is located.
        log_timeout (int): Time in seconds to show logs before checking for healthy state.
    """
-    log.info("Starting arch gateway")
+    log.info(
+        f"Starting arch gateway, image name: {ARCHGW_DOCKER_NAME}, tag: {ARCHGW_DOCKER_IMAGE}"
+    )

    try:
        archgw_container_status = docker_container_status(ARCHGW_DOCKER_NAME)
@ -92,10 +95,15 @@ def start_arch(arch_config_file, env, log_timeout=120, foreground=False):
            current_time = time.time()
            elapsed_time = current_time - start_time

+            if archgw_status == "exited":
+                log.info("archgw container exited unexpectedly.")
+                stream_gateway_logs(follow=False)
+                sys.exit(1)
+
            # Check if timeout is reached
            if elapsed_time > log_timeout:
                log.info(f"stopping log monitoring after {log_timeout} seconds.")
-                break
+                sys.exit(1)

            if prompt_gateway_health_check_status or llm_gateway_health_check_status:
                log.info("archgw is running and is healthy!")
@ -109,27 +117,27 @@ def start_arch(arch_config_file, env, log_timeout=120, foreground=False):

    except KeyboardInterrupt:
        log.info("Keyboard interrupt received, stopping arch gateway service.")
-        stop_arch()
+        stop_docker_container()


-def stop_arch():
+def stop_docker_container(service=ARCHGW_DOCKER_NAME):
    """
    Shutdown all Docker Compose services by running `docker-compose down`.

    Args:
        path (str): The path where the docker-compose.yml file is located.
    """
-    log.info("Shutting down arch gateway service.")
+    log.info(f"Shutting down {service} service.")

    try:
        subprocess.run(
-            ["docker", "stop", ARCHGW_DOCKER_NAME],
+            ["docker", "stop", service],
        )
        subprocess.run(
-            ["docker", "rm", ARCHGW_DOCKER_NAME],
+            ["docker", "rm", service],
        )

-        log.info("Successfully shut down arch gateway service.")
+        log.info(f"Successfully shut down {service} service.")

    except subprocess.CalledProcessError as e:
        log.info(f"Failed to shut down services: {str(e)}")
--- a/arch/tools/cli/docker_cli.py
+++ b/arch/tools/cli/docker_cli.py
@ -3,7 +3,10 @@ import json
 import sys
 import requests

-from cli.consts import ARCHGW_DOCKER_IMAGE, ARCHGW_DOCKER_NAME
+from cli.consts import (
+    ARCHGW_DOCKER_IMAGE,
+    ARCHGW_DOCKER_NAME,
+)
 from cli.utils import getLogger

 log = getLogger(__name__)
@ -54,7 +57,6 @@ def docker_start_archgw_detached(
    port_mappings_args = [item for port in port_mappings for item in ("-p", port)]

    volume_mappings = [
-        f"{logs_path_abs}:/var/log:rw",
        f"{arch_config_file}:/app/arch_config.yaml:ro",
        # "/Users/adilhafeez/src/intelligent-prompt-gateway/crates/target/wasm32-wasip1/release:/etc/envoy/proxy-wasm-plugins:ro",
    ]
@ -90,7 +92,7 @@ def health_check_endpoint(endpoint: str) -> bool:
    return False


-def stream_gateway_logs(follow):
+def stream_gateway_logs(follow, service="archgw"):
    """
    Stream logs from the arch gateway service.
    """
@ -99,7 +101,7 @@ def stream_gateway_logs(follow):
    options = ["docker", "logs"]
    if follow:
        options.append("-f")
-    options.append(ARCHGW_DOCKER_NAME)
+    options.append(service)
    try:
        # Run `docker-compose logs` to stream logs from the gateway service
        subprocess.run(
--- a/arch/tools/cli/main.py
+++ b/arch/tools/cli/main.py
@ -16,7 +16,7 @@ from cli.core import (
    start_arch_modelserver,
    stop_arch_modelserver,
    start_arch,
-    stop_arch,
+    stop_docker_container,
    download_models_from_hf,
 )
 from cli.consts import (
@ -51,6 +51,18 @@ def get_version():
        return "version not found"


+def verify_service_name(service):
+    """Verify if the service name is valid."""
+    if service not in [
+        SERVICE_NAME_ARCHGW,
+        SERVICE_NAME_MODEL_SERVER,
+        SERVICE_ALL,
+    ]:
+        print(f"Error: Invalid service {service}. Exiting")
+        sys.exit(1)
+    return True
+
+
@click.group(invoke_without_command=True)
@click.option("--version", is_flag=True, help="Show the archgw cli version and exit.")
@click.pass_context
@ -75,9 +87,8 @@ def main(ctx, version):
 )
 def build(service):
    """Build Arch from source. Must be in root of cloned repo."""
-    if service not in [SERVICE_NAME_ARCHGW, SERVICE_NAME_MODEL_SERVER, SERVICE_ALL]:
-        print(f"Error: Invalid service {service}. Exiting")
-        sys.exit(1)
+    verify_service_name(service)
+
    # Check if /arch/Dockerfile exists
    if service == SERVICE_NAME_ARCHGW or service == SERVICE_ALL:
        if os.path.exists(ARCHGW_DOCKERFILE):
@ -146,9 +157,7 @@ def build(service):
 )
 def up(file, path, service, foreground):
    """Starts Arch."""
-    if service not in [SERVICE_NAME_ARCHGW, SERVICE_NAME_MODEL_SERVER, SERVICE_ALL]:
-        log.info(f"Error: Invalid service {service}. Exiting")
-        sys.exit(1)
+    verify_service_name(service)

    if service == SERVICE_ALL and foreground:
        # foreground can only be specified when starting individual services
@ -156,7 +165,7 @@ def up(file, path, service, foreground):
        sys.exit(1)

    if service == SERVICE_NAME_MODEL_SERVER:
-        log.info("Download archgw models from HuggingFace...")
+        log.info("Download models from HuggingFace...")
        download_models_from_hf()
        start_arch_modelserver(foreground)
        return
@ -186,8 +195,6 @@ def up(file, path, service, foreground):
        log.info(f"Validation stderr: {validation_stderr}")
        sys.exit(1)

-    log.info("Starting arch model server and arch gateway")
-
    # Set the ARCH_CONFIG_FILE environment variable
    env_stage = {
        "OTEL_TRACING_HTTP_ENDPOINT": "http://host.docker.internal:4318/v1/traces",
@ -210,7 +217,6 @@ def up(file, path, service, foreground):
        else:
            app_env_file = os.path.abspath(os.path.join(path, ".env"))

-        print(f"app_env_file: {app_env_file}")
        if not os.path.exists(
            app_env_file
        ):  # check to see if the environment variables in the current environment or not
@ -248,17 +254,15 @@ def up(file, path, service, foreground):
 def down(service):
    """Stops Arch."""

-    if service not in [SERVICE_NAME_ARCHGW, SERVICE_NAME_MODEL_SERVER, SERVICE_ALL]:
-        log.info(f"Error: Invalid service {service}. Exiting")
-        sys.exit(1)
+    verify_service_name(service)

    if service == SERVICE_NAME_MODEL_SERVER:
        stop_arch_modelserver()
    elif service == SERVICE_NAME_ARCHGW:
-        stop_arch()
+        stop_docker_container()
    else:
        stop_arch_modelserver()
-        stop_arch()
+        stop_docker_container(SERVICE_NAME_ARCHGW)


@click.command()
--- a/crates/Cargo.lock
+++ b/crates/Cargo.lock
--- a/crates/Cargo.toml
+++ b/crates/Cargo.toml
@ -1,3 +1,3 @@
 [workspace]
 resolver = "2"
-members = ["llm_gateway", "prompt_gateway", "common"]
+members = ["llm_gateway", "prompt_gateway", "common", "brightstaff"]
--- a/crates/brightstaff/Cargo.toml
+++ b/crates/brightstaff/Cargo.toml
@ -0,0 +1,32 @@
+[package]
+name = "brightstaff"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+bytes = "1.10.1"
+common = { version = "0.1.0", path = "../common" }
+eventsource-client = "0.15.0"
+eventsource-stream = "0.2.3"
+futures = "0.3.31"
+futures-util = "0.3.31"
+http-body = "1.0.1"
+http-body-util = "0.1.3"
+hyper = { version = "1.6.0", features = ["full"] }
+hyper-util = "0.1.11"
+opentelemetry = "0.29.1"
+opentelemetry-http = "0.29.0"
+opentelemetry-otlp = {version="0.29.0", features=["trace", "tonic", "grpc-tonic"]}
+opentelemetry-stdout = "0.29.0"
+opentelemetry_sdk = "0.29.0"
+pretty_assertions = "1.4.1"
+reqwest = { version = "0.12.15", features = ["stream"] }
+serde = { version = "1.0.219", features = ["derive"] }
+serde_json = "1.0.140"
+serde_yaml = "0.9.34"
+thiserror = "2.0.12"
+tokio = { version = "1.44.2", features = ["full"] }
+tokio-stream = "0.1.17"
+tracing = "0.1.41"
+tracing-opentelemetry = "0.30.0"
+tracing-subscriber = { version = "0.3.19", features = ["env-filter", "fmt"] }
--- a/crates/brightstaff/src/handlers/chat_completions.rs
+++ b/crates/brightstaff/src/handlers/chat_completions.rs
@ -0,0 +1,168 @@
+use std::sync::Arc;
+
+use bytes::Bytes;
+use common::api::open_ai::ChatCompletionsRequest;
+use common::consts::ARCH_PROVIDER_HINT_HEADER;
+use common::utils::shorten_string;
+use http_body_util::combinators::BoxBody;
+use http_body_util::{BodyExt, Full, StreamBody};
+use hyper::body::Frame;
+use hyper::header::{self};
+use hyper::{Request, Response, StatusCode};
+use tokio::sync::mpsc;
+use tokio_stream::wrappers::ReceiverStream;
+use tokio_stream::StreamExt;
+use tracing::{info, warn};
+
+use crate::router::llm_router::RouterService;
+
+fn full<T: Into<Bytes>>(chunk: T) -> BoxBody<Bytes, hyper::Error> {
+    Full::new(chunk.into())
+        .map_err(|never| match never {})
+        .boxed()
+}
+
+pub async fn chat_completions(
+    request: Request<hyper::body::Incoming>,
+    router_service: Arc<RouterService>,
+    llm_provider_endpoint: String,
+) -> Result<Response<BoxBody<Bytes, hyper::Error>>, hyper::Error> {
+    let mut request_headers = request.headers().clone();
+
+    let chat_request_bytes = request.collect().await?.to_bytes();
+    let chat_completion_request: ChatCompletionsRequest =
+        match serde_json::from_slice(&chat_request_bytes) {
+            Ok(request) => request,
+            Err(err) => {
+                let err_msg = format!("Failed to parse request body: {}", err);
+                let mut bad_request = Response::new(full(err_msg));
+                *bad_request.status_mut() = StatusCode::BAD_REQUEST;
+                return Ok(bad_request);
+            }
+        };
+
+    info!(
+        "request body received: {}",
+        shorten_string(&serde_json::to_string(&chat_completion_request).unwrap())
+    );
+
+    let trace_parent = request_headers
+        .iter()
+        .find(|(ty, _)| ty.as_str() == "traceparent")
+        .map(|(_, value)| value.to_str().unwrap_or_default().to_string());
+
+    let selected_llm = match router_service
+        .determine_route(&chat_completion_request.messages, trace_parent.clone())
+        .await
+    {
+        Ok(route) => route,
+        Err(err) => {
+            let err_msg = format!("Failed to determine route: {}", err);
+            let mut internal_error = Response::new(full(err_msg));
+            *internal_error.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
+            return Ok(internal_error);
+        }
+    };
+
+    info!(
+        "sending request to llm provider: {} with llm model: {:?}",
+        llm_provider_endpoint, selected_llm
+    );
+
+    if let Some(trace_parent) = trace_parent {
+        request_headers.insert(
+            header::HeaderName::from_static("traceparent"),
+            header::HeaderValue::from_str(&trace_parent).unwrap(),
+        );
+    }
+
+    if let Some(selected_llm) = selected_llm {
+        request_headers.insert(
+            ARCH_PROVIDER_HINT_HEADER,
+            header::HeaderValue::from_str(&selected_llm).unwrap(),
+        );
+    }
+
+    let llm_response = match reqwest::Client::new()
+        .post(llm_provider_endpoint)
+        .headers(request_headers)
+        .body(chat_request_bytes)
+        .send()
+        .await
+    {
+        Ok(res) => res,
+        Err(err) => {
+            let err_msg = format!("Failed to send request: {}", err);
+            let mut internal_error = Response::new(full(err_msg));
+            *internal_error.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
+            return Ok(internal_error);
+        }
+    };
+
+    // copy over the headers from the original response
+    let response_headers = llm_response.headers().clone();
+    let mut response = Response::builder();
+    let headers = response.headers_mut().unwrap();
+    for (header_name, header_value) in response_headers.iter() {
+        headers.insert(header_name, header_value.clone());
+    }
+
+    if chat_completion_request.stream {
+        // channel to create async stream
+        let (tx, rx) = mpsc::channel::<Bytes>(16);
+
+        // Spawn a task to send data as it becomes available
+        tokio::spawn(async move {
+            let mut byte_stream = llm_response.bytes_stream();
+
+            while let Some(item) = byte_stream.next().await {
+                let item = match item {
+                    Ok(item) => item,
+                    Err(err) => {
+                        warn!("Error receiving chunk: {:?}", err);
+                        break;
+                    }
+                };
+
+                if tx.send(item).await.is_err() {
+                    warn!("Receiver dropped");
+                    break;
+                }
+            }
+        });
+
+        let stream = ReceiverStream::new(rx).map(|chunk| Ok::<_, hyper::Error>(Frame::data(chunk)));
+
+        let stream_body = BoxBody::new(StreamBody::new(stream));
+
+        match response.body(stream_body) {
+            Ok(response) => Ok(response),
+            Err(err) => {
+                let err_msg = format!("Failed to create response: {}", err);
+                let mut internal_error = Response::new(full(err_msg));
+                *internal_error.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
+                Ok(internal_error)
+            }
+        }
+    } else {
+        let body = match llm_response.text().await {
+            Ok(body) => body,
+            Err(err) => {
+                let err_msg = format!("Failed to read response: {}", err);
+                let mut internal_error = Response::new(full(err_msg));
+                *internal_error.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
+                return Ok(internal_error);
+            }
+        };
+
+        match response.body(full(body)) {
+            Ok(response) => Ok(response),
+            Err(err) => {
+                let err_msg = format!("Failed to create response: {}", err);
+                let mut internal_error = Response::new(full(err_msg));
+                *internal_error.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
+                Ok(internal_error)
+            }
+        }
+    }
+}
--- a/crates/brightstaff/src/handlers/mod.rs
+++ b/crates/brightstaff/src/handlers/mod.rs
@ -0,0 +1 @@
+pub mod chat_completions;
--- a/crates/brightstaff/src/lib.rs
+++ b/crates/brightstaff/src/lib.rs
@ -0,0 +1,2 @@
+pub mod handlers;
+pub mod router;
--- a/crates/brightstaff/src/main.rs
+++ b/crates/brightstaff/src/main.rs
@ -0,0 +1,157 @@
+use brightstaff::handlers::chat_completions::chat_completions;
+use brightstaff::router::llm_router::RouterService;
+use bytes::Bytes;
+use common::configuration::Configuration;
+use common::utils::shorten_string;
+use http_body_util::{combinators::BoxBody, BodyExt, Empty};
+use hyper::body::Incoming;
+use hyper::server::conn::http1;
+use hyper::service::service_fn;
+use hyper::{Method, Request, Response, StatusCode};
+use hyper_util::rt::TokioIo;
+use opentelemetry::global::BoxedTracer;
+use opentelemetry::trace::FutureExt;
+use opentelemetry::{
+    global,
+    trace::{SpanKind, Tracer},
+    Context,
+};
+use opentelemetry_http::HeaderExtractor;
+use opentelemetry_sdk::{propagation::TraceContextPropagator, trace::SdkTracerProvider};
+use opentelemetry_stdout::SpanExporter;
+use std::sync::{Arc, OnceLock};
+use std::{env, fs};
+use tokio::net::TcpListener;
+use tracing::info;
+use tracing_subscriber::EnvFilter;
+
+pub mod router;
+
+const BIND_ADDRESS: &str = "0.0.0.0:9091";
+
+fn get_tracer() -> &'static BoxedTracer {
+    static TRACER: OnceLock<BoxedTracer> = OnceLock::new();
+    TRACER.get_or_init(|| global::tracer("archgw/router"))
+}
+
+// Utility function to extract the context from the incoming request headers
+fn extract_context_from_request(req: &Request<Incoming>) -> Context {
+    global::get_text_map_propagator(|propagator| {
+        propagator.extract(&HeaderExtractor(req.headers()))
+    })
+}
+
+fn init_tracer() -> SdkTracerProvider {
+    global::set_text_map_propagator(TraceContextPropagator::new());
+    // Install stdout exporter pipeline to be able to retrieve the collected spans.
+    // For the demonstration, use `Sampler::AlwaysOn` sampler to sample all traces.
+    let provider = SdkTracerProvider::builder()
+        .with_simple_exporter(SpanExporter::default())
+        .build();
+
+    global::set_tracer_provider(provider.clone());
+    provider
+}
+
+fn empty() -> BoxBody<Bytes, hyper::Error> {
+    Empty::<Bytes>::new()
+        .map_err(|never| match never {})
+        .boxed()
+}
+
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
+    let _tracer_provider = init_tracer();
+    tracing_subscriber::fmt()
+        .with_env_filter(
+            EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")),
+        )
+        .init();
+
+    let bind_address = env::var("BIND_ADDRESS").unwrap_or_else(|_| BIND_ADDRESS.to_string());
+
+    //loading arch_config.yaml file
+    let arch_config_path =
+        env::var("ARCH_CONFIG_PATH").unwrap_or_else(|_| "./arch_config.yaml".to_string());
+    info!("Loading arch_config.yaml from {}", arch_config_path);
+
+    let config_contents =
+        fs::read_to_string(&arch_config_path).expect("Failed to read arch_config.yaml");
+
+    let config: Configuration =
+        serde_yaml::from_str(&config_contents).expect("Failed to parse arch_config.yaml");
+
+    let arch_config = Arc::new(config);
+
+    info!(
+        "arch_config: {:?}",
+        shorten_string(&serde_json::to_string(arch_config.as_ref()).unwrap())
+    );
+
+    let llm_provider_endpoint = env::var("LLM_PROVIDER_ENDPOINT")
+        .unwrap_or_else(|_| "http://localhost:12001/v1/chat/completions".to_string());
+
+    info!("llm provider endpoint: {}", llm_provider_endpoint);
+    info!("Listening on http://{}", bind_address);
+    let listener = TcpListener::bind(bind_address).await?;
+
+
+    // if routing is null then return gpt-4o as model name
+    let model = arch_config.routing.as_ref().map_or_else(
+        || "gpt-4o".to_string(),
+        |routing| routing.model.clone(),
+    );
+
+    let router_service: Arc<RouterService> = Arc::new(RouterService::new(
+        arch_config.llm_providers.clone(),
+        llm_provider_endpoint.clone(),
+        model,
+    ));
+
+    loop {
+        let (stream, _) = listener.accept().await?;
+        let peer_addr = stream.peer_addr()?;
+        let io = TokioIo::new(stream);
+
+        let router_service = Arc::clone(&router_service);
+        let llm_provider_endpoint = llm_provider_endpoint.clone();
+
+        let service = service_fn(move |req| {
+            let router_service = Arc::clone(&router_service);
+            let parent_cx = extract_context_from_request(&req);
+            info!("parent_cx: {:?}", parent_cx);
+            let tracer = get_tracer();
+            let _span = tracer
+                .span_builder("request")
+                .with_kind(SpanKind::Server)
+                .start_with_context(tracer, &parent_cx);
+            let llm_provider_endpoint = llm_provider_endpoint.clone();
+
+            async move {
+                match (req.method(), req.uri().path()) {
+                    (&Method::POST, "/v1/chat/completions") => {
+                        chat_completions(req, router_service, llm_provider_endpoint)
+                            .with_context(parent_cx)
+                            .await
+                    }
+                    _ => {
+                        let mut not_found = Response::new(empty());
+                        *not_found.status_mut() = StatusCode::NOT_FOUND;
+                        Ok(not_found)
+                    }
+                }
+            }
+        });
+
+        tokio::task::spawn(async move {
+            info!("Accepted connection from {:?}", peer_addr);
+            if let Err(err) = http1::Builder::new()
+                // .serve_connection(io, service_fn(chat_completion))
+                .serve_connection(io, service)
+                .await
+            {
+                info!("Error serving connection: {:?}", err);
+            }
+        });
+    }
+}
--- a/crates/brightstaff/src/router/llm_router.rs
+++ b/crates/brightstaff/src/router/llm_router.rs
@ -0,0 +1,151 @@
+use std::sync::Arc;
+
+use common::{
+    api::open_ai::{ChatCompletionsResponse, Message},
+    configuration::LlmProvider,
+    consts::ARCH_PROVIDER_HINT_HEADER,
+    utils::shorten_string,
+};
+use hyper::header;
+use thiserror::Error;
+use tracing::{info, warn};
+
+use super::router_model::RouterModel;
+
+pub struct RouterService {
+    router_url: String,
+    client: reqwest::Client,
+    router_model: Arc<dyn RouterModel>,
+    routing_model_name: String,
+    llm_usage_defined: bool,
+}
+
+#[derive(Debug, Error)]
+pub enum RoutingError {
+    #[error("Failed to send request: {0}")]
+    RequestError(#[from] reqwest::Error),
+
+    #[error("Failed to parse JSON: {0}, JSON: {1}")]
+    JsonError(serde_json::Error, String),
+
+    #[error("Router model error: {0}")]
+    RouterModelError(#[from] super::router_model::RoutingModelError),
+}
+
+pub type Result<T> = std::result::Result<T, RoutingError>;
+
+impl RouterService {
+    pub fn new(
+        providers: Vec<LlmProvider>,
+        router_url: String,
+        routing_model_name: String,
+    ) -> Self {
+        let providers_with_usage = providers
+            .iter()
+            .filter(|provider| provider.usage.is_some())
+            .cloned()
+            .collect::<Vec<LlmProvider>>();
+
+        // convert the llm_providers to yaml string but only include name and usage
+        let llm_providers_with_usage_yaml = providers_with_usage
+            .iter()
+            .map(|provider| {
+                format!(
+                    "- name: {}\n  description: {}",
+                    provider.name,
+                    provider.usage.as_ref().unwrap_or(&"".to_string())
+                )
+            })
+            .collect::<Vec<String>>()
+            .join("\n");
+
+        info!(
+            "llm_providers from config with usage: {}...",
+            shorten_string(&llm_providers_with_usage_yaml.replace("\n", "\\n"))
+        );
+
+        let router_model = Arc::new(super::router_model_v1::RouterModelV1::new(
+            llm_providers_with_usage_yaml.clone(),
+            routing_model_name.clone(),
+        ));
+
+        RouterService {
+            router_url,
+            client: reqwest::Client::new(),
+            router_model,
+            routing_model_name,
+            llm_usage_defined: !providers_with_usage.is_empty(),
+        }
+    }
+
+    pub async fn determine_route(
+        &self,
+        messages: &[Message],
+        trace_parent: Option<String>,
+    ) -> Result<Option<String>> {
+
+        if !self.llm_usage_defined {
+            return Ok(None);
+        }
+
+        let router_request = self.router_model.generate_request(messages);
+
+        info!(
+            "router_request: {}",
+            shorten_string(&serde_json::to_string(&router_request).unwrap()),
+        );
+
+        let mut llm_route_request_headers = header::HeaderMap::new();
+        llm_route_request_headers.insert(
+            header::CONTENT_TYPE,
+            header::HeaderValue::from_static("application/json"),
+        );
+
+        llm_route_request_headers.insert(
+            header::HeaderName::from_static(ARCH_PROVIDER_HINT_HEADER),
+            header::HeaderValue::from_str(&self.routing_model_name).unwrap(),
+        );
+
+        if let Some(trace_parent) = trace_parent {
+            llm_route_request_headers.insert(
+                header::HeaderName::from_static("traceparent"),
+                header::HeaderValue::from_str(&trace_parent).unwrap(),
+            );
+        }
+
+        let res = self
+            .client
+            .post(&self.router_url)
+            .headers(llm_route_request_headers)
+            .body(serde_json::to_string(&router_request).unwrap())
+            .send()
+            .await?;
+
+        let body = res.text().await?;
+
+        let chat_completion_response: ChatCompletionsResponse = match serde_json::from_str(&body) {
+            Ok(response) => response,
+            Err(err) => {
+                warn!(
+                    "Failed to parse JSON: {}. Body: {}",
+                    err,
+                    &serde_json::to_string(&body).unwrap()
+                );
+                return Err(RoutingError::JsonError(
+                    err,
+                    format!("Failed to parse JSON: {}", body),
+                ));
+            }
+        };
+
+        let selected_llm = self.router_model.parse_response(
+            chat_completion_response.choices[0]
+                .message
+                .content
+                .as_ref()
+                .unwrap(),
+        )?;
+
+        Ok(selected_llm)
+    }
+}
--- a/crates/brightstaff/src/router/mod.rs
+++ b/crates/brightstaff/src/router/mod.rs
@ -0,0 +1,3 @@
+pub mod llm_router;
+pub mod router_model;
+pub mod router_model_v1;
--- a/crates/brightstaff/src/router/router_model.rs
+++ b/crates/brightstaff/src/router/router_model.rs
@ -0,0 +1,15 @@
+use common::api::open_ai::{ChatCompletionsRequest, Message};
+use thiserror::Error;
+
+#[derive(Debug, Error)]
+pub enum RoutingModelError {
+    #[error("Failed to parse JSON: {0}")]
+    JsonError(#[from] serde_json::Error),
+}
+
+pub type Result<T> = std::result::Result<T, RoutingModelError>;
+
+pub trait RouterModel: Send + Sync {
+    fn generate_request(&self, messages: &[Message]) -> ChatCompletionsRequest;
+    fn parse_response(&self, content: &str) -> Result<Option<String>>;
+}
--- a/crates/brightstaff/src/router/router_model_v1.rs
+++ b/crates/brightstaff/src/router/router_model_v1.rs
@ -0,0 +1,251 @@
+use common::{
+    api::open_ai::{ChatCompletionsRequest, Message},
+    consts::{SYSTEM_ROLE, USER_ROLE},
+};
+use serde::{Deserialize, Serialize};
+use tracing::info;
+
+use super::router_model::{RouterModel, RoutingModelError};
+
+pub const ARCH_ROUTER_V1_SYSTEM_PROMPT: &str = r#"
+You are a helpful assistant designed to find the best suited route.
+You are provided with route description within <routes></routes> XML tags:
+<routes>
+{routes}
+</routes>
+
+Your task is to decide which route is best suit with user intent on the conversation in <conversation></conversation> XML tags.  Follow the instruction:
+1. If the latest intent from user is irrelevant, response with empty route {"route": ""}.
+2. If the user request is full fill and user thank or ending the conversation , response with empty route {"route": ""}.
+3. Understand user latest intent and find the best match route in <routes></routes> xml tags.
+
+Based on your analysis, provide your response in the following JSON formats if you decide to match any route:
+{"route": "route_name"}
+
+
+<conversation>
+{conversation}
+</conversation>
+"#;
+
+pub type Result<T> = std::result::Result<T, RoutingModelError>;
+
+pub struct RouterModelV1 {
+    llm_providers_with_usage_yaml: String,
+    routing_model: String,
+}
+
+impl RouterModelV1 {
+    pub fn new(llm_providers_with_usage_yaml: String, routing_model: String) -> Self {
+        RouterModelV1 {
+            llm_providers_with_usage_yaml,
+            routing_model,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+struct LlmRouterResponse {
+    pub route: Option<String>,
+}
+
+impl RouterModel for RouterModelV1 {
+    fn generate_request(&self, messages: &[Message]) -> ChatCompletionsRequest {
+        let messages_str = messages
+            .iter()
+            .filter(|m| m.role != SYSTEM_ROLE)
+            .map(|m| {
+                let content_json_str = serde_json::to_string(&m.content).unwrap_or_default();
+                format!("{}: {}", m.role, content_json_str)
+            })
+            .collect::<Vec<String>>()
+            .join("\n");
+
+        let message = ARCH_ROUTER_V1_SYSTEM_PROMPT
+            .replace("{routes}", &self.llm_providers_with_usage_yaml)
+            .replace("{conversation}", messages_str.as_str());
+
+        ChatCompletionsRequest {
+            model: self.routing_model.clone(),
+            messages: vec![Message {
+                content: Some(message),
+                role: USER_ROLE.to_string(),
+                model: None,
+                tool_calls: None,
+                tool_call_id: None,
+            }],
+            tools: None,
+            stream: false,
+            stream_options: None,
+            metadata: None,
+        }
+    }
+
+    fn parse_response(&self, content: &str) -> Result<Option<String>> {
+        if content.is_empty() {
+            return Ok(None);
+        }
+        let router_resp_fixed = fix_json_response(content);
+        info!(
+            "router response (fixed): {}",
+            router_resp_fixed.replace("\n", "\\n")
+        );
+        let router_response: LlmRouterResponse = serde_json::from_str(router_resp_fixed.as_str())?;
+
+        let selected_llm = router_response.route.unwrap_or_default().to_string();
+
+        if selected_llm.is_empty() {
+            return Ok(None);
+        }
+
+        Ok(Some(selected_llm))
+    }
+}
+
+fn fix_json_response(body: &str) -> String {
+    let mut updated_body = body.to_string();
+
+    updated_body = updated_body.replace("'", "\"");
+
+    if updated_body.contains("\\n") {
+        updated_body = updated_body.replace("\\n", "");
+    }
+
+    if updated_body.starts_with("```json") {
+        updated_body = updated_body
+            .strip_prefix("```json")
+            .unwrap_or(&updated_body)
+            .to_string();
+    }
+
+    if updated_body.ends_with("```") {
+        updated_body = updated_body
+            .strip_suffix("```")
+            .unwrap_or(&updated_body)
+            .to_string();
+    }
+
+    updated_body
+}
+
+impl std::fmt::Debug for dyn RouterModel {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "RouterModel")
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use pretty_assertions::assert_eq;
+
+    #[test]
+    fn test_system_prompt_format() {
+        let expected_prompt = r#"
+You are a helpful assistant designed to find the best suited route.
+You are provided with route description within <routes></routes> XML tags:
+<routes>
+route1: description1
+route2: description2
+</routes>
+
+Your task is to decide which route is best suit with user intent on the conversation in <conversation></conversation> XML tags.  Follow the instruction:
+1. If the latest intent from user is irrelevant, response with empty route {"route": ""}.
+2. If the user request is full fill and user thank or ending the conversation , response with empty route {"route": ""}.
+3. Understand user latest intent and find the best match route in <routes></routes> xml tags.
+
+Based on your analysis, provide your response in the following JSON formats if you decide to match any route:
+{"route": "route_name"}
+
+
+<conversation>
+user: "Hello, I want to book a flight."
+assistant: "Sure, where would you like to go?"
+user: "seattle"
+</conversation>
+"#;
+
+        let routes_yaml = "route1: description1\nroute2: description2";
+        let routing_model = "test-model".to_string();
+        let router = RouterModelV1::new(routes_yaml.to_string(), routing_model.clone());
+
+        let messages = vec![
+            Message {
+                role: "system".to_string(),
+                content: Some("You are a helpful assistant.".to_string()),
+                ..Default::default()
+            },
+            Message {
+                role: "user".to_string(),
+                content: Some("Hello, I want to book a flight.".to_string()),
+                ..Default::default()
+            },
+            Message {
+                role: "assistant".to_string(),
+                content: Some("Sure, where would you like to go?".to_string()),
+                ..Default::default()
+            },
+            Message {
+                role: "user".to_string(),
+                content: Some("seattle".to_string()),
+                ..Default::default()
+            },
+        ];
+
+        let req = router.generate_request(&messages);
+
+        let prompt = req.messages[0].content.as_ref().unwrap();
+
+        println!("Prompt: {}", prompt);
+
+        assert_eq!(expected_prompt, prompt);
+    }
+}
+
+#[test]
+fn test_parse_response() {
+    let router = RouterModelV1::new(
+        "route1: description1\nroute2: description2".to_string(),
+        "test-model".to_string(),
+    );
+
+    // Case 1: Valid JSON with non-empty route
+    let input = r#"{"route": "route1"}"#;
+    let result = router.parse_response(input).unwrap();
+    assert_eq!(result, Some("route1".to_string()));
+
+    // Case 2: Valid JSON with empty route
+    let input = r#"{"route": ""}"#;
+    let result = router.parse_response(input).unwrap();
+    assert_eq!(result, None);
+
+    // Case 3: Valid JSON with null route
+    let input = r#"{"route": null}"#;
+    let result = router.parse_response(input).unwrap();
+    assert_eq!(result, None);
+
+    // Case 4: JSON missing route field
+    let input = r#"{}"#;
+    let result = router.parse_response(input).unwrap();
+    assert_eq!(result, None);
+
+    // Case 4.1: empty string
+    let input = r#""#;
+    let result = router.parse_response(input).unwrap();
+    assert_eq!(result, None);
+
+    // Case 5: Malformed JSON
+    let input = r#"{"route": "route1""#; // missing closing }
+    let result = router.parse_response(input);
+    assert!(result.is_err());
+
+    // Case 6: Single quotes and \n in JSON
+    let input = "{'route': 'route2'}\\n";
+    let result = router.parse_response(input).unwrap();
+    assert_eq!(result, Some("route2".to_string()));
+
+    // Case 7: Code block marker
+    let input = "```json\n{\"route\": \"route1\"}\n```";
+    let result = router.parse_response(input).unwrap();
+    assert_eq!(result, Some("route1".to_string()));
+}
--- a/crates/common/src/api/open_ai.rs
+++ b/crates/common/src/api/open_ai.rs
@ -171,6 +171,18 @@ pub struct Message {
    pub tool_call_id: Option<String>,
 }

+impl Default for Message {
+    fn default() -> Self {
+        Message {
+            role: ASSISTANT_ROLE.to_string(),
+            content: None,
+            model: None,
+            tool_calls: None,
+            tool_call_id: None,
+        }
+    }
+}
+
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct Choice {
    pub finish_reason: Option<String>,
--- a/crates/common/src/configuration.rs
+++ b/crates/common/src/configuration.rs
@ -6,6 +6,11 @@ use crate::api::open_ai::{
    ChatCompletionTool, FunctionDefinition, FunctionParameter, FunctionParameters, ParameterType,
 };

+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct Routing {
+    pub model: String,
+}
+
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct Configuration {
    pub version: String,
@ -19,6 +24,7 @@ pub struct Configuration {
    pub ratelimits: Option<Vec<Ratelimit>>,
    pub tracing: Option<Tracing>,
    pub mode: Option<GatewayMode>,
+    pub routing: Option<Routing>,
 }

 #[derive(Debug, Clone, Serialize, Deserialize, Default)]
@ -166,6 +172,7 @@ pub struct LlmProvider {
    pub endpoint: Option<String>,
    pub port: Option<u16>,
    pub rate_limits: Option<LlmRatelimit>,
+    pub usage: Option<String>,
 }

 impl Display for LlmProvider {
--- a/crates/common/src/consts.rs
+++ b/crates/common/src/consts.rs
@ -11,7 +11,8 @@ pub const MODEL_SERVER_NAME: &str = "model_server";
 pub const ARCH_ROUTING_HEADER: &str = "x-arch-llm-provider";
 pub const MESSAGES_KEY: &str = "messages";
 pub const ARCH_PROVIDER_HINT_HEADER: &str = "x-arch-llm-provider-hint";
-pub const CHAT_COMPLETIONS_PATH: [&str; 2] = ["/v1/chat/completions", "/openai/v1/chat/completions"];
+pub const CHAT_COMPLETIONS_PATH: [&str; 2] =
+    ["/v1/chat/completions", "/openai/v1/chat/completions"];
 pub const HEALTHZ_PATH: &str = "/healthz";
 pub const X_ARCH_STATE_HEADER: &str = "x-arch-state";
 pub const X_ARCH_API_RESPONSE: &str = "x-arch-api-response-message";
@ -27,3 +28,4 @@ pub const HALLUCINATION_TEMPLATE: &str =
    "It seems I'm missing some information. Could you provide the following details ";
 pub const OTEL_COLLECTOR_HTTP: &str = "opentelemetry_collector_http";
 pub const OTEL_POST_PATH: &str = "/v1/traces";
+pub const LLM_ROUTE_HEADER: &str = "x-arch-llm-route";
--- a/crates/common/src/lib.rs
+++ b/crates/common/src/lib.rs
@ -11,3 +11,4 @@ pub mod routing;
 pub mod stats;
 pub mod tokenizer;
 pub mod tracing;
+pub mod utils;
--- a/crates/common/src/utils.rs
+++ b/crates/common/src/utils.rs
@ -0,0 +1,7 @@
+pub fn shorten_string(s: &str) -> String {
+    if s.len() > 80 {
+        format!("{}...", &s[..80])
+    } else {
+        s.to_string()
+    }
+}
--- a/crates/llm_gateway/src/stream_context.rs
+++ b/crates/llm_gateway/src/stream_context.rs
@ -228,6 +228,7 @@ impl HttpContext for StreamContext {
                stream: None,
                port: None,
                rate_limits: None,
+                usage: None,
            }));
        } else {
            self.select_llm_provider();
@ -316,10 +317,6 @@ impl HttpContext for StreamContext {
                }
            };

-        // remove metadata from the request body
-        //TODO: move this to prompt gateway
-        // deserialized_body.metadata = None;
-        // delete model key from message array
        for message in deserialized_body.messages.iter_mut() {
            message.model = None;
        }
@ -342,24 +339,22 @@ impl HttpContext for StreamContext {
        };

        let model_requested = deserialized_body.model.clone();
-        if deserialized_body.model.is_empty() || deserialized_body.model.to_lowercase() == "none" {
-            deserialized_body.model = match model_name {
-                Some(model_name) => model_name.clone(),
-                None => {
-                    if use_agent_orchestrator {
-                        "agent_orchestrator".to_string()
-                    } else {
-                        self.send_server_error(
-                          ServerError::BadRequest {
-                              why: format!("No model specified in request and couldn't determine model name from arch_config. Model name in req: {}, arch_config, provider: {}, model: {:?}", deserialized_body.model, self.llm_provider().name, self.llm_provider().model).to_string(),
-                          },
-                          Some(StatusCode::BAD_REQUEST),
-                      );
-                        return Action::Continue;
-                    }
+        deserialized_body.model = match model_name {
+            Some(model_name) => model_name.clone(),
+            None => {
+                if use_agent_orchestrator {
+                    "agent_orchestrator".to_string()
+                } else {
+                    self.send_server_error(
+                      ServerError::BadRequest {
+                          why: format!("No model specified in request and couldn't determine model name from arch_config. Model name in req: {}, arch_config, provider: {}, model: {:?}", deserialized_body.model, self.llm_provider().name, self.llm_provider().model).to_string(),
+                      },
+                      Some(StatusCode::BAD_REQUEST),
+                  );
+                    return Action::Continue;
                }
            }
-        }
+        };

        info!(
            "on_http_request_body: provider: {}, model requested: {}, model selected: {}",
--- a/crates/llm_gateway/tests/integration.rs
+++ b/crates/llm_gateway/tests/integration.rs
@ -489,7 +489,6 @@ fn llm_gateway_override_model_name() {
        .expect_log(Some(LogLevel::Debug), None)
        .expect_log(Some(LogLevel::Debug), None)
        .expect_log(Some(LogLevel::Debug), None)
-        .expect_log(Some(LogLevel::Debug), None)
        .expect_metric_record("input_sequence_length", 29)
        .expect_log(Some(LogLevel::Debug), None)
        .expect_log(Some(LogLevel::Debug), None)
--- a/crates/prompt_gateway/src/stream_context.rs
+++ b/crates/prompt_gateway/src/stream_context.rs
@ -777,18 +777,19 @@ impl StreamContext {

 fn check_intent_matched(model_server_response: &ChatCompletionsResponse) -> bool {
    let content = model_server_response
-        .choices.first()
+        .choices
+        .first()
        .and_then(|choice| choice.message.content.as_ref());

    let content_has_value = content.is_some() && !content.unwrap().is_empty();

    let tool_calls = model_server_response
-        .choices.first()
+        .choices
+        .first()
        .and_then(|choice| choice.message.tool_calls.as_ref());

    // intent was matched if content has some value or tool_calls is empty

-
    content_has_value || (tool_calls.is_some() && !tool_calls.unwrap().is_empty())
 }

--- a/demos/shared/test_runner/run_demo_tests.sh
+++ b/demos/shared/test_runner/run_demo_tests.sh
@ -1,13 +1,16 @@
 #!/bin/bash
 set -eu

+echo "docker images"
+docker images
+
 # for demo in currency_exchange hr_agent
-for demo in currency_exchange
+for demo in samples_python/currency_exchange use_cases/preference_based_routing
 do
  echo "******************************************"
  echo "Running tests for $demo ..."
  echo "****************************************"
-  cd ../../samples_python/$demo
+  cd ../../$demo
  echo "starting archgw"
  archgw up arch_config.yaml
  echo "starting docker containers"
--- a/demos/use_cases/preference_based_routing/README.md
+++ b/demos/use_cases/preference_based_routing/README.md
@ -0,0 +1,2 @@
+# Usage based LLM Routing
+This demo shows how you can use user preferences to route user prompts to appropriate llm. See [arch_config.yaml](arch_config.yaml) for details on how you can define user preferences.
--- a/demos/use_cases/preference_based_routing/arch_config.yaml
+++ b/demos/use_cases/preference_based_routing/arch_config.yaml
@ -0,0 +1,39 @@
+version: "0.1-beta"
+
+routing:
+  model: gpt-4o
+
+listeners:
+  egress_traffic:
+    address: 0.0.0.0
+    port: 12000
+    message_format: openai
+    timeout: 30s
+
+llm_providers:
+
+  - name: archgw-v1-router-model
+    provider_interface: openai
+    model: cotran2/llama-1b-4-26
+    base_url: http://35.192.87.187:8000/v1
+
+  - name: gpt-4o-mini
+    provider_interface: openai
+    access_key: $OPENAI_API_KEY
+    model: gpt-4o-mini
+    default: true
+
+  - name: gpt-4o
+    provider_interface: openai
+    access_key: $OPENAI_API_KEY
+    model: gpt-4o
+    usage: Generating original content such as scripts, articles, or creative materials.
+
+  - name: o4-mini
+    provider_interface: openai
+    access_key: $OPENAI_API_KEY
+    model: o4-mini
+    usage: Requesting topic ideas specifically related to personal finance and budgeting.
+
+tracing:
+  random_sampling: 100
--- a/demos/use_cases/preference_based_routing/docker-compose.yaml
+++ b/demos/use_cases/preference_based_routing/docker-compose.yaml
@ -0,0 +1,32 @@
+services:
+
+  chatbot_ui:
+    build:
+      context: ../../shared/chatbot_ui
+      dockerfile: Dockerfile
+    ports:
+      - "18080:8080"
+    environment:
+      - CHAT_COMPLETION_ENDPOINT=http://host.docker.internal:12000/v1
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
+    volumes:
+      - ./arch_config.yaml:/app/arch_config.yaml
+
+  jaeger:
+    build:
+      context: ../../shared/jaeger
+    ports:
+      - "16686:16686"
+      - "4317:4317"
+      - "4318:4318"
+
+  prometheus:
+    build:
+      context: ../../shared/prometheus
+
+  grafana:
+    build:
+      context: ../../shared/grafana
+    ports:
+      - "3000:3000"
--- a/demos/use_cases/preference_based_routing/hurl_tests/simple.hurl
+++ b/demos/use_cases/preference_based_routing/hurl_tests/simple.hurl
@ -0,0 +1,18 @@
+POST http://localhost:12000/v1/chat/completions
+Content-Type: application/json
+
+{
+  "messages": [
+    {
+      "role": "user",
+      "content": "I am running under debt, how should I keep a tab on my expenses?"
+    }
+  ]
+}
+HTTP 200
+[Asserts]
+header "content-type" == "application/json"
+jsonpath "$.model" matches /^o4-mini/
+jsonpath "$.usage" != null
+jsonpath "$.choices[0].message.content" != null
+jsonpath "$.choices[0].message.role" == "assistant"
--- a/demos/use_cases/preference_based_routing/hurl_tests/simple_stream.hurl
+++ b/demos/use_cases/preference_based_routing/hurl_tests/simple_stream.hurl
@ -0,0 +1,16 @@
+POST http://localhost:12000/v1/chat/completions
+Content-Type: application/json
+
+{
+  "messages": [
+    {
+      "role": "user",
+      "content": "I am running under debt, how should I keep a tab on my expenses?"
+    }
+  ],
+  "stream": true
+}
+HTTP 200
+[Asserts]
+header "content-type" matches /text\/event-stream/
+body matches /^data: .*?o4-mini.*?\n/
--- a/demos/use_cases/preference_based_routing/test_router_endpoint.rest
+++ b/demos/use_cases/preference_based_routing/test_router_endpoint.rest
@ -0,0 +1,24 @@
+@arch_llm_router_endpoint = http://35.192.87.187:8000
+
+POST {{arch_llm_router_endpoint}}/v1/chat/completions HTTP/1.1
+Content-Type: application/json
+
+{
+  "model": "cotran2/llama-1b-4-26",
+  "messages": [
+    {
+      "role": "user",
+      "content": "You are an advanced Routing Assistant designed to select the optimal route based on user requests. \nYour task is to analyze conversations and match them to the most appropriate predefined route.\nReview the available routes config:\n\n# ROUTES CONFIG START\n- name: gpt-4o()\n  description: \"complex reasoning problem, require multi step answer\\n\"\n- name: o4-mini()\n  description: \"simple requests, basic fact retrieval, easy to answer\\n\"\n\n# ROUTES CONFIG END\n\nExamine the following conversation between a user and an assistant:\n\n# CONVERSATION START\n\nuser: Hello\nassistant: Hi! How can I assist you today?\nuser: List us presidents who are born in odd years and are still alive. Order them by their age and I also know what is their home city they were born. And what year they became president. Also give me summary of which president was the best for economy of the US.\n\n# CONVERSATION END\n\nYour goal is to identify the most appropriate route that matches the user's LATEST intent. Follow these steps:\n\n1. Carefully read and analyze the provided conversation, focusing on the user's latest request and the conversation scenario.\n2. Check if the user's request and scenario matches any of the routes in the routing configuration (focus on the description).\n3. Find the route that best matches.\n4. Use context clues from the entire conversation to determine the best fit.\n5. Return the best match possible. You only response the name of the route that best matches the user's request, use the exact name in the routes config.\n6. If no route relatively close to matches the user's latest intent or user last message is thank you or greeting, return an empty route ''. \n\n\n# OUTPUT FORMAT\nYour final output must follow this JSON format:\n{\n  \"route\": \"route_name\" # The matched route name, or empty string '' if no match\n}\n\nBased on your analysis, provide only the JSON object as your final output with no additional text, explanations, or whitespace."
+    }
+  ]
+}
+
+### test 2
+
+POST {{arch_llm_router_endpoint}}/v1/chat/completions HTTP/1.1
+Content-Type: application/json
+
+{"model":"cotran2/llama-1b-4-26","messages":[{"role":"user","content":"\nYou are an advanced Routing Assistant designed to select the optimal route based on user requests. \nYour task is to analyze conversations and match them to the most appropriate predefined route.\nReview the available routes config:\n\n# ROUTES CONFIG START\n- name: gpt-4o\n  description: simple requests, basic fact retrieval, easy to answer\n- name: o4-mini()\n  description: complex reasoning problem, require multi step answer\n# ROUTES CONFIG END\n\nExamine the following conversation between a user and an assistant:\n\n# CONVERSATION START\n[{\"role\":\"user\",\"content\":\"What is the capital of France?\"}]\n# CONVERSATION END\n\nYour goal is to identify the most appropriate route that matches the user's LATEST intent. Follow these steps:\n\n1. Carefully read and analyze the provided conversation, focusing on the user's latest request and the conversation scenario.\n2. Check if the user's request and scenario matches any of the routes in the routing configuration (focus on the description).\n3. Find the route that best matches.\n4. Use context clues from the entire conversation to determine the best fit.\n5. Return the best match possible. You only response the name of the route that best matches the user's request, use the exact name in the routes config.\n6. If no route relatively close to matches the user's latest intent or user last message is thank you or greeting, return an empty route ''. \n\n# OUTPUT FORMAT\nYour final output must follow this JSON format:\n{\n  \"route\": \"route_name\" # The matched route name, or empty string '' if no match\n}\n\nBased on your analysis, provide only the JSON object as your final output with no additional text, explanations, or whitespace.\n"}],"stream":false}
+
+### get model list
+GET http://34.46.85.85:8000/v1/models HTTP/1.1
--- a/tests/rest/llm_routing.rest
+++ b/tests/rest/llm_routing.rest
@ -0,0 +1,77 @@
+@llm_endpoint = http://localhost:12000
+@openai_endpoint = https://api.openai.com
+@access_key = {{$dotenv OPENAI_API_KEY}}
+
+### openai request
+POST {{openai_endpoint}}/v1/chat/completions HTTP/1.1
+Content-Type: application/json
+Authorization: Bearer {{access_key}}
+
+{
+  "messages": [
+    {
+      "role": "user",
+      "content": "hello"
+    }
+  ],
+  "model": "gpt-4o-mini",
+  "stream": true
+}
+
+### openai request (streaming)
+POST {{openai_endpoint}}/v1/chat/completions HTTP/1.1
+Content-Type: application/json
+Authorization: Bearer {{access_key}}
+
+{
+  "messages": [
+    {
+      "role": "user",
+      "content": "hello"
+    }
+  ],
+  "model": "gpt-4o-mini",
+  "stream": true
+}
+
+
+### llm gateway request
+POST {{llm_endpoint}}/v1/chat/completions HTTP/1.1
+Content-Type: application/json
+
+{
+  "messages": [
+    {
+      "role": "user",
+      "content": "hello"
+    }
+  ]
+}
+
+### llm gateway request (streaming)
+POST {{llm_endpoint}}/v1/chat/completions HTTP/1.1
+Content-Type: application/json
+
+{
+  "messages": [
+    {
+      "role": "user",
+      "content": "hello"
+    }
+  ],
+  "stream": true
+}
+
+### llm gateway request (provider hint)
+POST {{llm_endpoint}}/v1/chat/completions HTTP/1.1
+Content-Type: application/json
+x-arch-llm-provider-hint: gpt-3.5-turbo-0125
+
+{
+  "messages": [
+    {
+      "role": "user",
+      "content": "hello"
+    }
+  ]
+}