diff --git a/.github/workflows/e2e_archgw.yml b/.github/workflows/e2e_archgw.yml
index d6bcb771..b4c6d4ed 100644
--- a/.github/workflows/e2e_archgw.yml
+++ b/.github/workflows/e2e_archgw.yml
@@ -24,7 +24,7 @@ jobs:
 
       - name: build arch docker image
         run: |
-          cd ../../ && docker build -f arch/Dockerfile . -t katanemo/archgw -t katanemo/archgw:0.3.0 -t katanemo/archgw:latest
+          cd ../../ && docker build -f arch/Dockerfile . -t katanemo/archgw -t katanemo/archgw:0.3.1 -t katanemo/archgw:latest
 
       - name: start archgw
         env:
diff --git a/.github/workflows/e2e_test_demos.yml b/.github/workflows/e2e_test_currency_convert.yml
similarity index 88%
rename from .github/workflows/e2e_test_demos.yml
rename to .github/workflows/e2e_test_currency_convert.yml
index a9f82a39..00f1d038 100644
--- a/.github/workflows/e2e_test_demos.yml
+++ b/.github/workflows/e2e_test_currency_convert.yml
@@ -1,4 +1,7 @@
-name: e2e demo tests
+name: e2e demo tests currency conversion
+
+permissions:
+  contents: read
 
 on:
   push:
@@ -21,7 +24,7 @@ jobs:
 
       - name: build arch docker image
         run: |
-          docker build  -f arch/Dockerfile . -t katanemo/archgw -t katanemo/archgw:0.3.0
+          docker build  -f arch/Dockerfile . -t katanemo/archgw -t katanemo/archgw:0.3.1
 
       - name: install poetry
         run: |
@@ -51,4 +54,4 @@ jobs:
           GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
         run: |
           source venv/bin/activate
-          cd demos/shared/test_runner && sh run_demo_tests.sh
+          cd demos/shared/test_runner && sh run_demo_tests.sh samples_python/currency_exchange
diff --git a/.github/workflows/e2e_test_preference_based_routing.yml b/.github/workflows/e2e_test_preference_based_routing.yml
new file mode 100644
index 00000000..de416c7d
--- /dev/null
+++ b/.github/workflows/e2e_test_preference_based_routing.yml
@@ -0,0 +1,58 @@
+name: e2e demo preference based routing tests
+
+permissions:
+  contents: read
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+
+jobs:
+  e2e_demo_tests:
+    runs-on: ubuntu-latest-m
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.12"
+
+      - name: build arch docker image
+        run: |
+          docker build  -f arch/Dockerfile . -t katanemo/archgw -t katanemo/archgw:0.3.1
+
+      - name: install poetry
+        run: |
+          export POETRY_VERSION=1.8.5
+          curl -sSL https://install.python-poetry.org | python3 -
+
+      - name: setup python venv
+        run: |
+          python -m venv venv
+
+      - name: install hurl
+        run: |
+          curl --location --remote-name https://github.com/Orange-OpenSource/hurl/releases/download/4.0.0/hurl_4.0.0_amd64.deb
+          sudo dpkg -i hurl_4.0.0_amd64.deb
+
+      - name: install model server, arch gateway and test dependencies
+        run: |
+          source venv/bin/activate
+          cd model_server/ && echo "installing model server" && poetry install
+          cd ../arch/tools && echo "installing archgw cli" && poetry install
+          cd ../../demos/shared/test_runner && echo "installing test dependencies" && poetry install
+
+      - name: run demo tests
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          MISTRAL_API_KEY: ${{ secrets.MISTRAL_API_KEY }}
+          GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
+          ARCH_API_KEY: ${{ secrets.ARCH_API_KEY }}
+        run: |
+          source venv/bin/activate
+          cd demos/shared/test_runner && sh run_demo_tests.sh use_cases/preference_based_routing
diff --git a/.github/workflows/validate_arch_config.yml b/.github/workflows/validate_arch_config.yml
index 67bb8245..e1d85747 100644
--- a/.github/workflows/validate_arch_config.yml
+++ b/.github/workflows/validate_arch_config.yml
@@ -24,7 +24,7 @@ jobs:
 
       - name: build arch docker image
         run: |
-          docker build  -f arch/Dockerfile . -t katanemo/archgw -t katanemo/archgw:0.3.0
+          docker build  -f arch/Dockerfile . -t katanemo/archgw -t katanemo/archgw:0.3.1
 
       - name: validate arch config
         run: |
diff --git a/README.md b/README.md
index 518e816f..ce2f503d 100644
--- a/README.md
+++ b/README.md
@@ -81,7 +81,7 @@ Arch's CLI allows you to manage and interact with the Arch gateway efficiently.
 ```console
 $ python -m venv venv
 $ source venv/bin/activate   # On Windows, use: venv\Scripts\activate
-$ pip install archgw==0.3.0
+$ pip install archgw==0.3.1
 ```
 
 ### Build AI Agent with Arch Gateway
@@ -93,7 +93,7 @@ In following quickstart we will show you how easy it is to build AI agent with A
 Create `arch_config.yaml` file with following content,
 
 ```yaml
-version: v0.1
+version: v0.1.0
 
 listeners:
   ingress_traffic:
@@ -195,7 +195,7 @@ Arch operates based on a configuration file where you can define LLM providers,
 Create `arch_config.yaml` file with following content:
 
 ```yaml
-version: v0.1
+version: v0.1.0
 
 listeners:
   egress_traffic:
diff --git a/arch/Dockerfile b/arch/Dockerfile
index b3117630..53a20c0e 100644
--- a/arch/Dockerfile
+++ b/arch/Dockerfile
@@ -1,5 +1,5 @@
 # build docker image for arch gateway
-FROM rust:1.82.0 as builder
+FROM rust:1.82.0 AS builder
 RUN rustup -v target add wasm32-wasip1
 WORKDIR /arch
 COPY crates .
@@ -8,10 +8,10 @@ RUN cargo build --release --target wasm32-wasip1 -p prompt_gateway -p llm_gatewa
 RUN cargo build --release -p brightstaff
 
 # copy built filter into envoy image
-FROM docker.io/envoyproxy/envoy:v1.32-latest as envoy
+FROM docker.io/envoyproxy/envoy:v1.32-latest AS envoy
 
 #Build config generator, so that we have a single build image for both Rust and Python
-FROM python:3.12-slim as arch
+FROM python:3.12-slim AS arch
 
 RUN apt-get update && apt-get install -y supervisor gettext-base curl && apt-get clean && rm -rf /var/lib/apt/lists/*
 
diff --git a/arch/arch_config_schema.yaml b/arch/arch_config_schema.yaml
index b30282a0..867836a0 100644
--- a/arch/arch_config_schema.yaml
+++ b/arch/arch_config_schema.yaml
@@ -3,6 +3,10 @@ type: object
 properties:
   version:
     type: string
+    enum:
+      - v0.1
+      - v0.1.0
+      - 0.1-beta
   listeners:
     type: object
     additionalProperties: false
@@ -70,6 +74,7 @@ properties:
         provider_interface:
           type: string
           enum:
+            - arch
             - claude
             - deepseek
             - groq
diff --git a/arch/envoy.template.yaml b/arch/envoy.template.yaml
index 56070bfe..92db9f3b 100644
--- a/arch/envoy.template.yaml
+++ b/arch/envoy.template.yaml
@@ -472,6 +472,31 @@ static_resources:
 
   clusters:
 
+    - name: arch
+      connect_timeout: 0.5s
+      type: LOGICAL_DNS
+      dns_lookup_family: V4_ONLY
+      lb_policy: ROUND_ROBIN
+      load_assignment:
+        cluster_name: arch
+        endpoints:
+          - lb_endpoints:
+              - endpoint:
+                  address:
+                    socket_address:
+                      address: archfc.katanemo.dev
+                      port_value: 443
+                  hostname: "archfc.katanemo.dev"
+      transport_socket:
+        name: envoy.transport_sockets.tls
+        typed_config:
+          "@type": type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext
+          sni: archfc.katanemo.dev
+          common_tls_context:
+            tls_params:
+              tls_minimum_protocol_version: TLSv1_2
+              tls_maximum_protocol_version: TLSv1_3
+
     - name: claude
       connect_timeout: 0.5s
       type: LOGICAL_DNS
@@ -752,22 +777,6 @@ static_resources:
                       port_value: 9091
                   hostname: localhost
 
-    - name: router_model_host
-      connect_timeout: 0.5s
-      type: LOGICAL_DNS
-      dns_lookup_family: V4_ONLY
-      lb_policy: ROUND_ROBIN
-      load_assignment:
-        cluster_name: router_model_host
-        endpoints:
-          - lb_endpoints:
-              - endpoint:
-                  address:
-                    socket_address:
-                      address: 34.30.16.38
-                      port_value: 8000
-                  hostname: router_model_host
-
     - name: arch_prompt_gateway_listener
       connect_timeout: 0.5s
       type: LOGICAL_DNS
diff --git a/arch/tools/README.md b/arch/tools/README.md
index 3b0669cc..d53e901a 100644
--- a/arch/tools/README.md
+++ b/arch/tools/README.md
@@ -19,7 +19,7 @@ source venv/bin/activate
 
 ### Step 3: Run the build script
 ```bash
-pip install archgw==0.3.0
+pip install archgw==0.3.1
 ```
 
 ## Uninstall Instructions: archgw CLI
diff --git a/arch/tools/cli/consts.py b/arch/tools/cli/consts.py
index c382b4ef..ed0efad5 100644
--- a/arch/tools/cli/consts.py
+++ b/arch/tools/cli/consts.py
@@ -10,4 +10,4 @@ SERVICE_NAME_MODEL_SERVER = "model_server"
 SERVICE_ALL = "all"
 MODEL_SERVER_LOG_FILE = "~/archgw_logs/modelserver.log"
 ARCHGW_DOCKER_NAME = "archgw"
-ARCHGW_DOCKER_IMAGE = os.getenv("ARCHGW_DOCKER_IMAGE", "katanemo/archgw:0.3.0")
+ARCHGW_DOCKER_IMAGE = os.getenv("ARCHGW_DOCKER_IMAGE", "katanemo/archgw:0.3.1")
diff --git a/arch/tools/poetry.lock b/arch/tools/poetry.lock
index 1da87f28..f12173c0 100644
--- a/arch/tools/poetry.lock
+++ b/arch/tools/poetry.lock
@@ -2,7 +2,7 @@
 
 [[package]]
 name = "archgw_modelserver"
-version = "0.3.0"
+version = "0.3.1"
 description = "A model server for serving models"
 optional = false
 python-versions = "*"
@@ -76,13 +76,13 @@ i18n = ["Babel (>=2.7)"]
 
 [[package]]
 name = "jsonschema"
-version = "4.23.0"
+version = "4.24.0"
 description = "An implementation of JSON Schema validation for Python"
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.9"
 files = [
-    {file = "jsonschema-4.23.0-py3-none-any.whl", hash = "sha256:fbadb6f8b144a8f8cf9f0b89ba94501d143e50411a1278633f56a7acf7fd5566"},
-    {file = "jsonschema-4.23.0.tar.gz", hash = "sha256:d71497fef26351a33265337fa77ffeb82423f3ea21283cd9467bb03999266bc4"},
+    {file = "jsonschema-4.24.0-py3-none-any.whl", hash = "sha256:a462455f19f5faf404a7902952b6f0e3ce868f3ee09a359b05eca6673bd8412d"},
+    {file = "jsonschema-4.24.0.tar.gz", hash = "sha256:0b4e8069eb12aedfa881333004bccaec24ecef5a8a6a4b6df142b2cc9599d196"},
 ]
 
 [package.dependencies]
@@ -417,4 +417,4 @@ files = [
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "f21ba72a4b91af8ddd1a1ec62eb033d0bae61490ae21a916ebcd7fe8c27045ca"
+content-hash = "50ac34641326134462d8324d6b1d595be25778c20a59dc6ec14fe064187132d4"
diff --git a/arch/tools/pyproject.toml b/arch/tools/pyproject.toml
index a47993a0..467b866d 100644
--- a/arch/tools/pyproject.toml
+++ b/arch/tools/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "archgw"
-version = "0.3.0"
+version = "0.3.1"
 description = "Python-based CLI tool to manage Arch Gateway."
 authors = ["Katanemo Labs, Inc."]
 packages = [
@@ -10,7 +10,7 @@ readme = "README.md"
 
 [tool.poetry.dependencies]
 python = "^3.10"
-archgw_modelserver = "^0.3.0"
+archgw_modelserver = "^0.3.1"
 click = "^8.1.7"
 jinja2 = "^3.1.4"
 jsonschema = "^4.23.0"
diff --git a/build_filter_image.sh b/build_filter_image.sh
index ba84878d..29413cde 100644
--- a/build_filter_image.sh
+++ b/build_filter_image.sh
@@ -1 +1 @@
-docker build  -f arch/Dockerfile . -t katanemo/archgw -t katanemo/archgw:0.3.0
+docker build  -f arch/Dockerfile . -t katanemo/archgw -t katanemo/archgw:0.3.1
diff --git a/crates/brightstaff/src/handlers/chat_completions.rs b/crates/brightstaff/src/handlers/chat_completions.rs
index cb3094fc..0a5bd25d 100644
--- a/crates/brightstaff/src/handlers/chat_completions.rs
+++ b/crates/brightstaff/src/handlers/chat_completions.rs
@@ -116,62 +116,40 @@ pub async fn chat_completions(
         headers.insert(header_name, header_value.clone());
     }
 
-    if chat_completion_request.stream {
-        // channel to create async stream
-        let (tx, rx) = mpsc::channel::<Bytes>(16);
+    // channel to create async stream
+    let (tx, rx) = mpsc::channel::<Bytes>(16);
 
-        // Spawn a task to send data as it becomes available
-        tokio::spawn(async move {
-            let mut byte_stream = llm_response.bytes_stream();
+    // Spawn a task to send data as it becomes available
+    tokio::spawn(async move {
+        let mut byte_stream = llm_response.bytes_stream();
 
-            while let Some(item) = byte_stream.next().await {
-                let item = match item {
-                    Ok(item) => item,
-                    Err(err) => {
-                        warn!("Error receiving chunk: {:?}", err);
-                        break;
-                    }
-                };
-
-                if tx.send(item).await.is_err() {
-                    warn!("Receiver dropped");
+        while let Some(item) = byte_stream.next().await {
+            let item = match item {
+                Ok(item) => item,
+                Err(err) => {
+                    warn!("Error receiving chunk: {:?}", err);
                     break;
                 }
-            }
-        });
+            };
 
-        let stream = ReceiverStream::new(rx).map(|chunk| Ok::<_, hyper::Error>(Frame::data(chunk)));
-
-        let stream_body = BoxBody::new(StreamBody::new(stream));
-
-        match response.body(stream_body) {
-            Ok(response) => Ok(response),
-            Err(err) => {
-                let err_msg = format!("Failed to create response: {}", err);
-                let mut internal_error = Response::new(full(err_msg));
-                *internal_error.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
-                Ok(internal_error)
+            if tx.send(item).await.is_err() {
+                warn!("Receiver dropped");
+                break;
             }
         }
-    } else {
-        let body = match llm_response.text().await {
-            Ok(body) => body,
-            Err(err) => {
-                let err_msg = format!("Failed to read response: {}", err);
-                let mut internal_error = Response::new(full(err_msg));
-                *internal_error.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
-                return Ok(internal_error);
-            }
-        };
+    });
 
-        match response.body(full(body)) {
-            Ok(response) => Ok(response),
-            Err(err) => {
-                let err_msg = format!("Failed to create response: {}", err);
-                let mut internal_error = Response::new(full(err_msg));
-                *internal_error.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
-                Ok(internal_error)
-            }
+    let stream = ReceiverStream::new(rx).map(|chunk| Ok::<_, hyper::Error>(Frame::data(chunk)));
+
+    let stream_body = BoxBody::new(StreamBody::new(stream));
+
+    match response.body(stream_body) {
+        Ok(response) => Ok(response),
+        Err(err) => {
+            let err_msg = format!("Failed to create response: {}", err);
+            let mut internal_error = Response::new(full(err_msg));
+            *internal_error.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
+            Ok(internal_error)
         }
     }
 }
diff --git a/crates/brightstaff/src/main.rs b/crates/brightstaff/src/main.rs
index 8eb2d7e2..5502c983 100644
--- a/crates/brightstaff/src/main.rs
+++ b/crates/brightstaff/src/main.rs
@@ -101,20 +101,16 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
                             .with_context(parent_cx)
                             .await
                     }
-                    (&Method::GET, "/v1/models") => {
-                        Ok(list_models(llm_providers).await)
-                    }
+                    (&Method::GET, "/v1/models") => Ok(list_models(llm_providers).await),
                     (&Method::OPTIONS, "/v1/models") => {
                         let mut response = Response::new(empty());
                         *response.status_mut() = StatusCode::NO_CONTENT;
-                        response.headers_mut().insert(
-                            "Allow",
-                            "GET, OPTIONS".parse().unwrap(),
-                        );
-                        response.headers_mut().insert(
-                            "Access-Control-Allow-Origin",
-                            "*".parse().unwrap(),
-                        );
+                        response
+                            .headers_mut()
+                            .insert("Allow", "GET, OPTIONS".parse().unwrap());
+                        response
+                            .headers_mut()
+                            .insert("Access-Control-Allow-Origin", "*".parse().unwrap());
                         response.headers_mut().insert(
                             "Access-Control-Allow-Headers",
                             "Authorization, Content-Type".parse().unwrap(),
@@ -123,10 +119,9 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
                             "Access-Control-Allow-Methods",
                             "GET, POST, OPTIONS".parse().unwrap(),
                         );
-                        response.headers_mut().insert(
-                            "Content-Type",
-                            "application/json".parse().unwrap(),
-                        );
+                        response
+                            .headers_mut()
+                            .insert("Content-Type", "application/json".parse().unwrap());
 
                         Ok(response)
                     }
diff --git a/crates/brightstaff/src/router/llm_router.rs b/crates/brightstaff/src/router/llm_router.rs
index 8d8c057f..d4158388 100644
--- a/crates/brightstaff/src/router/llm_router.rs
+++ b/crates/brightstaff/src/router/llm_router.rs
@@ -2,7 +2,7 @@ use std::sync::Arc;
 
 use common::{
     api::open_ai::{ChatCompletionsResponse, ContentType, Message},
-    configuration::LlmProvider,
+    configuration::{LlmProvider, LlmRoute},
     consts::ARCH_PROVIDER_HINT_HEADER,
 };
 use hyper::header;
@@ -47,26 +47,10 @@ impl RouterService {
             .cloned()
             .collect::<Vec<LlmProvider>>();
 
-        // convert the llm_providers to yaml string but only include name and usage
-        let llm_providers_with_usage_yaml = providers_with_usage
-            .iter()
-            .map(|provider| {
-                format!(
-                    "- name: {}\n  description: {}",
-                    provider.name,
-                    provider.usage.as_ref().unwrap_or(&"".to_string())
-                )
-            })
-            .collect::<Vec<String>>()
-            .join("\n");
-
-        debug!(
-            "llm_providers from config with usage: {}...",
-            llm_providers_with_usage_yaml.replace("\n", "\\n")
-        );
+        let llm_routes: Vec<LlmRoute> = providers_with_usage.iter().map(LlmRoute::from).collect();
 
         let router_model = Arc::new(router_model_v1::RouterModelV1::new(
-            llm_providers_with_usage_yaml.clone(),
+            llm_routes,
             routing_model_name.clone(),
             router_model_v1::MAX_TOKEN_LEN,
         ));
@@ -120,6 +104,11 @@ impl RouterService {
             );
         }
 
+        llm_route_request_headers.insert(
+            header::HeaderName::from_static("model"),
+            header::HeaderValue::from_static("arch-router"),
+        );
+
         let start_time = std::time::Instant::now();
         let res = self
             .client
diff --git a/crates/brightstaff/src/router/router_model_v1.rs b/crates/brightstaff/src/router/router_model_v1.rs
index cbea39ff..bc69b475 100644
--- a/crates/brightstaff/src/router/router_model_v1.rs
+++ b/crates/brightstaff/src/router/router_model_v1.rs
@@ -1,6 +1,7 @@
 use common::{
     api::open_ai::{ChatCompletionsRequest, ContentType, Message},
-    consts::{SYSTEM_ROLE, USER_ROLE},
+    configuration::LlmRoute,
+    consts::{SYSTEM_ROLE, TOOL_ROLE, USER_ROLE},
 };
 use serde::{Deserialize, Serialize};
 use tracing::{debug, warn};
@@ -15,36 +16,33 @@ You are provided with route description within <routes></routes> XML tags:
 {routes}
 </routes>
 
-Your task is to decide which route is best suit with user intent on the conversation in <conversation></conversation> XML tags.  Follow the instruction:
-1. If the latest intent from user is irrelevant, response with empty route {"route": ""}.
-2. If the user request is full fill and user thank or ending the conversation , response with empty route {"route": ""}.
-3. Understand user latest intent and find the best match route in <routes></routes> xml tags.
-
-Based on your analysis, provide your response in the following JSON formats if you decide to match any route:
-{"route": "route_name"}
-
-
 <conversation>
 {conversation}
 </conversation>
+
+Your task is to decide which route is best suit with user intent on the conversation in <conversation></conversation> XML tags.  Follow the instruction:
+1. If the latest intent from user is irrelevant or user intent is full filled, response with other route {"route": "other"}.
+2. You must analyze the route descriptions and find the best match route for user latest intent.
+3. You only response the name of the route that best matches the user's request, use the exact name in the <routes></routes>.
+
+Based on your analysis, provide your response in the following JSON formats if you decide to match any route:
+{"route": "route_name"}
 "#;
 
 pub type Result<T> = std::result::Result<T, RoutingModelError>;
 pub struct RouterModelV1 {
-    llm_providers_with_usage_yaml: String,
+    llm_route_json_str: String,
     routing_model: String,
     max_token_length: usize,
 }
 impl RouterModelV1 {
-    pub fn new(
-        llm_providers_with_usage_yaml: String,
-        routing_model: String,
-        max_token_length: usize,
-    ) -> Self {
+    pub fn new(llm_routes: Vec<LlmRoute>, routing_model: String, max_token_length: usize) -> Self {
+        let llm_route_json_str =
+            serde_json::to_string(&llm_routes).unwrap_or_else(|_| "[]".to_string());
         RouterModelV1 {
-            llm_providers_with_usage_yaml,
             routing_model,
             max_token_length,
+            llm_route_json_str,
         }
     }
 }
@@ -58,9 +56,12 @@ const TOKEN_LENGTH_DIVISOR: usize = 4; // Approximate token length divisor for U
 
 impl RouterModel for RouterModelV1 {
     fn generate_request(&self, messages: &[Message]) -> ChatCompletionsRequest {
+        // remove system prompt, tool calls, tool call response and messages without content
+        // if content is empty its likely a tool call
+        // when role == tool its tool call response
         let messages_vec = messages
             .iter()
-            .filter(|m| m.role != SYSTEM_ROLE)
+            .filter(|m| m.role != SYSTEM_ROLE && m.role != TOOL_ROLE && m.content.is_some())
             .collect::<Vec<&Message>>();
 
         // Following code is to ensure that the conversation does not exceed max token length
@@ -116,21 +117,23 @@ impl RouterModel for RouterModelV1 {
         }
 
         // Reverse the selected messages to maintain the conversation order
-
-        let selected_conversation_list_str = selected_messages_list_reversed
+        let selected_conversation_list = selected_messages_list_reversed
             .iter()
             .rev()
-            .map(|m| {
-                let content_json_str = serde_json::to_string(&m.content).unwrap_or_default();
-                format!("{}: {}", m.role, content_json_str)
+            .map(|message| {
+                Message::new(
+                    message.role.clone(),
+                    // we can unwrap here because we have already filtered out messages without content
+                    message.content.as_ref().unwrap().to_string(),
+                )
             })
-            .collect::<Vec<String>>();
+            .collect::<Vec<Message>>();
 
         let messages_content = ARCH_ROUTER_V1_SYSTEM_PROMPT
-            .replace("{routes}", &self.llm_providers_with_usage_yaml)
+            .replace("{routes}", &self.llm_route_json_str)
             .replace(
                 "{conversation}",
-                selected_conversation_list_str.join("\n").as_str(),
+                &serde_json::to_string(&selected_conversation_list).unwrap_or_default(),
             );
 
         ChatCompletionsRequest {
@@ -215,60 +218,53 @@ mod tests {
 You are a helpful assistant designed to find the best suited route.
 You are provided with route description within <routes></routes> XML tags:
 <routes>
-route1: description1
-route2: description2
+[{"name":"Image generation","description":"generating image"},{"name":"image conversion","description":"convert images to provided format"},{"name":"image search","description":"search image"},{"name":"Audio Processing","description":"Analyzing and interpreting audio input including speech, music, and environmental sounds"},{"name":"Speech Recognition","description":"Converting spoken language into written text"}]
 </routes>
 
+<conversation>
+[{"role":"user","content":"hi"},{"role":"assistant","content":"Hello! How can I assist you today?"},{"role":"user","content":"given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson"}]
+</conversation>
+
 Your task is to decide which route is best suit with user intent on the conversation in <conversation></conversation> XML tags.  Follow the instruction:
-1. If the latest intent from user is irrelevant, response with empty route {"route": ""}.
-2. If the user request is full fill and user thank or ending the conversation , response with empty route {"route": ""}.
-3. Understand user latest intent and find the best match route in <routes></routes> xml tags.
+1. If the latest intent from user is irrelevant or user intent is full filled, response with other route {"route": "other"}.
+2. You must analyze the route descriptions and find the best match route for user latest intent.
+3. You only response the name of the route that best matches the user's request, use the exact name in the <routes></routes>.
 
 Based on your analysis, provide your response in the following JSON formats if you decide to match any route:
 {"route": "route_name"}
-
-
-<conversation>
-user: "Hello, I want to book a flight."
-assistant: "Sure, where would you like to go?"
-user: "seattle"
-</conversation>
 "#;
-
-        let routes_yaml = "route1: description1\nroute2: description2";
+        let routes_str = r#"
+          [
+              {"name": "Image generation", "description": "generating image"},
+              {"name": "image conversion", "description": "convert images to provided format"},
+              {"name": "image search", "description": "search image"},
+              {"name": "Audio Processing", "description": "Analyzing and interpreting audio input including speech, music, and environmental sounds"},
+              {"name": "Speech Recognition", "description": "Converting spoken language into written text"}
+          ]
+        "#;
+        let llm_routes = serde_json::from_str::<Vec<LlmRoute>>(routes_str).unwrap();
         let routing_model = "test-model".to_string();
-        let router = RouterModelV1::new(routes_yaml.to_string(), routing_model.clone(), usize::MAX);
+        let router = RouterModelV1::new(llm_routes, routing_model.clone(), usize::MAX);
 
-        let messages = vec![
-            Message {
-                role: "system".to_string(),
-                content: Some(ContentType::Text(
-                    "You are a helpful assistant.".to_string(),
-                )),
-                ..Default::default()
-            },
-            Message {
-                role: "user".to_string(),
-                content: Some(ContentType::Text(
-                    "Hello, I want to book a flight.".to_string(),
-                )),
-                ..Default::default()
-            },
-            Message {
-                role: "assistant".to_string(),
-                content: Some(ContentType::Text(
-                    "Sure, where would you like to go?".to_string(),
-                )),
-                ..Default::default()
-            },
-            Message {
-                role: "user".to_string(),
-                content: Some(ContentType::Text("seattle".to_string())),
-                ..Default::default()
-            },
-        ];
+        let conversation_str = r#"
+                    [
+                        {
+                            "role": "user",
+                            "content": "hi"
+                        },
+                        {
+                            "role": "assistant",
+                            "content": "Hello! How can I assist you today?"
+                        },
+                        {
+                            "role": "user",
+                            "content": "given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson"
+                        }
+                    ]
+        "#;
+        let conversation: Vec<Message> = serde_json::from_str(conversation_str).unwrap();
 
-        let req = router.generate_request(&messages);
+        let req = router.generate_request(&conversation);
 
         let prompt = req.messages[0].content.as_ref().unwrap();
 
@@ -282,68 +278,55 @@ user: "seattle"
 You are a helpful assistant designed to find the best suited route.
 You are provided with route description within <routes></routes> XML tags:
 <routes>
-route1: description1
-route2: description2
+[{"name":"Image generation","description":"generating image"},{"name":"image conversion","description":"convert images to provided format"},{"name":"image search","description":"search image"},{"name":"Audio Processing","description":"Analyzing and interpreting audio input including speech, music, and environmental sounds"},{"name":"Speech Recognition","description":"Converting spoken language into written text"}]
 </routes>
 
+<conversation>
+[{"role":"user","content":"given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson"}]
+</conversation>
+
 Your task is to decide which route is best suit with user intent on the conversation in <conversation></conversation> XML tags.  Follow the instruction:
-1. If the latest intent from user is irrelevant, response with empty route {"route": ""}.
-2. If the user request is full fill and user thank or ending the conversation , response with empty route {"route": ""}.
-3. Understand user latest intent and find the best match route in <routes></routes> xml tags.
+1. If the latest intent from user is irrelevant or user intent is full filled, response with other route {"route": "other"}.
+2. You must analyze the route descriptions and find the best match route for user latest intent.
+3. You only response the name of the route that best matches the user's request, use the exact name in the <routes></routes>.
 
 Based on your analysis, provide your response in the following JSON formats if you decide to match any route:
 {"route": "route_name"}
-
-
-<conversation>
-user: "I want to book a flight."
-assistant: "Sure, where would you like to go?"
-user: "seattle"
-</conversation>
 "#;
 
-        let routes_yaml = "route1: description1\nroute2: description2";
+        let routes_str = r#"
+          [
+              {"name": "Image generation", "description": "generating image"},
+              {"name": "image conversion", "description": "convert images to provided format"},
+              {"name": "image search", "description": "search image"},
+              {"name": "Audio Processing", "description": "Analyzing and interpreting audio input including speech, music, and environmental sounds"},
+              {"name": "Speech Recognition", "description": "Converting spoken language into written text"}
+          ]
+        "#;
+        let llm_routes = serde_json::from_str::<Vec<LlmRoute>>(routes_str).unwrap();
         let routing_model = "test-model".to_string();
-        let router = RouterModelV1::new(routes_yaml.to_string(), routing_model.clone(), 223);
+        let router = RouterModelV1::new(llm_routes, routing_model.clone(), 235);
 
-        let messages = vec![
-            Message {
-                role: "system".to_string(),
-                content: Some(ContentType::Text(
-                    "You are a helpful assistant.".to_string(),
-                )),
-                ..Default::default()
-            },
-            Message {
-                role: "user".to_string(),
-                content: Some(ContentType::Text("Hi".to_string())),
-                ..Default::default()
-            },
-            Message {
-                role: "assistant".to_string(),
-                content: Some(ContentType::Text("Hello! How can I assist you".to_string())),
-                ..Default::default()
-            },
-            Message {
-                role: "user".to_string(),
-                content: Some(ContentType::Text("I want to book a flight.".to_string())),
-                ..Default::default()
-            },
-            Message {
-                role: "assistant".to_string(),
-                content: Some(ContentType::Text(
-                    "Sure, where would you like to go?".to_string(),
-                )),
-                ..Default::default()
-            },
-            Message {
-                role: "user".to_string(),
-                content: Some(ContentType::Text("seattle".to_string())),
-                ..Default::default()
-            },
-        ];
+        let conversation_str = r#"
+                    [
+                        {
+                            "role": "user",
+                            "content": "hi"
+                        },
+                        {
+                            "role": "assistant",
+                            "content": "Hello! How can I assist you today?"
+                        },
+                        {
+                            "role": "user",
+                            "content": "given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson"
+                        }
+                    ]
+        "#;
 
-        let req = router.generate_request(&messages);
+        let conversation: Vec<Message> = serde_json::from_str(conversation_str).unwrap();
+
+        let req = router.generate_request(&conversation);
 
         let prompt = req.messages[0].content.as_ref().unwrap();
 
@@ -357,69 +340,55 @@ user: "seattle"
 You are a helpful assistant designed to find the best suited route.
 You are provided with route description within <routes></routes> XML tags:
 <routes>
-route1: description1
-route2: description2
+[{"name":"Image generation","description":"generating image"},{"name":"image conversion","description":"convert images to provided format"},{"name":"image search","description":"search image"},{"name":"Audio Processing","description":"Analyzing and interpreting audio input including speech, music, and environmental sounds"},{"name":"Speech Recognition","description":"Converting spoken language into written text"}]
 </routes>
 
+<conversation>
+[{"role":"user","content":"given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson and this is a very long message that exceeds the max token length of the routing model, so it should be truncated and only the last user message should be included in the conversation for routing."}]
+</conversation>
+
 Your task is to decide which route is best suit with user intent on the conversation in <conversation></conversation> XML tags.  Follow the instruction:
-1. If the latest intent from user is irrelevant, response with empty route {"route": ""}.
-2. If the user request is full fill and user thank or ending the conversation , response with empty route {"route": ""}.
-3. Understand user latest intent and find the best match route in <routes></routes> xml tags.
+1. If the latest intent from user is irrelevant or user intent is full filled, response with other route {"route": "other"}.
+2. You must analyze the route descriptions and find the best match route for user latest intent.
+3. You only response the name of the route that best matches the user's request, use the exact name in the <routes></routes>.
 
 Based on your analysis, provide your response in the following JSON formats if you decide to match any route:
 {"route": "route_name"}
-
-
-<conversation>
-user: "Seatte, WA. But I also need to know about the weather there, and if there are any good restaurants nearby, and what the best time to visit is, and also if there are any events happening in the city."
-</conversation>
 "#;
 
-        let routes_yaml = "route1: description1\nroute2: description2";
+        let routes_str = r#"
+          [
+              {"name": "Image generation", "description": "generating image"},
+              {"name": "image conversion", "description": "convert images to provided format"},
+              {"name": "image search", "description": "search image"},
+              {"name": "Audio Processing", "description": "Analyzing and interpreting audio input including speech, music, and environmental sounds"},
+              {"name": "Speech Recognition", "description": "Converting spoken language into written text"}
+          ]
+        "#;
+        let llm_routes = serde_json::from_str::<Vec<LlmRoute>>(routes_str).unwrap();
         let routing_model = "test-model".to_string();
-        let router = RouterModelV1::new(routes_yaml.to_string(), routing_model.clone(), 210);
+        let router = RouterModelV1::new(llm_routes, routing_model.clone(), 200);
 
-        let messages = vec![
-            Message {
-                role: "system".to_string(),
-                content: Some(ContentType::Text(
-                    "You are a helpful assistant.".to_string(),
-                )),
-                ..Default::default()
-            },
-            Message {
-                role: "user".to_string(),
-                content: Some(ContentType::Text("Hi".to_string())),
-                ..Default::default()
-            },
-            Message {
-                role: "assistant".to_string(),
-                content: Some(ContentType::Text("Hello! How can I assist you".to_string())),
-                ..Default::default()
-            },
-            Message {
-                role: "user".to_string(),
-                content: Some(ContentType::Text("I want to book a flight.".to_string())),
-                ..Default::default()
-            },
-            Message {
-                role: "assistant".to_string(),
-                content: Some(ContentType::Text(
-                    "Sure, where would you like to go?".to_string(),
-                )),
-                ..Default::default()
-            },
-            Message {
-                role: "user".to_string(),
-                content: Some(ContentType::Text("Seatte, WA. But I also need to know about the weather there, \
-                                                 and if there are any good restaurants nearby, and what the \
-                                                 best time to visit is, and also if there are any events \
-                                                 happening in the city.".to_string())),
-                ..Default::default()
-            },
-        ];
+        let conversation_str = r#"
+                    [
+                        {
+                            "role": "user",
+                            "content": "hi"
+                        },
+                        {
+                            "role": "assistant",
+                            "content": "Hello! How can I assist you today?"
+                        },
+                        {
+                            "role": "user",
+                            "content": "given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson and this is a very long message that exceeds the max token length of the routing model, so it should be truncated and only the last user message should be included in the conversation for routing."
+                        }
+                    ]
+        "#;
 
-        let req = router.generate_request(&messages);
+        let conversation: Vec<Message> = serde_json::from_str(conversation_str).unwrap();
+
+        let req = router.generate_request(&conversation);
 
         let prompt = req.messages[0].content.as_ref().unwrap();
 
@@ -433,68 +402,229 @@ user: "Seatte, WA. But I also need to know about the weather there, and if there
 You are a helpful assistant designed to find the best suited route.
 You are provided with route description within <routes></routes> XML tags:
 <routes>
-route1: description1
-route2: description2
+[{"name":"Image generation","description":"generating image"},{"name":"image conversion","description":"convert images to provided format"},{"name":"image search","description":"search image"},{"name":"Audio Processing","description":"Analyzing and interpreting audio input including speech, music, and environmental sounds"},{"name":"Speech Recognition","description":"Converting spoken language into written text"}]
 </routes>
 
+<conversation>
+[{"role":"user","content":"given the image In style of Andy Warhol"},{"role":"assistant","content":"ok here is the image"},{"role":"user","content":"pls give me another image about Bart and Lisa"}]
+</conversation>
+
 Your task is to decide which route is best suit with user intent on the conversation in <conversation></conversation> XML tags.  Follow the instruction:
-1. If the latest intent from user is irrelevant, response with empty route {"route": ""}.
-2. If the user request is full fill and user thank or ending the conversation , response with empty route {"route": ""}.
-3. Understand user latest intent and find the best match route in <routes></routes> xml tags.
+1. If the latest intent from user is irrelevant or user intent is full filled, response with other route {"route": "other"}.
+2. You must analyze the route descriptions and find the best match route for user latest intent.
+3. You only response the name of the route that best matches the user's request, use the exact name in the <routes></routes>.
 
 Based on your analysis, provide your response in the following JSON formats if you decide to match any route:
 {"route": "route_name"}
-
-
-<conversation>
-user: "I want to book a flight."
-assistant: "Sure, where would you like to go?"
-user: "seattle"
-</conversation>
 "#;
 
-        let routes_yaml = "route1: description1\nroute2: description2";
+        let routes_str = r#"
+          [
+              {"name": "Image generation", "description": "generating image"},
+              {"name": "image conversion", "description": "convert images to provided format"},
+              {"name": "image search", "description": "search image"},
+              {"name": "Audio Processing", "description": "Analyzing and interpreting audio input including speech, music, and environmental sounds"},
+              {"name": "Speech Recognition", "description": "Converting spoken language into written text"}
+          ]
+        "#;
+        let llm_routes = serde_json::from_str::<Vec<LlmRoute>>(routes_str).unwrap();
         let routing_model = "test-model".to_string();
-        let router = RouterModelV1::new(routes_yaml.to_string(), routing_model.clone(), 220);
+        let router = RouterModelV1::new(llm_routes, routing_model.clone(), 230);
 
-        let messages = vec![
-            Message {
-                role: "system".to_string(),
-                content: Some(ContentType::Text(
-                    "You are a helpful assistant.".to_string(),
-                )),
-                ..Default::default()
-            },
-            Message {
-                role: "user".to_string(),
-                content: Some(ContentType::Text("Hi".to_string())),
-                ..Default::default()
-            },
-            Message {
-                role: "assistant".to_string(),
-                content: Some(ContentType::Text("Hello! How can I assist you".to_string())),
-                ..Default::default()
-            },
-            Message {
-                role: "user".to_string(),
-                content: Some(ContentType::Text("I want to book a flight.".to_string())),
-                ..Default::default()
-            },
-            Message {
-                role: "assistant".to_string(),
-                content: Some(ContentType::Text(
-                    "Sure, where would you like to go?".to_string(),
-                )),
-                ..Default::default()
-            },
-            Message {
-                role: "user".to_string(),
-                content: Some(ContentType::Text("seattle".to_string())),
-                ..Default::default()
-            },
-        ];
+        let conversation_str = r#"
+                    [
+                        {
+                            "role": "user",
+                            "content": "hi"
+                        },
+                        {
+                            "role": "assistant",
+                            "content": "Hello! How can I assist you today?"
+                        },
+                        {
+                            "role": "user",
+                            "content": "given the image In style of Andy Warhol"
+                        },
+                        {
+                            "role": "assistant",
+                            "content": "ok here is the image"
+                        },
+                        {
+                            "role": "user",
+                            "content": "pls give me another image about Bart and Lisa"
+                        }
+                    ]
+        "#;
 
-        let req = router.generate_request(&messages);
+        let conversation: Vec<Message> = serde_json::from_str(conversation_str).unwrap();
+
+        let req = router.generate_request(&conversation);
+
+        let prompt = req.messages[0].content.as_ref().unwrap();
+
+        assert_eq!(expected_prompt, prompt.to_string());
+    }
+
+    #[test]
+    fn test_non_text_input() {
+        let expected_prompt = r#"
+You are a helpful assistant designed to find the best suited route.
+You are provided with route description within <routes></routes> XML tags:
+<routes>
+[{"name":"Image generation","description":"generating image"},{"name":"image conversion","description":"convert images to provided format"},{"name":"image search","description":"search image"},{"name":"Audio Processing","description":"Analyzing and interpreting audio input including speech, music, and environmental sounds"},{"name":"Speech Recognition","description":"Converting spoken language into written text"}]
+</routes>
+
+<conversation>
+[{"role":"user","content":"hi"},{"role":"assistant","content":"Hello! How can I assist you today?"},{"role":"user","content":"given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson"}]
+</conversation>
+
+Your task is to decide which route is best suit with user intent on the conversation in <conversation></conversation> XML tags.  Follow the instruction:
+1. If the latest intent from user is irrelevant or user intent is full filled, response with other route {"route": "other"}.
+2. You must analyze the route descriptions and find the best match route for user latest intent.
+3. You only response the name of the route that best matches the user's request, use the exact name in the <routes></routes>.
+
+Based on your analysis, provide your response in the following JSON formats if you decide to match any route:
+{"route": "route_name"}
+"#;
+        let routes_str = r#"
+          [
+              {"name": "Image generation", "description": "generating image"},
+              {"name": "image conversion", "description": "convert images to provided format"},
+              {"name": "image search", "description": "search image"},
+              {"name": "Audio Processing", "description": "Analyzing and interpreting audio input including speech, music, and environmental sounds"},
+              {"name": "Speech Recognition", "description": "Converting spoken language into written text"}
+          ]
+        "#;
+        let llm_routes = serde_json::from_str::<Vec<LlmRoute>>(routes_str).unwrap();
+        let routing_model = "test-model".to_string();
+        let router = RouterModelV1::new(llm_routes, routing_model.clone(), usize::MAX);
+
+        let conversation_str = r#"
+                    [
+                        {
+                            "role": "user",
+                            "content": [
+                              {
+                                "type": "text",
+                                "text": "hi"
+                              },
+                              {
+                                "type": "image_url",
+                                "image_url": {
+                                  "url": "https://example.com/image.png"
+                                }
+                              }
+                            ]
+                        },
+                        {
+                            "role": "assistant",
+                            "content": "Hello! How can I assist you today?"
+                        },
+                        {
+                            "role": "user",
+                            "content": "given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson"
+                        }
+                    ]
+        "#;
+        let conversation: Vec<Message> = serde_json::from_str(conversation_str).unwrap();
+
+        let req = router.generate_request(&conversation);
+
+        let prompt = req.messages[0].content.as_ref().unwrap();
+
+        assert_eq!(expected_prompt, prompt.to_string());
+    }
+
+    #[test]
+    fn test_skip_tool_call() {
+        let expected_prompt = r#"
+You are a helpful assistant designed to find the best suited route.
+You are provided with route description within <routes></routes> XML tags:
+<routes>
+[{"name":"Image generation","description":"generating image"},{"name":"image conversion","description":"convert images to provided format"},{"name":"image search","description":"search image"},{"name":"Audio Processing","description":"Analyzing and interpreting audio input including speech, music, and environmental sounds"},{"name":"Speech Recognition","description":"Converting spoken language into written text"}]
+</routes>
+
+<conversation>
+[{"role":"user","content":"What's the weather like in Tokyo?"},{"role":"assistant","content":"The current weather in Tokyo is 22°C and sunny."},{"role":"user","content":"What about in New York?"}]
+</conversation>
+
+Your task is to decide which route is best suit with user intent on the conversation in <conversation></conversation> XML tags.  Follow the instruction:
+1. If the latest intent from user is irrelevant or user intent is full filled, response with other route {"route": "other"}.
+2. You must analyze the route descriptions and find the best match route for user latest intent.
+3. You only response the name of the route that best matches the user's request, use the exact name in the <routes></routes>.
+
+Based on your analysis, provide your response in the following JSON formats if you decide to match any route:
+{"route": "route_name"}
+"#;
+        let routes_str = r#"
+          [
+              {"name": "Image generation", "description": "generating image"},
+              {"name": "image conversion", "description": "convert images to provided format"},
+              {"name": "image search", "description": "search image"},
+              {"name": "Audio Processing", "description": "Analyzing and interpreting audio input including speech, music, and environmental sounds"},
+              {"name": "Speech Recognition", "description": "Converting spoken language into written text"}
+          ]
+        "#;
+        let llm_routes = serde_json::from_str::<Vec<LlmRoute>>(routes_str).unwrap();
+        let routing_model = "test-model".to_string();
+        let router = RouterModelV1::new(llm_routes, routing_model.clone(), usize::MAX);
+
+        let conversation_str = r#"
+                                                [
+                                                  {
+                                                    "role": "user",
+                                                    "content": "What's the weather like in Tokyo?"
+                                                  },
+                                                  {
+                                                    "role": "assistant",
+                                                    "content": null,
+                                                    "tool_calls": [
+                                                      {
+                                                        "id": "toolcall-abc123",
+                                                        "type": "function",
+                                                        "function": {
+                                                          "name": "get_weather",
+                                                          "arguments": { "location": "Tokyo" }
+                                                        }
+                                                      }
+                                                    ]
+                                                  },
+                                                  {
+                                                    "role": "tool",
+                                                    "tool_call_id": "toolcall-abc123",
+                                                    "content": "{ \"temperature\": \"22°C\", \"condition\": \"Sunny\" }"
+                                                  },
+                                                  {
+                                                    "role": "assistant",
+                                                    "content": "The current weather in Tokyo is 22°C and sunny."
+                                                  },
+                                                  {
+                                                    "role": "user",
+                                                    "content": "What about in New York?"
+                                                  }
+                                                ]
+        "#;
+
+        // expects conversation to look like this
+
+        // [
+        //   {
+        //     "role": "user",
+        //     "content": "What's the weather like in Tokyo?"
+        //   },
+        //   {
+        //     "role": "assistant",
+        //     "content": "The current weather in Tokyo is 22°C and sunny."
+        //   },
+        //   {
+        //     "role": "user",
+        //     "content": "What about in New York?"
+        //   }
+        // ]
+
+        let conversation: Vec<Message> = serde_json::from_str(conversation_str).unwrap();
+
+        let req = router.generate_request(&conversation);
 
         let prompt = req.messages[0].content.as_ref().unwrap();
 
@@ -503,11 +633,18 @@ user: "seattle"
 
     #[test]
     fn test_parse_response() {
-        let router = RouterModelV1::new(
-            "route1: description1\nroute2: description2".to_string(),
-            "test-model".to_string(),
-            2000,
-        );
+        let routes_str = r#"
+[
+    {"name": "Image generation", "description": "generating image"},
+    {"name": "image conversion", "description": "convert images to provided format"},
+    {"name": "image search", "description": "search image"},
+    {"name": "Audio Processing", "description": "Analyzing and interpreting audio input including speech, music, and environmental sounds"},
+    {"name": "Speech Recognition", "description": "Converting spoken language into written text"}
+]
+"#;
+        let llm_routes = serde_json::from_str::<Vec<LlmRoute>>(routes_str).unwrap();
+
+        let router = RouterModelV1::new(llm_routes, "test-model".to_string(), 2000);
 
         // Case 1: Valid JSON with non-empty route
         let input = r#"{"route": "route1"}"#;
diff --git a/crates/common/src/api/hallucination.rs b/crates/common/src/api/hallucination.rs
index e90ea165..41ccf3d7 100644
--- a/crates/common/src/api/hallucination.rs
+++ b/crates/common/src/api/hallucination.rs
@@ -6,7 +6,6 @@ use crate::{
 };
 use serde::{Deserialize, Serialize};
 
-
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct HallucinationClassificationRequest {
     pub prompt: String,
diff --git a/crates/common/src/api/open_ai.rs b/crates/common/src/api/open_ai.rs
index d5d4ce2a..080923c1 100644
--- a/crates/common/src/api/open_ai.rs
+++ b/crates/common/src/api/open_ai.rs
@@ -162,6 +162,8 @@ pub struct StreamOptions {
 pub enum MultiPartContentType {
     #[serde(rename = "text")]
     Text,
+    #[serde(rename = "image_url")]
+    ImageUrl,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
@@ -188,6 +190,9 @@ impl Display for ContentType {
                     .filter_map(|part| {
                         if part.content_type == MultiPartContentType::Text {
                             part.text.clone()
+                        } else if part.content_type == MultiPartContentType::ImageUrl {
+                            // skip image URLs or their data in text representation
+                            None
                         } else {
                             panic!("Unsupported content type: {:?}", part.content_type);
                         }
@@ -217,6 +222,19 @@ pub struct Message {
     pub tool_call_id: Option<String>,
 }
 
+impl Message {
+    pub fn new(role: String, content: String) -> Self {
+        let content = Some(ContentType::Text(content));
+        Message {
+            role,
+            content,
+            model: None,
+            tool_calls: None,
+            tool_call_id: None,
+        }
+    }
+}
+
 impl Default for Message {
     fn default() -> Self {
         Message {
diff --git a/crates/common/src/configuration.rs b/crates/common/src/configuration.rs
index 87293583..5438b03e 100644
--- a/crates/common/src/configuration.rs
+++ b/crates/common/src/configuration.rs
@@ -145,6 +145,8 @@ pub struct EmbeddingProviver {
 
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
 pub enum LlmProviderType {
+    #[serde(rename = "arch")]
+    Arch,
     #[serde(rename = "claude")]
     Claude,
     #[serde(rename = "deepseek")]
@@ -160,6 +162,7 @@ pub enum LlmProviderType {
 impl Display for LlmProviderType {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         match self {
+            LlmProviderType::Arch => write!(f, "arch"),
             LlmProviderType::Claude => write!(f, "claude"),
             LlmProviderType::Deepseek => write!(f, "deepseek"),
             LlmProviderType::Groq => write!(f, "groq"),
@@ -169,6 +172,25 @@ impl Display for LlmProviderType {
     }
 }
 
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct LlmRoute {
+    pub name: String,
+    pub description: String,
+}
+
+impl From<&LlmProvider> for LlmRoute {
+    fn from(provider: &LlmProvider) -> Self {
+        Self {
+            name: provider.name.to_string(),
+            description: provider
+                .usage
+                .as_ref()
+                .cloned()
+                .unwrap_or_else(|| "No description available".to_string()),
+        }
+    }
+}
+
 #[derive(Debug, Clone, Serialize, Deserialize)]
 //TODO: use enum for model, but if there is a new model, we need to update the code
 pub struct LlmProvider {
diff --git a/crates/common/src/tokenizer.rs b/crates/common/src/tokenizer.rs
index 11ce7295..46e39887 100644
--- a/crates/common/src/tokenizer.rs
+++ b/crates/common/src/tokenizer.rs
@@ -12,10 +12,9 @@ pub fn token_count(model_name: &str, text: &str) -> Result<usize, String> {
                 "tiktoken_rs: unsupported model: {}, using gpt-4 to compute token count",
                 model_name
             );
-
             "gpt-4"
         }
-        true => model_name,
+        true => model_name
     };
 
     // Consideration: is it more expensive to instantiate the BPE object every time, or to contend the singleton?
diff --git a/crates/prompt_gateway/src/http_context.rs b/crates/prompt_gateway/src/http_context.rs
index bb673208..cd251064 100644
--- a/crates/prompt_gateway/src/http_context.rs
+++ b/crates/prompt_gateway/src/http_context.rs
@@ -237,9 +237,7 @@ impl HttpContext for StreamContext {
             Duration::from_secs(5),
         );
 
-        if let Some(content) =
-            self.user_prompt.as_ref().unwrap().content.as_ref()
-        {
+        if let Some(content) = self.user_prompt.as_ref().unwrap().content.as_ref() {
             let call_context = StreamCallContext {
                 response_handler_type: ResponseHandlerType::ArchFC,
                 user_message: Some(content.to_string()),
@@ -262,7 +260,6 @@ impl HttpContext for StreamContext {
             );
         }
         Action::Pause
-
     }
 
     fn on_http_response_headers(&mut self, _num_headers: usize, _end_of_stream: bool) -> Action {
diff --git a/crates/prompt_gateway/tests/integration.rs b/crates/prompt_gateway/tests/integration.rs
index 563c9393..e749a007 100644
--- a/crates/prompt_gateway/tests/integration.rs
+++ b/crates/prompt_gateway/tests/integration.rs
@@ -1,5 +1,6 @@
 use common::api::open_ai::{
-    ChatCompletionsResponse, Choice, ContentType, FunctionCallDetail, Message, ToolCall, ToolType, Usage
+    ChatCompletionsResponse, Choice, ContentType, FunctionCallDetail, Message, ToolCall, ToolType,
+    Usage,
 };
 use common::configuration::Configuration;
 use http::StatusCode;
diff --git a/demos/samples_java/weather_forcecast_service/arch_config.yaml b/demos/samples_java/weather_forcecast_service/arch_config.yaml
index 8228d0f4..d11aaa96 100644
--- a/demos/samples_java/weather_forcecast_service/arch_config.yaml
+++ b/demos/samples_java/weather_forcecast_service/arch_config.yaml
@@ -1,4 +1,5 @@
-version: v0.1
+version: v0.1.0
+
 listeners:
   ingress_traffic:
     address: 0.0.0.0
diff --git a/demos/samples_python/currency_exchange/arch_config.yaml b/demos/samples_python/currency_exchange/arch_config.yaml
index 03e5a01d..a9c0bce0 100644
--- a/demos/samples_python/currency_exchange/arch_config.yaml
+++ b/demos/samples_python/currency_exchange/arch_config.yaml
@@ -1,4 +1,4 @@
-version: v0.1
+version: v0.1.0
 
 listeners:
   ingress_traffic:
diff --git a/demos/samples_python/human_resources_agent/arch_config.yaml b/demos/samples_python/human_resources_agent/arch_config.yaml
index 2ac97027..f46a6c7a 100644
--- a/demos/samples_python/human_resources_agent/arch_config.yaml
+++ b/demos/samples_python/human_resources_agent/arch_config.yaml
@@ -1,4 +1,5 @@
-version: v0.1
+version: v0.1.0
+
 listeners:
   ingress_traffic:
     address: 0.0.0.0
diff --git a/demos/samples_python/multi_turn_rag_agent/arch_config.yaml b/demos/samples_python/multi_turn_rag_agent/arch_config.yaml
index 8e8feb4f..706a374b 100644
--- a/demos/samples_python/multi_turn_rag_agent/arch_config.yaml
+++ b/demos/samples_python/multi_turn_rag_agent/arch_config.yaml
@@ -1,4 +1,4 @@
-version: v0.1
+version: v0.1.0
 
 listeners:
   ingress_traffic:
diff --git a/demos/samples_python/network_switch_operator_agent/arch_config.yaml b/demos/samples_python/network_switch_operator_agent/arch_config.yaml
index fa830d0b..0175e1c0 100644
--- a/demos/samples_python/network_switch_operator_agent/arch_config.yaml
+++ b/demos/samples_python/network_switch_operator_agent/arch_config.yaml
@@ -1,4 +1,4 @@
-version: v0.1
+version: v0.1.0
 listeners:
   ingress_traffic:
     address: 0.0.0.0
diff --git a/demos/samples_python/stock_quote/arch_config.yaml b/demos/samples_python/stock_quote/arch_config.yaml
index 96901620..6b4a427f 100644
--- a/demos/samples_python/stock_quote/arch_config.yaml
+++ b/demos/samples_python/stock_quote/arch_config.yaml
@@ -1,4 +1,4 @@
-version: v0.1
+version: v0.1.0
 
 listeners:
   ingress_traffic:
diff --git a/demos/samples_python/weather_forecast/arch_config.yaml b/demos/samples_python/weather_forecast/arch_config.yaml
index 8585bc6f..a51bf6a1 100644
--- a/demos/samples_python/weather_forecast/arch_config.yaml
+++ b/demos/samples_python/weather_forecast/arch_config.yaml
@@ -1,4 +1,4 @@
-version: "0.1-beta"
+version: v0.1.0
 
 listeners:
   ingress_traffic:
diff --git a/demos/shared/test_runner/run_demo_tests.sh b/demos/shared/test_runner/run_demo_tests.sh
index bcf4c608..2f44d3e0 100644
--- a/demos/shared/test_runner/run_demo_tests.sh
+++ b/demos/shared/test_runner/run_demo_tests.sh
@@ -1,11 +1,20 @@
 #!/bin/bash
 set -eu
 
-echo "docker images"
-docker images
+# load demo name from arguments
+if [ $# -eq 0 ]; then
+  echo "No demo names provided. Please provide demo names as arguments."
+  # print usage
+  echo "Usage: $0 <demo_name1> <demo_name2> ..."
+  exit 1
+fi
 
-# for demo in currency_exchange hr_agent
-for demo in samples_python/currency_exchange use_cases/preference_based_routing
+# extract demo names from arguments
+DEMOS="$@"
+
+echo "Running tests for demos: $DEMOS"
+
+for demo in $DEMOS
 do
   echo "******************************************"
   echo "Running tests for $demo ..."
@@ -14,13 +23,18 @@ do
   echo "starting archgw"
   archgw up arch_config.yaml
   echo "starting docker containers"
-  docker compose up -d 2>&1 > /dev/null
+  # only execute docker compose if demo is use_cases/preference_based_routing
+  if [ "$demo" == "use_cases/preference_based_routing" ]; then
+    echo "starting docker compose for $demo"
+    docker compose -f docker-compose.yaml up -d 2>&1 > /dev/null
+  else
+    echo "skipping docker compose for $demo"
+  fi
   echo "starting hurl tests"
-  hurl --test hurl_tests/*.hurl
-  if [ $? -ne 0 ]; then
+  if ! hurl hurl_tests/*.hurl; then
     echo "Hurl tests failed for $demo"
     echo "docker logs for archgw:"
-    docker logs archgw
+    docker logs archgw | tail -n 100
     exit 1
   fi
   echo "stopping docker containers and archgw"
diff --git a/demos/use_cases/llm_routing/arch_config.yaml b/demos/use_cases/llm_routing/arch_config.yaml
index 896fb795..0d38335e 100644
--- a/demos/use_cases/llm_routing/arch_config.yaml
+++ b/demos/use_cases/llm_routing/arch_config.yaml
@@ -1,4 +1,4 @@
-version: "0.1-beta"
+version: v0.1.0
 
 listeners:
   egress_traffic:
diff --git a/demos/use_cases/ollama/arch_config.yaml b/demos/use_cases/ollama/arch_config.yaml
index c933841d..24eb1bf1 100644
--- a/demos/use_cases/ollama/arch_config.yaml
+++ b/demos/use_cases/ollama/arch_config.yaml
@@ -1,4 +1,4 @@
-version: v0.1
+version: v0.1.0
 
 listeners:
   egress_traffic:
diff --git a/demos/use_cases/orchestrating_agents/arch_config.yaml b/demos/use_cases/orchestrating_agents/arch_config.yaml
index 7cffa101..639c5242 100644
--- a/demos/use_cases/orchestrating_agents/arch_config.yaml
+++ b/demos/use_cases/orchestrating_agents/arch_config.yaml
@@ -1,4 +1,4 @@
-version: "0.1-beta"
+version: v0.1.0
 
 listeners:
   ingress_traffic:
diff --git a/demos/use_cases/preference_based_routing/README.md b/demos/use_cases/preference_based_routing/README.md
index 4f703afe..8883d2b3 100644
--- a/demos/use_cases/preference_based_routing/README.md
+++ b/demos/use_cases/preference_based_routing/README.md
@@ -1,2 +1,53 @@
 # Usage based LLM Routing
 This demo shows how you can use user preferences to route user prompts to appropriate llm. See [arch_config.yaml](arch_config.yaml) for details on how you can define user preferences.
+
+## How to start the demo
+
+Make sure your machine is up to date with [latest version of archgw]([url](https://github.com/katanemo/archgw/tree/main?tab=readme-ov-file#prerequisites)). And you have activated the virtual environment.
+
+
+1. start the openwebui
+```bash
+(venv) $ cd demos/use_cases/preference_based_routing
+(venv) $ docker compose up -d
+```
+2. start archgw in the foreground
+```bash
+(venv) $ archgw up --service archgw --foreground
+2025-05-30 18:00:09,953 - cli.main - INFO - Starting archgw cli version: 0.3.1
+2025-05-30 18:00:09,953 - cli.main - INFO - Validating /Users/adilhafeez/src/intelligent-prompt-gateway/demos/use_cases/preference_based_routing/arch_config.yaml
+2025-05-30 18:00:10,422 - cli.core - INFO - Starting arch gateway, image name: archgw, tag: katanemo/archgw:0.3.1
+2025-05-30 18:00:10,662 - cli.core - INFO - archgw status: running, health status: starting
+2025-05-30 18:00:11,712 - cli.core - INFO - archgw status: running, health status: starting
+2025-05-30 18:00:12,761 - cli.core - INFO - archgw is running and is healthy!
+...
+```
+
+3. open openwebui http://localhost:8080/
+
+# Testing out preference based routing
+
+We have defined two routes 1. code generation and 2. code understanding
+
+For code generation query LLM that is better suited for code generation wil handle the request,
+
+
+If you look at the logs you'd see that code generation llm was selected,
+
+```
+...
+2025-05-31T01:02:19.382716Z  INFO brightstaff::router::llm_router: router response: {'route': 'code_generation'}, response time: 203ms
+...
+```
+
+<img width="1036" alt="image" src="https://github.com/user-attachments/assets/f923944b-ddbe-462e-9fd5-c75504adc8cf" />
+
+Now if you ask for query related to code understanding you'd see llm that is better suited to handle code understanding in handled,
+
+```
+...
+2025-05-31T01:06:33.555680Z  INFO brightstaff::router::llm_router: router response: {'route': 'code_understanding'}, response time: 327ms
+...
+```
+
+<img width="1081" alt="image" src="https://github.com/user-attachments/assets/e50d167c-46a0-4e3a-ba77-e84db1bd376d" />
diff --git a/demos/use_cases/preference_based_routing/arch_config.yaml b/demos/use_cases/preference_based_routing/arch_config.yaml
index 682527ca..41026481 100644
--- a/demos/use_cases/preference_based_routing/arch_config.yaml
+++ b/demos/use_cases/preference_based_routing/arch_config.yaml
@@ -1,7 +1,7 @@
-version: "0.1-beta"
+version: v0.1.0
 
 routing:
-  model: gpt-4o
+  model: arch-router
 
 listeners:
   egress_traffic:
@@ -12,10 +12,15 @@ listeners:
 
 llm_providers:
 
-  - name: archgw-v1-router-model
+  - name: arch-router
+    access_key: $ARCH_API_KEY
+    provider_interface: arch
+    model: Arch-Router
+
+  - name: gpt-4o-mini
     provider_interface: openai
-    model: cotran2/llama-4-epoch
-    base_url: http://34.46.85.85:8000/v1
+    access_key: $OPENAI_API_KEY
+    model: gpt-4o-mini
 
   - name: gpt-4o
     provider_interface: openai
diff --git a/demos/use_cases/preference_based_routing/docker-compose.yaml b/demos/use_cases/preference_based_routing/docker-compose.yaml
index c2d794c6..c0f30d5e 100644
--- a/demos/use_cases/preference_based_routing/docker-compose.yaml
+++ b/demos/use_cases/preference_based_routing/docker-compose.yaml
@@ -1,17 +1,14 @@
 services:
 
-  chatbot_ui:
-    build:
-      context: ../../shared/chatbot_ui
-      dockerfile: Dockerfile
+  open-web-ui:
+    image: ghcr.io/open-webui/open-webui:main
+    restart: always
     ports:
-      - "18080:8080"
+      - "8080:8080"
     environment:
-      - CHAT_COMPLETION_ENDPOINT=http://host.docker.internal:12000/v1
-    extra_hosts:
-      - "host.docker.internal:host-gateway"
-    volumes:
-      - ./arch_config.yaml:/app/arch_config.yaml
+      - DEFAULT_MODELS=gpt-4o-mini
+      - ENABLE_OPENAI_API=true
+      - OPENAI_API_BASE_URL=http://host.docker.internal:12000/v1
 
   jaeger:
     build:
diff --git a/demos/use_cases/preference_based_routing/test_router_endpoint.rest b/demos/use_cases/preference_based_routing/test_router_endpoint.rest
index 9fc6f6fe..890206cb 100644
--- a/demos/use_cases/preference_based_routing/test_router_endpoint.rest
+++ b/demos/use_cases/preference_based_routing/test_router_endpoint.rest
@@ -1,10 +1,10 @@
 @arch_llm_router_endpoint = http://35.192.87.187:8000
 
-POST {{arch_llm_router_endpoint}}/v1/chat/completions HTTP/1.1
+POST https://archfc.katanemo.dev/v1/chat/completions HTTP/1.1
 Content-Type: application/json
 
 {
-  "model": "cotran2/llama-1b-4-26",
+  "model": "cotran2/qwen-4-epoch-2600",
   "messages": [
     {
       "role": "user",
@@ -21,4 +21,5 @@ Content-Type: application/json
 {"model":"cotran2/llama-1b-4-26","messages":[{"role":"user","content":"\nYou are an advanced Routing Assistant designed to select the optimal route based on user requests. \nYour task is to analyze conversations and match them to the most appropriate predefined route.\nReview the available routes config:\n\n# ROUTES CONFIG START\n- name: gpt-4o\n  description: simple requests, basic fact retrieval, easy to answer\n- name: o4-mini()\n  description: complex reasoning problem, require multi step answer\n# ROUTES CONFIG END\n\nExamine the following conversation between a user and an assistant:\n\n# CONVERSATION START\n[{\"role\":\"user\",\"content\":\"What is the capital of France?\"}]\n# CONVERSATION END\n\nYour goal is to identify the most appropriate route that matches the user's LATEST intent. Follow these steps:\n\n1. Carefully read and analyze the provided conversation, focusing on the user's latest request and the conversation scenario.\n2. Check if the user's request and scenario matches any of the routes in the routing configuration (focus on the description).\n3. Find the route that best matches.\n4. Use context clues from the entire conversation to determine the best fit.\n5. Return the best match possible. You only response the name of the route that best matches the user's request, use the exact name in the routes config.\n6. If no route relatively close to matches the user's latest intent or user last message is thank you or greeting, return an empty route ''. \n\n# OUTPUT FORMAT\nYour final output must follow this JSON format:\n{\n  \"route\": \"route_name\" # The matched route name, or empty string '' if no match\n}\n\nBased on your analysis, provide only the JSON object as your final output with no additional text, explanations, or whitespace.\n"}],"stream":false}
 
 ### get model list
-GET http://34.46.85.85:8000/v1/models HTTP/1.1
+# GET http://34.46.85.85:8000/v1/models HTTP/1.1
+GET https://archfc.katanemo.dev/arch-router/v1/models HTTP/1.1
diff --git a/demos/use_cases/spotify_bearer_auth/arch_config.yaml b/demos/use_cases/spotify_bearer_auth/arch_config.yaml
index 1d82a426..99a67401 100644
--- a/demos/use_cases/spotify_bearer_auth/arch_config.yaml
+++ b/demos/use_cases/spotify_bearer_auth/arch_config.yaml
@@ -1,4 +1,5 @@
-version: v0.1
+version: v0.1.0
+
 listeners:
   ingress_traffic:
     address: 0.0.0.0
diff --git a/docs/source/concepts/includes/arch_config.yaml b/docs/source/concepts/includes/arch_config.yaml
index a7d0a289..4523ae32 100644
--- a/docs/source/concepts/includes/arch_config.yaml
+++ b/docs/source/concepts/includes/arch_config.yaml
@@ -1,4 +1,4 @@
-version: v0.1
+version: v0.1.0
 
 listeners:
   ingress_traffic:
diff --git a/docs/source/conf.py b/docs/source/conf.py
index ac4398a4..3a351b35 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -15,7 +15,7 @@ from sphinxawesome_theme.postprocess import Icons
 project = "Arch Docs"
 copyright = "2025, Katanemo Labs, Inc"
 author = "Katanemo Labs, Inc"
-release = " v0.3.0"
+release = " v0.3.1"
 
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
diff --git a/docs/source/get_started/quickstart.rst b/docs/source/get_started/quickstart.rst
index bfa559ed..cdedbaca 100644
--- a/docs/source/get_started/quickstart.rst
+++ b/docs/source/get_started/quickstart.rst
@@ -25,7 +25,7 @@ Arch's CLI allows you to manage and interact with the Arch gateway efficiently.
 
    $ python -m venv venv
    $ source venv/bin/activate   # On Windows, use: venv\Scripts\activate
-   $ pip install archgw==0.3.0
+   $ pip install archgw==0.3.1
 
 
 Build AI Agent with Arch Gateway
@@ -40,7 +40,7 @@ Create ``arch_config.yaml`` file with the following content:
 
 .. code-block:: yaml
 
-   version: v0.1
+   version: v0.1.0
 
   listeners:
     ingress_traffic:
@@ -143,7 +143,7 @@ Create ``arch_config.yaml`` file with the following content:
 
 .. code-block:: yaml
 
-   version: v0.1
+   version: v0.1.0
 
   listeners:
     egress_traffic:
diff --git a/docs/source/guides/includes/arch_config.yaml b/docs/source/guides/includes/arch_config.yaml
index e86c6072..4ee46cbb 100644
--- a/docs/source/guides/includes/arch_config.yaml
+++ b/docs/source/guides/includes/arch_config.yaml
@@ -1,4 +1,4 @@
-version: v0.1
+version: v0.1.0
 
 listeners:
   ingress_traffic:
diff --git a/docs/source/guides/llm_router.rst b/docs/source/guides/llm_router.rst
new file mode 100644
index 00000000..5e41672c
--- /dev/null
+++ b/docs/source/guides/llm_router.rst
@@ -0,0 +1,160 @@
+.. _llm_router:
+
+LLM Routing
+==============================================================
+
+LLM Router is an intelligent routing system that automatically selects the most appropriate large language model (LLM) for each user request based on the intent, domain, and complexity of the prompt. This enables optimal performance, cost efficiency, and response quality by matching requests with the most suitable model from your available LLM fleet.
+
+
+Routing Workflow
+-------------------------
+
+#. **Prompt Analysis**
+
+    When a user submits a prompt, the Router analyzes it to determine the domain (subject matter) or action (type of operation requested).
+
+#. **Model Selection**
+
+    Based on the analyzed intent and your configured routing preferences, the Router selects the most appropriate model from your available LLM fleet.
+
+#. **Request Forwarding**
+
+    Once the optimal model is identified, our gateway forwards the original prompt to the selected LLM endpoint. The routing decision is transparent and can be logged for monitoring and optimization purposes.
+
+#. **Response Handling**
+
+    After the selected model processes the request, the response is returned through the gateway. The gateway can optionally add routing metadata or performance metrics to help you understand and optimize your routing decisions.
+
+Arch-Router
+-------------------------
+The `Arch-Router <https://huggingface.co/katanemo/Arch-Router-1.5B>`_ is a state-of-the-art **preference-based routing model** specifically designed for intelligent LLM selection. This model delivers production-ready performance with low latency and high accuracy.
+
+To support effective routing, Arch-Router introduces two key concepts:
+
+- **Domain** – the high-level thematic category or subject matter of a request (e.g., legal, healthcare, programming).
+
+- **Action** – the specific type of operation the user wants performed (e.g., summarization, code generation, booking appointment, translation).
+
+Both domain and action configs are associated with preferred models or model variants. At inference time, Arch-Router analyzes the incoming prompt to infer its domain and action using semantic similarity, task indicators, and contextual cues. It then applies the user-defined routing preferences to select the model best suited to handle the request.
+
+In summary, Arch-Router demonstrates:
+
+- **Structured Preference Routing**: Aligns prompt request with model strengths using explicit domain–action mappings.
+
+- **Transparent and Controllable**: Makes routing decisions transparent and configurable, empowering users to customize system behavior.
+
+- **Flexible and Adaptive**: Supports evolving user needs, model updates, and new domains/actions without retraining the router.
+
+- **Production-Ready Performance**: Optimized for low-latency, high-throughput applications in multi-model environments.
+
+
+Implementing LLM Routing
+-----------------------------
+
+To configure LLM routing in our gateway, you need to define a prompt target configuration that specifies the routing model and the LLM providers. This configuration will allow Arch Gateway to route incoming prompts to the appropriate model based on the defined routes.
+
+Below is an example to show how to set up a prompt target for the Arch Router:
+
+- **Step 1: Define the routing model in the `routing` section**. You can use the `archgw-v1-router-model` as the katanemo routing model or any other routing model you prefer.
+
+- **Step 2: Define the listeners in the `listeners` section**. This is where you specify the address and port for incoming traffic, as well as the message format (e.g., OpenAI).
+
+- **Step 3: Define the LLM providers in the `llm_providers` section**. This is where you specify the routing model, and any other models you want to use for specific tasks and their route usage descriptions (e.g., code generation, code understanding).
+
+.. Note::
+  Make sure you define a model for default usage, such as `gpt-4o`, which will be used when no specific route is matched for an user prompt.
+
+
+.. code-block:: yaml
+    :caption: Route Config Example
+
+
+    routing:
+    model: archgw-v1-router-model
+
+    listeners:
+    egress_traffic:
+        address: 0.0.0.0
+        port: 12000
+        message_format: openai
+        timeout: 30s
+
+    llm_providers:
+    - name: archgw-v1-router-model
+        provider_interface: openai
+        model: katanemo/Arch-Router-1.5B
+        base_url: ...
+
+    - name: gpt-4o-mini
+        provider_interface: openai
+        access_key: $OPENAI_API_KEY
+        model: gpt-4o-mini
+        default: true
+
+    - name: code_generation
+        provider_interface: openai
+        access_key: $OPENAI_API_KEY
+        model: gpt-4o
+        usage: Generating new code snippets, functions, or boilerplate based on user prompts or requirements
+
+    - name: code_understanding
+        provider_interface: openai
+        access_key: $OPENAI_API_KEY
+        model: gpt-4.1
+        usage: understand and explain existing code snippets, functions, or libraries
+
+
+Example Use Cases
+-------------------------
+Here are common scenarios where Arch-Router excels:
+
+- **Coding Tasks**: Distinguish between code generation requests ("write a Python function"), debugging needs ("fix this error"), and code optimization ("make this faster"), routing each to appropriately specialized models.
+
+- **Content Processing Workflows**: Classify requests as summarization ("summarize this document"), translation ("translate to Spanish"), or analysis ("what are the key themes"), enabling targeted model selection.
+
+- **Multi-Domain Applications**: Accurately identify whether requests fall into legal, healthcare, technical, or general domains, even when the subject matter isn't explicitly stated in the prompt.
+
+- **Conversational Routing**: Track conversation context to identify when topics shift between domains or when the type of assistance needed changes mid-conversation.
+
+
+Best practice
+-------------------------
+- **💡Consistent Naming:**  Route names should align with their descriptions.
+
+  - ❌ Bad:
+    ```
+    {"name": "math", "description": "handle solving quadratic equations"}
+    ```
+  - ✅ Good:
+    ```
+    {"name": "quadratic_equation", "description": "solving quadratic equations"}
+    ```
+
+- **💡 Clear Usage Description:**  Make your route names and descriptions specific, unambiguous, and minimizing overlap between routes. The Router performs better when it can clearly distinguish between different types of requests.
+
+  - ❌ Bad:
+    ```
+    {"name": "math", "description": "anything closely related to mathematics"}
+    ```
+  - ✅ Good:
+    ```
+    {"name": "math", "description": "solving, explaining math problems, concepts"}
+    ```
+
+- **💡Nouns Descriptor:** Preference-based routers perform better with noun-centric descriptors, as they offer more stable and semantically rich signals for matching.
+
+- **💡Domain Inclusion:** for best user experience, you should always include domain route. This help the router fall back to domain when action is not
+
+.. Unsupported Features
+.. -------------------------
+
+.. The following features are **not supported** by the Arch-Router model:
+
+.. - **❌ Multi-Modality:**
+..   The model is not trained to process raw image or audio inputs. While it can handle textual queries *about* these modalities (e.g., "generate an image of a cat"), it cannot interpret encoded multimedia data directly.
+
+.. - **❌ Function Calling:**
+..   This model is designed for **semantic preference matching**, not exact intent classification or tool execution. For structured function invocation, use models in the **Arch-Function-Calling** collection.
+
+.. - **❌ System Prompt Dependency:**
+..   Arch-Router routes based solely on the user’s conversation history. It does not use or rely on system prompts for routing decisions.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index aefc63bf..0f7f4c84 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -52,6 +52,7 @@ Welcome to Arch!
       guides/prompt_guard
       guides/agent_routing
       guides/function_calling
+      guides/llm_router
       guides/observability/observability
 
   .. tab-item:: Build with Arch
diff --git a/model_server/poetry.lock b/model_server/poetry.lock
index a54bc30c..792db439 100644
--- a/model_server/poetry.lock
+++ b/model_server/poetry.lock
@@ -324,13 +324,13 @@ typing = ["typing-extensions (>=4.12.2)"]
 
 [[package]]
 name = "fsspec"
-version = "2025.5.0"
+version = "2025.5.1"
 description = "File-system specification"
 optional = false
 python-versions = ">=3.9"
 files = [
-    {file = "fsspec-2025.5.0-py3-none-any.whl", hash = "sha256:0ca253eca6b5333d8a2b8bd98c7326fe821f1f0fdbd34e1b445bddde8e804c95"},
-    {file = "fsspec-2025.5.0.tar.gz", hash = "sha256:e4f4623bb6221f7407fd695cc535d1f857a077eb247580f4ada34f5dc25fd5c8"},
+    {file = "fsspec-2025.5.1-py3-none-any.whl", hash = "sha256:24d3a2e663d5fc735ab256263c4075f374a174c3410c0b25e5bd1970bceaa462"},
+    {file = "fsspec-2025.5.1.tar.gz", hash = "sha256:2e55e47a540b91843b755e83ded97c6e897fa0942b11490113f09e9c443c2475"},
 ]
 
 [package.extras]
@@ -520,13 +520,13 @@ zstd = ["zstandard (>=0.18.0)"]
 
 [[package]]
 name = "huggingface-hub"
-version = "0.32.0"
+version = "0.32.3"
 description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
 optional = false
 python-versions = ">=3.8.0"
 files = [
-    {file = "huggingface_hub-0.32.0-py3-none-any.whl", hash = "sha256:e56e94109649ce6ebdb59b4e393ee3543ec0eca2eab4f41b269e1d885c88d08c"},
-    {file = "huggingface_hub-0.32.0.tar.gz", hash = "sha256:dd66c9365ea43049ec9b939bdcdb21a0051e1bd70026fc50304e4fb1bb6a15ba"},
+    {file = "huggingface_hub-0.32.3-py3-none-any.whl", hash = "sha256:e46f7ea7fe2b5e5f67cc4e37eb201140091946a314d7c2b134a9673dadd80b6a"},
+    {file = "huggingface_hub-0.32.3.tar.gz", hash = "sha256:752c889ebf3a63cbd39803f6d87ccc135a463bbcb36abfa2faff0ccbf1cec087"},
 ]
 
 [package.dependencies]
@@ -1023,13 +1023,13 @@ files = [
 
 [[package]]
 name = "openai"
-version = "1.82.0"
+version = "1.82.1"
 description = "The official Python library for the openai API"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "openai-1.82.0-py3-none-any.whl", hash = "sha256:8c40647fea1816516cb3de5189775b30b5f4812777e40b8768f361f232b61b30"},
-    {file = "openai-1.82.0.tar.gz", hash = "sha256:b0a009b9a58662d598d07e91e4219ab4b1e3d8ba2db3f173896a92b9b874d1a7"},
+    {file = "openai-1.82.1-py3-none-any.whl", hash = "sha256:334eb5006edf59aa464c9e932b9d137468d810b2659e5daea9b3a8c39d052395"},
+    {file = "openai-1.82.1.tar.gz", hash = "sha256:ffc529680018e0417acac85f926f92aa0bbcbc26e82e2621087303c66bc7f95d"},
 ]
 
 [package.dependencies]
@@ -1285,22 +1285,22 @@ testing = ["coverage", "pytest", "pytest-benchmark"]
 
 [[package]]
 name = "protobuf"
-version = "5.29.4"
+version = "5.29.5"
 description = ""
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "protobuf-5.29.4-cp310-abi3-win32.whl", hash = "sha256:13eb236f8eb9ec34e63fc8b1d6efd2777d062fa6aaa68268fb67cf77f6839ad7"},
-    {file = "protobuf-5.29.4-cp310-abi3-win_amd64.whl", hash = "sha256:bcefcdf3976233f8a502d265eb65ea740c989bacc6c30a58290ed0e519eb4b8d"},
-    {file = "protobuf-5.29.4-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:307ecba1d852ec237e9ba668e087326a67564ef83e45a0189a772ede9e854dd0"},
-    {file = "protobuf-5.29.4-cp38-abi3-manylinux2014_aarch64.whl", hash = "sha256:aec4962f9ea93c431d5714ed1be1c93f13e1a8618e70035ba2b0564d9e633f2e"},
-    {file = "protobuf-5.29.4-cp38-abi3-manylinux2014_x86_64.whl", hash = "sha256:d7d3f7d1d5a66ed4942d4fefb12ac4b14a29028b209d4bfb25c68ae172059922"},
-    {file = "protobuf-5.29.4-cp38-cp38-win32.whl", hash = "sha256:1832f0515b62d12d8e6ffc078d7e9eb06969aa6dc13c13e1036e39d73bebc2de"},
-    {file = "protobuf-5.29.4-cp38-cp38-win_amd64.whl", hash = "sha256:476cb7b14914c780605a8cf62e38c2a85f8caff2e28a6a0bad827ec7d6c85d68"},
-    {file = "protobuf-5.29.4-cp39-cp39-win32.whl", hash = "sha256:fd32223020cb25a2cc100366f1dedc904e2d71d9322403224cdde5fdced0dabe"},
-    {file = "protobuf-5.29.4-cp39-cp39-win_amd64.whl", hash = "sha256:678974e1e3a9b975b8bc2447fca458db5f93a2fb6b0c8db46b6675b5b5346812"},
-    {file = "protobuf-5.29.4-py3-none-any.whl", hash = "sha256:3fde11b505e1597f71b875ef2fc52062b6a9740e5f7c8997ce878b6009145862"},
-    {file = "protobuf-5.29.4.tar.gz", hash = "sha256:4f1dfcd7997b31ef8f53ec82781ff434a28bf71d9102ddde14d076adcfc78c99"},
+    {file = "protobuf-5.29.5-cp310-abi3-win32.whl", hash = "sha256:3f1c6468a2cfd102ff4703976138844f78ebd1fb45f49011afc5139e9e283079"},
+    {file = "protobuf-5.29.5-cp310-abi3-win_amd64.whl", hash = "sha256:3f76e3a3675b4a4d867b52e4a5f5b78a2ef9565549d4037e06cf7b0942b1d3fc"},
+    {file = "protobuf-5.29.5-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:e38c5add5a311f2a6eb0340716ef9b039c1dfa428b28f25a7838ac329204a671"},
+    {file = "protobuf-5.29.5-cp38-abi3-manylinux2014_aarch64.whl", hash = "sha256:fa18533a299d7ab6c55a238bf8629311439995f2e7eca5caaff08663606e9015"},
+    {file = "protobuf-5.29.5-cp38-abi3-manylinux2014_x86_64.whl", hash = "sha256:63848923da3325e1bf7e9003d680ce6e14b07e55d0473253a690c3a8b8fd6e61"},
+    {file = "protobuf-5.29.5-cp38-cp38-win32.whl", hash = "sha256:ef91363ad4faba7b25d844ef1ada59ff1604184c0bcd8b39b8a6bef15e1af238"},
+    {file = "protobuf-5.29.5-cp38-cp38-win_amd64.whl", hash = "sha256:7318608d56b6402d2ea7704ff1e1e4597bee46d760e7e4dd42a3d45e24b87f2e"},
+    {file = "protobuf-5.29.5-cp39-cp39-win32.whl", hash = "sha256:6f642dc9a61782fa72b90878af134c5afe1917c89a568cd3476d758d3c3a0736"},
+    {file = "protobuf-5.29.5-cp39-cp39-win_amd64.whl", hash = "sha256:470f3af547ef17847a28e1f47200a1cbf0ba3ff57b7de50d22776607cd2ea353"},
+    {file = "protobuf-5.29.5-py3-none-any.whl", hash = "sha256:6cf42630262c59b2d8de33954443d94b746c952b01434fc58a417fdbd2e84bd5"},
+    {file = "protobuf-5.29.5.tar.gz", hash = "sha256:bc1463bafd4b0929216c35f437a8e28731a2b7fe3d98bb77a600efced5a15c84"},
 ]
 
 [[package]]
@@ -1482,13 +1482,13 @@ dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments
 
 [[package]]
 name = "pytest-asyncio"
-version = "0.26.0"
+version = "1.0.0"
 description = "Pytest support for asyncio"
 optional = false
 python-versions = ">=3.9"
 files = [
-    {file = "pytest_asyncio-0.26.0-py3-none-any.whl", hash = "sha256:7b51ed894f4fbea1340262bdae5135797ebbe21d8638978e35d31c6d19f72fb0"},
-    {file = "pytest_asyncio-0.26.0.tar.gz", hash = "sha256:c4df2a697648241ff39e7f0e4a73050b03f123f760673956cf0d72a4990e312f"},
+    {file = "pytest_asyncio-1.0.0-py3-none-any.whl", hash = "sha256:4f024da9f1ef945e680dc68610b52550e36590a67fd31bb3b4943979a1f90ef3"},
+    {file = "pytest_asyncio-1.0.0.tar.gz", hash = "sha256:d15463d13f4456e1ead2594520216b225a16f781e144f8fdf6c5bb4667c48b3f"},
 ]
 
 [package.dependencies]
@@ -2003,13 +2003,13 @@ telegram = ["requests"]
 
 [[package]]
 name = "transformers"
-version = "4.52.3"
+version = "4.52.4"
 description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow"
 optional = false
 python-versions = ">=3.9.0"
 files = [
-    {file = "transformers-4.52.3-py3-none-any.whl", hash = "sha256:cd04059da50e7cf2a617ce3143ba8beffbf119f8c25a0717c3454fd9d0f19609"},
-    {file = "transformers-4.52.3.tar.gz", hash = "sha256:2e1de29374f27920aaf6d589d4e6339f33def2fb08809e1a1d792e040e9fbce7"},
+    {file = "transformers-4.52.4-py3-none-any.whl", hash = "sha256:203f5c19416d5877e36e88633943761719538a25d9775977a24fe77a1e5adfc7"},
+    {file = "transformers-4.52.4.tar.gz", hash = "sha256:aff3764441c1adc192a08dba49740d3cbbcb72d850586075aed6bd89b98203e6"},
 ]
 
 [package.dependencies]
@@ -2026,22 +2026,22 @@ tqdm = ">=4.27"
 
 [package.extras]
 accelerate = ["accelerate (>=0.26.0)"]
-all = ["Pillow (>=10.0.1,<=15.0)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "accelerate (>=0.26.0)", "av", "codecarbon (>=2.8.1)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "kernels (>=0.4.4,<0.5)", "librosa", "num2words", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.1,<2.7)", "torchaudio", "torchvision"]
+all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av", "codecarbon (>=2.8.1)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "kernels (>=0.4.4,<0.5)", "librosa", "num2words", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.1,<2.7)", "torchaudio", "torchvision"]
 audio = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
 benchmark = ["optimum-benchmark (>=0.3.0)"]
 codecarbon = ["codecarbon (>=2.8.1)"]
 deepspeed = ["accelerate (>=0.26.0)", "deepspeed (>=0.9.3)"]
-deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.26.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "datasets (!=2.5.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk (<=3.8.1)", "optuna", "parameterized", "protobuf", "psutil", "pydantic", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"]
-dev = ["GitPython (<3.1.19)", "GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "accelerate (>=0.26.0)", "av", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "datasets (!=2.5.0)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "kernels (>=0.4.4,<0.5)", "libcst", "librosa", "nltk (<=3.8.1)", "num2words", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict_core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.1,<2.7)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic_lite (>=1.0.7)", "urllib3 (<2.0.0)"]
-dev-tensorflow = ["GitPython (<3.1.19)", "GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "datasets (!=2.5.0)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "isort (>=5.5.4)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "tf2onnx", "timeout-decorator", "tokenizers (>=0.21,<0.22)", "urllib3 (<2.0.0)"]
-dev-torch = ["GitPython (<3.1.19)", "GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "datasets (!=2.5.0)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "kenlm", "kernels (>=0.4.4,<0.5)", "libcst", "librosa", "nltk (<=3.8.1)", "num2words", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict_core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.1,<2.7)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic_lite (>=1.0.7)", "urllib3 (<2.0.0)"]
+deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.26.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk (<=3.8.1)", "optuna", "parameterized", "protobuf", "psutil", "pydantic", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"]
+dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "kernels (>=0.4.4,<0.5)", "libcst", "librosa", "nltk (<=3.8.1)", "num2words", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.1,<2.7)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
+dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "isort (>=5.5.4)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.21,<0.22)", "urllib3 (<2.0.0)"]
+dev-torch = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "kenlm", "kernels (>=0.4.4,<0.5)", "libcst", "librosa", "nltk (<=3.8.1)", "num2words", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.1,<2.7)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
 flax = ["flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "optax (>=0.0.8,<=0.1.4)", "scipy (<1.13.0)"]
 flax-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
 ftfy = ["ftfy"]
-hf-xet = ["hf_xet"]
+hf-xet = ["hf-xet"]
 hub-kernels = ["kernels (>=0.4.4,<0.5)"]
 integrations = ["kernels (>=0.4.4,<0.5)", "optuna", "ray[tune] (>=2.7.0)", "sigopt"]
-ja = ["fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "rhoknp (>=1.1.0,<1.3.1)", "sudachidict_core (>=20220729)", "sudachipy (>=0.6.6)", "unidic (>=1.0.2)", "unidic_lite (>=1.0.7)"]
+ja = ["fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "rhoknp (>=1.1.0,<1.3.1)", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)"]
 modelcreation = ["cookiecutter (==1.7.3)"]
 natten = ["natten (>=0.14.6,<0.15.0)"]
 num2words = ["num2words"]
@@ -2058,7 +2058,7 @@ serving = ["fastapi", "pydantic", "starlette", "uvicorn"]
 sigopt = ["sigopt"]
 sklearn = ["scikit-learn"]
 speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"]
-testing = ["GitPython (<3.1.19)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk (<=3.8.1)", "parameterized", "psutil", "pydantic", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"]
+testing = ["GitPython (<3.1.19)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk (<=3.8.1)", "parameterized", "psutil", "pydantic", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"]
 tf = ["keras-nlp (>=0.3.1,<0.14.0)", "onnxconverter-common", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx"]
 tf-cpu = ["keras (>2.9,<2.16)", "keras-nlp (>=0.3.1,<0.14.0)", "onnxconverter-common", "tensorflow-cpu (>2.9,<2.16)", "tensorflow-probability (<0.24)", "tensorflow-text (<2.16)", "tf2onnx"]
 tf-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
@@ -2068,7 +2068,7 @@ tokenizers = ["tokenizers (>=0.21,<0.22)"]
 torch = ["accelerate (>=0.26.0)", "torch (>=2.1,<2.7)"]
 torch-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"]
 torch-vision = ["Pillow (>=10.0.1,<=15.0)", "torchvision"]
-torchhub = ["filelock", "huggingface-hub (>=0.30.0,<1.0)", "importlib_metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.21,<0.22)", "torch (>=2.1,<2.7)", "tqdm (>=4.27)"]
+torchhub = ["filelock", "huggingface-hub (>=0.30.0,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.21,<0.22)", "torch (>=2.1,<2.7)", "tqdm (>=4.27)"]
 video = ["av"]
 vision = ["Pillow (>=10.0.1,<=15.0)"]
 
@@ -2290,13 +2290,13 @@ files = [
 
 [[package]]
 name = "zipp"
-version = "3.21.0"
+version = "3.22.0"
 description = "Backport of pathlib-compatible object wrapper for zip files"
 optional = false
 python-versions = ">=3.9"
 files = [
-    {file = "zipp-3.21.0-py3-none-any.whl", hash = "sha256:ac1bbe05fd2991f160ebce24ffbac5f6d11d83dc90891255885223d42b3cd931"},
-    {file = "zipp-3.21.0.tar.gz", hash = "sha256:2c9958f6430a2040341a52eb608ed6dd93ef4392e02ffe219417c1b28b5dd1f4"},
+    {file = "zipp-3.22.0-py3-none-any.whl", hash = "sha256:fe208f65f2aca48b81f9e6fd8cf7b8b32c26375266b009b413d45306b6148343"},
+    {file = "zipp-3.22.0.tar.gz", hash = "sha256:dd2f28c3ce4bc67507bfd3781d21b7bb2be31103b51a4553ad7d90b84e57ace5"},
 ]
 
 [package.extras]
@@ -2304,7 +2304,7 @@ check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"]
 cover = ["pytest-cov"]
 doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
 enabler = ["pytest-enabler (>=2.2)"]
-test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"]
+test = ["big-O", "importlib_resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more_itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"]
 type = ["pytest-mypy"]
 
 [metadata]
diff --git a/model_server/pyproject.toml b/model_server/pyproject.toml
index b2c044c3..43702b01 100644
--- a/model_server/pyproject.toml
+++ b/model_server/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "archgw_modelserver"
-version = "0.3.0"
+version = "0.3.1"
 description = "A model server for serving models"
 authors = ["Katanemo Labs, Inc <info@katanemo.com>"]
 license = "Apache 2.0"
diff --git a/tests/archgw/arch_config.yaml b/tests/archgw/arch_config.yaml
index d1990330..c702887b 100644
--- a/tests/archgw/arch_config.yaml
+++ b/tests/archgw/arch_config.yaml
@@ -1,4 +1,4 @@
-version: "0.1-beta"
+version: v0.1.0
 
 listeners:
   ingress_traffic: