From 662a840ac59dc2c1f6bce6b4414992c635da7b69 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jos=C3=A9=20Ulises=20Ni=C3=B1o=20Rivera?=
 <junr03@users.noreply.github.com>
Date: Mon, 28 Oct 2024 20:05:06 -0400
Subject: [PATCH] Add support for streaming and fixes few issues (see
 description) (#202)

---
 .github/workflows/checks.yml                  |  35 -
 .github/workflows/e2e_tests.yml               |  32 +
 .github/workflows/model-server-tests.yml      |   2 +-
 .github/workflows/rust_tests.yml              |  33 +
 .gitignore                                    |   1 +
 arch/Dockerfile                               |   6 +-
 arch/arch_config_schema.yaml                  |   1 -
 arch/build_filter_image.sh                    |   2 +-
 arch/docker-compose.dev.yaml                  |  16 +-
 arch/docker-compose.e2e.yaml                  |  17 +
 arch/docker-compose.yaml                      |   2 +-
 arch/envoy.template.yaml                      |  46 ++
 arch/tools/cli/config_generator.py            |  15 +-
 archgw.code-workspace                         |   4 +
 chatbot_ui/.vscode/launch.json                |  16 +-
 chatbot_ui/app/arch_util.py                   |  20 +
 chatbot_ui/app/run.py                         | 107 ++-
 chatbot_ui/app/run_stream.py                  |  36 -
 crates/common/src/common_types.rs             | 308 +++++++-
 crates/common/src/configuration.rs            |  10 +-
 crates/common/src/consts.rs                   |   4 +-
 crates/common/src/errors.rs                   |   5 +-
 crates/common/src/tokenizer.rs                |  14 +-
 crates/llm_gateway/src/stream_context.rs      | 229 +++---
 crates/llm_gateway/tests/integration.rs       |  28 +-
 crates/prompt_gateway/src/filter_context.rs   |   2 +-
 crates/prompt_gateway/src/hallucination.rs    |  24 +-
 crates/prompt_gateway/src/http_context.rs     | 222 +++---
 crates/prompt_gateway/src/stream_context.rs   | 194 +++--
 crates/prompt_gateway/tests/integration.rs    |   8 +-
 demos/function_calling/api_server/app/main.py |   6 +-
 demos/function_calling/arch_config.yaml       |  25 +-
 demos/function_calling/docker-compose.yaml    |   6 +-
 demos/llm_routing/arch_config.yaml            |  32 +
 demos/llm_routing/docker-compose.yaml         |  14 +
 e2e_tests/.vscode/settings.json               |   7 +
 e2e_tests/README.md                           |  34 +
 e2e_tests/common.py                           |  42 ++
 e2e_tests/common_scripts.sh                   |  33 +
 e2e_tests/poetry.lock                         | 702 ++++++++++++++++++
 e2e_tests/pyproject.toml                      |  23 +
 e2e_tests/run_e2e_tests.sh                    |  80 ++
 e2e_tests/test_llm_gateway.py                 |  36 +
 e2e_tests/test_prompt_gateway.py              | 262 +++++++
 model_server/app/cli.py                       |   2 +-
 45 files changed, 2266 insertions(+), 477 deletions(-)
 delete mode 100644 .github/workflows/checks.yml
 create mode 100644 .github/workflows/e2e_tests.yml
 create mode 100644 .github/workflows/rust_tests.yml
 create mode 100644 arch/docker-compose.e2e.yaml
 create mode 100644 chatbot_ui/app/arch_util.py
 delete mode 100644 chatbot_ui/app/run_stream.py
 create mode 100644 demos/llm_routing/arch_config.yaml
 create mode 100644 demos/llm_routing/docker-compose.yaml
 create mode 100644 e2e_tests/.vscode/settings.json
 create mode 100644 e2e_tests/README.md
 create mode 100644 e2e_tests/common.py
 create mode 100644 e2e_tests/common_scripts.sh
 create mode 100644 e2e_tests/poetry.lock
 create mode 100644 e2e_tests/pyproject.toml
 create mode 100644 e2e_tests/run_e2e_tests.sh
 create mode 100644 e2e_tests/test_llm_gateway.py
 create mode 100644 e2e_tests/test_prompt_gateway.py

diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml
deleted file mode 100644
index ac33c76c..00000000
--- a/.github/workflows/checks.yml
+++ /dev/null
@@ -1,35 +0,0 @@
-name: Checks
-
-on:
-  pull_request:
-  push:
-    branches: [main]
-
-jobs:
-  test:
-    name: Test
-    runs-on: ubuntu-latest
-    steps:
-      - name: Setup | Checkout
-        uses: actions/checkout@v4
-
-      - name: Setup | Rust
-        run: rustup toolchain install stable --profile minimal
-
-      - name: Setup | Install wasm toolchain
-        run: rustup target add wasm32-wasi
-
-      - name: Run Tests on common crate
-        run: cd crates/common && cargo test
-
-      - name: Build wasm module for prompt_gateway
-        run: cd crates/prompt_gateway && cargo build --release --target=wasm32-wasi
-
-      - name: Run Tests on prompt_gateway crate
-        run: cd crates/prompt_gateway && cargo test
-
-      - name: Build wasm module for llm_gateway
-        run: cd crates/llm_gateway && cargo build --release --target=wasm32-wasi
-
-      - name: Run Tests on llm_gateway crate
-        run: cd crates/llm_gateway && cargo test
diff --git a/.github/workflows/e2e_tests.yml b/.github/workflows/e2e_tests.yml
new file mode 100644
index 00000000..2d53a6d6
--- /dev/null
+++ b/.github/workflows/e2e_tests.yml
@@ -0,0 +1,32 @@
+name: e2e tests
+
+on:
+  push:
+    branches:
+      - main  # Run tests on pushes to the main branch
+  pull_request:
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+
+      - name: Install Poetry
+        run: |
+          curl -sSL https://install.python-poetry.org | python3 -
+          export PATH="$HOME/.local/bin:$PATH"
+
+      - name: Run e2e tests
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          MISTRAL_API_KEY: ${{ secrets.MISTRAL_API_KEY }}
+        run: |
+          cd e2e_tests && bash run_e2e_tests.sh
diff --git a/.github/workflows/model-server-tests.yml b/.github/workflows/model-server-tests.yml
index 1b33b5fc..64489d34 100644
--- a/.github/workflows/model-server-tests.yml
+++ b/.github/workflows/model-server-tests.yml
@@ -1,4 +1,4 @@
-name: Run Model Server tests
+name: model server tests
 
 on:
   push:
diff --git a/.github/workflows/rust_tests.yml b/.github/workflows/rust_tests.yml
new file mode 100644
index 00000000..548e74e1
--- /dev/null
+++ b/.github/workflows/rust_tests.yml
@@ -0,0 +1,33 @@
+name: rust tests (prompt and llm gateway)
+
+on:
+  pull_request:
+  push:
+    branches: [main]
+
+jobs:
+  test:
+    name: Test
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: ./crates
+
+    steps:
+      - name: Setup | Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup | Rust
+        run: rustup toolchain install stable --profile minimal
+
+      - name: Setup | Install wasm toolchain
+        run: rustup target add wasm32-wasi
+
+      - name: Build wasm module
+        run: cargo build --release --target=wasm32-wasi
+
+      - name: Run unit tests
+        run: cargo test --lib
+
+      - name: Run integration tests
+        run: cargo test --test integration
diff --git a/.gitignore b/.gitignore
index 9ed22f96..1a25cc1c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -31,3 +31,4 @@ arch_logs/
 dist/
 crates/*/target/
 crates/target/
+build.log
diff --git a/arch/Dockerfile b/arch/Dockerfile
index 073c0b6b..85721f58 100644
--- a/arch/Dockerfile
+++ b/arch/Dockerfile
@@ -12,6 +12,9 @@ FROM envoyproxy/envoy:v1.31-latest as envoy
 
 #Build config generator, so that we have a single build image for both Rust and Python
 FROM python:3-slim as arch
+
+RUN apt-get update && apt-get install -y gettext-base && apt-get clean && rm -rf /var/lib/apt/lists/*
+
 COPY --from=builder /arch/target/wasm32-wasi/release/prompt_gateway.wasm /etc/envoy/proxy-wasm-plugins/prompt_gateway.wasm
 COPY --from=builder /arch/target/wasm32-wasi/release/llm_gateway.wasm /etc/envoy/proxy-wasm-plugins/llm_gateway.wasm
 COPY --from=envoy /usr/local/bin/envoy /usr/local/bin/envoy
@@ -22,4 +25,5 @@ COPY arch/tools/cli/config_generator.py .
 COPY arch/envoy.template.yaml .
 COPY arch/arch_config_schema.yaml .
 
-CMD ["sh", "-c", "python config_generator.py && envoy -c /etc/envoy/envoy.yaml --component-log-level wasm:debug"]
+
+ENTRYPOINT ["sh", "-c", "python config_generator.py && envsubst < /etc/envoy/envoy.yaml > /etc/envoy.env_sub.yaml && envoy -c /etc/envoy.env_sub.yaml --component-log-level wasm:debug"]
diff --git a/arch/arch_config_schema.yaml b/arch/arch_config_schema.yaml
index 9b63840e..142fe338 100644
--- a/arch/arch_config_schema.yaml
+++ b/arch/arch_config_schema.yaml
@@ -160,4 +160,3 @@ required:
   - version
   - listener
   - llm_providers
-  - prompt_targets
diff --git a/arch/build_filter_image.sh b/arch/build_filter_image.sh
index a0b6f55b..75ac81ce 100644
--- a/arch/build_filter_image.sh
+++ b/arch/build_filter_image.sh
@@ -1 +1 @@
-docker build -t archgw .. -f Dockerfile
+docker build  -f Dockerfile .. -t katanemo/archgw
diff --git a/arch/docker-compose.dev.yaml b/arch/docker-compose.dev.yaml
index 36c364bb..fdf024c6 100644
--- a/arch/docker-compose.dev.yaml
+++ b/arch/docker-compose.dev.yaml
@@ -1,6 +1,6 @@
 services:
   archgw:
-    image: archgw:latest
+    image: katanemo/archgw:latest
     ports:
       - "10000:10000"
       - "11000:11000"
@@ -10,9 +10,13 @@ services:
       - ${ARCH_CONFIG_FILE:-../demos/function_calling/arch_config.yaml}:/config/arch_config.yaml
       - /etc/ssl/cert.pem:/etc/ssl/cert.pem
       - ./envoy.template.yaml:/config/envoy.template.yaml
-      - ./target/wasm32-wasi/release/intelligent_prompt_gateway.wasm:/etc/envoy/proxy-wasm-plugins/intelligent_prompt_gateway.wasm
       - ./arch_config_schema.yaml:/config/arch_config_schema.yaml
-      - ./tools/config_generator.py:/config/config_generator.py
-      - ./arch_logs:/var/log/
-    env_file:
-      - stage.env
+      - ./tools/cli/config_generator.py:/config/config_generator.py
+      - ../crates/target/wasm32-wasi/release/llm_gateway.wasm:/etc/envoy/proxy-wasm-plugins/llm_gateway.wasm
+      - ../crates/target/wasm32-wasi/release/prompt_gateway.wasm:/etc/envoy/proxy-wasm-plugins/prompt_gateway.wasm
+      - ~/archgw_logs:/var/log/
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
+    environment:
+      - OPENAI_API_KEY=${OPENAI_API_KEY:?error}
+      - MISTRAL_API_KEY=${MISTRAL_API_KEY:?error}
diff --git a/arch/docker-compose.e2e.yaml b/arch/docker-compose.e2e.yaml
new file mode 100644
index 00000000..ebff459e
--- /dev/null
+++ b/arch/docker-compose.e2e.yaml
@@ -0,0 +1,17 @@
+services:
+  archgw:
+    image: katanemo/archgw:latest
+    ports:
+      - "10000:10000"
+      - "11000:11000"
+      - "12000:12000"
+      - "19901:9901"
+    volumes:
+      - ${ARCH_CONFIG_FILE:-../demos/function_calling/arch_config.yaml}:/config/arch_config.yaml
+      - /etc/ssl/cert.pem:/etc/ssl/cert.pem
+      - ~/archgw_logs:/var/log/
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
+    environment:
+      - OPENAI_API_KEY=${OPENAI_API_KEY:?error}
+      - MISTRAL_API_KEY=${MISTRAL_API_KEY:?error}
diff --git a/arch/docker-compose.yaml b/arch/docker-compose.yaml
index 0a2e5a99..78345547 100644
--- a/arch/docker-compose.yaml
+++ b/arch/docker-compose.yaml
@@ -7,7 +7,7 @@ services:
       - "12000:12000"
       - "19901:9901"
     volumes:
-      - ${ARCH_CONFIG_FILE:-./demos/function_calling/arch_confg.yaml}:/config/arch_config.yaml
+      - ${ARCH_CONFIG_FILE:-../demos/function_calling/arch_config.yaml}:/config/arch_config.yaml
       - /etc/ssl/cert.pem:/etc/ssl/cert.pem
       - ~/archgw_logs:/var/log/
     env_file:
diff --git a/arch/envoy.template.yaml b/arch/envoy.template.yaml
index 14e26e84..a1ed4472 100644
--- a/arch/envoy.template.yaml
+++ b/arch/envoy.template.yaml
@@ -52,6 +52,15 @@ static_resources:
                             cluster: arch_llm_listener
                             timeout: 60s
                 http_filters:
+                  - name: envoy.filters.http.compressor
+                    typed_config:
+                      "@type": type.googleapis.com/envoy.extensions.filters.http.compressor.v3.Compressor
+                      compressor_library:
+                        name: compress
+                        typed_config:
+                          "@type": type.googleapis.com/envoy.extensions.compression.gzip.compressor.v3.Gzip
+                          memory_level: 3
+                          window_bits: 10
                   - name: envoy.filters.http.wasm
                     typed_config:
                       "@type": type.googleapis.com/udpa.type.v1.TypedStruct
@@ -69,6 +78,17 @@ static_resources:
                             code:
                               local:
                                 filename: "/etc/envoy/proxy-wasm-plugins/prompt_gateway.wasm"
+                  - name: envoy.filters.http.decompressor
+                    typed_config:
+                      "@type": type.googleapis.com/envoy.extensions.filters.http.decompressor.v3.Decompressor
+                      decompressor_library:
+                        name: decompress
+                        typed_config:
+                          "@type": "type.googleapis.com/envoy.extensions.compression.gzip.decompressor.v3.Gzip"
+                          window_bits: 9
+                          chunk_size: 8192
+                          # If this ratio is set too low, then body data will not be decompressed completely.
+                          max_inflate_ratio: 1000
                   - name: envoy.filters.http.router
                     typed_config:
                       "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
@@ -187,6 +207,12 @@ static_resources:
                       domains:
                         - "*"
                       routes:
+                        - match:
+                            prefix: "/healthz"
+                          route:
+                            auto_host_rewrite: true
+                            cluster: openai
+                            timeout: 60s
                       {% for provider in arch_llm_providers %}
                         - match:
                             prefix: "/"
@@ -206,6 +232,15 @@ static_resources:
                             body:
                               inline_string: "x-arch-llm-provider header not set, llm gateway cannot perform routing\n"
                 http_filters:
+                  - name: envoy.filters.http.compressor
+                    typed_config:
+                      "@type": type.googleapis.com/envoy.extensions.filters.http.compressor.v3.Compressor
+                      compressor_library:
+                        name: compress
+                        typed_config:
+                          "@type": type.googleapis.com/envoy.extensions.compression.gzip.compressor.v3.Gzip
+                          memory_level: 3
+                          window_bits: 10
                   - name: envoy.filters.http.wasm
                     typed_config:
                       "@type": type.googleapis.com/udpa.type.v1.TypedStruct
@@ -223,6 +258,17 @@ static_resources:
                             code:
                               local:
                                 filename: "/etc/envoy/proxy-wasm-plugins/llm_gateway.wasm"
+                  - name: envoy.filters.http.decompressor
+                    typed_config:
+                      "@type": type.googleapis.com/envoy.extensions.filters.http.decompressor.v3.Decompressor
+                      decompressor_library:
+                        name: decompress
+                        typed_config:
+                          "@type": "type.googleapis.com/envoy.extensions.compression.gzip.decompressor.v3.Gzip"
+                          window_bits: 9
+                          chunk_size: 8192
+                          # If this ratio is set too low, then body data will not be decompressed completely.
+                          max_inflate_ratio: 1000
                   - name: envoy.filters.http.router
                     typed_config:
                       "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
diff --git a/arch/tools/cli/config_generator.py b/arch/tools/cli/config_generator.py
index 33741ee9..1e5fd4a3 100644
--- a/arch/tools/cli/config_generator.py
+++ b/arch/tools/cli/config_generator.py
@@ -47,13 +47,14 @@ def validate_and_render_schema():
     config_schema_yaml = yaml.safe_load(arch_config_schema)
     inferred_clusters = {}
 
-    for prompt_target in config_yaml["prompt_targets"]:
-        name = prompt_target.get("endpoint", {}).get("name", "")
-        if name not in inferred_clusters:
-            inferred_clusters[name] = {
-                "name": name,
-                "port": 80,  # default port
-            }
+    if "prompt_targets" in config_yaml:
+        for prompt_target in config_yaml["prompt_targets"]:
+            name = prompt_target.get("endpoint", {}).get("name", "")
+            if name not in inferred_clusters:
+                inferred_clusters[name] = {
+                    "name": name,
+                    "port": 80,  # default port
+                }
 
     print(inferred_clusters)
     endpoints = config_yaml.get("endpoints", {})
diff --git a/archgw.code-workspace b/archgw.code-workspace
index 9148057d..230e2225 100644
--- a/archgw.code-workspace
+++ b/archgw.code-workspace
@@ -20,6 +20,10 @@
       "name": "chatbot_ui",
       "path": "chatbot_ui"
     },
+    {
+      "name": "e2e_tests",
+      "path": "e2e_tests"
+    },
     {
       "name": "demos/function_calling",
       "path": "./demos/function_calling",
diff --git a/chatbot_ui/.vscode/launch.json b/chatbot_ui/.vscode/launch.json
index 8b42a191..2064a252 100644
--- a/chatbot_ui/.vscode/launch.json
+++ b/chatbot_ui/.vscode/launch.json
@@ -14,7 +14,9 @@
       "console": "integratedTerminal",
       "env": {
         "LLM": "1",
-        "CHAT_COMPLETION_ENDPOINT": "http://localhost:10000/v1"
+        "CHAT_COMPLETION_ENDPOINT": "http://localhost:10000/v1",
+        "STREAMING": "True",
+        "ARCH_CONFIG": "../../demos/function_calling/arch_config.yaml"
       }
     },
     {
@@ -29,17 +31,5 @@
         "CHAT_COMPLETION_ENDPOINT": "http://localhost:12000/v1"
       }
     },
-    {
-      "name": "chatbot-ui streaming",
-      "cwd": "${workspaceFolder}/app",
-      "type": "debugpy",
-      "request": "launch",
-      "program": "run_stream.py",
-      "console": "integratedTerminal",
-      "env": {
-        "LLM": "1",
-        "CHAT_COMPLETION_ENDPOINT": "http://localhost:10000/v1"
-      }
-    }
   ]
 }
diff --git a/chatbot_ui/app/arch_util.py b/chatbot_ui/app/arch_util.py
new file mode 100644
index 00000000..567640e5
--- /dev/null
+++ b/chatbot_ui/app/arch_util.py
@@ -0,0 +1,20 @@
+import json
+
+
+ARCH_STATE_HEADER = "x-arch-state"
+
+
+def get_arch_messages(response_json):
+    arch_messages = []
+    if response_json and "metadata" in response_json:
+        # load arch_state from metadata
+        arch_state_str = response_json.get("metadata", {}).get(ARCH_STATE_HEADER, "{}")
+        # parse arch_state into json object
+        arch_state = json.loads(arch_state_str)
+        # load messages from arch_state
+        arch_messages_str = arch_state.get("messages", "[]")
+        # parse messages into json object
+        arch_messages = json.loads(arch_messages_str)
+        # append messages from arch gateway to history
+        return arch_messages
+    return []
diff --git a/chatbot_ui/app/run.py b/chatbot_ui/app/run.py
index 05a6a6db..b0d5acc6 100644
--- a/chatbot_ui/app/run.py
+++ b/chatbot_ui/app/run.py
@@ -2,14 +2,17 @@ import json
 import os
 import logging
 import yaml
+from arch_util import get_arch_messages
 import gradio as gr
 
 from typing import List, Optional, Tuple
-from openai import OpenAI, DefaultHttpxClient
+from openai import OpenAI
 from dotenv import load_dotenv
 
 load_dotenv()
 
+STREAM_RESPONSE = bool(os.getenv("STREAM_RESPOSE", True))
+
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s - %(levelname)s - %(message)s",
@@ -20,7 +23,6 @@ log = logging.getLogger(__name__)
 CHAT_COMPLETION_ENDPOINT = os.getenv("CHAT_COMPLETION_ENDPOINT")
 log.info(f"CHAT_COMPLETION_ENDPOINT: {CHAT_COMPLETION_ENDPOINT}")
 
-ARCH_STATE_HEADER = "x-arch-state"
 
 CSS_STYLE = """
 .json-container {
@@ -37,7 +39,7 @@ footer {visibility: hidden}
 client = OpenAI(
     api_key="--",
     base_url=CHAT_COMPLETION_ENDPOINT,
-    http_client=DefaultHttpxClient(headers={"accept-encoding": "*"}),
+    # http_client=DefaultHttpxClient(headers={"accept-encoding": "*"}),
 )
 
 
@@ -69,7 +71,7 @@ def convert_prompt_target_to_openai_format(target):
 
 def get_prompt_targets():
     try:
-        with open("arch_config.yaml", "r") as file:
+        with open(os.getenv("ARCH_CONFIG", "arch_config.yaml"), "r") as file:
             config = yaml.safe_load(file)
 
             available_tools = []
@@ -105,48 +107,85 @@ def chat(query: Optional[str], conversation: Optional[List[Tuple[str, str]]], st
             temperature=1.0,
             # metadata=metadata,
             extra_headers=custom_headers,
+            stream=STREAM_RESPONSE,
         )
     except Exception as e:
         log.info(e)
         # remove last user message in case of exception
         history.pop()
-        log.info("Error calling gateway API: {}".format(e.message))
-        raise gr.Error("Error calling gateway API: {}".format(e.message))
+        log.info("Error calling gateway API: {}".format(e))
+        raise gr.Error("Error calling gateway API: {}".format(e))
 
-    log.error(f"raw_response: {raw_response.text}")
-    response = raw_response.parse()
+    if STREAM_RESPONSE:
+        response = raw_response.parse()
+        history.append({"role": "assistant", "content": "", "model": ""})
+        # for gradio UI we don't want to show raw tool calls and messages from developer application
+        # so we're filtering those out
+        history_view = [h for h in history if h["role"] != "tool" and "content" in h]
 
-    # extract arch_state from metadata and store it in gradio session state
-    # this state must be passed back to the gateway in the next request
-    response_json = json.loads(raw_response.text)
-    log.info(response_json)
-    if response_json and "metadata" in response_json:
-        # load arch_state from metadata
-        arch_state_str = response_json.get("metadata", {}).get(ARCH_STATE_HEADER, "{}")
-        # parse arch_state into json object
-        arch_state = json.loads(arch_state_str)
-        # load messages from arch_state
-        arch_messages_str = arch_state.get("messages", "[]")
-        # parse messages into json object
-        arch_messages = json.loads(arch_messages_str)
-        # append messages from arch gateway to history
-        for message in arch_messages:
-            history.append(message)
+        messages = [
+            (history_view[i]["content"], history_view[i + 1]["content"])
+            for i in range(0, len(history_view) - 1, 2)
+        ]
 
-    content = response.choices[0].message.content
+        for chunk in response:
+            if len(chunk.choices) > 0:
+                if chunk.choices[0].delta.role:
+                    if history[-1]["role"] != chunk.choices[0].delta.role:
+                        history.append(
+                            {
+                                "role": chunk.choices[0].delta.role,
+                                "content": chunk.choices[0].delta.content,
+                                "model": chunk.model,
+                                "tool_calls": chunk.choices[0].delta.tool_calls,
+                            }
+                        )
 
-    history.append({"role": "assistant", "content": content, "model": response.model})
+                history[-1]["model"] = chunk.model
+                if chunk.choices[0].delta.content:
+                    if not history[-1]["content"]:
+                        history[-1]["content"] = ""
+                    history[-1]["content"] = (
+                        history[-1]["content"] + chunk.choices[0].delta.content
+                    )
+                if chunk.choices[0].delta.tool_calls:
+                    history[-1]["tool_calls"] = chunk.choices[0].delta.tool_calls
 
-    # for gradio UI we don't want to show raw tool calls and messages from developer application
-    # so we're filtering those out
-    history_view = [h for h in history if h["role"] != "tool" and "content" in h]
+                if chunk.model and chunk.choices[0].delta.content:
+                    messages[-1] = (
+                        messages[-1][0],
+                        messages[-1][1] + chunk.choices[0].delta.content,
+                    )
+                yield "", messages, state
+    else:
+        log.error(f"raw_response: {raw_response.text}")
+        response = raw_response.parse()
 
-    messages = [
-        (history_view[i]["content"], history_view[i + 1]["content"])
-        for i in range(0, len(history_view) - 1, 2)
-    ]
+        # extract arch_state from metadata and store it in gradio session state
+        # this state must be passed back to the gateway in the next request
+        response_json = json.loads(raw_response.text)
+        log.info(response_json)
 
-    return "", messages, state
+        arch_messages = get_arch_messages(response_json)
+        for arch_message in arch_messages:
+            history.append(arch_message)
+
+        content = response.choices[0].message.content
+
+        history.append(
+            {"role": "assistant", "content": content, "model": response.model}
+        )
+
+        # for gradio UI we don't want to show raw tool calls and messages from developer application
+        # so we're filtering those out
+        history_view = [h for h in history if h["role"] != "tool" and "content" in h]
+
+        messages = [
+            (history_view[i]["content"], history_view[i + 1]["content"])
+            for i in range(0, len(history_view) - 1, 2)
+        ]
+
+        yield "", messages, state
 
 
 def main():
diff --git a/chatbot_ui/app/run_stream.py b/chatbot_ui/app/run_stream.py
deleted file mode 100644
index 8be5a16b..00000000
--- a/chatbot_ui/app/run_stream.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# copied from https://www.gradio.app/guides/creating-a-chatbot-fast#a-streaming-example-using-openai
-
-import os
-from openai import OpenAI
-import gradio as gr
-
-api_key = os.getenv("OPENAI_API_KEY")
-CHAT_COMPLETION_ENDPOINT = os.getenv(
-    "CHAT_COMPLETION_ENDPOINT", "https://api.openai.com/v1"
-)
-
-client = OpenAI(api_key=api_key, base_url=CHAT_COMPLETION_ENDPOINT)
-
-
-def predict(message, history):
-    history_openai_format = []
-    for human, assistant in history:
-        history_openai_format.append({"role": "user", "content": human})
-        history_openai_format.append({"role": "assistant", "content": assistant})
-    history_openai_format.append({"role": "user", "content": message})
-
-    response = client.chat.completions.create(
-        model="gpt-3.5-turbo",
-        messages=history_openai_format,
-        temperature=1.0,
-        stream=True,
-    )
-
-    partial_message = ""
-    for chunk in response:
-        if chunk.choices[0].delta.content is not None:
-            partial_message = partial_message + chunk.choices[0].delta.content
-            yield partial_message
-
-
-gr.ChatInterface(predict).launch(server_name="0.0.0.0", server_port=8081)
diff --git a/crates/common/src/common_types.rs b/crates/common/src/common_types.rs
index c8f91e0f..35404096 100644
--- a/crates/common/src/common_types.rs
+++ b/crates/common/src/common_types.rs
@@ -34,11 +34,16 @@ pub struct SearchPointResult {
 }
 
 pub mod open_ai {
-    use std::collections::HashMap;
+    use std::{
+        collections::{HashMap, VecDeque},
+        fmt::Display,
+    };
 
     use serde::{ser::SerializeMap, Deserialize, Serialize};
     use serde_yaml::Value;
 
+    use crate::consts::{ARCH_FC_MODEL_NAME, ASSISTANT_ROLE};
+
     #[derive(Debug, Clone, Serialize, Deserialize)]
     pub struct ChatCompletionsRequest {
         #[serde(default)]
@@ -182,12 +187,16 @@ pub mod open_ai {
     #[derive(Debug, Clone, Serialize, Deserialize)]
     pub struct Message {
         pub role: String,
+
         #[serde(skip_serializing_if = "Option::is_none")]
         pub content: Option<String>,
+
         #[serde(skip_serializing_if = "Option::is_none")]
         pub model: Option<String>,
+
         #[serde(skip_serializing_if = "Option::is_none")]
         pub tool_calls: Option<Vec<ToolCall>>,
+
         #[serde(skip_serializing_if = "Option::is_none")]
         pub tool_call_id: Option<String>,
     }
@@ -235,17 +244,116 @@ pub mod open_ai {
         pub metadata: Option<HashMap<String, String>>,
     }
 
+    impl ChatCompletionsResponse {
+        pub fn new(message: String) -> Self {
+            ChatCompletionsResponse {
+                choices: vec![Choice {
+                    message: Message {
+                        role: ASSISTANT_ROLE.to_string(),
+                        content: Some(message),
+                        model: Some(ARCH_FC_MODEL_NAME.to_string()),
+                        tool_calls: None,
+                        tool_call_id: None,
+                    },
+                    index: 0,
+                    finish_reason: "done".to_string(),
+                }],
+                usage: None,
+                model: ARCH_FC_MODEL_NAME.to_string(),
+                metadata: None,
+            }
+        }
+    }
+
     #[derive(Debug, Clone, Serialize, Deserialize)]
     pub struct Usage {
         pub completion_tokens: usize,
     }
 
     #[derive(Debug, Clone, Serialize, Deserialize)]
-    pub struct ChatCompletionChunkResponse {
-        pub model: String,
+    pub struct ChatCompletionStreamResponse {
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub model: Option<String>,
         pub choices: Vec<ChunkChoice>,
     }
 
+    impl ChatCompletionStreamResponse {
+        pub fn new(
+            response: Option<String>,
+            role: Option<String>,
+            model: Option<String>,
+            tool_calls: Option<Vec<ToolCall>>,
+        ) -> Self {
+            ChatCompletionStreamResponse {
+                model,
+                choices: vec![ChunkChoice {
+                    delta: Delta {
+                        role,
+                        content: response,
+                        tool_calls,
+                        model: None,
+                        tool_call_id: None,
+                    },
+                    finish_reason: None,
+                }],
+            }
+        }
+    }
+
+    #[derive(Debug, thiserror::Error)]
+    pub enum ChatCompletionChunkResponseError {
+        #[error("failed to deserialize")]
+        Deserialization(#[from] serde_json::Error),
+        #[error("empty content in data chunk")]
+        EmptyContent,
+        #[error("no chunks present")]
+        NoChunks,
+    }
+
+    pub struct ChatCompletionStreamResponseServerEvents {
+        pub events: Vec<ChatCompletionStreamResponse>,
+    }
+
+    impl Display for ChatCompletionStreamResponseServerEvents {
+        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+            let tokens_str = self
+                .events
+                .iter()
+                .map(|response_chunk| {
+                    if response_chunk.choices.is_empty() {
+                        return "".to_string();
+                    }
+                    response_chunk.choices[0]
+                        .delta
+                        .content
+                        .clone()
+                        .unwrap_or("".to_string())
+                })
+                .collect::<Vec<String>>()
+                .join("");
+
+            write!(f, "{}", tokens_str)
+        }
+    }
+
+    impl TryFrom<&str> for ChatCompletionStreamResponseServerEvents {
+        type Error = ChatCompletionChunkResponseError;
+
+        fn try_from(value: &str) -> Result<Self, Self::Error> {
+            let response_chunks: VecDeque<ChatCompletionStreamResponse> = value
+                .lines()
+                .filter(|line| line.starts_with("data: "))
+                .map(|line| line.get(6..).unwrap())
+                .filter(|data_chunk| *data_chunk != "[DONE]")
+                .map(serde_json::from_str::<ChatCompletionStreamResponse>)
+                .collect::<Result<VecDeque<ChatCompletionStreamResponse>, _>>()?;
+
+            Ok(ChatCompletionStreamResponseServerEvents {
+                events: response_chunks.into(),
+            })
+        }
+    }
+
     #[derive(Debug, Clone, Serialize, Deserialize)]
     pub struct ChunkChoice {
         pub delta: Delta,
@@ -255,7 +363,30 @@ pub mod open_ai {
 
     #[derive(Debug, Clone, Serialize, Deserialize)]
     pub struct Delta {
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub role: Option<String>,
+
+        #[serde(skip_serializing_if = "Option::is_none")]
         pub content: Option<String>,
+
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub tool_calls: Option<Vec<ToolCall>>,
+
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub model: Option<String>,
+
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub tool_call_id: Option<String>,
+    }
+
+    pub fn to_server_events(chunks: Vec<ChatCompletionStreamResponse>) -> String {
+        let mut response_str = String::new();
+        for chunk in chunks.iter() {
+            response_str.push_str("data: ");
+            response_str.push_str(&serde_json::to_string(&chunk).unwrap());
+            response_str.push_str("\n\n");
+        }
+        response_str
     }
 }
 
@@ -313,7 +444,7 @@ pub struct PromptGuardResponse {
 
 #[cfg(test)]
 mod test {
-    use crate::common_types::open_ai::Message;
+    use crate::common_types::open_ai::{ChatCompletionStreamResponseServerEvents, Message};
     use pretty_assertions::{assert_eq, assert_ne};
     use std::collections::HashMap;
 
@@ -448,4 +579,173 @@ mod test {
             ParameterType::String
         );
     }
+
+    #[test]
+    fn stream_chunk_parse() {
+        use super::open_ai::{ChatCompletionStreamResponse, ChunkChoice, Delta};
+
+        const CHUNK_RESPONSE: &str = r#"data: {"id":"chatcmpl-ALmdmtKulBMEq3fRLbrnxJwcKOqvS","object":"chat.completion.chunk","created":1729755226,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{"role":"assistant","content":"","refusal":null},"logprobs":null,"finish_reason":null}]}
+
+data: {"id":"chatcmpl-ALmdmtKulBMEq3fRLbrnxJwcKOqvS","object":"chat.completion.chunk","created":1729755226,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{"content":"Hello"},"logprobs":null,"finish_reason":null}]}
+
+data: {"id":"chatcmpl-ALmdmtKulBMEq3fRLbrnxJwcKOqvS","object":"chat.completion.chunk","created":1729755226,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{"content":"!"},"logprobs":null,"finish_reason":null}]}
+
+data: {"id":"chatcmpl-ALmdmtKulBMEq3fRLbrnxJwcKOqvS","object":"chat.completion.chunk","created":1729755226,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{"content":" How"},"logprobs":null,"finish_reason":null}]}
+
+data: {"id":"chatcmpl-ALmdmtKulBMEq3fRLbrnxJwcKOqvS","object":"chat.completion.chunk","created":1729755226,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{"content":" can"},"logprobs":null,"finish_reason":null}]}
+
+
+"#;
+
+        let sever_events =
+            ChatCompletionStreamResponseServerEvents::try_from(CHUNK_RESPONSE).unwrap();
+        assert_eq!(sever_events.events.len(), 5);
+        assert_eq!(
+            sever_events.events[0].choices[0]
+                .delta
+                .content
+                .as_ref()
+                .unwrap(),
+            ""
+        );
+        assert_eq!(
+            sever_events.events[1].choices[0]
+                .delta
+                .content
+                .as_ref()
+                .unwrap(),
+            "Hello"
+        );
+        assert_eq!(
+            sever_events.events[2].choices[0]
+                .delta
+                .content
+                .as_ref()
+                .unwrap(),
+            "!"
+        );
+        assert_eq!(
+            sever_events.events[3].choices[0]
+                .delta
+                .content
+                .as_ref()
+                .unwrap(),
+            " How"
+        );
+        assert_eq!(
+            sever_events.events[4].choices[0]
+                .delta
+                .content
+                .as_ref()
+                .unwrap(),
+            " can"
+        );
+        assert_eq!(sever_events.to_string(), "Hello! How can");
+    }
+
+    #[test]
+    fn stream_chunk_parse_done() {
+        use super::open_ai::{ChatCompletionStreamResponse, ChunkChoice, Delta};
+
+        const CHUNK_RESPONSE: &str = r#"data: {"id":"chatcmpl-ALn2KTfmrIpYd9N3Un4Kyg08WIIP6","object":"chat.completion.chunk","created":1729756748,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{"content":" I"},"logprobs":null,"finish_reason":null}]}
+
+data: {"id":"chatcmpl-ALn2KTfmrIpYd9N3Un4Kyg08WIIP6","object":"chat.completion.chunk","created":1729756748,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{"content":" assist"},"logprobs":null,"finish_reason":null}]}
+
+data: {"id":"chatcmpl-ALn2KTfmrIpYd9N3Un4Kyg08WIIP6","object":"chat.completion.chunk","created":1729756748,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{"content":" you"},"logprobs":null,"finish_reason":null}]}
+
+data: {"id":"chatcmpl-ALn2KTfmrIpYd9N3Un4Kyg08WIIP6","object":"chat.completion.chunk","created":1729756748,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{"content":" today"},"logprobs":null,"finish_reason":null}]}
+
+data: {"id":"chatcmpl-ALn2KTfmrIpYd9N3Un4Kyg08WIIP6","object":"chat.completion.chunk","created":1729756748,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{"content":"?"},"logprobs":null,"finish_reason":null}]}
+
+data: {"id":"chatcmpl-ALn2KTfmrIpYd9N3Un4Kyg08WIIP6","object":"chat.completion.chunk","created":1729756748,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{},"logprobs":null,"finish_reason":"stop"}]}
+
+data: [DONE]
+"#;
+
+        let sever_events: ChatCompletionStreamResponseServerEvents =
+            ChatCompletionStreamResponseServerEvents::try_from(CHUNK_RESPONSE).unwrap();
+        assert_eq!(sever_events.events.len(), 6);
+        assert_eq!(
+            sever_events.events[0].choices[0]
+                .delta
+                .content
+                .as_ref()
+                .unwrap(),
+            " I"
+        );
+        assert_eq!(
+            sever_events.events[1].choices[0]
+                .delta
+                .content
+                .as_ref()
+                .unwrap(),
+            " assist"
+        );
+        assert_eq!(
+            sever_events.events[2].choices[0]
+                .delta
+                .content
+                .as_ref()
+                .unwrap(),
+            " you"
+        );
+        assert_eq!(
+            sever_events.events[3].choices[0]
+                .delta
+                .content
+                .as_ref()
+                .unwrap(),
+            " today"
+        );
+        assert_eq!(
+            sever_events.events[4].choices[0]
+                .delta
+                .content
+                .as_ref()
+                .unwrap(),
+            "?"
+        );
+        assert_eq!(sever_events.events[5].choices[0].delta.content, None);
+
+        assert_eq!(sever_events.to_string(), " I assist you today?");
+    }
+
+    #[test]
+    fn stream_chunk_parse_mistral() {
+        use super::open_ai::{ChatCompletionStreamResponse, ChunkChoice, Delta};
+
+        const CHUNK_RESPONSE: &str = r#"data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"role":"assistant","content":""},"finish_reason":null}]}
+
+data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"content":"Hello"},"finish_reason":null}]}
+
+data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"content":"!"},"finish_reason":null}]}
+
+data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"content":" How"},"finish_reason":null}]}
+
+data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"content":" can"},"finish_reason":null}]}
+
+data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"content":" I"},"finish_reason":null}]}
+
+data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"content":" assist"},"finish_reason":null}]}
+
+data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"content":" you"},"finish_reason":null}]}
+
+data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"content":" today"},"finish_reason":null}]}
+
+data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"content":"?"},"finish_reason":null}]}
+
+data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"content":""},"finish_reason":"stop"}],"usage":{"prompt_tokens":4,"total_tokens":13,"completion_tokens":9}}
+
+data: [DONE]
+"#;
+
+        let sever_events: ChatCompletionStreamResponseServerEvents =
+            ChatCompletionStreamResponseServerEvents::try_from(CHUNK_RESPONSE).unwrap();
+        assert_eq!(sever_events.events.len(), 11);
+
+        assert_eq!(
+            sever_events.to_string(),
+            "Hello! How can I assist you today?"
+        );
+    }
 }
diff --git a/crates/common/src/configuration.rs b/crates/common/src/configuration.rs
index 293dad09..ef57845a 100644
--- a/crates/common/src/configuration.rs
+++ b/crates/common/src/configuration.rs
@@ -27,12 +27,12 @@ pub enum GatewayMode {
 pub struct Configuration {
     pub version: String,
     pub listener: Listener,
-    pub endpoints: HashMap<String, Endpoint>,
+    pub endpoints: Option<HashMap<String, Endpoint>>,
     pub llm_providers: Vec<LlmProvider>,
     pub overrides: Option<Overrides>,
     pub system_prompt: Option<String>,
     pub prompt_guards: Option<PromptGuards>,
-    pub prompt_targets: Vec<PromptTarget>,
+    pub prompt_targets: Option<Vec<PromptTarget>>,
     pub error_target: Option<ErrorTargetDetail>,
     pub ratelimits: Option<Vec<Ratelimit>>,
     pub tracing: Option<Tracing>,
@@ -246,8 +246,10 @@ mod test {
         );
 
         let prompt_targets = &config.prompt_targets;
-        assert_eq!(prompt_targets.len(), 2);
+        assert_eq!(prompt_targets.as_ref().unwrap().len(), 2);
         let prompt_target = prompt_targets
+            .as_ref()
+            .unwrap()
             .iter()
             .find(|p| p.name == "reboot_network_device")
             .unwrap();
@@ -255,6 +257,8 @@ mod test {
         assert_eq!(prompt_target.default, None);
 
         let prompt_target = prompt_targets
+            .as_ref()
+            .unwrap()
             .iter()
             .find(|p| p.name == "information_extraction")
             .unwrap();
diff --git a/crates/common/src/consts.rs b/crates/common/src/consts.rs
index 81df31f8..f782cf99 100644
--- a/crates/common/src/consts.rs
+++ b/crates/common/src/consts.rs
@@ -18,6 +18,7 @@ pub const ARCH_ROUTING_HEADER: &str = "x-arch-llm-provider";
 pub const MESSAGES_KEY: &str = "messages";
 pub const ARCH_PROVIDER_HINT_HEADER: &str = "x-arch-llm-provider-hint";
 pub const CHAT_COMPLETIONS_PATH: &str = "/v1/chat/completions";
+pub const HEALTHZ_PATH: &str = "/healthz";
 pub const ARCH_STATE_HEADER: &str = "x-arch-state";
 pub const ARCH_FC_MODEL_NAME: &str = "Arch-Function-1.5B";
 pub const REQUEST_ID_HEADER: &str = "x-request-id";
@@ -25,4 +26,5 @@ pub const ARCH_INTERNAL_CLUSTER_NAME: &str = "arch_internal";
 pub const ARCH_UPSTREAM_HOST_HEADER: &str = "x-arch-upstream";
 pub const ARCH_LLM_UPSTREAM_LISTENER: &str = "arch_llm_listener";
 pub const ARCH_MODEL_PREFIX: &str = "Arch";
-pub const HALLUCINATION_TEMPLATE: &str = "It seems I’m missing some information. Could you provide the following details ";
+pub const HALLUCINATION_TEMPLATE: &str =
+    "It seems I'm missing some information. Could you provide the following details ";
diff --git a/crates/common/src/errors.rs b/crates/common/src/errors.rs
index 27b0341e..6808793d 100644
--- a/crates/common/src/errors.rs
+++ b/crates/common/src/errors.rs
@@ -1,6 +1,7 @@
 use proxy_wasm::types::Status;
+use serde_json::error;
 
-use crate::ratelimit;
+use crate::{common_types::open_ai::ChatCompletionChunkResponseError, ratelimit};
 
 #[derive(thiserror::Error, Debug)]
 pub enum ClientError {
@@ -37,4 +38,6 @@ pub enum ServerError {
     ExceededRatelimit(ratelimit::Error),
     #[error("{why}")]
     BadRequest { why: String },
+    #[error("error in streaming response")]
+    Streaming(#[from] ChatCompletionChunkResponseError),
 }
diff --git a/crates/common/src/tokenizer.rs b/crates/common/src/tokenizer.rs
index 25ac924e..aa0870f2 100644
--- a/crates/common/src/tokenizer.rs
+++ b/crates/common/src/tokenizer.rs
@@ -1,17 +1,19 @@
 use log::debug;
 
-#[derive(Debug, PartialEq, Eq)]
+#[derive(thiserror::Error, Debug, PartialEq, Eq)]
 #[allow(dead_code)]
 pub enum Error {
-    UnknownModel,
-    FailedToTokenize,
+    #[error("Unknown model: {model_name}")]
+    UnknownModel { model_name: String },
 }
 
 #[allow(dead_code)]
 pub fn token_count(model_name: &str, text: &str) -> Result<usize, Error> {
     debug!("getting token count model={}", model_name);
     // Consideration: is it more expensive to instantiate the BPE object every time, or to contend the singleton?
-    let bpe = tiktoken_rs::get_bpe_from_model(model_name).map_err(|_| Error::UnknownModel)?;
+    let bpe = tiktoken_rs::get_bpe_from_model(model_name).map_err(|_| Error::UnknownModel {
+        model_name: model_name.to_string(),
+    })?;
     Ok(bpe.encode_ordinary(text).len())
 }
 
@@ -32,7 +34,9 @@ mod test {
     #[test]
     fn unrecognized_model() {
         assert_eq!(
-            Error::UnknownModel,
+            Error::UnknownModel {
+                model_name: "unknown".to_string()
+            },
             token_count("unknown", "").expect_err("unknown model")
         )
     }
diff --git a/crates/llm_gateway/src/stream_context.rs b/crates/llm_gateway/src/stream_context.rs
index bd2fba5e..c0e1212c 100644
--- a/crates/llm_gateway/src/stream_context.rs
+++ b/crates/llm_gateway/src/stream_context.rs
@@ -1,23 +1,21 @@
 use crate::filter_context::WasmMetrics;
 use common::common_types::open_ai::{
-    ArchState, ChatCompletionChunkResponse, ChatCompletionsRequest, ChatCompletionsResponse,
-    Message, ToolCall, ToolCallState,
+    ChatCompletionStreamResponseServerEvents, ChatCompletionsRequest, ChatCompletionsResponse,
+    StreamOptions,
 };
 use common::configuration::LlmProvider;
 use common::consts::{
-    ARCH_PROVIDER_HINT_HEADER, ARCH_ROUTING_HEADER, ARCH_STATE_HEADER, CHAT_COMPLETIONS_PATH,
-    RATELIMIT_SELECTOR_HEADER_KEY, REQUEST_ID_HEADER, USER_ROLE,
+    ARCH_PROVIDER_HINT_HEADER, ARCH_ROUTING_HEADER, CHAT_COMPLETIONS_PATH,
+    RATELIMIT_SELECTOR_HEADER_KEY, REQUEST_ID_HEADER,
 };
 use common::errors::ServerError;
 use common::llm_providers::LlmProviders;
 use common::ratelimit::Header;
 use common::{ratelimit, routing, tokenizer};
 use http::StatusCode;
-use log::debug;
+use log::{debug, trace, warn};
 use proxy_wasm::traits::*;
 use proxy_wasm::types::*;
-use serde_json::Value;
-use sha2::{Digest, Sha256};
 use std::num::NonZero;
 use std::rc::Rc;
 
@@ -26,15 +24,10 @@ use common::stats::IncrementingMetric;
 pub struct StreamContext {
     context_id: u32,
     metrics: Rc<WasmMetrics>,
-    tool_calls: Option<Vec<ToolCall>>,
-    tool_call_response: Option<String>,
-    arch_state: Option<Vec<ArchState>>,
     ratelimit_selector: Option<Header>,
     streaming_response: bool,
-    user_prompt: Option<Message>,
     response_tokens: usize,
     is_chat_completions_request: bool,
-    chat_completions_request: Option<ChatCompletionsRequest>,
     llm_providers: Rc<LlmProviders>,
     llm_provider: Option<Rc<LlmProvider>>,
     request_id: Option<String>,
@@ -45,13 +38,8 @@ impl StreamContext {
         StreamContext {
             context_id,
             metrics,
-            chat_completions_request: None,
-            tool_calls: None,
-            tool_call_response: None,
-            arch_state: None,
             ratelimit_selector: None,
             streaming_response: false,
-            user_prompt: None,
             response_tokens: 0,
             is_chat_completions_request: false,
             llm_providers,
@@ -223,6 +211,21 @@ impl HttpContext for StreamContext {
             .clone_from(&self.llm_provider.as_ref().unwrap().model);
         let chat_completion_request_str = serde_json::to_string(&deserialized_body).unwrap();
 
+        trace!(
+            "arch => {:?}, body: {}",
+            deserialized_body.model,
+            chat_completion_request_str
+        );
+
+        if deserialized_body.stream {
+            self.streaming_response = true;
+        }
+        if deserialized_body.stream && deserialized_body.stream_options.is_none() {
+            deserialized_body.stream_options = Some(StreamOptions {
+                include_usage: true,
+            });
+        }
+
         // enforce ratelimits on ingress
         if let Err(e) =
             self.enforce_ratelimits(&deserialized_body.model, &chat_completion_request_str)
@@ -235,10 +238,6 @@ impl HttpContext for StreamContext {
             return Action::Continue;
         }
 
-        debug!(
-            "arch => {:?}, body: {}",
-            deserialized_body.model, chat_completion_request_str
-        );
         self.set_http_request_body(0, body_size, chat_completion_request_str.as_bytes());
 
         Action::Continue
@@ -246,78 +245,112 @@ impl HttpContext for StreamContext {
 
     fn on_http_response_body(&mut self, body_size: usize, end_of_stream: bool) -> Action {
         debug!(
-            "recv [S={}] bytes={} end_stream={}",
+            "on_http_response_body [S={}] bytes={} end_stream={}",
             self.context_id, body_size, end_of_stream
         );
 
         if !self.is_chat_completions_request {
-            if let Some(body_str) = self
-                .get_http_response_body(0, body_size)
-                .and_then(|bytes| String::from_utf8(bytes).ok())
-            {
-                debug!("recv [S={}] body_str={}", self.context_id, body_str);
-            }
+            debug!("non-chatcompletion request");
             return Action::Continue;
         }
 
-        if !end_of_stream {
-            return Action::Pause;
-        }
-
-        let body = self
-            .get_http_response_body(0, body_size)
-            .expect("cant get response body");
-
-        if self.streaming_response {
-            let body_str = String::from_utf8(body).expect("body is not utf-8");
-            debug!("streaming response");
-            let chat_completions_data = match body_str.split_once("data: ") {
-                Some((_, chat_completions_data)) => chat_completions_data,
+        let body = if self.streaming_response {
+            if end_of_stream && body_size == 0 {
+                return Action::Continue;
+            }
+            let chunk_start = 0;
+            let chunk_size = body_size;
+            debug!(
+                "streaming response reading, {}..{}",
+                chunk_start, chunk_size
+            );
+            let streaming_chunk = match self.get_http_response_body(0, chunk_size) {
+                Some(chunk) => chunk,
                 None => {
-                    self.send_server_error(
-                        ServerError::LogicError(String::from("parsing error in streaming data")),
-                        None,
+                    warn!(
+                        "response body empty, chunk_start: {}, chunk_size: {}",
+                        chunk_start, chunk_size
                     );
-                    return Action::Pause;
+                    return Action::Continue;
                 }
             };
 
-            let chat_completions_chunk_response: ChatCompletionChunkResponse =
-                match serde_json::from_str(chat_completions_data) {
-                    Ok(de) => de,
-                    Err(_) => {
-                        if chat_completions_data != "[NONE]" {
-                            self.send_server_error(
-                                ServerError::LogicError(String::from(
-                                    "error in streaming response",
-                                )),
-                                None,
-                            );
-                            return Action::Continue;
-                        }
+            if streaming_chunk.len() != chunk_size {
+                warn!(
+                    "chunk size mismatch: read: {} != requested: {}",
+                    streaming_chunk.len(),
+                    chunk_size
+                );
+            }
+            streaming_chunk
+        } else {
+            debug!("non streaming response bytes read: 0:{}", body_size);
+            match self.get_http_response_body(0, body_size) {
+                Some(body) => body,
+                None => {
+                    warn!("non streaming response body empty");
+                    return Action::Continue;
+                }
+            }
+        };
+
+        let body_utf8 = match String::from_utf8(body) {
+            Ok(body_utf8) => body_utf8,
+            Err(e) => {
+                debug!("could not convert to utf8: {}", e);
+                return Action::Continue;
+            }
+        };
+
+        if self.streaming_response {
+            let chat_completions_chunk_response_events =
+                match ChatCompletionStreamResponseServerEvents::try_from(body_utf8.as_str()) {
+                    Ok(response) => response,
+                    Err(e) => {
+                        debug!(
+                            "invalid streaming response: body str: {}, {:?}",
+                            body_utf8, e
+                        );
                         return Action::Continue;
                     }
                 };
 
-            if let Some(content) = chat_completions_chunk_response
-                .choices
+            if chat_completions_chunk_response_events.events.is_empty() {
+                debug!("empty streaming response");
+                return Action::Continue;
+            }
+
+            let mut model = chat_completions_chunk_response_events
+                .events
                 .first()
                 .unwrap()
-                .delta
-                .content
-                .as_ref()
+                .model
+                .clone();
+            let tokens_str = chat_completions_chunk_response_events.to_string();
+            //HACK: add support for tokenizing mistral and other models
+            //filed issue https://github.com/katanemo/arch/issues/222
+            if model.as_ref().unwrap().starts_with("mistral")
+                || model.as_ref().unwrap().starts_with("ministral")
             {
-                let model = &chat_completions_chunk_response.model;
-                let token_count = tokenizer::token_count(model, content).unwrap_or(0);
-                self.response_tokens += token_count;
+                model = Some("gpt-4".to_string());
             }
+            let token_count =
+                match tokenizer::token_count(model.as_ref().unwrap().as_str(), tokens_str.as_str())
+                {
+                    Ok(token_count) => token_count,
+                    Err(e) => {
+                        debug!("could not get token count: {:?}", e);
+                        return Action::Continue;
+                    }
+                };
+            self.response_tokens += token_count;
         } else {
             debug!("non streaming response");
             let chat_completions_response: ChatCompletionsResponse =
-                match serde_json::from_slice(&body) {
+                match serde_json::from_str(body_utf8.as_str()) {
                     Ok(de) => de,
                     Err(_e) => {
-                        debug!("invalid response: {}", String::from_utf8_lossy(&body));
+                        debug!("invalid response: {}", body_utf8);
                         return Action::Continue;
                     }
                 };
@@ -329,65 +362,6 @@ impl HttpContext for StreamContext {
                     .unwrap()
                     .completion_tokens;
             }
-
-            if let Some(tool_calls) = self.tool_calls.as_ref() {
-                if !tool_calls.is_empty() {
-                    if self.arch_state.is_none() {
-                        self.arch_state = Some(Vec::new());
-                    }
-
-                    // compute sha hash from message history
-                    let mut hasher = Sha256::new();
-                    let prompts: Vec<String> = self
-                        .chat_completions_request
-                        .as_ref()
-                        .unwrap()
-                        .messages
-                        .iter()
-                        .filter(|msg| msg.role == USER_ROLE)
-                        .map(|msg| msg.content.clone().unwrap())
-                        .collect();
-                    let prompts_merged = prompts.join("#.#");
-                    hasher.update(prompts_merged.clone());
-                    let hash_key = hasher.finalize();
-                    // conver hash to hex string
-                    let hash_key_str = format!("{:x}", hash_key);
-                    debug!("hash key: {}, prompts: {}", hash_key_str, prompts_merged);
-
-                    // create new tool call state
-                    let tool_call_state = ToolCallState {
-                        key: hash_key_str,
-                        message: self.user_prompt.clone(),
-                        tool_call: tool_calls[0].function.clone(),
-                        tool_response: self.tool_call_response.clone().unwrap(),
-                    };
-
-                    // push tool call state to arch state
-                    self.arch_state
-                        .as_mut()
-                        .unwrap()
-                        .push(ArchState::ToolCall(vec![tool_call_state]));
-
-                    let mut data: Value = serde_json::from_slice(&body).unwrap();
-                    // use serde::Value to manipulate the json object and ensure that we don't lose any data
-                    if let Value::Object(ref mut map) = data {
-                        // serialize arch state and add to metadata
-                        let arch_state_str = serde_json::to_string(&self.arch_state).unwrap();
-                        debug!("arch_state: {}", arch_state_str);
-                        let metadata = map
-                            .entry("metadata")
-                            .or_insert(Value::Object(serde_json::Map::new()));
-                        metadata.as_object_mut().unwrap().insert(
-                            ARCH_STATE_HEADER.to_string(),
-                            serde_json::Value::String(arch_state_str),
-                        );
-
-                        let data_serialized = serde_json::to_string(&data).unwrap();
-                        debug!("arch => user: {}", data_serialized);
-                        self.set_http_response_body(0, body_size, data_serialized.as_bytes());
-                    };
-                }
-            }
         }
 
         debug!(
@@ -395,7 +369,6 @@ impl HttpContext for StreamContext {
             self.context_id, self.response_tokens, end_of_stream
         );
 
-        // TODO:: ratelimit based on response tokens.
         Action::Continue
     }
 }
diff --git a/crates/llm_gateway/tests/integration.rs b/crates/llm_gateway/tests/integration.rs
index 7ec92ccd..5a5ed4a8 100644
--- a/crates/llm_gateway/tests/integration.rs
+++ b/crates/llm_gateway/tests/integration.rs
@@ -149,14 +149,14 @@ ratelimits:
       key: selector-key
       value: selector-value
     limit:
-      tokens: 50
+      tokens: 100
       unit: minute
 "#
 }
 
 #[test]
 #[serial]
-fn successful_request_to_open_ai_chat_completions() {
+fn llm_gateway_successful_request_to_open_ai_chat_completions() {
     let args = tester::MockSettings {
         wasm_path: wasm_module(),
         quiet: false,
@@ -207,7 +207,7 @@ fn successful_request_to_open_ai_chat_completions() {
         )
         .expect_get_buffer_bytes(Some(BufferType::HttpRequestBody))
         .returning(Some(chat_completions_request_body))
-        .expect_log(Some(LogLevel::Debug), None)
+        .expect_log(Some(LogLevel::Trace), None)
         .expect_log(Some(LogLevel::Debug), None)
         .expect_log(Some(LogLevel::Debug), None)
         .expect_set_buffer_bytes(Some(BufferType::HttpRequestBody), None)
@@ -217,7 +217,7 @@ fn successful_request_to_open_ai_chat_completions() {
 
 #[test]
 #[serial]
-fn bad_request_to_open_ai_chat_completions() {
+fn llm_gateway_bad_request_to_open_ai_chat_completions() {
     let args = tester::MockSettings {
         wasm_path: wasm_module(),
         quiet: false,
@@ -279,7 +279,7 @@ fn bad_request_to_open_ai_chat_completions() {
 
 #[test]
 #[serial]
-fn request_ratelimited() {
+fn llm_gateway_request_ratelimited() {
     let args = tester::MockSettings {
         wasm_path: wasm_module(),
         quiet: false,
@@ -306,11 +306,11 @@ fn request_ratelimited() {
     \"messages\": [\
     {\
         \"role\": \"system\",\
-        \"content\": \"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair.\"\
+        \"content\": \"You are a helpful poetic assistant!, skilled in explaining complex programming concepts with creative flair. Be sure to be concise and to the point.\"\
     },\
     {\
         \"role\": \"user\",\
-        \"content\": \"Compose a poem that explains the concept of recursion in programming. Compose a poem that explains the concept of recursion in programming. Compose a poem that explains the concept of recursion in programming. \"\
+        \"content\": \"Compose a poem that explains the concept of recursion in programming. Compose a poem that explains the concept of recursion in programming. Compose a poem that explains the concept of recursion in programming. And also summarize it how a 4th graded would understand it.\"\
     }\
     ],\
     \"model\": \"gpt-4\"\
@@ -325,6 +325,7 @@ fn request_ratelimited() {
         .expect_get_buffer_bytes(Some(BufferType::HttpRequestBody))
         .returning(Some(chat_completions_request_body))
         // The actual call is not important in this test, we just need to grab the token_id
+        .expect_log(Some(LogLevel::Trace), None)
         .expect_log(Some(LogLevel::Debug), None)
         .expect_log(Some(LogLevel::Debug), None)
         .expect_log(Some(LogLevel::Debug), None)
@@ -342,7 +343,7 @@ fn request_ratelimited() {
 
 #[test]
 #[serial]
-fn request_not_ratelimited() {
+fn llm_gateway_request_not_ratelimited() {
     let args = tester::MockSettings {
         wasm_path: wasm_module(),
         quiet: false,
@@ -388,17 +389,10 @@ fn request_not_ratelimited() {
         .expect_get_buffer_bytes(Some(BufferType::HttpRequestBody))
         .returning(Some(chat_completions_request_body))
         // The actual call is not important in this test, we just need to grab the token_id
+        .expect_log(Some(LogLevel::Trace), None)
         .expect_log(Some(LogLevel::Debug), None)
         .expect_log(Some(LogLevel::Debug), None)
-        .expect_log(Some(LogLevel::Debug), None)
-        // .expect_metric_increment("active_http_calls", 1)
-        .expect_send_local_response(
-            Some(StatusCode::TOO_MANY_REQUESTS.as_u16().into()),
-            None,
-            None,
-            None,
-        )
-        .expect_metric_increment("ratelimited_rq", 1)
+        .expect_set_buffer_bytes(Some(BufferType::HttpRequestBody), None)
         .execute_and_expect(ReturnType::Action(Action::Continue))
         .unwrap();
 }
diff --git a/crates/prompt_gateway/src/filter_context.rs b/crates/prompt_gateway/src/filter_context.rs
index 3f1d3f0d..de120369 100644
--- a/crates/prompt_gateway/src/filter_context.rs
+++ b/crates/prompt_gateway/src/filter_context.rs
@@ -243,7 +243,7 @@ impl RootContext for FilterContext {
         self.overrides = Rc::new(config.overrides);
 
         let mut prompt_targets = HashMap::new();
-        for pt in config.prompt_targets {
+        for pt in config.prompt_targets.unwrap_or_default() {
             prompt_targets.insert(pt.name.clone(), pt.clone());
         }
         self.system_prompt = Rc::new(config.system_prompt);
diff --git a/crates/prompt_gateway/src/hallucination.rs b/crates/prompt_gateway/src/hallucination.rs
index c4425957..130f8723 100644
--- a/crates/prompt_gateway/src/hallucination.rs
+++ b/crates/prompt_gateway/src/hallucination.rs
@@ -1,9 +1,9 @@
 use common::{
     common_types::open_ai::Message,
-    consts::{ARCH_MODEL_PREFIX, USER_ROLE, HALLUCINATION_TEMPLATE},
+    consts::{ARCH_MODEL_PREFIX, HALLUCINATION_TEMPLATE, USER_ROLE},
 };
 
-pub fn extract_messages_for_hallucination(messages: &Vec<Message>) -> Vec<String> {
+pub fn extract_messages_for_hallucination(messages: &[Message]) -> Vec<String> {
     let mut arch_assistant = false;
     let mut user_messages = Vec::new();
     if messages.len() >= 2 {
@@ -18,11 +18,11 @@ pub fn extract_messages_for_hallucination(messages: &Vec<Message>) -> Vec<String
         for message in messages.iter().rev() {
             if let Some(model) = message.model.as_ref() {
                 if !model.starts_with(ARCH_MODEL_PREFIX) {
-                  if let Some(content) = &message.content {
-                      if !content.starts_with(HALLUCINATION_TEMPLATE) {
-                          break;
-                      }
-                  }
+                    if let Some(content) = &message.content {
+                        if !content.starts_with(HALLUCINATION_TEMPLATE) {
+                            break;
+                        }
+                    }
                 }
             }
             if message.role == USER_ROLE {
@@ -37,13 +37,13 @@ pub fn extract_messages_for_hallucination(messages: &Vec<Message>) -> Vec<String
         }
     }
     user_messages.reverse(); // Reverse to maintain the original order
-    return user_messages;
+    user_messages
 }
 
 #[cfg(test)]
 mod test {
-    use pretty_assertions::assert_eq;
     use common::common_types::open_ai::Message;
+    use pretty_assertions::assert_eq;
 
     use super::extract_messages_for_hallucination;
 
@@ -160,7 +160,9 @@ mod test {
         let messages_for_halluncination = extract_messages_for_hallucination(&messages);
         println!("{:?}", messages_for_halluncination);
         assert_eq!(messages_for_halluncination.len(), 3);
-        assert_eq!(["tell me about the weather", "Seattle", "7 days"], messages_for_halluncination.as_slice());
+        assert_eq!(
+            ["tell me about the weather", "Seattle", "7 days"],
+            messages_for_halluncination.as_slice()
+        );
     }
-
 }
diff --git a/crates/prompt_gateway/src/http_context.rs b/crates/prompt_gateway/src/http_context.rs
index da0e69c9..3b014009 100644
--- a/crates/prompt_gateway/src/http_context.rs
+++ b/crates/prompt_gateway/src/http_context.rs
@@ -3,14 +3,14 @@ use std::{collections::HashMap, time::Duration};
 use common::{
     common_types::{
         open_ai::{
-            ArchState, ChatCompletionsRequest, ChatCompletionsResponse, Message, StreamOptions,
+            to_server_events, ArchState, ChatCompletionStreamResponse, ChatCompletionsRequest,
         },
         PromptGuardRequest, PromptGuardTask,
     },
     consts::{
         ARCH_FC_MODEL_NAME, ARCH_INTERNAL_CLUSTER_NAME, ARCH_STATE_HEADER,
         ARCH_UPSTREAM_HOST_HEADER, ASSISTANT_ROLE, CHAT_COMPLETIONS_PATH, GUARD_INTERNAL_HOST,
-        REQUEST_ID_HEADER, TOOL_ROLE, USER_ROLE,
+        HEALTHZ_PATH, REQUEST_ID_HEADER, TOOL_ROLE, USER_ROLE,
     },
     errors::ServerError,
     http::{CallArgs, Client},
@@ -33,8 +33,17 @@ impl HttpContext for StreamContext {
         // manipulate the body in benign ways e.g., compression.
         self.set_http_request_header("content-length", None);
 
-        self.is_chat_completions_request =
-            self.get_http_request_header(":path").unwrap_or_default() == CHAT_COMPLETIONS_PATH;
+        let request_path = self.get_http_request_header(":path").unwrap_or_default();
+        if request_path == HEALTHZ_PATH {
+            if self.embeddings_store.is_none() {
+                self.send_http_response(503, vec![], None);
+            } else {
+                self.send_http_response(200, vec![], None);
+            }
+            return Action::Continue;
+        }
+
+        self.is_chat_completions_request = request_path == CHAT_COMPLETIONS_PATH;
 
         trace!(
             "on_http_request_headers S[{}] req_headers={:?}",
@@ -80,21 +89,23 @@ impl HttpContext for StreamContext {
             }
         };
 
-        debug!("developer => archgw: {}", String::from_utf8_lossy(&body_bytes));
+        debug!(
+            "developer => archgw: {}",
+            String::from_utf8_lossy(&body_bytes)
+        );
 
         // Deserialize body into spec.
         // Currently OpenAI API.
-        let mut deserialized_body: ChatCompletionsRequest =
-            match serde_json::from_slice(&body_bytes) {
-                Ok(deserialized) => deserialized,
-                Err(e) => {
-                    self.send_server_error(
-                        ServerError::Deserialization(e),
-                        Some(StatusCode::BAD_REQUEST),
-                    );
-                    return Action::Pause;
-                }
-            };
+        let deserialized_body: ChatCompletionsRequest = match serde_json::from_slice(&body_bytes) {
+            Ok(deserialized) => deserialized,
+            Err(e) => {
+                self.send_server_error(
+                    ServerError::Deserialization(e),
+                    Some(StatusCode::BAD_REQUEST),
+                );
+                return Action::Pause;
+            }
+        };
 
         self.arch_state = match deserialized_body.metadata {
             Some(ref metadata) => {
@@ -110,11 +121,6 @@ impl HttpContext for StreamContext {
         };
 
         self.streaming_response = deserialized_body.stream;
-        if deserialized_body.stream && deserialized_body.stream_options.is_none() {
-            deserialized_body.stream_options = Some(StreamOptions {
-                include_usage: true,
-            });
-        }
 
         let last_user_prompt = match deserialized_body
             .messages
@@ -235,105 +241,111 @@ impl HttpContext for StreamContext {
         );
 
         if !self.is_chat_completions_request {
-            if let Some(body_str) = self
-                .get_http_response_body(0, body_size)
-                .and_then(|bytes| String::from_utf8(bytes).ok())
-            {
-                debug!("recv [S={}] body_str={}", self.context_id, body_str);
-            }
+            debug!("non-streaming request");
             return Action::Continue;
         }
 
-        if !end_of_stream {
-            return Action::Pause;
-        }
+        let body = if self.streaming_response {
+            let streaming_chunk = match self.get_http_response_body(0, body_size) {
+                Some(chunk) => chunk,
+                None => {
+                    warn!(
+                        "response body empty, chunk_start: {}, chunk_size: {}",
+                        0, body_size
+                    );
+                    return Action::Continue;
+                }
+            };
 
-        let body = self
-            .get_http_response_body(0, body_size)
-            .expect("cant get response body");
+            if streaming_chunk.len() != body_size {
+                warn!(
+                    "chunk size mismatch: read: {} != requested: {}",
+                    streaming_chunk.len(),
+                    body_size
+                );
+            }
+
+            streaming_chunk
+        } else {
+            debug!("non streaming response bytes read: 0:{}", body_size);
+            match self.get_http_response_body(0, body_size) {
+                Some(body) => body,
+                None => {
+                    warn!("non streaming response body empty");
+                    return Action::Continue;
+                }
+            }
+        };
+
+        let body_utf8 = match String::from_utf8(body) {
+            Ok(body_utf8) => body_utf8,
+            Err(e) => {
+                debug!("could not convert to utf8: {}", e);
+                return Action::Continue;
+            }
+        };
 
         if self.streaming_response {
             trace!("streaming response");
-        } else {
-            trace!("non streaming response");
-            let chat_completions_response: ChatCompletionsResponse =
-                match serde_json::from_slice(&body) {
-                    Ok(de) => de,
-                    Err(e) => {
-                        trace!(
-                            "invalid response: {}, {}",
-                            String::from_utf8_lossy(&body),
-                            e
-                        );
-                        return Action::Continue;
-                    }
-                };
 
-            if chat_completions_response.usage.is_some() {
-                self.response_tokens += chat_completions_response
-                    .usage
-                    .as_ref()
-                    .unwrap()
-                    .completion_tokens;
+            if self.tool_calls.is_some() && !self.tool_calls.as_ref().unwrap().is_empty() {
+                let chunks = vec![
+                    ChatCompletionStreamResponse::new(
+                        None,
+                        Some(ASSISTANT_ROLE.to_string()),
+                        Some(ARCH_FC_MODEL_NAME.to_string()),
+                        self.tool_calls.to_owned(),
+                    ),
+                    ChatCompletionStreamResponse::new(
+                        self.tool_call_response.clone(),
+                        Some(TOOL_ROLE.to_string()),
+                        Some(ARCH_FC_MODEL_NAME.to_string()),
+                        None,
+                    ),
+                ];
+
+                let mut response_str = to_server_events(chunks);
+                // append the original response from the model to the stream
+                response_str.push_str(&body_utf8);
+                self.set_http_response_body(0, body_size, response_str.as_bytes());
+                self.tool_calls = None;
             }
+        } else if let Some(tool_calls) = self.tool_calls.as_ref() {
+            if !tool_calls.is_empty() {
+                if self.arch_state.is_none() {
+                    self.arch_state = Some(Vec::new());
+                }
 
-            if let Some(tool_calls) = self.tool_calls.as_ref() {
-                if !tool_calls.is_empty() {
-                    if self.arch_state.is_none() {
-                        self.arch_state = Some(Vec::new());
+                let mut data = serde_json::from_str(&body_utf8).unwrap();
+                // use serde::Value to manipulate the json object and ensure that we don't lose any data
+                if let Value::Object(ref mut map) = data {
+                    // serialize arch state and add to metadata
+                    let metadata = map
+                        .entry("metadata")
+                        .or_insert(Value::Object(serde_json::Map::new()));
+                    if metadata == &Value::Null {
+                        *metadata = Value::Object(serde_json::Map::new());
                     }
 
-                    let mut data = serde_json::from_slice(&body).unwrap();
-                    // use serde::Value to manipulate the json object and ensure that we don't lose any data
-                    if let Value::Object(ref mut map) = data {
-                        // serialize arch state and add to metadata
-                        let metadata = map
-                            .entry("metadata")
-                            .or_insert(Value::Object(serde_json::Map::new()));
-                        if metadata == &Value::Null {
-                            *metadata = Value::Object(serde_json::Map::new());
-                        }
-
-                        // since arch gateway generates tool calls (using arch-fc) and calls upstream api to
-                        // get response, we will send these back to developer so they can see the api response
-                        // and tool call arch-fc generated
-                        let fc_messages = vec![
-                            Message {
-                                role: ASSISTANT_ROLE.to_string(),
-                                content: None,
-                                model: Some(ARCH_FC_MODEL_NAME.to_string()),
-                                tool_calls: self.tool_calls.clone(),
-                                tool_call_id: None,
-                            },
-                            Message {
-                                role: TOOL_ROLE.to_string(),
-                                content: self.tool_call_response.clone(),
-                                model: None,
-                                tool_calls: None,
-                                tool_call_id: Some(self.tool_calls.as_ref().unwrap()[0].id.clone()),
-                            },
-                        ];
-                        let fc_messages_str = serde_json::to_string(&fc_messages).unwrap();
-                        let arch_state = HashMap::from([("messages".to_string(), fc_messages_str)]);
-                        let arch_state_str = serde_json::to_string(&arch_state).unwrap();
-                        metadata.as_object_mut().unwrap().insert(
-                            ARCH_STATE_HEADER.to_string(),
-                            serde_json::Value::String(arch_state_str),
-                        );
-                        let data_serialized = serde_json::to_string(&data).unwrap();
-                        debug!("archgw <= developer: {}", data_serialized);
-                        self.set_http_response_body(0, body_size, data_serialized.as_bytes());
-                    };
-                }
+                    let fc_messages = vec![
+                        self.generate_toll_call_message(),
+                        self.generate_api_response_message(),
+                    ];
+                    let fc_messages_str = serde_json::to_string(&fc_messages).unwrap();
+                    let arch_state = HashMap::from([("messages".to_string(), fc_messages_str)]);
+                    let arch_state_str = serde_json::to_string(&arch_state).unwrap();
+                    metadata.as_object_mut().unwrap().insert(
+                        ARCH_STATE_HEADER.to_string(),
+                        serde_json::Value::String(arch_state_str),
+                    );
+                    let data_serialized = serde_json::to_string(&data).unwrap();
+                    debug!("archgw <= developer: {}", data_serialized);
+                    self.set_http_response_body(0, body_size, data_serialized.as_bytes());
+                };
             }
         }
 
-        trace!(
-            "recv [S={}] total_tokens={} end_stream={}",
-            self.context_id,
-            self.response_tokens,
-            end_of_stream
-        );
+        trace!("recv [S={}] end_stream={}", self.context_id, end_of_stream);
 
         Action::Continue
     }
diff --git a/crates/prompt_gateway/src/stream_context.rs b/crates/prompt_gateway/src/stream_context.rs
index 6f4a36ea..5d79d181 100644
--- a/crates/prompt_gateway/src/stream_context.rs
+++ b/crates/prompt_gateway/src/stream_context.rs
@@ -2,9 +2,9 @@ use crate::filter_context::{EmbeddingsStore, WasmMetrics};
 use crate::hallucination::extract_messages_for_hallucination;
 use acap::cos;
 use common::common_types::open_ai::{
-    ArchState, ChatCompletionTool, ChatCompletionsRequest, ChatCompletionsResponse, Choice,
-    FunctionDefinition, FunctionParameter, FunctionParameters, Message, ParameterType, ToolCall,
-    ToolType,
+    to_server_events, ArchState, ChatCompletionStreamResponse, ChatCompletionTool,
+    ChatCompletionsRequest, ChatCompletionsResponse, FunctionDefinition, FunctionParameter,
+    FunctionParameters, Message, ParameterType, ToolCall, ToolType,
 };
 use common::common_types::{
     EmbeddingType, HallucinationClassificationRequest, HallucinationClassificationResponse,
@@ -12,7 +12,12 @@ use common::common_types::{
 };
 use common::configuration::{Overrides, PromptGuards, PromptTarget};
 use common::consts::{
-    ARCH_FC_INTERNAL_HOST, ARCH_FC_MODEL_NAME, ARCH_FC_REQUEST_TIMEOUT_MS, ARCH_INTERNAL_CLUSTER_NAME, MESSAGES_KEY, ARCH_MODEL_PREFIX, ARCH_STATE_HEADER, ARCH_UPSTREAM_HOST_HEADER, ASSISTANT_ROLE, DEFAULT_EMBEDDING_MODEL, HALLUCINATION_TEMPLATE, DEFAULT_HALLUCINATED_THRESHOLD, DEFAULT_INTENT_MODEL, DEFAULT_PROMPT_TARGET_THRESHOLD, EMBEDDINGS_INTERNAL_HOST, HALLUCINATION_INTERNAL_HOST, REQUEST_ID_HEADER, SYSTEM_ROLE, TOOL_ROLE, USER_ROLE, ZEROSHOT_INTERNAL_HOST
+    ARCH_FC_INTERNAL_HOST, ARCH_FC_MODEL_NAME, ARCH_FC_REQUEST_TIMEOUT_MS,
+    ARCH_INTERNAL_CLUSTER_NAME, ARCH_MODEL_PREFIX, ARCH_STATE_HEADER, ARCH_UPSTREAM_HOST_HEADER,
+    ASSISTANT_ROLE, DEFAULT_EMBEDDING_MODEL, DEFAULT_HALLUCINATED_THRESHOLD, DEFAULT_INTENT_MODEL,
+    DEFAULT_PROMPT_TARGET_THRESHOLD, EMBEDDINGS_INTERNAL_HOST, HALLUCINATION_INTERNAL_HOST,
+    HALLUCINATION_TEMPLATE, MESSAGES_KEY, REQUEST_ID_HEADER, SYSTEM_ROLE, TOOL_ROLE, USER_ROLE,
+    ZEROSHOT_INTERNAL_HOST,
 };
 use common::embeddings::{
     CreateEmbeddingRequest, CreateEmbeddingRequestInput, CreateEmbeddingResponse,
@@ -57,7 +62,7 @@ pub struct StreamCallContext {
 pub struct StreamContext {
     system_prompt: Rc<Option<String>>,
     prompt_targets: Rc<HashMap<String, PromptTarget>>,
-    embeddings_store: Option<Rc<EmbeddingsStore>>,
+    pub embeddings_store: Option<Rc<EmbeddingsStore>>,
     overrides: Rc<Option<Overrides>>,
     pub metrics: Rc<WasmMetrics>,
     pub callouts: RefCell<HashMap<u32, StreamCallContext>>,
@@ -66,9 +71,8 @@ pub struct StreamContext {
     pub tool_call_response: Option<String>,
     pub arch_state: Option<Vec<ArchState>>,
     pub request_body_size: usize,
-    pub streaming_response: bool,
     pub user_prompt: Option<Message>,
-    pub response_tokens: usize,
+    pub streaming_response: bool,
     pub is_chat_completions_request: bool,
     pub chat_completions_request: Option<ChatCompletionsRequest>,
     pub prompt_guards: Rc<PromptGuards>,
@@ -99,7 +103,6 @@ impl StreamContext {
             request_body_size: 0,
             streaming_response: false,
             user_prompt: None,
-            response_tokens: 0,
             is_chat_completions_request: false,
             prompt_guards,
             overrides,
@@ -300,13 +303,17 @@ impl StreamContext {
         body: Vec<u8>,
         callout_context: StreamCallContext,
     ) {
-        let boyd_str = String::from_utf8(body).expect("could not convert body to string");
-        debug!("archgw <= hallucination response: {}", boyd_str);
+        let body_str = String::from_utf8(body).expect("could not convert body to string");
+        debug!("archgw <= hallucination response: {}", body_str);
         let hallucination_response: HallucinationClassificationResponse =
-            match serde_json::from_str(boyd_str.as_str()) {
+            match serde_json::from_str(body_str.as_str()) {
                 Ok(hallucination_response) => hallucination_response,
                 Err(e) => {
-                    warn!("error deserializing hallucination response: {}", e);
+                    warn!(
+                        "error deserializing hallucination response: {}, body: {}",
+                        e,
+                        body_str.as_str()
+                    );
                     return self.send_server_error(ServerError::Deserialization(e), None);
                 }
             };
@@ -323,37 +330,36 @@ impl StreamContext {
 
         if !keys_with_low_score.is_empty() {
             let response =
-                    HALLUCINATION_TEMPLATE.to_string()
-                    + &keys_with_low_score.join(", ")
-                    + " ?";
-            let message = Message {
-                role: ASSISTANT_ROLE.to_string(),
-                content: Some(response),
-                model: Some(ARCH_FC_MODEL_NAME.to_string()),
-                tool_calls: None,
-                tool_call_id: None,
-            };
+                HALLUCINATION_TEMPLATE.to_string() + &keys_with_low_score.join(", ") + " ?";
 
-            let chat_completion_response = ChatCompletionsResponse {
-                choices: vec![Choice {
-                    message,
-                    index: 0,
-                    finish_reason: "done".to_string(),
-                }],
-                usage: None,
-                model: ARCH_FC_MODEL_NAME.to_string(),
-                metadata: None,
-            };
+            let response_str = if self.streaming_response {
+                let chunks = vec![
+                    ChatCompletionStreamResponse::new(
+                        None,
+                        Some(ASSISTANT_ROLE.to_string()),
+                        Some(ARCH_FC_MODEL_NAME.to_owned()),
+                        None,
+                    ),
+                    ChatCompletionStreamResponse::new(
+                        Some(response),
+                        None,
+                        Some(ARCH_FC_MODEL_NAME.to_owned()),
+                        None,
+                    ),
+                ];
 
-            trace!("hallucination response: {:?}", chat_completion_response);
+                to_server_events(chunks)
+            } else {
+                let chat_completion_response = ChatCompletionsResponse::new(response);
+                serde_json::to_string(&chat_completion_response).unwrap()
+            };
+            debug!("hallucination response: {:?}", response_str);
+            // make sure on_http_response_body does not attach tool calls and tool response to the response
+            self.tool_calls = None;
             self.send_http_response(
                 StatusCode::OK.as_u16().into(),
                 vec![("Powered-By", "Katanemo")],
-                Some(
-                    serde_json::to_string(&chat_completion_response)
-                        .unwrap()
-                        .as_bytes(),
-                ),
+                Some(response_str.as_bytes()),
             );
         } else {
             // not a hallucination, resume the flow
@@ -629,6 +635,7 @@ impl StreamContext {
             .message
             .tool_calls
             .clone_into(&mut self.tool_calls);
+
         if self.tool_calls.as_ref().unwrap().len() > 1 {
             warn!(
                 "multiple tool calls not supported yet, tool_calls count found: {}",
@@ -643,10 +650,39 @@ impl StreamContext {
 
             //TODO: add resolver name to the response so the client can send the response back to the correct resolver
 
+            let direct_response_str = if self.streaming_response {
+                let chunks = vec![
+                    ChatCompletionStreamResponse::new(
+                        None,
+                        Some(ASSISTANT_ROLE.to_string()),
+                        Some(ARCH_FC_MODEL_NAME.to_owned()),
+                        None,
+                    ),
+                    ChatCompletionStreamResponse::new(
+                        Some(
+                            arch_fc_response.choices[0]
+                                .message
+                                .content
+                                .as_ref()
+                                .unwrap()
+                                .clone(),
+                        ),
+                        None,
+                        Some(ARCH_FC_MODEL_NAME.to_owned()),
+                        None,
+                    ),
+                ];
+
+                to_server_events(chunks)
+            } else {
+                body_str
+            };
+
+            self.tool_calls = None;
             return self.send_http_response(
                 StatusCode::OK.as_u16().into(),
                 vec![("Powered-By", "Katanemo")],
-                Some(body_str.as_bytes()),
+                Some(direct_response_str.as_bytes()),
             );
         }
 
@@ -943,7 +979,7 @@ impl StreamContext {
         self.get_embeddings(callout_context);
     }
 
-    pub fn default_target_handler(&self, body: Vec<u8>, callout_context: StreamCallContext) {
+    pub fn default_target_handler(&self, body: Vec<u8>, mut callout_context: StreamCallContext) {
         let prompt_target = self
             .prompt_targets
             .get(callout_context.prompt_target_name.as_ref().unwrap())
@@ -951,8 +987,34 @@ impl StreamContext {
             .clone();
 
         // check if the default target should be dispatched to the LLM provider
-        if !prompt_target.auto_llm_dispatch_on_response.unwrap_or(false) {
-            let default_target_response_str = String::from_utf8(body).unwrap();
+        if !prompt_target
+            .auto_llm_dispatch_on_response
+            .unwrap_or_default()
+        {
+            let default_target_response_str = if self.streaming_response {
+                let chat_completion_response =
+                    serde_json::from_slice::<ChatCompletionsResponse>(&body).unwrap();
+
+                let chunks = vec![
+                    ChatCompletionStreamResponse::new(
+                        None,
+                        Some(ASSISTANT_ROLE.to_string()),
+                        Some(chat_completion_response.model.clone()),
+                        None,
+                    ),
+                    ChatCompletionStreamResponse::new(
+                        chat_completion_response.choices[0].message.content.clone(),
+                        None,
+                        Some(chat_completion_response.model.clone()),
+                        None,
+                    ),
+                ];
+
+                to_server_events(chunks)
+            } else {
+                String::from_utf8(body).unwrap()
+            };
+
             self.send_http_response(
                 StatusCode::OK.as_u16().into(),
                 vec![("Powered-By", "Katanemo")],
@@ -960,20 +1022,20 @@ impl StreamContext {
             );
             return;
         }
+
         let chat_completions_resp: ChatCompletionsResponse = match serde_json::from_slice(&body) {
             Ok(chat_completions_resp) => chat_completions_resp,
             Err(e) => {
-                warn!("error deserializing default target response: {}", e);
+                warn!(
+                    "error deserializing default target response: {}, body str: {}",
+                    e,
+                    String::from_utf8(body).unwrap()
+                );
                 return self.send_server_error(ServerError::Deserialization(e), None);
             }
         };
-        let api_resp = chat_completions_resp.choices[0]
-            .message
-            .content
-            .as_ref()
-            .unwrap();
-        let mut messages = callout_context.request_body.messages;
 
+        let mut messages = Vec::new();
         // add system prompt
         match prompt_target.system_prompt.as_ref() {
             None => {}
@@ -989,13 +1051,24 @@ impl StreamContext {
             }
         }
 
+        messages.append(&mut callout_context.request_body.messages);
+
+        let api_resp = chat_completions_resp.choices[0]
+            .message
+            .content
+            .as_ref()
+            .unwrap();
+
+        let user_message = messages.pop().unwrap();
+        let message = format!("{}\ncontext: {}", user_message.content.unwrap(), api_resp);
         messages.push(Message {
             role: USER_ROLE.to_string(),
-            content: Some(api_resp.clone()),
+            content: Some(message),
             model: None,
             tool_calls: None,
             tool_call_id: None,
         });
+
         let chat_completion_request = ChatCompletionsRequest {
             model: self
                 .chat_completions_request
@@ -1009,11 +1082,32 @@ impl StreamContext {
             stream_options: callout_context.request_body.stream_options,
             metadata: None,
         };
+
         let json_resp = serde_json::to_string(&chat_completion_request).unwrap();
         debug!("archgw => (default target) llm request: {}", json_resp);
         self.set_http_request_body(0, self.request_body_size, json_resp.as_bytes());
         self.resume_http_request();
     }
+
+    pub fn generate_toll_call_message(&mut self) -> Message {
+        Message {
+            role: ASSISTANT_ROLE.to_string(),
+            content: None,
+            model: Some(ARCH_FC_MODEL_NAME.to_string()),
+            tool_calls: self.tool_calls.clone(),
+            tool_call_id: None,
+        }
+    }
+
+    pub fn generate_api_response_message(&mut self) -> Message {
+        Message {
+            role: TOOL_ROLE.to_string(),
+            content: self.tool_call_response.clone(),
+            model: None,
+            tool_calls: None,
+            tool_call_id: Some(self.tool_calls.as_ref().unwrap()[0].id.clone()),
+        }
+    }
 }
 
 impl Client for StreamContext {
diff --git a/crates/prompt_gateway/tests/integration.rs b/crates/prompt_gateway/tests/integration.rs
index 27eac427..1bf581c5 100644
--- a/crates/prompt_gateway/tests/integration.rs
+++ b/crates/prompt_gateway/tests/integration.rs
@@ -375,7 +375,7 @@ ratelimits:
 
 #[test]
 #[serial]
-fn successful_request_to_open_ai_chat_completions() {
+fn prompt_gateway_successful_request_to_open_ai_chat_completions() {
     let args = tester::MockSettings {
         wasm_path: wasm_module(),
         quiet: false,
@@ -438,7 +438,7 @@ fn successful_request_to_open_ai_chat_completions() {
 
 #[test]
 #[serial]
-fn bad_request_to_open_ai_chat_completions() {
+fn prompt_gateway_bad_request_to_open_ai_chat_completions() {
     let args = tester::MockSettings {
         wasm_path: wasm_module(),
         quiet: false,
@@ -501,7 +501,7 @@ fn bad_request_to_open_ai_chat_completions() {
 
 #[test]
 #[serial]
-fn request_to_llm_gateway() {
+fn prompt_gateway_request_to_llm_gateway() {
     let args = tester::MockSettings {
         wasm_path: wasm_module(),
         quiet: false,
@@ -669,8 +669,8 @@ fn request_to_llm_gateway() {
         .expect_get_buffer_bytes(Some(BufferType::HttpResponseBody))
         .returning(Some(chat_completion_response_str.as_str()))
         .expect_log(Some(LogLevel::Trace), None)
+        .expect_log(Some(LogLevel::Debug), None)
         .expect_set_buffer_bytes(Some(BufferType::HttpResponseBody), None)
-        .expect_log(Some(LogLevel::Trace), None)
         .expect_log(Some(LogLevel::Debug), None)
         .expect_log(Some(LogLevel::Trace), None)
         .execute_and_expect(ReturnType::Action(Action::Continue))
diff --git a/demos/function_calling/api_server/app/main.py b/demos/function_calling/api_server/app/main.py
index 041a921d..e87a3a21 100644
--- a/demos/function_calling/api_server/app/main.py
+++ b/demos/function_calling/api_server/app/main.py
@@ -66,18 +66,18 @@ async def insurance_claim_details(req: InsuranceClaimDetailsRequest, res: Respon
 
 
 class DefaultTargetRequest(BaseModel):
-    arch_messages: list
+    messages: list
 
 
 @app.post("/default_target")
 async def default_target(req: DefaultTargetRequest, res: Response):
-    logger.info(f"Received arch_messages: {req.arch_messages}")
+    logger.info(f"Received arch_messages: {req.messages}")
     resp = {
         "choices": [
             {
                 "message": {
                     "role": "assistant",
-                    "content": "hello world from api server",
+                    "content": "I can help you with weather forecast or insurance claim details",
                 },
                 "finish_reason": "completed",
                 "index": 0,
diff --git a/demos/function_calling/arch_config.yaml b/demos/function_calling/arch_config.yaml
index a2c92883..e7448c7e 100644
--- a/demos/function_calling/arch_config.yaml
+++ b/demos/function_calling/arch_config.yaml
@@ -16,12 +16,27 @@ overrides:
   prompt_target_intent_matching_threshold: 0.6
 
 llm_providers:
-  - name: gpt
-    access_key: OPENAI_API_KEY
+  - name: gpt-4o-mini
+    access_key: $OPENAI_API_KEY
     provider: openai
-    model: gpt-3.5-turbo
+    model: gpt-4o-mini
     default: true
 
+  - name: gpt-3.5-turbo-0125
+    access_key: $OPENAI_API_KEY
+    provider: openai
+    model: gpt-3.5-turbo-0125
+
+  - name: gpt-4o
+    access_key: $OPENAI_API_KEY
+    provider: openai
+    model: gpt-4o
+
+  - name: ministral-3b
+    access_key: $MISTRAL_API_KEY
+    provider: mistral
+    model: ministral-3b-latest
+
 system_prompt: |
   You are a helpful assistant.
 
@@ -67,10 +82,10 @@ prompt_targets:
       name: api_server
       path: /default_target
     system_prompt: |
-      You are a helpful assistant. Use the information that is provided to you.
+      You are a helpful assistant! Summarize the user's request and provide a helpful response.
     # if it is set to false arch will send response that it received from this prompt target to the user
     # if true arch will forward the response to the default LLM
-    auto_llm_dispatch_on_response: true
+    auto_llm_dispatch_on_response: false
 
 tracing:
   random_sampling: 100
diff --git a/demos/function_calling/docker-compose.yaml b/demos/function_calling/docker-compose.yaml
index 379b2cf7..40a01743 100644
--- a/demos/function_calling/docker-compose.yaml
+++ b/demos/function_calling/docker-compose.yaml
@@ -13,11 +13,11 @@ services:
   chatbot_ui:
     build:
       context: ../../chatbot_ui
-      dockerfile: Dockerfile
     ports:
       - "18080:8080"
     environment:
-      - CHAT_COMPLETION_ENDPOINT=http://host.docker.internal:10000/v1 #this is only because we are running the sample app in the same docker container environemtn as archgw
+      # this is only because we are running the sample app in the same docker container environemtn as archgw
+      - CHAT_COMPLETION_ENDPOINT=http://host.docker.internal:10000/v1
     extra_hosts:
       - "host.docker.internal:host-gateway"
     volumes:
@@ -38,6 +38,8 @@ services:
     - "${PORT_UI:-55679}:55679"
     - "${PORT_GRPC:-4317}:4317"
     - "${PORT_HTTP:-4318}:4318"
+    profiles:
+      - monitoring
 
   prometheus:
     image: prom/prometheus
diff --git a/demos/llm_routing/arch_config.yaml b/demos/llm_routing/arch_config.yaml
new file mode 100644
index 00000000..620a1d10
--- /dev/null
+++ b/demos/llm_routing/arch_config.yaml
@@ -0,0 +1,32 @@
+version: "0.1-beta"
+
+listener:
+  address: 0.0.0.0
+  port: 10000
+  message_format: huggingface
+  connect_timeout: 0.005s
+
+llm_providers:
+  - name: gpt-4o-mini
+    access_key: $OPENAI_API_KEY
+    provider: openai
+    model: gpt-4o-mini
+    default: true
+
+  - name: gpt-3.5-turbo-0125
+    access_key: $OPENAI_API_KEY
+    provider: openai
+    model: gpt-3.5-turbo-0125
+
+  - name: gpt-4o
+    access_key: $OPENAI_API_KEY
+    provider: openai
+    model: gpt-4o
+
+  - name: ministral-3b
+    access_key: $MISTRAL_API_KEY
+    provider: mistral
+    model: ministral-3b-latest
+
+tracing:
+  random_sampling: 100
diff --git a/demos/llm_routing/docker-compose.yaml b/demos/llm_routing/docker-compose.yaml
new file mode 100644
index 00000000..1ce6963b
--- /dev/null
+++ b/demos/llm_routing/docker-compose.yaml
@@ -0,0 +1,14 @@
+services:
+
+  chatbot_ui:
+    build:
+      context: ../../chatbot_ui
+      dockerfile: Dockerfile
+    ports:
+      - "18080:8080"
+    environment:
+      - CHAT_COMPLETION_ENDPOINT=http://host.docker.internal:12000/v1
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
+    volumes:
+      - ./arch_config.yaml:/app/arch_config.yaml
diff --git a/e2e_tests/.vscode/settings.json b/e2e_tests/.vscode/settings.json
new file mode 100644
index 00000000..98ba633e
--- /dev/null
+++ b/e2e_tests/.vscode/settings.json
@@ -0,0 +1,7 @@
+{
+  "python.testing.pytestArgs": [
+    "."
+  ],
+  "python.testing.unittestEnabled": false,
+  "python.testing.pytestEnabled": true
+}
diff --git a/e2e_tests/README.md b/e2e_tests/README.md
new file mode 100644
index 00000000..368fdd13
--- /dev/null
+++ b/e2e_tests/README.md
@@ -0,0 +1,34 @@
+# e2e tests
+
+e2e tests for arch llm gateway and prompt gateway
+
+To be able to run e2e tests successfully run_e2e_script prepares environment in following way,
+
+1. build and start function_calling demo (using docker compose)
+1. build, install and start model server async (using poetry)
+1. build and start arch gateway (using docker compose)
+1. wait for model server to be ready
+1. wait for arch gateway to be ready
+1. start e2e tests (using poetry)
+   1. runs llm gateway tests for llm routing
+   2. runs prompt gateway tests to test function calling, parameter gathering and summarization
+2. cleanup
+   1. stops arch gateway
+   2. stops model server
+   3. stops function_calling demo
+
+## How to run
+
+To run locally make sure that following requirements are met.
+
+### Requirements
+
+- Python 3.10
+- Poetry
+- Docker
+
+### Running tests locally
+
+```sh
+sh run_e2e_test.sh
+```
diff --git a/e2e_tests/common.py b/e2e_tests/common.py
new file mode 100644
index 00000000..7ccee7c4
--- /dev/null
+++ b/e2e_tests/common.py
@@ -0,0 +1,42 @@
+import json
+import os
+
+
+PROMPT_GATEWAY_ENDPOINT = os.getenv(
+    "PROMPT_GATEWAY_ENDPOINT", "http://localhost:10000/v1/chat/completions"
+)
+LLM_GATEWAY_ENDPOINT = os.getenv(
+    "LLM_GATEWAY_ENDPOINT", "http://localhost:12000/v1/chat/completions"
+)
+ARCH_STATE_HEADER = "x-arch-state"
+
+
+def get_data_chunks(stream, n=1):
+    chunks = []
+    for chunk in stream.iter_lines():
+        if chunk:
+            chunk = chunk.decode("utf-8")
+            chunk_data_id = chunk[0:6]
+            assert chunk_data_id == "data: "
+            chunk_data = chunk[6:]
+            chunk_data = chunk_data.strip()
+            chunks.append(chunk_data)
+            if len(chunks) >= n:
+                break
+    return chunks
+
+
+def get_arch_messages(response_json):
+    arch_messages = []
+    if response_json and "metadata" in response_json:
+        # load arch_state from metadata
+        arch_state_str = response_json.get("metadata", {}).get(ARCH_STATE_HEADER, "{}")
+        # parse arch_state into json object
+        arch_state = json.loads(arch_state_str)
+        # load messages from arch_state
+        arch_messages_str = arch_state.get("messages", "[]")
+        # parse messages into json object
+        arch_messages = json.loads(arch_messages_str)
+        # append messages from arch gateway to history
+        return arch_messages
+    return []
diff --git a/e2e_tests/common_scripts.sh b/e2e_tests/common_scripts.sh
new file mode 100644
index 00000000..90d31603
--- /dev/null
+++ b/e2e_tests/common_scripts.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+log() {
+  timestamp=$(date +"%Y-%m-%d %H:%M:%S")
+  message="$*"
+  echo "$timestamp: $message"
+}
+
+wait_for_healthz() {
+  local healthz_url="$1"
+  local timeout_seconds="${2:-30}"  # Default timeout of 30 seconds
+  local sleep_between="${3:-5}"  # Default sleep of 5 seconds
+
+  local start_time=$(date +%s)
+
+  while true; do
+    local response_code=$(curl -s -o /dev/null -w "%{http_code}" "$healthz_url")
+
+    log "Healthz endpoint $healthz_url response code: $response_code"
+    if [[ "$response_code" -eq 200 ]]; then
+      log "Healthz endpoint is healthy. Proceeding..."
+      return 0
+    fi
+
+    local elapsed_time=$(( $(date +%s) - $start_time ))
+    if [[ $elapsed_time -ge $timeout_seconds ]]; then
+      log "Timeout reached. Healthz endpoint is still unhealthy. Exiting..."
+      return 1
+    fi
+
+    sleep $sleep_between
+  done
+}
diff --git a/e2e_tests/poetry.lock b/e2e_tests/poetry.lock
new file mode 100644
index 00000000..68ebfcf5
--- /dev/null
+++ b/e2e_tests/poetry.lock
@@ -0,0 +1,702 @@
+# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
+
+[[package]]
+name = "attrs"
+version = "24.2.0"
+description = "Classes Without Boilerplate"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "attrs-24.2.0-py3-none-any.whl", hash = "sha256:81921eb96de3191c8258c199618104dd27ac608d9366f5e35d011eae1867ede2"},
+    {file = "attrs-24.2.0.tar.gz", hash = "sha256:5cfb1b9148b5b086569baec03f20d7b6bf3bcacc9a42bebf87ffaaca362f6346"},
+]
+
+[package.extras]
+benchmark = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-codspeed", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
+cov = ["cloudpickle", "coverage[toml] (>=5.3)", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
+dev = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
+docs = ["cogapp", "furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier (<24.7)"]
+tests = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
+tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"]
+
+[[package]]
+name = "certifi"
+version = "2024.8.30"
+description = "Python package for providing Mozilla's CA Bundle."
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "certifi-2024.8.30-py3-none-any.whl", hash = "sha256:922820b53db7a7257ffbda3f597266d435245903d80737e34f8a45ff3e3230d8"},
+    {file = "certifi-2024.8.30.tar.gz", hash = "sha256:bec941d2aa8195e248a60b31ff9f0558284cf01a52591ceda73ea9afffd69fd9"},
+]
+
+[[package]]
+name = "cffi"
+version = "1.17.1"
+description = "Foreign Function Interface for Python calling C code."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "cffi-1.17.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:df8b1c11f177bc2313ec4b2d46baec87a5f3e71fc8b45dab2ee7cae86d9aba14"},
+    {file = "cffi-1.17.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8f2cdc858323644ab277e9bb925ad72ae0e67f69e804f4898c070998d50b1a67"},
+    {file = "cffi-1.17.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:edae79245293e15384b51f88b00613ba9f7198016a5948b5dddf4917d4d26382"},
+    {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45398b671ac6d70e67da8e4224a065cec6a93541bb7aebe1b198a61b58c7b702"},
+    {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ad9413ccdeda48c5afdae7e4fa2192157e991ff761e7ab8fdd8926f40b160cc3"},
+    {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5da5719280082ac6bd9aa7becb3938dc9f9cbd57fac7d2871717b1feb0902ab6"},
+    {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bb1a08b8008b281856e5971307cc386a8e9c5b625ac297e853d36da6efe9c17"},
+    {file = "cffi-1.17.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:045d61c734659cc045141be4bae381a41d89b741f795af1dd018bfb532fd0df8"},
+    {file = "cffi-1.17.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:6883e737d7d9e4899a8a695e00ec36bd4e5e4f18fabe0aca0efe0a4b44cdb13e"},
+    {file = "cffi-1.17.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:6b8b4a92e1c65048ff98cfe1f735ef8f1ceb72e3d5f0c25fdb12087a23da22be"},
+    {file = "cffi-1.17.1-cp310-cp310-win32.whl", hash = "sha256:c9c3d058ebabb74db66e431095118094d06abf53284d9c81f27300d0e0d8bc7c"},
+    {file = "cffi-1.17.1-cp310-cp310-win_amd64.whl", hash = "sha256:0f048dcf80db46f0098ccac01132761580d28e28bc0f78ae0d58048063317e15"},
+    {file = "cffi-1.17.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a45e3c6913c5b87b3ff120dcdc03f6131fa0065027d0ed7ee6190736a74cd401"},
+    {file = "cffi-1.17.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:30c5e0cb5ae493c04c8b42916e52ca38079f1b235c2f8ae5f4527b963c401caf"},
+    {file = "cffi-1.17.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f75c7ab1f9e4aca5414ed4d8e5c0e303a34f4421f8a0d47a4d019ceff0ab6af4"},
+    {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1ed2dd2972641495a3ec98445e09766f077aee98a1c896dcb4ad0d303628e41"},
+    {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:46bf43160c1a35f7ec506d254e5c890f3c03648a4dbac12d624e4490a7046cd1"},
+    {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a24ed04c8ffd54b0729c07cee15a81d964e6fee0e3d4d342a27b020d22959dc6"},
+    {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:610faea79c43e44c71e1ec53a554553fa22321b65fae24889706c0a84d4ad86d"},
+    {file = "cffi-1.17.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:a9b15d491f3ad5d692e11f6b71f7857e7835eb677955c00cc0aefcd0669adaf6"},
+    {file = "cffi-1.17.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:de2ea4b5833625383e464549fec1bc395c1bdeeb5f25c4a3a82b5a8c756ec22f"},
+    {file = "cffi-1.17.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:fc48c783f9c87e60831201f2cce7f3b2e4846bf4d8728eabe54d60700b318a0b"},
+    {file = "cffi-1.17.1-cp311-cp311-win32.whl", hash = "sha256:85a950a4ac9c359340d5963966e3e0a94a676bd6245a4b55bc43949eee26a655"},
+    {file = "cffi-1.17.1-cp311-cp311-win_amd64.whl", hash = "sha256:caaf0640ef5f5517f49bc275eca1406b0ffa6aa184892812030f04c2abf589a0"},
+    {file = "cffi-1.17.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:805b4371bf7197c329fcb3ead37e710d1bca9da5d583f5073b799d5c5bd1eee4"},
+    {file = "cffi-1.17.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:733e99bc2df47476e3848417c5a4540522f234dfd4ef3ab7fafdf555b082ec0c"},
+    {file = "cffi-1.17.1-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1257bdabf294dceb59f5e70c64a3e2f462c30c7ad68092d01bbbfb1c16b1ba36"},
+    {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da95af8214998d77a98cc14e3a3bd00aa191526343078b530ceb0bd710fb48a5"},
+    {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d63afe322132c194cf832bfec0dc69a99fb9bb6bbd550f161a49e9e855cc78ff"},
+    {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f79fc4fc25f1c8698ff97788206bb3c2598949bfe0fef03d299eb1b5356ada99"},
+    {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b62ce867176a75d03a665bad002af8e6d54644fad99a3c70905c543130e39d93"},
+    {file = "cffi-1.17.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:386c8bf53c502fff58903061338ce4f4950cbdcb23e2902d86c0f722b786bbe3"},
+    {file = "cffi-1.17.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4ceb10419a9adf4460ea14cfd6bc43d08701f0835e979bf821052f1805850fe8"},
+    {file = "cffi-1.17.1-cp312-cp312-win32.whl", hash = "sha256:a08d7e755f8ed21095a310a693525137cfe756ce62d066e53f502a83dc550f65"},
+    {file = "cffi-1.17.1-cp312-cp312-win_amd64.whl", hash = "sha256:51392eae71afec0d0c8fb1a53b204dbb3bcabcb3c9b807eedf3e1e6ccf2de903"},
+    {file = "cffi-1.17.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f3a2b4222ce6b60e2e8b337bb9596923045681d71e5a082783484d845390938e"},
+    {file = "cffi-1.17.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0984a4925a435b1da406122d4d7968dd861c1385afe3b45ba82b750f229811e2"},
+    {file = "cffi-1.17.1-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d01b12eeeb4427d3110de311e1774046ad344f5b1a7403101878976ecd7a10f3"},
+    {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:706510fe141c86a69c8ddc029c7910003a17353970cff3b904ff0686a5927683"},
+    {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de55b766c7aa2e2a3092c51e0483d700341182f08e67c63630d5b6f200bb28e5"},
+    {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c59d6e989d07460165cc5ad3c61f9fd8f1b4796eacbd81cee78957842b834af4"},
+    {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd398dbc6773384a17fe0d3e7eeb8d1a21c2200473ee6806bb5e6a8e62bb73dd"},
+    {file = "cffi-1.17.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:3edc8d958eb099c634dace3c7e16560ae474aa3803a5df240542b305d14e14ed"},
+    {file = "cffi-1.17.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:72e72408cad3d5419375fc87d289076ee319835bdfa2caad331e377589aebba9"},
+    {file = "cffi-1.17.1-cp313-cp313-win32.whl", hash = "sha256:e03eab0a8677fa80d646b5ddece1cbeaf556c313dcfac435ba11f107ba117b5d"},
+    {file = "cffi-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:f6a16c31041f09ead72d69f583767292f750d24913dadacf5756b966aacb3f1a"},
+    {file = "cffi-1.17.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:636062ea65bd0195bc012fea9321aca499c0504409f413dc88af450b57ffd03b"},
+    {file = "cffi-1.17.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c7eac2ef9b63c79431bc4b25f1cd649d7f061a28808cbc6c47b534bd789ef964"},
+    {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e221cf152cff04059d011ee126477f0d9588303eb57e88923578ace7baad17f9"},
+    {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:31000ec67d4221a71bd3f67df918b1f88f676f1c3b535a7eb473255fdc0b83fc"},
+    {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6f17be4345073b0a7b8ea599688f692ac3ef23ce28e5df79c04de519dbc4912c"},
+    {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e2b1fac190ae3ebfe37b979cc1ce69c81f4e4fe5746bb401dca63a9062cdaf1"},
+    {file = "cffi-1.17.1-cp38-cp38-win32.whl", hash = "sha256:7596d6620d3fa590f677e9ee430df2958d2d6d6de2feeae5b20e82c00b76fbf8"},
+    {file = "cffi-1.17.1-cp38-cp38-win_amd64.whl", hash = "sha256:78122be759c3f8a014ce010908ae03364d00a1f81ab5c7f4a7a5120607ea56e1"},
+    {file = "cffi-1.17.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b2ab587605f4ba0bf81dc0cb08a41bd1c0a5906bd59243d56bad7668a6fc6c16"},
+    {file = "cffi-1.17.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:28b16024becceed8c6dfbc75629e27788d8a3f9030691a1dbf9821a128b22c36"},
+    {file = "cffi-1.17.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1d599671f396c4723d016dbddb72fe8e0397082b0a77a4fab8028923bec050e8"},
+    {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca74b8dbe6e8e8263c0ffd60277de77dcee6c837a3d0881d8c1ead7268c9e576"},
+    {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f7f5baafcc48261359e14bcd6d9bff6d4b28d9103847c9e136694cb0501aef87"},
+    {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:98e3969bcff97cae1b2def8ba499ea3d6f31ddfdb7635374834cf89a1a08ecf0"},
+    {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cdf5ce3acdfd1661132f2a9c19cac174758dc2352bfe37d98aa7512c6b7178b3"},
+    {file = "cffi-1.17.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9755e4345d1ec879e3849e62222a18c7174d65a6a92d5b346b1863912168b595"},
+    {file = "cffi-1.17.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:f1e22e8c4419538cb197e4dd60acc919d7696e5ef98ee4da4e01d3f8cfa4cc5a"},
+    {file = "cffi-1.17.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:c03e868a0b3bc35839ba98e74211ed2b05d2119be4e8a0f224fba9384f1fe02e"},
+    {file = "cffi-1.17.1-cp39-cp39-win32.whl", hash = "sha256:e31ae45bc2e29f6b2abd0de1cc3b9d5205aa847cafaecb8af1476a609a2f6eb7"},
+    {file = "cffi-1.17.1-cp39-cp39-win_amd64.whl", hash = "sha256:d016c76bdd850f3c626af19b0542c9677ba156e4ee4fccfdd7848803533ef662"},
+    {file = "cffi-1.17.1.tar.gz", hash = "sha256:1c39c6016c32bc48dd54561950ebd6836e1670f2ae46128f67cf49e789c52824"},
+]
+
+[package.dependencies]
+pycparser = "*"
+
+[[package]]
+name = "charset-normalizer"
+version = "3.4.0"
+description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
+optional = false
+python-versions = ">=3.7.0"
+files = [
+    {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:4f9fc98dad6c2eaa32fc3af1417d95b5e3d08aff968df0cd320066def971f9a6"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0de7b687289d3c1b3e8660d0741874abe7888100efe14bd0f9fd7141bcbda92b"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5ed2e36c3e9b4f21dd9422f6893dec0abf2cca553af509b10cd630f878d3eb99"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40d3ff7fc90b98c637bda91c89d51264a3dcf210cade3a2c6f838c7268d7a4ca"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1110e22af8ca26b90bd6364fe4c763329b0ebf1ee213ba32b68c73de5752323d"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:86f4e8cca779080f66ff4f191a685ced73d2f72d50216f7112185dc02b90b9b7"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f683ddc7eedd742e2889d2bfb96d69573fde1d92fcb811979cdb7165bb9c7d3"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:27623ba66c183eca01bf9ff833875b459cad267aeeb044477fedac35e19ba907"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f606a1881d2663630ea5b8ce2efe2111740df4b687bd78b34a8131baa007f79b"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:0b309d1747110feb25d7ed6b01afdec269c647d382c857ef4663bbe6ad95a912"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:136815f06a3ae311fae551c3df1f998a1ebd01ddd424aa5603a4336997629e95"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:14215b71a762336254351b00ec720a8e85cada43b987da5a042e4ce3e82bd68e"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:79983512b108e4a164b9c8d34de3992f76d48cadc9554c9e60b43f308988aabe"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-win32.whl", hash = "sha256:c94057af19bc953643a33581844649a7fdab902624d2eb739738a30e2b3e60fc"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:55f56e2ebd4e3bc50442fbc0888c9d8c94e4e06a933804e2af3e89e2f9c1c749"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:0d99dd8ff461990f12d6e42c7347fd9ab2532fb70e9621ba520f9e8637161d7c"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c57516e58fd17d03ebe67e181a4e4e2ccab1168f8c2976c6a334d4f819fe5944"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6dba5d19c4dfab08e58d5b36304b3f92f3bd5d42c1a3fa37b5ba5cdf6dfcbcee"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf4475b82be41b07cc5e5ff94810e6a01f276e37c2d55571e3fe175e467a1a1c"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ce031db0408e487fd2775d745ce30a7cd2923667cf3b69d48d219f1d8f5ddeb6"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8ff4e7cdfdb1ab5698e675ca622e72d58a6fa2a8aa58195de0c0061288e6e3ea"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3710a9751938947e6327ea9f3ea6332a09bf0ba0c09cae9cb1f250bd1f1549bc"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:82357d85de703176b5587dbe6ade8ff67f9f69a41c0733cf2425378b49954de5"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:47334db71978b23ebcf3c0f9f5ee98b8d65992b65c9c4f2d34c2eaf5bcaf0594"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8ce7fd6767a1cc5a92a639b391891bf1c268b03ec7e021c7d6d902285259685c"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:f1a2f519ae173b5b6a2c9d5fa3116ce16e48b3462c8b96dfdded11055e3d6365"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:63bc5c4ae26e4bc6be6469943b8253c0fd4e4186c43ad46e713ea61a0ba49129"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:bcb4f8ea87d03bc51ad04add8ceaf9b0f085ac045ab4d74e73bbc2dc033f0236"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-win32.whl", hash = "sha256:9ae4ef0b3f6b41bad6366fb0ea4fc1d7ed051528e113a60fa2a65a9abb5b1d99"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:cee4373f4d3ad28f1ab6290684d8e2ebdb9e7a1b74fdc39e4c211995f77bec27"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:0713f3adb9d03d49d365b70b84775d0a0d18e4ab08d12bc46baa6132ba78aaf6"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:de7376c29d95d6719048c194a9cf1a1b0393fbe8488a22008610b0361d834ecf"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4a51b48f42d9358460b78725283f04bddaf44a9358197b889657deba38f329db"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b295729485b06c1a0683af02a9e42d2caa9db04a373dc38a6a58cdd1e8abddf1"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ee803480535c44e7f5ad00788526da7d85525cfefaf8acf8ab9a310000be4b03"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3d59d125ffbd6d552765510e3f31ed75ebac2c7470c7274195b9161a32350284"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8cda06946eac330cbe6598f77bb54e690b4ca93f593dee1568ad22b04f347c15"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:07afec21bbbbf8a5cc3651aa96b980afe2526e7f048fdfb7f1014d84acc8b6d8"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6b40e8d38afe634559e398cc32b1472f376a4099c75fe6299ae607e404c033b2"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:b8dcd239c743aa2f9c22ce674a145e0a25cb1566c495928440a181ca1ccf6719"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:84450ba661fb96e9fd67629b93d2941c871ca86fc38d835d19d4225ff946a631"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:44aeb140295a2f0659e113b31cfe92c9061622cadbc9e2a2f7b8ef6b1e29ef4b"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:1db4e7fefefd0f548d73e2e2e041f9df5c59e178b4c72fbac4cc6f535cfb1565"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-win32.whl", hash = "sha256:5726cf76c982532c1863fb64d8c6dd0e4c90b6ece9feb06c9f202417a31f7dd7"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:b197e7094f232959f8f20541ead1d9862ac5ebea1d58e9849c1bf979255dfac9"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:dd4eda173a9fcccb5f2e2bd2a9f423d180194b1bf17cf59e3269899235b2a114"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9e3c4c9e1ed40ea53acf11e2a386383c3304212c965773704e4603d589343ed"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:92a7e36b000bf022ef3dbb9c46bfe2d52c047d5e3f3343f43204263c5addc250"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:54b6a92d009cbe2fb11054ba694bc9e284dad30a26757b1e372a1fdddaf21920"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ffd9493de4c922f2a38c2bf62b831dcec90ac673ed1ca182fe11b4d8e9f2a64"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:35c404d74c2926d0287fbd63ed5d27eb911eb9e4a3bb2c6d294f3cfd4a9e0c23"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4796efc4faf6b53a18e3d46343535caed491776a22af773f366534056c4e1fbc"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e7fdd52961feb4c96507aa649550ec2a0d527c086d284749b2f582f2d40a2e0d"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:92db3c28b5b2a273346bebb24857fda45601aef6ae1c011c0a997106581e8a88"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ab973df98fc99ab39080bfb0eb3a925181454d7c3ac8a1e695fddfae696d9e90"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:4b67fdab07fdd3c10bb21edab3cbfe8cf5696f453afce75d815d9d7223fbe88b"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:aa41e526a5d4a9dfcfbab0716c7e8a1b215abd3f3df5a45cf18a12721d31cb5d"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ffc519621dce0c767e96b9c53f09c5d215578e10b02c285809f76509a3931482"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-win32.whl", hash = "sha256:f19c1585933c82098c2a520f8ec1227f20e339e33aca8fa6f956f6691b784e67"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:707b82d19e65c9bd28b81dde95249b07bf9f5b90ebe1ef17d9b57473f8a64b7b"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:dbe03226baf438ac4fda9e2d0715022fd579cb641c4cf639fa40d53b2fe6f3e2"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd9a8bd8900e65504a305bf8ae6fa9fbc66de94178c420791d0293702fce2df7"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b8831399554b92b72af5932cdbbd4ddc55c55f631bb13ff8fe4e6536a06c5c51"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a14969b8691f7998e74663b77b4c36c0337cb1df552da83d5c9004a93afdb574"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dcaf7c1524c0542ee2fc82cc8ec337f7a9f7edee2532421ab200d2b920fc97cf"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:425c5f215d0eecee9a56cdb703203dda90423247421bf0d67125add85d0c4455"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:d5b054862739d276e09928de37c79ddeec42a6e1bfc55863be96a36ba22926f6"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_i686.whl", hash = "sha256:f3e73a4255342d4eb26ef6df01e3962e73aa29baa3124a8e824c5d3364a65748"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_ppc64le.whl", hash = "sha256:2f6c34da58ea9c1a9515621f4d9ac379871a8f21168ba1b5e09d74250de5ad62"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_s390x.whl", hash = "sha256:f09cb5a7bbe1ecae6e87901a2eb23e0256bb524a79ccc53eb0b7629fbe7677c4"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:0099d79bdfcf5c1f0c2c72f91516702ebf8b0b8ddd8905f97a8aecf49712c621"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-win32.whl", hash = "sha256:9c98230f5042f4945f957d006edccc2af1e03ed5e37ce7c373f00a5a4daa6149"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-win_amd64.whl", hash = "sha256:62f60aebecfc7f4b82e3f639a7d1433a20ec32824db2199a11ad4f5e146ef5ee"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:af73657b7a68211996527dbfeffbb0864e043d270580c5aef06dc4b659a4b578"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cab5d0b79d987c67f3b9e9c53f54a61360422a5a0bc075f43cab5621d530c3b6"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9289fd5dddcf57bab41d044f1756550f9e7cf0c8e373b8cdf0ce8773dc4bd417"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b493a043635eb376e50eedf7818f2f322eabbaa974e948bd8bdd29eb7ef2a51"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9fa2566ca27d67c86569e8c85297aaf413ffab85a8960500f12ea34ff98e4c41"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a8e538f46104c815be19c975572d74afb53f29650ea2025bbfaef359d2de2f7f"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6fd30dc99682dc2c603c2b315bded2799019cea829f8bf57dc6b61efde6611c8"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2006769bd1640bdf4d5641c69a3d63b71b81445473cac5ded39740a226fa88ab"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:dc15e99b2d8a656f8e666854404f1ba54765871104e50c8e9813af8a7db07f12"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:ab2e5bef076f5a235c3774b4f4028a680432cded7cad37bba0fd90d64b187d19"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:4ec9dd88a5b71abfc74e9df5ebe7921c35cbb3b641181a531ca65cdb5e8e4dea"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:43193c5cda5d612f247172016c4bb71251c784d7a4d9314677186a838ad34858"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:aa693779a8b50cd97570e5a0f343538a8dbd3e496fa5dcb87e29406ad0299654"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-win32.whl", hash = "sha256:7706f5850360ac01d80c89bcef1640683cc12ed87f42579dab6c5d3ed6888613"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:c3e446d253bd88f6377260d07c895816ebf33ffffd56c1c792b13bff9c3e1ade"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:980b4f289d1d90ca5efcf07958d3eb38ed9c0b7676bf2831a54d4f66f9c27dfa"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f28f891ccd15c514a0981f3b9db9aa23d62fe1a99997512b0491d2ed323d229a"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8aacce6e2e1edcb6ac625fb0f8c3a9570ccc7bfba1f63419b3769ccf6a00ed0"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd7af3717683bea4c87acd8c0d3d5b44d56120b26fd3f8a692bdd2d5260c620a"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5ff2ed8194587faf56555927b3aa10e6fb69d931e33953943bc4f837dfee2242"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e91f541a85298cf35433bf66f3fab2a4a2cff05c127eeca4af174f6d497f0d4b"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:309a7de0a0ff3040acaebb35ec45d18db4b28232f21998851cfa709eeff49d62"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:285e96d9d53422efc0d7a17c60e59f37fbf3dfa942073f666db4ac71e8d726d0"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:5d447056e2ca60382d460a604b6302d8db69476fd2015c81e7c35417cfabe4cd"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:20587d20f557fe189b7947d8e7ec5afa110ccf72a3128d61a2a387c3313f46be"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:130272c698667a982a5d0e626851ceff662565379baf0ff2cc58067b81d4f11d"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:ab22fbd9765e6954bc0bcff24c25ff71dcbfdb185fcdaca49e81bac68fe724d3"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7782afc9b6b42200f7362858f9e73b1f8316afb276d316336c0ec3bd73312742"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-win32.whl", hash = "sha256:2de62e8801ddfff069cd5c504ce3bc9672b23266597d4e4f50eda28846c322f2"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:95c3c157765b031331dd4db3c775e58deaee050a3042fcad72cbc4189d7c8dca"},
+    {file = "charset_normalizer-3.4.0-py3-none-any.whl", hash = "sha256:fe9f97feb71aa9896b81973a7bbada8c49501dc73e58a10fcef6663af95e5079"},
+    {file = "charset_normalizer-3.4.0.tar.gz", hash = "sha256:223217c3d4f82c3ac5e29032b3f1c2eb0fb591b72161f86d93f5719079dae93e"},
+]
+
+[[package]]
+name = "colorama"
+version = "0.4.6"
+description = "Cross-platform colored terminal text."
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
+files = [
+    {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
+    {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
+]
+
+[[package]]
+name = "coverage"
+version = "7.6.4"
+description = "Code coverage measurement for Python"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "coverage-7.6.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5f8ae553cba74085db385d489c7a792ad66f7f9ba2ee85bfa508aeb84cf0ba07"},
+    {file = "coverage-7.6.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8165b796df0bd42e10527a3f493c592ba494f16ef3c8b531288e3d0d72c1f6f0"},
+    {file = "coverage-7.6.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c7c8b95bf47db6d19096a5e052ffca0a05f335bc63cef281a6e8fe864d450a72"},
+    {file = "coverage-7.6.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8ed9281d1b52628e81393f5eaee24a45cbd64965f41857559c2b7ff19385df51"},
+    {file = "coverage-7.6.4-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0809082ee480bb8f7416507538243c8863ac74fd8a5d2485c46f0f7499f2b491"},
+    {file = "coverage-7.6.4-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d541423cdd416b78626b55f123412fcf979d22a2c39fce251b350de38c15c15b"},
+    {file = "coverage-7.6.4-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:58809e238a8a12a625c70450b48e8767cff9eb67c62e6154a642b21ddf79baea"},
+    {file = "coverage-7.6.4-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:c9b8e184898ed014884ca84c70562b4a82cbc63b044d366fedc68bc2b2f3394a"},
+    {file = "coverage-7.6.4-cp310-cp310-win32.whl", hash = "sha256:6bd818b7ea14bc6e1f06e241e8234508b21edf1b242d49831831a9450e2f35fa"},
+    {file = "coverage-7.6.4-cp310-cp310-win_amd64.whl", hash = "sha256:06babbb8f4e74b063dbaeb74ad68dfce9186c595a15f11f5d5683f748fa1d172"},
+    {file = "coverage-7.6.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:73d2b73584446e66ee633eaad1a56aad577c077f46c35ca3283cd687b7715b0b"},
+    {file = "coverage-7.6.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:51b44306032045b383a7a8a2c13878de375117946d68dcb54308111f39775a25"},
+    {file = "coverage-7.6.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0b3fb02fe73bed561fa12d279a417b432e5b50fe03e8d663d61b3d5990f29546"},
+    {file = "coverage-7.6.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ed8fe9189d2beb6edc14d3ad19800626e1d9f2d975e436f84e19efb7fa19469b"},
+    {file = "coverage-7.6.4-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b369ead6527d025a0fe7bd3864e46dbee3aa8f652d48df6174f8d0bac9e26e0e"},
+    {file = "coverage-7.6.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ade3ca1e5f0ff46b678b66201f7ff477e8fa11fb537f3b55c3f0568fbfe6e718"},
+    {file = "coverage-7.6.4-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:27fb4a050aaf18772db513091c9c13f6cb94ed40eacdef8dad8411d92d9992db"},
+    {file = "coverage-7.6.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4f704f0998911abf728a7783799444fcbbe8261c4a6c166f667937ae6a8aa522"},
+    {file = "coverage-7.6.4-cp311-cp311-win32.whl", hash = "sha256:29155cd511ee058e260db648b6182c419422a0d2e9a4fa44501898cf918866cf"},
+    {file = "coverage-7.6.4-cp311-cp311-win_amd64.whl", hash = "sha256:8902dd6a30173d4ef09954bfcb24b5d7b5190cf14a43170e386979651e09ba19"},
+    {file = "coverage-7.6.4-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:12394842a3a8affa3ba62b0d4ab7e9e210c5e366fbac3e8b2a68636fb19892c2"},
+    {file = "coverage-7.6.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2b6b4c83d8e8ea79f27ab80778c19bc037759aea298da4b56621f4474ffeb117"},
+    {file = "coverage-7.6.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1d5b8007f81b88696d06f7df0cb9af0d3b835fe0c8dbf489bad70b45f0e45613"},
+    {file = "coverage-7.6.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b57b768feb866f44eeed9f46975f3d6406380275c5ddfe22f531a2bf187eda27"},
+    {file = "coverage-7.6.4-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5915fcdec0e54ee229926868e9b08586376cae1f5faa9bbaf8faf3561b393d52"},
+    {file = "coverage-7.6.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0b58c672d14f16ed92a48db984612f5ce3836ae7d72cdd161001cc54512571f2"},
+    {file = "coverage-7.6.4-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:2fdef0d83a2d08d69b1f2210a93c416d54e14d9eb398f6ab2f0a209433db19e1"},
+    {file = "coverage-7.6.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:8cf717ee42012be8c0cb205dbbf18ffa9003c4cbf4ad078db47b95e10748eec5"},
+    {file = "coverage-7.6.4-cp312-cp312-win32.whl", hash = "sha256:7bb92c539a624cf86296dd0c68cd5cc286c9eef2d0c3b8b192b604ce9de20a17"},
+    {file = "coverage-7.6.4-cp312-cp312-win_amd64.whl", hash = "sha256:1032e178b76a4e2b5b32e19d0fd0abbce4b58e77a1ca695820d10e491fa32b08"},
+    {file = "coverage-7.6.4-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:023bf8ee3ec6d35af9c1c6ccc1d18fa69afa1cb29eaac57cb064dbb262a517f9"},
+    {file = "coverage-7.6.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:b0ac3d42cb51c4b12df9c5f0dd2f13a4f24f01943627120ec4d293c9181219ba"},
+    {file = "coverage-7.6.4-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f8fe4984b431f8621ca53d9380901f62bfb54ff759a1348cd140490ada7b693c"},
+    {file = "coverage-7.6.4-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5fbd612f8a091954a0c8dd4c0b571b973487277d26476f8480bfa4b2a65b5d06"},
+    {file = "coverage-7.6.4-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dacbc52de979f2823a819571f2e3a350a7e36b8cb7484cdb1e289bceaf35305f"},
+    {file = "coverage-7.6.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:dab4d16dfef34b185032580e2f2f89253d302facba093d5fa9dbe04f569c4f4b"},
+    {file = "coverage-7.6.4-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:862264b12ebb65ad8d863d51f17758b1684560b66ab02770d4f0baf2ff75da21"},
+    {file = "coverage-7.6.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5beb1ee382ad32afe424097de57134175fea3faf847b9af002cc7895be4e2a5a"},
+    {file = "coverage-7.6.4-cp313-cp313-win32.whl", hash = "sha256:bf20494da9653f6410213424f5f8ad0ed885e01f7e8e59811f572bdb20b8972e"},
+    {file = "coverage-7.6.4-cp313-cp313-win_amd64.whl", hash = "sha256:182e6cd5c040cec0a1c8d415a87b67ed01193ed9ad458ee427741c7d8513d963"},
+    {file = "coverage-7.6.4-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:a181e99301a0ae128493a24cfe5cfb5b488c4e0bf2f8702091473d033494d04f"},
+    {file = "coverage-7.6.4-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:df57bdbeffe694e7842092c5e2e0bc80fff7f43379d465f932ef36f027179806"},
+    {file = "coverage-7.6.4-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0bcd1069e710600e8e4cf27f65c90c7843fa8edfb4520fb0ccb88894cad08b11"},
+    {file = "coverage-7.6.4-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:99b41d18e6b2a48ba949418db48159d7a2e81c5cc290fc934b7d2380515bd0e3"},
+    {file = "coverage-7.6.4-cp313-cp313t-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a6b1e54712ba3474f34b7ef7a41e65bd9037ad47916ccb1cc78769bae324c01a"},
+    {file = "coverage-7.6.4-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:53d202fd109416ce011578f321460795abfe10bb901b883cafd9b3ef851bacfc"},
+    {file = "coverage-7.6.4-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:c48167910a8f644671de9f2083a23630fbf7a1cb70ce939440cd3328e0919f70"},
+    {file = "coverage-7.6.4-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:cc8ff50b50ce532de2fa7a7daae9dd12f0a699bfcd47f20945364e5c31799fef"},
+    {file = "coverage-7.6.4-cp313-cp313t-win32.whl", hash = "sha256:b8d3a03d9bfcaf5b0141d07a88456bb6a4c3ce55c080712fec8418ef3610230e"},
+    {file = "coverage-7.6.4-cp313-cp313t-win_amd64.whl", hash = "sha256:f3ddf056d3ebcf6ce47bdaf56142af51bb7fad09e4af310241e9db7a3a8022e1"},
+    {file = "coverage-7.6.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9cb7fa111d21a6b55cbf633039f7bc2749e74932e3aa7cb7333f675a58a58bf3"},
+    {file = "coverage-7.6.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:11a223a14e91a4693d2d0755c7a043db43d96a7450b4f356d506c2562c48642c"},
+    {file = "coverage-7.6.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a413a096c4cbac202433c850ee43fa326d2e871b24554da8327b01632673a076"},
+    {file = "coverage-7.6.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:00a1d69c112ff5149cabe60d2e2ee948752c975d95f1e1096742e6077affd376"},
+    {file = "coverage-7.6.4-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1f76846299ba5c54d12c91d776d9605ae33f8ae2b9d1d3c3703cf2db1a67f2c0"},
+    {file = "coverage-7.6.4-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:fe439416eb6380de434886b00c859304338f8b19f6f54811984f3420a2e03858"},
+    {file = "coverage-7.6.4-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:0294ca37f1ba500667b1aef631e48d875ced93ad5e06fa665a3295bdd1d95111"},
+    {file = "coverage-7.6.4-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:6f01ba56b1c0e9d149f9ac85a2f999724895229eb36bd997b61e62999e9b0901"},
+    {file = "coverage-7.6.4-cp39-cp39-win32.whl", hash = "sha256:bc66f0bf1d7730a17430a50163bb264ba9ded56739112368ba985ddaa9c3bd09"},
+    {file = "coverage-7.6.4-cp39-cp39-win_amd64.whl", hash = "sha256:c481b47f6b5845064c65a7bc78bc0860e635a9b055af0df46fdf1c58cebf8e8f"},
+    {file = "coverage-7.6.4-pp39.pp310-none-any.whl", hash = "sha256:3c65d37f3a9ebb703e710befdc489a38683a5b152242664b973a7b7b22348a4e"},
+    {file = "coverage-7.6.4.tar.gz", hash = "sha256:29fc0f17b1d3fea332f8001d4558f8214af7f1d87a345f3a133c901d60347c73"},
+]
+
+[package.dependencies]
+tomli = {version = "*", optional = true, markers = "python_full_version <= \"3.11.0a6\" and extra == \"toml\""}
+
+[package.extras]
+toml = ["tomli"]
+
+[[package]]
+name = "deepdiff"
+version = "8.0.1"
+description = "Deep Difference and Search of any Python object/data. Recreate objects by adding adding deltas to each other."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "deepdiff-8.0.1-py3-none-any.whl", hash = "sha256:42e99004ce603f9a53934c634a57b04ad5900e0d8ed0abb15e635767489cbc05"},
+    {file = "deepdiff-8.0.1.tar.gz", hash = "sha256:245599a4586ab59bb599ca3517a9c42f3318ff600ded5e80a3432693c8ec3c4b"},
+]
+
+[package.dependencies]
+orderly-set = "5.2.2"
+
+[package.extras]
+cli = ["click (==8.1.7)", "pyyaml (==6.0.1)"]
+optimize = ["orjson"]
+
+[[package]]
+name = "exceptiongroup"
+version = "1.2.2"
+description = "Backport of PEP 654 (exception groups)"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"},
+    {file = "exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc"},
+]
+
+[package.extras]
+test = ["pytest (>=6)"]
+
+[[package]]
+name = "h11"
+version = "0.14.0"
+description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"},
+    {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"},
+]
+
+[[package]]
+name = "idna"
+version = "3.10"
+description = "Internationalized Domain Names in Applications (IDNA)"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3"},
+    {file = "idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9"},
+]
+
+[package.extras]
+all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2)"]
+
+[[package]]
+name = "iniconfig"
+version = "2.0.0"
+description = "brain-dead simple config-ini parsing"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"},
+    {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"},
+]
+
+[[package]]
+name = "orderly-set"
+version = "5.2.2"
+description = "Orderly set"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "orderly_set-5.2.2-py3-none-any.whl", hash = "sha256:f7a37c95a38c01cdfe41c3ffb62925a318a2286ea0a41790c057fc802aec54da"},
+    {file = "orderly_set-5.2.2.tar.gz", hash = "sha256:52a18b86aaf3f5d5a498bbdb27bf3253a4e5c57ab38e5b7a56fa00115cd28448"},
+]
+
+[[package]]
+name = "outcome"
+version = "1.3.0.post0"
+description = "Capture the outcome of Python function calls."
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "outcome-1.3.0.post0-py2.py3-none-any.whl", hash = "sha256:e771c5ce06d1415e356078d3bdd68523f284b4ce5419828922b6871e65eda82b"},
+    {file = "outcome-1.3.0.post0.tar.gz", hash = "sha256:9dcf02e65f2971b80047b377468e72a268e15c0af3cf1238e6ff14f7f91143b8"},
+]
+
+[package.dependencies]
+attrs = ">=19.2.0"
+
+[[package]]
+name = "packaging"
+version = "24.1"
+description = "Core utilities for Python packages"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "packaging-24.1-py3-none-any.whl", hash = "sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124"},
+    {file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"},
+]
+
+[[package]]
+name = "pluggy"
+version = "1.5.0"
+description = "plugin and hook calling mechanisms for python"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"},
+    {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"},
+]
+
+[package.extras]
+dev = ["pre-commit", "tox"]
+testing = ["pytest", "pytest-benchmark"]
+
+[[package]]
+name = "pycparser"
+version = "2.22"
+description = "C parser in Python"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc"},
+    {file = "pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6"},
+]
+
+[[package]]
+name = "pysocks"
+version = "1.7.1"
+description = "A Python SOCKS client module. See https://github.com/Anorov/PySocks for more information."
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+files = [
+    {file = "PySocks-1.7.1-py27-none-any.whl", hash = "sha256:08e69f092cc6dbe92a0fdd16eeb9b9ffbc13cadfe5ca4c7bd92ffb078b293299"},
+    {file = "PySocks-1.7.1-py3-none-any.whl", hash = "sha256:2725bd0a9925919b9b51739eea5f9e2bae91e83288108a9ad338b2e3a4435ee5"},
+    {file = "PySocks-1.7.1.tar.gz", hash = "sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0"},
+]
+
+[[package]]
+name = "pytest"
+version = "7.4.4"
+description = "pytest: simple powerful testing with Python"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "pytest-7.4.4-py3-none-any.whl", hash = "sha256:b090cdf5ed60bf4c45261be03239c2c1c22df034fbffe691abe93cd80cea01d8"},
+    {file = "pytest-7.4.4.tar.gz", hash = "sha256:2cf0005922c6ace4a3e2ec8b4080eb0d9753fdc93107415332f50ce9e7994280"},
+]
+
+[package.dependencies]
+colorama = {version = "*", markers = "sys_platform == \"win32\""}
+exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""}
+iniconfig = "*"
+packaging = "*"
+pluggy = ">=0.12,<2.0"
+tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
+
+[package.extras]
+testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
+
+[[package]]
+name = "pytest-cov"
+version = "4.1.0"
+description = "Pytest plugin for measuring coverage."
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "pytest-cov-4.1.0.tar.gz", hash = "sha256:3904b13dfbfec47f003b8e77fd5b589cd11904a21ddf1ab38a64f204d6a10ef6"},
+    {file = "pytest_cov-4.1.0-py3-none-any.whl", hash = "sha256:6ba70b9e97e69fcc3fb45bfeab2d0a138fb65c4d0d6a41ef33983ad114be8c3a"},
+]
+
+[package.dependencies]
+coverage = {version = ">=5.2.1", extras = ["toml"]}
+pytest = ">=4.6"
+
+[package.extras]
+testing = ["fields", "hunter", "process-tests", "pytest-xdist", "six", "virtualenv"]
+
+[[package]]
+name = "pytest-sugar"
+version = "1.0.0"
+description = "pytest-sugar is a plugin for pytest that changes the default look and feel of pytest (e.g. progressbar, show tests that fail instantly)."
+optional = false
+python-versions = "*"
+files = [
+    {file = "pytest-sugar-1.0.0.tar.gz", hash = "sha256:6422e83258f5b0c04ce7c632176c7732cab5fdb909cb39cca5c9139f81276c0a"},
+    {file = "pytest_sugar-1.0.0-py3-none-any.whl", hash = "sha256:70ebcd8fc5795dc457ff8b69d266a4e2e8a74ae0c3edc749381c64b5246c8dfd"},
+]
+
+[package.dependencies]
+packaging = ">=21.3"
+pytest = ">=6.2.0"
+termcolor = ">=2.1.0"
+
+[package.extras]
+dev = ["black", "flake8", "pre-commit"]
+
+[[package]]
+name = "requests"
+version = "2.32.3"
+description = "Python HTTP for Humans."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"},
+    {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"},
+]
+
+[package.dependencies]
+certifi = ">=2017.4.17"
+charset-normalizer = ">=2,<4"
+idna = ">=2.5,<4"
+urllib3 = ">=1.21.1,<3"
+
+[package.extras]
+socks = ["PySocks (>=1.5.6,!=1.5.7)"]
+use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
+
+[[package]]
+name = "selenium"
+version = "4.25.0"
+description = "Official Python bindings for Selenium WebDriver"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "selenium-4.25.0-py3-none-any.whl", hash = "sha256:3798d2d12b4a570bc5790163ba57fef10b2afee958bf1d80f2a3cf07c4141f33"},
+    {file = "selenium-4.25.0.tar.gz", hash = "sha256:95d08d3b82fb353f3c474895154516604c7f0e6a9a565ae6498ef36c9bac6921"},
+]
+
+[package.dependencies]
+certifi = ">=2021.10.8"
+trio = ">=0.17,<1.0"
+trio-websocket = ">=0.9,<1.0"
+typing_extensions = ">=4.9,<5.0"
+urllib3 = {version = ">=1.26,<3", extras = ["socks"]}
+websocket-client = ">=1.8,<2.0"
+
+[[package]]
+name = "sniffio"
+version = "1.3.1"
+description = "Sniff out which async library your code is running under"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2"},
+    {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"},
+]
+
+[[package]]
+name = "sortedcontainers"
+version = "2.4.0"
+description = "Sorted Containers -- Sorted List, Sorted Dict, Sorted Set"
+optional = false
+python-versions = "*"
+files = [
+    {file = "sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0"},
+    {file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"},
+]
+
+[[package]]
+name = "termcolor"
+version = "2.5.0"
+description = "ANSI color formatting for output in terminal"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "termcolor-2.5.0-py3-none-any.whl", hash = "sha256:37b17b5fc1e604945c2642c872a3764b5d547a48009871aea3edd3afa180afb8"},
+    {file = "termcolor-2.5.0.tar.gz", hash = "sha256:998d8d27da6d48442e8e1f016119076b690d962507531df4890fcd2db2ef8a6f"},
+]
+
+[package.extras]
+tests = ["pytest", "pytest-cov"]
+
+[[package]]
+name = "tomli"
+version = "2.0.2"
+description = "A lil' TOML parser"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "tomli-2.0.2-py3-none-any.whl", hash = "sha256:2ebe24485c53d303f690b0ec092806a085f07af5a5aa1464f3931eec36caaa38"},
+    {file = "tomli-2.0.2.tar.gz", hash = "sha256:d46d457a85337051c36524bc5349dd91b1877838e2979ac5ced3e710ed8a60ed"},
+]
+
+[[package]]
+name = "trio"
+version = "0.27.0"
+description = "A friendly Python library for async concurrency and I/O"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "trio-0.27.0-py3-none-any.whl", hash = "sha256:68eabbcf8f457d925df62da780eff15ff5dc68fd6b367e2dde59f7aaf2a0b884"},
+    {file = "trio-0.27.0.tar.gz", hash = "sha256:1dcc95ab1726b2da054afea8fd761af74bad79bd52381b84eae408e983c76831"},
+]
+
+[package.dependencies]
+attrs = ">=23.2.0"
+cffi = {version = ">=1.14", markers = "os_name == \"nt\" and implementation_name != \"pypy\""}
+exceptiongroup = {version = "*", markers = "python_version < \"3.11\""}
+idna = "*"
+outcome = "*"
+sniffio = ">=1.3.0"
+sortedcontainers = "*"
+
+[[package]]
+name = "trio-websocket"
+version = "0.11.1"
+description = "WebSocket library for Trio"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "trio-websocket-0.11.1.tar.gz", hash = "sha256:18c11793647703c158b1f6e62de638acada927344d534e3c7628eedcb746839f"},
+    {file = "trio_websocket-0.11.1-py3-none-any.whl", hash = "sha256:520d046b0d030cf970b8b2b2e00c4c2245b3807853ecd44214acd33d74581638"},
+]
+
+[package.dependencies]
+exceptiongroup = {version = "*", markers = "python_version < \"3.11\""}
+trio = ">=0.11"
+wsproto = ">=0.14"
+
+[[package]]
+name = "typing-extensions"
+version = "4.12.2"
+description = "Backported and Experimental Type Hints for Python 3.8+"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"},
+    {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"},
+]
+
+[[package]]
+name = "urllib3"
+version = "2.2.3"
+description = "HTTP library with thread-safe connection pooling, file post, and more."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "urllib3-2.2.3-py3-none-any.whl", hash = "sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac"},
+    {file = "urllib3-2.2.3.tar.gz", hash = "sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9"},
+]
+
+[package.dependencies]
+pysocks = {version = ">=1.5.6,<1.5.7 || >1.5.7,<2.0", optional = true, markers = "extra == \"socks\""}
+
+[package.extras]
+brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"]
+h2 = ["h2 (>=4,<5)"]
+socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"]
+zstd = ["zstandard (>=0.18.0)"]
+
+[[package]]
+name = "websocket-client"
+version = "1.8.0"
+description = "WebSocket client for Python with low level API options"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "websocket_client-1.8.0-py3-none-any.whl", hash = "sha256:17b44cc997f5c498e809b22cdf2d9c7a9e71c02c8cc2b6c56e7c2d1239bfa526"},
+    {file = "websocket_client-1.8.0.tar.gz", hash = "sha256:3239df9f44da632f96012472805d40a23281a991027ce11d2f45a6f24ac4c3da"},
+]
+
+[package.extras]
+docs = ["Sphinx (>=6.0)", "myst-parser (>=2.0.0)", "sphinx-rtd-theme (>=1.1.0)"]
+optional = ["python-socks", "wsaccel"]
+test = ["websockets"]
+
+[[package]]
+name = "wsproto"
+version = "1.2.0"
+description = "WebSockets state-machine based protocol implementation"
+optional = false
+python-versions = ">=3.7.0"
+files = [
+    {file = "wsproto-1.2.0-py3-none-any.whl", hash = "sha256:b9acddd652b585d75b20477888c56642fdade28bdfd3579aa24a4d2c037dd736"},
+    {file = "wsproto-1.2.0.tar.gz", hash = "sha256:ad565f26ecb92588a3e43bc3d96164de84cd9902482b130d0ddbaa9664a85065"},
+]
+
+[package.dependencies]
+h11 = ">=0.9.0,<1"
+
+[metadata]
+lock-version = "2.0"
+python-versions = "^3.10"
+content-hash = "6ae4fa6397091b87b63698201a08d7d97628ed65992d46514f118768b46b99ce"
diff --git a/e2e_tests/pyproject.toml b/e2e_tests/pyproject.toml
new file mode 100644
index 00000000..68724c18
--- /dev/null
+++ b/e2e_tests/pyproject.toml
@@ -0,0 +1,23 @@
+[tool.poetry]
+name = "e2e_tests"
+version = "0.0.1"
+description = "e2e tests for prompt and llm gateway"
+authors = ["Katanemo Labs, Inc <archgw@katanemo.com>"]
+license = "Apache 2.0"
+readme = "README.md"
+package-mode = false
+
+[tool.poetry.dependencies]
+python = "^3.10"
+pytest = "^7.3.1"
+requests = "^2.29.0"
+selenium = "^4.11.2"
+pytest-sugar = "^1.0.0"
+deepdiff = "^8.0.1"
+
+[tool.poetry.dev-dependencies]
+pytest-cov = "^4.1.0"
+
+[tool.pytest.ini_options]
+python_files = ["test*.py"]
+addopts = ["-v", "-s"]
diff --git a/e2e_tests/run_e2e_tests.sh b/e2e_tests/run_e2e_tests.sh
new file mode 100644
index 00000000..47f56459
--- /dev/null
+++ b/e2e_tests/run_e2e_tests.sh
@@ -0,0 +1,80 @@
+#/bin/bash
+# if any of the commands fail, the script will exit
+set -e
+
+. ./common_scripts.sh
+
+print_debug() {
+  log "Received signal to stop"
+  log "Printing debug logs for model_server"
+  log "===================================="
+  tail -n 500 ~/archgw_logs/modelserver.log
+  log "Printing debug logs for docker"
+  log "===================================="
+  tail -n 500 ../build.log
+}
+
+trap 'print_debug' INT TERM ERR
+
+log starting > ../build.log
+
+log building function_callling demo
+log ===============================
+cd ../demos/function_calling
+docker compose build -q
+
+log starting the function_calling demo
+docker compose up -d
+cd -
+
+log building model server
+log =====================
+cd ../model_server
+poetry install 2>&1 >> ../build.log
+log starting model server
+log =====================
+mkdir -p ~/archgw_logs
+touch ~/archgw_logs/modelserver.log
+poetry run archgw_modelserver restart &
+tail -F ~/archgw_logs/modelserver.log &
+model_server_tail_pid=$!
+cd -
+
+log building llm and prompt gateway rust modules
+log ============================================
+cd ../arch
+docker build  -f Dockerfile .. -t katanemo/archgw -q
+log starting the arch gateway service
+log =================================
+docker compose -f docker-compose.e2e.yaml down
+log waiting for model service to be healthy
+wait_for_healthz "http://localhost:51000/healthz" 300
+kill $model_server_tail_pid
+docker compose -f docker-compose.e2e.yaml up -d
+log waiting for arch gateway service to be healthy
+wait_for_healthz "http://localhost:10000/healthz" 60
+log waiting for arch gateway service to be healthy
+cd -
+
+log running e2e tests
+log =================
+poetry install 2>&1 >> ../build.log
+poetry run pytest
+
+log shutting down the arch gateway service
+log ======================================
+cd ../arch
+docker compose -f docker-compose.e2e.yaml stop 2>&1 >> ../build.log
+cd -
+
+log shutting down the function_calling demo
+log =======================================
+cd ../demos/function_calling
+docker compose down 2>&1 >> ../build.log
+cd -
+
+log shutting down the model server
+log ==============================
+cd ../model_server
+poetry run archgw_modelserver stop 2>&1 >> ../build.log
+cd -
diff --git a/e2e_tests/test_llm_gateway.py b/e2e_tests/test_llm_gateway.py
new file mode 100644
index 00000000..74209c84
--- /dev/null
+++ b/e2e_tests/test_llm_gateway.py
@@ -0,0 +1,36 @@
+import json
+import pytest
+import requests
+
+from common import LLM_GATEWAY_ENDPOINT, get_data_chunks
+
+
+# test default llm
+@pytest.mark.parametrize("stream", [True, False])
+@pytest.mark.parametrize("provider_hint", [None, "gpt-3.5-turbo-0125"])
+def test_hello_llm_gateway_llm(stream, provider_hint):
+    expected_llm = "gpt-4o-mini-2024-07-18" if provider_hint is None else provider_hint
+    body = {
+        "messages": [
+            {
+                "role": "user",
+                "content": "hello",
+            }
+        ],
+        "stream": stream,
+    }
+    headers = {}
+    if provider_hint:
+        headers["x-arch-llm-provider-hint"] = provider_hint
+    response = requests.post(
+        LLM_GATEWAY_ENDPOINT, json=body, stream=stream, headers=headers
+    )
+    assert response.status_code == 200
+    if stream:
+        chunks = get_data_chunks(response)
+        assert len(chunks) > 0
+        response_json = json.loads(chunks[0])
+        assert response_json.get("model") == expected_llm
+    else:
+        response_json = response.json()
+        assert response_json.get("model") == expected_llm
diff --git a/e2e_tests/test_prompt_gateway.py b/e2e_tests/test_prompt_gateway.py
new file mode 100644
index 00000000..31f305d4
--- /dev/null
+++ b/e2e_tests/test_prompt_gateway.py
@@ -0,0 +1,262 @@
+import json
+import pytest
+import requests
+from deepdiff import DeepDiff
+
+from common import PROMPT_GATEWAY_ENDPOINT, get_arch_messages, get_data_chunks
+
+
+@pytest.mark.parametrize("stream", [True, False])
+def test_prompt_gateway(stream):
+    expected_tool_call = {
+        "name": "weather_forecast",
+        "arguments": {"city": "seattle", "days": 10},
+    }
+
+    body = {
+        "messages": [
+            {
+                "role": "user",
+                "content": "how is the weather in seattle for next 10 days",
+            }
+        ],
+        "stream": stream,
+    }
+    response = requests.post(PROMPT_GATEWAY_ENDPOINT, json=body, stream=stream)
+    assert response.status_code == 200
+    if stream:
+        chunks = get_data_chunks(response, n=20)
+        assert len(chunks) > 2
+
+        # first chunk is tool calls (role = assistant)
+        response_json = json.loads(chunks[0])
+        assert response_json.get("model").startswith("Arch")
+        choices = response_json.get("choices", [])
+        assert len(choices) > 0
+        assert "role" in choices[0]["delta"]
+        role = choices[0]["delta"]["role"]
+        assert role == "assistant"
+        tool_calls = choices[0].get("delta", {}).get("tool_calls", [])
+        assert len(tool_calls) > 0
+        tool_call = tool_calls[0]["function"]
+        diff = DeepDiff(tool_call, expected_tool_call, ignore_string_case=True)
+        assert not diff
+
+        # second chunk is api call result (role = tool)
+        response_json = json.loads(chunks[1])
+        choices = response_json.get("choices", [])
+        assert len(choices) > 0
+        assert "role" in choices[0]["delta"]
+        role = choices[0]["delta"]["role"]
+        assert role == "tool"
+
+        # third..end chunk is summarization (role = assistant)
+        response_json = json.loads(chunks[2])
+        assert response_json.get("model").startswith("gpt-4o-mini")
+        choices = response_json.get("choices", [])
+        assert len(choices) > 0
+        assert "role" in choices[0]["delta"]
+        role = choices[0]["delta"]["role"]
+        assert role == "assistant"
+
+    else:
+        response_json = response.json()
+        assert response_json.get("model").startswith("gpt-4o-mini")
+        choices = response_json.get("choices", [])
+        assert len(choices) > 0
+        assert "role" in choices[0]["message"]
+        assert choices[0]["message"]["role"] == "assistant"
+        # now verify arch_messages (tool call and api response) that are sent as response metadata
+        arch_messages = get_arch_messages(response_json)
+        assert len(arch_messages) == 2
+        tool_calls_message = arch_messages[0]
+        tool_calls = tool_calls_message.get("tool_calls", [])
+        assert len(tool_calls) > 0
+        tool_call = tool_calls[0]["function"]
+        diff = DeepDiff(tool_call, expected_tool_call, ignore_string_case=True)
+        assert not diff
+
+
+@pytest.mark.parametrize("stream", [True, False])
+def test_prompt_gateway_arch_direct_response(stream):
+    body = {
+        "messages": [
+            {
+                "role": "user",
+                "content": "how is the weather",
+            }
+        ],
+        "stream": stream,
+    }
+    response = requests.post(PROMPT_GATEWAY_ENDPOINT, json=body, stream=stream)
+    assert response.status_code == 200
+    if stream:
+        chunks = get_data_chunks(response, n=3)
+        assert len(chunks) > 0
+        response_json = json.loads(chunks[0])
+        # make sure arch responded directly
+        assert response_json.get("model").startswith("Arch")
+        # and tool call is null
+        choices = response_json.get("choices", [])
+        assert len(choices) > 0
+        tool_calls = choices[0].get("delta", {}).get("tool_calls", [])
+        assert len(tool_calls) == 0
+    else:
+        response_json = response.json()
+        assert response_json.get("model").startswith("Arch")
+        choices = response_json.get("choices", [])
+        assert len(choices) > 0
+        message = choices[0]["message"]["content"]
+        assert "Could you provide the following details days" not in message
+
+
+@pytest.mark.parametrize("stream", [True, False])
+def test_prompt_gateway_param_gathering(stream):
+    body = {
+        "messages": [
+            {
+                "role": "user",
+                "content": "how is the weather in seattle",
+            }
+        ],
+        "stream": stream,
+    }
+    response = requests.post(PROMPT_GATEWAY_ENDPOINT, json=body, stream=stream)
+    assert response.status_code == 200
+    if stream:
+        chunks = get_data_chunks(response, n=3)
+        assert len(chunks) > 0
+        response_json = json.loads(chunks[0])
+        # make sure arch responded directly
+        assert response_json.get("model").startswith("Arch")
+        # and tool call is null
+        choices = response_json.get("choices", [])
+        assert len(choices) > 0
+        tool_calls = choices[0].get("delta", {}).get("tool_calls", [])
+        assert len(tool_calls) == 0
+    else:
+        response_json = response.json()
+        assert response_json.get("model").startswith("Arch")
+        choices = response_json.get("choices", [])
+        assert len(choices) > 0
+        message = choices[0]["message"]["content"]
+        assert "Could you provide the following details days" in message
+
+
+@pytest.mark.parametrize("stream", [True, False])
+def test_prompt_gateway_param_tool_call(stream):
+    expected_tool_call = {
+        "name": "weather_forecast",
+        "arguments": {"city": "seattle", "days": 2},
+    }
+
+    body = {
+        "messages": [
+            {
+                "role": "user",
+                "content": "how is the weather in seattle",
+            },
+            {
+                "role": "assistant",
+                "content": "Could you provide the following details days ?",
+                "model": "Arch-Function-1.5B",
+            },
+            {
+                "role": "user",
+                "content": "2 days",
+            },
+        ],
+        "stream": stream,
+    }
+    response = requests.post(PROMPT_GATEWAY_ENDPOINT, json=body, stream=stream)
+    assert response.status_code == 200
+    if stream:
+        chunks = get_data_chunks(response, n=20)
+        assert len(chunks) > 2
+
+        # first chunk is tool calls (role = assistant)
+        response_json = json.loads(chunks[0])
+        assert response_json.get("model").startswith("Arch")
+        choices = response_json.get("choices", [])
+        assert len(choices) > 0
+        assert "role" in choices[0]["delta"]
+        role = choices[0]["delta"]["role"]
+        assert role == "assistant"
+        tool_calls = choices[0].get("delta", {}).get("tool_calls", [])
+        assert len(tool_calls) > 0
+        tool_call = tool_calls[0]["function"]
+        diff = DeepDiff(tool_call, expected_tool_call, ignore_string_case=True)
+        assert not diff
+
+        # second chunk is api call result (role = tool)
+        response_json = json.loads(chunks[1])
+        choices = response_json.get("choices", [])
+        assert len(choices) > 0
+        assert "role" in choices[0]["delta"]
+        role = choices[0]["delta"]["role"]
+        assert role == "tool"
+
+        # third..end chunk is summarization (role = assistant)
+        response_json = json.loads(chunks[2])
+        assert response_json.get("model").startswith("gpt-4o-mini")
+        choices = response_json.get("choices", [])
+        assert len(choices) > 0
+        assert "role" in choices[0]["delta"]
+        role = choices[0]["delta"]["role"]
+        assert role == "assistant"
+
+    else:
+        response_json = response.json()
+        assert response_json.get("model").startswith("gpt-4o-mini")
+        choices = response_json.get("choices", [])
+        assert len(choices) > 0
+        assert "role" in choices[0]["message"]
+        assert choices[0]["message"]["role"] == "assistant"
+        # now verify arch_messages (tool call and api response) that are sent as response metadata
+        arch_messages = get_arch_messages(response_json)
+        assert len(arch_messages) == 2
+        tool_calls_message = arch_messages[0]
+        tool_calls = tool_calls_message.get("tool_calls", [])
+        assert len(tool_calls) > 0
+        tool_call = tool_calls[0]["function"]
+        diff = DeepDiff(tool_call, expected_tool_call, ignore_string_case=True)
+        assert not diff
+
+
+@pytest.mark.parametrize("stream", [True, False])
+def test_prompt_gateway_default_target(stream):
+    body = {
+        "messages": [
+            {
+                "role": "user",
+                "content": "hello, what can you do for me?",
+            },
+        ],
+        "stream": stream,
+    }
+    response = requests.post(PROMPT_GATEWAY_ENDPOINT, json=body, stream=stream)
+    assert response.status_code == 200
+    if stream:
+        chunks = get_data_chunks(response, n=3)
+        assert len(chunks) > 0
+        response_json = json.loads(chunks[0])
+        assert response_json.get("model").startswith("api_server")
+        assert len(response_json.get("choices", [])) > 0
+        assert response_json.get("choices")[0]["delta"]["role"] == "assistant"
+
+        response_json = json.loads(chunks[1])
+        choices = response_json.get("choices", [])
+        assert len(choices) > 0
+        content = choices[0]["delta"]["content"]
+        assert (
+            content == "I can help you with weather forecast or insurance claim details"
+        )
+    else:
+        response_json = response.json()
+        assert response_json.get("model").startswith("api_server")
+        assert len(response_json.get("choices")) > 0
+        assert response_json.get("choices")[0]["message"]["role"] == "assistant"
+        assert (
+            response_json.get("choices")[0]["message"]["content"]
+            == "I can help you with weather forecast or insurance claim details"
+        )
diff --git a/model_server/app/cli.py b/model_server/app/cli.py
index dd6a5679..014608b0 100644
--- a/model_server/app/cli.py
+++ b/model_server/app/cli.py
@@ -65,7 +65,7 @@ def start_server(port=51000):
         process.terminate()
 
 
-def wait_for_health_check(url, timeout=180):
+def wait_for_health_check(url, timeout=300):
     """Wait for the Uvicorn server to respond to health-check requests."""
     start_time = time.time()
     while time.time() - start_time < timeout: